Loading...
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <linux/uaccess.h>
76#include <linux/bitops.h>
77#include <linux/capability.h>
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
81#include <linux/hash.h>
82#include <linux/slab.h>
83#include <linux/sched.h>
84#include <linux/sched/mm.h>
85#include <linux/mutex.h>
86#include <linux/string.h>
87#include <linux/mm.h>
88#include <linux/socket.h>
89#include <linux/sockios.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/if_ether.h>
93#include <linux/netdevice.h>
94#include <linux/etherdevice.h>
95#include <linux/ethtool.h>
96#include <linux/notifier.h>
97#include <linux/skbuff.h>
98#include <linux/bpf.h>
99#include <linux/bpf_trace.h>
100#include <net/net_namespace.h>
101#include <net/sock.h>
102#include <net/busy_poll.h>
103#include <linux/rtnetlink.h>
104#include <linux/stat.h>
105#include <net/dst.h>
106#include <net/dst_metadata.h>
107#include <net/pkt_sched.h>
108#include <net/pkt_cls.h>
109#include <net/checksum.h>
110#include <net/xfrm.h>
111#include <linux/highmem.h>
112#include <linux/init.h>
113#include <linux/module.h>
114#include <linux/netpoll.h>
115#include <linux/rcupdate.h>
116#include <linux/delay.h>
117#include <net/iw_handler.h>
118#include <asm/current.h>
119#include <linux/audit.h>
120#include <linux/dmaengine.h>
121#include <linux/err.h>
122#include <linux/ctype.h>
123#include <linux/if_arp.h>
124#include <linux/if_vlan.h>
125#include <linux/ip.h>
126#include <net/ip.h>
127#include <net/mpls.h>
128#include <linux/ipv6.h>
129#include <linux/in.h>
130#include <linux/jhash.h>
131#include <linux/random.h>
132#include <trace/events/napi.h>
133#include <trace/events/net.h>
134#include <trace/events/skb.h>
135#include <linux/pci.h>
136#include <linux/inetdevice.h>
137#include <linux/cpu_rmap.h>
138#include <linux/static_key.h>
139#include <linux/hashtable.h>
140#include <linux/vmalloc.h>
141#include <linux/if_macvlan.h>
142#include <linux/errqueue.h>
143#include <linux/hrtimer.h>
144#include <linux/netfilter_ingress.h>
145#include <linux/crash_dump.h>
146#include <linux/sctp.h>
147#include <net/udp_tunnel.h>
148#include <linux/net_namespace.h>
149
150#include "net-sysfs.h"
151
152/* Instead of increasing this, you should create a hash table. */
153#define MAX_GRO_SKBS 8
154
155/* This should be increased if a protocol with a bigger head is added. */
156#define GRO_MAX_HEAD (MAX_HEADER + 128)
157
158static DEFINE_SPINLOCK(ptype_lock);
159static DEFINE_SPINLOCK(offload_lock);
160struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
161struct list_head ptype_all __read_mostly; /* Taps */
162static struct list_head offload_base __read_mostly;
163
164static int netif_rx_internal(struct sk_buff *skb);
165static int call_netdevice_notifiers_info(unsigned long val,
166 struct netdev_notifier_info *info);
167static struct napi_struct *napi_by_id(unsigned int napi_id);
168
169/*
170 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
171 * semaphore.
172 *
173 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
174 *
175 * Writers must hold the rtnl semaphore while they loop through the
176 * dev_base_head list, and hold dev_base_lock for writing when they do the
177 * actual updates. This allows pure readers to access the list even
178 * while a writer is preparing to update it.
179 *
180 * To put it another way, dev_base_lock is held for writing only to
181 * protect against pure readers; the rtnl semaphore provides the
182 * protection against other writers.
183 *
184 * See, for example usages, register_netdevice() and
185 * unregister_netdevice(), which must be called with the rtnl
186 * semaphore held.
187 */
188DEFINE_RWLOCK(dev_base_lock);
189EXPORT_SYMBOL(dev_base_lock);
190
191static DEFINE_MUTEX(ifalias_mutex);
192
193/* protects napi_hash addition/deletion and napi_gen_id */
194static DEFINE_SPINLOCK(napi_hash_lock);
195
196static unsigned int napi_gen_id = NR_CPUS;
197static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
198
199static seqcount_t devnet_rename_seq;
200
201static inline void dev_base_seq_inc(struct net *net)
202{
203 while (++net->dev_base_seq == 0)
204 ;
205}
206
207static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
208{
209 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
210
211 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
212}
213
214static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
215{
216 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
217}
218
219static inline void rps_lock(struct softnet_data *sd)
220{
221#ifdef CONFIG_RPS
222 spin_lock(&sd->input_pkt_queue.lock);
223#endif
224}
225
226static inline void rps_unlock(struct softnet_data *sd)
227{
228#ifdef CONFIG_RPS
229 spin_unlock(&sd->input_pkt_queue.lock);
230#endif
231}
232
233/* Device list insertion */
234static void list_netdevice(struct net_device *dev)
235{
236 struct net *net = dev_net(dev);
237
238 ASSERT_RTNL();
239
240 write_lock_bh(&dev_base_lock);
241 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
242 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
243 hlist_add_head_rcu(&dev->index_hlist,
244 dev_index_hash(net, dev->ifindex));
245 write_unlock_bh(&dev_base_lock);
246
247 dev_base_seq_inc(net);
248}
249
250/* Device list removal
251 * caller must respect a RCU grace period before freeing/reusing dev
252 */
253static void unlist_netdevice(struct net_device *dev)
254{
255 ASSERT_RTNL();
256
257 /* Unlink dev from the device chain */
258 write_lock_bh(&dev_base_lock);
259 list_del_rcu(&dev->dev_list);
260 hlist_del_rcu(&dev->name_hlist);
261 hlist_del_rcu(&dev->index_hlist);
262 write_unlock_bh(&dev_base_lock);
263
264 dev_base_seq_inc(dev_net(dev));
265}
266
267/*
268 * Our notifier list
269 */
270
271static RAW_NOTIFIER_HEAD(netdev_chain);
272
273/*
274 * Device drivers call our routines to queue packets here. We empty the
275 * queue in the local softnet handler.
276 */
277
278DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
279EXPORT_PER_CPU_SYMBOL(softnet_data);
280
281#ifdef CONFIG_LOCKDEP
282/*
283 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
284 * according to dev->type
285 */
286static const unsigned short netdev_lock_type[] = {
287 ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
288 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
289 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
290 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
291 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
292 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
293 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
294 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
295 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
296 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
297 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
298 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
299 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
300 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
301 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
302
303static const char *const netdev_lock_name[] = {
304 "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
305 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
306 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
307 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
308 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
309 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
310 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
311 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
312 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
313 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
314 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
315 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
316 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
317 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
318 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
319
320static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
321static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
322
323static inline unsigned short netdev_lock_pos(unsigned short dev_type)
324{
325 int i;
326
327 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
328 if (netdev_lock_type[i] == dev_type)
329 return i;
330 /* the last key is used by default */
331 return ARRAY_SIZE(netdev_lock_type) - 1;
332}
333
334static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
335 unsigned short dev_type)
336{
337 int i;
338
339 i = netdev_lock_pos(dev_type);
340 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
341 netdev_lock_name[i]);
342}
343
344static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
345{
346 int i;
347
348 i = netdev_lock_pos(dev->type);
349 lockdep_set_class_and_name(&dev->addr_list_lock,
350 &netdev_addr_lock_key[i],
351 netdev_lock_name[i]);
352}
353#else
354static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
355 unsigned short dev_type)
356{
357}
358static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
359{
360}
361#endif
362
363/*******************************************************************************
364 *
365 * Protocol management and registration routines
366 *
367 *******************************************************************************/
368
369
370/*
371 * Add a protocol ID to the list. Now that the input handler is
372 * smarter we can dispense with all the messy stuff that used to be
373 * here.
374 *
375 * BEWARE!!! Protocol handlers, mangling input packets,
376 * MUST BE last in hash buckets and checking protocol handlers
377 * MUST start from promiscuous ptype_all chain in net_bh.
378 * It is true now, do not change it.
379 * Explanation follows: if protocol handler, mangling packet, will
380 * be the first on list, it is not able to sense, that packet
381 * is cloned and should be copied-on-write, so that it will
382 * change it and subsequent readers will get broken packet.
383 * --ANK (980803)
384 */
385
386static inline struct list_head *ptype_head(const struct packet_type *pt)
387{
388 if (pt->type == htons(ETH_P_ALL))
389 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
390 else
391 return pt->dev ? &pt->dev->ptype_specific :
392 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
393}
394
395/**
396 * dev_add_pack - add packet handler
397 * @pt: packet type declaration
398 *
399 * Add a protocol handler to the networking stack. The passed &packet_type
400 * is linked into kernel lists and may not be freed until it has been
401 * removed from the kernel lists.
402 *
403 * This call does not sleep therefore it can not
404 * guarantee all CPU's that are in middle of receiving packets
405 * will see the new packet type (until the next received packet).
406 */
407
408void dev_add_pack(struct packet_type *pt)
409{
410 struct list_head *head = ptype_head(pt);
411
412 spin_lock(&ptype_lock);
413 list_add_rcu(&pt->list, head);
414 spin_unlock(&ptype_lock);
415}
416EXPORT_SYMBOL(dev_add_pack);
417
418/**
419 * __dev_remove_pack - remove packet handler
420 * @pt: packet type declaration
421 *
422 * Remove a protocol handler that was previously added to the kernel
423 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
424 * from the kernel lists and can be freed or reused once this function
425 * returns.
426 *
427 * The packet type might still be in use by receivers
428 * and must not be freed until after all the CPU's have gone
429 * through a quiescent state.
430 */
431void __dev_remove_pack(struct packet_type *pt)
432{
433 struct list_head *head = ptype_head(pt);
434 struct packet_type *pt1;
435
436 spin_lock(&ptype_lock);
437
438 list_for_each_entry(pt1, head, list) {
439 if (pt == pt1) {
440 list_del_rcu(&pt->list);
441 goto out;
442 }
443 }
444
445 pr_warn("dev_remove_pack: %p not found\n", pt);
446out:
447 spin_unlock(&ptype_lock);
448}
449EXPORT_SYMBOL(__dev_remove_pack);
450
451/**
452 * dev_remove_pack - remove packet handler
453 * @pt: packet type declaration
454 *
455 * Remove a protocol handler that was previously added to the kernel
456 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
457 * from the kernel lists and can be freed or reused once this function
458 * returns.
459 *
460 * This call sleeps to guarantee that no CPU is looking at the packet
461 * type after return.
462 */
463void dev_remove_pack(struct packet_type *pt)
464{
465 __dev_remove_pack(pt);
466
467 synchronize_net();
468}
469EXPORT_SYMBOL(dev_remove_pack);
470
471
472/**
473 * dev_add_offload - register offload handlers
474 * @po: protocol offload declaration
475 *
476 * Add protocol offload handlers to the networking stack. The passed
477 * &proto_offload is linked into kernel lists and may not be freed until
478 * it has been removed from the kernel lists.
479 *
480 * This call does not sleep therefore it can not
481 * guarantee all CPU's that are in middle of receiving packets
482 * will see the new offload handlers (until the next received packet).
483 */
484void dev_add_offload(struct packet_offload *po)
485{
486 struct packet_offload *elem;
487
488 spin_lock(&offload_lock);
489 list_for_each_entry(elem, &offload_base, list) {
490 if (po->priority < elem->priority)
491 break;
492 }
493 list_add_rcu(&po->list, elem->list.prev);
494 spin_unlock(&offload_lock);
495}
496EXPORT_SYMBOL(dev_add_offload);
497
498/**
499 * __dev_remove_offload - remove offload handler
500 * @po: packet offload declaration
501 *
502 * Remove a protocol offload handler that was previously added to the
503 * kernel offload handlers by dev_add_offload(). The passed &offload_type
504 * is removed from the kernel lists and can be freed or reused once this
505 * function returns.
506 *
507 * The packet type might still be in use by receivers
508 * and must not be freed until after all the CPU's have gone
509 * through a quiescent state.
510 */
511static void __dev_remove_offload(struct packet_offload *po)
512{
513 struct list_head *head = &offload_base;
514 struct packet_offload *po1;
515
516 spin_lock(&offload_lock);
517
518 list_for_each_entry(po1, head, list) {
519 if (po == po1) {
520 list_del_rcu(&po->list);
521 goto out;
522 }
523 }
524
525 pr_warn("dev_remove_offload: %p not found\n", po);
526out:
527 spin_unlock(&offload_lock);
528}
529
530/**
531 * dev_remove_offload - remove packet offload handler
532 * @po: packet offload declaration
533 *
534 * Remove a packet offload handler that was previously added to the kernel
535 * offload handlers by dev_add_offload(). The passed &offload_type is
536 * removed from the kernel lists and can be freed or reused once this
537 * function returns.
538 *
539 * This call sleeps to guarantee that no CPU is looking at the packet
540 * type after return.
541 */
542void dev_remove_offload(struct packet_offload *po)
543{
544 __dev_remove_offload(po);
545
546 synchronize_net();
547}
548EXPORT_SYMBOL(dev_remove_offload);
549
550/******************************************************************************
551 *
552 * Device Boot-time Settings Routines
553 *
554 ******************************************************************************/
555
556/* Boot time configuration table */
557static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
558
559/**
560 * netdev_boot_setup_add - add new setup entry
561 * @name: name of the device
562 * @map: configured settings for the device
563 *
564 * Adds new setup entry to the dev_boot_setup list. The function
565 * returns 0 on error and 1 on success. This is a generic routine to
566 * all netdevices.
567 */
568static int netdev_boot_setup_add(char *name, struct ifmap *map)
569{
570 struct netdev_boot_setup *s;
571 int i;
572
573 s = dev_boot_setup;
574 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
575 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
576 memset(s[i].name, 0, sizeof(s[i].name));
577 strlcpy(s[i].name, name, IFNAMSIZ);
578 memcpy(&s[i].map, map, sizeof(s[i].map));
579 break;
580 }
581 }
582
583 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
584}
585
586/**
587 * netdev_boot_setup_check - check boot time settings
588 * @dev: the netdevice
589 *
590 * Check boot time settings for the device.
591 * The found settings are set for the device to be used
592 * later in the device probing.
593 * Returns 0 if no settings found, 1 if they are.
594 */
595int netdev_boot_setup_check(struct net_device *dev)
596{
597 struct netdev_boot_setup *s = dev_boot_setup;
598 int i;
599
600 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
601 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
602 !strcmp(dev->name, s[i].name)) {
603 dev->irq = s[i].map.irq;
604 dev->base_addr = s[i].map.base_addr;
605 dev->mem_start = s[i].map.mem_start;
606 dev->mem_end = s[i].map.mem_end;
607 return 1;
608 }
609 }
610 return 0;
611}
612EXPORT_SYMBOL(netdev_boot_setup_check);
613
614
615/**
616 * netdev_boot_base - get address from boot time settings
617 * @prefix: prefix for network device
618 * @unit: id for network device
619 *
620 * Check boot time settings for the base address of device.
621 * The found settings are set for the device to be used
622 * later in the device probing.
623 * Returns 0 if no settings found.
624 */
625unsigned long netdev_boot_base(const char *prefix, int unit)
626{
627 const struct netdev_boot_setup *s = dev_boot_setup;
628 char name[IFNAMSIZ];
629 int i;
630
631 sprintf(name, "%s%d", prefix, unit);
632
633 /*
634 * If device already registered then return base of 1
635 * to indicate not to probe for this interface
636 */
637 if (__dev_get_by_name(&init_net, name))
638 return 1;
639
640 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
641 if (!strcmp(name, s[i].name))
642 return s[i].map.base_addr;
643 return 0;
644}
645
646/*
647 * Saves at boot time configured settings for any netdevice.
648 */
649int __init netdev_boot_setup(char *str)
650{
651 int ints[5];
652 struct ifmap map;
653
654 str = get_options(str, ARRAY_SIZE(ints), ints);
655 if (!str || !*str)
656 return 0;
657
658 /* Save settings */
659 memset(&map, 0, sizeof(map));
660 if (ints[0] > 0)
661 map.irq = ints[1];
662 if (ints[0] > 1)
663 map.base_addr = ints[2];
664 if (ints[0] > 2)
665 map.mem_start = ints[3];
666 if (ints[0] > 3)
667 map.mem_end = ints[4];
668
669 /* Add new entry to the list */
670 return netdev_boot_setup_add(str, &map);
671}
672
673__setup("netdev=", netdev_boot_setup);
674
675/*******************************************************************************
676 *
677 * Device Interface Subroutines
678 *
679 *******************************************************************************/
680
681/**
682 * dev_get_iflink - get 'iflink' value of a interface
683 * @dev: targeted interface
684 *
685 * Indicates the ifindex the interface is linked to.
686 * Physical interfaces have the same 'ifindex' and 'iflink' values.
687 */
688
689int dev_get_iflink(const struct net_device *dev)
690{
691 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
692 return dev->netdev_ops->ndo_get_iflink(dev);
693
694 return dev->ifindex;
695}
696EXPORT_SYMBOL(dev_get_iflink);
697
698/**
699 * dev_fill_metadata_dst - Retrieve tunnel egress information.
700 * @dev: targeted interface
701 * @skb: The packet.
702 *
703 * For better visibility of tunnel traffic OVS needs to retrieve
704 * egress tunnel information for a packet. Following API allows
705 * user to get this info.
706 */
707int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
708{
709 struct ip_tunnel_info *info;
710
711 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
712 return -EINVAL;
713
714 info = skb_tunnel_info_unclone(skb);
715 if (!info)
716 return -ENOMEM;
717 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
718 return -EINVAL;
719
720 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
721}
722EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
723
724/**
725 * __dev_get_by_name - find a device by its name
726 * @net: the applicable net namespace
727 * @name: name to find
728 *
729 * Find an interface by name. Must be called under RTNL semaphore
730 * or @dev_base_lock. If the name is found a pointer to the device
731 * is returned. If the name is not found then %NULL is returned. The
732 * reference counters are not incremented so the caller must be
733 * careful with locks.
734 */
735
736struct net_device *__dev_get_by_name(struct net *net, const char *name)
737{
738 struct net_device *dev;
739 struct hlist_head *head = dev_name_hash(net, name);
740
741 hlist_for_each_entry(dev, head, name_hlist)
742 if (!strncmp(dev->name, name, IFNAMSIZ))
743 return dev;
744
745 return NULL;
746}
747EXPORT_SYMBOL(__dev_get_by_name);
748
749/**
750 * dev_get_by_name_rcu - find a device by its name
751 * @net: the applicable net namespace
752 * @name: name to find
753 *
754 * Find an interface by name.
755 * If the name is found a pointer to the device is returned.
756 * If the name is not found then %NULL is returned.
757 * The reference counters are not incremented so the caller must be
758 * careful with locks. The caller must hold RCU lock.
759 */
760
761struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
762{
763 struct net_device *dev;
764 struct hlist_head *head = dev_name_hash(net, name);
765
766 hlist_for_each_entry_rcu(dev, head, name_hlist)
767 if (!strncmp(dev->name, name, IFNAMSIZ))
768 return dev;
769
770 return NULL;
771}
772EXPORT_SYMBOL(dev_get_by_name_rcu);
773
774/**
775 * dev_get_by_name - find a device by its name
776 * @net: the applicable net namespace
777 * @name: name to find
778 *
779 * Find an interface by name. This can be called from any
780 * context and does its own locking. The returned handle has
781 * the usage count incremented and the caller must use dev_put() to
782 * release it when it is no longer needed. %NULL is returned if no
783 * matching device is found.
784 */
785
786struct net_device *dev_get_by_name(struct net *net, const char *name)
787{
788 struct net_device *dev;
789
790 rcu_read_lock();
791 dev = dev_get_by_name_rcu(net, name);
792 if (dev)
793 dev_hold(dev);
794 rcu_read_unlock();
795 return dev;
796}
797EXPORT_SYMBOL(dev_get_by_name);
798
799/**
800 * __dev_get_by_index - find a device by its ifindex
801 * @net: the applicable net namespace
802 * @ifindex: index of device
803 *
804 * Search for an interface by index. Returns %NULL if the device
805 * is not found or a pointer to the device. The device has not
806 * had its reference counter increased so the caller must be careful
807 * about locking. The caller must hold either the RTNL semaphore
808 * or @dev_base_lock.
809 */
810
811struct net_device *__dev_get_by_index(struct net *net, int ifindex)
812{
813 struct net_device *dev;
814 struct hlist_head *head = dev_index_hash(net, ifindex);
815
816 hlist_for_each_entry(dev, head, index_hlist)
817 if (dev->ifindex == ifindex)
818 return dev;
819
820 return NULL;
821}
822EXPORT_SYMBOL(__dev_get_by_index);
823
824/**
825 * dev_get_by_index_rcu - find a device by its ifindex
826 * @net: the applicable net namespace
827 * @ifindex: index of device
828 *
829 * Search for an interface by index. Returns %NULL if the device
830 * is not found or a pointer to the device. The device has not
831 * had its reference counter increased so the caller must be careful
832 * about locking. The caller must hold RCU lock.
833 */
834
835struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
836{
837 struct net_device *dev;
838 struct hlist_head *head = dev_index_hash(net, ifindex);
839
840 hlist_for_each_entry_rcu(dev, head, index_hlist)
841 if (dev->ifindex == ifindex)
842 return dev;
843
844 return NULL;
845}
846EXPORT_SYMBOL(dev_get_by_index_rcu);
847
848
849/**
850 * dev_get_by_index - find a device by its ifindex
851 * @net: the applicable net namespace
852 * @ifindex: index of device
853 *
854 * Search for an interface by index. Returns NULL if the device
855 * is not found or a pointer to the device. The device returned has
856 * had a reference added and the pointer is safe until the user calls
857 * dev_put to indicate they have finished with it.
858 */
859
860struct net_device *dev_get_by_index(struct net *net, int ifindex)
861{
862 struct net_device *dev;
863
864 rcu_read_lock();
865 dev = dev_get_by_index_rcu(net, ifindex);
866 if (dev)
867 dev_hold(dev);
868 rcu_read_unlock();
869 return dev;
870}
871EXPORT_SYMBOL(dev_get_by_index);
872
873/**
874 * dev_get_by_napi_id - find a device by napi_id
875 * @napi_id: ID of the NAPI struct
876 *
877 * Search for an interface by NAPI ID. Returns %NULL if the device
878 * is not found or a pointer to the device. The device has not had
879 * its reference counter increased so the caller must be careful
880 * about locking. The caller must hold RCU lock.
881 */
882
883struct net_device *dev_get_by_napi_id(unsigned int napi_id)
884{
885 struct napi_struct *napi;
886
887 WARN_ON_ONCE(!rcu_read_lock_held());
888
889 if (napi_id < MIN_NAPI_ID)
890 return NULL;
891
892 napi = napi_by_id(napi_id);
893
894 return napi ? napi->dev : NULL;
895}
896EXPORT_SYMBOL(dev_get_by_napi_id);
897
898/**
899 * netdev_get_name - get a netdevice name, knowing its ifindex.
900 * @net: network namespace
901 * @name: a pointer to the buffer where the name will be stored.
902 * @ifindex: the ifindex of the interface to get the name from.
903 *
904 * The use of raw_seqcount_begin() and cond_resched() before
905 * retrying is required as we want to give the writers a chance
906 * to complete when CONFIG_PREEMPT is not set.
907 */
908int netdev_get_name(struct net *net, char *name, int ifindex)
909{
910 struct net_device *dev;
911 unsigned int seq;
912
913retry:
914 seq = raw_seqcount_begin(&devnet_rename_seq);
915 rcu_read_lock();
916 dev = dev_get_by_index_rcu(net, ifindex);
917 if (!dev) {
918 rcu_read_unlock();
919 return -ENODEV;
920 }
921
922 strcpy(name, dev->name);
923 rcu_read_unlock();
924 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
925 cond_resched();
926 goto retry;
927 }
928
929 return 0;
930}
931
932/**
933 * dev_getbyhwaddr_rcu - find a device by its hardware address
934 * @net: the applicable net namespace
935 * @type: media type of device
936 * @ha: hardware address
937 *
938 * Search for an interface by MAC address. Returns NULL if the device
939 * is not found or a pointer to the device.
940 * The caller must hold RCU or RTNL.
941 * The returned device has not had its ref count increased
942 * and the caller must therefore be careful about locking
943 *
944 */
945
946struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
947 const char *ha)
948{
949 struct net_device *dev;
950
951 for_each_netdev_rcu(net, dev)
952 if (dev->type == type &&
953 !memcmp(dev->dev_addr, ha, dev->addr_len))
954 return dev;
955
956 return NULL;
957}
958EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
959
960struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
961{
962 struct net_device *dev;
963
964 ASSERT_RTNL();
965 for_each_netdev(net, dev)
966 if (dev->type == type)
967 return dev;
968
969 return NULL;
970}
971EXPORT_SYMBOL(__dev_getfirstbyhwtype);
972
973struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
974{
975 struct net_device *dev, *ret = NULL;
976
977 rcu_read_lock();
978 for_each_netdev_rcu(net, dev)
979 if (dev->type == type) {
980 dev_hold(dev);
981 ret = dev;
982 break;
983 }
984 rcu_read_unlock();
985 return ret;
986}
987EXPORT_SYMBOL(dev_getfirstbyhwtype);
988
989/**
990 * __dev_get_by_flags - find any device with given flags
991 * @net: the applicable net namespace
992 * @if_flags: IFF_* values
993 * @mask: bitmask of bits in if_flags to check
994 *
995 * Search for any interface with the given flags. Returns NULL if a device
996 * is not found or a pointer to the device. Must be called inside
997 * rtnl_lock(), and result refcount is unchanged.
998 */
999
1000struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1001 unsigned short mask)
1002{
1003 struct net_device *dev, *ret;
1004
1005 ASSERT_RTNL();
1006
1007 ret = NULL;
1008 for_each_netdev(net, dev) {
1009 if (((dev->flags ^ if_flags) & mask) == 0) {
1010 ret = dev;
1011 break;
1012 }
1013 }
1014 return ret;
1015}
1016EXPORT_SYMBOL(__dev_get_by_flags);
1017
1018/**
1019 * dev_valid_name - check if name is okay for network device
1020 * @name: name string
1021 *
1022 * Network device names need to be valid file names to
1023 * to allow sysfs to work. We also disallow any kind of
1024 * whitespace.
1025 */
1026bool dev_valid_name(const char *name)
1027{
1028 if (*name == '\0')
1029 return false;
1030 if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1031 return false;
1032 if (!strcmp(name, ".") || !strcmp(name, ".."))
1033 return false;
1034
1035 while (*name) {
1036 if (*name == '/' || *name == ':' || isspace(*name))
1037 return false;
1038 name++;
1039 }
1040 return true;
1041}
1042EXPORT_SYMBOL(dev_valid_name);
1043
1044/**
1045 * __dev_alloc_name - allocate a name for a device
1046 * @net: network namespace to allocate the device name in
1047 * @name: name format string
1048 * @buf: scratch buffer and result name string
1049 *
1050 * Passed a format string - eg "lt%d" it will try and find a suitable
1051 * id. It scans list of devices to build up a free map, then chooses
1052 * the first empty slot. The caller must hold the dev_base or rtnl lock
1053 * while allocating the name and adding the device in order to avoid
1054 * duplicates.
1055 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1056 * Returns the number of the unit assigned or a negative errno code.
1057 */
1058
1059static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1060{
1061 int i = 0;
1062 const char *p;
1063 const int max_netdevices = 8*PAGE_SIZE;
1064 unsigned long *inuse;
1065 struct net_device *d;
1066
1067 if (!dev_valid_name(name))
1068 return -EINVAL;
1069
1070 p = strchr(name, '%');
1071 if (p) {
1072 /*
1073 * Verify the string as this thing may have come from
1074 * the user. There must be either one "%d" and no other "%"
1075 * characters.
1076 */
1077 if (p[1] != 'd' || strchr(p + 2, '%'))
1078 return -EINVAL;
1079
1080 /* Use one page as a bit array of possible slots */
1081 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1082 if (!inuse)
1083 return -ENOMEM;
1084
1085 for_each_netdev(net, d) {
1086 if (!sscanf(d->name, name, &i))
1087 continue;
1088 if (i < 0 || i >= max_netdevices)
1089 continue;
1090
1091 /* avoid cases where sscanf is not exact inverse of printf */
1092 snprintf(buf, IFNAMSIZ, name, i);
1093 if (!strncmp(buf, d->name, IFNAMSIZ))
1094 set_bit(i, inuse);
1095 }
1096
1097 i = find_first_zero_bit(inuse, max_netdevices);
1098 free_page((unsigned long) inuse);
1099 }
1100
1101 snprintf(buf, IFNAMSIZ, name, i);
1102 if (!__dev_get_by_name(net, buf))
1103 return i;
1104
1105 /* It is possible to run out of possible slots
1106 * when the name is long and there isn't enough space left
1107 * for the digits, or if all bits are used.
1108 */
1109 return -ENFILE;
1110}
1111
1112static int dev_alloc_name_ns(struct net *net,
1113 struct net_device *dev,
1114 const char *name)
1115{
1116 char buf[IFNAMSIZ];
1117 int ret;
1118
1119 BUG_ON(!net);
1120 ret = __dev_alloc_name(net, name, buf);
1121 if (ret >= 0)
1122 strlcpy(dev->name, buf, IFNAMSIZ);
1123 return ret;
1124}
1125
1126/**
1127 * dev_alloc_name - allocate a name for a device
1128 * @dev: device
1129 * @name: name format string
1130 *
1131 * Passed a format string - eg "lt%d" it will try and find a suitable
1132 * id. It scans list of devices to build up a free map, then chooses
1133 * the first empty slot. The caller must hold the dev_base or rtnl lock
1134 * while allocating the name and adding the device in order to avoid
1135 * duplicates.
1136 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1137 * Returns the number of the unit assigned or a negative errno code.
1138 */
1139
1140int dev_alloc_name(struct net_device *dev, const char *name)
1141{
1142 return dev_alloc_name_ns(dev_net(dev), dev, name);
1143}
1144EXPORT_SYMBOL(dev_alloc_name);
1145
1146int dev_get_valid_name(struct net *net, struct net_device *dev,
1147 const char *name)
1148{
1149 BUG_ON(!net);
1150
1151 if (!dev_valid_name(name))
1152 return -EINVAL;
1153
1154 if (strchr(name, '%'))
1155 return dev_alloc_name_ns(net, dev, name);
1156 else if (__dev_get_by_name(net, name))
1157 return -EEXIST;
1158 else if (dev->name != name)
1159 strlcpy(dev->name, name, IFNAMSIZ);
1160
1161 return 0;
1162}
1163EXPORT_SYMBOL(dev_get_valid_name);
1164
1165/**
1166 * dev_change_name - change name of a device
1167 * @dev: device
1168 * @newname: name (or format string) must be at least IFNAMSIZ
1169 *
1170 * Change name of a device, can pass format strings "eth%d".
1171 * for wildcarding.
1172 */
1173int dev_change_name(struct net_device *dev, const char *newname)
1174{
1175 unsigned char old_assign_type;
1176 char oldname[IFNAMSIZ];
1177 int err = 0;
1178 int ret;
1179 struct net *net;
1180
1181 ASSERT_RTNL();
1182 BUG_ON(!dev_net(dev));
1183
1184 net = dev_net(dev);
1185 if (dev->flags & IFF_UP)
1186 return -EBUSY;
1187
1188 write_seqcount_begin(&devnet_rename_seq);
1189
1190 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1191 write_seqcount_end(&devnet_rename_seq);
1192 return 0;
1193 }
1194
1195 memcpy(oldname, dev->name, IFNAMSIZ);
1196
1197 err = dev_get_valid_name(net, dev, newname);
1198 if (err < 0) {
1199 write_seqcount_end(&devnet_rename_seq);
1200 return err;
1201 }
1202
1203 if (oldname[0] && !strchr(oldname, '%'))
1204 netdev_info(dev, "renamed from %s\n", oldname);
1205
1206 old_assign_type = dev->name_assign_type;
1207 dev->name_assign_type = NET_NAME_RENAMED;
1208
1209rollback:
1210 ret = device_rename(&dev->dev, dev->name);
1211 if (ret) {
1212 memcpy(dev->name, oldname, IFNAMSIZ);
1213 dev->name_assign_type = old_assign_type;
1214 write_seqcount_end(&devnet_rename_seq);
1215 return ret;
1216 }
1217
1218 write_seqcount_end(&devnet_rename_seq);
1219
1220 netdev_adjacent_rename_links(dev, oldname);
1221
1222 write_lock_bh(&dev_base_lock);
1223 hlist_del_rcu(&dev->name_hlist);
1224 write_unlock_bh(&dev_base_lock);
1225
1226 synchronize_rcu();
1227
1228 write_lock_bh(&dev_base_lock);
1229 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1230 write_unlock_bh(&dev_base_lock);
1231
1232 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1233 ret = notifier_to_errno(ret);
1234
1235 if (ret) {
1236 /* err >= 0 after dev_alloc_name() or stores the first errno */
1237 if (err >= 0) {
1238 err = ret;
1239 write_seqcount_begin(&devnet_rename_seq);
1240 memcpy(dev->name, oldname, IFNAMSIZ);
1241 memcpy(oldname, newname, IFNAMSIZ);
1242 dev->name_assign_type = old_assign_type;
1243 old_assign_type = NET_NAME_RENAMED;
1244 goto rollback;
1245 } else {
1246 pr_err("%s: name change rollback failed: %d\n",
1247 dev->name, ret);
1248 }
1249 }
1250
1251 return err;
1252}
1253
1254/**
1255 * dev_set_alias - change ifalias of a device
1256 * @dev: device
1257 * @alias: name up to IFALIASZ
1258 * @len: limit of bytes to copy from info
1259 *
1260 * Set ifalias for a device,
1261 */
1262int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1263{
1264 struct dev_ifalias *new_alias = NULL;
1265
1266 if (len >= IFALIASZ)
1267 return -EINVAL;
1268
1269 if (len) {
1270 new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1271 if (!new_alias)
1272 return -ENOMEM;
1273
1274 memcpy(new_alias->ifalias, alias, len);
1275 new_alias->ifalias[len] = 0;
1276 }
1277
1278 mutex_lock(&ifalias_mutex);
1279 rcu_swap_protected(dev->ifalias, new_alias,
1280 mutex_is_locked(&ifalias_mutex));
1281 mutex_unlock(&ifalias_mutex);
1282
1283 if (new_alias)
1284 kfree_rcu(new_alias, rcuhead);
1285
1286 return len;
1287}
1288
1289/**
1290 * dev_get_alias - get ifalias of a device
1291 * @dev: device
1292 * @name: buffer to store name of ifalias
1293 * @len: size of buffer
1294 *
1295 * get ifalias for a device. Caller must make sure dev cannot go
1296 * away, e.g. rcu read lock or own a reference count to device.
1297 */
1298int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1299{
1300 const struct dev_ifalias *alias;
1301 int ret = 0;
1302
1303 rcu_read_lock();
1304 alias = rcu_dereference(dev->ifalias);
1305 if (alias)
1306 ret = snprintf(name, len, "%s", alias->ifalias);
1307 rcu_read_unlock();
1308
1309 return ret;
1310}
1311
1312/**
1313 * netdev_features_change - device changes features
1314 * @dev: device to cause notification
1315 *
1316 * Called to indicate a device has changed features.
1317 */
1318void netdev_features_change(struct net_device *dev)
1319{
1320 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1321}
1322EXPORT_SYMBOL(netdev_features_change);
1323
1324/**
1325 * netdev_state_change - device changes state
1326 * @dev: device to cause notification
1327 *
1328 * Called to indicate a device has changed state. This function calls
1329 * the notifier chains for netdev_chain and sends a NEWLINK message
1330 * to the routing socket.
1331 */
1332void netdev_state_change(struct net_device *dev)
1333{
1334 if (dev->flags & IFF_UP) {
1335 struct netdev_notifier_change_info change_info = {
1336 .info.dev = dev,
1337 };
1338
1339 call_netdevice_notifiers_info(NETDEV_CHANGE,
1340 &change_info.info);
1341 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1342 }
1343}
1344EXPORT_SYMBOL(netdev_state_change);
1345
1346/**
1347 * netdev_notify_peers - notify network peers about existence of @dev
1348 * @dev: network device
1349 *
1350 * Generate traffic such that interested network peers are aware of
1351 * @dev, such as by generating a gratuitous ARP. This may be used when
1352 * a device wants to inform the rest of the network about some sort of
1353 * reconfiguration such as a failover event or virtual machine
1354 * migration.
1355 */
1356void netdev_notify_peers(struct net_device *dev)
1357{
1358 rtnl_lock();
1359 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1360 call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1361 rtnl_unlock();
1362}
1363EXPORT_SYMBOL(netdev_notify_peers);
1364
1365static int __dev_open(struct net_device *dev)
1366{
1367 const struct net_device_ops *ops = dev->netdev_ops;
1368 int ret;
1369
1370 ASSERT_RTNL();
1371
1372 if (!netif_device_present(dev))
1373 return -ENODEV;
1374
1375 /* Block netpoll from trying to do any rx path servicing.
1376 * If we don't do this there is a chance ndo_poll_controller
1377 * or ndo_poll may be running while we open the device
1378 */
1379 netpoll_poll_disable(dev);
1380
1381 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1382 ret = notifier_to_errno(ret);
1383 if (ret)
1384 return ret;
1385
1386 set_bit(__LINK_STATE_START, &dev->state);
1387
1388 if (ops->ndo_validate_addr)
1389 ret = ops->ndo_validate_addr(dev);
1390
1391 if (!ret && ops->ndo_open)
1392 ret = ops->ndo_open(dev);
1393
1394 netpoll_poll_enable(dev);
1395
1396 if (ret)
1397 clear_bit(__LINK_STATE_START, &dev->state);
1398 else {
1399 dev->flags |= IFF_UP;
1400 dev_set_rx_mode(dev);
1401 dev_activate(dev);
1402 add_device_randomness(dev->dev_addr, dev->addr_len);
1403 }
1404
1405 return ret;
1406}
1407
1408/**
1409 * dev_open - prepare an interface for use.
1410 * @dev: device to open
1411 *
1412 * Takes a device from down to up state. The device's private open
1413 * function is invoked and then the multicast lists are loaded. Finally
1414 * the device is moved into the up state and a %NETDEV_UP message is
1415 * sent to the netdev notifier chain.
1416 *
1417 * Calling this function on an active interface is a nop. On a failure
1418 * a negative errno code is returned.
1419 */
1420int dev_open(struct net_device *dev)
1421{
1422 int ret;
1423
1424 if (dev->flags & IFF_UP)
1425 return 0;
1426
1427 ret = __dev_open(dev);
1428 if (ret < 0)
1429 return ret;
1430
1431 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1432 call_netdevice_notifiers(NETDEV_UP, dev);
1433
1434 return ret;
1435}
1436EXPORT_SYMBOL(dev_open);
1437
1438static void __dev_close_many(struct list_head *head)
1439{
1440 struct net_device *dev;
1441
1442 ASSERT_RTNL();
1443 might_sleep();
1444
1445 list_for_each_entry(dev, head, close_list) {
1446 /* Temporarily disable netpoll until the interface is down */
1447 netpoll_poll_disable(dev);
1448
1449 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1450
1451 clear_bit(__LINK_STATE_START, &dev->state);
1452
1453 /* Synchronize to scheduled poll. We cannot touch poll list, it
1454 * can be even on different cpu. So just clear netif_running().
1455 *
1456 * dev->stop() will invoke napi_disable() on all of it's
1457 * napi_struct instances on this device.
1458 */
1459 smp_mb__after_atomic(); /* Commit netif_running(). */
1460 }
1461
1462 dev_deactivate_many(head);
1463
1464 list_for_each_entry(dev, head, close_list) {
1465 const struct net_device_ops *ops = dev->netdev_ops;
1466
1467 /*
1468 * Call the device specific close. This cannot fail.
1469 * Only if device is UP
1470 *
1471 * We allow it to be called even after a DETACH hot-plug
1472 * event.
1473 */
1474 if (ops->ndo_stop)
1475 ops->ndo_stop(dev);
1476
1477 dev->flags &= ~IFF_UP;
1478 netpoll_poll_enable(dev);
1479 }
1480}
1481
1482static void __dev_close(struct net_device *dev)
1483{
1484 LIST_HEAD(single);
1485
1486 list_add(&dev->close_list, &single);
1487 __dev_close_many(&single);
1488 list_del(&single);
1489}
1490
1491void dev_close_many(struct list_head *head, bool unlink)
1492{
1493 struct net_device *dev, *tmp;
1494
1495 /* Remove the devices that don't need to be closed */
1496 list_for_each_entry_safe(dev, tmp, head, close_list)
1497 if (!(dev->flags & IFF_UP))
1498 list_del_init(&dev->close_list);
1499
1500 __dev_close_many(head);
1501
1502 list_for_each_entry_safe(dev, tmp, head, close_list) {
1503 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1504 call_netdevice_notifiers(NETDEV_DOWN, dev);
1505 if (unlink)
1506 list_del_init(&dev->close_list);
1507 }
1508}
1509EXPORT_SYMBOL(dev_close_many);
1510
1511/**
1512 * dev_close - shutdown an interface.
1513 * @dev: device to shutdown
1514 *
1515 * This function moves an active device into down state. A
1516 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1517 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1518 * chain.
1519 */
1520void dev_close(struct net_device *dev)
1521{
1522 if (dev->flags & IFF_UP) {
1523 LIST_HEAD(single);
1524
1525 list_add(&dev->close_list, &single);
1526 dev_close_many(&single, true);
1527 list_del(&single);
1528 }
1529}
1530EXPORT_SYMBOL(dev_close);
1531
1532
1533/**
1534 * dev_disable_lro - disable Large Receive Offload on a device
1535 * @dev: device
1536 *
1537 * Disable Large Receive Offload (LRO) on a net device. Must be
1538 * called under RTNL. This is needed if received packets may be
1539 * forwarded to another interface.
1540 */
1541void dev_disable_lro(struct net_device *dev)
1542{
1543 struct net_device *lower_dev;
1544 struct list_head *iter;
1545
1546 dev->wanted_features &= ~NETIF_F_LRO;
1547 netdev_update_features(dev);
1548
1549 if (unlikely(dev->features & NETIF_F_LRO))
1550 netdev_WARN(dev, "failed to disable LRO!\n");
1551
1552 netdev_for_each_lower_dev(dev, lower_dev, iter)
1553 dev_disable_lro(lower_dev);
1554}
1555EXPORT_SYMBOL(dev_disable_lro);
1556
1557/**
1558 * dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1559 * @dev: device
1560 *
1561 * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be
1562 * called under RTNL. This is needed if Generic XDP is installed on
1563 * the device.
1564 */
1565static void dev_disable_gro_hw(struct net_device *dev)
1566{
1567 dev->wanted_features &= ~NETIF_F_GRO_HW;
1568 netdev_update_features(dev);
1569
1570 if (unlikely(dev->features & NETIF_F_GRO_HW))
1571 netdev_WARN(dev, "failed to disable GRO_HW!\n");
1572}
1573
1574const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1575{
1576#define N(val) \
1577 case NETDEV_##val: \
1578 return "NETDEV_" __stringify(val);
1579 switch (cmd) {
1580 N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1581 N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1582 N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1583 N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1584 N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1585 N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1586 N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1587 N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1588 N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1589 };
1590#undef N
1591 return "UNKNOWN_NETDEV_EVENT";
1592}
1593EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1594
1595static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1596 struct net_device *dev)
1597{
1598 struct netdev_notifier_info info = {
1599 .dev = dev,
1600 };
1601
1602 return nb->notifier_call(nb, val, &info);
1603}
1604
1605static int dev_boot_phase = 1;
1606
1607/**
1608 * register_netdevice_notifier - register a network notifier block
1609 * @nb: notifier
1610 *
1611 * Register a notifier to be called when network device events occur.
1612 * The notifier passed is linked into the kernel structures and must
1613 * not be reused until it has been unregistered. A negative errno code
1614 * is returned on a failure.
1615 *
1616 * When registered all registration and up events are replayed
1617 * to the new notifier to allow device to have a race free
1618 * view of the network device list.
1619 */
1620
1621int register_netdevice_notifier(struct notifier_block *nb)
1622{
1623 struct net_device *dev;
1624 struct net_device *last;
1625 struct net *net;
1626 int err;
1627
1628 /* Close race with setup_net() and cleanup_net() */
1629 down_write(&pernet_ops_rwsem);
1630 rtnl_lock();
1631 err = raw_notifier_chain_register(&netdev_chain, nb);
1632 if (err)
1633 goto unlock;
1634 if (dev_boot_phase)
1635 goto unlock;
1636 for_each_net(net) {
1637 for_each_netdev(net, dev) {
1638 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1639 err = notifier_to_errno(err);
1640 if (err)
1641 goto rollback;
1642
1643 if (!(dev->flags & IFF_UP))
1644 continue;
1645
1646 call_netdevice_notifier(nb, NETDEV_UP, dev);
1647 }
1648 }
1649
1650unlock:
1651 rtnl_unlock();
1652 up_write(&pernet_ops_rwsem);
1653 return err;
1654
1655rollback:
1656 last = dev;
1657 for_each_net(net) {
1658 for_each_netdev(net, dev) {
1659 if (dev == last)
1660 goto outroll;
1661
1662 if (dev->flags & IFF_UP) {
1663 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1664 dev);
1665 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1666 }
1667 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1668 }
1669 }
1670
1671outroll:
1672 raw_notifier_chain_unregister(&netdev_chain, nb);
1673 goto unlock;
1674}
1675EXPORT_SYMBOL(register_netdevice_notifier);
1676
1677/**
1678 * unregister_netdevice_notifier - unregister a network notifier block
1679 * @nb: notifier
1680 *
1681 * Unregister a notifier previously registered by
1682 * register_netdevice_notifier(). The notifier is unlinked into the
1683 * kernel structures and may then be reused. A negative errno code
1684 * is returned on a failure.
1685 *
1686 * After unregistering unregister and down device events are synthesized
1687 * for all devices on the device list to the removed notifier to remove
1688 * the need for special case cleanup code.
1689 */
1690
1691int unregister_netdevice_notifier(struct notifier_block *nb)
1692{
1693 struct net_device *dev;
1694 struct net *net;
1695 int err;
1696
1697 /* Close race with setup_net() and cleanup_net() */
1698 down_write(&pernet_ops_rwsem);
1699 rtnl_lock();
1700 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1701 if (err)
1702 goto unlock;
1703
1704 for_each_net(net) {
1705 for_each_netdev(net, dev) {
1706 if (dev->flags & IFF_UP) {
1707 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1708 dev);
1709 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1710 }
1711 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1712 }
1713 }
1714unlock:
1715 rtnl_unlock();
1716 up_write(&pernet_ops_rwsem);
1717 return err;
1718}
1719EXPORT_SYMBOL(unregister_netdevice_notifier);
1720
1721/**
1722 * call_netdevice_notifiers_info - call all network notifier blocks
1723 * @val: value passed unmodified to notifier function
1724 * @info: notifier information data
1725 *
1726 * Call all network notifier blocks. Parameters and return value
1727 * are as for raw_notifier_call_chain().
1728 */
1729
1730static int call_netdevice_notifiers_info(unsigned long val,
1731 struct netdev_notifier_info *info)
1732{
1733 ASSERT_RTNL();
1734 return raw_notifier_call_chain(&netdev_chain, val, info);
1735}
1736
1737/**
1738 * call_netdevice_notifiers - call all network notifier blocks
1739 * @val: value passed unmodified to notifier function
1740 * @dev: net_device pointer passed unmodified to notifier function
1741 *
1742 * Call all network notifier blocks. Parameters and return value
1743 * are as for raw_notifier_call_chain().
1744 */
1745
1746int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1747{
1748 struct netdev_notifier_info info = {
1749 .dev = dev,
1750 };
1751
1752 return call_netdevice_notifiers_info(val, &info);
1753}
1754EXPORT_SYMBOL(call_netdevice_notifiers);
1755
1756#ifdef CONFIG_NET_INGRESS
1757static struct static_key ingress_needed __read_mostly;
1758
1759void net_inc_ingress_queue(void)
1760{
1761 static_key_slow_inc(&ingress_needed);
1762}
1763EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1764
1765void net_dec_ingress_queue(void)
1766{
1767 static_key_slow_dec(&ingress_needed);
1768}
1769EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1770#endif
1771
1772#ifdef CONFIG_NET_EGRESS
1773static struct static_key egress_needed __read_mostly;
1774
1775void net_inc_egress_queue(void)
1776{
1777 static_key_slow_inc(&egress_needed);
1778}
1779EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1780
1781void net_dec_egress_queue(void)
1782{
1783 static_key_slow_dec(&egress_needed);
1784}
1785EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1786#endif
1787
1788static struct static_key netstamp_needed __read_mostly;
1789#ifdef HAVE_JUMP_LABEL
1790static atomic_t netstamp_needed_deferred;
1791static atomic_t netstamp_wanted;
1792static void netstamp_clear(struct work_struct *work)
1793{
1794 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1795 int wanted;
1796
1797 wanted = atomic_add_return(deferred, &netstamp_wanted);
1798 if (wanted > 0)
1799 static_key_enable(&netstamp_needed);
1800 else
1801 static_key_disable(&netstamp_needed);
1802}
1803static DECLARE_WORK(netstamp_work, netstamp_clear);
1804#endif
1805
1806void net_enable_timestamp(void)
1807{
1808#ifdef HAVE_JUMP_LABEL
1809 int wanted;
1810
1811 while (1) {
1812 wanted = atomic_read(&netstamp_wanted);
1813 if (wanted <= 0)
1814 break;
1815 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1816 return;
1817 }
1818 atomic_inc(&netstamp_needed_deferred);
1819 schedule_work(&netstamp_work);
1820#else
1821 static_key_slow_inc(&netstamp_needed);
1822#endif
1823}
1824EXPORT_SYMBOL(net_enable_timestamp);
1825
1826void net_disable_timestamp(void)
1827{
1828#ifdef HAVE_JUMP_LABEL
1829 int wanted;
1830
1831 while (1) {
1832 wanted = atomic_read(&netstamp_wanted);
1833 if (wanted <= 1)
1834 break;
1835 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1836 return;
1837 }
1838 atomic_dec(&netstamp_needed_deferred);
1839 schedule_work(&netstamp_work);
1840#else
1841 static_key_slow_dec(&netstamp_needed);
1842#endif
1843}
1844EXPORT_SYMBOL(net_disable_timestamp);
1845
1846static inline void net_timestamp_set(struct sk_buff *skb)
1847{
1848 skb->tstamp = 0;
1849 if (static_key_false(&netstamp_needed))
1850 __net_timestamp(skb);
1851}
1852
1853#define net_timestamp_check(COND, SKB) \
1854 if (static_key_false(&netstamp_needed)) { \
1855 if ((COND) && !(SKB)->tstamp) \
1856 __net_timestamp(SKB); \
1857 } \
1858
1859bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1860{
1861 unsigned int len;
1862
1863 if (!(dev->flags & IFF_UP))
1864 return false;
1865
1866 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1867 if (skb->len <= len)
1868 return true;
1869
1870 /* if TSO is enabled, we don't care about the length as the packet
1871 * could be forwarded without being segmented before
1872 */
1873 if (skb_is_gso(skb))
1874 return true;
1875
1876 return false;
1877}
1878EXPORT_SYMBOL_GPL(is_skb_forwardable);
1879
1880int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1881{
1882 int ret = ____dev_forward_skb(dev, skb);
1883
1884 if (likely(!ret)) {
1885 skb->protocol = eth_type_trans(skb, dev);
1886 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1887 }
1888
1889 return ret;
1890}
1891EXPORT_SYMBOL_GPL(__dev_forward_skb);
1892
1893/**
1894 * dev_forward_skb - loopback an skb to another netif
1895 *
1896 * @dev: destination network device
1897 * @skb: buffer to forward
1898 *
1899 * return values:
1900 * NET_RX_SUCCESS (no congestion)
1901 * NET_RX_DROP (packet was dropped, but freed)
1902 *
1903 * dev_forward_skb can be used for injecting an skb from the
1904 * start_xmit function of one device into the receive queue
1905 * of another device.
1906 *
1907 * The receiving device may be in another namespace, so
1908 * we have to clear all information in the skb that could
1909 * impact namespace isolation.
1910 */
1911int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1912{
1913 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1914}
1915EXPORT_SYMBOL_GPL(dev_forward_skb);
1916
1917static inline int deliver_skb(struct sk_buff *skb,
1918 struct packet_type *pt_prev,
1919 struct net_device *orig_dev)
1920{
1921 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
1922 return -ENOMEM;
1923 refcount_inc(&skb->users);
1924 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1925}
1926
1927static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1928 struct packet_type **pt,
1929 struct net_device *orig_dev,
1930 __be16 type,
1931 struct list_head *ptype_list)
1932{
1933 struct packet_type *ptype, *pt_prev = *pt;
1934
1935 list_for_each_entry_rcu(ptype, ptype_list, list) {
1936 if (ptype->type != type)
1937 continue;
1938 if (pt_prev)
1939 deliver_skb(skb, pt_prev, orig_dev);
1940 pt_prev = ptype;
1941 }
1942 *pt = pt_prev;
1943}
1944
1945static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1946{
1947 if (!ptype->af_packet_priv || !skb->sk)
1948 return false;
1949
1950 if (ptype->id_match)
1951 return ptype->id_match(ptype, skb->sk);
1952 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1953 return true;
1954
1955 return false;
1956}
1957
1958/*
1959 * Support routine. Sends outgoing frames to any network
1960 * taps currently in use.
1961 */
1962
1963void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1964{
1965 struct packet_type *ptype;
1966 struct sk_buff *skb2 = NULL;
1967 struct packet_type *pt_prev = NULL;
1968 struct list_head *ptype_list = &ptype_all;
1969
1970 rcu_read_lock();
1971again:
1972 list_for_each_entry_rcu(ptype, ptype_list, list) {
1973 /* Never send packets back to the socket
1974 * they originated from - MvS (miquels@drinkel.ow.org)
1975 */
1976 if (skb_loop_sk(ptype, skb))
1977 continue;
1978
1979 if (pt_prev) {
1980 deliver_skb(skb2, pt_prev, skb->dev);
1981 pt_prev = ptype;
1982 continue;
1983 }
1984
1985 /* need to clone skb, done only once */
1986 skb2 = skb_clone(skb, GFP_ATOMIC);
1987 if (!skb2)
1988 goto out_unlock;
1989
1990 net_timestamp_set(skb2);
1991
1992 /* skb->nh should be correctly
1993 * set by sender, so that the second statement is
1994 * just protection against buggy protocols.
1995 */
1996 skb_reset_mac_header(skb2);
1997
1998 if (skb_network_header(skb2) < skb2->data ||
1999 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2000 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2001 ntohs(skb2->protocol),
2002 dev->name);
2003 skb_reset_network_header(skb2);
2004 }
2005
2006 skb2->transport_header = skb2->network_header;
2007 skb2->pkt_type = PACKET_OUTGOING;
2008 pt_prev = ptype;
2009 }
2010
2011 if (ptype_list == &ptype_all) {
2012 ptype_list = &dev->ptype_all;
2013 goto again;
2014 }
2015out_unlock:
2016 if (pt_prev) {
2017 if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2018 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2019 else
2020 kfree_skb(skb2);
2021 }
2022 rcu_read_unlock();
2023}
2024EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2025
2026/**
2027 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2028 * @dev: Network device
2029 * @txq: number of queues available
2030 *
2031 * If real_num_tx_queues is changed the tc mappings may no longer be
2032 * valid. To resolve this verify the tc mapping remains valid and if
2033 * not NULL the mapping. With no priorities mapping to this
2034 * offset/count pair it will no longer be used. In the worst case TC0
2035 * is invalid nothing can be done so disable priority mappings. If is
2036 * expected that drivers will fix this mapping if they can before
2037 * calling netif_set_real_num_tx_queues.
2038 */
2039static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2040{
2041 int i;
2042 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2043
2044 /* If TC0 is invalidated disable TC mapping */
2045 if (tc->offset + tc->count > txq) {
2046 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2047 dev->num_tc = 0;
2048 return;
2049 }
2050
2051 /* Invalidated prio to tc mappings set to TC0 */
2052 for (i = 1; i < TC_BITMASK + 1; i++) {
2053 int q = netdev_get_prio_tc_map(dev, i);
2054
2055 tc = &dev->tc_to_txq[q];
2056 if (tc->offset + tc->count > txq) {
2057 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2058 i, q);
2059 netdev_set_prio_tc_map(dev, i, 0);
2060 }
2061 }
2062}
2063
2064int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2065{
2066 if (dev->num_tc) {
2067 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2068 int i;
2069
2070 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2071 if ((txq - tc->offset) < tc->count)
2072 return i;
2073 }
2074
2075 return -1;
2076 }
2077
2078 return 0;
2079}
2080EXPORT_SYMBOL(netdev_txq_to_tc);
2081
2082#ifdef CONFIG_XPS
2083static DEFINE_MUTEX(xps_map_mutex);
2084#define xmap_dereference(P) \
2085 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2086
2087static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2088 int tci, u16 index)
2089{
2090 struct xps_map *map = NULL;
2091 int pos;
2092
2093 if (dev_maps)
2094 map = xmap_dereference(dev_maps->cpu_map[tci]);
2095 if (!map)
2096 return false;
2097
2098 for (pos = map->len; pos--;) {
2099 if (map->queues[pos] != index)
2100 continue;
2101
2102 if (map->len > 1) {
2103 map->queues[pos] = map->queues[--map->len];
2104 break;
2105 }
2106
2107 RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
2108 kfree_rcu(map, rcu);
2109 return false;
2110 }
2111
2112 return true;
2113}
2114
2115static bool remove_xps_queue_cpu(struct net_device *dev,
2116 struct xps_dev_maps *dev_maps,
2117 int cpu, u16 offset, u16 count)
2118{
2119 int num_tc = dev->num_tc ? : 1;
2120 bool active = false;
2121 int tci;
2122
2123 for (tci = cpu * num_tc; num_tc--; tci++) {
2124 int i, j;
2125
2126 for (i = count, j = offset; i--; j++) {
2127 if (!remove_xps_queue(dev_maps, tci, j))
2128 break;
2129 }
2130
2131 active |= i < 0;
2132 }
2133
2134 return active;
2135}
2136
2137static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2138 u16 count)
2139{
2140 struct xps_dev_maps *dev_maps;
2141 int cpu, i;
2142 bool active = false;
2143
2144 mutex_lock(&xps_map_mutex);
2145 dev_maps = xmap_dereference(dev->xps_maps);
2146
2147 if (!dev_maps)
2148 goto out_no_maps;
2149
2150 for_each_possible_cpu(cpu)
2151 active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2152 offset, count);
2153
2154 if (!active) {
2155 RCU_INIT_POINTER(dev->xps_maps, NULL);
2156 kfree_rcu(dev_maps, rcu);
2157 }
2158
2159 for (i = offset + (count - 1); count--; i--)
2160 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2161 NUMA_NO_NODE);
2162
2163out_no_maps:
2164 mutex_unlock(&xps_map_mutex);
2165}
2166
2167static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2168{
2169 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2170}
2171
2172static struct xps_map *expand_xps_map(struct xps_map *map,
2173 int cpu, u16 index)
2174{
2175 struct xps_map *new_map;
2176 int alloc_len = XPS_MIN_MAP_ALLOC;
2177 int i, pos;
2178
2179 for (pos = 0; map && pos < map->len; pos++) {
2180 if (map->queues[pos] != index)
2181 continue;
2182 return map;
2183 }
2184
2185 /* Need to add queue to this CPU's existing map */
2186 if (map) {
2187 if (pos < map->alloc_len)
2188 return map;
2189
2190 alloc_len = map->alloc_len * 2;
2191 }
2192
2193 /* Need to allocate new map to store queue on this CPU's map */
2194 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2195 cpu_to_node(cpu));
2196 if (!new_map)
2197 return NULL;
2198
2199 for (i = 0; i < pos; i++)
2200 new_map->queues[i] = map->queues[i];
2201 new_map->alloc_len = alloc_len;
2202 new_map->len = pos;
2203
2204 return new_map;
2205}
2206
2207int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2208 u16 index)
2209{
2210 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2211 int i, cpu, tci, numa_node_id = -2;
2212 int maps_sz, num_tc = 1, tc = 0;
2213 struct xps_map *map, *new_map;
2214 bool active = false;
2215
2216 if (dev->num_tc) {
2217 num_tc = dev->num_tc;
2218 tc = netdev_txq_to_tc(dev, index);
2219 if (tc < 0)
2220 return -EINVAL;
2221 }
2222
2223 maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2224 if (maps_sz < L1_CACHE_BYTES)
2225 maps_sz = L1_CACHE_BYTES;
2226
2227 mutex_lock(&xps_map_mutex);
2228
2229 dev_maps = xmap_dereference(dev->xps_maps);
2230
2231 /* allocate memory for queue storage */
2232 for_each_cpu_and(cpu, cpu_online_mask, mask) {
2233 if (!new_dev_maps)
2234 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2235 if (!new_dev_maps) {
2236 mutex_unlock(&xps_map_mutex);
2237 return -ENOMEM;
2238 }
2239
2240 tci = cpu * num_tc + tc;
2241 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2242 NULL;
2243
2244 map = expand_xps_map(map, cpu, index);
2245 if (!map)
2246 goto error;
2247
2248 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2249 }
2250
2251 if (!new_dev_maps)
2252 goto out_no_new_maps;
2253
2254 for_each_possible_cpu(cpu) {
2255 /* copy maps belonging to foreign traffic classes */
2256 for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2257 /* fill in the new device map from the old device map */
2258 map = xmap_dereference(dev_maps->cpu_map[tci]);
2259 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2260 }
2261
2262 /* We need to explicitly update tci as prevous loop
2263 * could break out early if dev_maps is NULL.
2264 */
2265 tci = cpu * num_tc + tc;
2266
2267 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2268 /* add queue to CPU maps */
2269 int pos = 0;
2270
2271 map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2272 while ((pos < map->len) && (map->queues[pos] != index))
2273 pos++;
2274
2275 if (pos == map->len)
2276 map->queues[map->len++] = index;
2277#ifdef CONFIG_NUMA
2278 if (numa_node_id == -2)
2279 numa_node_id = cpu_to_node(cpu);
2280 else if (numa_node_id != cpu_to_node(cpu))
2281 numa_node_id = -1;
2282#endif
2283 } else if (dev_maps) {
2284 /* fill in the new device map from the old device map */
2285 map = xmap_dereference(dev_maps->cpu_map[tci]);
2286 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2287 }
2288
2289 /* copy maps belonging to foreign traffic classes */
2290 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2291 /* fill in the new device map from the old device map */
2292 map = xmap_dereference(dev_maps->cpu_map[tci]);
2293 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2294 }
2295 }
2296
2297 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2298
2299 /* Cleanup old maps */
2300 if (!dev_maps)
2301 goto out_no_old_maps;
2302
2303 for_each_possible_cpu(cpu) {
2304 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2305 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2306 map = xmap_dereference(dev_maps->cpu_map[tci]);
2307 if (map && map != new_map)
2308 kfree_rcu(map, rcu);
2309 }
2310 }
2311
2312 kfree_rcu(dev_maps, rcu);
2313
2314out_no_old_maps:
2315 dev_maps = new_dev_maps;
2316 active = true;
2317
2318out_no_new_maps:
2319 /* update Tx queue numa node */
2320 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2321 (numa_node_id >= 0) ? numa_node_id :
2322 NUMA_NO_NODE);
2323
2324 if (!dev_maps)
2325 goto out_no_maps;
2326
2327 /* removes queue from unused CPUs */
2328 for_each_possible_cpu(cpu) {
2329 for (i = tc, tci = cpu * num_tc; i--; tci++)
2330 active |= remove_xps_queue(dev_maps, tci, index);
2331 if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2332 active |= remove_xps_queue(dev_maps, tci, index);
2333 for (i = num_tc - tc, tci++; --i; tci++)
2334 active |= remove_xps_queue(dev_maps, tci, index);
2335 }
2336
2337 /* free map if not active */
2338 if (!active) {
2339 RCU_INIT_POINTER(dev->xps_maps, NULL);
2340 kfree_rcu(dev_maps, rcu);
2341 }
2342
2343out_no_maps:
2344 mutex_unlock(&xps_map_mutex);
2345
2346 return 0;
2347error:
2348 /* remove any maps that we added */
2349 for_each_possible_cpu(cpu) {
2350 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2351 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2352 map = dev_maps ?
2353 xmap_dereference(dev_maps->cpu_map[tci]) :
2354 NULL;
2355 if (new_map && new_map != map)
2356 kfree(new_map);
2357 }
2358 }
2359
2360 mutex_unlock(&xps_map_mutex);
2361
2362 kfree(new_dev_maps);
2363 return -ENOMEM;
2364}
2365EXPORT_SYMBOL(netif_set_xps_queue);
2366
2367#endif
2368void netdev_reset_tc(struct net_device *dev)
2369{
2370#ifdef CONFIG_XPS
2371 netif_reset_xps_queues_gt(dev, 0);
2372#endif
2373 dev->num_tc = 0;
2374 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2375 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2376}
2377EXPORT_SYMBOL(netdev_reset_tc);
2378
2379int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2380{
2381 if (tc >= dev->num_tc)
2382 return -EINVAL;
2383
2384#ifdef CONFIG_XPS
2385 netif_reset_xps_queues(dev, offset, count);
2386#endif
2387 dev->tc_to_txq[tc].count = count;
2388 dev->tc_to_txq[tc].offset = offset;
2389 return 0;
2390}
2391EXPORT_SYMBOL(netdev_set_tc_queue);
2392
2393int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2394{
2395 if (num_tc > TC_MAX_QUEUE)
2396 return -EINVAL;
2397
2398#ifdef CONFIG_XPS
2399 netif_reset_xps_queues_gt(dev, 0);
2400#endif
2401 dev->num_tc = num_tc;
2402 return 0;
2403}
2404EXPORT_SYMBOL(netdev_set_num_tc);
2405
2406/*
2407 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2408 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2409 */
2410int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2411{
2412 bool disabling;
2413 int rc;
2414
2415 disabling = txq < dev->real_num_tx_queues;
2416
2417 if (txq < 1 || txq > dev->num_tx_queues)
2418 return -EINVAL;
2419
2420 if (dev->reg_state == NETREG_REGISTERED ||
2421 dev->reg_state == NETREG_UNREGISTERING) {
2422 ASSERT_RTNL();
2423
2424 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2425 txq);
2426 if (rc)
2427 return rc;
2428
2429 if (dev->num_tc)
2430 netif_setup_tc(dev, txq);
2431
2432 dev->real_num_tx_queues = txq;
2433
2434 if (disabling) {
2435 synchronize_net();
2436 qdisc_reset_all_tx_gt(dev, txq);
2437#ifdef CONFIG_XPS
2438 netif_reset_xps_queues_gt(dev, txq);
2439#endif
2440 }
2441 } else {
2442 dev->real_num_tx_queues = txq;
2443 }
2444
2445 return 0;
2446}
2447EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2448
2449#ifdef CONFIG_SYSFS
2450/**
2451 * netif_set_real_num_rx_queues - set actual number of RX queues used
2452 * @dev: Network device
2453 * @rxq: Actual number of RX queues
2454 *
2455 * This must be called either with the rtnl_lock held or before
2456 * registration of the net device. Returns 0 on success, or a
2457 * negative error code. If called before registration, it always
2458 * succeeds.
2459 */
2460int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2461{
2462 int rc;
2463
2464 if (rxq < 1 || rxq > dev->num_rx_queues)
2465 return -EINVAL;
2466
2467 if (dev->reg_state == NETREG_REGISTERED) {
2468 ASSERT_RTNL();
2469
2470 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2471 rxq);
2472 if (rc)
2473 return rc;
2474 }
2475
2476 dev->real_num_rx_queues = rxq;
2477 return 0;
2478}
2479EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2480#endif
2481
2482/**
2483 * netif_get_num_default_rss_queues - default number of RSS queues
2484 *
2485 * This routine should set an upper limit on the number of RSS queues
2486 * used by default by multiqueue devices.
2487 */
2488int netif_get_num_default_rss_queues(void)
2489{
2490 return is_kdump_kernel() ?
2491 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2492}
2493EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2494
2495static void __netif_reschedule(struct Qdisc *q)
2496{
2497 struct softnet_data *sd;
2498 unsigned long flags;
2499
2500 local_irq_save(flags);
2501 sd = this_cpu_ptr(&softnet_data);
2502 q->next_sched = NULL;
2503 *sd->output_queue_tailp = q;
2504 sd->output_queue_tailp = &q->next_sched;
2505 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2506 local_irq_restore(flags);
2507}
2508
2509void __netif_schedule(struct Qdisc *q)
2510{
2511 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2512 __netif_reschedule(q);
2513}
2514EXPORT_SYMBOL(__netif_schedule);
2515
2516struct dev_kfree_skb_cb {
2517 enum skb_free_reason reason;
2518};
2519
2520static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2521{
2522 return (struct dev_kfree_skb_cb *)skb->cb;
2523}
2524
2525void netif_schedule_queue(struct netdev_queue *txq)
2526{
2527 rcu_read_lock();
2528 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2529 struct Qdisc *q = rcu_dereference(txq->qdisc);
2530
2531 __netif_schedule(q);
2532 }
2533 rcu_read_unlock();
2534}
2535EXPORT_SYMBOL(netif_schedule_queue);
2536
2537void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2538{
2539 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2540 struct Qdisc *q;
2541
2542 rcu_read_lock();
2543 q = rcu_dereference(dev_queue->qdisc);
2544 __netif_schedule(q);
2545 rcu_read_unlock();
2546 }
2547}
2548EXPORT_SYMBOL(netif_tx_wake_queue);
2549
2550void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2551{
2552 unsigned long flags;
2553
2554 if (unlikely(!skb))
2555 return;
2556
2557 if (likely(refcount_read(&skb->users) == 1)) {
2558 smp_rmb();
2559 refcount_set(&skb->users, 0);
2560 } else if (likely(!refcount_dec_and_test(&skb->users))) {
2561 return;
2562 }
2563 get_kfree_skb_cb(skb)->reason = reason;
2564 local_irq_save(flags);
2565 skb->next = __this_cpu_read(softnet_data.completion_queue);
2566 __this_cpu_write(softnet_data.completion_queue, skb);
2567 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2568 local_irq_restore(flags);
2569}
2570EXPORT_SYMBOL(__dev_kfree_skb_irq);
2571
2572void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2573{
2574 if (in_irq() || irqs_disabled())
2575 __dev_kfree_skb_irq(skb, reason);
2576 else
2577 dev_kfree_skb(skb);
2578}
2579EXPORT_SYMBOL(__dev_kfree_skb_any);
2580
2581
2582/**
2583 * netif_device_detach - mark device as removed
2584 * @dev: network device
2585 *
2586 * Mark device as removed from system and therefore no longer available.
2587 */
2588void netif_device_detach(struct net_device *dev)
2589{
2590 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2591 netif_running(dev)) {
2592 netif_tx_stop_all_queues(dev);
2593 }
2594}
2595EXPORT_SYMBOL(netif_device_detach);
2596
2597/**
2598 * netif_device_attach - mark device as attached
2599 * @dev: network device
2600 *
2601 * Mark device as attached from system and restart if needed.
2602 */
2603void netif_device_attach(struct net_device *dev)
2604{
2605 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2606 netif_running(dev)) {
2607 netif_tx_wake_all_queues(dev);
2608 __netdev_watchdog_up(dev);
2609 }
2610}
2611EXPORT_SYMBOL(netif_device_attach);
2612
2613/*
2614 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2615 * to be used as a distribution range.
2616 */
2617u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2618 unsigned int num_tx_queues)
2619{
2620 u32 hash;
2621 u16 qoffset = 0;
2622 u16 qcount = num_tx_queues;
2623
2624 if (skb_rx_queue_recorded(skb)) {
2625 hash = skb_get_rx_queue(skb);
2626 while (unlikely(hash >= num_tx_queues))
2627 hash -= num_tx_queues;
2628 return hash;
2629 }
2630
2631 if (dev->num_tc) {
2632 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2633
2634 qoffset = dev->tc_to_txq[tc].offset;
2635 qcount = dev->tc_to_txq[tc].count;
2636 }
2637
2638 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2639}
2640EXPORT_SYMBOL(__skb_tx_hash);
2641
2642static void skb_warn_bad_offload(const struct sk_buff *skb)
2643{
2644 static const netdev_features_t null_features;
2645 struct net_device *dev = skb->dev;
2646 const char *name = "";
2647
2648 if (!net_ratelimit())
2649 return;
2650
2651 if (dev) {
2652 if (dev->dev.parent)
2653 name = dev_driver_string(dev->dev.parent);
2654 else
2655 name = netdev_name(dev);
2656 }
2657 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2658 "gso_type=%d ip_summed=%d\n",
2659 name, dev ? &dev->features : &null_features,
2660 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2661 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2662 skb_shinfo(skb)->gso_type, skb->ip_summed);
2663}
2664
2665/*
2666 * Invalidate hardware checksum when packet is to be mangled, and
2667 * complete checksum manually on outgoing path.
2668 */
2669int skb_checksum_help(struct sk_buff *skb)
2670{
2671 __wsum csum;
2672 int ret = 0, offset;
2673
2674 if (skb->ip_summed == CHECKSUM_COMPLETE)
2675 goto out_set_summed;
2676
2677 if (unlikely(skb_shinfo(skb)->gso_size)) {
2678 skb_warn_bad_offload(skb);
2679 return -EINVAL;
2680 }
2681
2682 /* Before computing a checksum, we should make sure no frag could
2683 * be modified by an external entity : checksum could be wrong.
2684 */
2685 if (skb_has_shared_frag(skb)) {
2686 ret = __skb_linearize(skb);
2687 if (ret)
2688 goto out;
2689 }
2690
2691 offset = skb_checksum_start_offset(skb);
2692 BUG_ON(offset >= skb_headlen(skb));
2693 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2694
2695 offset += skb->csum_offset;
2696 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2697
2698 if (skb_cloned(skb) &&
2699 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2700 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2701 if (ret)
2702 goto out;
2703 }
2704
2705 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2706out_set_summed:
2707 skb->ip_summed = CHECKSUM_NONE;
2708out:
2709 return ret;
2710}
2711EXPORT_SYMBOL(skb_checksum_help);
2712
2713int skb_crc32c_csum_help(struct sk_buff *skb)
2714{
2715 __le32 crc32c_csum;
2716 int ret = 0, offset, start;
2717
2718 if (skb->ip_summed != CHECKSUM_PARTIAL)
2719 goto out;
2720
2721 if (unlikely(skb_is_gso(skb)))
2722 goto out;
2723
2724 /* Before computing a checksum, we should make sure no frag could
2725 * be modified by an external entity : checksum could be wrong.
2726 */
2727 if (unlikely(skb_has_shared_frag(skb))) {
2728 ret = __skb_linearize(skb);
2729 if (ret)
2730 goto out;
2731 }
2732 start = skb_checksum_start_offset(skb);
2733 offset = start + offsetof(struct sctphdr, checksum);
2734 if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
2735 ret = -EINVAL;
2736 goto out;
2737 }
2738 if (skb_cloned(skb) &&
2739 !skb_clone_writable(skb, offset + sizeof(__le32))) {
2740 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2741 if (ret)
2742 goto out;
2743 }
2744 crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
2745 skb->len - start, ~(__u32)0,
2746 crc32c_csum_stub));
2747 *(__le32 *)(skb->data + offset) = crc32c_csum;
2748 skb->ip_summed = CHECKSUM_NONE;
2749 skb->csum_not_inet = 0;
2750out:
2751 return ret;
2752}
2753
2754__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2755{
2756 __be16 type = skb->protocol;
2757
2758 /* Tunnel gso handlers can set protocol to ethernet. */
2759 if (type == htons(ETH_P_TEB)) {
2760 struct ethhdr *eth;
2761
2762 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2763 return 0;
2764
2765 eth = (struct ethhdr *)skb->data;
2766 type = eth->h_proto;
2767 }
2768
2769 return __vlan_get_protocol(skb, type, depth);
2770}
2771
2772/**
2773 * skb_mac_gso_segment - mac layer segmentation handler.
2774 * @skb: buffer to segment
2775 * @features: features for the output path (see dev->features)
2776 */
2777struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2778 netdev_features_t features)
2779{
2780 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2781 struct packet_offload *ptype;
2782 int vlan_depth = skb->mac_len;
2783 __be16 type = skb_network_protocol(skb, &vlan_depth);
2784
2785 if (unlikely(!type))
2786 return ERR_PTR(-EINVAL);
2787
2788 __skb_pull(skb, vlan_depth);
2789
2790 rcu_read_lock();
2791 list_for_each_entry_rcu(ptype, &offload_base, list) {
2792 if (ptype->type == type && ptype->callbacks.gso_segment) {
2793 segs = ptype->callbacks.gso_segment(skb, features);
2794 break;
2795 }
2796 }
2797 rcu_read_unlock();
2798
2799 __skb_push(skb, skb->data - skb_mac_header(skb));
2800
2801 return segs;
2802}
2803EXPORT_SYMBOL(skb_mac_gso_segment);
2804
2805
2806/* openvswitch calls this on rx path, so we need a different check.
2807 */
2808static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2809{
2810 if (tx_path)
2811 return skb->ip_summed != CHECKSUM_PARTIAL &&
2812 skb->ip_summed != CHECKSUM_UNNECESSARY;
2813
2814 return skb->ip_summed == CHECKSUM_NONE;
2815}
2816
2817/**
2818 * __skb_gso_segment - Perform segmentation on skb.
2819 * @skb: buffer to segment
2820 * @features: features for the output path (see dev->features)
2821 * @tx_path: whether it is called in TX path
2822 *
2823 * This function segments the given skb and returns a list of segments.
2824 *
2825 * It may return NULL if the skb requires no segmentation. This is
2826 * only possible when GSO is used for verifying header integrity.
2827 *
2828 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2829 */
2830struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2831 netdev_features_t features, bool tx_path)
2832{
2833 struct sk_buff *segs;
2834
2835 if (unlikely(skb_needs_check(skb, tx_path))) {
2836 int err;
2837
2838 /* We're going to init ->check field in TCP or UDP header */
2839 err = skb_cow_head(skb, 0);
2840 if (err < 0)
2841 return ERR_PTR(err);
2842 }
2843
2844 /* Only report GSO partial support if it will enable us to
2845 * support segmentation on this frame without needing additional
2846 * work.
2847 */
2848 if (features & NETIF_F_GSO_PARTIAL) {
2849 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2850 struct net_device *dev = skb->dev;
2851
2852 partial_features |= dev->features & dev->gso_partial_features;
2853 if (!skb_gso_ok(skb, features | partial_features))
2854 features &= ~NETIF_F_GSO_PARTIAL;
2855 }
2856
2857 BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2858 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2859
2860 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2861 SKB_GSO_CB(skb)->encap_level = 0;
2862
2863 skb_reset_mac_header(skb);
2864 skb_reset_mac_len(skb);
2865
2866 segs = skb_mac_gso_segment(skb, features);
2867
2868 if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
2869 skb_warn_bad_offload(skb);
2870
2871 return segs;
2872}
2873EXPORT_SYMBOL(__skb_gso_segment);
2874
2875/* Take action when hardware reception checksum errors are detected. */
2876#ifdef CONFIG_BUG
2877void netdev_rx_csum_fault(struct net_device *dev)
2878{
2879 if (net_ratelimit()) {
2880 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2881 dump_stack();
2882 }
2883}
2884EXPORT_SYMBOL(netdev_rx_csum_fault);
2885#endif
2886
2887/* Actually, we should eliminate this check as soon as we know, that:
2888 * 1. IOMMU is present and allows to map all the memory.
2889 * 2. No high memory really exists on this machine.
2890 */
2891
2892static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2893{
2894#ifdef CONFIG_HIGHMEM
2895 int i;
2896
2897 if (!(dev->features & NETIF_F_HIGHDMA)) {
2898 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2899 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2900
2901 if (PageHighMem(skb_frag_page(frag)))
2902 return 1;
2903 }
2904 }
2905
2906 if (PCI_DMA_BUS_IS_PHYS) {
2907 struct device *pdev = dev->dev.parent;
2908
2909 if (!pdev)
2910 return 0;
2911 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2912 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2913 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2914
2915 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2916 return 1;
2917 }
2918 }
2919#endif
2920 return 0;
2921}
2922
2923/* If MPLS offload request, verify we are testing hardware MPLS features
2924 * instead of standard features for the netdev.
2925 */
2926#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2927static netdev_features_t net_mpls_features(struct sk_buff *skb,
2928 netdev_features_t features,
2929 __be16 type)
2930{
2931 if (eth_p_mpls(type))
2932 features &= skb->dev->mpls_features;
2933
2934 return features;
2935}
2936#else
2937static netdev_features_t net_mpls_features(struct sk_buff *skb,
2938 netdev_features_t features,
2939 __be16 type)
2940{
2941 return features;
2942}
2943#endif
2944
2945static netdev_features_t harmonize_features(struct sk_buff *skb,
2946 netdev_features_t features)
2947{
2948 int tmp;
2949 __be16 type;
2950
2951 type = skb_network_protocol(skb, &tmp);
2952 features = net_mpls_features(skb, features, type);
2953
2954 if (skb->ip_summed != CHECKSUM_NONE &&
2955 !can_checksum_protocol(features, type)) {
2956 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2957 }
2958 if (illegal_highdma(skb->dev, skb))
2959 features &= ~NETIF_F_SG;
2960
2961 return features;
2962}
2963
2964netdev_features_t passthru_features_check(struct sk_buff *skb,
2965 struct net_device *dev,
2966 netdev_features_t features)
2967{
2968 return features;
2969}
2970EXPORT_SYMBOL(passthru_features_check);
2971
2972static netdev_features_t dflt_features_check(struct sk_buff *skb,
2973 struct net_device *dev,
2974 netdev_features_t features)
2975{
2976 return vlan_features_check(skb, features);
2977}
2978
2979static netdev_features_t gso_features_check(const struct sk_buff *skb,
2980 struct net_device *dev,
2981 netdev_features_t features)
2982{
2983 u16 gso_segs = skb_shinfo(skb)->gso_segs;
2984
2985 if (gso_segs > dev->gso_max_segs)
2986 return features & ~NETIF_F_GSO_MASK;
2987
2988 /* Support for GSO partial features requires software
2989 * intervention before we can actually process the packets
2990 * so we need to strip support for any partial features now
2991 * and we can pull them back in after we have partially
2992 * segmented the frame.
2993 */
2994 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2995 features &= ~dev->gso_partial_features;
2996
2997 /* Make sure to clear the IPv4 ID mangling feature if the
2998 * IPv4 header has the potential to be fragmented.
2999 */
3000 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3001 struct iphdr *iph = skb->encapsulation ?
3002 inner_ip_hdr(skb) : ip_hdr(skb);
3003
3004 if (!(iph->frag_off & htons(IP_DF)))
3005 features &= ~NETIF_F_TSO_MANGLEID;
3006 }
3007
3008 return features;
3009}
3010
3011netdev_features_t netif_skb_features(struct sk_buff *skb)
3012{
3013 struct net_device *dev = skb->dev;
3014 netdev_features_t features = dev->features;
3015
3016 if (skb_is_gso(skb))
3017 features = gso_features_check(skb, dev, features);
3018
3019 /* If encapsulation offload request, verify we are testing
3020 * hardware encapsulation features instead of standard
3021 * features for the netdev
3022 */
3023 if (skb->encapsulation)
3024 features &= dev->hw_enc_features;
3025
3026 if (skb_vlan_tagged(skb))
3027 features = netdev_intersect_features(features,
3028 dev->vlan_features |
3029 NETIF_F_HW_VLAN_CTAG_TX |
3030 NETIF_F_HW_VLAN_STAG_TX);
3031
3032 if (dev->netdev_ops->ndo_features_check)
3033 features &= dev->netdev_ops->ndo_features_check(skb, dev,
3034 features);
3035 else
3036 features &= dflt_features_check(skb, dev, features);
3037
3038 return harmonize_features(skb, features);
3039}
3040EXPORT_SYMBOL(netif_skb_features);
3041
3042static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3043 struct netdev_queue *txq, bool more)
3044{
3045 unsigned int len;
3046 int rc;
3047
3048 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
3049 dev_queue_xmit_nit(skb, dev);
3050
3051 len = skb->len;
3052 trace_net_dev_start_xmit(skb, dev);
3053 rc = netdev_start_xmit(skb, dev, txq, more);
3054 trace_net_dev_xmit(skb, rc, dev, len);
3055
3056 return rc;
3057}
3058
3059struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3060 struct netdev_queue *txq, int *ret)
3061{
3062 struct sk_buff *skb = first;
3063 int rc = NETDEV_TX_OK;
3064
3065 while (skb) {
3066 struct sk_buff *next = skb->next;
3067
3068 skb->next = NULL;
3069 rc = xmit_one(skb, dev, txq, next != NULL);
3070 if (unlikely(!dev_xmit_complete(rc))) {
3071 skb->next = next;
3072 goto out;
3073 }
3074
3075 skb = next;
3076 if (netif_xmit_stopped(txq) && skb) {
3077 rc = NETDEV_TX_BUSY;
3078 break;
3079 }
3080 }
3081
3082out:
3083 *ret = rc;
3084 return skb;
3085}
3086
3087static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3088 netdev_features_t features)
3089{
3090 if (skb_vlan_tag_present(skb) &&
3091 !vlan_hw_offload_capable(features, skb->vlan_proto))
3092 skb = __vlan_hwaccel_push_inside(skb);
3093 return skb;
3094}
3095
3096int skb_csum_hwoffload_help(struct sk_buff *skb,
3097 const netdev_features_t features)
3098{
3099 if (unlikely(skb->csum_not_inet))
3100 return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3101 skb_crc32c_csum_help(skb);
3102
3103 return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
3104}
3105EXPORT_SYMBOL(skb_csum_hwoffload_help);
3106
3107static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3108{
3109 netdev_features_t features;
3110
3111 features = netif_skb_features(skb);
3112 skb = validate_xmit_vlan(skb, features);
3113 if (unlikely(!skb))
3114 goto out_null;
3115
3116 if (netif_needs_gso(skb, features)) {
3117 struct sk_buff *segs;
3118
3119 segs = skb_gso_segment(skb, features);
3120 if (IS_ERR(segs)) {
3121 goto out_kfree_skb;
3122 } else if (segs) {
3123 consume_skb(skb);
3124 skb = segs;
3125 }
3126 } else {
3127 if (skb_needs_linearize(skb, features) &&
3128 __skb_linearize(skb))
3129 goto out_kfree_skb;
3130
3131 /* If packet is not checksummed and device does not
3132 * support checksumming for this protocol, complete
3133 * checksumming here.
3134 */
3135 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3136 if (skb->encapsulation)
3137 skb_set_inner_transport_header(skb,
3138 skb_checksum_start_offset(skb));
3139 else
3140 skb_set_transport_header(skb,
3141 skb_checksum_start_offset(skb));
3142 if (skb_csum_hwoffload_help(skb, features))
3143 goto out_kfree_skb;
3144 }
3145 }
3146
3147 skb = validate_xmit_xfrm(skb, features, again);
3148
3149 return skb;
3150
3151out_kfree_skb:
3152 kfree_skb(skb);
3153out_null:
3154 atomic_long_inc(&dev->tx_dropped);
3155 return NULL;
3156}
3157
3158struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3159{
3160 struct sk_buff *next, *head = NULL, *tail;
3161
3162 for (; skb != NULL; skb = next) {
3163 next = skb->next;
3164 skb->next = NULL;
3165
3166 /* in case skb wont be segmented, point to itself */
3167 skb->prev = skb;
3168
3169 skb = validate_xmit_skb(skb, dev, again);
3170 if (!skb)
3171 continue;
3172
3173 if (!head)
3174 head = skb;
3175 else
3176 tail->next = skb;
3177 /* If skb was segmented, skb->prev points to
3178 * the last segment. If not, it still contains skb.
3179 */
3180 tail = skb->prev;
3181 }
3182 return head;
3183}
3184EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3185
3186static void qdisc_pkt_len_init(struct sk_buff *skb)
3187{
3188 const struct skb_shared_info *shinfo = skb_shinfo(skb);
3189
3190 qdisc_skb_cb(skb)->pkt_len = skb->len;
3191
3192 /* To get more precise estimation of bytes sent on wire,
3193 * we add to pkt_len the headers size of all segments
3194 */
3195 if (shinfo->gso_size) {
3196 unsigned int hdr_len;
3197 u16 gso_segs = shinfo->gso_segs;
3198
3199 /* mac layer + network layer */
3200 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3201
3202 /* + transport layer */
3203 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3204 const struct tcphdr *th;
3205 struct tcphdr _tcphdr;
3206
3207 th = skb_header_pointer(skb, skb_transport_offset(skb),
3208 sizeof(_tcphdr), &_tcphdr);
3209 if (likely(th))
3210 hdr_len += __tcp_hdrlen(th);
3211 } else {
3212 struct udphdr _udphdr;
3213
3214 if (skb_header_pointer(skb, skb_transport_offset(skb),
3215 sizeof(_udphdr), &_udphdr))
3216 hdr_len += sizeof(struct udphdr);
3217 }
3218
3219 if (shinfo->gso_type & SKB_GSO_DODGY)
3220 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3221 shinfo->gso_size);
3222
3223 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3224 }
3225}
3226
3227static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3228 struct net_device *dev,
3229 struct netdev_queue *txq)
3230{
3231 spinlock_t *root_lock = qdisc_lock(q);
3232 struct sk_buff *to_free = NULL;
3233 bool contended;
3234 int rc;
3235
3236 qdisc_calculate_pkt_len(skb, q);
3237
3238 if (q->flags & TCQ_F_NOLOCK) {
3239 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3240 __qdisc_drop(skb, &to_free);
3241 rc = NET_XMIT_DROP;
3242 } else {
3243 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3244 __qdisc_run(q);
3245 }
3246
3247 if (unlikely(to_free))
3248 kfree_skb_list(to_free);
3249 return rc;
3250 }
3251
3252 /*
3253 * Heuristic to force contended enqueues to serialize on a
3254 * separate lock before trying to get qdisc main lock.
3255 * This permits qdisc->running owner to get the lock more
3256 * often and dequeue packets faster.
3257 */
3258 contended = qdisc_is_running(q);
3259 if (unlikely(contended))
3260 spin_lock(&q->busylock);
3261
3262 spin_lock(root_lock);
3263 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3264 __qdisc_drop(skb, &to_free);
3265 rc = NET_XMIT_DROP;
3266 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3267 qdisc_run_begin(q)) {
3268 /*
3269 * This is a work-conserving queue; there are no old skbs
3270 * waiting to be sent out; and the qdisc is not running -
3271 * xmit the skb directly.
3272 */
3273
3274 qdisc_bstats_update(q, skb);
3275
3276 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3277 if (unlikely(contended)) {
3278 spin_unlock(&q->busylock);
3279 contended = false;
3280 }
3281 __qdisc_run(q);
3282 }
3283
3284 qdisc_run_end(q);
3285 rc = NET_XMIT_SUCCESS;
3286 } else {
3287 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3288 if (qdisc_run_begin(q)) {
3289 if (unlikely(contended)) {
3290 spin_unlock(&q->busylock);
3291 contended = false;
3292 }
3293 __qdisc_run(q);
3294 qdisc_run_end(q);
3295 }
3296 }
3297 spin_unlock(root_lock);
3298 if (unlikely(to_free))
3299 kfree_skb_list(to_free);
3300 if (unlikely(contended))
3301 spin_unlock(&q->busylock);
3302 return rc;
3303}
3304
3305#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3306static void skb_update_prio(struct sk_buff *skb)
3307{
3308 const struct netprio_map *map;
3309 const struct sock *sk;
3310 unsigned int prioidx;
3311
3312 if (skb->priority)
3313 return;
3314 map = rcu_dereference_bh(skb->dev->priomap);
3315 if (!map)
3316 return;
3317 sk = skb_to_full_sk(skb);
3318 if (!sk)
3319 return;
3320
3321 prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3322
3323 if (prioidx < map->priomap_len)
3324 skb->priority = map->priomap[prioidx];
3325}
3326#else
3327#define skb_update_prio(skb)
3328#endif
3329
3330DEFINE_PER_CPU(int, xmit_recursion);
3331EXPORT_SYMBOL(xmit_recursion);
3332
3333/**
3334 * dev_loopback_xmit - loop back @skb
3335 * @net: network namespace this loopback is happening in
3336 * @sk: sk needed to be a netfilter okfn
3337 * @skb: buffer to transmit
3338 */
3339int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3340{
3341 skb_reset_mac_header(skb);
3342 __skb_pull(skb, skb_network_offset(skb));
3343 skb->pkt_type = PACKET_LOOPBACK;
3344 skb->ip_summed = CHECKSUM_UNNECESSARY;
3345 WARN_ON(!skb_dst(skb));
3346 skb_dst_force(skb);
3347 netif_rx_ni(skb);
3348 return 0;
3349}
3350EXPORT_SYMBOL(dev_loopback_xmit);
3351
3352#ifdef CONFIG_NET_EGRESS
3353static struct sk_buff *
3354sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3355{
3356 struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3357 struct tcf_result cl_res;
3358
3359 if (!miniq)
3360 return skb;
3361
3362 /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3363 mini_qdisc_bstats_cpu_update(miniq, skb);
3364
3365 switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3366 case TC_ACT_OK:
3367 case TC_ACT_RECLASSIFY:
3368 skb->tc_index = TC_H_MIN(cl_res.classid);
3369 break;
3370 case TC_ACT_SHOT:
3371 mini_qdisc_qstats_cpu_drop(miniq);
3372 *ret = NET_XMIT_DROP;
3373 kfree_skb(skb);
3374 return NULL;
3375 case TC_ACT_STOLEN:
3376 case TC_ACT_QUEUED:
3377 case TC_ACT_TRAP:
3378 *ret = NET_XMIT_SUCCESS;
3379 consume_skb(skb);
3380 return NULL;
3381 case TC_ACT_REDIRECT:
3382 /* No need to push/pop skb's mac_header here on egress! */
3383 skb_do_redirect(skb);
3384 *ret = NET_XMIT_SUCCESS;
3385 return NULL;
3386 default:
3387 break;
3388 }
3389
3390 return skb;
3391}
3392#endif /* CONFIG_NET_EGRESS */
3393
3394static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3395{
3396#ifdef CONFIG_XPS
3397 struct xps_dev_maps *dev_maps;
3398 struct xps_map *map;
3399 int queue_index = -1;
3400
3401 rcu_read_lock();
3402 dev_maps = rcu_dereference(dev->xps_maps);
3403 if (dev_maps) {
3404 unsigned int tci = skb->sender_cpu - 1;
3405
3406 if (dev->num_tc) {
3407 tci *= dev->num_tc;
3408 tci += netdev_get_prio_tc_map(dev, skb->priority);
3409 }
3410
3411 map = rcu_dereference(dev_maps->cpu_map[tci]);
3412 if (map) {
3413 if (map->len == 1)
3414 queue_index = map->queues[0];
3415 else
3416 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3417 map->len)];
3418 if (unlikely(queue_index >= dev->real_num_tx_queues))
3419 queue_index = -1;
3420 }
3421 }
3422 rcu_read_unlock();
3423
3424 return queue_index;
3425#else
3426 return -1;
3427#endif
3428}
3429
3430static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3431{
3432 struct sock *sk = skb->sk;
3433 int queue_index = sk_tx_queue_get(sk);
3434
3435 if (queue_index < 0 || skb->ooo_okay ||
3436 queue_index >= dev->real_num_tx_queues) {
3437 int new_index = get_xps_queue(dev, skb);
3438
3439 if (new_index < 0)
3440 new_index = skb_tx_hash(dev, skb);
3441
3442 if (queue_index != new_index && sk &&
3443 sk_fullsock(sk) &&
3444 rcu_access_pointer(sk->sk_dst_cache))
3445 sk_tx_queue_set(sk, new_index);
3446
3447 queue_index = new_index;
3448 }
3449
3450 return queue_index;
3451}
3452
3453struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3454 struct sk_buff *skb,
3455 void *accel_priv)
3456{
3457 int queue_index = 0;
3458
3459#ifdef CONFIG_XPS
3460 u32 sender_cpu = skb->sender_cpu - 1;
3461
3462 if (sender_cpu >= (u32)NR_CPUS)
3463 skb->sender_cpu = raw_smp_processor_id() + 1;
3464#endif
3465
3466 if (dev->real_num_tx_queues != 1) {
3467 const struct net_device_ops *ops = dev->netdev_ops;
3468
3469 if (ops->ndo_select_queue)
3470 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3471 __netdev_pick_tx);
3472 else
3473 queue_index = __netdev_pick_tx(dev, skb);
3474
3475 queue_index = netdev_cap_txqueue(dev, queue_index);
3476 }
3477
3478 skb_set_queue_mapping(skb, queue_index);
3479 return netdev_get_tx_queue(dev, queue_index);
3480}
3481
3482/**
3483 * __dev_queue_xmit - transmit a buffer
3484 * @skb: buffer to transmit
3485 * @accel_priv: private data used for L2 forwarding offload
3486 *
3487 * Queue a buffer for transmission to a network device. The caller must
3488 * have set the device and priority and built the buffer before calling
3489 * this function. The function can be called from an interrupt.
3490 *
3491 * A negative errno code is returned on a failure. A success does not
3492 * guarantee the frame will be transmitted as it may be dropped due
3493 * to congestion or traffic shaping.
3494 *
3495 * -----------------------------------------------------------------------------------
3496 * I notice this method can also return errors from the queue disciplines,
3497 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3498 * be positive.
3499 *
3500 * Regardless of the return value, the skb is consumed, so it is currently
3501 * difficult to retry a send to this method. (You can bump the ref count
3502 * before sending to hold a reference for retry if you are careful.)
3503 *
3504 * When calling this method, interrupts MUST be enabled. This is because
3505 * the BH enable code must have IRQs enabled so that it will not deadlock.
3506 * --BLG
3507 */
3508static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3509{
3510 struct net_device *dev = skb->dev;
3511 struct netdev_queue *txq;
3512 struct Qdisc *q;
3513 int rc = -ENOMEM;
3514 bool again = false;
3515
3516 skb_reset_mac_header(skb);
3517
3518 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3519 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3520
3521 /* Disable soft irqs for various locks below. Also
3522 * stops preemption for RCU.
3523 */
3524 rcu_read_lock_bh();
3525
3526 skb_update_prio(skb);
3527
3528 qdisc_pkt_len_init(skb);
3529#ifdef CONFIG_NET_CLS_ACT
3530 skb->tc_at_ingress = 0;
3531# ifdef CONFIG_NET_EGRESS
3532 if (static_key_false(&egress_needed)) {
3533 skb = sch_handle_egress(skb, &rc, dev);
3534 if (!skb)
3535 goto out;
3536 }
3537# endif
3538#endif
3539 /* If device/qdisc don't need skb->dst, release it right now while
3540 * its hot in this cpu cache.
3541 */
3542 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3543 skb_dst_drop(skb);
3544 else
3545 skb_dst_force(skb);
3546
3547 txq = netdev_pick_tx(dev, skb, accel_priv);
3548 q = rcu_dereference_bh(txq->qdisc);
3549
3550 trace_net_dev_queue(skb);
3551 if (q->enqueue) {
3552 rc = __dev_xmit_skb(skb, q, dev, txq);
3553 goto out;
3554 }
3555
3556 /* The device has no queue. Common case for software devices:
3557 * loopback, all the sorts of tunnels...
3558
3559 * Really, it is unlikely that netif_tx_lock protection is necessary
3560 * here. (f.e. loopback and IP tunnels are clean ignoring statistics
3561 * counters.)
3562 * However, it is possible, that they rely on protection
3563 * made by us here.
3564
3565 * Check this and shot the lock. It is not prone from deadlocks.
3566 *Either shot noqueue qdisc, it is even simpler 8)
3567 */
3568 if (dev->flags & IFF_UP) {
3569 int cpu = smp_processor_id(); /* ok because BHs are off */
3570
3571 if (txq->xmit_lock_owner != cpu) {
3572 if (unlikely(__this_cpu_read(xmit_recursion) >
3573 XMIT_RECURSION_LIMIT))
3574 goto recursion_alert;
3575
3576 skb = validate_xmit_skb(skb, dev, &again);
3577 if (!skb)
3578 goto out;
3579
3580 HARD_TX_LOCK(dev, txq, cpu);
3581
3582 if (!netif_xmit_stopped(txq)) {
3583 __this_cpu_inc(xmit_recursion);
3584 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3585 __this_cpu_dec(xmit_recursion);
3586 if (dev_xmit_complete(rc)) {
3587 HARD_TX_UNLOCK(dev, txq);
3588 goto out;
3589 }
3590 }
3591 HARD_TX_UNLOCK(dev, txq);
3592 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3593 dev->name);
3594 } else {
3595 /* Recursion is detected! It is possible,
3596 * unfortunately
3597 */
3598recursion_alert:
3599 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3600 dev->name);
3601 }
3602 }
3603
3604 rc = -ENETDOWN;
3605 rcu_read_unlock_bh();
3606
3607 atomic_long_inc(&dev->tx_dropped);
3608 kfree_skb_list(skb);
3609 return rc;
3610out:
3611 rcu_read_unlock_bh();
3612 return rc;
3613}
3614
3615int dev_queue_xmit(struct sk_buff *skb)
3616{
3617 return __dev_queue_xmit(skb, NULL);
3618}
3619EXPORT_SYMBOL(dev_queue_xmit);
3620
3621int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3622{
3623 return __dev_queue_xmit(skb, accel_priv);
3624}
3625EXPORT_SYMBOL(dev_queue_xmit_accel);
3626
3627
3628/*************************************************************************
3629 * Receiver routines
3630 *************************************************************************/
3631
3632int netdev_max_backlog __read_mostly = 1000;
3633EXPORT_SYMBOL(netdev_max_backlog);
3634
3635int netdev_tstamp_prequeue __read_mostly = 1;
3636int netdev_budget __read_mostly = 300;
3637unsigned int __read_mostly netdev_budget_usecs = 2000;
3638int weight_p __read_mostly = 64; /* old backlog weight */
3639int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
3640int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
3641int dev_rx_weight __read_mostly = 64;
3642int dev_tx_weight __read_mostly = 64;
3643
3644/* Called with irq disabled */
3645static inline void ____napi_schedule(struct softnet_data *sd,
3646 struct napi_struct *napi)
3647{
3648 list_add_tail(&napi->poll_list, &sd->poll_list);
3649 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3650}
3651
3652#ifdef CONFIG_RPS
3653
3654/* One global table that all flow-based protocols share. */
3655struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3656EXPORT_SYMBOL(rps_sock_flow_table);
3657u32 rps_cpu_mask __read_mostly;
3658EXPORT_SYMBOL(rps_cpu_mask);
3659
3660struct static_key rps_needed __read_mostly;
3661EXPORT_SYMBOL(rps_needed);
3662struct static_key rfs_needed __read_mostly;
3663EXPORT_SYMBOL(rfs_needed);
3664
3665static struct rps_dev_flow *
3666set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3667 struct rps_dev_flow *rflow, u16 next_cpu)
3668{
3669 if (next_cpu < nr_cpu_ids) {
3670#ifdef CONFIG_RFS_ACCEL
3671 struct netdev_rx_queue *rxqueue;
3672 struct rps_dev_flow_table *flow_table;
3673 struct rps_dev_flow *old_rflow;
3674 u32 flow_id;
3675 u16 rxq_index;
3676 int rc;
3677
3678 /* Should we steer this flow to a different hardware queue? */
3679 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3680 !(dev->features & NETIF_F_NTUPLE))
3681 goto out;
3682 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3683 if (rxq_index == skb_get_rx_queue(skb))
3684 goto out;
3685
3686 rxqueue = dev->_rx + rxq_index;
3687 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3688 if (!flow_table)
3689 goto out;
3690 flow_id = skb_get_hash(skb) & flow_table->mask;
3691 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3692 rxq_index, flow_id);
3693 if (rc < 0)
3694 goto out;
3695 old_rflow = rflow;
3696 rflow = &flow_table->flows[flow_id];
3697 rflow->filter = rc;
3698 if (old_rflow->filter == rflow->filter)
3699 old_rflow->filter = RPS_NO_FILTER;
3700 out:
3701#endif
3702 rflow->last_qtail =
3703 per_cpu(softnet_data, next_cpu).input_queue_head;
3704 }
3705
3706 rflow->cpu = next_cpu;
3707 return rflow;
3708}
3709
3710/*
3711 * get_rps_cpu is called from netif_receive_skb and returns the target
3712 * CPU from the RPS map of the receiving queue for a given skb.
3713 * rcu_read_lock must be held on entry.
3714 */
3715static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3716 struct rps_dev_flow **rflowp)
3717{
3718 const struct rps_sock_flow_table *sock_flow_table;
3719 struct netdev_rx_queue *rxqueue = dev->_rx;
3720 struct rps_dev_flow_table *flow_table;
3721 struct rps_map *map;
3722 int cpu = -1;
3723 u32 tcpu;
3724 u32 hash;
3725
3726 if (skb_rx_queue_recorded(skb)) {
3727 u16 index = skb_get_rx_queue(skb);
3728
3729 if (unlikely(index >= dev->real_num_rx_queues)) {
3730 WARN_ONCE(dev->real_num_rx_queues > 1,
3731 "%s received packet on queue %u, but number "
3732 "of RX queues is %u\n",
3733 dev->name, index, dev->real_num_rx_queues);
3734 goto done;
3735 }
3736 rxqueue += index;
3737 }
3738
3739 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3740
3741 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3742 map = rcu_dereference(rxqueue->rps_map);
3743 if (!flow_table && !map)
3744 goto done;
3745
3746 skb_reset_network_header(skb);
3747 hash = skb_get_hash(skb);
3748 if (!hash)
3749 goto done;
3750
3751 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3752 if (flow_table && sock_flow_table) {
3753 struct rps_dev_flow *rflow;
3754 u32 next_cpu;
3755 u32 ident;
3756
3757 /* First check into global flow table if there is a match */
3758 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3759 if ((ident ^ hash) & ~rps_cpu_mask)
3760 goto try_rps;
3761
3762 next_cpu = ident & rps_cpu_mask;
3763
3764 /* OK, now we know there is a match,
3765 * we can look at the local (per receive queue) flow table
3766 */
3767 rflow = &flow_table->flows[hash & flow_table->mask];
3768 tcpu = rflow->cpu;
3769
3770 /*
3771 * If the desired CPU (where last recvmsg was done) is
3772 * different from current CPU (one in the rx-queue flow
3773 * table entry), switch if one of the following holds:
3774 * - Current CPU is unset (>= nr_cpu_ids).
3775 * - Current CPU is offline.
3776 * - The current CPU's queue tail has advanced beyond the
3777 * last packet that was enqueued using this table entry.
3778 * This guarantees that all previous packets for the flow
3779 * have been dequeued, thus preserving in order delivery.
3780 */
3781 if (unlikely(tcpu != next_cpu) &&
3782 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3783 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3784 rflow->last_qtail)) >= 0)) {
3785 tcpu = next_cpu;
3786 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3787 }
3788
3789 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3790 *rflowp = rflow;
3791 cpu = tcpu;
3792 goto done;
3793 }
3794 }
3795
3796try_rps:
3797
3798 if (map) {
3799 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3800 if (cpu_online(tcpu)) {
3801 cpu = tcpu;
3802 goto done;
3803 }
3804 }
3805
3806done:
3807 return cpu;
3808}
3809
3810#ifdef CONFIG_RFS_ACCEL
3811
3812/**
3813 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3814 * @dev: Device on which the filter was set
3815 * @rxq_index: RX queue index
3816 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3817 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3818 *
3819 * Drivers that implement ndo_rx_flow_steer() should periodically call
3820 * this function for each installed filter and remove the filters for
3821 * which it returns %true.
3822 */
3823bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3824 u32 flow_id, u16 filter_id)
3825{
3826 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3827 struct rps_dev_flow_table *flow_table;
3828 struct rps_dev_flow *rflow;
3829 bool expire = true;
3830 unsigned int cpu;
3831
3832 rcu_read_lock();
3833 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3834 if (flow_table && flow_id <= flow_table->mask) {
3835 rflow = &flow_table->flows[flow_id];
3836 cpu = READ_ONCE(rflow->cpu);
3837 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3838 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3839 rflow->last_qtail) <
3840 (int)(10 * flow_table->mask)))
3841 expire = false;
3842 }
3843 rcu_read_unlock();
3844 return expire;
3845}
3846EXPORT_SYMBOL(rps_may_expire_flow);
3847
3848#endif /* CONFIG_RFS_ACCEL */
3849
3850/* Called from hardirq (IPI) context */
3851static void rps_trigger_softirq(void *data)
3852{
3853 struct softnet_data *sd = data;
3854
3855 ____napi_schedule(sd, &sd->backlog);
3856 sd->received_rps++;
3857}
3858
3859#endif /* CONFIG_RPS */
3860
3861/*
3862 * Check if this softnet_data structure is another cpu one
3863 * If yes, queue it to our IPI list and return 1
3864 * If no, return 0
3865 */
3866static int rps_ipi_queued(struct softnet_data *sd)
3867{
3868#ifdef CONFIG_RPS
3869 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3870
3871 if (sd != mysd) {
3872 sd->rps_ipi_next = mysd->rps_ipi_list;
3873 mysd->rps_ipi_list = sd;
3874
3875 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3876 return 1;
3877 }
3878#endif /* CONFIG_RPS */
3879 return 0;
3880}
3881
3882#ifdef CONFIG_NET_FLOW_LIMIT
3883int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3884#endif
3885
3886static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3887{
3888#ifdef CONFIG_NET_FLOW_LIMIT
3889 struct sd_flow_limit *fl;
3890 struct softnet_data *sd;
3891 unsigned int old_flow, new_flow;
3892
3893 if (qlen < (netdev_max_backlog >> 1))
3894 return false;
3895
3896 sd = this_cpu_ptr(&softnet_data);
3897
3898 rcu_read_lock();
3899 fl = rcu_dereference(sd->flow_limit);
3900 if (fl) {
3901 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3902 old_flow = fl->history[fl->history_head];
3903 fl->history[fl->history_head] = new_flow;
3904
3905 fl->history_head++;
3906 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3907
3908 if (likely(fl->buckets[old_flow]))
3909 fl->buckets[old_flow]--;
3910
3911 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3912 fl->count++;
3913 rcu_read_unlock();
3914 return true;
3915 }
3916 }
3917 rcu_read_unlock();
3918#endif
3919 return false;
3920}
3921
3922/*
3923 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3924 * queue (may be a remote CPU queue).
3925 */
3926static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3927 unsigned int *qtail)
3928{
3929 struct softnet_data *sd;
3930 unsigned long flags;
3931 unsigned int qlen;
3932
3933 sd = &per_cpu(softnet_data, cpu);
3934
3935 local_irq_save(flags);
3936
3937 rps_lock(sd);
3938 if (!netif_running(skb->dev))
3939 goto drop;
3940 qlen = skb_queue_len(&sd->input_pkt_queue);
3941 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3942 if (qlen) {
3943enqueue:
3944 __skb_queue_tail(&sd->input_pkt_queue, skb);
3945 input_queue_tail_incr_save(sd, qtail);
3946 rps_unlock(sd);
3947 local_irq_restore(flags);
3948 return NET_RX_SUCCESS;
3949 }
3950
3951 /* Schedule NAPI for backlog device
3952 * We can use non atomic operation since we own the queue lock
3953 */
3954 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3955 if (!rps_ipi_queued(sd))
3956 ____napi_schedule(sd, &sd->backlog);
3957 }
3958 goto enqueue;
3959 }
3960
3961drop:
3962 sd->dropped++;
3963 rps_unlock(sd);
3964
3965 local_irq_restore(flags);
3966
3967 atomic_long_inc(&skb->dev->rx_dropped);
3968 kfree_skb(skb);
3969 return NET_RX_DROP;
3970}
3971
3972static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
3973{
3974 struct net_device *dev = skb->dev;
3975 struct netdev_rx_queue *rxqueue;
3976
3977 rxqueue = dev->_rx;
3978
3979 if (skb_rx_queue_recorded(skb)) {
3980 u16 index = skb_get_rx_queue(skb);
3981
3982 if (unlikely(index >= dev->real_num_rx_queues)) {
3983 WARN_ONCE(dev->real_num_rx_queues > 1,
3984 "%s received packet on queue %u, but number "
3985 "of RX queues is %u\n",
3986 dev->name, index, dev->real_num_rx_queues);
3987
3988 return rxqueue; /* Return first rxqueue */
3989 }
3990 rxqueue += index;
3991 }
3992 return rxqueue;
3993}
3994
3995static u32 netif_receive_generic_xdp(struct sk_buff *skb,
3996 struct bpf_prog *xdp_prog)
3997{
3998 struct netdev_rx_queue *rxqueue;
3999 u32 metalen, act = XDP_DROP;
4000 struct xdp_buff xdp;
4001 void *orig_data;
4002 int hlen, off;
4003 u32 mac_len;
4004
4005 /* Reinjected packets coming from act_mirred or similar should
4006 * not get XDP generic processing.
4007 */
4008 if (skb_cloned(skb))
4009 return XDP_PASS;
4010
4011 /* XDP packets must be linear and must have sufficient headroom
4012 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4013 * native XDP provides, thus we need to do it here as well.
4014 */
4015 if (skb_is_nonlinear(skb) ||
4016 skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4017 int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4018 int troom = skb->tail + skb->data_len - skb->end;
4019
4020 /* In case we have to go down the path and also linearize,
4021 * then lets do the pskb_expand_head() work just once here.
4022 */
4023 if (pskb_expand_head(skb,
4024 hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4025 troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4026 goto do_drop;
4027 if (skb_linearize(skb))
4028 goto do_drop;
4029 }
4030
4031 /* The XDP program wants to see the packet starting at the MAC
4032 * header.
4033 */
4034 mac_len = skb->data - skb_mac_header(skb);
4035 hlen = skb_headlen(skb) + mac_len;
4036 xdp.data = skb->data - mac_len;
4037 xdp.data_meta = xdp.data;
4038 xdp.data_end = xdp.data + hlen;
4039 xdp.data_hard_start = skb->data - skb_headroom(skb);
4040 orig_data = xdp.data;
4041
4042 rxqueue = netif_get_rxqueue(skb);
4043 xdp.rxq = &rxqueue->xdp_rxq;
4044
4045 act = bpf_prog_run_xdp(xdp_prog, &xdp);
4046
4047 off = xdp.data - orig_data;
4048 if (off > 0)
4049 __skb_pull(skb, off);
4050 else if (off < 0)
4051 __skb_push(skb, -off);
4052 skb->mac_header += off;
4053
4054 switch (act) {
4055 case XDP_REDIRECT:
4056 case XDP_TX:
4057 __skb_push(skb, mac_len);
4058 break;
4059 case XDP_PASS:
4060 metalen = xdp.data - xdp.data_meta;
4061 if (metalen)
4062 skb_metadata_set(skb, metalen);
4063 break;
4064 default:
4065 bpf_warn_invalid_xdp_action(act);
4066 /* fall through */
4067 case XDP_ABORTED:
4068 trace_xdp_exception(skb->dev, xdp_prog, act);
4069 /* fall through */
4070 case XDP_DROP:
4071 do_drop:
4072 kfree_skb(skb);
4073 break;
4074 }
4075
4076 return act;
4077}
4078
4079/* When doing generic XDP we have to bypass the qdisc layer and the
4080 * network taps in order to match in-driver-XDP behavior.
4081 */
4082void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4083{
4084 struct net_device *dev = skb->dev;
4085 struct netdev_queue *txq;
4086 bool free_skb = true;
4087 int cpu, rc;
4088
4089 txq = netdev_pick_tx(dev, skb, NULL);
4090 cpu = smp_processor_id();
4091 HARD_TX_LOCK(dev, txq, cpu);
4092 if (!netif_xmit_stopped(txq)) {
4093 rc = netdev_start_xmit(skb, dev, txq, 0);
4094 if (dev_xmit_complete(rc))
4095 free_skb = false;
4096 }
4097 HARD_TX_UNLOCK(dev, txq);
4098 if (free_skb) {
4099 trace_xdp_exception(dev, xdp_prog, XDP_TX);
4100 kfree_skb(skb);
4101 }
4102}
4103EXPORT_SYMBOL_GPL(generic_xdp_tx);
4104
4105static struct static_key generic_xdp_needed __read_mostly;
4106
4107int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4108{
4109 if (xdp_prog) {
4110 u32 act = netif_receive_generic_xdp(skb, xdp_prog);
4111 int err;
4112
4113 if (act != XDP_PASS) {
4114 switch (act) {
4115 case XDP_REDIRECT:
4116 err = xdp_do_generic_redirect(skb->dev, skb,
4117 xdp_prog);
4118 if (err)
4119 goto out_redir;
4120 /* fallthru to submit skb */
4121 case XDP_TX:
4122 generic_xdp_tx(skb, xdp_prog);
4123 break;
4124 }
4125 return XDP_DROP;
4126 }
4127 }
4128 return XDP_PASS;
4129out_redir:
4130 kfree_skb(skb);
4131 return XDP_DROP;
4132}
4133EXPORT_SYMBOL_GPL(do_xdp_generic);
4134
4135static int netif_rx_internal(struct sk_buff *skb)
4136{
4137 int ret;
4138
4139 net_timestamp_check(netdev_tstamp_prequeue, skb);
4140
4141 trace_netif_rx(skb);
4142
4143 if (static_key_false(&generic_xdp_needed)) {
4144 int ret;
4145
4146 preempt_disable();
4147 rcu_read_lock();
4148 ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
4149 rcu_read_unlock();
4150 preempt_enable();
4151
4152 /* Consider XDP consuming the packet a success from
4153 * the netdev point of view we do not want to count
4154 * this as an error.
4155 */
4156 if (ret != XDP_PASS)
4157 return NET_RX_SUCCESS;
4158 }
4159
4160#ifdef CONFIG_RPS
4161 if (static_key_false(&rps_needed)) {
4162 struct rps_dev_flow voidflow, *rflow = &voidflow;
4163 int cpu;
4164
4165 preempt_disable();
4166 rcu_read_lock();
4167
4168 cpu = get_rps_cpu(skb->dev, skb, &rflow);
4169 if (cpu < 0)
4170 cpu = smp_processor_id();
4171
4172 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4173
4174 rcu_read_unlock();
4175 preempt_enable();
4176 } else
4177#endif
4178 {
4179 unsigned int qtail;
4180
4181 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4182 put_cpu();
4183 }
4184 return ret;
4185}
4186
4187/**
4188 * netif_rx - post buffer to the network code
4189 * @skb: buffer to post
4190 *
4191 * This function receives a packet from a device driver and queues it for
4192 * the upper (protocol) levels to process. It always succeeds. The buffer
4193 * may be dropped during processing for congestion control or by the
4194 * protocol layers.
4195 *
4196 * return values:
4197 * NET_RX_SUCCESS (no congestion)
4198 * NET_RX_DROP (packet was dropped)
4199 *
4200 */
4201
4202int netif_rx(struct sk_buff *skb)
4203{
4204 trace_netif_rx_entry(skb);
4205
4206 return netif_rx_internal(skb);
4207}
4208EXPORT_SYMBOL(netif_rx);
4209
4210int netif_rx_ni(struct sk_buff *skb)
4211{
4212 int err;
4213
4214 trace_netif_rx_ni_entry(skb);
4215
4216 preempt_disable();
4217 err = netif_rx_internal(skb);
4218 if (local_softirq_pending())
4219 do_softirq();
4220 preempt_enable();
4221
4222 return err;
4223}
4224EXPORT_SYMBOL(netif_rx_ni);
4225
4226static __latent_entropy void net_tx_action(struct softirq_action *h)
4227{
4228 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4229
4230 if (sd->completion_queue) {
4231 struct sk_buff *clist;
4232
4233 local_irq_disable();
4234 clist = sd->completion_queue;
4235 sd->completion_queue = NULL;
4236 local_irq_enable();
4237
4238 while (clist) {
4239 struct sk_buff *skb = clist;
4240
4241 clist = clist->next;
4242
4243 WARN_ON(refcount_read(&skb->users));
4244 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4245 trace_consume_skb(skb);
4246 else
4247 trace_kfree_skb(skb, net_tx_action);
4248
4249 if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4250 __kfree_skb(skb);
4251 else
4252 __kfree_skb_defer(skb);
4253 }
4254
4255 __kfree_skb_flush();
4256 }
4257
4258 if (sd->output_queue) {
4259 struct Qdisc *head;
4260
4261 local_irq_disable();
4262 head = sd->output_queue;
4263 sd->output_queue = NULL;
4264 sd->output_queue_tailp = &sd->output_queue;
4265 local_irq_enable();
4266
4267 while (head) {
4268 struct Qdisc *q = head;
4269 spinlock_t *root_lock = NULL;
4270
4271 head = head->next_sched;
4272
4273 if (!(q->flags & TCQ_F_NOLOCK)) {
4274 root_lock = qdisc_lock(q);
4275 spin_lock(root_lock);
4276 }
4277 /* We need to make sure head->next_sched is read
4278 * before clearing __QDISC_STATE_SCHED
4279 */
4280 smp_mb__before_atomic();
4281 clear_bit(__QDISC_STATE_SCHED, &q->state);
4282 qdisc_run(q);
4283 if (root_lock)
4284 spin_unlock(root_lock);
4285 }
4286 }
4287
4288 xfrm_dev_backlog(sd);
4289}
4290
4291#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4292/* This hook is defined here for ATM LANE */
4293int (*br_fdb_test_addr_hook)(struct net_device *dev,
4294 unsigned char *addr) __read_mostly;
4295EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4296#endif
4297
4298static inline struct sk_buff *
4299sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4300 struct net_device *orig_dev)
4301{
4302#ifdef CONFIG_NET_CLS_ACT
4303 struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
4304 struct tcf_result cl_res;
4305
4306 /* If there's at least one ingress present somewhere (so
4307 * we get here via enabled static key), remaining devices
4308 * that are not configured with an ingress qdisc will bail
4309 * out here.
4310 */
4311 if (!miniq)
4312 return skb;
4313
4314 if (*pt_prev) {
4315 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4316 *pt_prev = NULL;
4317 }
4318
4319 qdisc_skb_cb(skb)->pkt_len = skb->len;
4320 skb->tc_at_ingress = 1;
4321 mini_qdisc_bstats_cpu_update(miniq, skb);
4322
4323 switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
4324 case TC_ACT_OK:
4325 case TC_ACT_RECLASSIFY:
4326 skb->tc_index = TC_H_MIN(cl_res.classid);
4327 break;
4328 case TC_ACT_SHOT:
4329 mini_qdisc_qstats_cpu_drop(miniq);
4330 kfree_skb(skb);
4331 return NULL;
4332 case TC_ACT_STOLEN:
4333 case TC_ACT_QUEUED:
4334 case TC_ACT_TRAP:
4335 consume_skb(skb);
4336 return NULL;
4337 case TC_ACT_REDIRECT:
4338 /* skb_mac_header check was done by cls/act_bpf, so
4339 * we can safely push the L2 header back before
4340 * redirecting to another netdev
4341 */
4342 __skb_push(skb, skb->mac_len);
4343 skb_do_redirect(skb);
4344 return NULL;
4345 default:
4346 break;
4347 }
4348#endif /* CONFIG_NET_CLS_ACT */
4349 return skb;
4350}
4351
4352/**
4353 * netdev_is_rx_handler_busy - check if receive handler is registered
4354 * @dev: device to check
4355 *
4356 * Check if a receive handler is already registered for a given device.
4357 * Return true if there one.
4358 *
4359 * The caller must hold the rtnl_mutex.
4360 */
4361bool netdev_is_rx_handler_busy(struct net_device *dev)
4362{
4363 ASSERT_RTNL();
4364 return dev && rtnl_dereference(dev->rx_handler);
4365}
4366EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
4367
4368/**
4369 * netdev_rx_handler_register - register receive handler
4370 * @dev: device to register a handler for
4371 * @rx_handler: receive handler to register
4372 * @rx_handler_data: data pointer that is used by rx handler
4373 *
4374 * Register a receive handler for a device. This handler will then be
4375 * called from __netif_receive_skb. A negative errno code is returned
4376 * on a failure.
4377 *
4378 * The caller must hold the rtnl_mutex.
4379 *
4380 * For a general description of rx_handler, see enum rx_handler_result.
4381 */
4382int netdev_rx_handler_register(struct net_device *dev,
4383 rx_handler_func_t *rx_handler,
4384 void *rx_handler_data)
4385{
4386 if (netdev_is_rx_handler_busy(dev))
4387 return -EBUSY;
4388
4389 if (dev->priv_flags & IFF_NO_RX_HANDLER)
4390 return -EINVAL;
4391
4392 /* Note: rx_handler_data must be set before rx_handler */
4393 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4394 rcu_assign_pointer(dev->rx_handler, rx_handler);
4395
4396 return 0;
4397}
4398EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4399
4400/**
4401 * netdev_rx_handler_unregister - unregister receive handler
4402 * @dev: device to unregister a handler from
4403 *
4404 * Unregister a receive handler from a device.
4405 *
4406 * The caller must hold the rtnl_mutex.
4407 */
4408void netdev_rx_handler_unregister(struct net_device *dev)
4409{
4410
4411 ASSERT_RTNL();
4412 RCU_INIT_POINTER(dev->rx_handler, NULL);
4413 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4414 * section has a guarantee to see a non NULL rx_handler_data
4415 * as well.
4416 */
4417 synchronize_net();
4418 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4419}
4420EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4421
4422/*
4423 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4424 * the special handling of PFMEMALLOC skbs.
4425 */
4426static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4427{
4428 switch (skb->protocol) {
4429 case htons(ETH_P_ARP):
4430 case htons(ETH_P_IP):
4431 case htons(ETH_P_IPV6):
4432 case htons(ETH_P_8021Q):
4433 case htons(ETH_P_8021AD):
4434 return true;
4435 default:
4436 return false;
4437 }
4438}
4439
4440static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4441 int *ret, struct net_device *orig_dev)
4442{
4443#ifdef CONFIG_NETFILTER_INGRESS
4444 if (nf_hook_ingress_active(skb)) {
4445 int ingress_retval;
4446
4447 if (*pt_prev) {
4448 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4449 *pt_prev = NULL;
4450 }
4451
4452 rcu_read_lock();
4453 ingress_retval = nf_hook_ingress(skb);
4454 rcu_read_unlock();
4455 return ingress_retval;
4456 }
4457#endif /* CONFIG_NETFILTER_INGRESS */
4458 return 0;
4459}
4460
4461static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4462{
4463 struct packet_type *ptype, *pt_prev;
4464 rx_handler_func_t *rx_handler;
4465 struct net_device *orig_dev;
4466 bool deliver_exact = false;
4467 int ret = NET_RX_DROP;
4468 __be16 type;
4469
4470 net_timestamp_check(!netdev_tstamp_prequeue, skb);
4471
4472 trace_netif_receive_skb(skb);
4473
4474 orig_dev = skb->dev;
4475
4476 skb_reset_network_header(skb);
4477 if (!skb_transport_header_was_set(skb))
4478 skb_reset_transport_header(skb);
4479 skb_reset_mac_len(skb);
4480
4481 pt_prev = NULL;
4482
4483another_round:
4484 skb->skb_iif = skb->dev->ifindex;
4485
4486 __this_cpu_inc(softnet_data.processed);
4487
4488 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4489 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4490 skb = skb_vlan_untag(skb);
4491 if (unlikely(!skb))
4492 goto out;
4493 }
4494
4495 if (skb_skip_tc_classify(skb))
4496 goto skip_classify;
4497
4498 if (pfmemalloc)
4499 goto skip_taps;
4500
4501 list_for_each_entry_rcu(ptype, &ptype_all, list) {
4502 if (pt_prev)
4503 ret = deliver_skb(skb, pt_prev, orig_dev);
4504 pt_prev = ptype;
4505 }
4506
4507 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4508 if (pt_prev)
4509 ret = deliver_skb(skb, pt_prev, orig_dev);
4510 pt_prev = ptype;
4511 }
4512
4513skip_taps:
4514#ifdef CONFIG_NET_INGRESS
4515 if (static_key_false(&ingress_needed)) {
4516 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4517 if (!skb)
4518 goto out;
4519
4520 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4521 goto out;
4522 }
4523#endif
4524 skb_reset_tc(skb);
4525skip_classify:
4526 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4527 goto drop;
4528
4529 if (skb_vlan_tag_present(skb)) {
4530 if (pt_prev) {
4531 ret = deliver_skb(skb, pt_prev, orig_dev);
4532 pt_prev = NULL;
4533 }
4534 if (vlan_do_receive(&skb))
4535 goto another_round;
4536 else if (unlikely(!skb))
4537 goto out;
4538 }
4539
4540 rx_handler = rcu_dereference(skb->dev->rx_handler);
4541 if (rx_handler) {
4542 if (pt_prev) {
4543 ret = deliver_skb(skb, pt_prev, orig_dev);
4544 pt_prev = NULL;
4545 }
4546 switch (rx_handler(&skb)) {
4547 case RX_HANDLER_CONSUMED:
4548 ret = NET_RX_SUCCESS;
4549 goto out;
4550 case RX_HANDLER_ANOTHER:
4551 goto another_round;
4552 case RX_HANDLER_EXACT:
4553 deliver_exact = true;
4554 case RX_HANDLER_PASS:
4555 break;
4556 default:
4557 BUG();
4558 }
4559 }
4560
4561 if (unlikely(skb_vlan_tag_present(skb))) {
4562 if (skb_vlan_tag_get_id(skb))
4563 skb->pkt_type = PACKET_OTHERHOST;
4564 /* Note: we might in the future use prio bits
4565 * and set skb->priority like in vlan_do_receive()
4566 * For the time being, just ignore Priority Code Point
4567 */
4568 skb->vlan_tci = 0;
4569 }
4570
4571 type = skb->protocol;
4572
4573 /* deliver only exact match when indicated */
4574 if (likely(!deliver_exact)) {
4575 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4576 &ptype_base[ntohs(type) &
4577 PTYPE_HASH_MASK]);
4578 }
4579
4580 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4581 &orig_dev->ptype_specific);
4582
4583 if (unlikely(skb->dev != orig_dev)) {
4584 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4585 &skb->dev->ptype_specific);
4586 }
4587
4588 if (pt_prev) {
4589 if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
4590 goto drop;
4591 else
4592 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4593 } else {
4594drop:
4595 if (!deliver_exact)
4596 atomic_long_inc(&skb->dev->rx_dropped);
4597 else
4598 atomic_long_inc(&skb->dev->rx_nohandler);
4599 kfree_skb(skb);
4600 /* Jamal, now you will not able to escape explaining
4601 * me how you were going to use this. :-)
4602 */
4603 ret = NET_RX_DROP;
4604 }
4605
4606out:
4607 return ret;
4608}
4609
4610/**
4611 * netif_receive_skb_core - special purpose version of netif_receive_skb
4612 * @skb: buffer to process
4613 *
4614 * More direct receive version of netif_receive_skb(). It should
4615 * only be used by callers that have a need to skip RPS and Generic XDP.
4616 * Caller must also take care of handling if (page_is_)pfmemalloc.
4617 *
4618 * This function may only be called from softirq context and interrupts
4619 * should be enabled.
4620 *
4621 * Return values (usually ignored):
4622 * NET_RX_SUCCESS: no congestion
4623 * NET_RX_DROP: packet was dropped
4624 */
4625int netif_receive_skb_core(struct sk_buff *skb)
4626{
4627 int ret;
4628
4629 rcu_read_lock();
4630 ret = __netif_receive_skb_core(skb, false);
4631 rcu_read_unlock();
4632
4633 return ret;
4634}
4635EXPORT_SYMBOL(netif_receive_skb_core);
4636
4637static int __netif_receive_skb(struct sk_buff *skb)
4638{
4639 int ret;
4640
4641 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4642 unsigned int noreclaim_flag;
4643
4644 /*
4645 * PFMEMALLOC skbs are special, they should
4646 * - be delivered to SOCK_MEMALLOC sockets only
4647 * - stay away from userspace
4648 * - have bounded memory usage
4649 *
4650 * Use PF_MEMALLOC as this saves us from propagating the allocation
4651 * context down to all allocation sites.
4652 */
4653 noreclaim_flag = memalloc_noreclaim_save();
4654 ret = __netif_receive_skb_core(skb, true);
4655 memalloc_noreclaim_restore(noreclaim_flag);
4656 } else
4657 ret = __netif_receive_skb_core(skb, false);
4658
4659 return ret;
4660}
4661
4662static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
4663{
4664 struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
4665 struct bpf_prog *new = xdp->prog;
4666 int ret = 0;
4667
4668 switch (xdp->command) {
4669 case XDP_SETUP_PROG:
4670 rcu_assign_pointer(dev->xdp_prog, new);
4671 if (old)
4672 bpf_prog_put(old);
4673
4674 if (old && !new) {
4675 static_key_slow_dec(&generic_xdp_needed);
4676 } else if (new && !old) {
4677 static_key_slow_inc(&generic_xdp_needed);
4678 dev_disable_lro(dev);
4679 dev_disable_gro_hw(dev);
4680 }
4681 break;
4682
4683 case XDP_QUERY_PROG:
4684 xdp->prog_attached = !!old;
4685 xdp->prog_id = old ? old->aux->id : 0;
4686 break;
4687
4688 default:
4689 ret = -EINVAL;
4690 break;
4691 }
4692
4693 return ret;
4694}
4695
4696static int netif_receive_skb_internal(struct sk_buff *skb)
4697{
4698 int ret;
4699
4700 net_timestamp_check(netdev_tstamp_prequeue, skb);
4701
4702 if (skb_defer_rx_timestamp(skb))
4703 return NET_RX_SUCCESS;
4704
4705 if (static_key_false(&generic_xdp_needed)) {
4706 int ret;
4707
4708 preempt_disable();
4709 rcu_read_lock();
4710 ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
4711 rcu_read_unlock();
4712 preempt_enable();
4713
4714 if (ret != XDP_PASS)
4715 return NET_RX_DROP;
4716 }
4717
4718 rcu_read_lock();
4719#ifdef CONFIG_RPS
4720 if (static_key_false(&rps_needed)) {
4721 struct rps_dev_flow voidflow, *rflow = &voidflow;
4722 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4723
4724 if (cpu >= 0) {
4725 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4726 rcu_read_unlock();
4727 return ret;
4728 }
4729 }
4730#endif
4731 ret = __netif_receive_skb(skb);
4732 rcu_read_unlock();
4733 return ret;
4734}
4735
4736/**
4737 * netif_receive_skb - process receive buffer from network
4738 * @skb: buffer to process
4739 *
4740 * netif_receive_skb() is the main receive data processing function.
4741 * It always succeeds. The buffer may be dropped during processing
4742 * for congestion control or by the protocol layers.
4743 *
4744 * This function may only be called from softirq context and interrupts
4745 * should be enabled.
4746 *
4747 * Return values (usually ignored):
4748 * NET_RX_SUCCESS: no congestion
4749 * NET_RX_DROP: packet was dropped
4750 */
4751int netif_receive_skb(struct sk_buff *skb)
4752{
4753 trace_netif_receive_skb_entry(skb);
4754
4755 return netif_receive_skb_internal(skb);
4756}
4757EXPORT_SYMBOL(netif_receive_skb);
4758
4759DEFINE_PER_CPU(struct work_struct, flush_works);
4760
4761/* Network device is going away, flush any packets still pending */
4762static void flush_backlog(struct work_struct *work)
4763{
4764 struct sk_buff *skb, *tmp;
4765 struct softnet_data *sd;
4766
4767 local_bh_disable();
4768 sd = this_cpu_ptr(&softnet_data);
4769
4770 local_irq_disable();
4771 rps_lock(sd);
4772 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4773 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4774 __skb_unlink(skb, &sd->input_pkt_queue);
4775 kfree_skb(skb);
4776 input_queue_head_incr(sd);
4777 }
4778 }
4779 rps_unlock(sd);
4780 local_irq_enable();
4781
4782 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4783 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4784 __skb_unlink(skb, &sd->process_queue);
4785 kfree_skb(skb);
4786 input_queue_head_incr(sd);
4787 }
4788 }
4789 local_bh_enable();
4790}
4791
4792static void flush_all_backlogs(void)
4793{
4794 unsigned int cpu;
4795
4796 get_online_cpus();
4797
4798 for_each_online_cpu(cpu)
4799 queue_work_on(cpu, system_highpri_wq,
4800 per_cpu_ptr(&flush_works, cpu));
4801
4802 for_each_online_cpu(cpu)
4803 flush_work(per_cpu_ptr(&flush_works, cpu));
4804
4805 put_online_cpus();
4806}
4807
4808static int napi_gro_complete(struct sk_buff *skb)
4809{
4810 struct packet_offload *ptype;
4811 __be16 type = skb->protocol;
4812 struct list_head *head = &offload_base;
4813 int err = -ENOENT;
4814
4815 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4816
4817 if (NAPI_GRO_CB(skb)->count == 1) {
4818 skb_shinfo(skb)->gso_size = 0;
4819 goto out;
4820 }
4821
4822 rcu_read_lock();
4823 list_for_each_entry_rcu(ptype, head, list) {
4824 if (ptype->type != type || !ptype->callbacks.gro_complete)
4825 continue;
4826
4827 err = ptype->callbacks.gro_complete(skb, 0);
4828 break;
4829 }
4830 rcu_read_unlock();
4831
4832 if (err) {
4833 WARN_ON(&ptype->list == head);
4834 kfree_skb(skb);
4835 return NET_RX_SUCCESS;
4836 }
4837
4838out:
4839 return netif_receive_skb_internal(skb);
4840}
4841
4842/* napi->gro_list contains packets ordered by age.
4843 * youngest packets at the head of it.
4844 * Complete skbs in reverse order to reduce latencies.
4845 */
4846void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4847{
4848 struct sk_buff *skb, *prev = NULL;
4849
4850 /* scan list and build reverse chain */
4851 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4852 skb->prev = prev;
4853 prev = skb;
4854 }
4855
4856 for (skb = prev; skb; skb = prev) {
4857 skb->next = NULL;
4858
4859 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4860 return;
4861
4862 prev = skb->prev;
4863 napi_gro_complete(skb);
4864 napi->gro_count--;
4865 }
4866
4867 napi->gro_list = NULL;
4868}
4869EXPORT_SYMBOL(napi_gro_flush);
4870
4871static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4872{
4873 struct sk_buff *p;
4874 unsigned int maclen = skb->dev->hard_header_len;
4875 u32 hash = skb_get_hash_raw(skb);
4876
4877 for (p = napi->gro_list; p; p = p->next) {
4878 unsigned long diffs;
4879
4880 NAPI_GRO_CB(p)->flush = 0;
4881
4882 if (hash != skb_get_hash_raw(p)) {
4883 NAPI_GRO_CB(p)->same_flow = 0;
4884 continue;
4885 }
4886
4887 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4888 diffs |= p->vlan_tci ^ skb->vlan_tci;
4889 diffs |= skb_metadata_dst_cmp(p, skb);
4890 diffs |= skb_metadata_differs(p, skb);
4891 if (maclen == ETH_HLEN)
4892 diffs |= compare_ether_header(skb_mac_header(p),
4893 skb_mac_header(skb));
4894 else if (!diffs)
4895 diffs = memcmp(skb_mac_header(p),
4896 skb_mac_header(skb),
4897 maclen);
4898 NAPI_GRO_CB(p)->same_flow = !diffs;
4899 }
4900}
4901
4902static void skb_gro_reset_offset(struct sk_buff *skb)
4903{
4904 const struct skb_shared_info *pinfo = skb_shinfo(skb);
4905 const skb_frag_t *frag0 = &pinfo->frags[0];
4906
4907 NAPI_GRO_CB(skb)->data_offset = 0;
4908 NAPI_GRO_CB(skb)->frag0 = NULL;
4909 NAPI_GRO_CB(skb)->frag0_len = 0;
4910
4911 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4912 pinfo->nr_frags &&
4913 !PageHighMem(skb_frag_page(frag0))) {
4914 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4915 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4916 skb_frag_size(frag0),
4917 skb->end - skb->tail);
4918 }
4919}
4920
4921static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4922{
4923 struct skb_shared_info *pinfo = skb_shinfo(skb);
4924
4925 BUG_ON(skb->end - skb->tail < grow);
4926
4927 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4928
4929 skb->data_len -= grow;
4930 skb->tail += grow;
4931
4932 pinfo->frags[0].page_offset += grow;
4933 skb_frag_size_sub(&pinfo->frags[0], grow);
4934
4935 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4936 skb_frag_unref(skb, 0);
4937 memmove(pinfo->frags, pinfo->frags + 1,
4938 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4939 }
4940}
4941
4942static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4943{
4944 struct sk_buff **pp = NULL;
4945 struct packet_offload *ptype;
4946 __be16 type = skb->protocol;
4947 struct list_head *head = &offload_base;
4948 int same_flow;
4949 enum gro_result ret;
4950 int grow;
4951
4952 if (netif_elide_gro(skb->dev))
4953 goto normal;
4954
4955 gro_list_prepare(napi, skb);
4956
4957 rcu_read_lock();
4958 list_for_each_entry_rcu(ptype, head, list) {
4959 if (ptype->type != type || !ptype->callbacks.gro_receive)
4960 continue;
4961
4962 skb_set_network_header(skb, skb_gro_offset(skb));
4963 skb_reset_mac_len(skb);
4964 NAPI_GRO_CB(skb)->same_flow = 0;
4965 NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4966 NAPI_GRO_CB(skb)->free = 0;
4967 NAPI_GRO_CB(skb)->encap_mark = 0;
4968 NAPI_GRO_CB(skb)->recursion_counter = 0;
4969 NAPI_GRO_CB(skb)->is_fou = 0;
4970 NAPI_GRO_CB(skb)->is_atomic = 1;
4971 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4972
4973 /* Setup for GRO checksum validation */
4974 switch (skb->ip_summed) {
4975 case CHECKSUM_COMPLETE:
4976 NAPI_GRO_CB(skb)->csum = skb->csum;
4977 NAPI_GRO_CB(skb)->csum_valid = 1;
4978 NAPI_GRO_CB(skb)->csum_cnt = 0;
4979 break;
4980 case CHECKSUM_UNNECESSARY:
4981 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4982 NAPI_GRO_CB(skb)->csum_valid = 0;
4983 break;
4984 default:
4985 NAPI_GRO_CB(skb)->csum_cnt = 0;
4986 NAPI_GRO_CB(skb)->csum_valid = 0;
4987 }
4988
4989 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4990 break;
4991 }
4992 rcu_read_unlock();
4993
4994 if (&ptype->list == head)
4995 goto normal;
4996
4997 if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
4998 ret = GRO_CONSUMED;
4999 goto ok;
5000 }
5001
5002 same_flow = NAPI_GRO_CB(skb)->same_flow;
5003 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
5004
5005 if (pp) {
5006 struct sk_buff *nskb = *pp;
5007
5008 *pp = nskb->next;
5009 nskb->next = NULL;
5010 napi_gro_complete(nskb);
5011 napi->gro_count--;
5012 }
5013
5014 if (same_flow)
5015 goto ok;
5016
5017 if (NAPI_GRO_CB(skb)->flush)
5018 goto normal;
5019
5020 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
5021 struct sk_buff *nskb = napi->gro_list;
5022
5023 /* locate the end of the list to select the 'oldest' flow */
5024 while (nskb->next) {
5025 pp = &nskb->next;
5026 nskb = *pp;
5027 }
5028 *pp = NULL;
5029 nskb->next = NULL;
5030 napi_gro_complete(nskb);
5031 } else {
5032 napi->gro_count++;
5033 }
5034 NAPI_GRO_CB(skb)->count = 1;
5035 NAPI_GRO_CB(skb)->age = jiffies;
5036 NAPI_GRO_CB(skb)->last = skb;
5037 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
5038 skb->next = napi->gro_list;
5039 napi->gro_list = skb;
5040 ret = GRO_HELD;
5041
5042pull:
5043 grow = skb_gro_offset(skb) - skb_headlen(skb);
5044 if (grow > 0)
5045 gro_pull_from_frag0(skb, grow);
5046ok:
5047 return ret;
5048
5049normal:
5050 ret = GRO_NORMAL;
5051 goto pull;
5052}
5053
5054struct packet_offload *gro_find_receive_by_type(__be16 type)
5055{
5056 struct list_head *offload_head = &offload_base;
5057 struct packet_offload *ptype;
5058
5059 list_for_each_entry_rcu(ptype, offload_head, list) {
5060 if (ptype->type != type || !ptype->callbacks.gro_receive)
5061 continue;
5062 return ptype;
5063 }
5064 return NULL;
5065}
5066EXPORT_SYMBOL(gro_find_receive_by_type);
5067
5068struct packet_offload *gro_find_complete_by_type(__be16 type)
5069{
5070 struct list_head *offload_head = &offload_base;
5071 struct packet_offload *ptype;
5072
5073 list_for_each_entry_rcu(ptype, offload_head, list) {
5074 if (ptype->type != type || !ptype->callbacks.gro_complete)
5075 continue;
5076 return ptype;
5077 }
5078 return NULL;
5079}
5080EXPORT_SYMBOL(gro_find_complete_by_type);
5081
5082static void napi_skb_free_stolen_head(struct sk_buff *skb)
5083{
5084 skb_dst_drop(skb);
5085 secpath_reset(skb);
5086 kmem_cache_free(skbuff_head_cache, skb);
5087}
5088
5089static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5090{
5091 switch (ret) {
5092 case GRO_NORMAL:
5093 if (netif_receive_skb_internal(skb))
5094 ret = GRO_DROP;
5095 break;
5096
5097 case GRO_DROP:
5098 kfree_skb(skb);
5099 break;
5100
5101 case GRO_MERGED_FREE:
5102 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5103 napi_skb_free_stolen_head(skb);
5104 else
5105 __kfree_skb(skb);
5106 break;
5107
5108 case GRO_HELD:
5109 case GRO_MERGED:
5110 case GRO_CONSUMED:
5111 break;
5112 }
5113
5114 return ret;
5115}
5116
5117gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5118{
5119 skb_mark_napi_id(skb, napi);
5120 trace_napi_gro_receive_entry(skb);
5121
5122 skb_gro_reset_offset(skb);
5123
5124 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
5125}
5126EXPORT_SYMBOL(napi_gro_receive);
5127
5128static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
5129{
5130 if (unlikely(skb->pfmemalloc)) {
5131 consume_skb(skb);
5132 return;
5133 }
5134 __skb_pull(skb, skb_headlen(skb));
5135 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
5136 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
5137 skb->vlan_tci = 0;
5138 skb->dev = napi->dev;
5139 skb->skb_iif = 0;
5140 skb->encapsulation = 0;
5141 skb_shinfo(skb)->gso_type = 0;
5142 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
5143 secpath_reset(skb);
5144
5145 napi->skb = skb;
5146}
5147
5148struct sk_buff *napi_get_frags(struct napi_struct *napi)
5149{
5150 struct sk_buff *skb = napi->skb;
5151
5152 if (!skb) {
5153 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
5154 if (skb) {
5155 napi->skb = skb;
5156 skb_mark_napi_id(skb, napi);
5157 }
5158 }
5159 return skb;
5160}
5161EXPORT_SYMBOL(napi_get_frags);
5162
5163static gro_result_t napi_frags_finish(struct napi_struct *napi,
5164 struct sk_buff *skb,
5165 gro_result_t ret)
5166{
5167 switch (ret) {
5168 case GRO_NORMAL:
5169 case GRO_HELD:
5170 __skb_push(skb, ETH_HLEN);
5171 skb->protocol = eth_type_trans(skb, skb->dev);
5172 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
5173 ret = GRO_DROP;
5174 break;
5175
5176 case GRO_DROP:
5177 napi_reuse_skb(napi, skb);
5178 break;
5179
5180 case GRO_MERGED_FREE:
5181 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
5182 napi_skb_free_stolen_head(skb);
5183 else
5184 napi_reuse_skb(napi, skb);
5185 break;
5186
5187 case GRO_MERGED:
5188 case GRO_CONSUMED:
5189 break;
5190 }
5191
5192 return ret;
5193}
5194
5195/* Upper GRO stack assumes network header starts at gro_offset=0
5196 * Drivers could call both napi_gro_frags() and napi_gro_receive()
5197 * We copy ethernet header into skb->data to have a common layout.
5198 */
5199static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
5200{
5201 struct sk_buff *skb = napi->skb;
5202 const struct ethhdr *eth;
5203 unsigned int hlen = sizeof(*eth);
5204
5205 napi->skb = NULL;
5206
5207 skb_reset_mac_header(skb);
5208 skb_gro_reset_offset(skb);
5209
5210 eth = skb_gro_header_fast(skb, 0);
5211 if (unlikely(skb_gro_header_hard(skb, hlen))) {
5212 eth = skb_gro_header_slow(skb, hlen, 0);
5213 if (unlikely(!eth)) {
5214 net_warn_ratelimited("%s: dropping impossible skb from %s\n",
5215 __func__, napi->dev->name);
5216 napi_reuse_skb(napi, skb);
5217 return NULL;
5218 }
5219 } else {
5220 gro_pull_from_frag0(skb, hlen);
5221 NAPI_GRO_CB(skb)->frag0 += hlen;
5222 NAPI_GRO_CB(skb)->frag0_len -= hlen;
5223 }
5224 __skb_pull(skb, hlen);
5225
5226 /*
5227 * This works because the only protocols we care about don't require
5228 * special handling.
5229 * We'll fix it up properly in napi_frags_finish()
5230 */
5231 skb->protocol = eth->h_proto;
5232
5233 return skb;
5234}
5235
5236gro_result_t napi_gro_frags(struct napi_struct *napi)
5237{
5238 struct sk_buff *skb = napi_frags_skb(napi);
5239
5240 if (!skb)
5241 return GRO_DROP;
5242
5243 trace_napi_gro_frags_entry(skb);
5244
5245 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5246}
5247EXPORT_SYMBOL(napi_gro_frags);
5248
5249/* Compute the checksum from gro_offset and return the folded value
5250 * after adding in any pseudo checksum.
5251 */
5252__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
5253{
5254 __wsum wsum;
5255 __sum16 sum;
5256
5257 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
5258
5259 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
5260 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
5261 if (likely(!sum)) {
5262 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
5263 !skb->csum_complete_sw)
5264 netdev_rx_csum_fault(skb->dev);
5265 }
5266
5267 NAPI_GRO_CB(skb)->csum = wsum;
5268 NAPI_GRO_CB(skb)->csum_valid = 1;
5269
5270 return sum;
5271}
5272EXPORT_SYMBOL(__skb_gro_checksum_complete);
5273
5274static void net_rps_send_ipi(struct softnet_data *remsd)
5275{
5276#ifdef CONFIG_RPS
5277 while (remsd) {
5278 struct softnet_data *next = remsd->rps_ipi_next;
5279
5280 if (cpu_online(remsd->cpu))
5281 smp_call_function_single_async(remsd->cpu, &remsd->csd);
5282 remsd = next;
5283 }
5284#endif
5285}
5286
5287/*
5288 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
5289 * Note: called with local irq disabled, but exits with local irq enabled.
5290 */
5291static void net_rps_action_and_irq_enable(struct softnet_data *sd)
5292{
5293#ifdef CONFIG_RPS
5294 struct softnet_data *remsd = sd->rps_ipi_list;
5295
5296 if (remsd) {
5297 sd->rps_ipi_list = NULL;
5298
5299 local_irq_enable();
5300
5301 /* Send pending IPI's to kick RPS processing on remote cpus. */
5302 net_rps_send_ipi(remsd);
5303 } else
5304#endif
5305 local_irq_enable();
5306}
5307
5308static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
5309{
5310#ifdef CONFIG_RPS
5311 return sd->rps_ipi_list != NULL;
5312#else
5313 return false;
5314#endif
5315}
5316
5317static int process_backlog(struct napi_struct *napi, int quota)
5318{
5319 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
5320 bool again = true;
5321 int work = 0;
5322
5323 /* Check if we have pending ipi, its better to send them now,
5324 * not waiting net_rx_action() end.
5325 */
5326 if (sd_has_rps_ipi_waiting(sd)) {
5327 local_irq_disable();
5328 net_rps_action_and_irq_enable(sd);
5329 }
5330
5331 napi->weight = dev_rx_weight;
5332 while (again) {
5333 struct sk_buff *skb;
5334
5335 while ((skb = __skb_dequeue(&sd->process_queue))) {
5336 rcu_read_lock();
5337 __netif_receive_skb(skb);
5338 rcu_read_unlock();
5339 input_queue_head_incr(sd);
5340 if (++work >= quota)
5341 return work;
5342
5343 }
5344
5345 local_irq_disable();
5346 rps_lock(sd);
5347 if (skb_queue_empty(&sd->input_pkt_queue)) {
5348 /*
5349 * Inline a custom version of __napi_complete().
5350 * only current cpu owns and manipulates this napi,
5351 * and NAPI_STATE_SCHED is the only possible flag set
5352 * on backlog.
5353 * We can use a plain write instead of clear_bit(),
5354 * and we dont need an smp_mb() memory barrier.
5355 */
5356 napi->state = 0;
5357 again = false;
5358 } else {
5359 skb_queue_splice_tail_init(&sd->input_pkt_queue,
5360 &sd->process_queue);
5361 }
5362 rps_unlock(sd);
5363 local_irq_enable();
5364 }
5365
5366 return work;
5367}
5368
5369/**
5370 * __napi_schedule - schedule for receive
5371 * @n: entry to schedule
5372 *
5373 * The entry's receive function will be scheduled to run.
5374 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
5375 */
5376void __napi_schedule(struct napi_struct *n)
5377{
5378 unsigned long flags;
5379
5380 local_irq_save(flags);
5381 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
5382 local_irq_restore(flags);
5383}
5384EXPORT_SYMBOL(__napi_schedule);
5385
5386/**
5387 * napi_schedule_prep - check if napi can be scheduled
5388 * @n: napi context
5389 *
5390 * Test if NAPI routine is already running, and if not mark
5391 * it as running. This is used as a condition variable
5392 * insure only one NAPI poll instance runs. We also make
5393 * sure there is no pending NAPI disable.
5394 */
5395bool napi_schedule_prep(struct napi_struct *n)
5396{
5397 unsigned long val, new;
5398
5399 do {
5400 val = READ_ONCE(n->state);
5401 if (unlikely(val & NAPIF_STATE_DISABLE))
5402 return false;
5403 new = val | NAPIF_STATE_SCHED;
5404
5405 /* Sets STATE_MISSED bit if STATE_SCHED was already set
5406 * This was suggested by Alexander Duyck, as compiler
5407 * emits better code than :
5408 * if (val & NAPIF_STATE_SCHED)
5409 * new |= NAPIF_STATE_MISSED;
5410 */
5411 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
5412 NAPIF_STATE_MISSED;
5413 } while (cmpxchg(&n->state, val, new) != val);
5414
5415 return !(val & NAPIF_STATE_SCHED);
5416}
5417EXPORT_SYMBOL(napi_schedule_prep);
5418
5419/**
5420 * __napi_schedule_irqoff - schedule for receive
5421 * @n: entry to schedule
5422 *
5423 * Variant of __napi_schedule() assuming hard irqs are masked
5424 */
5425void __napi_schedule_irqoff(struct napi_struct *n)
5426{
5427 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
5428}
5429EXPORT_SYMBOL(__napi_schedule_irqoff);
5430
5431bool napi_complete_done(struct napi_struct *n, int work_done)
5432{
5433 unsigned long flags, val, new;
5434
5435 /*
5436 * 1) Don't let napi dequeue from the cpu poll list
5437 * just in case its running on a different cpu.
5438 * 2) If we are busy polling, do nothing here, we have
5439 * the guarantee we will be called later.
5440 */
5441 if (unlikely(n->state & (NAPIF_STATE_NPSVC |
5442 NAPIF_STATE_IN_BUSY_POLL)))
5443 return false;
5444
5445 if (n->gro_list) {
5446 unsigned long timeout = 0;
5447
5448 if (work_done)
5449 timeout = n->dev->gro_flush_timeout;
5450
5451 if (timeout)
5452 hrtimer_start(&n->timer, ns_to_ktime(timeout),
5453 HRTIMER_MODE_REL_PINNED);
5454 else
5455 napi_gro_flush(n, false);
5456 }
5457 if (unlikely(!list_empty(&n->poll_list))) {
5458 /* If n->poll_list is not empty, we need to mask irqs */
5459 local_irq_save(flags);
5460 list_del_init(&n->poll_list);
5461 local_irq_restore(flags);
5462 }
5463
5464 do {
5465 val = READ_ONCE(n->state);
5466
5467 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
5468
5469 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
5470
5471 /* If STATE_MISSED was set, leave STATE_SCHED set,
5472 * because we will call napi->poll() one more time.
5473 * This C code was suggested by Alexander Duyck to help gcc.
5474 */
5475 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5476 NAPIF_STATE_SCHED;
5477 } while (cmpxchg(&n->state, val, new) != val);
5478
5479 if (unlikely(val & NAPIF_STATE_MISSED)) {
5480 __napi_schedule(n);
5481 return false;
5482 }
5483
5484 return true;
5485}
5486EXPORT_SYMBOL(napi_complete_done);
5487
5488/* must be called under rcu_read_lock(), as we dont take a reference */
5489static struct napi_struct *napi_by_id(unsigned int napi_id)
5490{
5491 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5492 struct napi_struct *napi;
5493
5494 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5495 if (napi->napi_id == napi_id)
5496 return napi;
5497
5498 return NULL;
5499}
5500
5501#if defined(CONFIG_NET_RX_BUSY_POLL)
5502
5503#define BUSY_POLL_BUDGET 8
5504
5505static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
5506{
5507 int rc;
5508
5509 /* Busy polling means there is a high chance device driver hard irq
5510 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5511 * set in napi_schedule_prep().
5512 * Since we are about to call napi->poll() once more, we can safely
5513 * clear NAPI_STATE_MISSED.
5514 *
5515 * Note: x86 could use a single "lock and ..." instruction
5516 * to perform these two clear_bit()
5517 */
5518 clear_bit(NAPI_STATE_MISSED, &napi->state);
5519 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
5520
5521 local_bh_disable();
5522
5523 /* All we really want here is to re-enable device interrupts.
5524 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
5525 */
5526 rc = napi->poll(napi, BUSY_POLL_BUDGET);
5527 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5528 netpoll_poll_unlock(have_poll_lock);
5529 if (rc == BUSY_POLL_BUDGET)
5530 __napi_schedule(napi);
5531 local_bh_enable();
5532}
5533
5534void napi_busy_loop(unsigned int napi_id,
5535 bool (*loop_end)(void *, unsigned long),
5536 void *loop_end_arg)
5537{
5538 unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
5539 int (*napi_poll)(struct napi_struct *napi, int budget);
5540 void *have_poll_lock = NULL;
5541 struct napi_struct *napi;
5542
5543restart:
5544 napi_poll = NULL;
5545
5546 rcu_read_lock();
5547
5548 napi = napi_by_id(napi_id);
5549 if (!napi)
5550 goto out;
5551
5552 preempt_disable();
5553 for (;;) {
5554 int work = 0;
5555
5556 local_bh_disable();
5557 if (!napi_poll) {
5558 unsigned long val = READ_ONCE(napi->state);
5559
5560 /* If multiple threads are competing for this napi,
5561 * we avoid dirtying napi->state as much as we can.
5562 */
5563 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5564 NAPIF_STATE_IN_BUSY_POLL))
5565 goto count;
5566 if (cmpxchg(&napi->state, val,
5567 val | NAPIF_STATE_IN_BUSY_POLL |
5568 NAPIF_STATE_SCHED) != val)
5569 goto count;
5570 have_poll_lock = netpoll_poll_lock(napi);
5571 napi_poll = napi->poll;
5572 }
5573 work = napi_poll(napi, BUSY_POLL_BUDGET);
5574 trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
5575count:
5576 if (work > 0)
5577 __NET_ADD_STATS(dev_net(napi->dev),
5578 LINUX_MIB_BUSYPOLLRXPACKETS, work);
5579 local_bh_enable();
5580
5581 if (!loop_end || loop_end(loop_end_arg, start_time))
5582 break;
5583
5584 if (unlikely(need_resched())) {
5585 if (napi_poll)
5586 busy_poll_stop(napi, have_poll_lock);
5587 preempt_enable();
5588 rcu_read_unlock();
5589 cond_resched();
5590 if (loop_end(loop_end_arg, start_time))
5591 return;
5592 goto restart;
5593 }
5594 cpu_relax();
5595 }
5596 if (napi_poll)
5597 busy_poll_stop(napi, have_poll_lock);
5598 preempt_enable();
5599out:
5600 rcu_read_unlock();
5601}
5602EXPORT_SYMBOL(napi_busy_loop);
5603
5604#endif /* CONFIG_NET_RX_BUSY_POLL */
5605
5606static void napi_hash_add(struct napi_struct *napi)
5607{
5608 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5609 test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5610 return;
5611
5612 spin_lock(&napi_hash_lock);
5613
5614 /* 0..NR_CPUS range is reserved for sender_cpu use */
5615 do {
5616 if (unlikely(++napi_gen_id < MIN_NAPI_ID))
5617 napi_gen_id = MIN_NAPI_ID;
5618 } while (napi_by_id(napi_gen_id));
5619 napi->napi_id = napi_gen_id;
5620
5621 hlist_add_head_rcu(&napi->napi_hash_node,
5622 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5623
5624 spin_unlock(&napi_hash_lock);
5625}
5626
5627/* Warning : caller is responsible to make sure rcu grace period
5628 * is respected before freeing memory containing @napi
5629 */
5630bool napi_hash_del(struct napi_struct *napi)
5631{
5632 bool rcu_sync_needed = false;
5633
5634 spin_lock(&napi_hash_lock);
5635
5636 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5637 rcu_sync_needed = true;
5638 hlist_del_rcu(&napi->napi_hash_node);
5639 }
5640 spin_unlock(&napi_hash_lock);
5641 return rcu_sync_needed;
5642}
5643EXPORT_SYMBOL_GPL(napi_hash_del);
5644
5645static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5646{
5647 struct napi_struct *napi;
5648
5649 napi = container_of(timer, struct napi_struct, timer);
5650
5651 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
5652 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5653 */
5654 if (napi->gro_list && !napi_disable_pending(napi) &&
5655 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
5656 __napi_schedule_irqoff(napi);
5657
5658 return HRTIMER_NORESTART;
5659}
5660
5661void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5662 int (*poll)(struct napi_struct *, int), int weight)
5663{
5664 INIT_LIST_HEAD(&napi->poll_list);
5665 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5666 napi->timer.function = napi_watchdog;
5667 napi->gro_count = 0;
5668 napi->gro_list = NULL;
5669 napi->skb = NULL;
5670 napi->poll = poll;
5671 if (weight > NAPI_POLL_WEIGHT)
5672 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5673 weight, dev->name);
5674 napi->weight = weight;
5675 list_add(&napi->dev_list, &dev->napi_list);
5676 napi->dev = dev;
5677#ifdef CONFIG_NETPOLL
5678 napi->poll_owner = -1;
5679#endif
5680 set_bit(NAPI_STATE_SCHED, &napi->state);
5681 napi_hash_add(napi);
5682}
5683EXPORT_SYMBOL(netif_napi_add);
5684
5685void napi_disable(struct napi_struct *n)
5686{
5687 might_sleep();
5688 set_bit(NAPI_STATE_DISABLE, &n->state);
5689
5690 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5691 msleep(1);
5692 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5693 msleep(1);
5694
5695 hrtimer_cancel(&n->timer);
5696
5697 clear_bit(NAPI_STATE_DISABLE, &n->state);
5698}
5699EXPORT_SYMBOL(napi_disable);
5700
5701/* Must be called in process context */
5702void netif_napi_del(struct napi_struct *napi)
5703{
5704 might_sleep();
5705 if (napi_hash_del(napi))
5706 synchronize_net();
5707 list_del_init(&napi->dev_list);
5708 napi_free_frags(napi);
5709
5710 kfree_skb_list(napi->gro_list);
5711 napi->gro_list = NULL;
5712 napi->gro_count = 0;
5713}
5714EXPORT_SYMBOL(netif_napi_del);
5715
5716static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5717{
5718 void *have;
5719 int work, weight;
5720
5721 list_del_init(&n->poll_list);
5722
5723 have = netpoll_poll_lock(n);
5724
5725 weight = n->weight;
5726
5727 /* This NAPI_STATE_SCHED test is for avoiding a race
5728 * with netpoll's poll_napi(). Only the entity which
5729 * obtains the lock and sees NAPI_STATE_SCHED set will
5730 * actually make the ->poll() call. Therefore we avoid
5731 * accidentally calling ->poll() when NAPI is not scheduled.
5732 */
5733 work = 0;
5734 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5735 work = n->poll(n, weight);
5736 trace_napi_poll(n, work, weight);
5737 }
5738
5739 WARN_ON_ONCE(work > weight);
5740
5741 if (likely(work < weight))
5742 goto out_unlock;
5743
5744 /* Drivers must not modify the NAPI state if they
5745 * consume the entire weight. In such cases this code
5746 * still "owns" the NAPI instance and therefore can
5747 * move the instance around on the list at-will.
5748 */
5749 if (unlikely(napi_disable_pending(n))) {
5750 napi_complete(n);
5751 goto out_unlock;
5752 }
5753
5754 if (n->gro_list) {
5755 /* flush too old packets
5756 * If HZ < 1000, flush all packets.
5757 */
5758 napi_gro_flush(n, HZ >= 1000);
5759 }
5760
5761 /* Some drivers may have called napi_schedule
5762 * prior to exhausting their budget.
5763 */
5764 if (unlikely(!list_empty(&n->poll_list))) {
5765 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5766 n->dev ? n->dev->name : "backlog");
5767 goto out_unlock;
5768 }
5769
5770 list_add_tail(&n->poll_list, repoll);
5771
5772out_unlock:
5773 netpoll_poll_unlock(have);
5774
5775 return work;
5776}
5777
5778static __latent_entropy void net_rx_action(struct softirq_action *h)
5779{
5780 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5781 unsigned long time_limit = jiffies +
5782 usecs_to_jiffies(netdev_budget_usecs);
5783 int budget = netdev_budget;
5784 LIST_HEAD(list);
5785 LIST_HEAD(repoll);
5786
5787 local_irq_disable();
5788 list_splice_init(&sd->poll_list, &list);
5789 local_irq_enable();
5790
5791 for (;;) {
5792 struct napi_struct *n;
5793
5794 if (list_empty(&list)) {
5795 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5796 goto out;
5797 break;
5798 }
5799
5800 n = list_first_entry(&list, struct napi_struct, poll_list);
5801 budget -= napi_poll(n, &repoll);
5802
5803 /* If softirq window is exhausted then punt.
5804 * Allow this to run for 2 jiffies since which will allow
5805 * an average latency of 1.5/HZ.
5806 */
5807 if (unlikely(budget <= 0 ||
5808 time_after_eq(jiffies, time_limit))) {
5809 sd->time_squeeze++;
5810 break;
5811 }
5812 }
5813
5814 local_irq_disable();
5815
5816 list_splice_tail_init(&sd->poll_list, &list);
5817 list_splice_tail(&repoll, &list);
5818 list_splice(&list, &sd->poll_list);
5819 if (!list_empty(&sd->poll_list))
5820 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5821
5822 net_rps_action_and_irq_enable(sd);
5823out:
5824 __kfree_skb_flush();
5825}
5826
5827struct netdev_adjacent {
5828 struct net_device *dev;
5829
5830 /* upper master flag, there can only be one master device per list */
5831 bool master;
5832
5833 /* counter for the number of times this device was added to us */
5834 u16 ref_nr;
5835
5836 /* private field for the users */
5837 void *private;
5838
5839 struct list_head list;
5840 struct rcu_head rcu;
5841};
5842
5843static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5844 struct list_head *adj_list)
5845{
5846 struct netdev_adjacent *adj;
5847
5848 list_for_each_entry(adj, adj_list, list) {
5849 if (adj->dev == adj_dev)
5850 return adj;
5851 }
5852 return NULL;
5853}
5854
5855static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5856{
5857 struct net_device *dev = data;
5858
5859 return upper_dev == dev;
5860}
5861
5862/**
5863 * netdev_has_upper_dev - Check if device is linked to an upper device
5864 * @dev: device
5865 * @upper_dev: upper device to check
5866 *
5867 * Find out if a device is linked to specified upper device and return true
5868 * in case it is. Note that this checks only immediate upper device,
5869 * not through a complete stack of devices. The caller must hold the RTNL lock.
5870 */
5871bool netdev_has_upper_dev(struct net_device *dev,
5872 struct net_device *upper_dev)
5873{
5874 ASSERT_RTNL();
5875
5876 return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5877 upper_dev);
5878}
5879EXPORT_SYMBOL(netdev_has_upper_dev);
5880
5881/**
5882 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5883 * @dev: device
5884 * @upper_dev: upper device to check
5885 *
5886 * Find out if a device is linked to specified upper device and return true
5887 * in case it is. Note that this checks the entire upper device chain.
5888 * The caller must hold rcu lock.
5889 */
5890
5891bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5892 struct net_device *upper_dev)
5893{
5894 return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5895 upper_dev);
5896}
5897EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5898
5899/**
5900 * netdev_has_any_upper_dev - Check if device is linked to some device
5901 * @dev: device
5902 *
5903 * Find out if a device is linked to an upper device and return true in case
5904 * it is. The caller must hold the RTNL lock.
5905 */
5906bool netdev_has_any_upper_dev(struct net_device *dev)
5907{
5908 ASSERT_RTNL();
5909
5910 return !list_empty(&dev->adj_list.upper);
5911}
5912EXPORT_SYMBOL(netdev_has_any_upper_dev);
5913
5914/**
5915 * netdev_master_upper_dev_get - Get master upper device
5916 * @dev: device
5917 *
5918 * Find a master upper device and return pointer to it or NULL in case
5919 * it's not there. The caller must hold the RTNL lock.
5920 */
5921struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5922{
5923 struct netdev_adjacent *upper;
5924
5925 ASSERT_RTNL();
5926
5927 if (list_empty(&dev->adj_list.upper))
5928 return NULL;
5929
5930 upper = list_first_entry(&dev->adj_list.upper,
5931 struct netdev_adjacent, list);
5932 if (likely(upper->master))
5933 return upper->dev;
5934 return NULL;
5935}
5936EXPORT_SYMBOL(netdev_master_upper_dev_get);
5937
5938/**
5939 * netdev_has_any_lower_dev - Check if device is linked to some device
5940 * @dev: device
5941 *
5942 * Find out if a device is linked to a lower device and return true in case
5943 * it is. The caller must hold the RTNL lock.
5944 */
5945static bool netdev_has_any_lower_dev(struct net_device *dev)
5946{
5947 ASSERT_RTNL();
5948
5949 return !list_empty(&dev->adj_list.lower);
5950}
5951
5952void *netdev_adjacent_get_private(struct list_head *adj_list)
5953{
5954 struct netdev_adjacent *adj;
5955
5956 adj = list_entry(adj_list, struct netdev_adjacent, list);
5957
5958 return adj->private;
5959}
5960EXPORT_SYMBOL(netdev_adjacent_get_private);
5961
5962/**
5963 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5964 * @dev: device
5965 * @iter: list_head ** of the current position
5966 *
5967 * Gets the next device from the dev's upper list, starting from iter
5968 * position. The caller must hold RCU read lock.
5969 */
5970struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5971 struct list_head **iter)
5972{
5973 struct netdev_adjacent *upper;
5974
5975 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5976
5977 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5978
5979 if (&upper->list == &dev->adj_list.upper)
5980 return NULL;
5981
5982 *iter = &upper->list;
5983
5984 return upper->dev;
5985}
5986EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5987
5988static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5989 struct list_head **iter)
5990{
5991 struct netdev_adjacent *upper;
5992
5993 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5994
5995 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5996
5997 if (&upper->list == &dev->adj_list.upper)
5998 return NULL;
5999
6000 *iter = &upper->list;
6001
6002 return upper->dev;
6003}
6004
6005int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
6006 int (*fn)(struct net_device *dev,
6007 void *data),
6008 void *data)
6009{
6010 struct net_device *udev;
6011 struct list_head *iter;
6012 int ret;
6013
6014 for (iter = &dev->adj_list.upper,
6015 udev = netdev_next_upper_dev_rcu(dev, &iter);
6016 udev;
6017 udev = netdev_next_upper_dev_rcu(dev, &iter)) {
6018 /* first is the upper device itself */
6019 ret = fn(udev, data);
6020 if (ret)
6021 return ret;
6022
6023 /* then look at all of its upper devices */
6024 ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
6025 if (ret)
6026 return ret;
6027 }
6028
6029 return 0;
6030}
6031EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
6032
6033/**
6034 * netdev_lower_get_next_private - Get the next ->private from the
6035 * lower neighbour list
6036 * @dev: device
6037 * @iter: list_head ** of the current position
6038 *
6039 * Gets the next netdev_adjacent->private from the dev's lower neighbour
6040 * list, starting from iter position. The caller must hold either hold the
6041 * RTNL lock or its own locking that guarantees that the neighbour lower
6042 * list will remain unchanged.
6043 */
6044void *netdev_lower_get_next_private(struct net_device *dev,
6045 struct list_head **iter)
6046{
6047 struct netdev_adjacent *lower;
6048
6049 lower = list_entry(*iter, struct netdev_adjacent, list);
6050
6051 if (&lower->list == &dev->adj_list.lower)
6052 return NULL;
6053
6054 *iter = lower->list.next;
6055
6056 return lower->private;
6057}
6058EXPORT_SYMBOL(netdev_lower_get_next_private);
6059
6060/**
6061 * netdev_lower_get_next_private_rcu - Get the next ->private from the
6062 * lower neighbour list, RCU
6063 * variant
6064 * @dev: device
6065 * @iter: list_head ** of the current position
6066 *
6067 * Gets the next netdev_adjacent->private from the dev's lower neighbour
6068 * list, starting from iter position. The caller must hold RCU read lock.
6069 */
6070void *netdev_lower_get_next_private_rcu(struct net_device *dev,
6071 struct list_head **iter)
6072{
6073 struct netdev_adjacent *lower;
6074
6075 WARN_ON_ONCE(!rcu_read_lock_held());
6076
6077 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6078
6079 if (&lower->list == &dev->adj_list.lower)
6080 return NULL;
6081
6082 *iter = &lower->list;
6083
6084 return lower->private;
6085}
6086EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
6087
6088/**
6089 * netdev_lower_get_next - Get the next device from the lower neighbour
6090 * list
6091 * @dev: device
6092 * @iter: list_head ** of the current position
6093 *
6094 * Gets the next netdev_adjacent from the dev's lower neighbour
6095 * list, starting from iter position. The caller must hold RTNL lock or
6096 * its own locking that guarantees that the neighbour lower
6097 * list will remain unchanged.
6098 */
6099void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
6100{
6101 struct netdev_adjacent *lower;
6102
6103 lower = list_entry(*iter, struct netdev_adjacent, list);
6104
6105 if (&lower->list == &dev->adj_list.lower)
6106 return NULL;
6107
6108 *iter = lower->list.next;
6109
6110 return lower->dev;
6111}
6112EXPORT_SYMBOL(netdev_lower_get_next);
6113
6114static struct net_device *netdev_next_lower_dev(struct net_device *dev,
6115 struct list_head **iter)
6116{
6117 struct netdev_adjacent *lower;
6118
6119 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
6120
6121 if (&lower->list == &dev->adj_list.lower)
6122 return NULL;
6123
6124 *iter = &lower->list;
6125
6126 return lower->dev;
6127}
6128
6129int netdev_walk_all_lower_dev(struct net_device *dev,
6130 int (*fn)(struct net_device *dev,
6131 void *data),
6132 void *data)
6133{
6134 struct net_device *ldev;
6135 struct list_head *iter;
6136 int ret;
6137
6138 for (iter = &dev->adj_list.lower,
6139 ldev = netdev_next_lower_dev(dev, &iter);
6140 ldev;
6141 ldev = netdev_next_lower_dev(dev, &iter)) {
6142 /* first is the lower device itself */
6143 ret = fn(ldev, data);
6144 if (ret)
6145 return ret;
6146
6147 /* then look at all of its lower devices */
6148 ret = netdev_walk_all_lower_dev(ldev, fn, data);
6149 if (ret)
6150 return ret;
6151 }
6152
6153 return 0;
6154}
6155EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
6156
6157static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
6158 struct list_head **iter)
6159{
6160 struct netdev_adjacent *lower;
6161
6162 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
6163 if (&lower->list == &dev->adj_list.lower)
6164 return NULL;
6165
6166 *iter = &lower->list;
6167
6168 return lower->dev;
6169}
6170
6171int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
6172 int (*fn)(struct net_device *dev,
6173 void *data),
6174 void *data)
6175{
6176 struct net_device *ldev;
6177 struct list_head *iter;
6178 int ret;
6179
6180 for (iter = &dev->adj_list.lower,
6181 ldev = netdev_next_lower_dev_rcu(dev, &iter);
6182 ldev;
6183 ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
6184 /* first is the lower device itself */
6185 ret = fn(ldev, data);
6186 if (ret)
6187 return ret;
6188
6189 /* then look at all of its lower devices */
6190 ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
6191 if (ret)
6192 return ret;
6193 }
6194
6195 return 0;
6196}
6197EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
6198
6199/**
6200 * netdev_lower_get_first_private_rcu - Get the first ->private from the
6201 * lower neighbour list, RCU
6202 * variant
6203 * @dev: device
6204 *
6205 * Gets the first netdev_adjacent->private from the dev's lower neighbour
6206 * list. The caller must hold RCU read lock.
6207 */
6208void *netdev_lower_get_first_private_rcu(struct net_device *dev)
6209{
6210 struct netdev_adjacent *lower;
6211
6212 lower = list_first_or_null_rcu(&dev->adj_list.lower,
6213 struct netdev_adjacent, list);
6214 if (lower)
6215 return lower->private;
6216 return NULL;
6217}
6218EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
6219
6220/**
6221 * netdev_master_upper_dev_get_rcu - Get master upper device
6222 * @dev: device
6223 *
6224 * Find a master upper device and return pointer to it or NULL in case
6225 * it's not there. The caller must hold the RCU read lock.
6226 */
6227struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
6228{
6229 struct netdev_adjacent *upper;
6230
6231 upper = list_first_or_null_rcu(&dev->adj_list.upper,
6232 struct netdev_adjacent, list);
6233 if (upper && likely(upper->master))
6234 return upper->dev;
6235 return NULL;
6236}
6237EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
6238
6239static int netdev_adjacent_sysfs_add(struct net_device *dev,
6240 struct net_device *adj_dev,
6241 struct list_head *dev_list)
6242{
6243 char linkname[IFNAMSIZ+7];
6244
6245 sprintf(linkname, dev_list == &dev->adj_list.upper ?
6246 "upper_%s" : "lower_%s", adj_dev->name);
6247 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
6248 linkname);
6249}
6250static void netdev_adjacent_sysfs_del(struct net_device *dev,
6251 char *name,
6252 struct list_head *dev_list)
6253{
6254 char linkname[IFNAMSIZ+7];
6255
6256 sprintf(linkname, dev_list == &dev->adj_list.upper ?
6257 "upper_%s" : "lower_%s", name);
6258 sysfs_remove_link(&(dev->dev.kobj), linkname);
6259}
6260
6261static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
6262 struct net_device *adj_dev,
6263 struct list_head *dev_list)
6264{
6265 return (dev_list == &dev->adj_list.upper ||
6266 dev_list == &dev->adj_list.lower) &&
6267 net_eq(dev_net(dev), dev_net(adj_dev));
6268}
6269
6270static int __netdev_adjacent_dev_insert(struct net_device *dev,
6271 struct net_device *adj_dev,
6272 struct list_head *dev_list,
6273 void *private, bool master)
6274{
6275 struct netdev_adjacent *adj;
6276 int ret;
6277
6278 adj = __netdev_find_adj(adj_dev, dev_list);
6279
6280 if (adj) {
6281 adj->ref_nr += 1;
6282 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
6283 dev->name, adj_dev->name, adj->ref_nr);
6284
6285 return 0;
6286 }
6287
6288 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
6289 if (!adj)
6290 return -ENOMEM;
6291
6292 adj->dev = adj_dev;
6293 adj->master = master;
6294 adj->ref_nr = 1;
6295 adj->private = private;
6296 dev_hold(adj_dev);
6297
6298 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
6299 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
6300
6301 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
6302 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
6303 if (ret)
6304 goto free_adj;
6305 }
6306
6307 /* Ensure that master link is always the first item in list. */
6308 if (master) {
6309 ret = sysfs_create_link(&(dev->dev.kobj),
6310 &(adj_dev->dev.kobj), "master");
6311 if (ret)
6312 goto remove_symlinks;
6313
6314 list_add_rcu(&adj->list, dev_list);
6315 } else {
6316 list_add_tail_rcu(&adj->list, dev_list);
6317 }
6318
6319 return 0;
6320
6321remove_symlinks:
6322 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
6323 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
6324free_adj:
6325 kfree(adj);
6326 dev_put(adj_dev);
6327
6328 return ret;
6329}
6330
6331static void __netdev_adjacent_dev_remove(struct net_device *dev,
6332 struct net_device *adj_dev,
6333 u16 ref_nr,
6334 struct list_head *dev_list)
6335{
6336 struct netdev_adjacent *adj;
6337
6338 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
6339 dev->name, adj_dev->name, ref_nr);
6340
6341 adj = __netdev_find_adj(adj_dev, dev_list);
6342
6343 if (!adj) {
6344 pr_err("Adjacency does not exist for device %s from %s\n",
6345 dev->name, adj_dev->name);
6346 WARN_ON(1);
6347 return;
6348 }
6349
6350 if (adj->ref_nr > ref_nr) {
6351 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
6352 dev->name, adj_dev->name, ref_nr,
6353 adj->ref_nr - ref_nr);
6354 adj->ref_nr -= ref_nr;
6355 return;
6356 }
6357
6358 if (adj->master)
6359 sysfs_remove_link(&(dev->dev.kobj), "master");
6360
6361 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
6362 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
6363
6364 list_del_rcu(&adj->list);
6365 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
6366 adj_dev->name, dev->name, adj_dev->name);
6367 dev_put(adj_dev);
6368 kfree_rcu(adj, rcu);
6369}
6370
6371static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
6372 struct net_device *upper_dev,
6373 struct list_head *up_list,
6374 struct list_head *down_list,
6375 void *private, bool master)
6376{
6377 int ret;
6378
6379 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
6380 private, master);
6381 if (ret)
6382 return ret;
6383
6384 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
6385 private, false);
6386 if (ret) {
6387 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
6388 return ret;
6389 }
6390
6391 return 0;
6392}
6393
6394static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
6395 struct net_device *upper_dev,
6396 u16 ref_nr,
6397 struct list_head *up_list,
6398 struct list_head *down_list)
6399{
6400 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
6401 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
6402}
6403
6404static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
6405 struct net_device *upper_dev,
6406 void *private, bool master)
6407{
6408 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
6409 &dev->adj_list.upper,
6410 &upper_dev->adj_list.lower,
6411 private, master);
6412}
6413
6414static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
6415 struct net_device *upper_dev)
6416{
6417 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
6418 &dev->adj_list.upper,
6419 &upper_dev->adj_list.lower);
6420}
6421
6422static int __netdev_upper_dev_link(struct net_device *dev,
6423 struct net_device *upper_dev, bool master,
6424 void *upper_priv, void *upper_info,
6425 struct netlink_ext_ack *extack)
6426{
6427 struct netdev_notifier_changeupper_info changeupper_info = {
6428 .info = {
6429 .dev = dev,
6430 .extack = extack,
6431 },
6432 .upper_dev = upper_dev,
6433 .master = master,
6434 .linking = true,
6435 .upper_info = upper_info,
6436 };
6437 struct net_device *master_dev;
6438 int ret = 0;
6439
6440 ASSERT_RTNL();
6441
6442 if (dev == upper_dev)
6443 return -EBUSY;
6444
6445 /* To prevent loops, check if dev is not upper device to upper_dev. */
6446 if (netdev_has_upper_dev(upper_dev, dev))
6447 return -EBUSY;
6448
6449 if (!master) {
6450 if (netdev_has_upper_dev(dev, upper_dev))
6451 return -EEXIST;
6452 } else {
6453 master_dev = netdev_master_upper_dev_get(dev);
6454 if (master_dev)
6455 return master_dev == upper_dev ? -EEXIST : -EBUSY;
6456 }
6457
6458 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
6459 &changeupper_info.info);
6460 ret = notifier_to_errno(ret);
6461 if (ret)
6462 return ret;
6463
6464 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
6465 master);
6466 if (ret)
6467 return ret;
6468
6469 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
6470 &changeupper_info.info);
6471 ret = notifier_to_errno(ret);
6472 if (ret)
6473 goto rollback;
6474
6475 return 0;
6476
6477rollback:
6478 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6479
6480 return ret;
6481}
6482
6483/**
6484 * netdev_upper_dev_link - Add a link to the upper device
6485 * @dev: device
6486 * @upper_dev: new upper device
6487 * @extack: netlink extended ack
6488 *
6489 * Adds a link to device which is upper to this one. The caller must hold
6490 * the RTNL lock. On a failure a negative errno code is returned.
6491 * On success the reference counts are adjusted and the function
6492 * returns zero.
6493 */
6494int netdev_upper_dev_link(struct net_device *dev,
6495 struct net_device *upper_dev,
6496 struct netlink_ext_ack *extack)
6497{
6498 return __netdev_upper_dev_link(dev, upper_dev, false,
6499 NULL, NULL, extack);
6500}
6501EXPORT_SYMBOL(netdev_upper_dev_link);
6502
6503/**
6504 * netdev_master_upper_dev_link - Add a master link to the upper device
6505 * @dev: device
6506 * @upper_dev: new upper device
6507 * @upper_priv: upper device private
6508 * @upper_info: upper info to be passed down via notifier
6509 * @extack: netlink extended ack
6510 *
6511 * Adds a link to device which is upper to this one. In this case, only
6512 * one master upper device can be linked, although other non-master devices
6513 * might be linked as well. The caller must hold the RTNL lock.
6514 * On a failure a negative errno code is returned. On success the reference
6515 * counts are adjusted and the function returns zero.
6516 */
6517int netdev_master_upper_dev_link(struct net_device *dev,
6518 struct net_device *upper_dev,
6519 void *upper_priv, void *upper_info,
6520 struct netlink_ext_ack *extack)
6521{
6522 return __netdev_upper_dev_link(dev, upper_dev, true,
6523 upper_priv, upper_info, extack);
6524}
6525EXPORT_SYMBOL(netdev_master_upper_dev_link);
6526
6527/**
6528 * netdev_upper_dev_unlink - Removes a link to upper device
6529 * @dev: device
6530 * @upper_dev: new upper device
6531 *
6532 * Removes a link to device which is upper to this one. The caller must hold
6533 * the RTNL lock.
6534 */
6535void netdev_upper_dev_unlink(struct net_device *dev,
6536 struct net_device *upper_dev)
6537{
6538 struct netdev_notifier_changeupper_info changeupper_info = {
6539 .info = {
6540 .dev = dev,
6541 },
6542 .upper_dev = upper_dev,
6543 .linking = false,
6544 };
6545
6546 ASSERT_RTNL();
6547
6548 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
6549
6550 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
6551 &changeupper_info.info);
6552
6553 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6554
6555 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
6556 &changeupper_info.info);
6557}
6558EXPORT_SYMBOL(netdev_upper_dev_unlink);
6559
6560/**
6561 * netdev_bonding_info_change - Dispatch event about slave change
6562 * @dev: device
6563 * @bonding_info: info to dispatch
6564 *
6565 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6566 * The caller must hold the RTNL lock.
6567 */
6568void netdev_bonding_info_change(struct net_device *dev,
6569 struct netdev_bonding_info *bonding_info)
6570{
6571 struct netdev_notifier_bonding_info info = {
6572 .info.dev = dev,
6573 };
6574
6575 memcpy(&info.bonding_info, bonding_info,
6576 sizeof(struct netdev_bonding_info));
6577 call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
6578 &info.info);
6579}
6580EXPORT_SYMBOL(netdev_bonding_info_change);
6581
6582static void netdev_adjacent_add_links(struct net_device *dev)
6583{
6584 struct netdev_adjacent *iter;
6585
6586 struct net *net = dev_net(dev);
6587
6588 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6589 if (!net_eq(net, dev_net(iter->dev)))
6590 continue;
6591 netdev_adjacent_sysfs_add(iter->dev, dev,
6592 &iter->dev->adj_list.lower);
6593 netdev_adjacent_sysfs_add(dev, iter->dev,
6594 &dev->adj_list.upper);
6595 }
6596
6597 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6598 if (!net_eq(net, dev_net(iter->dev)))
6599 continue;
6600 netdev_adjacent_sysfs_add(iter->dev, dev,
6601 &iter->dev->adj_list.upper);
6602 netdev_adjacent_sysfs_add(dev, iter->dev,
6603 &dev->adj_list.lower);
6604 }
6605}
6606
6607static void netdev_adjacent_del_links(struct net_device *dev)
6608{
6609 struct netdev_adjacent *iter;
6610
6611 struct net *net = dev_net(dev);
6612
6613 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6614 if (!net_eq(net, dev_net(iter->dev)))
6615 continue;
6616 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6617 &iter->dev->adj_list.lower);
6618 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6619 &dev->adj_list.upper);
6620 }
6621
6622 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6623 if (!net_eq(net, dev_net(iter->dev)))
6624 continue;
6625 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6626 &iter->dev->adj_list.upper);
6627 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6628 &dev->adj_list.lower);
6629 }
6630}
6631
6632void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6633{
6634 struct netdev_adjacent *iter;
6635
6636 struct net *net = dev_net(dev);
6637
6638 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6639 if (!net_eq(net, dev_net(iter->dev)))
6640 continue;
6641 netdev_adjacent_sysfs_del(iter->dev, oldname,
6642 &iter->dev->adj_list.lower);
6643 netdev_adjacent_sysfs_add(iter->dev, dev,
6644 &iter->dev->adj_list.lower);
6645 }
6646
6647 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6648 if (!net_eq(net, dev_net(iter->dev)))
6649 continue;
6650 netdev_adjacent_sysfs_del(iter->dev, oldname,
6651 &iter->dev->adj_list.upper);
6652 netdev_adjacent_sysfs_add(iter->dev, dev,
6653 &iter->dev->adj_list.upper);
6654 }
6655}
6656
6657void *netdev_lower_dev_get_private(struct net_device *dev,
6658 struct net_device *lower_dev)
6659{
6660 struct netdev_adjacent *lower;
6661
6662 if (!lower_dev)
6663 return NULL;
6664 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6665 if (!lower)
6666 return NULL;
6667
6668 return lower->private;
6669}
6670EXPORT_SYMBOL(netdev_lower_dev_get_private);
6671
6672
6673int dev_get_nest_level(struct net_device *dev)
6674{
6675 struct net_device *lower = NULL;
6676 struct list_head *iter;
6677 int max_nest = -1;
6678 int nest;
6679
6680 ASSERT_RTNL();
6681
6682 netdev_for_each_lower_dev(dev, lower, iter) {
6683 nest = dev_get_nest_level(lower);
6684 if (max_nest < nest)
6685 max_nest = nest;
6686 }
6687
6688 return max_nest + 1;
6689}
6690EXPORT_SYMBOL(dev_get_nest_level);
6691
6692/**
6693 * netdev_lower_change - Dispatch event about lower device state change
6694 * @lower_dev: device
6695 * @lower_state_info: state to dispatch
6696 *
6697 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6698 * The caller must hold the RTNL lock.
6699 */
6700void netdev_lower_state_changed(struct net_device *lower_dev,
6701 void *lower_state_info)
6702{
6703 struct netdev_notifier_changelowerstate_info changelowerstate_info = {
6704 .info.dev = lower_dev,
6705 };
6706
6707 ASSERT_RTNL();
6708 changelowerstate_info.lower_state_info = lower_state_info;
6709 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
6710 &changelowerstate_info.info);
6711}
6712EXPORT_SYMBOL(netdev_lower_state_changed);
6713
6714static void dev_change_rx_flags(struct net_device *dev, int flags)
6715{
6716 const struct net_device_ops *ops = dev->netdev_ops;
6717
6718 if (ops->ndo_change_rx_flags)
6719 ops->ndo_change_rx_flags(dev, flags);
6720}
6721
6722static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6723{
6724 unsigned int old_flags = dev->flags;
6725 kuid_t uid;
6726 kgid_t gid;
6727
6728 ASSERT_RTNL();
6729
6730 dev->flags |= IFF_PROMISC;
6731 dev->promiscuity += inc;
6732 if (dev->promiscuity == 0) {
6733 /*
6734 * Avoid overflow.
6735 * If inc causes overflow, untouch promisc and return error.
6736 */
6737 if (inc < 0)
6738 dev->flags &= ~IFF_PROMISC;
6739 else {
6740 dev->promiscuity -= inc;
6741 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6742 dev->name);
6743 return -EOVERFLOW;
6744 }
6745 }
6746 if (dev->flags != old_flags) {
6747 pr_info("device %s %s promiscuous mode\n",
6748 dev->name,
6749 dev->flags & IFF_PROMISC ? "entered" : "left");
6750 if (audit_enabled) {
6751 current_uid_gid(&uid, &gid);
6752 audit_log(current->audit_context, GFP_ATOMIC,
6753 AUDIT_ANOM_PROMISCUOUS,
6754 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6755 dev->name, (dev->flags & IFF_PROMISC),
6756 (old_flags & IFF_PROMISC),
6757 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6758 from_kuid(&init_user_ns, uid),
6759 from_kgid(&init_user_ns, gid),
6760 audit_get_sessionid(current));
6761 }
6762
6763 dev_change_rx_flags(dev, IFF_PROMISC);
6764 }
6765 if (notify)
6766 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6767 return 0;
6768}
6769
6770/**
6771 * dev_set_promiscuity - update promiscuity count on a device
6772 * @dev: device
6773 * @inc: modifier
6774 *
6775 * Add or remove promiscuity from a device. While the count in the device
6776 * remains above zero the interface remains promiscuous. Once it hits zero
6777 * the device reverts back to normal filtering operation. A negative inc
6778 * value is used to drop promiscuity on the device.
6779 * Return 0 if successful or a negative errno code on error.
6780 */
6781int dev_set_promiscuity(struct net_device *dev, int inc)
6782{
6783 unsigned int old_flags = dev->flags;
6784 int err;
6785
6786 err = __dev_set_promiscuity(dev, inc, true);
6787 if (err < 0)
6788 return err;
6789 if (dev->flags != old_flags)
6790 dev_set_rx_mode(dev);
6791 return err;
6792}
6793EXPORT_SYMBOL(dev_set_promiscuity);
6794
6795static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6796{
6797 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6798
6799 ASSERT_RTNL();
6800
6801 dev->flags |= IFF_ALLMULTI;
6802 dev->allmulti += inc;
6803 if (dev->allmulti == 0) {
6804 /*
6805 * Avoid overflow.
6806 * If inc causes overflow, untouch allmulti and return error.
6807 */
6808 if (inc < 0)
6809 dev->flags &= ~IFF_ALLMULTI;
6810 else {
6811 dev->allmulti -= inc;
6812 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6813 dev->name);
6814 return -EOVERFLOW;
6815 }
6816 }
6817 if (dev->flags ^ old_flags) {
6818 dev_change_rx_flags(dev, IFF_ALLMULTI);
6819 dev_set_rx_mode(dev);
6820 if (notify)
6821 __dev_notify_flags(dev, old_flags,
6822 dev->gflags ^ old_gflags);
6823 }
6824 return 0;
6825}
6826
6827/**
6828 * dev_set_allmulti - update allmulti count on a device
6829 * @dev: device
6830 * @inc: modifier
6831 *
6832 * Add or remove reception of all multicast frames to a device. While the
6833 * count in the device remains above zero the interface remains listening
6834 * to all interfaces. Once it hits zero the device reverts back to normal
6835 * filtering operation. A negative @inc value is used to drop the counter
6836 * when releasing a resource needing all multicasts.
6837 * Return 0 if successful or a negative errno code on error.
6838 */
6839
6840int dev_set_allmulti(struct net_device *dev, int inc)
6841{
6842 return __dev_set_allmulti(dev, inc, true);
6843}
6844EXPORT_SYMBOL(dev_set_allmulti);
6845
6846/*
6847 * Upload unicast and multicast address lists to device and
6848 * configure RX filtering. When the device doesn't support unicast
6849 * filtering it is put in promiscuous mode while unicast addresses
6850 * are present.
6851 */
6852void __dev_set_rx_mode(struct net_device *dev)
6853{
6854 const struct net_device_ops *ops = dev->netdev_ops;
6855
6856 /* dev_open will call this function so the list will stay sane. */
6857 if (!(dev->flags&IFF_UP))
6858 return;
6859
6860 if (!netif_device_present(dev))
6861 return;
6862
6863 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6864 /* Unicast addresses changes may only happen under the rtnl,
6865 * therefore calling __dev_set_promiscuity here is safe.
6866 */
6867 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6868 __dev_set_promiscuity(dev, 1, false);
6869 dev->uc_promisc = true;
6870 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6871 __dev_set_promiscuity(dev, -1, false);
6872 dev->uc_promisc = false;
6873 }
6874 }
6875
6876 if (ops->ndo_set_rx_mode)
6877 ops->ndo_set_rx_mode(dev);
6878}
6879
6880void dev_set_rx_mode(struct net_device *dev)
6881{
6882 netif_addr_lock_bh(dev);
6883 __dev_set_rx_mode(dev);
6884 netif_addr_unlock_bh(dev);
6885}
6886
6887/**
6888 * dev_get_flags - get flags reported to userspace
6889 * @dev: device
6890 *
6891 * Get the combination of flag bits exported through APIs to userspace.
6892 */
6893unsigned int dev_get_flags(const struct net_device *dev)
6894{
6895 unsigned int flags;
6896
6897 flags = (dev->flags & ~(IFF_PROMISC |
6898 IFF_ALLMULTI |
6899 IFF_RUNNING |
6900 IFF_LOWER_UP |
6901 IFF_DORMANT)) |
6902 (dev->gflags & (IFF_PROMISC |
6903 IFF_ALLMULTI));
6904
6905 if (netif_running(dev)) {
6906 if (netif_oper_up(dev))
6907 flags |= IFF_RUNNING;
6908 if (netif_carrier_ok(dev))
6909 flags |= IFF_LOWER_UP;
6910 if (netif_dormant(dev))
6911 flags |= IFF_DORMANT;
6912 }
6913
6914 return flags;
6915}
6916EXPORT_SYMBOL(dev_get_flags);
6917
6918int __dev_change_flags(struct net_device *dev, unsigned int flags)
6919{
6920 unsigned int old_flags = dev->flags;
6921 int ret;
6922
6923 ASSERT_RTNL();
6924
6925 /*
6926 * Set the flags on our device.
6927 */
6928
6929 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6930 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6931 IFF_AUTOMEDIA)) |
6932 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6933 IFF_ALLMULTI));
6934
6935 /*
6936 * Load in the correct multicast list now the flags have changed.
6937 */
6938
6939 if ((old_flags ^ flags) & IFF_MULTICAST)
6940 dev_change_rx_flags(dev, IFF_MULTICAST);
6941
6942 dev_set_rx_mode(dev);
6943
6944 /*
6945 * Have we downed the interface. We handle IFF_UP ourselves
6946 * according to user attempts to set it, rather than blindly
6947 * setting it.
6948 */
6949
6950 ret = 0;
6951 if ((old_flags ^ flags) & IFF_UP) {
6952 if (old_flags & IFF_UP)
6953 __dev_close(dev);
6954 else
6955 ret = __dev_open(dev);
6956 }
6957
6958 if ((flags ^ dev->gflags) & IFF_PROMISC) {
6959 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6960 unsigned int old_flags = dev->flags;
6961
6962 dev->gflags ^= IFF_PROMISC;
6963
6964 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6965 if (dev->flags != old_flags)
6966 dev_set_rx_mode(dev);
6967 }
6968
6969 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6970 * is important. Some (broken) drivers set IFF_PROMISC, when
6971 * IFF_ALLMULTI is requested not asking us and not reporting.
6972 */
6973 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6974 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6975
6976 dev->gflags ^= IFF_ALLMULTI;
6977 __dev_set_allmulti(dev, inc, false);
6978 }
6979
6980 return ret;
6981}
6982
6983void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6984 unsigned int gchanges)
6985{
6986 unsigned int changes = dev->flags ^ old_flags;
6987
6988 if (gchanges)
6989 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6990
6991 if (changes & IFF_UP) {
6992 if (dev->flags & IFF_UP)
6993 call_netdevice_notifiers(NETDEV_UP, dev);
6994 else
6995 call_netdevice_notifiers(NETDEV_DOWN, dev);
6996 }
6997
6998 if (dev->flags & IFF_UP &&
6999 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
7000 struct netdev_notifier_change_info change_info = {
7001 .info = {
7002 .dev = dev,
7003 },
7004 .flags_changed = changes,
7005 };
7006
7007 call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
7008 }
7009}
7010
7011/**
7012 * dev_change_flags - change device settings
7013 * @dev: device
7014 * @flags: device state flags
7015 *
7016 * Change settings on device based state flags. The flags are
7017 * in the userspace exported format.
7018 */
7019int dev_change_flags(struct net_device *dev, unsigned int flags)
7020{
7021 int ret;
7022 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
7023
7024 ret = __dev_change_flags(dev, flags);
7025 if (ret < 0)
7026 return ret;
7027
7028 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
7029 __dev_notify_flags(dev, old_flags, changes);
7030 return ret;
7031}
7032EXPORT_SYMBOL(dev_change_flags);
7033
7034int __dev_set_mtu(struct net_device *dev, int new_mtu)
7035{
7036 const struct net_device_ops *ops = dev->netdev_ops;
7037
7038 if (ops->ndo_change_mtu)
7039 return ops->ndo_change_mtu(dev, new_mtu);
7040
7041 dev->mtu = new_mtu;
7042 return 0;
7043}
7044EXPORT_SYMBOL(__dev_set_mtu);
7045
7046/**
7047 * dev_set_mtu - Change maximum transfer unit
7048 * @dev: device
7049 * @new_mtu: new transfer unit
7050 *
7051 * Change the maximum transfer size of the network device.
7052 */
7053int dev_set_mtu(struct net_device *dev, int new_mtu)
7054{
7055 int err, orig_mtu;
7056
7057 if (new_mtu == dev->mtu)
7058 return 0;
7059
7060 /* MTU must be positive, and in range */
7061 if (new_mtu < 0 || new_mtu < dev->min_mtu) {
7062 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
7063 dev->name, new_mtu, dev->min_mtu);
7064 return -EINVAL;
7065 }
7066
7067 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
7068 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
7069 dev->name, new_mtu, dev->max_mtu);
7070 return -EINVAL;
7071 }
7072
7073 if (!netif_device_present(dev))
7074 return -ENODEV;
7075
7076 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
7077 err = notifier_to_errno(err);
7078 if (err)
7079 return err;
7080
7081 orig_mtu = dev->mtu;
7082 err = __dev_set_mtu(dev, new_mtu);
7083
7084 if (!err) {
7085 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
7086 err = notifier_to_errno(err);
7087 if (err) {
7088 /* setting mtu back and notifying everyone again,
7089 * so that they have a chance to revert changes.
7090 */
7091 __dev_set_mtu(dev, orig_mtu);
7092 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
7093 }
7094 }
7095 return err;
7096}
7097EXPORT_SYMBOL(dev_set_mtu);
7098
7099/**
7100 * dev_change_tx_queue_len - Change TX queue length of a netdevice
7101 * @dev: device
7102 * @new_len: new tx queue length
7103 */
7104int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
7105{
7106 unsigned int orig_len = dev->tx_queue_len;
7107 int res;
7108
7109 if (new_len != (unsigned int)new_len)
7110 return -ERANGE;
7111
7112 if (new_len != orig_len) {
7113 dev->tx_queue_len = new_len;
7114 res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
7115 res = notifier_to_errno(res);
7116 if (res) {
7117 netdev_err(dev,
7118 "refused to change device tx_queue_len\n");
7119 dev->tx_queue_len = orig_len;
7120 return res;
7121 }
7122 return dev_qdisc_change_tx_queue_len(dev);
7123 }
7124
7125 return 0;
7126}
7127
7128/**
7129 * dev_set_group - Change group this device belongs to
7130 * @dev: device
7131 * @new_group: group this device should belong to
7132 */
7133void dev_set_group(struct net_device *dev, int new_group)
7134{
7135 dev->group = new_group;
7136}
7137EXPORT_SYMBOL(dev_set_group);
7138
7139/**
7140 * dev_set_mac_address - Change Media Access Control Address
7141 * @dev: device
7142 * @sa: new address
7143 *
7144 * Change the hardware (MAC) address of the device
7145 */
7146int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
7147{
7148 const struct net_device_ops *ops = dev->netdev_ops;
7149 int err;
7150
7151 if (!ops->ndo_set_mac_address)
7152 return -EOPNOTSUPP;
7153 if (sa->sa_family != dev->type)
7154 return -EINVAL;
7155 if (!netif_device_present(dev))
7156 return -ENODEV;
7157 err = ops->ndo_set_mac_address(dev, sa);
7158 if (err)
7159 return err;
7160 dev->addr_assign_type = NET_ADDR_SET;
7161 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7162 add_device_randomness(dev->dev_addr, dev->addr_len);
7163 return 0;
7164}
7165EXPORT_SYMBOL(dev_set_mac_address);
7166
7167/**
7168 * dev_change_carrier - Change device carrier
7169 * @dev: device
7170 * @new_carrier: new value
7171 *
7172 * Change device carrier
7173 */
7174int dev_change_carrier(struct net_device *dev, bool new_carrier)
7175{
7176 const struct net_device_ops *ops = dev->netdev_ops;
7177
7178 if (!ops->ndo_change_carrier)
7179 return -EOPNOTSUPP;
7180 if (!netif_device_present(dev))
7181 return -ENODEV;
7182 return ops->ndo_change_carrier(dev, new_carrier);
7183}
7184EXPORT_SYMBOL(dev_change_carrier);
7185
7186/**
7187 * dev_get_phys_port_id - Get device physical port ID
7188 * @dev: device
7189 * @ppid: port ID
7190 *
7191 * Get device physical port ID
7192 */
7193int dev_get_phys_port_id(struct net_device *dev,
7194 struct netdev_phys_item_id *ppid)
7195{
7196 const struct net_device_ops *ops = dev->netdev_ops;
7197
7198 if (!ops->ndo_get_phys_port_id)
7199 return -EOPNOTSUPP;
7200 return ops->ndo_get_phys_port_id(dev, ppid);
7201}
7202EXPORT_SYMBOL(dev_get_phys_port_id);
7203
7204/**
7205 * dev_get_phys_port_name - Get device physical port name
7206 * @dev: device
7207 * @name: port name
7208 * @len: limit of bytes to copy to name
7209 *
7210 * Get device physical port name
7211 */
7212int dev_get_phys_port_name(struct net_device *dev,
7213 char *name, size_t len)
7214{
7215 const struct net_device_ops *ops = dev->netdev_ops;
7216
7217 if (!ops->ndo_get_phys_port_name)
7218 return -EOPNOTSUPP;
7219 return ops->ndo_get_phys_port_name(dev, name, len);
7220}
7221EXPORT_SYMBOL(dev_get_phys_port_name);
7222
7223/**
7224 * dev_change_proto_down - update protocol port state information
7225 * @dev: device
7226 * @proto_down: new value
7227 *
7228 * This info can be used by switch drivers to set the phys state of the
7229 * port.
7230 */
7231int dev_change_proto_down(struct net_device *dev, bool proto_down)
7232{
7233 const struct net_device_ops *ops = dev->netdev_ops;
7234
7235 if (!ops->ndo_change_proto_down)
7236 return -EOPNOTSUPP;
7237 if (!netif_device_present(dev))
7238 return -ENODEV;
7239 return ops->ndo_change_proto_down(dev, proto_down);
7240}
7241EXPORT_SYMBOL(dev_change_proto_down);
7242
7243void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
7244 struct netdev_bpf *xdp)
7245{
7246 memset(xdp, 0, sizeof(*xdp));
7247 xdp->command = XDP_QUERY_PROG;
7248
7249 /* Query must always succeed. */
7250 WARN_ON(bpf_op(dev, xdp) < 0);
7251}
7252
7253static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
7254{
7255 struct netdev_bpf xdp;
7256
7257 __dev_xdp_query(dev, bpf_op, &xdp);
7258
7259 return xdp.prog_attached;
7260}
7261
7262static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
7263 struct netlink_ext_ack *extack, u32 flags,
7264 struct bpf_prog *prog)
7265{
7266 struct netdev_bpf xdp;
7267
7268 memset(&xdp, 0, sizeof(xdp));
7269 if (flags & XDP_FLAGS_HW_MODE)
7270 xdp.command = XDP_SETUP_PROG_HW;
7271 else
7272 xdp.command = XDP_SETUP_PROG;
7273 xdp.extack = extack;
7274 xdp.flags = flags;
7275 xdp.prog = prog;
7276
7277 return bpf_op(dev, &xdp);
7278}
7279
7280static void dev_xdp_uninstall(struct net_device *dev)
7281{
7282 struct netdev_bpf xdp;
7283 bpf_op_t ndo_bpf;
7284
7285 /* Remove generic XDP */
7286 WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
7287
7288 /* Remove from the driver */
7289 ndo_bpf = dev->netdev_ops->ndo_bpf;
7290 if (!ndo_bpf)
7291 return;
7292
7293 __dev_xdp_query(dev, ndo_bpf, &xdp);
7294 if (xdp.prog_attached == XDP_ATTACHED_NONE)
7295 return;
7296
7297 /* Program removal should always succeed */
7298 WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL));
7299}
7300
7301/**
7302 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
7303 * @dev: device
7304 * @extack: netlink extended ack
7305 * @fd: new program fd or negative value to clear
7306 * @flags: xdp-related flags
7307 *
7308 * Set or clear a bpf program for a device
7309 */
7310int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
7311 int fd, u32 flags)
7312{
7313 const struct net_device_ops *ops = dev->netdev_ops;
7314 struct bpf_prog *prog = NULL;
7315 bpf_op_t bpf_op, bpf_chk;
7316 int err;
7317
7318 ASSERT_RTNL();
7319
7320 bpf_op = bpf_chk = ops->ndo_bpf;
7321 if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
7322 return -EOPNOTSUPP;
7323 if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
7324 bpf_op = generic_xdp_install;
7325 if (bpf_op == bpf_chk)
7326 bpf_chk = generic_xdp_install;
7327
7328 if (fd >= 0) {
7329 if (bpf_chk && __dev_xdp_attached(dev, bpf_chk))
7330 return -EEXIST;
7331 if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
7332 __dev_xdp_attached(dev, bpf_op))
7333 return -EBUSY;
7334
7335 prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
7336 bpf_op == ops->ndo_bpf);
7337 if (IS_ERR(prog))
7338 return PTR_ERR(prog);
7339
7340 if (!(flags & XDP_FLAGS_HW_MODE) &&
7341 bpf_prog_is_dev_bound(prog->aux)) {
7342 NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
7343 bpf_prog_put(prog);
7344 return -EINVAL;
7345 }
7346 }
7347
7348 err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
7349 if (err < 0 && prog)
7350 bpf_prog_put(prog);
7351
7352 return err;
7353}
7354
7355/**
7356 * dev_new_index - allocate an ifindex
7357 * @net: the applicable net namespace
7358 *
7359 * Returns a suitable unique value for a new device interface
7360 * number. The caller must hold the rtnl semaphore or the
7361 * dev_base_lock to be sure it remains unique.
7362 */
7363static int dev_new_index(struct net *net)
7364{
7365 int ifindex = net->ifindex;
7366
7367 for (;;) {
7368 if (++ifindex <= 0)
7369 ifindex = 1;
7370 if (!__dev_get_by_index(net, ifindex))
7371 return net->ifindex = ifindex;
7372 }
7373}
7374
7375/* Delayed registration/unregisteration */
7376static LIST_HEAD(net_todo_list);
7377DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
7378
7379static void net_set_todo(struct net_device *dev)
7380{
7381 list_add_tail(&dev->todo_list, &net_todo_list);
7382 dev_net(dev)->dev_unreg_count++;
7383}
7384
7385static void rollback_registered_many(struct list_head *head)
7386{
7387 struct net_device *dev, *tmp;
7388 LIST_HEAD(close_head);
7389
7390 BUG_ON(dev_boot_phase);
7391 ASSERT_RTNL();
7392
7393 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
7394 /* Some devices call without registering
7395 * for initialization unwind. Remove those
7396 * devices and proceed with the remaining.
7397 */
7398 if (dev->reg_state == NETREG_UNINITIALIZED) {
7399 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
7400 dev->name, dev);
7401
7402 WARN_ON(1);
7403 list_del(&dev->unreg_list);
7404 continue;
7405 }
7406 dev->dismantle = true;
7407 BUG_ON(dev->reg_state != NETREG_REGISTERED);
7408 }
7409
7410 /* If device is running, close it first. */
7411 list_for_each_entry(dev, head, unreg_list)
7412 list_add_tail(&dev->close_list, &close_head);
7413 dev_close_many(&close_head, true);
7414
7415 list_for_each_entry(dev, head, unreg_list) {
7416 /* And unlink it from device chain. */
7417 unlist_netdevice(dev);
7418
7419 dev->reg_state = NETREG_UNREGISTERING;
7420 }
7421 flush_all_backlogs();
7422
7423 synchronize_net();
7424
7425 list_for_each_entry(dev, head, unreg_list) {
7426 struct sk_buff *skb = NULL;
7427
7428 /* Shutdown queueing discipline. */
7429 dev_shutdown(dev);
7430
7431 dev_xdp_uninstall(dev);
7432
7433 /* Notify protocols, that we are about to destroy
7434 * this device. They should clean all the things.
7435 */
7436 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7437
7438 if (!dev->rtnl_link_ops ||
7439 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7440 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
7441 GFP_KERNEL, NULL, 0);
7442
7443 /*
7444 * Flush the unicast and multicast chains
7445 */
7446 dev_uc_flush(dev);
7447 dev_mc_flush(dev);
7448
7449 if (dev->netdev_ops->ndo_uninit)
7450 dev->netdev_ops->ndo_uninit(dev);
7451
7452 if (skb)
7453 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
7454
7455 /* Notifier chain MUST detach us all upper devices. */
7456 WARN_ON(netdev_has_any_upper_dev(dev));
7457 WARN_ON(netdev_has_any_lower_dev(dev));
7458
7459 /* Remove entries from kobject tree */
7460 netdev_unregister_kobject(dev);
7461#ifdef CONFIG_XPS
7462 /* Remove XPS queueing entries */
7463 netif_reset_xps_queues_gt(dev, 0);
7464#endif
7465 }
7466
7467 synchronize_net();
7468
7469 list_for_each_entry(dev, head, unreg_list)
7470 dev_put(dev);
7471}
7472
7473static void rollback_registered(struct net_device *dev)
7474{
7475 LIST_HEAD(single);
7476
7477 list_add(&dev->unreg_list, &single);
7478 rollback_registered_many(&single);
7479 list_del(&single);
7480}
7481
7482static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
7483 struct net_device *upper, netdev_features_t features)
7484{
7485 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
7486 netdev_features_t feature;
7487 int feature_bit;
7488
7489 for_each_netdev_feature(&upper_disables, feature_bit) {
7490 feature = __NETIF_F_BIT(feature_bit);
7491 if (!(upper->wanted_features & feature)
7492 && (features & feature)) {
7493 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
7494 &feature, upper->name);
7495 features &= ~feature;
7496 }
7497 }
7498
7499 return features;
7500}
7501
7502static void netdev_sync_lower_features(struct net_device *upper,
7503 struct net_device *lower, netdev_features_t features)
7504{
7505 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
7506 netdev_features_t feature;
7507 int feature_bit;
7508
7509 for_each_netdev_feature(&upper_disables, feature_bit) {
7510 feature = __NETIF_F_BIT(feature_bit);
7511 if (!(features & feature) && (lower->features & feature)) {
7512 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
7513 &feature, lower->name);
7514 lower->wanted_features &= ~feature;
7515 netdev_update_features(lower);
7516
7517 if (unlikely(lower->features & feature))
7518 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
7519 &feature, lower->name);
7520 }
7521 }
7522}
7523
7524static netdev_features_t netdev_fix_features(struct net_device *dev,
7525 netdev_features_t features)
7526{
7527 /* Fix illegal checksum combinations */
7528 if ((features & NETIF_F_HW_CSUM) &&
7529 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
7530 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
7531 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
7532 }
7533
7534 /* TSO requires that SG is present as well. */
7535 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
7536 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
7537 features &= ~NETIF_F_ALL_TSO;
7538 }
7539
7540 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
7541 !(features & NETIF_F_IP_CSUM)) {
7542 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
7543 features &= ~NETIF_F_TSO;
7544 features &= ~NETIF_F_TSO_ECN;
7545 }
7546
7547 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
7548 !(features & NETIF_F_IPV6_CSUM)) {
7549 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
7550 features &= ~NETIF_F_TSO6;
7551 }
7552
7553 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
7554 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
7555 features &= ~NETIF_F_TSO_MANGLEID;
7556
7557 /* TSO ECN requires that TSO is present as well. */
7558 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
7559 features &= ~NETIF_F_TSO_ECN;
7560
7561 /* Software GSO depends on SG. */
7562 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
7563 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
7564 features &= ~NETIF_F_GSO;
7565 }
7566
7567 /* GSO partial features require GSO partial be set */
7568 if ((features & dev->gso_partial_features) &&
7569 !(features & NETIF_F_GSO_PARTIAL)) {
7570 netdev_dbg(dev,
7571 "Dropping partially supported GSO features since no GSO partial.\n");
7572 features &= ~dev->gso_partial_features;
7573 }
7574
7575 if (!(features & NETIF_F_RXCSUM)) {
7576 /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
7577 * successfully merged by hardware must also have the
7578 * checksum verified by hardware. If the user does not
7579 * want to enable RXCSUM, logically, we should disable GRO_HW.
7580 */
7581 if (features & NETIF_F_GRO_HW) {
7582 netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
7583 features &= ~NETIF_F_GRO_HW;
7584 }
7585 }
7586
7587 /* LRO/HW-GRO features cannot be combined with RX-FCS */
7588 if (features & NETIF_F_RXFCS) {
7589 if (features & NETIF_F_LRO) {
7590 netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
7591 features &= ~NETIF_F_LRO;
7592 }
7593
7594 if (features & NETIF_F_GRO_HW) {
7595 netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
7596 features &= ~NETIF_F_GRO_HW;
7597 }
7598 }
7599
7600 return features;
7601}
7602
7603int __netdev_update_features(struct net_device *dev)
7604{
7605 struct net_device *upper, *lower;
7606 netdev_features_t features;
7607 struct list_head *iter;
7608 int err = -1;
7609
7610 ASSERT_RTNL();
7611
7612 features = netdev_get_wanted_features(dev);
7613
7614 if (dev->netdev_ops->ndo_fix_features)
7615 features = dev->netdev_ops->ndo_fix_features(dev, features);
7616
7617 /* driver might be less strict about feature dependencies */
7618 features = netdev_fix_features(dev, features);
7619
7620 /* some features can't be enabled if they're off an an upper device */
7621 netdev_for_each_upper_dev_rcu(dev, upper, iter)
7622 features = netdev_sync_upper_features(dev, upper, features);
7623
7624 if (dev->features == features)
7625 goto sync_lower;
7626
7627 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7628 &dev->features, &features);
7629
7630 if (dev->netdev_ops->ndo_set_features)
7631 err = dev->netdev_ops->ndo_set_features(dev, features);
7632 else
7633 err = 0;
7634
7635 if (unlikely(err < 0)) {
7636 netdev_err(dev,
7637 "set_features() failed (%d); wanted %pNF, left %pNF\n",
7638 err, &features, &dev->features);
7639 /* return non-0 since some features might have changed and
7640 * it's better to fire a spurious notification than miss it
7641 */
7642 return -1;
7643 }
7644
7645sync_lower:
7646 /* some features must be disabled on lower devices when disabled
7647 * on an upper device (think: bonding master or bridge)
7648 */
7649 netdev_for_each_lower_dev(dev, lower, iter)
7650 netdev_sync_lower_features(dev, lower, features);
7651
7652 if (!err) {
7653 netdev_features_t diff = features ^ dev->features;
7654
7655 if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
7656 /* udp_tunnel_{get,drop}_rx_info both need
7657 * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
7658 * device, or they won't do anything.
7659 * Thus we need to update dev->features
7660 * *before* calling udp_tunnel_get_rx_info,
7661 * but *after* calling udp_tunnel_drop_rx_info.
7662 */
7663 if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
7664 dev->features = features;
7665 udp_tunnel_get_rx_info(dev);
7666 } else {
7667 udp_tunnel_drop_rx_info(dev);
7668 }
7669 }
7670
7671 if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
7672 if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
7673 dev->features = features;
7674 err |= vlan_get_rx_ctag_filter_info(dev);
7675 } else {
7676 vlan_drop_rx_ctag_filter_info(dev);
7677 }
7678 }
7679
7680 if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
7681 if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
7682 dev->features = features;
7683 err |= vlan_get_rx_stag_filter_info(dev);
7684 } else {
7685 vlan_drop_rx_stag_filter_info(dev);
7686 }
7687 }
7688
7689 dev->features = features;
7690 }
7691
7692 return err < 0 ? 0 : 1;
7693}
7694
7695/**
7696 * netdev_update_features - recalculate device features
7697 * @dev: the device to check
7698 *
7699 * Recalculate dev->features set and send notifications if it
7700 * has changed. Should be called after driver or hardware dependent
7701 * conditions might have changed that influence the features.
7702 */
7703void netdev_update_features(struct net_device *dev)
7704{
7705 if (__netdev_update_features(dev))
7706 netdev_features_change(dev);
7707}
7708EXPORT_SYMBOL(netdev_update_features);
7709
7710/**
7711 * netdev_change_features - recalculate device features
7712 * @dev: the device to check
7713 *
7714 * Recalculate dev->features set and send notifications even
7715 * if they have not changed. Should be called instead of
7716 * netdev_update_features() if also dev->vlan_features might
7717 * have changed to allow the changes to be propagated to stacked
7718 * VLAN devices.
7719 */
7720void netdev_change_features(struct net_device *dev)
7721{
7722 __netdev_update_features(dev);
7723 netdev_features_change(dev);
7724}
7725EXPORT_SYMBOL(netdev_change_features);
7726
7727/**
7728 * netif_stacked_transfer_operstate - transfer operstate
7729 * @rootdev: the root or lower level device to transfer state from
7730 * @dev: the device to transfer operstate to
7731 *
7732 * Transfer operational state from root to device. This is normally
7733 * called when a stacking relationship exists between the root
7734 * device and the device(a leaf device).
7735 */
7736void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7737 struct net_device *dev)
7738{
7739 if (rootdev->operstate == IF_OPER_DORMANT)
7740 netif_dormant_on(dev);
7741 else
7742 netif_dormant_off(dev);
7743
7744 if (netif_carrier_ok(rootdev))
7745 netif_carrier_on(dev);
7746 else
7747 netif_carrier_off(dev);
7748}
7749EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7750
7751static int netif_alloc_rx_queues(struct net_device *dev)
7752{
7753 unsigned int i, count = dev->num_rx_queues;
7754 struct netdev_rx_queue *rx;
7755 size_t sz = count * sizeof(*rx);
7756 int err = 0;
7757
7758 BUG_ON(count < 1);
7759
7760 rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
7761 if (!rx)
7762 return -ENOMEM;
7763
7764 dev->_rx = rx;
7765
7766 for (i = 0; i < count; i++) {
7767 rx[i].dev = dev;
7768
7769 /* XDP RX-queue setup */
7770 err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
7771 if (err < 0)
7772 goto err_rxq_info;
7773 }
7774 return 0;
7775
7776err_rxq_info:
7777 /* Rollback successful reg's and free other resources */
7778 while (i--)
7779 xdp_rxq_info_unreg(&rx[i].xdp_rxq);
7780 kvfree(dev->_rx);
7781 dev->_rx = NULL;
7782 return err;
7783}
7784
7785static void netif_free_rx_queues(struct net_device *dev)
7786{
7787 unsigned int i, count = dev->num_rx_queues;
7788
7789 /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
7790 if (!dev->_rx)
7791 return;
7792
7793 for (i = 0; i < count; i++)
7794 xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
7795
7796 kvfree(dev->_rx);
7797}
7798
7799static void netdev_init_one_queue(struct net_device *dev,
7800 struct netdev_queue *queue, void *_unused)
7801{
7802 /* Initialize queue lock */
7803 spin_lock_init(&queue->_xmit_lock);
7804 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7805 queue->xmit_lock_owner = -1;
7806 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7807 queue->dev = dev;
7808#ifdef CONFIG_BQL
7809 dql_init(&queue->dql, HZ);
7810#endif
7811}
7812
7813static void netif_free_tx_queues(struct net_device *dev)
7814{
7815 kvfree(dev->_tx);
7816}
7817
7818static int netif_alloc_netdev_queues(struct net_device *dev)
7819{
7820 unsigned int count = dev->num_tx_queues;
7821 struct netdev_queue *tx;
7822 size_t sz = count * sizeof(*tx);
7823
7824 if (count < 1 || count > 0xffff)
7825 return -EINVAL;
7826
7827 tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
7828 if (!tx)
7829 return -ENOMEM;
7830
7831 dev->_tx = tx;
7832
7833 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7834 spin_lock_init(&dev->tx_global_lock);
7835
7836 return 0;
7837}
7838
7839void netif_tx_stop_all_queues(struct net_device *dev)
7840{
7841 unsigned int i;
7842
7843 for (i = 0; i < dev->num_tx_queues; i++) {
7844 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7845
7846 netif_tx_stop_queue(txq);
7847 }
7848}
7849EXPORT_SYMBOL(netif_tx_stop_all_queues);
7850
7851/**
7852 * register_netdevice - register a network device
7853 * @dev: device to register
7854 *
7855 * Take a completed network device structure and add it to the kernel
7856 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7857 * chain. 0 is returned on success. A negative errno code is returned
7858 * on a failure to set up the device, or if the name is a duplicate.
7859 *
7860 * Callers must hold the rtnl semaphore. You may want
7861 * register_netdev() instead of this.
7862 *
7863 * BUGS:
7864 * The locking appears insufficient to guarantee two parallel registers
7865 * will not get the same name.
7866 */
7867
7868int register_netdevice(struct net_device *dev)
7869{
7870 int ret;
7871 struct net *net = dev_net(dev);
7872
7873 BUG_ON(dev_boot_phase);
7874 ASSERT_RTNL();
7875
7876 might_sleep();
7877
7878 /* When net_device's are persistent, this will be fatal. */
7879 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7880 BUG_ON(!net);
7881
7882 spin_lock_init(&dev->addr_list_lock);
7883 netdev_set_addr_lockdep_class(dev);
7884
7885 ret = dev_get_valid_name(net, dev, dev->name);
7886 if (ret < 0)
7887 goto out;
7888
7889 /* Init, if this function is available */
7890 if (dev->netdev_ops->ndo_init) {
7891 ret = dev->netdev_ops->ndo_init(dev);
7892 if (ret) {
7893 if (ret > 0)
7894 ret = -EIO;
7895 goto out;
7896 }
7897 }
7898
7899 if (((dev->hw_features | dev->features) &
7900 NETIF_F_HW_VLAN_CTAG_FILTER) &&
7901 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7902 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7903 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7904 ret = -EINVAL;
7905 goto err_uninit;
7906 }
7907
7908 ret = -EBUSY;
7909 if (!dev->ifindex)
7910 dev->ifindex = dev_new_index(net);
7911 else if (__dev_get_by_index(net, dev->ifindex))
7912 goto err_uninit;
7913
7914 /* Transfer changeable features to wanted_features and enable
7915 * software offloads (GSO and GRO).
7916 */
7917 dev->hw_features |= NETIF_F_SOFT_FEATURES;
7918 dev->features |= NETIF_F_SOFT_FEATURES;
7919
7920 if (dev->netdev_ops->ndo_udp_tunnel_add) {
7921 dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
7922 dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
7923 }
7924
7925 dev->wanted_features = dev->features & dev->hw_features;
7926
7927 if (!(dev->flags & IFF_LOOPBACK))
7928 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7929
7930 /* If IPv4 TCP segmentation offload is supported we should also
7931 * allow the device to enable segmenting the frame with the option
7932 * of ignoring a static IP ID value. This doesn't enable the
7933 * feature itself but allows the user to enable it later.
7934 */
7935 if (dev->hw_features & NETIF_F_TSO)
7936 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7937 if (dev->vlan_features & NETIF_F_TSO)
7938 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7939 if (dev->mpls_features & NETIF_F_TSO)
7940 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7941 if (dev->hw_enc_features & NETIF_F_TSO)
7942 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7943
7944 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7945 */
7946 dev->vlan_features |= NETIF_F_HIGHDMA;
7947
7948 /* Make NETIF_F_SG inheritable to tunnel devices.
7949 */
7950 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7951
7952 /* Make NETIF_F_SG inheritable to MPLS.
7953 */
7954 dev->mpls_features |= NETIF_F_SG;
7955
7956 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7957 ret = notifier_to_errno(ret);
7958 if (ret)
7959 goto err_uninit;
7960
7961 ret = netdev_register_kobject(dev);
7962 if (ret)
7963 goto err_uninit;
7964 dev->reg_state = NETREG_REGISTERED;
7965
7966 __netdev_update_features(dev);
7967
7968 /*
7969 * Default initial state at registry is that the
7970 * device is present.
7971 */
7972
7973 set_bit(__LINK_STATE_PRESENT, &dev->state);
7974
7975 linkwatch_init_dev(dev);
7976
7977 dev_init_scheduler(dev);
7978 dev_hold(dev);
7979 list_netdevice(dev);
7980 add_device_randomness(dev->dev_addr, dev->addr_len);
7981
7982 /* If the device has permanent device address, driver should
7983 * set dev_addr and also addr_assign_type should be set to
7984 * NET_ADDR_PERM (default value).
7985 */
7986 if (dev->addr_assign_type == NET_ADDR_PERM)
7987 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7988
7989 /* Notify protocols, that a new device appeared. */
7990 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7991 ret = notifier_to_errno(ret);
7992 if (ret) {
7993 rollback_registered(dev);
7994 dev->reg_state = NETREG_UNREGISTERED;
7995 }
7996 /*
7997 * Prevent userspace races by waiting until the network
7998 * device is fully setup before sending notifications.
7999 */
8000 if (!dev->rtnl_link_ops ||
8001 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
8002 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8003
8004out:
8005 return ret;
8006
8007err_uninit:
8008 if (dev->netdev_ops->ndo_uninit)
8009 dev->netdev_ops->ndo_uninit(dev);
8010 if (dev->priv_destructor)
8011 dev->priv_destructor(dev);
8012 goto out;
8013}
8014EXPORT_SYMBOL(register_netdevice);
8015
8016/**
8017 * init_dummy_netdev - init a dummy network device for NAPI
8018 * @dev: device to init
8019 *
8020 * This takes a network device structure and initialize the minimum
8021 * amount of fields so it can be used to schedule NAPI polls without
8022 * registering a full blown interface. This is to be used by drivers
8023 * that need to tie several hardware interfaces to a single NAPI
8024 * poll scheduler due to HW limitations.
8025 */
8026int init_dummy_netdev(struct net_device *dev)
8027{
8028 /* Clear everything. Note we don't initialize spinlocks
8029 * are they aren't supposed to be taken by any of the
8030 * NAPI code and this dummy netdev is supposed to be
8031 * only ever used for NAPI polls
8032 */
8033 memset(dev, 0, sizeof(struct net_device));
8034
8035 /* make sure we BUG if trying to hit standard
8036 * register/unregister code path
8037 */
8038 dev->reg_state = NETREG_DUMMY;
8039
8040 /* NAPI wants this */
8041 INIT_LIST_HEAD(&dev->napi_list);
8042
8043 /* a dummy interface is started by default */
8044 set_bit(__LINK_STATE_PRESENT, &dev->state);
8045 set_bit(__LINK_STATE_START, &dev->state);
8046
8047 /* Note : We dont allocate pcpu_refcnt for dummy devices,
8048 * because users of this 'device' dont need to change
8049 * its refcount.
8050 */
8051
8052 return 0;
8053}
8054EXPORT_SYMBOL_GPL(init_dummy_netdev);
8055
8056
8057/**
8058 * register_netdev - register a network device
8059 * @dev: device to register
8060 *
8061 * Take a completed network device structure and add it to the kernel
8062 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
8063 * chain. 0 is returned on success. A negative errno code is returned
8064 * on a failure to set up the device, or if the name is a duplicate.
8065 *
8066 * This is a wrapper around register_netdevice that takes the rtnl semaphore
8067 * and expands the device name if you passed a format string to
8068 * alloc_netdev.
8069 */
8070int register_netdev(struct net_device *dev)
8071{
8072 int err;
8073
8074 if (rtnl_lock_killable())
8075 return -EINTR;
8076 err = register_netdevice(dev);
8077 rtnl_unlock();
8078 return err;
8079}
8080EXPORT_SYMBOL(register_netdev);
8081
8082int netdev_refcnt_read(const struct net_device *dev)
8083{
8084 int i, refcnt = 0;
8085
8086 for_each_possible_cpu(i)
8087 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
8088 return refcnt;
8089}
8090EXPORT_SYMBOL(netdev_refcnt_read);
8091
8092/**
8093 * netdev_wait_allrefs - wait until all references are gone.
8094 * @dev: target net_device
8095 *
8096 * This is called when unregistering network devices.
8097 *
8098 * Any protocol or device that holds a reference should register
8099 * for netdevice notification, and cleanup and put back the
8100 * reference if they receive an UNREGISTER event.
8101 * We can get stuck here if buggy protocols don't correctly
8102 * call dev_put.
8103 */
8104static void netdev_wait_allrefs(struct net_device *dev)
8105{
8106 unsigned long rebroadcast_time, warning_time;
8107 int refcnt;
8108
8109 linkwatch_forget_dev(dev);
8110
8111 rebroadcast_time = warning_time = jiffies;
8112 refcnt = netdev_refcnt_read(dev);
8113
8114 while (refcnt != 0) {
8115 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
8116 rtnl_lock();
8117
8118 /* Rebroadcast unregister notification */
8119 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8120
8121 __rtnl_unlock();
8122 rcu_barrier();
8123 rtnl_lock();
8124
8125 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
8126 &dev->state)) {
8127 /* We must not have linkwatch events
8128 * pending on unregister. If this
8129 * happens, we simply run the queue
8130 * unscheduled, resulting in a noop
8131 * for this device.
8132 */
8133 linkwatch_run_queue();
8134 }
8135
8136 __rtnl_unlock();
8137
8138 rebroadcast_time = jiffies;
8139 }
8140
8141 msleep(250);
8142
8143 refcnt = netdev_refcnt_read(dev);
8144
8145 if (time_after(jiffies, warning_time + 10 * HZ)) {
8146 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
8147 dev->name, refcnt);
8148 warning_time = jiffies;
8149 }
8150 }
8151}
8152
8153/* The sequence is:
8154 *
8155 * rtnl_lock();
8156 * ...
8157 * register_netdevice(x1);
8158 * register_netdevice(x2);
8159 * ...
8160 * unregister_netdevice(y1);
8161 * unregister_netdevice(y2);
8162 * ...
8163 * rtnl_unlock();
8164 * free_netdev(y1);
8165 * free_netdev(y2);
8166 *
8167 * We are invoked by rtnl_unlock().
8168 * This allows us to deal with problems:
8169 * 1) We can delete sysfs objects which invoke hotplug
8170 * without deadlocking with linkwatch via keventd.
8171 * 2) Since we run with the RTNL semaphore not held, we can sleep
8172 * safely in order to wait for the netdev refcnt to drop to zero.
8173 *
8174 * We must not return until all unregister events added during
8175 * the interval the lock was held have been completed.
8176 */
8177void netdev_run_todo(void)
8178{
8179 struct list_head list;
8180
8181 /* Snapshot list, allow later requests */
8182 list_replace_init(&net_todo_list, &list);
8183
8184 __rtnl_unlock();
8185
8186
8187 /* Wait for rcu callbacks to finish before next phase */
8188 if (!list_empty(&list))
8189 rcu_barrier();
8190
8191 while (!list_empty(&list)) {
8192 struct net_device *dev
8193 = list_first_entry(&list, struct net_device, todo_list);
8194 list_del(&dev->todo_list);
8195
8196 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
8197 pr_err("network todo '%s' but state %d\n",
8198 dev->name, dev->reg_state);
8199 dump_stack();
8200 continue;
8201 }
8202
8203 dev->reg_state = NETREG_UNREGISTERED;
8204
8205 netdev_wait_allrefs(dev);
8206
8207 /* paranoia */
8208 BUG_ON(netdev_refcnt_read(dev));
8209 BUG_ON(!list_empty(&dev->ptype_all));
8210 BUG_ON(!list_empty(&dev->ptype_specific));
8211 WARN_ON(rcu_access_pointer(dev->ip_ptr));
8212 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
8213#if IS_ENABLED(CONFIG_DECNET)
8214 WARN_ON(dev->dn_ptr);
8215#endif
8216 if (dev->priv_destructor)
8217 dev->priv_destructor(dev);
8218 if (dev->needs_free_netdev)
8219 free_netdev(dev);
8220
8221 /* Report a network device has been unregistered */
8222 rtnl_lock();
8223 dev_net(dev)->dev_unreg_count--;
8224 __rtnl_unlock();
8225 wake_up(&netdev_unregistering_wq);
8226
8227 /* Free network device */
8228 kobject_put(&dev->dev.kobj);
8229 }
8230}
8231
8232/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
8233 * all the same fields in the same order as net_device_stats, with only
8234 * the type differing, but rtnl_link_stats64 may have additional fields
8235 * at the end for newer counters.
8236 */
8237void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
8238 const struct net_device_stats *netdev_stats)
8239{
8240#if BITS_PER_LONG == 64
8241 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
8242 memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
8243 /* zero out counters that only exist in rtnl_link_stats64 */
8244 memset((char *)stats64 + sizeof(*netdev_stats), 0,
8245 sizeof(*stats64) - sizeof(*netdev_stats));
8246#else
8247 size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
8248 const unsigned long *src = (const unsigned long *)netdev_stats;
8249 u64 *dst = (u64 *)stats64;
8250
8251 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
8252 for (i = 0; i < n; i++)
8253 dst[i] = src[i];
8254 /* zero out counters that only exist in rtnl_link_stats64 */
8255 memset((char *)stats64 + n * sizeof(u64), 0,
8256 sizeof(*stats64) - n * sizeof(u64));
8257#endif
8258}
8259EXPORT_SYMBOL(netdev_stats_to_stats64);
8260
8261/**
8262 * dev_get_stats - get network device statistics
8263 * @dev: device to get statistics from
8264 * @storage: place to store stats
8265 *
8266 * Get network statistics from device. Return @storage.
8267 * The device driver may provide its own method by setting
8268 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
8269 * otherwise the internal statistics structure is used.
8270 */
8271struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
8272 struct rtnl_link_stats64 *storage)
8273{
8274 const struct net_device_ops *ops = dev->netdev_ops;
8275
8276 if (ops->ndo_get_stats64) {
8277 memset(storage, 0, sizeof(*storage));
8278 ops->ndo_get_stats64(dev, storage);
8279 } else if (ops->ndo_get_stats) {
8280 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
8281 } else {
8282 netdev_stats_to_stats64(storage, &dev->stats);
8283 }
8284 storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
8285 storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
8286 storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
8287 return storage;
8288}
8289EXPORT_SYMBOL(dev_get_stats);
8290
8291struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
8292{
8293 struct netdev_queue *queue = dev_ingress_queue(dev);
8294
8295#ifdef CONFIG_NET_CLS_ACT
8296 if (queue)
8297 return queue;
8298 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
8299 if (!queue)
8300 return NULL;
8301 netdev_init_one_queue(dev, queue, NULL);
8302 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
8303 queue->qdisc_sleeping = &noop_qdisc;
8304 rcu_assign_pointer(dev->ingress_queue, queue);
8305#endif
8306 return queue;
8307}
8308
8309static const struct ethtool_ops default_ethtool_ops;
8310
8311void netdev_set_default_ethtool_ops(struct net_device *dev,
8312 const struct ethtool_ops *ops)
8313{
8314 if (dev->ethtool_ops == &default_ethtool_ops)
8315 dev->ethtool_ops = ops;
8316}
8317EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
8318
8319void netdev_freemem(struct net_device *dev)
8320{
8321 char *addr = (char *)dev - dev->padded;
8322
8323 kvfree(addr);
8324}
8325
8326/**
8327 * alloc_netdev_mqs - allocate network device
8328 * @sizeof_priv: size of private data to allocate space for
8329 * @name: device name format string
8330 * @name_assign_type: origin of device name
8331 * @setup: callback to initialize device
8332 * @txqs: the number of TX subqueues to allocate
8333 * @rxqs: the number of RX subqueues to allocate
8334 *
8335 * Allocates a struct net_device with private data area for driver use
8336 * and performs basic initialization. Also allocates subqueue structs
8337 * for each queue on the device.
8338 */
8339struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
8340 unsigned char name_assign_type,
8341 void (*setup)(struct net_device *),
8342 unsigned int txqs, unsigned int rxqs)
8343{
8344 struct net_device *dev;
8345 unsigned int alloc_size;
8346 struct net_device *p;
8347
8348 BUG_ON(strlen(name) >= sizeof(dev->name));
8349
8350 if (txqs < 1) {
8351 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
8352 return NULL;
8353 }
8354
8355 if (rxqs < 1) {
8356 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
8357 return NULL;
8358 }
8359
8360 alloc_size = sizeof(struct net_device);
8361 if (sizeof_priv) {
8362 /* ensure 32-byte alignment of private area */
8363 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
8364 alloc_size += sizeof_priv;
8365 }
8366 /* ensure 32-byte alignment of whole construct */
8367 alloc_size += NETDEV_ALIGN - 1;
8368
8369 p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
8370 if (!p)
8371 return NULL;
8372
8373 dev = PTR_ALIGN(p, NETDEV_ALIGN);
8374 dev->padded = (char *)dev - (char *)p;
8375
8376 dev->pcpu_refcnt = alloc_percpu(int);
8377 if (!dev->pcpu_refcnt)
8378 goto free_dev;
8379
8380 if (dev_addr_init(dev))
8381 goto free_pcpu;
8382
8383 dev_mc_init(dev);
8384 dev_uc_init(dev);
8385
8386 dev_net_set(dev, &init_net);
8387
8388 dev->gso_max_size = GSO_MAX_SIZE;
8389 dev->gso_max_segs = GSO_MAX_SEGS;
8390
8391 INIT_LIST_HEAD(&dev->napi_list);
8392 INIT_LIST_HEAD(&dev->unreg_list);
8393 INIT_LIST_HEAD(&dev->close_list);
8394 INIT_LIST_HEAD(&dev->link_watch_list);
8395 INIT_LIST_HEAD(&dev->adj_list.upper);
8396 INIT_LIST_HEAD(&dev->adj_list.lower);
8397 INIT_LIST_HEAD(&dev->ptype_all);
8398 INIT_LIST_HEAD(&dev->ptype_specific);
8399#ifdef CONFIG_NET_SCHED
8400 hash_init(dev->qdisc_hash);
8401#endif
8402 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
8403 setup(dev);
8404
8405 if (!dev->tx_queue_len) {
8406 dev->priv_flags |= IFF_NO_QUEUE;
8407 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
8408 }
8409
8410 dev->num_tx_queues = txqs;
8411 dev->real_num_tx_queues = txqs;
8412 if (netif_alloc_netdev_queues(dev))
8413 goto free_all;
8414
8415 dev->num_rx_queues = rxqs;
8416 dev->real_num_rx_queues = rxqs;
8417 if (netif_alloc_rx_queues(dev))
8418 goto free_all;
8419
8420 strcpy(dev->name, name);
8421 dev->name_assign_type = name_assign_type;
8422 dev->group = INIT_NETDEV_GROUP;
8423 if (!dev->ethtool_ops)
8424 dev->ethtool_ops = &default_ethtool_ops;
8425
8426 nf_hook_ingress_init(dev);
8427
8428 return dev;
8429
8430free_all:
8431 free_netdev(dev);
8432 return NULL;
8433
8434free_pcpu:
8435 free_percpu(dev->pcpu_refcnt);
8436free_dev:
8437 netdev_freemem(dev);
8438 return NULL;
8439}
8440EXPORT_SYMBOL(alloc_netdev_mqs);
8441
8442/**
8443 * free_netdev - free network device
8444 * @dev: device
8445 *
8446 * This function does the last stage of destroying an allocated device
8447 * interface. The reference to the device object is released. If this
8448 * is the last reference then it will be freed.Must be called in process
8449 * context.
8450 */
8451void free_netdev(struct net_device *dev)
8452{
8453 struct napi_struct *p, *n;
8454
8455 might_sleep();
8456 netif_free_tx_queues(dev);
8457 netif_free_rx_queues(dev);
8458
8459 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
8460
8461 /* Flush device addresses */
8462 dev_addr_flush(dev);
8463
8464 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
8465 netif_napi_del(p);
8466
8467 free_percpu(dev->pcpu_refcnt);
8468 dev->pcpu_refcnt = NULL;
8469
8470 /* Compatibility with error handling in drivers */
8471 if (dev->reg_state == NETREG_UNINITIALIZED) {
8472 netdev_freemem(dev);
8473 return;
8474 }
8475
8476 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
8477 dev->reg_state = NETREG_RELEASED;
8478
8479 /* will free via device release */
8480 put_device(&dev->dev);
8481}
8482EXPORT_SYMBOL(free_netdev);
8483
8484/**
8485 * synchronize_net - Synchronize with packet receive processing
8486 *
8487 * Wait for packets currently being received to be done.
8488 * Does not block later packets from starting.
8489 */
8490void synchronize_net(void)
8491{
8492 might_sleep();
8493 if (rtnl_is_locked())
8494 synchronize_rcu_expedited();
8495 else
8496 synchronize_rcu();
8497}
8498EXPORT_SYMBOL(synchronize_net);
8499
8500/**
8501 * unregister_netdevice_queue - remove device from the kernel
8502 * @dev: device
8503 * @head: list
8504 *
8505 * This function shuts down a device interface and removes it
8506 * from the kernel tables.
8507 * If head not NULL, device is queued to be unregistered later.
8508 *
8509 * Callers must hold the rtnl semaphore. You may want
8510 * unregister_netdev() instead of this.
8511 */
8512
8513void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
8514{
8515 ASSERT_RTNL();
8516
8517 if (head) {
8518 list_move_tail(&dev->unreg_list, head);
8519 } else {
8520 rollback_registered(dev);
8521 /* Finish processing unregister after unlock */
8522 net_set_todo(dev);
8523 }
8524}
8525EXPORT_SYMBOL(unregister_netdevice_queue);
8526
8527/**
8528 * unregister_netdevice_many - unregister many devices
8529 * @head: list of devices
8530 *
8531 * Note: As most callers use a stack allocated list_head,
8532 * we force a list_del() to make sure stack wont be corrupted later.
8533 */
8534void unregister_netdevice_many(struct list_head *head)
8535{
8536 struct net_device *dev;
8537
8538 if (!list_empty(head)) {
8539 rollback_registered_many(head);
8540 list_for_each_entry(dev, head, unreg_list)
8541 net_set_todo(dev);
8542 list_del(head);
8543 }
8544}
8545EXPORT_SYMBOL(unregister_netdevice_many);
8546
8547/**
8548 * unregister_netdev - remove device from the kernel
8549 * @dev: device
8550 *
8551 * This function shuts down a device interface and removes it
8552 * from the kernel tables.
8553 *
8554 * This is just a wrapper for unregister_netdevice that takes
8555 * the rtnl semaphore. In general you want to use this and not
8556 * unregister_netdevice.
8557 */
8558void unregister_netdev(struct net_device *dev)
8559{
8560 rtnl_lock();
8561 unregister_netdevice(dev);
8562 rtnl_unlock();
8563}
8564EXPORT_SYMBOL(unregister_netdev);
8565
8566/**
8567 * dev_change_net_namespace - move device to different nethost namespace
8568 * @dev: device
8569 * @net: network namespace
8570 * @pat: If not NULL name pattern to try if the current device name
8571 * is already taken in the destination network namespace.
8572 *
8573 * This function shuts down a device interface and moves it
8574 * to a new network namespace. On success 0 is returned, on
8575 * a failure a netagive errno code is returned.
8576 *
8577 * Callers must hold the rtnl semaphore.
8578 */
8579
8580int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
8581{
8582 int err, new_nsid, new_ifindex;
8583
8584 ASSERT_RTNL();
8585
8586 /* Don't allow namespace local devices to be moved. */
8587 err = -EINVAL;
8588 if (dev->features & NETIF_F_NETNS_LOCAL)
8589 goto out;
8590
8591 /* Ensure the device has been registrered */
8592 if (dev->reg_state != NETREG_REGISTERED)
8593 goto out;
8594
8595 /* Get out if there is nothing todo */
8596 err = 0;
8597 if (net_eq(dev_net(dev), net))
8598 goto out;
8599
8600 /* Pick the destination device name, and ensure
8601 * we can use it in the destination network namespace.
8602 */
8603 err = -EEXIST;
8604 if (__dev_get_by_name(net, dev->name)) {
8605 /* We get here if we can't use the current device name */
8606 if (!pat)
8607 goto out;
8608 if (dev_get_valid_name(net, dev, pat) < 0)
8609 goto out;
8610 }
8611
8612 /*
8613 * And now a mini version of register_netdevice unregister_netdevice.
8614 */
8615
8616 /* If device is running close it first. */
8617 dev_close(dev);
8618
8619 /* And unlink it from device chain */
8620 err = -ENODEV;
8621 unlist_netdevice(dev);
8622
8623 synchronize_net();
8624
8625 /* Shutdown queueing discipline. */
8626 dev_shutdown(dev);
8627
8628 /* Notify protocols, that we are about to destroy
8629 * this device. They should clean all the things.
8630 *
8631 * Note that dev->reg_state stays at NETREG_REGISTERED.
8632 * This is wanted because this way 8021q and macvlan know
8633 * the device is just moving and can keep their slaves up.
8634 */
8635 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8636 rcu_barrier();
8637
8638 new_nsid = peernet2id_alloc(dev_net(dev), net);
8639 /* If there is an ifindex conflict assign a new one */
8640 if (__dev_get_by_index(net, dev->ifindex))
8641 new_ifindex = dev_new_index(net);
8642 else
8643 new_ifindex = dev->ifindex;
8644
8645 rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
8646 new_ifindex);
8647
8648 /*
8649 * Flush the unicast and multicast chains
8650 */
8651 dev_uc_flush(dev);
8652 dev_mc_flush(dev);
8653
8654 /* Send a netdev-removed uevent to the old namespace */
8655 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
8656 netdev_adjacent_del_links(dev);
8657
8658 /* Actually switch the network namespace */
8659 dev_net_set(dev, net);
8660 dev->ifindex = new_ifindex;
8661
8662 /* Send a netdev-add uevent to the new namespace */
8663 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
8664 netdev_adjacent_add_links(dev);
8665
8666 /* Fixup kobjects */
8667 err = device_rename(&dev->dev, dev->name);
8668 WARN_ON(err);
8669
8670 /* Add the device back in the hashes */
8671 list_netdevice(dev);
8672
8673 /* Notify protocols, that a new device appeared. */
8674 call_netdevice_notifiers(NETDEV_REGISTER, dev);
8675
8676 /*
8677 * Prevent userspace races by waiting until the network
8678 * device is fully setup before sending notifications.
8679 */
8680 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8681
8682 synchronize_net();
8683 err = 0;
8684out:
8685 return err;
8686}
8687EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8688
8689static int dev_cpu_dead(unsigned int oldcpu)
8690{
8691 struct sk_buff **list_skb;
8692 struct sk_buff *skb;
8693 unsigned int cpu;
8694 struct softnet_data *sd, *oldsd, *remsd = NULL;
8695
8696 local_irq_disable();
8697 cpu = smp_processor_id();
8698 sd = &per_cpu(softnet_data, cpu);
8699 oldsd = &per_cpu(softnet_data, oldcpu);
8700
8701 /* Find end of our completion_queue. */
8702 list_skb = &sd->completion_queue;
8703 while (*list_skb)
8704 list_skb = &(*list_skb)->next;
8705 /* Append completion queue from offline CPU. */
8706 *list_skb = oldsd->completion_queue;
8707 oldsd->completion_queue = NULL;
8708
8709 /* Append output queue from offline CPU. */
8710 if (oldsd->output_queue) {
8711 *sd->output_queue_tailp = oldsd->output_queue;
8712 sd->output_queue_tailp = oldsd->output_queue_tailp;
8713 oldsd->output_queue = NULL;
8714 oldsd->output_queue_tailp = &oldsd->output_queue;
8715 }
8716 /* Append NAPI poll list from offline CPU, with one exception :
8717 * process_backlog() must be called by cpu owning percpu backlog.
8718 * We properly handle process_queue & input_pkt_queue later.
8719 */
8720 while (!list_empty(&oldsd->poll_list)) {
8721 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8722 struct napi_struct,
8723 poll_list);
8724
8725 list_del_init(&napi->poll_list);
8726 if (napi->poll == process_backlog)
8727 napi->state = 0;
8728 else
8729 ____napi_schedule(sd, napi);
8730 }
8731
8732 raise_softirq_irqoff(NET_TX_SOFTIRQ);
8733 local_irq_enable();
8734
8735#ifdef CONFIG_RPS
8736 remsd = oldsd->rps_ipi_list;
8737 oldsd->rps_ipi_list = NULL;
8738#endif
8739 /* send out pending IPI's on offline CPU */
8740 net_rps_send_ipi(remsd);
8741
8742 /* Process offline CPU's input_pkt_queue */
8743 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8744 netif_rx_ni(skb);
8745 input_queue_head_incr(oldsd);
8746 }
8747 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8748 netif_rx_ni(skb);
8749 input_queue_head_incr(oldsd);
8750 }
8751
8752 return 0;
8753}
8754
8755/**
8756 * netdev_increment_features - increment feature set by one
8757 * @all: current feature set
8758 * @one: new feature set
8759 * @mask: mask feature set
8760 *
8761 * Computes a new feature set after adding a device with feature set
8762 * @one to the master device with current feature set @all. Will not
8763 * enable anything that is off in @mask. Returns the new feature set.
8764 */
8765netdev_features_t netdev_increment_features(netdev_features_t all,
8766 netdev_features_t one, netdev_features_t mask)
8767{
8768 if (mask & NETIF_F_HW_CSUM)
8769 mask |= NETIF_F_CSUM_MASK;
8770 mask |= NETIF_F_VLAN_CHALLENGED;
8771
8772 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8773 all &= one | ~NETIF_F_ALL_FOR_ALL;
8774
8775 /* If one device supports hw checksumming, set for all. */
8776 if (all & NETIF_F_HW_CSUM)
8777 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8778
8779 return all;
8780}
8781EXPORT_SYMBOL(netdev_increment_features);
8782
8783static struct hlist_head * __net_init netdev_create_hash(void)
8784{
8785 int i;
8786 struct hlist_head *hash;
8787
8788 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8789 if (hash != NULL)
8790 for (i = 0; i < NETDEV_HASHENTRIES; i++)
8791 INIT_HLIST_HEAD(&hash[i]);
8792
8793 return hash;
8794}
8795
8796/* Initialize per network namespace state */
8797static int __net_init netdev_init(struct net *net)
8798{
8799 if (net != &init_net)
8800 INIT_LIST_HEAD(&net->dev_base_head);
8801
8802 net->dev_name_head = netdev_create_hash();
8803 if (net->dev_name_head == NULL)
8804 goto err_name;
8805
8806 net->dev_index_head = netdev_create_hash();
8807 if (net->dev_index_head == NULL)
8808 goto err_idx;
8809
8810 return 0;
8811
8812err_idx:
8813 kfree(net->dev_name_head);
8814err_name:
8815 return -ENOMEM;
8816}
8817
8818/**
8819 * netdev_drivername - network driver for the device
8820 * @dev: network device
8821 *
8822 * Determine network driver for device.
8823 */
8824const char *netdev_drivername(const struct net_device *dev)
8825{
8826 const struct device_driver *driver;
8827 const struct device *parent;
8828 const char *empty = "";
8829
8830 parent = dev->dev.parent;
8831 if (!parent)
8832 return empty;
8833
8834 driver = parent->driver;
8835 if (driver && driver->name)
8836 return driver->name;
8837 return empty;
8838}
8839
8840static void __netdev_printk(const char *level, const struct net_device *dev,
8841 struct va_format *vaf)
8842{
8843 if (dev && dev->dev.parent) {
8844 dev_printk_emit(level[1] - '0',
8845 dev->dev.parent,
8846 "%s %s %s%s: %pV",
8847 dev_driver_string(dev->dev.parent),
8848 dev_name(dev->dev.parent),
8849 netdev_name(dev), netdev_reg_state(dev),
8850 vaf);
8851 } else if (dev) {
8852 printk("%s%s%s: %pV",
8853 level, netdev_name(dev), netdev_reg_state(dev), vaf);
8854 } else {
8855 printk("%s(NULL net_device): %pV", level, vaf);
8856 }
8857}
8858
8859void netdev_printk(const char *level, const struct net_device *dev,
8860 const char *format, ...)
8861{
8862 struct va_format vaf;
8863 va_list args;
8864
8865 va_start(args, format);
8866
8867 vaf.fmt = format;
8868 vaf.va = &args;
8869
8870 __netdev_printk(level, dev, &vaf);
8871
8872 va_end(args);
8873}
8874EXPORT_SYMBOL(netdev_printk);
8875
8876#define define_netdev_printk_level(func, level) \
8877void func(const struct net_device *dev, const char *fmt, ...) \
8878{ \
8879 struct va_format vaf; \
8880 va_list args; \
8881 \
8882 va_start(args, fmt); \
8883 \
8884 vaf.fmt = fmt; \
8885 vaf.va = &args; \
8886 \
8887 __netdev_printk(level, dev, &vaf); \
8888 \
8889 va_end(args); \
8890} \
8891EXPORT_SYMBOL(func);
8892
8893define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8894define_netdev_printk_level(netdev_alert, KERN_ALERT);
8895define_netdev_printk_level(netdev_crit, KERN_CRIT);
8896define_netdev_printk_level(netdev_err, KERN_ERR);
8897define_netdev_printk_level(netdev_warn, KERN_WARNING);
8898define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8899define_netdev_printk_level(netdev_info, KERN_INFO);
8900
8901static void __net_exit netdev_exit(struct net *net)
8902{
8903 kfree(net->dev_name_head);
8904 kfree(net->dev_index_head);
8905 if (net != &init_net)
8906 WARN_ON_ONCE(!list_empty(&net->dev_base_head));
8907}
8908
8909static struct pernet_operations __net_initdata netdev_net_ops = {
8910 .init = netdev_init,
8911 .exit = netdev_exit,
8912};
8913
8914static void __net_exit default_device_exit(struct net *net)
8915{
8916 struct net_device *dev, *aux;
8917 /*
8918 * Push all migratable network devices back to the
8919 * initial network namespace
8920 */
8921 rtnl_lock();
8922 for_each_netdev_safe(net, dev, aux) {
8923 int err;
8924 char fb_name[IFNAMSIZ];
8925
8926 /* Ignore unmoveable devices (i.e. loopback) */
8927 if (dev->features & NETIF_F_NETNS_LOCAL)
8928 continue;
8929
8930 /* Leave virtual devices for the generic cleanup */
8931 if (dev->rtnl_link_ops)
8932 continue;
8933
8934 /* Push remaining network devices to init_net */
8935 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8936 err = dev_change_net_namespace(dev, &init_net, fb_name);
8937 if (err) {
8938 pr_emerg("%s: failed to move %s to init_net: %d\n",
8939 __func__, dev->name, err);
8940 BUG();
8941 }
8942 }
8943 rtnl_unlock();
8944}
8945
8946static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8947{
8948 /* Return with the rtnl_lock held when there are no network
8949 * devices unregistering in any network namespace in net_list.
8950 */
8951 struct net *net;
8952 bool unregistering;
8953 DEFINE_WAIT_FUNC(wait, woken_wake_function);
8954
8955 add_wait_queue(&netdev_unregistering_wq, &wait);
8956 for (;;) {
8957 unregistering = false;
8958 rtnl_lock();
8959 list_for_each_entry(net, net_list, exit_list) {
8960 if (net->dev_unreg_count > 0) {
8961 unregistering = true;
8962 break;
8963 }
8964 }
8965 if (!unregistering)
8966 break;
8967 __rtnl_unlock();
8968
8969 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8970 }
8971 remove_wait_queue(&netdev_unregistering_wq, &wait);
8972}
8973
8974static void __net_exit default_device_exit_batch(struct list_head *net_list)
8975{
8976 /* At exit all network devices most be removed from a network
8977 * namespace. Do this in the reverse order of registration.
8978 * Do this across as many network namespaces as possible to
8979 * improve batching efficiency.
8980 */
8981 struct net_device *dev;
8982 struct net *net;
8983 LIST_HEAD(dev_kill_list);
8984
8985 /* To prevent network device cleanup code from dereferencing
8986 * loopback devices or network devices that have been freed
8987 * wait here for all pending unregistrations to complete,
8988 * before unregistring the loopback device and allowing the
8989 * network namespace be freed.
8990 *
8991 * The netdev todo list containing all network devices
8992 * unregistrations that happen in default_device_exit_batch
8993 * will run in the rtnl_unlock() at the end of
8994 * default_device_exit_batch.
8995 */
8996 rtnl_lock_unregistering(net_list);
8997 list_for_each_entry(net, net_list, exit_list) {
8998 for_each_netdev_reverse(net, dev) {
8999 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
9000 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
9001 else
9002 unregister_netdevice_queue(dev, &dev_kill_list);
9003 }
9004 }
9005 unregister_netdevice_many(&dev_kill_list);
9006 rtnl_unlock();
9007}
9008
9009static struct pernet_operations __net_initdata default_device_ops = {
9010 .exit = default_device_exit,
9011 .exit_batch = default_device_exit_batch,
9012};
9013
9014/*
9015 * Initialize the DEV module. At boot time this walks the device list and
9016 * unhooks any devices that fail to initialise (normally hardware not
9017 * present) and leaves us with a valid list of present and active devices.
9018 *
9019 */
9020
9021/*
9022 * This is called single threaded during boot, so no need
9023 * to take the rtnl semaphore.
9024 */
9025static int __init net_dev_init(void)
9026{
9027 int i, rc = -ENOMEM;
9028
9029 BUG_ON(!dev_boot_phase);
9030
9031 if (dev_proc_init())
9032 goto out;
9033
9034 if (netdev_kobject_init())
9035 goto out;
9036
9037 INIT_LIST_HEAD(&ptype_all);
9038 for (i = 0; i < PTYPE_HASH_SIZE; i++)
9039 INIT_LIST_HEAD(&ptype_base[i]);
9040
9041 INIT_LIST_HEAD(&offload_base);
9042
9043 if (register_pernet_subsys(&netdev_net_ops))
9044 goto out;
9045
9046 /*
9047 * Initialise the packet receive queues.
9048 */
9049
9050 for_each_possible_cpu(i) {
9051 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
9052 struct softnet_data *sd = &per_cpu(softnet_data, i);
9053
9054 INIT_WORK(flush, flush_backlog);
9055
9056 skb_queue_head_init(&sd->input_pkt_queue);
9057 skb_queue_head_init(&sd->process_queue);
9058#ifdef CONFIG_XFRM_OFFLOAD
9059 skb_queue_head_init(&sd->xfrm_backlog);
9060#endif
9061 INIT_LIST_HEAD(&sd->poll_list);
9062 sd->output_queue_tailp = &sd->output_queue;
9063#ifdef CONFIG_RPS
9064 sd->csd.func = rps_trigger_softirq;
9065 sd->csd.info = sd;
9066 sd->cpu = i;
9067#endif
9068
9069 sd->backlog.poll = process_backlog;
9070 sd->backlog.weight = weight_p;
9071 }
9072
9073 dev_boot_phase = 0;
9074
9075 /* The loopback device is special if any other network devices
9076 * is present in a network namespace the loopback device must
9077 * be present. Since we now dynamically allocate and free the
9078 * loopback device ensure this invariant is maintained by
9079 * keeping the loopback device as the first device on the
9080 * list of network devices. Ensuring the loopback devices
9081 * is the first device that appears and the last network device
9082 * that disappears.
9083 */
9084 if (register_pernet_device(&loopback_net_ops))
9085 goto out;
9086
9087 if (register_pernet_device(&default_device_ops))
9088 goto out;
9089
9090 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
9091 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
9092
9093 rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
9094 NULL, dev_cpu_dead);
9095 WARN_ON(rc < 0);
9096 rc = 0;
9097out:
9098 return rc;
9099}
9100
9101subsys_initcall(net_dev_init);
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <linux/uaccess.h>
76#include <linux/bitops.h>
77#include <linux/capability.h>
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
81#include <linux/hash.h>
82#include <linux/slab.h>
83#include <linux/sched.h>
84#include <linux/mutex.h>
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
94#include <linux/ethtool.h>
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
97#include <linux/bpf.h>
98#include <net/net_namespace.h>
99#include <net/sock.h>
100#include <net/busy_poll.h>
101#include <linux/rtnetlink.h>
102#include <linux/stat.h>
103#include <net/dst.h>
104#include <net/dst_metadata.h>
105#include <net/pkt_sched.h>
106#include <net/checksum.h>
107#include <net/xfrm.h>
108#include <linux/highmem.h>
109#include <linux/init.h>
110#include <linux/module.h>
111#include <linux/netpoll.h>
112#include <linux/rcupdate.h>
113#include <linux/delay.h>
114#include <net/iw_handler.h>
115#include <asm/current.h>
116#include <linux/audit.h>
117#include <linux/dmaengine.h>
118#include <linux/err.h>
119#include <linux/ctype.h>
120#include <linux/if_arp.h>
121#include <linux/if_vlan.h>
122#include <linux/ip.h>
123#include <net/ip.h>
124#include <net/mpls.h>
125#include <linux/ipv6.h>
126#include <linux/in.h>
127#include <linux/jhash.h>
128#include <linux/random.h>
129#include <trace/events/napi.h>
130#include <trace/events/net.h>
131#include <trace/events/skb.h>
132#include <linux/pci.h>
133#include <linux/inetdevice.h>
134#include <linux/cpu_rmap.h>
135#include <linux/static_key.h>
136#include <linux/hashtable.h>
137#include <linux/vmalloc.h>
138#include <linux/if_macvlan.h>
139#include <linux/errqueue.h>
140#include <linux/hrtimer.h>
141#include <linux/netfilter_ingress.h>
142#include <linux/crash_dump.h>
143
144#include "net-sysfs.h"
145
146/* Instead of increasing this, you should create a hash table. */
147#define MAX_GRO_SKBS 8
148
149/* This should be increased if a protocol with a bigger head is added. */
150#define GRO_MAX_HEAD (MAX_HEADER + 128)
151
152static DEFINE_SPINLOCK(ptype_lock);
153static DEFINE_SPINLOCK(offload_lock);
154struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
155struct list_head ptype_all __read_mostly; /* Taps */
156static struct list_head offload_base __read_mostly;
157
158static int netif_rx_internal(struct sk_buff *skb);
159static int call_netdevice_notifiers_info(unsigned long val,
160 struct net_device *dev,
161 struct netdev_notifier_info *info);
162
163/*
164 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
165 * semaphore.
166 *
167 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
168 *
169 * Writers must hold the rtnl semaphore while they loop through the
170 * dev_base_head list, and hold dev_base_lock for writing when they do the
171 * actual updates. This allows pure readers to access the list even
172 * while a writer is preparing to update it.
173 *
174 * To put it another way, dev_base_lock is held for writing only to
175 * protect against pure readers; the rtnl semaphore provides the
176 * protection against other writers.
177 *
178 * See, for example usages, register_netdevice() and
179 * unregister_netdevice(), which must be called with the rtnl
180 * semaphore held.
181 */
182DEFINE_RWLOCK(dev_base_lock);
183EXPORT_SYMBOL(dev_base_lock);
184
185/* protects napi_hash addition/deletion and napi_gen_id */
186static DEFINE_SPINLOCK(napi_hash_lock);
187
188static unsigned int napi_gen_id = NR_CPUS;
189static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
190
191static seqcount_t devnet_rename_seq;
192
193static inline void dev_base_seq_inc(struct net *net)
194{
195 while (++net->dev_base_seq == 0);
196}
197
198static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
199{
200 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
201
202 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
203}
204
205static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206{
207 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
208}
209
210static inline void rps_lock(struct softnet_data *sd)
211{
212#ifdef CONFIG_RPS
213 spin_lock(&sd->input_pkt_queue.lock);
214#endif
215}
216
217static inline void rps_unlock(struct softnet_data *sd)
218{
219#ifdef CONFIG_RPS
220 spin_unlock(&sd->input_pkt_queue.lock);
221#endif
222}
223
224/* Device list insertion */
225static void list_netdevice(struct net_device *dev)
226{
227 struct net *net = dev_net(dev);
228
229 ASSERT_RTNL();
230
231 write_lock_bh(&dev_base_lock);
232 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
233 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
234 hlist_add_head_rcu(&dev->index_hlist,
235 dev_index_hash(net, dev->ifindex));
236 write_unlock_bh(&dev_base_lock);
237
238 dev_base_seq_inc(net);
239}
240
241/* Device list removal
242 * caller must respect a RCU grace period before freeing/reusing dev
243 */
244static void unlist_netdevice(struct net_device *dev)
245{
246 ASSERT_RTNL();
247
248 /* Unlink dev from the device chain */
249 write_lock_bh(&dev_base_lock);
250 list_del_rcu(&dev->dev_list);
251 hlist_del_rcu(&dev->name_hlist);
252 hlist_del_rcu(&dev->index_hlist);
253 write_unlock_bh(&dev_base_lock);
254
255 dev_base_seq_inc(dev_net(dev));
256}
257
258/*
259 * Our notifier list
260 */
261
262static RAW_NOTIFIER_HEAD(netdev_chain);
263
264/*
265 * Device drivers call our routines to queue packets here. We empty the
266 * queue in the local softnet handler.
267 */
268
269DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270EXPORT_PER_CPU_SYMBOL(softnet_data);
271
272#ifdef CONFIG_LOCKDEP
273/*
274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275 * according to dev->type
276 */
277static const unsigned short netdev_lock_type[] =
278 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
291 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
292 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
293
294static const char *const netdev_lock_name[] =
295 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
308 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
309 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
310
311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
313
314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315{
316 int i;
317
318 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 if (netdev_lock_type[i] == dev_type)
320 return i;
321 /* the last key is used by default */
322 return ARRAY_SIZE(netdev_lock_type) - 1;
323}
324
325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 unsigned short dev_type)
327{
328 int i;
329
330 i = netdev_lock_pos(dev_type);
331 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 netdev_lock_name[i]);
333}
334
335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336{
337 int i;
338
339 i = netdev_lock_pos(dev->type);
340 lockdep_set_class_and_name(&dev->addr_list_lock,
341 &netdev_addr_lock_key[i],
342 netdev_lock_name[i]);
343}
344#else
345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 unsigned short dev_type)
347{
348}
349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350{
351}
352#endif
353
354/*******************************************************************************
355
356 Protocol management and registration routines
357
358*******************************************************************************/
359
360/*
361 * Add a protocol ID to the list. Now that the input handler is
362 * smarter we can dispense with all the messy stuff that used to be
363 * here.
364 *
365 * BEWARE!!! Protocol handlers, mangling input packets,
366 * MUST BE last in hash buckets and checking protocol handlers
367 * MUST start from promiscuous ptype_all chain in net_bh.
368 * It is true now, do not change it.
369 * Explanation follows: if protocol handler, mangling packet, will
370 * be the first on list, it is not able to sense, that packet
371 * is cloned and should be copied-on-write, so that it will
372 * change it and subsequent readers will get broken packet.
373 * --ANK (980803)
374 */
375
376static inline struct list_head *ptype_head(const struct packet_type *pt)
377{
378 if (pt->type == htons(ETH_P_ALL))
379 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
380 else
381 return pt->dev ? &pt->dev->ptype_specific :
382 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383}
384
385/**
386 * dev_add_pack - add packet handler
387 * @pt: packet type declaration
388 *
389 * Add a protocol handler to the networking stack. The passed &packet_type
390 * is linked into kernel lists and may not be freed until it has been
391 * removed from the kernel lists.
392 *
393 * This call does not sleep therefore it can not
394 * guarantee all CPU's that are in middle of receiving packets
395 * will see the new packet type (until the next received packet).
396 */
397
398void dev_add_pack(struct packet_type *pt)
399{
400 struct list_head *head = ptype_head(pt);
401
402 spin_lock(&ptype_lock);
403 list_add_rcu(&pt->list, head);
404 spin_unlock(&ptype_lock);
405}
406EXPORT_SYMBOL(dev_add_pack);
407
408/**
409 * __dev_remove_pack - remove packet handler
410 * @pt: packet type declaration
411 *
412 * Remove a protocol handler that was previously added to the kernel
413 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
414 * from the kernel lists and can be freed or reused once this function
415 * returns.
416 *
417 * The packet type might still be in use by receivers
418 * and must not be freed until after all the CPU's have gone
419 * through a quiescent state.
420 */
421void __dev_remove_pack(struct packet_type *pt)
422{
423 struct list_head *head = ptype_head(pt);
424 struct packet_type *pt1;
425
426 spin_lock(&ptype_lock);
427
428 list_for_each_entry(pt1, head, list) {
429 if (pt == pt1) {
430 list_del_rcu(&pt->list);
431 goto out;
432 }
433 }
434
435 pr_warn("dev_remove_pack: %p not found\n", pt);
436out:
437 spin_unlock(&ptype_lock);
438}
439EXPORT_SYMBOL(__dev_remove_pack);
440
441/**
442 * dev_remove_pack - remove packet handler
443 * @pt: packet type declaration
444 *
445 * Remove a protocol handler that was previously added to the kernel
446 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
447 * from the kernel lists and can be freed or reused once this function
448 * returns.
449 *
450 * This call sleeps to guarantee that no CPU is looking at the packet
451 * type after return.
452 */
453void dev_remove_pack(struct packet_type *pt)
454{
455 __dev_remove_pack(pt);
456
457 synchronize_net();
458}
459EXPORT_SYMBOL(dev_remove_pack);
460
461
462/**
463 * dev_add_offload - register offload handlers
464 * @po: protocol offload declaration
465 *
466 * Add protocol offload handlers to the networking stack. The passed
467 * &proto_offload is linked into kernel lists and may not be freed until
468 * it has been removed from the kernel lists.
469 *
470 * This call does not sleep therefore it can not
471 * guarantee all CPU's that are in middle of receiving packets
472 * will see the new offload handlers (until the next received packet).
473 */
474void dev_add_offload(struct packet_offload *po)
475{
476 struct packet_offload *elem;
477
478 spin_lock(&offload_lock);
479 list_for_each_entry(elem, &offload_base, list) {
480 if (po->priority < elem->priority)
481 break;
482 }
483 list_add_rcu(&po->list, elem->list.prev);
484 spin_unlock(&offload_lock);
485}
486EXPORT_SYMBOL(dev_add_offload);
487
488/**
489 * __dev_remove_offload - remove offload handler
490 * @po: packet offload declaration
491 *
492 * Remove a protocol offload handler that was previously added to the
493 * kernel offload handlers by dev_add_offload(). The passed &offload_type
494 * is removed from the kernel lists and can be freed or reused once this
495 * function returns.
496 *
497 * The packet type might still be in use by receivers
498 * and must not be freed until after all the CPU's have gone
499 * through a quiescent state.
500 */
501static void __dev_remove_offload(struct packet_offload *po)
502{
503 struct list_head *head = &offload_base;
504 struct packet_offload *po1;
505
506 spin_lock(&offload_lock);
507
508 list_for_each_entry(po1, head, list) {
509 if (po == po1) {
510 list_del_rcu(&po->list);
511 goto out;
512 }
513 }
514
515 pr_warn("dev_remove_offload: %p not found\n", po);
516out:
517 spin_unlock(&offload_lock);
518}
519
520/**
521 * dev_remove_offload - remove packet offload handler
522 * @po: packet offload declaration
523 *
524 * Remove a packet offload handler that was previously added to the kernel
525 * offload handlers by dev_add_offload(). The passed &offload_type is
526 * removed from the kernel lists and can be freed or reused once this
527 * function returns.
528 *
529 * This call sleeps to guarantee that no CPU is looking at the packet
530 * type after return.
531 */
532void dev_remove_offload(struct packet_offload *po)
533{
534 __dev_remove_offload(po);
535
536 synchronize_net();
537}
538EXPORT_SYMBOL(dev_remove_offload);
539
540/******************************************************************************
541
542 Device Boot-time Settings Routines
543
544*******************************************************************************/
545
546/* Boot time configuration table */
547static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
548
549/**
550 * netdev_boot_setup_add - add new setup entry
551 * @name: name of the device
552 * @map: configured settings for the device
553 *
554 * Adds new setup entry to the dev_boot_setup list. The function
555 * returns 0 on error and 1 on success. This is a generic routine to
556 * all netdevices.
557 */
558static int netdev_boot_setup_add(char *name, struct ifmap *map)
559{
560 struct netdev_boot_setup *s;
561 int i;
562
563 s = dev_boot_setup;
564 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
565 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
566 memset(s[i].name, 0, sizeof(s[i].name));
567 strlcpy(s[i].name, name, IFNAMSIZ);
568 memcpy(&s[i].map, map, sizeof(s[i].map));
569 break;
570 }
571 }
572
573 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
574}
575
576/**
577 * netdev_boot_setup_check - check boot time settings
578 * @dev: the netdevice
579 *
580 * Check boot time settings for the device.
581 * The found settings are set for the device to be used
582 * later in the device probing.
583 * Returns 0 if no settings found, 1 if they are.
584 */
585int netdev_boot_setup_check(struct net_device *dev)
586{
587 struct netdev_boot_setup *s = dev_boot_setup;
588 int i;
589
590 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
591 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
592 !strcmp(dev->name, s[i].name)) {
593 dev->irq = s[i].map.irq;
594 dev->base_addr = s[i].map.base_addr;
595 dev->mem_start = s[i].map.mem_start;
596 dev->mem_end = s[i].map.mem_end;
597 return 1;
598 }
599 }
600 return 0;
601}
602EXPORT_SYMBOL(netdev_boot_setup_check);
603
604
605/**
606 * netdev_boot_base - get address from boot time settings
607 * @prefix: prefix for network device
608 * @unit: id for network device
609 *
610 * Check boot time settings for the base address of device.
611 * The found settings are set for the device to be used
612 * later in the device probing.
613 * Returns 0 if no settings found.
614 */
615unsigned long netdev_boot_base(const char *prefix, int unit)
616{
617 const struct netdev_boot_setup *s = dev_boot_setup;
618 char name[IFNAMSIZ];
619 int i;
620
621 sprintf(name, "%s%d", prefix, unit);
622
623 /*
624 * If device already registered then return base of 1
625 * to indicate not to probe for this interface
626 */
627 if (__dev_get_by_name(&init_net, name))
628 return 1;
629
630 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
631 if (!strcmp(name, s[i].name))
632 return s[i].map.base_addr;
633 return 0;
634}
635
636/*
637 * Saves at boot time configured settings for any netdevice.
638 */
639int __init netdev_boot_setup(char *str)
640{
641 int ints[5];
642 struct ifmap map;
643
644 str = get_options(str, ARRAY_SIZE(ints), ints);
645 if (!str || !*str)
646 return 0;
647
648 /* Save settings */
649 memset(&map, 0, sizeof(map));
650 if (ints[0] > 0)
651 map.irq = ints[1];
652 if (ints[0] > 1)
653 map.base_addr = ints[2];
654 if (ints[0] > 2)
655 map.mem_start = ints[3];
656 if (ints[0] > 3)
657 map.mem_end = ints[4];
658
659 /* Add new entry to the list */
660 return netdev_boot_setup_add(str, &map);
661}
662
663__setup("netdev=", netdev_boot_setup);
664
665/*******************************************************************************
666
667 Device Interface Subroutines
668
669*******************************************************************************/
670
671/**
672 * dev_get_iflink - get 'iflink' value of a interface
673 * @dev: targeted interface
674 *
675 * Indicates the ifindex the interface is linked to.
676 * Physical interfaces have the same 'ifindex' and 'iflink' values.
677 */
678
679int dev_get_iflink(const struct net_device *dev)
680{
681 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
682 return dev->netdev_ops->ndo_get_iflink(dev);
683
684 return dev->ifindex;
685}
686EXPORT_SYMBOL(dev_get_iflink);
687
688/**
689 * dev_fill_metadata_dst - Retrieve tunnel egress information.
690 * @dev: targeted interface
691 * @skb: The packet.
692 *
693 * For better visibility of tunnel traffic OVS needs to retrieve
694 * egress tunnel information for a packet. Following API allows
695 * user to get this info.
696 */
697int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
698{
699 struct ip_tunnel_info *info;
700
701 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
702 return -EINVAL;
703
704 info = skb_tunnel_info_unclone(skb);
705 if (!info)
706 return -ENOMEM;
707 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
708 return -EINVAL;
709
710 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
711}
712EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
713
714/**
715 * __dev_get_by_name - find a device by its name
716 * @net: the applicable net namespace
717 * @name: name to find
718 *
719 * Find an interface by name. Must be called under RTNL semaphore
720 * or @dev_base_lock. If the name is found a pointer to the device
721 * is returned. If the name is not found then %NULL is returned. The
722 * reference counters are not incremented so the caller must be
723 * careful with locks.
724 */
725
726struct net_device *__dev_get_by_name(struct net *net, const char *name)
727{
728 struct net_device *dev;
729 struct hlist_head *head = dev_name_hash(net, name);
730
731 hlist_for_each_entry(dev, head, name_hlist)
732 if (!strncmp(dev->name, name, IFNAMSIZ))
733 return dev;
734
735 return NULL;
736}
737EXPORT_SYMBOL(__dev_get_by_name);
738
739/**
740 * dev_get_by_name_rcu - find a device by its name
741 * @net: the applicable net namespace
742 * @name: name to find
743 *
744 * Find an interface by name.
745 * If the name is found a pointer to the device is returned.
746 * If the name is not found then %NULL is returned.
747 * The reference counters are not incremented so the caller must be
748 * careful with locks. The caller must hold RCU lock.
749 */
750
751struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
752{
753 struct net_device *dev;
754 struct hlist_head *head = dev_name_hash(net, name);
755
756 hlist_for_each_entry_rcu(dev, head, name_hlist)
757 if (!strncmp(dev->name, name, IFNAMSIZ))
758 return dev;
759
760 return NULL;
761}
762EXPORT_SYMBOL(dev_get_by_name_rcu);
763
764/**
765 * dev_get_by_name - find a device by its name
766 * @net: the applicable net namespace
767 * @name: name to find
768 *
769 * Find an interface by name. This can be called from any
770 * context and does its own locking. The returned handle has
771 * the usage count incremented and the caller must use dev_put() to
772 * release it when it is no longer needed. %NULL is returned if no
773 * matching device is found.
774 */
775
776struct net_device *dev_get_by_name(struct net *net, const char *name)
777{
778 struct net_device *dev;
779
780 rcu_read_lock();
781 dev = dev_get_by_name_rcu(net, name);
782 if (dev)
783 dev_hold(dev);
784 rcu_read_unlock();
785 return dev;
786}
787EXPORT_SYMBOL(dev_get_by_name);
788
789/**
790 * __dev_get_by_index - find a device by its ifindex
791 * @net: the applicable net namespace
792 * @ifindex: index of device
793 *
794 * Search for an interface by index. Returns %NULL if the device
795 * is not found or a pointer to the device. The device has not
796 * had its reference counter increased so the caller must be careful
797 * about locking. The caller must hold either the RTNL semaphore
798 * or @dev_base_lock.
799 */
800
801struct net_device *__dev_get_by_index(struct net *net, int ifindex)
802{
803 struct net_device *dev;
804 struct hlist_head *head = dev_index_hash(net, ifindex);
805
806 hlist_for_each_entry(dev, head, index_hlist)
807 if (dev->ifindex == ifindex)
808 return dev;
809
810 return NULL;
811}
812EXPORT_SYMBOL(__dev_get_by_index);
813
814/**
815 * dev_get_by_index_rcu - find a device by its ifindex
816 * @net: the applicable net namespace
817 * @ifindex: index of device
818 *
819 * Search for an interface by index. Returns %NULL if the device
820 * is not found or a pointer to the device. The device has not
821 * had its reference counter increased so the caller must be careful
822 * about locking. The caller must hold RCU lock.
823 */
824
825struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
826{
827 struct net_device *dev;
828 struct hlist_head *head = dev_index_hash(net, ifindex);
829
830 hlist_for_each_entry_rcu(dev, head, index_hlist)
831 if (dev->ifindex == ifindex)
832 return dev;
833
834 return NULL;
835}
836EXPORT_SYMBOL(dev_get_by_index_rcu);
837
838
839/**
840 * dev_get_by_index - find a device by its ifindex
841 * @net: the applicable net namespace
842 * @ifindex: index of device
843 *
844 * Search for an interface by index. Returns NULL if the device
845 * is not found or a pointer to the device. The device returned has
846 * had a reference added and the pointer is safe until the user calls
847 * dev_put to indicate they have finished with it.
848 */
849
850struct net_device *dev_get_by_index(struct net *net, int ifindex)
851{
852 struct net_device *dev;
853
854 rcu_read_lock();
855 dev = dev_get_by_index_rcu(net, ifindex);
856 if (dev)
857 dev_hold(dev);
858 rcu_read_unlock();
859 return dev;
860}
861EXPORT_SYMBOL(dev_get_by_index);
862
863/**
864 * netdev_get_name - get a netdevice name, knowing its ifindex.
865 * @net: network namespace
866 * @name: a pointer to the buffer where the name will be stored.
867 * @ifindex: the ifindex of the interface to get the name from.
868 *
869 * The use of raw_seqcount_begin() and cond_resched() before
870 * retrying is required as we want to give the writers a chance
871 * to complete when CONFIG_PREEMPT is not set.
872 */
873int netdev_get_name(struct net *net, char *name, int ifindex)
874{
875 struct net_device *dev;
876 unsigned int seq;
877
878retry:
879 seq = raw_seqcount_begin(&devnet_rename_seq);
880 rcu_read_lock();
881 dev = dev_get_by_index_rcu(net, ifindex);
882 if (!dev) {
883 rcu_read_unlock();
884 return -ENODEV;
885 }
886
887 strcpy(name, dev->name);
888 rcu_read_unlock();
889 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
890 cond_resched();
891 goto retry;
892 }
893
894 return 0;
895}
896
897/**
898 * dev_getbyhwaddr_rcu - find a device by its hardware address
899 * @net: the applicable net namespace
900 * @type: media type of device
901 * @ha: hardware address
902 *
903 * Search for an interface by MAC address. Returns NULL if the device
904 * is not found or a pointer to the device.
905 * The caller must hold RCU or RTNL.
906 * The returned device has not had its ref count increased
907 * and the caller must therefore be careful about locking
908 *
909 */
910
911struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
912 const char *ha)
913{
914 struct net_device *dev;
915
916 for_each_netdev_rcu(net, dev)
917 if (dev->type == type &&
918 !memcmp(dev->dev_addr, ha, dev->addr_len))
919 return dev;
920
921 return NULL;
922}
923EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
924
925struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
926{
927 struct net_device *dev;
928
929 ASSERT_RTNL();
930 for_each_netdev(net, dev)
931 if (dev->type == type)
932 return dev;
933
934 return NULL;
935}
936EXPORT_SYMBOL(__dev_getfirstbyhwtype);
937
938struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
939{
940 struct net_device *dev, *ret = NULL;
941
942 rcu_read_lock();
943 for_each_netdev_rcu(net, dev)
944 if (dev->type == type) {
945 dev_hold(dev);
946 ret = dev;
947 break;
948 }
949 rcu_read_unlock();
950 return ret;
951}
952EXPORT_SYMBOL(dev_getfirstbyhwtype);
953
954/**
955 * __dev_get_by_flags - find any device with given flags
956 * @net: the applicable net namespace
957 * @if_flags: IFF_* values
958 * @mask: bitmask of bits in if_flags to check
959 *
960 * Search for any interface with the given flags. Returns NULL if a device
961 * is not found or a pointer to the device. Must be called inside
962 * rtnl_lock(), and result refcount is unchanged.
963 */
964
965struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
966 unsigned short mask)
967{
968 struct net_device *dev, *ret;
969
970 ASSERT_RTNL();
971
972 ret = NULL;
973 for_each_netdev(net, dev) {
974 if (((dev->flags ^ if_flags) & mask) == 0) {
975 ret = dev;
976 break;
977 }
978 }
979 return ret;
980}
981EXPORT_SYMBOL(__dev_get_by_flags);
982
983/**
984 * dev_valid_name - check if name is okay for network device
985 * @name: name string
986 *
987 * Network device names need to be valid file names to
988 * to allow sysfs to work. We also disallow any kind of
989 * whitespace.
990 */
991bool dev_valid_name(const char *name)
992{
993 if (*name == '\0')
994 return false;
995 if (strlen(name) >= IFNAMSIZ)
996 return false;
997 if (!strcmp(name, ".") || !strcmp(name, ".."))
998 return false;
999
1000 while (*name) {
1001 if (*name == '/' || *name == ':' || isspace(*name))
1002 return false;
1003 name++;
1004 }
1005 return true;
1006}
1007EXPORT_SYMBOL(dev_valid_name);
1008
1009/**
1010 * __dev_alloc_name - allocate a name for a device
1011 * @net: network namespace to allocate the device name in
1012 * @name: name format string
1013 * @buf: scratch buffer and result name string
1014 *
1015 * Passed a format string - eg "lt%d" it will try and find a suitable
1016 * id. It scans list of devices to build up a free map, then chooses
1017 * the first empty slot. The caller must hold the dev_base or rtnl lock
1018 * while allocating the name and adding the device in order to avoid
1019 * duplicates.
1020 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 * Returns the number of the unit assigned or a negative errno code.
1022 */
1023
1024static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025{
1026 int i = 0;
1027 const char *p;
1028 const int max_netdevices = 8*PAGE_SIZE;
1029 unsigned long *inuse;
1030 struct net_device *d;
1031
1032 p = strnchr(name, IFNAMSIZ-1, '%');
1033 if (p) {
1034 /*
1035 * Verify the string as this thing may have come from
1036 * the user. There must be either one "%d" and no other "%"
1037 * characters.
1038 */
1039 if (p[1] != 'd' || strchr(p + 2, '%'))
1040 return -EINVAL;
1041
1042 /* Use one page as a bit array of possible slots */
1043 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044 if (!inuse)
1045 return -ENOMEM;
1046
1047 for_each_netdev(net, d) {
1048 if (!sscanf(d->name, name, &i))
1049 continue;
1050 if (i < 0 || i >= max_netdevices)
1051 continue;
1052
1053 /* avoid cases where sscanf is not exact inverse of printf */
1054 snprintf(buf, IFNAMSIZ, name, i);
1055 if (!strncmp(buf, d->name, IFNAMSIZ))
1056 set_bit(i, inuse);
1057 }
1058
1059 i = find_first_zero_bit(inuse, max_netdevices);
1060 free_page((unsigned long) inuse);
1061 }
1062
1063 if (buf != name)
1064 snprintf(buf, IFNAMSIZ, name, i);
1065 if (!__dev_get_by_name(net, buf))
1066 return i;
1067
1068 /* It is possible to run out of possible slots
1069 * when the name is long and there isn't enough space left
1070 * for the digits, or if all bits are used.
1071 */
1072 return -ENFILE;
1073}
1074
1075/**
1076 * dev_alloc_name - allocate a name for a device
1077 * @dev: device
1078 * @name: name format string
1079 *
1080 * Passed a format string - eg "lt%d" it will try and find a suitable
1081 * id. It scans list of devices to build up a free map, then chooses
1082 * the first empty slot. The caller must hold the dev_base or rtnl lock
1083 * while allocating the name and adding the device in order to avoid
1084 * duplicates.
1085 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086 * Returns the number of the unit assigned or a negative errno code.
1087 */
1088
1089int dev_alloc_name(struct net_device *dev, const char *name)
1090{
1091 char buf[IFNAMSIZ];
1092 struct net *net;
1093 int ret;
1094
1095 BUG_ON(!dev_net(dev));
1096 net = dev_net(dev);
1097 ret = __dev_alloc_name(net, name, buf);
1098 if (ret >= 0)
1099 strlcpy(dev->name, buf, IFNAMSIZ);
1100 return ret;
1101}
1102EXPORT_SYMBOL(dev_alloc_name);
1103
1104static int dev_alloc_name_ns(struct net *net,
1105 struct net_device *dev,
1106 const char *name)
1107{
1108 char buf[IFNAMSIZ];
1109 int ret;
1110
1111 ret = __dev_alloc_name(net, name, buf);
1112 if (ret >= 0)
1113 strlcpy(dev->name, buf, IFNAMSIZ);
1114 return ret;
1115}
1116
1117static int dev_get_valid_name(struct net *net,
1118 struct net_device *dev,
1119 const char *name)
1120{
1121 BUG_ON(!net);
1122
1123 if (!dev_valid_name(name))
1124 return -EINVAL;
1125
1126 if (strchr(name, '%'))
1127 return dev_alloc_name_ns(net, dev, name);
1128 else if (__dev_get_by_name(net, name))
1129 return -EEXIST;
1130 else if (dev->name != name)
1131 strlcpy(dev->name, name, IFNAMSIZ);
1132
1133 return 0;
1134}
1135
1136/**
1137 * dev_change_name - change name of a device
1138 * @dev: device
1139 * @newname: name (or format string) must be at least IFNAMSIZ
1140 *
1141 * Change name of a device, can pass format strings "eth%d".
1142 * for wildcarding.
1143 */
1144int dev_change_name(struct net_device *dev, const char *newname)
1145{
1146 unsigned char old_assign_type;
1147 char oldname[IFNAMSIZ];
1148 int err = 0;
1149 int ret;
1150 struct net *net;
1151
1152 ASSERT_RTNL();
1153 BUG_ON(!dev_net(dev));
1154
1155 net = dev_net(dev);
1156 if (dev->flags & IFF_UP)
1157 return -EBUSY;
1158
1159 write_seqcount_begin(&devnet_rename_seq);
1160
1161 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162 write_seqcount_end(&devnet_rename_seq);
1163 return 0;
1164 }
1165
1166 memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168 err = dev_get_valid_name(net, dev, newname);
1169 if (err < 0) {
1170 write_seqcount_end(&devnet_rename_seq);
1171 return err;
1172 }
1173
1174 if (oldname[0] && !strchr(oldname, '%'))
1175 netdev_info(dev, "renamed from %s\n", oldname);
1176
1177 old_assign_type = dev->name_assign_type;
1178 dev->name_assign_type = NET_NAME_RENAMED;
1179
1180rollback:
1181 ret = device_rename(&dev->dev, dev->name);
1182 if (ret) {
1183 memcpy(dev->name, oldname, IFNAMSIZ);
1184 dev->name_assign_type = old_assign_type;
1185 write_seqcount_end(&devnet_rename_seq);
1186 return ret;
1187 }
1188
1189 write_seqcount_end(&devnet_rename_seq);
1190
1191 netdev_adjacent_rename_links(dev, oldname);
1192
1193 write_lock_bh(&dev_base_lock);
1194 hlist_del_rcu(&dev->name_hlist);
1195 write_unlock_bh(&dev_base_lock);
1196
1197 synchronize_rcu();
1198
1199 write_lock_bh(&dev_base_lock);
1200 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201 write_unlock_bh(&dev_base_lock);
1202
1203 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204 ret = notifier_to_errno(ret);
1205
1206 if (ret) {
1207 /* err >= 0 after dev_alloc_name() or stores the first errno */
1208 if (err >= 0) {
1209 err = ret;
1210 write_seqcount_begin(&devnet_rename_seq);
1211 memcpy(dev->name, oldname, IFNAMSIZ);
1212 memcpy(oldname, newname, IFNAMSIZ);
1213 dev->name_assign_type = old_assign_type;
1214 old_assign_type = NET_NAME_RENAMED;
1215 goto rollback;
1216 } else {
1217 pr_err("%s: name change rollback failed: %d\n",
1218 dev->name, ret);
1219 }
1220 }
1221
1222 return err;
1223}
1224
1225/**
1226 * dev_set_alias - change ifalias of a device
1227 * @dev: device
1228 * @alias: name up to IFALIASZ
1229 * @len: limit of bytes to copy from info
1230 *
1231 * Set ifalias for a device,
1232 */
1233int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234{
1235 char *new_ifalias;
1236
1237 ASSERT_RTNL();
1238
1239 if (len >= IFALIASZ)
1240 return -EINVAL;
1241
1242 if (!len) {
1243 kfree(dev->ifalias);
1244 dev->ifalias = NULL;
1245 return 0;
1246 }
1247
1248 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249 if (!new_ifalias)
1250 return -ENOMEM;
1251 dev->ifalias = new_ifalias;
1252
1253 strlcpy(dev->ifalias, alias, len+1);
1254 return len;
1255}
1256
1257
1258/**
1259 * netdev_features_change - device changes features
1260 * @dev: device to cause notification
1261 *
1262 * Called to indicate a device has changed features.
1263 */
1264void netdev_features_change(struct net_device *dev)
1265{
1266 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267}
1268EXPORT_SYMBOL(netdev_features_change);
1269
1270/**
1271 * netdev_state_change - device changes state
1272 * @dev: device to cause notification
1273 *
1274 * Called to indicate a device has changed state. This function calls
1275 * the notifier chains for netdev_chain and sends a NEWLINK message
1276 * to the routing socket.
1277 */
1278void netdev_state_change(struct net_device *dev)
1279{
1280 if (dev->flags & IFF_UP) {
1281 struct netdev_notifier_change_info change_info;
1282
1283 change_info.flags_changed = 0;
1284 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285 &change_info.info);
1286 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287 }
1288}
1289EXPORT_SYMBOL(netdev_state_change);
1290
1291/**
1292 * netdev_notify_peers - notify network peers about existence of @dev
1293 * @dev: network device
1294 *
1295 * Generate traffic such that interested network peers are aware of
1296 * @dev, such as by generating a gratuitous ARP. This may be used when
1297 * a device wants to inform the rest of the network about some sort of
1298 * reconfiguration such as a failover event or virtual machine
1299 * migration.
1300 */
1301void netdev_notify_peers(struct net_device *dev)
1302{
1303 rtnl_lock();
1304 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305 rtnl_unlock();
1306}
1307EXPORT_SYMBOL(netdev_notify_peers);
1308
1309static int __dev_open(struct net_device *dev)
1310{
1311 const struct net_device_ops *ops = dev->netdev_ops;
1312 int ret;
1313
1314 ASSERT_RTNL();
1315
1316 if (!netif_device_present(dev))
1317 return -ENODEV;
1318
1319 /* Block netpoll from trying to do any rx path servicing.
1320 * If we don't do this there is a chance ndo_poll_controller
1321 * or ndo_poll may be running while we open the device
1322 */
1323 netpoll_poll_disable(dev);
1324
1325 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326 ret = notifier_to_errno(ret);
1327 if (ret)
1328 return ret;
1329
1330 set_bit(__LINK_STATE_START, &dev->state);
1331
1332 if (ops->ndo_validate_addr)
1333 ret = ops->ndo_validate_addr(dev);
1334
1335 if (!ret && ops->ndo_open)
1336 ret = ops->ndo_open(dev);
1337
1338 netpoll_poll_enable(dev);
1339
1340 if (ret)
1341 clear_bit(__LINK_STATE_START, &dev->state);
1342 else {
1343 dev->flags |= IFF_UP;
1344 dev_set_rx_mode(dev);
1345 dev_activate(dev);
1346 add_device_randomness(dev->dev_addr, dev->addr_len);
1347 }
1348
1349 return ret;
1350}
1351
1352/**
1353 * dev_open - prepare an interface for use.
1354 * @dev: device to open
1355 *
1356 * Takes a device from down to up state. The device's private open
1357 * function is invoked and then the multicast lists are loaded. Finally
1358 * the device is moved into the up state and a %NETDEV_UP message is
1359 * sent to the netdev notifier chain.
1360 *
1361 * Calling this function on an active interface is a nop. On a failure
1362 * a negative errno code is returned.
1363 */
1364int dev_open(struct net_device *dev)
1365{
1366 int ret;
1367
1368 if (dev->flags & IFF_UP)
1369 return 0;
1370
1371 ret = __dev_open(dev);
1372 if (ret < 0)
1373 return ret;
1374
1375 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376 call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378 return ret;
1379}
1380EXPORT_SYMBOL(dev_open);
1381
1382static int __dev_close_many(struct list_head *head)
1383{
1384 struct net_device *dev;
1385
1386 ASSERT_RTNL();
1387 might_sleep();
1388
1389 list_for_each_entry(dev, head, close_list) {
1390 /* Temporarily disable netpoll until the interface is down */
1391 netpoll_poll_disable(dev);
1392
1393 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395 clear_bit(__LINK_STATE_START, &dev->state);
1396
1397 /* Synchronize to scheduled poll. We cannot touch poll list, it
1398 * can be even on different cpu. So just clear netif_running().
1399 *
1400 * dev->stop() will invoke napi_disable() on all of it's
1401 * napi_struct instances on this device.
1402 */
1403 smp_mb__after_atomic(); /* Commit netif_running(). */
1404 }
1405
1406 dev_deactivate_many(head);
1407
1408 list_for_each_entry(dev, head, close_list) {
1409 const struct net_device_ops *ops = dev->netdev_ops;
1410
1411 /*
1412 * Call the device specific close. This cannot fail.
1413 * Only if device is UP
1414 *
1415 * We allow it to be called even after a DETACH hot-plug
1416 * event.
1417 */
1418 if (ops->ndo_stop)
1419 ops->ndo_stop(dev);
1420
1421 dev->flags &= ~IFF_UP;
1422 netpoll_poll_enable(dev);
1423 }
1424
1425 return 0;
1426}
1427
1428static int __dev_close(struct net_device *dev)
1429{
1430 int retval;
1431 LIST_HEAD(single);
1432
1433 list_add(&dev->close_list, &single);
1434 retval = __dev_close_many(&single);
1435 list_del(&single);
1436
1437 return retval;
1438}
1439
1440int dev_close_many(struct list_head *head, bool unlink)
1441{
1442 struct net_device *dev, *tmp;
1443
1444 /* Remove the devices that don't need to be closed */
1445 list_for_each_entry_safe(dev, tmp, head, close_list)
1446 if (!(dev->flags & IFF_UP))
1447 list_del_init(&dev->close_list);
1448
1449 __dev_close_many(head);
1450
1451 list_for_each_entry_safe(dev, tmp, head, close_list) {
1452 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453 call_netdevice_notifiers(NETDEV_DOWN, dev);
1454 if (unlink)
1455 list_del_init(&dev->close_list);
1456 }
1457
1458 return 0;
1459}
1460EXPORT_SYMBOL(dev_close_many);
1461
1462/**
1463 * dev_close - shutdown an interface.
1464 * @dev: device to shutdown
1465 *
1466 * This function moves an active device into down state. A
1467 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469 * chain.
1470 */
1471int dev_close(struct net_device *dev)
1472{
1473 if (dev->flags & IFF_UP) {
1474 LIST_HEAD(single);
1475
1476 list_add(&dev->close_list, &single);
1477 dev_close_many(&single, true);
1478 list_del(&single);
1479 }
1480 return 0;
1481}
1482EXPORT_SYMBOL(dev_close);
1483
1484
1485/**
1486 * dev_disable_lro - disable Large Receive Offload on a device
1487 * @dev: device
1488 *
1489 * Disable Large Receive Offload (LRO) on a net device. Must be
1490 * called under RTNL. This is needed if received packets may be
1491 * forwarded to another interface.
1492 */
1493void dev_disable_lro(struct net_device *dev)
1494{
1495 struct net_device *lower_dev;
1496 struct list_head *iter;
1497
1498 dev->wanted_features &= ~NETIF_F_LRO;
1499 netdev_update_features(dev);
1500
1501 if (unlikely(dev->features & NETIF_F_LRO))
1502 netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504 netdev_for_each_lower_dev(dev, lower_dev, iter)
1505 dev_disable_lro(lower_dev);
1506}
1507EXPORT_SYMBOL(dev_disable_lro);
1508
1509static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510 struct net_device *dev)
1511{
1512 struct netdev_notifier_info info;
1513
1514 netdev_notifier_info_init(&info, dev);
1515 return nb->notifier_call(nb, val, &info);
1516}
1517
1518static int dev_boot_phase = 1;
1519
1520/**
1521 * register_netdevice_notifier - register a network notifier block
1522 * @nb: notifier
1523 *
1524 * Register a notifier to be called when network device events occur.
1525 * The notifier passed is linked into the kernel structures and must
1526 * not be reused until it has been unregistered. A negative errno code
1527 * is returned on a failure.
1528 *
1529 * When registered all registration and up events are replayed
1530 * to the new notifier to allow device to have a race free
1531 * view of the network device list.
1532 */
1533
1534int register_netdevice_notifier(struct notifier_block *nb)
1535{
1536 struct net_device *dev;
1537 struct net_device *last;
1538 struct net *net;
1539 int err;
1540
1541 rtnl_lock();
1542 err = raw_notifier_chain_register(&netdev_chain, nb);
1543 if (err)
1544 goto unlock;
1545 if (dev_boot_phase)
1546 goto unlock;
1547 for_each_net(net) {
1548 for_each_netdev(net, dev) {
1549 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550 err = notifier_to_errno(err);
1551 if (err)
1552 goto rollback;
1553
1554 if (!(dev->flags & IFF_UP))
1555 continue;
1556
1557 call_netdevice_notifier(nb, NETDEV_UP, dev);
1558 }
1559 }
1560
1561unlock:
1562 rtnl_unlock();
1563 return err;
1564
1565rollback:
1566 last = dev;
1567 for_each_net(net) {
1568 for_each_netdev(net, dev) {
1569 if (dev == last)
1570 goto outroll;
1571
1572 if (dev->flags & IFF_UP) {
1573 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574 dev);
1575 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576 }
1577 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578 }
1579 }
1580
1581outroll:
1582 raw_notifier_chain_unregister(&netdev_chain, nb);
1583 goto unlock;
1584}
1585EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587/**
1588 * unregister_netdevice_notifier - unregister a network notifier block
1589 * @nb: notifier
1590 *
1591 * Unregister a notifier previously registered by
1592 * register_netdevice_notifier(). The notifier is unlinked into the
1593 * kernel structures and may then be reused. A negative errno code
1594 * is returned on a failure.
1595 *
1596 * After unregistering unregister and down device events are synthesized
1597 * for all devices on the device list to the removed notifier to remove
1598 * the need for special case cleanup code.
1599 */
1600
1601int unregister_netdevice_notifier(struct notifier_block *nb)
1602{
1603 struct net_device *dev;
1604 struct net *net;
1605 int err;
1606
1607 rtnl_lock();
1608 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609 if (err)
1610 goto unlock;
1611
1612 for_each_net(net) {
1613 for_each_netdev(net, dev) {
1614 if (dev->flags & IFF_UP) {
1615 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616 dev);
1617 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618 }
1619 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620 }
1621 }
1622unlock:
1623 rtnl_unlock();
1624 return err;
1625}
1626EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
1628/**
1629 * call_netdevice_notifiers_info - call all network notifier blocks
1630 * @val: value passed unmodified to notifier function
1631 * @dev: net_device pointer passed unmodified to notifier function
1632 * @info: notifier information data
1633 *
1634 * Call all network notifier blocks. Parameters and return value
1635 * are as for raw_notifier_call_chain().
1636 */
1637
1638static int call_netdevice_notifiers_info(unsigned long val,
1639 struct net_device *dev,
1640 struct netdev_notifier_info *info)
1641{
1642 ASSERT_RTNL();
1643 netdev_notifier_info_init(info, dev);
1644 return raw_notifier_call_chain(&netdev_chain, val, info);
1645}
1646
1647/**
1648 * call_netdevice_notifiers - call all network notifier blocks
1649 * @val: value passed unmodified to notifier function
1650 * @dev: net_device pointer passed unmodified to notifier function
1651 *
1652 * Call all network notifier blocks. Parameters and return value
1653 * are as for raw_notifier_call_chain().
1654 */
1655
1656int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657{
1658 struct netdev_notifier_info info;
1659
1660 return call_netdevice_notifiers_info(val, dev, &info);
1661}
1662EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664#ifdef CONFIG_NET_INGRESS
1665static struct static_key ingress_needed __read_mostly;
1666
1667void net_inc_ingress_queue(void)
1668{
1669 static_key_slow_inc(&ingress_needed);
1670}
1671EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673void net_dec_ingress_queue(void)
1674{
1675 static_key_slow_dec(&ingress_needed);
1676}
1677EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678#endif
1679
1680#ifdef CONFIG_NET_EGRESS
1681static struct static_key egress_needed __read_mostly;
1682
1683void net_inc_egress_queue(void)
1684{
1685 static_key_slow_inc(&egress_needed);
1686}
1687EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689void net_dec_egress_queue(void)
1690{
1691 static_key_slow_dec(&egress_needed);
1692}
1693EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694#endif
1695
1696static struct static_key netstamp_needed __read_mostly;
1697#ifdef HAVE_JUMP_LABEL
1698static atomic_t netstamp_needed_deferred;
1699static atomic_t netstamp_wanted;
1700static void netstamp_clear(struct work_struct *work)
1701{
1702 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1703 int wanted;
1704
1705 wanted = atomic_add_return(deferred, &netstamp_wanted);
1706 if (wanted > 0)
1707 static_key_enable(&netstamp_needed);
1708 else
1709 static_key_disable(&netstamp_needed);
1710}
1711static DECLARE_WORK(netstamp_work, netstamp_clear);
1712#endif
1713
1714void net_enable_timestamp(void)
1715{
1716#ifdef HAVE_JUMP_LABEL
1717 int wanted;
1718
1719 while (1) {
1720 wanted = atomic_read(&netstamp_wanted);
1721 if (wanted <= 0)
1722 break;
1723 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1724 return;
1725 }
1726 atomic_inc(&netstamp_needed_deferred);
1727 schedule_work(&netstamp_work);
1728#else
1729 static_key_slow_inc(&netstamp_needed);
1730#endif
1731}
1732EXPORT_SYMBOL(net_enable_timestamp);
1733
1734void net_disable_timestamp(void)
1735{
1736#ifdef HAVE_JUMP_LABEL
1737 int wanted;
1738
1739 while (1) {
1740 wanted = atomic_read(&netstamp_wanted);
1741 if (wanted <= 1)
1742 break;
1743 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1744 return;
1745 }
1746 atomic_dec(&netstamp_needed_deferred);
1747 schedule_work(&netstamp_work);
1748#else
1749 static_key_slow_dec(&netstamp_needed);
1750#endif
1751}
1752EXPORT_SYMBOL(net_disable_timestamp);
1753
1754static inline void net_timestamp_set(struct sk_buff *skb)
1755{
1756 skb->tstamp = 0;
1757 if (static_key_false(&netstamp_needed))
1758 __net_timestamp(skb);
1759}
1760
1761#define net_timestamp_check(COND, SKB) \
1762 if (static_key_false(&netstamp_needed)) { \
1763 if ((COND) && !(SKB)->tstamp) \
1764 __net_timestamp(SKB); \
1765 } \
1766
1767bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1768{
1769 unsigned int len;
1770
1771 if (!(dev->flags & IFF_UP))
1772 return false;
1773
1774 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1775 if (skb->len <= len)
1776 return true;
1777
1778 /* if TSO is enabled, we don't care about the length as the packet
1779 * could be forwarded without being segmented before
1780 */
1781 if (skb_is_gso(skb))
1782 return true;
1783
1784 return false;
1785}
1786EXPORT_SYMBOL_GPL(is_skb_forwardable);
1787
1788int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1789{
1790 int ret = ____dev_forward_skb(dev, skb);
1791
1792 if (likely(!ret)) {
1793 skb->protocol = eth_type_trans(skb, dev);
1794 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1795 }
1796
1797 return ret;
1798}
1799EXPORT_SYMBOL_GPL(__dev_forward_skb);
1800
1801/**
1802 * dev_forward_skb - loopback an skb to another netif
1803 *
1804 * @dev: destination network device
1805 * @skb: buffer to forward
1806 *
1807 * return values:
1808 * NET_RX_SUCCESS (no congestion)
1809 * NET_RX_DROP (packet was dropped, but freed)
1810 *
1811 * dev_forward_skb can be used for injecting an skb from the
1812 * start_xmit function of one device into the receive queue
1813 * of another device.
1814 *
1815 * The receiving device may be in another namespace, so
1816 * we have to clear all information in the skb that could
1817 * impact namespace isolation.
1818 */
1819int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1820{
1821 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1822}
1823EXPORT_SYMBOL_GPL(dev_forward_skb);
1824
1825static inline int deliver_skb(struct sk_buff *skb,
1826 struct packet_type *pt_prev,
1827 struct net_device *orig_dev)
1828{
1829 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1830 return -ENOMEM;
1831 atomic_inc(&skb->users);
1832 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1833}
1834
1835static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1836 struct packet_type **pt,
1837 struct net_device *orig_dev,
1838 __be16 type,
1839 struct list_head *ptype_list)
1840{
1841 struct packet_type *ptype, *pt_prev = *pt;
1842
1843 list_for_each_entry_rcu(ptype, ptype_list, list) {
1844 if (ptype->type != type)
1845 continue;
1846 if (pt_prev)
1847 deliver_skb(skb, pt_prev, orig_dev);
1848 pt_prev = ptype;
1849 }
1850 *pt = pt_prev;
1851}
1852
1853static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1854{
1855 if (!ptype->af_packet_priv || !skb->sk)
1856 return false;
1857
1858 if (ptype->id_match)
1859 return ptype->id_match(ptype, skb->sk);
1860 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1861 return true;
1862
1863 return false;
1864}
1865
1866/*
1867 * Support routine. Sends outgoing frames to any network
1868 * taps currently in use.
1869 */
1870
1871void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1872{
1873 struct packet_type *ptype;
1874 struct sk_buff *skb2 = NULL;
1875 struct packet_type *pt_prev = NULL;
1876 struct list_head *ptype_list = &ptype_all;
1877
1878 rcu_read_lock();
1879again:
1880 list_for_each_entry_rcu(ptype, ptype_list, list) {
1881 /* Never send packets back to the socket
1882 * they originated from - MvS (miquels@drinkel.ow.org)
1883 */
1884 if (skb_loop_sk(ptype, skb))
1885 continue;
1886
1887 if (pt_prev) {
1888 deliver_skb(skb2, pt_prev, skb->dev);
1889 pt_prev = ptype;
1890 continue;
1891 }
1892
1893 /* need to clone skb, done only once */
1894 skb2 = skb_clone(skb, GFP_ATOMIC);
1895 if (!skb2)
1896 goto out_unlock;
1897
1898 net_timestamp_set(skb2);
1899
1900 /* skb->nh should be correctly
1901 * set by sender, so that the second statement is
1902 * just protection against buggy protocols.
1903 */
1904 skb_reset_mac_header(skb2);
1905
1906 if (skb_network_header(skb2) < skb2->data ||
1907 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1908 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1909 ntohs(skb2->protocol),
1910 dev->name);
1911 skb_reset_network_header(skb2);
1912 }
1913
1914 skb2->transport_header = skb2->network_header;
1915 skb2->pkt_type = PACKET_OUTGOING;
1916 pt_prev = ptype;
1917 }
1918
1919 if (ptype_list == &ptype_all) {
1920 ptype_list = &dev->ptype_all;
1921 goto again;
1922 }
1923out_unlock:
1924 if (pt_prev)
1925 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1926 rcu_read_unlock();
1927}
1928EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1929
1930/**
1931 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1932 * @dev: Network device
1933 * @txq: number of queues available
1934 *
1935 * If real_num_tx_queues is changed the tc mappings may no longer be
1936 * valid. To resolve this verify the tc mapping remains valid and if
1937 * not NULL the mapping. With no priorities mapping to this
1938 * offset/count pair it will no longer be used. In the worst case TC0
1939 * is invalid nothing can be done so disable priority mappings. If is
1940 * expected that drivers will fix this mapping if they can before
1941 * calling netif_set_real_num_tx_queues.
1942 */
1943static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1944{
1945 int i;
1946 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1947
1948 /* If TC0 is invalidated disable TC mapping */
1949 if (tc->offset + tc->count > txq) {
1950 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1951 dev->num_tc = 0;
1952 return;
1953 }
1954
1955 /* Invalidated prio to tc mappings set to TC0 */
1956 for (i = 1; i < TC_BITMASK + 1; i++) {
1957 int q = netdev_get_prio_tc_map(dev, i);
1958
1959 tc = &dev->tc_to_txq[q];
1960 if (tc->offset + tc->count > txq) {
1961 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1962 i, q);
1963 netdev_set_prio_tc_map(dev, i, 0);
1964 }
1965 }
1966}
1967
1968int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
1969{
1970 if (dev->num_tc) {
1971 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1972 int i;
1973
1974 for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
1975 if ((txq - tc->offset) < tc->count)
1976 return i;
1977 }
1978
1979 return -1;
1980 }
1981
1982 return 0;
1983}
1984
1985#ifdef CONFIG_XPS
1986static DEFINE_MUTEX(xps_map_mutex);
1987#define xmap_dereference(P) \
1988 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1989
1990static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
1991 int tci, u16 index)
1992{
1993 struct xps_map *map = NULL;
1994 int pos;
1995
1996 if (dev_maps)
1997 map = xmap_dereference(dev_maps->cpu_map[tci]);
1998 if (!map)
1999 return false;
2000
2001 for (pos = map->len; pos--;) {
2002 if (map->queues[pos] != index)
2003 continue;
2004
2005 if (map->len > 1) {
2006 map->queues[pos] = map->queues[--map->len];
2007 break;
2008 }
2009
2010 RCU_INIT_POINTER(dev_maps->cpu_map[tci], NULL);
2011 kfree_rcu(map, rcu);
2012 return false;
2013 }
2014
2015 return true;
2016}
2017
2018static bool remove_xps_queue_cpu(struct net_device *dev,
2019 struct xps_dev_maps *dev_maps,
2020 int cpu, u16 offset, u16 count)
2021{
2022 int num_tc = dev->num_tc ? : 1;
2023 bool active = false;
2024 int tci;
2025
2026 for (tci = cpu * num_tc; num_tc--; tci++) {
2027 int i, j;
2028
2029 for (i = count, j = offset; i--; j++) {
2030 if (!remove_xps_queue(dev_maps, cpu, j))
2031 break;
2032 }
2033
2034 active |= i < 0;
2035 }
2036
2037 return active;
2038}
2039
2040static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2041 u16 count)
2042{
2043 struct xps_dev_maps *dev_maps;
2044 int cpu, i;
2045 bool active = false;
2046
2047 mutex_lock(&xps_map_mutex);
2048 dev_maps = xmap_dereference(dev->xps_maps);
2049
2050 if (!dev_maps)
2051 goto out_no_maps;
2052
2053 for_each_possible_cpu(cpu)
2054 active |= remove_xps_queue_cpu(dev, dev_maps, cpu,
2055 offset, count);
2056
2057 if (!active) {
2058 RCU_INIT_POINTER(dev->xps_maps, NULL);
2059 kfree_rcu(dev_maps, rcu);
2060 }
2061
2062 for (i = offset + (count - 1); count--; i--)
2063 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2064 NUMA_NO_NODE);
2065
2066out_no_maps:
2067 mutex_unlock(&xps_map_mutex);
2068}
2069
2070static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2071{
2072 netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2073}
2074
2075static struct xps_map *expand_xps_map(struct xps_map *map,
2076 int cpu, u16 index)
2077{
2078 struct xps_map *new_map;
2079 int alloc_len = XPS_MIN_MAP_ALLOC;
2080 int i, pos;
2081
2082 for (pos = 0; map && pos < map->len; pos++) {
2083 if (map->queues[pos] != index)
2084 continue;
2085 return map;
2086 }
2087
2088 /* Need to add queue to this CPU's existing map */
2089 if (map) {
2090 if (pos < map->alloc_len)
2091 return map;
2092
2093 alloc_len = map->alloc_len * 2;
2094 }
2095
2096 /* Need to allocate new map to store queue on this CPU's map */
2097 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2098 cpu_to_node(cpu));
2099 if (!new_map)
2100 return NULL;
2101
2102 for (i = 0; i < pos; i++)
2103 new_map->queues[i] = map->queues[i];
2104 new_map->alloc_len = alloc_len;
2105 new_map->len = pos;
2106
2107 return new_map;
2108}
2109
2110int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2111 u16 index)
2112{
2113 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2114 int i, cpu, tci, numa_node_id = -2;
2115 int maps_sz, num_tc = 1, tc = 0;
2116 struct xps_map *map, *new_map;
2117 bool active = false;
2118
2119 if (dev->num_tc) {
2120 num_tc = dev->num_tc;
2121 tc = netdev_txq_to_tc(dev, index);
2122 if (tc < 0)
2123 return -EINVAL;
2124 }
2125
2126 maps_sz = XPS_DEV_MAPS_SIZE(num_tc);
2127 if (maps_sz < L1_CACHE_BYTES)
2128 maps_sz = L1_CACHE_BYTES;
2129
2130 mutex_lock(&xps_map_mutex);
2131
2132 dev_maps = xmap_dereference(dev->xps_maps);
2133
2134 /* allocate memory for queue storage */
2135 for_each_cpu_and(cpu, cpu_online_mask, mask) {
2136 if (!new_dev_maps)
2137 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2138 if (!new_dev_maps) {
2139 mutex_unlock(&xps_map_mutex);
2140 return -ENOMEM;
2141 }
2142
2143 tci = cpu * num_tc + tc;
2144 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[tci]) :
2145 NULL;
2146
2147 map = expand_xps_map(map, cpu, index);
2148 if (!map)
2149 goto error;
2150
2151 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2152 }
2153
2154 if (!new_dev_maps)
2155 goto out_no_new_maps;
2156
2157 for_each_possible_cpu(cpu) {
2158 /* copy maps belonging to foreign traffic classes */
2159 for (i = tc, tci = cpu * num_tc; dev_maps && i--; tci++) {
2160 /* fill in the new device map from the old device map */
2161 map = xmap_dereference(dev_maps->cpu_map[tci]);
2162 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2163 }
2164
2165 /* We need to explicitly update tci as prevous loop
2166 * could break out early if dev_maps is NULL.
2167 */
2168 tci = cpu * num_tc + tc;
2169
2170 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2171 /* add queue to CPU maps */
2172 int pos = 0;
2173
2174 map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2175 while ((pos < map->len) && (map->queues[pos] != index))
2176 pos++;
2177
2178 if (pos == map->len)
2179 map->queues[map->len++] = index;
2180#ifdef CONFIG_NUMA
2181 if (numa_node_id == -2)
2182 numa_node_id = cpu_to_node(cpu);
2183 else if (numa_node_id != cpu_to_node(cpu))
2184 numa_node_id = -1;
2185#endif
2186 } else if (dev_maps) {
2187 /* fill in the new device map from the old device map */
2188 map = xmap_dereference(dev_maps->cpu_map[tci]);
2189 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2190 }
2191
2192 /* copy maps belonging to foreign traffic classes */
2193 for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2194 /* fill in the new device map from the old device map */
2195 map = xmap_dereference(dev_maps->cpu_map[tci]);
2196 RCU_INIT_POINTER(new_dev_maps->cpu_map[tci], map);
2197 }
2198 }
2199
2200 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2201
2202 /* Cleanup old maps */
2203 if (!dev_maps)
2204 goto out_no_old_maps;
2205
2206 for_each_possible_cpu(cpu) {
2207 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2208 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2209 map = xmap_dereference(dev_maps->cpu_map[tci]);
2210 if (map && map != new_map)
2211 kfree_rcu(map, rcu);
2212 }
2213 }
2214
2215 kfree_rcu(dev_maps, rcu);
2216
2217out_no_old_maps:
2218 dev_maps = new_dev_maps;
2219 active = true;
2220
2221out_no_new_maps:
2222 /* update Tx queue numa node */
2223 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2224 (numa_node_id >= 0) ? numa_node_id :
2225 NUMA_NO_NODE);
2226
2227 if (!dev_maps)
2228 goto out_no_maps;
2229
2230 /* removes queue from unused CPUs */
2231 for_each_possible_cpu(cpu) {
2232 for (i = tc, tci = cpu * num_tc; i--; tci++)
2233 active |= remove_xps_queue(dev_maps, tci, index);
2234 if (!cpumask_test_cpu(cpu, mask) || !cpu_online(cpu))
2235 active |= remove_xps_queue(dev_maps, tci, index);
2236 for (i = num_tc - tc, tci++; --i; tci++)
2237 active |= remove_xps_queue(dev_maps, tci, index);
2238 }
2239
2240 /* free map if not active */
2241 if (!active) {
2242 RCU_INIT_POINTER(dev->xps_maps, NULL);
2243 kfree_rcu(dev_maps, rcu);
2244 }
2245
2246out_no_maps:
2247 mutex_unlock(&xps_map_mutex);
2248
2249 return 0;
2250error:
2251 /* remove any maps that we added */
2252 for_each_possible_cpu(cpu) {
2253 for (i = num_tc, tci = cpu * num_tc; i--; tci++) {
2254 new_map = xmap_dereference(new_dev_maps->cpu_map[tci]);
2255 map = dev_maps ?
2256 xmap_dereference(dev_maps->cpu_map[tci]) :
2257 NULL;
2258 if (new_map && new_map != map)
2259 kfree(new_map);
2260 }
2261 }
2262
2263 mutex_unlock(&xps_map_mutex);
2264
2265 kfree(new_dev_maps);
2266 return -ENOMEM;
2267}
2268EXPORT_SYMBOL(netif_set_xps_queue);
2269
2270#endif
2271void netdev_reset_tc(struct net_device *dev)
2272{
2273#ifdef CONFIG_XPS
2274 netif_reset_xps_queues_gt(dev, 0);
2275#endif
2276 dev->num_tc = 0;
2277 memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2278 memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2279}
2280EXPORT_SYMBOL(netdev_reset_tc);
2281
2282int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2283{
2284 if (tc >= dev->num_tc)
2285 return -EINVAL;
2286
2287#ifdef CONFIG_XPS
2288 netif_reset_xps_queues(dev, offset, count);
2289#endif
2290 dev->tc_to_txq[tc].count = count;
2291 dev->tc_to_txq[tc].offset = offset;
2292 return 0;
2293}
2294EXPORT_SYMBOL(netdev_set_tc_queue);
2295
2296int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2297{
2298 if (num_tc > TC_MAX_QUEUE)
2299 return -EINVAL;
2300
2301#ifdef CONFIG_XPS
2302 netif_reset_xps_queues_gt(dev, 0);
2303#endif
2304 dev->num_tc = num_tc;
2305 return 0;
2306}
2307EXPORT_SYMBOL(netdev_set_num_tc);
2308
2309/*
2310 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2311 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2312 */
2313int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2314{
2315 int rc;
2316
2317 if (txq < 1 || txq > dev->num_tx_queues)
2318 return -EINVAL;
2319
2320 if (dev->reg_state == NETREG_REGISTERED ||
2321 dev->reg_state == NETREG_UNREGISTERING) {
2322 ASSERT_RTNL();
2323
2324 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2325 txq);
2326 if (rc)
2327 return rc;
2328
2329 if (dev->num_tc)
2330 netif_setup_tc(dev, txq);
2331
2332 if (txq < dev->real_num_tx_queues) {
2333 qdisc_reset_all_tx_gt(dev, txq);
2334#ifdef CONFIG_XPS
2335 netif_reset_xps_queues_gt(dev, txq);
2336#endif
2337 }
2338 }
2339
2340 dev->real_num_tx_queues = txq;
2341 return 0;
2342}
2343EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2344
2345#ifdef CONFIG_SYSFS
2346/**
2347 * netif_set_real_num_rx_queues - set actual number of RX queues used
2348 * @dev: Network device
2349 * @rxq: Actual number of RX queues
2350 *
2351 * This must be called either with the rtnl_lock held or before
2352 * registration of the net device. Returns 0 on success, or a
2353 * negative error code. If called before registration, it always
2354 * succeeds.
2355 */
2356int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2357{
2358 int rc;
2359
2360 if (rxq < 1 || rxq > dev->num_rx_queues)
2361 return -EINVAL;
2362
2363 if (dev->reg_state == NETREG_REGISTERED) {
2364 ASSERT_RTNL();
2365
2366 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2367 rxq);
2368 if (rc)
2369 return rc;
2370 }
2371
2372 dev->real_num_rx_queues = rxq;
2373 return 0;
2374}
2375EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2376#endif
2377
2378/**
2379 * netif_get_num_default_rss_queues - default number of RSS queues
2380 *
2381 * This routine should set an upper limit on the number of RSS queues
2382 * used by default by multiqueue devices.
2383 */
2384int netif_get_num_default_rss_queues(void)
2385{
2386 return is_kdump_kernel() ?
2387 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2388}
2389EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2390
2391static void __netif_reschedule(struct Qdisc *q)
2392{
2393 struct softnet_data *sd;
2394 unsigned long flags;
2395
2396 local_irq_save(flags);
2397 sd = this_cpu_ptr(&softnet_data);
2398 q->next_sched = NULL;
2399 *sd->output_queue_tailp = q;
2400 sd->output_queue_tailp = &q->next_sched;
2401 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2402 local_irq_restore(flags);
2403}
2404
2405void __netif_schedule(struct Qdisc *q)
2406{
2407 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2408 __netif_reschedule(q);
2409}
2410EXPORT_SYMBOL(__netif_schedule);
2411
2412struct dev_kfree_skb_cb {
2413 enum skb_free_reason reason;
2414};
2415
2416static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2417{
2418 return (struct dev_kfree_skb_cb *)skb->cb;
2419}
2420
2421void netif_schedule_queue(struct netdev_queue *txq)
2422{
2423 rcu_read_lock();
2424 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2425 struct Qdisc *q = rcu_dereference(txq->qdisc);
2426
2427 __netif_schedule(q);
2428 }
2429 rcu_read_unlock();
2430}
2431EXPORT_SYMBOL(netif_schedule_queue);
2432
2433/**
2434 * netif_wake_subqueue - allow sending packets on subqueue
2435 * @dev: network device
2436 * @queue_index: sub queue index
2437 *
2438 * Resume individual transmit queue of a device with multiple transmit queues.
2439 */
2440void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2441{
2442 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2443
2444 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2445 struct Qdisc *q;
2446
2447 rcu_read_lock();
2448 q = rcu_dereference(txq->qdisc);
2449 __netif_schedule(q);
2450 rcu_read_unlock();
2451 }
2452}
2453EXPORT_SYMBOL(netif_wake_subqueue);
2454
2455void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2456{
2457 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2458 struct Qdisc *q;
2459
2460 rcu_read_lock();
2461 q = rcu_dereference(dev_queue->qdisc);
2462 __netif_schedule(q);
2463 rcu_read_unlock();
2464 }
2465}
2466EXPORT_SYMBOL(netif_tx_wake_queue);
2467
2468void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2469{
2470 unsigned long flags;
2471
2472 if (likely(atomic_read(&skb->users) == 1)) {
2473 smp_rmb();
2474 atomic_set(&skb->users, 0);
2475 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2476 return;
2477 }
2478 get_kfree_skb_cb(skb)->reason = reason;
2479 local_irq_save(flags);
2480 skb->next = __this_cpu_read(softnet_data.completion_queue);
2481 __this_cpu_write(softnet_data.completion_queue, skb);
2482 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2483 local_irq_restore(flags);
2484}
2485EXPORT_SYMBOL(__dev_kfree_skb_irq);
2486
2487void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2488{
2489 if (in_irq() || irqs_disabled())
2490 __dev_kfree_skb_irq(skb, reason);
2491 else
2492 dev_kfree_skb(skb);
2493}
2494EXPORT_SYMBOL(__dev_kfree_skb_any);
2495
2496
2497/**
2498 * netif_device_detach - mark device as removed
2499 * @dev: network device
2500 *
2501 * Mark device as removed from system and therefore no longer available.
2502 */
2503void netif_device_detach(struct net_device *dev)
2504{
2505 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2506 netif_running(dev)) {
2507 netif_tx_stop_all_queues(dev);
2508 }
2509}
2510EXPORT_SYMBOL(netif_device_detach);
2511
2512/**
2513 * netif_device_attach - mark device as attached
2514 * @dev: network device
2515 *
2516 * Mark device as attached from system and restart if needed.
2517 */
2518void netif_device_attach(struct net_device *dev)
2519{
2520 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2521 netif_running(dev)) {
2522 netif_tx_wake_all_queues(dev);
2523 __netdev_watchdog_up(dev);
2524 }
2525}
2526EXPORT_SYMBOL(netif_device_attach);
2527
2528/*
2529 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2530 * to be used as a distribution range.
2531 */
2532u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2533 unsigned int num_tx_queues)
2534{
2535 u32 hash;
2536 u16 qoffset = 0;
2537 u16 qcount = num_tx_queues;
2538
2539 if (skb_rx_queue_recorded(skb)) {
2540 hash = skb_get_rx_queue(skb);
2541 while (unlikely(hash >= num_tx_queues))
2542 hash -= num_tx_queues;
2543 return hash;
2544 }
2545
2546 if (dev->num_tc) {
2547 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2548 qoffset = dev->tc_to_txq[tc].offset;
2549 qcount = dev->tc_to_txq[tc].count;
2550 }
2551
2552 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2553}
2554EXPORT_SYMBOL(__skb_tx_hash);
2555
2556static void skb_warn_bad_offload(const struct sk_buff *skb)
2557{
2558 static const netdev_features_t null_features;
2559 struct net_device *dev = skb->dev;
2560 const char *name = "";
2561
2562 if (!net_ratelimit())
2563 return;
2564
2565 if (dev) {
2566 if (dev->dev.parent)
2567 name = dev_driver_string(dev->dev.parent);
2568 else
2569 name = netdev_name(dev);
2570 }
2571 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2572 "gso_type=%d ip_summed=%d\n",
2573 name, dev ? &dev->features : &null_features,
2574 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2575 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2576 skb_shinfo(skb)->gso_type, skb->ip_summed);
2577}
2578
2579/*
2580 * Invalidate hardware checksum when packet is to be mangled, and
2581 * complete checksum manually on outgoing path.
2582 */
2583int skb_checksum_help(struct sk_buff *skb)
2584{
2585 __wsum csum;
2586 int ret = 0, offset;
2587
2588 if (skb->ip_summed == CHECKSUM_COMPLETE)
2589 goto out_set_summed;
2590
2591 if (unlikely(skb_shinfo(skb)->gso_size)) {
2592 skb_warn_bad_offload(skb);
2593 return -EINVAL;
2594 }
2595
2596 /* Before computing a checksum, we should make sure no frag could
2597 * be modified by an external entity : checksum could be wrong.
2598 */
2599 if (skb_has_shared_frag(skb)) {
2600 ret = __skb_linearize(skb);
2601 if (ret)
2602 goto out;
2603 }
2604
2605 offset = skb_checksum_start_offset(skb);
2606 BUG_ON(offset >= skb_headlen(skb));
2607 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2608
2609 offset += skb->csum_offset;
2610 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2611
2612 if (skb_cloned(skb) &&
2613 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2614 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2615 if (ret)
2616 goto out;
2617 }
2618
2619 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2620out_set_summed:
2621 skb->ip_summed = CHECKSUM_NONE;
2622out:
2623 return ret;
2624}
2625EXPORT_SYMBOL(skb_checksum_help);
2626
2627__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2628{
2629 __be16 type = skb->protocol;
2630
2631 /* Tunnel gso handlers can set protocol to ethernet. */
2632 if (type == htons(ETH_P_TEB)) {
2633 struct ethhdr *eth;
2634
2635 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2636 return 0;
2637
2638 eth = (struct ethhdr *)skb_mac_header(skb);
2639 type = eth->h_proto;
2640 }
2641
2642 return __vlan_get_protocol(skb, type, depth);
2643}
2644
2645/**
2646 * skb_mac_gso_segment - mac layer segmentation handler.
2647 * @skb: buffer to segment
2648 * @features: features for the output path (see dev->features)
2649 */
2650struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2651 netdev_features_t features)
2652{
2653 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2654 struct packet_offload *ptype;
2655 int vlan_depth = skb->mac_len;
2656 __be16 type = skb_network_protocol(skb, &vlan_depth);
2657
2658 if (unlikely(!type))
2659 return ERR_PTR(-EINVAL);
2660
2661 __skb_pull(skb, vlan_depth);
2662
2663 rcu_read_lock();
2664 list_for_each_entry_rcu(ptype, &offload_base, list) {
2665 if (ptype->type == type && ptype->callbacks.gso_segment) {
2666 segs = ptype->callbacks.gso_segment(skb, features);
2667 break;
2668 }
2669 }
2670 rcu_read_unlock();
2671
2672 __skb_push(skb, skb->data - skb_mac_header(skb));
2673
2674 return segs;
2675}
2676EXPORT_SYMBOL(skb_mac_gso_segment);
2677
2678
2679/* openvswitch calls this on rx path, so we need a different check.
2680 */
2681static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2682{
2683 if (tx_path)
2684 return skb->ip_summed != CHECKSUM_PARTIAL;
2685 else
2686 return skb->ip_summed == CHECKSUM_NONE;
2687}
2688
2689/**
2690 * __skb_gso_segment - Perform segmentation on skb.
2691 * @skb: buffer to segment
2692 * @features: features for the output path (see dev->features)
2693 * @tx_path: whether it is called in TX path
2694 *
2695 * This function segments the given skb and returns a list of segments.
2696 *
2697 * It may return NULL if the skb requires no segmentation. This is
2698 * only possible when GSO is used for verifying header integrity.
2699 *
2700 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2701 */
2702struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2703 netdev_features_t features, bool tx_path)
2704{
2705 if (unlikely(skb_needs_check(skb, tx_path))) {
2706 int err;
2707
2708 skb_warn_bad_offload(skb);
2709
2710 err = skb_cow_head(skb, 0);
2711 if (err < 0)
2712 return ERR_PTR(err);
2713 }
2714
2715 /* Only report GSO partial support if it will enable us to
2716 * support segmentation on this frame without needing additional
2717 * work.
2718 */
2719 if (features & NETIF_F_GSO_PARTIAL) {
2720 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2721 struct net_device *dev = skb->dev;
2722
2723 partial_features |= dev->features & dev->gso_partial_features;
2724 if (!skb_gso_ok(skb, features | partial_features))
2725 features &= ~NETIF_F_GSO_PARTIAL;
2726 }
2727
2728 BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2729 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2730
2731 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2732 SKB_GSO_CB(skb)->encap_level = 0;
2733
2734 skb_reset_mac_header(skb);
2735 skb_reset_mac_len(skb);
2736
2737 return skb_mac_gso_segment(skb, features);
2738}
2739EXPORT_SYMBOL(__skb_gso_segment);
2740
2741/* Take action when hardware reception checksum errors are detected. */
2742#ifdef CONFIG_BUG
2743void netdev_rx_csum_fault(struct net_device *dev)
2744{
2745 if (net_ratelimit()) {
2746 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2747 dump_stack();
2748 }
2749}
2750EXPORT_SYMBOL(netdev_rx_csum_fault);
2751#endif
2752
2753/* Actually, we should eliminate this check as soon as we know, that:
2754 * 1. IOMMU is present and allows to map all the memory.
2755 * 2. No high memory really exists on this machine.
2756 */
2757
2758static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2759{
2760#ifdef CONFIG_HIGHMEM
2761 int i;
2762 if (!(dev->features & NETIF_F_HIGHDMA)) {
2763 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2764 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2765 if (PageHighMem(skb_frag_page(frag)))
2766 return 1;
2767 }
2768 }
2769
2770 if (PCI_DMA_BUS_IS_PHYS) {
2771 struct device *pdev = dev->dev.parent;
2772
2773 if (!pdev)
2774 return 0;
2775 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2776 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2777 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2778 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2779 return 1;
2780 }
2781 }
2782#endif
2783 return 0;
2784}
2785
2786/* If MPLS offload request, verify we are testing hardware MPLS features
2787 * instead of standard features for the netdev.
2788 */
2789#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2790static netdev_features_t net_mpls_features(struct sk_buff *skb,
2791 netdev_features_t features,
2792 __be16 type)
2793{
2794 if (eth_p_mpls(type))
2795 features &= skb->dev->mpls_features;
2796
2797 return features;
2798}
2799#else
2800static netdev_features_t net_mpls_features(struct sk_buff *skb,
2801 netdev_features_t features,
2802 __be16 type)
2803{
2804 return features;
2805}
2806#endif
2807
2808static netdev_features_t harmonize_features(struct sk_buff *skb,
2809 netdev_features_t features)
2810{
2811 int tmp;
2812 __be16 type;
2813
2814 type = skb_network_protocol(skb, &tmp);
2815 features = net_mpls_features(skb, features, type);
2816
2817 if (skb->ip_summed != CHECKSUM_NONE &&
2818 !can_checksum_protocol(features, type)) {
2819 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2820 }
2821 if (illegal_highdma(skb->dev, skb))
2822 features &= ~NETIF_F_SG;
2823
2824 return features;
2825}
2826
2827netdev_features_t passthru_features_check(struct sk_buff *skb,
2828 struct net_device *dev,
2829 netdev_features_t features)
2830{
2831 return features;
2832}
2833EXPORT_SYMBOL(passthru_features_check);
2834
2835static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2836 struct net_device *dev,
2837 netdev_features_t features)
2838{
2839 return vlan_features_check(skb, features);
2840}
2841
2842static netdev_features_t gso_features_check(const struct sk_buff *skb,
2843 struct net_device *dev,
2844 netdev_features_t features)
2845{
2846 u16 gso_segs = skb_shinfo(skb)->gso_segs;
2847
2848 if (gso_segs > dev->gso_max_segs)
2849 return features & ~NETIF_F_GSO_MASK;
2850
2851 /* Support for GSO partial features requires software
2852 * intervention before we can actually process the packets
2853 * so we need to strip support for any partial features now
2854 * and we can pull them back in after we have partially
2855 * segmented the frame.
2856 */
2857 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2858 features &= ~dev->gso_partial_features;
2859
2860 /* Make sure to clear the IPv4 ID mangling feature if the
2861 * IPv4 header has the potential to be fragmented.
2862 */
2863 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2864 struct iphdr *iph = skb->encapsulation ?
2865 inner_ip_hdr(skb) : ip_hdr(skb);
2866
2867 if (!(iph->frag_off & htons(IP_DF)))
2868 features &= ~NETIF_F_TSO_MANGLEID;
2869 }
2870
2871 return features;
2872}
2873
2874netdev_features_t netif_skb_features(struct sk_buff *skb)
2875{
2876 struct net_device *dev = skb->dev;
2877 netdev_features_t features = dev->features;
2878
2879 if (skb_is_gso(skb))
2880 features = gso_features_check(skb, dev, features);
2881
2882 /* If encapsulation offload request, verify we are testing
2883 * hardware encapsulation features instead of standard
2884 * features for the netdev
2885 */
2886 if (skb->encapsulation)
2887 features &= dev->hw_enc_features;
2888
2889 if (skb_vlan_tagged(skb))
2890 features = netdev_intersect_features(features,
2891 dev->vlan_features |
2892 NETIF_F_HW_VLAN_CTAG_TX |
2893 NETIF_F_HW_VLAN_STAG_TX);
2894
2895 if (dev->netdev_ops->ndo_features_check)
2896 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2897 features);
2898 else
2899 features &= dflt_features_check(skb, dev, features);
2900
2901 return harmonize_features(skb, features);
2902}
2903EXPORT_SYMBOL(netif_skb_features);
2904
2905static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2906 struct netdev_queue *txq, bool more)
2907{
2908 unsigned int len;
2909 int rc;
2910
2911 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2912 dev_queue_xmit_nit(skb, dev);
2913
2914 len = skb->len;
2915 trace_net_dev_start_xmit(skb, dev);
2916 rc = netdev_start_xmit(skb, dev, txq, more);
2917 trace_net_dev_xmit(skb, rc, dev, len);
2918
2919 return rc;
2920}
2921
2922struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2923 struct netdev_queue *txq, int *ret)
2924{
2925 struct sk_buff *skb = first;
2926 int rc = NETDEV_TX_OK;
2927
2928 while (skb) {
2929 struct sk_buff *next = skb->next;
2930
2931 skb->next = NULL;
2932 rc = xmit_one(skb, dev, txq, next != NULL);
2933 if (unlikely(!dev_xmit_complete(rc))) {
2934 skb->next = next;
2935 goto out;
2936 }
2937
2938 skb = next;
2939 if (netif_xmit_stopped(txq) && skb) {
2940 rc = NETDEV_TX_BUSY;
2941 break;
2942 }
2943 }
2944
2945out:
2946 *ret = rc;
2947 return skb;
2948}
2949
2950static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2951 netdev_features_t features)
2952{
2953 if (skb_vlan_tag_present(skb) &&
2954 !vlan_hw_offload_capable(features, skb->vlan_proto))
2955 skb = __vlan_hwaccel_push_inside(skb);
2956 return skb;
2957}
2958
2959static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2960{
2961 netdev_features_t features;
2962
2963 features = netif_skb_features(skb);
2964 skb = validate_xmit_vlan(skb, features);
2965 if (unlikely(!skb))
2966 goto out_null;
2967
2968 if (netif_needs_gso(skb, features)) {
2969 struct sk_buff *segs;
2970
2971 segs = skb_gso_segment(skb, features);
2972 if (IS_ERR(segs)) {
2973 goto out_kfree_skb;
2974 } else if (segs) {
2975 consume_skb(skb);
2976 skb = segs;
2977 }
2978 } else {
2979 if (skb_needs_linearize(skb, features) &&
2980 __skb_linearize(skb))
2981 goto out_kfree_skb;
2982
2983 /* If packet is not checksummed and device does not
2984 * support checksumming for this protocol, complete
2985 * checksumming here.
2986 */
2987 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2988 if (skb->encapsulation)
2989 skb_set_inner_transport_header(skb,
2990 skb_checksum_start_offset(skb));
2991 else
2992 skb_set_transport_header(skb,
2993 skb_checksum_start_offset(skb));
2994 if (!(features & NETIF_F_CSUM_MASK) &&
2995 skb_checksum_help(skb))
2996 goto out_kfree_skb;
2997 }
2998 }
2999
3000 return skb;
3001
3002out_kfree_skb:
3003 kfree_skb(skb);
3004out_null:
3005 atomic_long_inc(&dev->tx_dropped);
3006 return NULL;
3007}
3008
3009struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3010{
3011 struct sk_buff *next, *head = NULL, *tail;
3012
3013 for (; skb != NULL; skb = next) {
3014 next = skb->next;
3015 skb->next = NULL;
3016
3017 /* in case skb wont be segmented, point to itself */
3018 skb->prev = skb;
3019
3020 skb = validate_xmit_skb(skb, dev);
3021 if (!skb)
3022 continue;
3023
3024 if (!head)
3025 head = skb;
3026 else
3027 tail->next = skb;
3028 /* If skb was segmented, skb->prev points to
3029 * the last segment. If not, it still contains skb.
3030 */
3031 tail = skb->prev;
3032 }
3033 return head;
3034}
3035EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3036
3037static void qdisc_pkt_len_init(struct sk_buff *skb)
3038{
3039 const struct skb_shared_info *shinfo = skb_shinfo(skb);
3040
3041 qdisc_skb_cb(skb)->pkt_len = skb->len;
3042
3043 /* To get more precise estimation of bytes sent on wire,
3044 * we add to pkt_len the headers size of all segments
3045 */
3046 if (shinfo->gso_size) {
3047 unsigned int hdr_len;
3048 u16 gso_segs = shinfo->gso_segs;
3049
3050 /* mac layer + network layer */
3051 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3052
3053 /* + transport layer */
3054 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3055 hdr_len += tcp_hdrlen(skb);
3056 else
3057 hdr_len += sizeof(struct udphdr);
3058
3059 if (shinfo->gso_type & SKB_GSO_DODGY)
3060 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3061 shinfo->gso_size);
3062
3063 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3064 }
3065}
3066
3067static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3068 struct net_device *dev,
3069 struct netdev_queue *txq)
3070{
3071 spinlock_t *root_lock = qdisc_lock(q);
3072 struct sk_buff *to_free = NULL;
3073 bool contended;
3074 int rc;
3075
3076 qdisc_calculate_pkt_len(skb, q);
3077 /*
3078 * Heuristic to force contended enqueues to serialize on a
3079 * separate lock before trying to get qdisc main lock.
3080 * This permits qdisc->running owner to get the lock more
3081 * often and dequeue packets faster.
3082 */
3083 contended = qdisc_is_running(q);
3084 if (unlikely(contended))
3085 spin_lock(&q->busylock);
3086
3087 spin_lock(root_lock);
3088 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3089 __qdisc_drop(skb, &to_free);
3090 rc = NET_XMIT_DROP;
3091 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3092 qdisc_run_begin(q)) {
3093 /*
3094 * This is a work-conserving queue; there are no old skbs
3095 * waiting to be sent out; and the qdisc is not running -
3096 * xmit the skb directly.
3097 */
3098
3099 qdisc_bstats_update(q, skb);
3100
3101 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3102 if (unlikely(contended)) {
3103 spin_unlock(&q->busylock);
3104 contended = false;
3105 }
3106 __qdisc_run(q);
3107 } else
3108 qdisc_run_end(q);
3109
3110 rc = NET_XMIT_SUCCESS;
3111 } else {
3112 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3113 if (qdisc_run_begin(q)) {
3114 if (unlikely(contended)) {
3115 spin_unlock(&q->busylock);
3116 contended = false;
3117 }
3118 __qdisc_run(q);
3119 }
3120 }
3121 spin_unlock(root_lock);
3122 if (unlikely(to_free))
3123 kfree_skb_list(to_free);
3124 if (unlikely(contended))
3125 spin_unlock(&q->busylock);
3126 return rc;
3127}
3128
3129#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3130static void skb_update_prio(struct sk_buff *skb)
3131{
3132 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3133
3134 if (!skb->priority && skb->sk && map) {
3135 unsigned int prioidx =
3136 sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3137
3138 if (prioidx < map->priomap_len)
3139 skb->priority = map->priomap[prioidx];
3140 }
3141}
3142#else
3143#define skb_update_prio(skb)
3144#endif
3145
3146DEFINE_PER_CPU(int, xmit_recursion);
3147EXPORT_SYMBOL(xmit_recursion);
3148
3149/**
3150 * dev_loopback_xmit - loop back @skb
3151 * @net: network namespace this loopback is happening in
3152 * @sk: sk needed to be a netfilter okfn
3153 * @skb: buffer to transmit
3154 */
3155int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3156{
3157 skb_reset_mac_header(skb);
3158 __skb_pull(skb, skb_network_offset(skb));
3159 skb->pkt_type = PACKET_LOOPBACK;
3160 skb->ip_summed = CHECKSUM_UNNECESSARY;
3161 WARN_ON(!skb_dst(skb));
3162 skb_dst_force(skb);
3163 netif_rx_ni(skb);
3164 return 0;
3165}
3166EXPORT_SYMBOL(dev_loopback_xmit);
3167
3168#ifdef CONFIG_NET_EGRESS
3169static struct sk_buff *
3170sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3171{
3172 struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3173 struct tcf_result cl_res;
3174
3175 if (!cl)
3176 return skb;
3177
3178 /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3179 * earlier by the caller.
3180 */
3181 qdisc_bstats_cpu_update(cl->q, skb);
3182
3183 switch (tc_classify(skb, cl, &cl_res, false)) {
3184 case TC_ACT_OK:
3185 case TC_ACT_RECLASSIFY:
3186 skb->tc_index = TC_H_MIN(cl_res.classid);
3187 break;
3188 case TC_ACT_SHOT:
3189 qdisc_qstats_cpu_drop(cl->q);
3190 *ret = NET_XMIT_DROP;
3191 kfree_skb(skb);
3192 return NULL;
3193 case TC_ACT_STOLEN:
3194 case TC_ACT_QUEUED:
3195 *ret = NET_XMIT_SUCCESS;
3196 consume_skb(skb);
3197 return NULL;
3198 case TC_ACT_REDIRECT:
3199 /* No need to push/pop skb's mac_header here on egress! */
3200 skb_do_redirect(skb);
3201 *ret = NET_XMIT_SUCCESS;
3202 return NULL;
3203 default:
3204 break;
3205 }
3206
3207 return skb;
3208}
3209#endif /* CONFIG_NET_EGRESS */
3210
3211static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3212{
3213#ifdef CONFIG_XPS
3214 struct xps_dev_maps *dev_maps;
3215 struct xps_map *map;
3216 int queue_index = -1;
3217
3218 rcu_read_lock();
3219 dev_maps = rcu_dereference(dev->xps_maps);
3220 if (dev_maps) {
3221 unsigned int tci = skb->sender_cpu - 1;
3222
3223 if (dev->num_tc) {
3224 tci *= dev->num_tc;
3225 tci += netdev_get_prio_tc_map(dev, skb->priority);
3226 }
3227
3228 map = rcu_dereference(dev_maps->cpu_map[tci]);
3229 if (map) {
3230 if (map->len == 1)
3231 queue_index = map->queues[0];
3232 else
3233 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3234 map->len)];
3235 if (unlikely(queue_index >= dev->real_num_tx_queues))
3236 queue_index = -1;
3237 }
3238 }
3239 rcu_read_unlock();
3240
3241 return queue_index;
3242#else
3243 return -1;
3244#endif
3245}
3246
3247static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3248{
3249 struct sock *sk = skb->sk;
3250 int queue_index = sk_tx_queue_get(sk);
3251
3252 if (queue_index < 0 || skb->ooo_okay ||
3253 queue_index >= dev->real_num_tx_queues) {
3254 int new_index = get_xps_queue(dev, skb);
3255 if (new_index < 0)
3256 new_index = skb_tx_hash(dev, skb);
3257
3258 if (queue_index != new_index && sk &&
3259 sk_fullsock(sk) &&
3260 rcu_access_pointer(sk->sk_dst_cache))
3261 sk_tx_queue_set(sk, new_index);
3262
3263 queue_index = new_index;
3264 }
3265
3266 return queue_index;
3267}
3268
3269struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3270 struct sk_buff *skb,
3271 void *accel_priv)
3272{
3273 int queue_index = 0;
3274
3275#ifdef CONFIG_XPS
3276 u32 sender_cpu = skb->sender_cpu - 1;
3277
3278 if (sender_cpu >= (u32)NR_CPUS)
3279 skb->sender_cpu = raw_smp_processor_id() + 1;
3280#endif
3281
3282 if (dev->real_num_tx_queues != 1) {
3283 const struct net_device_ops *ops = dev->netdev_ops;
3284 if (ops->ndo_select_queue)
3285 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3286 __netdev_pick_tx);
3287 else
3288 queue_index = __netdev_pick_tx(dev, skb);
3289
3290 if (!accel_priv)
3291 queue_index = netdev_cap_txqueue(dev, queue_index);
3292 }
3293
3294 skb_set_queue_mapping(skb, queue_index);
3295 return netdev_get_tx_queue(dev, queue_index);
3296}
3297
3298/**
3299 * __dev_queue_xmit - transmit a buffer
3300 * @skb: buffer to transmit
3301 * @accel_priv: private data used for L2 forwarding offload
3302 *
3303 * Queue a buffer for transmission to a network device. The caller must
3304 * have set the device and priority and built the buffer before calling
3305 * this function. The function can be called from an interrupt.
3306 *
3307 * A negative errno code is returned on a failure. A success does not
3308 * guarantee the frame will be transmitted as it may be dropped due
3309 * to congestion or traffic shaping.
3310 *
3311 * -----------------------------------------------------------------------------------
3312 * I notice this method can also return errors from the queue disciplines,
3313 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3314 * be positive.
3315 *
3316 * Regardless of the return value, the skb is consumed, so it is currently
3317 * difficult to retry a send to this method. (You can bump the ref count
3318 * before sending to hold a reference for retry if you are careful.)
3319 *
3320 * When calling this method, interrupts MUST be enabled. This is because
3321 * the BH enable code must have IRQs enabled so that it will not deadlock.
3322 * --BLG
3323 */
3324static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3325{
3326 struct net_device *dev = skb->dev;
3327 struct netdev_queue *txq;
3328 struct Qdisc *q;
3329 int rc = -ENOMEM;
3330
3331 skb_reset_mac_header(skb);
3332
3333 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3334 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3335
3336 /* Disable soft irqs for various locks below. Also
3337 * stops preemption for RCU.
3338 */
3339 rcu_read_lock_bh();
3340
3341 skb_update_prio(skb);
3342
3343 qdisc_pkt_len_init(skb);
3344#ifdef CONFIG_NET_CLS_ACT
3345 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3346# ifdef CONFIG_NET_EGRESS
3347 if (static_key_false(&egress_needed)) {
3348 skb = sch_handle_egress(skb, &rc, dev);
3349 if (!skb)
3350 goto out;
3351 }
3352# endif
3353#endif
3354 /* If device/qdisc don't need skb->dst, release it right now while
3355 * its hot in this cpu cache.
3356 */
3357 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3358 skb_dst_drop(skb);
3359 else
3360 skb_dst_force(skb);
3361
3362 txq = netdev_pick_tx(dev, skb, accel_priv);
3363 q = rcu_dereference_bh(txq->qdisc);
3364
3365 trace_net_dev_queue(skb);
3366 if (q->enqueue) {
3367 rc = __dev_xmit_skb(skb, q, dev, txq);
3368 goto out;
3369 }
3370
3371 /* The device has no queue. Common case for software devices:
3372 loopback, all the sorts of tunnels...
3373
3374 Really, it is unlikely that netif_tx_lock protection is necessary
3375 here. (f.e. loopback and IP tunnels are clean ignoring statistics
3376 counters.)
3377 However, it is possible, that they rely on protection
3378 made by us here.
3379
3380 Check this and shot the lock. It is not prone from deadlocks.
3381 Either shot noqueue qdisc, it is even simpler 8)
3382 */
3383 if (dev->flags & IFF_UP) {
3384 int cpu = smp_processor_id(); /* ok because BHs are off */
3385
3386 if (txq->xmit_lock_owner != cpu) {
3387 if (unlikely(__this_cpu_read(xmit_recursion) >
3388 XMIT_RECURSION_LIMIT))
3389 goto recursion_alert;
3390
3391 skb = validate_xmit_skb(skb, dev);
3392 if (!skb)
3393 goto out;
3394
3395 HARD_TX_LOCK(dev, txq, cpu);
3396
3397 if (!netif_xmit_stopped(txq)) {
3398 __this_cpu_inc(xmit_recursion);
3399 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3400 __this_cpu_dec(xmit_recursion);
3401 if (dev_xmit_complete(rc)) {
3402 HARD_TX_UNLOCK(dev, txq);
3403 goto out;
3404 }
3405 }
3406 HARD_TX_UNLOCK(dev, txq);
3407 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3408 dev->name);
3409 } else {
3410 /* Recursion is detected! It is possible,
3411 * unfortunately
3412 */
3413recursion_alert:
3414 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3415 dev->name);
3416 }
3417 }
3418
3419 rc = -ENETDOWN;
3420 rcu_read_unlock_bh();
3421
3422 atomic_long_inc(&dev->tx_dropped);
3423 kfree_skb_list(skb);
3424 return rc;
3425out:
3426 rcu_read_unlock_bh();
3427 return rc;
3428}
3429
3430int dev_queue_xmit(struct sk_buff *skb)
3431{
3432 return __dev_queue_xmit(skb, NULL);
3433}
3434EXPORT_SYMBOL(dev_queue_xmit);
3435
3436int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3437{
3438 return __dev_queue_xmit(skb, accel_priv);
3439}
3440EXPORT_SYMBOL(dev_queue_xmit_accel);
3441
3442
3443/*=======================================================================
3444 Receiver routines
3445 =======================================================================*/
3446
3447int netdev_max_backlog __read_mostly = 1000;
3448EXPORT_SYMBOL(netdev_max_backlog);
3449
3450int netdev_tstamp_prequeue __read_mostly = 1;
3451int netdev_budget __read_mostly = 300;
3452int weight_p __read_mostly = 64; /* old backlog weight */
3453
3454/* Called with irq disabled */
3455static inline void ____napi_schedule(struct softnet_data *sd,
3456 struct napi_struct *napi)
3457{
3458 list_add_tail(&napi->poll_list, &sd->poll_list);
3459 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3460}
3461
3462#ifdef CONFIG_RPS
3463
3464/* One global table that all flow-based protocols share. */
3465struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3466EXPORT_SYMBOL(rps_sock_flow_table);
3467u32 rps_cpu_mask __read_mostly;
3468EXPORT_SYMBOL(rps_cpu_mask);
3469
3470struct static_key rps_needed __read_mostly;
3471EXPORT_SYMBOL(rps_needed);
3472struct static_key rfs_needed __read_mostly;
3473EXPORT_SYMBOL(rfs_needed);
3474
3475static struct rps_dev_flow *
3476set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3477 struct rps_dev_flow *rflow, u16 next_cpu)
3478{
3479 if (next_cpu < nr_cpu_ids) {
3480#ifdef CONFIG_RFS_ACCEL
3481 struct netdev_rx_queue *rxqueue;
3482 struct rps_dev_flow_table *flow_table;
3483 struct rps_dev_flow *old_rflow;
3484 u32 flow_id;
3485 u16 rxq_index;
3486 int rc;
3487
3488 /* Should we steer this flow to a different hardware queue? */
3489 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3490 !(dev->features & NETIF_F_NTUPLE))
3491 goto out;
3492 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3493 if (rxq_index == skb_get_rx_queue(skb))
3494 goto out;
3495
3496 rxqueue = dev->_rx + rxq_index;
3497 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3498 if (!flow_table)
3499 goto out;
3500 flow_id = skb_get_hash(skb) & flow_table->mask;
3501 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3502 rxq_index, flow_id);
3503 if (rc < 0)
3504 goto out;
3505 old_rflow = rflow;
3506 rflow = &flow_table->flows[flow_id];
3507 rflow->filter = rc;
3508 if (old_rflow->filter == rflow->filter)
3509 old_rflow->filter = RPS_NO_FILTER;
3510 out:
3511#endif
3512 rflow->last_qtail =
3513 per_cpu(softnet_data, next_cpu).input_queue_head;
3514 }
3515
3516 rflow->cpu = next_cpu;
3517 return rflow;
3518}
3519
3520/*
3521 * get_rps_cpu is called from netif_receive_skb and returns the target
3522 * CPU from the RPS map of the receiving queue for a given skb.
3523 * rcu_read_lock must be held on entry.
3524 */
3525static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3526 struct rps_dev_flow **rflowp)
3527{
3528 const struct rps_sock_flow_table *sock_flow_table;
3529 struct netdev_rx_queue *rxqueue = dev->_rx;
3530 struct rps_dev_flow_table *flow_table;
3531 struct rps_map *map;
3532 int cpu = -1;
3533 u32 tcpu;
3534 u32 hash;
3535
3536 if (skb_rx_queue_recorded(skb)) {
3537 u16 index = skb_get_rx_queue(skb);
3538
3539 if (unlikely(index >= dev->real_num_rx_queues)) {
3540 WARN_ONCE(dev->real_num_rx_queues > 1,
3541 "%s received packet on queue %u, but number "
3542 "of RX queues is %u\n",
3543 dev->name, index, dev->real_num_rx_queues);
3544 goto done;
3545 }
3546 rxqueue += index;
3547 }
3548
3549 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3550
3551 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3552 map = rcu_dereference(rxqueue->rps_map);
3553 if (!flow_table && !map)
3554 goto done;
3555
3556 skb_reset_network_header(skb);
3557 hash = skb_get_hash(skb);
3558 if (!hash)
3559 goto done;
3560
3561 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3562 if (flow_table && sock_flow_table) {
3563 struct rps_dev_flow *rflow;
3564 u32 next_cpu;
3565 u32 ident;
3566
3567 /* First check into global flow table if there is a match */
3568 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3569 if ((ident ^ hash) & ~rps_cpu_mask)
3570 goto try_rps;
3571
3572 next_cpu = ident & rps_cpu_mask;
3573
3574 /* OK, now we know there is a match,
3575 * we can look at the local (per receive queue) flow table
3576 */
3577 rflow = &flow_table->flows[hash & flow_table->mask];
3578 tcpu = rflow->cpu;
3579
3580 /*
3581 * If the desired CPU (where last recvmsg was done) is
3582 * different from current CPU (one in the rx-queue flow
3583 * table entry), switch if one of the following holds:
3584 * - Current CPU is unset (>= nr_cpu_ids).
3585 * - Current CPU is offline.
3586 * - The current CPU's queue tail has advanced beyond the
3587 * last packet that was enqueued using this table entry.
3588 * This guarantees that all previous packets for the flow
3589 * have been dequeued, thus preserving in order delivery.
3590 */
3591 if (unlikely(tcpu != next_cpu) &&
3592 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3593 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3594 rflow->last_qtail)) >= 0)) {
3595 tcpu = next_cpu;
3596 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3597 }
3598
3599 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3600 *rflowp = rflow;
3601 cpu = tcpu;
3602 goto done;
3603 }
3604 }
3605
3606try_rps:
3607
3608 if (map) {
3609 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3610 if (cpu_online(tcpu)) {
3611 cpu = tcpu;
3612 goto done;
3613 }
3614 }
3615
3616done:
3617 return cpu;
3618}
3619
3620#ifdef CONFIG_RFS_ACCEL
3621
3622/**
3623 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3624 * @dev: Device on which the filter was set
3625 * @rxq_index: RX queue index
3626 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3627 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3628 *
3629 * Drivers that implement ndo_rx_flow_steer() should periodically call
3630 * this function for each installed filter and remove the filters for
3631 * which it returns %true.
3632 */
3633bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3634 u32 flow_id, u16 filter_id)
3635{
3636 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3637 struct rps_dev_flow_table *flow_table;
3638 struct rps_dev_flow *rflow;
3639 bool expire = true;
3640 unsigned int cpu;
3641
3642 rcu_read_lock();
3643 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3644 if (flow_table && flow_id <= flow_table->mask) {
3645 rflow = &flow_table->flows[flow_id];
3646 cpu = ACCESS_ONCE(rflow->cpu);
3647 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3648 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3649 rflow->last_qtail) <
3650 (int)(10 * flow_table->mask)))
3651 expire = false;
3652 }
3653 rcu_read_unlock();
3654 return expire;
3655}
3656EXPORT_SYMBOL(rps_may_expire_flow);
3657
3658#endif /* CONFIG_RFS_ACCEL */
3659
3660/* Called from hardirq (IPI) context */
3661static void rps_trigger_softirq(void *data)
3662{
3663 struct softnet_data *sd = data;
3664
3665 ____napi_schedule(sd, &sd->backlog);
3666 sd->received_rps++;
3667}
3668
3669#endif /* CONFIG_RPS */
3670
3671/*
3672 * Check if this softnet_data structure is another cpu one
3673 * If yes, queue it to our IPI list and return 1
3674 * If no, return 0
3675 */
3676static int rps_ipi_queued(struct softnet_data *sd)
3677{
3678#ifdef CONFIG_RPS
3679 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3680
3681 if (sd != mysd) {
3682 sd->rps_ipi_next = mysd->rps_ipi_list;
3683 mysd->rps_ipi_list = sd;
3684
3685 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3686 return 1;
3687 }
3688#endif /* CONFIG_RPS */
3689 return 0;
3690}
3691
3692#ifdef CONFIG_NET_FLOW_LIMIT
3693int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3694#endif
3695
3696static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3697{
3698#ifdef CONFIG_NET_FLOW_LIMIT
3699 struct sd_flow_limit *fl;
3700 struct softnet_data *sd;
3701 unsigned int old_flow, new_flow;
3702
3703 if (qlen < (netdev_max_backlog >> 1))
3704 return false;
3705
3706 sd = this_cpu_ptr(&softnet_data);
3707
3708 rcu_read_lock();
3709 fl = rcu_dereference(sd->flow_limit);
3710 if (fl) {
3711 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3712 old_flow = fl->history[fl->history_head];
3713 fl->history[fl->history_head] = new_flow;
3714
3715 fl->history_head++;
3716 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3717
3718 if (likely(fl->buckets[old_flow]))
3719 fl->buckets[old_flow]--;
3720
3721 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3722 fl->count++;
3723 rcu_read_unlock();
3724 return true;
3725 }
3726 }
3727 rcu_read_unlock();
3728#endif
3729 return false;
3730}
3731
3732/*
3733 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3734 * queue (may be a remote CPU queue).
3735 */
3736static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3737 unsigned int *qtail)
3738{
3739 struct softnet_data *sd;
3740 unsigned long flags;
3741 unsigned int qlen;
3742
3743 sd = &per_cpu(softnet_data, cpu);
3744
3745 local_irq_save(flags);
3746
3747 rps_lock(sd);
3748 if (!netif_running(skb->dev))
3749 goto drop;
3750 qlen = skb_queue_len(&sd->input_pkt_queue);
3751 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3752 if (qlen) {
3753enqueue:
3754 __skb_queue_tail(&sd->input_pkt_queue, skb);
3755 input_queue_tail_incr_save(sd, qtail);
3756 rps_unlock(sd);
3757 local_irq_restore(flags);
3758 return NET_RX_SUCCESS;
3759 }
3760
3761 /* Schedule NAPI for backlog device
3762 * We can use non atomic operation since we own the queue lock
3763 */
3764 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3765 if (!rps_ipi_queued(sd))
3766 ____napi_schedule(sd, &sd->backlog);
3767 }
3768 goto enqueue;
3769 }
3770
3771drop:
3772 sd->dropped++;
3773 rps_unlock(sd);
3774
3775 local_irq_restore(flags);
3776
3777 atomic_long_inc(&skb->dev->rx_dropped);
3778 kfree_skb(skb);
3779 return NET_RX_DROP;
3780}
3781
3782static int netif_rx_internal(struct sk_buff *skb)
3783{
3784 int ret;
3785
3786 net_timestamp_check(netdev_tstamp_prequeue, skb);
3787
3788 trace_netif_rx(skb);
3789#ifdef CONFIG_RPS
3790 if (static_key_false(&rps_needed)) {
3791 struct rps_dev_flow voidflow, *rflow = &voidflow;
3792 int cpu;
3793
3794 preempt_disable();
3795 rcu_read_lock();
3796
3797 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3798 if (cpu < 0)
3799 cpu = smp_processor_id();
3800
3801 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3802
3803 rcu_read_unlock();
3804 preempt_enable();
3805 } else
3806#endif
3807 {
3808 unsigned int qtail;
3809 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3810 put_cpu();
3811 }
3812 return ret;
3813}
3814
3815/**
3816 * netif_rx - post buffer to the network code
3817 * @skb: buffer to post
3818 *
3819 * This function receives a packet from a device driver and queues it for
3820 * the upper (protocol) levels to process. It always succeeds. The buffer
3821 * may be dropped during processing for congestion control or by the
3822 * protocol layers.
3823 *
3824 * return values:
3825 * NET_RX_SUCCESS (no congestion)
3826 * NET_RX_DROP (packet was dropped)
3827 *
3828 */
3829
3830int netif_rx(struct sk_buff *skb)
3831{
3832 trace_netif_rx_entry(skb);
3833
3834 return netif_rx_internal(skb);
3835}
3836EXPORT_SYMBOL(netif_rx);
3837
3838int netif_rx_ni(struct sk_buff *skb)
3839{
3840 int err;
3841
3842 trace_netif_rx_ni_entry(skb);
3843
3844 preempt_disable();
3845 err = netif_rx_internal(skb);
3846 if (local_softirq_pending())
3847 do_softirq();
3848 preempt_enable();
3849
3850 return err;
3851}
3852EXPORT_SYMBOL(netif_rx_ni);
3853
3854static __latent_entropy void net_tx_action(struct softirq_action *h)
3855{
3856 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3857
3858 if (sd->completion_queue) {
3859 struct sk_buff *clist;
3860
3861 local_irq_disable();
3862 clist = sd->completion_queue;
3863 sd->completion_queue = NULL;
3864 local_irq_enable();
3865
3866 while (clist) {
3867 struct sk_buff *skb = clist;
3868 clist = clist->next;
3869
3870 WARN_ON(atomic_read(&skb->users));
3871 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3872 trace_consume_skb(skb);
3873 else
3874 trace_kfree_skb(skb, net_tx_action);
3875
3876 if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3877 __kfree_skb(skb);
3878 else
3879 __kfree_skb_defer(skb);
3880 }
3881
3882 __kfree_skb_flush();
3883 }
3884
3885 if (sd->output_queue) {
3886 struct Qdisc *head;
3887
3888 local_irq_disable();
3889 head = sd->output_queue;
3890 sd->output_queue = NULL;
3891 sd->output_queue_tailp = &sd->output_queue;
3892 local_irq_enable();
3893
3894 while (head) {
3895 struct Qdisc *q = head;
3896 spinlock_t *root_lock;
3897
3898 head = head->next_sched;
3899
3900 root_lock = qdisc_lock(q);
3901 spin_lock(root_lock);
3902 /* We need to make sure head->next_sched is read
3903 * before clearing __QDISC_STATE_SCHED
3904 */
3905 smp_mb__before_atomic();
3906 clear_bit(__QDISC_STATE_SCHED, &q->state);
3907 qdisc_run(q);
3908 spin_unlock(root_lock);
3909 }
3910 }
3911}
3912
3913#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3914/* This hook is defined here for ATM LANE */
3915int (*br_fdb_test_addr_hook)(struct net_device *dev,
3916 unsigned char *addr) __read_mostly;
3917EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3918#endif
3919
3920static inline struct sk_buff *
3921sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3922 struct net_device *orig_dev)
3923{
3924#ifdef CONFIG_NET_CLS_ACT
3925 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3926 struct tcf_result cl_res;
3927
3928 /* If there's at least one ingress present somewhere (so
3929 * we get here via enabled static key), remaining devices
3930 * that are not configured with an ingress qdisc will bail
3931 * out here.
3932 */
3933 if (!cl)
3934 return skb;
3935 if (*pt_prev) {
3936 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3937 *pt_prev = NULL;
3938 }
3939
3940 qdisc_skb_cb(skb)->pkt_len = skb->len;
3941 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3942 qdisc_bstats_cpu_update(cl->q, skb);
3943
3944 switch (tc_classify(skb, cl, &cl_res, false)) {
3945 case TC_ACT_OK:
3946 case TC_ACT_RECLASSIFY:
3947 skb->tc_index = TC_H_MIN(cl_res.classid);
3948 break;
3949 case TC_ACT_SHOT:
3950 qdisc_qstats_cpu_drop(cl->q);
3951 kfree_skb(skb);
3952 return NULL;
3953 case TC_ACT_STOLEN:
3954 case TC_ACT_QUEUED:
3955 consume_skb(skb);
3956 return NULL;
3957 case TC_ACT_REDIRECT:
3958 /* skb_mac_header check was done by cls/act_bpf, so
3959 * we can safely push the L2 header back before
3960 * redirecting to another netdev
3961 */
3962 __skb_push(skb, skb->mac_len);
3963 skb_do_redirect(skb);
3964 return NULL;
3965 default:
3966 break;
3967 }
3968#endif /* CONFIG_NET_CLS_ACT */
3969 return skb;
3970}
3971
3972/**
3973 * netdev_is_rx_handler_busy - check if receive handler is registered
3974 * @dev: device to check
3975 *
3976 * Check if a receive handler is already registered for a given device.
3977 * Return true if there one.
3978 *
3979 * The caller must hold the rtnl_mutex.
3980 */
3981bool netdev_is_rx_handler_busy(struct net_device *dev)
3982{
3983 ASSERT_RTNL();
3984 return dev && rtnl_dereference(dev->rx_handler);
3985}
3986EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3987
3988/**
3989 * netdev_rx_handler_register - register receive handler
3990 * @dev: device to register a handler for
3991 * @rx_handler: receive handler to register
3992 * @rx_handler_data: data pointer that is used by rx handler
3993 *
3994 * Register a receive handler for a device. This handler will then be
3995 * called from __netif_receive_skb. A negative errno code is returned
3996 * on a failure.
3997 *
3998 * The caller must hold the rtnl_mutex.
3999 *
4000 * For a general description of rx_handler, see enum rx_handler_result.
4001 */
4002int netdev_rx_handler_register(struct net_device *dev,
4003 rx_handler_func_t *rx_handler,
4004 void *rx_handler_data)
4005{
4006 ASSERT_RTNL();
4007
4008 if (dev->rx_handler)
4009 return -EBUSY;
4010
4011 /* Note: rx_handler_data must be set before rx_handler */
4012 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4013 rcu_assign_pointer(dev->rx_handler, rx_handler);
4014
4015 return 0;
4016}
4017EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4018
4019/**
4020 * netdev_rx_handler_unregister - unregister receive handler
4021 * @dev: device to unregister a handler from
4022 *
4023 * Unregister a receive handler from a device.
4024 *
4025 * The caller must hold the rtnl_mutex.
4026 */
4027void netdev_rx_handler_unregister(struct net_device *dev)
4028{
4029
4030 ASSERT_RTNL();
4031 RCU_INIT_POINTER(dev->rx_handler, NULL);
4032 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4033 * section has a guarantee to see a non NULL rx_handler_data
4034 * as well.
4035 */
4036 synchronize_net();
4037 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4038}
4039EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4040
4041/*
4042 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4043 * the special handling of PFMEMALLOC skbs.
4044 */
4045static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4046{
4047 switch (skb->protocol) {
4048 case htons(ETH_P_ARP):
4049 case htons(ETH_P_IP):
4050 case htons(ETH_P_IPV6):
4051 case htons(ETH_P_8021Q):
4052 case htons(ETH_P_8021AD):
4053 return true;
4054 default:
4055 return false;
4056 }
4057}
4058
4059static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4060 int *ret, struct net_device *orig_dev)
4061{
4062#ifdef CONFIG_NETFILTER_INGRESS
4063 if (nf_hook_ingress_active(skb)) {
4064 int ingress_retval;
4065
4066 if (*pt_prev) {
4067 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4068 *pt_prev = NULL;
4069 }
4070
4071 rcu_read_lock();
4072 ingress_retval = nf_hook_ingress(skb);
4073 rcu_read_unlock();
4074 return ingress_retval;
4075 }
4076#endif /* CONFIG_NETFILTER_INGRESS */
4077 return 0;
4078}
4079
4080static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4081{
4082 struct packet_type *ptype, *pt_prev;
4083 rx_handler_func_t *rx_handler;
4084 struct net_device *orig_dev;
4085 bool deliver_exact = false;
4086 int ret = NET_RX_DROP;
4087 __be16 type;
4088
4089 net_timestamp_check(!netdev_tstamp_prequeue, skb);
4090
4091 trace_netif_receive_skb(skb);
4092
4093 orig_dev = skb->dev;
4094
4095 skb_reset_network_header(skb);
4096 if (!skb_transport_header_was_set(skb))
4097 skb_reset_transport_header(skb);
4098 skb_reset_mac_len(skb);
4099
4100 pt_prev = NULL;
4101
4102another_round:
4103 skb->skb_iif = skb->dev->ifindex;
4104
4105 __this_cpu_inc(softnet_data.processed);
4106
4107 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4108 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4109 skb = skb_vlan_untag(skb);
4110 if (unlikely(!skb))
4111 goto out;
4112 }
4113
4114#ifdef CONFIG_NET_CLS_ACT
4115 if (skb->tc_verd & TC_NCLS) {
4116 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4117 goto ncls;
4118 }
4119#endif
4120
4121 if (pfmemalloc)
4122 goto skip_taps;
4123
4124 list_for_each_entry_rcu(ptype, &ptype_all, list) {
4125 if (pt_prev)
4126 ret = deliver_skb(skb, pt_prev, orig_dev);
4127 pt_prev = ptype;
4128 }
4129
4130 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4131 if (pt_prev)
4132 ret = deliver_skb(skb, pt_prev, orig_dev);
4133 pt_prev = ptype;
4134 }
4135
4136skip_taps:
4137#ifdef CONFIG_NET_INGRESS
4138 if (static_key_false(&ingress_needed)) {
4139 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4140 if (!skb)
4141 goto out;
4142
4143 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4144 goto out;
4145 }
4146#endif
4147#ifdef CONFIG_NET_CLS_ACT
4148 skb->tc_verd = 0;
4149ncls:
4150#endif
4151 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4152 goto drop;
4153
4154 if (skb_vlan_tag_present(skb)) {
4155 if (pt_prev) {
4156 ret = deliver_skb(skb, pt_prev, orig_dev);
4157 pt_prev = NULL;
4158 }
4159 if (vlan_do_receive(&skb))
4160 goto another_round;
4161 else if (unlikely(!skb))
4162 goto out;
4163 }
4164
4165 rx_handler = rcu_dereference(skb->dev->rx_handler);
4166 if (rx_handler) {
4167 if (pt_prev) {
4168 ret = deliver_skb(skb, pt_prev, orig_dev);
4169 pt_prev = NULL;
4170 }
4171 switch (rx_handler(&skb)) {
4172 case RX_HANDLER_CONSUMED:
4173 ret = NET_RX_SUCCESS;
4174 goto out;
4175 case RX_HANDLER_ANOTHER:
4176 goto another_round;
4177 case RX_HANDLER_EXACT:
4178 deliver_exact = true;
4179 case RX_HANDLER_PASS:
4180 break;
4181 default:
4182 BUG();
4183 }
4184 }
4185
4186 if (unlikely(skb_vlan_tag_present(skb))) {
4187 if (skb_vlan_tag_get_id(skb))
4188 skb->pkt_type = PACKET_OTHERHOST;
4189 /* Note: we might in the future use prio bits
4190 * and set skb->priority like in vlan_do_receive()
4191 * For the time being, just ignore Priority Code Point
4192 */
4193 skb->vlan_tci = 0;
4194 }
4195
4196 type = skb->protocol;
4197
4198 /* deliver only exact match when indicated */
4199 if (likely(!deliver_exact)) {
4200 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4201 &ptype_base[ntohs(type) &
4202 PTYPE_HASH_MASK]);
4203 }
4204
4205 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4206 &orig_dev->ptype_specific);
4207
4208 if (unlikely(skb->dev != orig_dev)) {
4209 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4210 &skb->dev->ptype_specific);
4211 }
4212
4213 if (pt_prev) {
4214 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4215 goto drop;
4216 else
4217 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4218 } else {
4219drop:
4220 if (!deliver_exact)
4221 atomic_long_inc(&skb->dev->rx_dropped);
4222 else
4223 atomic_long_inc(&skb->dev->rx_nohandler);
4224 kfree_skb(skb);
4225 /* Jamal, now you will not able to escape explaining
4226 * me how you were going to use this. :-)
4227 */
4228 ret = NET_RX_DROP;
4229 }
4230
4231out:
4232 return ret;
4233}
4234
4235static int __netif_receive_skb(struct sk_buff *skb)
4236{
4237 int ret;
4238
4239 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4240 unsigned long pflags = current->flags;
4241
4242 /*
4243 * PFMEMALLOC skbs are special, they should
4244 * - be delivered to SOCK_MEMALLOC sockets only
4245 * - stay away from userspace
4246 * - have bounded memory usage
4247 *
4248 * Use PF_MEMALLOC as this saves us from propagating the allocation
4249 * context down to all allocation sites.
4250 */
4251 current->flags |= PF_MEMALLOC;
4252 ret = __netif_receive_skb_core(skb, true);
4253 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4254 } else
4255 ret = __netif_receive_skb_core(skb, false);
4256
4257 return ret;
4258}
4259
4260static int netif_receive_skb_internal(struct sk_buff *skb)
4261{
4262 int ret;
4263
4264 net_timestamp_check(netdev_tstamp_prequeue, skb);
4265
4266 if (skb_defer_rx_timestamp(skb))
4267 return NET_RX_SUCCESS;
4268
4269 rcu_read_lock();
4270
4271#ifdef CONFIG_RPS
4272 if (static_key_false(&rps_needed)) {
4273 struct rps_dev_flow voidflow, *rflow = &voidflow;
4274 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4275
4276 if (cpu >= 0) {
4277 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4278 rcu_read_unlock();
4279 return ret;
4280 }
4281 }
4282#endif
4283 ret = __netif_receive_skb(skb);
4284 rcu_read_unlock();
4285 return ret;
4286}
4287
4288/**
4289 * netif_receive_skb - process receive buffer from network
4290 * @skb: buffer to process
4291 *
4292 * netif_receive_skb() is the main receive data processing function.
4293 * It always succeeds. The buffer may be dropped during processing
4294 * for congestion control or by the protocol layers.
4295 *
4296 * This function may only be called from softirq context and interrupts
4297 * should be enabled.
4298 *
4299 * Return values (usually ignored):
4300 * NET_RX_SUCCESS: no congestion
4301 * NET_RX_DROP: packet was dropped
4302 */
4303int netif_receive_skb(struct sk_buff *skb)
4304{
4305 trace_netif_receive_skb_entry(skb);
4306
4307 return netif_receive_skb_internal(skb);
4308}
4309EXPORT_SYMBOL(netif_receive_skb);
4310
4311DEFINE_PER_CPU(struct work_struct, flush_works);
4312
4313/* Network device is going away, flush any packets still pending */
4314static void flush_backlog(struct work_struct *work)
4315{
4316 struct sk_buff *skb, *tmp;
4317 struct softnet_data *sd;
4318
4319 local_bh_disable();
4320 sd = this_cpu_ptr(&softnet_data);
4321
4322 local_irq_disable();
4323 rps_lock(sd);
4324 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4325 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4326 __skb_unlink(skb, &sd->input_pkt_queue);
4327 kfree_skb(skb);
4328 input_queue_head_incr(sd);
4329 }
4330 }
4331 rps_unlock(sd);
4332 local_irq_enable();
4333
4334 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4335 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4336 __skb_unlink(skb, &sd->process_queue);
4337 kfree_skb(skb);
4338 input_queue_head_incr(sd);
4339 }
4340 }
4341 local_bh_enable();
4342}
4343
4344static void flush_all_backlogs(void)
4345{
4346 unsigned int cpu;
4347
4348 get_online_cpus();
4349
4350 for_each_online_cpu(cpu)
4351 queue_work_on(cpu, system_highpri_wq,
4352 per_cpu_ptr(&flush_works, cpu));
4353
4354 for_each_online_cpu(cpu)
4355 flush_work(per_cpu_ptr(&flush_works, cpu));
4356
4357 put_online_cpus();
4358}
4359
4360static int napi_gro_complete(struct sk_buff *skb)
4361{
4362 struct packet_offload *ptype;
4363 __be16 type = skb->protocol;
4364 struct list_head *head = &offload_base;
4365 int err = -ENOENT;
4366
4367 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4368
4369 if (NAPI_GRO_CB(skb)->count == 1) {
4370 skb_shinfo(skb)->gso_size = 0;
4371 goto out;
4372 }
4373
4374 rcu_read_lock();
4375 list_for_each_entry_rcu(ptype, head, list) {
4376 if (ptype->type != type || !ptype->callbacks.gro_complete)
4377 continue;
4378
4379 err = ptype->callbacks.gro_complete(skb, 0);
4380 break;
4381 }
4382 rcu_read_unlock();
4383
4384 if (err) {
4385 WARN_ON(&ptype->list == head);
4386 kfree_skb(skb);
4387 return NET_RX_SUCCESS;
4388 }
4389
4390out:
4391 return netif_receive_skb_internal(skb);
4392}
4393
4394/* napi->gro_list contains packets ordered by age.
4395 * youngest packets at the head of it.
4396 * Complete skbs in reverse order to reduce latencies.
4397 */
4398void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4399{
4400 struct sk_buff *skb, *prev = NULL;
4401
4402 /* scan list and build reverse chain */
4403 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4404 skb->prev = prev;
4405 prev = skb;
4406 }
4407
4408 for (skb = prev; skb; skb = prev) {
4409 skb->next = NULL;
4410
4411 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4412 return;
4413
4414 prev = skb->prev;
4415 napi_gro_complete(skb);
4416 napi->gro_count--;
4417 }
4418
4419 napi->gro_list = NULL;
4420}
4421EXPORT_SYMBOL(napi_gro_flush);
4422
4423static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4424{
4425 struct sk_buff *p;
4426 unsigned int maclen = skb->dev->hard_header_len;
4427 u32 hash = skb_get_hash_raw(skb);
4428
4429 for (p = napi->gro_list; p; p = p->next) {
4430 unsigned long diffs;
4431
4432 NAPI_GRO_CB(p)->flush = 0;
4433
4434 if (hash != skb_get_hash_raw(p)) {
4435 NAPI_GRO_CB(p)->same_flow = 0;
4436 continue;
4437 }
4438
4439 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4440 diffs |= p->vlan_tci ^ skb->vlan_tci;
4441 diffs |= skb_metadata_dst_cmp(p, skb);
4442 if (maclen == ETH_HLEN)
4443 diffs |= compare_ether_header(skb_mac_header(p),
4444 skb_mac_header(skb));
4445 else if (!diffs)
4446 diffs = memcmp(skb_mac_header(p),
4447 skb_mac_header(skb),
4448 maclen);
4449 NAPI_GRO_CB(p)->same_flow = !diffs;
4450 }
4451}
4452
4453static void skb_gro_reset_offset(struct sk_buff *skb)
4454{
4455 const struct skb_shared_info *pinfo = skb_shinfo(skb);
4456 const skb_frag_t *frag0 = &pinfo->frags[0];
4457
4458 NAPI_GRO_CB(skb)->data_offset = 0;
4459 NAPI_GRO_CB(skb)->frag0 = NULL;
4460 NAPI_GRO_CB(skb)->frag0_len = 0;
4461
4462 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4463 pinfo->nr_frags &&
4464 !PageHighMem(skb_frag_page(frag0))) {
4465 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4466 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4467 skb_frag_size(frag0),
4468 skb->end - skb->tail);
4469 }
4470}
4471
4472static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4473{
4474 struct skb_shared_info *pinfo = skb_shinfo(skb);
4475
4476 BUG_ON(skb->end - skb->tail < grow);
4477
4478 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4479
4480 skb->data_len -= grow;
4481 skb->tail += grow;
4482
4483 pinfo->frags[0].page_offset += grow;
4484 skb_frag_size_sub(&pinfo->frags[0], grow);
4485
4486 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4487 skb_frag_unref(skb, 0);
4488 memmove(pinfo->frags, pinfo->frags + 1,
4489 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4490 }
4491}
4492
4493static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4494{
4495 struct sk_buff **pp = NULL;
4496 struct packet_offload *ptype;
4497 __be16 type = skb->protocol;
4498 struct list_head *head = &offload_base;
4499 int same_flow;
4500 enum gro_result ret;
4501 int grow;
4502
4503 if (!(skb->dev->features & NETIF_F_GRO))
4504 goto normal;
4505
4506 if (skb->csum_bad)
4507 goto normal;
4508
4509 gro_list_prepare(napi, skb);
4510
4511 rcu_read_lock();
4512 list_for_each_entry_rcu(ptype, head, list) {
4513 if (ptype->type != type || !ptype->callbacks.gro_receive)
4514 continue;
4515
4516 skb_set_network_header(skb, skb_gro_offset(skb));
4517 skb_reset_mac_len(skb);
4518 NAPI_GRO_CB(skb)->same_flow = 0;
4519 NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
4520 NAPI_GRO_CB(skb)->free = 0;
4521 NAPI_GRO_CB(skb)->encap_mark = 0;
4522 NAPI_GRO_CB(skb)->recursion_counter = 0;
4523 NAPI_GRO_CB(skb)->is_fou = 0;
4524 NAPI_GRO_CB(skb)->is_atomic = 1;
4525 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4526
4527 /* Setup for GRO checksum validation */
4528 switch (skb->ip_summed) {
4529 case CHECKSUM_COMPLETE:
4530 NAPI_GRO_CB(skb)->csum = skb->csum;
4531 NAPI_GRO_CB(skb)->csum_valid = 1;
4532 NAPI_GRO_CB(skb)->csum_cnt = 0;
4533 break;
4534 case CHECKSUM_UNNECESSARY:
4535 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4536 NAPI_GRO_CB(skb)->csum_valid = 0;
4537 break;
4538 default:
4539 NAPI_GRO_CB(skb)->csum_cnt = 0;
4540 NAPI_GRO_CB(skb)->csum_valid = 0;
4541 }
4542
4543 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4544 break;
4545 }
4546 rcu_read_unlock();
4547
4548 if (&ptype->list == head)
4549 goto normal;
4550
4551 same_flow = NAPI_GRO_CB(skb)->same_flow;
4552 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4553
4554 if (pp) {
4555 struct sk_buff *nskb = *pp;
4556
4557 *pp = nskb->next;
4558 nskb->next = NULL;
4559 napi_gro_complete(nskb);
4560 napi->gro_count--;
4561 }
4562
4563 if (same_flow)
4564 goto ok;
4565
4566 if (NAPI_GRO_CB(skb)->flush)
4567 goto normal;
4568
4569 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4570 struct sk_buff *nskb = napi->gro_list;
4571
4572 /* locate the end of the list to select the 'oldest' flow */
4573 while (nskb->next) {
4574 pp = &nskb->next;
4575 nskb = *pp;
4576 }
4577 *pp = NULL;
4578 nskb->next = NULL;
4579 napi_gro_complete(nskb);
4580 } else {
4581 napi->gro_count++;
4582 }
4583 NAPI_GRO_CB(skb)->count = 1;
4584 NAPI_GRO_CB(skb)->age = jiffies;
4585 NAPI_GRO_CB(skb)->last = skb;
4586 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4587 skb->next = napi->gro_list;
4588 napi->gro_list = skb;
4589 ret = GRO_HELD;
4590
4591pull:
4592 grow = skb_gro_offset(skb) - skb_headlen(skb);
4593 if (grow > 0)
4594 gro_pull_from_frag0(skb, grow);
4595ok:
4596 return ret;
4597
4598normal:
4599 ret = GRO_NORMAL;
4600 goto pull;
4601}
4602
4603struct packet_offload *gro_find_receive_by_type(__be16 type)
4604{
4605 struct list_head *offload_head = &offload_base;
4606 struct packet_offload *ptype;
4607
4608 list_for_each_entry_rcu(ptype, offload_head, list) {
4609 if (ptype->type != type || !ptype->callbacks.gro_receive)
4610 continue;
4611 return ptype;
4612 }
4613 return NULL;
4614}
4615EXPORT_SYMBOL(gro_find_receive_by_type);
4616
4617struct packet_offload *gro_find_complete_by_type(__be16 type)
4618{
4619 struct list_head *offload_head = &offload_base;
4620 struct packet_offload *ptype;
4621
4622 list_for_each_entry_rcu(ptype, offload_head, list) {
4623 if (ptype->type != type || !ptype->callbacks.gro_complete)
4624 continue;
4625 return ptype;
4626 }
4627 return NULL;
4628}
4629EXPORT_SYMBOL(gro_find_complete_by_type);
4630
4631static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4632{
4633 switch (ret) {
4634 case GRO_NORMAL:
4635 if (netif_receive_skb_internal(skb))
4636 ret = GRO_DROP;
4637 break;
4638
4639 case GRO_DROP:
4640 kfree_skb(skb);
4641 break;
4642
4643 case GRO_MERGED_FREE:
4644 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4645 skb_dst_drop(skb);
4646 kmem_cache_free(skbuff_head_cache, skb);
4647 } else {
4648 __kfree_skb(skb);
4649 }
4650 break;
4651
4652 case GRO_HELD:
4653 case GRO_MERGED:
4654 break;
4655 }
4656
4657 return ret;
4658}
4659
4660gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4661{
4662 skb_mark_napi_id(skb, napi);
4663 trace_napi_gro_receive_entry(skb);
4664
4665 skb_gro_reset_offset(skb);
4666
4667 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4668}
4669EXPORT_SYMBOL(napi_gro_receive);
4670
4671static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4672{
4673 if (unlikely(skb->pfmemalloc)) {
4674 consume_skb(skb);
4675 return;
4676 }
4677 __skb_pull(skb, skb_headlen(skb));
4678 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4679 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4680 skb->vlan_tci = 0;
4681 skb->dev = napi->dev;
4682 skb->skb_iif = 0;
4683 skb->encapsulation = 0;
4684 skb_shinfo(skb)->gso_type = 0;
4685 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4686
4687 napi->skb = skb;
4688}
4689
4690struct sk_buff *napi_get_frags(struct napi_struct *napi)
4691{
4692 struct sk_buff *skb = napi->skb;
4693
4694 if (!skb) {
4695 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4696 if (skb) {
4697 napi->skb = skb;
4698 skb_mark_napi_id(skb, napi);
4699 }
4700 }
4701 return skb;
4702}
4703EXPORT_SYMBOL(napi_get_frags);
4704
4705static gro_result_t napi_frags_finish(struct napi_struct *napi,
4706 struct sk_buff *skb,
4707 gro_result_t ret)
4708{
4709 switch (ret) {
4710 case GRO_NORMAL:
4711 case GRO_HELD:
4712 __skb_push(skb, ETH_HLEN);
4713 skb->protocol = eth_type_trans(skb, skb->dev);
4714 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4715 ret = GRO_DROP;
4716 break;
4717
4718 case GRO_DROP:
4719 case GRO_MERGED_FREE:
4720 napi_reuse_skb(napi, skb);
4721 break;
4722
4723 case GRO_MERGED:
4724 break;
4725 }
4726
4727 return ret;
4728}
4729
4730/* Upper GRO stack assumes network header starts at gro_offset=0
4731 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4732 * We copy ethernet header into skb->data to have a common layout.
4733 */
4734static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4735{
4736 struct sk_buff *skb = napi->skb;
4737 const struct ethhdr *eth;
4738 unsigned int hlen = sizeof(*eth);
4739
4740 napi->skb = NULL;
4741
4742 skb_reset_mac_header(skb);
4743 skb_gro_reset_offset(skb);
4744
4745 eth = skb_gro_header_fast(skb, 0);
4746 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4747 eth = skb_gro_header_slow(skb, hlen, 0);
4748 if (unlikely(!eth)) {
4749 net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4750 __func__, napi->dev->name);
4751 napi_reuse_skb(napi, skb);
4752 return NULL;
4753 }
4754 } else {
4755 gro_pull_from_frag0(skb, hlen);
4756 NAPI_GRO_CB(skb)->frag0 += hlen;
4757 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4758 }
4759 __skb_pull(skb, hlen);
4760
4761 /*
4762 * This works because the only protocols we care about don't require
4763 * special handling.
4764 * We'll fix it up properly in napi_frags_finish()
4765 */
4766 skb->protocol = eth->h_proto;
4767
4768 return skb;
4769}
4770
4771gro_result_t napi_gro_frags(struct napi_struct *napi)
4772{
4773 struct sk_buff *skb = napi_frags_skb(napi);
4774
4775 if (!skb)
4776 return GRO_DROP;
4777
4778 trace_napi_gro_frags_entry(skb);
4779
4780 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4781}
4782EXPORT_SYMBOL(napi_gro_frags);
4783
4784/* Compute the checksum from gro_offset and return the folded value
4785 * after adding in any pseudo checksum.
4786 */
4787__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4788{
4789 __wsum wsum;
4790 __sum16 sum;
4791
4792 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4793
4794 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4795 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4796 if (likely(!sum)) {
4797 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4798 !skb->csum_complete_sw)
4799 netdev_rx_csum_fault(skb->dev);
4800 }
4801
4802 NAPI_GRO_CB(skb)->csum = wsum;
4803 NAPI_GRO_CB(skb)->csum_valid = 1;
4804
4805 return sum;
4806}
4807EXPORT_SYMBOL(__skb_gro_checksum_complete);
4808
4809/*
4810 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4811 * Note: called with local irq disabled, but exits with local irq enabled.
4812 */
4813static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4814{
4815#ifdef CONFIG_RPS
4816 struct softnet_data *remsd = sd->rps_ipi_list;
4817
4818 if (remsd) {
4819 sd->rps_ipi_list = NULL;
4820
4821 local_irq_enable();
4822
4823 /* Send pending IPI's to kick RPS processing on remote cpus. */
4824 while (remsd) {
4825 struct softnet_data *next = remsd->rps_ipi_next;
4826
4827 if (cpu_online(remsd->cpu))
4828 smp_call_function_single_async(remsd->cpu,
4829 &remsd->csd);
4830 remsd = next;
4831 }
4832 } else
4833#endif
4834 local_irq_enable();
4835}
4836
4837static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4838{
4839#ifdef CONFIG_RPS
4840 return sd->rps_ipi_list != NULL;
4841#else
4842 return false;
4843#endif
4844}
4845
4846static int process_backlog(struct napi_struct *napi, int quota)
4847{
4848 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4849 bool again = true;
4850 int work = 0;
4851
4852 /* Check if we have pending ipi, its better to send them now,
4853 * not waiting net_rx_action() end.
4854 */
4855 if (sd_has_rps_ipi_waiting(sd)) {
4856 local_irq_disable();
4857 net_rps_action_and_irq_enable(sd);
4858 }
4859
4860 napi->weight = weight_p;
4861 while (again) {
4862 struct sk_buff *skb;
4863
4864 while ((skb = __skb_dequeue(&sd->process_queue))) {
4865 rcu_read_lock();
4866 __netif_receive_skb(skb);
4867 rcu_read_unlock();
4868 input_queue_head_incr(sd);
4869 if (++work >= quota)
4870 return work;
4871
4872 }
4873
4874 local_irq_disable();
4875 rps_lock(sd);
4876 if (skb_queue_empty(&sd->input_pkt_queue)) {
4877 /*
4878 * Inline a custom version of __napi_complete().
4879 * only current cpu owns and manipulates this napi,
4880 * and NAPI_STATE_SCHED is the only possible flag set
4881 * on backlog.
4882 * We can use a plain write instead of clear_bit(),
4883 * and we dont need an smp_mb() memory barrier.
4884 */
4885 napi->state = 0;
4886 again = false;
4887 } else {
4888 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4889 &sd->process_queue);
4890 }
4891 rps_unlock(sd);
4892 local_irq_enable();
4893 }
4894
4895 return work;
4896}
4897
4898/**
4899 * __napi_schedule - schedule for receive
4900 * @n: entry to schedule
4901 *
4902 * The entry's receive function will be scheduled to run.
4903 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4904 */
4905void __napi_schedule(struct napi_struct *n)
4906{
4907 unsigned long flags;
4908
4909 local_irq_save(flags);
4910 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4911 local_irq_restore(flags);
4912}
4913EXPORT_SYMBOL(__napi_schedule);
4914
4915/**
4916 * napi_schedule_prep - check if napi can be scheduled
4917 * @n: napi context
4918 *
4919 * Test if NAPI routine is already running, and if not mark
4920 * it as running. This is used as a condition variable
4921 * insure only one NAPI poll instance runs. We also make
4922 * sure there is no pending NAPI disable.
4923 */
4924bool napi_schedule_prep(struct napi_struct *n)
4925{
4926 unsigned long val, new;
4927
4928 do {
4929 val = READ_ONCE(n->state);
4930 if (unlikely(val & NAPIF_STATE_DISABLE))
4931 return false;
4932 new = val | NAPIF_STATE_SCHED;
4933
4934 /* Sets STATE_MISSED bit if STATE_SCHED was already set
4935 * This was suggested by Alexander Duyck, as compiler
4936 * emits better code than :
4937 * if (val & NAPIF_STATE_SCHED)
4938 * new |= NAPIF_STATE_MISSED;
4939 */
4940 new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
4941 NAPIF_STATE_MISSED;
4942 } while (cmpxchg(&n->state, val, new) != val);
4943
4944 return !(val & NAPIF_STATE_SCHED);
4945}
4946EXPORT_SYMBOL(napi_schedule_prep);
4947
4948/**
4949 * __napi_schedule_irqoff - schedule for receive
4950 * @n: entry to schedule
4951 *
4952 * Variant of __napi_schedule() assuming hard irqs are masked
4953 */
4954void __napi_schedule_irqoff(struct napi_struct *n)
4955{
4956 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4957}
4958EXPORT_SYMBOL(__napi_schedule_irqoff);
4959
4960bool __napi_complete(struct napi_struct *n)
4961{
4962 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4963
4964 /* Some drivers call us directly, instead of calling
4965 * napi_complete_done().
4966 */
4967 if (unlikely(test_bit(NAPI_STATE_IN_BUSY_POLL, &n->state)))
4968 return false;
4969
4970 list_del_init(&n->poll_list);
4971 smp_mb__before_atomic();
4972 clear_bit(NAPI_STATE_SCHED, &n->state);
4973 return true;
4974}
4975EXPORT_SYMBOL(__napi_complete);
4976
4977bool napi_complete_done(struct napi_struct *n, int work_done)
4978{
4979 unsigned long flags, val, new;
4980
4981 /*
4982 * 1) Don't let napi dequeue from the cpu poll list
4983 * just in case its running on a different cpu.
4984 * 2) If we are busy polling, do nothing here, we have
4985 * the guarantee we will be called later.
4986 */
4987 if (unlikely(n->state & (NAPIF_STATE_NPSVC |
4988 NAPIF_STATE_IN_BUSY_POLL)))
4989 return false;
4990
4991 if (n->gro_list) {
4992 unsigned long timeout = 0;
4993
4994 if (work_done)
4995 timeout = n->dev->gro_flush_timeout;
4996
4997 if (timeout)
4998 hrtimer_start(&n->timer, ns_to_ktime(timeout),
4999 HRTIMER_MODE_REL_PINNED);
5000 else
5001 napi_gro_flush(n, false);
5002 }
5003 if (unlikely(!list_empty(&n->poll_list))) {
5004 /* If n->poll_list is not empty, we need to mask irqs */
5005 local_irq_save(flags);
5006 list_del_init(&n->poll_list);
5007 local_irq_restore(flags);
5008 }
5009
5010 do {
5011 val = READ_ONCE(n->state);
5012
5013 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
5014
5015 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
5016
5017 /* If STATE_MISSED was set, leave STATE_SCHED set,
5018 * because we will call napi->poll() one more time.
5019 * This C code was suggested by Alexander Duyck to help gcc.
5020 */
5021 new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
5022 NAPIF_STATE_SCHED;
5023 } while (cmpxchg(&n->state, val, new) != val);
5024
5025 if (unlikely(val & NAPIF_STATE_MISSED)) {
5026 __napi_schedule(n);
5027 return false;
5028 }
5029
5030 return true;
5031}
5032EXPORT_SYMBOL(napi_complete_done);
5033
5034/* must be called under rcu_read_lock(), as we dont take a reference */
5035static struct napi_struct *napi_by_id(unsigned int napi_id)
5036{
5037 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
5038 struct napi_struct *napi;
5039
5040 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
5041 if (napi->napi_id == napi_id)
5042 return napi;
5043
5044 return NULL;
5045}
5046
5047#if defined(CONFIG_NET_RX_BUSY_POLL)
5048
5049#define BUSY_POLL_BUDGET 8
5050
5051static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
5052{
5053 int rc;
5054
5055 /* Busy polling means there is a high chance device driver hard irq
5056 * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
5057 * set in napi_schedule_prep().
5058 * Since we are about to call napi->poll() once more, we can safely
5059 * clear NAPI_STATE_MISSED.
5060 *
5061 * Note: x86 could use a single "lock and ..." instruction
5062 * to perform these two clear_bit()
5063 */
5064 clear_bit(NAPI_STATE_MISSED, &napi->state);
5065 clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
5066
5067 local_bh_disable();
5068
5069 /* All we really want here is to re-enable device interrupts.
5070 * Ideally, a new ndo_busy_poll_stop() could avoid another round.
5071 */
5072 rc = napi->poll(napi, BUSY_POLL_BUDGET);
5073 netpoll_poll_unlock(have_poll_lock);
5074 if (rc == BUSY_POLL_BUDGET)
5075 __napi_schedule(napi);
5076 local_bh_enable();
5077 if (local_softirq_pending())
5078 do_softirq();
5079}
5080
5081bool sk_busy_loop(struct sock *sk, int nonblock)
5082{
5083 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
5084 int (*napi_poll)(struct napi_struct *napi, int budget);
5085 int (*busy_poll)(struct napi_struct *dev);
5086 void *have_poll_lock = NULL;
5087 struct napi_struct *napi;
5088 int rc;
5089
5090restart:
5091 rc = false;
5092 napi_poll = NULL;
5093
5094 rcu_read_lock();
5095
5096 napi = napi_by_id(sk->sk_napi_id);
5097 if (!napi)
5098 goto out;
5099
5100 /* Note: ndo_busy_poll method is optional in linux-4.5 */
5101 busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
5102
5103 preempt_disable();
5104 for (;;) {
5105 rc = 0;
5106 local_bh_disable();
5107 if (busy_poll) {
5108 rc = busy_poll(napi);
5109 goto count;
5110 }
5111 if (!napi_poll) {
5112 unsigned long val = READ_ONCE(napi->state);
5113
5114 /* If multiple threads are competing for this napi,
5115 * we avoid dirtying napi->state as much as we can.
5116 */
5117 if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
5118 NAPIF_STATE_IN_BUSY_POLL))
5119 goto count;
5120 if (cmpxchg(&napi->state, val,
5121 val | NAPIF_STATE_IN_BUSY_POLL |
5122 NAPIF_STATE_SCHED) != val)
5123 goto count;
5124 have_poll_lock = netpoll_poll_lock(napi);
5125 napi_poll = napi->poll;
5126 }
5127 rc = napi_poll(napi, BUSY_POLL_BUDGET);
5128 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5129count:
5130 if (rc > 0)
5131 __NET_ADD_STATS(sock_net(sk),
5132 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5133 local_bh_enable();
5134
5135 if (rc == LL_FLUSH_FAILED)
5136 break; /* permanent failure */
5137
5138 if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
5139 busy_loop_timeout(end_time))
5140 break;
5141
5142 if (unlikely(need_resched())) {
5143 if (napi_poll)
5144 busy_poll_stop(napi, have_poll_lock);
5145 preempt_enable();
5146 rcu_read_unlock();
5147 cond_resched();
5148 rc = !skb_queue_empty(&sk->sk_receive_queue);
5149 if (rc || busy_loop_timeout(end_time))
5150 return rc;
5151 goto restart;
5152 }
5153 cpu_relax();
5154 }
5155 if (napi_poll)
5156 busy_poll_stop(napi, have_poll_lock);
5157 preempt_enable();
5158 rc = !skb_queue_empty(&sk->sk_receive_queue);
5159out:
5160 rcu_read_unlock();
5161 return rc;
5162}
5163EXPORT_SYMBOL(sk_busy_loop);
5164
5165#endif /* CONFIG_NET_RX_BUSY_POLL */
5166
5167static void napi_hash_add(struct napi_struct *napi)
5168{
5169 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5170 test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5171 return;
5172
5173 spin_lock(&napi_hash_lock);
5174
5175 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5176 do {
5177 if (unlikely(++napi_gen_id < NR_CPUS + 1))
5178 napi_gen_id = NR_CPUS + 1;
5179 } while (napi_by_id(napi_gen_id));
5180 napi->napi_id = napi_gen_id;
5181
5182 hlist_add_head_rcu(&napi->napi_hash_node,
5183 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5184
5185 spin_unlock(&napi_hash_lock);
5186}
5187
5188/* Warning : caller is responsible to make sure rcu grace period
5189 * is respected before freeing memory containing @napi
5190 */
5191bool napi_hash_del(struct napi_struct *napi)
5192{
5193 bool rcu_sync_needed = false;
5194
5195 spin_lock(&napi_hash_lock);
5196
5197 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5198 rcu_sync_needed = true;
5199 hlist_del_rcu(&napi->napi_hash_node);
5200 }
5201 spin_unlock(&napi_hash_lock);
5202 return rcu_sync_needed;
5203}
5204EXPORT_SYMBOL_GPL(napi_hash_del);
5205
5206static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5207{
5208 struct napi_struct *napi;
5209
5210 napi = container_of(timer, struct napi_struct, timer);
5211
5212 /* Note : we use a relaxed variant of napi_schedule_prep() not setting
5213 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
5214 */
5215 if (napi->gro_list && !napi_disable_pending(napi) &&
5216 !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
5217 __napi_schedule_irqoff(napi);
5218
5219 return HRTIMER_NORESTART;
5220}
5221
5222void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5223 int (*poll)(struct napi_struct *, int), int weight)
5224{
5225 INIT_LIST_HEAD(&napi->poll_list);
5226 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5227 napi->timer.function = napi_watchdog;
5228 napi->gro_count = 0;
5229 napi->gro_list = NULL;
5230 napi->skb = NULL;
5231 napi->poll = poll;
5232 if (weight > NAPI_POLL_WEIGHT)
5233 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5234 weight, dev->name);
5235 napi->weight = weight;
5236 list_add(&napi->dev_list, &dev->napi_list);
5237 napi->dev = dev;
5238#ifdef CONFIG_NETPOLL
5239 napi->poll_owner = -1;
5240#endif
5241 set_bit(NAPI_STATE_SCHED, &napi->state);
5242 napi_hash_add(napi);
5243}
5244EXPORT_SYMBOL(netif_napi_add);
5245
5246void napi_disable(struct napi_struct *n)
5247{
5248 might_sleep();
5249 set_bit(NAPI_STATE_DISABLE, &n->state);
5250
5251 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5252 msleep(1);
5253 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5254 msleep(1);
5255
5256 hrtimer_cancel(&n->timer);
5257
5258 clear_bit(NAPI_STATE_DISABLE, &n->state);
5259}
5260EXPORT_SYMBOL(napi_disable);
5261
5262/* Must be called in process context */
5263void netif_napi_del(struct napi_struct *napi)
5264{
5265 might_sleep();
5266 if (napi_hash_del(napi))
5267 synchronize_net();
5268 list_del_init(&napi->dev_list);
5269 napi_free_frags(napi);
5270
5271 kfree_skb_list(napi->gro_list);
5272 napi->gro_list = NULL;
5273 napi->gro_count = 0;
5274}
5275EXPORT_SYMBOL(netif_napi_del);
5276
5277static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5278{
5279 void *have;
5280 int work, weight;
5281
5282 list_del_init(&n->poll_list);
5283
5284 have = netpoll_poll_lock(n);
5285
5286 weight = n->weight;
5287
5288 /* This NAPI_STATE_SCHED test is for avoiding a race
5289 * with netpoll's poll_napi(). Only the entity which
5290 * obtains the lock and sees NAPI_STATE_SCHED set will
5291 * actually make the ->poll() call. Therefore we avoid
5292 * accidentally calling ->poll() when NAPI is not scheduled.
5293 */
5294 work = 0;
5295 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5296 work = n->poll(n, weight);
5297 trace_napi_poll(n, work, weight);
5298 }
5299
5300 WARN_ON_ONCE(work > weight);
5301
5302 if (likely(work < weight))
5303 goto out_unlock;
5304
5305 /* Drivers must not modify the NAPI state if they
5306 * consume the entire weight. In such cases this code
5307 * still "owns" the NAPI instance and therefore can
5308 * move the instance around on the list at-will.
5309 */
5310 if (unlikely(napi_disable_pending(n))) {
5311 napi_complete(n);
5312 goto out_unlock;
5313 }
5314
5315 if (n->gro_list) {
5316 /* flush too old packets
5317 * If HZ < 1000, flush all packets.
5318 */
5319 napi_gro_flush(n, HZ >= 1000);
5320 }
5321
5322 /* Some drivers may have called napi_schedule
5323 * prior to exhausting their budget.
5324 */
5325 if (unlikely(!list_empty(&n->poll_list))) {
5326 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5327 n->dev ? n->dev->name : "backlog");
5328 goto out_unlock;
5329 }
5330
5331 list_add_tail(&n->poll_list, repoll);
5332
5333out_unlock:
5334 netpoll_poll_unlock(have);
5335
5336 return work;
5337}
5338
5339static __latent_entropy void net_rx_action(struct softirq_action *h)
5340{
5341 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5342 unsigned long time_limit = jiffies + 2;
5343 int budget = netdev_budget;
5344 LIST_HEAD(list);
5345 LIST_HEAD(repoll);
5346
5347 local_irq_disable();
5348 list_splice_init(&sd->poll_list, &list);
5349 local_irq_enable();
5350
5351 for (;;) {
5352 struct napi_struct *n;
5353
5354 if (list_empty(&list)) {
5355 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5356 goto out;
5357 break;
5358 }
5359
5360 n = list_first_entry(&list, struct napi_struct, poll_list);
5361 budget -= napi_poll(n, &repoll);
5362
5363 /* If softirq window is exhausted then punt.
5364 * Allow this to run for 2 jiffies since which will allow
5365 * an average latency of 1.5/HZ.
5366 */
5367 if (unlikely(budget <= 0 ||
5368 time_after_eq(jiffies, time_limit))) {
5369 sd->time_squeeze++;
5370 break;
5371 }
5372 }
5373
5374 local_irq_disable();
5375
5376 list_splice_tail_init(&sd->poll_list, &list);
5377 list_splice_tail(&repoll, &list);
5378 list_splice(&list, &sd->poll_list);
5379 if (!list_empty(&sd->poll_list))
5380 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5381
5382 net_rps_action_and_irq_enable(sd);
5383out:
5384 __kfree_skb_flush();
5385}
5386
5387struct netdev_adjacent {
5388 struct net_device *dev;
5389
5390 /* upper master flag, there can only be one master device per list */
5391 bool master;
5392
5393 /* counter for the number of times this device was added to us */
5394 u16 ref_nr;
5395
5396 /* private field for the users */
5397 void *private;
5398
5399 struct list_head list;
5400 struct rcu_head rcu;
5401};
5402
5403static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5404 struct list_head *adj_list)
5405{
5406 struct netdev_adjacent *adj;
5407
5408 list_for_each_entry(adj, adj_list, list) {
5409 if (adj->dev == adj_dev)
5410 return adj;
5411 }
5412 return NULL;
5413}
5414
5415static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5416{
5417 struct net_device *dev = data;
5418
5419 return upper_dev == dev;
5420}
5421
5422/**
5423 * netdev_has_upper_dev - Check if device is linked to an upper device
5424 * @dev: device
5425 * @upper_dev: upper device to check
5426 *
5427 * Find out if a device is linked to specified upper device and return true
5428 * in case it is. Note that this checks only immediate upper device,
5429 * not through a complete stack of devices. The caller must hold the RTNL lock.
5430 */
5431bool netdev_has_upper_dev(struct net_device *dev,
5432 struct net_device *upper_dev)
5433{
5434 ASSERT_RTNL();
5435
5436 return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5437 upper_dev);
5438}
5439EXPORT_SYMBOL(netdev_has_upper_dev);
5440
5441/**
5442 * netdev_has_upper_dev_all - Check if device is linked to an upper device
5443 * @dev: device
5444 * @upper_dev: upper device to check
5445 *
5446 * Find out if a device is linked to specified upper device and return true
5447 * in case it is. Note that this checks the entire upper device chain.
5448 * The caller must hold rcu lock.
5449 */
5450
5451bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5452 struct net_device *upper_dev)
5453{
5454 return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5455 upper_dev);
5456}
5457EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5458
5459/**
5460 * netdev_has_any_upper_dev - Check if device is linked to some device
5461 * @dev: device
5462 *
5463 * Find out if a device is linked to an upper device and return true in case
5464 * it is. The caller must hold the RTNL lock.
5465 */
5466static bool netdev_has_any_upper_dev(struct net_device *dev)
5467{
5468 ASSERT_RTNL();
5469
5470 return !list_empty(&dev->adj_list.upper);
5471}
5472
5473/**
5474 * netdev_master_upper_dev_get - Get master upper device
5475 * @dev: device
5476 *
5477 * Find a master upper device and return pointer to it or NULL in case
5478 * it's not there. The caller must hold the RTNL lock.
5479 */
5480struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5481{
5482 struct netdev_adjacent *upper;
5483
5484 ASSERT_RTNL();
5485
5486 if (list_empty(&dev->adj_list.upper))
5487 return NULL;
5488
5489 upper = list_first_entry(&dev->adj_list.upper,
5490 struct netdev_adjacent, list);
5491 if (likely(upper->master))
5492 return upper->dev;
5493 return NULL;
5494}
5495EXPORT_SYMBOL(netdev_master_upper_dev_get);
5496
5497/**
5498 * netdev_has_any_lower_dev - Check if device is linked to some device
5499 * @dev: device
5500 *
5501 * Find out if a device is linked to a lower device and return true in case
5502 * it is. The caller must hold the RTNL lock.
5503 */
5504static bool netdev_has_any_lower_dev(struct net_device *dev)
5505{
5506 ASSERT_RTNL();
5507
5508 return !list_empty(&dev->adj_list.lower);
5509}
5510
5511void *netdev_adjacent_get_private(struct list_head *adj_list)
5512{
5513 struct netdev_adjacent *adj;
5514
5515 adj = list_entry(adj_list, struct netdev_adjacent, list);
5516
5517 return adj->private;
5518}
5519EXPORT_SYMBOL(netdev_adjacent_get_private);
5520
5521/**
5522 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5523 * @dev: device
5524 * @iter: list_head ** of the current position
5525 *
5526 * Gets the next device from the dev's upper list, starting from iter
5527 * position. The caller must hold RCU read lock.
5528 */
5529struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5530 struct list_head **iter)
5531{
5532 struct netdev_adjacent *upper;
5533
5534 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5535
5536 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5537
5538 if (&upper->list == &dev->adj_list.upper)
5539 return NULL;
5540
5541 *iter = &upper->list;
5542
5543 return upper->dev;
5544}
5545EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5546
5547static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5548 struct list_head **iter)
5549{
5550 struct netdev_adjacent *upper;
5551
5552 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5553
5554 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5555
5556 if (&upper->list == &dev->adj_list.upper)
5557 return NULL;
5558
5559 *iter = &upper->list;
5560
5561 return upper->dev;
5562}
5563
5564int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5565 int (*fn)(struct net_device *dev,
5566 void *data),
5567 void *data)
5568{
5569 struct net_device *udev;
5570 struct list_head *iter;
5571 int ret;
5572
5573 for (iter = &dev->adj_list.upper,
5574 udev = netdev_next_upper_dev_rcu(dev, &iter);
5575 udev;
5576 udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5577 /* first is the upper device itself */
5578 ret = fn(udev, data);
5579 if (ret)
5580 return ret;
5581
5582 /* then look at all of its upper devices */
5583 ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5584 if (ret)
5585 return ret;
5586 }
5587
5588 return 0;
5589}
5590EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5591
5592/**
5593 * netdev_lower_get_next_private - Get the next ->private from the
5594 * lower neighbour list
5595 * @dev: device
5596 * @iter: list_head ** of the current position
5597 *
5598 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5599 * list, starting from iter position. The caller must hold either hold the
5600 * RTNL lock or its own locking that guarantees that the neighbour lower
5601 * list will remain unchanged.
5602 */
5603void *netdev_lower_get_next_private(struct net_device *dev,
5604 struct list_head **iter)
5605{
5606 struct netdev_adjacent *lower;
5607
5608 lower = list_entry(*iter, struct netdev_adjacent, list);
5609
5610 if (&lower->list == &dev->adj_list.lower)
5611 return NULL;
5612
5613 *iter = lower->list.next;
5614
5615 return lower->private;
5616}
5617EXPORT_SYMBOL(netdev_lower_get_next_private);
5618
5619/**
5620 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5621 * lower neighbour list, RCU
5622 * variant
5623 * @dev: device
5624 * @iter: list_head ** of the current position
5625 *
5626 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5627 * list, starting from iter position. The caller must hold RCU read lock.
5628 */
5629void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5630 struct list_head **iter)
5631{
5632 struct netdev_adjacent *lower;
5633
5634 WARN_ON_ONCE(!rcu_read_lock_held());
5635
5636 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5637
5638 if (&lower->list == &dev->adj_list.lower)
5639 return NULL;
5640
5641 *iter = &lower->list;
5642
5643 return lower->private;
5644}
5645EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5646
5647/**
5648 * netdev_lower_get_next - Get the next device from the lower neighbour
5649 * list
5650 * @dev: device
5651 * @iter: list_head ** of the current position
5652 *
5653 * Gets the next netdev_adjacent from the dev's lower neighbour
5654 * list, starting from iter position. The caller must hold RTNL lock or
5655 * its own locking that guarantees that the neighbour lower
5656 * list will remain unchanged.
5657 */
5658void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5659{
5660 struct netdev_adjacent *lower;
5661
5662 lower = list_entry(*iter, struct netdev_adjacent, list);
5663
5664 if (&lower->list == &dev->adj_list.lower)
5665 return NULL;
5666
5667 *iter = lower->list.next;
5668
5669 return lower->dev;
5670}
5671EXPORT_SYMBOL(netdev_lower_get_next);
5672
5673static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5674 struct list_head **iter)
5675{
5676 struct netdev_adjacent *lower;
5677
5678 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5679
5680 if (&lower->list == &dev->adj_list.lower)
5681 return NULL;
5682
5683 *iter = &lower->list;
5684
5685 return lower->dev;
5686}
5687
5688int netdev_walk_all_lower_dev(struct net_device *dev,
5689 int (*fn)(struct net_device *dev,
5690 void *data),
5691 void *data)
5692{
5693 struct net_device *ldev;
5694 struct list_head *iter;
5695 int ret;
5696
5697 for (iter = &dev->adj_list.lower,
5698 ldev = netdev_next_lower_dev(dev, &iter);
5699 ldev;
5700 ldev = netdev_next_lower_dev(dev, &iter)) {
5701 /* first is the lower device itself */
5702 ret = fn(ldev, data);
5703 if (ret)
5704 return ret;
5705
5706 /* then look at all of its lower devices */
5707 ret = netdev_walk_all_lower_dev(ldev, fn, data);
5708 if (ret)
5709 return ret;
5710 }
5711
5712 return 0;
5713}
5714EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5715
5716static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5717 struct list_head **iter)
5718{
5719 struct netdev_adjacent *lower;
5720
5721 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5722 if (&lower->list == &dev->adj_list.lower)
5723 return NULL;
5724
5725 *iter = &lower->list;
5726
5727 return lower->dev;
5728}
5729
5730int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5731 int (*fn)(struct net_device *dev,
5732 void *data),
5733 void *data)
5734{
5735 struct net_device *ldev;
5736 struct list_head *iter;
5737 int ret;
5738
5739 for (iter = &dev->adj_list.lower,
5740 ldev = netdev_next_lower_dev_rcu(dev, &iter);
5741 ldev;
5742 ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5743 /* first is the lower device itself */
5744 ret = fn(ldev, data);
5745 if (ret)
5746 return ret;
5747
5748 /* then look at all of its lower devices */
5749 ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5750 if (ret)
5751 return ret;
5752 }
5753
5754 return 0;
5755}
5756EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5757
5758/**
5759 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5760 * lower neighbour list, RCU
5761 * variant
5762 * @dev: device
5763 *
5764 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5765 * list. The caller must hold RCU read lock.
5766 */
5767void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5768{
5769 struct netdev_adjacent *lower;
5770
5771 lower = list_first_or_null_rcu(&dev->adj_list.lower,
5772 struct netdev_adjacent, list);
5773 if (lower)
5774 return lower->private;
5775 return NULL;
5776}
5777EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5778
5779/**
5780 * netdev_master_upper_dev_get_rcu - Get master upper device
5781 * @dev: device
5782 *
5783 * Find a master upper device and return pointer to it or NULL in case
5784 * it's not there. The caller must hold the RCU read lock.
5785 */
5786struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5787{
5788 struct netdev_adjacent *upper;
5789
5790 upper = list_first_or_null_rcu(&dev->adj_list.upper,
5791 struct netdev_adjacent, list);
5792 if (upper && likely(upper->master))
5793 return upper->dev;
5794 return NULL;
5795}
5796EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5797
5798static int netdev_adjacent_sysfs_add(struct net_device *dev,
5799 struct net_device *adj_dev,
5800 struct list_head *dev_list)
5801{
5802 char linkname[IFNAMSIZ+7];
5803 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5804 "upper_%s" : "lower_%s", adj_dev->name);
5805 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5806 linkname);
5807}
5808static void netdev_adjacent_sysfs_del(struct net_device *dev,
5809 char *name,
5810 struct list_head *dev_list)
5811{
5812 char linkname[IFNAMSIZ+7];
5813 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5814 "upper_%s" : "lower_%s", name);
5815 sysfs_remove_link(&(dev->dev.kobj), linkname);
5816}
5817
5818static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5819 struct net_device *adj_dev,
5820 struct list_head *dev_list)
5821{
5822 return (dev_list == &dev->adj_list.upper ||
5823 dev_list == &dev->adj_list.lower) &&
5824 net_eq(dev_net(dev), dev_net(adj_dev));
5825}
5826
5827static int __netdev_adjacent_dev_insert(struct net_device *dev,
5828 struct net_device *adj_dev,
5829 struct list_head *dev_list,
5830 void *private, bool master)
5831{
5832 struct netdev_adjacent *adj;
5833 int ret;
5834
5835 adj = __netdev_find_adj(adj_dev, dev_list);
5836
5837 if (adj) {
5838 adj->ref_nr += 1;
5839 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5840 dev->name, adj_dev->name, adj->ref_nr);
5841
5842 return 0;
5843 }
5844
5845 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5846 if (!adj)
5847 return -ENOMEM;
5848
5849 adj->dev = adj_dev;
5850 adj->master = master;
5851 adj->ref_nr = 1;
5852 adj->private = private;
5853 dev_hold(adj_dev);
5854
5855 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5856 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5857
5858 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5859 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5860 if (ret)
5861 goto free_adj;
5862 }
5863
5864 /* Ensure that master link is always the first item in list. */
5865 if (master) {
5866 ret = sysfs_create_link(&(dev->dev.kobj),
5867 &(adj_dev->dev.kobj), "master");
5868 if (ret)
5869 goto remove_symlinks;
5870
5871 list_add_rcu(&adj->list, dev_list);
5872 } else {
5873 list_add_tail_rcu(&adj->list, dev_list);
5874 }
5875
5876 return 0;
5877
5878remove_symlinks:
5879 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5880 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5881free_adj:
5882 kfree(adj);
5883 dev_put(adj_dev);
5884
5885 return ret;
5886}
5887
5888static void __netdev_adjacent_dev_remove(struct net_device *dev,
5889 struct net_device *adj_dev,
5890 u16 ref_nr,
5891 struct list_head *dev_list)
5892{
5893 struct netdev_adjacent *adj;
5894
5895 pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5896 dev->name, adj_dev->name, ref_nr);
5897
5898 adj = __netdev_find_adj(adj_dev, dev_list);
5899
5900 if (!adj) {
5901 pr_err("Adjacency does not exist for device %s from %s\n",
5902 dev->name, adj_dev->name);
5903 WARN_ON(1);
5904 return;
5905 }
5906
5907 if (adj->ref_nr > ref_nr) {
5908 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5909 dev->name, adj_dev->name, ref_nr,
5910 adj->ref_nr - ref_nr);
5911 adj->ref_nr -= ref_nr;
5912 return;
5913 }
5914
5915 if (adj->master)
5916 sysfs_remove_link(&(dev->dev.kobj), "master");
5917
5918 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5919 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5920
5921 list_del_rcu(&adj->list);
5922 pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5923 adj_dev->name, dev->name, adj_dev->name);
5924 dev_put(adj_dev);
5925 kfree_rcu(adj, rcu);
5926}
5927
5928static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5929 struct net_device *upper_dev,
5930 struct list_head *up_list,
5931 struct list_head *down_list,
5932 void *private, bool master)
5933{
5934 int ret;
5935
5936 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5937 private, master);
5938 if (ret)
5939 return ret;
5940
5941 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5942 private, false);
5943 if (ret) {
5944 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5945 return ret;
5946 }
5947
5948 return 0;
5949}
5950
5951static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5952 struct net_device *upper_dev,
5953 u16 ref_nr,
5954 struct list_head *up_list,
5955 struct list_head *down_list)
5956{
5957 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5958 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5959}
5960
5961static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5962 struct net_device *upper_dev,
5963 void *private, bool master)
5964{
5965 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5966 &dev->adj_list.upper,
5967 &upper_dev->adj_list.lower,
5968 private, master);
5969}
5970
5971static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5972 struct net_device *upper_dev)
5973{
5974 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5975 &dev->adj_list.upper,
5976 &upper_dev->adj_list.lower);
5977}
5978
5979static int __netdev_upper_dev_link(struct net_device *dev,
5980 struct net_device *upper_dev, bool master,
5981 void *upper_priv, void *upper_info)
5982{
5983 struct netdev_notifier_changeupper_info changeupper_info;
5984 int ret = 0;
5985
5986 ASSERT_RTNL();
5987
5988 if (dev == upper_dev)
5989 return -EBUSY;
5990
5991 /* To prevent loops, check if dev is not upper device to upper_dev. */
5992 if (netdev_has_upper_dev(upper_dev, dev))
5993 return -EBUSY;
5994
5995 if (netdev_has_upper_dev(dev, upper_dev))
5996 return -EEXIST;
5997
5998 if (master && netdev_master_upper_dev_get(dev))
5999 return -EBUSY;
6000
6001 changeupper_info.upper_dev = upper_dev;
6002 changeupper_info.master = master;
6003 changeupper_info.linking = true;
6004 changeupper_info.upper_info = upper_info;
6005
6006 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6007 &changeupper_info.info);
6008 ret = notifier_to_errno(ret);
6009 if (ret)
6010 return ret;
6011
6012 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
6013 master);
6014 if (ret)
6015 return ret;
6016
6017 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6018 &changeupper_info.info);
6019 ret = notifier_to_errno(ret);
6020 if (ret)
6021 goto rollback;
6022
6023 return 0;
6024
6025rollback:
6026 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6027
6028 return ret;
6029}
6030
6031/**
6032 * netdev_upper_dev_link - Add a link to the upper device
6033 * @dev: device
6034 * @upper_dev: new upper device
6035 *
6036 * Adds a link to device which is upper to this one. The caller must hold
6037 * the RTNL lock. On a failure a negative errno code is returned.
6038 * On success the reference counts are adjusted and the function
6039 * returns zero.
6040 */
6041int netdev_upper_dev_link(struct net_device *dev,
6042 struct net_device *upper_dev)
6043{
6044 return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
6045}
6046EXPORT_SYMBOL(netdev_upper_dev_link);
6047
6048/**
6049 * netdev_master_upper_dev_link - Add a master link to the upper device
6050 * @dev: device
6051 * @upper_dev: new upper device
6052 * @upper_priv: upper device private
6053 * @upper_info: upper info to be passed down via notifier
6054 *
6055 * Adds a link to device which is upper to this one. In this case, only
6056 * one master upper device can be linked, although other non-master devices
6057 * might be linked as well. The caller must hold the RTNL lock.
6058 * On a failure a negative errno code is returned. On success the reference
6059 * counts are adjusted and the function returns zero.
6060 */
6061int netdev_master_upper_dev_link(struct net_device *dev,
6062 struct net_device *upper_dev,
6063 void *upper_priv, void *upper_info)
6064{
6065 return __netdev_upper_dev_link(dev, upper_dev, true,
6066 upper_priv, upper_info);
6067}
6068EXPORT_SYMBOL(netdev_master_upper_dev_link);
6069
6070/**
6071 * netdev_upper_dev_unlink - Removes a link to upper device
6072 * @dev: device
6073 * @upper_dev: new upper device
6074 *
6075 * Removes a link to device which is upper to this one. The caller must hold
6076 * the RTNL lock.
6077 */
6078void netdev_upper_dev_unlink(struct net_device *dev,
6079 struct net_device *upper_dev)
6080{
6081 struct netdev_notifier_changeupper_info changeupper_info;
6082 ASSERT_RTNL();
6083
6084 changeupper_info.upper_dev = upper_dev;
6085 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
6086 changeupper_info.linking = false;
6087
6088 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
6089 &changeupper_info.info);
6090
6091 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
6092
6093 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
6094 &changeupper_info.info);
6095}
6096EXPORT_SYMBOL(netdev_upper_dev_unlink);
6097
6098/**
6099 * netdev_bonding_info_change - Dispatch event about slave change
6100 * @dev: device
6101 * @bonding_info: info to dispatch
6102 *
6103 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
6104 * The caller must hold the RTNL lock.
6105 */
6106void netdev_bonding_info_change(struct net_device *dev,
6107 struct netdev_bonding_info *bonding_info)
6108{
6109 struct netdev_notifier_bonding_info info;
6110
6111 memcpy(&info.bonding_info, bonding_info,
6112 sizeof(struct netdev_bonding_info));
6113 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
6114 &info.info);
6115}
6116EXPORT_SYMBOL(netdev_bonding_info_change);
6117
6118static void netdev_adjacent_add_links(struct net_device *dev)
6119{
6120 struct netdev_adjacent *iter;
6121
6122 struct net *net = dev_net(dev);
6123
6124 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6125 if (!net_eq(net, dev_net(iter->dev)))
6126 continue;
6127 netdev_adjacent_sysfs_add(iter->dev, dev,
6128 &iter->dev->adj_list.lower);
6129 netdev_adjacent_sysfs_add(dev, iter->dev,
6130 &dev->adj_list.upper);
6131 }
6132
6133 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6134 if (!net_eq(net, dev_net(iter->dev)))
6135 continue;
6136 netdev_adjacent_sysfs_add(iter->dev, dev,
6137 &iter->dev->adj_list.upper);
6138 netdev_adjacent_sysfs_add(dev, iter->dev,
6139 &dev->adj_list.lower);
6140 }
6141}
6142
6143static void netdev_adjacent_del_links(struct net_device *dev)
6144{
6145 struct netdev_adjacent *iter;
6146
6147 struct net *net = dev_net(dev);
6148
6149 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6150 if (!net_eq(net, dev_net(iter->dev)))
6151 continue;
6152 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6153 &iter->dev->adj_list.lower);
6154 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6155 &dev->adj_list.upper);
6156 }
6157
6158 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6159 if (!net_eq(net, dev_net(iter->dev)))
6160 continue;
6161 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6162 &iter->dev->adj_list.upper);
6163 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6164 &dev->adj_list.lower);
6165 }
6166}
6167
6168void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6169{
6170 struct netdev_adjacent *iter;
6171
6172 struct net *net = dev_net(dev);
6173
6174 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6175 if (!net_eq(net, dev_net(iter->dev)))
6176 continue;
6177 netdev_adjacent_sysfs_del(iter->dev, oldname,
6178 &iter->dev->adj_list.lower);
6179 netdev_adjacent_sysfs_add(iter->dev, dev,
6180 &iter->dev->adj_list.lower);
6181 }
6182
6183 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6184 if (!net_eq(net, dev_net(iter->dev)))
6185 continue;
6186 netdev_adjacent_sysfs_del(iter->dev, oldname,
6187 &iter->dev->adj_list.upper);
6188 netdev_adjacent_sysfs_add(iter->dev, dev,
6189 &iter->dev->adj_list.upper);
6190 }
6191}
6192
6193void *netdev_lower_dev_get_private(struct net_device *dev,
6194 struct net_device *lower_dev)
6195{
6196 struct netdev_adjacent *lower;
6197
6198 if (!lower_dev)
6199 return NULL;
6200 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6201 if (!lower)
6202 return NULL;
6203
6204 return lower->private;
6205}
6206EXPORT_SYMBOL(netdev_lower_dev_get_private);
6207
6208
6209int dev_get_nest_level(struct net_device *dev)
6210{
6211 struct net_device *lower = NULL;
6212 struct list_head *iter;
6213 int max_nest = -1;
6214 int nest;
6215
6216 ASSERT_RTNL();
6217
6218 netdev_for_each_lower_dev(dev, lower, iter) {
6219 nest = dev_get_nest_level(lower);
6220 if (max_nest < nest)
6221 max_nest = nest;
6222 }
6223
6224 return max_nest + 1;
6225}
6226EXPORT_SYMBOL(dev_get_nest_level);
6227
6228/**
6229 * netdev_lower_change - Dispatch event about lower device state change
6230 * @lower_dev: device
6231 * @lower_state_info: state to dispatch
6232 *
6233 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6234 * The caller must hold the RTNL lock.
6235 */
6236void netdev_lower_state_changed(struct net_device *lower_dev,
6237 void *lower_state_info)
6238{
6239 struct netdev_notifier_changelowerstate_info changelowerstate_info;
6240
6241 ASSERT_RTNL();
6242 changelowerstate_info.lower_state_info = lower_state_info;
6243 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6244 &changelowerstate_info.info);
6245}
6246EXPORT_SYMBOL(netdev_lower_state_changed);
6247
6248int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6249 struct neighbour *n)
6250{
6251 struct net_device *lower_dev, *stop_dev;
6252 struct list_head *iter;
6253 int err;
6254
6255 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6256 if (!lower_dev->netdev_ops->ndo_neigh_construct)
6257 continue;
6258 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6259 if (err) {
6260 stop_dev = lower_dev;
6261 goto rollback;
6262 }
6263 }
6264 return 0;
6265
6266rollback:
6267 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6268 if (lower_dev == stop_dev)
6269 break;
6270 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6271 continue;
6272 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6273 }
6274 return err;
6275}
6276EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6277
6278void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6279 struct neighbour *n)
6280{
6281 struct net_device *lower_dev;
6282 struct list_head *iter;
6283
6284 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6285 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6286 continue;
6287 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6288 }
6289}
6290EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6291
6292static void dev_change_rx_flags(struct net_device *dev, int flags)
6293{
6294 const struct net_device_ops *ops = dev->netdev_ops;
6295
6296 if (ops->ndo_change_rx_flags)
6297 ops->ndo_change_rx_flags(dev, flags);
6298}
6299
6300static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6301{
6302 unsigned int old_flags = dev->flags;
6303 kuid_t uid;
6304 kgid_t gid;
6305
6306 ASSERT_RTNL();
6307
6308 dev->flags |= IFF_PROMISC;
6309 dev->promiscuity += inc;
6310 if (dev->promiscuity == 0) {
6311 /*
6312 * Avoid overflow.
6313 * If inc causes overflow, untouch promisc and return error.
6314 */
6315 if (inc < 0)
6316 dev->flags &= ~IFF_PROMISC;
6317 else {
6318 dev->promiscuity -= inc;
6319 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6320 dev->name);
6321 return -EOVERFLOW;
6322 }
6323 }
6324 if (dev->flags != old_flags) {
6325 pr_info("device %s %s promiscuous mode\n",
6326 dev->name,
6327 dev->flags & IFF_PROMISC ? "entered" : "left");
6328 if (audit_enabled) {
6329 current_uid_gid(&uid, &gid);
6330 audit_log(current->audit_context, GFP_ATOMIC,
6331 AUDIT_ANOM_PROMISCUOUS,
6332 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6333 dev->name, (dev->flags & IFF_PROMISC),
6334 (old_flags & IFF_PROMISC),
6335 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6336 from_kuid(&init_user_ns, uid),
6337 from_kgid(&init_user_ns, gid),
6338 audit_get_sessionid(current));
6339 }
6340
6341 dev_change_rx_flags(dev, IFF_PROMISC);
6342 }
6343 if (notify)
6344 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6345 return 0;
6346}
6347
6348/**
6349 * dev_set_promiscuity - update promiscuity count on a device
6350 * @dev: device
6351 * @inc: modifier
6352 *
6353 * Add or remove promiscuity from a device. While the count in the device
6354 * remains above zero the interface remains promiscuous. Once it hits zero
6355 * the device reverts back to normal filtering operation. A negative inc
6356 * value is used to drop promiscuity on the device.
6357 * Return 0 if successful or a negative errno code on error.
6358 */
6359int dev_set_promiscuity(struct net_device *dev, int inc)
6360{
6361 unsigned int old_flags = dev->flags;
6362 int err;
6363
6364 err = __dev_set_promiscuity(dev, inc, true);
6365 if (err < 0)
6366 return err;
6367 if (dev->flags != old_flags)
6368 dev_set_rx_mode(dev);
6369 return err;
6370}
6371EXPORT_SYMBOL(dev_set_promiscuity);
6372
6373static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6374{
6375 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6376
6377 ASSERT_RTNL();
6378
6379 dev->flags |= IFF_ALLMULTI;
6380 dev->allmulti += inc;
6381 if (dev->allmulti == 0) {
6382 /*
6383 * Avoid overflow.
6384 * If inc causes overflow, untouch allmulti and return error.
6385 */
6386 if (inc < 0)
6387 dev->flags &= ~IFF_ALLMULTI;
6388 else {
6389 dev->allmulti -= inc;
6390 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6391 dev->name);
6392 return -EOVERFLOW;
6393 }
6394 }
6395 if (dev->flags ^ old_flags) {
6396 dev_change_rx_flags(dev, IFF_ALLMULTI);
6397 dev_set_rx_mode(dev);
6398 if (notify)
6399 __dev_notify_flags(dev, old_flags,
6400 dev->gflags ^ old_gflags);
6401 }
6402 return 0;
6403}
6404
6405/**
6406 * dev_set_allmulti - update allmulti count on a device
6407 * @dev: device
6408 * @inc: modifier
6409 *
6410 * Add or remove reception of all multicast frames to a device. While the
6411 * count in the device remains above zero the interface remains listening
6412 * to all interfaces. Once it hits zero the device reverts back to normal
6413 * filtering operation. A negative @inc value is used to drop the counter
6414 * when releasing a resource needing all multicasts.
6415 * Return 0 if successful or a negative errno code on error.
6416 */
6417
6418int dev_set_allmulti(struct net_device *dev, int inc)
6419{
6420 return __dev_set_allmulti(dev, inc, true);
6421}
6422EXPORT_SYMBOL(dev_set_allmulti);
6423
6424/*
6425 * Upload unicast and multicast address lists to device and
6426 * configure RX filtering. When the device doesn't support unicast
6427 * filtering it is put in promiscuous mode while unicast addresses
6428 * are present.
6429 */
6430void __dev_set_rx_mode(struct net_device *dev)
6431{
6432 const struct net_device_ops *ops = dev->netdev_ops;
6433
6434 /* dev_open will call this function so the list will stay sane. */
6435 if (!(dev->flags&IFF_UP))
6436 return;
6437
6438 if (!netif_device_present(dev))
6439 return;
6440
6441 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6442 /* Unicast addresses changes may only happen under the rtnl,
6443 * therefore calling __dev_set_promiscuity here is safe.
6444 */
6445 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6446 __dev_set_promiscuity(dev, 1, false);
6447 dev->uc_promisc = true;
6448 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6449 __dev_set_promiscuity(dev, -1, false);
6450 dev->uc_promisc = false;
6451 }
6452 }
6453
6454 if (ops->ndo_set_rx_mode)
6455 ops->ndo_set_rx_mode(dev);
6456}
6457
6458void dev_set_rx_mode(struct net_device *dev)
6459{
6460 netif_addr_lock_bh(dev);
6461 __dev_set_rx_mode(dev);
6462 netif_addr_unlock_bh(dev);
6463}
6464
6465/**
6466 * dev_get_flags - get flags reported to userspace
6467 * @dev: device
6468 *
6469 * Get the combination of flag bits exported through APIs to userspace.
6470 */
6471unsigned int dev_get_flags(const struct net_device *dev)
6472{
6473 unsigned int flags;
6474
6475 flags = (dev->flags & ~(IFF_PROMISC |
6476 IFF_ALLMULTI |
6477 IFF_RUNNING |
6478 IFF_LOWER_UP |
6479 IFF_DORMANT)) |
6480 (dev->gflags & (IFF_PROMISC |
6481 IFF_ALLMULTI));
6482
6483 if (netif_running(dev)) {
6484 if (netif_oper_up(dev))
6485 flags |= IFF_RUNNING;
6486 if (netif_carrier_ok(dev))
6487 flags |= IFF_LOWER_UP;
6488 if (netif_dormant(dev))
6489 flags |= IFF_DORMANT;
6490 }
6491
6492 return flags;
6493}
6494EXPORT_SYMBOL(dev_get_flags);
6495
6496int __dev_change_flags(struct net_device *dev, unsigned int flags)
6497{
6498 unsigned int old_flags = dev->flags;
6499 int ret;
6500
6501 ASSERT_RTNL();
6502
6503 /*
6504 * Set the flags on our device.
6505 */
6506
6507 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6508 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6509 IFF_AUTOMEDIA)) |
6510 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6511 IFF_ALLMULTI));
6512
6513 /*
6514 * Load in the correct multicast list now the flags have changed.
6515 */
6516
6517 if ((old_flags ^ flags) & IFF_MULTICAST)
6518 dev_change_rx_flags(dev, IFF_MULTICAST);
6519
6520 dev_set_rx_mode(dev);
6521
6522 /*
6523 * Have we downed the interface. We handle IFF_UP ourselves
6524 * according to user attempts to set it, rather than blindly
6525 * setting it.
6526 */
6527
6528 ret = 0;
6529 if ((old_flags ^ flags) & IFF_UP)
6530 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6531
6532 if ((flags ^ dev->gflags) & IFF_PROMISC) {
6533 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6534 unsigned int old_flags = dev->flags;
6535
6536 dev->gflags ^= IFF_PROMISC;
6537
6538 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6539 if (dev->flags != old_flags)
6540 dev_set_rx_mode(dev);
6541 }
6542
6543 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6544 is important. Some (broken) drivers set IFF_PROMISC, when
6545 IFF_ALLMULTI is requested not asking us and not reporting.
6546 */
6547 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6548 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6549
6550 dev->gflags ^= IFF_ALLMULTI;
6551 __dev_set_allmulti(dev, inc, false);
6552 }
6553
6554 return ret;
6555}
6556
6557void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6558 unsigned int gchanges)
6559{
6560 unsigned int changes = dev->flags ^ old_flags;
6561
6562 if (gchanges)
6563 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6564
6565 if (changes & IFF_UP) {
6566 if (dev->flags & IFF_UP)
6567 call_netdevice_notifiers(NETDEV_UP, dev);
6568 else
6569 call_netdevice_notifiers(NETDEV_DOWN, dev);
6570 }
6571
6572 if (dev->flags & IFF_UP &&
6573 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6574 struct netdev_notifier_change_info change_info;
6575
6576 change_info.flags_changed = changes;
6577 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6578 &change_info.info);
6579 }
6580}
6581
6582/**
6583 * dev_change_flags - change device settings
6584 * @dev: device
6585 * @flags: device state flags
6586 *
6587 * Change settings on device based state flags. The flags are
6588 * in the userspace exported format.
6589 */
6590int dev_change_flags(struct net_device *dev, unsigned int flags)
6591{
6592 int ret;
6593 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6594
6595 ret = __dev_change_flags(dev, flags);
6596 if (ret < 0)
6597 return ret;
6598
6599 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6600 __dev_notify_flags(dev, old_flags, changes);
6601 return ret;
6602}
6603EXPORT_SYMBOL(dev_change_flags);
6604
6605static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6606{
6607 const struct net_device_ops *ops = dev->netdev_ops;
6608
6609 if (ops->ndo_change_mtu)
6610 return ops->ndo_change_mtu(dev, new_mtu);
6611
6612 dev->mtu = new_mtu;
6613 return 0;
6614}
6615
6616/**
6617 * dev_set_mtu - Change maximum transfer unit
6618 * @dev: device
6619 * @new_mtu: new transfer unit
6620 *
6621 * Change the maximum transfer size of the network device.
6622 */
6623int dev_set_mtu(struct net_device *dev, int new_mtu)
6624{
6625 int err, orig_mtu;
6626
6627 if (new_mtu == dev->mtu)
6628 return 0;
6629
6630 /* MTU must be positive, and in range */
6631 if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6632 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6633 dev->name, new_mtu, dev->min_mtu);
6634 return -EINVAL;
6635 }
6636
6637 if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6638 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6639 dev->name, new_mtu, dev->max_mtu);
6640 return -EINVAL;
6641 }
6642
6643 if (!netif_device_present(dev))
6644 return -ENODEV;
6645
6646 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6647 err = notifier_to_errno(err);
6648 if (err)
6649 return err;
6650
6651 orig_mtu = dev->mtu;
6652 err = __dev_set_mtu(dev, new_mtu);
6653
6654 if (!err) {
6655 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6656 err = notifier_to_errno(err);
6657 if (err) {
6658 /* setting mtu back and notifying everyone again,
6659 * so that they have a chance to revert changes.
6660 */
6661 __dev_set_mtu(dev, orig_mtu);
6662 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6663 }
6664 }
6665 return err;
6666}
6667EXPORT_SYMBOL(dev_set_mtu);
6668
6669/**
6670 * dev_set_group - Change group this device belongs to
6671 * @dev: device
6672 * @new_group: group this device should belong to
6673 */
6674void dev_set_group(struct net_device *dev, int new_group)
6675{
6676 dev->group = new_group;
6677}
6678EXPORT_SYMBOL(dev_set_group);
6679
6680/**
6681 * dev_set_mac_address - Change Media Access Control Address
6682 * @dev: device
6683 * @sa: new address
6684 *
6685 * Change the hardware (MAC) address of the device
6686 */
6687int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6688{
6689 const struct net_device_ops *ops = dev->netdev_ops;
6690 int err;
6691
6692 if (!ops->ndo_set_mac_address)
6693 return -EOPNOTSUPP;
6694 if (sa->sa_family != dev->type)
6695 return -EINVAL;
6696 if (!netif_device_present(dev))
6697 return -ENODEV;
6698 err = ops->ndo_set_mac_address(dev, sa);
6699 if (err)
6700 return err;
6701 dev->addr_assign_type = NET_ADDR_SET;
6702 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6703 add_device_randomness(dev->dev_addr, dev->addr_len);
6704 return 0;
6705}
6706EXPORT_SYMBOL(dev_set_mac_address);
6707
6708/**
6709 * dev_change_carrier - Change device carrier
6710 * @dev: device
6711 * @new_carrier: new value
6712 *
6713 * Change device carrier
6714 */
6715int dev_change_carrier(struct net_device *dev, bool new_carrier)
6716{
6717 const struct net_device_ops *ops = dev->netdev_ops;
6718
6719 if (!ops->ndo_change_carrier)
6720 return -EOPNOTSUPP;
6721 if (!netif_device_present(dev))
6722 return -ENODEV;
6723 return ops->ndo_change_carrier(dev, new_carrier);
6724}
6725EXPORT_SYMBOL(dev_change_carrier);
6726
6727/**
6728 * dev_get_phys_port_id - Get device physical port ID
6729 * @dev: device
6730 * @ppid: port ID
6731 *
6732 * Get device physical port ID
6733 */
6734int dev_get_phys_port_id(struct net_device *dev,
6735 struct netdev_phys_item_id *ppid)
6736{
6737 const struct net_device_ops *ops = dev->netdev_ops;
6738
6739 if (!ops->ndo_get_phys_port_id)
6740 return -EOPNOTSUPP;
6741 return ops->ndo_get_phys_port_id(dev, ppid);
6742}
6743EXPORT_SYMBOL(dev_get_phys_port_id);
6744
6745/**
6746 * dev_get_phys_port_name - Get device physical port name
6747 * @dev: device
6748 * @name: port name
6749 * @len: limit of bytes to copy to name
6750 *
6751 * Get device physical port name
6752 */
6753int dev_get_phys_port_name(struct net_device *dev,
6754 char *name, size_t len)
6755{
6756 const struct net_device_ops *ops = dev->netdev_ops;
6757
6758 if (!ops->ndo_get_phys_port_name)
6759 return -EOPNOTSUPP;
6760 return ops->ndo_get_phys_port_name(dev, name, len);
6761}
6762EXPORT_SYMBOL(dev_get_phys_port_name);
6763
6764/**
6765 * dev_change_proto_down - update protocol port state information
6766 * @dev: device
6767 * @proto_down: new value
6768 *
6769 * This info can be used by switch drivers to set the phys state of the
6770 * port.
6771 */
6772int dev_change_proto_down(struct net_device *dev, bool proto_down)
6773{
6774 const struct net_device_ops *ops = dev->netdev_ops;
6775
6776 if (!ops->ndo_change_proto_down)
6777 return -EOPNOTSUPP;
6778 if (!netif_device_present(dev))
6779 return -ENODEV;
6780 return ops->ndo_change_proto_down(dev, proto_down);
6781}
6782EXPORT_SYMBOL(dev_change_proto_down);
6783
6784/**
6785 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
6786 * @dev: device
6787 * @fd: new program fd or negative value to clear
6788 * @flags: xdp-related flags
6789 *
6790 * Set or clear a bpf program for a device
6791 */
6792int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags)
6793{
6794 const struct net_device_ops *ops = dev->netdev_ops;
6795 struct bpf_prog *prog = NULL;
6796 struct netdev_xdp xdp;
6797 int err;
6798
6799 ASSERT_RTNL();
6800
6801 if (!ops->ndo_xdp)
6802 return -EOPNOTSUPP;
6803 if (fd >= 0) {
6804 if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
6805 memset(&xdp, 0, sizeof(xdp));
6806 xdp.command = XDP_QUERY_PROG;
6807
6808 err = ops->ndo_xdp(dev, &xdp);
6809 if (err < 0)
6810 return err;
6811 if (xdp.prog_attached)
6812 return -EBUSY;
6813 }
6814
6815 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6816 if (IS_ERR(prog))
6817 return PTR_ERR(prog);
6818 }
6819
6820 memset(&xdp, 0, sizeof(xdp));
6821 xdp.command = XDP_SETUP_PROG;
6822 xdp.prog = prog;
6823
6824 err = ops->ndo_xdp(dev, &xdp);
6825 if (err < 0 && prog)
6826 bpf_prog_put(prog);
6827
6828 return err;
6829}
6830EXPORT_SYMBOL(dev_change_xdp_fd);
6831
6832/**
6833 * dev_new_index - allocate an ifindex
6834 * @net: the applicable net namespace
6835 *
6836 * Returns a suitable unique value for a new device interface
6837 * number. The caller must hold the rtnl semaphore or the
6838 * dev_base_lock to be sure it remains unique.
6839 */
6840static int dev_new_index(struct net *net)
6841{
6842 int ifindex = net->ifindex;
6843 for (;;) {
6844 if (++ifindex <= 0)
6845 ifindex = 1;
6846 if (!__dev_get_by_index(net, ifindex))
6847 return net->ifindex = ifindex;
6848 }
6849}
6850
6851/* Delayed registration/unregisteration */
6852static LIST_HEAD(net_todo_list);
6853DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6854
6855static void net_set_todo(struct net_device *dev)
6856{
6857 list_add_tail(&dev->todo_list, &net_todo_list);
6858 dev_net(dev)->dev_unreg_count++;
6859}
6860
6861static void rollback_registered_many(struct list_head *head)
6862{
6863 struct net_device *dev, *tmp;
6864 LIST_HEAD(close_head);
6865
6866 BUG_ON(dev_boot_phase);
6867 ASSERT_RTNL();
6868
6869 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6870 /* Some devices call without registering
6871 * for initialization unwind. Remove those
6872 * devices and proceed with the remaining.
6873 */
6874 if (dev->reg_state == NETREG_UNINITIALIZED) {
6875 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6876 dev->name, dev);
6877
6878 WARN_ON(1);
6879 list_del(&dev->unreg_list);
6880 continue;
6881 }
6882 dev->dismantle = true;
6883 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6884 }
6885
6886 /* If device is running, close it first. */
6887 list_for_each_entry(dev, head, unreg_list)
6888 list_add_tail(&dev->close_list, &close_head);
6889 dev_close_many(&close_head, true);
6890
6891 list_for_each_entry(dev, head, unreg_list) {
6892 /* And unlink it from device chain. */
6893 unlist_netdevice(dev);
6894
6895 dev->reg_state = NETREG_UNREGISTERING;
6896 }
6897 flush_all_backlogs();
6898
6899 synchronize_net();
6900
6901 list_for_each_entry(dev, head, unreg_list) {
6902 struct sk_buff *skb = NULL;
6903
6904 /* Shutdown queueing discipline. */
6905 dev_shutdown(dev);
6906
6907
6908 /* Notify protocols, that we are about to destroy
6909 this device. They should clean all the things.
6910 */
6911 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6912
6913 if (!dev->rtnl_link_ops ||
6914 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6915 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6916 GFP_KERNEL);
6917
6918 /*
6919 * Flush the unicast and multicast chains
6920 */
6921 dev_uc_flush(dev);
6922 dev_mc_flush(dev);
6923
6924 if (dev->netdev_ops->ndo_uninit)
6925 dev->netdev_ops->ndo_uninit(dev);
6926
6927 if (skb)
6928 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6929
6930 /* Notifier chain MUST detach us all upper devices. */
6931 WARN_ON(netdev_has_any_upper_dev(dev));
6932 WARN_ON(netdev_has_any_lower_dev(dev));
6933
6934 /* Remove entries from kobject tree */
6935 netdev_unregister_kobject(dev);
6936#ifdef CONFIG_XPS
6937 /* Remove XPS queueing entries */
6938 netif_reset_xps_queues_gt(dev, 0);
6939#endif
6940 }
6941
6942 synchronize_net();
6943
6944 list_for_each_entry(dev, head, unreg_list)
6945 dev_put(dev);
6946}
6947
6948static void rollback_registered(struct net_device *dev)
6949{
6950 LIST_HEAD(single);
6951
6952 list_add(&dev->unreg_list, &single);
6953 rollback_registered_many(&single);
6954 list_del(&single);
6955}
6956
6957static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6958 struct net_device *upper, netdev_features_t features)
6959{
6960 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6961 netdev_features_t feature;
6962 int feature_bit;
6963
6964 for_each_netdev_feature(&upper_disables, feature_bit) {
6965 feature = __NETIF_F_BIT(feature_bit);
6966 if (!(upper->wanted_features & feature)
6967 && (features & feature)) {
6968 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6969 &feature, upper->name);
6970 features &= ~feature;
6971 }
6972 }
6973
6974 return features;
6975}
6976
6977static void netdev_sync_lower_features(struct net_device *upper,
6978 struct net_device *lower, netdev_features_t features)
6979{
6980 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6981 netdev_features_t feature;
6982 int feature_bit;
6983
6984 for_each_netdev_feature(&upper_disables, feature_bit) {
6985 feature = __NETIF_F_BIT(feature_bit);
6986 if (!(features & feature) && (lower->features & feature)) {
6987 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6988 &feature, lower->name);
6989 lower->wanted_features &= ~feature;
6990 netdev_update_features(lower);
6991
6992 if (unlikely(lower->features & feature))
6993 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6994 &feature, lower->name);
6995 }
6996 }
6997}
6998
6999static netdev_features_t netdev_fix_features(struct net_device *dev,
7000 netdev_features_t features)
7001{
7002 /* Fix illegal checksum combinations */
7003 if ((features & NETIF_F_HW_CSUM) &&
7004 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
7005 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
7006 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
7007 }
7008
7009 /* TSO requires that SG is present as well. */
7010 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
7011 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
7012 features &= ~NETIF_F_ALL_TSO;
7013 }
7014
7015 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
7016 !(features & NETIF_F_IP_CSUM)) {
7017 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
7018 features &= ~NETIF_F_TSO;
7019 features &= ~NETIF_F_TSO_ECN;
7020 }
7021
7022 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
7023 !(features & NETIF_F_IPV6_CSUM)) {
7024 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
7025 features &= ~NETIF_F_TSO6;
7026 }
7027
7028 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
7029 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
7030 features &= ~NETIF_F_TSO_MANGLEID;
7031
7032 /* TSO ECN requires that TSO is present as well. */
7033 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
7034 features &= ~NETIF_F_TSO_ECN;
7035
7036 /* Software GSO depends on SG. */
7037 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
7038 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
7039 features &= ~NETIF_F_GSO;
7040 }
7041
7042 /* UFO needs SG and checksumming */
7043 if (features & NETIF_F_UFO) {
7044 /* maybe split UFO into V4 and V6? */
7045 if (!(features & NETIF_F_HW_CSUM) &&
7046 ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
7047 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
7048 netdev_dbg(dev,
7049 "Dropping NETIF_F_UFO since no checksum offload features.\n");
7050 features &= ~NETIF_F_UFO;
7051 }
7052
7053 if (!(features & NETIF_F_SG)) {
7054 netdev_dbg(dev,
7055 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
7056 features &= ~NETIF_F_UFO;
7057 }
7058 }
7059
7060 /* GSO partial features require GSO partial be set */
7061 if ((features & dev->gso_partial_features) &&
7062 !(features & NETIF_F_GSO_PARTIAL)) {
7063 netdev_dbg(dev,
7064 "Dropping partially supported GSO features since no GSO partial.\n");
7065 features &= ~dev->gso_partial_features;
7066 }
7067
7068#ifdef CONFIG_NET_RX_BUSY_POLL
7069 if (dev->netdev_ops->ndo_busy_poll)
7070 features |= NETIF_F_BUSY_POLL;
7071 else
7072#endif
7073 features &= ~NETIF_F_BUSY_POLL;
7074
7075 return features;
7076}
7077
7078int __netdev_update_features(struct net_device *dev)
7079{
7080 struct net_device *upper, *lower;
7081 netdev_features_t features;
7082 struct list_head *iter;
7083 int err = -1;
7084
7085 ASSERT_RTNL();
7086
7087 features = netdev_get_wanted_features(dev);
7088
7089 if (dev->netdev_ops->ndo_fix_features)
7090 features = dev->netdev_ops->ndo_fix_features(dev, features);
7091
7092 /* driver might be less strict about feature dependencies */
7093 features = netdev_fix_features(dev, features);
7094
7095 /* some features can't be enabled if they're off an an upper device */
7096 netdev_for_each_upper_dev_rcu(dev, upper, iter)
7097 features = netdev_sync_upper_features(dev, upper, features);
7098
7099 if (dev->features == features)
7100 goto sync_lower;
7101
7102 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
7103 &dev->features, &features);
7104
7105 if (dev->netdev_ops->ndo_set_features)
7106 err = dev->netdev_ops->ndo_set_features(dev, features);
7107 else
7108 err = 0;
7109
7110 if (unlikely(err < 0)) {
7111 netdev_err(dev,
7112 "set_features() failed (%d); wanted %pNF, left %pNF\n",
7113 err, &features, &dev->features);
7114 /* return non-0 since some features might have changed and
7115 * it's better to fire a spurious notification than miss it
7116 */
7117 return -1;
7118 }
7119
7120sync_lower:
7121 /* some features must be disabled on lower devices when disabled
7122 * on an upper device (think: bonding master or bridge)
7123 */
7124 netdev_for_each_lower_dev(dev, lower, iter)
7125 netdev_sync_lower_features(dev, lower, features);
7126
7127 if (!err)
7128 dev->features = features;
7129
7130 return err < 0 ? 0 : 1;
7131}
7132
7133/**
7134 * netdev_update_features - recalculate device features
7135 * @dev: the device to check
7136 *
7137 * Recalculate dev->features set and send notifications if it
7138 * has changed. Should be called after driver or hardware dependent
7139 * conditions might have changed that influence the features.
7140 */
7141void netdev_update_features(struct net_device *dev)
7142{
7143 if (__netdev_update_features(dev))
7144 netdev_features_change(dev);
7145}
7146EXPORT_SYMBOL(netdev_update_features);
7147
7148/**
7149 * netdev_change_features - recalculate device features
7150 * @dev: the device to check
7151 *
7152 * Recalculate dev->features set and send notifications even
7153 * if they have not changed. Should be called instead of
7154 * netdev_update_features() if also dev->vlan_features might
7155 * have changed to allow the changes to be propagated to stacked
7156 * VLAN devices.
7157 */
7158void netdev_change_features(struct net_device *dev)
7159{
7160 __netdev_update_features(dev);
7161 netdev_features_change(dev);
7162}
7163EXPORT_SYMBOL(netdev_change_features);
7164
7165/**
7166 * netif_stacked_transfer_operstate - transfer operstate
7167 * @rootdev: the root or lower level device to transfer state from
7168 * @dev: the device to transfer operstate to
7169 *
7170 * Transfer operational state from root to device. This is normally
7171 * called when a stacking relationship exists between the root
7172 * device and the device(a leaf device).
7173 */
7174void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7175 struct net_device *dev)
7176{
7177 if (rootdev->operstate == IF_OPER_DORMANT)
7178 netif_dormant_on(dev);
7179 else
7180 netif_dormant_off(dev);
7181
7182 if (netif_carrier_ok(rootdev)) {
7183 if (!netif_carrier_ok(dev))
7184 netif_carrier_on(dev);
7185 } else {
7186 if (netif_carrier_ok(dev))
7187 netif_carrier_off(dev);
7188 }
7189}
7190EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7191
7192#ifdef CONFIG_SYSFS
7193static int netif_alloc_rx_queues(struct net_device *dev)
7194{
7195 unsigned int i, count = dev->num_rx_queues;
7196 struct netdev_rx_queue *rx;
7197 size_t sz = count * sizeof(*rx);
7198
7199 BUG_ON(count < 1);
7200
7201 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7202 if (!rx) {
7203 rx = vzalloc(sz);
7204 if (!rx)
7205 return -ENOMEM;
7206 }
7207 dev->_rx = rx;
7208
7209 for (i = 0; i < count; i++)
7210 rx[i].dev = dev;
7211 return 0;
7212}
7213#endif
7214
7215static void netdev_init_one_queue(struct net_device *dev,
7216 struct netdev_queue *queue, void *_unused)
7217{
7218 /* Initialize queue lock */
7219 spin_lock_init(&queue->_xmit_lock);
7220 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7221 queue->xmit_lock_owner = -1;
7222 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7223 queue->dev = dev;
7224#ifdef CONFIG_BQL
7225 dql_init(&queue->dql, HZ);
7226#endif
7227}
7228
7229static void netif_free_tx_queues(struct net_device *dev)
7230{
7231 kvfree(dev->_tx);
7232}
7233
7234static int netif_alloc_netdev_queues(struct net_device *dev)
7235{
7236 unsigned int count = dev->num_tx_queues;
7237 struct netdev_queue *tx;
7238 size_t sz = count * sizeof(*tx);
7239
7240 if (count < 1 || count > 0xffff)
7241 return -EINVAL;
7242
7243 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7244 if (!tx) {
7245 tx = vzalloc(sz);
7246 if (!tx)
7247 return -ENOMEM;
7248 }
7249 dev->_tx = tx;
7250
7251 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7252 spin_lock_init(&dev->tx_global_lock);
7253
7254 return 0;
7255}
7256
7257void netif_tx_stop_all_queues(struct net_device *dev)
7258{
7259 unsigned int i;
7260
7261 for (i = 0; i < dev->num_tx_queues; i++) {
7262 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7263 netif_tx_stop_queue(txq);
7264 }
7265}
7266EXPORT_SYMBOL(netif_tx_stop_all_queues);
7267
7268/**
7269 * register_netdevice - register a network device
7270 * @dev: device to register
7271 *
7272 * Take a completed network device structure and add it to the kernel
7273 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7274 * chain. 0 is returned on success. A negative errno code is returned
7275 * on a failure to set up the device, or if the name is a duplicate.
7276 *
7277 * Callers must hold the rtnl semaphore. You may want
7278 * register_netdev() instead of this.
7279 *
7280 * BUGS:
7281 * The locking appears insufficient to guarantee two parallel registers
7282 * will not get the same name.
7283 */
7284
7285int register_netdevice(struct net_device *dev)
7286{
7287 int ret;
7288 struct net *net = dev_net(dev);
7289
7290 BUG_ON(dev_boot_phase);
7291 ASSERT_RTNL();
7292
7293 might_sleep();
7294
7295 /* When net_device's are persistent, this will be fatal. */
7296 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7297 BUG_ON(!net);
7298
7299 spin_lock_init(&dev->addr_list_lock);
7300 netdev_set_addr_lockdep_class(dev);
7301
7302 ret = dev_get_valid_name(net, dev, dev->name);
7303 if (ret < 0)
7304 goto out;
7305
7306 /* Init, if this function is available */
7307 if (dev->netdev_ops->ndo_init) {
7308 ret = dev->netdev_ops->ndo_init(dev);
7309 if (ret) {
7310 if (ret > 0)
7311 ret = -EIO;
7312 goto out;
7313 }
7314 }
7315
7316 if (((dev->hw_features | dev->features) &
7317 NETIF_F_HW_VLAN_CTAG_FILTER) &&
7318 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7319 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7320 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7321 ret = -EINVAL;
7322 goto err_uninit;
7323 }
7324
7325 ret = -EBUSY;
7326 if (!dev->ifindex)
7327 dev->ifindex = dev_new_index(net);
7328 else if (__dev_get_by_index(net, dev->ifindex))
7329 goto err_uninit;
7330
7331 /* Transfer changeable features to wanted_features and enable
7332 * software offloads (GSO and GRO).
7333 */
7334 dev->hw_features |= NETIF_F_SOFT_FEATURES;
7335 dev->features |= NETIF_F_SOFT_FEATURES;
7336 dev->wanted_features = dev->features & dev->hw_features;
7337
7338 if (!(dev->flags & IFF_LOOPBACK))
7339 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7340
7341 /* If IPv4 TCP segmentation offload is supported we should also
7342 * allow the device to enable segmenting the frame with the option
7343 * of ignoring a static IP ID value. This doesn't enable the
7344 * feature itself but allows the user to enable it later.
7345 */
7346 if (dev->hw_features & NETIF_F_TSO)
7347 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7348 if (dev->vlan_features & NETIF_F_TSO)
7349 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7350 if (dev->mpls_features & NETIF_F_TSO)
7351 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7352 if (dev->hw_enc_features & NETIF_F_TSO)
7353 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7354
7355 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7356 */
7357 dev->vlan_features |= NETIF_F_HIGHDMA;
7358
7359 /* Make NETIF_F_SG inheritable to tunnel devices.
7360 */
7361 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7362
7363 /* Make NETIF_F_SG inheritable to MPLS.
7364 */
7365 dev->mpls_features |= NETIF_F_SG;
7366
7367 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7368 ret = notifier_to_errno(ret);
7369 if (ret)
7370 goto err_uninit;
7371
7372 ret = netdev_register_kobject(dev);
7373 if (ret)
7374 goto err_uninit;
7375 dev->reg_state = NETREG_REGISTERED;
7376
7377 __netdev_update_features(dev);
7378
7379 /*
7380 * Default initial state at registry is that the
7381 * device is present.
7382 */
7383
7384 set_bit(__LINK_STATE_PRESENT, &dev->state);
7385
7386 linkwatch_init_dev(dev);
7387
7388 dev_init_scheduler(dev);
7389 dev_hold(dev);
7390 list_netdevice(dev);
7391 add_device_randomness(dev->dev_addr, dev->addr_len);
7392
7393 /* If the device has permanent device address, driver should
7394 * set dev_addr and also addr_assign_type should be set to
7395 * NET_ADDR_PERM (default value).
7396 */
7397 if (dev->addr_assign_type == NET_ADDR_PERM)
7398 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7399
7400 /* Notify protocols, that a new device appeared. */
7401 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7402 ret = notifier_to_errno(ret);
7403 if (ret) {
7404 rollback_registered(dev);
7405 dev->reg_state = NETREG_UNREGISTERED;
7406 }
7407 /*
7408 * Prevent userspace races by waiting until the network
7409 * device is fully setup before sending notifications.
7410 */
7411 if (!dev->rtnl_link_ops ||
7412 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7413 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7414
7415out:
7416 return ret;
7417
7418err_uninit:
7419 if (dev->netdev_ops->ndo_uninit)
7420 dev->netdev_ops->ndo_uninit(dev);
7421 goto out;
7422}
7423EXPORT_SYMBOL(register_netdevice);
7424
7425/**
7426 * init_dummy_netdev - init a dummy network device for NAPI
7427 * @dev: device to init
7428 *
7429 * This takes a network device structure and initialize the minimum
7430 * amount of fields so it can be used to schedule NAPI polls without
7431 * registering a full blown interface. This is to be used by drivers
7432 * that need to tie several hardware interfaces to a single NAPI
7433 * poll scheduler due to HW limitations.
7434 */
7435int init_dummy_netdev(struct net_device *dev)
7436{
7437 /* Clear everything. Note we don't initialize spinlocks
7438 * are they aren't supposed to be taken by any of the
7439 * NAPI code and this dummy netdev is supposed to be
7440 * only ever used for NAPI polls
7441 */
7442 memset(dev, 0, sizeof(struct net_device));
7443
7444 /* make sure we BUG if trying to hit standard
7445 * register/unregister code path
7446 */
7447 dev->reg_state = NETREG_DUMMY;
7448
7449 /* NAPI wants this */
7450 INIT_LIST_HEAD(&dev->napi_list);
7451
7452 /* a dummy interface is started by default */
7453 set_bit(__LINK_STATE_PRESENT, &dev->state);
7454 set_bit(__LINK_STATE_START, &dev->state);
7455
7456 /* Note : We dont allocate pcpu_refcnt for dummy devices,
7457 * because users of this 'device' dont need to change
7458 * its refcount.
7459 */
7460
7461 return 0;
7462}
7463EXPORT_SYMBOL_GPL(init_dummy_netdev);
7464
7465
7466/**
7467 * register_netdev - register a network device
7468 * @dev: device to register
7469 *
7470 * Take a completed network device structure and add it to the kernel
7471 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7472 * chain. 0 is returned on success. A negative errno code is returned
7473 * on a failure to set up the device, or if the name is a duplicate.
7474 *
7475 * This is a wrapper around register_netdevice that takes the rtnl semaphore
7476 * and expands the device name if you passed a format string to
7477 * alloc_netdev.
7478 */
7479int register_netdev(struct net_device *dev)
7480{
7481 int err;
7482
7483 rtnl_lock();
7484 err = register_netdevice(dev);
7485 rtnl_unlock();
7486 return err;
7487}
7488EXPORT_SYMBOL(register_netdev);
7489
7490int netdev_refcnt_read(const struct net_device *dev)
7491{
7492 int i, refcnt = 0;
7493
7494 for_each_possible_cpu(i)
7495 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7496 return refcnt;
7497}
7498EXPORT_SYMBOL(netdev_refcnt_read);
7499
7500/**
7501 * netdev_wait_allrefs - wait until all references are gone.
7502 * @dev: target net_device
7503 *
7504 * This is called when unregistering network devices.
7505 *
7506 * Any protocol or device that holds a reference should register
7507 * for netdevice notification, and cleanup and put back the
7508 * reference if they receive an UNREGISTER event.
7509 * We can get stuck here if buggy protocols don't correctly
7510 * call dev_put.
7511 */
7512static void netdev_wait_allrefs(struct net_device *dev)
7513{
7514 unsigned long rebroadcast_time, warning_time;
7515 int refcnt;
7516
7517 linkwatch_forget_dev(dev);
7518
7519 rebroadcast_time = warning_time = jiffies;
7520 refcnt = netdev_refcnt_read(dev);
7521
7522 while (refcnt != 0) {
7523 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7524 rtnl_lock();
7525
7526 /* Rebroadcast unregister notification */
7527 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7528
7529 __rtnl_unlock();
7530 rcu_barrier();
7531 rtnl_lock();
7532
7533 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7534 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7535 &dev->state)) {
7536 /* We must not have linkwatch events
7537 * pending on unregister. If this
7538 * happens, we simply run the queue
7539 * unscheduled, resulting in a noop
7540 * for this device.
7541 */
7542 linkwatch_run_queue();
7543 }
7544
7545 __rtnl_unlock();
7546
7547 rebroadcast_time = jiffies;
7548 }
7549
7550 msleep(250);
7551
7552 refcnt = netdev_refcnt_read(dev);
7553
7554 if (time_after(jiffies, warning_time + 10 * HZ)) {
7555 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7556 dev->name, refcnt);
7557 warning_time = jiffies;
7558 }
7559 }
7560}
7561
7562/* The sequence is:
7563 *
7564 * rtnl_lock();
7565 * ...
7566 * register_netdevice(x1);
7567 * register_netdevice(x2);
7568 * ...
7569 * unregister_netdevice(y1);
7570 * unregister_netdevice(y2);
7571 * ...
7572 * rtnl_unlock();
7573 * free_netdev(y1);
7574 * free_netdev(y2);
7575 *
7576 * We are invoked by rtnl_unlock().
7577 * This allows us to deal with problems:
7578 * 1) We can delete sysfs objects which invoke hotplug
7579 * without deadlocking with linkwatch via keventd.
7580 * 2) Since we run with the RTNL semaphore not held, we can sleep
7581 * safely in order to wait for the netdev refcnt to drop to zero.
7582 *
7583 * We must not return until all unregister events added during
7584 * the interval the lock was held have been completed.
7585 */
7586void netdev_run_todo(void)
7587{
7588 struct list_head list;
7589
7590 /* Snapshot list, allow later requests */
7591 list_replace_init(&net_todo_list, &list);
7592
7593 __rtnl_unlock();
7594
7595
7596 /* Wait for rcu callbacks to finish before next phase */
7597 if (!list_empty(&list))
7598 rcu_barrier();
7599
7600 while (!list_empty(&list)) {
7601 struct net_device *dev
7602 = list_first_entry(&list, struct net_device, todo_list);
7603 list_del(&dev->todo_list);
7604
7605 rtnl_lock();
7606 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7607 __rtnl_unlock();
7608
7609 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7610 pr_err("network todo '%s' but state %d\n",
7611 dev->name, dev->reg_state);
7612 dump_stack();
7613 continue;
7614 }
7615
7616 dev->reg_state = NETREG_UNREGISTERED;
7617
7618 netdev_wait_allrefs(dev);
7619
7620 /* paranoia */
7621 BUG_ON(netdev_refcnt_read(dev));
7622 BUG_ON(!list_empty(&dev->ptype_all));
7623 BUG_ON(!list_empty(&dev->ptype_specific));
7624 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7625 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7626 WARN_ON(dev->dn_ptr);
7627
7628 if (dev->destructor)
7629 dev->destructor(dev);
7630
7631 /* Report a network device has been unregistered */
7632 rtnl_lock();
7633 dev_net(dev)->dev_unreg_count--;
7634 __rtnl_unlock();
7635 wake_up(&netdev_unregistering_wq);
7636
7637 /* Free network device */
7638 kobject_put(&dev->dev.kobj);
7639 }
7640}
7641
7642/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7643 * all the same fields in the same order as net_device_stats, with only
7644 * the type differing, but rtnl_link_stats64 may have additional fields
7645 * at the end for newer counters.
7646 */
7647void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7648 const struct net_device_stats *netdev_stats)
7649{
7650#if BITS_PER_LONG == 64
7651 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7652 memcpy(stats64, netdev_stats, sizeof(*stats64));
7653 /* zero out counters that only exist in rtnl_link_stats64 */
7654 memset((char *)stats64 + sizeof(*netdev_stats), 0,
7655 sizeof(*stats64) - sizeof(*netdev_stats));
7656#else
7657 size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7658 const unsigned long *src = (const unsigned long *)netdev_stats;
7659 u64 *dst = (u64 *)stats64;
7660
7661 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7662 for (i = 0; i < n; i++)
7663 dst[i] = src[i];
7664 /* zero out counters that only exist in rtnl_link_stats64 */
7665 memset((char *)stats64 + n * sizeof(u64), 0,
7666 sizeof(*stats64) - n * sizeof(u64));
7667#endif
7668}
7669EXPORT_SYMBOL(netdev_stats_to_stats64);
7670
7671/**
7672 * dev_get_stats - get network device statistics
7673 * @dev: device to get statistics from
7674 * @storage: place to store stats
7675 *
7676 * Get network statistics from device. Return @storage.
7677 * The device driver may provide its own method by setting
7678 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7679 * otherwise the internal statistics structure is used.
7680 */
7681struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7682 struct rtnl_link_stats64 *storage)
7683{
7684 const struct net_device_ops *ops = dev->netdev_ops;
7685
7686 if (ops->ndo_get_stats64) {
7687 memset(storage, 0, sizeof(*storage));
7688 ops->ndo_get_stats64(dev, storage);
7689 } else if (ops->ndo_get_stats) {
7690 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7691 } else {
7692 netdev_stats_to_stats64(storage, &dev->stats);
7693 }
7694 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7695 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7696 storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7697 return storage;
7698}
7699EXPORT_SYMBOL(dev_get_stats);
7700
7701struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7702{
7703 struct netdev_queue *queue = dev_ingress_queue(dev);
7704
7705#ifdef CONFIG_NET_CLS_ACT
7706 if (queue)
7707 return queue;
7708 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7709 if (!queue)
7710 return NULL;
7711 netdev_init_one_queue(dev, queue, NULL);
7712 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7713 queue->qdisc_sleeping = &noop_qdisc;
7714 rcu_assign_pointer(dev->ingress_queue, queue);
7715#endif
7716 return queue;
7717}
7718
7719static const struct ethtool_ops default_ethtool_ops;
7720
7721void netdev_set_default_ethtool_ops(struct net_device *dev,
7722 const struct ethtool_ops *ops)
7723{
7724 if (dev->ethtool_ops == &default_ethtool_ops)
7725 dev->ethtool_ops = ops;
7726}
7727EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7728
7729void netdev_freemem(struct net_device *dev)
7730{
7731 char *addr = (char *)dev - dev->padded;
7732
7733 kvfree(addr);
7734}
7735
7736/**
7737 * alloc_netdev_mqs - allocate network device
7738 * @sizeof_priv: size of private data to allocate space for
7739 * @name: device name format string
7740 * @name_assign_type: origin of device name
7741 * @setup: callback to initialize device
7742 * @txqs: the number of TX subqueues to allocate
7743 * @rxqs: the number of RX subqueues to allocate
7744 *
7745 * Allocates a struct net_device with private data area for driver use
7746 * and performs basic initialization. Also allocates subqueue structs
7747 * for each queue on the device.
7748 */
7749struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7750 unsigned char name_assign_type,
7751 void (*setup)(struct net_device *),
7752 unsigned int txqs, unsigned int rxqs)
7753{
7754 struct net_device *dev;
7755 size_t alloc_size;
7756 struct net_device *p;
7757
7758 BUG_ON(strlen(name) >= sizeof(dev->name));
7759
7760 if (txqs < 1) {
7761 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7762 return NULL;
7763 }
7764
7765#ifdef CONFIG_SYSFS
7766 if (rxqs < 1) {
7767 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7768 return NULL;
7769 }
7770#endif
7771
7772 alloc_size = sizeof(struct net_device);
7773 if (sizeof_priv) {
7774 /* ensure 32-byte alignment of private area */
7775 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7776 alloc_size += sizeof_priv;
7777 }
7778 /* ensure 32-byte alignment of whole construct */
7779 alloc_size += NETDEV_ALIGN - 1;
7780
7781 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7782 if (!p)
7783 p = vzalloc(alloc_size);
7784 if (!p)
7785 return NULL;
7786
7787 dev = PTR_ALIGN(p, NETDEV_ALIGN);
7788 dev->padded = (char *)dev - (char *)p;
7789
7790 dev->pcpu_refcnt = alloc_percpu(int);
7791 if (!dev->pcpu_refcnt)
7792 goto free_dev;
7793
7794 if (dev_addr_init(dev))
7795 goto free_pcpu;
7796
7797 dev_mc_init(dev);
7798 dev_uc_init(dev);
7799
7800 dev_net_set(dev, &init_net);
7801
7802 dev->gso_max_size = GSO_MAX_SIZE;
7803 dev->gso_max_segs = GSO_MAX_SEGS;
7804
7805 INIT_LIST_HEAD(&dev->napi_list);
7806 INIT_LIST_HEAD(&dev->unreg_list);
7807 INIT_LIST_HEAD(&dev->close_list);
7808 INIT_LIST_HEAD(&dev->link_watch_list);
7809 INIT_LIST_HEAD(&dev->adj_list.upper);
7810 INIT_LIST_HEAD(&dev->adj_list.lower);
7811 INIT_LIST_HEAD(&dev->ptype_all);
7812 INIT_LIST_HEAD(&dev->ptype_specific);
7813#ifdef CONFIG_NET_SCHED
7814 hash_init(dev->qdisc_hash);
7815#endif
7816 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7817 setup(dev);
7818
7819 if (!dev->tx_queue_len) {
7820 dev->priv_flags |= IFF_NO_QUEUE;
7821 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
7822 }
7823
7824 dev->num_tx_queues = txqs;
7825 dev->real_num_tx_queues = txqs;
7826 if (netif_alloc_netdev_queues(dev))
7827 goto free_all;
7828
7829#ifdef CONFIG_SYSFS
7830 dev->num_rx_queues = rxqs;
7831 dev->real_num_rx_queues = rxqs;
7832 if (netif_alloc_rx_queues(dev))
7833 goto free_all;
7834#endif
7835
7836 strcpy(dev->name, name);
7837 dev->name_assign_type = name_assign_type;
7838 dev->group = INIT_NETDEV_GROUP;
7839 if (!dev->ethtool_ops)
7840 dev->ethtool_ops = &default_ethtool_ops;
7841
7842 nf_hook_ingress_init(dev);
7843
7844 return dev;
7845
7846free_all:
7847 free_netdev(dev);
7848 return NULL;
7849
7850free_pcpu:
7851 free_percpu(dev->pcpu_refcnt);
7852free_dev:
7853 netdev_freemem(dev);
7854 return NULL;
7855}
7856EXPORT_SYMBOL(alloc_netdev_mqs);
7857
7858/**
7859 * free_netdev - free network device
7860 * @dev: device
7861 *
7862 * This function does the last stage of destroying an allocated device
7863 * interface. The reference to the device object is released.
7864 * If this is the last reference then it will be freed.
7865 * Must be called in process context.
7866 */
7867void free_netdev(struct net_device *dev)
7868{
7869 struct napi_struct *p, *n;
7870
7871 might_sleep();
7872 netif_free_tx_queues(dev);
7873#ifdef CONFIG_SYSFS
7874 kvfree(dev->_rx);
7875#endif
7876
7877 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7878
7879 /* Flush device addresses */
7880 dev_addr_flush(dev);
7881
7882 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7883 netif_napi_del(p);
7884
7885 free_percpu(dev->pcpu_refcnt);
7886 dev->pcpu_refcnt = NULL;
7887
7888 /* Compatibility with error handling in drivers */
7889 if (dev->reg_state == NETREG_UNINITIALIZED) {
7890 netdev_freemem(dev);
7891 return;
7892 }
7893
7894 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7895 dev->reg_state = NETREG_RELEASED;
7896
7897 /* will free via device release */
7898 put_device(&dev->dev);
7899}
7900EXPORT_SYMBOL(free_netdev);
7901
7902/**
7903 * synchronize_net - Synchronize with packet receive processing
7904 *
7905 * Wait for packets currently being received to be done.
7906 * Does not block later packets from starting.
7907 */
7908void synchronize_net(void)
7909{
7910 might_sleep();
7911 if (rtnl_is_locked())
7912 synchronize_rcu_expedited();
7913 else
7914 synchronize_rcu();
7915}
7916EXPORT_SYMBOL(synchronize_net);
7917
7918/**
7919 * unregister_netdevice_queue - remove device from the kernel
7920 * @dev: device
7921 * @head: list
7922 *
7923 * This function shuts down a device interface and removes it
7924 * from the kernel tables.
7925 * If head not NULL, device is queued to be unregistered later.
7926 *
7927 * Callers must hold the rtnl semaphore. You may want
7928 * unregister_netdev() instead of this.
7929 */
7930
7931void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7932{
7933 ASSERT_RTNL();
7934
7935 if (head) {
7936 list_move_tail(&dev->unreg_list, head);
7937 } else {
7938 rollback_registered(dev);
7939 /* Finish processing unregister after unlock */
7940 net_set_todo(dev);
7941 }
7942}
7943EXPORT_SYMBOL(unregister_netdevice_queue);
7944
7945/**
7946 * unregister_netdevice_many - unregister many devices
7947 * @head: list of devices
7948 *
7949 * Note: As most callers use a stack allocated list_head,
7950 * we force a list_del() to make sure stack wont be corrupted later.
7951 */
7952void unregister_netdevice_many(struct list_head *head)
7953{
7954 struct net_device *dev;
7955
7956 if (!list_empty(head)) {
7957 rollback_registered_many(head);
7958 list_for_each_entry(dev, head, unreg_list)
7959 net_set_todo(dev);
7960 list_del(head);
7961 }
7962}
7963EXPORT_SYMBOL(unregister_netdevice_many);
7964
7965/**
7966 * unregister_netdev - remove device from the kernel
7967 * @dev: device
7968 *
7969 * This function shuts down a device interface and removes it
7970 * from the kernel tables.
7971 *
7972 * This is just a wrapper for unregister_netdevice that takes
7973 * the rtnl semaphore. In general you want to use this and not
7974 * unregister_netdevice.
7975 */
7976void unregister_netdev(struct net_device *dev)
7977{
7978 rtnl_lock();
7979 unregister_netdevice(dev);
7980 rtnl_unlock();
7981}
7982EXPORT_SYMBOL(unregister_netdev);
7983
7984/**
7985 * dev_change_net_namespace - move device to different nethost namespace
7986 * @dev: device
7987 * @net: network namespace
7988 * @pat: If not NULL name pattern to try if the current device name
7989 * is already taken in the destination network namespace.
7990 *
7991 * This function shuts down a device interface and moves it
7992 * to a new network namespace. On success 0 is returned, on
7993 * a failure a netagive errno code is returned.
7994 *
7995 * Callers must hold the rtnl semaphore.
7996 */
7997
7998int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7999{
8000 int err;
8001
8002 ASSERT_RTNL();
8003
8004 /* Don't allow namespace local devices to be moved. */
8005 err = -EINVAL;
8006 if (dev->features & NETIF_F_NETNS_LOCAL)
8007 goto out;
8008
8009 /* Ensure the device has been registrered */
8010 if (dev->reg_state != NETREG_REGISTERED)
8011 goto out;
8012
8013 /* Get out if there is nothing todo */
8014 err = 0;
8015 if (net_eq(dev_net(dev), net))
8016 goto out;
8017
8018 /* Pick the destination device name, and ensure
8019 * we can use it in the destination network namespace.
8020 */
8021 err = -EEXIST;
8022 if (__dev_get_by_name(net, dev->name)) {
8023 /* We get here if we can't use the current device name */
8024 if (!pat)
8025 goto out;
8026 if (dev_get_valid_name(net, dev, pat) < 0)
8027 goto out;
8028 }
8029
8030 /*
8031 * And now a mini version of register_netdevice unregister_netdevice.
8032 */
8033
8034 /* If device is running close it first. */
8035 dev_close(dev);
8036
8037 /* And unlink it from device chain */
8038 err = -ENODEV;
8039 unlist_netdevice(dev);
8040
8041 synchronize_net();
8042
8043 /* Shutdown queueing discipline. */
8044 dev_shutdown(dev);
8045
8046 /* Notify protocols, that we are about to destroy
8047 this device. They should clean all the things.
8048
8049 Note that dev->reg_state stays at NETREG_REGISTERED.
8050 This is wanted because this way 8021q and macvlan know
8051 the device is just moving and can keep their slaves up.
8052 */
8053 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
8054 rcu_barrier();
8055 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
8056 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
8057
8058 /*
8059 * Flush the unicast and multicast chains
8060 */
8061 dev_uc_flush(dev);
8062 dev_mc_flush(dev);
8063
8064 /* Send a netdev-removed uevent to the old namespace */
8065 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
8066 netdev_adjacent_del_links(dev);
8067
8068 /* Actually switch the network namespace */
8069 dev_net_set(dev, net);
8070
8071 /* If there is an ifindex conflict assign a new one */
8072 if (__dev_get_by_index(net, dev->ifindex))
8073 dev->ifindex = dev_new_index(net);
8074
8075 /* Send a netdev-add uevent to the new namespace */
8076 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
8077 netdev_adjacent_add_links(dev);
8078
8079 /* Fixup kobjects */
8080 err = device_rename(&dev->dev, dev->name);
8081 WARN_ON(err);
8082
8083 /* Add the device back in the hashes */
8084 list_netdevice(dev);
8085
8086 /* Notify protocols, that a new device appeared. */
8087 call_netdevice_notifiers(NETDEV_REGISTER, dev);
8088
8089 /*
8090 * Prevent userspace races by waiting until the network
8091 * device is fully setup before sending notifications.
8092 */
8093 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
8094
8095 synchronize_net();
8096 err = 0;
8097out:
8098 return err;
8099}
8100EXPORT_SYMBOL_GPL(dev_change_net_namespace);
8101
8102static int dev_cpu_dead(unsigned int oldcpu)
8103{
8104 struct sk_buff **list_skb;
8105 struct sk_buff *skb;
8106 unsigned int cpu;
8107 struct softnet_data *sd, *oldsd;
8108
8109 local_irq_disable();
8110 cpu = smp_processor_id();
8111 sd = &per_cpu(softnet_data, cpu);
8112 oldsd = &per_cpu(softnet_data, oldcpu);
8113
8114 /* Find end of our completion_queue. */
8115 list_skb = &sd->completion_queue;
8116 while (*list_skb)
8117 list_skb = &(*list_skb)->next;
8118 /* Append completion queue from offline CPU. */
8119 *list_skb = oldsd->completion_queue;
8120 oldsd->completion_queue = NULL;
8121
8122 /* Append output queue from offline CPU. */
8123 if (oldsd->output_queue) {
8124 *sd->output_queue_tailp = oldsd->output_queue;
8125 sd->output_queue_tailp = oldsd->output_queue_tailp;
8126 oldsd->output_queue = NULL;
8127 oldsd->output_queue_tailp = &oldsd->output_queue;
8128 }
8129 /* Append NAPI poll list from offline CPU, with one exception :
8130 * process_backlog() must be called by cpu owning percpu backlog.
8131 * We properly handle process_queue & input_pkt_queue later.
8132 */
8133 while (!list_empty(&oldsd->poll_list)) {
8134 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
8135 struct napi_struct,
8136 poll_list);
8137
8138 list_del_init(&napi->poll_list);
8139 if (napi->poll == process_backlog)
8140 napi->state = 0;
8141 else
8142 ____napi_schedule(sd, napi);
8143 }
8144
8145 raise_softirq_irqoff(NET_TX_SOFTIRQ);
8146 local_irq_enable();
8147
8148 /* Process offline CPU's input_pkt_queue */
8149 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8150 netif_rx_ni(skb);
8151 input_queue_head_incr(oldsd);
8152 }
8153 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8154 netif_rx_ni(skb);
8155 input_queue_head_incr(oldsd);
8156 }
8157
8158 return 0;
8159}
8160
8161/**
8162 * netdev_increment_features - increment feature set by one
8163 * @all: current feature set
8164 * @one: new feature set
8165 * @mask: mask feature set
8166 *
8167 * Computes a new feature set after adding a device with feature set
8168 * @one to the master device with current feature set @all. Will not
8169 * enable anything that is off in @mask. Returns the new feature set.
8170 */
8171netdev_features_t netdev_increment_features(netdev_features_t all,
8172 netdev_features_t one, netdev_features_t mask)
8173{
8174 if (mask & NETIF_F_HW_CSUM)
8175 mask |= NETIF_F_CSUM_MASK;
8176 mask |= NETIF_F_VLAN_CHALLENGED;
8177
8178 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8179 all &= one | ~NETIF_F_ALL_FOR_ALL;
8180
8181 /* If one device supports hw checksumming, set for all. */
8182 if (all & NETIF_F_HW_CSUM)
8183 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8184
8185 return all;
8186}
8187EXPORT_SYMBOL(netdev_increment_features);
8188
8189static struct hlist_head * __net_init netdev_create_hash(void)
8190{
8191 int i;
8192 struct hlist_head *hash;
8193
8194 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8195 if (hash != NULL)
8196 for (i = 0; i < NETDEV_HASHENTRIES; i++)
8197 INIT_HLIST_HEAD(&hash[i]);
8198
8199 return hash;
8200}
8201
8202/* Initialize per network namespace state */
8203static int __net_init netdev_init(struct net *net)
8204{
8205 if (net != &init_net)
8206 INIT_LIST_HEAD(&net->dev_base_head);
8207
8208 net->dev_name_head = netdev_create_hash();
8209 if (net->dev_name_head == NULL)
8210 goto err_name;
8211
8212 net->dev_index_head = netdev_create_hash();
8213 if (net->dev_index_head == NULL)
8214 goto err_idx;
8215
8216 return 0;
8217
8218err_idx:
8219 kfree(net->dev_name_head);
8220err_name:
8221 return -ENOMEM;
8222}
8223
8224/**
8225 * netdev_drivername - network driver for the device
8226 * @dev: network device
8227 *
8228 * Determine network driver for device.
8229 */
8230const char *netdev_drivername(const struct net_device *dev)
8231{
8232 const struct device_driver *driver;
8233 const struct device *parent;
8234 const char *empty = "";
8235
8236 parent = dev->dev.parent;
8237 if (!parent)
8238 return empty;
8239
8240 driver = parent->driver;
8241 if (driver && driver->name)
8242 return driver->name;
8243 return empty;
8244}
8245
8246static void __netdev_printk(const char *level, const struct net_device *dev,
8247 struct va_format *vaf)
8248{
8249 if (dev && dev->dev.parent) {
8250 dev_printk_emit(level[1] - '0',
8251 dev->dev.parent,
8252 "%s %s %s%s: %pV",
8253 dev_driver_string(dev->dev.parent),
8254 dev_name(dev->dev.parent),
8255 netdev_name(dev), netdev_reg_state(dev),
8256 vaf);
8257 } else if (dev) {
8258 printk("%s%s%s: %pV",
8259 level, netdev_name(dev), netdev_reg_state(dev), vaf);
8260 } else {
8261 printk("%s(NULL net_device): %pV", level, vaf);
8262 }
8263}
8264
8265void netdev_printk(const char *level, const struct net_device *dev,
8266 const char *format, ...)
8267{
8268 struct va_format vaf;
8269 va_list args;
8270
8271 va_start(args, format);
8272
8273 vaf.fmt = format;
8274 vaf.va = &args;
8275
8276 __netdev_printk(level, dev, &vaf);
8277
8278 va_end(args);
8279}
8280EXPORT_SYMBOL(netdev_printk);
8281
8282#define define_netdev_printk_level(func, level) \
8283void func(const struct net_device *dev, const char *fmt, ...) \
8284{ \
8285 struct va_format vaf; \
8286 va_list args; \
8287 \
8288 va_start(args, fmt); \
8289 \
8290 vaf.fmt = fmt; \
8291 vaf.va = &args; \
8292 \
8293 __netdev_printk(level, dev, &vaf); \
8294 \
8295 va_end(args); \
8296} \
8297EXPORT_SYMBOL(func);
8298
8299define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8300define_netdev_printk_level(netdev_alert, KERN_ALERT);
8301define_netdev_printk_level(netdev_crit, KERN_CRIT);
8302define_netdev_printk_level(netdev_err, KERN_ERR);
8303define_netdev_printk_level(netdev_warn, KERN_WARNING);
8304define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8305define_netdev_printk_level(netdev_info, KERN_INFO);
8306
8307static void __net_exit netdev_exit(struct net *net)
8308{
8309 kfree(net->dev_name_head);
8310 kfree(net->dev_index_head);
8311}
8312
8313static struct pernet_operations __net_initdata netdev_net_ops = {
8314 .init = netdev_init,
8315 .exit = netdev_exit,
8316};
8317
8318static void __net_exit default_device_exit(struct net *net)
8319{
8320 struct net_device *dev, *aux;
8321 /*
8322 * Push all migratable network devices back to the
8323 * initial network namespace
8324 */
8325 rtnl_lock();
8326 for_each_netdev_safe(net, dev, aux) {
8327 int err;
8328 char fb_name[IFNAMSIZ];
8329
8330 /* Ignore unmoveable devices (i.e. loopback) */
8331 if (dev->features & NETIF_F_NETNS_LOCAL)
8332 continue;
8333
8334 /* Leave virtual devices for the generic cleanup */
8335 if (dev->rtnl_link_ops)
8336 continue;
8337
8338 /* Push remaining network devices to init_net */
8339 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8340 err = dev_change_net_namespace(dev, &init_net, fb_name);
8341 if (err) {
8342 pr_emerg("%s: failed to move %s to init_net: %d\n",
8343 __func__, dev->name, err);
8344 BUG();
8345 }
8346 }
8347 rtnl_unlock();
8348}
8349
8350static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8351{
8352 /* Return with the rtnl_lock held when there are no network
8353 * devices unregistering in any network namespace in net_list.
8354 */
8355 struct net *net;
8356 bool unregistering;
8357 DEFINE_WAIT_FUNC(wait, woken_wake_function);
8358
8359 add_wait_queue(&netdev_unregistering_wq, &wait);
8360 for (;;) {
8361 unregistering = false;
8362 rtnl_lock();
8363 list_for_each_entry(net, net_list, exit_list) {
8364 if (net->dev_unreg_count > 0) {
8365 unregistering = true;
8366 break;
8367 }
8368 }
8369 if (!unregistering)
8370 break;
8371 __rtnl_unlock();
8372
8373 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8374 }
8375 remove_wait_queue(&netdev_unregistering_wq, &wait);
8376}
8377
8378static void __net_exit default_device_exit_batch(struct list_head *net_list)
8379{
8380 /* At exit all network devices most be removed from a network
8381 * namespace. Do this in the reverse order of registration.
8382 * Do this across as many network namespaces as possible to
8383 * improve batching efficiency.
8384 */
8385 struct net_device *dev;
8386 struct net *net;
8387 LIST_HEAD(dev_kill_list);
8388
8389 /* To prevent network device cleanup code from dereferencing
8390 * loopback devices or network devices that have been freed
8391 * wait here for all pending unregistrations to complete,
8392 * before unregistring the loopback device and allowing the
8393 * network namespace be freed.
8394 *
8395 * The netdev todo list containing all network devices
8396 * unregistrations that happen in default_device_exit_batch
8397 * will run in the rtnl_unlock() at the end of
8398 * default_device_exit_batch.
8399 */
8400 rtnl_lock_unregistering(net_list);
8401 list_for_each_entry(net, net_list, exit_list) {
8402 for_each_netdev_reverse(net, dev) {
8403 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8404 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8405 else
8406 unregister_netdevice_queue(dev, &dev_kill_list);
8407 }
8408 }
8409 unregister_netdevice_many(&dev_kill_list);
8410 rtnl_unlock();
8411}
8412
8413static struct pernet_operations __net_initdata default_device_ops = {
8414 .exit = default_device_exit,
8415 .exit_batch = default_device_exit_batch,
8416};
8417
8418/*
8419 * Initialize the DEV module. At boot time this walks the device list and
8420 * unhooks any devices that fail to initialise (normally hardware not
8421 * present) and leaves us with a valid list of present and active devices.
8422 *
8423 */
8424
8425/*
8426 * This is called single threaded during boot, so no need
8427 * to take the rtnl semaphore.
8428 */
8429static int __init net_dev_init(void)
8430{
8431 int i, rc = -ENOMEM;
8432
8433 BUG_ON(!dev_boot_phase);
8434
8435 if (dev_proc_init())
8436 goto out;
8437
8438 if (netdev_kobject_init())
8439 goto out;
8440
8441 INIT_LIST_HEAD(&ptype_all);
8442 for (i = 0; i < PTYPE_HASH_SIZE; i++)
8443 INIT_LIST_HEAD(&ptype_base[i]);
8444
8445 INIT_LIST_HEAD(&offload_base);
8446
8447 if (register_pernet_subsys(&netdev_net_ops))
8448 goto out;
8449
8450 /*
8451 * Initialise the packet receive queues.
8452 */
8453
8454 for_each_possible_cpu(i) {
8455 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8456 struct softnet_data *sd = &per_cpu(softnet_data, i);
8457
8458 INIT_WORK(flush, flush_backlog);
8459
8460 skb_queue_head_init(&sd->input_pkt_queue);
8461 skb_queue_head_init(&sd->process_queue);
8462 INIT_LIST_HEAD(&sd->poll_list);
8463 sd->output_queue_tailp = &sd->output_queue;
8464#ifdef CONFIG_RPS
8465 sd->csd.func = rps_trigger_softirq;
8466 sd->csd.info = sd;
8467 sd->cpu = i;
8468#endif
8469
8470 sd->backlog.poll = process_backlog;
8471 sd->backlog.weight = weight_p;
8472 }
8473
8474 dev_boot_phase = 0;
8475
8476 /* The loopback device is special if any other network devices
8477 * is present in a network namespace the loopback device must
8478 * be present. Since we now dynamically allocate and free the
8479 * loopback device ensure this invariant is maintained by
8480 * keeping the loopback device as the first device on the
8481 * list of network devices. Ensuring the loopback devices
8482 * is the first device that appears and the last network device
8483 * that disappears.
8484 */
8485 if (register_pernet_device(&loopback_net_ops))
8486 goto out;
8487
8488 if (register_pernet_device(&default_device_ops))
8489 goto out;
8490
8491 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8492 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8493
8494 rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
8495 NULL, dev_cpu_dead);
8496 WARN_ON(rc < 0);
8497 dst_subsys_init();
8498 rc = 0;
8499out:
8500 return rc;
8501}
8502
8503subsys_initcall(net_dev_init);