Loading...
1/*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14/* Changes:
15 *
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
23 * Ville Nuorvala
24 * Fixed routing subtrees.
25 */
26
27#include <linux/capability.h>
28#include <linux/errno.h>
29#include <linux/types.h>
30#include <linux/times.h>
31#include <linux/socket.h>
32#include <linux/sockios.h>
33#include <linux/net.h>
34#include <linux/route.h>
35#include <linux/netdevice.h>
36#include <linux/in6.h>
37#include <linux/mroute6.h>
38#include <linux/init.h>
39#include <linux/if_arp.h>
40#include <linux/proc_fs.h>
41#include <linux/seq_file.h>
42#include <linux/nsproxy.h>
43#include <linux/slab.h>
44#include <net/net_namespace.h>
45#include <net/snmp.h>
46#include <net/ipv6.h>
47#include <net/ip6_fib.h>
48#include <net/ip6_route.h>
49#include <net/ndisc.h>
50#include <net/addrconf.h>
51#include <net/tcp.h>
52#include <linux/rtnetlink.h>
53#include <net/dst.h>
54#include <net/xfrm.h>
55#include <net/netevent.h>
56#include <net/netlink.h>
57
58#include <asm/uaccess.h>
59
60#ifdef CONFIG_SYSCTL
61#include <linux/sysctl.h>
62#endif
63
64/* Set to 3 to get tracing. */
65#define RT6_DEBUG 2
66
67#if RT6_DEBUG >= 3
68#define RDBG(x) printk x
69#define RT6_TRACE(x...) printk(KERN_DEBUG x)
70#else
71#define RDBG(x)
72#define RT6_TRACE(x...) do { ; } while (0)
73#endif
74
75static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
76 const struct in6_addr *dest);
77static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78static unsigned int ip6_default_advmss(const struct dst_entry *dst);
79static unsigned int ip6_default_mtu(const struct dst_entry *dst);
80static struct dst_entry *ip6_negative_advice(struct dst_entry *);
81static void ip6_dst_destroy(struct dst_entry *);
82static void ip6_dst_ifdown(struct dst_entry *,
83 struct net_device *dev, int how);
84static int ip6_dst_gc(struct dst_ops *ops);
85
86static int ip6_pkt_discard(struct sk_buff *skb);
87static int ip6_pkt_discard_out(struct sk_buff *skb);
88static void ip6_link_failure(struct sk_buff *skb);
89static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90
91#ifdef CONFIG_IPV6_ROUTE_INFO
92static struct rt6_info *rt6_add_route_info(struct net *net,
93 const struct in6_addr *prefix, int prefixlen,
94 const struct in6_addr *gwaddr, int ifindex,
95 unsigned pref);
96static struct rt6_info *rt6_get_route_info(struct net *net,
97 const struct in6_addr *prefix, int prefixlen,
98 const struct in6_addr *gwaddr, int ifindex);
99#endif
100
101static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
102{
103 struct rt6_info *rt = (struct rt6_info *) dst;
104 struct inet_peer *peer;
105 u32 *p = NULL;
106
107 if (!(rt->dst.flags & DST_HOST))
108 return NULL;
109
110 if (!rt->rt6i_peer)
111 rt6_bind_peer(rt, 1);
112
113 peer = rt->rt6i_peer;
114 if (peer) {
115 u32 *old_p = __DST_METRICS_PTR(old);
116 unsigned long prev, new;
117
118 p = peer->metrics;
119 if (inet_metrics_new(peer))
120 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
121
122 new = (unsigned long) p;
123 prev = cmpxchg(&dst->_metrics, old, new);
124
125 if (prev != old) {
126 p = __DST_METRICS_PTR(prev);
127 if (prev & DST_METRICS_READ_ONLY)
128 p = NULL;
129 }
130 }
131 return p;
132}
133
134static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
135{
136 return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
137}
138
139static struct dst_ops ip6_dst_ops_template = {
140 .family = AF_INET6,
141 .protocol = cpu_to_be16(ETH_P_IPV6),
142 .gc = ip6_dst_gc,
143 .gc_thresh = 1024,
144 .check = ip6_dst_check,
145 .default_advmss = ip6_default_advmss,
146 .default_mtu = ip6_default_mtu,
147 .cow_metrics = ipv6_cow_metrics,
148 .destroy = ip6_dst_destroy,
149 .ifdown = ip6_dst_ifdown,
150 .negative_advice = ip6_negative_advice,
151 .link_failure = ip6_link_failure,
152 .update_pmtu = ip6_rt_update_pmtu,
153 .local_out = __ip6_local_out,
154 .neigh_lookup = ip6_neigh_lookup,
155};
156
157static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
158{
159 return 0;
160}
161
162static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
163{
164}
165
166static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
167 unsigned long old)
168{
169 return NULL;
170}
171
172static struct dst_ops ip6_dst_blackhole_ops = {
173 .family = AF_INET6,
174 .protocol = cpu_to_be16(ETH_P_IPV6),
175 .destroy = ip6_dst_destroy,
176 .check = ip6_dst_check,
177 .default_mtu = ip6_blackhole_default_mtu,
178 .default_advmss = ip6_default_advmss,
179 .update_pmtu = ip6_rt_blackhole_update_pmtu,
180 .cow_metrics = ip6_rt_blackhole_cow_metrics,
181 .neigh_lookup = ip6_neigh_lookup,
182};
183
184static const u32 ip6_template_metrics[RTAX_MAX] = {
185 [RTAX_HOPLIMIT - 1] = 255,
186};
187
188static struct rt6_info ip6_null_entry_template = {
189 .dst = {
190 .__refcnt = ATOMIC_INIT(1),
191 .__use = 1,
192 .obsolete = -1,
193 .error = -ENETUNREACH,
194 .input = ip6_pkt_discard,
195 .output = ip6_pkt_discard_out,
196 },
197 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
198 .rt6i_protocol = RTPROT_KERNEL,
199 .rt6i_metric = ~(u32) 0,
200 .rt6i_ref = ATOMIC_INIT(1),
201};
202
203#ifdef CONFIG_IPV6_MULTIPLE_TABLES
204
205static int ip6_pkt_prohibit(struct sk_buff *skb);
206static int ip6_pkt_prohibit_out(struct sk_buff *skb);
207
208static struct rt6_info ip6_prohibit_entry_template = {
209 .dst = {
210 .__refcnt = ATOMIC_INIT(1),
211 .__use = 1,
212 .obsolete = -1,
213 .error = -EACCES,
214 .input = ip6_pkt_prohibit,
215 .output = ip6_pkt_prohibit_out,
216 },
217 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
218 .rt6i_protocol = RTPROT_KERNEL,
219 .rt6i_metric = ~(u32) 0,
220 .rt6i_ref = ATOMIC_INIT(1),
221};
222
223static struct rt6_info ip6_blk_hole_entry_template = {
224 .dst = {
225 .__refcnt = ATOMIC_INIT(1),
226 .__use = 1,
227 .obsolete = -1,
228 .error = -EINVAL,
229 .input = dst_discard,
230 .output = dst_discard,
231 },
232 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
233 .rt6i_protocol = RTPROT_KERNEL,
234 .rt6i_metric = ~(u32) 0,
235 .rt6i_ref = ATOMIC_INIT(1),
236};
237
238#endif
239
240/* allocate dst with ip6_dst_ops */
241static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
242 struct net_device *dev,
243 int flags)
244{
245 struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
246
247 if (rt != NULL)
248 memset(&rt->rt6i_table, 0,
249 sizeof(*rt) - sizeof(struct dst_entry));
250
251 return rt;
252}
253
254static void ip6_dst_destroy(struct dst_entry *dst)
255{
256 struct rt6_info *rt = (struct rt6_info *)dst;
257 struct inet6_dev *idev = rt->rt6i_idev;
258 struct inet_peer *peer = rt->rt6i_peer;
259
260 if (!(rt->dst.flags & DST_HOST))
261 dst_destroy_metrics_generic(dst);
262
263 if (idev != NULL) {
264 rt->rt6i_idev = NULL;
265 in6_dev_put(idev);
266 }
267 if (peer) {
268 rt->rt6i_peer = NULL;
269 inet_putpeer(peer);
270 }
271}
272
273static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
274
275static u32 rt6_peer_genid(void)
276{
277 return atomic_read(&__rt6_peer_genid);
278}
279
280void rt6_bind_peer(struct rt6_info *rt, int create)
281{
282 struct inet_peer *peer;
283
284 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
285 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
286 inet_putpeer(peer);
287 else
288 rt->rt6i_peer_genid = rt6_peer_genid();
289}
290
291static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
292 int how)
293{
294 struct rt6_info *rt = (struct rt6_info *)dst;
295 struct inet6_dev *idev = rt->rt6i_idev;
296 struct net_device *loopback_dev =
297 dev_net(dev)->loopback_dev;
298
299 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
300 struct inet6_dev *loopback_idev =
301 in6_dev_get(loopback_dev);
302 if (loopback_idev != NULL) {
303 rt->rt6i_idev = loopback_idev;
304 in6_dev_put(idev);
305 }
306 }
307}
308
309static __inline__ int rt6_check_expired(const struct rt6_info *rt)
310{
311 return (rt->rt6i_flags & RTF_EXPIRES) &&
312 time_after(jiffies, rt->rt6i_expires);
313}
314
315static inline int rt6_need_strict(const struct in6_addr *daddr)
316{
317 return ipv6_addr_type(daddr) &
318 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
319}
320
321/*
322 * Route lookup. Any table->tb6_lock is implied.
323 */
324
325static inline struct rt6_info *rt6_device_match(struct net *net,
326 struct rt6_info *rt,
327 const struct in6_addr *saddr,
328 int oif,
329 int flags)
330{
331 struct rt6_info *local = NULL;
332 struct rt6_info *sprt;
333
334 if (!oif && ipv6_addr_any(saddr))
335 goto out;
336
337 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
338 struct net_device *dev = sprt->rt6i_dev;
339
340 if (oif) {
341 if (dev->ifindex == oif)
342 return sprt;
343 if (dev->flags & IFF_LOOPBACK) {
344 if (sprt->rt6i_idev == NULL ||
345 sprt->rt6i_idev->dev->ifindex != oif) {
346 if (flags & RT6_LOOKUP_F_IFACE && oif)
347 continue;
348 if (local && (!oif ||
349 local->rt6i_idev->dev->ifindex == oif))
350 continue;
351 }
352 local = sprt;
353 }
354 } else {
355 if (ipv6_chk_addr(net, saddr, dev,
356 flags & RT6_LOOKUP_F_IFACE))
357 return sprt;
358 }
359 }
360
361 if (oif) {
362 if (local)
363 return local;
364
365 if (flags & RT6_LOOKUP_F_IFACE)
366 return net->ipv6.ip6_null_entry;
367 }
368out:
369 return rt;
370}
371
372#ifdef CONFIG_IPV6_ROUTER_PREF
373static void rt6_probe(struct rt6_info *rt)
374{
375 struct neighbour *neigh;
376 /*
377 * Okay, this does not seem to be appropriate
378 * for now, however, we need to check if it
379 * is really so; aka Router Reachability Probing.
380 *
381 * Router Reachability Probe MUST be rate-limited
382 * to no more than one per minute.
383 */
384 rcu_read_lock();
385 neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
386 if (!neigh || (neigh->nud_state & NUD_VALID))
387 goto out;
388 read_lock_bh(&neigh->lock);
389 if (!(neigh->nud_state & NUD_VALID) &&
390 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
391 struct in6_addr mcaddr;
392 struct in6_addr *target;
393
394 neigh->updated = jiffies;
395 read_unlock_bh(&neigh->lock);
396
397 target = (struct in6_addr *)&neigh->primary_key;
398 addrconf_addr_solict_mult(target, &mcaddr);
399 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
400 } else {
401 read_unlock_bh(&neigh->lock);
402 }
403out:
404 rcu_read_unlock();
405}
406#else
407static inline void rt6_probe(struct rt6_info *rt)
408{
409}
410#endif
411
412/*
413 * Default Router Selection (RFC 2461 6.3.6)
414 */
415static inline int rt6_check_dev(struct rt6_info *rt, int oif)
416{
417 struct net_device *dev = rt->rt6i_dev;
418 if (!oif || dev->ifindex == oif)
419 return 2;
420 if ((dev->flags & IFF_LOOPBACK) &&
421 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
422 return 1;
423 return 0;
424}
425
426static inline int rt6_check_neigh(struct rt6_info *rt)
427{
428 struct neighbour *neigh;
429 int m;
430
431 rcu_read_lock();
432 neigh = dst_get_neighbour(&rt->dst);
433 if (rt->rt6i_flags & RTF_NONEXTHOP ||
434 !(rt->rt6i_flags & RTF_GATEWAY))
435 m = 1;
436 else if (neigh) {
437 read_lock_bh(&neigh->lock);
438 if (neigh->nud_state & NUD_VALID)
439 m = 2;
440#ifdef CONFIG_IPV6_ROUTER_PREF
441 else if (neigh->nud_state & NUD_FAILED)
442 m = 0;
443#endif
444 else
445 m = 1;
446 read_unlock_bh(&neigh->lock);
447 } else
448 m = 0;
449 rcu_read_unlock();
450 return m;
451}
452
453static int rt6_score_route(struct rt6_info *rt, int oif,
454 int strict)
455{
456 int m, n;
457
458 m = rt6_check_dev(rt, oif);
459 if (!m && (strict & RT6_LOOKUP_F_IFACE))
460 return -1;
461#ifdef CONFIG_IPV6_ROUTER_PREF
462 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
463#endif
464 n = rt6_check_neigh(rt);
465 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
466 return -1;
467 return m;
468}
469
470static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
471 int *mpri, struct rt6_info *match)
472{
473 int m;
474
475 if (rt6_check_expired(rt))
476 goto out;
477
478 m = rt6_score_route(rt, oif, strict);
479 if (m < 0)
480 goto out;
481
482 if (m > *mpri) {
483 if (strict & RT6_LOOKUP_F_REACHABLE)
484 rt6_probe(match);
485 *mpri = m;
486 match = rt;
487 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
488 rt6_probe(rt);
489 }
490
491out:
492 return match;
493}
494
495static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
496 struct rt6_info *rr_head,
497 u32 metric, int oif, int strict)
498{
499 struct rt6_info *rt, *match;
500 int mpri = -1;
501
502 match = NULL;
503 for (rt = rr_head; rt && rt->rt6i_metric == metric;
504 rt = rt->dst.rt6_next)
505 match = find_match(rt, oif, strict, &mpri, match);
506 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
507 rt = rt->dst.rt6_next)
508 match = find_match(rt, oif, strict, &mpri, match);
509
510 return match;
511}
512
513static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
514{
515 struct rt6_info *match, *rt0;
516 struct net *net;
517
518 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
519 __func__, fn->leaf, oif);
520
521 rt0 = fn->rr_ptr;
522 if (!rt0)
523 fn->rr_ptr = rt0 = fn->leaf;
524
525 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
526
527 if (!match &&
528 (strict & RT6_LOOKUP_F_REACHABLE)) {
529 struct rt6_info *next = rt0->dst.rt6_next;
530
531 /* no entries matched; do round-robin */
532 if (!next || next->rt6i_metric != rt0->rt6i_metric)
533 next = fn->leaf;
534
535 if (next != rt0)
536 fn->rr_ptr = next;
537 }
538
539 RT6_TRACE("%s() => %p\n",
540 __func__, match);
541
542 net = dev_net(rt0->rt6i_dev);
543 return match ? match : net->ipv6.ip6_null_entry;
544}
545
546#ifdef CONFIG_IPV6_ROUTE_INFO
547int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
548 const struct in6_addr *gwaddr)
549{
550 struct net *net = dev_net(dev);
551 struct route_info *rinfo = (struct route_info *) opt;
552 struct in6_addr prefix_buf, *prefix;
553 unsigned int pref;
554 unsigned long lifetime;
555 struct rt6_info *rt;
556
557 if (len < sizeof(struct route_info)) {
558 return -EINVAL;
559 }
560
561 /* Sanity check for prefix_len and length */
562 if (rinfo->length > 3) {
563 return -EINVAL;
564 } else if (rinfo->prefix_len > 128) {
565 return -EINVAL;
566 } else if (rinfo->prefix_len > 64) {
567 if (rinfo->length < 2) {
568 return -EINVAL;
569 }
570 } else if (rinfo->prefix_len > 0) {
571 if (rinfo->length < 1) {
572 return -EINVAL;
573 }
574 }
575
576 pref = rinfo->route_pref;
577 if (pref == ICMPV6_ROUTER_PREF_INVALID)
578 return -EINVAL;
579
580 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
581
582 if (rinfo->length == 3)
583 prefix = (struct in6_addr *)rinfo->prefix;
584 else {
585 /* this function is safe */
586 ipv6_addr_prefix(&prefix_buf,
587 (struct in6_addr *)rinfo->prefix,
588 rinfo->prefix_len);
589 prefix = &prefix_buf;
590 }
591
592 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
593 dev->ifindex);
594
595 if (rt && !lifetime) {
596 ip6_del_rt(rt);
597 rt = NULL;
598 }
599
600 if (!rt && lifetime)
601 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
602 pref);
603 else if (rt)
604 rt->rt6i_flags = RTF_ROUTEINFO |
605 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
606
607 if (rt) {
608 if (!addrconf_finite_timeout(lifetime)) {
609 rt->rt6i_flags &= ~RTF_EXPIRES;
610 } else {
611 rt->rt6i_expires = jiffies + HZ * lifetime;
612 rt->rt6i_flags |= RTF_EXPIRES;
613 }
614 dst_release(&rt->dst);
615 }
616 return 0;
617}
618#endif
619
620#define BACKTRACK(__net, saddr) \
621do { \
622 if (rt == __net->ipv6.ip6_null_entry) { \
623 struct fib6_node *pn; \
624 while (1) { \
625 if (fn->fn_flags & RTN_TL_ROOT) \
626 goto out; \
627 pn = fn->parent; \
628 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
629 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
630 else \
631 fn = pn; \
632 if (fn->fn_flags & RTN_RTINFO) \
633 goto restart; \
634 } \
635 } \
636} while(0)
637
638static struct rt6_info *ip6_pol_route_lookup(struct net *net,
639 struct fib6_table *table,
640 struct flowi6 *fl6, int flags)
641{
642 struct fib6_node *fn;
643 struct rt6_info *rt;
644
645 read_lock_bh(&table->tb6_lock);
646 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
647restart:
648 rt = fn->leaf;
649 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
650 BACKTRACK(net, &fl6->saddr);
651out:
652 dst_use(&rt->dst, jiffies);
653 read_unlock_bh(&table->tb6_lock);
654 return rt;
655
656}
657
658struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
659 const struct in6_addr *saddr, int oif, int strict)
660{
661 struct flowi6 fl6 = {
662 .flowi6_oif = oif,
663 .daddr = *daddr,
664 };
665 struct dst_entry *dst;
666 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
667
668 if (saddr) {
669 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
670 flags |= RT6_LOOKUP_F_HAS_SADDR;
671 }
672
673 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
674 if (dst->error == 0)
675 return (struct rt6_info *) dst;
676
677 dst_release(dst);
678
679 return NULL;
680}
681
682EXPORT_SYMBOL(rt6_lookup);
683
684/* ip6_ins_rt is called with FREE table->tb6_lock.
685 It takes new route entry, the addition fails by any reason the
686 route is freed. In any case, if caller does not hold it, it may
687 be destroyed.
688 */
689
690static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
691{
692 int err;
693 struct fib6_table *table;
694
695 table = rt->rt6i_table;
696 write_lock_bh(&table->tb6_lock);
697 err = fib6_add(&table->tb6_root, rt, info);
698 write_unlock_bh(&table->tb6_lock);
699
700 return err;
701}
702
703int ip6_ins_rt(struct rt6_info *rt)
704{
705 struct nl_info info = {
706 .nl_net = dev_net(rt->rt6i_dev),
707 };
708 return __ip6_ins_rt(rt, &info);
709}
710
711static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
712 const struct in6_addr *daddr,
713 const struct in6_addr *saddr)
714{
715 struct rt6_info *rt;
716
717 /*
718 * Clone the route.
719 */
720
721 rt = ip6_rt_copy(ort, daddr);
722
723 if (rt) {
724 struct neighbour *neigh;
725 int attempts = !in_softirq();
726
727 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
728 if (rt->rt6i_dst.plen != 128 &&
729 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
730 rt->rt6i_flags |= RTF_ANYCAST;
731 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
732 }
733
734 rt->rt6i_flags |= RTF_CACHE;
735
736#ifdef CONFIG_IPV6_SUBTREES
737 if (rt->rt6i_src.plen && saddr) {
738 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
739 rt->rt6i_src.plen = 128;
740 }
741#endif
742
743 retry:
744 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
745 if (IS_ERR(neigh)) {
746 struct net *net = dev_net(rt->rt6i_dev);
747 int saved_rt_min_interval =
748 net->ipv6.sysctl.ip6_rt_gc_min_interval;
749 int saved_rt_elasticity =
750 net->ipv6.sysctl.ip6_rt_gc_elasticity;
751
752 if (attempts-- > 0) {
753 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
754 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
755
756 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
757
758 net->ipv6.sysctl.ip6_rt_gc_elasticity =
759 saved_rt_elasticity;
760 net->ipv6.sysctl.ip6_rt_gc_min_interval =
761 saved_rt_min_interval;
762 goto retry;
763 }
764
765 if (net_ratelimit())
766 printk(KERN_WARNING
767 "ipv6: Neighbour table overflow.\n");
768 dst_free(&rt->dst);
769 return NULL;
770 }
771 dst_set_neighbour(&rt->dst, neigh);
772
773 }
774
775 return rt;
776}
777
778static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
779 const struct in6_addr *daddr)
780{
781 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
782
783 if (rt) {
784 rt->rt6i_flags |= RTF_CACHE;
785 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
786 }
787 return rt;
788}
789
790static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
791 struct flowi6 *fl6, int flags)
792{
793 struct fib6_node *fn;
794 struct rt6_info *rt, *nrt;
795 int strict = 0;
796 int attempts = 3;
797 int err;
798 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
799
800 strict |= flags & RT6_LOOKUP_F_IFACE;
801
802relookup:
803 read_lock_bh(&table->tb6_lock);
804
805restart_2:
806 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
807
808restart:
809 rt = rt6_select(fn, oif, strict | reachable);
810
811 BACKTRACK(net, &fl6->saddr);
812 if (rt == net->ipv6.ip6_null_entry ||
813 rt->rt6i_flags & RTF_CACHE)
814 goto out;
815
816 dst_hold(&rt->dst);
817 read_unlock_bh(&table->tb6_lock);
818
819 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
820 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
821 else if (!(rt->dst.flags & DST_HOST))
822 nrt = rt6_alloc_clone(rt, &fl6->daddr);
823 else
824 goto out2;
825
826 dst_release(&rt->dst);
827 rt = nrt ? : net->ipv6.ip6_null_entry;
828
829 dst_hold(&rt->dst);
830 if (nrt) {
831 err = ip6_ins_rt(nrt);
832 if (!err)
833 goto out2;
834 }
835
836 if (--attempts <= 0)
837 goto out2;
838
839 /*
840 * Race condition! In the gap, when table->tb6_lock was
841 * released someone could insert this route. Relookup.
842 */
843 dst_release(&rt->dst);
844 goto relookup;
845
846out:
847 if (reachable) {
848 reachable = 0;
849 goto restart_2;
850 }
851 dst_hold(&rt->dst);
852 read_unlock_bh(&table->tb6_lock);
853out2:
854 rt->dst.lastuse = jiffies;
855 rt->dst.__use++;
856
857 return rt;
858}
859
860static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
861 struct flowi6 *fl6, int flags)
862{
863 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
864}
865
866void ip6_route_input(struct sk_buff *skb)
867{
868 const struct ipv6hdr *iph = ipv6_hdr(skb);
869 struct net *net = dev_net(skb->dev);
870 int flags = RT6_LOOKUP_F_HAS_SADDR;
871 struct flowi6 fl6 = {
872 .flowi6_iif = skb->dev->ifindex,
873 .daddr = iph->daddr,
874 .saddr = iph->saddr,
875 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
876 .flowi6_mark = skb->mark,
877 .flowi6_proto = iph->nexthdr,
878 };
879
880 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
881 flags |= RT6_LOOKUP_F_IFACE;
882
883 skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
884}
885
886static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
887 struct flowi6 *fl6, int flags)
888{
889 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
890}
891
892struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
893 struct flowi6 *fl6)
894{
895 int flags = 0;
896
897 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
898 flags |= RT6_LOOKUP_F_IFACE;
899
900 if (!ipv6_addr_any(&fl6->saddr))
901 flags |= RT6_LOOKUP_F_HAS_SADDR;
902 else if (sk)
903 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
904
905 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
906}
907
908EXPORT_SYMBOL(ip6_route_output);
909
910struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
911{
912 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
913 struct dst_entry *new = NULL;
914
915 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
916 if (rt) {
917 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
918
919 new = &rt->dst;
920
921 new->__use = 1;
922 new->input = dst_discard;
923 new->output = dst_discard;
924
925 if (dst_metrics_read_only(&ort->dst))
926 new->_metrics = ort->dst._metrics;
927 else
928 dst_copy_metrics(new, &ort->dst);
929 rt->rt6i_idev = ort->rt6i_idev;
930 if (rt->rt6i_idev)
931 in6_dev_hold(rt->rt6i_idev);
932 rt->rt6i_expires = 0;
933
934 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
935 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
936 rt->rt6i_metric = 0;
937
938 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
939#ifdef CONFIG_IPV6_SUBTREES
940 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
941#endif
942
943 dst_free(new);
944 }
945
946 dst_release(dst_orig);
947 return new ? new : ERR_PTR(-ENOMEM);
948}
949
950/*
951 * Destination cache support functions
952 */
953
954static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
955{
956 struct rt6_info *rt;
957
958 rt = (struct rt6_info *) dst;
959
960 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
961 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
962 if (!rt->rt6i_peer)
963 rt6_bind_peer(rt, 0);
964 rt->rt6i_peer_genid = rt6_peer_genid();
965 }
966 return dst;
967 }
968 return NULL;
969}
970
971static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
972{
973 struct rt6_info *rt = (struct rt6_info *) dst;
974
975 if (rt) {
976 if (rt->rt6i_flags & RTF_CACHE) {
977 if (rt6_check_expired(rt)) {
978 ip6_del_rt(rt);
979 dst = NULL;
980 }
981 } else {
982 dst_release(dst);
983 dst = NULL;
984 }
985 }
986 return dst;
987}
988
989static void ip6_link_failure(struct sk_buff *skb)
990{
991 struct rt6_info *rt;
992
993 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
994
995 rt = (struct rt6_info *) skb_dst(skb);
996 if (rt) {
997 if (rt->rt6i_flags&RTF_CACHE) {
998 dst_set_expires(&rt->dst, 0);
999 rt->rt6i_flags |= RTF_EXPIRES;
1000 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1001 rt->rt6i_node->fn_sernum = -1;
1002 }
1003}
1004
1005static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1006{
1007 struct rt6_info *rt6 = (struct rt6_info*)dst;
1008
1009 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1010 rt6->rt6i_flags |= RTF_MODIFIED;
1011 if (mtu < IPV6_MIN_MTU) {
1012 u32 features = dst_metric(dst, RTAX_FEATURES);
1013 mtu = IPV6_MIN_MTU;
1014 features |= RTAX_FEATURE_ALLFRAG;
1015 dst_metric_set(dst, RTAX_FEATURES, features);
1016 }
1017 dst_metric_set(dst, RTAX_MTU, mtu);
1018 }
1019}
1020
1021static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1022{
1023 struct net_device *dev = dst->dev;
1024 unsigned int mtu = dst_mtu(dst);
1025 struct net *net = dev_net(dev);
1026
1027 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1028
1029 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1030 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1031
1032 /*
1033 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1034 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1035 * IPV6_MAXPLEN is also valid and means: "any MSS,
1036 * rely only on pmtu discovery"
1037 */
1038 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1039 mtu = IPV6_MAXPLEN;
1040 return mtu;
1041}
1042
1043static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1044{
1045 unsigned int mtu = IPV6_MIN_MTU;
1046 struct inet6_dev *idev;
1047
1048 rcu_read_lock();
1049 idev = __in6_dev_get(dst->dev);
1050 if (idev)
1051 mtu = idev->cnf.mtu6;
1052 rcu_read_unlock();
1053
1054 return mtu;
1055}
1056
1057static struct dst_entry *icmp6_dst_gc_list;
1058static DEFINE_SPINLOCK(icmp6_dst_lock);
1059
1060struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1061 struct neighbour *neigh,
1062 const struct in6_addr *addr)
1063{
1064 struct rt6_info *rt;
1065 struct inet6_dev *idev = in6_dev_get(dev);
1066 struct net *net = dev_net(dev);
1067
1068 if (unlikely(idev == NULL))
1069 return NULL;
1070
1071 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1072 if (unlikely(rt == NULL)) {
1073 in6_dev_put(idev);
1074 goto out;
1075 }
1076
1077 if (neigh)
1078 neigh_hold(neigh);
1079 else {
1080 neigh = ndisc_get_neigh(dev, addr);
1081 if (IS_ERR(neigh))
1082 neigh = NULL;
1083 }
1084
1085 rt->dst.flags |= DST_HOST;
1086 rt->dst.output = ip6_output;
1087 dst_set_neighbour(&rt->dst, neigh);
1088 atomic_set(&rt->dst.__refcnt, 1);
1089 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1090
1091 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1092 rt->rt6i_dst.plen = 128;
1093 rt->rt6i_idev = idev;
1094
1095 spin_lock_bh(&icmp6_dst_lock);
1096 rt->dst.next = icmp6_dst_gc_list;
1097 icmp6_dst_gc_list = &rt->dst;
1098 spin_unlock_bh(&icmp6_dst_lock);
1099
1100 fib6_force_start_gc(net);
1101
1102out:
1103 return &rt->dst;
1104}
1105
1106int icmp6_dst_gc(void)
1107{
1108 struct dst_entry *dst, **pprev;
1109 int more = 0;
1110
1111 spin_lock_bh(&icmp6_dst_lock);
1112 pprev = &icmp6_dst_gc_list;
1113
1114 while ((dst = *pprev) != NULL) {
1115 if (!atomic_read(&dst->__refcnt)) {
1116 *pprev = dst->next;
1117 dst_free(dst);
1118 } else {
1119 pprev = &dst->next;
1120 ++more;
1121 }
1122 }
1123
1124 spin_unlock_bh(&icmp6_dst_lock);
1125
1126 return more;
1127}
1128
1129static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1130 void *arg)
1131{
1132 struct dst_entry *dst, **pprev;
1133
1134 spin_lock_bh(&icmp6_dst_lock);
1135 pprev = &icmp6_dst_gc_list;
1136 while ((dst = *pprev) != NULL) {
1137 struct rt6_info *rt = (struct rt6_info *) dst;
1138 if (func(rt, arg)) {
1139 *pprev = dst->next;
1140 dst_free(dst);
1141 } else {
1142 pprev = &dst->next;
1143 }
1144 }
1145 spin_unlock_bh(&icmp6_dst_lock);
1146}
1147
1148static int ip6_dst_gc(struct dst_ops *ops)
1149{
1150 unsigned long now = jiffies;
1151 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1152 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1153 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1154 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1155 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1156 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1157 int entries;
1158
1159 entries = dst_entries_get_fast(ops);
1160 if (time_after(rt_last_gc + rt_min_interval, now) &&
1161 entries <= rt_max_size)
1162 goto out;
1163
1164 net->ipv6.ip6_rt_gc_expire++;
1165 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1166 net->ipv6.ip6_rt_last_gc = now;
1167 entries = dst_entries_get_slow(ops);
1168 if (entries < ops->gc_thresh)
1169 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1170out:
1171 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1172 return entries > rt_max_size;
1173}
1174
1175/* Clean host part of a prefix. Not necessary in radix tree,
1176 but results in cleaner routing tables.
1177
1178 Remove it only when all the things will work!
1179 */
1180
1181int ip6_dst_hoplimit(struct dst_entry *dst)
1182{
1183 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1184 if (hoplimit == 0) {
1185 struct net_device *dev = dst->dev;
1186 struct inet6_dev *idev;
1187
1188 rcu_read_lock();
1189 idev = __in6_dev_get(dev);
1190 if (idev)
1191 hoplimit = idev->cnf.hop_limit;
1192 else
1193 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1194 rcu_read_unlock();
1195 }
1196 return hoplimit;
1197}
1198EXPORT_SYMBOL(ip6_dst_hoplimit);
1199
1200/*
1201 *
1202 */
1203
1204int ip6_route_add(struct fib6_config *cfg)
1205{
1206 int err;
1207 struct net *net = cfg->fc_nlinfo.nl_net;
1208 struct rt6_info *rt = NULL;
1209 struct net_device *dev = NULL;
1210 struct inet6_dev *idev = NULL;
1211 struct fib6_table *table;
1212 int addr_type;
1213
1214 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1215 return -EINVAL;
1216#ifndef CONFIG_IPV6_SUBTREES
1217 if (cfg->fc_src_len)
1218 return -EINVAL;
1219#endif
1220 if (cfg->fc_ifindex) {
1221 err = -ENODEV;
1222 dev = dev_get_by_index(net, cfg->fc_ifindex);
1223 if (!dev)
1224 goto out;
1225 idev = in6_dev_get(dev);
1226 if (!idev)
1227 goto out;
1228 }
1229
1230 if (cfg->fc_metric == 0)
1231 cfg->fc_metric = IP6_RT_PRIO_USER;
1232
1233 table = fib6_new_table(net, cfg->fc_table);
1234 if (table == NULL) {
1235 err = -ENOBUFS;
1236 goto out;
1237 }
1238
1239 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1240
1241 if (rt == NULL) {
1242 err = -ENOMEM;
1243 goto out;
1244 }
1245
1246 rt->dst.obsolete = -1;
1247 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1248 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1249 0;
1250
1251 if (cfg->fc_protocol == RTPROT_UNSPEC)
1252 cfg->fc_protocol = RTPROT_BOOT;
1253 rt->rt6i_protocol = cfg->fc_protocol;
1254
1255 addr_type = ipv6_addr_type(&cfg->fc_dst);
1256
1257 if (addr_type & IPV6_ADDR_MULTICAST)
1258 rt->dst.input = ip6_mc_input;
1259 else if (cfg->fc_flags & RTF_LOCAL)
1260 rt->dst.input = ip6_input;
1261 else
1262 rt->dst.input = ip6_forward;
1263
1264 rt->dst.output = ip6_output;
1265
1266 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1267 rt->rt6i_dst.plen = cfg->fc_dst_len;
1268 if (rt->rt6i_dst.plen == 128)
1269 rt->dst.flags |= DST_HOST;
1270
1271 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1272 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1273 if (!metrics) {
1274 err = -ENOMEM;
1275 goto out;
1276 }
1277 dst_init_metrics(&rt->dst, metrics, 0);
1278 }
1279#ifdef CONFIG_IPV6_SUBTREES
1280 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1281 rt->rt6i_src.plen = cfg->fc_src_len;
1282#endif
1283
1284 rt->rt6i_metric = cfg->fc_metric;
1285
1286 /* We cannot add true routes via loopback here,
1287 they would result in kernel looping; promote them to reject routes
1288 */
1289 if ((cfg->fc_flags & RTF_REJECT) ||
1290 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1291 && !(cfg->fc_flags&RTF_LOCAL))) {
1292 /* hold loopback dev/idev if we haven't done so. */
1293 if (dev != net->loopback_dev) {
1294 if (dev) {
1295 dev_put(dev);
1296 in6_dev_put(idev);
1297 }
1298 dev = net->loopback_dev;
1299 dev_hold(dev);
1300 idev = in6_dev_get(dev);
1301 if (!idev) {
1302 err = -ENODEV;
1303 goto out;
1304 }
1305 }
1306 rt->dst.output = ip6_pkt_discard_out;
1307 rt->dst.input = ip6_pkt_discard;
1308 rt->dst.error = -ENETUNREACH;
1309 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1310 goto install_route;
1311 }
1312
1313 if (cfg->fc_flags & RTF_GATEWAY) {
1314 const struct in6_addr *gw_addr;
1315 int gwa_type;
1316
1317 gw_addr = &cfg->fc_gateway;
1318 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1319 gwa_type = ipv6_addr_type(gw_addr);
1320
1321 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1322 struct rt6_info *grt;
1323
1324 /* IPv6 strictly inhibits using not link-local
1325 addresses as nexthop address.
1326 Otherwise, router will not able to send redirects.
1327 It is very good, but in some (rare!) circumstances
1328 (SIT, PtP, NBMA NOARP links) it is handy to allow
1329 some exceptions. --ANK
1330 */
1331 err = -EINVAL;
1332 if (!(gwa_type&IPV6_ADDR_UNICAST))
1333 goto out;
1334
1335 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1336
1337 err = -EHOSTUNREACH;
1338 if (grt == NULL)
1339 goto out;
1340 if (dev) {
1341 if (dev != grt->rt6i_dev) {
1342 dst_release(&grt->dst);
1343 goto out;
1344 }
1345 } else {
1346 dev = grt->rt6i_dev;
1347 idev = grt->rt6i_idev;
1348 dev_hold(dev);
1349 in6_dev_hold(grt->rt6i_idev);
1350 }
1351 if (!(grt->rt6i_flags&RTF_GATEWAY))
1352 err = 0;
1353 dst_release(&grt->dst);
1354
1355 if (err)
1356 goto out;
1357 }
1358 err = -EINVAL;
1359 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1360 goto out;
1361 }
1362
1363 err = -ENODEV;
1364 if (dev == NULL)
1365 goto out;
1366
1367 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1368 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1369 err = -EINVAL;
1370 goto out;
1371 }
1372 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1373 rt->rt6i_prefsrc.plen = 128;
1374 } else
1375 rt->rt6i_prefsrc.plen = 0;
1376
1377 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1378 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1379 if (IS_ERR(n)) {
1380 err = PTR_ERR(n);
1381 goto out;
1382 }
1383 dst_set_neighbour(&rt->dst, n);
1384 }
1385
1386 rt->rt6i_flags = cfg->fc_flags;
1387
1388install_route:
1389 if (cfg->fc_mx) {
1390 struct nlattr *nla;
1391 int remaining;
1392
1393 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1394 int type = nla_type(nla);
1395
1396 if (type) {
1397 if (type > RTAX_MAX) {
1398 err = -EINVAL;
1399 goto out;
1400 }
1401
1402 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1403 }
1404 }
1405 }
1406
1407 rt->dst.dev = dev;
1408 rt->rt6i_idev = idev;
1409 rt->rt6i_table = table;
1410
1411 cfg->fc_nlinfo.nl_net = dev_net(dev);
1412
1413 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1414
1415out:
1416 if (dev)
1417 dev_put(dev);
1418 if (idev)
1419 in6_dev_put(idev);
1420 if (rt)
1421 dst_free(&rt->dst);
1422 return err;
1423}
1424
1425static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1426{
1427 int err;
1428 struct fib6_table *table;
1429 struct net *net = dev_net(rt->rt6i_dev);
1430
1431 if (rt == net->ipv6.ip6_null_entry)
1432 return -ENOENT;
1433
1434 table = rt->rt6i_table;
1435 write_lock_bh(&table->tb6_lock);
1436
1437 err = fib6_del(rt, info);
1438 dst_release(&rt->dst);
1439
1440 write_unlock_bh(&table->tb6_lock);
1441
1442 return err;
1443}
1444
1445int ip6_del_rt(struct rt6_info *rt)
1446{
1447 struct nl_info info = {
1448 .nl_net = dev_net(rt->rt6i_dev),
1449 };
1450 return __ip6_del_rt(rt, &info);
1451}
1452
1453static int ip6_route_del(struct fib6_config *cfg)
1454{
1455 struct fib6_table *table;
1456 struct fib6_node *fn;
1457 struct rt6_info *rt;
1458 int err = -ESRCH;
1459
1460 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1461 if (table == NULL)
1462 return err;
1463
1464 read_lock_bh(&table->tb6_lock);
1465
1466 fn = fib6_locate(&table->tb6_root,
1467 &cfg->fc_dst, cfg->fc_dst_len,
1468 &cfg->fc_src, cfg->fc_src_len);
1469
1470 if (fn) {
1471 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1472 if (cfg->fc_ifindex &&
1473 (rt->rt6i_dev == NULL ||
1474 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1475 continue;
1476 if (cfg->fc_flags & RTF_GATEWAY &&
1477 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1478 continue;
1479 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1480 continue;
1481 dst_hold(&rt->dst);
1482 read_unlock_bh(&table->tb6_lock);
1483
1484 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1485 }
1486 }
1487 read_unlock_bh(&table->tb6_lock);
1488
1489 return err;
1490}
1491
1492/*
1493 * Handle redirects
1494 */
1495struct ip6rd_flowi {
1496 struct flowi6 fl6;
1497 struct in6_addr gateway;
1498};
1499
1500static struct rt6_info *__ip6_route_redirect(struct net *net,
1501 struct fib6_table *table,
1502 struct flowi6 *fl6,
1503 int flags)
1504{
1505 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1506 struct rt6_info *rt;
1507 struct fib6_node *fn;
1508
1509 /*
1510 * Get the "current" route for this destination and
1511 * check if the redirect has come from approriate router.
1512 *
1513 * RFC 2461 specifies that redirects should only be
1514 * accepted if they come from the nexthop to the target.
1515 * Due to the way the routes are chosen, this notion
1516 * is a bit fuzzy and one might need to check all possible
1517 * routes.
1518 */
1519
1520 read_lock_bh(&table->tb6_lock);
1521 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1522restart:
1523 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1524 /*
1525 * Current route is on-link; redirect is always invalid.
1526 *
1527 * Seems, previous statement is not true. It could
1528 * be node, which looks for us as on-link (f.e. proxy ndisc)
1529 * But then router serving it might decide, that we should
1530 * know truth 8)8) --ANK (980726).
1531 */
1532 if (rt6_check_expired(rt))
1533 continue;
1534 if (!(rt->rt6i_flags & RTF_GATEWAY))
1535 continue;
1536 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1537 continue;
1538 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1539 continue;
1540 break;
1541 }
1542
1543 if (!rt)
1544 rt = net->ipv6.ip6_null_entry;
1545 BACKTRACK(net, &fl6->saddr);
1546out:
1547 dst_hold(&rt->dst);
1548
1549 read_unlock_bh(&table->tb6_lock);
1550
1551 return rt;
1552};
1553
1554static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1555 const struct in6_addr *src,
1556 const struct in6_addr *gateway,
1557 struct net_device *dev)
1558{
1559 int flags = RT6_LOOKUP_F_HAS_SADDR;
1560 struct net *net = dev_net(dev);
1561 struct ip6rd_flowi rdfl = {
1562 .fl6 = {
1563 .flowi6_oif = dev->ifindex,
1564 .daddr = *dest,
1565 .saddr = *src,
1566 },
1567 };
1568
1569 ipv6_addr_copy(&rdfl.gateway, gateway);
1570
1571 if (rt6_need_strict(dest))
1572 flags |= RT6_LOOKUP_F_IFACE;
1573
1574 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1575 flags, __ip6_route_redirect);
1576}
1577
1578void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1579 const struct in6_addr *saddr,
1580 struct neighbour *neigh, u8 *lladdr, int on_link)
1581{
1582 struct rt6_info *rt, *nrt = NULL;
1583 struct netevent_redirect netevent;
1584 struct net *net = dev_net(neigh->dev);
1585
1586 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1587
1588 if (rt == net->ipv6.ip6_null_entry) {
1589 if (net_ratelimit())
1590 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1591 "for redirect target\n");
1592 goto out;
1593 }
1594
1595 /*
1596 * We have finally decided to accept it.
1597 */
1598
1599 neigh_update(neigh, lladdr, NUD_STALE,
1600 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1601 NEIGH_UPDATE_F_OVERRIDE|
1602 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1603 NEIGH_UPDATE_F_ISROUTER))
1604 );
1605
1606 /*
1607 * Redirect received -> path was valid.
1608 * Look, redirects are sent only in response to data packets,
1609 * so that this nexthop apparently is reachable. --ANK
1610 */
1611 dst_confirm(&rt->dst);
1612
1613 /* Duplicate redirect: silently ignore. */
1614 if (neigh == dst_get_neighbour_raw(&rt->dst))
1615 goto out;
1616
1617 nrt = ip6_rt_copy(rt, dest);
1618 if (nrt == NULL)
1619 goto out;
1620
1621 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1622 if (on_link)
1623 nrt->rt6i_flags &= ~RTF_GATEWAY;
1624
1625 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1626 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1627
1628 if (ip6_ins_rt(nrt))
1629 goto out;
1630
1631 netevent.old = &rt->dst;
1632 netevent.new = &nrt->dst;
1633 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1634
1635 if (rt->rt6i_flags&RTF_CACHE) {
1636 ip6_del_rt(rt);
1637 return;
1638 }
1639
1640out:
1641 dst_release(&rt->dst);
1642}
1643
1644/*
1645 * Handle ICMP "packet too big" messages
1646 * i.e. Path MTU discovery
1647 */
1648
1649static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1650 struct net *net, u32 pmtu, int ifindex)
1651{
1652 struct rt6_info *rt, *nrt;
1653 int allfrag = 0;
1654again:
1655 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1656 if (rt == NULL)
1657 return;
1658
1659 if (rt6_check_expired(rt)) {
1660 ip6_del_rt(rt);
1661 goto again;
1662 }
1663
1664 if (pmtu >= dst_mtu(&rt->dst))
1665 goto out;
1666
1667 if (pmtu < IPV6_MIN_MTU) {
1668 /*
1669 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1670 * MTU (1280) and a fragment header should always be included
1671 * after a node receiving Too Big message reporting PMTU is
1672 * less than the IPv6 Minimum Link MTU.
1673 */
1674 pmtu = IPV6_MIN_MTU;
1675 allfrag = 1;
1676 }
1677
1678 /* New mtu received -> path was valid.
1679 They are sent only in response to data packets,
1680 so that this nexthop apparently is reachable. --ANK
1681 */
1682 dst_confirm(&rt->dst);
1683
1684 /* Host route. If it is static, it would be better
1685 not to override it, but add new one, so that
1686 when cache entry will expire old pmtu
1687 would return automatically.
1688 */
1689 if (rt->rt6i_flags & RTF_CACHE) {
1690 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1691 if (allfrag) {
1692 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1693 features |= RTAX_FEATURE_ALLFRAG;
1694 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1695 }
1696 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1697 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1698 goto out;
1699 }
1700
1701 /* Network route.
1702 Two cases are possible:
1703 1. It is connected route. Action: COW
1704 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1705 */
1706 if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1707 nrt = rt6_alloc_cow(rt, daddr, saddr);
1708 else
1709 nrt = rt6_alloc_clone(rt, daddr);
1710
1711 if (nrt) {
1712 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1713 if (allfrag) {
1714 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1715 features |= RTAX_FEATURE_ALLFRAG;
1716 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1717 }
1718
1719 /* According to RFC 1981, detecting PMTU increase shouldn't be
1720 * happened within 5 mins, the recommended timer is 10 mins.
1721 * Here this route expiration time is set to ip6_rt_mtu_expires
1722 * which is 10 mins. After 10 mins the decreased pmtu is expired
1723 * and detecting PMTU increase will be automatically happened.
1724 */
1725 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1726 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1727
1728 ip6_ins_rt(nrt);
1729 }
1730out:
1731 dst_release(&rt->dst);
1732}
1733
1734void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1735 struct net_device *dev, u32 pmtu)
1736{
1737 struct net *net = dev_net(dev);
1738
1739 /*
1740 * RFC 1981 states that a node "MUST reduce the size of the packets it
1741 * is sending along the path" that caused the Packet Too Big message.
1742 * Since it's not possible in the general case to determine which
1743 * interface was used to send the original packet, we update the MTU
1744 * on the interface that will be used to send future packets. We also
1745 * update the MTU on the interface that received the Packet Too Big in
1746 * case the original packet was forced out that interface with
1747 * SO_BINDTODEVICE or similar. This is the next best thing to the
1748 * correct behaviour, which would be to update the MTU on all
1749 * interfaces.
1750 */
1751 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1752 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1753}
1754
1755/*
1756 * Misc support functions
1757 */
1758
1759static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1760 const struct in6_addr *dest)
1761{
1762 struct net *net = dev_net(ort->rt6i_dev);
1763 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1764 ort->dst.dev, 0);
1765
1766 if (rt) {
1767 rt->dst.input = ort->dst.input;
1768 rt->dst.output = ort->dst.output;
1769 rt->dst.flags |= DST_HOST;
1770
1771 ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1772 rt->rt6i_dst.plen = 128;
1773 dst_copy_metrics(&rt->dst, &ort->dst);
1774 rt->dst.error = ort->dst.error;
1775 rt->rt6i_idev = ort->rt6i_idev;
1776 if (rt->rt6i_idev)
1777 in6_dev_hold(rt->rt6i_idev);
1778 rt->dst.lastuse = jiffies;
1779 rt->rt6i_expires = 0;
1780
1781 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1782 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1783 rt->rt6i_metric = 0;
1784
1785#ifdef CONFIG_IPV6_SUBTREES
1786 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1787#endif
1788 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1789 rt->rt6i_table = ort->rt6i_table;
1790 }
1791 return rt;
1792}
1793
1794#ifdef CONFIG_IPV6_ROUTE_INFO
1795static struct rt6_info *rt6_get_route_info(struct net *net,
1796 const struct in6_addr *prefix, int prefixlen,
1797 const struct in6_addr *gwaddr, int ifindex)
1798{
1799 struct fib6_node *fn;
1800 struct rt6_info *rt = NULL;
1801 struct fib6_table *table;
1802
1803 table = fib6_get_table(net, RT6_TABLE_INFO);
1804 if (table == NULL)
1805 return NULL;
1806
1807 write_lock_bh(&table->tb6_lock);
1808 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1809 if (!fn)
1810 goto out;
1811
1812 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1813 if (rt->rt6i_dev->ifindex != ifindex)
1814 continue;
1815 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1816 continue;
1817 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1818 continue;
1819 dst_hold(&rt->dst);
1820 break;
1821 }
1822out:
1823 write_unlock_bh(&table->tb6_lock);
1824 return rt;
1825}
1826
1827static struct rt6_info *rt6_add_route_info(struct net *net,
1828 const struct in6_addr *prefix, int prefixlen,
1829 const struct in6_addr *gwaddr, int ifindex,
1830 unsigned pref)
1831{
1832 struct fib6_config cfg = {
1833 .fc_table = RT6_TABLE_INFO,
1834 .fc_metric = IP6_RT_PRIO_USER,
1835 .fc_ifindex = ifindex,
1836 .fc_dst_len = prefixlen,
1837 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1838 RTF_UP | RTF_PREF(pref),
1839 .fc_nlinfo.pid = 0,
1840 .fc_nlinfo.nlh = NULL,
1841 .fc_nlinfo.nl_net = net,
1842 };
1843
1844 ipv6_addr_copy(&cfg.fc_dst, prefix);
1845 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1846
1847 /* We should treat it as a default route if prefix length is 0. */
1848 if (!prefixlen)
1849 cfg.fc_flags |= RTF_DEFAULT;
1850
1851 ip6_route_add(&cfg);
1852
1853 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1854}
1855#endif
1856
1857struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1858{
1859 struct rt6_info *rt;
1860 struct fib6_table *table;
1861
1862 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1863 if (table == NULL)
1864 return NULL;
1865
1866 write_lock_bh(&table->tb6_lock);
1867 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1868 if (dev == rt->rt6i_dev &&
1869 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1870 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1871 break;
1872 }
1873 if (rt)
1874 dst_hold(&rt->dst);
1875 write_unlock_bh(&table->tb6_lock);
1876 return rt;
1877}
1878
1879struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1880 struct net_device *dev,
1881 unsigned int pref)
1882{
1883 struct fib6_config cfg = {
1884 .fc_table = RT6_TABLE_DFLT,
1885 .fc_metric = IP6_RT_PRIO_USER,
1886 .fc_ifindex = dev->ifindex,
1887 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1888 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1889 .fc_nlinfo.pid = 0,
1890 .fc_nlinfo.nlh = NULL,
1891 .fc_nlinfo.nl_net = dev_net(dev),
1892 };
1893
1894 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1895
1896 ip6_route_add(&cfg);
1897
1898 return rt6_get_dflt_router(gwaddr, dev);
1899}
1900
1901void rt6_purge_dflt_routers(struct net *net)
1902{
1903 struct rt6_info *rt;
1904 struct fib6_table *table;
1905
1906 /* NOTE: Keep consistent with rt6_get_dflt_router */
1907 table = fib6_get_table(net, RT6_TABLE_DFLT);
1908 if (table == NULL)
1909 return;
1910
1911restart:
1912 read_lock_bh(&table->tb6_lock);
1913 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1914 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1915 dst_hold(&rt->dst);
1916 read_unlock_bh(&table->tb6_lock);
1917 ip6_del_rt(rt);
1918 goto restart;
1919 }
1920 }
1921 read_unlock_bh(&table->tb6_lock);
1922}
1923
1924static void rtmsg_to_fib6_config(struct net *net,
1925 struct in6_rtmsg *rtmsg,
1926 struct fib6_config *cfg)
1927{
1928 memset(cfg, 0, sizeof(*cfg));
1929
1930 cfg->fc_table = RT6_TABLE_MAIN;
1931 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1932 cfg->fc_metric = rtmsg->rtmsg_metric;
1933 cfg->fc_expires = rtmsg->rtmsg_info;
1934 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1935 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1936 cfg->fc_flags = rtmsg->rtmsg_flags;
1937
1938 cfg->fc_nlinfo.nl_net = net;
1939
1940 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1941 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1942 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1943}
1944
1945int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1946{
1947 struct fib6_config cfg;
1948 struct in6_rtmsg rtmsg;
1949 int err;
1950
1951 switch(cmd) {
1952 case SIOCADDRT: /* Add a route */
1953 case SIOCDELRT: /* Delete a route */
1954 if (!capable(CAP_NET_ADMIN))
1955 return -EPERM;
1956 err = copy_from_user(&rtmsg, arg,
1957 sizeof(struct in6_rtmsg));
1958 if (err)
1959 return -EFAULT;
1960
1961 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1962
1963 rtnl_lock();
1964 switch (cmd) {
1965 case SIOCADDRT:
1966 err = ip6_route_add(&cfg);
1967 break;
1968 case SIOCDELRT:
1969 err = ip6_route_del(&cfg);
1970 break;
1971 default:
1972 err = -EINVAL;
1973 }
1974 rtnl_unlock();
1975
1976 return err;
1977 }
1978
1979 return -EINVAL;
1980}
1981
1982/*
1983 * Drop the packet on the floor
1984 */
1985
1986static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1987{
1988 int type;
1989 struct dst_entry *dst = skb_dst(skb);
1990 switch (ipstats_mib_noroutes) {
1991 case IPSTATS_MIB_INNOROUTES:
1992 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1993 if (type == IPV6_ADDR_ANY) {
1994 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1995 IPSTATS_MIB_INADDRERRORS);
1996 break;
1997 }
1998 /* FALLTHROUGH */
1999 case IPSTATS_MIB_OUTNOROUTES:
2000 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2001 ipstats_mib_noroutes);
2002 break;
2003 }
2004 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2005 kfree_skb(skb);
2006 return 0;
2007}
2008
2009static int ip6_pkt_discard(struct sk_buff *skb)
2010{
2011 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2012}
2013
2014static int ip6_pkt_discard_out(struct sk_buff *skb)
2015{
2016 skb->dev = skb_dst(skb)->dev;
2017 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2018}
2019
2020#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2021
2022static int ip6_pkt_prohibit(struct sk_buff *skb)
2023{
2024 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2025}
2026
2027static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2028{
2029 skb->dev = skb_dst(skb)->dev;
2030 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2031}
2032
2033#endif
2034
2035/*
2036 * Allocate a dst for local (unicast / anycast) address.
2037 */
2038
2039struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2040 const struct in6_addr *addr,
2041 int anycast)
2042{
2043 struct net *net = dev_net(idev->dev);
2044 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2045 net->loopback_dev, 0);
2046 struct neighbour *neigh;
2047
2048 if (rt == NULL) {
2049 if (net_ratelimit())
2050 pr_warning("IPv6: Maximum number of routes reached,"
2051 " consider increasing route/max_size.\n");
2052 return ERR_PTR(-ENOMEM);
2053 }
2054
2055 in6_dev_hold(idev);
2056
2057 rt->dst.flags |= DST_HOST;
2058 rt->dst.input = ip6_input;
2059 rt->dst.output = ip6_output;
2060 rt->rt6i_idev = idev;
2061 rt->dst.obsolete = -1;
2062
2063 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2064 if (anycast)
2065 rt->rt6i_flags |= RTF_ANYCAST;
2066 else
2067 rt->rt6i_flags |= RTF_LOCAL;
2068 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2069 if (IS_ERR(neigh)) {
2070 dst_free(&rt->dst);
2071
2072 return ERR_CAST(neigh);
2073 }
2074 dst_set_neighbour(&rt->dst, neigh);
2075
2076 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2077 rt->rt6i_dst.plen = 128;
2078 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2079
2080 atomic_set(&rt->dst.__refcnt, 1);
2081
2082 return rt;
2083}
2084
2085int ip6_route_get_saddr(struct net *net,
2086 struct rt6_info *rt,
2087 const struct in6_addr *daddr,
2088 unsigned int prefs,
2089 struct in6_addr *saddr)
2090{
2091 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2092 int err = 0;
2093 if (rt->rt6i_prefsrc.plen)
2094 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2095 else
2096 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2097 daddr, prefs, saddr);
2098 return err;
2099}
2100
2101/* remove deleted ip from prefsrc entries */
2102struct arg_dev_net_ip {
2103 struct net_device *dev;
2104 struct net *net;
2105 struct in6_addr *addr;
2106};
2107
2108static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2109{
2110 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2111 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2112 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2113
2114 if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2115 rt != net->ipv6.ip6_null_entry &&
2116 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2117 /* remove prefsrc entry */
2118 rt->rt6i_prefsrc.plen = 0;
2119 }
2120 return 0;
2121}
2122
2123void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2124{
2125 struct net *net = dev_net(ifp->idev->dev);
2126 struct arg_dev_net_ip adni = {
2127 .dev = ifp->idev->dev,
2128 .net = net,
2129 .addr = &ifp->addr,
2130 };
2131 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2132}
2133
2134struct arg_dev_net {
2135 struct net_device *dev;
2136 struct net *net;
2137};
2138
2139static int fib6_ifdown(struct rt6_info *rt, void *arg)
2140{
2141 const struct arg_dev_net *adn = arg;
2142 const struct net_device *dev = adn->dev;
2143
2144 if ((rt->rt6i_dev == dev || dev == NULL) &&
2145 rt != adn->net->ipv6.ip6_null_entry) {
2146 RT6_TRACE("deleted by ifdown %p\n", rt);
2147 return -1;
2148 }
2149 return 0;
2150}
2151
2152void rt6_ifdown(struct net *net, struct net_device *dev)
2153{
2154 struct arg_dev_net adn = {
2155 .dev = dev,
2156 .net = net,
2157 };
2158
2159 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2160 icmp6_clean_all(fib6_ifdown, &adn);
2161}
2162
2163struct rt6_mtu_change_arg
2164{
2165 struct net_device *dev;
2166 unsigned mtu;
2167};
2168
2169static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2170{
2171 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2172 struct inet6_dev *idev;
2173
2174 /* In IPv6 pmtu discovery is not optional,
2175 so that RTAX_MTU lock cannot disable it.
2176 We still use this lock to block changes
2177 caused by addrconf/ndisc.
2178 */
2179
2180 idev = __in6_dev_get(arg->dev);
2181 if (idev == NULL)
2182 return 0;
2183
2184 /* For administrative MTU increase, there is no way to discover
2185 IPv6 PMTU increase, so PMTU increase should be updated here.
2186 Since RFC 1981 doesn't include administrative MTU increase
2187 update PMTU increase is a MUST. (i.e. jumbo frame)
2188 */
2189 /*
2190 If new MTU is less than route PMTU, this new MTU will be the
2191 lowest MTU in the path, update the route PMTU to reflect PMTU
2192 decreases; if new MTU is greater than route PMTU, and the
2193 old MTU is the lowest MTU in the path, update the route PMTU
2194 to reflect the increase. In this case if the other nodes' MTU
2195 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2196 PMTU discouvery.
2197 */
2198 if (rt->rt6i_dev == arg->dev &&
2199 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2200 (dst_mtu(&rt->dst) >= arg->mtu ||
2201 (dst_mtu(&rt->dst) < arg->mtu &&
2202 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2203 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2204 }
2205 return 0;
2206}
2207
2208void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2209{
2210 struct rt6_mtu_change_arg arg = {
2211 .dev = dev,
2212 .mtu = mtu,
2213 };
2214
2215 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2216}
2217
2218static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2219 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2220 [RTA_OIF] = { .type = NLA_U32 },
2221 [RTA_IIF] = { .type = NLA_U32 },
2222 [RTA_PRIORITY] = { .type = NLA_U32 },
2223 [RTA_METRICS] = { .type = NLA_NESTED },
2224};
2225
2226static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2227 struct fib6_config *cfg)
2228{
2229 struct rtmsg *rtm;
2230 struct nlattr *tb[RTA_MAX+1];
2231 int err;
2232
2233 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2234 if (err < 0)
2235 goto errout;
2236
2237 err = -EINVAL;
2238 rtm = nlmsg_data(nlh);
2239 memset(cfg, 0, sizeof(*cfg));
2240
2241 cfg->fc_table = rtm->rtm_table;
2242 cfg->fc_dst_len = rtm->rtm_dst_len;
2243 cfg->fc_src_len = rtm->rtm_src_len;
2244 cfg->fc_flags = RTF_UP;
2245 cfg->fc_protocol = rtm->rtm_protocol;
2246
2247 if (rtm->rtm_type == RTN_UNREACHABLE)
2248 cfg->fc_flags |= RTF_REJECT;
2249
2250 if (rtm->rtm_type == RTN_LOCAL)
2251 cfg->fc_flags |= RTF_LOCAL;
2252
2253 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2254 cfg->fc_nlinfo.nlh = nlh;
2255 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2256
2257 if (tb[RTA_GATEWAY]) {
2258 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2259 cfg->fc_flags |= RTF_GATEWAY;
2260 }
2261
2262 if (tb[RTA_DST]) {
2263 int plen = (rtm->rtm_dst_len + 7) >> 3;
2264
2265 if (nla_len(tb[RTA_DST]) < plen)
2266 goto errout;
2267
2268 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2269 }
2270
2271 if (tb[RTA_SRC]) {
2272 int plen = (rtm->rtm_src_len + 7) >> 3;
2273
2274 if (nla_len(tb[RTA_SRC]) < plen)
2275 goto errout;
2276
2277 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2278 }
2279
2280 if (tb[RTA_PREFSRC])
2281 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2282
2283 if (tb[RTA_OIF])
2284 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2285
2286 if (tb[RTA_PRIORITY])
2287 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2288
2289 if (tb[RTA_METRICS]) {
2290 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2291 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2292 }
2293
2294 if (tb[RTA_TABLE])
2295 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2296
2297 err = 0;
2298errout:
2299 return err;
2300}
2301
2302static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2303{
2304 struct fib6_config cfg;
2305 int err;
2306
2307 err = rtm_to_fib6_config(skb, nlh, &cfg);
2308 if (err < 0)
2309 return err;
2310
2311 return ip6_route_del(&cfg);
2312}
2313
2314static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2315{
2316 struct fib6_config cfg;
2317 int err;
2318
2319 err = rtm_to_fib6_config(skb, nlh, &cfg);
2320 if (err < 0)
2321 return err;
2322
2323 return ip6_route_add(&cfg);
2324}
2325
2326static inline size_t rt6_nlmsg_size(void)
2327{
2328 return NLMSG_ALIGN(sizeof(struct rtmsg))
2329 + nla_total_size(16) /* RTA_SRC */
2330 + nla_total_size(16) /* RTA_DST */
2331 + nla_total_size(16) /* RTA_GATEWAY */
2332 + nla_total_size(16) /* RTA_PREFSRC */
2333 + nla_total_size(4) /* RTA_TABLE */
2334 + nla_total_size(4) /* RTA_IIF */
2335 + nla_total_size(4) /* RTA_OIF */
2336 + nla_total_size(4) /* RTA_PRIORITY */
2337 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2338 + nla_total_size(sizeof(struct rta_cacheinfo));
2339}
2340
2341static int rt6_fill_node(struct net *net,
2342 struct sk_buff *skb, struct rt6_info *rt,
2343 struct in6_addr *dst, struct in6_addr *src,
2344 int iif, int type, u32 pid, u32 seq,
2345 int prefix, int nowait, unsigned int flags)
2346{
2347 struct rtmsg *rtm;
2348 struct nlmsghdr *nlh;
2349 long expires;
2350 u32 table;
2351 struct neighbour *n;
2352
2353 if (prefix) { /* user wants prefix routes only */
2354 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2355 /* success since this is not a prefix route */
2356 return 1;
2357 }
2358 }
2359
2360 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2361 if (nlh == NULL)
2362 return -EMSGSIZE;
2363
2364 rtm = nlmsg_data(nlh);
2365 rtm->rtm_family = AF_INET6;
2366 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2367 rtm->rtm_src_len = rt->rt6i_src.plen;
2368 rtm->rtm_tos = 0;
2369 if (rt->rt6i_table)
2370 table = rt->rt6i_table->tb6_id;
2371 else
2372 table = RT6_TABLE_UNSPEC;
2373 rtm->rtm_table = table;
2374 NLA_PUT_U32(skb, RTA_TABLE, table);
2375 if (rt->rt6i_flags&RTF_REJECT)
2376 rtm->rtm_type = RTN_UNREACHABLE;
2377 else if (rt->rt6i_flags&RTF_LOCAL)
2378 rtm->rtm_type = RTN_LOCAL;
2379 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2380 rtm->rtm_type = RTN_LOCAL;
2381 else
2382 rtm->rtm_type = RTN_UNICAST;
2383 rtm->rtm_flags = 0;
2384 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2385 rtm->rtm_protocol = rt->rt6i_protocol;
2386 if (rt->rt6i_flags&RTF_DYNAMIC)
2387 rtm->rtm_protocol = RTPROT_REDIRECT;
2388 else if (rt->rt6i_flags & RTF_ADDRCONF)
2389 rtm->rtm_protocol = RTPROT_KERNEL;
2390 else if (rt->rt6i_flags&RTF_DEFAULT)
2391 rtm->rtm_protocol = RTPROT_RA;
2392
2393 if (rt->rt6i_flags&RTF_CACHE)
2394 rtm->rtm_flags |= RTM_F_CLONED;
2395
2396 if (dst) {
2397 NLA_PUT(skb, RTA_DST, 16, dst);
2398 rtm->rtm_dst_len = 128;
2399 } else if (rtm->rtm_dst_len)
2400 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2401#ifdef CONFIG_IPV6_SUBTREES
2402 if (src) {
2403 NLA_PUT(skb, RTA_SRC, 16, src);
2404 rtm->rtm_src_len = 128;
2405 } else if (rtm->rtm_src_len)
2406 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2407#endif
2408 if (iif) {
2409#ifdef CONFIG_IPV6_MROUTE
2410 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2411 int err = ip6mr_get_route(net, skb, rtm, nowait);
2412 if (err <= 0) {
2413 if (!nowait) {
2414 if (err == 0)
2415 return 0;
2416 goto nla_put_failure;
2417 } else {
2418 if (err == -EMSGSIZE)
2419 goto nla_put_failure;
2420 }
2421 }
2422 } else
2423#endif
2424 NLA_PUT_U32(skb, RTA_IIF, iif);
2425 } else if (dst) {
2426 struct in6_addr saddr_buf;
2427 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2428 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2429 }
2430
2431 if (rt->rt6i_prefsrc.plen) {
2432 struct in6_addr saddr_buf;
2433 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2434 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2435 }
2436
2437 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2438 goto nla_put_failure;
2439
2440 rcu_read_lock();
2441 n = dst_get_neighbour(&rt->dst);
2442 if (n)
2443 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2444 rcu_read_unlock();
2445
2446 if (rt->dst.dev)
2447 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2448
2449 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2450
2451 if (!(rt->rt6i_flags & RTF_EXPIRES))
2452 expires = 0;
2453 else if (rt->rt6i_expires - jiffies < INT_MAX)
2454 expires = rt->rt6i_expires - jiffies;
2455 else
2456 expires = INT_MAX;
2457
2458 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2459 expires, rt->dst.error) < 0)
2460 goto nla_put_failure;
2461
2462 return nlmsg_end(skb, nlh);
2463
2464nla_put_failure:
2465 nlmsg_cancel(skb, nlh);
2466 return -EMSGSIZE;
2467}
2468
2469int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2470{
2471 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2472 int prefix;
2473
2474 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2475 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2476 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2477 } else
2478 prefix = 0;
2479
2480 return rt6_fill_node(arg->net,
2481 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2482 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2483 prefix, 0, NLM_F_MULTI);
2484}
2485
2486static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2487{
2488 struct net *net = sock_net(in_skb->sk);
2489 struct nlattr *tb[RTA_MAX+1];
2490 struct rt6_info *rt;
2491 struct sk_buff *skb;
2492 struct rtmsg *rtm;
2493 struct flowi6 fl6;
2494 int err, iif = 0;
2495
2496 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2497 if (err < 0)
2498 goto errout;
2499
2500 err = -EINVAL;
2501 memset(&fl6, 0, sizeof(fl6));
2502
2503 if (tb[RTA_SRC]) {
2504 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2505 goto errout;
2506
2507 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2508 }
2509
2510 if (tb[RTA_DST]) {
2511 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2512 goto errout;
2513
2514 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2515 }
2516
2517 if (tb[RTA_IIF])
2518 iif = nla_get_u32(tb[RTA_IIF]);
2519
2520 if (tb[RTA_OIF])
2521 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2522
2523 if (iif) {
2524 struct net_device *dev;
2525 dev = __dev_get_by_index(net, iif);
2526 if (!dev) {
2527 err = -ENODEV;
2528 goto errout;
2529 }
2530 }
2531
2532 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2533 if (skb == NULL) {
2534 err = -ENOBUFS;
2535 goto errout;
2536 }
2537
2538 /* Reserve room for dummy headers, this skb can pass
2539 through good chunk of routing engine.
2540 */
2541 skb_reset_mac_header(skb);
2542 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2543
2544 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2545 skb_dst_set(skb, &rt->dst);
2546
2547 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2548 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2549 nlh->nlmsg_seq, 0, 0, 0);
2550 if (err < 0) {
2551 kfree_skb(skb);
2552 goto errout;
2553 }
2554
2555 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2556errout:
2557 return err;
2558}
2559
2560void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2561{
2562 struct sk_buff *skb;
2563 struct net *net = info->nl_net;
2564 u32 seq;
2565 int err;
2566
2567 err = -ENOBUFS;
2568 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2569
2570 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2571 if (skb == NULL)
2572 goto errout;
2573
2574 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2575 event, info->pid, seq, 0, 0, 0);
2576 if (err < 0) {
2577 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2578 WARN_ON(err == -EMSGSIZE);
2579 kfree_skb(skb);
2580 goto errout;
2581 }
2582 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2583 info->nlh, gfp_any());
2584 return;
2585errout:
2586 if (err < 0)
2587 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2588}
2589
2590static int ip6_route_dev_notify(struct notifier_block *this,
2591 unsigned long event, void *data)
2592{
2593 struct net_device *dev = (struct net_device *)data;
2594 struct net *net = dev_net(dev);
2595
2596 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2597 net->ipv6.ip6_null_entry->dst.dev = dev;
2598 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2599#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2600 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2601 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2602 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2603 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2604#endif
2605 }
2606
2607 return NOTIFY_OK;
2608}
2609
2610/*
2611 * /proc
2612 */
2613
2614#ifdef CONFIG_PROC_FS
2615
2616struct rt6_proc_arg
2617{
2618 char *buffer;
2619 int offset;
2620 int length;
2621 int skip;
2622 int len;
2623};
2624
2625static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2626{
2627 struct seq_file *m = p_arg;
2628 struct neighbour *n;
2629
2630 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2631
2632#ifdef CONFIG_IPV6_SUBTREES
2633 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2634#else
2635 seq_puts(m, "00000000000000000000000000000000 00 ");
2636#endif
2637 rcu_read_lock();
2638 n = dst_get_neighbour(&rt->dst);
2639 if (n) {
2640 seq_printf(m, "%pi6", n->primary_key);
2641 } else {
2642 seq_puts(m, "00000000000000000000000000000000");
2643 }
2644 rcu_read_unlock();
2645 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2646 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2647 rt->dst.__use, rt->rt6i_flags,
2648 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2649 return 0;
2650}
2651
2652static int ipv6_route_show(struct seq_file *m, void *v)
2653{
2654 struct net *net = (struct net *)m->private;
2655 fib6_clean_all(net, rt6_info_route, 0, m);
2656 return 0;
2657}
2658
2659static int ipv6_route_open(struct inode *inode, struct file *file)
2660{
2661 return single_open_net(inode, file, ipv6_route_show);
2662}
2663
2664static const struct file_operations ipv6_route_proc_fops = {
2665 .owner = THIS_MODULE,
2666 .open = ipv6_route_open,
2667 .read = seq_read,
2668 .llseek = seq_lseek,
2669 .release = single_release_net,
2670};
2671
2672static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2673{
2674 struct net *net = (struct net *)seq->private;
2675 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2676 net->ipv6.rt6_stats->fib_nodes,
2677 net->ipv6.rt6_stats->fib_route_nodes,
2678 net->ipv6.rt6_stats->fib_rt_alloc,
2679 net->ipv6.rt6_stats->fib_rt_entries,
2680 net->ipv6.rt6_stats->fib_rt_cache,
2681 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2682 net->ipv6.rt6_stats->fib_discarded_routes);
2683
2684 return 0;
2685}
2686
2687static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2688{
2689 return single_open_net(inode, file, rt6_stats_seq_show);
2690}
2691
2692static const struct file_operations rt6_stats_seq_fops = {
2693 .owner = THIS_MODULE,
2694 .open = rt6_stats_seq_open,
2695 .read = seq_read,
2696 .llseek = seq_lseek,
2697 .release = single_release_net,
2698};
2699#endif /* CONFIG_PROC_FS */
2700
2701#ifdef CONFIG_SYSCTL
2702
2703static
2704int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2705 void __user *buffer, size_t *lenp, loff_t *ppos)
2706{
2707 struct net *net;
2708 int delay;
2709 if (!write)
2710 return -EINVAL;
2711
2712 net = (struct net *)ctl->extra1;
2713 delay = net->ipv6.sysctl.flush_delay;
2714 proc_dointvec(ctl, write, buffer, lenp, ppos);
2715 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2716 return 0;
2717}
2718
2719ctl_table ipv6_route_table_template[] = {
2720 {
2721 .procname = "flush",
2722 .data = &init_net.ipv6.sysctl.flush_delay,
2723 .maxlen = sizeof(int),
2724 .mode = 0200,
2725 .proc_handler = ipv6_sysctl_rtcache_flush
2726 },
2727 {
2728 .procname = "gc_thresh",
2729 .data = &ip6_dst_ops_template.gc_thresh,
2730 .maxlen = sizeof(int),
2731 .mode = 0644,
2732 .proc_handler = proc_dointvec,
2733 },
2734 {
2735 .procname = "max_size",
2736 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2737 .maxlen = sizeof(int),
2738 .mode = 0644,
2739 .proc_handler = proc_dointvec,
2740 },
2741 {
2742 .procname = "gc_min_interval",
2743 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2744 .maxlen = sizeof(int),
2745 .mode = 0644,
2746 .proc_handler = proc_dointvec_jiffies,
2747 },
2748 {
2749 .procname = "gc_timeout",
2750 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2751 .maxlen = sizeof(int),
2752 .mode = 0644,
2753 .proc_handler = proc_dointvec_jiffies,
2754 },
2755 {
2756 .procname = "gc_interval",
2757 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2758 .maxlen = sizeof(int),
2759 .mode = 0644,
2760 .proc_handler = proc_dointvec_jiffies,
2761 },
2762 {
2763 .procname = "gc_elasticity",
2764 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2765 .maxlen = sizeof(int),
2766 .mode = 0644,
2767 .proc_handler = proc_dointvec,
2768 },
2769 {
2770 .procname = "mtu_expires",
2771 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2772 .maxlen = sizeof(int),
2773 .mode = 0644,
2774 .proc_handler = proc_dointvec_jiffies,
2775 },
2776 {
2777 .procname = "min_adv_mss",
2778 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2779 .maxlen = sizeof(int),
2780 .mode = 0644,
2781 .proc_handler = proc_dointvec,
2782 },
2783 {
2784 .procname = "gc_min_interval_ms",
2785 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2786 .maxlen = sizeof(int),
2787 .mode = 0644,
2788 .proc_handler = proc_dointvec_ms_jiffies,
2789 },
2790 { }
2791};
2792
2793struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2794{
2795 struct ctl_table *table;
2796
2797 table = kmemdup(ipv6_route_table_template,
2798 sizeof(ipv6_route_table_template),
2799 GFP_KERNEL);
2800
2801 if (table) {
2802 table[0].data = &net->ipv6.sysctl.flush_delay;
2803 table[0].extra1 = net;
2804 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2805 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2806 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2807 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2808 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2809 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2810 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2811 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2812 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2813 }
2814
2815 return table;
2816}
2817#endif
2818
2819static int __net_init ip6_route_net_init(struct net *net)
2820{
2821 int ret = -ENOMEM;
2822
2823 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2824 sizeof(net->ipv6.ip6_dst_ops));
2825
2826 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2827 goto out_ip6_dst_ops;
2828
2829 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2830 sizeof(*net->ipv6.ip6_null_entry),
2831 GFP_KERNEL);
2832 if (!net->ipv6.ip6_null_entry)
2833 goto out_ip6_dst_entries;
2834 net->ipv6.ip6_null_entry->dst.path =
2835 (struct dst_entry *)net->ipv6.ip6_null_entry;
2836 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2837 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2838 ip6_template_metrics, true);
2839
2840#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2841 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2842 sizeof(*net->ipv6.ip6_prohibit_entry),
2843 GFP_KERNEL);
2844 if (!net->ipv6.ip6_prohibit_entry)
2845 goto out_ip6_null_entry;
2846 net->ipv6.ip6_prohibit_entry->dst.path =
2847 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2848 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2849 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2850 ip6_template_metrics, true);
2851
2852 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2853 sizeof(*net->ipv6.ip6_blk_hole_entry),
2854 GFP_KERNEL);
2855 if (!net->ipv6.ip6_blk_hole_entry)
2856 goto out_ip6_prohibit_entry;
2857 net->ipv6.ip6_blk_hole_entry->dst.path =
2858 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2859 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2860 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2861 ip6_template_metrics, true);
2862#endif
2863
2864 net->ipv6.sysctl.flush_delay = 0;
2865 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2866 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2867 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2868 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2869 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2870 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2871 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2872
2873#ifdef CONFIG_PROC_FS
2874 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2875 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2876#endif
2877 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2878
2879 ret = 0;
2880out:
2881 return ret;
2882
2883#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2884out_ip6_prohibit_entry:
2885 kfree(net->ipv6.ip6_prohibit_entry);
2886out_ip6_null_entry:
2887 kfree(net->ipv6.ip6_null_entry);
2888#endif
2889out_ip6_dst_entries:
2890 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2891out_ip6_dst_ops:
2892 goto out;
2893}
2894
2895static void __net_exit ip6_route_net_exit(struct net *net)
2896{
2897#ifdef CONFIG_PROC_FS
2898 proc_net_remove(net, "ipv6_route");
2899 proc_net_remove(net, "rt6_stats");
2900#endif
2901 kfree(net->ipv6.ip6_null_entry);
2902#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2903 kfree(net->ipv6.ip6_prohibit_entry);
2904 kfree(net->ipv6.ip6_blk_hole_entry);
2905#endif
2906 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2907}
2908
2909static struct pernet_operations ip6_route_net_ops = {
2910 .init = ip6_route_net_init,
2911 .exit = ip6_route_net_exit,
2912};
2913
2914static struct notifier_block ip6_route_dev_notifier = {
2915 .notifier_call = ip6_route_dev_notify,
2916 .priority = 0,
2917};
2918
2919int __init ip6_route_init(void)
2920{
2921 int ret;
2922
2923 ret = -ENOMEM;
2924 ip6_dst_ops_template.kmem_cachep =
2925 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2926 SLAB_HWCACHE_ALIGN, NULL);
2927 if (!ip6_dst_ops_template.kmem_cachep)
2928 goto out;
2929
2930 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2931 if (ret)
2932 goto out_kmem_cache;
2933
2934 ret = register_pernet_subsys(&ip6_route_net_ops);
2935 if (ret)
2936 goto out_dst_entries;
2937
2938 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2939
2940 /* Registering of the loopback is done before this portion of code,
2941 * the loopback reference in rt6_info will not be taken, do it
2942 * manually for init_net */
2943 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2944 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2945 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2946 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2947 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2948 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2949 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2950 #endif
2951 ret = fib6_init();
2952 if (ret)
2953 goto out_register_subsys;
2954
2955 ret = xfrm6_init();
2956 if (ret)
2957 goto out_fib6_init;
2958
2959 ret = fib6_rules_init();
2960 if (ret)
2961 goto xfrm6_init;
2962
2963 ret = -ENOBUFS;
2964 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2965 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2966 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2967 goto fib6_rules_init;
2968
2969 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2970 if (ret)
2971 goto fib6_rules_init;
2972
2973out:
2974 return ret;
2975
2976fib6_rules_init:
2977 fib6_rules_cleanup();
2978xfrm6_init:
2979 xfrm6_fini();
2980out_fib6_init:
2981 fib6_gc_cleanup();
2982out_register_subsys:
2983 unregister_pernet_subsys(&ip6_route_net_ops);
2984out_dst_entries:
2985 dst_entries_destroy(&ip6_dst_blackhole_ops);
2986out_kmem_cache:
2987 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2988 goto out;
2989}
2990
2991void ip6_route_cleanup(void)
2992{
2993 unregister_netdevice_notifier(&ip6_route_dev_notifier);
2994 fib6_rules_cleanup();
2995 xfrm6_fini();
2996 fib6_gc_cleanup();
2997 unregister_pernet_subsys(&ip6_route_net_ops);
2998 dst_entries_destroy(&ip6_dst_blackhole_ops);
2999 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3000}
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Linux INET6 implementation
4 * FIB front-end.
5 *
6 * Authors:
7 * Pedro Roque <roque@di.fc.ul.pt>
8 */
9
10/* Changes:
11 *
12 * YOSHIFUJI Hideaki @USAGI
13 * reworked default router selection.
14 * - respect outgoing interface
15 * - select from (probably) reachable routers (i.e.
16 * routers in REACHABLE, STALE, DELAY or PROBE states).
17 * - always select the same router if it is (probably)
18 * reachable. otherwise, round-robin the list.
19 * Ville Nuorvala
20 * Fixed routing subtrees.
21 */
22
23#define pr_fmt(fmt) "IPv6: " fmt
24
25#include <linux/capability.h>
26#include <linux/errno.h>
27#include <linux/export.h>
28#include <linux/types.h>
29#include <linux/times.h>
30#include <linux/socket.h>
31#include <linux/sockios.h>
32#include <linux/net.h>
33#include <linux/route.h>
34#include <linux/netdevice.h>
35#include <linux/in6.h>
36#include <linux/mroute6.h>
37#include <linux/init.h>
38#include <linux/if_arp.h>
39#include <linux/proc_fs.h>
40#include <linux/seq_file.h>
41#include <linux/nsproxy.h>
42#include <linux/slab.h>
43#include <linux/jhash.h>
44#include <linux/siphash.h>
45#include <net/net_namespace.h>
46#include <net/snmp.h>
47#include <net/ipv6.h>
48#include <net/ip6_fib.h>
49#include <net/ip6_route.h>
50#include <net/ndisc.h>
51#include <net/addrconf.h>
52#include <net/tcp.h>
53#include <linux/rtnetlink.h>
54#include <net/dst.h>
55#include <net/dst_metadata.h>
56#include <net/xfrm.h>
57#include <net/netevent.h>
58#include <net/netlink.h>
59#include <net/rtnh.h>
60#include <net/lwtunnel.h>
61#include <net/ip_tunnels.h>
62#include <net/l3mdev.h>
63#include <net/ip.h>
64#include <linux/uaccess.h>
65#include <linux/btf_ids.h>
66
67#ifdef CONFIG_SYSCTL
68#include <linux/sysctl.h>
69#endif
70
71static int ip6_rt_type_to_error(u8 fib6_type);
72
73#define CREATE_TRACE_POINTS
74#include <trace/events/fib6.h>
75EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
76#undef CREATE_TRACE_POINTS
77
78enum rt6_nud_state {
79 RT6_NUD_FAIL_HARD = -3,
80 RT6_NUD_FAIL_PROBE = -2,
81 RT6_NUD_FAIL_DO_RR = -1,
82 RT6_NUD_SUCCEED = 1
83};
84
85INDIRECT_CALLABLE_SCOPE
86struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
87static unsigned int ip6_default_advmss(const struct dst_entry *dst);
88INDIRECT_CALLABLE_SCOPE
89unsigned int ip6_mtu(const struct dst_entry *dst);
90static void ip6_negative_advice(struct sock *sk,
91 struct dst_entry *dst);
92static void ip6_dst_destroy(struct dst_entry *);
93static void ip6_dst_ifdown(struct dst_entry *,
94 struct net_device *dev);
95static void ip6_dst_gc(struct dst_ops *ops);
96
97static int ip6_pkt_discard(struct sk_buff *skb);
98static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
99static int ip6_pkt_prohibit(struct sk_buff *skb);
100static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
101static void ip6_link_failure(struct sk_buff *skb);
102static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
103 struct sk_buff *skb, u32 mtu,
104 bool confirm_neigh);
105static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
106 struct sk_buff *skb);
107static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
108 int strict);
109static size_t rt6_nlmsg_size(struct fib6_info *f6i);
110static int rt6_fill_node(struct net *net, struct sk_buff *skb,
111 struct fib6_info *rt, struct dst_entry *dst,
112 struct in6_addr *dest, struct in6_addr *src,
113 int iif, int type, u32 portid, u32 seq,
114 unsigned int flags);
115static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
116 const struct in6_addr *daddr,
117 const struct in6_addr *saddr);
118
119#ifdef CONFIG_IPV6_ROUTE_INFO
120static struct fib6_info *rt6_add_route_info(struct net *net,
121 const struct in6_addr *prefix, int prefixlen,
122 const struct in6_addr *gwaddr,
123 struct net_device *dev,
124 unsigned int pref);
125static struct fib6_info *rt6_get_route_info(struct net *net,
126 const struct in6_addr *prefix, int prefixlen,
127 const struct in6_addr *gwaddr,
128 struct net_device *dev);
129#endif
130
131struct uncached_list {
132 spinlock_t lock;
133 struct list_head head;
134 struct list_head quarantine;
135};
136
137static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
138
139void rt6_uncached_list_add(struct rt6_info *rt)
140{
141 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
142
143 rt->dst.rt_uncached_list = ul;
144
145 spin_lock_bh(&ul->lock);
146 list_add_tail(&rt->dst.rt_uncached, &ul->head);
147 spin_unlock_bh(&ul->lock);
148}
149
150void rt6_uncached_list_del(struct rt6_info *rt)
151{
152 if (!list_empty(&rt->dst.rt_uncached)) {
153 struct uncached_list *ul = rt->dst.rt_uncached_list;
154
155 spin_lock_bh(&ul->lock);
156 list_del_init(&rt->dst.rt_uncached);
157 spin_unlock_bh(&ul->lock);
158 }
159}
160
161static void rt6_uncached_list_flush_dev(struct net_device *dev)
162{
163 int cpu;
164
165 for_each_possible_cpu(cpu) {
166 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
167 struct rt6_info *rt, *safe;
168
169 if (list_empty(&ul->head))
170 continue;
171
172 spin_lock_bh(&ul->lock);
173 list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
174 struct inet6_dev *rt_idev = rt->rt6i_idev;
175 struct net_device *rt_dev = rt->dst.dev;
176 bool handled = false;
177
178 if (rt_idev->dev == dev) {
179 rt->rt6i_idev = in6_dev_get(blackhole_netdev);
180 in6_dev_put(rt_idev);
181 handled = true;
182 }
183
184 if (rt_dev == dev) {
185 rt->dst.dev = blackhole_netdev;
186 netdev_ref_replace(rt_dev, blackhole_netdev,
187 &rt->dst.dev_tracker,
188 GFP_ATOMIC);
189 handled = true;
190 }
191 if (handled)
192 list_move(&rt->dst.rt_uncached,
193 &ul->quarantine);
194 }
195 spin_unlock_bh(&ul->lock);
196 }
197}
198
199static inline const void *choose_neigh_daddr(const struct in6_addr *p,
200 struct sk_buff *skb,
201 const void *daddr)
202{
203 if (!ipv6_addr_any(p))
204 return (const void *) p;
205 else if (skb)
206 return &ipv6_hdr(skb)->daddr;
207 return daddr;
208}
209
210struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
211 struct net_device *dev,
212 struct sk_buff *skb,
213 const void *daddr)
214{
215 struct neighbour *n;
216
217 daddr = choose_neigh_daddr(gw, skb, daddr);
218 n = __ipv6_neigh_lookup(dev, daddr);
219 if (n)
220 return n;
221
222 n = neigh_create(&nd_tbl, daddr, dev);
223 return IS_ERR(n) ? NULL : n;
224}
225
226static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
227 struct sk_buff *skb,
228 const void *daddr)
229{
230 const struct rt6_info *rt = dst_rt6_info(dst);
231
232 return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
233 dst->dev, skb, daddr);
234}
235
236static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
237{
238 const struct rt6_info *rt = dst_rt6_info(dst);
239 struct net_device *dev = dst->dev;
240
241 daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
242 if (!daddr)
243 return;
244 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
245 return;
246 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
247 return;
248 __ipv6_confirm_neigh(dev, daddr);
249}
250
251static struct dst_ops ip6_dst_ops_template = {
252 .family = AF_INET6,
253 .gc = ip6_dst_gc,
254 .gc_thresh = 1024,
255 .check = ip6_dst_check,
256 .default_advmss = ip6_default_advmss,
257 .mtu = ip6_mtu,
258 .cow_metrics = dst_cow_metrics_generic,
259 .destroy = ip6_dst_destroy,
260 .ifdown = ip6_dst_ifdown,
261 .negative_advice = ip6_negative_advice,
262 .link_failure = ip6_link_failure,
263 .update_pmtu = ip6_rt_update_pmtu,
264 .redirect = rt6_do_redirect,
265 .local_out = __ip6_local_out,
266 .neigh_lookup = ip6_dst_neigh_lookup,
267 .confirm_neigh = ip6_confirm_neigh,
268};
269
270static struct dst_ops ip6_dst_blackhole_ops = {
271 .family = AF_INET6,
272 .default_advmss = ip6_default_advmss,
273 .neigh_lookup = ip6_dst_neigh_lookup,
274 .check = ip6_dst_check,
275 .destroy = ip6_dst_destroy,
276 .cow_metrics = dst_cow_metrics_generic,
277 .update_pmtu = dst_blackhole_update_pmtu,
278 .redirect = dst_blackhole_redirect,
279 .mtu = dst_blackhole_mtu,
280};
281
282static const u32 ip6_template_metrics[RTAX_MAX] = {
283 [RTAX_HOPLIMIT - 1] = 0,
284};
285
286static const struct fib6_info fib6_null_entry_template = {
287 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
288 .fib6_protocol = RTPROT_KERNEL,
289 .fib6_metric = ~(u32)0,
290 .fib6_ref = REFCOUNT_INIT(1),
291 .fib6_type = RTN_UNREACHABLE,
292 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
293};
294
295static const struct rt6_info ip6_null_entry_template = {
296 .dst = {
297 .__rcuref = RCUREF_INIT(1),
298 .__use = 1,
299 .obsolete = DST_OBSOLETE_FORCE_CHK,
300 .error = -ENETUNREACH,
301 .input = ip6_pkt_discard,
302 .output = ip6_pkt_discard_out,
303 },
304 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
305};
306
307#ifdef CONFIG_IPV6_MULTIPLE_TABLES
308
309static const struct rt6_info ip6_prohibit_entry_template = {
310 .dst = {
311 .__rcuref = RCUREF_INIT(1),
312 .__use = 1,
313 .obsolete = DST_OBSOLETE_FORCE_CHK,
314 .error = -EACCES,
315 .input = ip6_pkt_prohibit,
316 .output = ip6_pkt_prohibit_out,
317 },
318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
319};
320
321static const struct rt6_info ip6_blk_hole_entry_template = {
322 .dst = {
323 .__rcuref = RCUREF_INIT(1),
324 .__use = 1,
325 .obsolete = DST_OBSOLETE_FORCE_CHK,
326 .error = -EINVAL,
327 .input = dst_discard,
328 .output = dst_discard_out,
329 },
330 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
331};
332
333#endif
334
335static void rt6_info_init(struct rt6_info *rt)
336{
337 memset_after(rt, 0, dst);
338}
339
340/* allocate dst with ip6_dst_ops */
341struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
342 int flags)
343{
344 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
345 DST_OBSOLETE_FORCE_CHK, flags);
346
347 if (rt) {
348 rt6_info_init(rt);
349 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
350 }
351
352 return rt;
353}
354EXPORT_SYMBOL(ip6_dst_alloc);
355
356static void ip6_dst_destroy(struct dst_entry *dst)
357{
358 struct rt6_info *rt = dst_rt6_info(dst);
359 struct fib6_info *from;
360 struct inet6_dev *idev;
361
362 ip_dst_metrics_put(dst);
363 rt6_uncached_list_del(rt);
364
365 idev = rt->rt6i_idev;
366 if (idev) {
367 rt->rt6i_idev = NULL;
368 in6_dev_put(idev);
369 }
370
371 from = xchg((__force struct fib6_info **)&rt->from, NULL);
372 fib6_info_release(from);
373}
374
375static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
376{
377 struct rt6_info *rt = dst_rt6_info(dst);
378 struct inet6_dev *idev = rt->rt6i_idev;
379
380 if (idev && idev->dev != blackhole_netdev) {
381 struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev);
382
383 if (blackhole_idev) {
384 rt->rt6i_idev = blackhole_idev;
385 in6_dev_put(idev);
386 }
387 }
388}
389
390static bool __rt6_check_expired(const struct rt6_info *rt)
391{
392 if (rt->rt6i_flags & RTF_EXPIRES)
393 return time_after(jiffies, rt->dst.expires);
394 else
395 return false;
396}
397
398static bool rt6_check_expired(const struct rt6_info *rt)
399{
400 struct fib6_info *from;
401
402 from = rcu_dereference(rt->from);
403
404 if (rt->rt6i_flags & RTF_EXPIRES) {
405 if (time_after(jiffies, rt->dst.expires))
406 return true;
407 } else if (from) {
408 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
409 fib6_check_expired(from);
410 }
411 return false;
412}
413
414void fib6_select_path(const struct net *net, struct fib6_result *res,
415 struct flowi6 *fl6, int oif, bool have_oif_match,
416 const struct sk_buff *skb, int strict)
417{
418 struct fib6_info *sibling, *next_sibling;
419 struct fib6_info *match = res->f6i;
420
421 if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
422 goto out;
423
424 if (match->nh && have_oif_match && res->nh)
425 return;
426
427 if (skb)
428 IP6CB(skb)->flags |= IP6SKB_MULTIPATH;
429
430 /* We might have already computed the hash for ICMPv6 errors. In such
431 * case it will always be non-zero. Otherwise now is the time to do it.
432 */
433 if (!fl6->mp_hash &&
434 (!match->nh || nexthop_is_multipath(match->nh)))
435 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
436
437 if (unlikely(match->nh)) {
438 nexthop_path_fib6_result(res, fl6->mp_hash);
439 return;
440 }
441
442 if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
443 goto out;
444
445 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
446 fib6_siblings) {
447 const struct fib6_nh *nh = sibling->fib6_nh;
448 int nh_upper_bound;
449
450 nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
451 if (fl6->mp_hash > nh_upper_bound)
452 continue;
453 if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
454 break;
455 match = sibling;
456 break;
457 }
458
459out:
460 res->f6i = match;
461 res->nh = match->fib6_nh;
462}
463
464/*
465 * Route lookup. rcu_read_lock() should be held.
466 */
467
468static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
469 const struct in6_addr *saddr, int oif, int flags)
470{
471 const struct net_device *dev;
472
473 if (nh->fib_nh_flags & RTNH_F_DEAD)
474 return false;
475
476 dev = nh->fib_nh_dev;
477 if (oif) {
478 if (dev->ifindex == oif)
479 return true;
480 } else {
481 if (ipv6_chk_addr(net, saddr, dev,
482 flags & RT6_LOOKUP_F_IFACE))
483 return true;
484 }
485
486 return false;
487}
488
489struct fib6_nh_dm_arg {
490 struct net *net;
491 const struct in6_addr *saddr;
492 int oif;
493 int flags;
494 struct fib6_nh *nh;
495};
496
497static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
498{
499 struct fib6_nh_dm_arg *arg = _arg;
500
501 arg->nh = nh;
502 return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
503 arg->flags);
504}
505
506/* returns fib6_nh from nexthop or NULL */
507static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
508 struct fib6_result *res,
509 const struct in6_addr *saddr,
510 int oif, int flags)
511{
512 struct fib6_nh_dm_arg arg = {
513 .net = net,
514 .saddr = saddr,
515 .oif = oif,
516 .flags = flags,
517 };
518
519 if (nexthop_is_blackhole(nh))
520 return NULL;
521
522 if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
523 return arg.nh;
524
525 return NULL;
526}
527
528static void rt6_device_match(struct net *net, struct fib6_result *res,
529 const struct in6_addr *saddr, int oif, int flags)
530{
531 struct fib6_info *f6i = res->f6i;
532 struct fib6_info *spf6i;
533 struct fib6_nh *nh;
534
535 if (!oif && ipv6_addr_any(saddr)) {
536 if (unlikely(f6i->nh)) {
537 nh = nexthop_fib6_nh(f6i->nh);
538 if (nexthop_is_blackhole(f6i->nh))
539 goto out_blackhole;
540 } else {
541 nh = f6i->fib6_nh;
542 }
543 if (!(nh->fib_nh_flags & RTNH_F_DEAD))
544 goto out;
545 }
546
547 for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
548 bool matched = false;
549
550 if (unlikely(spf6i->nh)) {
551 nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
552 oif, flags);
553 if (nh)
554 matched = true;
555 } else {
556 nh = spf6i->fib6_nh;
557 if (__rt6_device_match(net, nh, saddr, oif, flags))
558 matched = true;
559 }
560 if (matched) {
561 res->f6i = spf6i;
562 goto out;
563 }
564 }
565
566 if (oif && flags & RT6_LOOKUP_F_IFACE) {
567 res->f6i = net->ipv6.fib6_null_entry;
568 nh = res->f6i->fib6_nh;
569 goto out;
570 }
571
572 if (unlikely(f6i->nh)) {
573 nh = nexthop_fib6_nh(f6i->nh);
574 if (nexthop_is_blackhole(f6i->nh))
575 goto out_blackhole;
576 } else {
577 nh = f6i->fib6_nh;
578 }
579
580 if (nh->fib_nh_flags & RTNH_F_DEAD) {
581 res->f6i = net->ipv6.fib6_null_entry;
582 nh = res->f6i->fib6_nh;
583 }
584out:
585 res->nh = nh;
586 res->fib6_type = res->f6i->fib6_type;
587 res->fib6_flags = res->f6i->fib6_flags;
588 return;
589
590out_blackhole:
591 res->fib6_flags |= RTF_REJECT;
592 res->fib6_type = RTN_BLACKHOLE;
593 res->nh = nh;
594}
595
596#ifdef CONFIG_IPV6_ROUTER_PREF
597struct __rt6_probe_work {
598 struct work_struct work;
599 struct in6_addr target;
600 struct net_device *dev;
601 netdevice_tracker dev_tracker;
602};
603
604static void rt6_probe_deferred(struct work_struct *w)
605{
606 struct in6_addr mcaddr;
607 struct __rt6_probe_work *work =
608 container_of(w, struct __rt6_probe_work, work);
609
610 addrconf_addr_solict_mult(&work->target, &mcaddr);
611 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
612 netdev_put(work->dev, &work->dev_tracker);
613 kfree(work);
614}
615
616static void rt6_probe(struct fib6_nh *fib6_nh)
617{
618 struct __rt6_probe_work *work = NULL;
619 const struct in6_addr *nh_gw;
620 unsigned long last_probe;
621 struct neighbour *neigh;
622 struct net_device *dev;
623 struct inet6_dev *idev;
624
625 /*
626 * Okay, this does not seem to be appropriate
627 * for now, however, we need to check if it
628 * is really so; aka Router Reachability Probing.
629 *
630 * Router Reachability Probe MUST be rate-limited
631 * to no more than one per minute.
632 */
633 if (!fib6_nh->fib_nh_gw_family)
634 return;
635
636 nh_gw = &fib6_nh->fib_nh_gw6;
637 dev = fib6_nh->fib_nh_dev;
638 rcu_read_lock();
639 last_probe = READ_ONCE(fib6_nh->last_probe);
640 idev = __in6_dev_get(dev);
641 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
642 if (neigh) {
643 if (READ_ONCE(neigh->nud_state) & NUD_VALID)
644 goto out;
645
646 write_lock_bh(&neigh->lock);
647 if (!(neigh->nud_state & NUD_VALID) &&
648 time_after(jiffies,
649 neigh->updated +
650 READ_ONCE(idev->cnf.rtr_probe_interval))) {
651 work = kmalloc(sizeof(*work), GFP_ATOMIC);
652 if (work)
653 __neigh_set_probe_once(neigh);
654 }
655 write_unlock_bh(&neigh->lock);
656 } else if (time_after(jiffies, last_probe +
657 READ_ONCE(idev->cnf.rtr_probe_interval))) {
658 work = kmalloc(sizeof(*work), GFP_ATOMIC);
659 }
660
661 if (!work || cmpxchg(&fib6_nh->last_probe,
662 last_probe, jiffies) != last_probe) {
663 kfree(work);
664 } else {
665 INIT_WORK(&work->work, rt6_probe_deferred);
666 work->target = *nh_gw;
667 netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC);
668 work->dev = dev;
669 schedule_work(&work->work);
670 }
671
672out:
673 rcu_read_unlock();
674}
675#else
676static inline void rt6_probe(struct fib6_nh *fib6_nh)
677{
678}
679#endif
680
681/*
682 * Default Router Selection (RFC 2461 6.3.6)
683 */
684static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
685{
686 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
687 struct neighbour *neigh;
688
689 rcu_read_lock();
690 neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
691 &fib6_nh->fib_nh_gw6);
692 if (neigh) {
693 u8 nud_state = READ_ONCE(neigh->nud_state);
694
695 if (nud_state & NUD_VALID)
696 ret = RT6_NUD_SUCCEED;
697#ifdef CONFIG_IPV6_ROUTER_PREF
698 else if (!(nud_state & NUD_FAILED))
699 ret = RT6_NUD_SUCCEED;
700 else
701 ret = RT6_NUD_FAIL_PROBE;
702#endif
703 } else {
704 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
705 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
706 }
707 rcu_read_unlock();
708
709 return ret;
710}
711
712static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
713 int strict)
714{
715 int m = 0;
716
717 if (!oif || nh->fib_nh_dev->ifindex == oif)
718 m = 2;
719
720 if (!m && (strict & RT6_LOOKUP_F_IFACE))
721 return RT6_NUD_FAIL_HARD;
722#ifdef CONFIG_IPV6_ROUTER_PREF
723 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
724#endif
725 if ((strict & RT6_LOOKUP_F_REACHABLE) &&
726 !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
727 int n = rt6_check_neigh(nh);
728 if (n < 0)
729 return n;
730 }
731 return m;
732}
733
734static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
735 int oif, int strict, int *mpri, bool *do_rr)
736{
737 bool match_do_rr = false;
738 bool rc = false;
739 int m;
740
741 if (nh->fib_nh_flags & RTNH_F_DEAD)
742 goto out;
743
744 if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
745 nh->fib_nh_flags & RTNH_F_LINKDOWN &&
746 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
747 goto out;
748
749 m = rt6_score_route(nh, fib6_flags, oif, strict);
750 if (m == RT6_NUD_FAIL_DO_RR) {
751 match_do_rr = true;
752 m = 0; /* lowest valid score */
753 } else if (m == RT6_NUD_FAIL_HARD) {
754 goto out;
755 }
756
757 if (strict & RT6_LOOKUP_F_REACHABLE)
758 rt6_probe(nh);
759
760 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
761 if (m > *mpri) {
762 *do_rr = match_do_rr;
763 *mpri = m;
764 rc = true;
765 }
766out:
767 return rc;
768}
769
770struct fib6_nh_frl_arg {
771 u32 flags;
772 int oif;
773 int strict;
774 int *mpri;
775 bool *do_rr;
776 struct fib6_nh *nh;
777};
778
779static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
780{
781 struct fib6_nh_frl_arg *arg = _arg;
782
783 arg->nh = nh;
784 return find_match(nh, arg->flags, arg->oif, arg->strict,
785 arg->mpri, arg->do_rr);
786}
787
788static void __find_rr_leaf(struct fib6_info *f6i_start,
789 struct fib6_info *nomatch, u32 metric,
790 struct fib6_result *res, struct fib6_info **cont,
791 int oif, int strict, bool *do_rr, int *mpri)
792{
793 struct fib6_info *f6i;
794
795 for (f6i = f6i_start;
796 f6i && f6i != nomatch;
797 f6i = rcu_dereference(f6i->fib6_next)) {
798 bool matched = false;
799 struct fib6_nh *nh;
800
801 if (cont && f6i->fib6_metric != metric) {
802 *cont = f6i;
803 return;
804 }
805
806 if (fib6_check_expired(f6i))
807 continue;
808
809 if (unlikely(f6i->nh)) {
810 struct fib6_nh_frl_arg arg = {
811 .flags = f6i->fib6_flags,
812 .oif = oif,
813 .strict = strict,
814 .mpri = mpri,
815 .do_rr = do_rr
816 };
817
818 if (nexthop_is_blackhole(f6i->nh)) {
819 res->fib6_flags = RTF_REJECT;
820 res->fib6_type = RTN_BLACKHOLE;
821 res->f6i = f6i;
822 res->nh = nexthop_fib6_nh(f6i->nh);
823 return;
824 }
825 if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
826 &arg)) {
827 matched = true;
828 nh = arg.nh;
829 }
830 } else {
831 nh = f6i->fib6_nh;
832 if (find_match(nh, f6i->fib6_flags, oif, strict,
833 mpri, do_rr))
834 matched = true;
835 }
836 if (matched) {
837 res->f6i = f6i;
838 res->nh = nh;
839 res->fib6_flags = f6i->fib6_flags;
840 res->fib6_type = f6i->fib6_type;
841 }
842 }
843}
844
845static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
846 struct fib6_info *rr_head, int oif, int strict,
847 bool *do_rr, struct fib6_result *res)
848{
849 u32 metric = rr_head->fib6_metric;
850 struct fib6_info *cont = NULL;
851 int mpri = -1;
852
853 __find_rr_leaf(rr_head, NULL, metric, res, &cont,
854 oif, strict, do_rr, &mpri);
855
856 __find_rr_leaf(leaf, rr_head, metric, res, &cont,
857 oif, strict, do_rr, &mpri);
858
859 if (res->f6i || !cont)
860 return;
861
862 __find_rr_leaf(cont, NULL, metric, res, NULL,
863 oif, strict, do_rr, &mpri);
864}
865
866static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
867 struct fib6_result *res, int strict)
868{
869 struct fib6_info *leaf = rcu_dereference(fn->leaf);
870 struct fib6_info *rt0;
871 bool do_rr = false;
872 int key_plen;
873
874 /* make sure this function or its helpers sets f6i */
875 res->f6i = NULL;
876
877 if (!leaf || leaf == net->ipv6.fib6_null_entry)
878 goto out;
879
880 rt0 = rcu_dereference(fn->rr_ptr);
881 if (!rt0)
882 rt0 = leaf;
883
884 /* Double check to make sure fn is not an intermediate node
885 * and fn->leaf does not points to its child's leaf
886 * (This might happen if all routes under fn are deleted from
887 * the tree and fib6_repair_tree() is called on the node.)
888 */
889 key_plen = rt0->fib6_dst.plen;
890#ifdef CONFIG_IPV6_SUBTREES
891 if (rt0->fib6_src.plen)
892 key_plen = rt0->fib6_src.plen;
893#endif
894 if (fn->fn_bit != key_plen)
895 goto out;
896
897 find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
898 if (do_rr) {
899 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
900
901 /* no entries matched; do round-robin */
902 if (!next || next->fib6_metric != rt0->fib6_metric)
903 next = leaf;
904
905 if (next != rt0) {
906 spin_lock_bh(&leaf->fib6_table->tb6_lock);
907 /* make sure next is not being deleted from the tree */
908 if (next->fib6_node)
909 rcu_assign_pointer(fn->rr_ptr, next);
910 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
911 }
912 }
913
914out:
915 if (!res->f6i) {
916 res->f6i = net->ipv6.fib6_null_entry;
917 res->nh = res->f6i->fib6_nh;
918 res->fib6_flags = res->f6i->fib6_flags;
919 res->fib6_type = res->f6i->fib6_type;
920 }
921}
922
923static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
924{
925 return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
926 res->nh->fib_nh_gw_family;
927}
928
929#ifdef CONFIG_IPV6_ROUTE_INFO
930int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
931 const struct in6_addr *gwaddr)
932{
933 struct net *net = dev_net(dev);
934 struct route_info *rinfo = (struct route_info *) opt;
935 struct in6_addr prefix_buf, *prefix;
936 struct fib6_table *table;
937 unsigned int pref;
938 unsigned long lifetime;
939 struct fib6_info *rt;
940
941 if (len < sizeof(struct route_info)) {
942 return -EINVAL;
943 }
944
945 /* Sanity check for prefix_len and length */
946 if (rinfo->length > 3) {
947 return -EINVAL;
948 } else if (rinfo->prefix_len > 128) {
949 return -EINVAL;
950 } else if (rinfo->prefix_len > 64) {
951 if (rinfo->length < 2) {
952 return -EINVAL;
953 }
954 } else if (rinfo->prefix_len > 0) {
955 if (rinfo->length < 1) {
956 return -EINVAL;
957 }
958 }
959
960 pref = rinfo->route_pref;
961 if (pref == ICMPV6_ROUTER_PREF_INVALID)
962 return -EINVAL;
963
964 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
965
966 if (rinfo->length == 3)
967 prefix = (struct in6_addr *)rinfo->prefix;
968 else {
969 /* this function is safe */
970 ipv6_addr_prefix(&prefix_buf,
971 (struct in6_addr *)rinfo->prefix,
972 rinfo->prefix_len);
973 prefix = &prefix_buf;
974 }
975
976 if (rinfo->prefix_len == 0)
977 rt = rt6_get_dflt_router(net, gwaddr, dev);
978 else
979 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
980 gwaddr, dev);
981
982 if (rt && !lifetime) {
983 ip6_del_rt(net, rt, false);
984 rt = NULL;
985 }
986
987 if (!rt && lifetime)
988 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
989 dev, pref);
990 else if (rt)
991 rt->fib6_flags = RTF_ROUTEINFO |
992 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
993
994 if (rt) {
995 table = rt->fib6_table;
996 spin_lock_bh(&table->tb6_lock);
997
998 if (!addrconf_finite_timeout(lifetime)) {
999 fib6_clean_expires(rt);
1000 fib6_remove_gc_list(rt);
1001 } else {
1002 fib6_set_expires(rt, jiffies + HZ * lifetime);
1003 fib6_add_gc_list(rt);
1004 }
1005
1006 spin_unlock_bh(&table->tb6_lock);
1007
1008 fib6_info_release(rt);
1009 }
1010 return 0;
1011}
1012#endif
1013
1014/*
1015 * Misc support functions
1016 */
1017
1018/* called with rcu_lock held */
1019static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
1020{
1021 struct net_device *dev = res->nh->fib_nh_dev;
1022
1023 if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1024 /* for copies of local routes, dst->dev needs to be the
1025 * device if it is a master device, the master device if
1026 * device is enslaved, and the loopback as the default
1027 */
1028 if (netif_is_l3_slave(dev) &&
1029 !rt6_need_strict(&res->f6i->fib6_dst.addr))
1030 dev = l3mdev_master_dev_rcu(dev);
1031 else if (!netif_is_l3_master(dev))
1032 dev = dev_net(dev)->loopback_dev;
1033 /* last case is netif_is_l3_master(dev) is true in which
1034 * case we want dev returned to be dev
1035 */
1036 }
1037
1038 return dev;
1039}
1040
1041static const int fib6_prop[RTN_MAX + 1] = {
1042 [RTN_UNSPEC] = 0,
1043 [RTN_UNICAST] = 0,
1044 [RTN_LOCAL] = 0,
1045 [RTN_BROADCAST] = 0,
1046 [RTN_ANYCAST] = 0,
1047 [RTN_MULTICAST] = 0,
1048 [RTN_BLACKHOLE] = -EINVAL,
1049 [RTN_UNREACHABLE] = -EHOSTUNREACH,
1050 [RTN_PROHIBIT] = -EACCES,
1051 [RTN_THROW] = -EAGAIN,
1052 [RTN_NAT] = -EINVAL,
1053 [RTN_XRESOLVE] = -EINVAL,
1054};
1055
1056static int ip6_rt_type_to_error(u8 fib6_type)
1057{
1058 return fib6_prop[fib6_type];
1059}
1060
1061static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
1062{
1063 unsigned short flags = 0;
1064
1065 if (rt->dst_nocount)
1066 flags |= DST_NOCOUNT;
1067 if (rt->dst_nopolicy)
1068 flags |= DST_NOPOLICY;
1069
1070 return flags;
1071}
1072
1073static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
1074{
1075 rt->dst.error = ip6_rt_type_to_error(fib6_type);
1076
1077 switch (fib6_type) {
1078 case RTN_BLACKHOLE:
1079 rt->dst.output = dst_discard_out;
1080 rt->dst.input = dst_discard;
1081 break;
1082 case RTN_PROHIBIT:
1083 rt->dst.output = ip6_pkt_prohibit_out;
1084 rt->dst.input = ip6_pkt_prohibit;
1085 break;
1086 case RTN_THROW:
1087 case RTN_UNREACHABLE:
1088 default:
1089 rt->dst.output = ip6_pkt_discard_out;
1090 rt->dst.input = ip6_pkt_discard;
1091 break;
1092 }
1093}
1094
1095static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
1096{
1097 struct fib6_info *f6i = res->f6i;
1098
1099 if (res->fib6_flags & RTF_REJECT) {
1100 ip6_rt_init_dst_reject(rt, res->fib6_type);
1101 return;
1102 }
1103
1104 rt->dst.error = 0;
1105 rt->dst.output = ip6_output;
1106
1107 if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
1108 rt->dst.input = ip6_input;
1109 } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
1110 rt->dst.input = ip6_mc_input;
1111 } else {
1112 rt->dst.input = ip6_forward;
1113 }
1114
1115 if (res->nh->fib_nh_lws) {
1116 rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
1117 lwtunnel_set_redirect(&rt->dst);
1118 }
1119
1120 rt->dst.lastuse = jiffies;
1121}
1122
1123/* Caller must already hold reference to @from */
1124static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1125{
1126 rt->rt6i_flags &= ~RTF_EXPIRES;
1127 rcu_assign_pointer(rt->from, from);
1128 ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1129}
1130
1131/* Caller must already hold reference to f6i in result */
1132static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1133{
1134 const struct fib6_nh *nh = res->nh;
1135 const struct net_device *dev = nh->fib_nh_dev;
1136 struct fib6_info *f6i = res->f6i;
1137
1138 ip6_rt_init_dst(rt, res);
1139
1140 rt->rt6i_dst = f6i->fib6_dst;
1141 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1142 rt->rt6i_flags = res->fib6_flags;
1143 if (nh->fib_nh_gw_family) {
1144 rt->rt6i_gateway = nh->fib_nh_gw6;
1145 rt->rt6i_flags |= RTF_GATEWAY;
1146 }
1147 rt6_set_from(rt, f6i);
1148#ifdef CONFIG_IPV6_SUBTREES
1149 rt->rt6i_src = f6i->fib6_src;
1150#endif
1151}
1152
1153static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1154 struct in6_addr *saddr)
1155{
1156 struct fib6_node *pn, *sn;
1157 while (1) {
1158 if (fn->fn_flags & RTN_TL_ROOT)
1159 return NULL;
1160 pn = rcu_dereference(fn->parent);
1161 sn = FIB6_SUBTREE(pn);
1162 if (sn && sn != fn)
1163 fn = fib6_node_lookup(sn, NULL, saddr);
1164 else
1165 fn = pn;
1166 if (fn->fn_flags & RTN_RTINFO)
1167 return fn;
1168 }
1169}
1170
1171static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1172{
1173 struct rt6_info *rt = *prt;
1174
1175 if (dst_hold_safe(&rt->dst))
1176 return true;
1177 if (net) {
1178 rt = net->ipv6.ip6_null_entry;
1179 dst_hold(&rt->dst);
1180 } else {
1181 rt = NULL;
1182 }
1183 *prt = rt;
1184 return false;
1185}
1186
1187/* called with rcu_lock held */
1188static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1189{
1190 struct net_device *dev = res->nh->fib_nh_dev;
1191 struct fib6_info *f6i = res->f6i;
1192 unsigned short flags;
1193 struct rt6_info *nrt;
1194
1195 if (!fib6_info_hold_safe(f6i))
1196 goto fallback;
1197
1198 flags = fib6_info_dst_flags(f6i);
1199 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1200 if (!nrt) {
1201 fib6_info_release(f6i);
1202 goto fallback;
1203 }
1204
1205 ip6_rt_copy_init(nrt, res);
1206 return nrt;
1207
1208fallback:
1209 nrt = dev_net(dev)->ipv6.ip6_null_entry;
1210 dst_hold(&nrt->dst);
1211 return nrt;
1212}
1213
1214INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net,
1215 struct fib6_table *table,
1216 struct flowi6 *fl6,
1217 const struct sk_buff *skb,
1218 int flags)
1219{
1220 struct fib6_result res = {};
1221 struct fib6_node *fn;
1222 struct rt6_info *rt;
1223
1224 rcu_read_lock();
1225 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1226restart:
1227 res.f6i = rcu_dereference(fn->leaf);
1228 if (!res.f6i)
1229 res.f6i = net->ipv6.fib6_null_entry;
1230 else
1231 rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1232 flags);
1233
1234 if (res.f6i == net->ipv6.fib6_null_entry) {
1235 fn = fib6_backtrack(fn, &fl6->saddr);
1236 if (fn)
1237 goto restart;
1238
1239 rt = net->ipv6.ip6_null_entry;
1240 dst_hold(&rt->dst);
1241 goto out;
1242 } else if (res.fib6_flags & RTF_REJECT) {
1243 goto do_create;
1244 }
1245
1246 fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1247 fl6->flowi6_oif != 0, skb, flags);
1248
1249 /* Search through exception table */
1250 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1251 if (rt) {
1252 if (ip6_hold_safe(net, &rt))
1253 dst_use_noref(&rt->dst, jiffies);
1254 } else {
1255do_create:
1256 rt = ip6_create_rt_rcu(&res);
1257 }
1258
1259out:
1260 trace_fib6_table_lookup(net, &res, table, fl6);
1261
1262 rcu_read_unlock();
1263
1264 return rt;
1265}
1266
1267struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1268 const struct sk_buff *skb, int flags)
1269{
1270 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1271}
1272EXPORT_SYMBOL_GPL(ip6_route_lookup);
1273
1274struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1275 const struct in6_addr *saddr, int oif,
1276 const struct sk_buff *skb, int strict)
1277{
1278 struct flowi6 fl6 = {
1279 .flowi6_oif = oif,
1280 .daddr = *daddr,
1281 };
1282 struct dst_entry *dst;
1283 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1284
1285 if (saddr) {
1286 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1287 flags |= RT6_LOOKUP_F_HAS_SADDR;
1288 }
1289
1290 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1291 if (dst->error == 0)
1292 return dst_rt6_info(dst);
1293
1294 dst_release(dst);
1295
1296 return NULL;
1297}
1298EXPORT_SYMBOL(rt6_lookup);
1299
1300/* ip6_ins_rt is called with FREE table->tb6_lock.
1301 * It takes new route entry, the addition fails by any reason the
1302 * route is released.
1303 * Caller must hold dst before calling it.
1304 */
1305
1306static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1307 struct netlink_ext_ack *extack)
1308{
1309 int err;
1310 struct fib6_table *table;
1311
1312 table = rt->fib6_table;
1313 spin_lock_bh(&table->tb6_lock);
1314 err = fib6_add(&table->tb6_root, rt, info, extack);
1315 spin_unlock_bh(&table->tb6_lock);
1316
1317 return err;
1318}
1319
1320int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1321{
1322 struct nl_info info = { .nl_net = net, };
1323
1324 return __ip6_ins_rt(rt, &info, NULL);
1325}
1326
1327static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1328 const struct in6_addr *daddr,
1329 const struct in6_addr *saddr)
1330{
1331 struct fib6_info *f6i = res->f6i;
1332 struct net_device *dev;
1333 struct rt6_info *rt;
1334
1335 /*
1336 * Clone the route.
1337 */
1338
1339 if (!fib6_info_hold_safe(f6i))
1340 return NULL;
1341
1342 dev = ip6_rt_get_dev_rcu(res);
1343 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1344 if (!rt) {
1345 fib6_info_release(f6i);
1346 return NULL;
1347 }
1348
1349 ip6_rt_copy_init(rt, res);
1350 rt->rt6i_flags |= RTF_CACHE;
1351 rt->rt6i_dst.addr = *daddr;
1352 rt->rt6i_dst.plen = 128;
1353
1354 if (!rt6_is_gw_or_nonexthop(res)) {
1355 if (f6i->fib6_dst.plen != 128 &&
1356 ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1357 rt->rt6i_flags |= RTF_ANYCAST;
1358#ifdef CONFIG_IPV6_SUBTREES
1359 if (rt->rt6i_src.plen && saddr) {
1360 rt->rt6i_src.addr = *saddr;
1361 rt->rt6i_src.plen = 128;
1362 }
1363#endif
1364 }
1365
1366 return rt;
1367}
1368
1369static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1370{
1371 struct fib6_info *f6i = res->f6i;
1372 unsigned short flags = fib6_info_dst_flags(f6i);
1373 struct net_device *dev;
1374 struct rt6_info *pcpu_rt;
1375
1376 if (!fib6_info_hold_safe(f6i))
1377 return NULL;
1378
1379 rcu_read_lock();
1380 dev = ip6_rt_get_dev_rcu(res);
1381 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT);
1382 rcu_read_unlock();
1383 if (!pcpu_rt) {
1384 fib6_info_release(f6i);
1385 return NULL;
1386 }
1387 ip6_rt_copy_init(pcpu_rt, res);
1388 pcpu_rt->rt6i_flags |= RTF_PCPU;
1389
1390 if (f6i->nh)
1391 pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev));
1392
1393 return pcpu_rt;
1394}
1395
1396static bool rt6_is_valid(const struct rt6_info *rt6)
1397{
1398 return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev));
1399}
1400
1401/* It should be called with rcu_read_lock() acquired */
1402static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1403{
1404 struct rt6_info *pcpu_rt;
1405
1406 pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
1407
1408 if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) {
1409 struct rt6_info *prev, **p;
1410
1411 p = this_cpu_ptr(res->nh->rt6i_pcpu);
1412 prev = xchg(p, NULL);
1413 if (prev) {
1414 dst_dev_put(&prev->dst);
1415 dst_release(&prev->dst);
1416 }
1417
1418 pcpu_rt = NULL;
1419 }
1420
1421 return pcpu_rt;
1422}
1423
1424static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1425 const struct fib6_result *res)
1426{
1427 struct rt6_info *pcpu_rt, *prev, **p;
1428
1429 pcpu_rt = ip6_rt_pcpu_alloc(res);
1430 if (!pcpu_rt)
1431 return NULL;
1432
1433 p = this_cpu_ptr(res->nh->rt6i_pcpu);
1434 prev = cmpxchg(p, NULL, pcpu_rt);
1435 BUG_ON(prev);
1436
1437 if (res->f6i->fib6_destroying) {
1438 struct fib6_info *from;
1439
1440 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1441 fib6_info_release(from);
1442 }
1443
1444 return pcpu_rt;
1445}
1446
1447/* exception hash table implementation
1448 */
1449static DEFINE_SPINLOCK(rt6_exception_lock);
1450
1451/* Remove rt6_ex from hash table and free the memory
1452 * Caller must hold rt6_exception_lock
1453 */
1454static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1455 struct rt6_exception *rt6_ex)
1456{
1457 struct fib6_info *from;
1458 struct net *net;
1459
1460 if (!bucket || !rt6_ex)
1461 return;
1462
1463 net = dev_net(rt6_ex->rt6i->dst.dev);
1464 net->ipv6.rt6_stats->fib_rt_cache--;
1465
1466 /* purge completely the exception to allow releasing the held resources:
1467 * some [sk] cache may keep the dst around for unlimited time
1468 */
1469 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1470 fib6_info_release(from);
1471 dst_dev_put(&rt6_ex->rt6i->dst);
1472
1473 hlist_del_rcu(&rt6_ex->hlist);
1474 dst_release(&rt6_ex->rt6i->dst);
1475 kfree_rcu(rt6_ex, rcu);
1476 WARN_ON_ONCE(!bucket->depth);
1477 bucket->depth--;
1478}
1479
1480/* Remove oldest rt6_ex in bucket and free the memory
1481 * Caller must hold rt6_exception_lock
1482 */
1483static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1484{
1485 struct rt6_exception *rt6_ex, *oldest = NULL;
1486
1487 if (!bucket)
1488 return;
1489
1490 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1491 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1492 oldest = rt6_ex;
1493 }
1494 rt6_remove_exception(bucket, oldest);
1495}
1496
1497static u32 rt6_exception_hash(const struct in6_addr *dst,
1498 const struct in6_addr *src)
1499{
1500 static siphash_aligned_key_t rt6_exception_key;
1501 struct {
1502 struct in6_addr dst;
1503 struct in6_addr src;
1504 } __aligned(SIPHASH_ALIGNMENT) combined = {
1505 .dst = *dst,
1506 };
1507 u64 val;
1508
1509 net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key));
1510
1511#ifdef CONFIG_IPV6_SUBTREES
1512 if (src)
1513 combined.src = *src;
1514#endif
1515 val = siphash(&combined, sizeof(combined), &rt6_exception_key);
1516
1517 return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1518}
1519
1520/* Helper function to find the cached rt in the hash table
1521 * and update bucket pointer to point to the bucket for this
1522 * (daddr, saddr) pair
1523 * Caller must hold rt6_exception_lock
1524 */
1525static struct rt6_exception *
1526__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1527 const struct in6_addr *daddr,
1528 const struct in6_addr *saddr)
1529{
1530 struct rt6_exception *rt6_ex;
1531 u32 hval;
1532
1533 if (!(*bucket) || !daddr)
1534 return NULL;
1535
1536 hval = rt6_exception_hash(daddr, saddr);
1537 *bucket += hval;
1538
1539 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1540 struct rt6_info *rt6 = rt6_ex->rt6i;
1541 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1542
1543#ifdef CONFIG_IPV6_SUBTREES
1544 if (matched && saddr)
1545 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1546#endif
1547 if (matched)
1548 return rt6_ex;
1549 }
1550 return NULL;
1551}
1552
1553/* Helper function to find the cached rt in the hash table
1554 * and update bucket pointer to point to the bucket for this
1555 * (daddr, saddr) pair
1556 * Caller must hold rcu_read_lock()
1557 */
1558static struct rt6_exception *
1559__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1560 const struct in6_addr *daddr,
1561 const struct in6_addr *saddr)
1562{
1563 struct rt6_exception *rt6_ex;
1564 u32 hval;
1565
1566 WARN_ON_ONCE(!rcu_read_lock_held());
1567
1568 if (!(*bucket) || !daddr)
1569 return NULL;
1570
1571 hval = rt6_exception_hash(daddr, saddr);
1572 *bucket += hval;
1573
1574 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1575 struct rt6_info *rt6 = rt6_ex->rt6i;
1576 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1577
1578#ifdef CONFIG_IPV6_SUBTREES
1579 if (matched && saddr)
1580 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1581#endif
1582 if (matched)
1583 return rt6_ex;
1584 }
1585 return NULL;
1586}
1587
1588static unsigned int fib6_mtu(const struct fib6_result *res)
1589{
1590 const struct fib6_nh *nh = res->nh;
1591 unsigned int mtu;
1592
1593 if (res->f6i->fib6_pmtu) {
1594 mtu = res->f6i->fib6_pmtu;
1595 } else {
1596 struct net_device *dev = nh->fib_nh_dev;
1597 struct inet6_dev *idev;
1598
1599 rcu_read_lock();
1600 idev = __in6_dev_get(dev);
1601 mtu = READ_ONCE(idev->cnf.mtu6);
1602 rcu_read_unlock();
1603 }
1604
1605 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1606
1607 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1608}
1609
1610#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL
1611
1612/* used when the flushed bit is not relevant, only access to the bucket
1613 * (ie., all bucket users except rt6_insert_exception);
1614 *
1615 * called under rcu lock; sometimes called with rt6_exception_lock held
1616 */
1617static
1618struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
1619 spinlock_t *lock)
1620{
1621 struct rt6_exception_bucket *bucket;
1622
1623 if (lock)
1624 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1625 lockdep_is_held(lock));
1626 else
1627 bucket = rcu_dereference(nh->rt6i_exception_bucket);
1628
1629 /* remove bucket flushed bit if set */
1630 if (bucket) {
1631 unsigned long p = (unsigned long)bucket;
1632
1633 p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
1634 bucket = (struct rt6_exception_bucket *)p;
1635 }
1636
1637 return bucket;
1638}
1639
1640static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
1641{
1642 unsigned long p = (unsigned long)bucket;
1643
1644 return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
1645}
1646
1647/* called with rt6_exception_lock held */
1648static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
1649 spinlock_t *lock)
1650{
1651 struct rt6_exception_bucket *bucket;
1652 unsigned long p;
1653
1654 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1655 lockdep_is_held(lock));
1656
1657 p = (unsigned long)bucket;
1658 p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
1659 bucket = (struct rt6_exception_bucket *)p;
1660 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
1661}
1662
1663static int rt6_insert_exception(struct rt6_info *nrt,
1664 const struct fib6_result *res)
1665{
1666 struct net *net = dev_net(nrt->dst.dev);
1667 struct rt6_exception_bucket *bucket;
1668 struct fib6_info *f6i = res->f6i;
1669 struct in6_addr *src_key = NULL;
1670 struct rt6_exception *rt6_ex;
1671 struct fib6_nh *nh = res->nh;
1672 int max_depth;
1673 int err = 0;
1674
1675 spin_lock_bh(&rt6_exception_lock);
1676
1677 bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1678 lockdep_is_held(&rt6_exception_lock));
1679 if (!bucket) {
1680 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1681 GFP_ATOMIC);
1682 if (!bucket) {
1683 err = -ENOMEM;
1684 goto out;
1685 }
1686 rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
1687 } else if (fib6_nh_excptn_bucket_flushed(bucket)) {
1688 err = -EINVAL;
1689 goto out;
1690 }
1691
1692#ifdef CONFIG_IPV6_SUBTREES
1693 /* fib6_src.plen != 0 indicates f6i is in subtree
1694 * and exception table is indexed by a hash of
1695 * both fib6_dst and fib6_src.
1696 * Otherwise, the exception table is indexed by
1697 * a hash of only fib6_dst.
1698 */
1699 if (f6i->fib6_src.plen)
1700 src_key = &nrt->rt6i_src.addr;
1701#endif
1702 /* rt6_mtu_change() might lower mtu on f6i.
1703 * Only insert this exception route if its mtu
1704 * is less than f6i's mtu value.
1705 */
1706 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1707 err = -EINVAL;
1708 goto out;
1709 }
1710
1711 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1712 src_key);
1713 if (rt6_ex)
1714 rt6_remove_exception(bucket, rt6_ex);
1715
1716 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1717 if (!rt6_ex) {
1718 err = -ENOMEM;
1719 goto out;
1720 }
1721 rt6_ex->rt6i = nrt;
1722 rt6_ex->stamp = jiffies;
1723 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1724 bucket->depth++;
1725 net->ipv6.rt6_stats->fib_rt_cache++;
1726
1727 /* Randomize max depth to avoid some side channels attacks. */
1728 max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH);
1729 while (bucket->depth > max_depth)
1730 rt6_exception_remove_oldest(bucket);
1731
1732out:
1733 spin_unlock_bh(&rt6_exception_lock);
1734
1735 /* Update fn->fn_sernum to invalidate all cached dst */
1736 if (!err) {
1737 spin_lock_bh(&f6i->fib6_table->tb6_lock);
1738 fib6_update_sernum(net, f6i);
1739 spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1740 fib6_force_start_gc(net);
1741 }
1742
1743 return err;
1744}
1745
1746static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
1747{
1748 struct rt6_exception_bucket *bucket;
1749 struct rt6_exception *rt6_ex;
1750 struct hlist_node *tmp;
1751 int i;
1752
1753 spin_lock_bh(&rt6_exception_lock);
1754
1755 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1756 if (!bucket)
1757 goto out;
1758
1759 /* Prevent rt6_insert_exception() to recreate the bucket list */
1760 if (!from)
1761 fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
1762
1763 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1764 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
1765 if (!from ||
1766 rcu_access_pointer(rt6_ex->rt6i->from) == from)
1767 rt6_remove_exception(bucket, rt6_ex);
1768 }
1769 WARN_ON_ONCE(!from && bucket->depth);
1770 bucket++;
1771 }
1772out:
1773 spin_unlock_bh(&rt6_exception_lock);
1774}
1775
1776static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
1777{
1778 struct fib6_info *f6i = arg;
1779
1780 fib6_nh_flush_exceptions(nh, f6i);
1781
1782 return 0;
1783}
1784
1785void rt6_flush_exceptions(struct fib6_info *f6i)
1786{
1787 if (f6i->nh)
1788 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions,
1789 f6i);
1790 else
1791 fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
1792}
1793
1794/* Find cached rt in the hash table inside passed in rt
1795 * Caller has to hold rcu_read_lock()
1796 */
1797static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1798 const struct in6_addr *daddr,
1799 const struct in6_addr *saddr)
1800{
1801 const struct in6_addr *src_key = NULL;
1802 struct rt6_exception_bucket *bucket;
1803 struct rt6_exception *rt6_ex;
1804 struct rt6_info *ret = NULL;
1805
1806#ifdef CONFIG_IPV6_SUBTREES
1807 /* fib6i_src.plen != 0 indicates f6i is in subtree
1808 * and exception table is indexed by a hash of
1809 * both fib6_dst and fib6_src.
1810 * However, the src addr used to create the hash
1811 * might not be exactly the passed in saddr which
1812 * is a /128 addr from the flow.
1813 * So we need to use f6i->fib6_src to redo lookup
1814 * if the passed in saddr does not find anything.
1815 * (See the logic in ip6_rt_cache_alloc() on how
1816 * rt->rt6i_src is updated.)
1817 */
1818 if (res->f6i->fib6_src.plen)
1819 src_key = saddr;
1820find_ex:
1821#endif
1822 bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
1823 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1824
1825 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1826 ret = rt6_ex->rt6i;
1827
1828#ifdef CONFIG_IPV6_SUBTREES
1829 /* Use fib6_src as src_key and redo lookup */
1830 if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1831 src_key = &res->f6i->fib6_src.addr;
1832 goto find_ex;
1833 }
1834#endif
1835
1836 return ret;
1837}
1838
1839/* Remove the passed in cached rt from the hash table that contains it */
1840static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
1841 const struct rt6_info *rt)
1842{
1843 const struct in6_addr *src_key = NULL;
1844 struct rt6_exception_bucket *bucket;
1845 struct rt6_exception *rt6_ex;
1846 int err;
1847
1848 if (!rcu_access_pointer(nh->rt6i_exception_bucket))
1849 return -ENOENT;
1850
1851 spin_lock_bh(&rt6_exception_lock);
1852 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1853
1854#ifdef CONFIG_IPV6_SUBTREES
1855 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1856 * and exception table is indexed by a hash of
1857 * both rt6i_dst and rt6i_src.
1858 * Otherwise, the exception table is indexed by
1859 * a hash of only rt6i_dst.
1860 */
1861 if (plen)
1862 src_key = &rt->rt6i_src.addr;
1863#endif
1864 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1865 &rt->rt6i_dst.addr,
1866 src_key);
1867 if (rt6_ex) {
1868 rt6_remove_exception(bucket, rt6_ex);
1869 err = 0;
1870 } else {
1871 err = -ENOENT;
1872 }
1873
1874 spin_unlock_bh(&rt6_exception_lock);
1875 return err;
1876}
1877
1878struct fib6_nh_excptn_arg {
1879 struct rt6_info *rt;
1880 int plen;
1881};
1882
1883static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
1884{
1885 struct fib6_nh_excptn_arg *arg = _arg;
1886 int err;
1887
1888 err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
1889 if (err == 0)
1890 return 1;
1891
1892 return 0;
1893}
1894
1895static int rt6_remove_exception_rt(struct rt6_info *rt)
1896{
1897 struct fib6_info *from;
1898
1899 from = rcu_dereference(rt->from);
1900 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1901 return -EINVAL;
1902
1903 if (from->nh) {
1904 struct fib6_nh_excptn_arg arg = {
1905 .rt = rt,
1906 .plen = from->fib6_src.plen
1907 };
1908 int rc;
1909
1910 /* rc = 1 means an entry was found */
1911 rc = nexthop_for_each_fib6_nh(from->nh,
1912 rt6_nh_remove_exception_rt,
1913 &arg);
1914 return rc ? 0 : -ENOENT;
1915 }
1916
1917 return fib6_nh_remove_exception(from->fib6_nh,
1918 from->fib6_src.plen, rt);
1919}
1920
1921/* Find rt6_ex which contains the passed in rt cache and
1922 * refresh its stamp
1923 */
1924static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
1925 const struct rt6_info *rt)
1926{
1927 const struct in6_addr *src_key = NULL;
1928 struct rt6_exception_bucket *bucket;
1929 struct rt6_exception *rt6_ex;
1930
1931 bucket = fib6_nh_get_excptn_bucket(nh, NULL);
1932#ifdef CONFIG_IPV6_SUBTREES
1933 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1934 * and exception table is indexed by a hash of
1935 * both rt6i_dst and rt6i_src.
1936 * Otherwise, the exception table is indexed by
1937 * a hash of only rt6i_dst.
1938 */
1939 if (plen)
1940 src_key = &rt->rt6i_src.addr;
1941#endif
1942 rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
1943 if (rt6_ex)
1944 rt6_ex->stamp = jiffies;
1945}
1946
1947struct fib6_nh_match_arg {
1948 const struct net_device *dev;
1949 const struct in6_addr *gw;
1950 struct fib6_nh *match;
1951};
1952
1953/* determine if fib6_nh has given device and gateway */
1954static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
1955{
1956 struct fib6_nh_match_arg *arg = _arg;
1957
1958 if (arg->dev != nh->fib_nh_dev ||
1959 (arg->gw && !nh->fib_nh_gw_family) ||
1960 (!arg->gw && nh->fib_nh_gw_family) ||
1961 (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
1962 return 0;
1963
1964 arg->match = nh;
1965
1966 /* found a match, break the loop */
1967 return 1;
1968}
1969
1970static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1971{
1972 struct fib6_info *from;
1973 struct fib6_nh *fib6_nh;
1974
1975 rcu_read_lock();
1976
1977 from = rcu_dereference(rt->from);
1978 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1979 goto unlock;
1980
1981 if (from->nh) {
1982 struct fib6_nh_match_arg arg = {
1983 .dev = rt->dst.dev,
1984 .gw = &rt->rt6i_gateway,
1985 };
1986
1987 nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);
1988
1989 if (!arg.match)
1990 goto unlock;
1991 fib6_nh = arg.match;
1992 } else {
1993 fib6_nh = from->fib6_nh;
1994 }
1995 fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
1996unlock:
1997 rcu_read_unlock();
1998}
1999
2000static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
2001 struct rt6_info *rt, int mtu)
2002{
2003 /* If the new MTU is lower than the route PMTU, this new MTU will be the
2004 * lowest MTU in the path: always allow updating the route PMTU to
2005 * reflect PMTU decreases.
2006 *
2007 * If the new MTU is higher, and the route PMTU is equal to the local
2008 * MTU, this means the old MTU is the lowest in the path, so allow
2009 * updating it: if other nodes now have lower MTUs, PMTU discovery will
2010 * handle this.
2011 */
2012
2013 if (dst_mtu(&rt->dst) >= mtu)
2014 return true;
2015
2016 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
2017 return true;
2018
2019 return false;
2020}
2021
2022static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
2023 const struct fib6_nh *nh, int mtu)
2024{
2025 struct rt6_exception_bucket *bucket;
2026 struct rt6_exception *rt6_ex;
2027 int i;
2028
2029 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
2030 if (!bucket)
2031 return;
2032
2033 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2034 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
2035 struct rt6_info *entry = rt6_ex->rt6i;
2036
2037 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
2038 * route), the metrics of its rt->from have already
2039 * been updated.
2040 */
2041 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
2042 rt6_mtu_change_route_allowed(idev, entry, mtu))
2043 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
2044 }
2045 bucket++;
2046 }
2047}
2048
2049#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2050
2051static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
2052 const struct in6_addr *gateway)
2053{
2054 struct rt6_exception_bucket *bucket;
2055 struct rt6_exception *rt6_ex;
2056 struct hlist_node *tmp;
2057 int i;
2058
2059 if (!rcu_access_pointer(nh->rt6i_exception_bucket))
2060 return;
2061
2062 spin_lock_bh(&rt6_exception_lock);
2063 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
2064 if (bucket) {
2065 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2066 hlist_for_each_entry_safe(rt6_ex, tmp,
2067 &bucket->chain, hlist) {
2068 struct rt6_info *entry = rt6_ex->rt6i;
2069
2070 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
2071 RTF_CACHE_GATEWAY &&
2072 ipv6_addr_equal(gateway,
2073 &entry->rt6i_gateway)) {
2074 rt6_remove_exception(bucket, rt6_ex);
2075 }
2076 }
2077 bucket++;
2078 }
2079 }
2080
2081 spin_unlock_bh(&rt6_exception_lock);
2082}
2083
2084static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
2085 struct rt6_exception *rt6_ex,
2086 struct fib6_gc_args *gc_args,
2087 unsigned long now)
2088{
2089 struct rt6_info *rt = rt6_ex->rt6i;
2090
2091 /* we are pruning and obsoleting aged-out and non gateway exceptions
2092 * even if others have still references to them, so that on next
2093 * dst_check() such references can be dropped.
2094 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
2095 * expired, independently from their aging, as per RFC 8201 section 4
2096 */
2097 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
2098 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
2099 pr_debug("aging clone %p\n", rt);
2100 rt6_remove_exception(bucket, rt6_ex);
2101 return;
2102 }
2103 } else if (time_after(jiffies, rt->dst.expires)) {
2104 pr_debug("purging expired route %p\n", rt);
2105 rt6_remove_exception(bucket, rt6_ex);
2106 return;
2107 }
2108
2109 if (rt->rt6i_flags & RTF_GATEWAY) {
2110 struct neighbour *neigh;
2111
2112 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
2113
2114 if (!(neigh && (neigh->flags & NTF_ROUTER))) {
2115 pr_debug("purging route %p via non-router but gateway\n",
2116 rt);
2117 rt6_remove_exception(bucket, rt6_ex);
2118 return;
2119 }
2120 }
2121
2122 gc_args->more++;
2123}
2124
2125static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
2126 struct fib6_gc_args *gc_args,
2127 unsigned long now)
2128{
2129 struct rt6_exception_bucket *bucket;
2130 struct rt6_exception *rt6_ex;
2131 struct hlist_node *tmp;
2132 int i;
2133
2134 if (!rcu_access_pointer(nh->rt6i_exception_bucket))
2135 return;
2136
2137 rcu_read_lock_bh();
2138 spin_lock(&rt6_exception_lock);
2139 bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
2140 if (bucket) {
2141 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2142 hlist_for_each_entry_safe(rt6_ex, tmp,
2143 &bucket->chain, hlist) {
2144 rt6_age_examine_exception(bucket, rt6_ex,
2145 gc_args, now);
2146 }
2147 bucket++;
2148 }
2149 }
2150 spin_unlock(&rt6_exception_lock);
2151 rcu_read_unlock_bh();
2152}
2153
2154struct fib6_nh_age_excptn_arg {
2155 struct fib6_gc_args *gc_args;
2156 unsigned long now;
2157};
2158
2159static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
2160{
2161 struct fib6_nh_age_excptn_arg *arg = _arg;
2162
2163 fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
2164 return 0;
2165}
2166
2167void rt6_age_exceptions(struct fib6_info *f6i,
2168 struct fib6_gc_args *gc_args,
2169 unsigned long now)
2170{
2171 if (f6i->nh) {
2172 struct fib6_nh_age_excptn_arg arg = {
2173 .gc_args = gc_args,
2174 .now = now
2175 };
2176
2177 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
2178 &arg);
2179 } else {
2180 fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
2181 }
2182}
2183
2184/* must be called with rcu lock held */
2185int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
2186 struct flowi6 *fl6, struct fib6_result *res, int strict)
2187{
2188 struct fib6_node *fn, *saved_fn;
2189
2190 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2191 saved_fn = fn;
2192
2193redo_rt6_select:
2194 rt6_select(net, fn, oif, res, strict);
2195 if (res->f6i == net->ipv6.fib6_null_entry) {
2196 fn = fib6_backtrack(fn, &fl6->saddr);
2197 if (fn)
2198 goto redo_rt6_select;
2199 else if (strict & RT6_LOOKUP_F_REACHABLE) {
2200 /* also consider unreachable route */
2201 strict &= ~RT6_LOOKUP_F_REACHABLE;
2202 fn = saved_fn;
2203 goto redo_rt6_select;
2204 }
2205 }
2206
2207 trace_fib6_table_lookup(net, res, table, fl6);
2208
2209 return 0;
2210}
2211
2212struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
2213 int oif, struct flowi6 *fl6,
2214 const struct sk_buff *skb, int flags)
2215{
2216 struct fib6_result res = {};
2217 struct rt6_info *rt = NULL;
2218 int strict = 0;
2219
2220 WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
2221 !rcu_read_lock_held());
2222
2223 strict |= flags & RT6_LOOKUP_F_IFACE;
2224 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
2225 if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
2226 strict |= RT6_LOOKUP_F_REACHABLE;
2227
2228 rcu_read_lock();
2229
2230 fib6_table_lookup(net, table, oif, fl6, &res, strict);
2231 if (res.f6i == net->ipv6.fib6_null_entry)
2232 goto out;
2233
2234 fib6_select_path(net, &res, fl6, oif, false, skb, strict);
2235
2236 /*Search through exception table */
2237 rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
2238 if (rt) {
2239 goto out;
2240 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
2241 !res.nh->fib_nh_gw_family)) {
2242 /* Create a RTF_CACHE clone which will not be
2243 * owned by the fib6 tree. It is for the special case where
2244 * the daddr in the skb during the neighbor look-up is different
2245 * from the fl6->daddr used to look-up route here.
2246 */
2247 rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
2248
2249 if (rt) {
2250 /* 1 refcnt is taken during ip6_rt_cache_alloc().
2251 * As rt6_uncached_list_add() does not consume refcnt,
2252 * this refcnt is always returned to the caller even
2253 * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
2254 */
2255 rt6_uncached_list_add(rt);
2256 rcu_read_unlock();
2257
2258 return rt;
2259 }
2260 } else {
2261 /* Get a percpu copy */
2262 local_bh_disable();
2263 rt = rt6_get_pcpu_route(&res);
2264
2265 if (!rt)
2266 rt = rt6_make_pcpu_route(net, &res);
2267
2268 local_bh_enable();
2269 }
2270out:
2271 if (!rt)
2272 rt = net->ipv6.ip6_null_entry;
2273 if (!(flags & RT6_LOOKUP_F_DST_NOREF))
2274 ip6_hold_safe(net, &rt);
2275 rcu_read_unlock();
2276
2277 return rt;
2278}
2279EXPORT_SYMBOL_GPL(ip6_pol_route);
2280
2281INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
2282 struct fib6_table *table,
2283 struct flowi6 *fl6,
2284 const struct sk_buff *skb,
2285 int flags)
2286{
2287 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
2288}
2289
2290struct dst_entry *ip6_route_input_lookup(struct net *net,
2291 struct net_device *dev,
2292 struct flowi6 *fl6,
2293 const struct sk_buff *skb,
2294 int flags)
2295{
2296 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
2297 flags |= RT6_LOOKUP_F_IFACE;
2298
2299 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
2300}
2301EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
2302
2303static void ip6_multipath_l3_keys(const struct sk_buff *skb,
2304 struct flow_keys *keys,
2305 struct flow_keys *flkeys)
2306{
2307 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
2308 const struct ipv6hdr *key_iph = outer_iph;
2309 struct flow_keys *_flkeys = flkeys;
2310 const struct ipv6hdr *inner_iph;
2311 const struct icmp6hdr *icmph;
2312 struct ipv6hdr _inner_iph;
2313 struct icmp6hdr _icmph;
2314
2315 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2316 goto out;
2317
2318 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2319 sizeof(_icmph), &_icmph);
2320 if (!icmph)
2321 goto out;
2322
2323 if (!icmpv6_is_err(icmph->icmp6_type))
2324 goto out;
2325
2326 inner_iph = skb_header_pointer(skb,
2327 skb_transport_offset(skb) + sizeof(*icmph),
2328 sizeof(_inner_iph), &_inner_iph);
2329 if (!inner_iph)
2330 goto out;
2331
2332 key_iph = inner_iph;
2333 _flkeys = NULL;
2334out:
2335 if (_flkeys) {
2336 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2337 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2338 keys->tags.flow_label = _flkeys->tags.flow_label;
2339 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2340 } else {
2341 keys->addrs.v6addrs.src = key_iph->saddr;
2342 keys->addrs.v6addrs.dst = key_iph->daddr;
2343 keys->tags.flow_label = ip6_flowlabel(key_iph);
2344 keys->basic.ip_proto = key_iph->nexthdr;
2345 }
2346}
2347
2348static u32 rt6_multipath_custom_hash_outer(const struct net *net,
2349 const struct sk_buff *skb,
2350 bool *p_has_inner)
2351{
2352 u32 hash_fields = ip6_multipath_hash_fields(net);
2353 struct flow_keys keys, hash_keys;
2354
2355 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2356 return 0;
2357
2358 memset(&hash_keys, 0, sizeof(hash_keys));
2359 skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
2360
2361 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2362 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2363 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2364 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2365 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2366 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2367 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2368 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
2369 hash_keys.tags.flow_label = keys.tags.flow_label;
2370 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2371 hash_keys.ports.src = keys.ports.src;
2372 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2373 hash_keys.ports.dst = keys.ports.dst;
2374
2375 *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
2376 return flow_hash_from_keys(&hash_keys);
2377}
2378
2379static u32 rt6_multipath_custom_hash_inner(const struct net *net,
2380 const struct sk_buff *skb,
2381 bool has_inner)
2382{
2383 u32 hash_fields = ip6_multipath_hash_fields(net);
2384 struct flow_keys keys, hash_keys;
2385
2386 /* We assume the packet carries an encapsulation, but if none was
2387 * encountered during dissection of the outer flow, then there is no
2388 * point in calling the flow dissector again.
2389 */
2390 if (!has_inner)
2391 return 0;
2392
2393 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
2394 return 0;
2395
2396 memset(&hash_keys, 0, sizeof(hash_keys));
2397 skb_flow_dissect_flow_keys(skb, &keys, 0);
2398
2399 if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
2400 return 0;
2401
2402 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2403 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2404 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
2405 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2406 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
2407 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2408 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2409 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2410 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
2411 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2412 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
2413 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2414 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
2415 hash_keys.tags.flow_label = keys.tags.flow_label;
2416 }
2417
2418 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
2419 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2420 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
2421 hash_keys.ports.src = keys.ports.src;
2422 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
2423 hash_keys.ports.dst = keys.ports.dst;
2424
2425 return flow_hash_from_keys(&hash_keys);
2426}
2427
2428static u32 rt6_multipath_custom_hash_skb(const struct net *net,
2429 const struct sk_buff *skb)
2430{
2431 u32 mhash, mhash_inner;
2432 bool has_inner = true;
2433
2434 mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner);
2435 mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner);
2436
2437 return jhash_2words(mhash, mhash_inner, 0);
2438}
2439
2440static u32 rt6_multipath_custom_hash_fl6(const struct net *net,
2441 const struct flowi6 *fl6)
2442{
2443 u32 hash_fields = ip6_multipath_hash_fields(net);
2444 struct flow_keys hash_keys;
2445
2446 if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2447 return 0;
2448
2449 memset(&hash_keys, 0, sizeof(hash_keys));
2450 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2451 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2452 hash_keys.addrs.v6addrs.src = fl6->saddr;
2453 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2454 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2455 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2456 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2457 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
2458 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2459 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2460 hash_keys.ports.src = fl6->fl6_sport;
2461 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2462 hash_keys.ports.dst = fl6->fl6_dport;
2463
2464 return flow_hash_from_keys(&hash_keys);
2465}
2466
2467/* if skb is set it will be used and fl6 can be NULL */
2468u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2469 const struct sk_buff *skb, struct flow_keys *flkeys)
2470{
2471 struct flow_keys hash_keys;
2472 u32 mhash = 0;
2473
2474 switch (ip6_multipath_hash_policy(net)) {
2475 case 0:
2476 memset(&hash_keys, 0, sizeof(hash_keys));
2477 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2478 if (skb) {
2479 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2480 } else {
2481 hash_keys.addrs.v6addrs.src = fl6->saddr;
2482 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2483 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2484 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2485 }
2486 mhash = flow_hash_from_keys(&hash_keys);
2487 break;
2488 case 1:
2489 if (skb) {
2490 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2491 struct flow_keys keys;
2492
2493 /* short-circuit if we already have L4 hash present */
2494 if (skb->l4_hash)
2495 return skb_get_hash_raw(skb) >> 1;
2496
2497 memset(&hash_keys, 0, sizeof(hash_keys));
2498
2499 if (!flkeys) {
2500 skb_flow_dissect_flow_keys(skb, &keys, flag);
2501 flkeys = &keys;
2502 }
2503 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2504 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2505 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2506 hash_keys.ports.src = flkeys->ports.src;
2507 hash_keys.ports.dst = flkeys->ports.dst;
2508 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2509 } else {
2510 memset(&hash_keys, 0, sizeof(hash_keys));
2511 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2512 hash_keys.addrs.v6addrs.src = fl6->saddr;
2513 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2514 hash_keys.ports.src = fl6->fl6_sport;
2515 hash_keys.ports.dst = fl6->fl6_dport;
2516 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2517 }
2518 mhash = flow_hash_from_keys(&hash_keys);
2519 break;
2520 case 2:
2521 memset(&hash_keys, 0, sizeof(hash_keys));
2522 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2523 if (skb) {
2524 struct flow_keys keys;
2525
2526 if (!flkeys) {
2527 skb_flow_dissect_flow_keys(skb, &keys, 0);
2528 flkeys = &keys;
2529 }
2530
2531 /* Inner can be v4 or v6 */
2532 if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2533 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2534 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2535 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2536 } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2537 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2538 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2539 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2540 hash_keys.tags.flow_label = flkeys->tags.flow_label;
2541 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2542 } else {
2543 /* Same as case 0 */
2544 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2545 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2546 }
2547 } else {
2548 /* Same as case 0 */
2549 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2550 hash_keys.addrs.v6addrs.src = fl6->saddr;
2551 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2552 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2553 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2554 }
2555 mhash = flow_hash_from_keys(&hash_keys);
2556 break;
2557 case 3:
2558 if (skb)
2559 mhash = rt6_multipath_custom_hash_skb(net, skb);
2560 else
2561 mhash = rt6_multipath_custom_hash_fl6(net, fl6);
2562 break;
2563 }
2564
2565 return mhash >> 1;
2566}
2567
2568/* Called with rcu held */
2569void ip6_route_input(struct sk_buff *skb)
2570{
2571 const struct ipv6hdr *iph = ipv6_hdr(skb);
2572 struct net *net = dev_net(skb->dev);
2573 int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
2574 struct ip_tunnel_info *tun_info;
2575 struct flowi6 fl6 = {
2576 .flowi6_iif = skb->dev->ifindex,
2577 .daddr = iph->daddr,
2578 .saddr = iph->saddr,
2579 .flowlabel = ip6_flowinfo(iph),
2580 .flowi6_mark = skb->mark,
2581 .flowi6_proto = iph->nexthdr,
2582 };
2583 struct flow_keys *flkeys = NULL, _flkeys;
2584
2585 tun_info = skb_tunnel_info(skb);
2586 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2587 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2588
2589 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2590 flkeys = &_flkeys;
2591
2592 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2593 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2594 skb_dst_drop(skb);
2595 skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
2596 &fl6, skb, flags));
2597}
2598
2599INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
2600 struct fib6_table *table,
2601 struct flowi6 *fl6,
2602 const struct sk_buff *skb,
2603 int flags)
2604{
2605 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2606}
2607
2608static struct dst_entry *ip6_route_output_flags_noref(struct net *net,
2609 const struct sock *sk,
2610 struct flowi6 *fl6,
2611 int flags)
2612{
2613 bool any_src;
2614
2615 if (ipv6_addr_type(&fl6->daddr) &
2616 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2617 struct dst_entry *dst;
2618
2619 /* This function does not take refcnt on the dst */
2620 dst = l3mdev_link_scope_lookup(net, fl6);
2621 if (dst)
2622 return dst;
2623 }
2624
2625 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2626
2627 flags |= RT6_LOOKUP_F_DST_NOREF;
2628 any_src = ipv6_addr_any(&fl6->saddr);
2629 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2630 (fl6->flowi6_oif && any_src))
2631 flags |= RT6_LOOKUP_F_IFACE;
2632
2633 if (!any_src)
2634 flags |= RT6_LOOKUP_F_HAS_SADDR;
2635 else if (sk)
2636 flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs));
2637
2638 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2639}
2640
2641struct dst_entry *ip6_route_output_flags(struct net *net,
2642 const struct sock *sk,
2643 struct flowi6 *fl6,
2644 int flags)
2645{
2646 struct dst_entry *dst;
2647 struct rt6_info *rt6;
2648
2649 rcu_read_lock();
2650 dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
2651 rt6 = dst_rt6_info(dst);
2652 /* For dst cached in uncached_list, refcnt is already taken. */
2653 if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) {
2654 dst = &net->ipv6.ip6_null_entry->dst;
2655 dst_hold(dst);
2656 }
2657 rcu_read_unlock();
2658
2659 return dst;
2660}
2661EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2662
2663struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2664{
2665 struct rt6_info *rt, *ort = dst_rt6_info(dst_orig);
2666 struct net_device *loopback_dev = net->loopback_dev;
2667 struct dst_entry *new = NULL;
2668
2669 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev,
2670 DST_OBSOLETE_DEAD, 0);
2671 if (rt) {
2672 rt6_info_init(rt);
2673 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2674
2675 new = &rt->dst;
2676 new->__use = 1;
2677 new->input = dst_discard;
2678 new->output = dst_discard_out;
2679
2680 dst_copy_metrics(new, &ort->dst);
2681
2682 rt->rt6i_idev = in6_dev_get(loopback_dev);
2683 rt->rt6i_gateway = ort->rt6i_gateway;
2684 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2685
2686 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2687#ifdef CONFIG_IPV6_SUBTREES
2688 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2689#endif
2690 }
2691
2692 dst_release(dst_orig);
2693 return new ? new : ERR_PTR(-ENOMEM);
2694}
2695
2696/*
2697 * Destination cache support functions
2698 */
2699
2700static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2701{
2702 u32 rt_cookie = 0;
2703
2704 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2705 return false;
2706
2707 if (fib6_check_expired(f6i))
2708 return false;
2709
2710 return true;
2711}
2712
2713static struct dst_entry *rt6_check(struct rt6_info *rt,
2714 struct fib6_info *from,
2715 u32 cookie)
2716{
2717 u32 rt_cookie = 0;
2718
2719 if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
2720 rt_cookie != cookie)
2721 return NULL;
2722
2723 if (rt6_check_expired(rt))
2724 return NULL;
2725
2726 return &rt->dst;
2727}
2728
2729static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2730 struct fib6_info *from,
2731 u32 cookie)
2732{
2733 if (!__rt6_check_expired(rt) &&
2734 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2735 fib6_check(from, cookie))
2736 return &rt->dst;
2737 else
2738 return NULL;
2739}
2740
2741INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
2742 u32 cookie)
2743{
2744 struct dst_entry *dst_ret;
2745 struct fib6_info *from;
2746 struct rt6_info *rt;
2747
2748 rt = dst_rt6_info(dst);
2749
2750 if (rt->sernum)
2751 return rt6_is_valid(rt) ? dst : NULL;
2752
2753 rcu_read_lock();
2754
2755 /* All IPV6 dsts are created with ->obsolete set to the value
2756 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2757 * into this function always.
2758 */
2759
2760 from = rcu_dereference(rt->from);
2761
2762 if (from && (rt->rt6i_flags & RTF_PCPU ||
2763 unlikely(!list_empty(&rt->dst.rt_uncached))))
2764 dst_ret = rt6_dst_from_check(rt, from, cookie);
2765 else
2766 dst_ret = rt6_check(rt, from, cookie);
2767
2768 rcu_read_unlock();
2769
2770 return dst_ret;
2771}
2772EXPORT_INDIRECT_CALLABLE(ip6_dst_check);
2773
2774static void ip6_negative_advice(struct sock *sk,
2775 struct dst_entry *dst)
2776{
2777 struct rt6_info *rt = dst_rt6_info(dst);
2778
2779 if (rt->rt6i_flags & RTF_CACHE) {
2780 rcu_read_lock();
2781 if (rt6_check_expired(rt)) {
2782 /* counteract the dst_release() in sk_dst_reset() */
2783 dst_hold(dst);
2784 sk_dst_reset(sk);
2785
2786 rt6_remove_exception_rt(rt);
2787 }
2788 rcu_read_unlock();
2789 return;
2790 }
2791 sk_dst_reset(sk);
2792}
2793
2794static void ip6_link_failure(struct sk_buff *skb)
2795{
2796 struct rt6_info *rt;
2797
2798 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2799
2800 rt = dst_rt6_info(skb_dst(skb));
2801 if (rt) {
2802 rcu_read_lock();
2803 if (rt->rt6i_flags & RTF_CACHE) {
2804 rt6_remove_exception_rt(rt);
2805 } else {
2806 struct fib6_info *from;
2807 struct fib6_node *fn;
2808
2809 from = rcu_dereference(rt->from);
2810 if (from) {
2811 fn = rcu_dereference(from->fib6_node);
2812 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2813 WRITE_ONCE(fn->fn_sernum, -1);
2814 }
2815 }
2816 rcu_read_unlock();
2817 }
2818}
2819
2820static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2821{
2822 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2823 struct fib6_info *from;
2824
2825 rcu_read_lock();
2826 from = rcu_dereference(rt0->from);
2827 if (from)
2828 rt0->dst.expires = from->expires;
2829 rcu_read_unlock();
2830 }
2831
2832 dst_set_expires(&rt0->dst, timeout);
2833 rt0->rt6i_flags |= RTF_EXPIRES;
2834}
2835
2836static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2837{
2838 struct net *net = dev_net(rt->dst.dev);
2839
2840 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2841 rt->rt6i_flags |= RTF_MODIFIED;
2842 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2843}
2844
2845static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2846{
2847 return !(rt->rt6i_flags & RTF_CACHE) &&
2848 (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2849}
2850
2851static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2852 const struct ipv6hdr *iph, u32 mtu,
2853 bool confirm_neigh)
2854{
2855 const struct in6_addr *daddr, *saddr;
2856 struct rt6_info *rt6 = dst_rt6_info(dst);
2857
2858 /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
2859 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
2860 * [see also comment in rt6_mtu_change_route()]
2861 */
2862
2863 if (iph) {
2864 daddr = &iph->daddr;
2865 saddr = &iph->saddr;
2866 } else if (sk) {
2867 daddr = &sk->sk_v6_daddr;
2868 saddr = &inet6_sk(sk)->saddr;
2869 } else {
2870 daddr = NULL;
2871 saddr = NULL;
2872 }
2873
2874 if (confirm_neigh)
2875 dst_confirm_neigh(dst, daddr);
2876
2877 if (mtu < IPV6_MIN_MTU)
2878 return;
2879 if (mtu >= dst_mtu(dst))
2880 return;
2881
2882 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2883 rt6_do_update_pmtu(rt6, mtu);
2884 /* update rt6_ex->stamp for cache */
2885 if (rt6->rt6i_flags & RTF_CACHE)
2886 rt6_update_exception_stamp_rt(rt6);
2887 } else if (daddr) {
2888 struct fib6_result res = {};
2889 struct rt6_info *nrt6;
2890
2891 rcu_read_lock();
2892 res.f6i = rcu_dereference(rt6->from);
2893 if (!res.f6i)
2894 goto out_unlock;
2895
2896 res.fib6_flags = res.f6i->fib6_flags;
2897 res.fib6_type = res.f6i->fib6_type;
2898
2899 if (res.f6i->nh) {
2900 struct fib6_nh_match_arg arg = {
2901 .dev = dst->dev,
2902 .gw = &rt6->rt6i_gateway,
2903 };
2904
2905 nexthop_for_each_fib6_nh(res.f6i->nh,
2906 fib6_nh_find_match, &arg);
2907
2908 /* fib6_info uses a nexthop that does not have fib6_nh
2909 * using the dst->dev + gw. Should be impossible.
2910 */
2911 if (!arg.match)
2912 goto out_unlock;
2913
2914 res.nh = arg.match;
2915 } else {
2916 res.nh = res.f6i->fib6_nh;
2917 }
2918
2919 nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2920 if (nrt6) {
2921 rt6_do_update_pmtu(nrt6, mtu);
2922 if (rt6_insert_exception(nrt6, &res))
2923 dst_release_immediate(&nrt6->dst);
2924 }
2925out_unlock:
2926 rcu_read_unlock();
2927 }
2928}
2929
2930static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2931 struct sk_buff *skb, u32 mtu,
2932 bool confirm_neigh)
2933{
2934 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
2935 confirm_neigh);
2936}
2937
2938void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2939 int oif, u32 mark, kuid_t uid)
2940{
2941 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2942 struct dst_entry *dst;
2943 struct flowi6 fl6 = {
2944 .flowi6_oif = oif,
2945 .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2946 .daddr = iph->daddr,
2947 .saddr = iph->saddr,
2948 .flowlabel = ip6_flowinfo(iph),
2949 .flowi6_uid = uid,
2950 };
2951
2952 dst = ip6_route_output(net, NULL, &fl6);
2953 if (!dst->error)
2954 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
2955 dst_release(dst);
2956}
2957EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2958
2959void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2960{
2961 int oif = sk->sk_bound_dev_if;
2962 struct dst_entry *dst;
2963
2964 if (!oif && skb->dev)
2965 oif = l3mdev_master_ifindex(skb->dev);
2966
2967 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark),
2968 sk->sk_uid);
2969
2970 dst = __sk_dst_get(sk);
2971 if (!dst || !dst->obsolete ||
2972 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2973 return;
2974
2975 bh_lock_sock(sk);
2976 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2977 ip6_datagram_dst_update(sk, false);
2978 bh_unlock_sock(sk);
2979}
2980EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2981
2982void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2983 const struct flowi6 *fl6)
2984{
2985#ifdef CONFIG_IPV6_SUBTREES
2986 struct ipv6_pinfo *np = inet6_sk(sk);
2987#endif
2988
2989 ip6_dst_store(sk, dst,
2990 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2991 &sk->sk_v6_daddr : NULL,
2992#ifdef CONFIG_IPV6_SUBTREES
2993 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2994 &np->saddr :
2995#endif
2996 NULL);
2997}
2998
2999static bool ip6_redirect_nh_match(const struct fib6_result *res,
3000 struct flowi6 *fl6,
3001 const struct in6_addr *gw,
3002 struct rt6_info **ret)
3003{
3004 const struct fib6_nh *nh = res->nh;
3005
3006 if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
3007 fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
3008 return false;
3009
3010 /* rt_cache's gateway might be different from its 'parent'
3011 * in the case of an ip redirect.
3012 * So we keep searching in the exception table if the gateway
3013 * is different.
3014 */
3015 if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
3016 struct rt6_info *rt_cache;
3017
3018 rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
3019 if (rt_cache &&
3020 ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
3021 *ret = rt_cache;
3022 return true;
3023 }
3024 return false;
3025 }
3026 return true;
3027}
3028
3029struct fib6_nh_rd_arg {
3030 struct fib6_result *res;
3031 struct flowi6 *fl6;
3032 const struct in6_addr *gw;
3033 struct rt6_info **ret;
3034};
3035
3036static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
3037{
3038 struct fib6_nh_rd_arg *arg = _arg;
3039
3040 arg->res->nh = nh;
3041 return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
3042}
3043
3044/* Handle redirects */
3045struct ip6rd_flowi {
3046 struct flowi6 fl6;
3047 struct in6_addr gateway;
3048};
3049
3050INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net,
3051 struct fib6_table *table,
3052 struct flowi6 *fl6,
3053 const struct sk_buff *skb,
3054 int flags)
3055{
3056 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
3057 struct rt6_info *ret = NULL;
3058 struct fib6_result res = {};
3059 struct fib6_nh_rd_arg arg = {
3060 .res = &res,
3061 .fl6 = fl6,
3062 .gw = &rdfl->gateway,
3063 .ret = &ret
3064 };
3065 struct fib6_info *rt;
3066 struct fib6_node *fn;
3067
3068 /* Get the "current" route for this destination and
3069 * check if the redirect has come from appropriate router.
3070 *
3071 * RFC 4861 specifies that redirects should only be
3072 * accepted if they come from the nexthop to the target.
3073 * Due to the way the routes are chosen, this notion
3074 * is a bit fuzzy and one might need to check all possible
3075 * routes.
3076 */
3077
3078 rcu_read_lock();
3079 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
3080restart:
3081 for_each_fib6_node_rt_rcu(fn) {
3082 res.f6i = rt;
3083 if (fib6_check_expired(rt))
3084 continue;
3085 if (rt->fib6_flags & RTF_REJECT)
3086 break;
3087 if (unlikely(rt->nh)) {
3088 if (nexthop_is_blackhole(rt->nh))
3089 continue;
3090 /* on match, res->nh is filled in and potentially ret */
3091 if (nexthop_for_each_fib6_nh(rt->nh,
3092 fib6_nh_redirect_match,
3093 &arg))
3094 goto out;
3095 } else {
3096 res.nh = rt->fib6_nh;
3097 if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
3098 &ret))
3099 goto out;
3100 }
3101 }
3102
3103 if (!rt)
3104 rt = net->ipv6.fib6_null_entry;
3105 else if (rt->fib6_flags & RTF_REJECT) {
3106 ret = net->ipv6.ip6_null_entry;
3107 goto out;
3108 }
3109
3110 if (rt == net->ipv6.fib6_null_entry) {
3111 fn = fib6_backtrack(fn, &fl6->saddr);
3112 if (fn)
3113 goto restart;
3114 }
3115
3116 res.f6i = rt;
3117 res.nh = rt->fib6_nh;
3118out:
3119 if (ret) {
3120 ip6_hold_safe(net, &ret);
3121 } else {
3122 res.fib6_flags = res.f6i->fib6_flags;
3123 res.fib6_type = res.f6i->fib6_type;
3124 ret = ip6_create_rt_rcu(&res);
3125 }
3126
3127 rcu_read_unlock();
3128
3129 trace_fib6_table_lookup(net, &res, table, fl6);
3130 return ret;
3131};
3132
3133static struct dst_entry *ip6_route_redirect(struct net *net,
3134 const struct flowi6 *fl6,
3135 const struct sk_buff *skb,
3136 const struct in6_addr *gateway)
3137{
3138 int flags = RT6_LOOKUP_F_HAS_SADDR;
3139 struct ip6rd_flowi rdfl;
3140
3141 rdfl.fl6 = *fl6;
3142 rdfl.gateway = *gateway;
3143
3144 return fib6_rule_lookup(net, &rdfl.fl6, skb,
3145 flags, __ip6_route_redirect);
3146}
3147
3148void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
3149 kuid_t uid)
3150{
3151 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
3152 struct dst_entry *dst;
3153 struct flowi6 fl6 = {
3154 .flowi6_iif = LOOPBACK_IFINDEX,
3155 .flowi6_oif = oif,
3156 .flowi6_mark = mark,
3157 .daddr = iph->daddr,
3158 .saddr = iph->saddr,
3159 .flowlabel = ip6_flowinfo(iph),
3160 .flowi6_uid = uid,
3161 };
3162
3163 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
3164 rt6_do_redirect(dst, NULL, skb);
3165 dst_release(dst);
3166}
3167EXPORT_SYMBOL_GPL(ip6_redirect);
3168
3169void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
3170{
3171 const struct ipv6hdr *iph = ipv6_hdr(skb);
3172 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
3173 struct dst_entry *dst;
3174 struct flowi6 fl6 = {
3175 .flowi6_iif = LOOPBACK_IFINDEX,
3176 .flowi6_oif = oif,
3177 .daddr = msg->dest,
3178 .saddr = iph->daddr,
3179 .flowi6_uid = sock_net_uid(net, NULL),
3180 };
3181
3182 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
3183 rt6_do_redirect(dst, NULL, skb);
3184 dst_release(dst);
3185}
3186
3187void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
3188{
3189 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
3190 READ_ONCE(sk->sk_mark), sk->sk_uid);
3191}
3192EXPORT_SYMBOL_GPL(ip6_sk_redirect);
3193
3194static unsigned int ip6_default_advmss(const struct dst_entry *dst)
3195{
3196 struct net_device *dev = dst->dev;
3197 unsigned int mtu = dst_mtu(dst);
3198 struct net *net = dev_net(dev);
3199
3200 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
3201
3202 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
3203 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
3204
3205 /*
3206 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
3207 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
3208 * IPV6_MAXPLEN is also valid and means: "any MSS,
3209 * rely only on pmtu discovery"
3210 */
3211 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
3212 mtu = IPV6_MAXPLEN;
3213 return mtu;
3214}
3215
3216INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst)
3217{
3218 return ip6_dst_mtu_maybe_forward(dst, false);
3219}
3220EXPORT_INDIRECT_CALLABLE(ip6_mtu);
3221
3222/* MTU selection:
3223 * 1. mtu on route is locked - use it
3224 * 2. mtu from nexthop exception
3225 * 3. mtu from egress device
3226 *
3227 * based on ip6_dst_mtu_forward and exception logic of
3228 * rt6_find_cached_rt; called with rcu_read_lock
3229 */
3230u32 ip6_mtu_from_fib6(const struct fib6_result *res,
3231 const struct in6_addr *daddr,
3232 const struct in6_addr *saddr)
3233{
3234 const struct fib6_nh *nh = res->nh;
3235 struct fib6_info *f6i = res->f6i;
3236 struct inet6_dev *idev;
3237 struct rt6_info *rt;
3238 u32 mtu = 0;
3239
3240 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
3241 mtu = f6i->fib6_pmtu;
3242 if (mtu)
3243 goto out;
3244 }
3245
3246 rt = rt6_find_cached_rt(res, daddr, saddr);
3247 if (unlikely(rt)) {
3248 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
3249 } else {
3250 struct net_device *dev = nh->fib_nh_dev;
3251
3252 mtu = IPV6_MIN_MTU;
3253 idev = __in6_dev_get(dev);
3254 if (idev)
3255 mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6));
3256 }
3257
3258 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
3259out:
3260 return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
3261}
3262
3263struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
3264 struct flowi6 *fl6)
3265{
3266 struct dst_entry *dst;
3267 struct rt6_info *rt;
3268 struct inet6_dev *idev = in6_dev_get(dev);
3269 struct net *net = dev_net(dev);
3270
3271 if (unlikely(!idev))
3272 return ERR_PTR(-ENODEV);
3273
3274 rt = ip6_dst_alloc(net, dev, 0);
3275 if (unlikely(!rt)) {
3276 in6_dev_put(idev);
3277 dst = ERR_PTR(-ENOMEM);
3278 goto out;
3279 }
3280
3281 rt->dst.input = ip6_input;
3282 rt->dst.output = ip6_output;
3283 rt->rt6i_gateway = fl6->daddr;
3284 rt->rt6i_dst.addr = fl6->daddr;
3285 rt->rt6i_dst.plen = 128;
3286 rt->rt6i_idev = idev;
3287 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
3288
3289 /* Add this dst into uncached_list so that rt6_disable_ip() can
3290 * do proper release of the net_device
3291 */
3292 rt6_uncached_list_add(rt);
3293
3294 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
3295
3296out:
3297 return dst;
3298}
3299
3300static void ip6_dst_gc(struct dst_ops *ops)
3301{
3302 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
3303 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
3304 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
3305 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
3306 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
3307 unsigned int val;
3308 int entries;
3309
3310 if (time_after(rt_last_gc + rt_min_interval, jiffies))
3311 goto out;
3312
3313 fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
3314 entries = dst_entries_get_slow(ops);
3315 if (entries < ops->gc_thresh)
3316 atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1);
3317out:
3318 val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
3319 atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
3320}
3321
3322static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
3323 const struct in6_addr *gw_addr, u32 tbid,
3324 int flags, struct fib6_result *res)
3325{
3326 struct flowi6 fl6 = {
3327 .flowi6_oif = cfg->fc_ifindex,
3328 .daddr = *gw_addr,
3329 .saddr = cfg->fc_prefsrc,
3330 };
3331 struct fib6_table *table;
3332 int err;
3333
3334 table = fib6_get_table(net, tbid);
3335 if (!table)
3336 return -EINVAL;
3337
3338 if (!ipv6_addr_any(&cfg->fc_prefsrc))
3339 flags |= RT6_LOOKUP_F_HAS_SADDR;
3340
3341 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
3342
3343 err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
3344 if (!err && res->f6i != net->ipv6.fib6_null_entry)
3345 fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
3346 cfg->fc_ifindex != 0, NULL, flags);
3347
3348 return err;
3349}
3350
3351static int ip6_route_check_nh_onlink(struct net *net,
3352 struct fib6_config *cfg,
3353 const struct net_device *dev,
3354 struct netlink_ext_ack *extack)
3355{
3356 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
3357 const struct in6_addr *gw_addr = &cfg->fc_gateway;
3358 struct fib6_result res = {};
3359 int err;
3360
3361 err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
3362 if (!err && !(res.fib6_flags & RTF_REJECT) &&
3363 /* ignore match if it is the default route */
3364 !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
3365 (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
3366 NL_SET_ERR_MSG(extack,
3367 "Nexthop has invalid gateway or device mismatch");
3368 err = -EINVAL;
3369 }
3370
3371 return err;
3372}
3373
3374static int ip6_route_check_nh(struct net *net,
3375 struct fib6_config *cfg,
3376 struct net_device **_dev,
3377 netdevice_tracker *dev_tracker,
3378 struct inet6_dev **idev)
3379{
3380 const struct in6_addr *gw_addr = &cfg->fc_gateway;
3381 struct net_device *dev = _dev ? *_dev : NULL;
3382 int flags = RT6_LOOKUP_F_IFACE;
3383 struct fib6_result res = {};
3384 int err = -EHOSTUNREACH;
3385
3386 if (cfg->fc_table) {
3387 err = ip6_nh_lookup_table(net, cfg, gw_addr,
3388 cfg->fc_table, flags, &res);
3389 /* gw_addr can not require a gateway or resolve to a reject
3390 * route. If a device is given, it must match the result.
3391 */
3392 if (err || res.fib6_flags & RTF_REJECT ||
3393 res.nh->fib_nh_gw_family ||
3394 (dev && dev != res.nh->fib_nh_dev))
3395 err = -EHOSTUNREACH;
3396 }
3397
3398 if (err < 0) {
3399 struct flowi6 fl6 = {
3400 .flowi6_oif = cfg->fc_ifindex,
3401 .daddr = *gw_addr,
3402 };
3403
3404 err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
3405 if (err || res.fib6_flags & RTF_REJECT ||
3406 res.nh->fib_nh_gw_family)
3407 err = -EHOSTUNREACH;
3408
3409 if (err)
3410 return err;
3411
3412 fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
3413 cfg->fc_ifindex != 0, NULL, flags);
3414 }
3415
3416 err = 0;
3417 if (dev) {
3418 if (dev != res.nh->fib_nh_dev)
3419 err = -EHOSTUNREACH;
3420 } else {
3421 *_dev = dev = res.nh->fib_nh_dev;
3422 netdev_hold(dev, dev_tracker, GFP_ATOMIC);
3423 *idev = in6_dev_get(dev);
3424 }
3425
3426 return err;
3427}
3428
3429static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
3430 struct net_device **_dev,
3431 netdevice_tracker *dev_tracker,
3432 struct inet6_dev **idev,
3433 struct netlink_ext_ack *extack)
3434{
3435 const struct in6_addr *gw_addr = &cfg->fc_gateway;
3436 int gwa_type = ipv6_addr_type(gw_addr);
3437 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
3438 const struct net_device *dev = *_dev;
3439 bool need_addr_check = !dev;
3440 int err = -EINVAL;
3441
3442 /* if gw_addr is local we will fail to detect this in case
3443 * address is still TENTATIVE (DAD in progress). rt6_lookup()
3444 * will return already-added prefix route via interface that
3445 * prefix route was assigned to, which might be non-loopback.
3446 */
3447 if (dev &&
3448 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
3449 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
3450 goto out;
3451 }
3452
3453 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
3454 /* IPv6 strictly inhibits using not link-local
3455 * addresses as nexthop address.
3456 * Otherwise, router will not able to send redirects.
3457 * It is very good, but in some (rare!) circumstances
3458 * (SIT, PtP, NBMA NOARP links) it is handy to allow
3459 * some exceptions. --ANK
3460 * We allow IPv4-mapped nexthops to support RFC4798-type
3461 * addressing
3462 */
3463 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
3464 NL_SET_ERR_MSG(extack, "Invalid gateway address");
3465 goto out;
3466 }
3467
3468 rcu_read_lock();
3469
3470 if (cfg->fc_flags & RTNH_F_ONLINK)
3471 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
3472 else
3473 err = ip6_route_check_nh(net, cfg, _dev, dev_tracker,
3474 idev);
3475
3476 rcu_read_unlock();
3477
3478 if (err)
3479 goto out;
3480 }
3481
3482 /* reload in case device was changed */
3483 dev = *_dev;
3484
3485 err = -EINVAL;
3486 if (!dev) {
3487 NL_SET_ERR_MSG(extack, "Egress device not specified");
3488 goto out;
3489 } else if (dev->flags & IFF_LOOPBACK) {
3490 NL_SET_ERR_MSG(extack,
3491 "Egress device can not be loopback device for this route");
3492 goto out;
3493 }
3494
3495 /* if we did not check gw_addr above, do so now that the
3496 * egress device has been resolved.
3497 */
3498 if (need_addr_check &&
3499 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
3500 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
3501 goto out;
3502 }
3503
3504 err = 0;
3505out:
3506 return err;
3507}
3508
3509static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
3510{
3511 if ((flags & RTF_REJECT) ||
3512 (dev && (dev->flags & IFF_LOOPBACK) &&
3513 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3514 !(flags & (RTF_ANYCAST | RTF_LOCAL))))
3515 return true;
3516
3517 return false;
3518}
3519
3520int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
3521 struct fib6_config *cfg, gfp_t gfp_flags,
3522 struct netlink_ext_ack *extack)
3523{
3524 netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker;
3525 struct net_device *dev = NULL;
3526 struct inet6_dev *idev = NULL;
3527 int addr_type;
3528 int err;
3529
3530 fib6_nh->fib_nh_family = AF_INET6;
3531#ifdef CONFIG_IPV6_ROUTER_PREF
3532 fib6_nh->last_probe = jiffies;
3533#endif
3534 if (cfg->fc_is_fdb) {
3535 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3536 fib6_nh->fib_nh_gw_family = AF_INET6;
3537 return 0;
3538 }
3539
3540 err = -ENODEV;
3541 if (cfg->fc_ifindex) {
3542 dev = netdev_get_by_index(net, cfg->fc_ifindex,
3543 dev_tracker, gfp_flags);
3544 if (!dev)
3545 goto out;
3546 idev = in6_dev_get(dev);
3547 if (!idev)
3548 goto out;
3549 }
3550
3551 if (cfg->fc_flags & RTNH_F_ONLINK) {
3552 if (!dev) {
3553 NL_SET_ERR_MSG(extack,
3554 "Nexthop device required for onlink");
3555 goto out;
3556 }
3557
3558 if (!(dev->flags & IFF_UP)) {
3559 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3560 err = -ENETDOWN;
3561 goto out;
3562 }
3563
3564 fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3565 }
3566
3567 fib6_nh->fib_nh_weight = 1;
3568
3569 /* We cannot add true routes via loopback here,
3570 * they would result in kernel looping; promote them to reject routes
3571 */
3572 addr_type = ipv6_addr_type(&cfg->fc_dst);
3573 if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3574 /* hold loopback dev/idev if we haven't done so. */
3575 if (dev != net->loopback_dev) {
3576 if (dev) {
3577 netdev_put(dev, dev_tracker);
3578 in6_dev_put(idev);
3579 }
3580 dev = net->loopback_dev;
3581 netdev_hold(dev, dev_tracker, gfp_flags);
3582 idev = in6_dev_get(dev);
3583 if (!idev) {
3584 err = -ENODEV;
3585 goto out;
3586 }
3587 }
3588 goto pcpu_alloc;
3589 }
3590
3591 if (cfg->fc_flags & RTF_GATEWAY) {
3592 err = ip6_validate_gw(net, cfg, &dev, dev_tracker,
3593 &idev, extack);
3594 if (err)
3595 goto out;
3596
3597 fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3598 fib6_nh->fib_nh_gw_family = AF_INET6;
3599 }
3600
3601 err = -ENODEV;
3602 if (!dev)
3603 goto out;
3604
3605 if (idev->cnf.disable_ipv6) {
3606 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3607 err = -EACCES;
3608 goto out;
3609 }
3610
3611 if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3612 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3613 err = -ENETDOWN;
3614 goto out;
3615 }
3616
3617 if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3618 !netif_carrier_ok(dev))
3619 fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3620
3621 err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap,
3622 cfg->fc_encap_type, cfg, gfp_flags, extack);
3623 if (err)
3624 goto out;
3625
3626pcpu_alloc:
3627 fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
3628 if (!fib6_nh->rt6i_pcpu) {
3629 err = -ENOMEM;
3630 goto out;
3631 }
3632
3633 fib6_nh->fib_nh_dev = dev;
3634 fib6_nh->fib_nh_oif = dev->ifindex;
3635 err = 0;
3636out:
3637 if (idev)
3638 in6_dev_put(idev);
3639
3640 if (err) {
3641 lwtstate_put(fib6_nh->fib_nh_lws);
3642 fib6_nh->fib_nh_lws = NULL;
3643 netdev_put(dev, dev_tracker);
3644 }
3645
3646 return err;
3647}
3648
3649void fib6_nh_release(struct fib6_nh *fib6_nh)
3650{
3651 struct rt6_exception_bucket *bucket;
3652
3653 rcu_read_lock();
3654
3655 fib6_nh_flush_exceptions(fib6_nh, NULL);
3656 bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
3657 if (bucket) {
3658 rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
3659 kfree(bucket);
3660 }
3661
3662 rcu_read_unlock();
3663
3664 fib6_nh_release_dsts(fib6_nh);
3665 free_percpu(fib6_nh->rt6i_pcpu);
3666
3667 fib_nh_common_release(&fib6_nh->nh_common);
3668}
3669
3670void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
3671{
3672 int cpu;
3673
3674 if (!fib6_nh->rt6i_pcpu)
3675 return;
3676
3677 for_each_possible_cpu(cpu) {
3678 struct rt6_info *pcpu_rt, **ppcpu_rt;
3679
3680 ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
3681 pcpu_rt = xchg(ppcpu_rt, NULL);
3682 if (pcpu_rt) {
3683 dst_dev_put(&pcpu_rt->dst);
3684 dst_release(&pcpu_rt->dst);
3685 }
3686 }
3687}
3688
3689static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3690 gfp_t gfp_flags,
3691 struct netlink_ext_ack *extack)
3692{
3693 struct net *net = cfg->fc_nlinfo.nl_net;
3694 struct fib6_info *rt = NULL;
3695 struct nexthop *nh = NULL;
3696 struct fib6_table *table;
3697 struct fib6_nh *fib6_nh;
3698 int err = -EINVAL;
3699 int addr_type;
3700
3701 /* RTF_PCPU is an internal flag; can not be set by userspace */
3702 if (cfg->fc_flags & RTF_PCPU) {
3703 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3704 goto out;
3705 }
3706
3707 /* RTF_CACHE is an internal flag; can not be set by userspace */
3708 if (cfg->fc_flags & RTF_CACHE) {
3709 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3710 goto out;
3711 }
3712
3713 if (cfg->fc_type > RTN_MAX) {
3714 NL_SET_ERR_MSG(extack, "Invalid route type");
3715 goto out;
3716 }
3717
3718 if (cfg->fc_dst_len > 128) {
3719 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3720 goto out;
3721 }
3722 if (cfg->fc_src_len > 128) {
3723 NL_SET_ERR_MSG(extack, "Invalid source address length");
3724 goto out;
3725 }
3726#ifndef CONFIG_IPV6_SUBTREES
3727 if (cfg->fc_src_len) {
3728 NL_SET_ERR_MSG(extack,
3729 "Specifying source address requires IPV6_SUBTREES to be enabled");
3730 goto out;
3731 }
3732#endif
3733 if (cfg->fc_nh_id) {
3734 nh = nexthop_find_by_id(net, cfg->fc_nh_id);
3735 if (!nh) {
3736 NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
3737 goto out;
3738 }
3739 err = fib6_check_nexthop(nh, cfg, extack);
3740 if (err)
3741 goto out;
3742 }
3743
3744 err = -ENOBUFS;
3745 if (cfg->fc_nlinfo.nlh &&
3746 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3747 table = fib6_get_table(net, cfg->fc_table);
3748 if (!table) {
3749 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3750 table = fib6_new_table(net, cfg->fc_table);
3751 }
3752 } else {
3753 table = fib6_new_table(net, cfg->fc_table);
3754 }
3755
3756 if (!table)
3757 goto out;
3758
3759 err = -ENOMEM;
3760 rt = fib6_info_alloc(gfp_flags, !nh);
3761 if (!rt)
3762 goto out;
3763
3764 rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3765 extack);
3766 if (IS_ERR(rt->fib6_metrics)) {
3767 err = PTR_ERR(rt->fib6_metrics);
3768 /* Do not leave garbage there. */
3769 rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3770 goto out_free;
3771 }
3772
3773 if (cfg->fc_flags & RTF_ADDRCONF)
3774 rt->dst_nocount = true;
3775
3776 if (cfg->fc_flags & RTF_EXPIRES)
3777 fib6_set_expires(rt, jiffies +
3778 clock_t_to_jiffies(cfg->fc_expires));
3779
3780 if (cfg->fc_protocol == RTPROT_UNSPEC)
3781 cfg->fc_protocol = RTPROT_BOOT;
3782 rt->fib6_protocol = cfg->fc_protocol;
3783
3784 rt->fib6_table = table;
3785 rt->fib6_metric = cfg->fc_metric;
3786 rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
3787 rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3788
3789 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3790 rt->fib6_dst.plen = cfg->fc_dst_len;
3791
3792#ifdef CONFIG_IPV6_SUBTREES
3793 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3794 rt->fib6_src.plen = cfg->fc_src_len;
3795#endif
3796 if (nh) {
3797 if (rt->fib6_src.plen) {
3798 NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
3799 goto out_free;
3800 }
3801 if (!nexthop_get(nh)) {
3802 NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
3803 goto out_free;
3804 }
3805 rt->nh = nh;
3806 fib6_nh = nexthop_fib6_nh(rt->nh);
3807 } else {
3808 err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
3809 if (err)
3810 goto out;
3811
3812 fib6_nh = rt->fib6_nh;
3813
3814 /* We cannot add true routes via loopback here, they would
3815 * result in kernel looping; promote them to reject routes
3816 */
3817 addr_type = ipv6_addr_type(&cfg->fc_dst);
3818 if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
3819 addr_type))
3820 rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3821 }
3822
3823 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3824 struct net_device *dev = fib6_nh->fib_nh_dev;
3825
3826 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3827 NL_SET_ERR_MSG(extack, "Invalid source address");
3828 err = -EINVAL;
3829 goto out;
3830 }
3831 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3832 rt->fib6_prefsrc.plen = 128;
3833 } else
3834 rt->fib6_prefsrc.plen = 0;
3835
3836 return rt;
3837out:
3838 fib6_info_release(rt);
3839 return ERR_PTR(err);
3840out_free:
3841 ip_fib_metrics_put(rt->fib6_metrics);
3842 kfree(rt);
3843 return ERR_PTR(err);
3844}
3845
3846int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3847 struct netlink_ext_ack *extack)
3848{
3849 struct fib6_info *rt;
3850 int err;
3851
3852 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3853 if (IS_ERR(rt))
3854 return PTR_ERR(rt);
3855
3856 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3857 fib6_info_release(rt);
3858
3859 return err;
3860}
3861
3862static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3863{
3864 struct net *net = info->nl_net;
3865 struct fib6_table *table;
3866 int err;
3867
3868 if (rt == net->ipv6.fib6_null_entry) {
3869 err = -ENOENT;
3870 goto out;
3871 }
3872
3873 table = rt->fib6_table;
3874 spin_lock_bh(&table->tb6_lock);
3875 err = fib6_del(rt, info);
3876 spin_unlock_bh(&table->tb6_lock);
3877
3878out:
3879 fib6_info_release(rt);
3880 return err;
3881}
3882
3883int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify)
3884{
3885 struct nl_info info = {
3886 .nl_net = net,
3887 .skip_notify = skip_notify
3888 };
3889
3890 return __ip6_del_rt(rt, &info);
3891}
3892
3893static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3894{
3895 struct nl_info *info = &cfg->fc_nlinfo;
3896 struct net *net = info->nl_net;
3897 struct sk_buff *skb = NULL;
3898 struct fib6_table *table;
3899 int err = -ENOENT;
3900
3901 if (rt == net->ipv6.fib6_null_entry)
3902 goto out_put;
3903 table = rt->fib6_table;
3904 spin_lock_bh(&table->tb6_lock);
3905
3906 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3907 struct fib6_info *sibling, *next_sibling;
3908 struct fib6_node *fn;
3909
3910 /* prefer to send a single notification with all hops */
3911 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3912 if (skb) {
3913 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3914
3915 if (rt6_fill_node(net, skb, rt, NULL,
3916 NULL, NULL, 0, RTM_DELROUTE,
3917 info->portid, seq, 0) < 0) {
3918 kfree_skb(skb);
3919 skb = NULL;
3920 } else
3921 info->skip_notify = 1;
3922 }
3923
3924 /* 'rt' points to the first sibling route. If it is not the
3925 * leaf, then we do not need to send a notification. Otherwise,
3926 * we need to check if the last sibling has a next route or not
3927 * and emit a replace or delete notification, respectively.
3928 */
3929 info->skip_notify_kernel = 1;
3930 fn = rcu_dereference_protected(rt->fib6_node,
3931 lockdep_is_held(&table->tb6_lock));
3932 if (rcu_access_pointer(fn->leaf) == rt) {
3933 struct fib6_info *last_sibling, *replace_rt;
3934
3935 last_sibling = list_last_entry(&rt->fib6_siblings,
3936 struct fib6_info,
3937 fib6_siblings);
3938 replace_rt = rcu_dereference_protected(
3939 last_sibling->fib6_next,
3940 lockdep_is_held(&table->tb6_lock));
3941 if (replace_rt)
3942 call_fib6_entry_notifiers_replace(net,
3943 replace_rt);
3944 else
3945 call_fib6_multipath_entry_notifiers(net,
3946 FIB_EVENT_ENTRY_DEL,
3947 rt, rt->fib6_nsiblings,
3948 NULL);
3949 }
3950 list_for_each_entry_safe(sibling, next_sibling,
3951 &rt->fib6_siblings,
3952 fib6_siblings) {
3953 err = fib6_del(sibling, info);
3954 if (err)
3955 goto out_unlock;
3956 }
3957 }
3958
3959 err = fib6_del(rt, info);
3960out_unlock:
3961 spin_unlock_bh(&table->tb6_lock);
3962out_put:
3963 fib6_info_release(rt);
3964
3965 if (skb) {
3966 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3967 info->nlh, gfp_any());
3968 }
3969 return err;
3970}
3971
3972static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3973{
3974 int rc = -ESRCH;
3975
3976 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3977 goto out;
3978
3979 if (cfg->fc_flags & RTF_GATEWAY &&
3980 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3981 goto out;
3982
3983 rc = rt6_remove_exception_rt(rt);
3984out:
3985 return rc;
3986}
3987
3988static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
3989 struct fib6_nh *nh)
3990{
3991 struct fib6_result res = {
3992 .f6i = rt,
3993 .nh = nh,
3994 };
3995 struct rt6_info *rt_cache;
3996
3997 rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
3998 if (rt_cache)
3999 return __ip6_del_cached_rt(rt_cache, cfg);
4000
4001 return 0;
4002}
4003
4004struct fib6_nh_del_cached_rt_arg {
4005 struct fib6_config *cfg;
4006 struct fib6_info *f6i;
4007};
4008
4009static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
4010{
4011 struct fib6_nh_del_cached_rt_arg *arg = _arg;
4012 int rc;
4013
4014 rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
4015 return rc != -ESRCH ? rc : 0;
4016}
4017
4018static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
4019{
4020 struct fib6_nh_del_cached_rt_arg arg = {
4021 .cfg = cfg,
4022 .f6i = f6i
4023 };
4024
4025 return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
4026}
4027
4028static int ip6_route_del(struct fib6_config *cfg,
4029 struct netlink_ext_ack *extack)
4030{
4031 struct fib6_table *table;
4032 struct fib6_info *rt;
4033 struct fib6_node *fn;
4034 int err = -ESRCH;
4035
4036 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
4037 if (!table) {
4038 NL_SET_ERR_MSG(extack, "FIB table does not exist");
4039 return err;
4040 }
4041
4042 rcu_read_lock();
4043
4044 fn = fib6_locate(&table->tb6_root,
4045 &cfg->fc_dst, cfg->fc_dst_len,
4046 &cfg->fc_src, cfg->fc_src_len,
4047 !(cfg->fc_flags & RTF_CACHE));
4048
4049 if (fn) {
4050 for_each_fib6_node_rt_rcu(fn) {
4051 struct fib6_nh *nh;
4052
4053 if (rt->nh && cfg->fc_nh_id &&
4054 rt->nh->id != cfg->fc_nh_id)
4055 continue;
4056
4057 if (cfg->fc_flags & RTF_CACHE) {
4058 int rc = 0;
4059
4060 if (rt->nh) {
4061 rc = ip6_del_cached_rt_nh(cfg, rt);
4062 } else if (cfg->fc_nh_id) {
4063 continue;
4064 } else {
4065 nh = rt->fib6_nh;
4066 rc = ip6_del_cached_rt(cfg, rt, nh);
4067 }
4068 if (rc != -ESRCH) {
4069 rcu_read_unlock();
4070 return rc;
4071 }
4072 continue;
4073 }
4074
4075 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
4076 continue;
4077 if (cfg->fc_protocol &&
4078 cfg->fc_protocol != rt->fib6_protocol)
4079 continue;
4080
4081 if (rt->nh) {
4082 if (!fib6_info_hold_safe(rt))
4083 continue;
4084 rcu_read_unlock();
4085
4086 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
4087 }
4088 if (cfg->fc_nh_id)
4089 continue;
4090
4091 nh = rt->fib6_nh;
4092 if (cfg->fc_ifindex &&
4093 (!nh->fib_nh_dev ||
4094 nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
4095 continue;
4096 if (cfg->fc_flags & RTF_GATEWAY &&
4097 !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
4098 continue;
4099 if (!fib6_info_hold_safe(rt))
4100 continue;
4101 rcu_read_unlock();
4102
4103 /* if gateway was specified only delete the one hop */
4104 if (cfg->fc_flags & RTF_GATEWAY)
4105 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
4106
4107 return __ip6_del_rt_siblings(rt, cfg);
4108 }
4109 }
4110 rcu_read_unlock();
4111
4112 return err;
4113}
4114
4115static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
4116{
4117 struct netevent_redirect netevent;
4118 struct rt6_info *rt, *nrt = NULL;
4119 struct fib6_result res = {};
4120 struct ndisc_options ndopts;
4121 struct inet6_dev *in6_dev;
4122 struct neighbour *neigh;
4123 struct rd_msg *msg;
4124 int optlen, on_link;
4125 u8 *lladdr;
4126
4127 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
4128 optlen -= sizeof(*msg);
4129
4130 if (optlen < 0) {
4131 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
4132 return;
4133 }
4134
4135 msg = (struct rd_msg *)icmp6_hdr(skb);
4136
4137 if (ipv6_addr_is_multicast(&msg->dest)) {
4138 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
4139 return;
4140 }
4141
4142 on_link = 0;
4143 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
4144 on_link = 1;
4145 } else if (ipv6_addr_type(&msg->target) !=
4146 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
4147 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
4148 return;
4149 }
4150
4151 in6_dev = __in6_dev_get(skb->dev);
4152 if (!in6_dev)
4153 return;
4154 if (READ_ONCE(in6_dev->cnf.forwarding) ||
4155 !READ_ONCE(in6_dev->cnf.accept_redirects))
4156 return;
4157
4158 /* RFC2461 8.1:
4159 * The IP source address of the Redirect MUST be the same as the current
4160 * first-hop router for the specified ICMP Destination Address.
4161 */
4162
4163 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
4164 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
4165 return;
4166 }
4167
4168 lladdr = NULL;
4169 if (ndopts.nd_opts_tgt_lladdr) {
4170 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
4171 skb->dev);
4172 if (!lladdr) {
4173 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
4174 return;
4175 }
4176 }
4177
4178 rt = dst_rt6_info(dst);
4179 if (rt->rt6i_flags & RTF_REJECT) {
4180 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
4181 return;
4182 }
4183
4184 /* Redirect received -> path was valid.
4185 * Look, redirects are sent only in response to data packets,
4186 * so that this nexthop apparently is reachable. --ANK
4187 */
4188 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
4189
4190 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
4191 if (!neigh)
4192 return;
4193
4194 /*
4195 * We have finally decided to accept it.
4196 */
4197
4198 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
4199 NEIGH_UPDATE_F_WEAK_OVERRIDE|
4200 NEIGH_UPDATE_F_OVERRIDE|
4201 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
4202 NEIGH_UPDATE_F_ISROUTER)),
4203 NDISC_REDIRECT, &ndopts);
4204
4205 rcu_read_lock();
4206 res.f6i = rcu_dereference(rt->from);
4207 if (!res.f6i)
4208 goto out;
4209
4210 if (res.f6i->nh) {
4211 struct fib6_nh_match_arg arg = {
4212 .dev = dst->dev,
4213 .gw = &rt->rt6i_gateway,
4214 };
4215
4216 nexthop_for_each_fib6_nh(res.f6i->nh,
4217 fib6_nh_find_match, &arg);
4218
4219 /* fib6_info uses a nexthop that does not have fib6_nh
4220 * using the dst->dev. Should be impossible
4221 */
4222 if (!arg.match)
4223 goto out;
4224 res.nh = arg.match;
4225 } else {
4226 res.nh = res.f6i->fib6_nh;
4227 }
4228
4229 res.fib6_flags = res.f6i->fib6_flags;
4230 res.fib6_type = res.f6i->fib6_type;
4231 nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
4232 if (!nrt)
4233 goto out;
4234
4235 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
4236 if (on_link)
4237 nrt->rt6i_flags &= ~RTF_GATEWAY;
4238
4239 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
4240
4241 /* rt6_insert_exception() will take care of duplicated exceptions */
4242 if (rt6_insert_exception(nrt, &res)) {
4243 dst_release_immediate(&nrt->dst);
4244 goto out;
4245 }
4246
4247 netevent.old = &rt->dst;
4248 netevent.new = &nrt->dst;
4249 netevent.daddr = &msg->dest;
4250 netevent.neigh = neigh;
4251 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
4252
4253out:
4254 rcu_read_unlock();
4255 neigh_release(neigh);
4256}
4257
4258#ifdef CONFIG_IPV6_ROUTE_INFO
4259static struct fib6_info *rt6_get_route_info(struct net *net,
4260 const struct in6_addr *prefix, int prefixlen,
4261 const struct in6_addr *gwaddr,
4262 struct net_device *dev)
4263{
4264 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
4265 int ifindex = dev->ifindex;
4266 struct fib6_node *fn;
4267 struct fib6_info *rt = NULL;
4268 struct fib6_table *table;
4269
4270 table = fib6_get_table(net, tb_id);
4271 if (!table)
4272 return NULL;
4273
4274 rcu_read_lock();
4275 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
4276 if (!fn)
4277 goto out;
4278
4279 for_each_fib6_node_rt_rcu(fn) {
4280 /* these routes do not use nexthops */
4281 if (rt->nh)
4282 continue;
4283 if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
4284 continue;
4285 if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
4286 !rt->fib6_nh->fib_nh_gw_family)
4287 continue;
4288 if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
4289 continue;
4290 if (!fib6_info_hold_safe(rt))
4291 continue;
4292 break;
4293 }
4294out:
4295 rcu_read_unlock();
4296 return rt;
4297}
4298
4299static struct fib6_info *rt6_add_route_info(struct net *net,
4300 const struct in6_addr *prefix, int prefixlen,
4301 const struct in6_addr *gwaddr,
4302 struct net_device *dev,
4303 unsigned int pref)
4304{
4305 struct fib6_config cfg = {
4306 .fc_metric = IP6_RT_PRIO_USER,
4307 .fc_ifindex = dev->ifindex,
4308 .fc_dst_len = prefixlen,
4309 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
4310 RTF_UP | RTF_PREF(pref),
4311 .fc_protocol = RTPROT_RA,
4312 .fc_type = RTN_UNICAST,
4313 .fc_nlinfo.portid = 0,
4314 .fc_nlinfo.nlh = NULL,
4315 .fc_nlinfo.nl_net = net,
4316 };
4317
4318 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
4319 cfg.fc_dst = *prefix;
4320 cfg.fc_gateway = *gwaddr;
4321
4322 /* We should treat it as a default route if prefix length is 0. */
4323 if (!prefixlen)
4324 cfg.fc_flags |= RTF_DEFAULT;
4325
4326 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
4327
4328 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
4329}
4330#endif
4331
4332struct fib6_info *rt6_get_dflt_router(struct net *net,
4333 const struct in6_addr *addr,
4334 struct net_device *dev)
4335{
4336 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
4337 struct fib6_info *rt;
4338 struct fib6_table *table;
4339
4340 table = fib6_get_table(net, tb_id);
4341 if (!table)
4342 return NULL;
4343
4344 rcu_read_lock();
4345 for_each_fib6_node_rt_rcu(&table->tb6_root) {
4346 struct fib6_nh *nh;
4347
4348 /* RA routes do not use nexthops */
4349 if (rt->nh)
4350 continue;
4351
4352 nh = rt->fib6_nh;
4353 if (dev == nh->fib_nh_dev &&
4354 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
4355 ipv6_addr_equal(&nh->fib_nh_gw6, addr))
4356 break;
4357 }
4358 if (rt && !fib6_info_hold_safe(rt))
4359 rt = NULL;
4360 rcu_read_unlock();
4361 return rt;
4362}
4363
4364struct fib6_info *rt6_add_dflt_router(struct net *net,
4365 const struct in6_addr *gwaddr,
4366 struct net_device *dev,
4367 unsigned int pref,
4368 u32 defrtr_usr_metric,
4369 int lifetime)
4370{
4371 struct fib6_config cfg = {
4372 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
4373 .fc_metric = defrtr_usr_metric,
4374 .fc_ifindex = dev->ifindex,
4375 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
4376 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
4377 .fc_protocol = RTPROT_RA,
4378 .fc_type = RTN_UNICAST,
4379 .fc_nlinfo.portid = 0,
4380 .fc_nlinfo.nlh = NULL,
4381 .fc_nlinfo.nl_net = net,
4382 .fc_expires = jiffies_to_clock_t(lifetime * HZ),
4383 };
4384
4385 cfg.fc_gateway = *gwaddr;
4386
4387 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
4388 struct fib6_table *table;
4389
4390 table = fib6_get_table(dev_net(dev), cfg.fc_table);
4391 if (table)
4392 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
4393 }
4394
4395 return rt6_get_dflt_router(net, gwaddr, dev);
4396}
4397
4398static void __rt6_purge_dflt_routers(struct net *net,
4399 struct fib6_table *table)
4400{
4401 struct fib6_info *rt;
4402
4403restart:
4404 rcu_read_lock();
4405 for_each_fib6_node_rt_rcu(&table->tb6_root) {
4406 struct net_device *dev = fib6_info_nh_dev(rt);
4407 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
4408
4409 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
4410 (!idev || idev->cnf.accept_ra != 2) &&
4411 fib6_info_hold_safe(rt)) {
4412 rcu_read_unlock();
4413 ip6_del_rt(net, rt, false);
4414 goto restart;
4415 }
4416 }
4417 rcu_read_unlock();
4418
4419 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
4420}
4421
4422void rt6_purge_dflt_routers(struct net *net)
4423{
4424 struct fib6_table *table;
4425 struct hlist_head *head;
4426 unsigned int h;
4427
4428 rcu_read_lock();
4429
4430 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
4431 head = &net->ipv6.fib_table_hash[h];
4432 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
4433 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
4434 __rt6_purge_dflt_routers(net, table);
4435 }
4436 }
4437
4438 rcu_read_unlock();
4439}
4440
4441static void rtmsg_to_fib6_config(struct net *net,
4442 struct in6_rtmsg *rtmsg,
4443 struct fib6_config *cfg)
4444{
4445 *cfg = (struct fib6_config){
4446 .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
4447 : RT6_TABLE_MAIN,
4448 .fc_ifindex = rtmsg->rtmsg_ifindex,
4449 .fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
4450 .fc_expires = rtmsg->rtmsg_info,
4451 .fc_dst_len = rtmsg->rtmsg_dst_len,
4452 .fc_src_len = rtmsg->rtmsg_src_len,
4453 .fc_flags = rtmsg->rtmsg_flags,
4454 .fc_type = rtmsg->rtmsg_type,
4455
4456 .fc_nlinfo.nl_net = net,
4457
4458 .fc_dst = rtmsg->rtmsg_dst,
4459 .fc_src = rtmsg->rtmsg_src,
4460 .fc_gateway = rtmsg->rtmsg_gateway,
4461 };
4462}
4463
4464int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
4465{
4466 struct fib6_config cfg;
4467 int err;
4468
4469 if (cmd != SIOCADDRT && cmd != SIOCDELRT)
4470 return -EINVAL;
4471 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
4472 return -EPERM;
4473
4474 rtmsg_to_fib6_config(net, rtmsg, &cfg);
4475
4476 rtnl_lock();
4477 switch (cmd) {
4478 case SIOCADDRT:
4479 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
4480 break;
4481 case SIOCDELRT:
4482 err = ip6_route_del(&cfg, NULL);
4483 break;
4484 }
4485 rtnl_unlock();
4486 return err;
4487}
4488
4489/*
4490 * Drop the packet on the floor
4491 */
4492
4493static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
4494{
4495 struct dst_entry *dst = skb_dst(skb);
4496 struct net *net = dev_net(dst->dev);
4497 struct inet6_dev *idev;
4498 SKB_DR(reason);
4499 int type;
4500
4501 if (netif_is_l3_master(skb->dev) ||
4502 dst->dev == net->loopback_dev)
4503 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
4504 else
4505 idev = ip6_dst_idev(dst);
4506
4507 switch (ipstats_mib_noroutes) {
4508 case IPSTATS_MIB_INNOROUTES:
4509 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
4510 if (type == IPV6_ADDR_ANY) {
4511 SKB_DR_SET(reason, IP_INADDRERRORS);
4512 IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
4513 break;
4514 }
4515 SKB_DR_SET(reason, IP_INNOROUTES);
4516 fallthrough;
4517 case IPSTATS_MIB_OUTNOROUTES:
4518 SKB_DR_OR(reason, IP_OUTNOROUTES);
4519 IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
4520 break;
4521 }
4522
4523 /* Start over by dropping the dst for l3mdev case */
4524 if (netif_is_l3_master(skb->dev))
4525 skb_dst_drop(skb);
4526
4527 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
4528 kfree_skb_reason(skb, reason);
4529 return 0;
4530}
4531
4532static int ip6_pkt_discard(struct sk_buff *skb)
4533{
4534 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
4535}
4536
4537static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
4538{
4539 skb->dev = skb_dst(skb)->dev;
4540 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
4541}
4542
4543static int ip6_pkt_prohibit(struct sk_buff *skb)
4544{
4545 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
4546}
4547
4548static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
4549{
4550 skb->dev = skb_dst(skb)->dev;
4551 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
4552}
4553
4554/*
4555 * Allocate a dst for local (unicast / anycast) address.
4556 */
4557
4558struct fib6_info *addrconf_f6i_alloc(struct net *net,
4559 struct inet6_dev *idev,
4560 const struct in6_addr *addr,
4561 bool anycast, gfp_t gfp_flags,
4562 struct netlink_ext_ack *extack)
4563{
4564 struct fib6_config cfg = {
4565 .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
4566 .fc_ifindex = idev->dev->ifindex,
4567 .fc_flags = RTF_UP | RTF_NONEXTHOP,
4568 .fc_dst = *addr,
4569 .fc_dst_len = 128,
4570 .fc_protocol = RTPROT_KERNEL,
4571 .fc_nlinfo.nl_net = net,
4572 .fc_ignore_dev_down = true,
4573 };
4574 struct fib6_info *f6i;
4575
4576 if (anycast) {
4577 cfg.fc_type = RTN_ANYCAST;
4578 cfg.fc_flags |= RTF_ANYCAST;
4579 } else {
4580 cfg.fc_type = RTN_LOCAL;
4581 cfg.fc_flags |= RTF_LOCAL;
4582 }
4583
4584 f6i = ip6_route_info_create(&cfg, gfp_flags, extack);
4585 if (!IS_ERR(f6i)) {
4586 f6i->dst_nocount = true;
4587
4588 if (!anycast &&
4589 (READ_ONCE(net->ipv6.devconf_all->disable_policy) ||
4590 READ_ONCE(idev->cnf.disable_policy)))
4591 f6i->dst_nopolicy = true;
4592 }
4593
4594 return f6i;
4595}
4596
4597/* remove deleted ip from prefsrc entries */
4598struct arg_dev_net_ip {
4599 struct net *net;
4600 struct in6_addr *addr;
4601};
4602
4603static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
4604{
4605 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
4606 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
4607
4608 if (!rt->nh &&
4609 rt != net->ipv6.fib6_null_entry &&
4610 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) &&
4611 !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) {
4612 spin_lock_bh(&rt6_exception_lock);
4613 /* remove prefsrc entry */
4614 rt->fib6_prefsrc.plen = 0;
4615 spin_unlock_bh(&rt6_exception_lock);
4616 }
4617 return 0;
4618}
4619
4620void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
4621{
4622 struct net *net = dev_net(ifp->idev->dev);
4623 struct arg_dev_net_ip adni = {
4624 .net = net,
4625 .addr = &ifp->addr,
4626 };
4627 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
4628}
4629
4630#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT)
4631
4632/* Remove routers and update dst entries when gateway turn into host. */
4633static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
4634{
4635 struct in6_addr *gateway = (struct in6_addr *)arg;
4636 struct fib6_nh *nh;
4637
4638 /* RA routes do not use nexthops */
4639 if (rt->nh)
4640 return 0;
4641
4642 nh = rt->fib6_nh;
4643 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
4644 nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
4645 return -1;
4646
4647 /* Further clean up cached routes in exception table.
4648 * This is needed because cached route may have a different
4649 * gateway than its 'parent' in the case of an ip redirect.
4650 */
4651 fib6_nh_exceptions_clean_tohost(nh, gateway);
4652
4653 return 0;
4654}
4655
4656void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
4657{
4658 fib6_clean_all(net, fib6_clean_tohost, gateway);
4659}
4660
4661struct arg_netdev_event {
4662 const struct net_device *dev;
4663 union {
4664 unsigned char nh_flags;
4665 unsigned long event;
4666 };
4667};
4668
4669static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
4670{
4671 struct fib6_info *iter;
4672 struct fib6_node *fn;
4673
4674 fn = rcu_dereference_protected(rt->fib6_node,
4675 lockdep_is_held(&rt->fib6_table->tb6_lock));
4676 iter = rcu_dereference_protected(fn->leaf,
4677 lockdep_is_held(&rt->fib6_table->tb6_lock));
4678 while (iter) {
4679 if (iter->fib6_metric == rt->fib6_metric &&
4680 rt6_qualify_for_ecmp(iter))
4681 return iter;
4682 iter = rcu_dereference_protected(iter->fib6_next,
4683 lockdep_is_held(&rt->fib6_table->tb6_lock));
4684 }
4685
4686 return NULL;
4687}
4688
4689/* only called for fib entries with builtin fib6_nh */
4690static bool rt6_is_dead(const struct fib6_info *rt)
4691{
4692 if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
4693 (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
4694 ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
4695 return true;
4696
4697 return false;
4698}
4699
4700static int rt6_multipath_total_weight(const struct fib6_info *rt)
4701{
4702 struct fib6_info *iter;
4703 int total = 0;
4704
4705 if (!rt6_is_dead(rt))
4706 total += rt->fib6_nh->fib_nh_weight;
4707
4708 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
4709 if (!rt6_is_dead(iter))
4710 total += iter->fib6_nh->fib_nh_weight;
4711 }
4712
4713 return total;
4714}
4715
4716static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
4717{
4718 int upper_bound = -1;
4719
4720 if (!rt6_is_dead(rt)) {
4721 *weight += rt->fib6_nh->fib_nh_weight;
4722 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
4723 total) - 1;
4724 }
4725 atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
4726}
4727
4728static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
4729{
4730 struct fib6_info *iter;
4731 int weight = 0;
4732
4733 rt6_upper_bound_set(rt, &weight, total);
4734
4735 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4736 rt6_upper_bound_set(iter, &weight, total);
4737}
4738
4739void rt6_multipath_rebalance(struct fib6_info *rt)
4740{
4741 struct fib6_info *first;
4742 int total;
4743
4744 /* In case the entire multipath route was marked for flushing,
4745 * then there is no need to rebalance upon the removal of every
4746 * sibling route.
4747 */
4748 if (!rt->fib6_nsiblings || rt->should_flush)
4749 return;
4750
4751 /* During lookup routes are evaluated in order, so we need to
4752 * make sure upper bounds are assigned from the first sibling
4753 * onwards.
4754 */
4755 first = rt6_multipath_first_sibling(rt);
4756 if (WARN_ON_ONCE(!first))
4757 return;
4758
4759 total = rt6_multipath_total_weight(first);
4760 rt6_multipath_upper_bound_set(first, total);
4761}
4762
4763static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4764{
4765 const struct arg_netdev_event *arg = p_arg;
4766 struct net *net = dev_net(arg->dev);
4767
4768 if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
4769 rt->fib6_nh->fib_nh_dev == arg->dev) {
4770 rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
4771 fib6_update_sernum_upto_root(net, rt);
4772 rt6_multipath_rebalance(rt);
4773 }
4774
4775 return 0;
4776}
4777
4778void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4779{
4780 struct arg_netdev_event arg = {
4781 .dev = dev,
4782 {
4783 .nh_flags = nh_flags,
4784 },
4785 };
4786
4787 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4788 arg.nh_flags |= RTNH_F_LINKDOWN;
4789
4790 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4791}
4792
4793/* only called for fib entries with inline fib6_nh */
4794static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4795 const struct net_device *dev)
4796{
4797 struct fib6_info *iter;
4798
4799 if (rt->fib6_nh->fib_nh_dev == dev)
4800 return true;
4801 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4802 if (iter->fib6_nh->fib_nh_dev == dev)
4803 return true;
4804
4805 return false;
4806}
4807
4808static void rt6_multipath_flush(struct fib6_info *rt)
4809{
4810 struct fib6_info *iter;
4811
4812 rt->should_flush = 1;
4813 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4814 iter->should_flush = 1;
4815}
4816
4817static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4818 const struct net_device *down_dev)
4819{
4820 struct fib6_info *iter;
4821 unsigned int dead = 0;
4822
4823 if (rt->fib6_nh->fib_nh_dev == down_dev ||
4824 rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4825 dead++;
4826 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4827 if (iter->fib6_nh->fib_nh_dev == down_dev ||
4828 iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4829 dead++;
4830
4831 return dead;
4832}
4833
4834static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4835 const struct net_device *dev,
4836 unsigned char nh_flags)
4837{
4838 struct fib6_info *iter;
4839
4840 if (rt->fib6_nh->fib_nh_dev == dev)
4841 rt->fib6_nh->fib_nh_flags |= nh_flags;
4842 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4843 if (iter->fib6_nh->fib_nh_dev == dev)
4844 iter->fib6_nh->fib_nh_flags |= nh_flags;
4845}
4846
4847/* called with write lock held for table with rt */
4848static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4849{
4850 const struct arg_netdev_event *arg = p_arg;
4851 const struct net_device *dev = arg->dev;
4852 struct net *net = dev_net(dev);
4853
4854 if (rt == net->ipv6.fib6_null_entry || rt->nh)
4855 return 0;
4856
4857 switch (arg->event) {
4858 case NETDEV_UNREGISTER:
4859 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
4860 case NETDEV_DOWN:
4861 if (rt->should_flush)
4862 return -1;
4863 if (!rt->fib6_nsiblings)
4864 return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
4865 if (rt6_multipath_uses_dev(rt, dev)) {
4866 unsigned int count;
4867
4868 count = rt6_multipath_dead_count(rt, dev);
4869 if (rt->fib6_nsiblings + 1 == count) {
4870 rt6_multipath_flush(rt);
4871 return -1;
4872 }
4873 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4874 RTNH_F_LINKDOWN);
4875 fib6_update_sernum(net, rt);
4876 rt6_multipath_rebalance(rt);
4877 }
4878 return -2;
4879 case NETDEV_CHANGE:
4880 if (rt->fib6_nh->fib_nh_dev != dev ||
4881 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4882 break;
4883 rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
4884 rt6_multipath_rebalance(rt);
4885 break;
4886 }
4887
4888 return 0;
4889}
4890
4891void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4892{
4893 struct arg_netdev_event arg = {
4894 .dev = dev,
4895 {
4896 .event = event,
4897 },
4898 };
4899 struct net *net = dev_net(dev);
4900
4901 if (net->ipv6.sysctl.skip_notify_on_dev_down)
4902 fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4903 else
4904 fib6_clean_all(net, fib6_ifdown, &arg);
4905}
4906
4907void rt6_disable_ip(struct net_device *dev, unsigned long event)
4908{
4909 rt6_sync_down_dev(dev, event);
4910 rt6_uncached_list_flush_dev(dev);
4911 neigh_ifdown(&nd_tbl, dev);
4912}
4913
4914struct rt6_mtu_change_arg {
4915 struct net_device *dev;
4916 unsigned int mtu;
4917 struct fib6_info *f6i;
4918};
4919
4920static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
4921{
4922 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
4923 struct fib6_info *f6i = arg->f6i;
4924
4925 /* For administrative MTU increase, there is no way to discover
4926 * IPv6 PMTU increase, so PMTU increase should be updated here.
4927 * Since RFC 1981 doesn't include administrative MTU increase
4928 * update PMTU increase is a MUST. (i.e. jumbo frame)
4929 */
4930 if (nh->fib_nh_dev == arg->dev) {
4931 struct inet6_dev *idev = __in6_dev_get(arg->dev);
4932 u32 mtu = f6i->fib6_pmtu;
4933
4934 if (mtu >= arg->mtu ||
4935 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4936 fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
4937
4938 spin_lock_bh(&rt6_exception_lock);
4939 rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
4940 spin_unlock_bh(&rt6_exception_lock);
4941 }
4942
4943 return 0;
4944}
4945
4946static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
4947{
4948 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4949 struct inet6_dev *idev;
4950
4951 /* In IPv6 pmtu discovery is not optional,
4952 so that RTAX_MTU lock cannot disable it.
4953 We still use this lock to block changes
4954 caused by addrconf/ndisc.
4955 */
4956
4957 idev = __in6_dev_get(arg->dev);
4958 if (!idev)
4959 return 0;
4960
4961 if (fib6_metric_locked(f6i, RTAX_MTU))
4962 return 0;
4963
4964 arg->f6i = f6i;
4965 if (f6i->nh) {
4966 /* fib6_nh_mtu_change only returns 0, so this is safe */
4967 return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
4968 arg);
4969 }
4970
4971 return fib6_nh_mtu_change(f6i->fib6_nh, arg);
4972}
4973
4974void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4975{
4976 struct rt6_mtu_change_arg arg = {
4977 .dev = dev,
4978 .mtu = mtu,
4979 };
4980
4981 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4982}
4983
4984static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4985 [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 },
4986 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4987 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4988 [RTA_OIF] = { .type = NLA_U32 },
4989 [RTA_IIF] = { .type = NLA_U32 },
4990 [RTA_PRIORITY] = { .type = NLA_U32 },
4991 [RTA_METRICS] = { .type = NLA_NESTED },
4992 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4993 [RTA_PREF] = { .type = NLA_U8 },
4994 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4995 [RTA_ENCAP] = { .type = NLA_NESTED },
4996 [RTA_EXPIRES] = { .type = NLA_U32 },
4997 [RTA_UID] = { .type = NLA_U32 },
4998 [RTA_MARK] = { .type = NLA_U32 },
4999 [RTA_TABLE] = { .type = NLA_U32 },
5000 [RTA_IP_PROTO] = { .type = NLA_U8 },
5001 [RTA_SPORT] = { .type = NLA_U16 },
5002 [RTA_DPORT] = { .type = NLA_U16 },
5003 [RTA_NH_ID] = { .type = NLA_U32 },
5004};
5005
5006static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
5007 struct fib6_config *cfg,
5008 struct netlink_ext_ack *extack)
5009{
5010 struct rtmsg *rtm;
5011 struct nlattr *tb[RTA_MAX+1];
5012 unsigned int pref;
5013 int err;
5014
5015 err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
5016 rtm_ipv6_policy, extack);
5017 if (err < 0)
5018 goto errout;
5019
5020 err = -EINVAL;
5021 rtm = nlmsg_data(nlh);
5022
5023 if (rtm->rtm_tos) {
5024 NL_SET_ERR_MSG(extack,
5025 "Invalid dsfield (tos): option not available for IPv6");
5026 goto errout;
5027 }
5028
5029 *cfg = (struct fib6_config){
5030 .fc_table = rtm->rtm_table,
5031 .fc_dst_len = rtm->rtm_dst_len,
5032 .fc_src_len = rtm->rtm_src_len,
5033 .fc_flags = RTF_UP,
5034 .fc_protocol = rtm->rtm_protocol,
5035 .fc_type = rtm->rtm_type,
5036
5037 .fc_nlinfo.portid = NETLINK_CB(skb).portid,
5038 .fc_nlinfo.nlh = nlh,
5039 .fc_nlinfo.nl_net = sock_net(skb->sk),
5040 };
5041
5042 if (rtm->rtm_type == RTN_UNREACHABLE ||
5043 rtm->rtm_type == RTN_BLACKHOLE ||
5044 rtm->rtm_type == RTN_PROHIBIT ||
5045 rtm->rtm_type == RTN_THROW)
5046 cfg->fc_flags |= RTF_REJECT;
5047
5048 if (rtm->rtm_type == RTN_LOCAL)
5049 cfg->fc_flags |= RTF_LOCAL;
5050
5051 if (rtm->rtm_flags & RTM_F_CLONED)
5052 cfg->fc_flags |= RTF_CACHE;
5053
5054 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
5055
5056 if (tb[RTA_NH_ID]) {
5057 if (tb[RTA_GATEWAY] || tb[RTA_OIF] ||
5058 tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
5059 NL_SET_ERR_MSG(extack,
5060 "Nexthop specification and nexthop id are mutually exclusive");
5061 goto errout;
5062 }
5063 cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
5064 }
5065
5066 if (tb[RTA_GATEWAY]) {
5067 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
5068 cfg->fc_flags |= RTF_GATEWAY;
5069 }
5070 if (tb[RTA_VIA]) {
5071 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
5072 goto errout;
5073 }
5074
5075 if (tb[RTA_DST]) {
5076 int plen = (rtm->rtm_dst_len + 7) >> 3;
5077
5078 if (nla_len(tb[RTA_DST]) < plen)
5079 goto errout;
5080
5081 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
5082 }
5083
5084 if (tb[RTA_SRC]) {
5085 int plen = (rtm->rtm_src_len + 7) >> 3;
5086
5087 if (nla_len(tb[RTA_SRC]) < plen)
5088 goto errout;
5089
5090 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
5091 }
5092
5093 if (tb[RTA_PREFSRC])
5094 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
5095
5096 if (tb[RTA_OIF])
5097 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
5098
5099 if (tb[RTA_PRIORITY])
5100 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
5101
5102 if (tb[RTA_METRICS]) {
5103 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
5104 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
5105 }
5106
5107 if (tb[RTA_TABLE])
5108 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
5109
5110 if (tb[RTA_MULTIPATH]) {
5111 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
5112 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
5113
5114 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
5115 cfg->fc_mp_len, extack);
5116 if (err < 0)
5117 goto errout;
5118 }
5119
5120 if (tb[RTA_PREF]) {
5121 pref = nla_get_u8(tb[RTA_PREF]);
5122 if (pref != ICMPV6_ROUTER_PREF_LOW &&
5123 pref != ICMPV6_ROUTER_PREF_HIGH)
5124 pref = ICMPV6_ROUTER_PREF_MEDIUM;
5125 cfg->fc_flags |= RTF_PREF(pref);
5126 }
5127
5128 if (tb[RTA_ENCAP])
5129 cfg->fc_encap = tb[RTA_ENCAP];
5130
5131 if (tb[RTA_ENCAP_TYPE]) {
5132 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
5133
5134 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
5135 if (err < 0)
5136 goto errout;
5137 }
5138
5139 if (tb[RTA_EXPIRES]) {
5140 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
5141
5142 if (addrconf_finite_timeout(timeout)) {
5143 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
5144 cfg->fc_flags |= RTF_EXPIRES;
5145 }
5146 }
5147
5148 err = 0;
5149errout:
5150 return err;
5151}
5152
5153struct rt6_nh {
5154 struct fib6_info *fib6_info;
5155 struct fib6_config r_cfg;
5156 struct list_head next;
5157};
5158
5159static int ip6_route_info_append(struct net *net,
5160 struct list_head *rt6_nh_list,
5161 struct fib6_info *rt,
5162 struct fib6_config *r_cfg)
5163{
5164 struct rt6_nh *nh;
5165 int err = -EEXIST;
5166
5167 list_for_each_entry(nh, rt6_nh_list, next) {
5168 /* check if fib6_info already exists */
5169 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
5170 return err;
5171 }
5172
5173 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
5174 if (!nh)
5175 return -ENOMEM;
5176 nh->fib6_info = rt;
5177 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
5178 list_add_tail(&nh->next, rt6_nh_list);
5179
5180 return 0;
5181}
5182
5183static void ip6_route_mpath_notify(struct fib6_info *rt,
5184 struct fib6_info *rt_last,
5185 struct nl_info *info,
5186 __u16 nlflags)
5187{
5188 /* if this is an APPEND route, then rt points to the first route
5189 * inserted and rt_last points to last route inserted. Userspace
5190 * wants a consistent dump of the route which starts at the first
5191 * nexthop. Since sibling routes are always added at the end of
5192 * the list, find the first sibling of the last route appended
5193 */
5194 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
5195 rt = list_first_entry(&rt_last->fib6_siblings,
5196 struct fib6_info,
5197 fib6_siblings);
5198 }
5199
5200 if (rt)
5201 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
5202}
5203
5204static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
5205{
5206 bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
5207 bool should_notify = false;
5208 struct fib6_info *leaf;
5209 struct fib6_node *fn;
5210
5211 rcu_read_lock();
5212 fn = rcu_dereference(rt->fib6_node);
5213 if (!fn)
5214 goto out;
5215
5216 leaf = rcu_dereference(fn->leaf);
5217 if (!leaf)
5218 goto out;
5219
5220 if (rt == leaf ||
5221 (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric &&
5222 rt6_qualify_for_ecmp(leaf)))
5223 should_notify = true;
5224out:
5225 rcu_read_unlock();
5226
5227 return should_notify;
5228}
5229
5230static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla,
5231 struct netlink_ext_ack *extack)
5232{
5233 if (nla_len(nla) < sizeof(*gw)) {
5234 NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY");
5235 return -EINVAL;
5236 }
5237
5238 *gw = nla_get_in6_addr(nla);
5239
5240 return 0;
5241}
5242
5243static int ip6_route_multipath_add(struct fib6_config *cfg,
5244 struct netlink_ext_ack *extack)
5245{
5246 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
5247 struct nl_info *info = &cfg->fc_nlinfo;
5248 struct fib6_config r_cfg;
5249 struct rtnexthop *rtnh;
5250 struct fib6_info *rt;
5251 struct rt6_nh *err_nh;
5252 struct rt6_nh *nh, *nh_safe;
5253 __u16 nlflags;
5254 int remaining;
5255 int attrlen;
5256 int err = 1;
5257 int nhn = 0;
5258 int replace = (cfg->fc_nlinfo.nlh &&
5259 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
5260 LIST_HEAD(rt6_nh_list);
5261
5262 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
5263 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
5264 nlflags |= NLM_F_APPEND;
5265
5266 remaining = cfg->fc_mp_len;
5267 rtnh = (struct rtnexthop *)cfg->fc_mp;
5268
5269 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
5270 * fib6_info structs per nexthop
5271 */
5272 while (rtnh_ok(rtnh, remaining)) {
5273 memcpy(&r_cfg, cfg, sizeof(*cfg));
5274 if (rtnh->rtnh_ifindex)
5275 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
5276
5277 attrlen = rtnh_attrlen(rtnh);
5278 if (attrlen > 0) {
5279 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
5280
5281 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
5282 if (nla) {
5283 err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
5284 extack);
5285 if (err)
5286 goto cleanup;
5287
5288 r_cfg.fc_flags |= RTF_GATEWAY;
5289 }
5290 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
5291
5292 /* RTA_ENCAP_TYPE length checked in
5293 * lwtunnel_valid_encap_type_attr
5294 */
5295 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
5296 if (nla)
5297 r_cfg.fc_encap_type = nla_get_u16(nla);
5298 }
5299
5300 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
5301 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
5302 if (IS_ERR(rt)) {
5303 err = PTR_ERR(rt);
5304 rt = NULL;
5305 goto cleanup;
5306 }
5307 if (!rt6_qualify_for_ecmp(rt)) {
5308 err = -EINVAL;
5309 NL_SET_ERR_MSG(extack,
5310 "Device only routes can not be added for IPv6 using the multipath API.");
5311 fib6_info_release(rt);
5312 goto cleanup;
5313 }
5314
5315 rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
5316
5317 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
5318 rt, &r_cfg);
5319 if (err) {
5320 fib6_info_release(rt);
5321 goto cleanup;
5322 }
5323
5324 rtnh = rtnh_next(rtnh, &remaining);
5325 }
5326
5327 if (list_empty(&rt6_nh_list)) {
5328 NL_SET_ERR_MSG(extack,
5329 "Invalid nexthop configuration - no valid nexthops");
5330 return -EINVAL;
5331 }
5332
5333 /* for add and replace send one notification with all nexthops.
5334 * Skip the notification in fib6_add_rt2node and send one with
5335 * the full route when done
5336 */
5337 info->skip_notify = 1;
5338
5339 /* For add and replace, send one notification with all nexthops. For
5340 * append, send one notification with all appended nexthops.
5341 */
5342 info->skip_notify_kernel = 1;
5343
5344 err_nh = NULL;
5345 list_for_each_entry(nh, &rt6_nh_list, next) {
5346 err = __ip6_ins_rt(nh->fib6_info, info, extack);
5347
5348 if (err) {
5349 if (replace && nhn)
5350 NL_SET_ERR_MSG_MOD(extack,
5351 "multipath route replace failed (check consistency of installed routes)");
5352 err_nh = nh;
5353 goto add_errout;
5354 }
5355 /* save reference to last route successfully inserted */
5356 rt_last = nh->fib6_info;
5357
5358 /* save reference to first route for notification */
5359 if (!rt_notif)
5360 rt_notif = nh->fib6_info;
5361
5362 /* Because each route is added like a single route we remove
5363 * these flags after the first nexthop: if there is a collision,
5364 * we have already failed to add the first nexthop:
5365 * fib6_add_rt2node() has rejected it; when replacing, old
5366 * nexthops have been replaced by first new, the rest should
5367 * be added to it.
5368 */
5369 if (cfg->fc_nlinfo.nlh) {
5370 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
5371 NLM_F_REPLACE);
5372 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
5373 }
5374 nhn++;
5375 }
5376
5377 /* An in-kernel notification should only be sent in case the new
5378 * multipath route is added as the first route in the node, or if
5379 * it was appended to it. We pass 'rt_notif' since it is the first
5380 * sibling and might allow us to skip some checks in the replace case.
5381 */
5382 if (ip6_route_mpath_should_notify(rt_notif)) {
5383 enum fib_event_type fib_event;
5384
5385 if (rt_notif->fib6_nsiblings != nhn - 1)
5386 fib_event = FIB_EVENT_ENTRY_APPEND;
5387 else
5388 fib_event = FIB_EVENT_ENTRY_REPLACE;
5389
5390 err = call_fib6_multipath_entry_notifiers(info->nl_net,
5391 fib_event, rt_notif,
5392 nhn - 1, extack);
5393 if (err) {
5394 /* Delete all the siblings that were just added */
5395 err_nh = NULL;
5396 goto add_errout;
5397 }
5398 }
5399
5400 /* success ... tell user about new route */
5401 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
5402 goto cleanup;
5403
5404add_errout:
5405 /* send notification for routes that were added so that
5406 * the delete notifications sent by ip6_route_del are
5407 * coherent
5408 */
5409 if (rt_notif)
5410 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
5411
5412 /* Delete routes that were already added */
5413 list_for_each_entry(nh, &rt6_nh_list, next) {
5414 if (err_nh == nh)
5415 break;
5416 ip6_route_del(&nh->r_cfg, extack);
5417 }
5418
5419cleanup:
5420 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
5421 fib6_info_release(nh->fib6_info);
5422 list_del(&nh->next);
5423 kfree(nh);
5424 }
5425
5426 return err;
5427}
5428
5429static int ip6_route_multipath_del(struct fib6_config *cfg,
5430 struct netlink_ext_ack *extack)
5431{
5432 struct fib6_config r_cfg;
5433 struct rtnexthop *rtnh;
5434 int last_err = 0;
5435 int remaining;
5436 int attrlen;
5437 int err;
5438
5439 remaining = cfg->fc_mp_len;
5440 rtnh = (struct rtnexthop *)cfg->fc_mp;
5441
5442 /* Parse a Multipath Entry */
5443 while (rtnh_ok(rtnh, remaining)) {
5444 memcpy(&r_cfg, cfg, sizeof(*cfg));
5445 if (rtnh->rtnh_ifindex)
5446 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
5447
5448 attrlen = rtnh_attrlen(rtnh);
5449 if (attrlen > 0) {
5450 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
5451
5452 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
5453 if (nla) {
5454 err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
5455 extack);
5456 if (err) {
5457 last_err = err;
5458 goto next_rtnh;
5459 }
5460
5461 r_cfg.fc_flags |= RTF_GATEWAY;
5462 }
5463 }
5464 err = ip6_route_del(&r_cfg, extack);
5465 if (err)
5466 last_err = err;
5467
5468next_rtnh:
5469 rtnh = rtnh_next(rtnh, &remaining);
5470 }
5471
5472 return last_err;
5473}
5474
5475static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
5476 struct netlink_ext_ack *extack)
5477{
5478 struct fib6_config cfg;
5479 int err;
5480
5481 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
5482 if (err < 0)
5483 return err;
5484
5485 if (cfg.fc_nh_id &&
5486 !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
5487 NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
5488 return -EINVAL;
5489 }
5490
5491 if (cfg.fc_mp)
5492 return ip6_route_multipath_del(&cfg, extack);
5493 else {
5494 cfg.fc_delete_all_nh = 1;
5495 return ip6_route_del(&cfg, extack);
5496 }
5497}
5498
5499static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
5500 struct netlink_ext_ack *extack)
5501{
5502 struct fib6_config cfg;
5503 int err;
5504
5505 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
5506 if (err < 0)
5507 return err;
5508
5509 if (cfg.fc_metric == 0)
5510 cfg.fc_metric = IP6_RT_PRIO_USER;
5511
5512 if (cfg.fc_mp)
5513 return ip6_route_multipath_add(&cfg, extack);
5514 else
5515 return ip6_route_add(&cfg, GFP_KERNEL, extack);
5516}
5517
5518/* add the overhead of this fib6_nh to nexthop_len */
5519static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
5520{
5521 int *nexthop_len = arg;
5522
5523 *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */
5524 + NLA_ALIGN(sizeof(struct rtnexthop))
5525 + nla_total_size(16); /* RTA_GATEWAY */
5526
5527 if (nh->fib_nh_lws) {
5528 /* RTA_ENCAP_TYPE */
5529 *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
5530 /* RTA_ENCAP */
5531 *nexthop_len += nla_total_size(2);
5532 }
5533
5534 return 0;
5535}
5536
5537static size_t rt6_nlmsg_size(struct fib6_info *f6i)
5538{
5539 int nexthop_len;
5540
5541 if (f6i->nh) {
5542 nexthop_len = nla_total_size(4); /* RTA_NH_ID */
5543 nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
5544 &nexthop_len);
5545 } else {
5546 struct fib6_info *sibling, *next_sibling;
5547 struct fib6_nh *nh = f6i->fib6_nh;
5548
5549 nexthop_len = 0;
5550 if (f6i->fib6_nsiblings) {
5551 rt6_nh_nlmsg_size(nh, &nexthop_len);
5552
5553 list_for_each_entry_safe(sibling, next_sibling,
5554 &f6i->fib6_siblings, fib6_siblings) {
5555 rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
5556 }
5557 }
5558 nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
5559 }
5560
5561 return NLMSG_ALIGN(sizeof(struct rtmsg))
5562 + nla_total_size(16) /* RTA_SRC */
5563 + nla_total_size(16) /* RTA_DST */
5564 + nla_total_size(16) /* RTA_GATEWAY */
5565 + nla_total_size(16) /* RTA_PREFSRC */
5566 + nla_total_size(4) /* RTA_TABLE */
5567 + nla_total_size(4) /* RTA_IIF */
5568 + nla_total_size(4) /* RTA_OIF */
5569 + nla_total_size(4) /* RTA_PRIORITY */
5570 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
5571 + nla_total_size(sizeof(struct rta_cacheinfo))
5572 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
5573 + nla_total_size(1) /* RTA_PREF */
5574 + nexthop_len;
5575}
5576
5577static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
5578 unsigned char *flags)
5579{
5580 if (nexthop_is_multipath(nh)) {
5581 struct nlattr *mp;
5582
5583 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
5584 if (!mp)
5585 goto nla_put_failure;
5586
5587 if (nexthop_mpath_fill_node(skb, nh, AF_INET6))
5588 goto nla_put_failure;
5589
5590 nla_nest_end(skb, mp);
5591 } else {
5592 struct fib6_nh *fib6_nh;
5593
5594 fib6_nh = nexthop_fib6_nh(nh);
5595 if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6,
5596 flags, false) < 0)
5597 goto nla_put_failure;
5598 }
5599
5600 return 0;
5601
5602nla_put_failure:
5603 return -EMSGSIZE;
5604}
5605
5606static int rt6_fill_node(struct net *net, struct sk_buff *skb,
5607 struct fib6_info *rt, struct dst_entry *dst,
5608 struct in6_addr *dest, struct in6_addr *src,
5609 int iif, int type, u32 portid, u32 seq,
5610 unsigned int flags)
5611{
5612 struct rt6_info *rt6 = dst_rt6_info(dst);
5613 struct rt6key *rt6_dst, *rt6_src;
5614 u32 *pmetrics, table, rt6_flags;
5615 unsigned char nh_flags = 0;
5616 struct nlmsghdr *nlh;
5617 struct rtmsg *rtm;
5618 long expires = 0;
5619
5620 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
5621 if (!nlh)
5622 return -EMSGSIZE;
5623
5624 if (rt6) {
5625 rt6_dst = &rt6->rt6i_dst;
5626 rt6_src = &rt6->rt6i_src;
5627 rt6_flags = rt6->rt6i_flags;
5628 } else {
5629 rt6_dst = &rt->fib6_dst;
5630 rt6_src = &rt->fib6_src;
5631 rt6_flags = rt->fib6_flags;
5632 }
5633
5634 rtm = nlmsg_data(nlh);
5635 rtm->rtm_family = AF_INET6;
5636 rtm->rtm_dst_len = rt6_dst->plen;
5637 rtm->rtm_src_len = rt6_src->plen;
5638 rtm->rtm_tos = 0;
5639 if (rt->fib6_table)
5640 table = rt->fib6_table->tb6_id;
5641 else
5642 table = RT6_TABLE_UNSPEC;
5643 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
5644 if (nla_put_u32(skb, RTA_TABLE, table))
5645 goto nla_put_failure;
5646
5647 rtm->rtm_type = rt->fib6_type;
5648 rtm->rtm_flags = 0;
5649 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
5650 rtm->rtm_protocol = rt->fib6_protocol;
5651
5652 if (rt6_flags & RTF_CACHE)
5653 rtm->rtm_flags |= RTM_F_CLONED;
5654
5655 if (dest) {
5656 if (nla_put_in6_addr(skb, RTA_DST, dest))
5657 goto nla_put_failure;
5658 rtm->rtm_dst_len = 128;
5659 } else if (rtm->rtm_dst_len)
5660 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
5661 goto nla_put_failure;
5662#ifdef CONFIG_IPV6_SUBTREES
5663 if (src) {
5664 if (nla_put_in6_addr(skb, RTA_SRC, src))
5665 goto nla_put_failure;
5666 rtm->rtm_src_len = 128;
5667 } else if (rtm->rtm_src_len &&
5668 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
5669 goto nla_put_failure;
5670#endif
5671 if (iif) {
5672#ifdef CONFIG_IPV6_MROUTE
5673 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
5674 int err = ip6mr_get_route(net, skb, rtm, portid);
5675
5676 if (err == 0)
5677 return 0;
5678 if (err < 0)
5679 goto nla_put_failure;
5680 } else
5681#endif
5682 if (nla_put_u32(skb, RTA_IIF, iif))
5683 goto nla_put_failure;
5684 } else if (dest) {
5685 struct in6_addr saddr_buf;
5686 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
5687 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
5688 goto nla_put_failure;
5689 }
5690
5691 if (rt->fib6_prefsrc.plen) {
5692 struct in6_addr saddr_buf;
5693 saddr_buf = rt->fib6_prefsrc.addr;
5694 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
5695 goto nla_put_failure;
5696 }
5697
5698 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
5699 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
5700 goto nla_put_failure;
5701
5702 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
5703 goto nla_put_failure;
5704
5705 /* For multipath routes, walk the siblings list and add
5706 * each as a nexthop within RTA_MULTIPATH.
5707 */
5708 if (rt6) {
5709 if (rt6_flags & RTF_GATEWAY &&
5710 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
5711 goto nla_put_failure;
5712
5713 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
5714 goto nla_put_failure;
5715
5716 if (dst->lwtstate &&
5717 lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
5718 goto nla_put_failure;
5719 } else if (rt->fib6_nsiblings) {
5720 struct fib6_info *sibling, *next_sibling;
5721 struct nlattr *mp;
5722
5723 mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
5724 if (!mp)
5725 goto nla_put_failure;
5726
5727 if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
5728 rt->fib6_nh->fib_nh_weight, AF_INET6,
5729 0) < 0)
5730 goto nla_put_failure;
5731
5732 list_for_each_entry_safe(sibling, next_sibling,
5733 &rt->fib6_siblings, fib6_siblings) {
5734 if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
5735 sibling->fib6_nh->fib_nh_weight,
5736 AF_INET6, 0) < 0)
5737 goto nla_put_failure;
5738 }
5739
5740 nla_nest_end(skb, mp);
5741 } else if (rt->nh) {
5742 if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
5743 goto nla_put_failure;
5744
5745 if (nexthop_is_blackhole(rt->nh))
5746 rtm->rtm_type = RTN_BLACKHOLE;
5747
5748 if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) &&
5749 rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
5750 goto nla_put_failure;
5751
5752 rtm->rtm_flags |= nh_flags;
5753 } else {
5754 if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6,
5755 &nh_flags, false) < 0)
5756 goto nla_put_failure;
5757
5758 rtm->rtm_flags |= nh_flags;
5759 }
5760
5761 if (rt6_flags & RTF_EXPIRES) {
5762 expires = dst ? dst->expires : rt->expires;
5763 expires -= jiffies;
5764 }
5765
5766 if (!dst) {
5767 if (READ_ONCE(rt->offload))
5768 rtm->rtm_flags |= RTM_F_OFFLOAD;
5769 if (READ_ONCE(rt->trap))
5770 rtm->rtm_flags |= RTM_F_TRAP;
5771 if (READ_ONCE(rt->offload_failed))
5772 rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
5773 }
5774
5775 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
5776 goto nla_put_failure;
5777
5778 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
5779 goto nla_put_failure;
5780
5781
5782 nlmsg_end(skb, nlh);
5783 return 0;
5784
5785nla_put_failure:
5786 nlmsg_cancel(skb, nlh);
5787 return -EMSGSIZE;
5788}
5789
5790static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
5791{
5792 const struct net_device *dev = arg;
5793
5794 if (nh->fib_nh_dev == dev)
5795 return 1;
5796
5797 return 0;
5798}
5799
5800static bool fib6_info_uses_dev(const struct fib6_info *f6i,
5801 const struct net_device *dev)
5802{
5803 if (f6i->nh) {
5804 struct net_device *_dev = (struct net_device *)dev;
5805
5806 return !!nexthop_for_each_fib6_nh(f6i->nh,
5807 fib6_info_nh_uses_dev,
5808 _dev);
5809 }
5810
5811 if (f6i->fib6_nh->fib_nh_dev == dev)
5812 return true;
5813
5814 if (f6i->fib6_nsiblings) {
5815 struct fib6_info *sibling, *next_sibling;
5816
5817 list_for_each_entry_safe(sibling, next_sibling,
5818 &f6i->fib6_siblings, fib6_siblings) {
5819 if (sibling->fib6_nh->fib_nh_dev == dev)
5820 return true;
5821 }
5822 }
5823
5824 return false;
5825}
5826
5827struct fib6_nh_exception_dump_walker {
5828 struct rt6_rtnl_dump_arg *dump;
5829 struct fib6_info *rt;
5830 unsigned int flags;
5831 unsigned int skip;
5832 unsigned int count;
5833};
5834
5835static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
5836{
5837 struct fib6_nh_exception_dump_walker *w = arg;
5838 struct rt6_rtnl_dump_arg *dump = w->dump;
5839 struct rt6_exception_bucket *bucket;
5840 struct rt6_exception *rt6_ex;
5841 int i, err;
5842
5843 bucket = fib6_nh_get_excptn_bucket(nh, NULL);
5844 if (!bucket)
5845 return 0;
5846
5847 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
5848 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
5849 if (w->skip) {
5850 w->skip--;
5851 continue;
5852 }
5853
5854 /* Expiration of entries doesn't bump sernum, insertion
5855 * does. Removal is triggered by insertion, so we can
5856 * rely on the fact that if entries change between two
5857 * partial dumps, this node is scanned again completely,
5858 * see rt6_insert_exception() and fib6_dump_table().
5859 *
5860 * Count expired entries we go through as handled
5861 * entries that we'll skip next time, in case of partial
5862 * node dump. Otherwise, if entries expire meanwhile,
5863 * we'll skip the wrong amount.
5864 */
5865 if (rt6_check_expired(rt6_ex->rt6i)) {
5866 w->count++;
5867 continue;
5868 }
5869
5870 err = rt6_fill_node(dump->net, dump->skb, w->rt,
5871 &rt6_ex->rt6i->dst, NULL, NULL, 0,
5872 RTM_NEWROUTE,
5873 NETLINK_CB(dump->cb->skb).portid,
5874 dump->cb->nlh->nlmsg_seq, w->flags);
5875 if (err)
5876 return err;
5877
5878 w->count++;
5879 }
5880 bucket++;
5881 }
5882
5883 return 0;
5884}
5885
5886/* Return -1 if done with node, number of handled routes on partial dump */
5887int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
5888{
5889 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
5890 struct fib_dump_filter *filter = &arg->filter;
5891 unsigned int flags = NLM_F_MULTI;
5892 struct net *net = arg->net;
5893 int count = 0;
5894
5895 if (rt == net->ipv6.fib6_null_entry)
5896 return -1;
5897
5898 if ((filter->flags & RTM_F_PREFIX) &&
5899 !(rt->fib6_flags & RTF_PREFIX_RT)) {
5900 /* success since this is not a prefix route */
5901 return -1;
5902 }
5903 if (filter->filter_set &&
5904 ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
5905 (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
5906 (filter->protocol && rt->fib6_protocol != filter->protocol))) {
5907 return -1;
5908 }
5909
5910 if (filter->filter_set ||
5911 !filter->dump_routes || !filter->dump_exceptions) {
5912 flags |= NLM_F_DUMP_FILTERED;
5913 }
5914
5915 if (filter->dump_routes) {
5916 if (skip) {
5917 skip--;
5918 } else {
5919 if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
5920 0, RTM_NEWROUTE,
5921 NETLINK_CB(arg->cb->skb).portid,
5922 arg->cb->nlh->nlmsg_seq, flags)) {
5923 return 0;
5924 }
5925 count++;
5926 }
5927 }
5928
5929 if (filter->dump_exceptions) {
5930 struct fib6_nh_exception_dump_walker w = { .dump = arg,
5931 .rt = rt,
5932 .flags = flags,
5933 .skip = skip,
5934 .count = 0 };
5935 int err;
5936
5937 rcu_read_lock();
5938 if (rt->nh) {
5939 err = nexthop_for_each_fib6_nh(rt->nh,
5940 rt6_nh_dump_exceptions,
5941 &w);
5942 } else {
5943 err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
5944 }
5945 rcu_read_unlock();
5946
5947 if (err)
5948 return count + w.count;
5949 }
5950
5951 return -1;
5952}
5953
5954static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
5955 const struct nlmsghdr *nlh,
5956 struct nlattr **tb,
5957 struct netlink_ext_ack *extack)
5958{
5959 struct rtmsg *rtm;
5960 int i, err;
5961
5962 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
5963 NL_SET_ERR_MSG_MOD(extack,
5964 "Invalid header for get route request");
5965 return -EINVAL;
5966 }
5967
5968 if (!netlink_strict_get_check(skb))
5969 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
5970 rtm_ipv6_policy, extack);
5971
5972 rtm = nlmsg_data(nlh);
5973 if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
5974 (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
5975 rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
5976 rtm->rtm_type) {
5977 NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
5978 return -EINVAL;
5979 }
5980 if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
5981 NL_SET_ERR_MSG_MOD(extack,
5982 "Invalid flags for get route request");
5983 return -EINVAL;
5984 }
5985
5986 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
5987 rtm_ipv6_policy, extack);
5988 if (err)
5989 return err;
5990
5991 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
5992 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
5993 NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
5994 return -EINVAL;
5995 }
5996
5997 for (i = 0; i <= RTA_MAX; i++) {
5998 if (!tb[i])
5999 continue;
6000
6001 switch (i) {
6002 case RTA_SRC:
6003 case RTA_DST:
6004 case RTA_IIF:
6005 case RTA_OIF:
6006 case RTA_MARK:
6007 case RTA_UID:
6008 case RTA_SPORT:
6009 case RTA_DPORT:
6010 case RTA_IP_PROTO:
6011 break;
6012 default:
6013 NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
6014 return -EINVAL;
6015 }
6016 }
6017
6018 return 0;
6019}
6020
6021static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
6022 struct netlink_ext_ack *extack)
6023{
6024 struct net *net = sock_net(in_skb->sk);
6025 struct nlattr *tb[RTA_MAX+1];
6026 int err, iif = 0, oif = 0;
6027 struct fib6_info *from;
6028 struct dst_entry *dst;
6029 struct rt6_info *rt;
6030 struct sk_buff *skb;
6031 struct rtmsg *rtm;
6032 struct flowi6 fl6 = {};
6033 bool fibmatch;
6034
6035 err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
6036 if (err < 0)
6037 goto errout;
6038
6039 err = -EINVAL;
6040 rtm = nlmsg_data(nlh);
6041 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
6042 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
6043
6044 if (tb[RTA_SRC]) {
6045 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
6046 goto errout;
6047
6048 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
6049 }
6050
6051 if (tb[RTA_DST]) {
6052 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
6053 goto errout;
6054
6055 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
6056 }
6057
6058 if (tb[RTA_IIF])
6059 iif = nla_get_u32(tb[RTA_IIF]);
6060
6061 if (tb[RTA_OIF])
6062 oif = nla_get_u32(tb[RTA_OIF]);
6063
6064 if (tb[RTA_MARK])
6065 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
6066
6067 if (tb[RTA_UID])
6068 fl6.flowi6_uid = make_kuid(current_user_ns(),
6069 nla_get_u32(tb[RTA_UID]));
6070 else
6071 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
6072
6073 if (tb[RTA_SPORT])
6074 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
6075
6076 if (tb[RTA_DPORT])
6077 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
6078
6079 if (tb[RTA_IP_PROTO]) {
6080 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
6081 &fl6.flowi6_proto, AF_INET6,
6082 extack);
6083 if (err)
6084 goto errout;
6085 }
6086
6087 if (iif) {
6088 struct net_device *dev;
6089 int flags = 0;
6090
6091 rcu_read_lock();
6092
6093 dev = dev_get_by_index_rcu(net, iif);
6094 if (!dev) {
6095 rcu_read_unlock();
6096 err = -ENODEV;
6097 goto errout;
6098 }
6099
6100 fl6.flowi6_iif = iif;
6101
6102 if (!ipv6_addr_any(&fl6.saddr))
6103 flags |= RT6_LOOKUP_F_HAS_SADDR;
6104
6105 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
6106
6107 rcu_read_unlock();
6108 } else {
6109 fl6.flowi6_oif = oif;
6110
6111 dst = ip6_route_output(net, NULL, &fl6);
6112 }
6113
6114
6115 rt = dst_rt6_info(dst);
6116 if (rt->dst.error) {
6117 err = rt->dst.error;
6118 ip6_rt_put(rt);
6119 goto errout;
6120 }
6121
6122 if (rt == net->ipv6.ip6_null_entry) {
6123 err = rt->dst.error;
6124 ip6_rt_put(rt);
6125 goto errout;
6126 }
6127
6128 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
6129 if (!skb) {
6130 ip6_rt_put(rt);
6131 err = -ENOBUFS;
6132 goto errout;
6133 }
6134
6135 skb_dst_set(skb, &rt->dst);
6136
6137 rcu_read_lock();
6138 from = rcu_dereference(rt->from);
6139 if (from) {
6140 if (fibmatch)
6141 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
6142 iif, RTM_NEWROUTE,
6143 NETLINK_CB(in_skb).portid,
6144 nlh->nlmsg_seq, 0);
6145 else
6146 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
6147 &fl6.saddr, iif, RTM_NEWROUTE,
6148 NETLINK_CB(in_skb).portid,
6149 nlh->nlmsg_seq, 0);
6150 } else {
6151 err = -ENETUNREACH;
6152 }
6153 rcu_read_unlock();
6154
6155 if (err < 0) {
6156 kfree_skb(skb);
6157 goto errout;
6158 }
6159
6160 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
6161errout:
6162 return err;
6163}
6164
6165void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
6166 unsigned int nlm_flags)
6167{
6168 struct sk_buff *skb;
6169 struct net *net = info->nl_net;
6170 u32 seq;
6171 int err;
6172
6173 err = -ENOBUFS;
6174 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
6175
6176 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
6177 if (!skb)
6178 goto errout;
6179
6180 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
6181 event, info->portid, seq, nlm_flags);
6182 if (err < 0) {
6183 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6184 WARN_ON(err == -EMSGSIZE);
6185 kfree_skb(skb);
6186 goto errout;
6187 }
6188 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
6189 info->nlh, gfp_any());
6190 return;
6191errout:
6192 if (err < 0)
6193 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
6194}
6195
6196void fib6_rt_update(struct net *net, struct fib6_info *rt,
6197 struct nl_info *info)
6198{
6199 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
6200 struct sk_buff *skb;
6201 int err = -ENOBUFS;
6202
6203 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
6204 if (!skb)
6205 goto errout;
6206
6207 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
6208 RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
6209 if (err < 0) {
6210 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6211 WARN_ON(err == -EMSGSIZE);
6212 kfree_skb(skb);
6213 goto errout;
6214 }
6215 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
6216 info->nlh, gfp_any());
6217 return;
6218errout:
6219 if (err < 0)
6220 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
6221}
6222
6223void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
6224 bool offload, bool trap, bool offload_failed)
6225{
6226 struct sk_buff *skb;
6227 int err;
6228
6229 if (READ_ONCE(f6i->offload) == offload &&
6230 READ_ONCE(f6i->trap) == trap &&
6231 READ_ONCE(f6i->offload_failed) == offload_failed)
6232 return;
6233
6234 WRITE_ONCE(f6i->offload, offload);
6235 WRITE_ONCE(f6i->trap, trap);
6236
6237 /* 2 means send notifications only if offload_failed was changed. */
6238 if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 &&
6239 READ_ONCE(f6i->offload_failed) == offload_failed)
6240 return;
6241
6242 WRITE_ONCE(f6i->offload_failed, offload_failed);
6243
6244 if (!rcu_access_pointer(f6i->fib6_node))
6245 /* The route was removed from the tree, do not send
6246 * notification.
6247 */
6248 return;
6249
6250 if (!net->ipv6.sysctl.fib_notify_on_flag_change)
6251 return;
6252
6253 skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL);
6254 if (!skb) {
6255 err = -ENOBUFS;
6256 goto errout;
6257 }
6258
6259 err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0,
6260 0, 0);
6261 if (err < 0) {
6262 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6263 WARN_ON(err == -EMSGSIZE);
6264 kfree_skb(skb);
6265 goto errout;
6266 }
6267
6268 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL);
6269 return;
6270
6271errout:
6272 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
6273}
6274EXPORT_SYMBOL(fib6_info_hw_flags_set);
6275
6276static int ip6_route_dev_notify(struct notifier_block *this,
6277 unsigned long event, void *ptr)
6278{
6279 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
6280 struct net *net = dev_net(dev);
6281
6282 if (!(dev->flags & IFF_LOOPBACK))
6283 return NOTIFY_OK;
6284
6285 if (event == NETDEV_REGISTER) {
6286 net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
6287 net->ipv6.ip6_null_entry->dst.dev = dev;
6288 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
6289#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6290 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
6291 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
6292 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
6293 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
6294#endif
6295 } else if (event == NETDEV_UNREGISTER &&
6296 dev->reg_state != NETREG_UNREGISTERED) {
6297 /* NETDEV_UNREGISTER could be fired for multiple times by
6298 * netdev_wait_allrefs(). Make sure we only call this once.
6299 */
6300 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
6301#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6302 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
6303 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
6304#endif
6305 }
6306
6307 return NOTIFY_OK;
6308}
6309
6310/*
6311 * /proc
6312 */
6313
6314#ifdef CONFIG_PROC_FS
6315static int rt6_stats_seq_show(struct seq_file *seq, void *v)
6316{
6317 struct net *net = (struct net *)seq->private;
6318 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
6319 net->ipv6.rt6_stats->fib_nodes,
6320 net->ipv6.rt6_stats->fib_route_nodes,
6321 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
6322 net->ipv6.rt6_stats->fib_rt_entries,
6323 net->ipv6.rt6_stats->fib_rt_cache,
6324 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
6325 net->ipv6.rt6_stats->fib_discarded_routes);
6326
6327 return 0;
6328}
6329#endif /* CONFIG_PROC_FS */
6330
6331#ifdef CONFIG_SYSCTL
6332
6333static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
6334 void *buffer, size_t *lenp, loff_t *ppos)
6335{
6336 struct net *net;
6337 int delay;
6338 int ret;
6339 if (!write)
6340 return -EINVAL;
6341
6342 net = (struct net *)ctl->extra1;
6343 delay = net->ipv6.sysctl.flush_delay;
6344 ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
6345 if (ret)
6346 return ret;
6347
6348 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
6349 return 0;
6350}
6351
6352static struct ctl_table ipv6_route_table_template[] = {
6353 {
6354 .procname = "max_size",
6355 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
6356 .maxlen = sizeof(int),
6357 .mode = 0644,
6358 .proc_handler = proc_dointvec,
6359 },
6360 {
6361 .procname = "gc_thresh",
6362 .data = &ip6_dst_ops_template.gc_thresh,
6363 .maxlen = sizeof(int),
6364 .mode = 0644,
6365 .proc_handler = proc_dointvec,
6366 },
6367 {
6368 .procname = "flush",
6369 .data = &init_net.ipv6.sysctl.flush_delay,
6370 .maxlen = sizeof(int),
6371 .mode = 0200,
6372 .proc_handler = ipv6_sysctl_rtcache_flush
6373 },
6374 {
6375 .procname = "gc_min_interval",
6376 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
6377 .maxlen = sizeof(int),
6378 .mode = 0644,
6379 .proc_handler = proc_dointvec_jiffies,
6380 },
6381 {
6382 .procname = "gc_timeout",
6383 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
6384 .maxlen = sizeof(int),
6385 .mode = 0644,
6386 .proc_handler = proc_dointvec_jiffies,
6387 },
6388 {
6389 .procname = "gc_interval",
6390 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
6391 .maxlen = sizeof(int),
6392 .mode = 0644,
6393 .proc_handler = proc_dointvec_jiffies,
6394 },
6395 {
6396 .procname = "gc_elasticity",
6397 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
6398 .maxlen = sizeof(int),
6399 .mode = 0644,
6400 .proc_handler = proc_dointvec,
6401 },
6402 {
6403 .procname = "mtu_expires",
6404 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
6405 .maxlen = sizeof(int),
6406 .mode = 0644,
6407 .proc_handler = proc_dointvec_jiffies,
6408 },
6409 {
6410 .procname = "min_adv_mss",
6411 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
6412 .maxlen = sizeof(int),
6413 .mode = 0644,
6414 .proc_handler = proc_dointvec,
6415 },
6416 {
6417 .procname = "gc_min_interval_ms",
6418 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
6419 .maxlen = sizeof(int),
6420 .mode = 0644,
6421 .proc_handler = proc_dointvec_ms_jiffies,
6422 },
6423 {
6424 .procname = "skip_notify_on_dev_down",
6425 .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
6426 .maxlen = sizeof(u8),
6427 .mode = 0644,
6428 .proc_handler = proc_dou8vec_minmax,
6429 .extra1 = SYSCTL_ZERO,
6430 .extra2 = SYSCTL_ONE,
6431 },
6432 { }
6433};
6434
6435struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
6436{
6437 struct ctl_table *table;
6438
6439 table = kmemdup(ipv6_route_table_template,
6440 sizeof(ipv6_route_table_template),
6441 GFP_KERNEL);
6442
6443 if (table) {
6444 table[0].data = &net->ipv6.sysctl.ip6_rt_max_size;
6445 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
6446 table[2].data = &net->ipv6.sysctl.flush_delay;
6447 table[2].extra1 = net;
6448 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
6449 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
6450 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
6451 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
6452 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
6453 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
6454 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
6455 table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
6456
6457 /* Don't export sysctls to unprivileged users */
6458 if (net->user_ns != &init_user_ns)
6459 table[1].procname = NULL;
6460 }
6461
6462 return table;
6463}
6464
6465size_t ipv6_route_sysctl_table_size(struct net *net)
6466{
6467 /* Don't export sysctls to unprivileged users */
6468 if (net->user_ns != &init_user_ns)
6469 return 1;
6470
6471 return ARRAY_SIZE(ipv6_route_table_template);
6472}
6473#endif
6474
6475static int __net_init ip6_route_net_init(struct net *net)
6476{
6477 int ret = -ENOMEM;
6478
6479 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
6480 sizeof(net->ipv6.ip6_dst_ops));
6481
6482 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
6483 goto out_ip6_dst_ops;
6484
6485 net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
6486 if (!net->ipv6.fib6_null_entry)
6487 goto out_ip6_dst_entries;
6488 memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
6489 sizeof(*net->ipv6.fib6_null_entry));
6490
6491 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
6492 sizeof(*net->ipv6.ip6_null_entry),
6493 GFP_KERNEL);
6494 if (!net->ipv6.ip6_null_entry)
6495 goto out_fib6_null_entry;
6496 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6497 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
6498 ip6_template_metrics, true);
6499 INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached);
6500
6501#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6502 net->ipv6.fib6_has_custom_rules = false;
6503 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
6504 sizeof(*net->ipv6.ip6_prohibit_entry),
6505 GFP_KERNEL);
6506 if (!net->ipv6.ip6_prohibit_entry)
6507 goto out_ip6_null_entry;
6508 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6509 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
6510 ip6_template_metrics, true);
6511 INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached);
6512
6513 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
6514 sizeof(*net->ipv6.ip6_blk_hole_entry),
6515 GFP_KERNEL);
6516 if (!net->ipv6.ip6_blk_hole_entry)
6517 goto out_ip6_prohibit_entry;
6518 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6519 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
6520 ip6_template_metrics, true);
6521 INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached);
6522#ifdef CONFIG_IPV6_SUBTREES
6523 net->ipv6.fib6_routes_require_src = 0;
6524#endif
6525#endif
6526
6527 net->ipv6.sysctl.flush_delay = 0;
6528 net->ipv6.sysctl.ip6_rt_max_size = INT_MAX;
6529 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
6530 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
6531 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
6532 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
6533 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
6534 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
6535 net->ipv6.sysctl.skip_notify_on_dev_down = 0;
6536
6537 atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);
6538
6539 ret = 0;
6540out:
6541 return ret;
6542
6543#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6544out_ip6_prohibit_entry:
6545 kfree(net->ipv6.ip6_prohibit_entry);
6546out_ip6_null_entry:
6547 kfree(net->ipv6.ip6_null_entry);
6548#endif
6549out_fib6_null_entry:
6550 kfree(net->ipv6.fib6_null_entry);
6551out_ip6_dst_entries:
6552 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
6553out_ip6_dst_ops:
6554 goto out;
6555}
6556
6557static void __net_exit ip6_route_net_exit(struct net *net)
6558{
6559 kfree(net->ipv6.fib6_null_entry);
6560 kfree(net->ipv6.ip6_null_entry);
6561#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6562 kfree(net->ipv6.ip6_prohibit_entry);
6563 kfree(net->ipv6.ip6_blk_hole_entry);
6564#endif
6565 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
6566}
6567
6568static int __net_init ip6_route_net_init_late(struct net *net)
6569{
6570#ifdef CONFIG_PROC_FS
6571 if (!proc_create_net("ipv6_route", 0, net->proc_net,
6572 &ipv6_route_seq_ops,
6573 sizeof(struct ipv6_route_iter)))
6574 return -ENOMEM;
6575
6576 if (!proc_create_net_single("rt6_stats", 0444, net->proc_net,
6577 rt6_stats_seq_show, NULL)) {
6578 remove_proc_entry("ipv6_route", net->proc_net);
6579 return -ENOMEM;
6580 }
6581#endif
6582 return 0;
6583}
6584
6585static void __net_exit ip6_route_net_exit_late(struct net *net)
6586{
6587#ifdef CONFIG_PROC_FS
6588 remove_proc_entry("ipv6_route", net->proc_net);
6589 remove_proc_entry("rt6_stats", net->proc_net);
6590#endif
6591}
6592
6593static struct pernet_operations ip6_route_net_ops = {
6594 .init = ip6_route_net_init,
6595 .exit = ip6_route_net_exit,
6596};
6597
6598static int __net_init ipv6_inetpeer_init(struct net *net)
6599{
6600 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
6601
6602 if (!bp)
6603 return -ENOMEM;
6604 inet_peer_base_init(bp);
6605 net->ipv6.peers = bp;
6606 return 0;
6607}
6608
6609static void __net_exit ipv6_inetpeer_exit(struct net *net)
6610{
6611 struct inet_peer_base *bp = net->ipv6.peers;
6612
6613 net->ipv6.peers = NULL;
6614 inetpeer_invalidate_tree(bp);
6615 kfree(bp);
6616}
6617
6618static struct pernet_operations ipv6_inetpeer_ops = {
6619 .init = ipv6_inetpeer_init,
6620 .exit = ipv6_inetpeer_exit,
6621};
6622
6623static struct pernet_operations ip6_route_net_late_ops = {
6624 .init = ip6_route_net_init_late,
6625 .exit = ip6_route_net_exit_late,
6626};
6627
6628static struct notifier_block ip6_route_dev_notifier = {
6629 .notifier_call = ip6_route_dev_notify,
6630 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
6631};
6632
6633void __init ip6_route_init_special_entries(void)
6634{
6635 /* Registering of the loopback is done before this portion of code,
6636 * the loopback reference in rt6_info will not be taken, do it
6637 * manually for init_net */
6638 init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
6639 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
6640 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6641 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6642 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
6643 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6644 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
6645 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6646 #endif
6647}
6648
6649#if IS_BUILTIN(CONFIG_IPV6)
6650#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6651DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
6652
6653BTF_ID_LIST(btf_fib6_info_id)
6654BTF_ID(struct, fib6_info)
6655
6656static const struct bpf_iter_seq_info ipv6_route_seq_info = {
6657 .seq_ops = &ipv6_route_seq_ops,
6658 .init_seq_private = bpf_iter_init_seq_net,
6659 .fini_seq_private = bpf_iter_fini_seq_net,
6660 .seq_priv_size = sizeof(struct ipv6_route_iter),
6661};
6662
6663static struct bpf_iter_reg ipv6_route_reg_info = {
6664 .target = "ipv6_route",
6665 .ctx_arg_info_size = 1,
6666 .ctx_arg_info = {
6667 { offsetof(struct bpf_iter__ipv6_route, rt),
6668 PTR_TO_BTF_ID_OR_NULL },
6669 },
6670 .seq_info = &ipv6_route_seq_info,
6671};
6672
6673static int __init bpf_iter_register(void)
6674{
6675 ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id;
6676 return bpf_iter_reg_target(&ipv6_route_reg_info);
6677}
6678
6679static void bpf_iter_unregister(void)
6680{
6681 bpf_iter_unreg_target(&ipv6_route_reg_info);
6682}
6683#endif
6684#endif
6685
6686int __init ip6_route_init(void)
6687{
6688 int ret;
6689 int cpu;
6690
6691 ret = -ENOMEM;
6692 ip6_dst_ops_template.kmem_cachep =
6693 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
6694 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
6695 if (!ip6_dst_ops_template.kmem_cachep)
6696 goto out;
6697
6698 ret = dst_entries_init(&ip6_dst_blackhole_ops);
6699 if (ret)
6700 goto out_kmem_cache;
6701
6702 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
6703 if (ret)
6704 goto out_dst_entries;
6705
6706 ret = register_pernet_subsys(&ip6_route_net_ops);
6707 if (ret)
6708 goto out_register_inetpeer;
6709
6710 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
6711
6712 ret = fib6_init();
6713 if (ret)
6714 goto out_register_subsys;
6715
6716 ret = xfrm6_init();
6717 if (ret)
6718 goto out_fib6_init;
6719
6720 ret = fib6_rules_init();
6721 if (ret)
6722 goto xfrm6_init;
6723
6724 ret = register_pernet_subsys(&ip6_route_net_late_ops);
6725 if (ret)
6726 goto fib6_rules_init;
6727
6728 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
6729 inet6_rtm_newroute, NULL, 0);
6730 if (ret < 0)
6731 goto out_register_late_subsys;
6732
6733 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
6734 inet6_rtm_delroute, NULL, 0);
6735 if (ret < 0)
6736 goto out_register_late_subsys;
6737
6738 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
6739 inet6_rtm_getroute, NULL,
6740 RTNL_FLAG_DOIT_UNLOCKED);
6741 if (ret < 0)
6742 goto out_register_late_subsys;
6743
6744 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
6745 if (ret)
6746 goto out_register_late_subsys;
6747
6748#if IS_BUILTIN(CONFIG_IPV6)
6749#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6750 ret = bpf_iter_register();
6751 if (ret)
6752 goto out_register_late_subsys;
6753#endif
6754#endif
6755
6756 for_each_possible_cpu(cpu) {
6757 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
6758
6759 INIT_LIST_HEAD(&ul->head);
6760 INIT_LIST_HEAD(&ul->quarantine);
6761 spin_lock_init(&ul->lock);
6762 }
6763
6764out:
6765 return ret;
6766
6767out_register_late_subsys:
6768 rtnl_unregister_all(PF_INET6);
6769 unregister_pernet_subsys(&ip6_route_net_late_ops);
6770fib6_rules_init:
6771 fib6_rules_cleanup();
6772xfrm6_init:
6773 xfrm6_fini();
6774out_fib6_init:
6775 fib6_gc_cleanup();
6776out_register_subsys:
6777 unregister_pernet_subsys(&ip6_route_net_ops);
6778out_register_inetpeer:
6779 unregister_pernet_subsys(&ipv6_inetpeer_ops);
6780out_dst_entries:
6781 dst_entries_destroy(&ip6_dst_blackhole_ops);
6782out_kmem_cache:
6783 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
6784 goto out;
6785}
6786
6787void ip6_route_cleanup(void)
6788{
6789#if IS_BUILTIN(CONFIG_IPV6)
6790#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6791 bpf_iter_unregister();
6792#endif
6793#endif
6794 unregister_netdevice_notifier(&ip6_route_dev_notifier);
6795 unregister_pernet_subsys(&ip6_route_net_late_ops);
6796 fib6_rules_cleanup();
6797 xfrm6_fini();
6798 fib6_gc_cleanup();
6799 unregister_pernet_subsys(&ipv6_inetpeer_ops);
6800 unregister_pernet_subsys(&ip6_route_net_ops);
6801 dst_entries_destroy(&ip6_dst_blackhole_ops);
6802 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
6803}