Linux Audio

Check our new training course

Loading...
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *	Linux INET6 implementation
   4 *	FIB front-end.
   5 *
   6 *	Authors:
   7 *	Pedro Roque		<roque@di.fc.ul.pt>
   8 */
   9
  10/*	Changes:
  11 *
  12 *	YOSHIFUJI Hideaki @USAGI
  13 *		reworked default router selection.
  14 *		- respect outgoing interface
  15 *		- select from (probably) reachable routers (i.e.
  16 *		routers in REACHABLE, STALE, DELAY or PROBE states).
  17 *		- always select the same router if it is (probably)
  18 *		reachable.  otherwise, round-robin the list.
  19 *	Ville Nuorvala
  20 *		Fixed routing subtrees.
  21 */
  22
  23#define pr_fmt(fmt) "IPv6: " fmt
  24
  25#include <linux/capability.h>
  26#include <linux/errno.h>
  27#include <linux/export.h>
  28#include <linux/types.h>
  29#include <linux/times.h>
  30#include <linux/socket.h>
  31#include <linux/sockios.h>
  32#include <linux/net.h>
  33#include <linux/route.h>
  34#include <linux/netdevice.h>
  35#include <linux/in6.h>
  36#include <linux/mroute6.h>
  37#include <linux/init.h>
  38#include <linux/if_arp.h>
  39#include <linux/proc_fs.h>
  40#include <linux/seq_file.h>
  41#include <linux/nsproxy.h>
  42#include <linux/slab.h>
  43#include <linux/jhash.h>
  44#include <linux/siphash.h>
  45#include <net/net_namespace.h>
  46#include <net/snmp.h>
  47#include <net/ipv6.h>
  48#include <net/ip6_fib.h>
  49#include <net/ip6_route.h>
  50#include <net/ndisc.h>
  51#include <net/addrconf.h>
  52#include <net/tcp.h>
  53#include <linux/rtnetlink.h>
  54#include <net/dst.h>
  55#include <net/dst_metadata.h>
  56#include <net/xfrm.h>
  57#include <net/netevent.h>
  58#include <net/netlink.h>
  59#include <net/rtnh.h>
  60#include <net/lwtunnel.h>
  61#include <net/ip_tunnels.h>
  62#include <net/l3mdev.h>
  63#include <net/ip.h>
  64#include <linux/uaccess.h>
  65#include <linux/btf_ids.h>
  66
  67#ifdef CONFIG_SYSCTL
  68#include <linux/sysctl.h>
  69#endif
  70
  71static int ip6_rt_type_to_error(u8 fib6_type);
  72
  73#define CREATE_TRACE_POINTS
  74#include <trace/events/fib6.h>
  75EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
  76#undef CREATE_TRACE_POINTS
  77
  78enum rt6_nud_state {
  79	RT6_NUD_FAIL_HARD = -3,
  80	RT6_NUD_FAIL_PROBE = -2,
  81	RT6_NUD_FAIL_DO_RR = -1,
  82	RT6_NUD_SUCCEED = 1
  83};
  84
  85INDIRECT_CALLABLE_SCOPE
  86struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
  87static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
  88INDIRECT_CALLABLE_SCOPE
  89unsigned int		ip6_mtu(const struct dst_entry *dst);
  90static void		ip6_negative_advice(struct sock *sk,
  91					    struct dst_entry *dst);
  92static void		ip6_dst_destroy(struct dst_entry *);
  93static void		ip6_dst_ifdown(struct dst_entry *,
  94				       struct net_device *dev);
  95static void		 ip6_dst_gc(struct dst_ops *ops);
  96
  97static int		ip6_pkt_discard(struct sk_buff *skb);
  98static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
  99static int		ip6_pkt_prohibit(struct sk_buff *skb);
 100static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 101static void		ip6_link_failure(struct sk_buff *skb);
 102static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 103					   struct sk_buff *skb, u32 mtu,
 104					   bool confirm_neigh);
 105static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
 106					struct sk_buff *skb);
 107static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
 108			   int strict);
 109static size_t rt6_nlmsg_size(struct fib6_info *f6i);
 110static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 111			 struct fib6_info *rt, struct dst_entry *dst,
 112			 struct in6_addr *dest, struct in6_addr *src,
 113			 int iif, int type, u32 portid, u32 seq,
 114			 unsigned int flags);
 115static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
 116					   const struct in6_addr *daddr,
 117					   const struct in6_addr *saddr);
 118
 119#ifdef CONFIG_IPV6_ROUTE_INFO
 120static struct fib6_info *rt6_add_route_info(struct net *net,
 121					   const struct in6_addr *prefix, int prefixlen,
 122					   const struct in6_addr *gwaddr,
 123					   struct net_device *dev,
 124					   unsigned int pref);
 125static struct fib6_info *rt6_get_route_info(struct net *net,
 126					   const struct in6_addr *prefix, int prefixlen,
 127					   const struct in6_addr *gwaddr,
 128					   struct net_device *dev);
 129#endif
 130
 131struct uncached_list {
 132	spinlock_t		lock;
 133	struct list_head	head;
 134};
 135
 136static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
 137
 138void rt6_uncached_list_add(struct rt6_info *rt)
 139{
 140	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
 141
 142	rt->dst.rt_uncached_list = ul;
 143
 144	spin_lock_bh(&ul->lock);
 145	list_add_tail(&rt->dst.rt_uncached, &ul->head);
 146	spin_unlock_bh(&ul->lock);
 147}
 148
 149void rt6_uncached_list_del(struct rt6_info *rt)
 150{
 151	if (!list_empty(&rt->dst.rt_uncached)) {
 152		struct uncached_list *ul = rt->dst.rt_uncached_list;
 
 153
 154		spin_lock_bh(&ul->lock);
 155		list_del_init(&rt->dst.rt_uncached);
 
 156		spin_unlock_bh(&ul->lock);
 157	}
 158}
 159
 160static void rt6_uncached_list_flush_dev(struct net_device *dev)
 161{
 
 162	int cpu;
 163
 
 
 
 164	for_each_possible_cpu(cpu) {
 165		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
 166		struct rt6_info *rt, *safe;
 167
 168		if (list_empty(&ul->head))
 169			continue;
 170
 171		spin_lock_bh(&ul->lock);
 172		list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
 173			struct inet6_dev *rt_idev = rt->rt6i_idev;
 174			struct net_device *rt_dev = rt->dst.dev;
 175			bool handled = false;
 176
 177			if (rt_idev && rt_idev->dev == dev) {
 178				rt->rt6i_idev = in6_dev_get(blackhole_netdev);
 179				in6_dev_put(rt_idev);
 180				handled = true;
 181			}
 182
 183			if (rt_dev == dev) {
 184				rt->dst.dev = blackhole_netdev;
 185				netdev_ref_replace(rt_dev, blackhole_netdev,
 186						   &rt->dst.dev_tracker,
 187						   GFP_ATOMIC);
 188				handled = true;
 189			}
 190			if (handled)
 191				list_del_init(&rt->dst.rt_uncached);
 192		}
 193		spin_unlock_bh(&ul->lock);
 194	}
 195}
 196
 197static inline const void *choose_neigh_daddr(const struct in6_addr *p,
 198					     struct sk_buff *skb,
 199					     const void *daddr)
 200{
 201	if (!ipv6_addr_any(p))
 202		return (const void *) p;
 203	else if (skb)
 204		return &ipv6_hdr(skb)->daddr;
 205	return daddr;
 206}
 207
 208struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
 209				   struct net_device *dev,
 210				   struct sk_buff *skb,
 211				   const void *daddr)
 212{
 213	struct neighbour *n;
 214
 215	daddr = choose_neigh_daddr(gw, skb, daddr);
 216	n = __ipv6_neigh_lookup(dev, daddr);
 217	if (n)
 218		return n;
 219
 220	n = neigh_create(&nd_tbl, daddr, dev);
 221	return IS_ERR(n) ? NULL : n;
 222}
 223
 224static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
 225					      struct sk_buff *skb,
 226					      const void *daddr)
 227{
 228	const struct rt6_info *rt = dst_rt6_info(dst);
 229
 230	return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
 231				dst->dev, skb, daddr);
 232}
 233
 234static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 235{
 236	const struct rt6_info *rt = dst_rt6_info(dst);
 237	struct net_device *dev = dst->dev;
 
 238
 239	daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
 240	if (!daddr)
 241		return;
 242	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
 243		return;
 244	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
 245		return;
 246	__ipv6_confirm_neigh(dev, daddr);
 247}
 248
 249static struct dst_ops ip6_dst_ops_template = {
 250	.family			=	AF_INET6,
 251	.gc			=	ip6_dst_gc,
 252	.gc_thresh		=	1024,
 253	.check			=	ip6_dst_check,
 254	.default_advmss		=	ip6_default_advmss,
 255	.mtu			=	ip6_mtu,
 256	.cow_metrics		=	dst_cow_metrics_generic,
 257	.destroy		=	ip6_dst_destroy,
 258	.ifdown			=	ip6_dst_ifdown,
 259	.negative_advice	=	ip6_negative_advice,
 260	.link_failure		=	ip6_link_failure,
 261	.update_pmtu		=	ip6_rt_update_pmtu,
 262	.redirect		=	rt6_do_redirect,
 263	.local_out		=	__ip6_local_out,
 264	.neigh_lookup		=	ip6_dst_neigh_lookup,
 265	.confirm_neigh		=	ip6_confirm_neigh,
 266};
 267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 268static struct dst_ops ip6_dst_blackhole_ops = {
 269	.family			= AF_INET6,
 270	.default_advmss		= ip6_default_advmss,
 271	.neigh_lookup		= ip6_dst_neigh_lookup,
 272	.check			= ip6_dst_check,
 273	.destroy		= ip6_dst_destroy,
 274	.cow_metrics		= dst_cow_metrics_generic,
 275	.update_pmtu		= dst_blackhole_update_pmtu,
 276	.redirect		= dst_blackhole_redirect,
 277	.mtu			= dst_blackhole_mtu,
 278};
 279
 280static const u32 ip6_template_metrics[RTAX_MAX] = {
 281	[RTAX_HOPLIMIT - 1] = 0,
 282};
 283
 284static const struct fib6_info fib6_null_entry_template = {
 285	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 286	.fib6_protocol  = RTPROT_KERNEL,
 287	.fib6_metric	= ~(u32)0,
 288	.fib6_ref	= REFCOUNT_INIT(1),
 289	.fib6_type	= RTN_UNREACHABLE,
 290	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
 291};
 292
 293static const struct rt6_info ip6_null_entry_template = {
 294	.dst = {
 295		.__rcuref	= RCUREF_INIT(1),
 296		.__use		= 1,
 297		.obsolete	= DST_OBSOLETE_FORCE_CHK,
 298		.error		= -ENETUNREACH,
 299		.input		= ip6_pkt_discard,
 300		.output		= ip6_pkt_discard_out,
 301	},
 302	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 303};
 304
 305#ifdef CONFIG_IPV6_MULTIPLE_TABLES
 306
 307static const struct rt6_info ip6_prohibit_entry_template = {
 308	.dst = {
 309		.__rcuref	= RCUREF_INIT(1),
 310		.__use		= 1,
 311		.obsolete	= DST_OBSOLETE_FORCE_CHK,
 312		.error		= -EACCES,
 313		.input		= ip6_pkt_prohibit,
 314		.output		= ip6_pkt_prohibit_out,
 315	},
 316	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 317};
 318
 319static const struct rt6_info ip6_blk_hole_entry_template = {
 320	.dst = {
 321		.__rcuref	= RCUREF_INIT(1),
 322		.__use		= 1,
 323		.obsolete	= DST_OBSOLETE_FORCE_CHK,
 324		.error		= -EINVAL,
 325		.input		= dst_discard,
 326		.output		= dst_discard_out,
 327	},
 328	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 329};
 330
 331#endif
 332
 333static void rt6_info_init(struct rt6_info *rt)
 334{
 335	memset_after(rt, 0, dst);
 
 
 
 336}
 337
 338/* allocate dst with ip6_dst_ops */
 339struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
 340			       int flags)
 341{
 342	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
 343					DST_OBSOLETE_FORCE_CHK, flags);
 344
 345	if (rt) {
 346		rt6_info_init(rt);
 347		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
 348	}
 349
 350	return rt;
 351}
 352EXPORT_SYMBOL(ip6_dst_alloc);
 353
 354static void ip6_dst_destroy(struct dst_entry *dst)
 355{
 356	struct rt6_info *rt = dst_rt6_info(dst);
 357	struct fib6_info *from;
 358	struct inet6_dev *idev;
 359
 360	ip_dst_metrics_put(dst);
 361	rt6_uncached_list_del(rt);
 362
 363	idev = rt->rt6i_idev;
 364	if (idev) {
 365		rt->rt6i_idev = NULL;
 366		in6_dev_put(idev);
 367	}
 368
 369	from = unrcu_pointer(xchg(&rt->from, NULL));
 370	fib6_info_release(from);
 371}
 372
 373static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
 
 374{
 375	struct rt6_info *rt = dst_rt6_info(dst);
 376	struct inet6_dev *idev = rt->rt6i_idev;
 377	struct fib6_info *from;
 
 378
 379	if (idev && idev->dev != blackhole_netdev) {
 380		struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev);
 381
 382		if (blackhole_idev) {
 383			rt->rt6i_idev = blackhole_idev;
 384			in6_dev_put(idev);
 385		}
 386	}
 387	from = unrcu_pointer(xchg(&rt->from, NULL));
 388	fib6_info_release(from);
 389}
 390
 391static bool __rt6_check_expired(const struct rt6_info *rt)
 392{
 393	if (rt->rt6i_flags & RTF_EXPIRES)
 394		return time_after(jiffies, rt->dst.expires);
 395	else
 396		return false;
 397}
 398
 399static bool rt6_check_expired(const struct rt6_info *rt)
 400{
 401	struct fib6_info *from;
 402
 403	from = rcu_dereference(rt->from);
 404
 405	if (rt->rt6i_flags & RTF_EXPIRES) {
 406		if (time_after(jiffies, rt->dst.expires))
 407			return true;
 408	} else if (from) {
 409		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
 410			fib6_check_expired(from);
 411	}
 412	return false;
 413}
 414
 415void fib6_select_path(const struct net *net, struct fib6_result *res,
 416		      struct flowi6 *fl6, int oif, bool have_oif_match,
 417		      const struct sk_buff *skb, int strict)
 418{
 
 419	struct fib6_info *match = res->f6i;
 420	struct fib6_info *sibling;
 421
 422	if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
 423		goto out;
 424
 425	if (match->nh && have_oif_match && res->nh)
 426		return;
 427
 428	if (skb)
 429		IP6CB(skb)->flags |= IP6SKB_MULTIPATH;
 430
 431	/* We might have already computed the hash for ICMPv6 errors. In such
 432	 * case it will always be non-zero. Otherwise now is the time to do it.
 433	 */
 434	if (!fl6->mp_hash &&
 435	    (!match->nh || nexthop_is_multipath(match->nh)))
 436		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
 437
 438	if (unlikely(match->nh)) {
 439		nexthop_path_fib6_result(res, fl6->mp_hash);
 440		return;
 441	}
 442
 443	if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
 444		goto out;
 445
 446	list_for_each_entry_rcu(sibling, &match->fib6_siblings,
 447				fib6_siblings) {
 448		const struct fib6_nh *nh = sibling->fib6_nh;
 449		int nh_upper_bound;
 450
 451		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
 452		if (fl6->mp_hash > nh_upper_bound)
 453			continue;
 454		if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
 455			break;
 456		match = sibling;
 457		break;
 458	}
 459
 460out:
 461	res->f6i = match;
 462	res->nh = match->fib6_nh;
 463}
 464
 465/*
 466 *	Route lookup. rcu_read_lock() should be held.
 467 */
 468
 469static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
 470			       const struct in6_addr *saddr, int oif, int flags)
 471{
 472	const struct net_device *dev;
 473
 474	if (nh->fib_nh_flags & RTNH_F_DEAD)
 475		return false;
 476
 477	dev = nh->fib_nh_dev;
 478	if (oif) {
 479		if (dev->ifindex == oif)
 480			return true;
 481	} else {
 482		if (ipv6_chk_addr(net, saddr, dev,
 483				  flags & RT6_LOOKUP_F_IFACE))
 484			return true;
 485	}
 486
 487	return false;
 488}
 489
 490struct fib6_nh_dm_arg {
 491	struct net		*net;
 492	const struct in6_addr	*saddr;
 493	int			oif;
 494	int			flags;
 495	struct fib6_nh		*nh;
 496};
 497
 498static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
 499{
 500	struct fib6_nh_dm_arg *arg = _arg;
 501
 502	arg->nh = nh;
 503	return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
 504				  arg->flags);
 505}
 506
 507/* returns fib6_nh from nexthop or NULL */
 508static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
 509					struct fib6_result *res,
 510					const struct in6_addr *saddr,
 511					int oif, int flags)
 512{
 513	struct fib6_nh_dm_arg arg = {
 514		.net   = net,
 515		.saddr = saddr,
 516		.oif   = oif,
 517		.flags = flags,
 518	};
 519
 520	if (nexthop_is_blackhole(nh))
 521		return NULL;
 522
 523	if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
 524		return arg.nh;
 525
 526	return NULL;
 527}
 528
 529static void rt6_device_match(struct net *net, struct fib6_result *res,
 530			     const struct in6_addr *saddr, int oif, int flags)
 531{
 532	struct fib6_info *f6i = res->f6i;
 533	struct fib6_info *spf6i;
 534	struct fib6_nh *nh;
 535
 536	if (!oif && ipv6_addr_any(saddr)) {
 537		if (unlikely(f6i->nh)) {
 538			nh = nexthop_fib6_nh(f6i->nh);
 539			if (nexthop_is_blackhole(f6i->nh))
 540				goto out_blackhole;
 541		} else {
 542			nh = f6i->fib6_nh;
 543		}
 544		if (!(nh->fib_nh_flags & RTNH_F_DEAD))
 545			goto out;
 546	}
 547
 548	for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
 549		bool matched = false;
 550
 551		if (unlikely(spf6i->nh)) {
 552			nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
 553					      oif, flags);
 554			if (nh)
 555				matched = true;
 556		} else {
 557			nh = spf6i->fib6_nh;
 558			if (__rt6_device_match(net, nh, saddr, oif, flags))
 559				matched = true;
 560		}
 561		if (matched) {
 562			res->f6i = spf6i;
 563			goto out;
 564		}
 565	}
 566
 567	if (oif && flags & RT6_LOOKUP_F_IFACE) {
 568		res->f6i = net->ipv6.fib6_null_entry;
 569		nh = res->f6i->fib6_nh;
 570		goto out;
 571	}
 572
 573	if (unlikely(f6i->nh)) {
 574		nh = nexthop_fib6_nh(f6i->nh);
 575		if (nexthop_is_blackhole(f6i->nh))
 576			goto out_blackhole;
 577	} else {
 578		nh = f6i->fib6_nh;
 579	}
 580
 581	if (nh->fib_nh_flags & RTNH_F_DEAD) {
 582		res->f6i = net->ipv6.fib6_null_entry;
 583		nh = res->f6i->fib6_nh;
 584	}
 585out:
 586	res->nh = nh;
 587	res->fib6_type = res->f6i->fib6_type;
 588	res->fib6_flags = res->f6i->fib6_flags;
 589	return;
 590
 591out_blackhole:
 592	res->fib6_flags |= RTF_REJECT;
 593	res->fib6_type = RTN_BLACKHOLE;
 594	res->nh = nh;
 595}
 596
 597#ifdef CONFIG_IPV6_ROUTER_PREF
 598struct __rt6_probe_work {
 599	struct work_struct work;
 600	struct in6_addr target;
 601	struct net_device *dev;
 602	netdevice_tracker dev_tracker;
 603};
 604
 605static void rt6_probe_deferred(struct work_struct *w)
 606{
 607	struct in6_addr mcaddr;
 608	struct __rt6_probe_work *work =
 609		container_of(w, struct __rt6_probe_work, work);
 610
 611	addrconf_addr_solict_mult(&work->target, &mcaddr);
 612	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
 613	netdev_put(work->dev, &work->dev_tracker);
 614	kfree(work);
 615}
 616
 617static void rt6_probe(struct fib6_nh *fib6_nh)
 618{
 619	struct __rt6_probe_work *work = NULL;
 620	const struct in6_addr *nh_gw;
 621	unsigned long last_probe;
 622	struct neighbour *neigh;
 623	struct net_device *dev;
 624	struct inet6_dev *idev;
 625
 626	/*
 627	 * Okay, this does not seem to be appropriate
 628	 * for now, however, we need to check if it
 629	 * is really so; aka Router Reachability Probing.
 630	 *
 631	 * Router Reachability Probe MUST be rate-limited
 632	 * to no more than one per minute.
 633	 */
 634	if (!fib6_nh->fib_nh_gw_family)
 635		return;
 636
 637	nh_gw = &fib6_nh->fib_nh_gw6;
 638	dev = fib6_nh->fib_nh_dev;
 639	rcu_read_lock();
 640	last_probe = READ_ONCE(fib6_nh->last_probe);
 641	idev = __in6_dev_get(dev);
 642	if (!idev)
 643		goto out;
 644	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
 645	if (neigh) {
 646		if (READ_ONCE(neigh->nud_state) & NUD_VALID)
 647			goto out;
 648
 649		write_lock_bh(&neigh->lock);
 650		if (!(neigh->nud_state & NUD_VALID) &&
 651		    time_after(jiffies,
 652			       neigh->updated +
 653			       READ_ONCE(idev->cnf.rtr_probe_interval))) {
 654			work = kmalloc(sizeof(*work), GFP_ATOMIC);
 655			if (work)
 656				__neigh_set_probe_once(neigh);
 657		}
 658		write_unlock_bh(&neigh->lock);
 659	} else if (time_after(jiffies, last_probe +
 660				       READ_ONCE(idev->cnf.rtr_probe_interval))) {
 661		work = kmalloc(sizeof(*work), GFP_ATOMIC);
 662	}
 663
 664	if (!work || cmpxchg(&fib6_nh->last_probe,
 665			     last_probe, jiffies) != last_probe) {
 666		kfree(work);
 667	} else {
 668		INIT_WORK(&work->work, rt6_probe_deferred);
 669		work->target = *nh_gw;
 670		netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC);
 671		work->dev = dev;
 672		schedule_work(&work->work);
 673	}
 674
 675out:
 676	rcu_read_unlock();
 677}
 678#else
 679static inline void rt6_probe(struct fib6_nh *fib6_nh)
 680{
 681}
 682#endif
 683
 684/*
 685 * Default Router Selection (RFC 2461 6.3.6)
 686 */
 687static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
 688{
 689	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
 690	struct neighbour *neigh;
 691
 692	rcu_read_lock();
 693	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
 694					  &fib6_nh->fib_nh_gw6);
 695	if (neigh) {
 696		u8 nud_state = READ_ONCE(neigh->nud_state);
 697
 698		if (nud_state & NUD_VALID)
 699			ret = RT6_NUD_SUCCEED;
 700#ifdef CONFIG_IPV6_ROUTER_PREF
 701		else if (!(nud_state & NUD_FAILED))
 702			ret = RT6_NUD_SUCCEED;
 703		else
 704			ret = RT6_NUD_FAIL_PROBE;
 705#endif
 
 706	} else {
 707		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
 708		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
 709	}
 710	rcu_read_unlock();
 711
 712	return ret;
 713}
 714
 715static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
 716			   int strict)
 717{
 718	int m = 0;
 719
 720	if (!oif || nh->fib_nh_dev->ifindex == oif)
 721		m = 2;
 722
 723	if (!m && (strict & RT6_LOOKUP_F_IFACE))
 724		return RT6_NUD_FAIL_HARD;
 725#ifdef CONFIG_IPV6_ROUTER_PREF
 726	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
 727#endif
 728	if ((strict & RT6_LOOKUP_F_REACHABLE) &&
 729	    !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
 730		int n = rt6_check_neigh(nh);
 731		if (n < 0)
 732			return n;
 733	}
 734	return m;
 735}
 736
 737static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
 738		       int oif, int strict, int *mpri, bool *do_rr)
 739{
 740	bool match_do_rr = false;
 741	bool rc = false;
 742	int m;
 743
 744	if (nh->fib_nh_flags & RTNH_F_DEAD)
 745		goto out;
 746
 747	if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
 748	    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
 749	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
 750		goto out;
 751
 752	m = rt6_score_route(nh, fib6_flags, oif, strict);
 753	if (m == RT6_NUD_FAIL_DO_RR) {
 754		match_do_rr = true;
 755		m = 0; /* lowest valid score */
 756	} else if (m == RT6_NUD_FAIL_HARD) {
 757		goto out;
 758	}
 759
 760	if (strict & RT6_LOOKUP_F_REACHABLE)
 761		rt6_probe(nh);
 762
 763	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
 764	if (m > *mpri) {
 765		*do_rr = match_do_rr;
 766		*mpri = m;
 767		rc = true;
 768	}
 769out:
 770	return rc;
 771}
 772
 773struct fib6_nh_frl_arg {
 774	u32		flags;
 775	int		oif;
 776	int		strict;
 777	int		*mpri;
 778	bool		*do_rr;
 779	struct fib6_nh	*nh;
 780};
 781
 782static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
 783{
 784	struct fib6_nh_frl_arg *arg = _arg;
 785
 786	arg->nh = nh;
 787	return find_match(nh, arg->flags, arg->oif, arg->strict,
 788			  arg->mpri, arg->do_rr);
 789}
 790
 791static void __find_rr_leaf(struct fib6_info *f6i_start,
 792			   struct fib6_info *nomatch, u32 metric,
 793			   struct fib6_result *res, struct fib6_info **cont,
 794			   int oif, int strict, bool *do_rr, int *mpri)
 795{
 796	struct fib6_info *f6i;
 797
 798	for (f6i = f6i_start;
 799	     f6i && f6i != nomatch;
 800	     f6i = rcu_dereference(f6i->fib6_next)) {
 801		bool matched = false;
 802		struct fib6_nh *nh;
 803
 804		if (cont && f6i->fib6_metric != metric) {
 805			*cont = f6i;
 806			return;
 807		}
 808
 809		if (fib6_check_expired(f6i))
 810			continue;
 811
 812		if (unlikely(f6i->nh)) {
 813			struct fib6_nh_frl_arg arg = {
 814				.flags  = f6i->fib6_flags,
 815				.oif    = oif,
 816				.strict = strict,
 817				.mpri   = mpri,
 818				.do_rr  = do_rr
 819			};
 820
 821			if (nexthop_is_blackhole(f6i->nh)) {
 822				res->fib6_flags = RTF_REJECT;
 823				res->fib6_type = RTN_BLACKHOLE;
 824				res->f6i = f6i;
 825				res->nh = nexthop_fib6_nh(f6i->nh);
 826				return;
 827			}
 828			if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
 829						     &arg)) {
 830				matched = true;
 831				nh = arg.nh;
 832			}
 833		} else {
 834			nh = f6i->fib6_nh;
 835			if (find_match(nh, f6i->fib6_flags, oif, strict,
 836				       mpri, do_rr))
 837				matched = true;
 838		}
 839		if (matched) {
 840			res->f6i = f6i;
 841			res->nh = nh;
 842			res->fib6_flags = f6i->fib6_flags;
 843			res->fib6_type = f6i->fib6_type;
 844		}
 845	}
 846}
 847
 848static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
 849			 struct fib6_info *rr_head, int oif, int strict,
 850			 bool *do_rr, struct fib6_result *res)
 851{
 852	u32 metric = rr_head->fib6_metric;
 853	struct fib6_info *cont = NULL;
 854	int mpri = -1;
 855
 856	__find_rr_leaf(rr_head, NULL, metric, res, &cont,
 857		       oif, strict, do_rr, &mpri);
 858
 859	__find_rr_leaf(leaf, rr_head, metric, res, &cont,
 860		       oif, strict, do_rr, &mpri);
 861
 862	if (res->f6i || !cont)
 863		return;
 864
 865	__find_rr_leaf(cont, NULL, metric, res, NULL,
 866		       oif, strict, do_rr, &mpri);
 867}
 868
 869static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
 870		       struct fib6_result *res, int strict)
 871{
 872	struct fib6_info *leaf = rcu_dereference(fn->leaf);
 873	struct fib6_info *rt0;
 874	bool do_rr = false;
 875	int key_plen;
 876
 877	/* make sure this function or its helpers sets f6i */
 878	res->f6i = NULL;
 879
 880	if (!leaf || leaf == net->ipv6.fib6_null_entry)
 881		goto out;
 882
 883	rt0 = rcu_dereference(fn->rr_ptr);
 884	if (!rt0)
 885		rt0 = leaf;
 886
 887	/* Double check to make sure fn is not an intermediate node
 888	 * and fn->leaf does not points to its child's leaf
 889	 * (This might happen if all routes under fn are deleted from
 890	 * the tree and fib6_repair_tree() is called on the node.)
 891	 */
 892	key_plen = rt0->fib6_dst.plen;
 893#ifdef CONFIG_IPV6_SUBTREES
 894	if (rt0->fib6_src.plen)
 895		key_plen = rt0->fib6_src.plen;
 896#endif
 897	if (fn->fn_bit != key_plen)
 898		goto out;
 899
 900	find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
 901	if (do_rr) {
 902		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
 903
 904		/* no entries matched; do round-robin */
 905		if (!next || next->fib6_metric != rt0->fib6_metric)
 906			next = leaf;
 907
 908		if (next != rt0) {
 909			spin_lock_bh(&leaf->fib6_table->tb6_lock);
 910			/* make sure next is not being deleted from the tree */
 911			if (next->fib6_node)
 912				rcu_assign_pointer(fn->rr_ptr, next);
 913			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
 914		}
 915	}
 916
 917out:
 918	if (!res->f6i) {
 919		res->f6i = net->ipv6.fib6_null_entry;
 920		res->nh = res->f6i->fib6_nh;
 921		res->fib6_flags = res->f6i->fib6_flags;
 922		res->fib6_type = res->f6i->fib6_type;
 923	}
 924}
 925
 926static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
 927{
 928	return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
 929	       res->nh->fib_nh_gw_family;
 930}
 931
 932#ifdef CONFIG_IPV6_ROUTE_INFO
 933int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 934		  const struct in6_addr *gwaddr)
 935{
 936	struct net *net = dev_net(dev);
 937	struct route_info *rinfo = (struct route_info *) opt;
 938	struct in6_addr prefix_buf, *prefix;
 939	struct fib6_table *table;
 940	unsigned int pref;
 941	unsigned long lifetime;
 942	struct fib6_info *rt;
 943
 944	if (len < sizeof(struct route_info)) {
 945		return -EINVAL;
 946	}
 947
 948	/* Sanity check for prefix_len and length */
 949	if (rinfo->length > 3) {
 950		return -EINVAL;
 951	} else if (rinfo->prefix_len > 128) {
 952		return -EINVAL;
 953	} else if (rinfo->prefix_len > 64) {
 954		if (rinfo->length < 2) {
 955			return -EINVAL;
 956		}
 957	} else if (rinfo->prefix_len > 0) {
 958		if (rinfo->length < 1) {
 959			return -EINVAL;
 960		}
 961	}
 962
 963	pref = rinfo->route_pref;
 964	if (pref == ICMPV6_ROUTER_PREF_INVALID)
 965		return -EINVAL;
 966
 967	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
 968
 969	if (rinfo->length == 3)
 970		prefix = (struct in6_addr *)rinfo->prefix;
 971	else {
 972		/* this function is safe */
 973		ipv6_addr_prefix(&prefix_buf,
 974				 (struct in6_addr *)rinfo->prefix,
 975				 rinfo->prefix_len);
 976		prefix = &prefix_buf;
 977	}
 978
 979	if (rinfo->prefix_len == 0)
 980		rt = rt6_get_dflt_router(net, gwaddr, dev);
 981	else
 982		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
 983					gwaddr, dev);
 984
 985	if (rt && !lifetime) {
 986		ip6_del_rt(net, rt, false);
 987		rt = NULL;
 988	}
 989
 990	if (!rt && lifetime)
 991		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
 992					dev, pref);
 993	else if (rt)
 994		rt->fib6_flags = RTF_ROUTEINFO |
 995				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 996
 997	if (rt) {
 998		table = rt->fib6_table;
 999		spin_lock_bh(&table->tb6_lock);
1000
1001		if (!addrconf_finite_timeout(lifetime)) {
1002			fib6_clean_expires(rt);
1003			fib6_remove_gc_list(rt);
1004		} else {
1005			fib6_set_expires(rt, jiffies + HZ * lifetime);
1006			fib6_add_gc_list(rt);
1007		}
1008
1009		spin_unlock_bh(&table->tb6_lock);
1010
1011		fib6_info_release(rt);
1012	}
1013	return 0;
1014}
1015#endif
1016
1017/*
1018 *	Misc support functions
1019 */
1020
1021/* called with rcu_lock held */
1022static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
1023{
1024	struct net_device *dev = res->nh->fib_nh_dev;
1025
1026	if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1027		/* for copies of local routes, dst->dev needs to be the
1028		 * device if it is a master device, the master device if
1029		 * device is enslaved, and the loopback as the default
1030		 */
1031		if (netif_is_l3_slave(dev) &&
1032		    !rt6_need_strict(&res->f6i->fib6_dst.addr))
1033			dev = l3mdev_master_dev_rcu(dev);
1034		else if (!netif_is_l3_master(dev))
1035			dev = dev_net(dev)->loopback_dev;
1036		/* last case is netif_is_l3_master(dev) is true in which
1037		 * case we want dev returned to be dev
1038		 */
1039	}
1040
1041	return dev;
1042}
1043
1044static const int fib6_prop[RTN_MAX + 1] = {
1045	[RTN_UNSPEC]	= 0,
1046	[RTN_UNICAST]	= 0,
1047	[RTN_LOCAL]	= 0,
1048	[RTN_BROADCAST]	= 0,
1049	[RTN_ANYCAST]	= 0,
1050	[RTN_MULTICAST]	= 0,
1051	[RTN_BLACKHOLE]	= -EINVAL,
1052	[RTN_UNREACHABLE] = -EHOSTUNREACH,
1053	[RTN_PROHIBIT]	= -EACCES,
1054	[RTN_THROW]	= -EAGAIN,
1055	[RTN_NAT]	= -EINVAL,
1056	[RTN_XRESOLVE]	= -EINVAL,
1057};
1058
1059static int ip6_rt_type_to_error(u8 fib6_type)
1060{
1061	return fib6_prop[fib6_type];
1062}
1063
1064static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
1065{
1066	unsigned short flags = 0;
1067
1068	if (rt->dst_nocount)
1069		flags |= DST_NOCOUNT;
1070	if (rt->dst_nopolicy)
1071		flags |= DST_NOPOLICY;
 
 
1072
1073	return flags;
1074}
1075
1076static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
1077{
1078	rt->dst.error = ip6_rt_type_to_error(fib6_type);
1079
1080	switch (fib6_type) {
1081	case RTN_BLACKHOLE:
1082		rt->dst.output = dst_discard_out;
1083		rt->dst.input = dst_discard;
1084		break;
1085	case RTN_PROHIBIT:
1086		rt->dst.output = ip6_pkt_prohibit_out;
1087		rt->dst.input = ip6_pkt_prohibit;
1088		break;
1089	case RTN_THROW:
1090	case RTN_UNREACHABLE:
1091	default:
1092		rt->dst.output = ip6_pkt_discard_out;
1093		rt->dst.input = ip6_pkt_discard;
1094		break;
1095	}
1096}
1097
1098static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
1099{
1100	struct fib6_info *f6i = res->f6i;
1101
1102	if (res->fib6_flags & RTF_REJECT) {
1103		ip6_rt_init_dst_reject(rt, res->fib6_type);
1104		return;
1105	}
1106
1107	rt->dst.error = 0;
1108	rt->dst.output = ip6_output;
1109
1110	if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
1111		rt->dst.input = ip6_input;
1112	} else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
1113		rt->dst.input = ip6_mc_input;
1114	} else {
1115		rt->dst.input = ip6_forward;
1116	}
1117
1118	if (res->nh->fib_nh_lws) {
1119		rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
1120		lwtunnel_set_redirect(&rt->dst);
1121	}
1122
1123	rt->dst.lastuse = jiffies;
1124}
1125
1126/* Caller must already hold reference to @from */
1127static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1128{
1129	rt->rt6i_flags &= ~RTF_EXPIRES;
1130	rcu_assign_pointer(rt->from, from);
1131	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1132}
1133
1134/* Caller must already hold reference to f6i in result */
1135static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1136{
1137	const struct fib6_nh *nh = res->nh;
1138	const struct net_device *dev = nh->fib_nh_dev;
1139	struct fib6_info *f6i = res->f6i;
1140
1141	ip6_rt_init_dst(rt, res);
1142
1143	rt->rt6i_dst = f6i->fib6_dst;
1144	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1145	rt->rt6i_flags = res->fib6_flags;
1146	if (nh->fib_nh_gw_family) {
1147		rt->rt6i_gateway = nh->fib_nh_gw6;
1148		rt->rt6i_flags |= RTF_GATEWAY;
1149	}
1150	rt6_set_from(rt, f6i);
1151#ifdef CONFIG_IPV6_SUBTREES
1152	rt->rt6i_src = f6i->fib6_src;
1153#endif
1154}
1155
1156static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1157					struct in6_addr *saddr)
1158{
1159	struct fib6_node *pn, *sn;
1160	while (1) {
1161		if (fn->fn_flags & RTN_TL_ROOT)
1162			return NULL;
1163		pn = rcu_dereference(fn->parent);
1164		sn = FIB6_SUBTREE(pn);
1165		if (sn && sn != fn)
1166			fn = fib6_node_lookup(sn, NULL, saddr);
1167		else
1168			fn = pn;
1169		if (fn->fn_flags & RTN_RTINFO)
1170			return fn;
1171	}
1172}
1173
1174static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1175{
1176	struct rt6_info *rt = *prt;
1177
1178	if (dst_hold_safe(&rt->dst))
1179		return true;
1180	if (net) {
1181		rt = net->ipv6.ip6_null_entry;
1182		dst_hold(&rt->dst);
1183	} else {
1184		rt = NULL;
1185	}
1186	*prt = rt;
1187	return false;
1188}
1189
1190/* called with rcu_lock held */
1191static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1192{
1193	struct net_device *dev = res->nh->fib_nh_dev;
1194	struct fib6_info *f6i = res->f6i;
1195	unsigned short flags;
1196	struct rt6_info *nrt;
1197
1198	if (!fib6_info_hold_safe(f6i))
1199		goto fallback;
1200
1201	flags = fib6_info_dst_flags(f6i);
1202	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1203	if (!nrt) {
1204		fib6_info_release(f6i);
1205		goto fallback;
1206	}
1207
1208	ip6_rt_copy_init(nrt, res);
1209	return nrt;
1210
1211fallback:
1212	nrt = dev_net(dev)->ipv6.ip6_null_entry;
1213	dst_hold(&nrt->dst);
1214	return nrt;
1215}
1216
1217INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net,
1218					     struct fib6_table *table,
1219					     struct flowi6 *fl6,
1220					     const struct sk_buff *skb,
1221					     int flags)
1222{
1223	struct fib6_result res = {};
1224	struct fib6_node *fn;
1225	struct rt6_info *rt;
1226
 
 
 
1227	rcu_read_lock();
1228	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1229restart:
1230	res.f6i = rcu_dereference(fn->leaf);
1231	if (!res.f6i)
1232		res.f6i = net->ipv6.fib6_null_entry;
1233	else
1234		rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1235				 flags);
1236
1237	if (res.f6i == net->ipv6.fib6_null_entry) {
1238		fn = fib6_backtrack(fn, &fl6->saddr);
1239		if (fn)
1240			goto restart;
1241
1242		rt = net->ipv6.ip6_null_entry;
1243		dst_hold(&rt->dst);
1244		goto out;
1245	} else if (res.fib6_flags & RTF_REJECT) {
1246		goto do_create;
1247	}
1248
1249	fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1250			 fl6->flowi6_oif != 0, skb, flags);
1251
1252	/* Search through exception table */
1253	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1254	if (rt) {
1255		if (ip6_hold_safe(net, &rt))
1256			dst_use_noref(&rt->dst, jiffies);
1257	} else {
1258do_create:
1259		rt = ip6_create_rt_rcu(&res);
1260	}
1261
1262out:
1263	trace_fib6_table_lookup(net, &res, table, fl6);
1264
1265	rcu_read_unlock();
1266
1267	return rt;
1268}
1269
1270struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1271				   const struct sk_buff *skb, int flags)
1272{
1273	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1274}
1275EXPORT_SYMBOL_GPL(ip6_route_lookup);
1276
1277struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1278			    const struct in6_addr *saddr, int oif,
1279			    const struct sk_buff *skb, int strict)
1280{
1281	struct flowi6 fl6 = {
1282		.flowi6_oif = oif,
1283		.daddr = *daddr,
1284	};
1285	struct dst_entry *dst;
1286	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1287
1288	if (saddr) {
1289		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1290		flags |= RT6_LOOKUP_F_HAS_SADDR;
1291	}
1292
1293	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1294	if (dst->error == 0)
1295		return dst_rt6_info(dst);
1296
1297	dst_release(dst);
1298
1299	return NULL;
1300}
1301EXPORT_SYMBOL(rt6_lookup);
1302
1303/* ip6_ins_rt is called with FREE table->tb6_lock.
1304 * It takes new route entry, the addition fails by any reason the
1305 * route is released.
1306 * Caller must hold dst before calling it.
1307 */
1308
1309static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1310			struct netlink_ext_ack *extack)
1311{
1312	int err;
1313	struct fib6_table *table;
1314
1315	table = rt->fib6_table;
1316	spin_lock_bh(&table->tb6_lock);
1317	err = fib6_add(&table->tb6_root, rt, info, extack);
1318	spin_unlock_bh(&table->tb6_lock);
1319
1320	return err;
1321}
1322
1323int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1324{
1325	struct nl_info info = {	.nl_net = net, };
1326
1327	return __ip6_ins_rt(rt, &info, NULL);
1328}
1329
1330static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1331					   const struct in6_addr *daddr,
1332					   const struct in6_addr *saddr)
1333{
1334	struct fib6_info *f6i = res->f6i;
1335	struct net_device *dev;
1336	struct rt6_info *rt;
1337
1338	/*
1339	 *	Clone the route.
1340	 */
1341
1342	if (!fib6_info_hold_safe(f6i))
1343		return NULL;
1344
1345	dev = ip6_rt_get_dev_rcu(res);
1346	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1347	if (!rt) {
1348		fib6_info_release(f6i);
1349		return NULL;
1350	}
1351
1352	ip6_rt_copy_init(rt, res);
1353	rt->rt6i_flags |= RTF_CACHE;
 
1354	rt->rt6i_dst.addr = *daddr;
1355	rt->rt6i_dst.plen = 128;
1356
1357	if (!rt6_is_gw_or_nonexthop(res)) {
1358		if (f6i->fib6_dst.plen != 128 &&
1359		    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1360			rt->rt6i_flags |= RTF_ANYCAST;
1361#ifdef CONFIG_IPV6_SUBTREES
1362		if (rt->rt6i_src.plen && saddr) {
1363			rt->rt6i_src.addr = *saddr;
1364			rt->rt6i_src.plen = 128;
1365		}
1366#endif
1367	}
1368
1369	return rt;
1370}
1371
1372static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1373{
1374	struct fib6_info *f6i = res->f6i;
1375	unsigned short flags = fib6_info_dst_flags(f6i);
1376	struct net_device *dev;
1377	struct rt6_info *pcpu_rt;
1378
1379	if (!fib6_info_hold_safe(f6i))
1380		return NULL;
1381
1382	rcu_read_lock();
1383	dev = ip6_rt_get_dev_rcu(res);
1384	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT);
1385	rcu_read_unlock();
1386	if (!pcpu_rt) {
1387		fib6_info_release(f6i);
1388		return NULL;
1389	}
1390	ip6_rt_copy_init(pcpu_rt, res);
1391	pcpu_rt->rt6i_flags |= RTF_PCPU;
1392
1393	if (f6i->nh)
1394		pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev));
1395
1396	return pcpu_rt;
1397}
1398
1399static bool rt6_is_valid(const struct rt6_info *rt6)
1400{
1401	return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev));
1402}
1403
1404/* It should be called with rcu_read_lock() acquired */
1405static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1406{
1407	struct rt6_info *pcpu_rt;
1408
1409	pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
1410
1411	if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) {
1412		struct rt6_info *prev, **p;
1413
1414		p = this_cpu_ptr(res->nh->rt6i_pcpu);
1415		/* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */
1416		prev = xchg(p, NULL);
1417		if (prev) {
1418			dst_dev_put(&prev->dst);
1419			dst_release(&prev->dst);
1420		}
1421
1422		pcpu_rt = NULL;
1423	}
1424
1425	return pcpu_rt;
1426}
1427
1428static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1429					    const struct fib6_result *res)
1430{
1431	struct rt6_info *pcpu_rt, *prev, **p;
1432
1433	pcpu_rt = ip6_rt_pcpu_alloc(res);
1434	if (!pcpu_rt)
1435		return NULL;
1436
1437	p = this_cpu_ptr(res->nh->rt6i_pcpu);
1438	prev = cmpxchg(p, NULL, pcpu_rt);
1439	BUG_ON(prev);
1440
1441	if (res->f6i->fib6_destroying) {
1442		struct fib6_info *from;
1443
1444		from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
1445		fib6_info_release(from);
1446	}
1447
1448	return pcpu_rt;
1449}
1450
1451/* exception hash table implementation
1452 */
1453static DEFINE_SPINLOCK(rt6_exception_lock);
1454
1455/* Remove rt6_ex from hash table and free the memory
1456 * Caller must hold rt6_exception_lock
1457 */
1458static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1459				 struct rt6_exception *rt6_ex)
1460{
 
1461	struct net *net;
1462
1463	if (!bucket || !rt6_ex)
1464		return;
1465
1466	net = dev_net(rt6_ex->rt6i->dst.dev);
1467	net->ipv6.rt6_stats->fib_rt_cache--;
1468
1469	/* purge completely the exception to allow releasing the held resources:
1470	 * some [sk] cache may keep the dst around for unlimited time
1471	 */
 
 
1472	dst_dev_put(&rt6_ex->rt6i->dst);
1473
1474	hlist_del_rcu(&rt6_ex->hlist);
1475	dst_release(&rt6_ex->rt6i->dst);
1476	kfree_rcu(rt6_ex, rcu);
1477	WARN_ON_ONCE(!bucket->depth);
1478	bucket->depth--;
1479}
1480
1481/* Remove oldest rt6_ex in bucket and free the memory
1482 * Caller must hold rt6_exception_lock
1483 */
1484static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1485{
1486	struct rt6_exception *rt6_ex, *oldest = NULL;
1487
1488	if (!bucket)
1489		return;
1490
1491	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1492		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1493			oldest = rt6_ex;
1494	}
1495	rt6_remove_exception(bucket, oldest);
1496}
1497
1498static u32 rt6_exception_hash(const struct in6_addr *dst,
1499			      const struct in6_addr *src)
1500{
1501	static siphash_aligned_key_t rt6_exception_key;
1502	struct {
1503		struct in6_addr dst;
1504		struct in6_addr src;
1505	} __aligned(SIPHASH_ALIGNMENT) combined = {
1506		.dst = *dst,
1507	};
1508	u64 val;
1509
1510	net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key));
 
1511
1512#ifdef CONFIG_IPV6_SUBTREES
1513	if (src)
1514		combined.src = *src;
1515#endif
1516	val = siphash(&combined, sizeof(combined), &rt6_exception_key);
1517
1518	return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1519}
1520
1521/* Helper function to find the cached rt in the hash table
1522 * and update bucket pointer to point to the bucket for this
1523 * (daddr, saddr) pair
1524 * Caller must hold rt6_exception_lock
1525 */
1526static struct rt6_exception *
1527__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1528			      const struct in6_addr *daddr,
1529			      const struct in6_addr *saddr)
1530{
1531	struct rt6_exception *rt6_ex;
1532	u32 hval;
1533
1534	if (!(*bucket) || !daddr)
1535		return NULL;
1536
1537	hval = rt6_exception_hash(daddr, saddr);
1538	*bucket += hval;
1539
1540	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1541		struct rt6_info *rt6 = rt6_ex->rt6i;
1542		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1543
1544#ifdef CONFIG_IPV6_SUBTREES
1545		if (matched && saddr)
1546			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1547#endif
1548		if (matched)
1549			return rt6_ex;
1550	}
1551	return NULL;
1552}
1553
1554/* Helper function to find the cached rt in the hash table
1555 * and update bucket pointer to point to the bucket for this
1556 * (daddr, saddr) pair
1557 * Caller must hold rcu_read_lock()
1558 */
1559static struct rt6_exception *
1560__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1561			 const struct in6_addr *daddr,
1562			 const struct in6_addr *saddr)
1563{
1564	struct rt6_exception *rt6_ex;
1565	u32 hval;
1566
1567	WARN_ON_ONCE(!rcu_read_lock_held());
1568
1569	if (!(*bucket) || !daddr)
1570		return NULL;
1571
1572	hval = rt6_exception_hash(daddr, saddr);
1573	*bucket += hval;
1574
1575	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1576		struct rt6_info *rt6 = rt6_ex->rt6i;
1577		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1578
1579#ifdef CONFIG_IPV6_SUBTREES
1580		if (matched && saddr)
1581			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1582#endif
1583		if (matched)
1584			return rt6_ex;
1585	}
1586	return NULL;
1587}
1588
1589static unsigned int fib6_mtu(const struct fib6_result *res)
1590{
1591	const struct fib6_nh *nh = res->nh;
1592	unsigned int mtu;
1593
1594	if (res->f6i->fib6_pmtu) {
1595		mtu = res->f6i->fib6_pmtu;
1596	} else {
1597		struct net_device *dev = nh->fib_nh_dev;
1598		struct inet6_dev *idev;
1599
1600		rcu_read_lock();
1601		idev = __in6_dev_get(dev);
1602		mtu = READ_ONCE(idev->cnf.mtu6);
1603		rcu_read_unlock();
1604	}
1605
1606	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1607
1608	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1609}
1610
1611#define FIB6_EXCEPTION_BUCKET_FLUSHED  0x1UL
1612
1613/* used when the flushed bit is not relevant, only access to the bucket
1614 * (ie., all bucket users except rt6_insert_exception);
1615 *
1616 * called under rcu lock; sometimes called with rt6_exception_lock held
1617 */
1618static
1619struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
1620						       spinlock_t *lock)
1621{
1622	struct rt6_exception_bucket *bucket;
1623
1624	if (lock)
1625		bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1626						   lockdep_is_held(lock));
1627	else
1628		bucket = rcu_dereference(nh->rt6i_exception_bucket);
1629
1630	/* remove bucket flushed bit if set */
1631	if (bucket) {
1632		unsigned long p = (unsigned long)bucket;
1633
1634		p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
1635		bucket = (struct rt6_exception_bucket *)p;
1636	}
1637
1638	return bucket;
1639}
1640
1641static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
1642{
1643	unsigned long p = (unsigned long)bucket;
1644
1645	return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
1646}
1647
1648/* called with rt6_exception_lock held */
1649static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
1650					      spinlock_t *lock)
1651{
1652	struct rt6_exception_bucket *bucket;
1653	unsigned long p;
1654
1655	bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1656					   lockdep_is_held(lock));
1657
1658	p = (unsigned long)bucket;
1659	p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
1660	bucket = (struct rt6_exception_bucket *)p;
1661	rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
1662}
1663
1664static int rt6_insert_exception(struct rt6_info *nrt,
1665				const struct fib6_result *res)
1666{
1667	struct net *net = dev_net(nrt->dst.dev);
1668	struct rt6_exception_bucket *bucket;
1669	struct fib6_info *f6i = res->f6i;
1670	struct in6_addr *src_key = NULL;
1671	struct rt6_exception *rt6_ex;
1672	struct fib6_nh *nh = res->nh;
1673	int max_depth;
1674	int err = 0;
1675
1676	spin_lock_bh(&rt6_exception_lock);
1677
1678	bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1679					  lockdep_is_held(&rt6_exception_lock));
1680	if (!bucket) {
1681		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1682				 GFP_ATOMIC);
1683		if (!bucket) {
1684			err = -ENOMEM;
1685			goto out;
1686		}
1687		rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
1688	} else if (fib6_nh_excptn_bucket_flushed(bucket)) {
1689		err = -EINVAL;
1690		goto out;
1691	}
1692
1693#ifdef CONFIG_IPV6_SUBTREES
1694	/* fib6_src.plen != 0 indicates f6i is in subtree
1695	 * and exception table is indexed by a hash of
1696	 * both fib6_dst and fib6_src.
1697	 * Otherwise, the exception table is indexed by
1698	 * a hash of only fib6_dst.
1699	 */
1700	if (f6i->fib6_src.plen)
1701		src_key = &nrt->rt6i_src.addr;
1702#endif
1703	/* rt6_mtu_change() might lower mtu on f6i.
1704	 * Only insert this exception route if its mtu
1705	 * is less than f6i's mtu value.
1706	 */
1707	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1708		err = -EINVAL;
1709		goto out;
1710	}
1711
1712	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1713					       src_key);
1714	if (rt6_ex)
1715		rt6_remove_exception(bucket, rt6_ex);
1716
1717	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1718	if (!rt6_ex) {
1719		err = -ENOMEM;
1720		goto out;
1721	}
1722	rt6_ex->rt6i = nrt;
1723	rt6_ex->stamp = jiffies;
1724	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1725	bucket->depth++;
1726	net->ipv6.rt6_stats->fib_rt_cache++;
1727
1728	/* Randomize max depth to avoid some side channels attacks. */
1729	max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH);
1730	while (bucket->depth > max_depth)
1731		rt6_exception_remove_oldest(bucket);
1732
1733out:
1734	spin_unlock_bh(&rt6_exception_lock);
1735
1736	/* Update fn->fn_sernum to invalidate all cached dst */
1737	if (!err) {
1738		spin_lock_bh(&f6i->fib6_table->tb6_lock);
1739		fib6_update_sernum(net, f6i);
1740		spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1741		fib6_force_start_gc(net);
1742	}
1743
1744	return err;
1745}
1746
1747static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
1748{
1749	struct rt6_exception_bucket *bucket;
1750	struct rt6_exception *rt6_ex;
1751	struct hlist_node *tmp;
1752	int i;
1753
1754	spin_lock_bh(&rt6_exception_lock);
1755
1756	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1757	if (!bucket)
1758		goto out;
1759
1760	/* Prevent rt6_insert_exception() to recreate the bucket list */
1761	if (!from)
1762		fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
1763
1764	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1765		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
1766			if (!from ||
1767			    rcu_access_pointer(rt6_ex->rt6i->from) == from)
1768				rt6_remove_exception(bucket, rt6_ex);
1769		}
1770		WARN_ON_ONCE(!from && bucket->depth);
1771		bucket++;
1772	}
1773out:
1774	spin_unlock_bh(&rt6_exception_lock);
1775}
1776
1777static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
1778{
1779	struct fib6_info *f6i = arg;
1780
1781	fib6_nh_flush_exceptions(nh, f6i);
1782
1783	return 0;
1784}
1785
1786void rt6_flush_exceptions(struct fib6_info *f6i)
1787{
1788	if (f6i->nh)
1789		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions,
1790					 f6i);
1791	else
1792		fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
1793}
1794
1795/* Find cached rt in the hash table inside passed in rt
1796 * Caller has to hold rcu_read_lock()
1797 */
1798static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1799					   const struct in6_addr *daddr,
1800					   const struct in6_addr *saddr)
1801{
1802	const struct in6_addr *src_key = NULL;
1803	struct rt6_exception_bucket *bucket;
1804	struct rt6_exception *rt6_ex;
1805	struct rt6_info *ret = NULL;
1806
1807#ifdef CONFIG_IPV6_SUBTREES
1808	/* fib6i_src.plen != 0 indicates f6i is in subtree
1809	 * and exception table is indexed by a hash of
1810	 * both fib6_dst and fib6_src.
1811	 * However, the src addr used to create the hash
1812	 * might not be exactly the passed in saddr which
1813	 * is a /128 addr from the flow.
1814	 * So we need to use f6i->fib6_src to redo lookup
1815	 * if the passed in saddr does not find anything.
1816	 * (See the logic in ip6_rt_cache_alloc() on how
1817	 * rt->rt6i_src is updated.)
1818	 */
1819	if (res->f6i->fib6_src.plen)
1820		src_key = saddr;
1821find_ex:
1822#endif
1823	bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
1824	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1825
1826	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1827		ret = rt6_ex->rt6i;
1828
1829#ifdef CONFIG_IPV6_SUBTREES
1830	/* Use fib6_src as src_key and redo lookup */
1831	if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1832		src_key = &res->f6i->fib6_src.addr;
1833		goto find_ex;
1834	}
1835#endif
1836
1837	return ret;
1838}
1839
1840/* Remove the passed in cached rt from the hash table that contains it */
1841static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
1842				    const struct rt6_info *rt)
1843{
1844	const struct in6_addr *src_key = NULL;
1845	struct rt6_exception_bucket *bucket;
1846	struct rt6_exception *rt6_ex;
1847	int err;
1848
1849	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
1850		return -ENOENT;
1851
1852	spin_lock_bh(&rt6_exception_lock);
1853	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1854
1855#ifdef CONFIG_IPV6_SUBTREES
1856	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1857	 * and exception table is indexed by a hash of
1858	 * both rt6i_dst and rt6i_src.
1859	 * Otherwise, the exception table is indexed by
1860	 * a hash of only rt6i_dst.
1861	 */
1862	if (plen)
1863		src_key = &rt->rt6i_src.addr;
1864#endif
1865	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1866					       &rt->rt6i_dst.addr,
1867					       src_key);
1868	if (rt6_ex) {
1869		rt6_remove_exception(bucket, rt6_ex);
1870		err = 0;
1871	} else {
1872		err = -ENOENT;
1873	}
1874
1875	spin_unlock_bh(&rt6_exception_lock);
1876	return err;
1877}
1878
1879struct fib6_nh_excptn_arg {
1880	struct rt6_info	*rt;
1881	int		plen;
1882};
1883
1884static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
1885{
1886	struct fib6_nh_excptn_arg *arg = _arg;
1887	int err;
1888
1889	err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
1890	if (err == 0)
1891		return 1;
1892
1893	return 0;
1894}
1895
1896static int rt6_remove_exception_rt(struct rt6_info *rt)
1897{
1898	struct fib6_info *from;
1899
1900	from = rcu_dereference(rt->from);
1901	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1902		return -EINVAL;
1903
1904	if (from->nh) {
1905		struct fib6_nh_excptn_arg arg = {
1906			.rt = rt,
1907			.plen = from->fib6_src.plen
1908		};
1909		int rc;
1910
1911		/* rc = 1 means an entry was found */
1912		rc = nexthop_for_each_fib6_nh(from->nh,
1913					      rt6_nh_remove_exception_rt,
1914					      &arg);
1915		return rc ? 0 : -ENOENT;
1916	}
1917
1918	return fib6_nh_remove_exception(from->fib6_nh,
1919					from->fib6_src.plen, rt);
1920}
1921
1922/* Find rt6_ex which contains the passed in rt cache and
1923 * refresh its stamp
1924 */
1925static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
1926				     const struct rt6_info *rt)
1927{
1928	const struct in6_addr *src_key = NULL;
1929	struct rt6_exception_bucket *bucket;
1930	struct rt6_exception *rt6_ex;
1931
1932	bucket = fib6_nh_get_excptn_bucket(nh, NULL);
1933#ifdef CONFIG_IPV6_SUBTREES
1934	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1935	 * and exception table is indexed by a hash of
1936	 * both rt6i_dst and rt6i_src.
1937	 * Otherwise, the exception table is indexed by
1938	 * a hash of only rt6i_dst.
1939	 */
1940	if (plen)
1941		src_key = &rt->rt6i_src.addr;
1942#endif
1943	rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
1944	if (rt6_ex)
1945		rt6_ex->stamp = jiffies;
1946}
1947
1948struct fib6_nh_match_arg {
1949	const struct net_device *dev;
1950	const struct in6_addr	*gw;
1951	struct fib6_nh		*match;
1952};
1953
1954/* determine if fib6_nh has given device and gateway */
1955static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
1956{
1957	struct fib6_nh_match_arg *arg = _arg;
1958
1959	if (arg->dev != nh->fib_nh_dev ||
1960	    (arg->gw && !nh->fib_nh_gw_family) ||
1961	    (!arg->gw && nh->fib_nh_gw_family) ||
1962	    (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
1963		return 0;
1964
1965	arg->match = nh;
1966
1967	/* found a match, break the loop */
1968	return 1;
1969}
1970
1971static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1972{
1973	struct fib6_info *from;
1974	struct fib6_nh *fib6_nh;
1975
1976	rcu_read_lock();
1977
1978	from = rcu_dereference(rt->from);
1979	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1980		goto unlock;
1981
1982	if (from->nh) {
1983		struct fib6_nh_match_arg arg = {
1984			.dev = rt->dst.dev,
1985			.gw = &rt->rt6i_gateway,
1986		};
1987
1988		nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);
1989
1990		if (!arg.match)
1991			goto unlock;
1992		fib6_nh = arg.match;
1993	} else {
1994		fib6_nh = from->fib6_nh;
1995	}
1996	fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
1997unlock:
1998	rcu_read_unlock();
1999}
2000
2001static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
2002					 struct rt6_info *rt, int mtu)
2003{
2004	/* If the new MTU is lower than the route PMTU, this new MTU will be the
2005	 * lowest MTU in the path: always allow updating the route PMTU to
2006	 * reflect PMTU decreases.
2007	 *
2008	 * If the new MTU is higher, and the route PMTU is equal to the local
2009	 * MTU, this means the old MTU is the lowest in the path, so allow
2010	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
2011	 * handle this.
2012	 */
2013
2014	if (dst_mtu(&rt->dst) >= mtu)
2015		return true;
2016
2017	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
2018		return true;
2019
2020	return false;
2021}
2022
2023static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
2024				       const struct fib6_nh *nh, int mtu)
2025{
2026	struct rt6_exception_bucket *bucket;
2027	struct rt6_exception *rt6_ex;
2028	int i;
2029
2030	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
2031	if (!bucket)
2032		return;
2033
2034	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2035		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
2036			struct rt6_info *entry = rt6_ex->rt6i;
2037
2038			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
2039			 * route), the metrics of its rt->from have already
2040			 * been updated.
2041			 */
2042			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
2043			    rt6_mtu_change_route_allowed(idev, entry, mtu))
2044				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
2045		}
2046		bucket++;
2047	}
2048}
2049
2050#define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
2051
2052static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
2053					    const struct in6_addr *gateway)
2054{
2055	struct rt6_exception_bucket *bucket;
2056	struct rt6_exception *rt6_ex;
2057	struct hlist_node *tmp;
2058	int i;
2059
2060	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
2061		return;
2062
2063	spin_lock_bh(&rt6_exception_lock);
2064	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
2065	if (bucket) {
2066		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2067			hlist_for_each_entry_safe(rt6_ex, tmp,
2068						  &bucket->chain, hlist) {
2069				struct rt6_info *entry = rt6_ex->rt6i;
2070
2071				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
2072				    RTF_CACHE_GATEWAY &&
2073				    ipv6_addr_equal(gateway,
2074						    &entry->rt6i_gateway)) {
2075					rt6_remove_exception(bucket, rt6_ex);
2076				}
2077			}
2078			bucket++;
2079		}
2080	}
2081
2082	spin_unlock_bh(&rt6_exception_lock);
2083}
2084
2085static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
2086				      struct rt6_exception *rt6_ex,
2087				      struct fib6_gc_args *gc_args,
2088				      unsigned long now)
2089{
2090	struct rt6_info *rt = rt6_ex->rt6i;
2091
2092	/* we are pruning and obsoleting aged-out and non gateway exceptions
2093	 * even if others have still references to them, so that on next
2094	 * dst_check() such references can be dropped.
2095	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
2096	 * expired, independently from their aging, as per RFC 8201 section 4
2097	 */
2098	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
2099		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
2100			pr_debug("aging clone %p\n", rt);
2101			rt6_remove_exception(bucket, rt6_ex);
2102			return;
2103		}
2104	} else if (time_after(jiffies, rt->dst.expires)) {
2105		pr_debug("purging expired route %p\n", rt);
2106		rt6_remove_exception(bucket, rt6_ex);
2107		return;
2108	}
2109
2110	if (rt->rt6i_flags & RTF_GATEWAY) {
2111		struct neighbour *neigh;
 
2112
2113		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
 
 
2114
2115		if (!(neigh && (neigh->flags & NTF_ROUTER))) {
2116			pr_debug("purging route %p via non-router but gateway\n",
2117				 rt);
2118			rt6_remove_exception(bucket, rt6_ex);
2119			return;
2120		}
2121	}
2122
2123	gc_args->more++;
2124}
2125
2126static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
2127				   struct fib6_gc_args *gc_args,
2128				   unsigned long now)
2129{
2130	struct rt6_exception_bucket *bucket;
2131	struct rt6_exception *rt6_ex;
2132	struct hlist_node *tmp;
2133	int i;
2134
2135	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
2136		return;
2137
2138	rcu_read_lock_bh();
2139	spin_lock(&rt6_exception_lock);
2140	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
2141	if (bucket) {
2142		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2143			hlist_for_each_entry_safe(rt6_ex, tmp,
2144						  &bucket->chain, hlist) {
2145				rt6_age_examine_exception(bucket, rt6_ex,
2146							  gc_args, now);
2147			}
2148			bucket++;
2149		}
2150	}
2151	spin_unlock(&rt6_exception_lock);
2152	rcu_read_unlock_bh();
2153}
2154
2155struct fib6_nh_age_excptn_arg {
2156	struct fib6_gc_args	*gc_args;
2157	unsigned long		now;
2158};
2159
2160static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
2161{
2162	struct fib6_nh_age_excptn_arg *arg = _arg;
2163
2164	fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
2165	return 0;
2166}
2167
2168void rt6_age_exceptions(struct fib6_info *f6i,
2169			struct fib6_gc_args *gc_args,
2170			unsigned long now)
2171{
2172	if (f6i->nh) {
2173		struct fib6_nh_age_excptn_arg arg = {
2174			.gc_args = gc_args,
2175			.now = now
2176		};
2177
2178		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
2179					 &arg);
2180	} else {
2181		fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
2182	}
2183}
2184
2185/* must be called with rcu lock held */
2186int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
2187		      struct flowi6 *fl6, struct fib6_result *res, int strict)
2188{
2189	struct fib6_node *fn, *saved_fn;
2190
2191	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2192	saved_fn = fn;
2193
 
 
 
2194redo_rt6_select:
2195	rt6_select(net, fn, oif, res, strict);
2196	if (res->f6i == net->ipv6.fib6_null_entry) {
2197		fn = fib6_backtrack(fn, &fl6->saddr);
2198		if (fn)
2199			goto redo_rt6_select;
2200		else if (strict & RT6_LOOKUP_F_REACHABLE) {
2201			/* also consider unreachable route */
2202			strict &= ~RT6_LOOKUP_F_REACHABLE;
2203			fn = saved_fn;
2204			goto redo_rt6_select;
2205		}
2206	}
2207
2208	trace_fib6_table_lookup(net, res, table, fl6);
2209
2210	return 0;
2211}
2212
2213struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
2214			       int oif, struct flowi6 *fl6,
2215			       const struct sk_buff *skb, int flags)
2216{
2217	struct fib6_result res = {};
2218	struct rt6_info *rt = NULL;
2219	int strict = 0;
2220
2221	WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
2222		     !rcu_read_lock_held());
2223
2224	strict |= flags & RT6_LOOKUP_F_IFACE;
2225	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
2226	if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
2227		strict |= RT6_LOOKUP_F_REACHABLE;
2228
2229	rcu_read_lock();
2230
2231	fib6_table_lookup(net, table, oif, fl6, &res, strict);
2232	if (res.f6i == net->ipv6.fib6_null_entry)
2233		goto out;
2234
2235	fib6_select_path(net, &res, fl6, oif, false, skb, strict);
2236
2237	/*Search through exception table */
2238	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
2239	if (rt) {
2240		goto out;
2241	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
2242			    !res.nh->fib_nh_gw_family)) {
2243		/* Create a RTF_CACHE clone which will not be
2244		 * owned by the fib6 tree.  It is for the special case where
2245		 * the daddr in the skb during the neighbor look-up is different
2246		 * from the fl6->daddr used to look-up route here.
2247		 */
2248		rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
2249
2250		if (rt) {
2251			/* 1 refcnt is taken during ip6_rt_cache_alloc().
2252			 * As rt6_uncached_list_add() does not consume refcnt,
2253			 * this refcnt is always returned to the caller even
2254			 * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
2255			 */
2256			rt6_uncached_list_add(rt);
 
2257			rcu_read_unlock();
2258
2259			return rt;
2260		}
2261	} else {
2262		/* Get a percpu copy */
2263		local_bh_disable();
2264		rt = rt6_get_pcpu_route(&res);
2265
2266		if (!rt)
2267			rt = rt6_make_pcpu_route(net, &res);
2268
2269		local_bh_enable();
2270	}
2271out:
2272	if (!rt)
2273		rt = net->ipv6.ip6_null_entry;
2274	if (!(flags & RT6_LOOKUP_F_DST_NOREF))
2275		ip6_hold_safe(net, &rt);
2276	rcu_read_unlock();
2277
2278	return rt;
2279}
2280EXPORT_SYMBOL_GPL(ip6_pol_route);
2281
2282INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
2283					    struct fib6_table *table,
2284					    struct flowi6 *fl6,
2285					    const struct sk_buff *skb,
2286					    int flags)
2287{
2288	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
2289}
2290
2291struct dst_entry *ip6_route_input_lookup(struct net *net,
2292					 struct net_device *dev,
2293					 struct flowi6 *fl6,
2294					 const struct sk_buff *skb,
2295					 int flags)
2296{
2297	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
2298		flags |= RT6_LOOKUP_F_IFACE;
2299
2300	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
2301}
2302EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
2303
2304static void ip6_multipath_l3_keys(const struct sk_buff *skb,
2305				  struct flow_keys *keys,
2306				  struct flow_keys *flkeys)
2307{
2308	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
2309	const struct ipv6hdr *key_iph = outer_iph;
2310	struct flow_keys *_flkeys = flkeys;
2311	const struct ipv6hdr *inner_iph;
2312	const struct icmp6hdr *icmph;
2313	struct ipv6hdr _inner_iph;
2314	struct icmp6hdr _icmph;
2315
2316	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2317		goto out;
2318
2319	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2320				   sizeof(_icmph), &_icmph);
2321	if (!icmph)
2322		goto out;
2323
2324	if (!icmpv6_is_err(icmph->icmp6_type))
 
 
 
2325		goto out;
2326
2327	inner_iph = skb_header_pointer(skb,
2328				       skb_transport_offset(skb) + sizeof(*icmph),
2329				       sizeof(_inner_iph), &_inner_iph);
2330	if (!inner_iph)
2331		goto out;
2332
2333	key_iph = inner_iph;
2334	_flkeys = NULL;
2335out:
2336	if (_flkeys) {
2337		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2338		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2339		keys->tags.flow_label = _flkeys->tags.flow_label;
2340		keys->basic.ip_proto = _flkeys->basic.ip_proto;
2341	} else {
2342		keys->addrs.v6addrs.src = key_iph->saddr;
2343		keys->addrs.v6addrs.dst = key_iph->daddr;
2344		keys->tags.flow_label = ip6_flowlabel(key_iph);
2345		keys->basic.ip_proto = key_iph->nexthdr;
2346	}
2347}
2348
2349static u32 rt6_multipath_custom_hash_outer(const struct net *net,
2350					   const struct sk_buff *skb,
2351					   bool *p_has_inner)
2352{
2353	u32 hash_fields = ip6_multipath_hash_fields(net);
2354	struct flow_keys keys, hash_keys;
2355
2356	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2357		return 0;
2358
2359	memset(&hash_keys, 0, sizeof(hash_keys));
2360	skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
2361
2362	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2363	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2364		hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2365	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2366		hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2367	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2368		hash_keys.basic.ip_proto = keys.basic.ip_proto;
2369	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
2370		hash_keys.tags.flow_label = keys.tags.flow_label;
2371	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2372		hash_keys.ports.src = keys.ports.src;
2373	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2374		hash_keys.ports.dst = keys.ports.dst;
2375
2376	*p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
2377	return fib_multipath_hash_from_keys(net, &hash_keys);
2378}
2379
2380static u32 rt6_multipath_custom_hash_inner(const struct net *net,
2381					   const struct sk_buff *skb,
2382					   bool has_inner)
2383{
2384	u32 hash_fields = ip6_multipath_hash_fields(net);
2385	struct flow_keys keys, hash_keys;
2386
2387	/* We assume the packet carries an encapsulation, but if none was
2388	 * encountered during dissection of the outer flow, then there is no
2389	 * point in calling the flow dissector again.
2390	 */
2391	if (!has_inner)
2392		return 0;
2393
2394	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
2395		return 0;
2396
2397	memset(&hash_keys, 0, sizeof(hash_keys));
2398	skb_flow_dissect_flow_keys(skb, &keys, 0);
2399
2400	if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
2401		return 0;
2402
2403	if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2404		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2405		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
2406			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2407		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
2408			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2409	} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2410		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2411		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
2412			hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2413		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
2414			hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2415		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
2416			hash_keys.tags.flow_label = keys.tags.flow_label;
2417	}
2418
2419	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
2420		hash_keys.basic.ip_proto = keys.basic.ip_proto;
2421	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
2422		hash_keys.ports.src = keys.ports.src;
2423	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
2424		hash_keys.ports.dst = keys.ports.dst;
2425
2426	return fib_multipath_hash_from_keys(net, &hash_keys);
2427}
2428
2429static u32 rt6_multipath_custom_hash_skb(const struct net *net,
2430					 const struct sk_buff *skb)
2431{
2432	u32 mhash, mhash_inner;
2433	bool has_inner = true;
2434
2435	mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner);
2436	mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner);
2437
2438	return jhash_2words(mhash, mhash_inner, 0);
2439}
2440
2441static u32 rt6_multipath_custom_hash_fl6(const struct net *net,
2442					 const struct flowi6 *fl6)
2443{
2444	u32 hash_fields = ip6_multipath_hash_fields(net);
2445	struct flow_keys hash_keys;
2446
2447	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2448		return 0;
2449
2450	memset(&hash_keys, 0, sizeof(hash_keys));
2451	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2452	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2453		hash_keys.addrs.v6addrs.src = fl6->saddr;
2454	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2455		hash_keys.addrs.v6addrs.dst = fl6->daddr;
2456	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2457		hash_keys.basic.ip_proto = fl6->flowi6_proto;
2458	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
2459		hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2460	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2461		hash_keys.ports.src = fl6->fl6_sport;
2462	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2463		hash_keys.ports.dst = fl6->fl6_dport;
2464
2465	return fib_multipath_hash_from_keys(net, &hash_keys);
2466}
2467
2468/* if skb is set it will be used and fl6 can be NULL */
2469u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2470		       const struct sk_buff *skb, struct flow_keys *flkeys)
2471{
2472	struct flow_keys hash_keys;
2473	u32 mhash = 0;
2474
2475	switch (ip6_multipath_hash_policy(net)) {
2476	case 0:
2477		memset(&hash_keys, 0, sizeof(hash_keys));
2478		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2479		if (skb) {
2480			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2481		} else {
2482			hash_keys.addrs.v6addrs.src = fl6->saddr;
2483			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2484			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2485			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2486		}
2487		mhash = fib_multipath_hash_from_keys(net, &hash_keys);
2488		break;
2489	case 1:
2490		if (skb) {
2491			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2492			struct flow_keys keys;
2493
2494			/* short-circuit if we already have L4 hash present */
2495			if (skb->l4_hash)
2496				return skb_get_hash_raw(skb) >> 1;
2497
2498			memset(&hash_keys, 0, sizeof(hash_keys));
2499
2500			if (!flkeys) {
2501				skb_flow_dissect_flow_keys(skb, &keys, flag);
2502				flkeys = &keys;
2503			}
2504			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2505			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2506			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2507			hash_keys.ports.src = flkeys->ports.src;
2508			hash_keys.ports.dst = flkeys->ports.dst;
2509			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2510		} else {
2511			memset(&hash_keys, 0, sizeof(hash_keys));
2512			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2513			hash_keys.addrs.v6addrs.src = fl6->saddr;
2514			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2515			hash_keys.ports.src = fl6->fl6_sport;
2516			hash_keys.ports.dst = fl6->fl6_dport;
2517			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2518		}
2519		mhash = fib_multipath_hash_from_keys(net, &hash_keys);
2520		break;
2521	case 2:
2522		memset(&hash_keys, 0, sizeof(hash_keys));
2523		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2524		if (skb) {
2525			struct flow_keys keys;
2526
2527			if (!flkeys) {
2528				skb_flow_dissect_flow_keys(skb, &keys, 0);
2529				flkeys = &keys;
2530			}
2531
2532			/* Inner can be v4 or v6 */
2533			if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2534				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2535				hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2536				hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2537			} else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2538				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2539				hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2540				hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2541				hash_keys.tags.flow_label = flkeys->tags.flow_label;
2542				hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2543			} else {
2544				/* Same as case 0 */
2545				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2546				ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2547			}
2548		} else {
2549			/* Same as case 0 */
2550			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2551			hash_keys.addrs.v6addrs.src = fl6->saddr;
2552			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2553			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2554			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2555		}
2556		mhash = fib_multipath_hash_from_keys(net, &hash_keys);
2557		break;
2558	case 3:
2559		if (skb)
2560			mhash = rt6_multipath_custom_hash_skb(net, skb);
2561		else
2562			mhash = rt6_multipath_custom_hash_fl6(net, fl6);
2563		break;
2564	}
 
2565
2566	return mhash >> 1;
2567}
2568
2569/* Called with rcu held */
2570void ip6_route_input(struct sk_buff *skb)
2571{
2572	const struct ipv6hdr *iph = ipv6_hdr(skb);
2573	struct net *net = dev_net(skb->dev);
2574	int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
2575	struct ip_tunnel_info *tun_info;
2576	struct flowi6 fl6 = {
2577		.flowi6_iif = skb->dev->ifindex,
2578		.daddr = iph->daddr,
2579		.saddr = iph->saddr,
2580		.flowlabel = ip6_flowinfo(iph),
2581		.flowi6_mark = skb->mark,
2582		.flowi6_proto = iph->nexthdr,
2583	};
2584	struct flow_keys *flkeys = NULL, _flkeys;
2585
2586	tun_info = skb_tunnel_info(skb);
2587	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2588		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2589
2590	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2591		flkeys = &_flkeys;
2592
2593	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2594		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2595	skb_dst_drop(skb);
2596	skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
2597						      &fl6, skb, flags));
2598}
2599
2600INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
2601					     struct fib6_table *table,
2602					     struct flowi6 *fl6,
2603					     const struct sk_buff *skb,
2604					     int flags)
2605{
2606	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2607}
2608
2609static struct dst_entry *ip6_route_output_flags_noref(struct net *net,
2610						      const struct sock *sk,
2611						      struct flowi6 *fl6,
2612						      int flags)
2613{
2614	bool any_src;
2615
2616	if (ipv6_addr_type(&fl6->daddr) &
2617	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2618		struct dst_entry *dst;
2619
2620		/* This function does not take refcnt on the dst */
2621		dst = l3mdev_link_scope_lookup(net, fl6);
2622		if (dst)
2623			return dst;
2624	}
2625
2626	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2627
2628	flags |= RT6_LOOKUP_F_DST_NOREF;
2629	any_src = ipv6_addr_any(&fl6->saddr);
2630	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2631	    (fl6->flowi6_oif && any_src))
2632		flags |= RT6_LOOKUP_F_IFACE;
2633
2634	if (!any_src)
2635		flags |= RT6_LOOKUP_F_HAS_SADDR;
2636	else if (sk)
2637		flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs));
2638
2639	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2640}
 
2641
2642struct dst_entry *ip6_route_output_flags(struct net *net,
2643					 const struct sock *sk,
2644					 struct flowi6 *fl6,
2645					 int flags)
2646{
2647	struct dst_entry *dst;
2648	struct rt6_info *rt6;
2649
2650	rcu_read_lock();
2651	dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
2652	rt6 = dst_rt6_info(dst);
2653	/* For dst cached in uncached_list, refcnt is already taken. */
2654	if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) {
2655		dst = &net->ipv6.ip6_null_entry->dst;
2656		dst_hold(dst);
2657	}
2658	rcu_read_unlock();
2659
2660	return dst;
2661}
2662EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2663
2664struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2665{
2666	struct rt6_info *rt, *ort = dst_rt6_info(dst_orig);
2667	struct net_device *loopback_dev = net->loopback_dev;
2668	struct dst_entry *new = NULL;
2669
2670	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev,
2671		       DST_OBSOLETE_DEAD, 0);
2672	if (rt) {
2673		rt6_info_init(rt);
2674		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2675
2676		new = &rt->dst;
2677		new->__use = 1;
2678		new->input = dst_discard;
2679		new->output = dst_discard_out;
2680
2681		dst_copy_metrics(new, &ort->dst);
2682
2683		rt->rt6i_idev = in6_dev_get(loopback_dev);
2684		rt->rt6i_gateway = ort->rt6i_gateway;
2685		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2686
2687		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2688#ifdef CONFIG_IPV6_SUBTREES
2689		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2690#endif
2691	}
2692
2693	dst_release(dst_orig);
2694	return new ? new : ERR_PTR(-ENOMEM);
2695}
2696
2697/*
2698 *	Destination cache support functions
2699 */
2700
2701static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2702{
2703	u32 rt_cookie = 0;
2704
2705	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2706		return false;
2707
2708	if (fib6_check_expired(f6i))
2709		return false;
2710
2711	return true;
2712}
2713
2714static struct dst_entry *rt6_check(struct rt6_info *rt,
2715				   struct fib6_info *from,
2716				   u32 cookie)
2717{
2718	u32 rt_cookie = 0;
2719
2720	if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
2721	    rt_cookie != cookie)
2722		return NULL;
2723
2724	if (rt6_check_expired(rt))
2725		return NULL;
2726
2727	return &rt->dst;
2728}
2729
2730static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2731					    struct fib6_info *from,
2732					    u32 cookie)
2733{
2734	if (!__rt6_check_expired(rt) &&
2735	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2736	    fib6_check(from, cookie))
2737		return &rt->dst;
2738	else
2739		return NULL;
2740}
2741
2742INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
2743							u32 cookie)
2744{
2745	struct dst_entry *dst_ret;
2746	struct fib6_info *from;
2747	struct rt6_info *rt;
2748
2749	rt = dst_rt6_info(dst);
2750
2751	if (rt->sernum)
2752		return rt6_is_valid(rt) ? dst : NULL;
2753
2754	rcu_read_lock();
2755
2756	/* All IPV6 dsts are created with ->obsolete set to the value
2757	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2758	 * into this function always.
2759	 */
2760
2761	from = rcu_dereference(rt->from);
2762
2763	if (from && (rt->rt6i_flags & RTF_PCPU ||
2764	    unlikely(!list_empty(&rt->dst.rt_uncached))))
2765		dst_ret = rt6_dst_from_check(rt, from, cookie);
2766	else
2767		dst_ret = rt6_check(rt, from, cookie);
2768
2769	rcu_read_unlock();
2770
2771	return dst_ret;
2772}
2773EXPORT_INDIRECT_CALLABLE(ip6_dst_check);
2774
2775static void ip6_negative_advice(struct sock *sk,
2776				struct dst_entry *dst)
2777{
2778	struct rt6_info *rt = dst_rt6_info(dst);
2779
2780	if (rt->rt6i_flags & RTF_CACHE) {
2781		rcu_read_lock();
2782		if (rt6_check_expired(rt)) {
2783			/* rt/dst can not be destroyed yet,
2784			 * because of rcu_read_lock()
2785			 */
2786			sk_dst_reset(sk);
2787			rt6_remove_exception_rt(rt);
 
 
 
2788		}
2789		rcu_read_unlock();
2790		return;
2791	}
2792	sk_dst_reset(sk);
2793}
2794
2795static void ip6_link_failure(struct sk_buff *skb)
2796{
2797	struct rt6_info *rt;
2798
2799	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2800
2801	rt = dst_rt6_info(skb_dst(skb));
2802	if (rt) {
2803		rcu_read_lock();
2804		if (rt->rt6i_flags & RTF_CACHE) {
2805			rt6_remove_exception_rt(rt);
2806		} else {
2807			struct fib6_info *from;
2808			struct fib6_node *fn;
2809
2810			from = rcu_dereference(rt->from);
2811			if (from) {
2812				fn = rcu_dereference(from->fib6_node);
2813				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2814					WRITE_ONCE(fn->fn_sernum, -1);
2815			}
2816		}
2817		rcu_read_unlock();
2818	}
2819}
2820
2821static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2822{
2823	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2824		struct fib6_info *from;
2825
2826		rcu_read_lock();
2827		from = rcu_dereference(rt0->from);
2828		if (from)
2829			rt0->dst.expires = from->expires;
2830		rcu_read_unlock();
2831	}
2832
2833	dst_set_expires(&rt0->dst, timeout);
2834	rt0->rt6i_flags |= RTF_EXPIRES;
2835}
2836
2837static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2838{
2839	struct net *net = dev_net(rt->dst.dev);
2840
2841	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2842	rt->rt6i_flags |= RTF_MODIFIED;
2843	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2844}
2845
2846static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2847{
2848	return !(rt->rt6i_flags & RTF_CACHE) &&
2849		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2850}
2851
2852static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2853				 const struct ipv6hdr *iph, u32 mtu,
2854				 bool confirm_neigh)
2855{
2856	const struct in6_addr *daddr, *saddr;
2857	struct rt6_info *rt6 = dst_rt6_info(dst);
2858
2859	/* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
2860	 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
2861	 * [see also comment in rt6_mtu_change_route()]
2862	 */
2863
2864	if (iph) {
2865		daddr = &iph->daddr;
2866		saddr = &iph->saddr;
2867	} else if (sk) {
2868		daddr = &sk->sk_v6_daddr;
2869		saddr = &inet6_sk(sk)->saddr;
2870	} else {
2871		daddr = NULL;
2872		saddr = NULL;
2873	}
2874
2875	if (confirm_neigh)
2876		dst_confirm_neigh(dst, daddr);
2877
2878	if (mtu < IPV6_MIN_MTU)
2879		return;
2880	if (mtu >= dst_mtu(dst))
2881		return;
2882
2883	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2884		rt6_do_update_pmtu(rt6, mtu);
2885		/* update rt6_ex->stamp for cache */
2886		if (rt6->rt6i_flags & RTF_CACHE)
2887			rt6_update_exception_stamp_rt(rt6);
2888	} else if (daddr) {
2889		struct fib6_result res = {};
2890		struct rt6_info *nrt6;
2891
2892		rcu_read_lock();
2893		res.f6i = rcu_dereference(rt6->from);
2894		if (!res.f6i)
2895			goto out_unlock;
2896
2897		res.fib6_flags = res.f6i->fib6_flags;
2898		res.fib6_type = res.f6i->fib6_type;
2899
2900		if (res.f6i->nh) {
2901			struct fib6_nh_match_arg arg = {
2902				.dev = dst->dev,
2903				.gw = &rt6->rt6i_gateway,
2904			};
2905
2906			nexthop_for_each_fib6_nh(res.f6i->nh,
2907						 fib6_nh_find_match, &arg);
2908
2909			/* fib6_info uses a nexthop that does not have fib6_nh
2910			 * using the dst->dev + gw. Should be impossible.
2911			 */
2912			if (!arg.match)
2913				goto out_unlock;
2914
2915			res.nh = arg.match;
2916		} else {
2917			res.nh = res.f6i->fib6_nh;
2918		}
2919
2920		nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2921		if (nrt6) {
2922			rt6_do_update_pmtu(nrt6, mtu);
2923			if (rt6_insert_exception(nrt6, &res))
2924				dst_release_immediate(&nrt6->dst);
2925		}
2926out_unlock:
2927		rcu_read_unlock();
2928	}
2929}
2930
2931static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2932			       struct sk_buff *skb, u32 mtu,
2933			       bool confirm_neigh)
2934{
2935	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
2936			     confirm_neigh);
2937}
2938
2939void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2940		     int oif, u32 mark, kuid_t uid)
2941{
2942	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2943	struct dst_entry *dst;
2944	struct flowi6 fl6 = {
2945		.flowi6_oif = oif,
2946		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2947		.daddr = iph->daddr,
2948		.saddr = iph->saddr,
2949		.flowlabel = ip6_flowinfo(iph),
2950		.flowi6_uid = uid,
2951	};
2952
2953	dst = ip6_route_output(net, NULL, &fl6);
2954	if (!dst->error)
2955		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
2956	dst_release(dst);
2957}
2958EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2959
2960void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2961{
2962	int oif = sk->sk_bound_dev_if;
2963	struct dst_entry *dst;
2964
2965	if (!oif && skb->dev)
2966		oif = l3mdev_master_ifindex(skb->dev);
2967
2968	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark),
2969			sk->sk_uid);
2970
2971	dst = __sk_dst_get(sk);
2972	if (!dst || !dst->obsolete ||
2973	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2974		return;
2975
2976	bh_lock_sock(sk);
2977	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2978		ip6_datagram_dst_update(sk, false);
2979	bh_unlock_sock(sk);
2980}
2981EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2982
2983void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2984			   const struct flowi6 *fl6)
2985{
2986#ifdef CONFIG_IPV6_SUBTREES
2987	struct ipv6_pinfo *np = inet6_sk(sk);
2988#endif
2989
2990	ip6_dst_store(sk, dst,
2991		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2992		      &sk->sk_v6_daddr : NULL,
2993#ifdef CONFIG_IPV6_SUBTREES
2994		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2995		      &np->saddr :
2996#endif
2997		      NULL);
2998}
2999
3000static bool ip6_redirect_nh_match(const struct fib6_result *res,
3001				  struct flowi6 *fl6,
3002				  const struct in6_addr *gw,
3003				  struct rt6_info **ret)
3004{
3005	const struct fib6_nh *nh = res->nh;
3006
3007	if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
3008	    fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
3009		return false;
3010
3011	/* rt_cache's gateway might be different from its 'parent'
3012	 * in the case of an ip redirect.
3013	 * So we keep searching in the exception table if the gateway
3014	 * is different.
3015	 */
3016	if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
3017		struct rt6_info *rt_cache;
3018
3019		rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
3020		if (rt_cache &&
3021		    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
3022			*ret = rt_cache;
3023			return true;
3024		}
3025		return false;
3026	}
3027	return true;
3028}
3029
3030struct fib6_nh_rd_arg {
3031	struct fib6_result	*res;
3032	struct flowi6		*fl6;
3033	const struct in6_addr	*gw;
3034	struct rt6_info		**ret;
3035};
3036
3037static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
3038{
3039	struct fib6_nh_rd_arg *arg = _arg;
3040
3041	arg->res->nh = nh;
3042	return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
3043}
3044
3045/* Handle redirects */
3046struct ip6rd_flowi {
3047	struct flowi6 fl6;
3048	struct in6_addr gateway;
3049};
3050
3051INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net,
3052					     struct fib6_table *table,
3053					     struct flowi6 *fl6,
3054					     const struct sk_buff *skb,
3055					     int flags)
3056{
3057	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
3058	struct rt6_info *ret = NULL;
3059	struct fib6_result res = {};
3060	struct fib6_nh_rd_arg arg = {
3061		.res = &res,
3062		.fl6 = fl6,
3063		.gw  = &rdfl->gateway,
3064		.ret = &ret
3065	};
3066	struct fib6_info *rt;
3067	struct fib6_node *fn;
3068
 
 
 
 
 
 
3069	/* Get the "current" route for this destination and
3070	 * check if the redirect has come from appropriate router.
3071	 *
3072	 * RFC 4861 specifies that redirects should only be
3073	 * accepted if they come from the nexthop to the target.
3074	 * Due to the way the routes are chosen, this notion
3075	 * is a bit fuzzy and one might need to check all possible
3076	 * routes.
3077	 */
3078
3079	rcu_read_lock();
3080	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
3081restart:
3082	for_each_fib6_node_rt_rcu(fn) {
3083		res.f6i = rt;
3084		if (fib6_check_expired(rt))
3085			continue;
3086		if (rt->fib6_flags & RTF_REJECT)
3087			break;
3088		if (unlikely(rt->nh)) {
3089			if (nexthop_is_blackhole(rt->nh))
3090				continue;
3091			/* on match, res->nh is filled in and potentially ret */
3092			if (nexthop_for_each_fib6_nh(rt->nh,
3093						     fib6_nh_redirect_match,
3094						     &arg))
3095				goto out;
3096		} else {
3097			res.nh = rt->fib6_nh;
3098			if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
3099						  &ret))
3100				goto out;
3101		}
3102	}
3103
3104	if (!rt)
3105		rt = net->ipv6.fib6_null_entry;
3106	else if (rt->fib6_flags & RTF_REJECT) {
3107		ret = net->ipv6.ip6_null_entry;
3108		goto out;
3109	}
3110
3111	if (rt == net->ipv6.fib6_null_entry) {
3112		fn = fib6_backtrack(fn, &fl6->saddr);
3113		if (fn)
3114			goto restart;
3115	}
3116
3117	res.f6i = rt;
3118	res.nh = rt->fib6_nh;
3119out:
3120	if (ret) {
3121		ip6_hold_safe(net, &ret);
3122	} else {
3123		res.fib6_flags = res.f6i->fib6_flags;
3124		res.fib6_type = res.f6i->fib6_type;
3125		ret = ip6_create_rt_rcu(&res);
3126	}
3127
3128	rcu_read_unlock();
3129
3130	trace_fib6_table_lookup(net, &res, table, fl6);
3131	return ret;
3132};
3133
3134static struct dst_entry *ip6_route_redirect(struct net *net,
3135					    const struct flowi6 *fl6,
3136					    const struct sk_buff *skb,
3137					    const struct in6_addr *gateway)
3138{
3139	int flags = RT6_LOOKUP_F_HAS_SADDR;
3140	struct ip6rd_flowi rdfl;
3141
3142	rdfl.fl6 = *fl6;
3143	rdfl.gateway = *gateway;
3144
3145	return fib6_rule_lookup(net, &rdfl.fl6, skb,
3146				flags, __ip6_route_redirect);
3147}
3148
3149void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
3150		  kuid_t uid)
3151{
3152	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
3153	struct dst_entry *dst;
3154	struct flowi6 fl6 = {
3155		.flowi6_iif = LOOPBACK_IFINDEX,
3156		.flowi6_oif = oif,
3157		.flowi6_mark = mark,
3158		.daddr = iph->daddr,
3159		.saddr = iph->saddr,
3160		.flowlabel = ip6_flowinfo(iph),
3161		.flowi6_uid = uid,
3162	};
3163
3164	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
3165	rt6_do_redirect(dst, NULL, skb);
3166	dst_release(dst);
3167}
3168EXPORT_SYMBOL_GPL(ip6_redirect);
3169
3170void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
3171{
3172	const struct ipv6hdr *iph = ipv6_hdr(skb);
3173	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
3174	struct dst_entry *dst;
3175	struct flowi6 fl6 = {
3176		.flowi6_iif = LOOPBACK_IFINDEX,
3177		.flowi6_oif = oif,
3178		.daddr = msg->dest,
3179		.saddr = iph->daddr,
3180		.flowi6_uid = sock_net_uid(net, NULL),
3181	};
3182
3183	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
3184	rt6_do_redirect(dst, NULL, skb);
3185	dst_release(dst);
3186}
3187
3188void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
3189{
3190	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
3191		     READ_ONCE(sk->sk_mark), sk->sk_uid);
3192}
3193EXPORT_SYMBOL_GPL(ip6_sk_redirect);
3194
3195static unsigned int ip6_default_advmss(const struct dst_entry *dst)
3196{
3197	struct net_device *dev = dst->dev;
3198	unsigned int mtu = dst_mtu(dst);
3199	struct net *net;
3200
3201	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
3202
3203	rcu_read_lock();
3204
3205	net = dev_net_rcu(dev);
3206	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
3207		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
3208
3209	rcu_read_unlock();
3210
3211	/*
3212	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
3213	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
3214	 * IPV6_MAXPLEN is also valid and means: "any MSS,
3215	 * rely only on pmtu discovery"
3216	 */
3217	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
3218		mtu = IPV6_MAXPLEN;
3219	return mtu;
3220}
3221
3222INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst)
3223{
3224	return ip6_dst_mtu_maybe_forward(dst, false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3225}
3226EXPORT_INDIRECT_CALLABLE(ip6_mtu);
3227
3228/* MTU selection:
3229 * 1. mtu on route is locked - use it
3230 * 2. mtu from nexthop exception
3231 * 3. mtu from egress device
3232 *
3233 * based on ip6_dst_mtu_forward and exception logic of
3234 * rt6_find_cached_rt; called with rcu_read_lock
3235 */
3236u32 ip6_mtu_from_fib6(const struct fib6_result *res,
3237		      const struct in6_addr *daddr,
3238		      const struct in6_addr *saddr)
3239{
3240	const struct fib6_nh *nh = res->nh;
3241	struct fib6_info *f6i = res->f6i;
3242	struct inet6_dev *idev;
3243	struct rt6_info *rt;
3244	u32 mtu = 0;
3245
3246	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
3247		mtu = f6i->fib6_pmtu;
3248		if (mtu)
3249			goto out;
3250	}
3251
3252	rt = rt6_find_cached_rt(res, daddr, saddr);
3253	if (unlikely(rt)) {
3254		mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
3255	} else {
3256		struct net_device *dev = nh->fib_nh_dev;
3257
3258		mtu = IPV6_MIN_MTU;
3259		idev = __in6_dev_get(dev);
3260		if (idev)
3261			mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6));
3262	}
3263
3264	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
3265out:
3266	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
3267}
3268
3269struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
3270				  struct flowi6 *fl6)
3271{
3272	struct dst_entry *dst;
3273	struct rt6_info *rt;
3274	struct inet6_dev *idev = in6_dev_get(dev);
3275	struct net *net = dev_net(dev);
3276
3277	if (unlikely(!idev))
3278		return ERR_PTR(-ENODEV);
3279
3280	rt = ip6_dst_alloc(net, dev, 0);
3281	if (unlikely(!rt)) {
3282		in6_dev_put(idev);
3283		dst = ERR_PTR(-ENOMEM);
3284		goto out;
3285	}
3286
 
3287	rt->dst.input = ip6_input;
3288	rt->dst.output  = ip6_output;
3289	rt->rt6i_gateway  = fl6->daddr;
3290	rt->rt6i_dst.addr = fl6->daddr;
3291	rt->rt6i_dst.plen = 128;
3292	rt->rt6i_idev     = idev;
3293	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
3294
3295	/* Add this dst into uncached_list so that rt6_disable_ip() can
3296	 * do proper release of the net_device
3297	 */
3298	rt6_uncached_list_add(rt);
 
3299
3300	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
3301
3302out:
3303	return dst;
3304}
3305
3306static void ip6_dst_gc(struct dst_ops *ops)
3307{
3308	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
3309	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
 
3310	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
3311	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
3312	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
3313	unsigned int val;
3314	int entries;
3315
3316	if (time_after(rt_last_gc + rt_min_interval, jiffies))
 
 
3317		goto out;
3318
3319	fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
 
3320	entries = dst_entries_get_slow(ops);
3321	if (entries < ops->gc_thresh)
3322		atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1);
3323out:
3324	val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
3325	atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
3326}
3327
3328static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
3329			       const struct in6_addr *gw_addr, u32 tbid,
3330			       int flags, struct fib6_result *res)
3331{
3332	struct flowi6 fl6 = {
3333		.flowi6_oif = cfg->fc_ifindex,
3334		.daddr = *gw_addr,
3335		.saddr = cfg->fc_prefsrc,
3336	};
3337	struct fib6_table *table;
3338	int err;
3339
3340	table = fib6_get_table(net, tbid);
3341	if (!table)
3342		return -EINVAL;
3343
3344	if (!ipv6_addr_any(&cfg->fc_prefsrc))
3345		flags |= RT6_LOOKUP_F_HAS_SADDR;
3346
3347	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
3348
3349	err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
3350	if (!err && res->f6i != net->ipv6.fib6_null_entry)
3351		fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
3352				 cfg->fc_ifindex != 0, NULL, flags);
3353
3354	return err;
3355}
3356
3357static int ip6_route_check_nh_onlink(struct net *net,
3358				     struct fib6_config *cfg,
3359				     const struct net_device *dev,
3360				     struct netlink_ext_ack *extack)
3361{
3362	u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
3363	const struct in6_addr *gw_addr = &cfg->fc_gateway;
3364	struct fib6_result res = {};
3365	int err;
3366
3367	err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
3368	if (!err && !(res.fib6_flags & RTF_REJECT) &&
3369	    /* ignore match if it is the default route */
3370	    !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
3371	    (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
3372		NL_SET_ERR_MSG(extack,
3373			       "Nexthop has invalid gateway or device mismatch");
3374		err = -EINVAL;
3375	}
3376
3377	return err;
3378}
3379
3380static int ip6_route_check_nh(struct net *net,
3381			      struct fib6_config *cfg,
3382			      struct net_device **_dev,
3383			      netdevice_tracker *dev_tracker,
3384			      struct inet6_dev **idev)
3385{
3386	const struct in6_addr *gw_addr = &cfg->fc_gateway;
3387	struct net_device *dev = _dev ? *_dev : NULL;
3388	int flags = RT6_LOOKUP_F_IFACE;
3389	struct fib6_result res = {};
3390	int err = -EHOSTUNREACH;
3391
3392	if (cfg->fc_table) {
3393		err = ip6_nh_lookup_table(net, cfg, gw_addr,
3394					  cfg->fc_table, flags, &res);
3395		/* gw_addr can not require a gateway or resolve to a reject
3396		 * route. If a device is given, it must match the result.
3397		 */
3398		if (err || res.fib6_flags & RTF_REJECT ||
3399		    res.nh->fib_nh_gw_family ||
3400		    (dev && dev != res.nh->fib_nh_dev))
3401			err = -EHOSTUNREACH;
3402	}
3403
3404	if (err < 0) {
3405		struct flowi6 fl6 = {
3406			.flowi6_oif = cfg->fc_ifindex,
3407			.daddr = *gw_addr,
3408		};
3409
3410		err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
3411		if (err || res.fib6_flags & RTF_REJECT ||
3412		    res.nh->fib_nh_gw_family)
3413			err = -EHOSTUNREACH;
3414
3415		if (err)
3416			return err;
3417
3418		fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
3419				 cfg->fc_ifindex != 0, NULL, flags);
3420	}
3421
3422	err = 0;
3423	if (dev) {
3424		if (dev != res.nh->fib_nh_dev)
3425			err = -EHOSTUNREACH;
3426	} else {
3427		*_dev = dev = res.nh->fib_nh_dev;
3428		netdev_hold(dev, dev_tracker, GFP_ATOMIC);
3429		*idev = in6_dev_get(dev);
3430	}
3431
3432	return err;
3433}
3434
3435static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
3436			   struct net_device **_dev,
3437			   netdevice_tracker *dev_tracker,
3438			   struct inet6_dev **idev,
3439			   struct netlink_ext_ack *extack)
3440{
3441	const struct in6_addr *gw_addr = &cfg->fc_gateway;
3442	int gwa_type = ipv6_addr_type(gw_addr);
3443	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
3444	const struct net_device *dev = *_dev;
3445	bool need_addr_check = !dev;
3446	int err = -EINVAL;
3447
3448	/* if gw_addr is local we will fail to detect this in case
3449	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
3450	 * will return already-added prefix route via interface that
3451	 * prefix route was assigned to, which might be non-loopback.
3452	 */
3453	if (dev &&
3454	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
3455		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
3456		goto out;
3457	}
3458
3459	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
3460		/* IPv6 strictly inhibits using not link-local
3461		 * addresses as nexthop address.
3462		 * Otherwise, router will not able to send redirects.
3463		 * It is very good, but in some (rare!) circumstances
3464		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
3465		 * some exceptions. --ANK
3466		 * We allow IPv4-mapped nexthops to support RFC4798-type
3467		 * addressing
3468		 */
3469		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
3470			NL_SET_ERR_MSG(extack, "Invalid gateway address");
3471			goto out;
3472		}
3473
3474		rcu_read_lock();
3475
3476		if (cfg->fc_flags & RTNH_F_ONLINK)
3477			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
3478		else
3479			err = ip6_route_check_nh(net, cfg, _dev, dev_tracker,
3480						 idev);
3481
3482		rcu_read_unlock();
3483
3484		if (err)
3485			goto out;
3486	}
3487
3488	/* reload in case device was changed */
3489	dev = *_dev;
3490
3491	err = -EINVAL;
3492	if (!dev) {
3493		NL_SET_ERR_MSG(extack, "Egress device not specified");
3494		goto out;
3495	} else if (dev->flags & IFF_LOOPBACK) {
3496		NL_SET_ERR_MSG(extack,
3497			       "Egress device can not be loopback device for this route");
3498		goto out;
3499	}
3500
3501	/* if we did not check gw_addr above, do so now that the
3502	 * egress device has been resolved.
3503	 */
3504	if (need_addr_check &&
3505	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
3506		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
3507		goto out;
3508	}
3509
3510	err = 0;
3511out:
3512	return err;
3513}
3514
3515static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
3516{
3517	if ((flags & RTF_REJECT) ||
3518	    (dev && (dev->flags & IFF_LOOPBACK) &&
3519	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
3520	     !(flags & (RTF_ANYCAST | RTF_LOCAL))))
3521		return true;
3522
3523	return false;
3524}
3525
3526int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
3527		 struct fib6_config *cfg, gfp_t gfp_flags,
3528		 struct netlink_ext_ack *extack)
3529{
3530	netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker;
3531	struct net_device *dev = NULL;
3532	struct inet6_dev *idev = NULL;
3533	int addr_type;
3534	int err;
3535
3536	fib6_nh->fib_nh_family = AF_INET6;
3537#ifdef CONFIG_IPV6_ROUTER_PREF
3538	fib6_nh->last_probe = jiffies;
3539#endif
3540	if (cfg->fc_is_fdb) {
3541		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3542		fib6_nh->fib_nh_gw_family = AF_INET6;
3543		return 0;
3544	}
3545
3546	err = -ENODEV;
3547	if (cfg->fc_ifindex) {
3548		dev = netdev_get_by_index(net, cfg->fc_ifindex,
3549					  dev_tracker, gfp_flags);
3550		if (!dev)
3551			goto out;
3552		idev = in6_dev_get(dev);
3553		if (!idev)
3554			goto out;
3555	}
3556
3557	if (cfg->fc_flags & RTNH_F_ONLINK) {
3558		if (!dev) {
3559			NL_SET_ERR_MSG(extack,
3560				       "Nexthop device required for onlink");
3561			goto out;
3562		}
3563
3564		if (!(dev->flags & IFF_UP)) {
3565			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3566			err = -ENETDOWN;
3567			goto out;
3568		}
3569
3570		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3571	}
3572
3573	fib6_nh->fib_nh_weight = 1;
3574
3575	/* We cannot add true routes via loopback here,
3576	 * they would result in kernel looping; promote them to reject routes
3577	 */
3578	addr_type = ipv6_addr_type(&cfg->fc_dst);
3579	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3580		/* hold loopback dev/idev if we haven't done so. */
3581		if (dev != net->loopback_dev) {
3582			if (dev) {
3583				netdev_put(dev, dev_tracker);
3584				in6_dev_put(idev);
3585			}
3586			dev = net->loopback_dev;
3587			netdev_hold(dev, dev_tracker, gfp_flags);
3588			idev = in6_dev_get(dev);
3589			if (!idev) {
3590				err = -ENODEV;
3591				goto out;
3592			}
3593		}
3594		goto pcpu_alloc;
3595	}
3596
3597	if (cfg->fc_flags & RTF_GATEWAY) {
3598		err = ip6_validate_gw(net, cfg, &dev, dev_tracker,
3599				      &idev, extack);
3600		if (err)
3601			goto out;
3602
3603		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3604		fib6_nh->fib_nh_gw_family = AF_INET6;
3605	}
3606
3607	err = -ENODEV;
3608	if (!dev)
3609		goto out;
3610
3611	if (!idev || idev->cnf.disable_ipv6) {
3612		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3613		err = -EACCES;
3614		goto out;
3615	}
3616
3617	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3618		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3619		err = -ENETDOWN;
3620		goto out;
3621	}
3622
3623	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3624	    !netif_carrier_ok(dev))
3625		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3626
3627	err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap,
3628				 cfg->fc_encap_type, cfg, gfp_flags, extack);
3629	if (err)
3630		goto out;
3631
3632pcpu_alloc:
3633	fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
3634	if (!fib6_nh->rt6i_pcpu) {
3635		err = -ENOMEM;
3636		goto out;
3637	}
3638
3639	fib6_nh->fib_nh_dev = dev;
3640	fib6_nh->fib_nh_oif = dev->ifindex;
3641	err = 0;
3642out:
3643	if (idev)
3644		in6_dev_put(idev);
3645
3646	if (err) {
3647		lwtstate_put(fib6_nh->fib_nh_lws);
3648		fib6_nh->fib_nh_lws = NULL;
3649		netdev_put(dev, dev_tracker);
 
3650	}
3651
3652	return err;
3653}
3654
3655void fib6_nh_release(struct fib6_nh *fib6_nh)
3656{
3657	struct rt6_exception_bucket *bucket;
3658
3659	rcu_read_lock();
3660
3661	fib6_nh_flush_exceptions(fib6_nh, NULL);
3662	bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
3663	if (bucket) {
3664		rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
3665		kfree(bucket);
3666	}
3667
3668	rcu_read_unlock();
3669
3670	fib6_nh_release_dsts(fib6_nh);
3671	free_percpu(fib6_nh->rt6i_pcpu);
3672
3673	fib_nh_common_release(&fib6_nh->nh_common);
3674}
3675
3676void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
3677{
3678	int cpu;
3679
3680	if (!fib6_nh->rt6i_pcpu)
3681		return;
3682
3683	for_each_possible_cpu(cpu) {
3684		struct rt6_info *pcpu_rt, **ppcpu_rt;
3685
3686		ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
3687		pcpu_rt = xchg(ppcpu_rt, NULL);
3688		if (pcpu_rt) {
3689			dst_dev_put(&pcpu_rt->dst);
3690			dst_release(&pcpu_rt->dst);
3691		}
 
 
3692	}
 
 
3693}
3694
3695static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3696					      gfp_t gfp_flags,
3697					      struct netlink_ext_ack *extack)
3698{
3699	struct net *net = cfg->fc_nlinfo.nl_net;
3700	struct fib6_info *rt = NULL;
3701	struct nexthop *nh = NULL;
3702	struct fib6_table *table;
3703	struct fib6_nh *fib6_nh;
3704	int err = -EINVAL;
3705	int addr_type;
3706
3707	/* RTF_PCPU is an internal flag; can not be set by userspace */
3708	if (cfg->fc_flags & RTF_PCPU) {
3709		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3710		goto out;
3711	}
3712
3713	/* RTF_CACHE is an internal flag; can not be set by userspace */
3714	if (cfg->fc_flags & RTF_CACHE) {
3715		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3716		goto out;
3717	}
3718
3719	if (cfg->fc_type > RTN_MAX) {
3720		NL_SET_ERR_MSG(extack, "Invalid route type");
3721		goto out;
3722	}
3723
3724	if (cfg->fc_dst_len > 128) {
3725		NL_SET_ERR_MSG(extack, "Invalid prefix length");
3726		goto out;
3727	}
3728	if (cfg->fc_src_len > 128) {
3729		NL_SET_ERR_MSG(extack, "Invalid source address length");
3730		goto out;
3731	}
3732#ifndef CONFIG_IPV6_SUBTREES
3733	if (cfg->fc_src_len) {
3734		NL_SET_ERR_MSG(extack,
3735			       "Specifying source address requires IPV6_SUBTREES to be enabled");
3736		goto out;
3737	}
3738#endif
3739	if (cfg->fc_nh_id) {
3740		nh = nexthop_find_by_id(net, cfg->fc_nh_id);
3741		if (!nh) {
3742			NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
3743			goto out;
3744		}
3745		err = fib6_check_nexthop(nh, cfg, extack);
3746		if (err)
3747			goto out;
3748	}
3749
3750	err = -ENOBUFS;
3751	if (cfg->fc_nlinfo.nlh &&
3752	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3753		table = fib6_get_table(net, cfg->fc_table);
3754		if (!table) {
3755			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3756			table = fib6_new_table(net, cfg->fc_table);
3757		}
3758	} else {
3759		table = fib6_new_table(net, cfg->fc_table);
3760	}
3761
3762	if (!table)
3763		goto out;
3764
3765	err = -ENOMEM;
3766	rt = fib6_info_alloc(gfp_flags, !nh);
3767	if (!rt)
3768		goto out;
3769
3770	rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len,
3771					       extack);
3772	if (IS_ERR(rt->fib6_metrics)) {
3773		err = PTR_ERR(rt->fib6_metrics);
3774		/* Do not leave garbage there. */
3775		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3776		goto out_free;
3777	}
3778
3779	if (cfg->fc_flags & RTF_ADDRCONF)
3780		rt->dst_nocount = true;
3781
3782	if (cfg->fc_flags & RTF_EXPIRES)
3783		fib6_set_expires(rt, jiffies +
3784				clock_t_to_jiffies(cfg->fc_expires));
 
 
3785
3786	if (cfg->fc_protocol == RTPROT_UNSPEC)
3787		cfg->fc_protocol = RTPROT_BOOT;
3788	rt->fib6_protocol = cfg->fc_protocol;
3789
3790	rt->fib6_table = table;
3791	rt->fib6_metric = cfg->fc_metric;
3792	rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
3793	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3794
3795	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3796	rt->fib6_dst.plen = cfg->fc_dst_len;
 
 
3797
3798#ifdef CONFIG_IPV6_SUBTREES
3799	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3800	rt->fib6_src.plen = cfg->fc_src_len;
3801#endif
3802	if (nh) {
3803		if (rt->fib6_src.plen) {
3804			NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
3805			goto out_free;
3806		}
3807		if (!nexthop_get(nh)) {
3808			NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
3809			goto out_free;
 
 
 
 
3810		}
3811		rt->nh = nh;
3812		fib6_nh = nexthop_fib6_nh(rt->nh);
3813	} else {
3814		err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
3815		if (err)
3816			goto out;
3817
3818		fib6_nh = rt->fib6_nh;
3819
3820		/* We cannot add true routes via loopback here, they would
3821		 * result in kernel looping; promote them to reject routes
3822		 */
3823		addr_type = ipv6_addr_type(&cfg->fc_dst);
3824		if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
3825				   addr_type))
3826			rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3827	}
3828
3829	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3830		struct net_device *dev = fib6_nh->fib_nh_dev;
3831
3832		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3833			NL_SET_ERR_MSG(extack, "Invalid source address");
3834			err = -EINVAL;
3835			goto out;
3836		}
3837		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3838		rt->fib6_prefsrc.plen = 128;
3839	} else
3840		rt->fib6_prefsrc.plen = 0;
3841
3842	return rt;
3843out:
3844	fib6_info_release(rt);
3845	return ERR_PTR(err);
3846out_free:
3847	ip_fib_metrics_put(rt->fib6_metrics);
3848	kfree(rt);
3849	return ERR_PTR(err);
3850}
3851
3852int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3853		  struct netlink_ext_ack *extack)
3854{
3855	struct fib6_info *rt;
3856	int err;
3857
3858	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3859	if (IS_ERR(rt))
3860		return PTR_ERR(rt);
3861
3862	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3863	fib6_info_release(rt);
3864
3865	return err;
3866}
3867
3868static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3869{
3870	struct net *net = info->nl_net;
3871	struct fib6_table *table;
3872	int err;
3873
3874	if (rt == net->ipv6.fib6_null_entry) {
3875		err = -ENOENT;
3876		goto out;
3877	}
3878
3879	table = rt->fib6_table;
3880	spin_lock_bh(&table->tb6_lock);
3881	err = fib6_del(rt, info);
3882	spin_unlock_bh(&table->tb6_lock);
3883
3884out:
3885	fib6_info_release(rt);
3886	return err;
3887}
3888
3889int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify)
3890{
3891	struct nl_info info = {
3892		.nl_net = net,
3893		.skip_notify = skip_notify
3894	};
3895
3896	return __ip6_del_rt(rt, &info);
3897}
3898
3899static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3900{
3901	struct nl_info *info = &cfg->fc_nlinfo;
3902	struct net *net = info->nl_net;
3903	struct sk_buff *skb = NULL;
3904	struct fib6_table *table;
3905	int err = -ENOENT;
3906
3907	if (rt == net->ipv6.fib6_null_entry)
3908		goto out_put;
3909	table = rt->fib6_table;
3910	spin_lock_bh(&table->tb6_lock);
3911
3912	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3913		struct fib6_info *sibling, *next_sibling;
3914		struct fib6_node *fn;
3915
3916		/* prefer to send a single notification with all hops */
3917		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3918		if (skb) {
3919			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3920
3921			if (rt6_fill_node(net, skb, rt, NULL,
3922					  NULL, NULL, 0, RTM_DELROUTE,
3923					  info->portid, seq, 0) < 0) {
3924				kfree_skb(skb);
3925				skb = NULL;
3926			} else
3927				info->skip_notify = 1;
3928		}
3929
3930		/* 'rt' points to the first sibling route. If it is not the
3931		 * leaf, then we do not need to send a notification. Otherwise,
3932		 * we need to check if the last sibling has a next route or not
3933		 * and emit a replace or delete notification, respectively.
3934		 */
3935		info->skip_notify_kernel = 1;
3936		fn = rcu_dereference_protected(rt->fib6_node,
3937					    lockdep_is_held(&table->tb6_lock));
3938		if (rcu_access_pointer(fn->leaf) == rt) {
3939			struct fib6_info *last_sibling, *replace_rt;
3940
3941			last_sibling = list_last_entry(&rt->fib6_siblings,
3942						       struct fib6_info,
3943						       fib6_siblings);
3944			replace_rt = rcu_dereference_protected(
3945					    last_sibling->fib6_next,
3946					    lockdep_is_held(&table->tb6_lock));
3947			if (replace_rt)
3948				call_fib6_entry_notifiers_replace(net,
3949								  replace_rt);
3950			else
3951				call_fib6_multipath_entry_notifiers(net,
3952						       FIB_EVENT_ENTRY_DEL,
3953						       rt, rt->fib6_nsiblings,
3954						       NULL);
3955		}
3956		list_for_each_entry_safe(sibling, next_sibling,
3957					 &rt->fib6_siblings,
3958					 fib6_siblings) {
3959			err = fib6_del(sibling, info);
3960			if (err)
3961				goto out_unlock;
3962		}
3963	}
3964
3965	err = fib6_del(rt, info);
3966out_unlock:
3967	spin_unlock_bh(&table->tb6_lock);
3968out_put:
3969	fib6_info_release(rt);
3970
3971	if (skb) {
3972		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3973			    info->nlh, gfp_any());
3974	}
3975	return err;
3976}
3977
3978static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3979{
3980	int rc = -ESRCH;
3981
3982	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3983		goto out;
3984
3985	if (cfg->fc_flags & RTF_GATEWAY &&
3986	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3987		goto out;
3988
3989	rc = rt6_remove_exception_rt(rt);
3990out:
3991	return rc;
3992}
3993
3994static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
3995			     struct fib6_nh *nh)
3996{
3997	struct fib6_result res = {
3998		.f6i = rt,
3999		.nh = nh,
4000	};
4001	struct rt6_info *rt_cache;
4002
4003	rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
4004	if (rt_cache)
4005		return __ip6_del_cached_rt(rt_cache, cfg);
4006
4007	return 0;
4008}
4009
4010struct fib6_nh_del_cached_rt_arg {
4011	struct fib6_config *cfg;
4012	struct fib6_info *f6i;
4013};
4014
4015static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
4016{
4017	struct fib6_nh_del_cached_rt_arg *arg = _arg;
4018	int rc;
4019
4020	rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
4021	return rc != -ESRCH ? rc : 0;
4022}
4023
4024static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
4025{
4026	struct fib6_nh_del_cached_rt_arg arg = {
4027		.cfg = cfg,
4028		.f6i = f6i
4029	};
4030
4031	return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
4032}
4033
4034static int ip6_route_del(struct fib6_config *cfg,
4035			 struct netlink_ext_ack *extack)
4036{
4037	struct fib6_table *table;
4038	struct fib6_info *rt;
4039	struct fib6_node *fn;
4040	int err = -ESRCH;
4041
4042	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
4043	if (!table) {
4044		NL_SET_ERR_MSG(extack, "FIB table does not exist");
4045		return err;
4046	}
4047
4048	rcu_read_lock();
4049
4050	fn = fib6_locate(&table->tb6_root,
4051			 &cfg->fc_dst, cfg->fc_dst_len,
4052			 &cfg->fc_src, cfg->fc_src_len,
4053			 !(cfg->fc_flags & RTF_CACHE));
4054
4055	if (fn) {
4056		for_each_fib6_node_rt_rcu(fn) {
4057			struct fib6_nh *nh;
4058
4059			if (rt->nh && cfg->fc_nh_id &&
4060			    rt->nh->id != cfg->fc_nh_id)
4061				continue;
4062
4063			if (cfg->fc_flags & RTF_CACHE) {
4064				int rc = 0;
4065
4066				if (rt->nh) {
4067					rc = ip6_del_cached_rt_nh(cfg, rt);
4068				} else if (cfg->fc_nh_id) {
4069					continue;
4070				} else {
4071					nh = rt->fib6_nh;
4072					rc = ip6_del_cached_rt(cfg, rt, nh);
4073				}
4074				if (rc != -ESRCH) {
4075					rcu_read_unlock();
4076					return rc;
4077				}
4078				continue;
4079			}
4080
4081			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
4082				continue;
4083			if (cfg->fc_protocol &&
4084			    cfg->fc_protocol != rt->fib6_protocol)
4085				continue;
4086
4087			if (rt->nh) {
4088				if (!fib6_info_hold_safe(rt))
4089					continue;
4090				rcu_read_unlock();
4091
4092				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
4093			}
4094			if (cfg->fc_nh_id)
4095				continue;
4096
4097			nh = rt->fib6_nh;
4098			if (cfg->fc_ifindex &&
4099			    (!nh->fib_nh_dev ||
4100			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
4101				continue;
4102			if (cfg->fc_flags & RTF_GATEWAY &&
4103			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
4104				continue;
4105			if (!fib6_info_hold_safe(rt))
4106				continue;
4107			rcu_read_unlock();
4108
4109			/* if gateway was specified only delete the one hop */
4110			if (cfg->fc_flags & RTF_GATEWAY)
4111				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
4112
4113			return __ip6_del_rt_siblings(rt, cfg);
4114		}
4115	}
4116	rcu_read_unlock();
4117
4118	return err;
4119}
4120
4121static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
4122{
4123	struct netevent_redirect netevent;
4124	struct rt6_info *rt, *nrt = NULL;
4125	struct fib6_result res = {};
4126	struct ndisc_options ndopts;
4127	struct inet6_dev *in6_dev;
4128	struct neighbour *neigh;
4129	struct rd_msg *msg;
4130	int optlen, on_link;
4131	u8 *lladdr;
4132
4133	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
4134	optlen -= sizeof(*msg);
4135
4136	if (optlen < 0) {
4137		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
4138		return;
4139	}
4140
4141	msg = (struct rd_msg *)icmp6_hdr(skb);
4142
4143	if (ipv6_addr_is_multicast(&msg->dest)) {
4144		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
4145		return;
4146	}
4147
4148	on_link = 0;
4149	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
4150		on_link = 1;
4151	} else if (ipv6_addr_type(&msg->target) !=
4152		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
4153		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
4154		return;
4155	}
4156
4157	in6_dev = __in6_dev_get(skb->dev);
4158	if (!in6_dev)
4159		return;
4160	if (READ_ONCE(in6_dev->cnf.forwarding) ||
4161	    !READ_ONCE(in6_dev->cnf.accept_redirects))
4162		return;
4163
4164	/* RFC2461 8.1:
4165	 *	The IP source address of the Redirect MUST be the same as the current
4166	 *	first-hop router for the specified ICMP Destination Address.
4167	 */
4168
4169	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
4170		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
4171		return;
4172	}
4173
4174	lladdr = NULL;
4175	if (ndopts.nd_opts_tgt_lladdr) {
4176		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
4177					     skb->dev);
4178		if (!lladdr) {
4179			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
4180			return;
4181		}
4182	}
4183
4184	rt = dst_rt6_info(dst);
4185	if (rt->rt6i_flags & RTF_REJECT) {
4186		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
4187		return;
4188	}
4189
4190	/* Redirect received -> path was valid.
4191	 * Look, redirects are sent only in response to data packets,
4192	 * so that this nexthop apparently is reachable. --ANK
4193	 */
4194	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
4195
4196	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
4197	if (!neigh)
4198		return;
4199
4200	/*
4201	 *	We have finally decided to accept it.
4202	 */
4203
4204	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
4205		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
4206		     NEIGH_UPDATE_F_OVERRIDE|
4207		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
4208				     NEIGH_UPDATE_F_ISROUTER)),
4209		     NDISC_REDIRECT, &ndopts);
4210
4211	rcu_read_lock();
4212	res.f6i = rcu_dereference(rt->from);
4213	if (!res.f6i)
4214		goto out;
4215
4216	if (res.f6i->nh) {
4217		struct fib6_nh_match_arg arg = {
4218			.dev = dst->dev,
4219			.gw = &rt->rt6i_gateway,
4220		};
4221
4222		nexthop_for_each_fib6_nh(res.f6i->nh,
4223					 fib6_nh_find_match, &arg);
4224
4225		/* fib6_info uses a nexthop that does not have fib6_nh
4226		 * using the dst->dev. Should be impossible
4227		 */
4228		if (!arg.match)
4229			goto out;
4230		res.nh = arg.match;
4231	} else {
4232		res.nh = res.f6i->fib6_nh;
4233	}
4234
4235	res.fib6_flags = res.f6i->fib6_flags;
4236	res.fib6_type = res.f6i->fib6_type;
4237	nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
4238	if (!nrt)
4239		goto out;
4240
4241	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
4242	if (on_link)
4243		nrt->rt6i_flags &= ~RTF_GATEWAY;
4244
4245	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
4246
4247	/* rt6_insert_exception() will take care of duplicated exceptions */
4248	if (rt6_insert_exception(nrt, &res)) {
4249		dst_release_immediate(&nrt->dst);
4250		goto out;
4251	}
4252
4253	netevent.old = &rt->dst;
4254	netevent.new = &nrt->dst;
4255	netevent.daddr = &msg->dest;
4256	netevent.neigh = neigh;
4257	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
4258
4259out:
4260	rcu_read_unlock();
4261	neigh_release(neigh);
4262}
4263
4264#ifdef CONFIG_IPV6_ROUTE_INFO
4265static struct fib6_info *rt6_get_route_info(struct net *net,
4266					   const struct in6_addr *prefix, int prefixlen,
4267					   const struct in6_addr *gwaddr,
4268					   struct net_device *dev)
4269{
4270	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
4271	int ifindex = dev->ifindex;
4272	struct fib6_node *fn;
4273	struct fib6_info *rt = NULL;
4274	struct fib6_table *table;
4275
4276	table = fib6_get_table(net, tb_id);
4277	if (!table)
4278		return NULL;
4279
4280	rcu_read_lock();
4281	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
4282	if (!fn)
4283		goto out;
4284
4285	for_each_fib6_node_rt_rcu(fn) {
4286		/* these routes do not use nexthops */
4287		if (rt->nh)
4288			continue;
4289		if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
4290			continue;
4291		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
4292		    !rt->fib6_nh->fib_nh_gw_family)
4293			continue;
4294		if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
4295			continue;
4296		if (!fib6_info_hold_safe(rt))
4297			continue;
4298		break;
4299	}
4300out:
4301	rcu_read_unlock();
4302	return rt;
4303}
4304
4305static struct fib6_info *rt6_add_route_info(struct net *net,
4306					   const struct in6_addr *prefix, int prefixlen,
4307					   const struct in6_addr *gwaddr,
4308					   struct net_device *dev,
4309					   unsigned int pref)
4310{
4311	struct fib6_config cfg = {
4312		.fc_metric	= IP6_RT_PRIO_USER,
4313		.fc_ifindex	= dev->ifindex,
4314		.fc_dst_len	= prefixlen,
4315		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
4316				  RTF_UP | RTF_PREF(pref),
4317		.fc_protocol = RTPROT_RA,
4318		.fc_type = RTN_UNICAST,
4319		.fc_nlinfo.portid = 0,
4320		.fc_nlinfo.nlh = NULL,
4321		.fc_nlinfo.nl_net = net,
4322	};
4323
4324	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
4325	cfg.fc_dst = *prefix;
4326	cfg.fc_gateway = *gwaddr;
4327
4328	/* We should treat it as a default route if prefix length is 0. */
4329	if (!prefixlen)
4330		cfg.fc_flags |= RTF_DEFAULT;
4331
4332	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
4333
4334	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
4335}
4336#endif
4337
4338struct fib6_info *rt6_get_dflt_router(struct net *net,
4339				     const struct in6_addr *addr,
4340				     struct net_device *dev)
4341{
4342	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
4343	struct fib6_info *rt;
4344	struct fib6_table *table;
4345
4346	table = fib6_get_table(net, tb_id);
4347	if (!table)
4348		return NULL;
4349
4350	rcu_read_lock();
4351	for_each_fib6_node_rt_rcu(&table->tb6_root) {
4352		struct fib6_nh *nh;
4353
4354		/* RA routes do not use nexthops */
4355		if (rt->nh)
4356			continue;
4357
4358		nh = rt->fib6_nh;
4359		if (dev == nh->fib_nh_dev &&
4360		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
4361		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
4362			break;
4363	}
4364	if (rt && !fib6_info_hold_safe(rt))
4365		rt = NULL;
4366	rcu_read_unlock();
4367	return rt;
4368}
4369
4370struct fib6_info *rt6_add_dflt_router(struct net *net,
4371				     const struct in6_addr *gwaddr,
4372				     struct net_device *dev,
4373				     unsigned int pref,
4374				     u32 defrtr_usr_metric,
4375				     int lifetime)
4376{
4377	struct fib6_config cfg = {
4378		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
4379		.fc_metric	= defrtr_usr_metric,
4380		.fc_ifindex	= dev->ifindex,
4381		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
4382				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
4383		.fc_protocol = RTPROT_RA,
4384		.fc_type = RTN_UNICAST,
4385		.fc_nlinfo.portid = 0,
4386		.fc_nlinfo.nlh = NULL,
4387		.fc_nlinfo.nl_net = net,
4388		.fc_expires = jiffies_to_clock_t(lifetime * HZ),
4389	};
4390
4391	cfg.fc_gateway = *gwaddr;
4392
4393	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
4394		struct fib6_table *table;
4395
4396		table = fib6_get_table(dev_net(dev), cfg.fc_table);
4397		if (table)
4398			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
4399	}
4400
4401	return rt6_get_dflt_router(net, gwaddr, dev);
4402}
4403
4404static void __rt6_purge_dflt_routers(struct net *net,
4405				     struct fib6_table *table)
4406{
4407	struct fib6_info *rt;
4408
4409restart:
4410	rcu_read_lock();
4411	for_each_fib6_node_rt_rcu(&table->tb6_root) {
4412		struct net_device *dev = fib6_info_nh_dev(rt);
4413		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
4414
4415		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
4416		    (!idev || idev->cnf.accept_ra != 2) &&
4417		    fib6_info_hold_safe(rt)) {
4418			rcu_read_unlock();
4419			ip6_del_rt(net, rt, false);
4420			goto restart;
4421		}
4422	}
4423	rcu_read_unlock();
4424
4425	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
4426}
4427
4428void rt6_purge_dflt_routers(struct net *net)
4429{
4430	struct fib6_table *table;
4431	struct hlist_head *head;
4432	unsigned int h;
4433
4434	rcu_read_lock();
4435
4436	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
4437		head = &net->ipv6.fib_table_hash[h];
4438		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
4439			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
4440				__rt6_purge_dflt_routers(net, table);
4441		}
4442	}
4443
4444	rcu_read_unlock();
4445}
4446
4447static void rtmsg_to_fib6_config(struct net *net,
4448				 struct in6_rtmsg *rtmsg,
4449				 struct fib6_config *cfg)
4450{
4451	*cfg = (struct fib6_config){
4452		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
4453			 : RT6_TABLE_MAIN,
4454		.fc_ifindex = rtmsg->rtmsg_ifindex,
4455		.fc_metric = rtmsg->rtmsg_metric,
4456		.fc_expires = rtmsg->rtmsg_info,
4457		.fc_dst_len = rtmsg->rtmsg_dst_len,
4458		.fc_src_len = rtmsg->rtmsg_src_len,
4459		.fc_flags = rtmsg->rtmsg_flags,
4460		.fc_type = rtmsg->rtmsg_type,
4461
4462		.fc_nlinfo.nl_net = net,
4463
4464		.fc_dst = rtmsg->rtmsg_dst,
4465		.fc_src = rtmsg->rtmsg_src,
4466		.fc_gateway = rtmsg->rtmsg_gateway,
4467	};
4468}
4469
4470int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
4471{
4472	struct fib6_config cfg;
 
4473	int err;
4474
4475	if (cmd != SIOCADDRT && cmd != SIOCDELRT)
4476		return -EINVAL;
4477	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
4478		return -EPERM;
 
 
 
 
 
4479
4480	rtmsg_to_fib6_config(net, rtmsg, &cfg);
4481
4482	rtnl_lock();
4483	switch (cmd) {
4484	case SIOCADDRT:
4485		/* Only do the default setting of fc_metric in route adding */
4486		if (cfg.fc_metric == 0)
4487			cfg.fc_metric = IP6_RT_PRIO_USER;
4488		err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
4489		break;
4490	case SIOCDELRT:
4491		err = ip6_route_del(&cfg, NULL);
4492		break;
 
 
 
4493	}
4494	rtnl_unlock();
4495	return err;
4496}
4497
4498/*
4499 *	Drop the packet on the floor
4500 */
4501
4502static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
4503{
4504	struct dst_entry *dst = skb_dst(skb);
4505	struct net *net = dev_net(dst->dev);
4506	struct inet6_dev *idev;
4507	SKB_DR(reason);
4508	int type;
4509
4510	if (netif_is_l3_master(skb->dev) ||
4511	    dst->dev == net->loopback_dev)
4512		idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
4513	else
4514		idev = ip6_dst_idev(dst);
4515
4516	switch (ipstats_mib_noroutes) {
4517	case IPSTATS_MIB_INNOROUTES:
4518		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
4519		if (type == IPV6_ADDR_ANY) {
4520			SKB_DR_SET(reason, IP_INADDRERRORS);
4521			IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
4522			break;
4523		}
4524		SKB_DR_SET(reason, IP_INNOROUTES);
4525		fallthrough;
4526	case IPSTATS_MIB_OUTNOROUTES:
4527		SKB_DR_OR(reason, IP_OUTNOROUTES);
4528		IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
4529		break;
4530	}
4531
4532	/* Start over by dropping the dst for l3mdev case */
4533	if (netif_is_l3_master(skb->dev))
4534		skb_dst_drop(skb);
4535
4536	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
4537	kfree_skb_reason(skb, reason);
4538	return 0;
4539}
4540
4541static int ip6_pkt_discard(struct sk_buff *skb)
4542{
4543	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
4544}
4545
4546static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
4547{
4548	skb->dev = skb_dst(skb)->dev;
4549	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
4550}
4551
4552static int ip6_pkt_prohibit(struct sk_buff *skb)
4553{
4554	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
4555}
4556
4557static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
4558{
4559	skb->dev = skb_dst(skb)->dev;
4560	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
4561}
4562
4563/*
4564 *	Allocate a dst for local (unicast / anycast) address.
4565 */
4566
4567struct fib6_info *addrconf_f6i_alloc(struct net *net,
4568				     struct inet6_dev *idev,
4569				     const struct in6_addr *addr,
4570				     bool anycast, gfp_t gfp_flags,
4571				     struct netlink_ext_ack *extack)
4572{
4573	struct fib6_config cfg = {
4574		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
4575		.fc_ifindex = idev->dev->ifindex,
4576		.fc_flags = RTF_UP | RTF_NONEXTHOP,
4577		.fc_dst = *addr,
4578		.fc_dst_len = 128,
4579		.fc_protocol = RTPROT_KERNEL,
4580		.fc_nlinfo.nl_net = net,
4581		.fc_ignore_dev_down = true,
4582	};
4583	struct fib6_info *f6i;
4584
4585	if (anycast) {
4586		cfg.fc_type = RTN_ANYCAST;
4587		cfg.fc_flags |= RTF_ANYCAST;
4588	} else {
4589		cfg.fc_type = RTN_LOCAL;
4590		cfg.fc_flags |= RTF_LOCAL;
4591	}
4592
4593	f6i = ip6_route_info_create(&cfg, gfp_flags, extack);
4594	if (!IS_ERR(f6i)) {
4595		f6i->dst_nocount = true;
4596
4597		if (!anycast &&
4598		    (READ_ONCE(net->ipv6.devconf_all->disable_policy) ||
4599		     READ_ONCE(idev->cnf.disable_policy)))
4600			f6i->dst_nopolicy = true;
4601	}
4602
4603	return f6i;
4604}
4605
4606/* remove deleted ip from prefsrc entries */
4607struct arg_dev_net_ip {
 
4608	struct net *net;
4609	struct in6_addr *addr;
4610};
4611
4612static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
4613{
 
4614	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
4615	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
4616
4617	if (!rt->nh &&
 
4618	    rt != net->ipv6.fib6_null_entry &&
4619	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) &&
4620	    !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) {
4621		spin_lock_bh(&rt6_exception_lock);
4622		/* remove prefsrc entry */
4623		rt->fib6_prefsrc.plen = 0;
4624		spin_unlock_bh(&rt6_exception_lock);
4625	}
4626	return 0;
4627}
4628
4629void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
4630{
4631	struct net *net = dev_net(ifp->idev->dev);
4632	struct arg_dev_net_ip adni = {
 
4633		.net = net,
4634		.addr = &ifp->addr,
4635	};
4636	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
4637}
4638
4639#define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
4640
4641/* Remove routers and update dst entries when gateway turn into host. */
4642static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
4643{
4644	struct in6_addr *gateway = (struct in6_addr *)arg;
4645	struct fib6_nh *nh;
4646
4647	/* RA routes do not use nexthops */
4648	if (rt->nh)
4649		return 0;
4650
4651	nh = rt->fib6_nh;
4652	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
4653	    nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
4654		return -1;
4655
4656	/* Further clean up cached routes in exception table.
4657	 * This is needed because cached route may have a different
4658	 * gateway than its 'parent' in the case of an ip redirect.
4659	 */
4660	fib6_nh_exceptions_clean_tohost(nh, gateway);
4661
4662	return 0;
4663}
4664
4665void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
4666{
4667	fib6_clean_all(net, fib6_clean_tohost, gateway);
4668}
4669
4670struct arg_netdev_event {
4671	const struct net_device *dev;
4672	union {
4673		unsigned char nh_flags;
4674		unsigned long event;
4675	};
4676};
4677
4678static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
4679{
4680	struct fib6_info *iter;
4681	struct fib6_node *fn;
4682
4683	fn = rcu_dereference_protected(rt->fib6_node,
4684			lockdep_is_held(&rt->fib6_table->tb6_lock));
4685	iter = rcu_dereference_protected(fn->leaf,
4686			lockdep_is_held(&rt->fib6_table->tb6_lock));
4687	while (iter) {
4688		if (iter->fib6_metric == rt->fib6_metric &&
4689		    rt6_qualify_for_ecmp(iter))
4690			return iter;
4691		iter = rcu_dereference_protected(iter->fib6_next,
4692				lockdep_is_held(&rt->fib6_table->tb6_lock));
4693	}
4694
4695	return NULL;
4696}
4697
4698/* only called for fib entries with builtin fib6_nh */
4699static bool rt6_is_dead(const struct fib6_info *rt)
4700{
4701	if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
4702	    (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
4703	     ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
4704		return true;
4705
4706	return false;
4707}
4708
4709static int rt6_multipath_total_weight(const struct fib6_info *rt)
4710{
4711	struct fib6_info *iter;
4712	int total = 0;
4713
4714	if (!rt6_is_dead(rt))
4715		total += rt->fib6_nh->fib_nh_weight;
4716
4717	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
4718		if (!rt6_is_dead(iter))
4719			total += iter->fib6_nh->fib_nh_weight;
4720	}
4721
4722	return total;
4723}
4724
4725static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
4726{
4727	int upper_bound = -1;
4728
4729	if (!rt6_is_dead(rt)) {
4730		*weight += rt->fib6_nh->fib_nh_weight;
4731		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
4732						    total) - 1;
4733	}
4734	atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
4735}
4736
4737static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
4738{
4739	struct fib6_info *iter;
4740	int weight = 0;
4741
4742	rt6_upper_bound_set(rt, &weight, total);
4743
4744	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4745		rt6_upper_bound_set(iter, &weight, total);
4746}
4747
4748void rt6_multipath_rebalance(struct fib6_info *rt)
4749{
4750	struct fib6_info *first;
4751	int total;
4752
4753	/* In case the entire multipath route was marked for flushing,
4754	 * then there is no need to rebalance upon the removal of every
4755	 * sibling route.
4756	 */
4757	if (!rt->fib6_nsiblings || rt->should_flush)
4758		return;
4759
4760	/* During lookup routes are evaluated in order, so we need to
4761	 * make sure upper bounds are assigned from the first sibling
4762	 * onwards.
4763	 */
4764	first = rt6_multipath_first_sibling(rt);
4765	if (WARN_ON_ONCE(!first))
4766		return;
4767
4768	total = rt6_multipath_total_weight(first);
4769	rt6_multipath_upper_bound_set(first, total);
4770}
4771
4772static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4773{
4774	const struct arg_netdev_event *arg = p_arg;
4775	struct net *net = dev_net(arg->dev);
4776
4777	if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
4778	    rt->fib6_nh->fib_nh_dev == arg->dev) {
4779		rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
4780		fib6_update_sernum_upto_root(net, rt);
4781		rt6_multipath_rebalance(rt);
4782	}
4783
4784	return 0;
4785}
4786
4787void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4788{
4789	struct arg_netdev_event arg = {
4790		.dev = dev,
4791		{
4792			.nh_flags = nh_flags,
4793		},
4794	};
4795
4796	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4797		arg.nh_flags |= RTNH_F_LINKDOWN;
4798
4799	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4800}
4801
4802/* only called for fib entries with inline fib6_nh */
4803static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4804				   const struct net_device *dev)
4805{
4806	struct fib6_info *iter;
4807
4808	if (rt->fib6_nh->fib_nh_dev == dev)
4809		return true;
4810	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4811		if (iter->fib6_nh->fib_nh_dev == dev)
4812			return true;
4813
4814	return false;
4815}
4816
4817static void rt6_multipath_flush(struct fib6_info *rt)
4818{
4819	struct fib6_info *iter;
4820
4821	rt->should_flush = 1;
4822	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4823		iter->should_flush = 1;
4824}
4825
4826static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4827					     const struct net_device *down_dev)
4828{
4829	struct fib6_info *iter;
4830	unsigned int dead = 0;
4831
4832	if (rt->fib6_nh->fib_nh_dev == down_dev ||
4833	    rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4834		dead++;
4835	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4836		if (iter->fib6_nh->fib_nh_dev == down_dev ||
4837		    iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4838			dead++;
4839
4840	return dead;
4841}
4842
4843static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4844				       const struct net_device *dev,
4845				       unsigned char nh_flags)
4846{
4847	struct fib6_info *iter;
4848
4849	if (rt->fib6_nh->fib_nh_dev == dev)
4850		rt->fib6_nh->fib_nh_flags |= nh_flags;
4851	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4852		if (iter->fib6_nh->fib_nh_dev == dev)
4853			iter->fib6_nh->fib_nh_flags |= nh_flags;
4854}
4855
4856/* called with write lock held for table with rt */
4857static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4858{
4859	const struct arg_netdev_event *arg = p_arg;
4860	const struct net_device *dev = arg->dev;
4861	struct net *net = dev_net(dev);
4862
4863	if (rt == net->ipv6.fib6_null_entry || rt->nh)
4864		return 0;
4865
4866	switch (arg->event) {
4867	case NETDEV_UNREGISTER:
4868		return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
4869	case NETDEV_DOWN:
4870		if (rt->should_flush)
4871			return -1;
4872		if (!rt->fib6_nsiblings)
4873			return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
4874		if (rt6_multipath_uses_dev(rt, dev)) {
4875			unsigned int count;
4876
4877			count = rt6_multipath_dead_count(rt, dev);
4878			if (rt->fib6_nsiblings + 1 == count) {
4879				rt6_multipath_flush(rt);
4880				return -1;
4881			}
4882			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4883						   RTNH_F_LINKDOWN);
4884			fib6_update_sernum(net, rt);
4885			rt6_multipath_rebalance(rt);
4886		}
4887		return -2;
4888	case NETDEV_CHANGE:
4889		if (rt->fib6_nh->fib_nh_dev != dev ||
4890		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4891			break;
4892		rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
4893		rt6_multipath_rebalance(rt);
4894		break;
4895	}
4896
4897	return 0;
4898}
4899
4900void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4901{
4902	struct arg_netdev_event arg = {
4903		.dev = dev,
4904		{
4905			.event = event,
4906		},
4907	};
4908	struct net *net = dev_net(dev);
4909
4910	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4911		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4912	else
4913		fib6_clean_all(net, fib6_ifdown, &arg);
4914}
4915
4916void rt6_disable_ip(struct net_device *dev, unsigned long event)
4917{
4918	rt6_sync_down_dev(dev, event);
4919	rt6_uncached_list_flush_dev(dev);
4920	neigh_ifdown(&nd_tbl, dev);
4921}
4922
4923struct rt6_mtu_change_arg {
4924	struct net_device *dev;
4925	unsigned int mtu;
4926	struct fib6_info *f6i;
4927};
4928
4929static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
4930{
4931	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
4932	struct fib6_info *f6i = arg->f6i;
4933
4934	/* For administrative MTU increase, there is no way to discover
4935	 * IPv6 PMTU increase, so PMTU increase should be updated here.
4936	 * Since RFC 1981 doesn't include administrative MTU increase
4937	 * update PMTU increase is a MUST. (i.e. jumbo frame)
4938	 */
4939	if (nh->fib_nh_dev == arg->dev) {
4940		struct inet6_dev *idev = __in6_dev_get(arg->dev);
4941		u32 mtu = f6i->fib6_pmtu;
4942
4943		if (mtu >= arg->mtu ||
4944		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4945			fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
4946
4947		spin_lock_bh(&rt6_exception_lock);
4948		rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
4949		spin_unlock_bh(&rt6_exception_lock);
4950	}
4951
4952	return 0;
4953}
4954
4955static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
4956{
4957	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4958	struct inet6_dev *idev;
4959
4960	/* In IPv6 pmtu discovery is not optional,
4961	   so that RTAX_MTU lock cannot disable it.
4962	   We still use this lock to block changes
4963	   caused by addrconf/ndisc.
4964	*/
4965
4966	idev = __in6_dev_get(arg->dev);
4967	if (!idev)
4968		return 0;
4969
4970	if (fib6_metric_locked(f6i, RTAX_MTU))
4971		return 0;
4972
4973	arg->f6i = f6i;
4974	if (f6i->nh) {
4975		/* fib6_nh_mtu_change only returns 0, so this is safe */
4976		return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
4977						arg);
4978	}
4979
4980	return fib6_nh_mtu_change(f6i->fib6_nh, arg);
4981}
4982
4983void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4984{
4985	struct rt6_mtu_change_arg arg = {
4986		.dev = dev,
4987		.mtu = mtu,
4988	};
4989
4990	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4991}
4992
4993static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4994	[RTA_UNSPEC]		= { .strict_start_type = RTA_DPORT + 1 },
4995	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4996	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4997	[RTA_OIF]               = { .type = NLA_U32 },
4998	[RTA_IIF]		= { .type = NLA_U32 },
4999	[RTA_PRIORITY]          = { .type = NLA_U32 },
5000	[RTA_METRICS]           = { .type = NLA_NESTED },
5001	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
5002	[RTA_PREF]              = { .type = NLA_U8 },
5003	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
5004	[RTA_ENCAP]		= { .type = NLA_NESTED },
5005	[RTA_EXPIRES]		= { .type = NLA_U32 },
5006	[RTA_UID]		= { .type = NLA_U32 },
5007	[RTA_MARK]		= { .type = NLA_U32 },
5008	[RTA_TABLE]		= { .type = NLA_U32 },
5009	[RTA_IP_PROTO]		= { .type = NLA_U8 },
5010	[RTA_SPORT]		= { .type = NLA_U16 },
5011	[RTA_DPORT]		= { .type = NLA_U16 },
5012	[RTA_NH_ID]		= { .type = NLA_U32 },
5013};
5014
5015static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
5016			      struct fib6_config *cfg,
5017			      struct netlink_ext_ack *extack)
5018{
5019	struct rtmsg *rtm;
5020	struct nlattr *tb[RTA_MAX+1];
5021	unsigned int pref;
5022	int err;
5023
5024	err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
5025				     rtm_ipv6_policy, extack);
5026	if (err < 0)
5027		goto errout;
5028
5029	err = -EINVAL;
5030	rtm = nlmsg_data(nlh);
5031
5032	if (rtm->rtm_tos) {
5033		NL_SET_ERR_MSG(extack,
5034			       "Invalid dsfield (tos): option not available for IPv6");
5035		goto errout;
5036	}
5037
5038	*cfg = (struct fib6_config){
5039		.fc_table = rtm->rtm_table,
5040		.fc_dst_len = rtm->rtm_dst_len,
5041		.fc_src_len = rtm->rtm_src_len,
5042		.fc_flags = RTF_UP,
5043		.fc_protocol = rtm->rtm_protocol,
5044		.fc_type = rtm->rtm_type,
5045
5046		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
5047		.fc_nlinfo.nlh = nlh,
5048		.fc_nlinfo.nl_net = sock_net(skb->sk),
5049	};
5050
5051	if (rtm->rtm_type == RTN_UNREACHABLE ||
5052	    rtm->rtm_type == RTN_BLACKHOLE ||
5053	    rtm->rtm_type == RTN_PROHIBIT ||
5054	    rtm->rtm_type == RTN_THROW)
5055		cfg->fc_flags |= RTF_REJECT;
5056
5057	if (rtm->rtm_type == RTN_LOCAL)
5058		cfg->fc_flags |= RTF_LOCAL;
5059
5060	if (rtm->rtm_flags & RTM_F_CLONED)
5061		cfg->fc_flags |= RTF_CACHE;
5062
5063	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
5064
5065	if (tb[RTA_NH_ID]) {
5066		if (tb[RTA_GATEWAY]   || tb[RTA_OIF] ||
5067		    tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
5068			NL_SET_ERR_MSG(extack,
5069				       "Nexthop specification and nexthop id are mutually exclusive");
5070			goto errout;
5071		}
5072		cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
5073	}
5074
5075	if (tb[RTA_GATEWAY]) {
5076		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
5077		cfg->fc_flags |= RTF_GATEWAY;
5078	}
5079	if (tb[RTA_VIA]) {
5080		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
5081		goto errout;
5082	}
5083
5084	if (tb[RTA_DST]) {
5085		int plen = (rtm->rtm_dst_len + 7) >> 3;
5086
5087		if (nla_len(tb[RTA_DST]) < plen)
5088			goto errout;
5089
5090		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
5091	}
5092
5093	if (tb[RTA_SRC]) {
5094		int plen = (rtm->rtm_src_len + 7) >> 3;
5095
5096		if (nla_len(tb[RTA_SRC]) < plen)
5097			goto errout;
5098
5099		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
5100	}
5101
5102	if (tb[RTA_PREFSRC])
5103		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
5104
5105	if (tb[RTA_OIF])
5106		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
5107
5108	if (tb[RTA_PRIORITY])
5109		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
5110
5111	if (tb[RTA_METRICS]) {
5112		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
5113		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
5114	}
5115
5116	if (tb[RTA_TABLE])
5117		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
5118
5119	if (tb[RTA_MULTIPATH]) {
5120		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
5121		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
5122
5123		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
5124						     cfg->fc_mp_len, extack);
5125		if (err < 0)
5126			goto errout;
5127	}
5128
5129	if (tb[RTA_PREF]) {
5130		pref = nla_get_u8(tb[RTA_PREF]);
5131		if (pref != ICMPV6_ROUTER_PREF_LOW &&
5132		    pref != ICMPV6_ROUTER_PREF_HIGH)
5133			pref = ICMPV6_ROUTER_PREF_MEDIUM;
5134		cfg->fc_flags |= RTF_PREF(pref);
5135	}
5136
5137	if (tb[RTA_ENCAP])
5138		cfg->fc_encap = tb[RTA_ENCAP];
5139
5140	if (tb[RTA_ENCAP_TYPE]) {
5141		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
5142
5143		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
5144		if (err < 0)
5145			goto errout;
5146	}
5147
5148	if (tb[RTA_EXPIRES]) {
5149		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
5150
5151		if (addrconf_finite_timeout(timeout)) {
5152			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
5153			cfg->fc_flags |= RTF_EXPIRES;
5154		}
5155	}
5156
5157	err = 0;
5158errout:
5159	return err;
5160}
5161
5162struct rt6_nh {
5163	struct fib6_info *fib6_info;
5164	struct fib6_config r_cfg;
5165	struct list_head next;
5166};
5167
5168static int ip6_route_info_append(struct net *net,
5169				 struct list_head *rt6_nh_list,
5170				 struct fib6_info *rt,
5171				 struct fib6_config *r_cfg)
5172{
5173	struct rt6_nh *nh;
5174	int err = -EEXIST;
5175
5176	list_for_each_entry(nh, rt6_nh_list, next) {
5177		/* check if fib6_info already exists */
5178		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
5179			return err;
5180	}
5181
5182	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
5183	if (!nh)
5184		return -ENOMEM;
5185	nh->fib6_info = rt;
5186	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
5187	list_add_tail(&nh->next, rt6_nh_list);
5188
5189	return 0;
5190}
5191
5192static void ip6_route_mpath_notify(struct fib6_info *rt,
5193				   struct fib6_info *rt_last,
5194				   struct nl_info *info,
5195				   __u16 nlflags)
5196{
5197	/* if this is an APPEND route, then rt points to the first route
5198	 * inserted and rt_last points to last route inserted. Userspace
5199	 * wants a consistent dump of the route which starts at the first
5200	 * nexthop. Since sibling routes are always added at the end of
5201	 * the list, find the first sibling of the last route appended
5202	 */
5203	rcu_read_lock();
5204
5205	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
5206		rt = list_first_or_null_rcu(&rt_last->fib6_siblings,
5207					    struct fib6_info,
5208					    fib6_siblings);
5209	}
5210
5211	if (rt)
5212		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
5213
5214	rcu_read_unlock();
5215}
5216
5217static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
5218{
5219	bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
5220	bool should_notify = false;
5221	struct fib6_info *leaf;
5222	struct fib6_node *fn;
5223
5224	rcu_read_lock();
5225	fn = rcu_dereference(rt->fib6_node);
5226	if (!fn)
5227		goto out;
5228
5229	leaf = rcu_dereference(fn->leaf);
5230	if (!leaf)
5231		goto out;
5232
5233	if (rt == leaf ||
5234	    (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric &&
5235	     rt6_qualify_for_ecmp(leaf)))
5236		should_notify = true;
5237out:
5238	rcu_read_unlock();
5239
5240	return should_notify;
5241}
5242
5243static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla,
5244			     struct netlink_ext_ack *extack)
5245{
5246	if (nla_len(nla) < sizeof(*gw)) {
5247		NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY");
5248		return -EINVAL;
5249	}
5250
5251	*gw = nla_get_in6_addr(nla);
5252
5253	return 0;
5254}
5255
5256static int ip6_route_multipath_add(struct fib6_config *cfg,
5257				   struct netlink_ext_ack *extack)
5258{
5259	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
5260	struct nl_info *info = &cfg->fc_nlinfo;
 
5261	struct fib6_config r_cfg;
5262	struct rtnexthop *rtnh;
5263	struct fib6_info *rt;
5264	struct rt6_nh *err_nh;
5265	struct rt6_nh *nh, *nh_safe;
5266	__u16 nlflags;
5267	int remaining;
5268	int attrlen;
5269	int err = 1;
5270	int nhn = 0;
5271	int replace = (cfg->fc_nlinfo.nlh &&
5272		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
5273	LIST_HEAD(rt6_nh_list);
5274
5275	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
5276	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
5277		nlflags |= NLM_F_APPEND;
5278
5279	remaining = cfg->fc_mp_len;
5280	rtnh = (struct rtnexthop *)cfg->fc_mp;
5281
5282	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
5283	 * fib6_info structs per nexthop
5284	 */
5285	while (rtnh_ok(rtnh, remaining)) {
5286		memcpy(&r_cfg, cfg, sizeof(*cfg));
5287		if (rtnh->rtnh_ifindex)
5288			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
5289
5290		attrlen = rtnh_attrlen(rtnh);
5291		if (attrlen > 0) {
5292			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
5293
5294			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
5295			if (nla) {
5296				err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
5297							extack);
5298				if (err)
5299					goto cleanup;
5300
5301				r_cfg.fc_flags |= RTF_GATEWAY;
5302			}
5303			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
5304
5305			/* RTA_ENCAP_TYPE length checked in
5306			 * lwtunnel_valid_encap_type_attr
5307			 */
5308			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
5309			if (nla)
5310				r_cfg.fc_encap_type = nla_get_u16(nla);
5311		}
5312
5313		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
5314		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
5315		if (IS_ERR(rt)) {
5316			err = PTR_ERR(rt);
5317			rt = NULL;
5318			goto cleanup;
5319		}
5320		if (!rt6_qualify_for_ecmp(rt)) {
5321			err = -EINVAL;
5322			NL_SET_ERR_MSG(extack,
5323				       "Device only routes can not be added for IPv6 using the multipath API.");
5324			fib6_info_release(rt);
5325			goto cleanup;
5326		}
5327
5328		rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
5329
5330		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
5331					    rt, &r_cfg);
5332		if (err) {
5333			fib6_info_release(rt);
5334			goto cleanup;
5335		}
5336
5337		rtnh = rtnh_next(rtnh, &remaining);
5338	}
5339
5340	if (list_empty(&rt6_nh_list)) {
5341		NL_SET_ERR_MSG(extack,
5342			       "Invalid nexthop configuration - no valid nexthops");
5343		return -EINVAL;
5344	}
5345
5346	/* for add and replace send one notification with all nexthops.
5347	 * Skip the notification in fib6_add_rt2node and send one with
5348	 * the full route when done
5349	 */
5350	info->skip_notify = 1;
5351
5352	/* For add and replace, send one notification with all nexthops. For
5353	 * append, send one notification with all appended nexthops.
5354	 */
5355	info->skip_notify_kernel = 1;
5356
5357	err_nh = NULL;
5358	list_for_each_entry(nh, &rt6_nh_list, next) {
5359		err = __ip6_ins_rt(nh->fib6_info, info, extack);
 
 
 
 
 
5360
 
 
 
 
 
 
 
5361		if (err) {
5362			if (replace && nhn)
5363				NL_SET_ERR_MSG_MOD(extack,
5364						   "multipath route replace failed (check consistency of installed routes)");
5365			err_nh = nh;
5366			goto add_errout;
5367		}
5368		/* save reference to last route successfully inserted */
5369		rt_last = nh->fib6_info;
5370
5371		/* save reference to first route for notification */
5372		if (!rt_notif)
5373			rt_notif = nh->fib6_info;
5374
5375		/* Because each route is added like a single route we remove
5376		 * these flags after the first nexthop: if there is a collision,
5377		 * we have already failed to add the first nexthop:
5378		 * fib6_add_rt2node() has rejected it; when replacing, old
5379		 * nexthops have been replaced by first new, the rest should
5380		 * be added to it.
5381		 */
5382		if (cfg->fc_nlinfo.nlh) {
5383			cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
5384							     NLM_F_REPLACE);
5385			cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
5386		}
5387		nhn++;
5388	}
5389
5390	/* An in-kernel notification should only be sent in case the new
5391	 * multipath route is added as the first route in the node, or if
5392	 * it was appended to it. We pass 'rt_notif' since it is the first
5393	 * sibling and might allow us to skip some checks in the replace case.
5394	 */
5395	if (ip6_route_mpath_should_notify(rt_notif)) {
5396		enum fib_event_type fib_event;
5397
5398		if (rt_notif->fib6_nsiblings != nhn - 1)
5399			fib_event = FIB_EVENT_ENTRY_APPEND;
5400		else
5401			fib_event = FIB_EVENT_ENTRY_REPLACE;
5402
5403		err = call_fib6_multipath_entry_notifiers(info->nl_net,
5404							  fib_event, rt_notif,
5405							  nhn - 1, extack);
5406		if (err) {
5407			/* Delete all the siblings that were just added */
5408			err_nh = NULL;
5409			goto add_errout;
5410		}
5411	}
5412
5413	/* success ... tell user about new route */
5414	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
5415	goto cleanup;
5416
5417add_errout:
5418	/* send notification for routes that were added so that
5419	 * the delete notifications sent by ip6_route_del are
5420	 * coherent
5421	 */
5422	if (rt_notif)
5423		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
5424
5425	/* Delete routes that were already added */
5426	list_for_each_entry(nh, &rt6_nh_list, next) {
5427		if (err_nh == nh)
5428			break;
5429		ip6_route_del(&nh->r_cfg, extack);
5430	}
5431
5432cleanup:
5433	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
5434		fib6_info_release(nh->fib6_info);
 
5435		list_del(&nh->next);
5436		kfree(nh);
5437	}
5438
5439	return err;
5440}
5441
5442static int ip6_route_multipath_del(struct fib6_config *cfg,
5443				   struct netlink_ext_ack *extack)
5444{
5445	struct fib6_config r_cfg;
5446	struct rtnexthop *rtnh;
5447	int last_err = 0;
5448	int remaining;
5449	int attrlen;
5450	int err;
5451
5452	remaining = cfg->fc_mp_len;
5453	rtnh = (struct rtnexthop *)cfg->fc_mp;
5454
5455	/* Parse a Multipath Entry */
5456	while (rtnh_ok(rtnh, remaining)) {
5457		memcpy(&r_cfg, cfg, sizeof(*cfg));
5458		if (rtnh->rtnh_ifindex)
5459			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
5460
5461		attrlen = rtnh_attrlen(rtnh);
5462		if (attrlen > 0) {
5463			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
5464
5465			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
5466			if (nla) {
5467				err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
5468							extack);
5469				if (err) {
5470					last_err = err;
5471					goto next_rtnh;
5472				}
5473
5474				r_cfg.fc_flags |= RTF_GATEWAY;
5475			}
5476		}
5477		err = ip6_route_del(&r_cfg, extack);
5478		if (err)
5479			last_err = err;
5480
5481next_rtnh:
5482		rtnh = rtnh_next(rtnh, &remaining);
5483	}
5484
5485	return last_err;
5486}
5487
5488static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
5489			      struct netlink_ext_ack *extack)
5490{
5491	struct fib6_config cfg;
5492	int err;
5493
5494	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
5495	if (err < 0)
5496		return err;
5497
5498	if (cfg.fc_nh_id &&
5499	    !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
5500		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
5501		return -EINVAL;
5502	}
5503
5504	if (cfg.fc_mp)
5505		return ip6_route_multipath_del(&cfg, extack);
5506	else {
5507		cfg.fc_delete_all_nh = 1;
5508		return ip6_route_del(&cfg, extack);
5509	}
5510}
5511
5512static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
5513			      struct netlink_ext_ack *extack)
5514{
5515	struct fib6_config cfg;
5516	int err;
5517
5518	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
5519	if (err < 0)
5520		return err;
5521
5522	if (cfg.fc_metric == 0)
5523		cfg.fc_metric = IP6_RT_PRIO_USER;
5524
5525	if (cfg.fc_mp)
5526		return ip6_route_multipath_add(&cfg, extack);
5527	else
5528		return ip6_route_add(&cfg, GFP_KERNEL, extack);
5529}
5530
5531/* add the overhead of this fib6_nh to nexthop_len */
5532static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
5533{
5534	int *nexthop_len = arg;
5535
5536	*nexthop_len += nla_total_size(0)	 /* RTA_MULTIPATH */
5537		     + NLA_ALIGN(sizeof(struct rtnexthop))
5538		     + nla_total_size(16); /* RTA_GATEWAY */
5539
5540	if (nh->fib_nh_lws) {
5541		/* RTA_ENCAP_TYPE */
5542		*nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
5543		/* RTA_ENCAP */
5544		*nexthop_len += nla_total_size(2);
5545	}
5546
5547	return 0;
5548}
5549
5550static size_t rt6_nlmsg_size(struct fib6_info *f6i)
5551{
5552	int nexthop_len;
5553
5554	if (f6i->nh) {
5555		nexthop_len = nla_total_size(4); /* RTA_NH_ID */
5556		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
5557					 &nexthop_len);
5558	} else {
5559		struct fib6_nh *nh = f6i->fib6_nh;
5560		struct fib6_info *sibling;
5561
5562		nexthop_len = 0;
5563		if (f6i->fib6_nsiblings) {
5564			rt6_nh_nlmsg_size(nh, &nexthop_len);
5565
5566			rcu_read_lock();
5567
5568			list_for_each_entry_rcu(sibling, &f6i->fib6_siblings,
5569						fib6_siblings) {
5570				rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
5571			}
5572
5573			rcu_read_unlock();
5574		}
5575		nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
5576	}
5577
5578	return NLMSG_ALIGN(sizeof(struct rtmsg))
5579	       + nla_total_size(16) /* RTA_SRC */
5580	       + nla_total_size(16) /* RTA_DST */
5581	       + nla_total_size(16) /* RTA_GATEWAY */
5582	       + nla_total_size(16) /* RTA_PREFSRC */
5583	       + nla_total_size(4) /* RTA_TABLE */
5584	       + nla_total_size(4) /* RTA_IIF */
5585	       + nla_total_size(4) /* RTA_OIF */
5586	       + nla_total_size(4) /* RTA_PRIORITY */
5587	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
5588	       + nla_total_size(sizeof(struct rta_cacheinfo))
5589	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
5590	       + nla_total_size(1) /* RTA_PREF */
5591	       + nexthop_len;
5592}
5593
5594static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
5595				 unsigned char *flags)
5596{
5597	if (nexthop_is_multipath(nh)) {
5598		struct nlattr *mp;
5599
5600		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
5601		if (!mp)
5602			goto nla_put_failure;
5603
5604		if (nexthop_mpath_fill_node(skb, nh, AF_INET6))
5605			goto nla_put_failure;
5606
5607		nla_nest_end(skb, mp);
5608	} else {
5609		struct fib6_nh *fib6_nh;
5610
5611		fib6_nh = nexthop_fib6_nh(nh);
5612		if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6,
5613				     flags, false) < 0)
5614			goto nla_put_failure;
5615	}
5616
5617	return 0;
5618
5619nla_put_failure:
5620	return -EMSGSIZE;
5621}
5622
5623static int rt6_fill_node(struct net *net, struct sk_buff *skb,
5624			 struct fib6_info *rt, struct dst_entry *dst,
5625			 struct in6_addr *dest, struct in6_addr *src,
5626			 int iif, int type, u32 portid, u32 seq,
5627			 unsigned int flags)
5628{
5629	struct rt6_info *rt6 = dst_rt6_info(dst);
5630	struct rt6key *rt6_dst, *rt6_src;
5631	u32 *pmetrics, table, rt6_flags;
5632	unsigned char nh_flags = 0;
5633	struct nlmsghdr *nlh;
5634	struct rtmsg *rtm;
5635	long expires = 0;
5636
5637	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
5638	if (!nlh)
5639		return -EMSGSIZE;
5640
5641	if (rt6) {
5642		rt6_dst = &rt6->rt6i_dst;
5643		rt6_src = &rt6->rt6i_src;
5644		rt6_flags = rt6->rt6i_flags;
5645	} else {
5646		rt6_dst = &rt->fib6_dst;
5647		rt6_src = &rt->fib6_src;
5648		rt6_flags = rt->fib6_flags;
5649	}
5650
5651	rtm = nlmsg_data(nlh);
5652	rtm->rtm_family = AF_INET6;
5653	rtm->rtm_dst_len = rt6_dst->plen;
5654	rtm->rtm_src_len = rt6_src->plen;
5655	rtm->rtm_tos = 0;
5656	if (rt->fib6_table)
5657		table = rt->fib6_table->tb6_id;
5658	else
5659		table = RT6_TABLE_UNSPEC;
5660	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
5661	if (nla_put_u32(skb, RTA_TABLE, table))
5662		goto nla_put_failure;
5663
5664	rtm->rtm_type = rt->fib6_type;
5665	rtm->rtm_flags = 0;
5666	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
5667	rtm->rtm_protocol = rt->fib6_protocol;
5668
5669	if (rt6_flags & RTF_CACHE)
5670		rtm->rtm_flags |= RTM_F_CLONED;
5671
5672	if (dest) {
5673		if (nla_put_in6_addr(skb, RTA_DST, dest))
5674			goto nla_put_failure;
5675		rtm->rtm_dst_len = 128;
5676	} else if (rtm->rtm_dst_len)
5677		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
5678			goto nla_put_failure;
5679#ifdef CONFIG_IPV6_SUBTREES
5680	if (src) {
5681		if (nla_put_in6_addr(skb, RTA_SRC, src))
5682			goto nla_put_failure;
5683		rtm->rtm_src_len = 128;
5684	} else if (rtm->rtm_src_len &&
5685		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
5686		goto nla_put_failure;
5687#endif
5688	if (iif) {
5689#ifdef CONFIG_IPV6_MROUTE
5690		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
5691			int err = ip6mr_get_route(net, skb, rtm, portid);
5692
5693			if (err == 0)
5694				return 0;
5695			if (err < 0)
5696				goto nla_put_failure;
5697		} else
5698#endif
5699			if (nla_put_u32(skb, RTA_IIF, iif))
5700				goto nla_put_failure;
5701	} else if (dest) {
5702		struct in6_addr saddr_buf;
5703		if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 &&
5704		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
5705			goto nla_put_failure;
5706	}
5707
5708	if (rt->fib6_prefsrc.plen) {
5709		struct in6_addr saddr_buf;
5710		saddr_buf = rt->fib6_prefsrc.addr;
5711		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
5712			goto nla_put_failure;
5713	}
5714
5715	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
5716	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
5717		goto nla_put_failure;
5718
5719	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
5720		goto nla_put_failure;
5721
5722	/* For multipath routes, walk the siblings list and add
5723	 * each as a nexthop within RTA_MULTIPATH.
5724	 */
5725	if (rt6) {
5726		if (rt6_flags & RTF_GATEWAY &&
5727		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
5728			goto nla_put_failure;
5729
5730		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
5731			goto nla_put_failure;
5732
5733		if (dst->lwtstate &&
5734		    lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
5735			goto nla_put_failure;
5736	} else if (rt->fib6_nsiblings) {
5737		struct fib6_info *sibling;
5738		struct nlattr *mp;
5739
5740		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
5741		if (!mp)
5742			goto nla_put_failure;
5743
5744		if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
5745				    rt->fib6_nh->fib_nh_weight, AF_INET6,
5746				    0) < 0)
5747			goto nla_put_failure;
5748
5749		rcu_read_lock();
5750
5751		list_for_each_entry_rcu(sibling, &rt->fib6_siblings,
5752					fib6_siblings) {
5753			if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
5754					    sibling->fib6_nh->fib_nh_weight,
5755					    AF_INET6, 0) < 0) {
5756				rcu_read_unlock();
5757
5758				goto nla_put_failure;
5759			}
5760		}
5761
5762		rcu_read_unlock();
5763
5764		nla_nest_end(skb, mp);
5765	} else if (rt->nh) {
5766		if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
5767			goto nla_put_failure;
5768
5769		if (nexthop_is_blackhole(rt->nh))
5770			rtm->rtm_type = RTN_BLACKHOLE;
5771
5772		if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) &&
5773		    rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
5774			goto nla_put_failure;
5775
5776		rtm->rtm_flags |= nh_flags;
5777	} else {
5778		if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6,
5779				     &nh_flags, false) < 0)
5780			goto nla_put_failure;
5781
5782		rtm->rtm_flags |= nh_flags;
5783	}
5784
5785	if (rt6_flags & RTF_EXPIRES) {
5786		expires = dst ? dst->expires : rt->expires;
5787		expires -= jiffies;
5788	}
5789
5790	if (!dst) {
5791		if (READ_ONCE(rt->offload))
5792			rtm->rtm_flags |= RTM_F_OFFLOAD;
5793		if (READ_ONCE(rt->trap))
5794			rtm->rtm_flags |= RTM_F_TRAP;
5795		if (READ_ONCE(rt->offload_failed))
5796			rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
5797	}
5798
5799	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
5800		goto nla_put_failure;
5801
5802	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
5803		goto nla_put_failure;
5804
5805
5806	nlmsg_end(skb, nlh);
5807	return 0;
5808
5809nla_put_failure:
5810	nlmsg_cancel(skb, nlh);
5811	return -EMSGSIZE;
5812}
5813
5814static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
5815{
5816	const struct net_device *dev = arg;
5817
5818	if (nh->fib_nh_dev == dev)
5819		return 1;
5820
5821	return 0;
5822}
5823
5824static bool fib6_info_uses_dev(const struct fib6_info *f6i,
5825			       const struct net_device *dev)
5826{
5827	if (f6i->nh) {
5828		struct net_device *_dev = (struct net_device *)dev;
5829
5830		return !!nexthop_for_each_fib6_nh(f6i->nh,
5831						  fib6_info_nh_uses_dev,
5832						  _dev);
5833	}
5834
5835	if (f6i->fib6_nh->fib_nh_dev == dev)
5836		return true;
5837
5838	if (f6i->fib6_nsiblings) {
5839		struct fib6_info *sibling, *next_sibling;
5840
5841		list_for_each_entry_safe(sibling, next_sibling,
5842					 &f6i->fib6_siblings, fib6_siblings) {
5843			if (sibling->fib6_nh->fib_nh_dev == dev)
5844				return true;
5845		}
5846	}
5847
5848	return false;
5849}
5850
5851struct fib6_nh_exception_dump_walker {
5852	struct rt6_rtnl_dump_arg *dump;
5853	struct fib6_info *rt;
5854	unsigned int flags;
5855	unsigned int skip;
5856	unsigned int count;
5857};
5858
5859static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
5860{
5861	struct fib6_nh_exception_dump_walker *w = arg;
5862	struct rt6_rtnl_dump_arg *dump = w->dump;
5863	struct rt6_exception_bucket *bucket;
5864	struct rt6_exception *rt6_ex;
5865	int i, err;
5866
5867	bucket = fib6_nh_get_excptn_bucket(nh, NULL);
5868	if (!bucket)
5869		return 0;
5870
5871	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
5872		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
5873			if (w->skip) {
5874				w->skip--;
5875				continue;
5876			}
5877
5878			/* Expiration of entries doesn't bump sernum, insertion
5879			 * does. Removal is triggered by insertion, so we can
5880			 * rely on the fact that if entries change between two
5881			 * partial dumps, this node is scanned again completely,
5882			 * see rt6_insert_exception() and fib6_dump_table().
5883			 *
5884			 * Count expired entries we go through as handled
5885			 * entries that we'll skip next time, in case of partial
5886			 * node dump. Otherwise, if entries expire meanwhile,
5887			 * we'll skip the wrong amount.
5888			 */
5889			if (rt6_check_expired(rt6_ex->rt6i)) {
5890				w->count++;
5891				continue;
5892			}
5893
5894			err = rt6_fill_node(dump->net, dump->skb, w->rt,
5895					    &rt6_ex->rt6i->dst, NULL, NULL, 0,
5896					    RTM_NEWROUTE,
5897					    NETLINK_CB(dump->cb->skb).portid,
5898					    dump->cb->nlh->nlmsg_seq, w->flags);
5899			if (err)
5900				return err;
5901
5902			w->count++;
5903		}
5904		bucket++;
5905	}
5906
5907	return 0;
5908}
5909
5910/* Return -1 if done with node, number of handled routes on partial dump */
5911int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
5912{
5913	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
5914	struct fib_dump_filter *filter = &arg->filter;
5915	unsigned int flags = NLM_F_MULTI;
5916	struct net *net = arg->net;
5917	int count = 0;
5918
5919	if (rt == net->ipv6.fib6_null_entry)
5920		return -1;
5921
5922	if ((filter->flags & RTM_F_PREFIX) &&
5923	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
5924		/* success since this is not a prefix route */
5925		return -1;
5926	}
5927	if (filter->filter_set &&
5928	    ((filter->rt_type  && rt->fib6_type != filter->rt_type) ||
5929	     (filter->dev      && !fib6_info_uses_dev(rt, filter->dev)) ||
5930	     (filter->protocol && rt->fib6_protocol != filter->protocol))) {
5931		return -1;
5932	}
5933
5934	if (filter->filter_set ||
5935	    !filter->dump_routes || !filter->dump_exceptions) {
5936		flags |= NLM_F_DUMP_FILTERED;
5937	}
5938
5939	if (filter->dump_routes) {
5940		if (skip) {
5941			skip--;
5942		} else {
5943			if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
5944					  0, RTM_NEWROUTE,
5945					  NETLINK_CB(arg->cb->skb).portid,
5946					  arg->cb->nlh->nlmsg_seq, flags)) {
5947				return 0;
5948			}
5949			count++;
5950		}
5951	}
5952
5953	if (filter->dump_exceptions) {
5954		struct fib6_nh_exception_dump_walker w = { .dump = arg,
5955							   .rt = rt,
5956							   .flags = flags,
5957							   .skip = skip,
5958							   .count = 0 };
5959		int err;
5960
5961		rcu_read_lock();
5962		if (rt->nh) {
5963			err = nexthop_for_each_fib6_nh(rt->nh,
5964						       rt6_nh_dump_exceptions,
5965						       &w);
5966		} else {
5967			err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
5968		}
5969		rcu_read_unlock();
5970
5971		if (err)
5972			return count + w.count;
5973	}
5974
5975	return -1;
5976}
5977
5978static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
5979					const struct nlmsghdr *nlh,
5980					struct nlattr **tb,
5981					struct netlink_ext_ack *extack)
5982{
5983	struct rtmsg *rtm;
5984	int i, err;
5985
5986	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
5987		NL_SET_ERR_MSG_MOD(extack,
5988				   "Invalid header for get route request");
5989		return -EINVAL;
5990	}
5991
5992	if (!netlink_strict_get_check(skb))
5993		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
5994					      rtm_ipv6_policy, extack);
5995
5996	rtm = nlmsg_data(nlh);
5997	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
5998	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
5999	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
6000	    rtm->rtm_type) {
6001		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
6002		return -EINVAL;
6003	}
6004	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
6005		NL_SET_ERR_MSG_MOD(extack,
6006				   "Invalid flags for get route request");
6007		return -EINVAL;
6008	}
6009
6010	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
6011					    rtm_ipv6_policy, extack);
6012	if (err)
6013		return err;
6014
6015	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
6016	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
6017		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
6018		return -EINVAL;
6019	}
6020
6021	for (i = 0; i <= RTA_MAX; i++) {
6022		if (!tb[i])
6023			continue;
6024
6025		switch (i) {
6026		case RTA_SRC:
6027		case RTA_DST:
6028		case RTA_IIF:
6029		case RTA_OIF:
6030		case RTA_MARK:
6031		case RTA_UID:
6032		case RTA_SPORT:
6033		case RTA_DPORT:
6034		case RTA_IP_PROTO:
6035			break;
6036		default:
6037			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
6038			return -EINVAL;
6039		}
6040	}
6041
6042	return 0;
6043}
6044
6045static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
6046			      struct netlink_ext_ack *extack)
6047{
6048	struct net *net = sock_net(in_skb->sk);
6049	struct nlattr *tb[RTA_MAX+1];
6050	int err, iif = 0, oif = 0;
6051	struct fib6_info *from;
6052	struct dst_entry *dst;
6053	struct rt6_info *rt;
6054	struct sk_buff *skb;
6055	struct rtmsg *rtm;
6056	struct flowi6 fl6 = {};
6057	bool fibmatch;
6058
6059	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
6060	if (err < 0)
6061		goto errout;
6062
6063	err = -EINVAL;
6064	rtm = nlmsg_data(nlh);
6065	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
6066	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
6067
6068	if (tb[RTA_SRC]) {
6069		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
6070			goto errout;
6071
6072		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
6073	}
6074
6075	if (tb[RTA_DST]) {
6076		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
6077			goto errout;
6078
6079		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
6080	}
6081
6082	if (tb[RTA_IIF])
6083		iif = nla_get_u32(tb[RTA_IIF]);
6084
6085	if (tb[RTA_OIF])
6086		oif = nla_get_u32(tb[RTA_OIF]);
6087
6088	if (tb[RTA_MARK])
6089		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
6090
6091	if (tb[RTA_UID])
6092		fl6.flowi6_uid = make_kuid(current_user_ns(),
6093					   nla_get_u32(tb[RTA_UID]));
6094	else
6095		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
6096
6097	if (tb[RTA_SPORT])
6098		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
6099
6100	if (tb[RTA_DPORT])
6101		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
6102
6103	if (tb[RTA_IP_PROTO]) {
6104		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
6105						  &fl6.flowi6_proto, AF_INET6,
6106						  extack);
6107		if (err)
6108			goto errout;
6109	}
6110
6111	if (iif) {
6112		struct net_device *dev;
6113		int flags = 0;
6114
6115		rcu_read_lock();
6116
6117		dev = dev_get_by_index_rcu(net, iif);
6118		if (!dev) {
6119			rcu_read_unlock();
6120			err = -ENODEV;
6121			goto errout;
6122		}
6123
6124		fl6.flowi6_iif = iif;
6125
6126		if (!ipv6_addr_any(&fl6.saddr))
6127			flags |= RT6_LOOKUP_F_HAS_SADDR;
6128
6129		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
6130
6131		rcu_read_unlock();
6132	} else {
6133		fl6.flowi6_oif = oif;
6134
6135		dst = ip6_route_output(net, NULL, &fl6);
6136	}
6137
6138
6139	rt = dst_rt6_info(dst);
6140	if (rt->dst.error) {
6141		err = rt->dst.error;
6142		ip6_rt_put(rt);
6143		goto errout;
6144	}
6145
6146	if (rt == net->ipv6.ip6_null_entry) {
6147		err = rt->dst.error;
6148		ip6_rt_put(rt);
6149		goto errout;
6150	}
6151
6152	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
6153	if (!skb) {
6154		ip6_rt_put(rt);
6155		err = -ENOBUFS;
6156		goto errout;
6157	}
6158
6159	skb_dst_set(skb, &rt->dst);
6160
6161	rcu_read_lock();
6162	from = rcu_dereference(rt->from);
6163	if (from) {
6164		if (fibmatch)
6165			err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
6166					    iif, RTM_NEWROUTE,
6167					    NETLINK_CB(in_skb).portid,
6168					    nlh->nlmsg_seq, 0);
6169		else
6170			err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
6171					    &fl6.saddr, iif, RTM_NEWROUTE,
6172					    NETLINK_CB(in_skb).portid,
6173					    nlh->nlmsg_seq, 0);
6174	} else {
6175		err = -ENETUNREACH;
6176	}
6177	rcu_read_unlock();
6178
6179	if (err < 0) {
6180		kfree_skb(skb);
6181		goto errout;
6182	}
6183
6184	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
6185errout:
6186	return err;
6187}
6188
6189void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
6190		     unsigned int nlm_flags)
6191{
6192	struct sk_buff *skb;
6193	struct net *net = info->nl_net;
6194	u32 seq;
6195	int err;
6196
6197	err = -ENOBUFS;
6198	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
6199
6200	skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC);
6201	if (!skb)
6202		goto errout;
6203
6204	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
6205			    event, info->portid, seq, nlm_flags);
6206	if (err < 0) {
6207		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6208		WARN_ON(err == -EMSGSIZE);
6209		kfree_skb(skb);
6210		goto errout;
6211	}
6212	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
6213		    info->nlh, GFP_ATOMIC);
6214	return;
6215errout:
6216	rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
 
6217}
6218
6219void fib6_rt_update(struct net *net, struct fib6_info *rt,
6220		    struct nl_info *info)
6221{
6222	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
6223	struct sk_buff *skb;
6224	int err = -ENOBUFS;
6225
 
 
 
 
 
6226	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
6227	if (!skb)
6228		goto errout;
6229
6230	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
6231			    RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
6232	if (err < 0) {
6233		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6234		WARN_ON(err == -EMSGSIZE);
6235		kfree_skb(skb);
6236		goto errout;
6237	}
6238	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
6239		    info->nlh, gfp_any());
6240	return;
6241errout:
6242	rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
6243}
6244
6245void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
6246			    bool offload, bool trap, bool offload_failed)
6247{
6248	struct sk_buff *skb;
6249	int err;
6250
6251	if (READ_ONCE(f6i->offload) == offload &&
6252	    READ_ONCE(f6i->trap) == trap &&
6253	    READ_ONCE(f6i->offload_failed) == offload_failed)
6254		return;
6255
6256	WRITE_ONCE(f6i->offload, offload);
6257	WRITE_ONCE(f6i->trap, trap);
6258
6259	/* 2 means send notifications only if offload_failed was changed. */
6260	if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 &&
6261	    READ_ONCE(f6i->offload_failed) == offload_failed)
6262		return;
6263
6264	WRITE_ONCE(f6i->offload_failed, offload_failed);
6265
6266	if (!rcu_access_pointer(f6i->fib6_node))
6267		/* The route was removed from the tree, do not send
6268		 * notification.
6269		 */
6270		return;
6271
6272	if (!net->ipv6.sysctl.fib_notify_on_flag_change)
6273		return;
6274
6275	skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL);
6276	if (!skb) {
6277		err = -ENOBUFS;
6278		goto errout;
6279	}
6280
6281	err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0,
6282			    0, 0);
6283	if (err < 0) {
6284		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
6285		WARN_ON(err == -EMSGSIZE);
6286		kfree_skb(skb);
6287		goto errout;
6288	}
6289
6290	rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL);
6291	return;
6292
6293errout:
6294	rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
6295}
6296EXPORT_SYMBOL(fib6_info_hw_flags_set);
6297
6298static int ip6_route_dev_notify(struct notifier_block *this,
6299				unsigned long event, void *ptr)
6300{
6301	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
6302	struct net *net = dev_net(dev);
6303
6304	if (!(dev->flags & IFF_LOOPBACK))
6305		return NOTIFY_OK;
6306
6307	if (event == NETDEV_REGISTER) {
6308		net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
6309		net->ipv6.ip6_null_entry->dst.dev = dev;
6310		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
6311#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6312		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
6313		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
6314		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
6315		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
6316#endif
6317	 } else if (event == NETDEV_UNREGISTER &&
6318		    dev->reg_state != NETREG_UNREGISTERED) {
6319		/* NETDEV_UNREGISTER could be fired for multiple times by
6320		 * netdev_wait_allrefs(). Make sure we only call this once.
6321		 */
6322		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
6323#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6324		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
6325		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
6326#endif
6327	}
6328
6329	return NOTIFY_OK;
6330}
6331
6332/*
6333 *	/proc
6334 */
6335
6336#ifdef CONFIG_PROC_FS
6337static int rt6_stats_seq_show(struct seq_file *seq, void *v)
6338{
6339	struct net *net = (struct net *)seq->private;
6340	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
6341		   net->ipv6.rt6_stats->fib_nodes,
6342		   net->ipv6.rt6_stats->fib_route_nodes,
6343		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
6344		   net->ipv6.rt6_stats->fib_rt_entries,
6345		   net->ipv6.rt6_stats->fib_rt_cache,
6346		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
6347		   net->ipv6.rt6_stats->fib_discarded_routes);
6348
6349	return 0;
6350}
6351#endif	/* CONFIG_PROC_FS */
6352
6353#ifdef CONFIG_SYSCTL
6354
6355static int ipv6_sysctl_rtcache_flush(const struct ctl_table *ctl, int write,
6356			      void *buffer, size_t *lenp, loff_t *ppos)
 
6357{
6358	struct net *net;
6359	int delay;
6360	int ret;
6361	if (!write)
6362		return -EINVAL;
6363
 
 
6364	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
6365	if (ret)
6366		return ret;
6367
6368	net = (struct net *)ctl->extra1;
6369	delay = net->ipv6.sysctl.flush_delay;
6370	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
6371	return 0;
6372}
6373
6374static struct ctl_table ipv6_route_table_template[] = {
6375	{
6376		.procname	=	"max_size",
6377		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
6378		.maxlen		=	sizeof(int),
6379		.mode		=	0644,
6380		.proc_handler	=	proc_dointvec,
6381	},
6382	{
6383		.procname	=	"gc_thresh",
6384		.data		=	&ip6_dst_ops_template.gc_thresh,
6385		.maxlen		=	sizeof(int),
6386		.mode		=	0644,
6387		.proc_handler	=	proc_dointvec,
6388	},
6389	{
6390		.procname	=	"flush",
6391		.data		=	&init_net.ipv6.sysctl.flush_delay,
6392		.maxlen		=	sizeof(int),
6393		.mode		=	0200,
6394		.proc_handler	=	ipv6_sysctl_rtcache_flush
6395	},
6396	{
6397		.procname	=	"gc_min_interval",
6398		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
6399		.maxlen		=	sizeof(int),
6400		.mode		=	0644,
6401		.proc_handler	=	proc_dointvec_jiffies,
6402	},
6403	{
6404		.procname	=	"gc_timeout",
6405		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
6406		.maxlen		=	sizeof(int),
6407		.mode		=	0644,
6408		.proc_handler	=	proc_dointvec_jiffies,
6409	},
6410	{
6411		.procname	=	"gc_interval",
6412		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
6413		.maxlen		=	sizeof(int),
6414		.mode		=	0644,
6415		.proc_handler	=	proc_dointvec_jiffies,
6416	},
6417	{
6418		.procname	=	"gc_elasticity",
6419		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
6420		.maxlen		=	sizeof(int),
6421		.mode		=	0644,
6422		.proc_handler	=	proc_dointvec,
6423	},
6424	{
6425		.procname	=	"mtu_expires",
6426		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
6427		.maxlen		=	sizeof(int),
6428		.mode		=	0644,
6429		.proc_handler	=	proc_dointvec_jiffies,
6430	},
6431	{
6432		.procname	=	"min_adv_mss",
6433		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
6434		.maxlen		=	sizeof(int),
6435		.mode		=	0644,
6436		.proc_handler	=	proc_dointvec,
6437	},
6438	{
6439		.procname	=	"gc_min_interval_ms",
6440		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
6441		.maxlen		=	sizeof(int),
6442		.mode		=	0644,
6443		.proc_handler	=	proc_dointvec_ms_jiffies,
6444	},
6445	{
6446		.procname	=	"skip_notify_on_dev_down",
6447		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
6448		.maxlen		=	sizeof(u8),
6449		.mode		=	0644,
6450		.proc_handler	=	proc_dou8vec_minmax,
6451		.extra1		=	SYSCTL_ZERO,
6452		.extra2		=	SYSCTL_ONE,
6453	},
 
6454};
6455
6456struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
6457{
6458	struct ctl_table *table;
6459
6460	table = kmemdup(ipv6_route_table_template,
6461			sizeof(ipv6_route_table_template),
6462			GFP_KERNEL);
6463
6464	if (table) {
6465		table[0].data = &net->ipv6.sysctl.ip6_rt_max_size;
 
6466		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
6467		table[2].data = &net->ipv6.sysctl.flush_delay;
6468		table[2].extra1 = net;
6469		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
6470		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
6471		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
6472		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
6473		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
6474		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
6475		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
6476		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
 
 
 
 
6477	}
6478
6479	return table;
6480}
6481
6482size_t ipv6_route_sysctl_table_size(struct net *net)
6483{
6484	/* Don't export sysctls to unprivileged users */
6485	if (net->user_ns != &init_user_ns)
6486		return 1;
6487
6488	return ARRAY_SIZE(ipv6_route_table_template);
6489}
6490#endif
6491
6492static int __net_init ip6_route_net_init(struct net *net)
6493{
6494	int ret = -ENOMEM;
6495
6496	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
6497	       sizeof(net->ipv6.ip6_dst_ops));
6498
6499	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
6500		goto out_ip6_dst_ops;
6501
6502	net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
6503	if (!net->ipv6.fib6_null_entry)
6504		goto out_ip6_dst_entries;
6505	memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
6506	       sizeof(*net->ipv6.fib6_null_entry));
6507
6508	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
6509					   sizeof(*net->ipv6.ip6_null_entry),
6510					   GFP_KERNEL);
6511	if (!net->ipv6.ip6_null_entry)
6512		goto out_fib6_null_entry;
6513	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6514	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
6515			 ip6_template_metrics, true);
6516	INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached);
6517
6518#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6519	net->ipv6.fib6_has_custom_rules = false;
6520	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
6521					       sizeof(*net->ipv6.ip6_prohibit_entry),
6522					       GFP_KERNEL);
6523	if (!net->ipv6.ip6_prohibit_entry)
6524		goto out_ip6_null_entry;
6525	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6526	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
6527			 ip6_template_metrics, true);
6528	INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached);
6529
6530	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
6531					       sizeof(*net->ipv6.ip6_blk_hole_entry),
6532					       GFP_KERNEL);
6533	if (!net->ipv6.ip6_blk_hole_entry)
6534		goto out_ip6_prohibit_entry;
6535	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6536	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
6537			 ip6_template_metrics, true);
6538	INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached);
6539#ifdef CONFIG_IPV6_SUBTREES
6540	net->ipv6.fib6_routes_require_src = 0;
6541#endif
6542#endif
6543
6544	net->ipv6.sysctl.flush_delay = 0;
6545	net->ipv6.sysctl.ip6_rt_max_size = INT_MAX;
6546	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
6547	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
6548	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
6549	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
6550	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
6551	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
6552	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
6553
6554	atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);
6555
6556	ret = 0;
6557out:
6558	return ret;
6559
6560#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6561out_ip6_prohibit_entry:
6562	kfree(net->ipv6.ip6_prohibit_entry);
6563out_ip6_null_entry:
6564	kfree(net->ipv6.ip6_null_entry);
6565#endif
6566out_fib6_null_entry:
6567	kfree(net->ipv6.fib6_null_entry);
6568out_ip6_dst_entries:
6569	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
6570out_ip6_dst_ops:
6571	goto out;
6572}
6573
6574static void __net_exit ip6_route_net_exit(struct net *net)
6575{
6576	kfree(net->ipv6.fib6_null_entry);
6577	kfree(net->ipv6.ip6_null_entry);
6578#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6579	kfree(net->ipv6.ip6_prohibit_entry);
6580	kfree(net->ipv6.ip6_blk_hole_entry);
6581#endif
6582	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
6583}
6584
6585static int __net_init ip6_route_net_init_late(struct net *net)
6586{
6587#ifdef CONFIG_PROC_FS
6588	if (!proc_create_net("ipv6_route", 0, net->proc_net,
6589			     &ipv6_route_seq_ops,
6590			     sizeof(struct ipv6_route_iter)))
6591		return -ENOMEM;
6592
6593	if (!proc_create_net_single("rt6_stats", 0444, net->proc_net,
6594				    rt6_stats_seq_show, NULL)) {
6595		remove_proc_entry("ipv6_route", net->proc_net);
6596		return -ENOMEM;
6597	}
6598#endif
6599	return 0;
6600}
6601
6602static void __net_exit ip6_route_net_exit_late(struct net *net)
6603{
6604#ifdef CONFIG_PROC_FS
6605	remove_proc_entry("ipv6_route", net->proc_net);
6606	remove_proc_entry("rt6_stats", net->proc_net);
6607#endif
6608}
6609
6610static struct pernet_operations ip6_route_net_ops = {
6611	.init = ip6_route_net_init,
6612	.exit = ip6_route_net_exit,
6613};
6614
6615static int __net_init ipv6_inetpeer_init(struct net *net)
6616{
6617	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
6618
6619	if (!bp)
6620		return -ENOMEM;
6621	inet_peer_base_init(bp);
6622	net->ipv6.peers = bp;
6623	return 0;
6624}
6625
6626static void __net_exit ipv6_inetpeer_exit(struct net *net)
6627{
6628	struct inet_peer_base *bp = net->ipv6.peers;
6629
6630	net->ipv6.peers = NULL;
6631	inetpeer_invalidate_tree(bp);
6632	kfree(bp);
6633}
6634
6635static struct pernet_operations ipv6_inetpeer_ops = {
6636	.init	=	ipv6_inetpeer_init,
6637	.exit	=	ipv6_inetpeer_exit,
6638};
6639
6640static struct pernet_operations ip6_route_net_late_ops = {
6641	.init = ip6_route_net_init_late,
6642	.exit = ip6_route_net_exit_late,
6643};
6644
6645static struct notifier_block ip6_route_dev_notifier = {
6646	.notifier_call = ip6_route_dev_notify,
6647	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
6648};
6649
6650void __init ip6_route_init_special_entries(void)
6651{
6652	/* Registering of the loopback is done before this portion of code,
6653	 * the loopback reference in rt6_info will not be taken, do it
6654	 * manually for init_net */
6655	init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
6656	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
6657	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6658  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6659	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
6660	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6661	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
6662	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6663  #endif
6664}
6665
6666#if IS_BUILTIN(CONFIG_IPV6)
6667#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6668DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
6669
6670BTF_ID_LIST(btf_fib6_info_id)
6671BTF_ID(struct, fib6_info)
6672
6673static const struct bpf_iter_seq_info ipv6_route_seq_info = {
6674	.seq_ops		= &ipv6_route_seq_ops,
6675	.init_seq_private	= bpf_iter_init_seq_net,
6676	.fini_seq_private	= bpf_iter_fini_seq_net,
6677	.seq_priv_size		= sizeof(struct ipv6_route_iter),
6678};
6679
6680static struct bpf_iter_reg ipv6_route_reg_info = {
6681	.target			= "ipv6_route",
6682	.ctx_arg_info_size	= 1,
6683	.ctx_arg_info		= {
6684		{ offsetof(struct bpf_iter__ipv6_route, rt),
6685		  PTR_TO_BTF_ID_OR_NULL },
6686	},
6687	.seq_info		= &ipv6_route_seq_info,
6688};
6689
6690static int __init bpf_iter_register(void)
6691{
6692	ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id;
6693	return bpf_iter_reg_target(&ipv6_route_reg_info);
6694}
6695
6696static void bpf_iter_unregister(void)
6697{
6698	bpf_iter_unreg_target(&ipv6_route_reg_info);
6699}
6700#endif
6701#endif
6702
6703static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = {
6704	{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE,
6705	 .doit = inet6_rtm_newroute},
6706	{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE,
6707	 .doit = inet6_rtm_delroute},
6708	{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
6709	 .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
6710};
6711
6712int __init ip6_route_init(void)
6713{
6714	int ret;
6715	int cpu;
6716
6717	ret = -ENOMEM;
6718	ip6_dst_ops_template.kmem_cachep =
6719		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
6720				  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
6721	if (!ip6_dst_ops_template.kmem_cachep)
6722		goto out;
6723
6724	ret = dst_entries_init(&ip6_dst_blackhole_ops);
6725	if (ret)
6726		goto out_kmem_cache;
6727
6728	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
6729	if (ret)
6730		goto out_dst_entries;
6731
6732	ret = register_pernet_subsys(&ip6_route_net_ops);
6733	if (ret)
6734		goto out_register_inetpeer;
6735
6736	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
6737
6738	ret = fib6_init();
6739	if (ret)
6740		goto out_register_subsys;
6741
6742	ret = xfrm6_init();
6743	if (ret)
6744		goto out_fib6_init;
6745
6746	ret = fib6_rules_init();
6747	if (ret)
6748		goto xfrm6_init;
6749
6750	ret = register_pernet_subsys(&ip6_route_net_late_ops);
6751	if (ret)
6752		goto fib6_rules_init;
6753
6754	ret = rtnl_register_many(ip6_route_rtnl_msg_handlers);
 
6755	if (ret < 0)
6756		goto out_register_late_subsys;
6757
6758	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
6759	if (ret)
 
 
 
 
 
 
 
6760		goto out_register_late_subsys;
6761
6762#if IS_BUILTIN(CONFIG_IPV6)
6763#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6764	ret = bpf_iter_register();
6765	if (ret)
6766		goto out_register_late_subsys;
6767#endif
6768#endif
6769
6770	for_each_possible_cpu(cpu) {
6771		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
6772
6773		INIT_LIST_HEAD(&ul->head);
6774		spin_lock_init(&ul->lock);
6775	}
6776
6777out:
6778	return ret;
6779
6780out_register_late_subsys:
6781	rtnl_unregister_all(PF_INET6);
6782	unregister_pernet_subsys(&ip6_route_net_late_ops);
6783fib6_rules_init:
6784	fib6_rules_cleanup();
6785xfrm6_init:
6786	xfrm6_fini();
6787out_fib6_init:
6788	fib6_gc_cleanup();
6789out_register_subsys:
6790	unregister_pernet_subsys(&ip6_route_net_ops);
6791out_register_inetpeer:
6792	unregister_pernet_subsys(&ipv6_inetpeer_ops);
6793out_dst_entries:
6794	dst_entries_destroy(&ip6_dst_blackhole_ops);
6795out_kmem_cache:
6796	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
6797	goto out;
6798}
6799
6800void ip6_route_cleanup(void)
6801{
6802#if IS_BUILTIN(CONFIG_IPV6)
6803#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
6804	bpf_iter_unregister();
6805#endif
6806#endif
6807	unregister_netdevice_notifier(&ip6_route_dev_notifier);
6808	unregister_pernet_subsys(&ip6_route_net_late_ops);
6809	fib6_rules_cleanup();
6810	xfrm6_fini();
6811	fib6_gc_cleanup();
6812	unregister_pernet_subsys(&ipv6_inetpeer_ops);
6813	unregister_pernet_subsys(&ip6_route_net_ops);
6814	dst_entries_destroy(&ip6_dst_blackhole_ops);
6815	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
6816}
v5.4
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *	Linux INET6 implementation
   4 *	FIB front-end.
   5 *
   6 *	Authors:
   7 *	Pedro Roque		<roque@di.fc.ul.pt>
   8 */
   9
  10/*	Changes:
  11 *
  12 *	YOSHIFUJI Hideaki @USAGI
  13 *		reworked default router selection.
  14 *		- respect outgoing interface
  15 *		- select from (probably) reachable routers (i.e.
  16 *		routers in REACHABLE, STALE, DELAY or PROBE states).
  17 *		- always select the same router if it is (probably)
  18 *		reachable.  otherwise, round-robin the list.
  19 *	Ville Nuorvala
  20 *		Fixed routing subtrees.
  21 */
  22
  23#define pr_fmt(fmt) "IPv6: " fmt
  24
  25#include <linux/capability.h>
  26#include <linux/errno.h>
  27#include <linux/export.h>
  28#include <linux/types.h>
  29#include <linux/times.h>
  30#include <linux/socket.h>
  31#include <linux/sockios.h>
  32#include <linux/net.h>
  33#include <linux/route.h>
  34#include <linux/netdevice.h>
  35#include <linux/in6.h>
  36#include <linux/mroute6.h>
  37#include <linux/init.h>
  38#include <linux/if_arp.h>
  39#include <linux/proc_fs.h>
  40#include <linux/seq_file.h>
  41#include <linux/nsproxy.h>
  42#include <linux/slab.h>
  43#include <linux/jhash.h>
 
  44#include <net/net_namespace.h>
  45#include <net/snmp.h>
  46#include <net/ipv6.h>
  47#include <net/ip6_fib.h>
  48#include <net/ip6_route.h>
  49#include <net/ndisc.h>
  50#include <net/addrconf.h>
  51#include <net/tcp.h>
  52#include <linux/rtnetlink.h>
  53#include <net/dst.h>
  54#include <net/dst_metadata.h>
  55#include <net/xfrm.h>
  56#include <net/netevent.h>
  57#include <net/netlink.h>
  58#include <net/rtnh.h>
  59#include <net/lwtunnel.h>
  60#include <net/ip_tunnels.h>
  61#include <net/l3mdev.h>
  62#include <net/ip.h>
  63#include <linux/uaccess.h>
 
  64
  65#ifdef CONFIG_SYSCTL
  66#include <linux/sysctl.h>
  67#endif
  68
  69static int ip6_rt_type_to_error(u8 fib6_type);
  70
  71#define CREATE_TRACE_POINTS
  72#include <trace/events/fib6.h>
  73EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
  74#undef CREATE_TRACE_POINTS
  75
  76enum rt6_nud_state {
  77	RT6_NUD_FAIL_HARD = -3,
  78	RT6_NUD_FAIL_PROBE = -2,
  79	RT6_NUD_FAIL_DO_RR = -1,
  80	RT6_NUD_SUCCEED = 1
  81};
  82
  83static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
 
  84static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
  85static unsigned int	 ip6_mtu(const struct dst_entry *dst);
  86static struct dst_entry *ip6_negative_advice(struct dst_entry *);
 
 
  87static void		ip6_dst_destroy(struct dst_entry *);
  88static void		ip6_dst_ifdown(struct dst_entry *,
  89				       struct net_device *dev, int how);
  90static int		 ip6_dst_gc(struct dst_ops *ops);
  91
  92static int		ip6_pkt_discard(struct sk_buff *skb);
  93static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
  94static int		ip6_pkt_prohibit(struct sk_buff *skb);
  95static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
  96static void		ip6_link_failure(struct sk_buff *skb);
  97static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
  98					   struct sk_buff *skb, u32 mtu);
 
  99static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
 100					struct sk_buff *skb);
 101static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
 102			   int strict);
 103static size_t rt6_nlmsg_size(struct fib6_info *f6i);
 104static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 105			 struct fib6_info *rt, struct dst_entry *dst,
 106			 struct in6_addr *dest, struct in6_addr *src,
 107			 int iif, int type, u32 portid, u32 seq,
 108			 unsigned int flags);
 109static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
 110					   const struct in6_addr *daddr,
 111					   const struct in6_addr *saddr);
 112
 113#ifdef CONFIG_IPV6_ROUTE_INFO
 114static struct fib6_info *rt6_add_route_info(struct net *net,
 115					   const struct in6_addr *prefix, int prefixlen,
 116					   const struct in6_addr *gwaddr,
 117					   struct net_device *dev,
 118					   unsigned int pref);
 119static struct fib6_info *rt6_get_route_info(struct net *net,
 120					   const struct in6_addr *prefix, int prefixlen,
 121					   const struct in6_addr *gwaddr,
 122					   struct net_device *dev);
 123#endif
 124
 125struct uncached_list {
 126	spinlock_t		lock;
 127	struct list_head	head;
 128};
 129
 130static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
 131
 132void rt6_uncached_list_add(struct rt6_info *rt)
 133{
 134	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
 135
 136	rt->rt6i_uncached_list = ul;
 137
 138	spin_lock_bh(&ul->lock);
 139	list_add_tail(&rt->rt6i_uncached, &ul->head);
 140	spin_unlock_bh(&ul->lock);
 141}
 142
 143void rt6_uncached_list_del(struct rt6_info *rt)
 144{
 145	if (!list_empty(&rt->rt6i_uncached)) {
 146		struct uncached_list *ul = rt->rt6i_uncached_list;
 147		struct net *net = dev_net(rt->dst.dev);
 148
 149		spin_lock_bh(&ul->lock);
 150		list_del(&rt->rt6i_uncached);
 151		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
 152		spin_unlock_bh(&ul->lock);
 153	}
 154}
 155
 156static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
 157{
 158	struct net_device *loopback_dev = net->loopback_dev;
 159	int cpu;
 160
 161	if (dev == loopback_dev)
 162		return;
 163
 164	for_each_possible_cpu(cpu) {
 165		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
 166		struct rt6_info *rt;
 
 
 
 167
 168		spin_lock_bh(&ul->lock);
 169		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
 170			struct inet6_dev *rt_idev = rt->rt6i_idev;
 171			struct net_device *rt_dev = rt->dst.dev;
 
 172
 173			if (rt_idev->dev == dev) {
 174				rt->rt6i_idev = in6_dev_get(loopback_dev);
 175				in6_dev_put(rt_idev);
 
 176			}
 177
 178			if (rt_dev == dev) {
 179				rt->dst.dev = blackhole_netdev;
 180				dev_hold(rt->dst.dev);
 181				dev_put(rt_dev);
 
 
 182			}
 
 
 183		}
 184		spin_unlock_bh(&ul->lock);
 185	}
 186}
 187
 188static inline const void *choose_neigh_daddr(const struct in6_addr *p,
 189					     struct sk_buff *skb,
 190					     const void *daddr)
 191{
 192	if (!ipv6_addr_any(p))
 193		return (const void *) p;
 194	else if (skb)
 195		return &ipv6_hdr(skb)->daddr;
 196	return daddr;
 197}
 198
 199struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
 200				   struct net_device *dev,
 201				   struct sk_buff *skb,
 202				   const void *daddr)
 203{
 204	struct neighbour *n;
 205
 206	daddr = choose_neigh_daddr(gw, skb, daddr);
 207	n = __ipv6_neigh_lookup(dev, daddr);
 208	if (n)
 209		return n;
 210
 211	n = neigh_create(&nd_tbl, daddr, dev);
 212	return IS_ERR(n) ? NULL : n;
 213}
 214
 215static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
 216					      struct sk_buff *skb,
 217					      const void *daddr)
 218{
 219	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
 220
 221	return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
 222				dst->dev, skb, daddr);
 223}
 224
 225static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 226{
 
 227	struct net_device *dev = dst->dev;
 228	struct rt6_info *rt = (struct rt6_info *)dst;
 229
 230	daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
 231	if (!daddr)
 232		return;
 233	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
 234		return;
 235	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
 236		return;
 237	__ipv6_confirm_neigh(dev, daddr);
 238}
 239
 240static struct dst_ops ip6_dst_ops_template = {
 241	.family			=	AF_INET6,
 242	.gc			=	ip6_dst_gc,
 243	.gc_thresh		=	1024,
 244	.check			=	ip6_dst_check,
 245	.default_advmss		=	ip6_default_advmss,
 246	.mtu			=	ip6_mtu,
 247	.cow_metrics		=	dst_cow_metrics_generic,
 248	.destroy		=	ip6_dst_destroy,
 249	.ifdown			=	ip6_dst_ifdown,
 250	.negative_advice	=	ip6_negative_advice,
 251	.link_failure		=	ip6_link_failure,
 252	.update_pmtu		=	ip6_rt_update_pmtu,
 253	.redirect		=	rt6_do_redirect,
 254	.local_out		=	__ip6_local_out,
 255	.neigh_lookup		=	ip6_dst_neigh_lookup,
 256	.confirm_neigh		=	ip6_confirm_neigh,
 257};
 258
 259static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
 260{
 261	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
 262
 263	return mtu ? : dst->dev->mtu;
 264}
 265
 266static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
 267					 struct sk_buff *skb, u32 mtu)
 268{
 269}
 270
 271static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
 272				      struct sk_buff *skb)
 273{
 274}
 275
 276static struct dst_ops ip6_dst_blackhole_ops = {
 277	.family			=	AF_INET6,
 278	.destroy		=	ip6_dst_destroy,
 279	.check			=	ip6_dst_check,
 280	.mtu			=	ip6_blackhole_mtu,
 281	.default_advmss		=	ip6_default_advmss,
 282	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
 283	.redirect		=	ip6_rt_blackhole_redirect,
 284	.cow_metrics		=	dst_cow_metrics_generic,
 285	.neigh_lookup		=	ip6_dst_neigh_lookup,
 286};
 287
 288static const u32 ip6_template_metrics[RTAX_MAX] = {
 289	[RTAX_HOPLIMIT - 1] = 0,
 290};
 291
 292static const struct fib6_info fib6_null_entry_template = {
 293	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 294	.fib6_protocol  = RTPROT_KERNEL,
 295	.fib6_metric	= ~(u32)0,
 296	.fib6_ref	= REFCOUNT_INIT(1),
 297	.fib6_type	= RTN_UNREACHABLE,
 298	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
 299};
 300
 301static const struct rt6_info ip6_null_entry_template = {
 302	.dst = {
 303		.__refcnt	= ATOMIC_INIT(1),
 304		.__use		= 1,
 305		.obsolete	= DST_OBSOLETE_FORCE_CHK,
 306		.error		= -ENETUNREACH,
 307		.input		= ip6_pkt_discard,
 308		.output		= ip6_pkt_discard_out,
 309	},
 310	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 311};
 312
 313#ifdef CONFIG_IPV6_MULTIPLE_TABLES
 314
 315static const struct rt6_info ip6_prohibit_entry_template = {
 316	.dst = {
 317		.__refcnt	= ATOMIC_INIT(1),
 318		.__use		= 1,
 319		.obsolete	= DST_OBSOLETE_FORCE_CHK,
 320		.error		= -EACCES,
 321		.input		= ip6_pkt_prohibit,
 322		.output		= ip6_pkt_prohibit_out,
 323	},
 324	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 325};
 326
 327static const struct rt6_info ip6_blk_hole_entry_template = {
 328	.dst = {
 329		.__refcnt	= ATOMIC_INIT(1),
 330		.__use		= 1,
 331		.obsolete	= DST_OBSOLETE_FORCE_CHK,
 332		.error		= -EINVAL,
 333		.input		= dst_discard,
 334		.output		= dst_discard_out,
 335	},
 336	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 337};
 338
 339#endif
 340
 341static void rt6_info_init(struct rt6_info *rt)
 342{
 343	struct dst_entry *dst = &rt->dst;
 344
 345	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
 346	INIT_LIST_HEAD(&rt->rt6i_uncached);
 347}
 348
 349/* allocate dst with ip6_dst_ops */
 350struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
 351			       int flags)
 352{
 353	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
 354					1, DST_OBSOLETE_FORCE_CHK, flags);
 355
 356	if (rt) {
 357		rt6_info_init(rt);
 358		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
 359	}
 360
 361	return rt;
 362}
 363EXPORT_SYMBOL(ip6_dst_alloc);
 364
 365static void ip6_dst_destroy(struct dst_entry *dst)
 366{
 367	struct rt6_info *rt = (struct rt6_info *)dst;
 368	struct fib6_info *from;
 369	struct inet6_dev *idev;
 370
 371	ip_dst_metrics_put(dst);
 372	rt6_uncached_list_del(rt);
 373
 374	idev = rt->rt6i_idev;
 375	if (idev) {
 376		rt->rt6i_idev = NULL;
 377		in6_dev_put(idev);
 378	}
 379
 380	from = xchg((__force struct fib6_info **)&rt->from, NULL);
 381	fib6_info_release(from);
 382}
 383
 384static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 385			   int how)
 386{
 387	struct rt6_info *rt = (struct rt6_info *)dst;
 388	struct inet6_dev *idev = rt->rt6i_idev;
 389	struct net_device *loopback_dev =
 390		dev_net(dev)->loopback_dev;
 391
 392	if (idev && idev->dev != loopback_dev) {
 393		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
 394		if (loopback_idev) {
 395			rt->rt6i_idev = loopback_idev;
 
 396			in6_dev_put(idev);
 397		}
 398	}
 
 
 399}
 400
 401static bool __rt6_check_expired(const struct rt6_info *rt)
 402{
 403	if (rt->rt6i_flags & RTF_EXPIRES)
 404		return time_after(jiffies, rt->dst.expires);
 405	else
 406		return false;
 407}
 408
 409static bool rt6_check_expired(const struct rt6_info *rt)
 410{
 411	struct fib6_info *from;
 412
 413	from = rcu_dereference(rt->from);
 414
 415	if (rt->rt6i_flags & RTF_EXPIRES) {
 416		if (time_after(jiffies, rt->dst.expires))
 417			return true;
 418	} else if (from) {
 419		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
 420			fib6_check_expired(from);
 421	}
 422	return false;
 423}
 424
 425void fib6_select_path(const struct net *net, struct fib6_result *res,
 426		      struct flowi6 *fl6, int oif, bool have_oif_match,
 427		      const struct sk_buff *skb, int strict)
 428{
 429	struct fib6_info *sibling, *next_sibling;
 430	struct fib6_info *match = res->f6i;
 
 431
 432	if ((!match->fib6_nsiblings && !match->nh) || have_oif_match)
 433		goto out;
 434
 
 
 
 
 
 
 435	/* We might have already computed the hash for ICMPv6 errors. In such
 436	 * case it will always be non-zero. Otherwise now is the time to do it.
 437	 */
 438	if (!fl6->mp_hash &&
 439	    (!match->nh || nexthop_is_multipath(match->nh)))
 440		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
 441
 442	if (unlikely(match->nh)) {
 443		nexthop_path_fib6_result(res, fl6->mp_hash);
 444		return;
 445	}
 446
 447	if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
 448		goto out;
 449
 450	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
 451				 fib6_siblings) {
 452		const struct fib6_nh *nh = sibling->fib6_nh;
 453		int nh_upper_bound;
 454
 455		nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
 456		if (fl6->mp_hash > nh_upper_bound)
 457			continue;
 458		if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
 459			break;
 460		match = sibling;
 461		break;
 462	}
 463
 464out:
 465	res->f6i = match;
 466	res->nh = match->fib6_nh;
 467}
 468
 469/*
 470 *	Route lookup. rcu_read_lock() should be held.
 471 */
 472
 473static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
 474			       const struct in6_addr *saddr, int oif, int flags)
 475{
 476	const struct net_device *dev;
 477
 478	if (nh->fib_nh_flags & RTNH_F_DEAD)
 479		return false;
 480
 481	dev = nh->fib_nh_dev;
 482	if (oif) {
 483		if (dev->ifindex == oif)
 484			return true;
 485	} else {
 486		if (ipv6_chk_addr(net, saddr, dev,
 487				  flags & RT6_LOOKUP_F_IFACE))
 488			return true;
 489	}
 490
 491	return false;
 492}
 493
 494struct fib6_nh_dm_arg {
 495	struct net		*net;
 496	const struct in6_addr	*saddr;
 497	int			oif;
 498	int			flags;
 499	struct fib6_nh		*nh;
 500};
 501
 502static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
 503{
 504	struct fib6_nh_dm_arg *arg = _arg;
 505
 506	arg->nh = nh;
 507	return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
 508				  arg->flags);
 509}
 510
 511/* returns fib6_nh from nexthop or NULL */
 512static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
 513					struct fib6_result *res,
 514					const struct in6_addr *saddr,
 515					int oif, int flags)
 516{
 517	struct fib6_nh_dm_arg arg = {
 518		.net   = net,
 519		.saddr = saddr,
 520		.oif   = oif,
 521		.flags = flags,
 522	};
 523
 524	if (nexthop_is_blackhole(nh))
 525		return NULL;
 526
 527	if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
 528		return arg.nh;
 529
 530	return NULL;
 531}
 532
 533static void rt6_device_match(struct net *net, struct fib6_result *res,
 534			     const struct in6_addr *saddr, int oif, int flags)
 535{
 536	struct fib6_info *f6i = res->f6i;
 537	struct fib6_info *spf6i;
 538	struct fib6_nh *nh;
 539
 540	if (!oif && ipv6_addr_any(saddr)) {
 541		if (unlikely(f6i->nh)) {
 542			nh = nexthop_fib6_nh(f6i->nh);
 543			if (nexthop_is_blackhole(f6i->nh))
 544				goto out_blackhole;
 545		} else {
 546			nh = f6i->fib6_nh;
 547		}
 548		if (!(nh->fib_nh_flags & RTNH_F_DEAD))
 549			goto out;
 550	}
 551
 552	for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
 553		bool matched = false;
 554
 555		if (unlikely(spf6i->nh)) {
 556			nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
 557					      oif, flags);
 558			if (nh)
 559				matched = true;
 560		} else {
 561			nh = spf6i->fib6_nh;
 562			if (__rt6_device_match(net, nh, saddr, oif, flags))
 563				matched = true;
 564		}
 565		if (matched) {
 566			res->f6i = spf6i;
 567			goto out;
 568		}
 569	}
 570
 571	if (oif && flags & RT6_LOOKUP_F_IFACE) {
 572		res->f6i = net->ipv6.fib6_null_entry;
 573		nh = res->f6i->fib6_nh;
 574		goto out;
 575	}
 576
 577	if (unlikely(f6i->nh)) {
 578		nh = nexthop_fib6_nh(f6i->nh);
 579		if (nexthop_is_blackhole(f6i->nh))
 580			goto out_blackhole;
 581	} else {
 582		nh = f6i->fib6_nh;
 583	}
 584
 585	if (nh->fib_nh_flags & RTNH_F_DEAD) {
 586		res->f6i = net->ipv6.fib6_null_entry;
 587		nh = res->f6i->fib6_nh;
 588	}
 589out:
 590	res->nh = nh;
 591	res->fib6_type = res->f6i->fib6_type;
 592	res->fib6_flags = res->f6i->fib6_flags;
 593	return;
 594
 595out_blackhole:
 596	res->fib6_flags |= RTF_REJECT;
 597	res->fib6_type = RTN_BLACKHOLE;
 598	res->nh = nh;
 599}
 600
 601#ifdef CONFIG_IPV6_ROUTER_PREF
 602struct __rt6_probe_work {
 603	struct work_struct work;
 604	struct in6_addr target;
 605	struct net_device *dev;
 
 606};
 607
 608static void rt6_probe_deferred(struct work_struct *w)
 609{
 610	struct in6_addr mcaddr;
 611	struct __rt6_probe_work *work =
 612		container_of(w, struct __rt6_probe_work, work);
 613
 614	addrconf_addr_solict_mult(&work->target, &mcaddr);
 615	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
 616	dev_put(work->dev);
 617	kfree(work);
 618}
 619
 620static void rt6_probe(struct fib6_nh *fib6_nh)
 621{
 622	struct __rt6_probe_work *work = NULL;
 623	const struct in6_addr *nh_gw;
 624	unsigned long last_probe;
 625	struct neighbour *neigh;
 626	struct net_device *dev;
 627	struct inet6_dev *idev;
 628
 629	/*
 630	 * Okay, this does not seem to be appropriate
 631	 * for now, however, we need to check if it
 632	 * is really so; aka Router Reachability Probing.
 633	 *
 634	 * Router Reachability Probe MUST be rate-limited
 635	 * to no more than one per minute.
 636	 */
 637	if (!fib6_nh->fib_nh_gw_family)
 638		return;
 639
 640	nh_gw = &fib6_nh->fib_nh_gw6;
 641	dev = fib6_nh->fib_nh_dev;
 642	rcu_read_lock_bh();
 643	last_probe = READ_ONCE(fib6_nh->last_probe);
 644	idev = __in6_dev_get(dev);
 
 
 645	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
 646	if (neigh) {
 647		if (neigh->nud_state & NUD_VALID)
 648			goto out;
 649
 650		write_lock(&neigh->lock);
 651		if (!(neigh->nud_state & NUD_VALID) &&
 652		    time_after(jiffies,
 653			       neigh->updated + idev->cnf.rtr_probe_interval)) {
 
 654			work = kmalloc(sizeof(*work), GFP_ATOMIC);
 655			if (work)
 656				__neigh_set_probe_once(neigh);
 657		}
 658		write_unlock(&neigh->lock);
 659	} else if (time_after(jiffies, last_probe +
 660				       idev->cnf.rtr_probe_interval)) {
 661		work = kmalloc(sizeof(*work), GFP_ATOMIC);
 662	}
 663
 664	if (!work || cmpxchg(&fib6_nh->last_probe,
 665			     last_probe, jiffies) != last_probe) {
 666		kfree(work);
 667	} else {
 668		INIT_WORK(&work->work, rt6_probe_deferred);
 669		work->target = *nh_gw;
 670		dev_hold(dev);
 671		work->dev = dev;
 672		schedule_work(&work->work);
 673	}
 674
 675out:
 676	rcu_read_unlock_bh();
 677}
 678#else
 679static inline void rt6_probe(struct fib6_nh *fib6_nh)
 680{
 681}
 682#endif
 683
 684/*
 685 * Default Router Selection (RFC 2461 6.3.6)
 686 */
 687static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
 688{
 689	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
 690	struct neighbour *neigh;
 691
 692	rcu_read_lock_bh();
 693	neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
 694					  &fib6_nh->fib_nh_gw6);
 695	if (neigh) {
 696		read_lock(&neigh->lock);
 697		if (neigh->nud_state & NUD_VALID)
 
 698			ret = RT6_NUD_SUCCEED;
 699#ifdef CONFIG_IPV6_ROUTER_PREF
 700		else if (!(neigh->nud_state & NUD_FAILED))
 701			ret = RT6_NUD_SUCCEED;
 702		else
 703			ret = RT6_NUD_FAIL_PROBE;
 704#endif
 705		read_unlock(&neigh->lock);
 706	} else {
 707		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
 708		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
 709	}
 710	rcu_read_unlock_bh();
 711
 712	return ret;
 713}
 714
 715static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
 716			   int strict)
 717{
 718	int m = 0;
 719
 720	if (!oif || nh->fib_nh_dev->ifindex == oif)
 721		m = 2;
 722
 723	if (!m && (strict & RT6_LOOKUP_F_IFACE))
 724		return RT6_NUD_FAIL_HARD;
 725#ifdef CONFIG_IPV6_ROUTER_PREF
 726	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
 727#endif
 728	if ((strict & RT6_LOOKUP_F_REACHABLE) &&
 729	    !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
 730		int n = rt6_check_neigh(nh);
 731		if (n < 0)
 732			return n;
 733	}
 734	return m;
 735}
 736
 737static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
 738		       int oif, int strict, int *mpri, bool *do_rr)
 739{
 740	bool match_do_rr = false;
 741	bool rc = false;
 742	int m;
 743
 744	if (nh->fib_nh_flags & RTNH_F_DEAD)
 745		goto out;
 746
 747	if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
 748	    nh->fib_nh_flags & RTNH_F_LINKDOWN &&
 749	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
 750		goto out;
 751
 752	m = rt6_score_route(nh, fib6_flags, oif, strict);
 753	if (m == RT6_NUD_FAIL_DO_RR) {
 754		match_do_rr = true;
 755		m = 0; /* lowest valid score */
 756	} else if (m == RT6_NUD_FAIL_HARD) {
 757		goto out;
 758	}
 759
 760	if (strict & RT6_LOOKUP_F_REACHABLE)
 761		rt6_probe(nh);
 762
 763	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
 764	if (m > *mpri) {
 765		*do_rr = match_do_rr;
 766		*mpri = m;
 767		rc = true;
 768	}
 769out:
 770	return rc;
 771}
 772
 773struct fib6_nh_frl_arg {
 774	u32		flags;
 775	int		oif;
 776	int		strict;
 777	int		*mpri;
 778	bool		*do_rr;
 779	struct fib6_nh	*nh;
 780};
 781
 782static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
 783{
 784	struct fib6_nh_frl_arg *arg = _arg;
 785
 786	arg->nh = nh;
 787	return find_match(nh, arg->flags, arg->oif, arg->strict,
 788			  arg->mpri, arg->do_rr);
 789}
 790
 791static void __find_rr_leaf(struct fib6_info *f6i_start,
 792			   struct fib6_info *nomatch, u32 metric,
 793			   struct fib6_result *res, struct fib6_info **cont,
 794			   int oif, int strict, bool *do_rr, int *mpri)
 795{
 796	struct fib6_info *f6i;
 797
 798	for (f6i = f6i_start;
 799	     f6i && f6i != nomatch;
 800	     f6i = rcu_dereference(f6i->fib6_next)) {
 801		bool matched = false;
 802		struct fib6_nh *nh;
 803
 804		if (cont && f6i->fib6_metric != metric) {
 805			*cont = f6i;
 806			return;
 807		}
 808
 809		if (fib6_check_expired(f6i))
 810			continue;
 811
 812		if (unlikely(f6i->nh)) {
 813			struct fib6_nh_frl_arg arg = {
 814				.flags  = f6i->fib6_flags,
 815				.oif    = oif,
 816				.strict = strict,
 817				.mpri   = mpri,
 818				.do_rr  = do_rr
 819			};
 820
 821			if (nexthop_is_blackhole(f6i->nh)) {
 822				res->fib6_flags = RTF_REJECT;
 823				res->fib6_type = RTN_BLACKHOLE;
 824				res->f6i = f6i;
 825				res->nh = nexthop_fib6_nh(f6i->nh);
 826				return;
 827			}
 828			if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
 829						     &arg)) {
 830				matched = true;
 831				nh = arg.nh;
 832			}
 833		} else {
 834			nh = f6i->fib6_nh;
 835			if (find_match(nh, f6i->fib6_flags, oif, strict,
 836				       mpri, do_rr))
 837				matched = true;
 838		}
 839		if (matched) {
 840			res->f6i = f6i;
 841			res->nh = nh;
 842			res->fib6_flags = f6i->fib6_flags;
 843			res->fib6_type = f6i->fib6_type;
 844		}
 845	}
 846}
 847
 848static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
 849			 struct fib6_info *rr_head, int oif, int strict,
 850			 bool *do_rr, struct fib6_result *res)
 851{
 852	u32 metric = rr_head->fib6_metric;
 853	struct fib6_info *cont = NULL;
 854	int mpri = -1;
 855
 856	__find_rr_leaf(rr_head, NULL, metric, res, &cont,
 857		       oif, strict, do_rr, &mpri);
 858
 859	__find_rr_leaf(leaf, rr_head, metric, res, &cont,
 860		       oif, strict, do_rr, &mpri);
 861
 862	if (res->f6i || !cont)
 863		return;
 864
 865	__find_rr_leaf(cont, NULL, metric, res, NULL,
 866		       oif, strict, do_rr, &mpri);
 867}
 868
 869static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
 870		       struct fib6_result *res, int strict)
 871{
 872	struct fib6_info *leaf = rcu_dereference(fn->leaf);
 873	struct fib6_info *rt0;
 874	bool do_rr = false;
 875	int key_plen;
 876
 877	/* make sure this function or its helpers sets f6i */
 878	res->f6i = NULL;
 879
 880	if (!leaf || leaf == net->ipv6.fib6_null_entry)
 881		goto out;
 882
 883	rt0 = rcu_dereference(fn->rr_ptr);
 884	if (!rt0)
 885		rt0 = leaf;
 886
 887	/* Double check to make sure fn is not an intermediate node
 888	 * and fn->leaf does not points to its child's leaf
 889	 * (This might happen if all routes under fn are deleted from
 890	 * the tree and fib6_repair_tree() is called on the node.)
 891	 */
 892	key_plen = rt0->fib6_dst.plen;
 893#ifdef CONFIG_IPV6_SUBTREES
 894	if (rt0->fib6_src.plen)
 895		key_plen = rt0->fib6_src.plen;
 896#endif
 897	if (fn->fn_bit != key_plen)
 898		goto out;
 899
 900	find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
 901	if (do_rr) {
 902		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
 903
 904		/* no entries matched; do round-robin */
 905		if (!next || next->fib6_metric != rt0->fib6_metric)
 906			next = leaf;
 907
 908		if (next != rt0) {
 909			spin_lock_bh(&leaf->fib6_table->tb6_lock);
 910			/* make sure next is not being deleted from the tree */
 911			if (next->fib6_node)
 912				rcu_assign_pointer(fn->rr_ptr, next);
 913			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
 914		}
 915	}
 916
 917out:
 918	if (!res->f6i) {
 919		res->f6i = net->ipv6.fib6_null_entry;
 920		res->nh = res->f6i->fib6_nh;
 921		res->fib6_flags = res->f6i->fib6_flags;
 922		res->fib6_type = res->f6i->fib6_type;
 923	}
 924}
 925
 926static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
 927{
 928	return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
 929	       res->nh->fib_nh_gw_family;
 930}
 931
 932#ifdef CONFIG_IPV6_ROUTE_INFO
 933int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 934		  const struct in6_addr *gwaddr)
 935{
 936	struct net *net = dev_net(dev);
 937	struct route_info *rinfo = (struct route_info *) opt;
 938	struct in6_addr prefix_buf, *prefix;
 
 939	unsigned int pref;
 940	unsigned long lifetime;
 941	struct fib6_info *rt;
 942
 943	if (len < sizeof(struct route_info)) {
 944		return -EINVAL;
 945	}
 946
 947	/* Sanity check for prefix_len and length */
 948	if (rinfo->length > 3) {
 949		return -EINVAL;
 950	} else if (rinfo->prefix_len > 128) {
 951		return -EINVAL;
 952	} else if (rinfo->prefix_len > 64) {
 953		if (rinfo->length < 2) {
 954			return -EINVAL;
 955		}
 956	} else if (rinfo->prefix_len > 0) {
 957		if (rinfo->length < 1) {
 958			return -EINVAL;
 959		}
 960	}
 961
 962	pref = rinfo->route_pref;
 963	if (pref == ICMPV6_ROUTER_PREF_INVALID)
 964		return -EINVAL;
 965
 966	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
 967
 968	if (rinfo->length == 3)
 969		prefix = (struct in6_addr *)rinfo->prefix;
 970	else {
 971		/* this function is safe */
 972		ipv6_addr_prefix(&prefix_buf,
 973				 (struct in6_addr *)rinfo->prefix,
 974				 rinfo->prefix_len);
 975		prefix = &prefix_buf;
 976	}
 977
 978	if (rinfo->prefix_len == 0)
 979		rt = rt6_get_dflt_router(net, gwaddr, dev);
 980	else
 981		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
 982					gwaddr, dev);
 983
 984	if (rt && !lifetime) {
 985		ip6_del_rt(net, rt);
 986		rt = NULL;
 987	}
 988
 989	if (!rt && lifetime)
 990		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
 991					dev, pref);
 992	else if (rt)
 993		rt->fib6_flags = RTF_ROUTEINFO |
 994				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 995
 996	if (rt) {
 997		if (!addrconf_finite_timeout(lifetime))
 
 
 
 998			fib6_clean_expires(rt);
 999		else
 
1000			fib6_set_expires(rt, jiffies + HZ * lifetime);
 
 
 
 
1001
1002		fib6_info_release(rt);
1003	}
1004	return 0;
1005}
1006#endif
1007
1008/*
1009 *	Misc support functions
1010 */
1011
1012/* called with rcu_lock held */
1013static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
1014{
1015	struct net_device *dev = res->nh->fib_nh_dev;
1016
1017	if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
1018		/* for copies of local routes, dst->dev needs to be the
1019		 * device if it is a master device, the master device if
1020		 * device is enslaved, and the loopback as the default
1021		 */
1022		if (netif_is_l3_slave(dev) &&
1023		    !rt6_need_strict(&res->f6i->fib6_dst.addr))
1024			dev = l3mdev_master_dev_rcu(dev);
1025		else if (!netif_is_l3_master(dev))
1026			dev = dev_net(dev)->loopback_dev;
1027		/* last case is netif_is_l3_master(dev) is true in which
1028		 * case we want dev returned to be dev
1029		 */
1030	}
1031
1032	return dev;
1033}
1034
1035static const int fib6_prop[RTN_MAX + 1] = {
1036	[RTN_UNSPEC]	= 0,
1037	[RTN_UNICAST]	= 0,
1038	[RTN_LOCAL]	= 0,
1039	[RTN_BROADCAST]	= 0,
1040	[RTN_ANYCAST]	= 0,
1041	[RTN_MULTICAST]	= 0,
1042	[RTN_BLACKHOLE]	= -EINVAL,
1043	[RTN_UNREACHABLE] = -EHOSTUNREACH,
1044	[RTN_PROHIBIT]	= -EACCES,
1045	[RTN_THROW]	= -EAGAIN,
1046	[RTN_NAT]	= -EINVAL,
1047	[RTN_XRESOLVE]	= -EINVAL,
1048};
1049
1050static int ip6_rt_type_to_error(u8 fib6_type)
1051{
1052	return fib6_prop[fib6_type];
1053}
1054
1055static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
1056{
1057	unsigned short flags = 0;
1058
1059	if (rt->dst_nocount)
1060		flags |= DST_NOCOUNT;
1061	if (rt->dst_nopolicy)
1062		flags |= DST_NOPOLICY;
1063	if (rt->dst_host)
1064		flags |= DST_HOST;
1065
1066	return flags;
1067}
1068
1069static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
1070{
1071	rt->dst.error = ip6_rt_type_to_error(fib6_type);
1072
1073	switch (fib6_type) {
1074	case RTN_BLACKHOLE:
1075		rt->dst.output = dst_discard_out;
1076		rt->dst.input = dst_discard;
1077		break;
1078	case RTN_PROHIBIT:
1079		rt->dst.output = ip6_pkt_prohibit_out;
1080		rt->dst.input = ip6_pkt_prohibit;
1081		break;
1082	case RTN_THROW:
1083	case RTN_UNREACHABLE:
1084	default:
1085		rt->dst.output = ip6_pkt_discard_out;
1086		rt->dst.input = ip6_pkt_discard;
1087		break;
1088	}
1089}
1090
1091static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
1092{
1093	struct fib6_info *f6i = res->f6i;
1094
1095	if (res->fib6_flags & RTF_REJECT) {
1096		ip6_rt_init_dst_reject(rt, res->fib6_type);
1097		return;
1098	}
1099
1100	rt->dst.error = 0;
1101	rt->dst.output = ip6_output;
1102
1103	if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
1104		rt->dst.input = ip6_input;
1105	} else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
1106		rt->dst.input = ip6_mc_input;
1107	} else {
1108		rt->dst.input = ip6_forward;
1109	}
1110
1111	if (res->nh->fib_nh_lws) {
1112		rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
1113		lwtunnel_set_redirect(&rt->dst);
1114	}
1115
1116	rt->dst.lastuse = jiffies;
1117}
1118
1119/* Caller must already hold reference to @from */
1120static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
1121{
1122	rt->rt6i_flags &= ~RTF_EXPIRES;
1123	rcu_assign_pointer(rt->from, from);
1124	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
1125}
1126
1127/* Caller must already hold reference to f6i in result */
1128static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
1129{
1130	const struct fib6_nh *nh = res->nh;
1131	const struct net_device *dev = nh->fib_nh_dev;
1132	struct fib6_info *f6i = res->f6i;
1133
1134	ip6_rt_init_dst(rt, res);
1135
1136	rt->rt6i_dst = f6i->fib6_dst;
1137	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1138	rt->rt6i_flags = res->fib6_flags;
1139	if (nh->fib_nh_gw_family) {
1140		rt->rt6i_gateway = nh->fib_nh_gw6;
1141		rt->rt6i_flags |= RTF_GATEWAY;
1142	}
1143	rt6_set_from(rt, f6i);
1144#ifdef CONFIG_IPV6_SUBTREES
1145	rt->rt6i_src = f6i->fib6_src;
1146#endif
1147}
1148
1149static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1150					struct in6_addr *saddr)
1151{
1152	struct fib6_node *pn, *sn;
1153	while (1) {
1154		if (fn->fn_flags & RTN_TL_ROOT)
1155			return NULL;
1156		pn = rcu_dereference(fn->parent);
1157		sn = FIB6_SUBTREE(pn);
1158		if (sn && sn != fn)
1159			fn = fib6_node_lookup(sn, NULL, saddr);
1160		else
1161			fn = pn;
1162		if (fn->fn_flags & RTN_RTINFO)
1163			return fn;
1164	}
1165}
1166
1167static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1168{
1169	struct rt6_info *rt = *prt;
1170
1171	if (dst_hold_safe(&rt->dst))
1172		return true;
1173	if (net) {
1174		rt = net->ipv6.ip6_null_entry;
1175		dst_hold(&rt->dst);
1176	} else {
1177		rt = NULL;
1178	}
1179	*prt = rt;
1180	return false;
1181}
1182
1183/* called with rcu_lock held */
1184static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
1185{
1186	struct net_device *dev = res->nh->fib_nh_dev;
1187	struct fib6_info *f6i = res->f6i;
1188	unsigned short flags;
1189	struct rt6_info *nrt;
1190
1191	if (!fib6_info_hold_safe(f6i))
1192		goto fallback;
1193
1194	flags = fib6_info_dst_flags(f6i);
1195	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1196	if (!nrt) {
1197		fib6_info_release(f6i);
1198		goto fallback;
1199	}
1200
1201	ip6_rt_copy_init(nrt, res);
1202	return nrt;
1203
1204fallback:
1205	nrt = dev_net(dev)->ipv6.ip6_null_entry;
1206	dst_hold(&nrt->dst);
1207	return nrt;
1208}
1209
1210static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1211					     struct fib6_table *table,
1212					     struct flowi6 *fl6,
1213					     const struct sk_buff *skb,
1214					     int flags)
1215{
1216	struct fib6_result res = {};
1217	struct fib6_node *fn;
1218	struct rt6_info *rt;
1219
1220	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1221		flags &= ~RT6_LOOKUP_F_IFACE;
1222
1223	rcu_read_lock();
1224	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1225restart:
1226	res.f6i = rcu_dereference(fn->leaf);
1227	if (!res.f6i)
1228		res.f6i = net->ipv6.fib6_null_entry;
1229	else
1230		rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
1231				 flags);
1232
1233	if (res.f6i == net->ipv6.fib6_null_entry) {
1234		fn = fib6_backtrack(fn, &fl6->saddr);
1235		if (fn)
1236			goto restart;
1237
1238		rt = net->ipv6.ip6_null_entry;
1239		dst_hold(&rt->dst);
1240		goto out;
1241	} else if (res.fib6_flags & RTF_REJECT) {
1242		goto do_create;
1243	}
1244
1245	fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
1246			 fl6->flowi6_oif != 0, skb, flags);
1247
1248	/* Search through exception table */
1249	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
1250	if (rt) {
1251		if (ip6_hold_safe(net, &rt))
1252			dst_use_noref(&rt->dst, jiffies);
1253	} else {
1254do_create:
1255		rt = ip6_create_rt_rcu(&res);
1256	}
1257
1258out:
1259	trace_fib6_table_lookup(net, &res, table, fl6);
1260
1261	rcu_read_unlock();
1262
1263	return rt;
1264}
1265
1266struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1267				   const struct sk_buff *skb, int flags)
1268{
1269	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1270}
1271EXPORT_SYMBOL_GPL(ip6_route_lookup);
1272
1273struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1274			    const struct in6_addr *saddr, int oif,
1275			    const struct sk_buff *skb, int strict)
1276{
1277	struct flowi6 fl6 = {
1278		.flowi6_oif = oif,
1279		.daddr = *daddr,
1280	};
1281	struct dst_entry *dst;
1282	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1283
1284	if (saddr) {
1285		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1286		flags |= RT6_LOOKUP_F_HAS_SADDR;
1287	}
1288
1289	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1290	if (dst->error == 0)
1291		return (struct rt6_info *) dst;
1292
1293	dst_release(dst);
1294
1295	return NULL;
1296}
1297EXPORT_SYMBOL(rt6_lookup);
1298
1299/* ip6_ins_rt is called with FREE table->tb6_lock.
1300 * It takes new route entry, the addition fails by any reason the
1301 * route is released.
1302 * Caller must hold dst before calling it.
1303 */
1304
1305static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1306			struct netlink_ext_ack *extack)
1307{
1308	int err;
1309	struct fib6_table *table;
1310
1311	table = rt->fib6_table;
1312	spin_lock_bh(&table->tb6_lock);
1313	err = fib6_add(&table->tb6_root, rt, info, extack);
1314	spin_unlock_bh(&table->tb6_lock);
1315
1316	return err;
1317}
1318
1319int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1320{
1321	struct nl_info info = {	.nl_net = net, };
1322
1323	return __ip6_ins_rt(rt, &info, NULL);
1324}
1325
1326static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
1327					   const struct in6_addr *daddr,
1328					   const struct in6_addr *saddr)
1329{
1330	struct fib6_info *f6i = res->f6i;
1331	struct net_device *dev;
1332	struct rt6_info *rt;
1333
1334	/*
1335	 *	Clone the route.
1336	 */
1337
1338	if (!fib6_info_hold_safe(f6i))
1339		return NULL;
1340
1341	dev = ip6_rt_get_dev_rcu(res);
1342	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1343	if (!rt) {
1344		fib6_info_release(f6i);
1345		return NULL;
1346	}
1347
1348	ip6_rt_copy_init(rt, res);
1349	rt->rt6i_flags |= RTF_CACHE;
1350	rt->dst.flags |= DST_HOST;
1351	rt->rt6i_dst.addr = *daddr;
1352	rt->rt6i_dst.plen = 128;
1353
1354	if (!rt6_is_gw_or_nonexthop(res)) {
1355		if (f6i->fib6_dst.plen != 128 &&
1356		    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
1357			rt->rt6i_flags |= RTF_ANYCAST;
1358#ifdef CONFIG_IPV6_SUBTREES
1359		if (rt->rt6i_src.plen && saddr) {
1360			rt->rt6i_src.addr = *saddr;
1361			rt->rt6i_src.plen = 128;
1362		}
1363#endif
1364	}
1365
1366	return rt;
1367}
1368
1369static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
1370{
1371	struct fib6_info *f6i = res->f6i;
1372	unsigned short flags = fib6_info_dst_flags(f6i);
1373	struct net_device *dev;
1374	struct rt6_info *pcpu_rt;
1375
1376	if (!fib6_info_hold_safe(f6i))
1377		return NULL;
1378
1379	rcu_read_lock();
1380	dev = ip6_rt_get_dev_rcu(res);
1381	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1382	rcu_read_unlock();
1383	if (!pcpu_rt) {
1384		fib6_info_release(f6i);
1385		return NULL;
1386	}
1387	ip6_rt_copy_init(pcpu_rt, res);
1388	pcpu_rt->rt6i_flags |= RTF_PCPU;
 
 
 
 
1389	return pcpu_rt;
1390}
1391
 
 
 
 
 
1392/* It should be called with rcu_read_lock() acquired */
1393static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
1394{
1395	struct rt6_info *pcpu_rt;
1396
1397	pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
1398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1399	return pcpu_rt;
1400}
1401
1402static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1403					    const struct fib6_result *res)
1404{
1405	struct rt6_info *pcpu_rt, *prev, **p;
1406
1407	pcpu_rt = ip6_rt_pcpu_alloc(res);
1408	if (!pcpu_rt)
1409		return NULL;
1410
1411	p = this_cpu_ptr(res->nh->rt6i_pcpu);
1412	prev = cmpxchg(p, NULL, pcpu_rt);
1413	BUG_ON(prev);
1414
1415	if (res->f6i->fib6_destroying) {
1416		struct fib6_info *from;
1417
1418		from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1419		fib6_info_release(from);
1420	}
1421
1422	return pcpu_rt;
1423}
1424
1425/* exception hash table implementation
1426 */
1427static DEFINE_SPINLOCK(rt6_exception_lock);
1428
1429/* Remove rt6_ex from hash table and free the memory
1430 * Caller must hold rt6_exception_lock
1431 */
1432static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1433				 struct rt6_exception *rt6_ex)
1434{
1435	struct fib6_info *from;
1436	struct net *net;
1437
1438	if (!bucket || !rt6_ex)
1439		return;
1440
1441	net = dev_net(rt6_ex->rt6i->dst.dev);
1442	net->ipv6.rt6_stats->fib_rt_cache--;
1443
1444	/* purge completely the exception to allow releasing the held resources:
1445	 * some [sk] cache may keep the dst around for unlimited time
1446	 */
1447	from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1448	fib6_info_release(from);
1449	dst_dev_put(&rt6_ex->rt6i->dst);
1450
1451	hlist_del_rcu(&rt6_ex->hlist);
1452	dst_release(&rt6_ex->rt6i->dst);
1453	kfree_rcu(rt6_ex, rcu);
1454	WARN_ON_ONCE(!bucket->depth);
1455	bucket->depth--;
1456}
1457
1458/* Remove oldest rt6_ex in bucket and free the memory
1459 * Caller must hold rt6_exception_lock
1460 */
1461static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1462{
1463	struct rt6_exception *rt6_ex, *oldest = NULL;
1464
1465	if (!bucket)
1466		return;
1467
1468	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1469		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1470			oldest = rt6_ex;
1471	}
1472	rt6_remove_exception(bucket, oldest);
1473}
1474
1475static u32 rt6_exception_hash(const struct in6_addr *dst,
1476			      const struct in6_addr *src)
1477{
1478	static u32 seed __read_mostly;
1479	u32 val;
 
 
 
 
 
 
1480
1481	net_get_random_once(&seed, sizeof(seed));
1482	val = jhash(dst, sizeof(*dst), seed);
1483
1484#ifdef CONFIG_IPV6_SUBTREES
1485	if (src)
1486		val = jhash(src, sizeof(*src), val);
1487#endif
1488	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
 
 
1489}
1490
1491/* Helper function to find the cached rt in the hash table
1492 * and update bucket pointer to point to the bucket for this
1493 * (daddr, saddr) pair
1494 * Caller must hold rt6_exception_lock
1495 */
1496static struct rt6_exception *
1497__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1498			      const struct in6_addr *daddr,
1499			      const struct in6_addr *saddr)
1500{
1501	struct rt6_exception *rt6_ex;
1502	u32 hval;
1503
1504	if (!(*bucket) || !daddr)
1505		return NULL;
1506
1507	hval = rt6_exception_hash(daddr, saddr);
1508	*bucket += hval;
1509
1510	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1511		struct rt6_info *rt6 = rt6_ex->rt6i;
1512		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1513
1514#ifdef CONFIG_IPV6_SUBTREES
1515		if (matched && saddr)
1516			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1517#endif
1518		if (matched)
1519			return rt6_ex;
1520	}
1521	return NULL;
1522}
1523
1524/* Helper function to find the cached rt in the hash table
1525 * and update bucket pointer to point to the bucket for this
1526 * (daddr, saddr) pair
1527 * Caller must hold rcu_read_lock()
1528 */
1529static struct rt6_exception *
1530__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1531			 const struct in6_addr *daddr,
1532			 const struct in6_addr *saddr)
1533{
1534	struct rt6_exception *rt6_ex;
1535	u32 hval;
1536
1537	WARN_ON_ONCE(!rcu_read_lock_held());
1538
1539	if (!(*bucket) || !daddr)
1540		return NULL;
1541
1542	hval = rt6_exception_hash(daddr, saddr);
1543	*bucket += hval;
1544
1545	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1546		struct rt6_info *rt6 = rt6_ex->rt6i;
1547		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1548
1549#ifdef CONFIG_IPV6_SUBTREES
1550		if (matched && saddr)
1551			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1552#endif
1553		if (matched)
1554			return rt6_ex;
1555	}
1556	return NULL;
1557}
1558
1559static unsigned int fib6_mtu(const struct fib6_result *res)
1560{
1561	const struct fib6_nh *nh = res->nh;
1562	unsigned int mtu;
1563
1564	if (res->f6i->fib6_pmtu) {
1565		mtu = res->f6i->fib6_pmtu;
1566	} else {
1567		struct net_device *dev = nh->fib_nh_dev;
1568		struct inet6_dev *idev;
1569
1570		rcu_read_lock();
1571		idev = __in6_dev_get(dev);
1572		mtu = idev->cnf.mtu6;
1573		rcu_read_unlock();
1574	}
1575
1576	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1577
1578	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
1579}
1580
1581#define FIB6_EXCEPTION_BUCKET_FLUSHED  0x1UL
1582
1583/* used when the flushed bit is not relevant, only access to the bucket
1584 * (ie., all bucket users except rt6_insert_exception);
1585 *
1586 * called under rcu lock; sometimes called with rt6_exception_lock held
1587 */
1588static
1589struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
1590						       spinlock_t *lock)
1591{
1592	struct rt6_exception_bucket *bucket;
1593
1594	if (lock)
1595		bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1596						   lockdep_is_held(lock));
1597	else
1598		bucket = rcu_dereference(nh->rt6i_exception_bucket);
1599
1600	/* remove bucket flushed bit if set */
1601	if (bucket) {
1602		unsigned long p = (unsigned long)bucket;
1603
1604		p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
1605		bucket = (struct rt6_exception_bucket *)p;
1606	}
1607
1608	return bucket;
1609}
1610
1611static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
1612{
1613	unsigned long p = (unsigned long)bucket;
1614
1615	return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
1616}
1617
1618/* called with rt6_exception_lock held */
1619static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
1620					      spinlock_t *lock)
1621{
1622	struct rt6_exception_bucket *bucket;
1623	unsigned long p;
1624
1625	bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1626					   lockdep_is_held(lock));
1627
1628	p = (unsigned long)bucket;
1629	p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
1630	bucket = (struct rt6_exception_bucket *)p;
1631	rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
1632}
1633
1634static int rt6_insert_exception(struct rt6_info *nrt,
1635				const struct fib6_result *res)
1636{
1637	struct net *net = dev_net(nrt->dst.dev);
1638	struct rt6_exception_bucket *bucket;
1639	struct fib6_info *f6i = res->f6i;
1640	struct in6_addr *src_key = NULL;
1641	struct rt6_exception *rt6_ex;
1642	struct fib6_nh *nh = res->nh;
 
1643	int err = 0;
1644
1645	spin_lock_bh(&rt6_exception_lock);
1646
1647	bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
1648					  lockdep_is_held(&rt6_exception_lock));
1649	if (!bucket) {
1650		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1651				 GFP_ATOMIC);
1652		if (!bucket) {
1653			err = -ENOMEM;
1654			goto out;
1655		}
1656		rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
1657	} else if (fib6_nh_excptn_bucket_flushed(bucket)) {
1658		err = -EINVAL;
1659		goto out;
1660	}
1661
1662#ifdef CONFIG_IPV6_SUBTREES
1663	/* fib6_src.plen != 0 indicates f6i is in subtree
1664	 * and exception table is indexed by a hash of
1665	 * both fib6_dst and fib6_src.
1666	 * Otherwise, the exception table is indexed by
1667	 * a hash of only fib6_dst.
1668	 */
1669	if (f6i->fib6_src.plen)
1670		src_key = &nrt->rt6i_src.addr;
1671#endif
1672	/* rt6_mtu_change() might lower mtu on f6i.
1673	 * Only insert this exception route if its mtu
1674	 * is less than f6i's mtu value.
1675	 */
1676	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
1677		err = -EINVAL;
1678		goto out;
1679	}
1680
1681	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1682					       src_key);
1683	if (rt6_ex)
1684		rt6_remove_exception(bucket, rt6_ex);
1685
1686	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1687	if (!rt6_ex) {
1688		err = -ENOMEM;
1689		goto out;
1690	}
1691	rt6_ex->rt6i = nrt;
1692	rt6_ex->stamp = jiffies;
1693	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1694	bucket->depth++;
1695	net->ipv6.rt6_stats->fib_rt_cache++;
1696
1697	if (bucket->depth > FIB6_MAX_DEPTH)
 
 
1698		rt6_exception_remove_oldest(bucket);
1699
1700out:
1701	spin_unlock_bh(&rt6_exception_lock);
1702
1703	/* Update fn->fn_sernum to invalidate all cached dst */
1704	if (!err) {
1705		spin_lock_bh(&f6i->fib6_table->tb6_lock);
1706		fib6_update_sernum(net, f6i);
1707		spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1708		fib6_force_start_gc(net);
1709	}
1710
1711	return err;
1712}
1713
1714static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
1715{
1716	struct rt6_exception_bucket *bucket;
1717	struct rt6_exception *rt6_ex;
1718	struct hlist_node *tmp;
1719	int i;
1720
1721	spin_lock_bh(&rt6_exception_lock);
1722
1723	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1724	if (!bucket)
1725		goto out;
1726
1727	/* Prevent rt6_insert_exception() to recreate the bucket list */
1728	if (!from)
1729		fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
1730
1731	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1732		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
1733			if (!from ||
1734			    rcu_access_pointer(rt6_ex->rt6i->from) == from)
1735				rt6_remove_exception(bucket, rt6_ex);
1736		}
1737		WARN_ON_ONCE(!from && bucket->depth);
1738		bucket++;
1739	}
1740out:
1741	spin_unlock_bh(&rt6_exception_lock);
1742}
1743
1744static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
1745{
1746	struct fib6_info *f6i = arg;
1747
1748	fib6_nh_flush_exceptions(nh, f6i);
1749
1750	return 0;
1751}
1752
1753void rt6_flush_exceptions(struct fib6_info *f6i)
1754{
1755	if (f6i->nh)
1756		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions,
1757					 f6i);
1758	else
1759		fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
1760}
1761
1762/* Find cached rt in the hash table inside passed in rt
1763 * Caller has to hold rcu_read_lock()
1764 */
1765static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
1766					   const struct in6_addr *daddr,
1767					   const struct in6_addr *saddr)
1768{
1769	const struct in6_addr *src_key = NULL;
1770	struct rt6_exception_bucket *bucket;
1771	struct rt6_exception *rt6_ex;
1772	struct rt6_info *ret = NULL;
1773
1774#ifdef CONFIG_IPV6_SUBTREES
1775	/* fib6i_src.plen != 0 indicates f6i is in subtree
1776	 * and exception table is indexed by a hash of
1777	 * both fib6_dst and fib6_src.
1778	 * However, the src addr used to create the hash
1779	 * might not be exactly the passed in saddr which
1780	 * is a /128 addr from the flow.
1781	 * So we need to use f6i->fib6_src to redo lookup
1782	 * if the passed in saddr does not find anything.
1783	 * (See the logic in ip6_rt_cache_alloc() on how
1784	 * rt->rt6i_src is updated.)
1785	 */
1786	if (res->f6i->fib6_src.plen)
1787		src_key = saddr;
1788find_ex:
1789#endif
1790	bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
1791	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1792
1793	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1794		ret = rt6_ex->rt6i;
1795
1796#ifdef CONFIG_IPV6_SUBTREES
1797	/* Use fib6_src as src_key and redo lookup */
1798	if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
1799		src_key = &res->f6i->fib6_src.addr;
1800		goto find_ex;
1801	}
1802#endif
1803
1804	return ret;
1805}
1806
1807/* Remove the passed in cached rt from the hash table that contains it */
1808static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
1809				    const struct rt6_info *rt)
1810{
1811	const struct in6_addr *src_key = NULL;
1812	struct rt6_exception_bucket *bucket;
1813	struct rt6_exception *rt6_ex;
1814	int err;
1815
1816	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
1817		return -ENOENT;
1818
1819	spin_lock_bh(&rt6_exception_lock);
1820	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1821
1822#ifdef CONFIG_IPV6_SUBTREES
1823	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1824	 * and exception table is indexed by a hash of
1825	 * both rt6i_dst and rt6i_src.
1826	 * Otherwise, the exception table is indexed by
1827	 * a hash of only rt6i_dst.
1828	 */
1829	if (plen)
1830		src_key = &rt->rt6i_src.addr;
1831#endif
1832	rt6_ex = __rt6_find_exception_spinlock(&bucket,
1833					       &rt->rt6i_dst.addr,
1834					       src_key);
1835	if (rt6_ex) {
1836		rt6_remove_exception(bucket, rt6_ex);
1837		err = 0;
1838	} else {
1839		err = -ENOENT;
1840	}
1841
1842	spin_unlock_bh(&rt6_exception_lock);
1843	return err;
1844}
1845
1846struct fib6_nh_excptn_arg {
1847	struct rt6_info	*rt;
1848	int		plen;
1849};
1850
1851static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
1852{
1853	struct fib6_nh_excptn_arg *arg = _arg;
1854	int err;
1855
1856	err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
1857	if (err == 0)
1858		return 1;
1859
1860	return 0;
1861}
1862
1863static int rt6_remove_exception_rt(struct rt6_info *rt)
1864{
1865	struct fib6_info *from;
1866
1867	from = rcu_dereference(rt->from);
1868	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1869		return -EINVAL;
1870
1871	if (from->nh) {
1872		struct fib6_nh_excptn_arg arg = {
1873			.rt = rt,
1874			.plen = from->fib6_src.plen
1875		};
1876		int rc;
1877
1878		/* rc = 1 means an entry was found */
1879		rc = nexthop_for_each_fib6_nh(from->nh,
1880					      rt6_nh_remove_exception_rt,
1881					      &arg);
1882		return rc ? 0 : -ENOENT;
1883	}
1884
1885	return fib6_nh_remove_exception(from->fib6_nh,
1886					from->fib6_src.plen, rt);
1887}
1888
1889/* Find rt6_ex which contains the passed in rt cache and
1890 * refresh its stamp
1891 */
1892static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
1893				     const struct rt6_info *rt)
1894{
1895	const struct in6_addr *src_key = NULL;
1896	struct rt6_exception_bucket *bucket;
1897	struct rt6_exception *rt6_ex;
1898
1899	bucket = fib6_nh_get_excptn_bucket(nh, NULL);
1900#ifdef CONFIG_IPV6_SUBTREES
1901	/* rt6i_src.plen != 0 indicates 'from' is in subtree
1902	 * and exception table is indexed by a hash of
1903	 * both rt6i_dst and rt6i_src.
1904	 * Otherwise, the exception table is indexed by
1905	 * a hash of only rt6i_dst.
1906	 */
1907	if (plen)
1908		src_key = &rt->rt6i_src.addr;
1909#endif
1910	rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
1911	if (rt6_ex)
1912		rt6_ex->stamp = jiffies;
1913}
1914
1915struct fib6_nh_match_arg {
1916	const struct net_device *dev;
1917	const struct in6_addr	*gw;
1918	struct fib6_nh		*match;
1919};
1920
1921/* determine if fib6_nh has given device and gateway */
1922static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
1923{
1924	struct fib6_nh_match_arg *arg = _arg;
1925
1926	if (arg->dev != nh->fib_nh_dev ||
1927	    (arg->gw && !nh->fib_nh_gw_family) ||
1928	    (!arg->gw && nh->fib_nh_gw_family) ||
1929	    (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
1930		return 0;
1931
1932	arg->match = nh;
1933
1934	/* found a match, break the loop */
1935	return 1;
1936}
1937
1938static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1939{
1940	struct fib6_info *from;
1941	struct fib6_nh *fib6_nh;
1942
1943	rcu_read_lock();
1944
1945	from = rcu_dereference(rt->from);
1946	if (!from || !(rt->rt6i_flags & RTF_CACHE))
1947		goto unlock;
1948
1949	if (from->nh) {
1950		struct fib6_nh_match_arg arg = {
1951			.dev = rt->dst.dev,
1952			.gw = &rt->rt6i_gateway,
1953		};
1954
1955		nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);
1956
1957		if (!arg.match)
1958			goto unlock;
1959		fib6_nh = arg.match;
1960	} else {
1961		fib6_nh = from->fib6_nh;
1962	}
1963	fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
1964unlock:
1965	rcu_read_unlock();
1966}
1967
1968static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1969					 struct rt6_info *rt, int mtu)
1970{
1971	/* If the new MTU is lower than the route PMTU, this new MTU will be the
1972	 * lowest MTU in the path: always allow updating the route PMTU to
1973	 * reflect PMTU decreases.
1974	 *
1975	 * If the new MTU is higher, and the route PMTU is equal to the local
1976	 * MTU, this means the old MTU is the lowest in the path, so allow
1977	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1978	 * handle this.
1979	 */
1980
1981	if (dst_mtu(&rt->dst) >= mtu)
1982		return true;
1983
1984	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1985		return true;
1986
1987	return false;
1988}
1989
1990static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1991				       const struct fib6_nh *nh, int mtu)
1992{
1993	struct rt6_exception_bucket *bucket;
1994	struct rt6_exception *rt6_ex;
1995	int i;
1996
1997	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
1998	if (!bucket)
1999		return;
2000
2001	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2002		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
2003			struct rt6_info *entry = rt6_ex->rt6i;
2004
2005			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
2006			 * route), the metrics of its rt->from have already
2007			 * been updated.
2008			 */
2009			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
2010			    rt6_mtu_change_route_allowed(idev, entry, mtu))
2011				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
2012		}
2013		bucket++;
2014	}
2015}
2016
2017#define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)
2018
2019static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
2020					    const struct in6_addr *gateway)
2021{
2022	struct rt6_exception_bucket *bucket;
2023	struct rt6_exception *rt6_ex;
2024	struct hlist_node *tmp;
2025	int i;
2026
2027	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
2028		return;
2029
2030	spin_lock_bh(&rt6_exception_lock);
2031	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
2032	if (bucket) {
2033		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2034			hlist_for_each_entry_safe(rt6_ex, tmp,
2035						  &bucket->chain, hlist) {
2036				struct rt6_info *entry = rt6_ex->rt6i;
2037
2038				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
2039				    RTF_CACHE_GATEWAY &&
2040				    ipv6_addr_equal(gateway,
2041						    &entry->rt6i_gateway)) {
2042					rt6_remove_exception(bucket, rt6_ex);
2043				}
2044			}
2045			bucket++;
2046		}
2047	}
2048
2049	spin_unlock_bh(&rt6_exception_lock);
2050}
2051
2052static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
2053				      struct rt6_exception *rt6_ex,
2054				      struct fib6_gc_args *gc_args,
2055				      unsigned long now)
2056{
2057	struct rt6_info *rt = rt6_ex->rt6i;
2058
2059	/* we are pruning and obsoleting aged-out and non gateway exceptions
2060	 * even if others have still references to them, so that on next
2061	 * dst_check() such references can be dropped.
2062	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
2063	 * expired, independently from their aging, as per RFC 8201 section 4
2064	 */
2065	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
2066		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
2067			RT6_TRACE("aging clone %p\n", rt);
2068			rt6_remove_exception(bucket, rt6_ex);
2069			return;
2070		}
2071	} else if (time_after(jiffies, rt->dst.expires)) {
2072		RT6_TRACE("purging expired route %p\n", rt);
2073		rt6_remove_exception(bucket, rt6_ex);
2074		return;
2075	}
2076
2077	if (rt->rt6i_flags & RTF_GATEWAY) {
2078		struct neighbour *neigh;
2079		__u8 neigh_flags = 0;
2080
2081		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
2082		if (neigh)
2083			neigh_flags = neigh->flags;
2084
2085		if (!(neigh_flags & NTF_ROUTER)) {
2086			RT6_TRACE("purging route %p via non-router but gateway\n",
2087				  rt);
2088			rt6_remove_exception(bucket, rt6_ex);
2089			return;
2090		}
2091	}
2092
2093	gc_args->more++;
2094}
2095
2096static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
2097				   struct fib6_gc_args *gc_args,
2098				   unsigned long now)
2099{
2100	struct rt6_exception_bucket *bucket;
2101	struct rt6_exception *rt6_ex;
2102	struct hlist_node *tmp;
2103	int i;
2104
2105	if (!rcu_access_pointer(nh->rt6i_exception_bucket))
2106		return;
2107
2108	rcu_read_lock_bh();
2109	spin_lock(&rt6_exception_lock);
2110	bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
2111	if (bucket) {
2112		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
2113			hlist_for_each_entry_safe(rt6_ex, tmp,
2114						  &bucket->chain, hlist) {
2115				rt6_age_examine_exception(bucket, rt6_ex,
2116							  gc_args, now);
2117			}
2118			bucket++;
2119		}
2120	}
2121	spin_unlock(&rt6_exception_lock);
2122	rcu_read_unlock_bh();
2123}
2124
2125struct fib6_nh_age_excptn_arg {
2126	struct fib6_gc_args	*gc_args;
2127	unsigned long		now;
2128};
2129
2130static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
2131{
2132	struct fib6_nh_age_excptn_arg *arg = _arg;
2133
2134	fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
2135	return 0;
2136}
2137
2138void rt6_age_exceptions(struct fib6_info *f6i,
2139			struct fib6_gc_args *gc_args,
2140			unsigned long now)
2141{
2142	if (f6i->nh) {
2143		struct fib6_nh_age_excptn_arg arg = {
2144			.gc_args = gc_args,
2145			.now = now
2146		};
2147
2148		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
2149					 &arg);
2150	} else {
2151		fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
2152	}
2153}
2154
2155/* must be called with rcu lock held */
2156int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
2157		      struct flowi6 *fl6, struct fib6_result *res, int strict)
2158{
2159	struct fib6_node *fn, *saved_fn;
2160
2161	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2162	saved_fn = fn;
2163
2164	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2165		oif = 0;
2166
2167redo_rt6_select:
2168	rt6_select(net, fn, oif, res, strict);
2169	if (res->f6i == net->ipv6.fib6_null_entry) {
2170		fn = fib6_backtrack(fn, &fl6->saddr);
2171		if (fn)
2172			goto redo_rt6_select;
2173		else if (strict & RT6_LOOKUP_F_REACHABLE) {
2174			/* also consider unreachable route */
2175			strict &= ~RT6_LOOKUP_F_REACHABLE;
2176			fn = saved_fn;
2177			goto redo_rt6_select;
2178		}
2179	}
2180
2181	trace_fib6_table_lookup(net, res, table, fl6);
2182
2183	return 0;
2184}
2185
2186struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
2187			       int oif, struct flowi6 *fl6,
2188			       const struct sk_buff *skb, int flags)
2189{
2190	struct fib6_result res = {};
2191	struct rt6_info *rt = NULL;
2192	int strict = 0;
2193
2194	WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
2195		     !rcu_read_lock_held());
2196
2197	strict |= flags & RT6_LOOKUP_F_IFACE;
2198	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
2199	if (net->ipv6.devconf_all->forwarding == 0)
2200		strict |= RT6_LOOKUP_F_REACHABLE;
2201
2202	rcu_read_lock();
2203
2204	fib6_table_lookup(net, table, oif, fl6, &res, strict);
2205	if (res.f6i == net->ipv6.fib6_null_entry)
2206		goto out;
2207
2208	fib6_select_path(net, &res, fl6, oif, false, skb, strict);
2209
2210	/*Search through exception table */
2211	rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
2212	if (rt) {
2213		goto out;
2214	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
2215			    !res.nh->fib_nh_gw_family)) {
2216		/* Create a RTF_CACHE clone which will not be
2217		 * owned by the fib6 tree.  It is for the special case where
2218		 * the daddr in the skb during the neighbor look-up is different
2219		 * from the fl6->daddr used to look-up route here.
2220		 */
2221		rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
2222
2223		if (rt) {
2224			/* 1 refcnt is taken during ip6_rt_cache_alloc().
2225			 * As rt6_uncached_list_add() does not consume refcnt,
2226			 * this refcnt is always returned to the caller even
2227			 * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
2228			 */
2229			rt6_uncached_list_add(rt);
2230			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2231			rcu_read_unlock();
2232
2233			return rt;
2234		}
2235	} else {
2236		/* Get a percpu copy */
2237		local_bh_disable();
2238		rt = rt6_get_pcpu_route(&res);
2239
2240		if (!rt)
2241			rt = rt6_make_pcpu_route(net, &res);
2242
2243		local_bh_enable();
2244	}
2245out:
2246	if (!rt)
2247		rt = net->ipv6.ip6_null_entry;
2248	if (!(flags & RT6_LOOKUP_F_DST_NOREF))
2249		ip6_hold_safe(net, &rt);
2250	rcu_read_unlock();
2251
2252	return rt;
2253}
2254EXPORT_SYMBOL_GPL(ip6_pol_route);
2255
2256static struct rt6_info *ip6_pol_route_input(struct net *net,
2257					    struct fib6_table *table,
2258					    struct flowi6 *fl6,
2259					    const struct sk_buff *skb,
2260					    int flags)
2261{
2262	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
2263}
2264
2265struct dst_entry *ip6_route_input_lookup(struct net *net,
2266					 struct net_device *dev,
2267					 struct flowi6 *fl6,
2268					 const struct sk_buff *skb,
2269					 int flags)
2270{
2271	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
2272		flags |= RT6_LOOKUP_F_IFACE;
2273
2274	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
2275}
2276EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
2277
2278static void ip6_multipath_l3_keys(const struct sk_buff *skb,
2279				  struct flow_keys *keys,
2280				  struct flow_keys *flkeys)
2281{
2282	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
2283	const struct ipv6hdr *key_iph = outer_iph;
2284	struct flow_keys *_flkeys = flkeys;
2285	const struct ipv6hdr *inner_iph;
2286	const struct icmp6hdr *icmph;
2287	struct ipv6hdr _inner_iph;
2288	struct icmp6hdr _icmph;
2289
2290	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2291		goto out;
2292
2293	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2294				   sizeof(_icmph), &_icmph);
2295	if (!icmph)
2296		goto out;
2297
2298	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2299	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2300	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2301	    icmph->icmp6_type != ICMPV6_PARAMPROB)
2302		goto out;
2303
2304	inner_iph = skb_header_pointer(skb,
2305				       skb_transport_offset(skb) + sizeof(*icmph),
2306				       sizeof(_inner_iph), &_inner_iph);
2307	if (!inner_iph)
2308		goto out;
2309
2310	key_iph = inner_iph;
2311	_flkeys = NULL;
2312out:
2313	if (_flkeys) {
2314		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2315		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2316		keys->tags.flow_label = _flkeys->tags.flow_label;
2317		keys->basic.ip_proto = _flkeys->basic.ip_proto;
2318	} else {
2319		keys->addrs.v6addrs.src = key_iph->saddr;
2320		keys->addrs.v6addrs.dst = key_iph->daddr;
2321		keys->tags.flow_label = ip6_flowlabel(key_iph);
2322		keys->basic.ip_proto = key_iph->nexthdr;
2323	}
2324}
2325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2326/* if skb is set it will be used and fl6 can be NULL */
2327u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2328		       const struct sk_buff *skb, struct flow_keys *flkeys)
2329{
2330	struct flow_keys hash_keys;
2331	u32 mhash;
2332
2333	switch (ip6_multipath_hash_policy(net)) {
2334	case 0:
2335		memset(&hash_keys, 0, sizeof(hash_keys));
2336		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2337		if (skb) {
2338			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2339		} else {
2340			hash_keys.addrs.v6addrs.src = fl6->saddr;
2341			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2342			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2343			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2344		}
 
2345		break;
2346	case 1:
2347		if (skb) {
2348			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2349			struct flow_keys keys;
2350
2351			/* short-circuit if we already have L4 hash present */
2352			if (skb->l4_hash)
2353				return skb_get_hash_raw(skb) >> 1;
2354
2355			memset(&hash_keys, 0, sizeof(hash_keys));
2356
2357                        if (!flkeys) {
2358				skb_flow_dissect_flow_keys(skb, &keys, flag);
2359				flkeys = &keys;
2360			}
2361			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2362			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2363			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2364			hash_keys.ports.src = flkeys->ports.src;
2365			hash_keys.ports.dst = flkeys->ports.dst;
2366			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2367		} else {
2368			memset(&hash_keys, 0, sizeof(hash_keys));
2369			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2370			hash_keys.addrs.v6addrs.src = fl6->saddr;
2371			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2372			hash_keys.ports.src = fl6->fl6_sport;
2373			hash_keys.ports.dst = fl6->fl6_dport;
2374			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2375		}
 
2376		break;
2377	case 2:
2378		memset(&hash_keys, 0, sizeof(hash_keys));
2379		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2380		if (skb) {
2381			struct flow_keys keys;
2382
2383			if (!flkeys) {
2384				skb_flow_dissect_flow_keys(skb, &keys, 0);
2385				flkeys = &keys;
2386			}
2387
2388			/* Inner can be v4 or v6 */
2389			if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2390				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2391				hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2392				hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2393			} else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2394				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2395				hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2396				hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2397				hash_keys.tags.flow_label = flkeys->tags.flow_label;
2398				hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2399			} else {
2400				/* Same as case 0 */
2401				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2402				ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2403			}
2404		} else {
2405			/* Same as case 0 */
2406			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2407			hash_keys.addrs.v6addrs.src = fl6->saddr;
2408			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2409			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2410			hash_keys.basic.ip_proto = fl6->flowi6_proto;
2411		}
 
 
 
 
 
 
 
2412		break;
2413	}
2414	mhash = flow_hash_from_keys(&hash_keys);
2415
2416	return mhash >> 1;
2417}
2418
2419/* Called with rcu held */
2420void ip6_route_input(struct sk_buff *skb)
2421{
2422	const struct ipv6hdr *iph = ipv6_hdr(skb);
2423	struct net *net = dev_net(skb->dev);
2424	int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
2425	struct ip_tunnel_info *tun_info;
2426	struct flowi6 fl6 = {
2427		.flowi6_iif = skb->dev->ifindex,
2428		.daddr = iph->daddr,
2429		.saddr = iph->saddr,
2430		.flowlabel = ip6_flowinfo(iph),
2431		.flowi6_mark = skb->mark,
2432		.flowi6_proto = iph->nexthdr,
2433	};
2434	struct flow_keys *flkeys = NULL, _flkeys;
2435
2436	tun_info = skb_tunnel_info(skb);
2437	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2438		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2439
2440	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2441		flkeys = &_flkeys;
2442
2443	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2444		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2445	skb_dst_drop(skb);
2446	skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
2447						      &fl6, skb, flags));
2448}
2449
2450static struct rt6_info *ip6_pol_route_output(struct net *net,
2451					     struct fib6_table *table,
2452					     struct flowi6 *fl6,
2453					     const struct sk_buff *skb,
2454					     int flags)
2455{
2456	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2457}
2458
2459struct dst_entry *ip6_route_output_flags_noref(struct net *net,
2460					       const struct sock *sk,
2461					       struct flowi6 *fl6, int flags)
 
2462{
2463	bool any_src;
2464
2465	if (ipv6_addr_type(&fl6->daddr) &
2466	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2467		struct dst_entry *dst;
2468
2469		/* This function does not take refcnt on the dst */
2470		dst = l3mdev_link_scope_lookup(net, fl6);
2471		if (dst)
2472			return dst;
2473	}
2474
2475	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2476
2477	flags |= RT6_LOOKUP_F_DST_NOREF;
2478	any_src = ipv6_addr_any(&fl6->saddr);
2479	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2480	    (fl6->flowi6_oif && any_src))
2481		flags |= RT6_LOOKUP_F_IFACE;
2482
2483	if (!any_src)
2484		flags |= RT6_LOOKUP_F_HAS_SADDR;
2485	else if (sk)
2486		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2487
2488	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2489}
2490EXPORT_SYMBOL_GPL(ip6_route_output_flags_noref);
2491
2492struct dst_entry *ip6_route_output_flags(struct net *net,
2493					 const struct sock *sk,
2494					 struct flowi6 *fl6,
2495					 int flags)
2496{
2497        struct dst_entry *dst;
2498        struct rt6_info *rt6;
2499
2500        rcu_read_lock();
2501        dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
2502        rt6 = (struct rt6_info *)dst;
2503        /* For dst cached in uncached_list, refcnt is already taken. */
2504        if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
2505                dst = &net->ipv6.ip6_null_entry->dst;
2506                dst_hold(dst);
2507        }
2508        rcu_read_unlock();
2509
2510        return dst;
2511}
2512EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2513
2514struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2515{
2516	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2517	struct net_device *loopback_dev = net->loopback_dev;
2518	struct dst_entry *new = NULL;
2519
2520	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2521		       DST_OBSOLETE_DEAD, 0);
2522	if (rt) {
2523		rt6_info_init(rt);
2524		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2525
2526		new = &rt->dst;
2527		new->__use = 1;
2528		new->input = dst_discard;
2529		new->output = dst_discard_out;
2530
2531		dst_copy_metrics(new, &ort->dst);
2532
2533		rt->rt6i_idev = in6_dev_get(loopback_dev);
2534		rt->rt6i_gateway = ort->rt6i_gateway;
2535		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2536
2537		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2538#ifdef CONFIG_IPV6_SUBTREES
2539		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2540#endif
2541	}
2542
2543	dst_release(dst_orig);
2544	return new ? new : ERR_PTR(-ENOMEM);
2545}
2546
2547/*
2548 *	Destination cache support functions
2549 */
2550
2551static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2552{
2553	u32 rt_cookie = 0;
2554
2555	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2556		return false;
2557
2558	if (fib6_check_expired(f6i))
2559		return false;
2560
2561	return true;
2562}
2563
2564static struct dst_entry *rt6_check(struct rt6_info *rt,
2565				   struct fib6_info *from,
2566				   u32 cookie)
2567{
2568	u32 rt_cookie = 0;
2569
2570	if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
2571	    rt_cookie != cookie)
2572		return NULL;
2573
2574	if (rt6_check_expired(rt))
2575		return NULL;
2576
2577	return &rt->dst;
2578}
2579
2580static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2581					    struct fib6_info *from,
2582					    u32 cookie)
2583{
2584	if (!__rt6_check_expired(rt) &&
2585	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2586	    fib6_check(from, cookie))
2587		return &rt->dst;
2588	else
2589		return NULL;
2590}
2591
2592static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 
2593{
2594	struct dst_entry *dst_ret;
2595	struct fib6_info *from;
2596	struct rt6_info *rt;
2597
2598	rt = container_of(dst, struct rt6_info, dst);
 
 
 
2599
2600	rcu_read_lock();
2601
2602	/* All IPV6 dsts are created with ->obsolete set to the value
2603	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2604	 * into this function always.
2605	 */
2606
2607	from = rcu_dereference(rt->from);
2608
2609	if (from && (rt->rt6i_flags & RTF_PCPU ||
2610	    unlikely(!list_empty(&rt->rt6i_uncached))))
2611		dst_ret = rt6_dst_from_check(rt, from, cookie);
2612	else
2613		dst_ret = rt6_check(rt, from, cookie);
2614
2615	rcu_read_unlock();
2616
2617	return dst_ret;
2618}
 
2619
2620static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 
2621{
2622	struct rt6_info *rt = (struct rt6_info *) dst;
2623
2624	if (rt) {
2625		if (rt->rt6i_flags & RTF_CACHE) {
2626			rcu_read_lock();
2627			if (rt6_check_expired(rt)) {
2628				rt6_remove_exception_rt(rt);
2629				dst = NULL;
2630			}
2631			rcu_read_unlock();
2632		} else {
2633			dst_release(dst);
2634			dst = NULL;
2635		}
 
 
2636	}
2637	return dst;
2638}
2639
2640static void ip6_link_failure(struct sk_buff *skb)
2641{
2642	struct rt6_info *rt;
2643
2644	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2645
2646	rt = (struct rt6_info *) skb_dst(skb);
2647	if (rt) {
2648		rcu_read_lock();
2649		if (rt->rt6i_flags & RTF_CACHE) {
2650			rt6_remove_exception_rt(rt);
2651		} else {
2652			struct fib6_info *from;
2653			struct fib6_node *fn;
2654
2655			from = rcu_dereference(rt->from);
2656			if (from) {
2657				fn = rcu_dereference(from->fib6_node);
2658				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2659					fn->fn_sernum = -1;
2660			}
2661		}
2662		rcu_read_unlock();
2663	}
2664}
2665
2666static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2667{
2668	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2669		struct fib6_info *from;
2670
2671		rcu_read_lock();
2672		from = rcu_dereference(rt0->from);
2673		if (from)
2674			rt0->dst.expires = from->expires;
2675		rcu_read_unlock();
2676	}
2677
2678	dst_set_expires(&rt0->dst, timeout);
2679	rt0->rt6i_flags |= RTF_EXPIRES;
2680}
2681
2682static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2683{
2684	struct net *net = dev_net(rt->dst.dev);
2685
2686	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2687	rt->rt6i_flags |= RTF_MODIFIED;
2688	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2689}
2690
2691static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2692{
2693	return !(rt->rt6i_flags & RTF_CACHE) &&
2694		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2695}
2696
2697static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2698				 const struct ipv6hdr *iph, u32 mtu)
 
2699{
2700	const struct in6_addr *daddr, *saddr;
2701	struct rt6_info *rt6 = (struct rt6_info *)dst;
2702
2703	if (dst_metric_locked(dst, RTAX_MTU))
2704		return;
 
 
2705
2706	if (iph) {
2707		daddr = &iph->daddr;
2708		saddr = &iph->saddr;
2709	} else if (sk) {
2710		daddr = &sk->sk_v6_daddr;
2711		saddr = &inet6_sk(sk)->saddr;
2712	} else {
2713		daddr = NULL;
2714		saddr = NULL;
2715	}
2716	dst_confirm_neigh(dst, daddr);
2717	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
 
 
 
 
2718	if (mtu >= dst_mtu(dst))
2719		return;
2720
2721	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2722		rt6_do_update_pmtu(rt6, mtu);
2723		/* update rt6_ex->stamp for cache */
2724		if (rt6->rt6i_flags & RTF_CACHE)
2725			rt6_update_exception_stamp_rt(rt6);
2726	} else if (daddr) {
2727		struct fib6_result res = {};
2728		struct rt6_info *nrt6;
2729
2730		rcu_read_lock();
2731		res.f6i = rcu_dereference(rt6->from);
2732		if (!res.f6i)
2733			goto out_unlock;
2734
2735		res.fib6_flags = res.f6i->fib6_flags;
2736		res.fib6_type = res.f6i->fib6_type;
2737
2738		if (res.f6i->nh) {
2739			struct fib6_nh_match_arg arg = {
2740				.dev = dst->dev,
2741				.gw = &rt6->rt6i_gateway,
2742			};
2743
2744			nexthop_for_each_fib6_nh(res.f6i->nh,
2745						 fib6_nh_find_match, &arg);
2746
2747			/* fib6_info uses a nexthop that does not have fib6_nh
2748			 * using the dst->dev + gw. Should be impossible.
2749			 */
2750			if (!arg.match)
2751				goto out_unlock;
2752
2753			res.nh = arg.match;
2754		} else {
2755			res.nh = res.f6i->fib6_nh;
2756		}
2757
2758		nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
2759		if (nrt6) {
2760			rt6_do_update_pmtu(nrt6, mtu);
2761			if (rt6_insert_exception(nrt6, &res))
2762				dst_release_immediate(&nrt6->dst);
2763		}
2764out_unlock:
2765		rcu_read_unlock();
2766	}
2767}
2768
2769static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2770			       struct sk_buff *skb, u32 mtu)
 
2771{
2772	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
 
2773}
2774
2775void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2776		     int oif, u32 mark, kuid_t uid)
2777{
2778	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2779	struct dst_entry *dst;
2780	struct flowi6 fl6 = {
2781		.flowi6_oif = oif,
2782		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
2783		.daddr = iph->daddr,
2784		.saddr = iph->saddr,
2785		.flowlabel = ip6_flowinfo(iph),
2786		.flowi6_uid = uid,
2787	};
2788
2789	dst = ip6_route_output(net, NULL, &fl6);
2790	if (!dst->error)
2791		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2792	dst_release(dst);
2793}
2794EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2795
2796void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2797{
2798	int oif = sk->sk_bound_dev_if;
2799	struct dst_entry *dst;
2800
2801	if (!oif && skb->dev)
2802		oif = l3mdev_master_ifindex(skb->dev);
2803
2804	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
 
2805
2806	dst = __sk_dst_get(sk);
2807	if (!dst || !dst->obsolete ||
2808	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2809		return;
2810
2811	bh_lock_sock(sk);
2812	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2813		ip6_datagram_dst_update(sk, false);
2814	bh_unlock_sock(sk);
2815}
2816EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2817
2818void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2819			   const struct flowi6 *fl6)
2820{
2821#ifdef CONFIG_IPV6_SUBTREES
2822	struct ipv6_pinfo *np = inet6_sk(sk);
2823#endif
2824
2825	ip6_dst_store(sk, dst,
2826		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2827		      &sk->sk_v6_daddr : NULL,
2828#ifdef CONFIG_IPV6_SUBTREES
2829		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2830		      &np->saddr :
2831#endif
2832		      NULL);
2833}
2834
2835static bool ip6_redirect_nh_match(const struct fib6_result *res,
2836				  struct flowi6 *fl6,
2837				  const struct in6_addr *gw,
2838				  struct rt6_info **ret)
2839{
2840	const struct fib6_nh *nh = res->nh;
2841
2842	if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
2843	    fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
2844		return false;
2845
2846	/* rt_cache's gateway might be different from its 'parent'
2847	 * in the case of an ip redirect.
2848	 * So we keep searching in the exception table if the gateway
2849	 * is different.
2850	 */
2851	if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
2852		struct rt6_info *rt_cache;
2853
2854		rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
2855		if (rt_cache &&
2856		    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
2857			*ret = rt_cache;
2858			return true;
2859		}
2860		return false;
2861	}
2862	return true;
2863}
2864
2865struct fib6_nh_rd_arg {
2866	struct fib6_result	*res;
2867	struct flowi6		*fl6;
2868	const struct in6_addr	*gw;
2869	struct rt6_info		**ret;
2870};
2871
2872static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
2873{
2874	struct fib6_nh_rd_arg *arg = _arg;
2875
2876	arg->res->nh = nh;
2877	return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
2878}
2879
2880/* Handle redirects */
2881struct ip6rd_flowi {
2882	struct flowi6 fl6;
2883	struct in6_addr gateway;
2884};
2885
2886static struct rt6_info *__ip6_route_redirect(struct net *net,
2887					     struct fib6_table *table,
2888					     struct flowi6 *fl6,
2889					     const struct sk_buff *skb,
2890					     int flags)
2891{
2892	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2893	struct rt6_info *ret = NULL;
2894	struct fib6_result res = {};
2895	struct fib6_nh_rd_arg arg = {
2896		.res = &res,
2897		.fl6 = fl6,
2898		.gw  = &rdfl->gateway,
2899		.ret = &ret
2900	};
2901	struct fib6_info *rt;
2902	struct fib6_node *fn;
2903
2904	/* l3mdev_update_flow overrides oif if the device is enslaved; in
2905	 * this case we must match on the real ingress device, so reset it
2906	 */
2907	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2908		fl6->flowi6_oif = skb->dev->ifindex;
2909
2910	/* Get the "current" route for this destination and
2911	 * check if the redirect has come from appropriate router.
2912	 *
2913	 * RFC 4861 specifies that redirects should only be
2914	 * accepted if they come from the nexthop to the target.
2915	 * Due to the way the routes are chosen, this notion
2916	 * is a bit fuzzy and one might need to check all possible
2917	 * routes.
2918	 */
2919
2920	rcu_read_lock();
2921	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2922restart:
2923	for_each_fib6_node_rt_rcu(fn) {
2924		res.f6i = rt;
2925		if (fib6_check_expired(rt))
2926			continue;
2927		if (rt->fib6_flags & RTF_REJECT)
2928			break;
2929		if (unlikely(rt->nh)) {
2930			if (nexthop_is_blackhole(rt->nh))
2931				continue;
2932			/* on match, res->nh is filled in and potentially ret */
2933			if (nexthop_for_each_fib6_nh(rt->nh,
2934						     fib6_nh_redirect_match,
2935						     &arg))
2936				goto out;
2937		} else {
2938			res.nh = rt->fib6_nh;
2939			if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
2940						  &ret))
2941				goto out;
2942		}
2943	}
2944
2945	if (!rt)
2946		rt = net->ipv6.fib6_null_entry;
2947	else if (rt->fib6_flags & RTF_REJECT) {
2948		ret = net->ipv6.ip6_null_entry;
2949		goto out;
2950	}
2951
2952	if (rt == net->ipv6.fib6_null_entry) {
2953		fn = fib6_backtrack(fn, &fl6->saddr);
2954		if (fn)
2955			goto restart;
2956	}
2957
2958	res.f6i = rt;
2959	res.nh = rt->fib6_nh;
2960out:
2961	if (ret) {
2962		ip6_hold_safe(net, &ret);
2963	} else {
2964		res.fib6_flags = res.f6i->fib6_flags;
2965		res.fib6_type = res.f6i->fib6_type;
2966		ret = ip6_create_rt_rcu(&res);
2967	}
2968
2969	rcu_read_unlock();
2970
2971	trace_fib6_table_lookup(net, &res, table, fl6);
2972	return ret;
2973};
2974
2975static struct dst_entry *ip6_route_redirect(struct net *net,
2976					    const struct flowi6 *fl6,
2977					    const struct sk_buff *skb,
2978					    const struct in6_addr *gateway)
2979{
2980	int flags = RT6_LOOKUP_F_HAS_SADDR;
2981	struct ip6rd_flowi rdfl;
2982
2983	rdfl.fl6 = *fl6;
2984	rdfl.gateway = *gateway;
2985
2986	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2987				flags, __ip6_route_redirect);
2988}
2989
2990void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2991		  kuid_t uid)
2992{
2993	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2994	struct dst_entry *dst;
2995	struct flowi6 fl6 = {
2996		.flowi6_iif = LOOPBACK_IFINDEX,
2997		.flowi6_oif = oif,
2998		.flowi6_mark = mark,
2999		.daddr = iph->daddr,
3000		.saddr = iph->saddr,
3001		.flowlabel = ip6_flowinfo(iph),
3002		.flowi6_uid = uid,
3003	};
3004
3005	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
3006	rt6_do_redirect(dst, NULL, skb);
3007	dst_release(dst);
3008}
3009EXPORT_SYMBOL_GPL(ip6_redirect);
3010
3011void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
3012{
3013	const struct ipv6hdr *iph = ipv6_hdr(skb);
3014	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
3015	struct dst_entry *dst;
3016	struct flowi6 fl6 = {
3017		.flowi6_iif = LOOPBACK_IFINDEX,
3018		.flowi6_oif = oif,
3019		.daddr = msg->dest,
3020		.saddr = iph->daddr,
3021		.flowi6_uid = sock_net_uid(net, NULL),
3022	};
3023
3024	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
3025	rt6_do_redirect(dst, NULL, skb);
3026	dst_release(dst);
3027}
3028
3029void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
3030{
3031	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
3032		     sk->sk_uid);
3033}
3034EXPORT_SYMBOL_GPL(ip6_sk_redirect);
3035
3036static unsigned int ip6_default_advmss(const struct dst_entry *dst)
3037{
3038	struct net_device *dev = dst->dev;
3039	unsigned int mtu = dst_mtu(dst);
3040	struct net *net = dev_net(dev);
3041
3042	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
3043
 
 
 
3044	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
3045		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
3046
 
 
3047	/*
3048	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
3049	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
3050	 * IPV6_MAXPLEN is also valid and means: "any MSS,
3051	 * rely only on pmtu discovery"
3052	 */
3053	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
3054		mtu = IPV6_MAXPLEN;
3055	return mtu;
3056}
3057
3058static unsigned int ip6_mtu(const struct dst_entry *dst)
3059{
3060	struct inet6_dev *idev;
3061	unsigned int mtu;
3062
3063	mtu = dst_metric_raw(dst, RTAX_MTU);
3064	if (mtu)
3065		goto out;
3066
3067	mtu = IPV6_MIN_MTU;
3068
3069	rcu_read_lock();
3070	idev = __in6_dev_get(dst->dev);
3071	if (idev)
3072		mtu = idev->cnf.mtu6;
3073	rcu_read_unlock();
3074
3075out:
3076	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
3077
3078	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
3079}
 
3080
3081/* MTU selection:
3082 * 1. mtu on route is locked - use it
3083 * 2. mtu from nexthop exception
3084 * 3. mtu from egress device
3085 *
3086 * based on ip6_dst_mtu_forward and exception logic of
3087 * rt6_find_cached_rt; called with rcu_read_lock
3088 */
3089u32 ip6_mtu_from_fib6(const struct fib6_result *res,
3090		      const struct in6_addr *daddr,
3091		      const struct in6_addr *saddr)
3092{
3093	const struct fib6_nh *nh = res->nh;
3094	struct fib6_info *f6i = res->f6i;
3095	struct inet6_dev *idev;
3096	struct rt6_info *rt;
3097	u32 mtu = 0;
3098
3099	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
3100		mtu = f6i->fib6_pmtu;
3101		if (mtu)
3102			goto out;
3103	}
3104
3105	rt = rt6_find_cached_rt(res, daddr, saddr);
3106	if (unlikely(rt)) {
3107		mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
3108	} else {
3109		struct net_device *dev = nh->fib_nh_dev;
3110
3111		mtu = IPV6_MIN_MTU;
3112		idev = __in6_dev_get(dev);
3113		if (idev && idev->cnf.mtu6 > mtu)
3114			mtu = idev->cnf.mtu6;
3115	}
3116
3117	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
3118out:
3119	return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
3120}
3121
3122struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
3123				  struct flowi6 *fl6)
3124{
3125	struct dst_entry *dst;
3126	struct rt6_info *rt;
3127	struct inet6_dev *idev = in6_dev_get(dev);
3128	struct net *net = dev_net(dev);
3129
3130	if (unlikely(!idev))
3131		return ERR_PTR(-ENODEV);
3132
3133	rt = ip6_dst_alloc(net, dev, 0);
3134	if (unlikely(!rt)) {
3135		in6_dev_put(idev);
3136		dst = ERR_PTR(-ENOMEM);
3137		goto out;
3138	}
3139
3140	rt->dst.flags |= DST_HOST;
3141	rt->dst.input = ip6_input;
3142	rt->dst.output  = ip6_output;
3143	rt->rt6i_gateway  = fl6->daddr;
3144	rt->rt6i_dst.addr = fl6->daddr;
3145	rt->rt6i_dst.plen = 128;
3146	rt->rt6i_idev     = idev;
3147	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
3148
3149	/* Add this dst into uncached_list so that rt6_disable_ip() can
3150	 * do proper release of the net_device
3151	 */
3152	rt6_uncached_list_add(rt);
3153	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
3154
3155	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
3156
3157out:
3158	return dst;
3159}
3160
3161static int ip6_dst_gc(struct dst_ops *ops)
3162{
3163	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
3164	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
3165	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
3166	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
3167	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
3168	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
 
3169	int entries;
3170
3171	entries = dst_entries_get_fast(ops);
3172	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
3173	    entries <= rt_max_size)
3174		goto out;
3175
3176	net->ipv6.ip6_rt_gc_expire++;
3177	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
3178	entries = dst_entries_get_slow(ops);
3179	if (entries < ops->gc_thresh)
3180		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
3181out:
3182	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
3183	return entries > rt_max_size;
3184}
3185
3186static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
3187			       const struct in6_addr *gw_addr, u32 tbid,
3188			       int flags, struct fib6_result *res)
3189{
3190	struct flowi6 fl6 = {
3191		.flowi6_oif = cfg->fc_ifindex,
3192		.daddr = *gw_addr,
3193		.saddr = cfg->fc_prefsrc,
3194	};
3195	struct fib6_table *table;
3196	int err;
3197
3198	table = fib6_get_table(net, tbid);
3199	if (!table)
3200		return -EINVAL;
3201
3202	if (!ipv6_addr_any(&cfg->fc_prefsrc))
3203		flags |= RT6_LOOKUP_F_HAS_SADDR;
3204
3205	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
3206
3207	err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
3208	if (!err && res->f6i != net->ipv6.fib6_null_entry)
3209		fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
3210				 cfg->fc_ifindex != 0, NULL, flags);
3211
3212	return err;
3213}
3214
3215static int ip6_route_check_nh_onlink(struct net *net,
3216				     struct fib6_config *cfg,
3217				     const struct net_device *dev,
3218				     struct netlink_ext_ack *extack)
3219{
3220	u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
3221	const struct in6_addr *gw_addr = &cfg->fc_gateway;
3222	struct fib6_result res = {};
3223	int err;
3224
3225	err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
3226	if (!err && !(res.fib6_flags & RTF_REJECT) &&
3227	    /* ignore match if it is the default route */
3228	    !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
3229	    (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
3230		NL_SET_ERR_MSG(extack,
3231			       "Nexthop has invalid gateway or device mismatch");
3232		err = -EINVAL;
3233	}
3234
3235	return err;
3236}
3237
3238static int ip6_route_check_nh(struct net *net,
3239			      struct fib6_config *cfg,
3240			      struct net_device **_dev,
 
3241			      struct inet6_dev **idev)
3242{
3243	const struct in6_addr *gw_addr = &cfg->fc_gateway;
3244	struct net_device *dev = _dev ? *_dev : NULL;
3245	int flags = RT6_LOOKUP_F_IFACE;
3246	struct fib6_result res = {};
3247	int err = -EHOSTUNREACH;
3248
3249	if (cfg->fc_table) {
3250		err = ip6_nh_lookup_table(net, cfg, gw_addr,
3251					  cfg->fc_table, flags, &res);
3252		/* gw_addr can not require a gateway or resolve to a reject
3253		 * route. If a device is given, it must match the result.
3254		 */
3255		if (err || res.fib6_flags & RTF_REJECT ||
3256		    res.nh->fib_nh_gw_family ||
3257		    (dev && dev != res.nh->fib_nh_dev))
3258			err = -EHOSTUNREACH;
3259	}
3260
3261	if (err < 0) {
3262		struct flowi6 fl6 = {
3263			.flowi6_oif = cfg->fc_ifindex,
3264			.daddr = *gw_addr,
3265		};
3266
3267		err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
3268		if (err || res.fib6_flags & RTF_REJECT ||
3269		    res.nh->fib_nh_gw_family)
3270			err = -EHOSTUNREACH;
3271
3272		if (err)
3273			return err;
3274
3275		fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
3276				 cfg->fc_ifindex != 0, NULL, flags);
3277	}
3278
3279	err = 0;
3280	if (dev) {
3281		if (dev != res.nh->fib_nh_dev)
3282			err = -EHOSTUNREACH;
3283	} else {
3284		*_dev = dev = res.nh->fib_nh_dev;
3285		dev_hold(dev);
3286		*idev = in6_dev_get(dev);
3287	}
3288
3289	return err;
3290}
3291
3292static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
3293			   struct net_device **_dev, struct inet6_dev **idev,
 
 
3294			   struct netlink_ext_ack *extack)
3295{
3296	const struct in6_addr *gw_addr = &cfg->fc_gateway;
3297	int gwa_type = ipv6_addr_type(gw_addr);
3298	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
3299	const struct net_device *dev = *_dev;
3300	bool need_addr_check = !dev;
3301	int err = -EINVAL;
3302
3303	/* if gw_addr is local we will fail to detect this in case
3304	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
3305	 * will return already-added prefix route via interface that
3306	 * prefix route was assigned to, which might be non-loopback.
3307	 */
3308	if (dev &&
3309	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
3310		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
3311		goto out;
3312	}
3313
3314	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
3315		/* IPv6 strictly inhibits using not link-local
3316		 * addresses as nexthop address.
3317		 * Otherwise, router will not able to send redirects.
3318		 * It is very good, but in some (rare!) circumstances
3319		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
3320		 * some exceptions. --ANK
3321		 * We allow IPv4-mapped nexthops to support RFC4798-type
3322		 * addressing
3323		 */
3324		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
3325			NL_SET_ERR_MSG(extack, "Invalid gateway address");
3326			goto out;
3327		}
3328
3329		rcu_read_lock();
3330
3331		if (cfg->fc_flags & RTNH_F_ONLINK)
3332			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
3333		else
3334			err = ip6_route_check_nh(net, cfg, _dev, idev);
 
3335
3336		rcu_read_unlock();
3337
3338		if (err)
3339			goto out;
3340	}
3341
3342	/* reload in case device was changed */
3343	dev = *_dev;
3344
3345	err = -EINVAL;
3346	if (!dev) {
3347		NL_SET_ERR_MSG(extack, "Egress device not specified");
3348		goto out;
3349	} else if (dev->flags & IFF_LOOPBACK) {
3350		NL_SET_ERR_MSG(extack,
3351			       "Egress device can not be loopback device for this route");
3352		goto out;
3353	}
3354
3355	/* if we did not check gw_addr above, do so now that the
3356	 * egress device has been resolved.
3357	 */
3358	if (need_addr_check &&
3359	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
3360		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
3361		goto out;
3362	}
3363
3364	err = 0;
3365out:
3366	return err;
3367}
3368
3369static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
3370{
3371	if ((flags & RTF_REJECT) ||
3372	    (dev && (dev->flags & IFF_LOOPBACK) &&
3373	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
3374	     !(flags & RTF_LOCAL)))
3375		return true;
3376
3377	return false;
3378}
3379
3380int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
3381		 struct fib6_config *cfg, gfp_t gfp_flags,
3382		 struct netlink_ext_ack *extack)
3383{
 
3384	struct net_device *dev = NULL;
3385	struct inet6_dev *idev = NULL;
3386	int addr_type;
3387	int err;
3388
3389	fib6_nh->fib_nh_family = AF_INET6;
3390#ifdef CONFIG_IPV6_ROUTER_PREF
3391	fib6_nh->last_probe = jiffies;
3392#endif
 
 
 
 
 
3393
3394	err = -ENODEV;
3395	if (cfg->fc_ifindex) {
3396		dev = dev_get_by_index(net, cfg->fc_ifindex);
 
3397		if (!dev)
3398			goto out;
3399		idev = in6_dev_get(dev);
3400		if (!idev)
3401			goto out;
3402	}
3403
3404	if (cfg->fc_flags & RTNH_F_ONLINK) {
3405		if (!dev) {
3406			NL_SET_ERR_MSG(extack,
3407				       "Nexthop device required for onlink");
3408			goto out;
3409		}
3410
3411		if (!(dev->flags & IFF_UP)) {
3412			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3413			err = -ENETDOWN;
3414			goto out;
3415		}
3416
3417		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
3418	}
3419
3420	fib6_nh->fib_nh_weight = 1;
3421
3422	/* We cannot add true routes via loopback here,
3423	 * they would result in kernel looping; promote them to reject routes
3424	 */
3425	addr_type = ipv6_addr_type(&cfg->fc_dst);
3426	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
3427		/* hold loopback dev/idev if we haven't done so. */
3428		if (dev != net->loopback_dev) {
3429			if (dev) {
3430				dev_put(dev);
3431				in6_dev_put(idev);
3432			}
3433			dev = net->loopback_dev;
3434			dev_hold(dev);
3435			idev = in6_dev_get(dev);
3436			if (!idev) {
3437				err = -ENODEV;
3438				goto out;
3439			}
3440		}
3441		goto pcpu_alloc;
3442	}
3443
3444	if (cfg->fc_flags & RTF_GATEWAY) {
3445		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
 
3446		if (err)
3447			goto out;
3448
3449		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
3450		fib6_nh->fib_nh_gw_family = AF_INET6;
3451	}
3452
3453	err = -ENODEV;
3454	if (!dev)
3455		goto out;
3456
3457	if (idev->cnf.disable_ipv6) {
3458		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3459		err = -EACCES;
3460		goto out;
3461	}
3462
3463	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
3464		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3465		err = -ENETDOWN;
3466		goto out;
3467	}
3468
3469	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3470	    !netif_carrier_ok(dev))
3471		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
3472
3473	err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
3474				 cfg->fc_encap_type, cfg, gfp_flags, extack);
3475	if (err)
3476		goto out;
3477
3478pcpu_alloc:
3479	fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
3480	if (!fib6_nh->rt6i_pcpu) {
3481		err = -ENOMEM;
3482		goto out;
3483	}
3484
3485	fib6_nh->fib_nh_dev = dev;
3486	fib6_nh->fib_nh_oif = dev->ifindex;
3487	err = 0;
3488out:
3489	if (idev)
3490		in6_dev_put(idev);
3491
3492	if (err) {
3493		lwtstate_put(fib6_nh->fib_nh_lws);
3494		fib6_nh->fib_nh_lws = NULL;
3495		if (dev)
3496			dev_put(dev);
3497	}
3498
3499	return err;
3500}
3501
3502void fib6_nh_release(struct fib6_nh *fib6_nh)
3503{
3504	struct rt6_exception_bucket *bucket;
3505
3506	rcu_read_lock();
3507
3508	fib6_nh_flush_exceptions(fib6_nh, NULL);
3509	bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
3510	if (bucket) {
3511		rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
3512		kfree(bucket);
3513	}
3514
3515	rcu_read_unlock();
3516
3517	if (fib6_nh->rt6i_pcpu) {
3518		int cpu;
 
 
 
3519
3520		for_each_possible_cpu(cpu) {
3521			struct rt6_info **ppcpu_rt;
3522			struct rt6_info *pcpu_rt;
3523
3524			ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
3525			pcpu_rt = *ppcpu_rt;
3526			if (pcpu_rt) {
3527				dst_dev_put(&pcpu_rt->dst);
3528				dst_release(&pcpu_rt->dst);
3529				*ppcpu_rt = NULL;
3530			}
 
 
 
 
3531		}
3532
3533		free_percpu(fib6_nh->rt6i_pcpu);
3534	}
3535
3536	fib_nh_common_release(&fib6_nh->nh_common);
3537}
3538
3539static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3540					      gfp_t gfp_flags,
3541					      struct netlink_ext_ack *extack)
3542{
3543	struct net *net = cfg->fc_nlinfo.nl_net;
3544	struct fib6_info *rt = NULL;
3545	struct nexthop *nh = NULL;
3546	struct fib6_table *table;
3547	struct fib6_nh *fib6_nh;
3548	int err = -EINVAL;
3549	int addr_type;
3550
3551	/* RTF_PCPU is an internal flag; can not be set by userspace */
3552	if (cfg->fc_flags & RTF_PCPU) {
3553		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3554		goto out;
3555	}
3556
3557	/* RTF_CACHE is an internal flag; can not be set by userspace */
3558	if (cfg->fc_flags & RTF_CACHE) {
3559		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3560		goto out;
3561	}
3562
3563	if (cfg->fc_type > RTN_MAX) {
3564		NL_SET_ERR_MSG(extack, "Invalid route type");
3565		goto out;
3566	}
3567
3568	if (cfg->fc_dst_len > 128) {
3569		NL_SET_ERR_MSG(extack, "Invalid prefix length");
3570		goto out;
3571	}
3572	if (cfg->fc_src_len > 128) {
3573		NL_SET_ERR_MSG(extack, "Invalid source address length");
3574		goto out;
3575	}
3576#ifndef CONFIG_IPV6_SUBTREES
3577	if (cfg->fc_src_len) {
3578		NL_SET_ERR_MSG(extack,
3579			       "Specifying source address requires IPV6_SUBTREES to be enabled");
3580		goto out;
3581	}
3582#endif
3583	if (cfg->fc_nh_id) {
3584		nh = nexthop_find_by_id(net, cfg->fc_nh_id);
3585		if (!nh) {
3586			NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
3587			goto out;
3588		}
3589		err = fib6_check_nexthop(nh, cfg, extack);
3590		if (err)
3591			goto out;
3592	}
3593
3594	err = -ENOBUFS;
3595	if (cfg->fc_nlinfo.nlh &&
3596	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3597		table = fib6_get_table(net, cfg->fc_table);
3598		if (!table) {
3599			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3600			table = fib6_new_table(net, cfg->fc_table);
3601		}
3602	} else {
3603		table = fib6_new_table(net, cfg->fc_table);
3604	}
3605
3606	if (!table)
3607		goto out;
3608
3609	err = -ENOMEM;
3610	rt = fib6_info_alloc(gfp_flags, !nh);
3611	if (!rt)
3612		goto out;
3613
3614	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
3615					       extack);
3616	if (IS_ERR(rt->fib6_metrics)) {
3617		err = PTR_ERR(rt->fib6_metrics);
3618		/* Do not leave garbage there. */
3619		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3620		goto out;
3621	}
3622
3623	if (cfg->fc_flags & RTF_ADDRCONF)
3624		rt->dst_nocount = true;
3625
3626	if (cfg->fc_flags & RTF_EXPIRES)
3627		fib6_set_expires(rt, jiffies +
3628				clock_t_to_jiffies(cfg->fc_expires));
3629	else
3630		fib6_clean_expires(rt);
3631
3632	if (cfg->fc_protocol == RTPROT_UNSPEC)
3633		cfg->fc_protocol = RTPROT_BOOT;
3634	rt->fib6_protocol = cfg->fc_protocol;
3635
3636	rt->fib6_table = table;
3637	rt->fib6_metric = cfg->fc_metric;
3638	rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
3639	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3640
3641	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3642	rt->fib6_dst.plen = cfg->fc_dst_len;
3643	if (rt->fib6_dst.plen == 128)
3644		rt->dst_host = true;
3645
3646#ifdef CONFIG_IPV6_SUBTREES
3647	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3648	rt->fib6_src.plen = cfg->fc_src_len;
3649#endif
3650	if (nh) {
 
 
 
 
3651		if (!nexthop_get(nh)) {
3652			NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
3653			goto out;
3654		}
3655		if (rt->fib6_src.plen) {
3656			NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
3657			goto out;
3658		}
3659		rt->nh = nh;
3660		fib6_nh = nexthop_fib6_nh(rt->nh);
3661	} else {
3662		err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
3663		if (err)
3664			goto out;
3665
3666		fib6_nh = rt->fib6_nh;
3667
3668		/* We cannot add true routes via loopback here, they would
3669		 * result in kernel looping; promote them to reject routes
3670		 */
3671		addr_type = ipv6_addr_type(&cfg->fc_dst);
3672		if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
3673				   addr_type))
3674			rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3675	}
3676
3677	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3678		struct net_device *dev = fib6_nh->fib_nh_dev;
3679
3680		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3681			NL_SET_ERR_MSG(extack, "Invalid source address");
3682			err = -EINVAL;
3683			goto out;
3684		}
3685		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3686		rt->fib6_prefsrc.plen = 128;
3687	} else
3688		rt->fib6_prefsrc.plen = 0;
3689
3690	return rt;
3691out:
3692	fib6_info_release(rt);
3693	return ERR_PTR(err);
 
 
 
 
3694}
3695
3696int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3697		  struct netlink_ext_ack *extack)
3698{
3699	struct fib6_info *rt;
3700	int err;
3701
3702	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3703	if (IS_ERR(rt))
3704		return PTR_ERR(rt);
3705
3706	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3707	fib6_info_release(rt);
3708
3709	return err;
3710}
3711
3712static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3713{
3714	struct net *net = info->nl_net;
3715	struct fib6_table *table;
3716	int err;
3717
3718	if (rt == net->ipv6.fib6_null_entry) {
3719		err = -ENOENT;
3720		goto out;
3721	}
3722
3723	table = rt->fib6_table;
3724	spin_lock_bh(&table->tb6_lock);
3725	err = fib6_del(rt, info);
3726	spin_unlock_bh(&table->tb6_lock);
3727
3728out:
3729	fib6_info_release(rt);
3730	return err;
3731}
3732
3733int ip6_del_rt(struct net *net, struct fib6_info *rt)
3734{
3735	struct nl_info info = { .nl_net = net };
 
 
 
3736
3737	return __ip6_del_rt(rt, &info);
3738}
3739
3740static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3741{
3742	struct nl_info *info = &cfg->fc_nlinfo;
3743	struct net *net = info->nl_net;
3744	struct sk_buff *skb = NULL;
3745	struct fib6_table *table;
3746	int err = -ENOENT;
3747
3748	if (rt == net->ipv6.fib6_null_entry)
3749		goto out_put;
3750	table = rt->fib6_table;
3751	spin_lock_bh(&table->tb6_lock);
3752
3753	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3754		struct fib6_info *sibling, *next_sibling;
 
3755
3756		/* prefer to send a single notification with all hops */
3757		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3758		if (skb) {
3759			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3760
3761			if (rt6_fill_node(net, skb, rt, NULL,
3762					  NULL, NULL, 0, RTM_DELROUTE,
3763					  info->portid, seq, 0) < 0) {
3764				kfree_skb(skb);
3765				skb = NULL;
3766			} else
3767				info->skip_notify = 1;
3768		}
3769
 
 
 
 
 
3770		info->skip_notify_kernel = 1;
3771		call_fib6_multipath_entry_notifiers(net,
3772						    FIB_EVENT_ENTRY_DEL,
3773						    rt,
3774						    rt->fib6_nsiblings,
3775						    NULL);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3776		list_for_each_entry_safe(sibling, next_sibling,
3777					 &rt->fib6_siblings,
3778					 fib6_siblings) {
3779			err = fib6_del(sibling, info);
3780			if (err)
3781				goto out_unlock;
3782		}
3783	}
3784
3785	err = fib6_del(rt, info);
3786out_unlock:
3787	spin_unlock_bh(&table->tb6_lock);
3788out_put:
3789	fib6_info_release(rt);
3790
3791	if (skb) {
3792		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3793			    info->nlh, gfp_any());
3794	}
3795	return err;
3796}
3797
3798static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3799{
3800	int rc = -ESRCH;
3801
3802	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3803		goto out;
3804
3805	if (cfg->fc_flags & RTF_GATEWAY &&
3806	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3807		goto out;
3808
3809	rc = rt6_remove_exception_rt(rt);
3810out:
3811	return rc;
3812}
3813
3814static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
3815			     struct fib6_nh *nh)
3816{
3817	struct fib6_result res = {
3818		.f6i = rt,
3819		.nh = nh,
3820	};
3821	struct rt6_info *rt_cache;
3822
3823	rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
3824	if (rt_cache)
3825		return __ip6_del_cached_rt(rt_cache, cfg);
3826
3827	return 0;
3828}
3829
3830struct fib6_nh_del_cached_rt_arg {
3831	struct fib6_config *cfg;
3832	struct fib6_info *f6i;
3833};
3834
3835static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
3836{
3837	struct fib6_nh_del_cached_rt_arg *arg = _arg;
3838	int rc;
3839
3840	rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
3841	return rc != -ESRCH ? rc : 0;
3842}
3843
3844static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
3845{
3846	struct fib6_nh_del_cached_rt_arg arg = {
3847		.cfg = cfg,
3848		.f6i = f6i
3849	};
3850
3851	return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
3852}
3853
3854static int ip6_route_del(struct fib6_config *cfg,
3855			 struct netlink_ext_ack *extack)
3856{
3857	struct fib6_table *table;
3858	struct fib6_info *rt;
3859	struct fib6_node *fn;
3860	int err = -ESRCH;
3861
3862	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3863	if (!table) {
3864		NL_SET_ERR_MSG(extack, "FIB table does not exist");
3865		return err;
3866	}
3867
3868	rcu_read_lock();
3869
3870	fn = fib6_locate(&table->tb6_root,
3871			 &cfg->fc_dst, cfg->fc_dst_len,
3872			 &cfg->fc_src, cfg->fc_src_len,
3873			 !(cfg->fc_flags & RTF_CACHE));
3874
3875	if (fn) {
3876		for_each_fib6_node_rt_rcu(fn) {
3877			struct fib6_nh *nh;
3878
3879			if (rt->nh && cfg->fc_nh_id &&
3880			    rt->nh->id != cfg->fc_nh_id)
3881				continue;
3882
3883			if (cfg->fc_flags & RTF_CACHE) {
3884				int rc = 0;
3885
3886				if (rt->nh) {
3887					rc = ip6_del_cached_rt_nh(cfg, rt);
3888				} else if (cfg->fc_nh_id) {
3889					continue;
3890				} else {
3891					nh = rt->fib6_nh;
3892					rc = ip6_del_cached_rt(cfg, rt, nh);
3893				}
3894				if (rc != -ESRCH) {
3895					rcu_read_unlock();
3896					return rc;
3897				}
3898				continue;
3899			}
3900
3901			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3902				continue;
3903			if (cfg->fc_protocol &&
3904			    cfg->fc_protocol != rt->fib6_protocol)
3905				continue;
3906
3907			if (rt->nh) {
3908				if (!fib6_info_hold_safe(rt))
3909					continue;
3910				rcu_read_unlock();
3911
3912				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3913			}
3914			if (cfg->fc_nh_id)
3915				continue;
3916
3917			nh = rt->fib6_nh;
3918			if (cfg->fc_ifindex &&
3919			    (!nh->fib_nh_dev ||
3920			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
3921				continue;
3922			if (cfg->fc_flags & RTF_GATEWAY &&
3923			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
3924				continue;
3925			if (!fib6_info_hold_safe(rt))
3926				continue;
3927			rcu_read_unlock();
3928
3929			/* if gateway was specified only delete the one hop */
3930			if (cfg->fc_flags & RTF_GATEWAY)
3931				return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3932
3933			return __ip6_del_rt_siblings(rt, cfg);
3934		}
3935	}
3936	rcu_read_unlock();
3937
3938	return err;
3939}
3940
3941static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3942{
3943	struct netevent_redirect netevent;
3944	struct rt6_info *rt, *nrt = NULL;
3945	struct fib6_result res = {};
3946	struct ndisc_options ndopts;
3947	struct inet6_dev *in6_dev;
3948	struct neighbour *neigh;
3949	struct rd_msg *msg;
3950	int optlen, on_link;
3951	u8 *lladdr;
3952
3953	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3954	optlen -= sizeof(*msg);
3955
3956	if (optlen < 0) {
3957		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3958		return;
3959	}
3960
3961	msg = (struct rd_msg *)icmp6_hdr(skb);
3962
3963	if (ipv6_addr_is_multicast(&msg->dest)) {
3964		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3965		return;
3966	}
3967
3968	on_link = 0;
3969	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3970		on_link = 1;
3971	} else if (ipv6_addr_type(&msg->target) !=
3972		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3973		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3974		return;
3975	}
3976
3977	in6_dev = __in6_dev_get(skb->dev);
3978	if (!in6_dev)
3979		return;
3980	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
 
3981		return;
3982
3983	/* RFC2461 8.1:
3984	 *	The IP source address of the Redirect MUST be the same as the current
3985	 *	first-hop router for the specified ICMP Destination Address.
3986	 */
3987
3988	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3989		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3990		return;
3991	}
3992
3993	lladdr = NULL;
3994	if (ndopts.nd_opts_tgt_lladdr) {
3995		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3996					     skb->dev);
3997		if (!lladdr) {
3998			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3999			return;
4000		}
4001	}
4002
4003	rt = (struct rt6_info *) dst;
4004	if (rt->rt6i_flags & RTF_REJECT) {
4005		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
4006		return;
4007	}
4008
4009	/* Redirect received -> path was valid.
4010	 * Look, redirects are sent only in response to data packets,
4011	 * so that this nexthop apparently is reachable. --ANK
4012	 */
4013	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
4014
4015	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
4016	if (!neigh)
4017		return;
4018
4019	/*
4020	 *	We have finally decided to accept it.
4021	 */
4022
4023	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
4024		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
4025		     NEIGH_UPDATE_F_OVERRIDE|
4026		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
4027				     NEIGH_UPDATE_F_ISROUTER)),
4028		     NDISC_REDIRECT, &ndopts);
4029
4030	rcu_read_lock();
4031	res.f6i = rcu_dereference(rt->from);
4032	if (!res.f6i)
4033		goto out;
4034
4035	if (res.f6i->nh) {
4036		struct fib6_nh_match_arg arg = {
4037			.dev = dst->dev,
4038			.gw = &rt->rt6i_gateway,
4039		};
4040
4041		nexthop_for_each_fib6_nh(res.f6i->nh,
4042					 fib6_nh_find_match, &arg);
4043
4044		/* fib6_info uses a nexthop that does not have fib6_nh
4045		 * using the dst->dev. Should be impossible
4046		 */
4047		if (!arg.match)
4048			goto out;
4049		res.nh = arg.match;
4050	} else {
4051		res.nh = res.f6i->fib6_nh;
4052	}
4053
4054	res.fib6_flags = res.f6i->fib6_flags;
4055	res.fib6_type = res.f6i->fib6_type;
4056	nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
4057	if (!nrt)
4058		goto out;
4059
4060	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
4061	if (on_link)
4062		nrt->rt6i_flags &= ~RTF_GATEWAY;
4063
4064	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
4065
4066	/* rt6_insert_exception() will take care of duplicated exceptions */
4067	if (rt6_insert_exception(nrt, &res)) {
4068		dst_release_immediate(&nrt->dst);
4069		goto out;
4070	}
4071
4072	netevent.old = &rt->dst;
4073	netevent.new = &nrt->dst;
4074	netevent.daddr = &msg->dest;
4075	netevent.neigh = neigh;
4076	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
4077
4078out:
4079	rcu_read_unlock();
4080	neigh_release(neigh);
4081}
4082
4083#ifdef CONFIG_IPV6_ROUTE_INFO
4084static struct fib6_info *rt6_get_route_info(struct net *net,
4085					   const struct in6_addr *prefix, int prefixlen,
4086					   const struct in6_addr *gwaddr,
4087					   struct net_device *dev)
4088{
4089	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
4090	int ifindex = dev->ifindex;
4091	struct fib6_node *fn;
4092	struct fib6_info *rt = NULL;
4093	struct fib6_table *table;
4094
4095	table = fib6_get_table(net, tb_id);
4096	if (!table)
4097		return NULL;
4098
4099	rcu_read_lock();
4100	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
4101	if (!fn)
4102		goto out;
4103
4104	for_each_fib6_node_rt_rcu(fn) {
4105		/* these routes do not use nexthops */
4106		if (rt->nh)
4107			continue;
4108		if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
4109			continue;
4110		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
4111		    !rt->fib6_nh->fib_nh_gw_family)
4112			continue;
4113		if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
4114			continue;
4115		if (!fib6_info_hold_safe(rt))
4116			continue;
4117		break;
4118	}
4119out:
4120	rcu_read_unlock();
4121	return rt;
4122}
4123
4124static struct fib6_info *rt6_add_route_info(struct net *net,
4125					   const struct in6_addr *prefix, int prefixlen,
4126					   const struct in6_addr *gwaddr,
4127					   struct net_device *dev,
4128					   unsigned int pref)
4129{
4130	struct fib6_config cfg = {
4131		.fc_metric	= IP6_RT_PRIO_USER,
4132		.fc_ifindex	= dev->ifindex,
4133		.fc_dst_len	= prefixlen,
4134		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
4135				  RTF_UP | RTF_PREF(pref),
4136		.fc_protocol = RTPROT_RA,
4137		.fc_type = RTN_UNICAST,
4138		.fc_nlinfo.portid = 0,
4139		.fc_nlinfo.nlh = NULL,
4140		.fc_nlinfo.nl_net = net,
4141	};
4142
4143	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
4144	cfg.fc_dst = *prefix;
4145	cfg.fc_gateway = *gwaddr;
4146
4147	/* We should treat it as a default route if prefix length is 0. */
4148	if (!prefixlen)
4149		cfg.fc_flags |= RTF_DEFAULT;
4150
4151	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
4152
4153	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
4154}
4155#endif
4156
4157struct fib6_info *rt6_get_dflt_router(struct net *net,
4158				     const struct in6_addr *addr,
4159				     struct net_device *dev)
4160{
4161	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
4162	struct fib6_info *rt;
4163	struct fib6_table *table;
4164
4165	table = fib6_get_table(net, tb_id);
4166	if (!table)
4167		return NULL;
4168
4169	rcu_read_lock();
4170	for_each_fib6_node_rt_rcu(&table->tb6_root) {
4171		struct fib6_nh *nh;
4172
4173		/* RA routes do not use nexthops */
4174		if (rt->nh)
4175			continue;
4176
4177		nh = rt->fib6_nh;
4178		if (dev == nh->fib_nh_dev &&
4179		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
4180		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
4181			break;
4182	}
4183	if (rt && !fib6_info_hold_safe(rt))
4184		rt = NULL;
4185	rcu_read_unlock();
4186	return rt;
4187}
4188
4189struct fib6_info *rt6_add_dflt_router(struct net *net,
4190				     const struct in6_addr *gwaddr,
4191				     struct net_device *dev,
4192				     unsigned int pref)
 
 
4193{
4194	struct fib6_config cfg = {
4195		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
4196		.fc_metric	= IP6_RT_PRIO_USER,
4197		.fc_ifindex	= dev->ifindex,
4198		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
4199				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
4200		.fc_protocol = RTPROT_RA,
4201		.fc_type = RTN_UNICAST,
4202		.fc_nlinfo.portid = 0,
4203		.fc_nlinfo.nlh = NULL,
4204		.fc_nlinfo.nl_net = net,
 
4205	};
4206
4207	cfg.fc_gateway = *gwaddr;
4208
4209	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
4210		struct fib6_table *table;
4211
4212		table = fib6_get_table(dev_net(dev), cfg.fc_table);
4213		if (table)
4214			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
4215	}
4216
4217	return rt6_get_dflt_router(net, gwaddr, dev);
4218}
4219
4220static void __rt6_purge_dflt_routers(struct net *net,
4221				     struct fib6_table *table)
4222{
4223	struct fib6_info *rt;
4224
4225restart:
4226	rcu_read_lock();
4227	for_each_fib6_node_rt_rcu(&table->tb6_root) {
4228		struct net_device *dev = fib6_info_nh_dev(rt);
4229		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
4230
4231		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
4232		    (!idev || idev->cnf.accept_ra != 2) &&
4233		    fib6_info_hold_safe(rt)) {
4234			rcu_read_unlock();
4235			ip6_del_rt(net, rt);
4236			goto restart;
4237		}
4238	}
4239	rcu_read_unlock();
4240
4241	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
4242}
4243
4244void rt6_purge_dflt_routers(struct net *net)
4245{
4246	struct fib6_table *table;
4247	struct hlist_head *head;
4248	unsigned int h;
4249
4250	rcu_read_lock();
4251
4252	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
4253		head = &net->ipv6.fib_table_hash[h];
4254		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
4255			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
4256				__rt6_purge_dflt_routers(net, table);
4257		}
4258	}
4259
4260	rcu_read_unlock();
4261}
4262
4263static void rtmsg_to_fib6_config(struct net *net,
4264				 struct in6_rtmsg *rtmsg,
4265				 struct fib6_config *cfg)
4266{
4267	*cfg = (struct fib6_config){
4268		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
4269			 : RT6_TABLE_MAIN,
4270		.fc_ifindex = rtmsg->rtmsg_ifindex,
4271		.fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
4272		.fc_expires = rtmsg->rtmsg_info,
4273		.fc_dst_len = rtmsg->rtmsg_dst_len,
4274		.fc_src_len = rtmsg->rtmsg_src_len,
4275		.fc_flags = rtmsg->rtmsg_flags,
4276		.fc_type = rtmsg->rtmsg_type,
4277
4278		.fc_nlinfo.nl_net = net,
4279
4280		.fc_dst = rtmsg->rtmsg_dst,
4281		.fc_src = rtmsg->rtmsg_src,
4282		.fc_gateway = rtmsg->rtmsg_gateway,
4283	};
4284}
4285
4286int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4287{
4288	struct fib6_config cfg;
4289	struct in6_rtmsg rtmsg;
4290	int err;
4291
4292	switch (cmd) {
4293	case SIOCADDRT:		/* Add a route */
4294	case SIOCDELRT:		/* Delete a route */
4295		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
4296			return -EPERM;
4297		err = copy_from_user(&rtmsg, arg,
4298				     sizeof(struct in6_rtmsg));
4299		if (err)
4300			return -EFAULT;
4301
4302		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
4303
4304		rtnl_lock();
4305		switch (cmd) {
4306		case SIOCADDRT:
4307			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
4308			break;
4309		case SIOCDELRT:
4310			err = ip6_route_del(&cfg, NULL);
4311			break;
4312		default:
4313			err = -EINVAL;
4314		}
4315		rtnl_unlock();
4316
4317		return err;
4318	}
4319
4320	return -EINVAL;
4321}
4322
4323/*
4324 *	Drop the packet on the floor
4325 */
4326
4327static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
4328{
4329	struct dst_entry *dst = skb_dst(skb);
4330	struct net *net = dev_net(dst->dev);
4331	struct inet6_dev *idev;
 
4332	int type;
4333
4334	if (netif_is_l3_master(skb->dev) &&
4335	    dst->dev == net->loopback_dev)
4336		idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
4337	else
4338		idev = ip6_dst_idev(dst);
4339
4340	switch (ipstats_mib_noroutes) {
4341	case IPSTATS_MIB_INNOROUTES:
4342		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
4343		if (type == IPV6_ADDR_ANY) {
 
4344			IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
4345			break;
4346		}
4347		/* FALLTHROUGH */
 
4348	case IPSTATS_MIB_OUTNOROUTES:
 
4349		IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
4350		break;
4351	}
4352
4353	/* Start over by dropping the dst for l3mdev case */
4354	if (netif_is_l3_master(skb->dev))
4355		skb_dst_drop(skb);
4356
4357	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
4358	kfree_skb(skb);
4359	return 0;
4360}
4361
4362static int ip6_pkt_discard(struct sk_buff *skb)
4363{
4364	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
4365}
4366
4367static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
4368{
4369	skb->dev = skb_dst(skb)->dev;
4370	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
4371}
4372
4373static int ip6_pkt_prohibit(struct sk_buff *skb)
4374{
4375	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
4376}
4377
4378static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
4379{
4380	skb->dev = skb_dst(skb)->dev;
4381	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
4382}
4383
4384/*
4385 *	Allocate a dst for local (unicast / anycast) address.
4386 */
4387
4388struct fib6_info *addrconf_f6i_alloc(struct net *net,
4389				     struct inet6_dev *idev,
4390				     const struct in6_addr *addr,
4391				     bool anycast, gfp_t gfp_flags)
 
4392{
4393	struct fib6_config cfg = {
4394		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
4395		.fc_ifindex = idev->dev->ifindex,
4396		.fc_flags = RTF_UP | RTF_NONEXTHOP,
4397		.fc_dst = *addr,
4398		.fc_dst_len = 128,
4399		.fc_protocol = RTPROT_KERNEL,
4400		.fc_nlinfo.nl_net = net,
4401		.fc_ignore_dev_down = true,
4402	};
4403	struct fib6_info *f6i;
4404
4405	if (anycast) {
4406		cfg.fc_type = RTN_ANYCAST;
4407		cfg.fc_flags |= RTF_ANYCAST;
4408	} else {
4409		cfg.fc_type = RTN_LOCAL;
4410		cfg.fc_flags |= RTF_LOCAL;
4411	}
4412
4413	f6i = ip6_route_info_create(&cfg, gfp_flags, NULL);
4414	if (!IS_ERR(f6i))
4415		f6i->dst_nocount = true;
 
 
 
 
 
 
 
4416	return f6i;
4417}
4418
4419/* remove deleted ip from prefsrc entries */
4420struct arg_dev_net_ip {
4421	struct net_device *dev;
4422	struct net *net;
4423	struct in6_addr *addr;
4424};
4425
4426static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
4427{
4428	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
4429	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
4430	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
4431
4432	if (!rt->nh &&
4433	    ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
4434	    rt != net->ipv6.fib6_null_entry &&
4435	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
 
4436		spin_lock_bh(&rt6_exception_lock);
4437		/* remove prefsrc entry */
4438		rt->fib6_prefsrc.plen = 0;
4439		spin_unlock_bh(&rt6_exception_lock);
4440	}
4441	return 0;
4442}
4443
4444void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
4445{
4446	struct net *net = dev_net(ifp->idev->dev);
4447	struct arg_dev_net_ip adni = {
4448		.dev = ifp->idev->dev,
4449		.net = net,
4450		.addr = &ifp->addr,
4451	};
4452	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
4453}
4454
4455#define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
4456
4457/* Remove routers and update dst entries when gateway turn into host. */
4458static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
4459{
4460	struct in6_addr *gateway = (struct in6_addr *)arg;
4461	struct fib6_nh *nh;
4462
4463	/* RA routes do not use nexthops */
4464	if (rt->nh)
4465		return 0;
4466
4467	nh = rt->fib6_nh;
4468	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
4469	    nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
4470		return -1;
4471
4472	/* Further clean up cached routes in exception table.
4473	 * This is needed because cached route may have a different
4474	 * gateway than its 'parent' in the case of an ip redirect.
4475	 */
4476	fib6_nh_exceptions_clean_tohost(nh, gateway);
4477
4478	return 0;
4479}
4480
4481void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
4482{
4483	fib6_clean_all(net, fib6_clean_tohost, gateway);
4484}
4485
4486struct arg_netdev_event {
4487	const struct net_device *dev;
4488	union {
4489		unsigned char nh_flags;
4490		unsigned long event;
4491	};
4492};
4493
4494static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
4495{
4496	struct fib6_info *iter;
4497	struct fib6_node *fn;
4498
4499	fn = rcu_dereference_protected(rt->fib6_node,
4500			lockdep_is_held(&rt->fib6_table->tb6_lock));
4501	iter = rcu_dereference_protected(fn->leaf,
4502			lockdep_is_held(&rt->fib6_table->tb6_lock));
4503	while (iter) {
4504		if (iter->fib6_metric == rt->fib6_metric &&
4505		    rt6_qualify_for_ecmp(iter))
4506			return iter;
4507		iter = rcu_dereference_protected(iter->fib6_next,
4508				lockdep_is_held(&rt->fib6_table->tb6_lock));
4509	}
4510
4511	return NULL;
4512}
4513
4514/* only called for fib entries with builtin fib6_nh */
4515static bool rt6_is_dead(const struct fib6_info *rt)
4516{
4517	if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
4518	    (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
4519	     ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
4520		return true;
4521
4522	return false;
4523}
4524
4525static int rt6_multipath_total_weight(const struct fib6_info *rt)
4526{
4527	struct fib6_info *iter;
4528	int total = 0;
4529
4530	if (!rt6_is_dead(rt))
4531		total += rt->fib6_nh->fib_nh_weight;
4532
4533	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
4534		if (!rt6_is_dead(iter))
4535			total += iter->fib6_nh->fib_nh_weight;
4536	}
4537
4538	return total;
4539}
4540
4541static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
4542{
4543	int upper_bound = -1;
4544
4545	if (!rt6_is_dead(rt)) {
4546		*weight += rt->fib6_nh->fib_nh_weight;
4547		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
4548						    total) - 1;
4549	}
4550	atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
4551}
4552
4553static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
4554{
4555	struct fib6_info *iter;
4556	int weight = 0;
4557
4558	rt6_upper_bound_set(rt, &weight, total);
4559
4560	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4561		rt6_upper_bound_set(iter, &weight, total);
4562}
4563
4564void rt6_multipath_rebalance(struct fib6_info *rt)
4565{
4566	struct fib6_info *first;
4567	int total;
4568
4569	/* In case the entire multipath route was marked for flushing,
4570	 * then there is no need to rebalance upon the removal of every
4571	 * sibling route.
4572	 */
4573	if (!rt->fib6_nsiblings || rt->should_flush)
4574		return;
4575
4576	/* During lookup routes are evaluated in order, so we need to
4577	 * make sure upper bounds are assigned from the first sibling
4578	 * onwards.
4579	 */
4580	first = rt6_multipath_first_sibling(rt);
4581	if (WARN_ON_ONCE(!first))
4582		return;
4583
4584	total = rt6_multipath_total_weight(first);
4585	rt6_multipath_upper_bound_set(first, total);
4586}
4587
4588static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4589{
4590	const struct arg_netdev_event *arg = p_arg;
4591	struct net *net = dev_net(arg->dev);
4592
4593	if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
4594	    rt->fib6_nh->fib_nh_dev == arg->dev) {
4595		rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
4596		fib6_update_sernum_upto_root(net, rt);
4597		rt6_multipath_rebalance(rt);
4598	}
4599
4600	return 0;
4601}
4602
4603void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
4604{
4605	struct arg_netdev_event arg = {
4606		.dev = dev,
4607		{
4608			.nh_flags = nh_flags,
4609		},
4610	};
4611
4612	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4613		arg.nh_flags |= RTNH_F_LINKDOWN;
4614
4615	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4616}
4617
4618/* only called for fib entries with inline fib6_nh */
4619static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4620				   const struct net_device *dev)
4621{
4622	struct fib6_info *iter;
4623
4624	if (rt->fib6_nh->fib_nh_dev == dev)
4625		return true;
4626	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4627		if (iter->fib6_nh->fib_nh_dev == dev)
4628			return true;
4629
4630	return false;
4631}
4632
4633static void rt6_multipath_flush(struct fib6_info *rt)
4634{
4635	struct fib6_info *iter;
4636
4637	rt->should_flush = 1;
4638	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4639		iter->should_flush = 1;
4640}
4641
4642static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4643					     const struct net_device *down_dev)
4644{
4645	struct fib6_info *iter;
4646	unsigned int dead = 0;
4647
4648	if (rt->fib6_nh->fib_nh_dev == down_dev ||
4649	    rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4650		dead++;
4651	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4652		if (iter->fib6_nh->fib_nh_dev == down_dev ||
4653		    iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4654			dead++;
4655
4656	return dead;
4657}
4658
4659static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4660				       const struct net_device *dev,
4661				       unsigned char nh_flags)
4662{
4663	struct fib6_info *iter;
4664
4665	if (rt->fib6_nh->fib_nh_dev == dev)
4666		rt->fib6_nh->fib_nh_flags |= nh_flags;
4667	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4668		if (iter->fib6_nh->fib_nh_dev == dev)
4669			iter->fib6_nh->fib_nh_flags |= nh_flags;
4670}
4671
4672/* called with write lock held for table with rt */
4673static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4674{
4675	const struct arg_netdev_event *arg = p_arg;
4676	const struct net_device *dev = arg->dev;
4677	struct net *net = dev_net(dev);
4678
4679	if (rt == net->ipv6.fib6_null_entry || rt->nh)
4680		return 0;
4681
4682	switch (arg->event) {
4683	case NETDEV_UNREGISTER:
4684		return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
4685	case NETDEV_DOWN:
4686		if (rt->should_flush)
4687			return -1;
4688		if (!rt->fib6_nsiblings)
4689			return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
4690		if (rt6_multipath_uses_dev(rt, dev)) {
4691			unsigned int count;
4692
4693			count = rt6_multipath_dead_count(rt, dev);
4694			if (rt->fib6_nsiblings + 1 == count) {
4695				rt6_multipath_flush(rt);
4696				return -1;
4697			}
4698			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4699						   RTNH_F_LINKDOWN);
4700			fib6_update_sernum(net, rt);
4701			rt6_multipath_rebalance(rt);
4702		}
4703		return -2;
4704	case NETDEV_CHANGE:
4705		if (rt->fib6_nh->fib_nh_dev != dev ||
4706		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4707			break;
4708		rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
4709		rt6_multipath_rebalance(rt);
4710		break;
4711	}
4712
4713	return 0;
4714}
4715
4716void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4717{
4718	struct arg_netdev_event arg = {
4719		.dev = dev,
4720		{
4721			.event = event,
4722		},
4723	};
4724	struct net *net = dev_net(dev);
4725
4726	if (net->ipv6.sysctl.skip_notify_on_dev_down)
4727		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
4728	else
4729		fib6_clean_all(net, fib6_ifdown, &arg);
4730}
4731
4732void rt6_disable_ip(struct net_device *dev, unsigned long event)
4733{
4734	rt6_sync_down_dev(dev, event);
4735	rt6_uncached_list_flush_dev(dev_net(dev), dev);
4736	neigh_ifdown(&nd_tbl, dev);
4737}
4738
4739struct rt6_mtu_change_arg {
4740	struct net_device *dev;
4741	unsigned int mtu;
4742	struct fib6_info *f6i;
4743};
4744
4745static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
4746{
4747	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
4748	struct fib6_info *f6i = arg->f6i;
4749
4750	/* For administrative MTU increase, there is no way to discover
4751	 * IPv6 PMTU increase, so PMTU increase should be updated here.
4752	 * Since RFC 1981 doesn't include administrative MTU increase
4753	 * update PMTU increase is a MUST. (i.e. jumbo frame)
4754	 */
4755	if (nh->fib_nh_dev == arg->dev) {
4756		struct inet6_dev *idev = __in6_dev_get(arg->dev);
4757		u32 mtu = f6i->fib6_pmtu;
4758
4759		if (mtu >= arg->mtu ||
4760		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4761			fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
4762
4763		spin_lock_bh(&rt6_exception_lock);
4764		rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
4765		spin_unlock_bh(&rt6_exception_lock);
4766	}
4767
4768	return 0;
4769}
4770
4771static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
4772{
4773	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4774	struct inet6_dev *idev;
4775
4776	/* In IPv6 pmtu discovery is not optional,
4777	   so that RTAX_MTU lock cannot disable it.
4778	   We still use this lock to block changes
4779	   caused by addrconf/ndisc.
4780	*/
4781
4782	idev = __in6_dev_get(arg->dev);
4783	if (!idev)
4784		return 0;
4785
4786	if (fib6_metric_locked(f6i, RTAX_MTU))
4787		return 0;
4788
4789	arg->f6i = f6i;
4790	if (f6i->nh) {
4791		/* fib6_nh_mtu_change only returns 0, so this is safe */
4792		return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
4793						arg);
4794	}
4795
4796	return fib6_nh_mtu_change(f6i->fib6_nh, arg);
4797}
4798
4799void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4800{
4801	struct rt6_mtu_change_arg arg = {
4802		.dev = dev,
4803		.mtu = mtu,
4804	};
4805
4806	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4807}
4808
4809static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4810	[RTA_UNSPEC]		= { .strict_start_type = RTA_DPORT + 1 },
4811	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4812	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4813	[RTA_OIF]               = { .type = NLA_U32 },
4814	[RTA_IIF]		= { .type = NLA_U32 },
4815	[RTA_PRIORITY]          = { .type = NLA_U32 },
4816	[RTA_METRICS]           = { .type = NLA_NESTED },
4817	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4818	[RTA_PREF]              = { .type = NLA_U8 },
4819	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
4820	[RTA_ENCAP]		= { .type = NLA_NESTED },
4821	[RTA_EXPIRES]		= { .type = NLA_U32 },
4822	[RTA_UID]		= { .type = NLA_U32 },
4823	[RTA_MARK]		= { .type = NLA_U32 },
4824	[RTA_TABLE]		= { .type = NLA_U32 },
4825	[RTA_IP_PROTO]		= { .type = NLA_U8 },
4826	[RTA_SPORT]		= { .type = NLA_U16 },
4827	[RTA_DPORT]		= { .type = NLA_U16 },
4828	[RTA_NH_ID]		= { .type = NLA_U32 },
4829};
4830
4831static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4832			      struct fib6_config *cfg,
4833			      struct netlink_ext_ack *extack)
4834{
4835	struct rtmsg *rtm;
4836	struct nlattr *tb[RTA_MAX+1];
4837	unsigned int pref;
4838	int err;
4839
4840	err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
4841				     rtm_ipv6_policy, extack);
4842	if (err < 0)
4843		goto errout;
4844
4845	err = -EINVAL;
4846	rtm = nlmsg_data(nlh);
4847
 
 
 
 
 
 
4848	*cfg = (struct fib6_config){
4849		.fc_table = rtm->rtm_table,
4850		.fc_dst_len = rtm->rtm_dst_len,
4851		.fc_src_len = rtm->rtm_src_len,
4852		.fc_flags = RTF_UP,
4853		.fc_protocol = rtm->rtm_protocol,
4854		.fc_type = rtm->rtm_type,
4855
4856		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
4857		.fc_nlinfo.nlh = nlh,
4858		.fc_nlinfo.nl_net = sock_net(skb->sk),
4859	};
4860
4861	if (rtm->rtm_type == RTN_UNREACHABLE ||
4862	    rtm->rtm_type == RTN_BLACKHOLE ||
4863	    rtm->rtm_type == RTN_PROHIBIT ||
4864	    rtm->rtm_type == RTN_THROW)
4865		cfg->fc_flags |= RTF_REJECT;
4866
4867	if (rtm->rtm_type == RTN_LOCAL)
4868		cfg->fc_flags |= RTF_LOCAL;
4869
4870	if (rtm->rtm_flags & RTM_F_CLONED)
4871		cfg->fc_flags |= RTF_CACHE;
4872
4873	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4874
4875	if (tb[RTA_NH_ID]) {
4876		if (tb[RTA_GATEWAY]   || tb[RTA_OIF] ||
4877		    tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
4878			NL_SET_ERR_MSG(extack,
4879				       "Nexthop specification and nexthop id are mutually exclusive");
4880			goto errout;
4881		}
4882		cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
4883	}
4884
4885	if (tb[RTA_GATEWAY]) {
4886		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4887		cfg->fc_flags |= RTF_GATEWAY;
4888	}
4889	if (tb[RTA_VIA]) {
4890		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4891		goto errout;
4892	}
4893
4894	if (tb[RTA_DST]) {
4895		int plen = (rtm->rtm_dst_len + 7) >> 3;
4896
4897		if (nla_len(tb[RTA_DST]) < plen)
4898			goto errout;
4899
4900		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4901	}
4902
4903	if (tb[RTA_SRC]) {
4904		int plen = (rtm->rtm_src_len + 7) >> 3;
4905
4906		if (nla_len(tb[RTA_SRC]) < plen)
4907			goto errout;
4908
4909		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4910	}
4911
4912	if (tb[RTA_PREFSRC])
4913		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4914
4915	if (tb[RTA_OIF])
4916		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4917
4918	if (tb[RTA_PRIORITY])
4919		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4920
4921	if (tb[RTA_METRICS]) {
4922		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4923		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4924	}
4925
4926	if (tb[RTA_TABLE])
4927		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4928
4929	if (tb[RTA_MULTIPATH]) {
4930		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4931		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4932
4933		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4934						     cfg->fc_mp_len, extack);
4935		if (err < 0)
4936			goto errout;
4937	}
4938
4939	if (tb[RTA_PREF]) {
4940		pref = nla_get_u8(tb[RTA_PREF]);
4941		if (pref != ICMPV6_ROUTER_PREF_LOW &&
4942		    pref != ICMPV6_ROUTER_PREF_HIGH)
4943			pref = ICMPV6_ROUTER_PREF_MEDIUM;
4944		cfg->fc_flags |= RTF_PREF(pref);
4945	}
4946
4947	if (tb[RTA_ENCAP])
4948		cfg->fc_encap = tb[RTA_ENCAP];
4949
4950	if (tb[RTA_ENCAP_TYPE]) {
4951		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4952
4953		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4954		if (err < 0)
4955			goto errout;
4956	}
4957
4958	if (tb[RTA_EXPIRES]) {
4959		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4960
4961		if (addrconf_finite_timeout(timeout)) {
4962			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4963			cfg->fc_flags |= RTF_EXPIRES;
4964		}
4965	}
4966
4967	err = 0;
4968errout:
4969	return err;
4970}
4971
4972struct rt6_nh {
4973	struct fib6_info *fib6_info;
4974	struct fib6_config r_cfg;
4975	struct list_head next;
4976};
4977
4978static int ip6_route_info_append(struct net *net,
4979				 struct list_head *rt6_nh_list,
4980				 struct fib6_info *rt,
4981				 struct fib6_config *r_cfg)
4982{
4983	struct rt6_nh *nh;
4984	int err = -EEXIST;
4985
4986	list_for_each_entry(nh, rt6_nh_list, next) {
4987		/* check if fib6_info already exists */
4988		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4989			return err;
4990	}
4991
4992	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4993	if (!nh)
4994		return -ENOMEM;
4995	nh->fib6_info = rt;
4996	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4997	list_add_tail(&nh->next, rt6_nh_list);
4998
4999	return 0;
5000}
5001
5002static void ip6_route_mpath_notify(struct fib6_info *rt,
5003				   struct fib6_info *rt_last,
5004				   struct nl_info *info,
5005				   __u16 nlflags)
5006{
5007	/* if this is an APPEND route, then rt points to the first route
5008	 * inserted and rt_last points to last route inserted. Userspace
5009	 * wants a consistent dump of the route which starts at the first
5010	 * nexthop. Since sibling routes are always added at the end of
5011	 * the list, find the first sibling of the last route appended
5012	 */
 
 
5013	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
5014		rt = list_first_entry(&rt_last->fib6_siblings,
5015				      struct fib6_info,
5016				      fib6_siblings);
5017	}
5018
5019	if (rt)
5020		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5021}
5022
5023static int ip6_route_multipath_add(struct fib6_config *cfg,
5024				   struct netlink_ext_ack *extack)
5025{
5026	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
5027	struct nl_info *info = &cfg->fc_nlinfo;
5028	enum fib_event_type event_type;
5029	struct fib6_config r_cfg;
5030	struct rtnexthop *rtnh;
5031	struct fib6_info *rt;
5032	struct rt6_nh *err_nh;
5033	struct rt6_nh *nh, *nh_safe;
5034	__u16 nlflags;
5035	int remaining;
5036	int attrlen;
5037	int err = 1;
5038	int nhn = 0;
5039	int replace = (cfg->fc_nlinfo.nlh &&
5040		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
5041	LIST_HEAD(rt6_nh_list);
5042
5043	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
5044	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
5045		nlflags |= NLM_F_APPEND;
5046
5047	remaining = cfg->fc_mp_len;
5048	rtnh = (struct rtnexthop *)cfg->fc_mp;
5049
5050	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
5051	 * fib6_info structs per nexthop
5052	 */
5053	while (rtnh_ok(rtnh, remaining)) {
5054		memcpy(&r_cfg, cfg, sizeof(*cfg));
5055		if (rtnh->rtnh_ifindex)
5056			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
5057
5058		attrlen = rtnh_attrlen(rtnh);
5059		if (attrlen > 0) {
5060			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
5061
5062			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
5063			if (nla) {
5064				r_cfg.fc_gateway = nla_get_in6_addr(nla);
 
 
 
 
5065				r_cfg.fc_flags |= RTF_GATEWAY;
5066			}
5067			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
 
 
 
 
5068			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
5069			if (nla)
5070				r_cfg.fc_encap_type = nla_get_u16(nla);
5071		}
5072
5073		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
5074		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
5075		if (IS_ERR(rt)) {
5076			err = PTR_ERR(rt);
5077			rt = NULL;
5078			goto cleanup;
5079		}
5080		if (!rt6_qualify_for_ecmp(rt)) {
5081			err = -EINVAL;
5082			NL_SET_ERR_MSG(extack,
5083				       "Device only routes can not be added for IPv6 using the multipath API.");
5084			fib6_info_release(rt);
5085			goto cleanup;
5086		}
5087
5088		rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
5089
5090		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
5091					    rt, &r_cfg);
5092		if (err) {
5093			fib6_info_release(rt);
5094			goto cleanup;
5095		}
5096
5097		rtnh = rtnh_next(rtnh, &remaining);
5098	}
5099
5100	if (list_empty(&rt6_nh_list)) {
5101		NL_SET_ERR_MSG(extack,
5102			       "Invalid nexthop configuration - no valid nexthops");
5103		return -EINVAL;
5104	}
5105
5106	/* for add and replace send one notification with all nexthops.
5107	 * Skip the notification in fib6_add_rt2node and send one with
5108	 * the full route when done
5109	 */
5110	info->skip_notify = 1;
5111
5112	/* For add and replace, send one notification with all nexthops. For
5113	 * append, send one notification with all appended nexthops.
5114	 */
5115	info->skip_notify_kernel = 1;
5116
5117	err_nh = NULL;
5118	list_for_each_entry(nh, &rt6_nh_list, next) {
5119		err = __ip6_ins_rt(nh->fib6_info, info, extack);
5120		fib6_info_release(nh->fib6_info);
5121
5122		if (!err) {
5123			/* save reference to last route successfully inserted */
5124			rt_last = nh->fib6_info;
5125
5126			/* save reference to first route for notification */
5127			if (!rt_notif)
5128				rt_notif = nh->fib6_info;
5129		}
5130
5131		/* nh->fib6_info is used or freed at this point, reset to NULL*/
5132		nh->fib6_info = NULL;
5133		if (err) {
5134			if (replace && nhn)
5135				NL_SET_ERR_MSG_MOD(extack,
5136						   "multipath route replace failed (check consistency of installed routes)");
5137			err_nh = nh;
5138			goto add_errout;
5139		}
 
 
 
 
 
 
5140
5141		/* Because each route is added like a single route we remove
5142		 * these flags after the first nexthop: if there is a collision,
5143		 * we have already failed to add the first nexthop:
5144		 * fib6_add_rt2node() has rejected it; when replacing, old
5145		 * nexthops have been replaced by first new, the rest should
5146		 * be added to it.
5147		 */
5148		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
5149						     NLM_F_REPLACE);
 
 
 
5150		nhn++;
5151	}
5152
5153	event_type = replace ? FIB_EVENT_ENTRY_REPLACE : FIB_EVENT_ENTRY_ADD;
5154	err = call_fib6_multipath_entry_notifiers(info->nl_net, event_type,
5155						  rt_notif, nhn - 1, extack);
5156	if (err) {
5157		/* Delete all the siblings that were just added */
5158		err_nh = NULL;
5159		goto add_errout;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5160	}
5161
5162	/* success ... tell user about new route */
5163	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
5164	goto cleanup;
5165
5166add_errout:
5167	/* send notification for routes that were added so that
5168	 * the delete notifications sent by ip6_route_del are
5169	 * coherent
5170	 */
5171	if (rt_notif)
5172		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
5173
5174	/* Delete routes that were already added */
5175	list_for_each_entry(nh, &rt6_nh_list, next) {
5176		if (err_nh == nh)
5177			break;
5178		ip6_route_del(&nh->r_cfg, extack);
5179	}
5180
5181cleanup:
5182	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
5183		if (nh->fib6_info)
5184			fib6_info_release(nh->fib6_info);
5185		list_del(&nh->next);
5186		kfree(nh);
5187	}
5188
5189	return err;
5190}
5191
5192static int ip6_route_multipath_del(struct fib6_config *cfg,
5193				   struct netlink_ext_ack *extack)
5194{
5195	struct fib6_config r_cfg;
5196	struct rtnexthop *rtnh;
 
5197	int remaining;
5198	int attrlen;
5199	int err = 1, last_err = 0;
5200
5201	remaining = cfg->fc_mp_len;
5202	rtnh = (struct rtnexthop *)cfg->fc_mp;
5203
5204	/* Parse a Multipath Entry */
5205	while (rtnh_ok(rtnh, remaining)) {
5206		memcpy(&r_cfg, cfg, sizeof(*cfg));
5207		if (rtnh->rtnh_ifindex)
5208			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
5209
5210		attrlen = rtnh_attrlen(rtnh);
5211		if (attrlen > 0) {
5212			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
5213
5214			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
5215			if (nla) {
5216				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
 
 
 
 
 
 
5217				r_cfg.fc_flags |= RTF_GATEWAY;
5218			}
5219		}
5220		err = ip6_route_del(&r_cfg, extack);
5221		if (err)
5222			last_err = err;
5223
 
5224		rtnh = rtnh_next(rtnh, &remaining);
5225	}
5226
5227	return last_err;
5228}
5229
5230static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
5231			      struct netlink_ext_ack *extack)
5232{
5233	struct fib6_config cfg;
5234	int err;
5235
5236	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
5237	if (err < 0)
5238		return err;
5239
5240	if (cfg.fc_nh_id &&
5241	    !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
5242		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
5243		return -EINVAL;
5244	}
5245
5246	if (cfg.fc_mp)
5247		return ip6_route_multipath_del(&cfg, extack);
5248	else {
5249		cfg.fc_delete_all_nh = 1;
5250		return ip6_route_del(&cfg, extack);
5251	}
5252}
5253
5254static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
5255			      struct netlink_ext_ack *extack)
5256{
5257	struct fib6_config cfg;
5258	int err;
5259
5260	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
5261	if (err < 0)
5262		return err;
5263
5264	if (cfg.fc_metric == 0)
5265		cfg.fc_metric = IP6_RT_PRIO_USER;
5266
5267	if (cfg.fc_mp)
5268		return ip6_route_multipath_add(&cfg, extack);
5269	else
5270		return ip6_route_add(&cfg, GFP_KERNEL, extack);
5271}
5272
5273/* add the overhead of this fib6_nh to nexthop_len */
5274static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
5275{
5276	int *nexthop_len = arg;
5277
5278	*nexthop_len += nla_total_size(0)	 /* RTA_MULTIPATH */
5279		     + NLA_ALIGN(sizeof(struct rtnexthop))
5280		     + nla_total_size(16); /* RTA_GATEWAY */
5281
5282	if (nh->fib_nh_lws) {
5283		/* RTA_ENCAP_TYPE */
5284		*nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
5285		/* RTA_ENCAP */
5286		*nexthop_len += nla_total_size(2);
5287	}
5288
5289	return 0;
5290}
5291
5292static size_t rt6_nlmsg_size(struct fib6_info *f6i)
5293{
5294	int nexthop_len;
5295
5296	if (f6i->nh) {
5297		nexthop_len = nla_total_size(4); /* RTA_NH_ID */
5298		nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
5299					 &nexthop_len);
5300	} else {
5301		struct fib6_nh *nh = f6i->fib6_nh;
 
5302
5303		nexthop_len = 0;
5304		if (f6i->fib6_nsiblings) {
5305			nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
5306				    + NLA_ALIGN(sizeof(struct rtnexthop))
5307				    + nla_total_size(16) /* RTA_GATEWAY */
5308				    + lwtunnel_get_encap_size(nh->fib_nh_lws);
 
 
 
 
5309
5310			nexthop_len *= f6i->fib6_nsiblings;
5311		}
5312		nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
5313	}
5314
5315	return NLMSG_ALIGN(sizeof(struct rtmsg))
5316	       + nla_total_size(16) /* RTA_SRC */
5317	       + nla_total_size(16) /* RTA_DST */
5318	       + nla_total_size(16) /* RTA_GATEWAY */
5319	       + nla_total_size(16) /* RTA_PREFSRC */
5320	       + nla_total_size(4) /* RTA_TABLE */
5321	       + nla_total_size(4) /* RTA_IIF */
5322	       + nla_total_size(4) /* RTA_OIF */
5323	       + nla_total_size(4) /* RTA_PRIORITY */
5324	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
5325	       + nla_total_size(sizeof(struct rta_cacheinfo))
5326	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
5327	       + nla_total_size(1) /* RTA_PREF */
5328	       + nexthop_len;
5329}
5330
5331static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
5332				 unsigned char *flags)
5333{
5334	if (nexthop_is_multipath(nh)) {
5335		struct nlattr *mp;
5336
5337		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
5338		if (!mp)
5339			goto nla_put_failure;
5340
5341		if (nexthop_mpath_fill_node(skb, nh, AF_INET6))
5342			goto nla_put_failure;
5343
5344		nla_nest_end(skb, mp);
5345	} else {
5346		struct fib6_nh *fib6_nh;
5347
5348		fib6_nh = nexthop_fib6_nh(nh);
5349		if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6,
5350				     flags, false) < 0)
5351			goto nla_put_failure;
5352	}
5353
5354	return 0;
5355
5356nla_put_failure:
5357	return -EMSGSIZE;
5358}
5359
5360static int rt6_fill_node(struct net *net, struct sk_buff *skb,
5361			 struct fib6_info *rt, struct dst_entry *dst,
5362			 struct in6_addr *dest, struct in6_addr *src,
5363			 int iif, int type, u32 portid, u32 seq,
5364			 unsigned int flags)
5365{
5366	struct rt6_info *rt6 = (struct rt6_info *)dst;
5367	struct rt6key *rt6_dst, *rt6_src;
5368	u32 *pmetrics, table, rt6_flags;
5369	unsigned char nh_flags = 0;
5370	struct nlmsghdr *nlh;
5371	struct rtmsg *rtm;
5372	long expires = 0;
5373
5374	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
5375	if (!nlh)
5376		return -EMSGSIZE;
5377
5378	if (rt6) {
5379		rt6_dst = &rt6->rt6i_dst;
5380		rt6_src = &rt6->rt6i_src;
5381		rt6_flags = rt6->rt6i_flags;
5382	} else {
5383		rt6_dst = &rt->fib6_dst;
5384		rt6_src = &rt->fib6_src;
5385		rt6_flags = rt->fib6_flags;
5386	}
5387
5388	rtm = nlmsg_data(nlh);
5389	rtm->rtm_family = AF_INET6;
5390	rtm->rtm_dst_len = rt6_dst->plen;
5391	rtm->rtm_src_len = rt6_src->plen;
5392	rtm->rtm_tos = 0;
5393	if (rt->fib6_table)
5394		table = rt->fib6_table->tb6_id;
5395	else
5396		table = RT6_TABLE_UNSPEC;
5397	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
5398	if (nla_put_u32(skb, RTA_TABLE, table))
5399		goto nla_put_failure;
5400
5401	rtm->rtm_type = rt->fib6_type;
5402	rtm->rtm_flags = 0;
5403	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
5404	rtm->rtm_protocol = rt->fib6_protocol;
5405
5406	if (rt6_flags & RTF_CACHE)
5407		rtm->rtm_flags |= RTM_F_CLONED;
5408
5409	if (dest) {
5410		if (nla_put_in6_addr(skb, RTA_DST, dest))
5411			goto nla_put_failure;
5412		rtm->rtm_dst_len = 128;
5413	} else if (rtm->rtm_dst_len)
5414		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
5415			goto nla_put_failure;
5416#ifdef CONFIG_IPV6_SUBTREES
5417	if (src) {
5418		if (nla_put_in6_addr(skb, RTA_SRC, src))
5419			goto nla_put_failure;
5420		rtm->rtm_src_len = 128;
5421	} else if (rtm->rtm_src_len &&
5422		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
5423		goto nla_put_failure;
5424#endif
5425	if (iif) {
5426#ifdef CONFIG_IPV6_MROUTE
5427		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
5428			int err = ip6mr_get_route(net, skb, rtm, portid);
5429
5430			if (err == 0)
5431				return 0;
5432			if (err < 0)
5433				goto nla_put_failure;
5434		} else
5435#endif
5436			if (nla_put_u32(skb, RTA_IIF, iif))
5437				goto nla_put_failure;
5438	} else if (dest) {
5439		struct in6_addr saddr_buf;
5440		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
5441		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
5442			goto nla_put_failure;
5443	}
5444
5445	if (rt->fib6_prefsrc.plen) {
5446		struct in6_addr saddr_buf;
5447		saddr_buf = rt->fib6_prefsrc.addr;
5448		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
5449			goto nla_put_failure;
5450	}
5451
5452	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
5453	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
5454		goto nla_put_failure;
5455
5456	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
5457		goto nla_put_failure;
5458
5459	/* For multipath routes, walk the siblings list and add
5460	 * each as a nexthop within RTA_MULTIPATH.
5461	 */
5462	if (rt6) {
5463		if (rt6_flags & RTF_GATEWAY &&
5464		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
5465			goto nla_put_failure;
5466
5467		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
5468			goto nla_put_failure;
 
 
 
 
5469	} else if (rt->fib6_nsiblings) {
5470		struct fib6_info *sibling, *next_sibling;
5471		struct nlattr *mp;
5472
5473		mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
5474		if (!mp)
5475			goto nla_put_failure;
5476
5477		if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
5478				    rt->fib6_nh->fib_nh_weight, AF_INET6) < 0)
 
5479			goto nla_put_failure;
5480
5481		list_for_each_entry_safe(sibling, next_sibling,
5482					 &rt->fib6_siblings, fib6_siblings) {
 
 
5483			if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
5484					    sibling->fib6_nh->fib_nh_weight,
5485					    AF_INET6) < 0)
 
 
5486				goto nla_put_failure;
 
5487		}
5488
 
 
5489		nla_nest_end(skb, mp);
5490	} else if (rt->nh) {
5491		if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
5492			goto nla_put_failure;
5493
5494		if (nexthop_is_blackhole(rt->nh))
5495			rtm->rtm_type = RTN_BLACKHOLE;
5496
5497		if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
 
5498			goto nla_put_failure;
5499
5500		rtm->rtm_flags |= nh_flags;
5501	} else {
5502		if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6,
5503				     &nh_flags, false) < 0)
5504			goto nla_put_failure;
5505
5506		rtm->rtm_flags |= nh_flags;
5507	}
5508
5509	if (rt6_flags & RTF_EXPIRES) {
5510		expires = dst ? dst->expires : rt->expires;
5511		expires -= jiffies;
5512	}
5513
 
 
 
 
 
 
 
 
 
5514	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
5515		goto nla_put_failure;
5516
5517	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
5518		goto nla_put_failure;
5519
5520
5521	nlmsg_end(skb, nlh);
5522	return 0;
5523
5524nla_put_failure:
5525	nlmsg_cancel(skb, nlh);
5526	return -EMSGSIZE;
5527}
5528
5529static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
5530{
5531	const struct net_device *dev = arg;
5532
5533	if (nh->fib_nh_dev == dev)
5534		return 1;
5535
5536	return 0;
5537}
5538
5539static bool fib6_info_uses_dev(const struct fib6_info *f6i,
5540			       const struct net_device *dev)
5541{
5542	if (f6i->nh) {
5543		struct net_device *_dev = (struct net_device *)dev;
5544
5545		return !!nexthop_for_each_fib6_nh(f6i->nh,
5546						  fib6_info_nh_uses_dev,
5547						  _dev);
5548	}
5549
5550	if (f6i->fib6_nh->fib_nh_dev == dev)
5551		return true;
5552
5553	if (f6i->fib6_nsiblings) {
5554		struct fib6_info *sibling, *next_sibling;
5555
5556		list_for_each_entry_safe(sibling, next_sibling,
5557					 &f6i->fib6_siblings, fib6_siblings) {
5558			if (sibling->fib6_nh->fib_nh_dev == dev)
5559				return true;
5560		}
5561	}
5562
5563	return false;
5564}
5565
5566struct fib6_nh_exception_dump_walker {
5567	struct rt6_rtnl_dump_arg *dump;
5568	struct fib6_info *rt;
5569	unsigned int flags;
5570	unsigned int skip;
5571	unsigned int count;
5572};
5573
5574static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
5575{
5576	struct fib6_nh_exception_dump_walker *w = arg;
5577	struct rt6_rtnl_dump_arg *dump = w->dump;
5578	struct rt6_exception_bucket *bucket;
5579	struct rt6_exception *rt6_ex;
5580	int i, err;
5581
5582	bucket = fib6_nh_get_excptn_bucket(nh, NULL);
5583	if (!bucket)
5584		return 0;
5585
5586	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
5587		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
5588			if (w->skip) {
5589				w->skip--;
5590				continue;
5591			}
5592
5593			/* Expiration of entries doesn't bump sernum, insertion
5594			 * does. Removal is triggered by insertion, so we can
5595			 * rely on the fact that if entries change between two
5596			 * partial dumps, this node is scanned again completely,
5597			 * see rt6_insert_exception() and fib6_dump_table().
5598			 *
5599			 * Count expired entries we go through as handled
5600			 * entries that we'll skip next time, in case of partial
5601			 * node dump. Otherwise, if entries expire meanwhile,
5602			 * we'll skip the wrong amount.
5603			 */
5604			if (rt6_check_expired(rt6_ex->rt6i)) {
5605				w->count++;
5606				continue;
5607			}
5608
5609			err = rt6_fill_node(dump->net, dump->skb, w->rt,
5610					    &rt6_ex->rt6i->dst, NULL, NULL, 0,
5611					    RTM_NEWROUTE,
5612					    NETLINK_CB(dump->cb->skb).portid,
5613					    dump->cb->nlh->nlmsg_seq, w->flags);
5614			if (err)
5615				return err;
5616
5617			w->count++;
5618		}
5619		bucket++;
5620	}
5621
5622	return 0;
5623}
5624
5625/* Return -1 if done with node, number of handled routes on partial dump */
5626int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
5627{
5628	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
5629	struct fib_dump_filter *filter = &arg->filter;
5630	unsigned int flags = NLM_F_MULTI;
5631	struct net *net = arg->net;
5632	int count = 0;
5633
5634	if (rt == net->ipv6.fib6_null_entry)
5635		return -1;
5636
5637	if ((filter->flags & RTM_F_PREFIX) &&
5638	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
5639		/* success since this is not a prefix route */
5640		return -1;
5641	}
5642	if (filter->filter_set &&
5643	    ((filter->rt_type  && rt->fib6_type != filter->rt_type) ||
5644	     (filter->dev      && !fib6_info_uses_dev(rt, filter->dev)) ||
5645	     (filter->protocol && rt->fib6_protocol != filter->protocol))) {
5646		return -1;
5647	}
5648
5649	if (filter->filter_set ||
5650	    !filter->dump_routes || !filter->dump_exceptions) {
5651		flags |= NLM_F_DUMP_FILTERED;
5652	}
5653
5654	if (filter->dump_routes) {
5655		if (skip) {
5656			skip--;
5657		} else {
5658			if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
5659					  0, RTM_NEWROUTE,
5660					  NETLINK_CB(arg->cb->skb).portid,
5661					  arg->cb->nlh->nlmsg_seq, flags)) {
5662				return 0;
5663			}
5664			count++;
5665		}
5666	}
5667
5668	if (filter->dump_exceptions) {
5669		struct fib6_nh_exception_dump_walker w = { .dump = arg,
5670							   .rt = rt,
5671							   .flags = flags,
5672							   .skip = skip,
5673							   .count = 0 };
5674		int err;
5675
5676		rcu_read_lock();
5677		if (rt->nh) {
5678			err = nexthop_for_each_fib6_nh(rt->nh,
5679						       rt6_nh_dump_exceptions,
5680						       &w);
5681		} else {
5682			err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
5683		}
5684		rcu_read_unlock();
5685
5686		if (err)
5687			return count += w.count;
5688	}
5689
5690	return -1;
5691}
5692
5693static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
5694					const struct nlmsghdr *nlh,
5695					struct nlattr **tb,
5696					struct netlink_ext_ack *extack)
5697{
5698	struct rtmsg *rtm;
5699	int i, err;
5700
5701	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
5702		NL_SET_ERR_MSG_MOD(extack,
5703				   "Invalid header for get route request");
5704		return -EINVAL;
5705	}
5706
5707	if (!netlink_strict_get_check(skb))
5708		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
5709					      rtm_ipv6_policy, extack);
5710
5711	rtm = nlmsg_data(nlh);
5712	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
5713	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
5714	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
5715	    rtm->rtm_type) {
5716		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
5717		return -EINVAL;
5718	}
5719	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
5720		NL_SET_ERR_MSG_MOD(extack,
5721				   "Invalid flags for get route request");
5722		return -EINVAL;
5723	}
5724
5725	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
5726					    rtm_ipv6_policy, extack);
5727	if (err)
5728		return err;
5729
5730	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
5731	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
5732		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
5733		return -EINVAL;
5734	}
5735
5736	for (i = 0; i <= RTA_MAX; i++) {
5737		if (!tb[i])
5738			continue;
5739
5740		switch (i) {
5741		case RTA_SRC:
5742		case RTA_DST:
5743		case RTA_IIF:
5744		case RTA_OIF:
5745		case RTA_MARK:
5746		case RTA_UID:
5747		case RTA_SPORT:
5748		case RTA_DPORT:
5749		case RTA_IP_PROTO:
5750			break;
5751		default:
5752			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
5753			return -EINVAL;
5754		}
5755	}
5756
5757	return 0;
5758}
5759
5760static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
5761			      struct netlink_ext_ack *extack)
5762{
5763	struct net *net = sock_net(in_skb->sk);
5764	struct nlattr *tb[RTA_MAX+1];
5765	int err, iif = 0, oif = 0;
5766	struct fib6_info *from;
5767	struct dst_entry *dst;
5768	struct rt6_info *rt;
5769	struct sk_buff *skb;
5770	struct rtmsg *rtm;
5771	struct flowi6 fl6 = {};
5772	bool fibmatch;
5773
5774	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
5775	if (err < 0)
5776		goto errout;
5777
5778	err = -EINVAL;
5779	rtm = nlmsg_data(nlh);
5780	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
5781	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
5782
5783	if (tb[RTA_SRC]) {
5784		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
5785			goto errout;
5786
5787		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
5788	}
5789
5790	if (tb[RTA_DST]) {
5791		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
5792			goto errout;
5793
5794		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
5795	}
5796
5797	if (tb[RTA_IIF])
5798		iif = nla_get_u32(tb[RTA_IIF]);
5799
5800	if (tb[RTA_OIF])
5801		oif = nla_get_u32(tb[RTA_OIF]);
5802
5803	if (tb[RTA_MARK])
5804		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
5805
5806	if (tb[RTA_UID])
5807		fl6.flowi6_uid = make_kuid(current_user_ns(),
5808					   nla_get_u32(tb[RTA_UID]));
5809	else
5810		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
5811
5812	if (tb[RTA_SPORT])
5813		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
5814
5815	if (tb[RTA_DPORT])
5816		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
5817
5818	if (tb[RTA_IP_PROTO]) {
5819		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5820						  &fl6.flowi6_proto, AF_INET6,
5821						  extack);
5822		if (err)
5823			goto errout;
5824	}
5825
5826	if (iif) {
5827		struct net_device *dev;
5828		int flags = 0;
5829
5830		rcu_read_lock();
5831
5832		dev = dev_get_by_index_rcu(net, iif);
5833		if (!dev) {
5834			rcu_read_unlock();
5835			err = -ENODEV;
5836			goto errout;
5837		}
5838
5839		fl6.flowi6_iif = iif;
5840
5841		if (!ipv6_addr_any(&fl6.saddr))
5842			flags |= RT6_LOOKUP_F_HAS_SADDR;
5843
5844		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5845
5846		rcu_read_unlock();
5847	} else {
5848		fl6.flowi6_oif = oif;
5849
5850		dst = ip6_route_output(net, NULL, &fl6);
5851	}
5852
5853
5854	rt = container_of(dst, struct rt6_info, dst);
5855	if (rt->dst.error) {
5856		err = rt->dst.error;
5857		ip6_rt_put(rt);
5858		goto errout;
5859	}
5860
5861	if (rt == net->ipv6.ip6_null_entry) {
5862		err = rt->dst.error;
5863		ip6_rt_put(rt);
5864		goto errout;
5865	}
5866
5867	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5868	if (!skb) {
5869		ip6_rt_put(rt);
5870		err = -ENOBUFS;
5871		goto errout;
5872	}
5873
5874	skb_dst_set(skb, &rt->dst);
5875
5876	rcu_read_lock();
5877	from = rcu_dereference(rt->from);
5878	if (from) {
5879		if (fibmatch)
5880			err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5881					    iif, RTM_NEWROUTE,
5882					    NETLINK_CB(in_skb).portid,
5883					    nlh->nlmsg_seq, 0);
5884		else
5885			err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5886					    &fl6.saddr, iif, RTM_NEWROUTE,
5887					    NETLINK_CB(in_skb).portid,
5888					    nlh->nlmsg_seq, 0);
5889	} else {
5890		err = -ENETUNREACH;
5891	}
5892	rcu_read_unlock();
5893
5894	if (err < 0) {
5895		kfree_skb(skb);
5896		goto errout;
5897	}
5898
5899	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5900errout:
5901	return err;
5902}
5903
5904void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5905		     unsigned int nlm_flags)
5906{
5907	struct sk_buff *skb;
5908	struct net *net = info->nl_net;
5909	u32 seq;
5910	int err;
5911
5912	err = -ENOBUFS;
5913	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5914
5915	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5916	if (!skb)
5917		goto errout;
5918
5919	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5920			    event, info->portid, seq, nlm_flags);
5921	if (err < 0) {
5922		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5923		WARN_ON(err == -EMSGSIZE);
5924		kfree_skb(skb);
5925		goto errout;
5926	}
5927	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5928		    info->nlh, gfp_any());
5929	return;
5930errout:
5931	if (err < 0)
5932		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5933}
5934
5935void fib6_rt_update(struct net *net, struct fib6_info *rt,
5936		    struct nl_info *info)
5937{
5938	u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5939	struct sk_buff *skb;
5940	int err = -ENOBUFS;
5941
5942	/* call_fib6_entry_notifiers will be removed when in-kernel notifier
5943	 * is implemented and supported for nexthop objects
5944	 */
5945	call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, rt, NULL);
5946
5947	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5948	if (!skb)
5949		goto errout;
5950
5951	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5952			    RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
5953	if (err < 0) {
5954		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5955		WARN_ON(err == -EMSGSIZE);
5956		kfree_skb(skb);
5957		goto errout;
5958	}
5959	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5960		    info->nlh, gfp_any());
5961	return;
5962errout:
5963	if (err < 0)
5964		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5965}
 
5966
5967static int ip6_route_dev_notify(struct notifier_block *this,
5968				unsigned long event, void *ptr)
5969{
5970	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5971	struct net *net = dev_net(dev);
5972
5973	if (!(dev->flags & IFF_LOOPBACK))
5974		return NOTIFY_OK;
5975
5976	if (event == NETDEV_REGISTER) {
5977		net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
5978		net->ipv6.ip6_null_entry->dst.dev = dev;
5979		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5980#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5981		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5982		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5983		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5984		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5985#endif
5986	 } else if (event == NETDEV_UNREGISTER &&
5987		    dev->reg_state != NETREG_UNREGISTERED) {
5988		/* NETDEV_UNREGISTER could be fired for multiple times by
5989		 * netdev_wait_allrefs(). Make sure we only call this once.
5990		 */
5991		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5992#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5993		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5994		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5995#endif
5996	}
5997
5998	return NOTIFY_OK;
5999}
6000
6001/*
6002 *	/proc
6003 */
6004
6005#ifdef CONFIG_PROC_FS
6006static int rt6_stats_seq_show(struct seq_file *seq, void *v)
6007{
6008	struct net *net = (struct net *)seq->private;
6009	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
6010		   net->ipv6.rt6_stats->fib_nodes,
6011		   net->ipv6.rt6_stats->fib_route_nodes,
6012		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
6013		   net->ipv6.rt6_stats->fib_rt_entries,
6014		   net->ipv6.rt6_stats->fib_rt_cache,
6015		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
6016		   net->ipv6.rt6_stats->fib_discarded_routes);
6017
6018	return 0;
6019}
6020#endif	/* CONFIG_PROC_FS */
6021
6022#ifdef CONFIG_SYSCTL
6023
6024static
6025int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
6026			      void __user *buffer, size_t *lenp, loff_t *ppos)
6027{
6028	struct net *net;
6029	int delay;
6030	int ret;
6031	if (!write)
6032		return -EINVAL;
6033
6034	net = (struct net *)ctl->extra1;
6035	delay = net->ipv6.sysctl.flush_delay;
6036	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
6037	if (ret)
6038		return ret;
6039
 
 
6040	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
6041	return 0;
6042}
6043
6044static struct ctl_table ipv6_route_table_template[] = {
6045	{
6046		.procname	=	"flush",
6047		.data		=	&init_net.ipv6.sysctl.flush_delay,
6048		.maxlen		=	sizeof(int),
6049		.mode		=	0200,
6050		.proc_handler	=	ipv6_sysctl_rtcache_flush
6051	},
6052	{
6053		.procname	=	"gc_thresh",
6054		.data		=	&ip6_dst_ops_template.gc_thresh,
6055		.maxlen		=	sizeof(int),
6056		.mode		=	0644,
6057		.proc_handler	=	proc_dointvec,
6058	},
6059	{
6060		.procname	=	"max_size",
6061		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
6062		.maxlen		=	sizeof(int),
6063		.mode		=	0644,
6064		.proc_handler	=	proc_dointvec,
6065	},
6066	{
6067		.procname	=	"gc_min_interval",
6068		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
6069		.maxlen		=	sizeof(int),
6070		.mode		=	0644,
6071		.proc_handler	=	proc_dointvec_jiffies,
6072	},
6073	{
6074		.procname	=	"gc_timeout",
6075		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
6076		.maxlen		=	sizeof(int),
6077		.mode		=	0644,
6078		.proc_handler	=	proc_dointvec_jiffies,
6079	},
6080	{
6081		.procname	=	"gc_interval",
6082		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
6083		.maxlen		=	sizeof(int),
6084		.mode		=	0644,
6085		.proc_handler	=	proc_dointvec_jiffies,
6086	},
6087	{
6088		.procname	=	"gc_elasticity",
6089		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
6090		.maxlen		=	sizeof(int),
6091		.mode		=	0644,
6092		.proc_handler	=	proc_dointvec,
6093	},
6094	{
6095		.procname	=	"mtu_expires",
6096		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
6097		.maxlen		=	sizeof(int),
6098		.mode		=	0644,
6099		.proc_handler	=	proc_dointvec_jiffies,
6100	},
6101	{
6102		.procname	=	"min_adv_mss",
6103		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
6104		.maxlen		=	sizeof(int),
6105		.mode		=	0644,
6106		.proc_handler	=	proc_dointvec,
6107	},
6108	{
6109		.procname	=	"gc_min_interval_ms",
6110		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
6111		.maxlen		=	sizeof(int),
6112		.mode		=	0644,
6113		.proc_handler	=	proc_dointvec_ms_jiffies,
6114	},
6115	{
6116		.procname	=	"skip_notify_on_dev_down",
6117		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
6118		.maxlen		=	sizeof(int),
6119		.mode		=	0644,
6120		.proc_handler	=	proc_dointvec_minmax,
6121		.extra1		=	SYSCTL_ZERO,
6122		.extra2		=	SYSCTL_ONE,
6123	},
6124	{ }
6125};
6126
6127struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
6128{
6129	struct ctl_table *table;
6130
6131	table = kmemdup(ipv6_route_table_template,
6132			sizeof(ipv6_route_table_template),
6133			GFP_KERNEL);
6134
6135	if (table) {
6136		table[0].data = &net->ipv6.sysctl.flush_delay;
6137		table[0].extra1 = net;
6138		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
6139		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
 
6140		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
6141		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
6142		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
6143		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
6144		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
6145		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
6146		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
6147		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
6148
6149		/* Don't export sysctls to unprivileged users */
6150		if (net->user_ns != &init_user_ns)
6151			table[0].procname = NULL;
6152	}
6153
6154	return table;
6155}
 
 
 
 
 
 
 
 
 
6156#endif
6157
6158static int __net_init ip6_route_net_init(struct net *net)
6159{
6160	int ret = -ENOMEM;
6161
6162	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
6163	       sizeof(net->ipv6.ip6_dst_ops));
6164
6165	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
6166		goto out_ip6_dst_ops;
6167
6168	net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
6169	if (!net->ipv6.fib6_null_entry)
6170		goto out_ip6_dst_entries;
6171	memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
6172	       sizeof(*net->ipv6.fib6_null_entry));
6173
6174	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
6175					   sizeof(*net->ipv6.ip6_null_entry),
6176					   GFP_KERNEL);
6177	if (!net->ipv6.ip6_null_entry)
6178		goto out_fib6_null_entry;
6179	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6180	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
6181			 ip6_template_metrics, true);
6182	INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached);
6183
6184#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6185	net->ipv6.fib6_has_custom_rules = false;
6186	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
6187					       sizeof(*net->ipv6.ip6_prohibit_entry),
6188					       GFP_KERNEL);
6189	if (!net->ipv6.ip6_prohibit_entry)
6190		goto out_ip6_null_entry;
6191	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6192	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
6193			 ip6_template_metrics, true);
6194	INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached);
6195
6196	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
6197					       sizeof(*net->ipv6.ip6_blk_hole_entry),
6198					       GFP_KERNEL);
6199	if (!net->ipv6.ip6_blk_hole_entry)
6200		goto out_ip6_prohibit_entry;
6201	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
6202	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
6203			 ip6_template_metrics, true);
6204	INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
 
 
 
6205#endif
6206
6207	net->ipv6.sysctl.flush_delay = 0;
6208	net->ipv6.sysctl.ip6_rt_max_size = 4096;
6209	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
6210	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
6211	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
6212	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
6213	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
6214	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
6215	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
6216
6217	net->ipv6.ip6_rt_gc_expire = 30*HZ;
6218
6219	ret = 0;
6220out:
6221	return ret;
6222
6223#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6224out_ip6_prohibit_entry:
6225	kfree(net->ipv6.ip6_prohibit_entry);
6226out_ip6_null_entry:
6227	kfree(net->ipv6.ip6_null_entry);
6228#endif
6229out_fib6_null_entry:
6230	kfree(net->ipv6.fib6_null_entry);
6231out_ip6_dst_entries:
6232	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
6233out_ip6_dst_ops:
6234	goto out;
6235}
6236
6237static void __net_exit ip6_route_net_exit(struct net *net)
6238{
6239	kfree(net->ipv6.fib6_null_entry);
6240	kfree(net->ipv6.ip6_null_entry);
6241#ifdef CONFIG_IPV6_MULTIPLE_TABLES
6242	kfree(net->ipv6.ip6_prohibit_entry);
6243	kfree(net->ipv6.ip6_blk_hole_entry);
6244#endif
6245	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
6246}
6247
6248static int __net_init ip6_route_net_init_late(struct net *net)
6249{
6250#ifdef CONFIG_PROC_FS
6251	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
6252			sizeof(struct ipv6_route_iter));
6253	proc_create_net_single("rt6_stats", 0444, net->proc_net,
6254			rt6_stats_seq_show, NULL);
 
 
 
 
 
 
6255#endif
6256	return 0;
6257}
6258
6259static void __net_exit ip6_route_net_exit_late(struct net *net)
6260{
6261#ifdef CONFIG_PROC_FS
6262	remove_proc_entry("ipv6_route", net->proc_net);
6263	remove_proc_entry("rt6_stats", net->proc_net);
6264#endif
6265}
6266
6267static struct pernet_operations ip6_route_net_ops = {
6268	.init = ip6_route_net_init,
6269	.exit = ip6_route_net_exit,
6270};
6271
6272static int __net_init ipv6_inetpeer_init(struct net *net)
6273{
6274	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
6275
6276	if (!bp)
6277		return -ENOMEM;
6278	inet_peer_base_init(bp);
6279	net->ipv6.peers = bp;
6280	return 0;
6281}
6282
6283static void __net_exit ipv6_inetpeer_exit(struct net *net)
6284{
6285	struct inet_peer_base *bp = net->ipv6.peers;
6286
6287	net->ipv6.peers = NULL;
6288	inetpeer_invalidate_tree(bp);
6289	kfree(bp);
6290}
6291
6292static struct pernet_operations ipv6_inetpeer_ops = {
6293	.init	=	ipv6_inetpeer_init,
6294	.exit	=	ipv6_inetpeer_exit,
6295};
6296
6297static struct pernet_operations ip6_route_net_late_ops = {
6298	.init = ip6_route_net_init_late,
6299	.exit = ip6_route_net_exit_late,
6300};
6301
6302static struct notifier_block ip6_route_dev_notifier = {
6303	.notifier_call = ip6_route_dev_notify,
6304	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
6305};
6306
6307void __init ip6_route_init_special_entries(void)
6308{
6309	/* Registering of the loopback is done before this portion of code,
6310	 * the loopback reference in rt6_info will not be taken, do it
6311	 * manually for init_net */
6312	init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
6313	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
6314	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6315  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
6316	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
6317	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6318	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
6319	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
6320  #endif
6321}
6322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6323int __init ip6_route_init(void)
6324{
6325	int ret;
6326	int cpu;
6327
6328	ret = -ENOMEM;
6329	ip6_dst_ops_template.kmem_cachep =
6330		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
6331				  SLAB_HWCACHE_ALIGN, NULL);
6332	if (!ip6_dst_ops_template.kmem_cachep)
6333		goto out;
6334
6335	ret = dst_entries_init(&ip6_dst_blackhole_ops);
6336	if (ret)
6337		goto out_kmem_cache;
6338
6339	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
6340	if (ret)
6341		goto out_dst_entries;
6342
6343	ret = register_pernet_subsys(&ip6_route_net_ops);
6344	if (ret)
6345		goto out_register_inetpeer;
6346
6347	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
6348
6349	ret = fib6_init();
6350	if (ret)
6351		goto out_register_subsys;
6352
6353	ret = xfrm6_init();
6354	if (ret)
6355		goto out_fib6_init;
6356
6357	ret = fib6_rules_init();
6358	if (ret)
6359		goto xfrm6_init;
6360
6361	ret = register_pernet_subsys(&ip6_route_net_late_ops);
6362	if (ret)
6363		goto fib6_rules_init;
6364
6365	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
6366				   inet6_rtm_newroute, NULL, 0);
6367	if (ret < 0)
6368		goto out_register_late_subsys;
6369
6370	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
6371				   inet6_rtm_delroute, NULL, 0);
6372	if (ret < 0)
6373		goto out_register_late_subsys;
6374
6375	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
6376				   inet6_rtm_getroute, NULL,
6377				   RTNL_FLAG_DOIT_UNLOCKED);
6378	if (ret < 0)
6379		goto out_register_late_subsys;
6380
6381	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
 
 
6382	if (ret)
6383		goto out_register_late_subsys;
 
 
6384
6385	for_each_possible_cpu(cpu) {
6386		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
6387
6388		INIT_LIST_HEAD(&ul->head);
6389		spin_lock_init(&ul->lock);
6390	}
6391
6392out:
6393	return ret;
6394
6395out_register_late_subsys:
6396	rtnl_unregister_all(PF_INET6);
6397	unregister_pernet_subsys(&ip6_route_net_late_ops);
6398fib6_rules_init:
6399	fib6_rules_cleanup();
6400xfrm6_init:
6401	xfrm6_fini();
6402out_fib6_init:
6403	fib6_gc_cleanup();
6404out_register_subsys:
6405	unregister_pernet_subsys(&ip6_route_net_ops);
6406out_register_inetpeer:
6407	unregister_pernet_subsys(&ipv6_inetpeer_ops);
6408out_dst_entries:
6409	dst_entries_destroy(&ip6_dst_blackhole_ops);
6410out_kmem_cache:
6411	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
6412	goto out;
6413}
6414
6415void ip6_route_cleanup(void)
6416{
 
 
 
 
 
6417	unregister_netdevice_notifier(&ip6_route_dev_notifier);
6418	unregister_pernet_subsys(&ip6_route_net_late_ops);
6419	fib6_rules_cleanup();
6420	xfrm6_fini();
6421	fib6_gc_cleanup();
6422	unregister_pernet_subsys(&ipv6_inetpeer_ops);
6423	unregister_pernet_subsys(&ip6_route_net_ops);
6424	dst_entries_destroy(&ip6_dst_blackhole_ops);
6425	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
6426}