Linux Audio

Check our new training course

Loading...
v3.1
   1/*
   2 *	Linux INET6 implementation
   3 *	FIB front-end.
   4 *
   5 *	Authors:
   6 *	Pedro Roque		<roque@di.fc.ul.pt>
   7 *
   8 *	This program is free software; you can redistribute it and/or
   9 *      modify it under the terms of the GNU General Public License
  10 *      as published by the Free Software Foundation; either version
  11 *      2 of the License, or (at your option) any later version.
  12 */
  13
  14/*	Changes:
  15 *
  16 *	YOSHIFUJI Hideaki @USAGI
  17 *		reworked default router selection.
  18 *		- respect outgoing interface
  19 *		- select from (probably) reachable routers (i.e.
  20 *		routers in REACHABLE, STALE, DELAY or PROBE states).
  21 *		- always select the same router if it is (probably)
  22 *		reachable.  otherwise, round-robin the list.
  23 *	Ville Nuorvala
  24 *		Fixed routing subtrees.
  25 */
  26
  27#include <linux/capability.h>
  28#include <linux/errno.h>
  29#include <linux/types.h>
  30#include <linux/times.h>
  31#include <linux/socket.h>
  32#include <linux/sockios.h>
  33#include <linux/net.h>
  34#include <linux/route.h>
  35#include <linux/netdevice.h>
  36#include <linux/in6.h>
  37#include <linux/mroute6.h>
  38#include <linux/init.h>
  39#include <linux/if_arp.h>
  40#include <linux/proc_fs.h>
  41#include <linux/seq_file.h>
  42#include <linux/nsproxy.h>
  43#include <linux/slab.h>
  44#include <net/net_namespace.h>
  45#include <net/snmp.h>
  46#include <net/ipv6.h>
  47#include <net/ip6_fib.h>
  48#include <net/ip6_route.h>
  49#include <net/ndisc.h>
  50#include <net/addrconf.h>
  51#include <net/tcp.h>
  52#include <linux/rtnetlink.h>
  53#include <net/dst.h>
  54#include <net/xfrm.h>
  55#include <net/netevent.h>
  56#include <net/netlink.h>
  57
  58#include <asm/uaccess.h>
  59
  60#ifdef CONFIG_SYSCTL
  61#include <linux/sysctl.h>
  62#endif
  63
  64/* Set to 3 to get tracing. */
  65#define RT6_DEBUG 2
  66
  67#if RT6_DEBUG >= 3
  68#define RDBG(x) printk x
  69#define RT6_TRACE(x...) printk(KERN_DEBUG x)
  70#else
  71#define RDBG(x)
  72#define RT6_TRACE(x...) do { ; } while (0)
  73#endif
  74
  75static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
  76				    const struct in6_addr *dest);
  77static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
  78static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
  79static unsigned int	 ip6_default_mtu(const struct dst_entry *dst);
  80static struct dst_entry *ip6_negative_advice(struct dst_entry *);
  81static void		ip6_dst_destroy(struct dst_entry *);
  82static void		ip6_dst_ifdown(struct dst_entry *,
  83				       struct net_device *dev, int how);
  84static int		 ip6_dst_gc(struct dst_ops *ops);
  85
  86static int		ip6_pkt_discard(struct sk_buff *skb);
  87static int		ip6_pkt_discard_out(struct sk_buff *skb);
  88static void		ip6_link_failure(struct sk_buff *skb);
  89static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
  90
  91#ifdef CONFIG_IPV6_ROUTE_INFO
  92static struct rt6_info *rt6_add_route_info(struct net *net,
  93					   const struct in6_addr *prefix, int prefixlen,
  94					   const struct in6_addr *gwaddr, int ifindex,
  95					   unsigned pref);
  96static struct rt6_info *rt6_get_route_info(struct net *net,
  97					   const struct in6_addr *prefix, int prefixlen,
  98					   const struct in6_addr *gwaddr, int ifindex);
  99#endif
 100
 101static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
 102{
 103	struct rt6_info *rt = (struct rt6_info *) dst;
 104	struct inet_peer *peer;
 105	u32 *p = NULL;
 106
 107	if (!(rt->dst.flags & DST_HOST))
 108		return NULL;
 109
 110	if (!rt->rt6i_peer)
 111		rt6_bind_peer(rt, 1);
 112
 113	peer = rt->rt6i_peer;
 114	if (peer) {
 115		u32 *old_p = __DST_METRICS_PTR(old);
 116		unsigned long prev, new;
 117
 118		p = peer->metrics;
 119		if (inet_metrics_new(peer))
 120			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 121
 122		new = (unsigned long) p;
 123		prev = cmpxchg(&dst->_metrics, old, new);
 124
 125		if (prev != old) {
 126			p = __DST_METRICS_PTR(prev);
 127			if (prev & DST_METRICS_READ_ONLY)
 128				p = NULL;
 129		}
 130	}
 131	return p;
 132}
 133
 134static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
 135{
 136	return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
 137}
 138
 139static struct dst_ops ip6_dst_ops_template = {
 140	.family			=	AF_INET6,
 141	.protocol		=	cpu_to_be16(ETH_P_IPV6),
 142	.gc			=	ip6_dst_gc,
 143	.gc_thresh		=	1024,
 144	.check			=	ip6_dst_check,
 145	.default_advmss		=	ip6_default_advmss,
 146	.default_mtu		=	ip6_default_mtu,
 147	.cow_metrics		=	ipv6_cow_metrics,
 148	.destroy		=	ip6_dst_destroy,
 149	.ifdown			=	ip6_dst_ifdown,
 150	.negative_advice	=	ip6_negative_advice,
 151	.link_failure		=	ip6_link_failure,
 152	.update_pmtu		=	ip6_rt_update_pmtu,
 153	.local_out		=	__ip6_local_out,
 154	.neigh_lookup		=	ip6_neigh_lookup,
 155};
 156
 157static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
 158{
 159	return 0;
 160}
 161
 162static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 163{
 164}
 165
 166static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
 167					 unsigned long old)
 168{
 169	return NULL;
 170}
 171
 172static struct dst_ops ip6_dst_blackhole_ops = {
 173	.family			=	AF_INET6,
 174	.protocol		=	cpu_to_be16(ETH_P_IPV6),
 175	.destroy		=	ip6_dst_destroy,
 176	.check			=	ip6_dst_check,
 177	.default_mtu		=	ip6_blackhole_default_mtu,
 178	.default_advmss		=	ip6_default_advmss,
 179	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
 180	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
 181	.neigh_lookup		=	ip6_neigh_lookup,
 182};
 183
 184static const u32 ip6_template_metrics[RTAX_MAX] = {
 185	[RTAX_HOPLIMIT - 1] = 255,
 186};
 187
 188static struct rt6_info ip6_null_entry_template = {
 189	.dst = {
 190		.__refcnt	= ATOMIC_INIT(1),
 191		.__use		= 1,
 192		.obsolete	= -1,
 193		.error		= -ENETUNREACH,
 194		.input		= ip6_pkt_discard,
 195		.output		= ip6_pkt_discard_out,
 196	},
 197	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 198	.rt6i_protocol  = RTPROT_KERNEL,
 199	.rt6i_metric	= ~(u32) 0,
 200	.rt6i_ref	= ATOMIC_INIT(1),
 201};
 202
 203#ifdef CONFIG_IPV6_MULTIPLE_TABLES
 204
 205static int ip6_pkt_prohibit(struct sk_buff *skb);
 206static int ip6_pkt_prohibit_out(struct sk_buff *skb);
 207
 208static struct rt6_info ip6_prohibit_entry_template = {
 209	.dst = {
 210		.__refcnt	= ATOMIC_INIT(1),
 211		.__use		= 1,
 212		.obsolete	= -1,
 213		.error		= -EACCES,
 214		.input		= ip6_pkt_prohibit,
 215		.output		= ip6_pkt_prohibit_out,
 216	},
 217	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 218	.rt6i_protocol  = RTPROT_KERNEL,
 219	.rt6i_metric	= ~(u32) 0,
 220	.rt6i_ref	= ATOMIC_INIT(1),
 221};
 222
 223static struct rt6_info ip6_blk_hole_entry_template = {
 224	.dst = {
 225		.__refcnt	= ATOMIC_INIT(1),
 226		.__use		= 1,
 227		.obsolete	= -1,
 228		.error		= -EINVAL,
 229		.input		= dst_discard,
 230		.output		= dst_discard,
 231	},
 232	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 233	.rt6i_protocol  = RTPROT_KERNEL,
 234	.rt6i_metric	= ~(u32) 0,
 235	.rt6i_ref	= ATOMIC_INIT(1),
 236};
 237
 238#endif
 239
 240/* allocate dst with ip6_dst_ops */
 241static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
 242					     struct net_device *dev,
 243					     int flags)
 244{
 245	struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
 246
 247	if (rt != NULL)
 248		memset(&rt->rt6i_table, 0,
 249			sizeof(*rt) - sizeof(struct dst_entry));
 250
 251	return rt;
 252}
 253
 254static void ip6_dst_destroy(struct dst_entry *dst)
 255{
 256	struct rt6_info *rt = (struct rt6_info *)dst;
 257	struct inet6_dev *idev = rt->rt6i_idev;
 258	struct inet_peer *peer = rt->rt6i_peer;
 259
 260	if (!(rt->dst.flags & DST_HOST))
 261		dst_destroy_metrics_generic(dst);
 262
 263	if (idev != NULL) {
 264		rt->rt6i_idev = NULL;
 265		in6_dev_put(idev);
 266	}
 267	if (peer) {
 268		rt->rt6i_peer = NULL;
 269		inet_putpeer(peer);
 270	}
 271}
 272
 273static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
 274
 275static u32 rt6_peer_genid(void)
 276{
 277	return atomic_read(&__rt6_peer_genid);
 278}
 279
 280void rt6_bind_peer(struct rt6_info *rt, int create)
 281{
 282	struct inet_peer *peer;
 283
 284	peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
 285	if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
 286		inet_putpeer(peer);
 287	else
 288		rt->rt6i_peer_genid = rt6_peer_genid();
 289}
 290
 291static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 292			   int how)
 293{
 294	struct rt6_info *rt = (struct rt6_info *)dst;
 295	struct inet6_dev *idev = rt->rt6i_idev;
 296	struct net_device *loopback_dev =
 297		dev_net(dev)->loopback_dev;
 298
 299	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
 300		struct inet6_dev *loopback_idev =
 301			in6_dev_get(loopback_dev);
 302		if (loopback_idev != NULL) {
 303			rt->rt6i_idev = loopback_idev;
 304			in6_dev_put(idev);
 305		}
 306	}
 307}
 308
 309static __inline__ int rt6_check_expired(const struct rt6_info *rt)
 310{
 311	return (rt->rt6i_flags & RTF_EXPIRES) &&
 312		time_after(jiffies, rt->rt6i_expires);
 313}
 314
 315static inline int rt6_need_strict(const struct in6_addr *daddr)
 316{
 317	return ipv6_addr_type(daddr) &
 318		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 319}
 320
 321/*
 322 *	Route lookup. Any table->tb6_lock is implied.
 323 */
 324
 325static inline struct rt6_info *rt6_device_match(struct net *net,
 326						    struct rt6_info *rt,
 327						    const struct in6_addr *saddr,
 328						    int oif,
 329						    int flags)
 330{
 331	struct rt6_info *local = NULL;
 332	struct rt6_info *sprt;
 333
 334	if (!oif && ipv6_addr_any(saddr))
 335		goto out;
 336
 337	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
 338		struct net_device *dev = sprt->rt6i_dev;
 339
 340		if (oif) {
 341			if (dev->ifindex == oif)
 342				return sprt;
 343			if (dev->flags & IFF_LOOPBACK) {
 344				if (sprt->rt6i_idev == NULL ||
 345				    sprt->rt6i_idev->dev->ifindex != oif) {
 346					if (flags & RT6_LOOKUP_F_IFACE && oif)
 347						continue;
 348					if (local && (!oif ||
 349						      local->rt6i_idev->dev->ifindex == oif))
 350						continue;
 351				}
 352				local = sprt;
 353			}
 354		} else {
 355			if (ipv6_chk_addr(net, saddr, dev,
 356					  flags & RT6_LOOKUP_F_IFACE))
 357				return sprt;
 358		}
 359	}
 360
 361	if (oif) {
 362		if (local)
 363			return local;
 364
 365		if (flags & RT6_LOOKUP_F_IFACE)
 366			return net->ipv6.ip6_null_entry;
 367	}
 368out:
 369	return rt;
 370}
 371
 372#ifdef CONFIG_IPV6_ROUTER_PREF
 373static void rt6_probe(struct rt6_info *rt)
 374{
 375	struct neighbour *neigh;
 376	/*
 377	 * Okay, this does not seem to be appropriate
 378	 * for now, however, we need to check if it
 379	 * is really so; aka Router Reachability Probing.
 380	 *
 381	 * Router Reachability Probe MUST be rate-limited
 382	 * to no more than one per minute.
 383	 */
 384	rcu_read_lock();
 385	neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
 386	if (!neigh || (neigh->nud_state & NUD_VALID))
 387		goto out;
 388	read_lock_bh(&neigh->lock);
 389	if (!(neigh->nud_state & NUD_VALID) &&
 390	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
 391		struct in6_addr mcaddr;
 392		struct in6_addr *target;
 393
 394		neigh->updated = jiffies;
 395		read_unlock_bh(&neigh->lock);
 396
 397		target = (struct in6_addr *)&neigh->primary_key;
 398		addrconf_addr_solict_mult(target, &mcaddr);
 399		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
 400	} else {
 401		read_unlock_bh(&neigh->lock);
 402	}
 403out:
 404	rcu_read_unlock();
 405}
 406#else
 407static inline void rt6_probe(struct rt6_info *rt)
 408{
 409}
 410#endif
 411
 412/*
 413 * Default Router Selection (RFC 2461 6.3.6)
 414 */
 415static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 416{
 417	struct net_device *dev = rt->rt6i_dev;
 418	if (!oif || dev->ifindex == oif)
 419		return 2;
 420	if ((dev->flags & IFF_LOOPBACK) &&
 421	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
 422		return 1;
 423	return 0;
 424}
 425
 426static inline int rt6_check_neigh(struct rt6_info *rt)
 427{
 428	struct neighbour *neigh;
 429	int m;
 430
 431	rcu_read_lock();
 432	neigh = dst_get_neighbour(&rt->dst);
 433	if (rt->rt6i_flags & RTF_NONEXTHOP ||
 434	    !(rt->rt6i_flags & RTF_GATEWAY))
 435		m = 1;
 436	else if (neigh) {
 437		read_lock_bh(&neigh->lock);
 438		if (neigh->nud_state & NUD_VALID)
 439			m = 2;
 440#ifdef CONFIG_IPV6_ROUTER_PREF
 441		else if (neigh->nud_state & NUD_FAILED)
 442			m = 0;
 443#endif
 444		else
 445			m = 1;
 446		read_unlock_bh(&neigh->lock);
 447	} else
 448		m = 0;
 449	rcu_read_unlock();
 450	return m;
 451}
 452
 453static int rt6_score_route(struct rt6_info *rt, int oif,
 454			   int strict)
 455{
 456	int m, n;
 457
 458	m = rt6_check_dev(rt, oif);
 459	if (!m && (strict & RT6_LOOKUP_F_IFACE))
 460		return -1;
 461#ifdef CONFIG_IPV6_ROUTER_PREF
 462	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
 463#endif
 464	n = rt6_check_neigh(rt);
 465	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
 466		return -1;
 467	return m;
 468}
 469
 470static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 471				   int *mpri, struct rt6_info *match)
 472{
 473	int m;
 474
 475	if (rt6_check_expired(rt))
 476		goto out;
 477
 478	m = rt6_score_route(rt, oif, strict);
 479	if (m < 0)
 480		goto out;
 481
 482	if (m > *mpri) {
 483		if (strict & RT6_LOOKUP_F_REACHABLE)
 484			rt6_probe(match);
 485		*mpri = m;
 486		match = rt;
 487	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
 488		rt6_probe(rt);
 489	}
 490
 491out:
 492	return match;
 493}
 494
 495static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 496				     struct rt6_info *rr_head,
 497				     u32 metric, int oif, int strict)
 498{
 499	struct rt6_info *rt, *match;
 500	int mpri = -1;
 501
 502	match = NULL;
 503	for (rt = rr_head; rt && rt->rt6i_metric == metric;
 504	     rt = rt->dst.rt6_next)
 505		match = find_match(rt, oif, strict, &mpri, match);
 506	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
 507	     rt = rt->dst.rt6_next)
 508		match = find_match(rt, oif, strict, &mpri, match);
 509
 510	return match;
 511}
 512
 513static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
 514{
 515	struct rt6_info *match, *rt0;
 516	struct net *net;
 517
 518	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
 519		  __func__, fn->leaf, oif);
 520
 521	rt0 = fn->rr_ptr;
 522	if (!rt0)
 523		fn->rr_ptr = rt0 = fn->leaf;
 524
 525	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
 526
 527	if (!match &&
 528	    (strict & RT6_LOOKUP_F_REACHABLE)) {
 529		struct rt6_info *next = rt0->dst.rt6_next;
 530
 531		/* no entries matched; do round-robin */
 532		if (!next || next->rt6i_metric != rt0->rt6i_metric)
 533			next = fn->leaf;
 534
 535		if (next != rt0)
 536			fn->rr_ptr = next;
 537	}
 538
 539	RT6_TRACE("%s() => %p\n",
 540		  __func__, match);
 541
 542	net = dev_net(rt0->rt6i_dev);
 543	return match ? match : net->ipv6.ip6_null_entry;
 544}
 545
 546#ifdef CONFIG_IPV6_ROUTE_INFO
 547int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 548		  const struct in6_addr *gwaddr)
 549{
 550	struct net *net = dev_net(dev);
 551	struct route_info *rinfo = (struct route_info *) opt;
 552	struct in6_addr prefix_buf, *prefix;
 553	unsigned int pref;
 554	unsigned long lifetime;
 555	struct rt6_info *rt;
 556
 557	if (len < sizeof(struct route_info)) {
 558		return -EINVAL;
 559	}
 560
 561	/* Sanity check for prefix_len and length */
 562	if (rinfo->length > 3) {
 563		return -EINVAL;
 564	} else if (rinfo->prefix_len > 128) {
 565		return -EINVAL;
 566	} else if (rinfo->prefix_len > 64) {
 567		if (rinfo->length < 2) {
 568			return -EINVAL;
 569		}
 570	} else if (rinfo->prefix_len > 0) {
 571		if (rinfo->length < 1) {
 572			return -EINVAL;
 573		}
 574	}
 575
 576	pref = rinfo->route_pref;
 577	if (pref == ICMPV6_ROUTER_PREF_INVALID)
 578		return -EINVAL;
 579
 580	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
 581
 582	if (rinfo->length == 3)
 583		prefix = (struct in6_addr *)rinfo->prefix;
 584	else {
 585		/* this function is safe */
 586		ipv6_addr_prefix(&prefix_buf,
 587				 (struct in6_addr *)rinfo->prefix,
 588				 rinfo->prefix_len);
 589		prefix = &prefix_buf;
 590	}
 591
 592	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
 593				dev->ifindex);
 594
 595	if (rt && !lifetime) {
 596		ip6_del_rt(rt);
 597		rt = NULL;
 598	}
 599
 600	if (!rt && lifetime)
 601		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
 602					pref);
 603	else if (rt)
 604		rt->rt6i_flags = RTF_ROUTEINFO |
 605				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 606
 607	if (rt) {
 608		if (!addrconf_finite_timeout(lifetime)) {
 609			rt->rt6i_flags &= ~RTF_EXPIRES;
 610		} else {
 611			rt->rt6i_expires = jiffies + HZ * lifetime;
 612			rt->rt6i_flags |= RTF_EXPIRES;
 613		}
 614		dst_release(&rt->dst);
 615	}
 616	return 0;
 617}
 618#endif
 619
 620#define BACKTRACK(__net, saddr)			\
 621do { \
 622	if (rt == __net->ipv6.ip6_null_entry) {	\
 623		struct fib6_node *pn; \
 624		while (1) { \
 625			if (fn->fn_flags & RTN_TL_ROOT) \
 626				goto out; \
 627			pn = fn->parent; \
 628			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
 629				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
 630			else \
 631				fn = pn; \
 632			if (fn->fn_flags & RTN_RTINFO) \
 633				goto restart; \
 634		} \
 635	} \
 636} while(0)
 637
 638static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 639					     struct fib6_table *table,
 640					     struct flowi6 *fl6, int flags)
 641{
 642	struct fib6_node *fn;
 643	struct rt6_info *rt;
 644
 645	read_lock_bh(&table->tb6_lock);
 646	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 647restart:
 648	rt = fn->leaf;
 649	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
 650	BACKTRACK(net, &fl6->saddr);
 651out:
 652	dst_use(&rt->dst, jiffies);
 653	read_unlock_bh(&table->tb6_lock);
 654	return rt;
 655
 656}
 657
 658struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
 659			    const struct in6_addr *saddr, int oif, int strict)
 660{
 661	struct flowi6 fl6 = {
 662		.flowi6_oif = oif,
 663		.daddr = *daddr,
 664	};
 665	struct dst_entry *dst;
 666	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
 667
 668	if (saddr) {
 669		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
 670		flags |= RT6_LOOKUP_F_HAS_SADDR;
 671	}
 672
 673	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
 674	if (dst->error == 0)
 675		return (struct rt6_info *) dst;
 676
 677	dst_release(dst);
 678
 679	return NULL;
 680}
 681
 682EXPORT_SYMBOL(rt6_lookup);
 683
 684/* ip6_ins_rt is called with FREE table->tb6_lock.
 685   It takes new route entry, the addition fails by any reason the
 686   route is freed. In any case, if caller does not hold it, it may
 687   be destroyed.
 688 */
 689
 690static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
 691{
 692	int err;
 693	struct fib6_table *table;
 694
 695	table = rt->rt6i_table;
 696	write_lock_bh(&table->tb6_lock);
 697	err = fib6_add(&table->tb6_root, rt, info);
 698	write_unlock_bh(&table->tb6_lock);
 699
 700	return err;
 701}
 702
 703int ip6_ins_rt(struct rt6_info *rt)
 704{
 705	struct nl_info info = {
 706		.nl_net = dev_net(rt->rt6i_dev),
 707	};
 708	return __ip6_ins_rt(rt, &info);
 709}
 710
 711static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
 712				      const struct in6_addr *daddr,
 713				      const struct in6_addr *saddr)
 714{
 715	struct rt6_info *rt;
 716
 717	/*
 718	 *	Clone the route.
 719	 */
 720
 721	rt = ip6_rt_copy(ort, daddr);
 722
 723	if (rt) {
 724		struct neighbour *neigh;
 725		int attempts = !in_softirq();
 726
 727		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
 728			if (rt->rt6i_dst.plen != 128 &&
 729			    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
 730				rt->rt6i_flags |= RTF_ANYCAST;
 731			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
 732		}
 733
 734		rt->rt6i_flags |= RTF_CACHE;
 735
 736#ifdef CONFIG_IPV6_SUBTREES
 737		if (rt->rt6i_src.plen && saddr) {
 738			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
 739			rt->rt6i_src.plen = 128;
 740		}
 741#endif
 742
 743	retry:
 744		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
 745		if (IS_ERR(neigh)) {
 746			struct net *net = dev_net(rt->rt6i_dev);
 747			int saved_rt_min_interval =
 748				net->ipv6.sysctl.ip6_rt_gc_min_interval;
 749			int saved_rt_elasticity =
 750				net->ipv6.sysctl.ip6_rt_gc_elasticity;
 751
 752			if (attempts-- > 0) {
 753				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
 754				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
 755
 756				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
 757
 758				net->ipv6.sysctl.ip6_rt_gc_elasticity =
 759					saved_rt_elasticity;
 760				net->ipv6.sysctl.ip6_rt_gc_min_interval =
 761					saved_rt_min_interval;
 762				goto retry;
 763			}
 764
 765			if (net_ratelimit())
 766				printk(KERN_WARNING
 767				       "ipv6: Neighbour table overflow.\n");
 768			dst_free(&rt->dst);
 769			return NULL;
 770		}
 771		dst_set_neighbour(&rt->dst, neigh);
 772
 773	}
 774
 775	return rt;
 776}
 777
 778static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
 779					const struct in6_addr *daddr)
 780{
 781	struct rt6_info *rt = ip6_rt_copy(ort, daddr);
 782
 783	if (rt) {
 784		rt->rt6i_flags |= RTF_CACHE;
 785		dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
 786	}
 787	return rt;
 788}
 789
 790static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
 791				      struct flowi6 *fl6, int flags)
 792{
 793	struct fib6_node *fn;
 794	struct rt6_info *rt, *nrt;
 795	int strict = 0;
 796	int attempts = 3;
 797	int err;
 798	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
 799
 800	strict |= flags & RT6_LOOKUP_F_IFACE;
 801
 802relookup:
 803	read_lock_bh(&table->tb6_lock);
 804
 805restart_2:
 806	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 807
 808restart:
 809	rt = rt6_select(fn, oif, strict | reachable);
 810
 811	BACKTRACK(net, &fl6->saddr);
 812	if (rt == net->ipv6.ip6_null_entry ||
 813	    rt->rt6i_flags & RTF_CACHE)
 814		goto out;
 815
 816	dst_hold(&rt->dst);
 817	read_unlock_bh(&table->tb6_lock);
 818
 819	if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
 820		nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
 821	else if (!(rt->dst.flags & DST_HOST))
 822		nrt = rt6_alloc_clone(rt, &fl6->daddr);
 823	else
 824		goto out2;
 825
 826	dst_release(&rt->dst);
 827	rt = nrt ? : net->ipv6.ip6_null_entry;
 828
 829	dst_hold(&rt->dst);
 830	if (nrt) {
 831		err = ip6_ins_rt(nrt);
 832		if (!err)
 833			goto out2;
 834	}
 835
 836	if (--attempts <= 0)
 837		goto out2;
 838
 839	/*
 840	 * Race condition! In the gap, when table->tb6_lock was
 841	 * released someone could insert this route.  Relookup.
 842	 */
 843	dst_release(&rt->dst);
 844	goto relookup;
 845
 846out:
 847	if (reachable) {
 848		reachable = 0;
 849		goto restart_2;
 850	}
 851	dst_hold(&rt->dst);
 852	read_unlock_bh(&table->tb6_lock);
 853out2:
 854	rt->dst.lastuse = jiffies;
 855	rt->dst.__use++;
 856
 857	return rt;
 858}
 859
 860static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
 861					    struct flowi6 *fl6, int flags)
 862{
 863	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
 864}
 865
 866void ip6_route_input(struct sk_buff *skb)
 867{
 868	const struct ipv6hdr *iph = ipv6_hdr(skb);
 869	struct net *net = dev_net(skb->dev);
 870	int flags = RT6_LOOKUP_F_HAS_SADDR;
 871	struct flowi6 fl6 = {
 872		.flowi6_iif = skb->dev->ifindex,
 873		.daddr = iph->daddr,
 874		.saddr = iph->saddr,
 875		.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
 876		.flowi6_mark = skb->mark,
 877		.flowi6_proto = iph->nexthdr,
 878	};
 879
 880	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
 881		flags |= RT6_LOOKUP_F_IFACE;
 882
 883	skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
 884}
 885
 886static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
 887					     struct flowi6 *fl6, int flags)
 888{
 889	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
 890}
 891
 892struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
 893				    struct flowi6 *fl6)
 894{
 895	int flags = 0;
 896
 897	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
 898		flags |= RT6_LOOKUP_F_IFACE;
 899
 900	if (!ipv6_addr_any(&fl6->saddr))
 901		flags |= RT6_LOOKUP_F_HAS_SADDR;
 902	else if (sk)
 903		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
 904
 905	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
 906}
 907
 908EXPORT_SYMBOL(ip6_route_output);
 909
 910struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
 911{
 912	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
 913	struct dst_entry *new = NULL;
 914
 915	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
 916	if (rt) {
 917		memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
 918
 919		new = &rt->dst;
 920
 921		new->__use = 1;
 922		new->input = dst_discard;
 923		new->output = dst_discard;
 924
 925		if (dst_metrics_read_only(&ort->dst))
 926			new->_metrics = ort->dst._metrics;
 927		else
 928			dst_copy_metrics(new, &ort->dst);
 929		rt->rt6i_idev = ort->rt6i_idev;
 930		if (rt->rt6i_idev)
 931			in6_dev_hold(rt->rt6i_idev);
 932		rt->rt6i_expires = 0;
 933
 934		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
 935		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
 936		rt->rt6i_metric = 0;
 937
 938		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
 939#ifdef CONFIG_IPV6_SUBTREES
 940		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
 941#endif
 942
 943		dst_free(new);
 944	}
 945
 946	dst_release(dst_orig);
 947	return new ? new : ERR_PTR(-ENOMEM);
 948}
 949
 950/*
 951 *	Destination cache support functions
 952 */
 953
 954static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 955{
 956	struct rt6_info *rt;
 957
 958	rt = (struct rt6_info *) dst;
 959
 960	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
 961		if (rt->rt6i_peer_genid != rt6_peer_genid()) {
 962			if (!rt->rt6i_peer)
 963				rt6_bind_peer(rt, 0);
 964			rt->rt6i_peer_genid = rt6_peer_genid();
 965		}
 966		return dst;
 967	}
 968	return NULL;
 969}
 970
 971static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 972{
 973	struct rt6_info *rt = (struct rt6_info *) dst;
 974
 975	if (rt) {
 976		if (rt->rt6i_flags & RTF_CACHE) {
 977			if (rt6_check_expired(rt)) {
 978				ip6_del_rt(rt);
 979				dst = NULL;
 980			}
 981		} else {
 982			dst_release(dst);
 983			dst = NULL;
 984		}
 985	}
 986	return dst;
 987}
 988
 989static void ip6_link_failure(struct sk_buff *skb)
 990{
 991	struct rt6_info *rt;
 992
 993	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
 994
 995	rt = (struct rt6_info *) skb_dst(skb);
 996	if (rt) {
 997		if (rt->rt6i_flags&RTF_CACHE) {
 998			dst_set_expires(&rt->dst, 0);
 999			rt->rt6i_flags |= RTF_EXPIRES;
1000		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1001			rt->rt6i_node->fn_sernum = -1;
1002	}
1003}
1004
1005static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1006{
1007	struct rt6_info *rt6 = (struct rt6_info*)dst;
1008
1009	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1010		rt6->rt6i_flags |= RTF_MODIFIED;
1011		if (mtu < IPV6_MIN_MTU) {
1012			u32 features = dst_metric(dst, RTAX_FEATURES);
1013			mtu = IPV6_MIN_MTU;
1014			features |= RTAX_FEATURE_ALLFRAG;
1015			dst_metric_set(dst, RTAX_FEATURES, features);
1016		}
1017		dst_metric_set(dst, RTAX_MTU, mtu);
1018	}
1019}
1020
1021static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1022{
1023	struct net_device *dev = dst->dev;
1024	unsigned int mtu = dst_mtu(dst);
1025	struct net *net = dev_net(dev);
1026
1027	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1028
1029	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1030		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1031
1032	/*
1033	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1034	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1035	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1036	 * rely only on pmtu discovery"
1037	 */
1038	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1039		mtu = IPV6_MAXPLEN;
1040	return mtu;
1041}
1042
1043static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1044{
1045	unsigned int mtu = IPV6_MIN_MTU;
1046	struct inet6_dev *idev;
1047
1048	rcu_read_lock();
1049	idev = __in6_dev_get(dst->dev);
1050	if (idev)
1051		mtu = idev->cnf.mtu6;
1052	rcu_read_unlock();
1053
1054	return mtu;
1055}
1056
1057static struct dst_entry *icmp6_dst_gc_list;
1058static DEFINE_SPINLOCK(icmp6_dst_lock);
1059
1060struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1061				  struct neighbour *neigh,
1062				  const struct in6_addr *addr)
1063{
1064	struct rt6_info *rt;
1065	struct inet6_dev *idev = in6_dev_get(dev);
1066	struct net *net = dev_net(dev);
1067
1068	if (unlikely(idev == NULL))
1069		return NULL;
1070
1071	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1072	if (unlikely(rt == NULL)) {
1073		in6_dev_put(idev);
1074		goto out;
1075	}
1076
1077	if (neigh)
1078		neigh_hold(neigh);
1079	else {
1080		neigh = ndisc_get_neigh(dev, addr);
1081		if (IS_ERR(neigh))
1082			neigh = NULL;
1083	}
1084
1085	rt->dst.flags |= DST_HOST;
1086	rt->dst.output  = ip6_output;
1087	dst_set_neighbour(&rt->dst, neigh);
1088	atomic_set(&rt->dst.__refcnt, 1);
1089	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1090
1091	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1092	rt->rt6i_dst.plen = 128;
1093	rt->rt6i_idev     = idev;
1094
1095	spin_lock_bh(&icmp6_dst_lock);
1096	rt->dst.next = icmp6_dst_gc_list;
1097	icmp6_dst_gc_list = &rt->dst;
1098	spin_unlock_bh(&icmp6_dst_lock);
1099
1100	fib6_force_start_gc(net);
1101
1102out:
1103	return &rt->dst;
1104}
1105
1106int icmp6_dst_gc(void)
1107{
1108	struct dst_entry *dst, **pprev;
1109	int more = 0;
1110
1111	spin_lock_bh(&icmp6_dst_lock);
1112	pprev = &icmp6_dst_gc_list;
1113
1114	while ((dst = *pprev) != NULL) {
1115		if (!atomic_read(&dst->__refcnt)) {
1116			*pprev = dst->next;
1117			dst_free(dst);
1118		} else {
1119			pprev = &dst->next;
1120			++more;
1121		}
1122	}
1123
1124	spin_unlock_bh(&icmp6_dst_lock);
1125
1126	return more;
1127}
1128
1129static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1130			    void *arg)
1131{
1132	struct dst_entry *dst, **pprev;
1133
1134	spin_lock_bh(&icmp6_dst_lock);
1135	pprev = &icmp6_dst_gc_list;
1136	while ((dst = *pprev) != NULL) {
1137		struct rt6_info *rt = (struct rt6_info *) dst;
1138		if (func(rt, arg)) {
1139			*pprev = dst->next;
1140			dst_free(dst);
1141		} else {
1142			pprev = &dst->next;
1143		}
1144	}
1145	spin_unlock_bh(&icmp6_dst_lock);
1146}
1147
1148static int ip6_dst_gc(struct dst_ops *ops)
1149{
1150	unsigned long now = jiffies;
1151	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1152	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1153	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1154	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1155	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1156	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1157	int entries;
1158
1159	entries = dst_entries_get_fast(ops);
1160	if (time_after(rt_last_gc + rt_min_interval, now) &&
1161	    entries <= rt_max_size)
1162		goto out;
1163
1164	net->ipv6.ip6_rt_gc_expire++;
1165	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1166	net->ipv6.ip6_rt_last_gc = now;
1167	entries = dst_entries_get_slow(ops);
1168	if (entries < ops->gc_thresh)
1169		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1170out:
1171	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1172	return entries > rt_max_size;
1173}
1174
1175/* Clean host part of a prefix. Not necessary in radix tree,
1176   but results in cleaner routing tables.
1177
1178   Remove it only when all the things will work!
1179 */
1180
1181int ip6_dst_hoplimit(struct dst_entry *dst)
1182{
1183	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1184	if (hoplimit == 0) {
1185		struct net_device *dev = dst->dev;
1186		struct inet6_dev *idev;
1187
1188		rcu_read_lock();
1189		idev = __in6_dev_get(dev);
1190		if (idev)
1191			hoplimit = idev->cnf.hop_limit;
1192		else
1193			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1194		rcu_read_unlock();
1195	}
1196	return hoplimit;
1197}
1198EXPORT_SYMBOL(ip6_dst_hoplimit);
1199
1200/*
1201 *
1202 */
1203
1204int ip6_route_add(struct fib6_config *cfg)
1205{
1206	int err;
1207	struct net *net = cfg->fc_nlinfo.nl_net;
1208	struct rt6_info *rt = NULL;
1209	struct net_device *dev = NULL;
1210	struct inet6_dev *idev = NULL;
1211	struct fib6_table *table;
1212	int addr_type;
1213
1214	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1215		return -EINVAL;
1216#ifndef CONFIG_IPV6_SUBTREES
1217	if (cfg->fc_src_len)
1218		return -EINVAL;
1219#endif
1220	if (cfg->fc_ifindex) {
1221		err = -ENODEV;
1222		dev = dev_get_by_index(net, cfg->fc_ifindex);
1223		if (!dev)
1224			goto out;
1225		idev = in6_dev_get(dev);
1226		if (!idev)
1227			goto out;
1228	}
1229
1230	if (cfg->fc_metric == 0)
1231		cfg->fc_metric = IP6_RT_PRIO_USER;
1232
1233	table = fib6_new_table(net, cfg->fc_table);
1234	if (table == NULL) {
1235		err = -ENOBUFS;
1236		goto out;
1237	}
1238
1239	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1240
1241	if (rt == NULL) {
1242		err = -ENOMEM;
1243		goto out;
1244	}
1245
1246	rt->dst.obsolete = -1;
1247	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1248				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1249				0;
1250
1251	if (cfg->fc_protocol == RTPROT_UNSPEC)
1252		cfg->fc_protocol = RTPROT_BOOT;
1253	rt->rt6i_protocol = cfg->fc_protocol;
1254
1255	addr_type = ipv6_addr_type(&cfg->fc_dst);
1256
1257	if (addr_type & IPV6_ADDR_MULTICAST)
1258		rt->dst.input = ip6_mc_input;
1259	else if (cfg->fc_flags & RTF_LOCAL)
1260		rt->dst.input = ip6_input;
1261	else
1262		rt->dst.input = ip6_forward;
1263
1264	rt->dst.output = ip6_output;
1265
1266	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1267	rt->rt6i_dst.plen = cfg->fc_dst_len;
1268	if (rt->rt6i_dst.plen == 128)
1269	       rt->dst.flags |= DST_HOST;
1270
1271	if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1272		u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1273		if (!metrics) {
1274			err = -ENOMEM;
1275			goto out;
1276		}
1277		dst_init_metrics(&rt->dst, metrics, 0);
1278	}
1279#ifdef CONFIG_IPV6_SUBTREES
1280	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1281	rt->rt6i_src.plen = cfg->fc_src_len;
1282#endif
1283
1284	rt->rt6i_metric = cfg->fc_metric;
1285
1286	/* We cannot add true routes via loopback here,
1287	   they would result in kernel looping; promote them to reject routes
1288	 */
1289	if ((cfg->fc_flags & RTF_REJECT) ||
1290	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1291					      && !(cfg->fc_flags&RTF_LOCAL))) {
1292		/* hold loopback dev/idev if we haven't done so. */
1293		if (dev != net->loopback_dev) {
1294			if (dev) {
1295				dev_put(dev);
1296				in6_dev_put(idev);
1297			}
1298			dev = net->loopback_dev;
1299			dev_hold(dev);
1300			idev = in6_dev_get(dev);
1301			if (!idev) {
1302				err = -ENODEV;
1303				goto out;
1304			}
1305		}
1306		rt->dst.output = ip6_pkt_discard_out;
1307		rt->dst.input = ip6_pkt_discard;
1308		rt->dst.error = -ENETUNREACH;
1309		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1310		goto install_route;
1311	}
1312
1313	if (cfg->fc_flags & RTF_GATEWAY) {
1314		const struct in6_addr *gw_addr;
1315		int gwa_type;
1316
1317		gw_addr = &cfg->fc_gateway;
1318		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1319		gwa_type = ipv6_addr_type(gw_addr);
1320
1321		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1322			struct rt6_info *grt;
1323
1324			/* IPv6 strictly inhibits using not link-local
1325			   addresses as nexthop address.
1326			   Otherwise, router will not able to send redirects.
1327			   It is very good, but in some (rare!) circumstances
1328			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1329			   some exceptions. --ANK
1330			 */
1331			err = -EINVAL;
1332			if (!(gwa_type&IPV6_ADDR_UNICAST))
1333				goto out;
1334
1335			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1336
1337			err = -EHOSTUNREACH;
1338			if (grt == NULL)
1339				goto out;
1340			if (dev) {
1341				if (dev != grt->rt6i_dev) {
1342					dst_release(&grt->dst);
1343					goto out;
1344				}
1345			} else {
1346				dev = grt->rt6i_dev;
1347				idev = grt->rt6i_idev;
1348				dev_hold(dev);
1349				in6_dev_hold(grt->rt6i_idev);
1350			}
1351			if (!(grt->rt6i_flags&RTF_GATEWAY))
1352				err = 0;
1353			dst_release(&grt->dst);
1354
1355			if (err)
1356				goto out;
1357		}
1358		err = -EINVAL;
1359		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1360			goto out;
1361	}
1362
1363	err = -ENODEV;
1364	if (dev == NULL)
1365		goto out;
1366
1367	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1368		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1369			err = -EINVAL;
1370			goto out;
1371		}
1372		ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1373		rt->rt6i_prefsrc.plen = 128;
1374	} else
1375		rt->rt6i_prefsrc.plen = 0;
1376
1377	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1378		struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1379		if (IS_ERR(n)) {
1380			err = PTR_ERR(n);
1381			goto out;
1382		}
1383		dst_set_neighbour(&rt->dst, n);
1384	}
1385
1386	rt->rt6i_flags = cfg->fc_flags;
1387
1388install_route:
1389	if (cfg->fc_mx) {
1390		struct nlattr *nla;
1391		int remaining;
1392
1393		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1394			int type = nla_type(nla);
1395
1396			if (type) {
1397				if (type > RTAX_MAX) {
1398					err = -EINVAL;
1399					goto out;
1400				}
1401
1402				dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1403			}
1404		}
1405	}
1406
1407	rt->dst.dev = dev;
1408	rt->rt6i_idev = idev;
1409	rt->rt6i_table = table;
1410
1411	cfg->fc_nlinfo.nl_net = dev_net(dev);
1412
1413	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1414
1415out:
1416	if (dev)
1417		dev_put(dev);
1418	if (idev)
1419		in6_dev_put(idev);
1420	if (rt)
1421		dst_free(&rt->dst);
1422	return err;
1423}
1424
1425static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1426{
1427	int err;
1428	struct fib6_table *table;
1429	struct net *net = dev_net(rt->rt6i_dev);
1430
1431	if (rt == net->ipv6.ip6_null_entry)
1432		return -ENOENT;
1433
1434	table = rt->rt6i_table;
1435	write_lock_bh(&table->tb6_lock);
1436
1437	err = fib6_del(rt, info);
1438	dst_release(&rt->dst);
1439
1440	write_unlock_bh(&table->tb6_lock);
1441
1442	return err;
1443}
1444
1445int ip6_del_rt(struct rt6_info *rt)
1446{
1447	struct nl_info info = {
1448		.nl_net = dev_net(rt->rt6i_dev),
1449	};
1450	return __ip6_del_rt(rt, &info);
1451}
1452
1453static int ip6_route_del(struct fib6_config *cfg)
1454{
1455	struct fib6_table *table;
1456	struct fib6_node *fn;
1457	struct rt6_info *rt;
1458	int err = -ESRCH;
1459
1460	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1461	if (table == NULL)
1462		return err;
1463
1464	read_lock_bh(&table->tb6_lock);
1465
1466	fn = fib6_locate(&table->tb6_root,
1467			 &cfg->fc_dst, cfg->fc_dst_len,
1468			 &cfg->fc_src, cfg->fc_src_len);
1469
1470	if (fn) {
1471		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1472			if (cfg->fc_ifindex &&
1473			    (rt->rt6i_dev == NULL ||
1474			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1475				continue;
1476			if (cfg->fc_flags & RTF_GATEWAY &&
1477			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1478				continue;
1479			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1480				continue;
1481			dst_hold(&rt->dst);
1482			read_unlock_bh(&table->tb6_lock);
1483
1484			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1485		}
1486	}
1487	read_unlock_bh(&table->tb6_lock);
1488
1489	return err;
1490}
1491
1492/*
1493 *	Handle redirects
1494 */
1495struct ip6rd_flowi {
1496	struct flowi6 fl6;
1497	struct in6_addr gateway;
1498};
1499
1500static struct rt6_info *__ip6_route_redirect(struct net *net,
1501					     struct fib6_table *table,
1502					     struct flowi6 *fl6,
1503					     int flags)
1504{
1505	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1506	struct rt6_info *rt;
1507	struct fib6_node *fn;
1508
1509	/*
1510	 * Get the "current" route for this destination and
1511	 * check if the redirect has come from approriate router.
1512	 *
1513	 * RFC 2461 specifies that redirects should only be
1514	 * accepted if they come from the nexthop to the target.
1515	 * Due to the way the routes are chosen, this notion
1516	 * is a bit fuzzy and one might need to check all possible
1517	 * routes.
1518	 */
1519
1520	read_lock_bh(&table->tb6_lock);
1521	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1522restart:
1523	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1524		/*
1525		 * Current route is on-link; redirect is always invalid.
1526		 *
1527		 * Seems, previous statement is not true. It could
1528		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1529		 * But then router serving it might decide, that we should
1530		 * know truth 8)8) --ANK (980726).
1531		 */
1532		if (rt6_check_expired(rt))
1533			continue;
1534		if (!(rt->rt6i_flags & RTF_GATEWAY))
1535			continue;
1536		if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1537			continue;
1538		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1539			continue;
1540		break;
1541	}
1542
1543	if (!rt)
1544		rt = net->ipv6.ip6_null_entry;
1545	BACKTRACK(net, &fl6->saddr);
1546out:
1547	dst_hold(&rt->dst);
1548
1549	read_unlock_bh(&table->tb6_lock);
1550
1551	return rt;
1552};
1553
1554static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1555					   const struct in6_addr *src,
1556					   const struct in6_addr *gateway,
1557					   struct net_device *dev)
1558{
1559	int flags = RT6_LOOKUP_F_HAS_SADDR;
1560	struct net *net = dev_net(dev);
1561	struct ip6rd_flowi rdfl = {
1562		.fl6 = {
1563			.flowi6_oif = dev->ifindex,
1564			.daddr = *dest,
1565			.saddr = *src,
1566		},
1567	};
1568
1569	ipv6_addr_copy(&rdfl.gateway, gateway);
1570
1571	if (rt6_need_strict(dest))
1572		flags |= RT6_LOOKUP_F_IFACE;
1573
1574	return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1575						   flags, __ip6_route_redirect);
1576}
1577
1578void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1579		  const struct in6_addr *saddr,
1580		  struct neighbour *neigh, u8 *lladdr, int on_link)
1581{
1582	struct rt6_info *rt, *nrt = NULL;
1583	struct netevent_redirect netevent;
1584	struct net *net = dev_net(neigh->dev);
1585
1586	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1587
1588	if (rt == net->ipv6.ip6_null_entry) {
1589		if (net_ratelimit())
1590			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1591			       "for redirect target\n");
1592		goto out;
1593	}
1594
1595	/*
1596	 *	We have finally decided to accept it.
1597	 */
1598
1599	neigh_update(neigh, lladdr, NUD_STALE,
1600		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1601		     NEIGH_UPDATE_F_OVERRIDE|
1602		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1603				     NEIGH_UPDATE_F_ISROUTER))
1604		     );
1605
1606	/*
1607	 * Redirect received -> path was valid.
1608	 * Look, redirects are sent only in response to data packets,
1609	 * so that this nexthop apparently is reachable. --ANK
1610	 */
1611	dst_confirm(&rt->dst);
1612
1613	/* Duplicate redirect: silently ignore. */
1614	if (neigh == dst_get_neighbour_raw(&rt->dst))
1615		goto out;
1616
1617	nrt = ip6_rt_copy(rt, dest);
1618	if (nrt == NULL)
1619		goto out;
1620
1621	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1622	if (on_link)
1623		nrt->rt6i_flags &= ~RTF_GATEWAY;
1624
1625	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1626	dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1627
1628	if (ip6_ins_rt(nrt))
1629		goto out;
1630
1631	netevent.old = &rt->dst;
1632	netevent.new = &nrt->dst;
1633	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1634
1635	if (rt->rt6i_flags&RTF_CACHE) {
1636		ip6_del_rt(rt);
1637		return;
1638	}
1639
1640out:
1641	dst_release(&rt->dst);
1642}
1643
1644/*
1645 *	Handle ICMP "packet too big" messages
1646 *	i.e. Path MTU discovery
1647 */
1648
1649static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1650			     struct net *net, u32 pmtu, int ifindex)
1651{
1652	struct rt6_info *rt, *nrt;
1653	int allfrag = 0;
1654again:
1655	rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1656	if (rt == NULL)
1657		return;
1658
1659	if (rt6_check_expired(rt)) {
1660		ip6_del_rt(rt);
1661		goto again;
1662	}
1663
1664	if (pmtu >= dst_mtu(&rt->dst))
1665		goto out;
1666
1667	if (pmtu < IPV6_MIN_MTU) {
1668		/*
1669		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1670		 * MTU (1280) and a fragment header should always be included
1671		 * after a node receiving Too Big message reporting PMTU is
1672		 * less than the IPv6 Minimum Link MTU.
1673		 */
1674		pmtu = IPV6_MIN_MTU;
1675		allfrag = 1;
1676	}
1677
1678	/* New mtu received -> path was valid.
1679	   They are sent only in response to data packets,
1680	   so that this nexthop apparently is reachable. --ANK
1681	 */
1682	dst_confirm(&rt->dst);
1683
1684	/* Host route. If it is static, it would be better
1685	   not to override it, but add new one, so that
1686	   when cache entry will expire old pmtu
1687	   would return automatically.
1688	 */
1689	if (rt->rt6i_flags & RTF_CACHE) {
1690		dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1691		if (allfrag) {
1692			u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1693			features |= RTAX_FEATURE_ALLFRAG;
1694			dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1695		}
1696		dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1697		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1698		goto out;
1699	}
1700
1701	/* Network route.
1702	   Two cases are possible:
1703	   1. It is connected route. Action: COW
1704	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1705	 */
1706	if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1707		nrt = rt6_alloc_cow(rt, daddr, saddr);
1708	else
1709		nrt = rt6_alloc_clone(rt, daddr);
1710
1711	if (nrt) {
1712		dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1713		if (allfrag) {
1714			u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1715			features |= RTAX_FEATURE_ALLFRAG;
1716			dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1717		}
1718
1719		/* According to RFC 1981, detecting PMTU increase shouldn't be
1720		 * happened within 5 mins, the recommended timer is 10 mins.
1721		 * Here this route expiration time is set to ip6_rt_mtu_expires
1722		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1723		 * and detecting PMTU increase will be automatically happened.
1724		 */
1725		dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1726		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1727
1728		ip6_ins_rt(nrt);
1729	}
1730out:
1731	dst_release(&rt->dst);
1732}
1733
1734void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1735			struct net_device *dev, u32 pmtu)
1736{
1737	struct net *net = dev_net(dev);
1738
1739	/*
1740	 * RFC 1981 states that a node "MUST reduce the size of the packets it
1741	 * is sending along the path" that caused the Packet Too Big message.
1742	 * Since it's not possible in the general case to determine which
1743	 * interface was used to send the original packet, we update the MTU
1744	 * on the interface that will be used to send future packets. We also
1745	 * update the MTU on the interface that received the Packet Too Big in
1746	 * case the original packet was forced out that interface with
1747	 * SO_BINDTODEVICE or similar. This is the next best thing to the
1748	 * correct behaviour, which would be to update the MTU on all
1749	 * interfaces.
1750	 */
1751	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1752	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1753}
1754
1755/*
1756 *	Misc support functions
1757 */
1758
1759static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1760				    const struct in6_addr *dest)
1761{
1762	struct net *net = dev_net(ort->rt6i_dev);
1763	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1764					    ort->dst.dev, 0);
1765
1766	if (rt) {
1767		rt->dst.input = ort->dst.input;
1768		rt->dst.output = ort->dst.output;
1769		rt->dst.flags |= DST_HOST;
1770
1771		ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1772		rt->rt6i_dst.plen = 128;
1773		dst_copy_metrics(&rt->dst, &ort->dst);
1774		rt->dst.error = ort->dst.error;
1775		rt->rt6i_idev = ort->rt6i_idev;
1776		if (rt->rt6i_idev)
1777			in6_dev_hold(rt->rt6i_idev);
1778		rt->dst.lastuse = jiffies;
1779		rt->rt6i_expires = 0;
1780
1781		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1782		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1783		rt->rt6i_metric = 0;
1784
1785#ifdef CONFIG_IPV6_SUBTREES
1786		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1787#endif
1788		memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1789		rt->rt6i_table = ort->rt6i_table;
1790	}
1791	return rt;
1792}
1793
1794#ifdef CONFIG_IPV6_ROUTE_INFO
1795static struct rt6_info *rt6_get_route_info(struct net *net,
1796					   const struct in6_addr *prefix, int prefixlen,
1797					   const struct in6_addr *gwaddr, int ifindex)
1798{
1799	struct fib6_node *fn;
1800	struct rt6_info *rt = NULL;
1801	struct fib6_table *table;
1802
1803	table = fib6_get_table(net, RT6_TABLE_INFO);
1804	if (table == NULL)
1805		return NULL;
1806
1807	write_lock_bh(&table->tb6_lock);
1808	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1809	if (!fn)
1810		goto out;
1811
1812	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1813		if (rt->rt6i_dev->ifindex != ifindex)
1814			continue;
1815		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1816			continue;
1817		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1818			continue;
1819		dst_hold(&rt->dst);
1820		break;
1821	}
1822out:
1823	write_unlock_bh(&table->tb6_lock);
1824	return rt;
1825}
1826
1827static struct rt6_info *rt6_add_route_info(struct net *net,
1828					   const struct in6_addr *prefix, int prefixlen,
1829					   const struct in6_addr *gwaddr, int ifindex,
1830					   unsigned pref)
1831{
1832	struct fib6_config cfg = {
1833		.fc_table	= RT6_TABLE_INFO,
1834		.fc_metric	= IP6_RT_PRIO_USER,
1835		.fc_ifindex	= ifindex,
1836		.fc_dst_len	= prefixlen,
1837		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1838				  RTF_UP | RTF_PREF(pref),
1839		.fc_nlinfo.pid = 0,
1840		.fc_nlinfo.nlh = NULL,
1841		.fc_nlinfo.nl_net = net,
1842	};
1843
1844	ipv6_addr_copy(&cfg.fc_dst, prefix);
1845	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1846
1847	/* We should treat it as a default route if prefix length is 0. */
1848	if (!prefixlen)
1849		cfg.fc_flags |= RTF_DEFAULT;
1850
1851	ip6_route_add(&cfg);
1852
1853	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1854}
1855#endif
1856
1857struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1858{
1859	struct rt6_info *rt;
1860	struct fib6_table *table;
1861
1862	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1863	if (table == NULL)
1864		return NULL;
1865
1866	write_lock_bh(&table->tb6_lock);
1867	for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1868		if (dev == rt->rt6i_dev &&
1869		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1870		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1871			break;
1872	}
1873	if (rt)
1874		dst_hold(&rt->dst);
1875	write_unlock_bh(&table->tb6_lock);
1876	return rt;
1877}
1878
1879struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1880				     struct net_device *dev,
1881				     unsigned int pref)
1882{
1883	struct fib6_config cfg = {
1884		.fc_table	= RT6_TABLE_DFLT,
1885		.fc_metric	= IP6_RT_PRIO_USER,
1886		.fc_ifindex	= dev->ifindex,
1887		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1888				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1889		.fc_nlinfo.pid = 0,
1890		.fc_nlinfo.nlh = NULL,
1891		.fc_nlinfo.nl_net = dev_net(dev),
1892	};
1893
1894	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1895
1896	ip6_route_add(&cfg);
1897
1898	return rt6_get_dflt_router(gwaddr, dev);
1899}
1900
1901void rt6_purge_dflt_routers(struct net *net)
1902{
1903	struct rt6_info *rt;
1904	struct fib6_table *table;
1905
1906	/* NOTE: Keep consistent with rt6_get_dflt_router */
1907	table = fib6_get_table(net, RT6_TABLE_DFLT);
1908	if (table == NULL)
1909		return;
1910
1911restart:
1912	read_lock_bh(&table->tb6_lock);
1913	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1914		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1915			dst_hold(&rt->dst);
1916			read_unlock_bh(&table->tb6_lock);
1917			ip6_del_rt(rt);
1918			goto restart;
1919		}
1920	}
1921	read_unlock_bh(&table->tb6_lock);
1922}
1923
1924static void rtmsg_to_fib6_config(struct net *net,
1925				 struct in6_rtmsg *rtmsg,
1926				 struct fib6_config *cfg)
1927{
1928	memset(cfg, 0, sizeof(*cfg));
1929
1930	cfg->fc_table = RT6_TABLE_MAIN;
1931	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1932	cfg->fc_metric = rtmsg->rtmsg_metric;
1933	cfg->fc_expires = rtmsg->rtmsg_info;
1934	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1935	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1936	cfg->fc_flags = rtmsg->rtmsg_flags;
1937
1938	cfg->fc_nlinfo.nl_net = net;
1939
1940	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1941	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1942	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1943}
1944
1945int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1946{
1947	struct fib6_config cfg;
1948	struct in6_rtmsg rtmsg;
1949	int err;
1950
1951	switch(cmd) {
1952	case SIOCADDRT:		/* Add a route */
1953	case SIOCDELRT:		/* Delete a route */
1954		if (!capable(CAP_NET_ADMIN))
1955			return -EPERM;
1956		err = copy_from_user(&rtmsg, arg,
1957				     sizeof(struct in6_rtmsg));
1958		if (err)
1959			return -EFAULT;
1960
1961		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1962
1963		rtnl_lock();
1964		switch (cmd) {
1965		case SIOCADDRT:
1966			err = ip6_route_add(&cfg);
1967			break;
1968		case SIOCDELRT:
1969			err = ip6_route_del(&cfg);
1970			break;
1971		default:
1972			err = -EINVAL;
1973		}
1974		rtnl_unlock();
1975
1976		return err;
1977	}
1978
1979	return -EINVAL;
1980}
1981
1982/*
1983 *	Drop the packet on the floor
1984 */
1985
1986static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1987{
1988	int type;
1989	struct dst_entry *dst = skb_dst(skb);
1990	switch (ipstats_mib_noroutes) {
1991	case IPSTATS_MIB_INNOROUTES:
1992		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1993		if (type == IPV6_ADDR_ANY) {
1994			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1995				      IPSTATS_MIB_INADDRERRORS);
1996			break;
1997		}
1998		/* FALLTHROUGH */
1999	case IPSTATS_MIB_OUTNOROUTES:
2000		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2001			      ipstats_mib_noroutes);
2002		break;
2003	}
2004	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2005	kfree_skb(skb);
2006	return 0;
2007}
2008
2009static int ip6_pkt_discard(struct sk_buff *skb)
2010{
2011	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2012}
2013
2014static int ip6_pkt_discard_out(struct sk_buff *skb)
2015{
2016	skb->dev = skb_dst(skb)->dev;
2017	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2018}
2019
2020#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2021
2022static int ip6_pkt_prohibit(struct sk_buff *skb)
2023{
2024	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2025}
2026
2027static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2028{
2029	skb->dev = skb_dst(skb)->dev;
2030	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2031}
2032
2033#endif
2034
2035/*
2036 *	Allocate a dst for local (unicast / anycast) address.
2037 */
2038
2039struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2040				    const struct in6_addr *addr,
2041				    int anycast)
2042{
2043	struct net *net = dev_net(idev->dev);
2044	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2045					    net->loopback_dev, 0);
2046	struct neighbour *neigh;
2047
2048	if (rt == NULL) {
2049		if (net_ratelimit())
2050			pr_warning("IPv6:  Maximum number of routes reached,"
2051				   " consider increasing route/max_size.\n");
2052		return ERR_PTR(-ENOMEM);
2053	}
2054
2055	in6_dev_hold(idev);
2056
2057	rt->dst.flags |= DST_HOST;
2058	rt->dst.input = ip6_input;
2059	rt->dst.output = ip6_output;
2060	rt->rt6i_idev = idev;
2061	rt->dst.obsolete = -1;
2062
2063	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2064	if (anycast)
2065		rt->rt6i_flags |= RTF_ANYCAST;
2066	else
2067		rt->rt6i_flags |= RTF_LOCAL;
2068	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2069	if (IS_ERR(neigh)) {
2070		dst_free(&rt->dst);
2071
2072		return ERR_CAST(neigh);
2073	}
2074	dst_set_neighbour(&rt->dst, neigh);
2075
2076	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2077	rt->rt6i_dst.plen = 128;
2078	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2079
2080	atomic_set(&rt->dst.__refcnt, 1);
2081
2082	return rt;
2083}
2084
2085int ip6_route_get_saddr(struct net *net,
2086			struct rt6_info *rt,
2087			const struct in6_addr *daddr,
2088			unsigned int prefs,
2089			struct in6_addr *saddr)
2090{
2091	struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2092	int err = 0;
2093	if (rt->rt6i_prefsrc.plen)
2094		ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2095	else
2096		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2097					 daddr, prefs, saddr);
2098	return err;
2099}
2100
2101/* remove deleted ip from prefsrc entries */
2102struct arg_dev_net_ip {
2103	struct net_device *dev;
2104	struct net *net;
2105	struct in6_addr *addr;
2106};
2107
2108static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2109{
2110	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2111	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2112	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2113
2114	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2115	    rt != net->ipv6.ip6_null_entry &&
2116	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2117		/* remove prefsrc entry */
2118		rt->rt6i_prefsrc.plen = 0;
2119	}
2120	return 0;
2121}
2122
2123void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2124{
2125	struct net *net = dev_net(ifp->idev->dev);
2126	struct arg_dev_net_ip adni = {
2127		.dev = ifp->idev->dev,
2128		.net = net,
2129		.addr = &ifp->addr,
2130	};
2131	fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2132}
2133
2134struct arg_dev_net {
2135	struct net_device *dev;
2136	struct net *net;
2137};
2138
2139static int fib6_ifdown(struct rt6_info *rt, void *arg)
2140{
2141	const struct arg_dev_net *adn = arg;
2142	const struct net_device *dev = adn->dev;
2143
2144	if ((rt->rt6i_dev == dev || dev == NULL) &&
2145	    rt != adn->net->ipv6.ip6_null_entry) {
2146		RT6_TRACE("deleted by ifdown %p\n", rt);
2147		return -1;
2148	}
2149	return 0;
2150}
2151
2152void rt6_ifdown(struct net *net, struct net_device *dev)
2153{
2154	struct arg_dev_net adn = {
2155		.dev = dev,
2156		.net = net,
2157	};
2158
2159	fib6_clean_all(net, fib6_ifdown, 0, &adn);
2160	icmp6_clean_all(fib6_ifdown, &adn);
2161}
2162
2163struct rt6_mtu_change_arg
2164{
2165	struct net_device *dev;
2166	unsigned mtu;
2167};
2168
2169static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2170{
2171	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2172	struct inet6_dev *idev;
2173
2174	/* In IPv6 pmtu discovery is not optional,
2175	   so that RTAX_MTU lock cannot disable it.
2176	   We still use this lock to block changes
2177	   caused by addrconf/ndisc.
2178	*/
2179
2180	idev = __in6_dev_get(arg->dev);
2181	if (idev == NULL)
2182		return 0;
2183
2184	/* For administrative MTU increase, there is no way to discover
2185	   IPv6 PMTU increase, so PMTU increase should be updated here.
2186	   Since RFC 1981 doesn't include administrative MTU increase
2187	   update PMTU increase is a MUST. (i.e. jumbo frame)
2188	 */
2189	/*
2190	   If new MTU is less than route PMTU, this new MTU will be the
2191	   lowest MTU in the path, update the route PMTU to reflect PMTU
2192	   decreases; if new MTU is greater than route PMTU, and the
2193	   old MTU is the lowest MTU in the path, update the route PMTU
2194	   to reflect the increase. In this case if the other nodes' MTU
2195	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2196	   PMTU discouvery.
2197	 */
2198	if (rt->rt6i_dev == arg->dev &&
2199	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2200	    (dst_mtu(&rt->dst) >= arg->mtu ||
2201	     (dst_mtu(&rt->dst) < arg->mtu &&
2202	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2203		dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2204	}
2205	return 0;
2206}
2207
2208void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2209{
2210	struct rt6_mtu_change_arg arg = {
2211		.dev = dev,
2212		.mtu = mtu,
2213	};
2214
2215	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2216}
2217
2218static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2219	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2220	[RTA_OIF]               = { .type = NLA_U32 },
2221	[RTA_IIF]		= { .type = NLA_U32 },
2222	[RTA_PRIORITY]          = { .type = NLA_U32 },
2223	[RTA_METRICS]           = { .type = NLA_NESTED },
2224};
2225
2226static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2227			      struct fib6_config *cfg)
2228{
2229	struct rtmsg *rtm;
2230	struct nlattr *tb[RTA_MAX+1];
2231	int err;
2232
2233	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2234	if (err < 0)
2235		goto errout;
2236
2237	err = -EINVAL;
2238	rtm = nlmsg_data(nlh);
2239	memset(cfg, 0, sizeof(*cfg));
2240
2241	cfg->fc_table = rtm->rtm_table;
2242	cfg->fc_dst_len = rtm->rtm_dst_len;
2243	cfg->fc_src_len = rtm->rtm_src_len;
2244	cfg->fc_flags = RTF_UP;
2245	cfg->fc_protocol = rtm->rtm_protocol;
2246
2247	if (rtm->rtm_type == RTN_UNREACHABLE)
2248		cfg->fc_flags |= RTF_REJECT;
2249
2250	if (rtm->rtm_type == RTN_LOCAL)
2251		cfg->fc_flags |= RTF_LOCAL;
2252
2253	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2254	cfg->fc_nlinfo.nlh = nlh;
2255	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2256
2257	if (tb[RTA_GATEWAY]) {
2258		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2259		cfg->fc_flags |= RTF_GATEWAY;
2260	}
2261
2262	if (tb[RTA_DST]) {
2263		int plen = (rtm->rtm_dst_len + 7) >> 3;
2264
2265		if (nla_len(tb[RTA_DST]) < plen)
2266			goto errout;
2267
2268		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2269	}
2270
2271	if (tb[RTA_SRC]) {
2272		int plen = (rtm->rtm_src_len + 7) >> 3;
2273
2274		if (nla_len(tb[RTA_SRC]) < plen)
2275			goto errout;
2276
2277		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2278	}
2279
2280	if (tb[RTA_PREFSRC])
2281		nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2282
2283	if (tb[RTA_OIF])
2284		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2285
2286	if (tb[RTA_PRIORITY])
2287		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2288
2289	if (tb[RTA_METRICS]) {
2290		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2291		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2292	}
2293
2294	if (tb[RTA_TABLE])
2295		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2296
2297	err = 0;
2298errout:
2299	return err;
2300}
2301
2302static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2303{
2304	struct fib6_config cfg;
2305	int err;
2306
2307	err = rtm_to_fib6_config(skb, nlh, &cfg);
2308	if (err < 0)
2309		return err;
2310
2311	return ip6_route_del(&cfg);
2312}
2313
2314static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2315{
2316	struct fib6_config cfg;
2317	int err;
2318
2319	err = rtm_to_fib6_config(skb, nlh, &cfg);
2320	if (err < 0)
2321		return err;
2322
2323	return ip6_route_add(&cfg);
2324}
2325
2326static inline size_t rt6_nlmsg_size(void)
2327{
2328	return NLMSG_ALIGN(sizeof(struct rtmsg))
2329	       + nla_total_size(16) /* RTA_SRC */
2330	       + nla_total_size(16) /* RTA_DST */
2331	       + nla_total_size(16) /* RTA_GATEWAY */
2332	       + nla_total_size(16) /* RTA_PREFSRC */
2333	       + nla_total_size(4) /* RTA_TABLE */
2334	       + nla_total_size(4) /* RTA_IIF */
2335	       + nla_total_size(4) /* RTA_OIF */
2336	       + nla_total_size(4) /* RTA_PRIORITY */
2337	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2338	       + nla_total_size(sizeof(struct rta_cacheinfo));
2339}
2340
2341static int rt6_fill_node(struct net *net,
2342			 struct sk_buff *skb, struct rt6_info *rt,
2343			 struct in6_addr *dst, struct in6_addr *src,
2344			 int iif, int type, u32 pid, u32 seq,
2345			 int prefix, int nowait, unsigned int flags)
2346{
2347	struct rtmsg *rtm;
2348	struct nlmsghdr *nlh;
2349	long expires;
2350	u32 table;
2351	struct neighbour *n;
2352
2353	if (prefix) {	/* user wants prefix routes only */
2354		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2355			/* success since this is not a prefix route */
2356			return 1;
2357		}
2358	}
2359
2360	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2361	if (nlh == NULL)
2362		return -EMSGSIZE;
2363
2364	rtm = nlmsg_data(nlh);
2365	rtm->rtm_family = AF_INET6;
2366	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2367	rtm->rtm_src_len = rt->rt6i_src.plen;
2368	rtm->rtm_tos = 0;
2369	if (rt->rt6i_table)
2370		table = rt->rt6i_table->tb6_id;
2371	else
2372		table = RT6_TABLE_UNSPEC;
2373	rtm->rtm_table = table;
2374	NLA_PUT_U32(skb, RTA_TABLE, table);
2375	if (rt->rt6i_flags&RTF_REJECT)
2376		rtm->rtm_type = RTN_UNREACHABLE;
2377	else if (rt->rt6i_flags&RTF_LOCAL)
2378		rtm->rtm_type = RTN_LOCAL;
2379	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2380		rtm->rtm_type = RTN_LOCAL;
2381	else
2382		rtm->rtm_type = RTN_UNICAST;
2383	rtm->rtm_flags = 0;
2384	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2385	rtm->rtm_protocol = rt->rt6i_protocol;
2386	if (rt->rt6i_flags&RTF_DYNAMIC)
2387		rtm->rtm_protocol = RTPROT_REDIRECT;
2388	else if (rt->rt6i_flags & RTF_ADDRCONF)
2389		rtm->rtm_protocol = RTPROT_KERNEL;
2390	else if (rt->rt6i_flags&RTF_DEFAULT)
2391		rtm->rtm_protocol = RTPROT_RA;
2392
2393	if (rt->rt6i_flags&RTF_CACHE)
2394		rtm->rtm_flags |= RTM_F_CLONED;
2395
2396	if (dst) {
2397		NLA_PUT(skb, RTA_DST, 16, dst);
2398		rtm->rtm_dst_len = 128;
2399	} else if (rtm->rtm_dst_len)
2400		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2401#ifdef CONFIG_IPV6_SUBTREES
2402	if (src) {
2403		NLA_PUT(skb, RTA_SRC, 16, src);
2404		rtm->rtm_src_len = 128;
2405	} else if (rtm->rtm_src_len)
2406		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2407#endif
2408	if (iif) {
2409#ifdef CONFIG_IPV6_MROUTE
2410		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2411			int err = ip6mr_get_route(net, skb, rtm, nowait);
2412			if (err <= 0) {
2413				if (!nowait) {
2414					if (err == 0)
2415						return 0;
2416					goto nla_put_failure;
2417				} else {
2418					if (err == -EMSGSIZE)
2419						goto nla_put_failure;
2420				}
2421			}
2422		} else
2423#endif
2424			NLA_PUT_U32(skb, RTA_IIF, iif);
2425	} else if (dst) {
2426		struct in6_addr saddr_buf;
2427		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2428			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2429	}
2430
2431	if (rt->rt6i_prefsrc.plen) {
2432		struct in6_addr saddr_buf;
2433		ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2434		NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2435	}
2436
2437	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2438		goto nla_put_failure;
2439
2440	rcu_read_lock();
2441	n = dst_get_neighbour(&rt->dst);
2442	if (n)
2443		NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2444	rcu_read_unlock();
2445
2446	if (rt->dst.dev)
2447		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2448
2449	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2450
2451	if (!(rt->rt6i_flags & RTF_EXPIRES))
2452		expires = 0;
2453	else if (rt->rt6i_expires - jiffies < INT_MAX)
2454		expires = rt->rt6i_expires - jiffies;
2455	else
2456		expires = INT_MAX;
2457
2458	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2459			       expires, rt->dst.error) < 0)
2460		goto nla_put_failure;
2461
2462	return nlmsg_end(skb, nlh);
2463
2464nla_put_failure:
2465	nlmsg_cancel(skb, nlh);
2466	return -EMSGSIZE;
2467}
2468
2469int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2470{
2471	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2472	int prefix;
2473
2474	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2475		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2476		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2477	} else
2478		prefix = 0;
2479
2480	return rt6_fill_node(arg->net,
2481		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2482		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2483		     prefix, 0, NLM_F_MULTI);
2484}
2485
2486static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2487{
2488	struct net *net = sock_net(in_skb->sk);
2489	struct nlattr *tb[RTA_MAX+1];
2490	struct rt6_info *rt;
2491	struct sk_buff *skb;
2492	struct rtmsg *rtm;
2493	struct flowi6 fl6;
2494	int err, iif = 0;
2495
2496	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2497	if (err < 0)
2498		goto errout;
2499
2500	err = -EINVAL;
2501	memset(&fl6, 0, sizeof(fl6));
2502
2503	if (tb[RTA_SRC]) {
2504		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2505			goto errout;
2506
2507		ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2508	}
2509
2510	if (tb[RTA_DST]) {
2511		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2512			goto errout;
2513
2514		ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2515	}
2516
2517	if (tb[RTA_IIF])
2518		iif = nla_get_u32(tb[RTA_IIF]);
2519
2520	if (tb[RTA_OIF])
2521		fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2522
2523	if (iif) {
2524		struct net_device *dev;
2525		dev = __dev_get_by_index(net, iif);
2526		if (!dev) {
2527			err = -ENODEV;
2528			goto errout;
2529		}
2530	}
2531
2532	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2533	if (skb == NULL) {
2534		err = -ENOBUFS;
2535		goto errout;
2536	}
2537
2538	/* Reserve room for dummy headers, this skb can pass
2539	   through good chunk of routing engine.
2540	 */
2541	skb_reset_mac_header(skb);
2542	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2543
2544	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2545	skb_dst_set(skb, &rt->dst);
2546
2547	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2548			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2549			    nlh->nlmsg_seq, 0, 0, 0);
2550	if (err < 0) {
2551		kfree_skb(skb);
2552		goto errout;
2553	}
2554
2555	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2556errout:
2557	return err;
2558}
2559
2560void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2561{
2562	struct sk_buff *skb;
2563	struct net *net = info->nl_net;
2564	u32 seq;
2565	int err;
2566
2567	err = -ENOBUFS;
2568	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2569
2570	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2571	if (skb == NULL)
2572		goto errout;
2573
2574	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2575				event, info->pid, seq, 0, 0, 0);
2576	if (err < 0) {
2577		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2578		WARN_ON(err == -EMSGSIZE);
2579		kfree_skb(skb);
2580		goto errout;
2581	}
2582	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2583		    info->nlh, gfp_any());
2584	return;
2585errout:
2586	if (err < 0)
2587		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2588}
2589
2590static int ip6_route_dev_notify(struct notifier_block *this,
2591				unsigned long event, void *data)
2592{
2593	struct net_device *dev = (struct net_device *)data;
2594	struct net *net = dev_net(dev);
2595
2596	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2597		net->ipv6.ip6_null_entry->dst.dev = dev;
2598		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2599#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2600		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2601		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2602		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2603		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2604#endif
2605	}
2606
2607	return NOTIFY_OK;
2608}
2609
2610/*
2611 *	/proc
2612 */
2613
2614#ifdef CONFIG_PROC_FS
2615
2616struct rt6_proc_arg
2617{
2618	char *buffer;
2619	int offset;
2620	int length;
2621	int skip;
2622	int len;
2623};
2624
2625static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2626{
2627	struct seq_file *m = p_arg;
2628	struct neighbour *n;
2629
2630	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2631
2632#ifdef CONFIG_IPV6_SUBTREES
2633	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2634#else
2635	seq_puts(m, "00000000000000000000000000000000 00 ");
2636#endif
2637	rcu_read_lock();
2638	n = dst_get_neighbour(&rt->dst);
2639	if (n) {
2640		seq_printf(m, "%pi6", n->primary_key);
2641	} else {
2642		seq_puts(m, "00000000000000000000000000000000");
2643	}
2644	rcu_read_unlock();
2645	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2646		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2647		   rt->dst.__use, rt->rt6i_flags,
2648		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2649	return 0;
2650}
2651
2652static int ipv6_route_show(struct seq_file *m, void *v)
2653{
2654	struct net *net = (struct net *)m->private;
2655	fib6_clean_all(net, rt6_info_route, 0, m);
2656	return 0;
2657}
2658
2659static int ipv6_route_open(struct inode *inode, struct file *file)
2660{
2661	return single_open_net(inode, file, ipv6_route_show);
2662}
2663
2664static const struct file_operations ipv6_route_proc_fops = {
2665	.owner		= THIS_MODULE,
2666	.open		= ipv6_route_open,
2667	.read		= seq_read,
2668	.llseek		= seq_lseek,
2669	.release	= single_release_net,
2670};
2671
2672static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2673{
2674	struct net *net = (struct net *)seq->private;
2675	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2676		   net->ipv6.rt6_stats->fib_nodes,
2677		   net->ipv6.rt6_stats->fib_route_nodes,
2678		   net->ipv6.rt6_stats->fib_rt_alloc,
2679		   net->ipv6.rt6_stats->fib_rt_entries,
2680		   net->ipv6.rt6_stats->fib_rt_cache,
2681		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2682		   net->ipv6.rt6_stats->fib_discarded_routes);
2683
2684	return 0;
2685}
2686
2687static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2688{
2689	return single_open_net(inode, file, rt6_stats_seq_show);
2690}
2691
2692static const struct file_operations rt6_stats_seq_fops = {
2693	.owner	 = THIS_MODULE,
2694	.open	 = rt6_stats_seq_open,
2695	.read	 = seq_read,
2696	.llseek	 = seq_lseek,
2697	.release = single_release_net,
2698};
2699#endif	/* CONFIG_PROC_FS */
2700
2701#ifdef CONFIG_SYSCTL
2702
2703static
2704int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2705			      void __user *buffer, size_t *lenp, loff_t *ppos)
2706{
2707	struct net *net;
2708	int delay;
2709	if (!write)
2710		return -EINVAL;
2711
2712	net = (struct net *)ctl->extra1;
2713	delay = net->ipv6.sysctl.flush_delay;
2714	proc_dointvec(ctl, write, buffer, lenp, ppos);
2715	fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2716	return 0;
2717}
2718
2719ctl_table ipv6_route_table_template[] = {
2720	{
2721		.procname	=	"flush",
2722		.data		=	&init_net.ipv6.sysctl.flush_delay,
2723		.maxlen		=	sizeof(int),
2724		.mode		=	0200,
2725		.proc_handler	=	ipv6_sysctl_rtcache_flush
2726	},
2727	{
2728		.procname	=	"gc_thresh",
2729		.data		=	&ip6_dst_ops_template.gc_thresh,
2730		.maxlen		=	sizeof(int),
2731		.mode		=	0644,
2732		.proc_handler	=	proc_dointvec,
2733	},
2734	{
2735		.procname	=	"max_size",
2736		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2737		.maxlen		=	sizeof(int),
2738		.mode		=	0644,
2739		.proc_handler	=	proc_dointvec,
2740	},
2741	{
2742		.procname	=	"gc_min_interval",
2743		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2744		.maxlen		=	sizeof(int),
2745		.mode		=	0644,
2746		.proc_handler	=	proc_dointvec_jiffies,
2747	},
2748	{
2749		.procname	=	"gc_timeout",
2750		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2751		.maxlen		=	sizeof(int),
2752		.mode		=	0644,
2753		.proc_handler	=	proc_dointvec_jiffies,
2754	},
2755	{
2756		.procname	=	"gc_interval",
2757		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2758		.maxlen		=	sizeof(int),
2759		.mode		=	0644,
2760		.proc_handler	=	proc_dointvec_jiffies,
2761	},
2762	{
2763		.procname	=	"gc_elasticity",
2764		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2765		.maxlen		=	sizeof(int),
2766		.mode		=	0644,
2767		.proc_handler	=	proc_dointvec,
2768	},
2769	{
2770		.procname	=	"mtu_expires",
2771		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2772		.maxlen		=	sizeof(int),
2773		.mode		=	0644,
2774		.proc_handler	=	proc_dointvec_jiffies,
2775	},
2776	{
2777		.procname	=	"min_adv_mss",
2778		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2779		.maxlen		=	sizeof(int),
2780		.mode		=	0644,
2781		.proc_handler	=	proc_dointvec,
2782	},
2783	{
2784		.procname	=	"gc_min_interval_ms",
2785		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2786		.maxlen		=	sizeof(int),
2787		.mode		=	0644,
2788		.proc_handler	=	proc_dointvec_ms_jiffies,
2789	},
2790	{ }
2791};
2792
2793struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2794{
2795	struct ctl_table *table;
2796
2797	table = kmemdup(ipv6_route_table_template,
2798			sizeof(ipv6_route_table_template),
2799			GFP_KERNEL);
2800
2801	if (table) {
2802		table[0].data = &net->ipv6.sysctl.flush_delay;
2803		table[0].extra1 = net;
2804		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2805		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2806		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2807		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2808		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2809		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2810		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2811		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2812		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2813	}
2814
2815	return table;
2816}
2817#endif
2818
2819static int __net_init ip6_route_net_init(struct net *net)
2820{
2821	int ret = -ENOMEM;
2822
2823	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2824	       sizeof(net->ipv6.ip6_dst_ops));
2825
2826	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2827		goto out_ip6_dst_ops;
2828
2829	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2830					   sizeof(*net->ipv6.ip6_null_entry),
2831					   GFP_KERNEL);
2832	if (!net->ipv6.ip6_null_entry)
2833		goto out_ip6_dst_entries;
2834	net->ipv6.ip6_null_entry->dst.path =
2835		(struct dst_entry *)net->ipv6.ip6_null_entry;
2836	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2837	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2838			 ip6_template_metrics, true);
2839
2840#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2841	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2842					       sizeof(*net->ipv6.ip6_prohibit_entry),
2843					       GFP_KERNEL);
2844	if (!net->ipv6.ip6_prohibit_entry)
2845		goto out_ip6_null_entry;
2846	net->ipv6.ip6_prohibit_entry->dst.path =
2847		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2848	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2849	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2850			 ip6_template_metrics, true);
2851
2852	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2853					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2854					       GFP_KERNEL);
2855	if (!net->ipv6.ip6_blk_hole_entry)
2856		goto out_ip6_prohibit_entry;
2857	net->ipv6.ip6_blk_hole_entry->dst.path =
2858		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2859	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2860	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2861			 ip6_template_metrics, true);
2862#endif
2863
2864	net->ipv6.sysctl.flush_delay = 0;
2865	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2866	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2867	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2868	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2869	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2870	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2871	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2872
2873#ifdef CONFIG_PROC_FS
2874	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2875	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2876#endif
2877	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2878
2879	ret = 0;
2880out:
2881	return ret;
2882
2883#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2884out_ip6_prohibit_entry:
2885	kfree(net->ipv6.ip6_prohibit_entry);
2886out_ip6_null_entry:
2887	kfree(net->ipv6.ip6_null_entry);
2888#endif
2889out_ip6_dst_entries:
2890	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2891out_ip6_dst_ops:
2892	goto out;
2893}
2894
2895static void __net_exit ip6_route_net_exit(struct net *net)
2896{
2897#ifdef CONFIG_PROC_FS
2898	proc_net_remove(net, "ipv6_route");
2899	proc_net_remove(net, "rt6_stats");
2900#endif
2901	kfree(net->ipv6.ip6_null_entry);
2902#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2903	kfree(net->ipv6.ip6_prohibit_entry);
2904	kfree(net->ipv6.ip6_blk_hole_entry);
2905#endif
2906	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2907}
2908
2909static struct pernet_operations ip6_route_net_ops = {
2910	.init = ip6_route_net_init,
2911	.exit = ip6_route_net_exit,
2912};
2913
2914static struct notifier_block ip6_route_dev_notifier = {
2915	.notifier_call = ip6_route_dev_notify,
2916	.priority = 0,
2917};
2918
2919int __init ip6_route_init(void)
2920{
2921	int ret;
2922
2923	ret = -ENOMEM;
2924	ip6_dst_ops_template.kmem_cachep =
2925		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2926				  SLAB_HWCACHE_ALIGN, NULL);
2927	if (!ip6_dst_ops_template.kmem_cachep)
2928		goto out;
2929
2930	ret = dst_entries_init(&ip6_dst_blackhole_ops);
2931	if (ret)
2932		goto out_kmem_cache;
2933
2934	ret = register_pernet_subsys(&ip6_route_net_ops);
2935	if (ret)
2936		goto out_dst_entries;
2937
2938	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2939
2940	/* Registering of the loopback is done before this portion of code,
2941	 * the loopback reference in rt6_info will not be taken, do it
2942	 * manually for init_net */
2943	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2944	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2945  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2946	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2947	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2948	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2949	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2950  #endif
2951	ret = fib6_init();
2952	if (ret)
2953		goto out_register_subsys;
2954
2955	ret = xfrm6_init();
2956	if (ret)
2957		goto out_fib6_init;
2958
2959	ret = fib6_rules_init();
2960	if (ret)
2961		goto xfrm6_init;
2962
2963	ret = -ENOBUFS;
2964	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2965	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2966	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2967		goto fib6_rules_init;
2968
2969	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2970	if (ret)
2971		goto fib6_rules_init;
2972
2973out:
2974	return ret;
2975
2976fib6_rules_init:
2977	fib6_rules_cleanup();
2978xfrm6_init:
2979	xfrm6_fini();
2980out_fib6_init:
2981	fib6_gc_cleanup();
2982out_register_subsys:
2983	unregister_pernet_subsys(&ip6_route_net_ops);
2984out_dst_entries:
2985	dst_entries_destroy(&ip6_dst_blackhole_ops);
2986out_kmem_cache:
2987	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2988	goto out;
2989}
2990
2991void ip6_route_cleanup(void)
2992{
2993	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2994	fib6_rules_cleanup();
2995	xfrm6_fini();
2996	fib6_gc_cleanup();
2997	unregister_pernet_subsys(&ip6_route_net_ops);
2998	dst_entries_destroy(&ip6_dst_blackhole_ops);
2999	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3000}
v3.1
   1/*
   2 *	Linux INET6 implementation
   3 *	FIB front-end.
   4 *
   5 *	Authors:
   6 *	Pedro Roque		<roque@di.fc.ul.pt>
   7 *
   8 *	This program is free software; you can redistribute it and/or
   9 *      modify it under the terms of the GNU General Public License
  10 *      as published by the Free Software Foundation; either version
  11 *      2 of the License, or (at your option) any later version.
  12 */
  13
  14/*	Changes:
  15 *
  16 *	YOSHIFUJI Hideaki @USAGI
  17 *		reworked default router selection.
  18 *		- respect outgoing interface
  19 *		- select from (probably) reachable routers (i.e.
  20 *		routers in REACHABLE, STALE, DELAY or PROBE states).
  21 *		- always select the same router if it is (probably)
  22 *		reachable.  otherwise, round-robin the list.
  23 *	Ville Nuorvala
  24 *		Fixed routing subtrees.
  25 */
  26
  27#include <linux/capability.h>
  28#include <linux/errno.h>
  29#include <linux/types.h>
  30#include <linux/times.h>
  31#include <linux/socket.h>
  32#include <linux/sockios.h>
  33#include <linux/net.h>
  34#include <linux/route.h>
  35#include <linux/netdevice.h>
  36#include <linux/in6.h>
  37#include <linux/mroute6.h>
  38#include <linux/init.h>
  39#include <linux/if_arp.h>
  40#include <linux/proc_fs.h>
  41#include <linux/seq_file.h>
  42#include <linux/nsproxy.h>
  43#include <linux/slab.h>
  44#include <net/net_namespace.h>
  45#include <net/snmp.h>
  46#include <net/ipv6.h>
  47#include <net/ip6_fib.h>
  48#include <net/ip6_route.h>
  49#include <net/ndisc.h>
  50#include <net/addrconf.h>
  51#include <net/tcp.h>
  52#include <linux/rtnetlink.h>
  53#include <net/dst.h>
  54#include <net/xfrm.h>
  55#include <net/netevent.h>
  56#include <net/netlink.h>
  57
  58#include <asm/uaccess.h>
  59
  60#ifdef CONFIG_SYSCTL
  61#include <linux/sysctl.h>
  62#endif
  63
  64/* Set to 3 to get tracing. */
  65#define RT6_DEBUG 2
  66
  67#if RT6_DEBUG >= 3
  68#define RDBG(x) printk x
  69#define RT6_TRACE(x...) printk(KERN_DEBUG x)
  70#else
  71#define RDBG(x)
  72#define RT6_TRACE(x...) do { ; } while (0)
  73#endif
  74
  75static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
  76				    const struct in6_addr *dest);
  77static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
  78static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
  79static unsigned int	 ip6_default_mtu(const struct dst_entry *dst);
  80static struct dst_entry *ip6_negative_advice(struct dst_entry *);
  81static void		ip6_dst_destroy(struct dst_entry *);
  82static void		ip6_dst_ifdown(struct dst_entry *,
  83				       struct net_device *dev, int how);
  84static int		 ip6_dst_gc(struct dst_ops *ops);
  85
  86static int		ip6_pkt_discard(struct sk_buff *skb);
  87static int		ip6_pkt_discard_out(struct sk_buff *skb);
  88static void		ip6_link_failure(struct sk_buff *skb);
  89static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
  90
  91#ifdef CONFIG_IPV6_ROUTE_INFO
  92static struct rt6_info *rt6_add_route_info(struct net *net,
  93					   const struct in6_addr *prefix, int prefixlen,
  94					   const struct in6_addr *gwaddr, int ifindex,
  95					   unsigned pref);
  96static struct rt6_info *rt6_get_route_info(struct net *net,
  97					   const struct in6_addr *prefix, int prefixlen,
  98					   const struct in6_addr *gwaddr, int ifindex);
  99#endif
 100
 101static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
 102{
 103	struct rt6_info *rt = (struct rt6_info *) dst;
 104	struct inet_peer *peer;
 105	u32 *p = NULL;
 106
 107	if (!(rt->dst.flags & DST_HOST))
 108		return NULL;
 109
 110	if (!rt->rt6i_peer)
 111		rt6_bind_peer(rt, 1);
 112
 113	peer = rt->rt6i_peer;
 114	if (peer) {
 115		u32 *old_p = __DST_METRICS_PTR(old);
 116		unsigned long prev, new;
 117
 118		p = peer->metrics;
 119		if (inet_metrics_new(peer))
 120			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 121
 122		new = (unsigned long) p;
 123		prev = cmpxchg(&dst->_metrics, old, new);
 124
 125		if (prev != old) {
 126			p = __DST_METRICS_PTR(prev);
 127			if (prev & DST_METRICS_READ_ONLY)
 128				p = NULL;
 129		}
 130	}
 131	return p;
 132}
 133
 134static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
 135{
 136	return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
 137}
 138
 139static struct dst_ops ip6_dst_ops_template = {
 140	.family			=	AF_INET6,
 141	.protocol		=	cpu_to_be16(ETH_P_IPV6),
 142	.gc			=	ip6_dst_gc,
 143	.gc_thresh		=	1024,
 144	.check			=	ip6_dst_check,
 145	.default_advmss		=	ip6_default_advmss,
 146	.default_mtu		=	ip6_default_mtu,
 147	.cow_metrics		=	ipv6_cow_metrics,
 148	.destroy		=	ip6_dst_destroy,
 149	.ifdown			=	ip6_dst_ifdown,
 150	.negative_advice	=	ip6_negative_advice,
 151	.link_failure		=	ip6_link_failure,
 152	.update_pmtu		=	ip6_rt_update_pmtu,
 153	.local_out		=	__ip6_local_out,
 154	.neigh_lookup		=	ip6_neigh_lookup,
 155};
 156
 157static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
 158{
 159	return 0;
 160}
 161
 162static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 163{
 164}
 165
 166static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
 167					 unsigned long old)
 168{
 169	return NULL;
 170}
 171
 172static struct dst_ops ip6_dst_blackhole_ops = {
 173	.family			=	AF_INET6,
 174	.protocol		=	cpu_to_be16(ETH_P_IPV6),
 175	.destroy		=	ip6_dst_destroy,
 176	.check			=	ip6_dst_check,
 177	.default_mtu		=	ip6_blackhole_default_mtu,
 178	.default_advmss		=	ip6_default_advmss,
 179	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
 180	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
 181	.neigh_lookup		=	ip6_neigh_lookup,
 182};
 183
 184static const u32 ip6_template_metrics[RTAX_MAX] = {
 185	[RTAX_HOPLIMIT - 1] = 255,
 186};
 187
 188static struct rt6_info ip6_null_entry_template = {
 189	.dst = {
 190		.__refcnt	= ATOMIC_INIT(1),
 191		.__use		= 1,
 192		.obsolete	= -1,
 193		.error		= -ENETUNREACH,
 194		.input		= ip6_pkt_discard,
 195		.output		= ip6_pkt_discard_out,
 196	},
 197	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 198	.rt6i_protocol  = RTPROT_KERNEL,
 199	.rt6i_metric	= ~(u32) 0,
 200	.rt6i_ref	= ATOMIC_INIT(1),
 201};
 202
 203#ifdef CONFIG_IPV6_MULTIPLE_TABLES
 204
 205static int ip6_pkt_prohibit(struct sk_buff *skb);
 206static int ip6_pkt_prohibit_out(struct sk_buff *skb);
 207
 208static struct rt6_info ip6_prohibit_entry_template = {
 209	.dst = {
 210		.__refcnt	= ATOMIC_INIT(1),
 211		.__use		= 1,
 212		.obsolete	= -1,
 213		.error		= -EACCES,
 214		.input		= ip6_pkt_prohibit,
 215		.output		= ip6_pkt_prohibit_out,
 216	},
 217	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 218	.rt6i_protocol  = RTPROT_KERNEL,
 219	.rt6i_metric	= ~(u32) 0,
 220	.rt6i_ref	= ATOMIC_INIT(1),
 221};
 222
 223static struct rt6_info ip6_blk_hole_entry_template = {
 224	.dst = {
 225		.__refcnt	= ATOMIC_INIT(1),
 226		.__use		= 1,
 227		.obsolete	= -1,
 228		.error		= -EINVAL,
 229		.input		= dst_discard,
 230		.output		= dst_discard,
 231	},
 232	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
 233	.rt6i_protocol  = RTPROT_KERNEL,
 234	.rt6i_metric	= ~(u32) 0,
 235	.rt6i_ref	= ATOMIC_INIT(1),
 236};
 237
 238#endif
 239
 240/* allocate dst with ip6_dst_ops */
 241static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
 242					     struct net_device *dev,
 243					     int flags)
 244{
 245	struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
 246
 247	if (rt != NULL)
 248		memset(&rt->rt6i_table, 0,
 249			sizeof(*rt) - sizeof(struct dst_entry));
 250
 251	return rt;
 252}
 253
 254static void ip6_dst_destroy(struct dst_entry *dst)
 255{
 256	struct rt6_info *rt = (struct rt6_info *)dst;
 257	struct inet6_dev *idev = rt->rt6i_idev;
 258	struct inet_peer *peer = rt->rt6i_peer;
 259
 260	if (!(rt->dst.flags & DST_HOST))
 261		dst_destroy_metrics_generic(dst);
 262
 263	if (idev != NULL) {
 264		rt->rt6i_idev = NULL;
 265		in6_dev_put(idev);
 266	}
 267	if (peer) {
 268		rt->rt6i_peer = NULL;
 269		inet_putpeer(peer);
 270	}
 271}
 272
 273static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
 274
 275static u32 rt6_peer_genid(void)
 276{
 277	return atomic_read(&__rt6_peer_genid);
 278}
 279
 280void rt6_bind_peer(struct rt6_info *rt, int create)
 281{
 282	struct inet_peer *peer;
 283
 284	peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
 285	if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
 286		inet_putpeer(peer);
 287	else
 288		rt->rt6i_peer_genid = rt6_peer_genid();
 289}
 290
 291static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 292			   int how)
 293{
 294	struct rt6_info *rt = (struct rt6_info *)dst;
 295	struct inet6_dev *idev = rt->rt6i_idev;
 296	struct net_device *loopback_dev =
 297		dev_net(dev)->loopback_dev;
 298
 299	if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
 300		struct inet6_dev *loopback_idev =
 301			in6_dev_get(loopback_dev);
 302		if (loopback_idev != NULL) {
 303			rt->rt6i_idev = loopback_idev;
 304			in6_dev_put(idev);
 305		}
 306	}
 307}
 308
 309static __inline__ int rt6_check_expired(const struct rt6_info *rt)
 310{
 311	return (rt->rt6i_flags & RTF_EXPIRES) &&
 312		time_after(jiffies, rt->rt6i_expires);
 313}
 314
 315static inline int rt6_need_strict(const struct in6_addr *daddr)
 316{
 317	return ipv6_addr_type(daddr) &
 318		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 319}
 320
 321/*
 322 *	Route lookup. Any table->tb6_lock is implied.
 323 */
 324
 325static inline struct rt6_info *rt6_device_match(struct net *net,
 326						    struct rt6_info *rt,
 327						    const struct in6_addr *saddr,
 328						    int oif,
 329						    int flags)
 330{
 331	struct rt6_info *local = NULL;
 332	struct rt6_info *sprt;
 333
 334	if (!oif && ipv6_addr_any(saddr))
 335		goto out;
 336
 337	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
 338		struct net_device *dev = sprt->rt6i_dev;
 339
 340		if (oif) {
 341			if (dev->ifindex == oif)
 342				return sprt;
 343			if (dev->flags & IFF_LOOPBACK) {
 344				if (sprt->rt6i_idev == NULL ||
 345				    sprt->rt6i_idev->dev->ifindex != oif) {
 346					if (flags & RT6_LOOKUP_F_IFACE && oif)
 347						continue;
 348					if (local && (!oif ||
 349						      local->rt6i_idev->dev->ifindex == oif))
 350						continue;
 351				}
 352				local = sprt;
 353			}
 354		} else {
 355			if (ipv6_chk_addr(net, saddr, dev,
 356					  flags & RT6_LOOKUP_F_IFACE))
 357				return sprt;
 358		}
 359	}
 360
 361	if (oif) {
 362		if (local)
 363			return local;
 364
 365		if (flags & RT6_LOOKUP_F_IFACE)
 366			return net->ipv6.ip6_null_entry;
 367	}
 368out:
 369	return rt;
 370}
 371
 372#ifdef CONFIG_IPV6_ROUTER_PREF
 373static void rt6_probe(struct rt6_info *rt)
 374{
 375	struct neighbour *neigh;
 376	/*
 377	 * Okay, this does not seem to be appropriate
 378	 * for now, however, we need to check if it
 379	 * is really so; aka Router Reachability Probing.
 380	 *
 381	 * Router Reachability Probe MUST be rate-limited
 382	 * to no more than one per minute.
 383	 */
 384	rcu_read_lock();
 385	neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
 386	if (!neigh || (neigh->nud_state & NUD_VALID))
 387		goto out;
 388	read_lock_bh(&neigh->lock);
 389	if (!(neigh->nud_state & NUD_VALID) &&
 390	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
 391		struct in6_addr mcaddr;
 392		struct in6_addr *target;
 393
 394		neigh->updated = jiffies;
 395		read_unlock_bh(&neigh->lock);
 396
 397		target = (struct in6_addr *)&neigh->primary_key;
 398		addrconf_addr_solict_mult(target, &mcaddr);
 399		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
 400	} else {
 401		read_unlock_bh(&neigh->lock);
 402	}
 403out:
 404	rcu_read_unlock();
 405}
 406#else
 407static inline void rt6_probe(struct rt6_info *rt)
 408{
 409}
 410#endif
 411
 412/*
 413 * Default Router Selection (RFC 2461 6.3.6)
 414 */
 415static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 416{
 417	struct net_device *dev = rt->rt6i_dev;
 418	if (!oif || dev->ifindex == oif)
 419		return 2;
 420	if ((dev->flags & IFF_LOOPBACK) &&
 421	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
 422		return 1;
 423	return 0;
 424}
 425
 426static inline int rt6_check_neigh(struct rt6_info *rt)
 427{
 428	struct neighbour *neigh;
 429	int m;
 430
 431	rcu_read_lock();
 432	neigh = dst_get_neighbour(&rt->dst);
 433	if (rt->rt6i_flags & RTF_NONEXTHOP ||
 434	    !(rt->rt6i_flags & RTF_GATEWAY))
 435		m = 1;
 436	else if (neigh) {
 437		read_lock_bh(&neigh->lock);
 438		if (neigh->nud_state & NUD_VALID)
 439			m = 2;
 440#ifdef CONFIG_IPV6_ROUTER_PREF
 441		else if (neigh->nud_state & NUD_FAILED)
 442			m = 0;
 443#endif
 444		else
 445			m = 1;
 446		read_unlock_bh(&neigh->lock);
 447	} else
 448		m = 0;
 449	rcu_read_unlock();
 450	return m;
 451}
 452
 453static int rt6_score_route(struct rt6_info *rt, int oif,
 454			   int strict)
 455{
 456	int m, n;
 457
 458	m = rt6_check_dev(rt, oif);
 459	if (!m && (strict & RT6_LOOKUP_F_IFACE))
 460		return -1;
 461#ifdef CONFIG_IPV6_ROUTER_PREF
 462	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
 463#endif
 464	n = rt6_check_neigh(rt);
 465	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
 466		return -1;
 467	return m;
 468}
 469
 470static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 471				   int *mpri, struct rt6_info *match)
 472{
 473	int m;
 474
 475	if (rt6_check_expired(rt))
 476		goto out;
 477
 478	m = rt6_score_route(rt, oif, strict);
 479	if (m < 0)
 480		goto out;
 481
 482	if (m > *mpri) {
 483		if (strict & RT6_LOOKUP_F_REACHABLE)
 484			rt6_probe(match);
 485		*mpri = m;
 486		match = rt;
 487	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
 488		rt6_probe(rt);
 489	}
 490
 491out:
 492	return match;
 493}
 494
 495static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 496				     struct rt6_info *rr_head,
 497				     u32 metric, int oif, int strict)
 498{
 499	struct rt6_info *rt, *match;
 500	int mpri = -1;
 501
 502	match = NULL;
 503	for (rt = rr_head; rt && rt->rt6i_metric == metric;
 504	     rt = rt->dst.rt6_next)
 505		match = find_match(rt, oif, strict, &mpri, match);
 506	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
 507	     rt = rt->dst.rt6_next)
 508		match = find_match(rt, oif, strict, &mpri, match);
 509
 510	return match;
 511}
 512
 513static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
 514{
 515	struct rt6_info *match, *rt0;
 516	struct net *net;
 517
 518	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
 519		  __func__, fn->leaf, oif);
 520
 521	rt0 = fn->rr_ptr;
 522	if (!rt0)
 523		fn->rr_ptr = rt0 = fn->leaf;
 524
 525	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
 526
 527	if (!match &&
 528	    (strict & RT6_LOOKUP_F_REACHABLE)) {
 529		struct rt6_info *next = rt0->dst.rt6_next;
 530
 531		/* no entries matched; do round-robin */
 532		if (!next || next->rt6i_metric != rt0->rt6i_metric)
 533			next = fn->leaf;
 534
 535		if (next != rt0)
 536			fn->rr_ptr = next;
 537	}
 538
 539	RT6_TRACE("%s() => %p\n",
 540		  __func__, match);
 541
 542	net = dev_net(rt0->rt6i_dev);
 543	return match ? match : net->ipv6.ip6_null_entry;
 544}
 545
 546#ifdef CONFIG_IPV6_ROUTE_INFO
 547int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 548		  const struct in6_addr *gwaddr)
 549{
 550	struct net *net = dev_net(dev);
 551	struct route_info *rinfo = (struct route_info *) opt;
 552	struct in6_addr prefix_buf, *prefix;
 553	unsigned int pref;
 554	unsigned long lifetime;
 555	struct rt6_info *rt;
 556
 557	if (len < sizeof(struct route_info)) {
 558		return -EINVAL;
 559	}
 560
 561	/* Sanity check for prefix_len and length */
 562	if (rinfo->length > 3) {
 563		return -EINVAL;
 564	} else if (rinfo->prefix_len > 128) {
 565		return -EINVAL;
 566	} else if (rinfo->prefix_len > 64) {
 567		if (rinfo->length < 2) {
 568			return -EINVAL;
 569		}
 570	} else if (rinfo->prefix_len > 0) {
 571		if (rinfo->length < 1) {
 572			return -EINVAL;
 573		}
 574	}
 575
 576	pref = rinfo->route_pref;
 577	if (pref == ICMPV6_ROUTER_PREF_INVALID)
 578		return -EINVAL;
 579
 580	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
 581
 582	if (rinfo->length == 3)
 583		prefix = (struct in6_addr *)rinfo->prefix;
 584	else {
 585		/* this function is safe */
 586		ipv6_addr_prefix(&prefix_buf,
 587				 (struct in6_addr *)rinfo->prefix,
 588				 rinfo->prefix_len);
 589		prefix = &prefix_buf;
 590	}
 591
 592	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
 593				dev->ifindex);
 594
 595	if (rt && !lifetime) {
 596		ip6_del_rt(rt);
 597		rt = NULL;
 598	}
 599
 600	if (!rt && lifetime)
 601		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
 602					pref);
 603	else if (rt)
 604		rt->rt6i_flags = RTF_ROUTEINFO |
 605				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 606
 607	if (rt) {
 608		if (!addrconf_finite_timeout(lifetime)) {
 609			rt->rt6i_flags &= ~RTF_EXPIRES;
 610		} else {
 611			rt->rt6i_expires = jiffies + HZ * lifetime;
 612			rt->rt6i_flags |= RTF_EXPIRES;
 613		}
 614		dst_release(&rt->dst);
 615	}
 616	return 0;
 617}
 618#endif
 619
 620#define BACKTRACK(__net, saddr)			\
 621do { \
 622	if (rt == __net->ipv6.ip6_null_entry) {	\
 623		struct fib6_node *pn; \
 624		while (1) { \
 625			if (fn->fn_flags & RTN_TL_ROOT) \
 626				goto out; \
 627			pn = fn->parent; \
 628			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
 629				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
 630			else \
 631				fn = pn; \
 632			if (fn->fn_flags & RTN_RTINFO) \
 633				goto restart; \
 634		} \
 635	} \
 636} while(0)
 637
 638static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 639					     struct fib6_table *table,
 640					     struct flowi6 *fl6, int flags)
 641{
 642	struct fib6_node *fn;
 643	struct rt6_info *rt;
 644
 645	read_lock_bh(&table->tb6_lock);
 646	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 647restart:
 648	rt = fn->leaf;
 649	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
 650	BACKTRACK(net, &fl6->saddr);
 651out:
 652	dst_use(&rt->dst, jiffies);
 653	read_unlock_bh(&table->tb6_lock);
 654	return rt;
 655
 656}
 657
 658struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
 659			    const struct in6_addr *saddr, int oif, int strict)
 660{
 661	struct flowi6 fl6 = {
 662		.flowi6_oif = oif,
 663		.daddr = *daddr,
 664	};
 665	struct dst_entry *dst;
 666	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
 667
 668	if (saddr) {
 669		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
 670		flags |= RT6_LOOKUP_F_HAS_SADDR;
 671	}
 672
 673	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
 674	if (dst->error == 0)
 675		return (struct rt6_info *) dst;
 676
 677	dst_release(dst);
 678
 679	return NULL;
 680}
 681
 682EXPORT_SYMBOL(rt6_lookup);
 683
 684/* ip6_ins_rt is called with FREE table->tb6_lock.
 685   It takes new route entry, the addition fails by any reason the
 686   route is freed. In any case, if caller does not hold it, it may
 687   be destroyed.
 688 */
 689
 690static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
 691{
 692	int err;
 693	struct fib6_table *table;
 694
 695	table = rt->rt6i_table;
 696	write_lock_bh(&table->tb6_lock);
 697	err = fib6_add(&table->tb6_root, rt, info);
 698	write_unlock_bh(&table->tb6_lock);
 699
 700	return err;
 701}
 702
 703int ip6_ins_rt(struct rt6_info *rt)
 704{
 705	struct nl_info info = {
 706		.nl_net = dev_net(rt->rt6i_dev),
 707	};
 708	return __ip6_ins_rt(rt, &info);
 709}
 710
 711static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
 712				      const struct in6_addr *daddr,
 713				      const struct in6_addr *saddr)
 714{
 715	struct rt6_info *rt;
 716
 717	/*
 718	 *	Clone the route.
 719	 */
 720
 721	rt = ip6_rt_copy(ort, daddr);
 722
 723	if (rt) {
 724		struct neighbour *neigh;
 725		int attempts = !in_softirq();
 726
 727		if (!(rt->rt6i_flags&RTF_GATEWAY)) {
 728			if (rt->rt6i_dst.plen != 128 &&
 729			    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
 730				rt->rt6i_flags |= RTF_ANYCAST;
 731			ipv6_addr_copy(&rt->rt6i_gateway, daddr);
 732		}
 733
 734		rt->rt6i_flags |= RTF_CACHE;
 735
 736#ifdef CONFIG_IPV6_SUBTREES
 737		if (rt->rt6i_src.plen && saddr) {
 738			ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
 739			rt->rt6i_src.plen = 128;
 740		}
 741#endif
 742
 743	retry:
 744		neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
 745		if (IS_ERR(neigh)) {
 746			struct net *net = dev_net(rt->rt6i_dev);
 747			int saved_rt_min_interval =
 748				net->ipv6.sysctl.ip6_rt_gc_min_interval;
 749			int saved_rt_elasticity =
 750				net->ipv6.sysctl.ip6_rt_gc_elasticity;
 751
 752			if (attempts-- > 0) {
 753				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
 754				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
 755
 756				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
 757
 758				net->ipv6.sysctl.ip6_rt_gc_elasticity =
 759					saved_rt_elasticity;
 760				net->ipv6.sysctl.ip6_rt_gc_min_interval =
 761					saved_rt_min_interval;
 762				goto retry;
 763			}
 764
 765			if (net_ratelimit())
 766				printk(KERN_WARNING
 767				       "ipv6: Neighbour table overflow.\n");
 768			dst_free(&rt->dst);
 769			return NULL;
 770		}
 771		dst_set_neighbour(&rt->dst, neigh);
 772
 773	}
 774
 775	return rt;
 776}
 777
 778static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
 779					const struct in6_addr *daddr)
 780{
 781	struct rt6_info *rt = ip6_rt_copy(ort, daddr);
 782
 783	if (rt) {
 784		rt->rt6i_flags |= RTF_CACHE;
 785		dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
 786	}
 787	return rt;
 788}
 789
 790static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
 791				      struct flowi6 *fl6, int flags)
 792{
 793	struct fib6_node *fn;
 794	struct rt6_info *rt, *nrt;
 795	int strict = 0;
 796	int attempts = 3;
 797	int err;
 798	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
 799
 800	strict |= flags & RT6_LOOKUP_F_IFACE;
 801
 802relookup:
 803	read_lock_bh(&table->tb6_lock);
 804
 805restart_2:
 806	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 807
 808restart:
 809	rt = rt6_select(fn, oif, strict | reachable);
 810
 811	BACKTRACK(net, &fl6->saddr);
 812	if (rt == net->ipv6.ip6_null_entry ||
 813	    rt->rt6i_flags & RTF_CACHE)
 814		goto out;
 815
 816	dst_hold(&rt->dst);
 817	read_unlock_bh(&table->tb6_lock);
 818
 819	if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
 820		nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
 821	else if (!(rt->dst.flags & DST_HOST))
 822		nrt = rt6_alloc_clone(rt, &fl6->daddr);
 823	else
 824		goto out2;
 825
 826	dst_release(&rt->dst);
 827	rt = nrt ? : net->ipv6.ip6_null_entry;
 828
 829	dst_hold(&rt->dst);
 830	if (nrt) {
 831		err = ip6_ins_rt(nrt);
 832		if (!err)
 833			goto out2;
 834	}
 835
 836	if (--attempts <= 0)
 837		goto out2;
 838
 839	/*
 840	 * Race condition! In the gap, when table->tb6_lock was
 841	 * released someone could insert this route.  Relookup.
 842	 */
 843	dst_release(&rt->dst);
 844	goto relookup;
 845
 846out:
 847	if (reachable) {
 848		reachable = 0;
 849		goto restart_2;
 850	}
 851	dst_hold(&rt->dst);
 852	read_unlock_bh(&table->tb6_lock);
 853out2:
 854	rt->dst.lastuse = jiffies;
 855	rt->dst.__use++;
 856
 857	return rt;
 858}
 859
 860static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
 861					    struct flowi6 *fl6, int flags)
 862{
 863	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
 864}
 865
 866void ip6_route_input(struct sk_buff *skb)
 867{
 868	const struct ipv6hdr *iph = ipv6_hdr(skb);
 869	struct net *net = dev_net(skb->dev);
 870	int flags = RT6_LOOKUP_F_HAS_SADDR;
 871	struct flowi6 fl6 = {
 872		.flowi6_iif = skb->dev->ifindex,
 873		.daddr = iph->daddr,
 874		.saddr = iph->saddr,
 875		.flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
 876		.flowi6_mark = skb->mark,
 877		.flowi6_proto = iph->nexthdr,
 878	};
 879
 880	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
 881		flags |= RT6_LOOKUP_F_IFACE;
 882
 883	skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
 884}
 885
 886static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
 887					     struct flowi6 *fl6, int flags)
 888{
 889	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
 890}
 891
 892struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
 893				    struct flowi6 *fl6)
 894{
 895	int flags = 0;
 896
 897	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
 898		flags |= RT6_LOOKUP_F_IFACE;
 899
 900	if (!ipv6_addr_any(&fl6->saddr))
 901		flags |= RT6_LOOKUP_F_HAS_SADDR;
 902	else if (sk)
 903		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
 904
 905	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
 906}
 907
 908EXPORT_SYMBOL(ip6_route_output);
 909
 910struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
 911{
 912	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
 913	struct dst_entry *new = NULL;
 914
 915	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
 916	if (rt) {
 917		memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
 918
 919		new = &rt->dst;
 920
 921		new->__use = 1;
 922		new->input = dst_discard;
 923		new->output = dst_discard;
 924
 925		if (dst_metrics_read_only(&ort->dst))
 926			new->_metrics = ort->dst._metrics;
 927		else
 928			dst_copy_metrics(new, &ort->dst);
 929		rt->rt6i_idev = ort->rt6i_idev;
 930		if (rt->rt6i_idev)
 931			in6_dev_hold(rt->rt6i_idev);
 932		rt->rt6i_expires = 0;
 933
 934		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
 935		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
 936		rt->rt6i_metric = 0;
 937
 938		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
 939#ifdef CONFIG_IPV6_SUBTREES
 940		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
 941#endif
 942
 943		dst_free(new);
 944	}
 945
 946	dst_release(dst_orig);
 947	return new ? new : ERR_PTR(-ENOMEM);
 948}
 949
 950/*
 951 *	Destination cache support functions
 952 */
 953
 954static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 955{
 956	struct rt6_info *rt;
 957
 958	rt = (struct rt6_info *) dst;
 959
 960	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
 961		if (rt->rt6i_peer_genid != rt6_peer_genid()) {
 962			if (!rt->rt6i_peer)
 963				rt6_bind_peer(rt, 0);
 964			rt->rt6i_peer_genid = rt6_peer_genid();
 965		}
 966		return dst;
 967	}
 968	return NULL;
 969}
 970
 971static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 972{
 973	struct rt6_info *rt = (struct rt6_info *) dst;
 974
 975	if (rt) {
 976		if (rt->rt6i_flags & RTF_CACHE) {
 977			if (rt6_check_expired(rt)) {
 978				ip6_del_rt(rt);
 979				dst = NULL;
 980			}
 981		} else {
 982			dst_release(dst);
 983			dst = NULL;
 984		}
 985	}
 986	return dst;
 987}
 988
 989static void ip6_link_failure(struct sk_buff *skb)
 990{
 991	struct rt6_info *rt;
 992
 993	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
 994
 995	rt = (struct rt6_info *) skb_dst(skb);
 996	if (rt) {
 997		if (rt->rt6i_flags&RTF_CACHE) {
 998			dst_set_expires(&rt->dst, 0);
 999			rt->rt6i_flags |= RTF_EXPIRES;
1000		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1001			rt->rt6i_node->fn_sernum = -1;
1002	}
1003}
1004
1005static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1006{
1007	struct rt6_info *rt6 = (struct rt6_info*)dst;
1008
1009	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1010		rt6->rt6i_flags |= RTF_MODIFIED;
1011		if (mtu < IPV6_MIN_MTU) {
1012			u32 features = dst_metric(dst, RTAX_FEATURES);
1013			mtu = IPV6_MIN_MTU;
1014			features |= RTAX_FEATURE_ALLFRAG;
1015			dst_metric_set(dst, RTAX_FEATURES, features);
1016		}
1017		dst_metric_set(dst, RTAX_MTU, mtu);
1018	}
1019}
1020
1021static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1022{
1023	struct net_device *dev = dst->dev;
1024	unsigned int mtu = dst_mtu(dst);
1025	struct net *net = dev_net(dev);
1026
1027	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1028
1029	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1030		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1031
1032	/*
1033	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1034	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1035	 * IPV6_MAXPLEN is also valid and means: "any MSS,
1036	 * rely only on pmtu discovery"
1037	 */
1038	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1039		mtu = IPV6_MAXPLEN;
1040	return mtu;
1041}
1042
1043static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1044{
1045	unsigned int mtu = IPV6_MIN_MTU;
1046	struct inet6_dev *idev;
1047
1048	rcu_read_lock();
1049	idev = __in6_dev_get(dst->dev);
1050	if (idev)
1051		mtu = idev->cnf.mtu6;
1052	rcu_read_unlock();
1053
1054	return mtu;
1055}
1056
1057static struct dst_entry *icmp6_dst_gc_list;
1058static DEFINE_SPINLOCK(icmp6_dst_lock);
1059
1060struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1061				  struct neighbour *neigh,
1062				  const struct in6_addr *addr)
1063{
1064	struct rt6_info *rt;
1065	struct inet6_dev *idev = in6_dev_get(dev);
1066	struct net *net = dev_net(dev);
1067
1068	if (unlikely(idev == NULL))
1069		return NULL;
1070
1071	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1072	if (unlikely(rt == NULL)) {
1073		in6_dev_put(idev);
1074		goto out;
1075	}
1076
1077	if (neigh)
1078		neigh_hold(neigh);
1079	else {
1080		neigh = ndisc_get_neigh(dev, addr);
1081		if (IS_ERR(neigh))
1082			neigh = NULL;
1083	}
1084
1085	rt->dst.flags |= DST_HOST;
1086	rt->dst.output  = ip6_output;
1087	dst_set_neighbour(&rt->dst, neigh);
1088	atomic_set(&rt->dst.__refcnt, 1);
1089	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1090
1091	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1092	rt->rt6i_dst.plen = 128;
1093	rt->rt6i_idev     = idev;
1094
1095	spin_lock_bh(&icmp6_dst_lock);
1096	rt->dst.next = icmp6_dst_gc_list;
1097	icmp6_dst_gc_list = &rt->dst;
1098	spin_unlock_bh(&icmp6_dst_lock);
1099
1100	fib6_force_start_gc(net);
1101
1102out:
1103	return &rt->dst;
1104}
1105
1106int icmp6_dst_gc(void)
1107{
1108	struct dst_entry *dst, **pprev;
1109	int more = 0;
1110
1111	spin_lock_bh(&icmp6_dst_lock);
1112	pprev = &icmp6_dst_gc_list;
1113
1114	while ((dst = *pprev) != NULL) {
1115		if (!atomic_read(&dst->__refcnt)) {
1116			*pprev = dst->next;
1117			dst_free(dst);
1118		} else {
1119			pprev = &dst->next;
1120			++more;
1121		}
1122	}
1123
1124	spin_unlock_bh(&icmp6_dst_lock);
1125
1126	return more;
1127}
1128
1129static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1130			    void *arg)
1131{
1132	struct dst_entry *dst, **pprev;
1133
1134	spin_lock_bh(&icmp6_dst_lock);
1135	pprev = &icmp6_dst_gc_list;
1136	while ((dst = *pprev) != NULL) {
1137		struct rt6_info *rt = (struct rt6_info *) dst;
1138		if (func(rt, arg)) {
1139			*pprev = dst->next;
1140			dst_free(dst);
1141		} else {
1142			pprev = &dst->next;
1143		}
1144	}
1145	spin_unlock_bh(&icmp6_dst_lock);
1146}
1147
1148static int ip6_dst_gc(struct dst_ops *ops)
1149{
1150	unsigned long now = jiffies;
1151	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1152	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1153	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1154	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1155	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1156	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1157	int entries;
1158
1159	entries = dst_entries_get_fast(ops);
1160	if (time_after(rt_last_gc + rt_min_interval, now) &&
1161	    entries <= rt_max_size)
1162		goto out;
1163
1164	net->ipv6.ip6_rt_gc_expire++;
1165	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1166	net->ipv6.ip6_rt_last_gc = now;
1167	entries = dst_entries_get_slow(ops);
1168	if (entries < ops->gc_thresh)
1169		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1170out:
1171	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1172	return entries > rt_max_size;
1173}
1174
1175/* Clean host part of a prefix. Not necessary in radix tree,
1176   but results in cleaner routing tables.
1177
1178   Remove it only when all the things will work!
1179 */
1180
1181int ip6_dst_hoplimit(struct dst_entry *dst)
1182{
1183	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1184	if (hoplimit == 0) {
1185		struct net_device *dev = dst->dev;
1186		struct inet6_dev *idev;
1187
1188		rcu_read_lock();
1189		idev = __in6_dev_get(dev);
1190		if (idev)
1191			hoplimit = idev->cnf.hop_limit;
1192		else
1193			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1194		rcu_read_unlock();
1195	}
1196	return hoplimit;
1197}
1198EXPORT_SYMBOL(ip6_dst_hoplimit);
1199
1200/*
1201 *
1202 */
1203
1204int ip6_route_add(struct fib6_config *cfg)
1205{
1206	int err;
1207	struct net *net = cfg->fc_nlinfo.nl_net;
1208	struct rt6_info *rt = NULL;
1209	struct net_device *dev = NULL;
1210	struct inet6_dev *idev = NULL;
1211	struct fib6_table *table;
1212	int addr_type;
1213
1214	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1215		return -EINVAL;
1216#ifndef CONFIG_IPV6_SUBTREES
1217	if (cfg->fc_src_len)
1218		return -EINVAL;
1219#endif
1220	if (cfg->fc_ifindex) {
1221		err = -ENODEV;
1222		dev = dev_get_by_index(net, cfg->fc_ifindex);
1223		if (!dev)
1224			goto out;
1225		idev = in6_dev_get(dev);
1226		if (!idev)
1227			goto out;
1228	}
1229
1230	if (cfg->fc_metric == 0)
1231		cfg->fc_metric = IP6_RT_PRIO_USER;
1232
1233	table = fib6_new_table(net, cfg->fc_table);
1234	if (table == NULL) {
1235		err = -ENOBUFS;
1236		goto out;
1237	}
1238
1239	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1240
1241	if (rt == NULL) {
1242		err = -ENOMEM;
1243		goto out;
1244	}
1245
1246	rt->dst.obsolete = -1;
1247	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1248				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1249				0;
1250
1251	if (cfg->fc_protocol == RTPROT_UNSPEC)
1252		cfg->fc_protocol = RTPROT_BOOT;
1253	rt->rt6i_protocol = cfg->fc_protocol;
1254
1255	addr_type = ipv6_addr_type(&cfg->fc_dst);
1256
1257	if (addr_type & IPV6_ADDR_MULTICAST)
1258		rt->dst.input = ip6_mc_input;
1259	else if (cfg->fc_flags & RTF_LOCAL)
1260		rt->dst.input = ip6_input;
1261	else
1262		rt->dst.input = ip6_forward;
1263
1264	rt->dst.output = ip6_output;
1265
1266	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1267	rt->rt6i_dst.plen = cfg->fc_dst_len;
1268	if (rt->rt6i_dst.plen == 128)
1269	       rt->dst.flags |= DST_HOST;
1270
1271	if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1272		u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1273		if (!metrics) {
1274			err = -ENOMEM;
1275			goto out;
1276		}
1277		dst_init_metrics(&rt->dst, metrics, 0);
1278	}
1279#ifdef CONFIG_IPV6_SUBTREES
1280	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1281	rt->rt6i_src.plen = cfg->fc_src_len;
1282#endif
1283
1284	rt->rt6i_metric = cfg->fc_metric;
1285
1286	/* We cannot add true routes via loopback here,
1287	   they would result in kernel looping; promote them to reject routes
1288	 */
1289	if ((cfg->fc_flags & RTF_REJECT) ||
1290	    (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1291					      && !(cfg->fc_flags&RTF_LOCAL))) {
1292		/* hold loopback dev/idev if we haven't done so. */
1293		if (dev != net->loopback_dev) {
1294			if (dev) {
1295				dev_put(dev);
1296				in6_dev_put(idev);
1297			}
1298			dev = net->loopback_dev;
1299			dev_hold(dev);
1300			idev = in6_dev_get(dev);
1301			if (!idev) {
1302				err = -ENODEV;
1303				goto out;
1304			}
1305		}
1306		rt->dst.output = ip6_pkt_discard_out;
1307		rt->dst.input = ip6_pkt_discard;
1308		rt->dst.error = -ENETUNREACH;
1309		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1310		goto install_route;
1311	}
1312
1313	if (cfg->fc_flags & RTF_GATEWAY) {
1314		const struct in6_addr *gw_addr;
1315		int gwa_type;
1316
1317		gw_addr = &cfg->fc_gateway;
1318		ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1319		gwa_type = ipv6_addr_type(gw_addr);
1320
1321		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1322			struct rt6_info *grt;
1323
1324			/* IPv6 strictly inhibits using not link-local
1325			   addresses as nexthop address.
1326			   Otherwise, router will not able to send redirects.
1327			   It is very good, but in some (rare!) circumstances
1328			   (SIT, PtP, NBMA NOARP links) it is handy to allow
1329			   some exceptions. --ANK
1330			 */
1331			err = -EINVAL;
1332			if (!(gwa_type&IPV6_ADDR_UNICAST))
1333				goto out;
1334
1335			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1336
1337			err = -EHOSTUNREACH;
1338			if (grt == NULL)
1339				goto out;
1340			if (dev) {
1341				if (dev != grt->rt6i_dev) {
1342					dst_release(&grt->dst);
1343					goto out;
1344				}
1345			} else {
1346				dev = grt->rt6i_dev;
1347				idev = grt->rt6i_idev;
1348				dev_hold(dev);
1349				in6_dev_hold(grt->rt6i_idev);
1350			}
1351			if (!(grt->rt6i_flags&RTF_GATEWAY))
1352				err = 0;
1353			dst_release(&grt->dst);
1354
1355			if (err)
1356				goto out;
1357		}
1358		err = -EINVAL;
1359		if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1360			goto out;
1361	}
1362
1363	err = -ENODEV;
1364	if (dev == NULL)
1365		goto out;
1366
1367	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1368		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1369			err = -EINVAL;
1370			goto out;
1371		}
1372		ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1373		rt->rt6i_prefsrc.plen = 128;
1374	} else
1375		rt->rt6i_prefsrc.plen = 0;
1376
1377	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1378		struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1379		if (IS_ERR(n)) {
1380			err = PTR_ERR(n);
1381			goto out;
1382		}
1383		dst_set_neighbour(&rt->dst, n);
1384	}
1385
1386	rt->rt6i_flags = cfg->fc_flags;
1387
1388install_route:
1389	if (cfg->fc_mx) {
1390		struct nlattr *nla;
1391		int remaining;
1392
1393		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1394			int type = nla_type(nla);
1395
1396			if (type) {
1397				if (type > RTAX_MAX) {
1398					err = -EINVAL;
1399					goto out;
1400				}
1401
1402				dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1403			}
1404		}
1405	}
1406
1407	rt->dst.dev = dev;
1408	rt->rt6i_idev = idev;
1409	rt->rt6i_table = table;
1410
1411	cfg->fc_nlinfo.nl_net = dev_net(dev);
1412
1413	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1414
1415out:
1416	if (dev)
1417		dev_put(dev);
1418	if (idev)
1419		in6_dev_put(idev);
1420	if (rt)
1421		dst_free(&rt->dst);
1422	return err;
1423}
1424
1425static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1426{
1427	int err;
1428	struct fib6_table *table;
1429	struct net *net = dev_net(rt->rt6i_dev);
1430
1431	if (rt == net->ipv6.ip6_null_entry)
1432		return -ENOENT;
1433
1434	table = rt->rt6i_table;
1435	write_lock_bh(&table->tb6_lock);
1436
1437	err = fib6_del(rt, info);
1438	dst_release(&rt->dst);
1439
1440	write_unlock_bh(&table->tb6_lock);
1441
1442	return err;
1443}
1444
1445int ip6_del_rt(struct rt6_info *rt)
1446{
1447	struct nl_info info = {
1448		.nl_net = dev_net(rt->rt6i_dev),
1449	};
1450	return __ip6_del_rt(rt, &info);
1451}
1452
1453static int ip6_route_del(struct fib6_config *cfg)
1454{
1455	struct fib6_table *table;
1456	struct fib6_node *fn;
1457	struct rt6_info *rt;
1458	int err = -ESRCH;
1459
1460	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1461	if (table == NULL)
1462		return err;
1463
1464	read_lock_bh(&table->tb6_lock);
1465
1466	fn = fib6_locate(&table->tb6_root,
1467			 &cfg->fc_dst, cfg->fc_dst_len,
1468			 &cfg->fc_src, cfg->fc_src_len);
1469
1470	if (fn) {
1471		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1472			if (cfg->fc_ifindex &&
1473			    (rt->rt6i_dev == NULL ||
1474			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1475				continue;
1476			if (cfg->fc_flags & RTF_GATEWAY &&
1477			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1478				continue;
1479			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1480				continue;
1481			dst_hold(&rt->dst);
1482			read_unlock_bh(&table->tb6_lock);
1483
1484			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1485		}
1486	}
1487	read_unlock_bh(&table->tb6_lock);
1488
1489	return err;
1490}
1491
1492/*
1493 *	Handle redirects
1494 */
1495struct ip6rd_flowi {
1496	struct flowi6 fl6;
1497	struct in6_addr gateway;
1498};
1499
1500static struct rt6_info *__ip6_route_redirect(struct net *net,
1501					     struct fib6_table *table,
1502					     struct flowi6 *fl6,
1503					     int flags)
1504{
1505	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1506	struct rt6_info *rt;
1507	struct fib6_node *fn;
1508
1509	/*
1510	 * Get the "current" route for this destination and
1511	 * check if the redirect has come from approriate router.
1512	 *
1513	 * RFC 2461 specifies that redirects should only be
1514	 * accepted if they come from the nexthop to the target.
1515	 * Due to the way the routes are chosen, this notion
1516	 * is a bit fuzzy and one might need to check all possible
1517	 * routes.
1518	 */
1519
1520	read_lock_bh(&table->tb6_lock);
1521	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1522restart:
1523	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1524		/*
1525		 * Current route is on-link; redirect is always invalid.
1526		 *
1527		 * Seems, previous statement is not true. It could
1528		 * be node, which looks for us as on-link (f.e. proxy ndisc)
1529		 * But then router serving it might decide, that we should
1530		 * know truth 8)8) --ANK (980726).
1531		 */
1532		if (rt6_check_expired(rt))
1533			continue;
1534		if (!(rt->rt6i_flags & RTF_GATEWAY))
1535			continue;
1536		if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1537			continue;
1538		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1539			continue;
1540		break;
1541	}
1542
1543	if (!rt)
1544		rt = net->ipv6.ip6_null_entry;
1545	BACKTRACK(net, &fl6->saddr);
1546out:
1547	dst_hold(&rt->dst);
1548
1549	read_unlock_bh(&table->tb6_lock);
1550
1551	return rt;
1552};
1553
1554static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1555					   const struct in6_addr *src,
1556					   const struct in6_addr *gateway,
1557					   struct net_device *dev)
1558{
1559	int flags = RT6_LOOKUP_F_HAS_SADDR;
1560	struct net *net = dev_net(dev);
1561	struct ip6rd_flowi rdfl = {
1562		.fl6 = {
1563			.flowi6_oif = dev->ifindex,
1564			.daddr = *dest,
1565			.saddr = *src,
1566		},
1567	};
1568
1569	ipv6_addr_copy(&rdfl.gateway, gateway);
1570
1571	if (rt6_need_strict(dest))
1572		flags |= RT6_LOOKUP_F_IFACE;
1573
1574	return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1575						   flags, __ip6_route_redirect);
1576}
1577
1578void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1579		  const struct in6_addr *saddr,
1580		  struct neighbour *neigh, u8 *lladdr, int on_link)
1581{
1582	struct rt6_info *rt, *nrt = NULL;
1583	struct netevent_redirect netevent;
1584	struct net *net = dev_net(neigh->dev);
1585
1586	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1587
1588	if (rt == net->ipv6.ip6_null_entry) {
1589		if (net_ratelimit())
1590			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1591			       "for redirect target\n");
1592		goto out;
1593	}
1594
1595	/*
1596	 *	We have finally decided to accept it.
1597	 */
1598
1599	neigh_update(neigh, lladdr, NUD_STALE,
1600		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1601		     NEIGH_UPDATE_F_OVERRIDE|
1602		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1603				     NEIGH_UPDATE_F_ISROUTER))
1604		     );
1605
1606	/*
1607	 * Redirect received -> path was valid.
1608	 * Look, redirects are sent only in response to data packets,
1609	 * so that this nexthop apparently is reachable. --ANK
1610	 */
1611	dst_confirm(&rt->dst);
1612
1613	/* Duplicate redirect: silently ignore. */
1614	if (neigh == dst_get_neighbour_raw(&rt->dst))
1615		goto out;
1616
1617	nrt = ip6_rt_copy(rt, dest);
1618	if (nrt == NULL)
1619		goto out;
1620
1621	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1622	if (on_link)
1623		nrt->rt6i_flags &= ~RTF_GATEWAY;
1624
1625	ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1626	dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1627
1628	if (ip6_ins_rt(nrt))
1629		goto out;
1630
1631	netevent.old = &rt->dst;
1632	netevent.new = &nrt->dst;
1633	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1634
1635	if (rt->rt6i_flags&RTF_CACHE) {
1636		ip6_del_rt(rt);
1637		return;
1638	}
1639
1640out:
1641	dst_release(&rt->dst);
1642}
1643
1644/*
1645 *	Handle ICMP "packet too big" messages
1646 *	i.e. Path MTU discovery
1647 */
1648
1649static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1650			     struct net *net, u32 pmtu, int ifindex)
1651{
1652	struct rt6_info *rt, *nrt;
1653	int allfrag = 0;
1654again:
1655	rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1656	if (rt == NULL)
1657		return;
1658
1659	if (rt6_check_expired(rt)) {
1660		ip6_del_rt(rt);
1661		goto again;
1662	}
1663
1664	if (pmtu >= dst_mtu(&rt->dst))
1665		goto out;
1666
1667	if (pmtu < IPV6_MIN_MTU) {
1668		/*
1669		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1670		 * MTU (1280) and a fragment header should always be included
1671		 * after a node receiving Too Big message reporting PMTU is
1672		 * less than the IPv6 Minimum Link MTU.
1673		 */
1674		pmtu = IPV6_MIN_MTU;
1675		allfrag = 1;
1676	}
1677
1678	/* New mtu received -> path was valid.
1679	   They are sent only in response to data packets,
1680	   so that this nexthop apparently is reachable. --ANK
1681	 */
1682	dst_confirm(&rt->dst);
1683
1684	/* Host route. If it is static, it would be better
1685	   not to override it, but add new one, so that
1686	   when cache entry will expire old pmtu
1687	   would return automatically.
1688	 */
1689	if (rt->rt6i_flags & RTF_CACHE) {
1690		dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1691		if (allfrag) {
1692			u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1693			features |= RTAX_FEATURE_ALLFRAG;
1694			dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1695		}
1696		dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1697		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1698		goto out;
1699	}
1700
1701	/* Network route.
1702	   Two cases are possible:
1703	   1. It is connected route. Action: COW
1704	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1705	 */
1706	if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1707		nrt = rt6_alloc_cow(rt, daddr, saddr);
1708	else
1709		nrt = rt6_alloc_clone(rt, daddr);
1710
1711	if (nrt) {
1712		dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1713		if (allfrag) {
1714			u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1715			features |= RTAX_FEATURE_ALLFRAG;
1716			dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1717		}
1718
1719		/* According to RFC 1981, detecting PMTU increase shouldn't be
1720		 * happened within 5 mins, the recommended timer is 10 mins.
1721		 * Here this route expiration time is set to ip6_rt_mtu_expires
1722		 * which is 10 mins. After 10 mins the decreased pmtu is expired
1723		 * and detecting PMTU increase will be automatically happened.
1724		 */
1725		dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1726		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1727
1728		ip6_ins_rt(nrt);
1729	}
1730out:
1731	dst_release(&rt->dst);
1732}
1733
1734void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1735			struct net_device *dev, u32 pmtu)
1736{
1737	struct net *net = dev_net(dev);
1738
1739	/*
1740	 * RFC 1981 states that a node "MUST reduce the size of the packets it
1741	 * is sending along the path" that caused the Packet Too Big message.
1742	 * Since it's not possible in the general case to determine which
1743	 * interface was used to send the original packet, we update the MTU
1744	 * on the interface that will be used to send future packets. We also
1745	 * update the MTU on the interface that received the Packet Too Big in
1746	 * case the original packet was forced out that interface with
1747	 * SO_BINDTODEVICE or similar. This is the next best thing to the
1748	 * correct behaviour, which would be to update the MTU on all
1749	 * interfaces.
1750	 */
1751	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1752	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1753}
1754
1755/*
1756 *	Misc support functions
1757 */
1758
1759static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1760				    const struct in6_addr *dest)
1761{
1762	struct net *net = dev_net(ort->rt6i_dev);
1763	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1764					    ort->dst.dev, 0);
1765
1766	if (rt) {
1767		rt->dst.input = ort->dst.input;
1768		rt->dst.output = ort->dst.output;
1769		rt->dst.flags |= DST_HOST;
1770
1771		ipv6_addr_copy(&rt->rt6i_dst.addr, dest);
1772		rt->rt6i_dst.plen = 128;
1773		dst_copy_metrics(&rt->dst, &ort->dst);
1774		rt->dst.error = ort->dst.error;
1775		rt->rt6i_idev = ort->rt6i_idev;
1776		if (rt->rt6i_idev)
1777			in6_dev_hold(rt->rt6i_idev);
1778		rt->dst.lastuse = jiffies;
1779		rt->rt6i_expires = 0;
1780
1781		ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1782		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1783		rt->rt6i_metric = 0;
1784
1785#ifdef CONFIG_IPV6_SUBTREES
1786		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1787#endif
1788		memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1789		rt->rt6i_table = ort->rt6i_table;
1790	}
1791	return rt;
1792}
1793
1794#ifdef CONFIG_IPV6_ROUTE_INFO
1795static struct rt6_info *rt6_get_route_info(struct net *net,
1796					   const struct in6_addr *prefix, int prefixlen,
1797					   const struct in6_addr *gwaddr, int ifindex)
1798{
1799	struct fib6_node *fn;
1800	struct rt6_info *rt = NULL;
1801	struct fib6_table *table;
1802
1803	table = fib6_get_table(net, RT6_TABLE_INFO);
1804	if (table == NULL)
1805		return NULL;
1806
1807	write_lock_bh(&table->tb6_lock);
1808	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1809	if (!fn)
1810		goto out;
1811
1812	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1813		if (rt->rt6i_dev->ifindex != ifindex)
1814			continue;
1815		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1816			continue;
1817		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1818			continue;
1819		dst_hold(&rt->dst);
1820		break;
1821	}
1822out:
1823	write_unlock_bh(&table->tb6_lock);
1824	return rt;
1825}
1826
1827static struct rt6_info *rt6_add_route_info(struct net *net,
1828					   const struct in6_addr *prefix, int prefixlen,
1829					   const struct in6_addr *gwaddr, int ifindex,
1830					   unsigned pref)
1831{
1832	struct fib6_config cfg = {
1833		.fc_table	= RT6_TABLE_INFO,
1834		.fc_metric	= IP6_RT_PRIO_USER,
1835		.fc_ifindex	= ifindex,
1836		.fc_dst_len	= prefixlen,
1837		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1838				  RTF_UP | RTF_PREF(pref),
1839		.fc_nlinfo.pid = 0,
1840		.fc_nlinfo.nlh = NULL,
1841		.fc_nlinfo.nl_net = net,
1842	};
1843
1844	ipv6_addr_copy(&cfg.fc_dst, prefix);
1845	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1846
1847	/* We should treat it as a default route if prefix length is 0. */
1848	if (!prefixlen)
1849		cfg.fc_flags |= RTF_DEFAULT;
1850
1851	ip6_route_add(&cfg);
1852
1853	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1854}
1855#endif
1856
1857struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1858{
1859	struct rt6_info *rt;
1860	struct fib6_table *table;
1861
1862	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1863	if (table == NULL)
1864		return NULL;
1865
1866	write_lock_bh(&table->tb6_lock);
1867	for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1868		if (dev == rt->rt6i_dev &&
1869		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1870		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1871			break;
1872	}
1873	if (rt)
1874		dst_hold(&rt->dst);
1875	write_unlock_bh(&table->tb6_lock);
1876	return rt;
1877}
1878
1879struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1880				     struct net_device *dev,
1881				     unsigned int pref)
1882{
1883	struct fib6_config cfg = {
1884		.fc_table	= RT6_TABLE_DFLT,
1885		.fc_metric	= IP6_RT_PRIO_USER,
1886		.fc_ifindex	= dev->ifindex,
1887		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1888				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1889		.fc_nlinfo.pid = 0,
1890		.fc_nlinfo.nlh = NULL,
1891		.fc_nlinfo.nl_net = dev_net(dev),
1892	};
1893
1894	ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1895
1896	ip6_route_add(&cfg);
1897
1898	return rt6_get_dflt_router(gwaddr, dev);
1899}
1900
1901void rt6_purge_dflt_routers(struct net *net)
1902{
1903	struct rt6_info *rt;
1904	struct fib6_table *table;
1905
1906	/* NOTE: Keep consistent with rt6_get_dflt_router */
1907	table = fib6_get_table(net, RT6_TABLE_DFLT);
1908	if (table == NULL)
1909		return;
1910
1911restart:
1912	read_lock_bh(&table->tb6_lock);
1913	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1914		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1915			dst_hold(&rt->dst);
1916			read_unlock_bh(&table->tb6_lock);
1917			ip6_del_rt(rt);
1918			goto restart;
1919		}
1920	}
1921	read_unlock_bh(&table->tb6_lock);
1922}
1923
1924static void rtmsg_to_fib6_config(struct net *net,
1925				 struct in6_rtmsg *rtmsg,
1926				 struct fib6_config *cfg)
1927{
1928	memset(cfg, 0, sizeof(*cfg));
1929
1930	cfg->fc_table = RT6_TABLE_MAIN;
1931	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1932	cfg->fc_metric = rtmsg->rtmsg_metric;
1933	cfg->fc_expires = rtmsg->rtmsg_info;
1934	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1935	cfg->fc_src_len = rtmsg->rtmsg_src_len;
1936	cfg->fc_flags = rtmsg->rtmsg_flags;
1937
1938	cfg->fc_nlinfo.nl_net = net;
1939
1940	ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1941	ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1942	ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1943}
1944
1945int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1946{
1947	struct fib6_config cfg;
1948	struct in6_rtmsg rtmsg;
1949	int err;
1950
1951	switch(cmd) {
1952	case SIOCADDRT:		/* Add a route */
1953	case SIOCDELRT:		/* Delete a route */
1954		if (!capable(CAP_NET_ADMIN))
1955			return -EPERM;
1956		err = copy_from_user(&rtmsg, arg,
1957				     sizeof(struct in6_rtmsg));
1958		if (err)
1959			return -EFAULT;
1960
1961		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1962
1963		rtnl_lock();
1964		switch (cmd) {
1965		case SIOCADDRT:
1966			err = ip6_route_add(&cfg);
1967			break;
1968		case SIOCDELRT:
1969			err = ip6_route_del(&cfg);
1970			break;
1971		default:
1972			err = -EINVAL;
1973		}
1974		rtnl_unlock();
1975
1976		return err;
1977	}
1978
1979	return -EINVAL;
1980}
1981
1982/*
1983 *	Drop the packet on the floor
1984 */
1985
1986static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1987{
1988	int type;
1989	struct dst_entry *dst = skb_dst(skb);
1990	switch (ipstats_mib_noroutes) {
1991	case IPSTATS_MIB_INNOROUTES:
1992		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1993		if (type == IPV6_ADDR_ANY) {
1994			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1995				      IPSTATS_MIB_INADDRERRORS);
1996			break;
1997		}
1998		/* FALLTHROUGH */
1999	case IPSTATS_MIB_OUTNOROUTES:
2000		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2001			      ipstats_mib_noroutes);
2002		break;
2003	}
2004	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2005	kfree_skb(skb);
2006	return 0;
2007}
2008
2009static int ip6_pkt_discard(struct sk_buff *skb)
2010{
2011	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2012}
2013
2014static int ip6_pkt_discard_out(struct sk_buff *skb)
2015{
2016	skb->dev = skb_dst(skb)->dev;
2017	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2018}
2019
2020#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2021
2022static int ip6_pkt_prohibit(struct sk_buff *skb)
2023{
2024	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2025}
2026
2027static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2028{
2029	skb->dev = skb_dst(skb)->dev;
2030	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2031}
2032
2033#endif
2034
2035/*
2036 *	Allocate a dst for local (unicast / anycast) address.
2037 */
2038
2039struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2040				    const struct in6_addr *addr,
2041				    int anycast)
2042{
2043	struct net *net = dev_net(idev->dev);
2044	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2045					    net->loopback_dev, 0);
2046	struct neighbour *neigh;
2047
2048	if (rt == NULL) {
2049		if (net_ratelimit())
2050			pr_warning("IPv6:  Maximum number of routes reached,"
2051				   " consider increasing route/max_size.\n");
2052		return ERR_PTR(-ENOMEM);
2053	}
2054
2055	in6_dev_hold(idev);
2056
2057	rt->dst.flags |= DST_HOST;
2058	rt->dst.input = ip6_input;
2059	rt->dst.output = ip6_output;
2060	rt->rt6i_idev = idev;
2061	rt->dst.obsolete = -1;
2062
2063	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2064	if (anycast)
2065		rt->rt6i_flags |= RTF_ANYCAST;
2066	else
2067		rt->rt6i_flags |= RTF_LOCAL;
2068	neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2069	if (IS_ERR(neigh)) {
2070		dst_free(&rt->dst);
2071
2072		return ERR_CAST(neigh);
2073	}
2074	dst_set_neighbour(&rt->dst, neigh);
2075
2076	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2077	rt->rt6i_dst.plen = 128;
2078	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2079
2080	atomic_set(&rt->dst.__refcnt, 1);
2081
2082	return rt;
2083}
2084
2085int ip6_route_get_saddr(struct net *net,
2086			struct rt6_info *rt,
2087			const struct in6_addr *daddr,
2088			unsigned int prefs,
2089			struct in6_addr *saddr)
2090{
2091	struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2092	int err = 0;
2093	if (rt->rt6i_prefsrc.plen)
2094		ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2095	else
2096		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2097					 daddr, prefs, saddr);
2098	return err;
2099}
2100
2101/* remove deleted ip from prefsrc entries */
2102struct arg_dev_net_ip {
2103	struct net_device *dev;
2104	struct net *net;
2105	struct in6_addr *addr;
2106};
2107
2108static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2109{
2110	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2111	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2112	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2113
2114	if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2115	    rt != net->ipv6.ip6_null_entry &&
2116	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2117		/* remove prefsrc entry */
2118		rt->rt6i_prefsrc.plen = 0;
2119	}
2120	return 0;
2121}
2122
2123void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2124{
2125	struct net *net = dev_net(ifp->idev->dev);
2126	struct arg_dev_net_ip adni = {
2127		.dev = ifp->idev->dev,
2128		.net = net,
2129		.addr = &ifp->addr,
2130	};
2131	fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2132}
2133
2134struct arg_dev_net {
2135	struct net_device *dev;
2136	struct net *net;
2137};
2138
2139static int fib6_ifdown(struct rt6_info *rt, void *arg)
2140{
2141	const struct arg_dev_net *adn = arg;
2142	const struct net_device *dev = adn->dev;
2143
2144	if ((rt->rt6i_dev == dev || dev == NULL) &&
2145	    rt != adn->net->ipv6.ip6_null_entry) {
2146		RT6_TRACE("deleted by ifdown %p\n", rt);
2147		return -1;
2148	}
2149	return 0;
2150}
2151
2152void rt6_ifdown(struct net *net, struct net_device *dev)
2153{
2154	struct arg_dev_net adn = {
2155		.dev = dev,
2156		.net = net,
2157	};
2158
2159	fib6_clean_all(net, fib6_ifdown, 0, &adn);
2160	icmp6_clean_all(fib6_ifdown, &adn);
2161}
2162
2163struct rt6_mtu_change_arg
2164{
2165	struct net_device *dev;
2166	unsigned mtu;
2167};
2168
2169static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2170{
2171	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2172	struct inet6_dev *idev;
2173
2174	/* In IPv6 pmtu discovery is not optional,
2175	   so that RTAX_MTU lock cannot disable it.
2176	   We still use this lock to block changes
2177	   caused by addrconf/ndisc.
2178	*/
2179
2180	idev = __in6_dev_get(arg->dev);
2181	if (idev == NULL)
2182		return 0;
2183
2184	/* For administrative MTU increase, there is no way to discover
2185	   IPv6 PMTU increase, so PMTU increase should be updated here.
2186	   Since RFC 1981 doesn't include administrative MTU increase
2187	   update PMTU increase is a MUST. (i.e. jumbo frame)
2188	 */
2189	/*
2190	   If new MTU is less than route PMTU, this new MTU will be the
2191	   lowest MTU in the path, update the route PMTU to reflect PMTU
2192	   decreases; if new MTU is greater than route PMTU, and the
2193	   old MTU is the lowest MTU in the path, update the route PMTU
2194	   to reflect the increase. In this case if the other nodes' MTU
2195	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
2196	   PMTU discouvery.
2197	 */
2198	if (rt->rt6i_dev == arg->dev &&
2199	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2200	    (dst_mtu(&rt->dst) >= arg->mtu ||
2201	     (dst_mtu(&rt->dst) < arg->mtu &&
2202	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2203		dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2204	}
2205	return 0;
2206}
2207
2208void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2209{
2210	struct rt6_mtu_change_arg arg = {
2211		.dev = dev,
2212		.mtu = mtu,
2213	};
2214
2215	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2216}
2217
2218static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2219	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2220	[RTA_OIF]               = { .type = NLA_U32 },
2221	[RTA_IIF]		= { .type = NLA_U32 },
2222	[RTA_PRIORITY]          = { .type = NLA_U32 },
2223	[RTA_METRICS]           = { .type = NLA_NESTED },
2224};
2225
2226static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2227			      struct fib6_config *cfg)
2228{
2229	struct rtmsg *rtm;
2230	struct nlattr *tb[RTA_MAX+1];
2231	int err;
2232
2233	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2234	if (err < 0)
2235		goto errout;
2236
2237	err = -EINVAL;
2238	rtm = nlmsg_data(nlh);
2239	memset(cfg, 0, sizeof(*cfg));
2240
2241	cfg->fc_table = rtm->rtm_table;
2242	cfg->fc_dst_len = rtm->rtm_dst_len;
2243	cfg->fc_src_len = rtm->rtm_src_len;
2244	cfg->fc_flags = RTF_UP;
2245	cfg->fc_protocol = rtm->rtm_protocol;
2246
2247	if (rtm->rtm_type == RTN_UNREACHABLE)
2248		cfg->fc_flags |= RTF_REJECT;
2249
2250	if (rtm->rtm_type == RTN_LOCAL)
2251		cfg->fc_flags |= RTF_LOCAL;
2252
2253	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2254	cfg->fc_nlinfo.nlh = nlh;
2255	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2256
2257	if (tb[RTA_GATEWAY]) {
2258		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2259		cfg->fc_flags |= RTF_GATEWAY;
2260	}
2261
2262	if (tb[RTA_DST]) {
2263		int plen = (rtm->rtm_dst_len + 7) >> 3;
2264
2265		if (nla_len(tb[RTA_DST]) < plen)
2266			goto errout;
2267
2268		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2269	}
2270
2271	if (tb[RTA_SRC]) {
2272		int plen = (rtm->rtm_src_len + 7) >> 3;
2273
2274		if (nla_len(tb[RTA_SRC]) < plen)
2275			goto errout;
2276
2277		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2278	}
2279
2280	if (tb[RTA_PREFSRC])
2281		nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2282
2283	if (tb[RTA_OIF])
2284		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2285
2286	if (tb[RTA_PRIORITY])
2287		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2288
2289	if (tb[RTA_METRICS]) {
2290		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2291		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2292	}
2293
2294	if (tb[RTA_TABLE])
2295		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2296
2297	err = 0;
2298errout:
2299	return err;
2300}
2301
2302static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2303{
2304	struct fib6_config cfg;
2305	int err;
2306
2307	err = rtm_to_fib6_config(skb, nlh, &cfg);
2308	if (err < 0)
2309		return err;
2310
2311	return ip6_route_del(&cfg);
2312}
2313
2314static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2315{
2316	struct fib6_config cfg;
2317	int err;
2318
2319	err = rtm_to_fib6_config(skb, nlh, &cfg);
2320	if (err < 0)
2321		return err;
2322
2323	return ip6_route_add(&cfg);
2324}
2325
2326static inline size_t rt6_nlmsg_size(void)
2327{
2328	return NLMSG_ALIGN(sizeof(struct rtmsg))
2329	       + nla_total_size(16) /* RTA_SRC */
2330	       + nla_total_size(16) /* RTA_DST */
2331	       + nla_total_size(16) /* RTA_GATEWAY */
2332	       + nla_total_size(16) /* RTA_PREFSRC */
2333	       + nla_total_size(4) /* RTA_TABLE */
2334	       + nla_total_size(4) /* RTA_IIF */
2335	       + nla_total_size(4) /* RTA_OIF */
2336	       + nla_total_size(4) /* RTA_PRIORITY */
2337	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2338	       + nla_total_size(sizeof(struct rta_cacheinfo));
2339}
2340
2341static int rt6_fill_node(struct net *net,
2342			 struct sk_buff *skb, struct rt6_info *rt,
2343			 struct in6_addr *dst, struct in6_addr *src,
2344			 int iif, int type, u32 pid, u32 seq,
2345			 int prefix, int nowait, unsigned int flags)
2346{
2347	struct rtmsg *rtm;
2348	struct nlmsghdr *nlh;
2349	long expires;
2350	u32 table;
2351	struct neighbour *n;
2352
2353	if (prefix) {	/* user wants prefix routes only */
2354		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2355			/* success since this is not a prefix route */
2356			return 1;
2357		}
2358	}
2359
2360	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2361	if (nlh == NULL)
2362		return -EMSGSIZE;
2363
2364	rtm = nlmsg_data(nlh);
2365	rtm->rtm_family = AF_INET6;
2366	rtm->rtm_dst_len = rt->rt6i_dst.plen;
2367	rtm->rtm_src_len = rt->rt6i_src.plen;
2368	rtm->rtm_tos = 0;
2369	if (rt->rt6i_table)
2370		table = rt->rt6i_table->tb6_id;
2371	else
2372		table = RT6_TABLE_UNSPEC;
2373	rtm->rtm_table = table;
2374	NLA_PUT_U32(skb, RTA_TABLE, table);
2375	if (rt->rt6i_flags&RTF_REJECT)
2376		rtm->rtm_type = RTN_UNREACHABLE;
2377	else if (rt->rt6i_flags&RTF_LOCAL)
2378		rtm->rtm_type = RTN_LOCAL;
2379	else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2380		rtm->rtm_type = RTN_LOCAL;
2381	else
2382		rtm->rtm_type = RTN_UNICAST;
2383	rtm->rtm_flags = 0;
2384	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2385	rtm->rtm_protocol = rt->rt6i_protocol;
2386	if (rt->rt6i_flags&RTF_DYNAMIC)
2387		rtm->rtm_protocol = RTPROT_REDIRECT;
2388	else if (rt->rt6i_flags & RTF_ADDRCONF)
2389		rtm->rtm_protocol = RTPROT_KERNEL;
2390	else if (rt->rt6i_flags&RTF_DEFAULT)
2391		rtm->rtm_protocol = RTPROT_RA;
2392
2393	if (rt->rt6i_flags&RTF_CACHE)
2394		rtm->rtm_flags |= RTM_F_CLONED;
2395
2396	if (dst) {
2397		NLA_PUT(skb, RTA_DST, 16, dst);
2398		rtm->rtm_dst_len = 128;
2399	} else if (rtm->rtm_dst_len)
2400		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2401#ifdef CONFIG_IPV6_SUBTREES
2402	if (src) {
2403		NLA_PUT(skb, RTA_SRC, 16, src);
2404		rtm->rtm_src_len = 128;
2405	} else if (rtm->rtm_src_len)
2406		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2407#endif
2408	if (iif) {
2409#ifdef CONFIG_IPV6_MROUTE
2410		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2411			int err = ip6mr_get_route(net, skb, rtm, nowait);
2412			if (err <= 0) {
2413				if (!nowait) {
2414					if (err == 0)
2415						return 0;
2416					goto nla_put_failure;
2417				} else {
2418					if (err == -EMSGSIZE)
2419						goto nla_put_failure;
2420				}
2421			}
2422		} else
2423#endif
2424			NLA_PUT_U32(skb, RTA_IIF, iif);
2425	} else if (dst) {
2426		struct in6_addr saddr_buf;
2427		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2428			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2429	}
2430
2431	if (rt->rt6i_prefsrc.plen) {
2432		struct in6_addr saddr_buf;
2433		ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2434		NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2435	}
2436
2437	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2438		goto nla_put_failure;
2439
2440	rcu_read_lock();
2441	n = dst_get_neighbour(&rt->dst);
2442	if (n)
2443		NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2444	rcu_read_unlock();
2445
2446	if (rt->dst.dev)
2447		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2448
2449	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2450
2451	if (!(rt->rt6i_flags & RTF_EXPIRES))
2452		expires = 0;
2453	else if (rt->rt6i_expires - jiffies < INT_MAX)
2454		expires = rt->rt6i_expires - jiffies;
2455	else
2456		expires = INT_MAX;
2457
2458	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2459			       expires, rt->dst.error) < 0)
2460		goto nla_put_failure;
2461
2462	return nlmsg_end(skb, nlh);
2463
2464nla_put_failure:
2465	nlmsg_cancel(skb, nlh);
2466	return -EMSGSIZE;
2467}
2468
2469int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2470{
2471	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2472	int prefix;
2473
2474	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2475		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2476		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2477	} else
2478		prefix = 0;
2479
2480	return rt6_fill_node(arg->net,
2481		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2482		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2483		     prefix, 0, NLM_F_MULTI);
2484}
2485
2486static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2487{
2488	struct net *net = sock_net(in_skb->sk);
2489	struct nlattr *tb[RTA_MAX+1];
2490	struct rt6_info *rt;
2491	struct sk_buff *skb;
2492	struct rtmsg *rtm;
2493	struct flowi6 fl6;
2494	int err, iif = 0;
2495
2496	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2497	if (err < 0)
2498		goto errout;
2499
2500	err = -EINVAL;
2501	memset(&fl6, 0, sizeof(fl6));
2502
2503	if (tb[RTA_SRC]) {
2504		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2505			goto errout;
2506
2507		ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2508	}
2509
2510	if (tb[RTA_DST]) {
2511		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2512			goto errout;
2513
2514		ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2515	}
2516
2517	if (tb[RTA_IIF])
2518		iif = nla_get_u32(tb[RTA_IIF]);
2519
2520	if (tb[RTA_OIF])
2521		fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2522
2523	if (iif) {
2524		struct net_device *dev;
2525		dev = __dev_get_by_index(net, iif);
2526		if (!dev) {
2527			err = -ENODEV;
2528			goto errout;
2529		}
2530	}
2531
2532	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2533	if (skb == NULL) {
2534		err = -ENOBUFS;
2535		goto errout;
2536	}
2537
2538	/* Reserve room for dummy headers, this skb can pass
2539	   through good chunk of routing engine.
2540	 */
2541	skb_reset_mac_header(skb);
2542	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2543
2544	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2545	skb_dst_set(skb, &rt->dst);
2546
2547	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2548			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2549			    nlh->nlmsg_seq, 0, 0, 0);
2550	if (err < 0) {
2551		kfree_skb(skb);
2552		goto errout;
2553	}
2554
2555	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2556errout:
2557	return err;
2558}
2559
2560void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2561{
2562	struct sk_buff *skb;
2563	struct net *net = info->nl_net;
2564	u32 seq;
2565	int err;
2566
2567	err = -ENOBUFS;
2568	seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2569
2570	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2571	if (skb == NULL)
2572		goto errout;
2573
2574	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2575				event, info->pid, seq, 0, 0, 0);
2576	if (err < 0) {
2577		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2578		WARN_ON(err == -EMSGSIZE);
2579		kfree_skb(skb);
2580		goto errout;
2581	}
2582	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2583		    info->nlh, gfp_any());
2584	return;
2585errout:
2586	if (err < 0)
2587		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2588}
2589
2590static int ip6_route_dev_notify(struct notifier_block *this,
2591				unsigned long event, void *data)
2592{
2593	struct net_device *dev = (struct net_device *)data;
2594	struct net *net = dev_net(dev);
2595
2596	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2597		net->ipv6.ip6_null_entry->dst.dev = dev;
2598		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2599#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2600		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2601		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2602		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2603		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2604#endif
2605	}
2606
2607	return NOTIFY_OK;
2608}
2609
2610/*
2611 *	/proc
2612 */
2613
2614#ifdef CONFIG_PROC_FS
2615
2616struct rt6_proc_arg
2617{
2618	char *buffer;
2619	int offset;
2620	int length;
2621	int skip;
2622	int len;
2623};
2624
2625static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2626{
2627	struct seq_file *m = p_arg;
2628	struct neighbour *n;
2629
2630	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2631
2632#ifdef CONFIG_IPV6_SUBTREES
2633	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2634#else
2635	seq_puts(m, "00000000000000000000000000000000 00 ");
2636#endif
2637	rcu_read_lock();
2638	n = dst_get_neighbour(&rt->dst);
2639	if (n) {
2640		seq_printf(m, "%pi6", n->primary_key);
2641	} else {
2642		seq_puts(m, "00000000000000000000000000000000");
2643	}
2644	rcu_read_unlock();
2645	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2646		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2647		   rt->dst.__use, rt->rt6i_flags,
2648		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2649	return 0;
2650}
2651
2652static int ipv6_route_show(struct seq_file *m, void *v)
2653{
2654	struct net *net = (struct net *)m->private;
2655	fib6_clean_all(net, rt6_info_route, 0, m);
2656	return 0;
2657}
2658
2659static int ipv6_route_open(struct inode *inode, struct file *file)
2660{
2661	return single_open_net(inode, file, ipv6_route_show);
2662}
2663
2664static const struct file_operations ipv6_route_proc_fops = {
2665	.owner		= THIS_MODULE,
2666	.open		= ipv6_route_open,
2667	.read		= seq_read,
2668	.llseek		= seq_lseek,
2669	.release	= single_release_net,
2670};
2671
2672static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2673{
2674	struct net *net = (struct net *)seq->private;
2675	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2676		   net->ipv6.rt6_stats->fib_nodes,
2677		   net->ipv6.rt6_stats->fib_route_nodes,
2678		   net->ipv6.rt6_stats->fib_rt_alloc,
2679		   net->ipv6.rt6_stats->fib_rt_entries,
2680		   net->ipv6.rt6_stats->fib_rt_cache,
2681		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2682		   net->ipv6.rt6_stats->fib_discarded_routes);
2683
2684	return 0;
2685}
2686
2687static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2688{
2689	return single_open_net(inode, file, rt6_stats_seq_show);
2690}
2691
2692static const struct file_operations rt6_stats_seq_fops = {
2693	.owner	 = THIS_MODULE,
2694	.open	 = rt6_stats_seq_open,
2695	.read	 = seq_read,
2696	.llseek	 = seq_lseek,
2697	.release = single_release_net,
2698};
2699#endif	/* CONFIG_PROC_FS */
2700
2701#ifdef CONFIG_SYSCTL
2702
2703static
2704int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2705			      void __user *buffer, size_t *lenp, loff_t *ppos)
2706{
2707	struct net *net;
2708	int delay;
2709	if (!write)
2710		return -EINVAL;
2711
2712	net = (struct net *)ctl->extra1;
2713	delay = net->ipv6.sysctl.flush_delay;
2714	proc_dointvec(ctl, write, buffer, lenp, ppos);
2715	fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2716	return 0;
2717}
2718
2719ctl_table ipv6_route_table_template[] = {
2720	{
2721		.procname	=	"flush",
2722		.data		=	&init_net.ipv6.sysctl.flush_delay,
2723		.maxlen		=	sizeof(int),
2724		.mode		=	0200,
2725		.proc_handler	=	ipv6_sysctl_rtcache_flush
2726	},
2727	{
2728		.procname	=	"gc_thresh",
2729		.data		=	&ip6_dst_ops_template.gc_thresh,
2730		.maxlen		=	sizeof(int),
2731		.mode		=	0644,
2732		.proc_handler	=	proc_dointvec,
2733	},
2734	{
2735		.procname	=	"max_size",
2736		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
2737		.maxlen		=	sizeof(int),
2738		.mode		=	0644,
2739		.proc_handler	=	proc_dointvec,
2740	},
2741	{
2742		.procname	=	"gc_min_interval",
2743		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2744		.maxlen		=	sizeof(int),
2745		.mode		=	0644,
2746		.proc_handler	=	proc_dointvec_jiffies,
2747	},
2748	{
2749		.procname	=	"gc_timeout",
2750		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2751		.maxlen		=	sizeof(int),
2752		.mode		=	0644,
2753		.proc_handler	=	proc_dointvec_jiffies,
2754	},
2755	{
2756		.procname	=	"gc_interval",
2757		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
2758		.maxlen		=	sizeof(int),
2759		.mode		=	0644,
2760		.proc_handler	=	proc_dointvec_jiffies,
2761	},
2762	{
2763		.procname	=	"gc_elasticity",
2764		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2765		.maxlen		=	sizeof(int),
2766		.mode		=	0644,
2767		.proc_handler	=	proc_dointvec,
2768	},
2769	{
2770		.procname	=	"mtu_expires",
2771		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2772		.maxlen		=	sizeof(int),
2773		.mode		=	0644,
2774		.proc_handler	=	proc_dointvec_jiffies,
2775	},
2776	{
2777		.procname	=	"min_adv_mss",
2778		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
2779		.maxlen		=	sizeof(int),
2780		.mode		=	0644,
2781		.proc_handler	=	proc_dointvec,
2782	},
2783	{
2784		.procname	=	"gc_min_interval_ms",
2785		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2786		.maxlen		=	sizeof(int),
2787		.mode		=	0644,
2788		.proc_handler	=	proc_dointvec_ms_jiffies,
2789	},
2790	{ }
2791};
2792
2793struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2794{
2795	struct ctl_table *table;
2796
2797	table = kmemdup(ipv6_route_table_template,
2798			sizeof(ipv6_route_table_template),
2799			GFP_KERNEL);
2800
2801	if (table) {
2802		table[0].data = &net->ipv6.sysctl.flush_delay;
2803		table[0].extra1 = net;
2804		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2805		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2806		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2807		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2808		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2809		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2810		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2811		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2812		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2813	}
2814
2815	return table;
2816}
2817#endif
2818
2819static int __net_init ip6_route_net_init(struct net *net)
2820{
2821	int ret = -ENOMEM;
2822
2823	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2824	       sizeof(net->ipv6.ip6_dst_ops));
2825
2826	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2827		goto out_ip6_dst_ops;
2828
2829	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2830					   sizeof(*net->ipv6.ip6_null_entry),
2831					   GFP_KERNEL);
2832	if (!net->ipv6.ip6_null_entry)
2833		goto out_ip6_dst_entries;
2834	net->ipv6.ip6_null_entry->dst.path =
2835		(struct dst_entry *)net->ipv6.ip6_null_entry;
2836	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2837	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2838			 ip6_template_metrics, true);
2839
2840#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2841	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2842					       sizeof(*net->ipv6.ip6_prohibit_entry),
2843					       GFP_KERNEL);
2844	if (!net->ipv6.ip6_prohibit_entry)
2845		goto out_ip6_null_entry;
2846	net->ipv6.ip6_prohibit_entry->dst.path =
2847		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2848	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2849	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2850			 ip6_template_metrics, true);
2851
2852	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2853					       sizeof(*net->ipv6.ip6_blk_hole_entry),
2854					       GFP_KERNEL);
2855	if (!net->ipv6.ip6_blk_hole_entry)
2856		goto out_ip6_prohibit_entry;
2857	net->ipv6.ip6_blk_hole_entry->dst.path =
2858		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2859	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2860	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2861			 ip6_template_metrics, true);
2862#endif
2863
2864	net->ipv6.sysctl.flush_delay = 0;
2865	net->ipv6.sysctl.ip6_rt_max_size = 4096;
2866	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2867	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2868	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2869	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2870	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2871	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2872
2873#ifdef CONFIG_PROC_FS
2874	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2875	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2876#endif
2877	net->ipv6.ip6_rt_gc_expire = 30*HZ;
2878
2879	ret = 0;
2880out:
2881	return ret;
2882
2883#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2884out_ip6_prohibit_entry:
2885	kfree(net->ipv6.ip6_prohibit_entry);
2886out_ip6_null_entry:
2887	kfree(net->ipv6.ip6_null_entry);
2888#endif
2889out_ip6_dst_entries:
2890	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2891out_ip6_dst_ops:
2892	goto out;
2893}
2894
2895static void __net_exit ip6_route_net_exit(struct net *net)
2896{
2897#ifdef CONFIG_PROC_FS
2898	proc_net_remove(net, "ipv6_route");
2899	proc_net_remove(net, "rt6_stats");
2900#endif
2901	kfree(net->ipv6.ip6_null_entry);
2902#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2903	kfree(net->ipv6.ip6_prohibit_entry);
2904	kfree(net->ipv6.ip6_blk_hole_entry);
2905#endif
2906	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2907}
2908
2909static struct pernet_operations ip6_route_net_ops = {
2910	.init = ip6_route_net_init,
2911	.exit = ip6_route_net_exit,
2912};
2913
2914static struct notifier_block ip6_route_dev_notifier = {
2915	.notifier_call = ip6_route_dev_notify,
2916	.priority = 0,
2917};
2918
2919int __init ip6_route_init(void)
2920{
2921	int ret;
2922
2923	ret = -ENOMEM;
2924	ip6_dst_ops_template.kmem_cachep =
2925		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2926				  SLAB_HWCACHE_ALIGN, NULL);
2927	if (!ip6_dst_ops_template.kmem_cachep)
2928		goto out;
2929
2930	ret = dst_entries_init(&ip6_dst_blackhole_ops);
2931	if (ret)
2932		goto out_kmem_cache;
2933
2934	ret = register_pernet_subsys(&ip6_route_net_ops);
2935	if (ret)
2936		goto out_dst_entries;
2937
2938	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2939
2940	/* Registering of the loopback is done before this portion of code,
2941	 * the loopback reference in rt6_info will not be taken, do it
2942	 * manually for init_net */
2943	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2944	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2945  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2946	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2947	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2948	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2949	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2950  #endif
2951	ret = fib6_init();
2952	if (ret)
2953		goto out_register_subsys;
2954
2955	ret = xfrm6_init();
2956	if (ret)
2957		goto out_fib6_init;
2958
2959	ret = fib6_rules_init();
2960	if (ret)
2961		goto xfrm6_init;
2962
2963	ret = -ENOBUFS;
2964	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2965	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2966	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2967		goto fib6_rules_init;
2968
2969	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2970	if (ret)
2971		goto fib6_rules_init;
2972
2973out:
2974	return ret;
2975
2976fib6_rules_init:
2977	fib6_rules_cleanup();
2978xfrm6_init:
2979	xfrm6_fini();
2980out_fib6_init:
2981	fib6_gc_cleanup();
2982out_register_subsys:
2983	unregister_pernet_subsys(&ip6_route_net_ops);
2984out_dst_entries:
2985	dst_entries_destroy(&ip6_dst_blackhole_ops);
2986out_kmem_cache:
2987	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2988	goto out;
2989}
2990
2991void ip6_route_cleanup(void)
2992{
2993	unregister_netdevice_notifier(&ip6_route_dev_notifier);
2994	fib6_rules_cleanup();
2995	xfrm6_fini();
2996	fib6_gc_cleanup();
2997	unregister_pernet_subsys(&ip6_route_net_ops);
2998	dst_entries_destroy(&ip6_dst_blackhole_ops);
2999	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3000}