Linux Audio

Check our new training course

Loading...
v5.4
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		ROUTE - implementation of the IP router.
   8 *
   9 * Authors:	Ross Biro
  10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  12 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14 *
  15 * Fixes:
  16 *		Alan Cox	:	Verify area fixes.
  17 *		Alan Cox	:	cli() protects routing changes
  18 *		Rui Oliveira	:	ICMP routing table updates
  19 *		(rco@di.uminho.pt)	Routing table insertion and update
  20 *		Linus Torvalds	:	Rewrote bits to be sensible
  21 *		Alan Cox	:	Added BSD route gw semantics
  22 *		Alan Cox	:	Super /proc >4K
  23 *		Alan Cox	:	MTU in route table
  24 *		Alan Cox	: 	MSS actually. Also added the window
  25 *					clamper.
  26 *		Sam Lantinga	:	Fixed route matching in rt_del()
  27 *		Alan Cox	:	Routing cache support.
  28 *		Alan Cox	:	Removed compatibility cruft.
  29 *		Alan Cox	:	RTF_REJECT support.
  30 *		Alan Cox	:	TCP irtt support.
  31 *		Jonathan Naylor	:	Added Metric support.
  32 *	Miquel van Smoorenburg	:	BSD API fixes.
  33 *	Miquel van Smoorenburg	:	Metrics.
  34 *		Alan Cox	:	Use __u32 properly
  35 *		Alan Cox	:	Aligned routing errors more closely with BSD
  36 *					our system is still very different.
  37 *		Alan Cox	:	Faster /proc handling
  38 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  39 *					routing caches and better behaviour.
  40 *
  41 *		Olaf Erb	:	irtt wasn't being copied right.
  42 *		Bjorn Ekwall	:	Kerneld route support.
  43 *		Alan Cox	:	Multicast fixed (I hope)
  44 * 		Pavel Krauz	:	Limited broadcast fixed
  45 *		Mike McLagan	:	Routing by source
  46 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  47 *					route.c and rewritten from scratch.
  48 *		Andi Kleen	:	Load-limit warning messages.
  49 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  50 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  51 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  52 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  53 *		Marc Boucher	:	routing by fwmark
  54 *	Robert Olsson		:	Added rt_cache statistics
  55 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  56 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  57 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  58 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
 
 
 
 
 
  59 */
  60
  61#define pr_fmt(fmt) "IPv4: " fmt
  62
  63#include <linux/module.h>
  64#include <linux/uaccess.h>
 
  65#include <linux/bitops.h>
  66#include <linux/types.h>
  67#include <linux/kernel.h>
  68#include <linux/mm.h>
 
  69#include <linux/string.h>
  70#include <linux/socket.h>
  71#include <linux/sockios.h>
  72#include <linux/errno.h>
  73#include <linux/in.h>
  74#include <linux/inet.h>
  75#include <linux/netdevice.h>
  76#include <linux/proc_fs.h>
  77#include <linux/init.h>
 
  78#include <linux/skbuff.h>
  79#include <linux/inetdevice.h>
  80#include <linux/igmp.h>
  81#include <linux/pkt_sched.h>
  82#include <linux/mroute.h>
  83#include <linux/netfilter_ipv4.h>
  84#include <linux/random.h>
 
  85#include <linux/rcupdate.h>
  86#include <linux/times.h>
  87#include <linux/slab.h>
  88#include <linux/jhash.h>
  89#include <net/dst.h>
  90#include <net/dst_metadata.h>
  91#include <net/net_namespace.h>
  92#include <net/protocol.h>
  93#include <net/ip.h>
  94#include <net/route.h>
  95#include <net/inetpeer.h>
  96#include <net/sock.h>
  97#include <net/ip_fib.h>
  98#include <net/nexthop.h>
  99#include <net/arp.h>
 100#include <net/tcp.h>
 101#include <net/icmp.h>
 102#include <net/xfrm.h>
 103#include <net/lwtunnel.h>
 104#include <net/netevent.h>
 105#include <net/rtnetlink.h>
 106#ifdef CONFIG_SYSCTL
 107#include <linux/sysctl.h>
 108#endif
 
 109#include <net/secure_seq.h>
 110#include <net/ip_tunnels.h>
 111#include <net/l3mdev.h>
 112
 113#include "fib_lookup.h"
 114
 115#define RT_FL_TOS(oldflp4) \
 116	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 
 
 117
 118#define RT_GC_TIMEOUT (300*HZ)
 119
 120static int ip_rt_max_size;
 
 
 
 121static int ip_rt_redirect_number __read_mostly	= 9;
 122static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 123static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 124static int ip_rt_error_cost __read_mostly	= HZ;
 125static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 
 126static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 127static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 128static int ip_rt_min_advmss __read_mostly	= 256;
 129
 130static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 131
 132/*
 133 *	Interface to generic destination cache.
 134 */
 135
 136static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 137static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 138static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
 
 139static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 140static void		 ipv4_link_failure(struct sk_buff *skb);
 141static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 142					   struct sk_buff *skb, u32 mtu);
 143static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 144					struct sk_buff *skb);
 145static void		ipv4_dst_destroy(struct dst_entry *dst);
 
 
 146
 147static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 148{
 149	WARN_ON(1);
 150	return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 151}
 152
 153static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 154					   struct sk_buff *skb,
 155					   const void *daddr);
 156static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 157
 158static struct dst_ops ipv4_dst_ops = {
 159	.family =		AF_INET,
 
 
 160	.check =		ipv4_dst_check,
 161	.default_advmss =	ipv4_default_advmss,
 162	.mtu =			ipv4_mtu,
 163	.cow_metrics =		ipv4_cow_metrics,
 164	.destroy =		ipv4_dst_destroy,
 
 165	.negative_advice =	ipv4_negative_advice,
 166	.link_failure =		ipv4_link_failure,
 167	.update_pmtu =		ip_rt_update_pmtu,
 168	.redirect =		ip_do_redirect,
 169	.local_out =		__ip_local_out,
 170	.neigh_lookup =		ipv4_neigh_lookup,
 171	.confirm_neigh =	ipv4_confirm_neigh,
 172};
 173
 174#define ECN_OR_COST(class)	TC_PRIO_##class
 175
 176const __u8 ip_tos2prio[16] = {
 177	TC_PRIO_BESTEFFORT,
 178	ECN_OR_COST(BESTEFFORT),
 179	TC_PRIO_BESTEFFORT,
 180	ECN_OR_COST(BESTEFFORT),
 181	TC_PRIO_BULK,
 182	ECN_OR_COST(BULK),
 183	TC_PRIO_BULK,
 184	ECN_OR_COST(BULK),
 185	TC_PRIO_INTERACTIVE,
 186	ECN_OR_COST(INTERACTIVE),
 187	TC_PRIO_INTERACTIVE,
 188	ECN_OR_COST(INTERACTIVE),
 189	TC_PRIO_INTERACTIVE_BULK,
 190	ECN_OR_COST(INTERACTIVE_BULK),
 191	TC_PRIO_INTERACTIVE_BULK,
 192	ECN_OR_COST(INTERACTIVE_BULK)
 193};
 194EXPORT_SYMBOL(ip_tos2prio);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 195
 196static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 197#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 
 
 
 
 
 
 
 
 
 
 
 
 
 198
 199#ifdef CONFIG_PROC_FS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 200static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 201{
 
 202	if (*pos)
 203		return NULL;
 
 204	return SEQ_START_TOKEN;
 205}
 206
 207static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 208{
 
 
 
 
 
 
 209	++*pos;
 210	return NULL;
 211}
 212
 213static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 214{
 
 
 215}
 216
 217static int rt_cache_seq_show(struct seq_file *seq, void *v)
 218{
 219	if (v == SEQ_START_TOKEN)
 220		seq_printf(seq, "%-127s\n",
 221			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 222			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 223			   "HHUptod\tSpecDst");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 224	return 0;
 225}
 226
 227static const struct seq_operations rt_cache_seq_ops = {
 228	.start  = rt_cache_seq_start,
 229	.next   = rt_cache_seq_next,
 230	.stop   = rt_cache_seq_stop,
 231	.show   = rt_cache_seq_show,
 232};
 233
 234static int rt_cache_seq_open(struct inode *inode, struct file *file)
 235{
 236	return seq_open(file, &rt_cache_seq_ops);
 
 237}
 238
 239static const struct file_operations rt_cache_seq_fops = {
 
 240	.open	 = rt_cache_seq_open,
 241	.read	 = seq_read,
 242	.llseek	 = seq_lseek,
 243	.release = seq_release,
 244};
 245
 246
 247static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 248{
 249	int cpu;
 250
 251	if (*pos == 0)
 252		return SEQ_START_TOKEN;
 253
 254	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 255		if (!cpu_possible(cpu))
 256			continue;
 257		*pos = cpu+1;
 258		return &per_cpu(rt_cache_stat, cpu);
 259	}
 260	return NULL;
 261}
 262
 263static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 264{
 265	int cpu;
 266
 267	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 268		if (!cpu_possible(cpu))
 269			continue;
 270		*pos = cpu+1;
 271		return &per_cpu(rt_cache_stat, cpu);
 272	}
 273	return NULL;
 274
 275}
 276
 277static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 278{
 279
 280}
 281
 282static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 283{
 284	struct rt_cache_stat *st = v;
 285
 286	if (v == SEQ_START_TOKEN) {
 287		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 288		return 0;
 289	}
 290
 291	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 292		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 293		   dst_entries_get_slow(&ipv4_dst_ops),
 294		   0, /* st->in_hit */
 295		   st->in_slow_tot,
 296		   st->in_slow_mc,
 297		   st->in_no_route,
 298		   st->in_brd,
 299		   st->in_martian_dst,
 300		   st->in_martian_src,
 301
 302		   0, /* st->out_hit */
 303		   st->out_slow_tot,
 304		   st->out_slow_mc,
 305
 306		   0, /* st->gc_total */
 307		   0, /* st->gc_ignored */
 308		   0, /* st->gc_goal_miss */
 309		   0, /* st->gc_dst_overflow */
 310		   0, /* st->in_hlist_search */
 311		   0  /* st->out_hlist_search */
 312		);
 313	return 0;
 314}
 315
 316static const struct seq_operations rt_cpu_seq_ops = {
 317	.start  = rt_cpu_seq_start,
 318	.next   = rt_cpu_seq_next,
 319	.stop   = rt_cpu_seq_stop,
 320	.show   = rt_cpu_seq_show,
 321};
 322
 323
 324static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 325{
 326	return seq_open(file, &rt_cpu_seq_ops);
 327}
 328
 329static const struct file_operations rt_cpu_seq_fops = {
 
 330	.open	 = rt_cpu_seq_open,
 331	.read	 = seq_read,
 332	.llseek	 = seq_lseek,
 333	.release = seq_release,
 334};
 335
 336#ifdef CONFIG_IP_ROUTE_CLASSID
 337static int rt_acct_proc_show(struct seq_file *m, void *v)
 338{
 339	struct ip_rt_acct *dst, *src;
 340	unsigned int i, j;
 341
 342	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 343	if (!dst)
 344		return -ENOMEM;
 345
 346	for_each_possible_cpu(i) {
 347		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 348		for (j = 0; j < 256; j++) {
 349			dst[j].o_bytes   += src[j].o_bytes;
 350			dst[j].o_packets += src[j].o_packets;
 351			dst[j].i_bytes   += src[j].i_bytes;
 352			dst[j].i_packets += src[j].i_packets;
 353		}
 354	}
 355
 356	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 357	kfree(dst);
 358	return 0;
 359}
 
 
 
 
 
 
 
 
 
 
 
 
 
 360#endif
 361
 362static int __net_init ip_rt_do_proc_init(struct net *net)
 363{
 364	struct proc_dir_entry *pde;
 365
 366	pde = proc_create("rt_cache", 0444, net->proc_net,
 367			  &rt_cache_seq_fops);
 368	if (!pde)
 369		goto err1;
 370
 371	pde = proc_create("rt_cache", 0444,
 372			  net->proc_net_stat, &rt_cpu_seq_fops);
 373	if (!pde)
 374		goto err2;
 375
 376#ifdef CONFIG_IP_ROUTE_CLASSID
 377	pde = proc_create_single("rt_acct", 0, net->proc_net,
 378			rt_acct_proc_show);
 379	if (!pde)
 380		goto err3;
 381#endif
 382	return 0;
 383
 384#ifdef CONFIG_IP_ROUTE_CLASSID
 385err3:
 386	remove_proc_entry("rt_cache", net->proc_net_stat);
 387#endif
 388err2:
 389	remove_proc_entry("rt_cache", net->proc_net);
 390err1:
 391	return -ENOMEM;
 392}
 393
 394static void __net_exit ip_rt_do_proc_exit(struct net *net)
 395{
 396	remove_proc_entry("rt_cache", net->proc_net_stat);
 397	remove_proc_entry("rt_cache", net->proc_net);
 398#ifdef CONFIG_IP_ROUTE_CLASSID
 399	remove_proc_entry("rt_acct", net->proc_net);
 400#endif
 401}
 402
 403static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 404	.init = ip_rt_do_proc_init,
 405	.exit = ip_rt_do_proc_exit,
 406};
 407
 408static int __init ip_rt_proc_init(void)
 409{
 410	return register_pernet_subsys(&ip_rt_proc_ops);
 411}
 412
 413#else
 414static inline int ip_rt_proc_init(void)
 415{
 416	return 0;
 417}
 418#endif /* CONFIG_PROC_FS */
 419
 420static inline bool rt_is_expired(const struct rtable *rth)
 421{
 422	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 423}
 424
 425void rt_cache_flush(struct net *net)
 426{
 427	rt_genid_bump_ipv4(net);
 
 428}
 429
 430static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 431					   struct sk_buff *skb,
 432					   const void *daddr)
 433{
 434	const struct rtable *rt = container_of(dst, struct rtable, dst);
 435	struct net_device *dev = dst->dev;
 436	struct neighbour *n;
 
 
 437
 438	rcu_read_lock_bh();
 
 
 
 
 439
 440	if (likely(rt->rt_gw_family == AF_INET)) {
 441		n = ip_neigh_gw4(dev, rt->rt_gw4);
 442	} else if (rt->rt_gw_family == AF_INET6) {
 443		n = ip_neigh_gw6(dev, &rt->rt_gw6);
 444        } else {
 445		__be32 pkey;
 446
 447		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 448		n = ip_neigh_gw4(dev, pkey);
 449	}
 450
 451	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 452		n = NULL;
 
 
 
 
 
 453
 454	rcu_read_unlock_bh();
 
 
 
 
 
 
 
 
 
 
 
 
 455
 456	return n;
 
 
 
 
 457}
 458
 459static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 460{
 461	const struct rtable *rt = container_of(dst, struct rtable, dst);
 462	struct net_device *dev = dst->dev;
 463	const __be32 *pkey = daddr;
 464
 465	if (rt->rt_gw_family == AF_INET) {
 466		pkey = (const __be32 *)&rt->rt_gw4;
 467	} else if (rt->rt_gw_family == AF_INET6) {
 468		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 469	} else if (!daddr ||
 470		 (rt->rt_flags &
 471		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 472		return;
 473	}
 474	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 475}
 476
 477#define IP_IDENTS_SZ 2048u
 
 
 
 
 
 
 
 
 478
 479static atomic_t *ip_idents __read_mostly;
 480static u32 *ip_tstamps __read_mostly;
 
 
 481
 482/* In order to protect privacy, we add a perturbation to identifiers
 483 * if one generator is seldom used. This makes hard for an attacker
 484 * to infer how many packets were sent between two points in time.
 485 */
 486u32 ip_idents_reserve(u32 hash, int segs)
 487{
 488	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 489	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 490	u32 old = READ_ONCE(*p_tstamp);
 491	u32 now = (u32)jiffies;
 492	u32 new, delta = 0;
 493
 494	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 495		delta = prandom_u32_max(now - old);
 
 
 
 
 
 
 
 496
 497	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
 498	do {
 499		old = (u32)atomic_read(p_id);
 500		new = old + delta + segs;
 501	} while (atomic_cmpxchg(p_id, old, new) != old);
 
 
 
 
 502
 503	return new - segs;
 504}
 505EXPORT_SYMBOL(ip_idents_reserve);
 506
 507void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 508{
 509	u32 hash, id;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 510
 511	/* Note the following code is not safe, but this is okay. */
 512	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 513		get_random_bytes(&net->ipv4.ip_id_key,
 514				 sizeof(net->ipv4.ip_id_key));
 515
 516	hash = siphash_3u32((__force u32)iph->daddr,
 517			    (__force u32)iph->saddr,
 518			    iph->protocol,
 519			    &net->ipv4.ip_id_key);
 520	id = ip_idents_reserve(hash, segs);
 521	iph->id = htons(id);
 522}
 523EXPORT_SYMBOL(__ip_select_ident);
 524
 525static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 526			     const struct sock *sk,
 527			     const struct iphdr *iph,
 528			     int oif, u8 tos,
 529			     u8 prot, u32 mark, int flow_flags)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 530{
 531	if (sk) {
 532		const struct inet_sock *inet = inet_sk(sk);
 533
 534		oif = sk->sk_bound_dev_if;
 535		mark = sk->sk_mark;
 536		tos = RT_CONN_FLAGS(sk);
 537		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 538	}
 539	flowi4_init_output(fl4, oif, mark, tos,
 540			   RT_SCOPE_UNIVERSE, prot,
 541			   flow_flags,
 542			   iph->daddr, iph->saddr, 0, 0,
 543			   sock_net_uid(net, sk));
 544}
 545
 546static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 547			       const struct sock *sk)
 
 
 
 
 
 548{
 549	const struct net *net = dev_net(skb->dev);
 550	const struct iphdr *iph = ip_hdr(skb);
 551	int oif = skb->dev->ifindex;
 552	u8 tos = RT_TOS(iph->tos);
 553	u8 prot = iph->protocol;
 554	u32 mark = skb->mark;
 555
 556	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 
 557}
 558
 559static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 
 
 
 
 560{
 561	const struct inet_sock *inet = inet_sk(sk);
 562	const struct ip_options_rcu *inet_opt;
 563	__be32 daddr = inet->inet_daddr;
 
 564
 565	rcu_read_lock();
 566	inet_opt = rcu_dereference(inet->inet_opt);
 567	if (inet_opt && inet_opt->opt.srr)
 568		daddr = inet_opt->opt.faddr;
 569	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 570			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 571			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 572			   inet_sk_flowi_flags(sk),
 573			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 574	rcu_read_unlock();
 575}
 576
 577static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 578				 const struct sk_buff *skb)
 579{
 580	if (skb)
 581		build_skb_flow_key(fl4, skb, sk);
 582	else
 583		build_sk_flow_key(fl4, sk);
 584}
 585
 586static DEFINE_SPINLOCK(fnhe_lock);
 
 587
 588static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 
 
 
 
 
 
 
 
 
 
 589{
 590	struct rtable *rt;
 
 
 
 
 
 
 
 
 591
 592	rt = rcu_dereference(fnhe->fnhe_rth_input);
 593	if (rt) {
 594		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 595		dst_dev_put(&rt->dst);
 596		dst_release(&rt->dst);
 
 
 
 
 
 
 597	}
 598	rt = rcu_dereference(fnhe->fnhe_rth_output);
 599	if (rt) {
 600		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 601		dst_dev_put(&rt->dst);
 602		dst_release(&rt->dst);
 
 
 
 
 
 
 
 
 
 
 
 
 
 603	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 604}
 605
 606static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 
 
 
 607{
 608	struct fib_nh_exception *fnhe, *oldest;
 
 609
 610	oldest = rcu_dereference(hash->chain);
 611	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 612	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 613		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 614			oldest = fnhe;
 615	}
 616	fnhe_flush_routes(oldest);
 617	return oldest;
 618}
 619
 620static inline u32 fnhe_hashfun(__be32 daddr)
 621{
 622	static u32 fnhe_hashrnd __read_mostly;
 623	u32 hval;
 
 
 
 
 
 
 
 
 
 
 624
 625	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 626	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 627	return hash_32(hval, FNHE_HASH_SHIFT);
 
 628}
 629
 630static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 631{
 632	rt->rt_pmtu = fnhe->fnhe_pmtu;
 633	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 634	rt->dst.expires = fnhe->fnhe_expires;
 
 635
 636	if (fnhe->fnhe_gw) {
 637		rt->rt_flags |= RTCF_REDIRECTED;
 638		rt->rt_uses_gateway = 1;
 639		rt->rt_gw_family = AF_INET;
 640		rt->rt_gw4 = fnhe->fnhe_gw;
 641	}
 642}
 643
 644static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 645				  __be32 gw, u32 pmtu, bool lock,
 646				  unsigned long expires)
 647{
 648	struct fnhe_hash_bucket *hash;
 649	struct fib_nh_exception *fnhe;
 650	struct rtable *rt;
 651	u32 genid, hval;
 652	unsigned int i;
 653	int depth;
 
 
 
 
 
 
 
 654
 655	genid = fnhe_genid(dev_net(nhc->nhc_dev));
 656	hval = fnhe_hashfun(daddr);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 657
 658	spin_lock_bh(&fnhe_lock);
 
 
 
 
 
 
 
 
 
 
 659
 660	hash = rcu_dereference(nhc->nhc_exceptions);
 661	if (!hash) {
 662		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 663		if (!hash)
 664			goto out_unlock;
 665		rcu_assign_pointer(nhc->nhc_exceptions, hash);
 666	}
 667
 668	hash += hval;
 669
 670	depth = 0;
 671	for (fnhe = rcu_dereference(hash->chain); fnhe;
 672	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 673		if (fnhe->fnhe_daddr == daddr)
 674			break;
 675		depth++;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 676	}
 677
 678	if (fnhe) {
 679		if (fnhe->fnhe_genid != genid)
 680			fnhe->fnhe_genid = genid;
 681		if (gw)
 682			fnhe->fnhe_gw = gw;
 683		if (pmtu) {
 684			fnhe->fnhe_pmtu = pmtu;
 685			fnhe->fnhe_mtu_locked = lock;
 686		}
 687		fnhe->fnhe_expires = max(1UL, expires);
 688		/* Update all cached dsts too */
 689		rt = rcu_dereference(fnhe->fnhe_rth_input);
 690		if (rt)
 691			fill_route_from_fnhe(rt, fnhe);
 692		rt = rcu_dereference(fnhe->fnhe_rth_output);
 693		if (rt)
 694			fill_route_from_fnhe(rt, fnhe);
 695	} else {
 696		if (depth > FNHE_RECLAIM_DEPTH)
 697			fnhe = fnhe_oldest(hash);
 698		else {
 699			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 700			if (!fnhe)
 701				goto out_unlock;
 702
 703			fnhe->fnhe_next = hash->chain;
 704			rcu_assign_pointer(hash->chain, fnhe);
 705		}
 706		fnhe->fnhe_genid = genid;
 707		fnhe->fnhe_daddr = daddr;
 708		fnhe->fnhe_gw = gw;
 709		fnhe->fnhe_pmtu = pmtu;
 710		fnhe->fnhe_mtu_locked = lock;
 711		fnhe->fnhe_expires = max(1UL, expires);
 712
 713		/* Exception created; mark the cached routes for the nexthop
 714		 * stale, so anyone caching it rechecks if this exception
 715		 * applies to them.
 716		 */
 717		rt = rcu_dereference(nhc->nhc_rth_input);
 718		if (rt)
 719			rt->dst.obsolete = DST_OBSOLETE_KILL;
 
 
 
 
 
 
 
 
 
 
 
 
 720
 721		for_each_possible_cpu(i) {
 722			struct rtable __rcu **prt;
 723			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 724			rt = rcu_dereference(*prt);
 725			if (rt)
 726				rt->dst.obsolete = DST_OBSOLETE_KILL;
 727		}
 728	}
 729
 730	fnhe->fnhe_stamp = jiffies;
 
 
 
 
 
 
 
 
 
 
 
 731
 732out_unlock:
 733	spin_unlock_bh(&fnhe_lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 734}
 735
 736static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 737			     bool kill_route)
 
 738{
 739	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 740	__be32 old_gw = ip_hdr(skb)->saddr;
 741	struct net_device *dev = skb->dev;
 742	struct in_device *in_dev;
 743	struct fib_result res;
 744	struct neighbour *n;
 745	struct net *net;
 746
 747	switch (icmp_hdr(skb)->code & 7) {
 748	case ICMP_REDIR_NET:
 749	case ICMP_REDIR_NETTOS:
 750	case ICMP_REDIR_HOST:
 751	case ICMP_REDIR_HOSTTOS:
 752		break;
 753
 754	default:
 755		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 756	}
 
 
 757
 758	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 759		return;
 
 
 
 
 
 760
 761	in_dev = __in_dev_get_rcu(dev);
 762	if (!in_dev)
 763		return;
 764
 765	net = dev_net(dev);
 766	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 767	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 768	    ipv4_is_zeronet(new_gw))
 769		goto reject_redirect;
 770
 771	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 772		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 773			goto reject_redirect;
 774		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 775			goto reject_redirect;
 776	} else {
 777		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 778			goto reject_redirect;
 779	}
 780
 781	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 782	if (!n)
 783		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 784	if (!IS_ERR(n)) {
 785		if (!(n->nud_state & NUD_VALID)) {
 786			neigh_event_send(n, NULL);
 787		} else {
 788			if (fib_lookup(net, fl4, &res, 0) == 0) {
 789				struct fib_nh_common *nhc = FIB_RES_NHC(res);
 790
 791				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 792						0, false,
 793						jiffies + ip_rt_gc_timeout);
 794			}
 795			if (kill_route)
 796				rt->dst.obsolete = DST_OBSOLETE_KILL;
 797			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 798		}
 799		neigh_release(n);
 800	}
 801	return;
 802
 803reject_redirect:
 804#ifdef CONFIG_IP_ROUTE_VERBOSE
 805	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 806		const struct iphdr *iph = (const struct iphdr *) skb->data;
 807		__be32 daddr = iph->daddr;
 808		__be32 saddr = iph->saddr;
 809
 810		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 811				     "  Advised path = %pI4 -> %pI4\n",
 812				     &old_gw, dev->name, &new_gw,
 813				     &saddr, &daddr);
 814	}
 815#endif
 816	;
 817}
 818
 819static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 820{
 821	struct rtable *rt;
 822	struct flowi4 fl4;
 823	const struct iphdr *iph = (const struct iphdr *) skb->data;
 824	struct net *net = dev_net(skb->dev);
 825	int oif = skb->dev->ifindex;
 826	u8 tos = RT_TOS(iph->tos);
 827	u8 prot = iph->protocol;
 828	u32 mark = skb->mark;
 829
 830	rt = (struct rtable *) dst;
 
 
 831
 832	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 833	__ip_do_redirect(rt, skb, &fl4, true);
 834}
 835
 836static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 837{
 838	struct rtable *rt = (struct rtable *)dst;
 839	struct dst_entry *ret = dst;
 840
 841	if (rt) {
 842		if (dst->obsolete > 0) {
 843			ip_rt_put(rt);
 844			ret = NULL;
 845		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 846			   rt->dst.expires) {
 847			ip_rt_put(rt);
 
 
 848			ret = NULL;
 
 
 849		}
 850	}
 851	return ret;
 852}
 853
 854/*
 855 * Algorithm:
 856 *	1. The first ip_rt_redirect_number redirects are sent
 857 *	   with exponential backoff, then we stop sending them at all,
 858 *	   assuming that the host ignores our redirects.
 859 *	2. If we did not see packets requiring redirects
 860 *	   during ip_rt_redirect_silence, we assume that the host
 861 *	   forgot redirected route and start to send redirects again.
 862 *
 863 * This algorithm is much cheaper and more intelligent than dumb load limiting
 864 * in icmp.c.
 865 *
 866 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 867 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 868 */
 869
 870void ip_rt_send_redirect(struct sk_buff *skb)
 871{
 872	struct rtable *rt = skb_rtable(skb);
 873	struct in_device *in_dev;
 874	struct inet_peer *peer;
 875	struct net *net;
 876	int log_martians;
 877	int vif;
 878
 879	rcu_read_lock();
 880	in_dev = __in_dev_get_rcu(rt->dst.dev);
 881	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 882		rcu_read_unlock();
 883		return;
 884	}
 885	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 886	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 887	rcu_read_unlock();
 888
 889	net = dev_net(rt->dst.dev);
 890	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 
 891	if (!peer) {
 892		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 893			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 894		return;
 895	}
 896
 897	/* No redirected packets during ip_rt_redirect_silence;
 898	 * reset the algorithm.
 899	 */
 900	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 901		peer->rate_tokens = 0;
 902		peer->n_redirects = 0;
 903	}
 904
 905	/* Too many ignored redirects; do not send anything
 906	 * set dst.rate_last to the last seen redirected packet.
 907	 */
 908	if (peer->n_redirects >= ip_rt_redirect_number) {
 909		peer->rate_last = jiffies;
 910		goto out_put_peer;
 911	}
 912
 913	/* Check for load limit; set rate_last to the latest sent
 914	 * redirect.
 915	 */
 916	if (peer->rate_tokens == 0 ||
 917	    time_after(jiffies,
 918		       (peer->rate_last +
 919			(ip_rt_redirect_load << peer->n_redirects)))) {
 920		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 921
 922		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 923		peer->rate_last = jiffies;
 924		++peer->n_redirects;
 925#ifdef CONFIG_IP_ROUTE_VERBOSE
 926		if (log_martians &&
 927		    peer->n_redirects == ip_rt_redirect_number)
 928			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 929					     &ip_hdr(skb)->saddr, inet_iif(skb),
 930					     &ip_hdr(skb)->daddr, &gw);
 
 931#endif
 932	}
 933out_put_peer:
 934	inet_putpeer(peer);
 935}
 936
 937static int ip_error(struct sk_buff *skb)
 938{
 939	struct rtable *rt = skb_rtable(skb);
 940	struct net_device *dev = skb->dev;
 941	struct in_device *in_dev;
 942	struct inet_peer *peer;
 943	unsigned long now;
 944	struct net *net;
 945	bool send;
 946	int code;
 947
 948	if (netif_is_l3_master(skb->dev)) {
 949		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 950		if (!dev)
 951			goto out;
 952	}
 953
 954	in_dev = __in_dev_get_rcu(dev);
 955
 956	/* IP on this device is disabled. */
 957	if (!in_dev)
 958		goto out;
 959
 960	net = dev_net(rt->dst.dev);
 961	if (!IN_DEV_FORWARD(in_dev)) {
 962		switch (rt->dst.error) {
 963		case EHOSTUNREACH:
 964			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 965			break;
 966
 967		case ENETUNREACH:
 968			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 969			break;
 970		}
 971		goto out;
 972	}
 973
 974	switch (rt->dst.error) {
 975	case EINVAL:
 976	default:
 977		goto out;
 978	case EHOSTUNREACH:
 979		code = ICMP_HOST_UNREACH;
 980		break;
 981	case ENETUNREACH:
 982		code = ICMP_NET_UNREACH;
 983		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 
 984		break;
 985	case EACCES:
 986		code = ICMP_PKT_FILTERED;
 987		break;
 988	}
 989
 990	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 991			       l3mdev_master_ifindex(skb->dev), 1);
 
 992
 993	send = true;
 994	if (peer) {
 995		now = jiffies;
 996		peer->rate_tokens += now - peer->rate_last;
 997		if (peer->rate_tokens > ip_rt_error_burst)
 998			peer->rate_tokens = ip_rt_error_burst;
 999		peer->rate_last = now;
1000		if (peer->rate_tokens >= ip_rt_error_cost)
1001			peer->rate_tokens -= ip_rt_error_cost;
1002		else
1003			send = false;
1004		inet_putpeer(peer);
1005	}
1006	if (send)
1007		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1008
1009out:	kfree_skb(skb);
1010	return 0;
1011}
1012
1013static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1014{
1015	struct dst_entry *dst = &rt->dst;
1016	u32 old_mtu = ipv4_mtu(dst);
1017	struct fib_result res;
1018	bool lock = false;
1019
1020	if (ip_mtu_locked(dst))
1021		return;
1022
1023	if (old_mtu < mtu)
1024		return;
1025
1026	if (mtu < ip_rt_min_pmtu) {
1027		lock = true;
1028		mtu = min(old_mtu, ip_rt_min_pmtu);
1029	}
1030
1031	if (rt->rt_pmtu == mtu && !lock &&
1032	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1033		return;
1034
1035	rcu_read_lock();
1036	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1037		struct fib_nh_common *nhc = FIB_RES_NHC(res);
1038
1039		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1040				      jiffies + ip_rt_mtu_expires);
1041	}
1042	rcu_read_unlock();
1043}
1044
1045static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1046			      struct sk_buff *skb, u32 mtu)
1047{
1048	struct rtable *rt = (struct rtable *) dst;
1049	struct flowi4 fl4;
1050
1051	ip_rt_build_flow_key(&fl4, sk, skb);
1052	__ip_rt_update_pmtu(rt, &fl4, mtu);
 
 
1053}
1054
1055void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1056		      int oif, u8 protocol)
 
1057{
1058	const struct iphdr *iph = (const struct iphdr *) skb->data;
1059	struct flowi4 fl4;
1060	struct rtable *rt;
1061	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1062
1063	__build_flow_key(net, &fl4, NULL, iph, oif,
1064			 RT_TOS(iph->tos), protocol, mark, 0);
1065	rt = __ip_route_output_key(net, &fl4);
1066	if (!IS_ERR(rt)) {
1067		__ip_rt_update_pmtu(rt, &fl4, mtu);
1068		ip_rt_put(rt);
1069	}
1070}
1071EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1072
1073static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074{
1075	const struct iphdr *iph = (const struct iphdr *) skb->data;
1076	struct flowi4 fl4;
1077	struct rtable *rt;
1078
1079	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1080
1081	if (!fl4.flowi4_mark)
1082		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1083
1084	rt = __ip_route_output_key(sock_net(sk), &fl4);
1085	if (!IS_ERR(rt)) {
1086		__ip_rt_update_pmtu(rt, &fl4, mtu);
1087		ip_rt_put(rt);
1088	}
 
1089}
1090
1091void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092{
1093	const struct iphdr *iph = (const struct iphdr *) skb->data;
1094	struct flowi4 fl4;
1095	struct rtable *rt;
1096	struct dst_entry *odst = NULL;
1097	bool new = false;
1098	struct net *net = sock_net(sk);
1099
1100	bh_lock_sock(sk);
1101
1102	if (!ip_sk_accept_pmtu(sk))
1103		goto out;
1104
1105	odst = sk_dst_get(sk);
1106
1107	if (sock_owned_by_user(sk) || !odst) {
1108		__ipv4_sk_update_pmtu(skb, sk, mtu);
1109		goto out;
1110	}
 
 
 
 
 
 
 
 
1111
1112	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
 
 
 
1113
1114	rt = (struct rtable *)odst;
1115	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1116		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1117		if (IS_ERR(rt))
1118			goto out;
1119
1120		new = true;
1121	}
 
 
 
1122
1123	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
 
 
1124
1125	if (!dst_check(&rt->dst, 0)) {
1126		if (new)
1127			dst_release(&rt->dst);
1128
1129		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1130		if (IS_ERR(rt))
1131			goto out;
1132
1133		new = true;
 
 
 
1134	}
1135
1136	if (new)
1137		sk_dst_set(sk, &rt->dst);
1138
1139out:
1140	bh_unlock_sock(sk);
1141	dst_release(odst);
1142}
1143EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1144
1145void ipv4_redirect(struct sk_buff *skb, struct net *net,
1146		   int oif, u8 protocol)
1147{
1148	const struct iphdr *iph = (const struct iphdr *) skb->data;
1149	struct flowi4 fl4;
1150	struct rtable *rt;
1151
1152	__build_flow_key(net, &fl4, NULL, iph, oif,
1153			 RT_TOS(iph->tos), protocol, 0, 0);
1154	rt = __ip_route_output_key(net, &fl4);
1155	if (!IS_ERR(rt)) {
1156		__ip_do_redirect(rt, skb, &fl4, false);
1157		ip_rt_put(rt);
1158	}
1159}
1160EXPORT_SYMBOL_GPL(ipv4_redirect);
1161
1162void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1163{
1164	const struct iphdr *iph = (const struct iphdr *) skb->data;
1165	struct flowi4 fl4;
1166	struct rtable *rt;
1167	struct net *net = sock_net(sk);
1168
1169	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1170	rt = __ip_route_output_key(net, &fl4);
1171	if (!IS_ERR(rt)) {
1172		__ip_do_redirect(rt, skb, &fl4, false);
1173		ip_rt_put(rt);
 
 
 
 
 
 
 
 
 
1174	}
 
1175}
1176EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1177
1178static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1179{
1180	struct rtable *rt = (struct rtable *) dst;
1181
1182	/* All IPV4 dsts are created with ->obsolete set to the value
1183	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1184	 * into this function always.
1185	 *
1186	 * When a PMTU/redirect information update invalidates a route,
1187	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1188	 * DST_OBSOLETE_DEAD.
1189	 */
1190	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1191		return NULL;
1192	return dst;
1193}
1194
1195static void ipv4_send_dest_unreach(struct sk_buff *skb)
1196{
1197	struct ip_options opt;
1198	int res;
1199
1200	/* Recompile ip options since IPCB may not be valid anymore.
1201	 * Also check we have a reasonable ipv4 header.
1202	 */
1203	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1204	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1205		return;
 
 
 
 
1206
1207	memset(&opt, 0, sizeof(opt));
1208	if (ip_hdr(skb)->ihl > 5) {
1209		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1210			return;
1211		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1212
1213		rcu_read_lock();
1214		res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1215		rcu_read_unlock();
 
1216
1217		if (res)
1218			return;
 
 
 
 
 
1219	}
1220	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1221}
1222
 
1223static void ipv4_link_failure(struct sk_buff *skb)
1224{
1225	struct rtable *rt;
1226
1227	ipv4_send_dest_unreach(skb);
1228
1229	rt = skb_rtable(skb);
1230	if (rt)
1231		dst_set_expires(&rt->dst, 0);
1232}
1233
1234static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1235{
1236	pr_debug("%s: %pI4 -> %pI4, %s\n",
1237		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1238		 skb->dev ? skb->dev->name : "?");
1239	kfree_skb(skb);
1240	WARN_ON(1);
1241	return 0;
1242}
1243
1244/*
1245   We do not cache source address of outgoing interface,
1246   because it is used only by IP RR, TS and SRR options,
1247   so that it out of fast path.
1248
1249   BTW remember: "addr" is allowed to be not aligned
1250   in IP options!
1251 */
1252
1253void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1254{
1255	__be32 src;
1256
1257	if (rt_is_output_route(rt))
1258		src = ip_hdr(skb)->saddr;
1259	else {
1260		struct fib_result res;
1261		struct iphdr *iph = ip_hdr(skb);
1262		struct flowi4 fl4 = {
1263			.daddr = iph->daddr,
1264			.saddr = iph->saddr,
1265			.flowi4_tos = RT_TOS(iph->tos),
1266			.flowi4_oif = rt->dst.dev->ifindex,
1267			.flowi4_iif = skb->dev->ifindex,
1268			.flowi4_mark = skb->mark,
1269		};
 
 
 
1270
1271		rcu_read_lock();
1272		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1273			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1274		else
1275			src = inet_select_addr(rt->dst.dev,
1276					       rt_nexthop(rt, iph->daddr),
1277					       RT_SCOPE_UNIVERSE);
1278		rcu_read_unlock();
1279	}
1280	memcpy(addr, &src, 4);
1281}
1282
1283#ifdef CONFIG_IP_ROUTE_CLASSID
1284static void set_class_tag(struct rtable *rt, u32 tag)
1285{
1286	if (!(rt->dst.tclassid & 0xFFFF))
1287		rt->dst.tclassid |= tag & 0xFFFF;
1288	if (!(rt->dst.tclassid & 0xFFFF0000))
1289		rt->dst.tclassid |= tag & 0xFFFF0000;
1290}
1291#endif
1292
1293static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1294{
1295	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1296	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1297				    ip_rt_min_advmss);
1298
1299	return min(advmss, IPV4_MAX_PMTU - header_size);
1300}
1301
1302static unsigned int ipv4_mtu(const struct dst_entry *dst)
1303{
1304	const struct rtable *rt = (const struct rtable *) dst;
1305	unsigned int mtu = rt->rt_pmtu;
1306
1307	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1308		mtu = dst_metric_raw(dst, RTAX_MTU);
1309
1310	if (mtu)
1311		return mtu;
1312
1313	mtu = READ_ONCE(dst->dev->mtu);
1314
1315	if (unlikely(ip_mtu_locked(dst))) {
1316		if (rt->rt_uses_gateway && mtu > 576)
1317			mtu = 576;
1318	}
1319
1320	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1321
1322	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1323}
1324
1325static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1326{
1327	struct fnhe_hash_bucket *hash;
1328	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1329	u32 hval = fnhe_hashfun(daddr);
1330
1331	spin_lock_bh(&fnhe_lock);
1332
1333	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1334					 lockdep_is_held(&fnhe_lock));
1335	hash += hval;
1336
1337	fnhe_p = &hash->chain;
1338	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1339	while (fnhe) {
1340		if (fnhe->fnhe_daddr == daddr) {
1341			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1342				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1343			/* set fnhe_daddr to 0 to ensure it won't bind with
1344			 * new dsts in rt_bind_exception().
1345			 */
1346			fnhe->fnhe_daddr = 0;
1347			fnhe_flush_routes(fnhe);
1348			kfree_rcu(fnhe, rcu);
1349			break;
1350		}
1351		fnhe_p = &fnhe->fnhe_next;
1352		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1353						 lockdep_is_held(&fnhe_lock));
1354	}
1355
1356	spin_unlock_bh(&fnhe_lock);
1357}
1358
1359static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1360					       __be32 daddr)
1361{
1362	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1363	struct fib_nh_exception *fnhe;
1364	u32 hval;
1365
1366	if (!hash)
1367		return NULL;
1368
1369	hval = fnhe_hashfun(daddr);
1370
1371	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1372	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1373		if (fnhe->fnhe_daddr == daddr) {
1374			if (fnhe->fnhe_expires &&
1375			    time_after(jiffies, fnhe->fnhe_expires)) {
1376				ip_del_fnhe(nhc, daddr);
1377				break;
1378			}
1379			return fnhe;
1380		}
1381	}
1382	return NULL;
1383}
1384
1385/* MTU selection:
1386 * 1. mtu on route is locked - use it
1387 * 2. mtu from nexthop exception
1388 * 3. mtu from egress device
1389 */
1390
1391u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1392{
1393	struct fib_nh_common *nhc = res->nhc;
1394	struct net_device *dev = nhc->nhc_dev;
1395	struct fib_info *fi = res->fi;
1396	u32 mtu = 0;
1397
1398	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1399	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1400		mtu = fi->fib_mtu;
1401
1402	if (likely(!mtu)) {
1403		struct fib_nh_exception *fnhe;
1404
1405		fnhe = find_exception(nhc, daddr);
1406		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1407			mtu = fnhe->fnhe_pmtu;
1408	}
1409
1410	if (likely(!mtu))
1411		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1412
1413	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1414}
1415
1416static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1417			      __be32 daddr, const bool do_cache)
1418{
1419	bool ret = false;
1420
1421	spin_lock_bh(&fnhe_lock);
1422
1423	if (daddr == fnhe->fnhe_daddr) {
1424		struct rtable __rcu **porig;
1425		struct rtable *orig;
1426		int genid = fnhe_genid(dev_net(rt->dst.dev));
1427
1428		if (rt_is_input_route(rt))
1429			porig = &fnhe->fnhe_rth_input;
1430		else
1431			porig = &fnhe->fnhe_rth_output;
1432		orig = rcu_dereference(*porig);
1433
1434		if (fnhe->fnhe_genid != genid) {
1435			fnhe->fnhe_genid = genid;
1436			fnhe->fnhe_gw = 0;
1437			fnhe->fnhe_pmtu = 0;
1438			fnhe->fnhe_expires = 0;
1439			fnhe->fnhe_mtu_locked = false;
1440			fnhe_flush_routes(fnhe);
1441			orig = NULL;
1442		}
1443		fill_route_from_fnhe(rt, fnhe);
1444		if (!rt->rt_gw4) {
1445			rt->rt_gw4 = daddr;
1446			rt->rt_gw_family = AF_INET;
1447		}
1448
1449		if (do_cache) {
1450			dst_hold(&rt->dst);
1451			rcu_assign_pointer(*porig, rt);
1452			if (orig) {
1453				dst_dev_put(&orig->dst);
1454				dst_release(&orig->dst);
1455			}
1456			ret = true;
1457		}
1458
1459		fnhe->fnhe_stamp = jiffies;
1460	}
1461	spin_unlock_bh(&fnhe_lock);
1462
1463	return ret;
1464}
1465
1466static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1467{
1468	struct rtable *orig, *prev, **p;
1469	bool ret = true;
1470
1471	if (rt_is_input_route(rt)) {
1472		p = (struct rtable **)&nhc->nhc_rth_input;
1473	} else {
1474		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1475	}
1476	orig = *p;
1477
1478	/* hold dst before doing cmpxchg() to avoid race condition
1479	 * on this dst
1480	 */
1481	dst_hold(&rt->dst);
1482	prev = cmpxchg(p, orig, rt);
1483	if (prev == orig) {
1484		if (orig) {
1485			rt_add_uncached_list(orig);
1486			dst_release(&orig->dst);
 
 
 
 
 
 
 
 
 
 
1487		}
1488	} else {
1489		dst_release(&rt->dst);
1490		ret = false;
1491	}
1492
1493	return ret;
1494}
1495
1496struct uncached_list {
1497	spinlock_t		lock;
1498	struct list_head	head;
1499};
1500
1501static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1502
1503void rt_add_uncached_list(struct rtable *rt)
1504{
1505	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1506
1507	rt->rt_uncached_list = ul;
1508
1509	spin_lock_bh(&ul->lock);
1510	list_add_tail(&rt->rt_uncached, &ul->head);
1511	spin_unlock_bh(&ul->lock);
1512}
1513
1514void rt_del_uncached_list(struct rtable *rt)
1515{
1516	if (!list_empty(&rt->rt_uncached)) {
1517		struct uncached_list *ul = rt->rt_uncached_list;
1518
1519		spin_lock_bh(&ul->lock);
1520		list_del(&rt->rt_uncached);
1521		spin_unlock_bh(&ul->lock);
1522	}
1523}
1524
1525static void ipv4_dst_destroy(struct dst_entry *dst)
1526{
1527	struct rtable *rt = (struct rtable *)dst;
1528
1529	ip_dst_metrics_put(dst);
1530	rt_del_uncached_list(rt);
1531}
1532
1533void rt_flush_dev(struct net_device *dev)
1534{
1535	struct rtable *rt;
1536	int cpu;
1537
1538	for_each_possible_cpu(cpu) {
1539		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1540
1541		spin_lock_bh(&ul->lock);
1542		list_for_each_entry(rt, &ul->head, rt_uncached) {
1543			if (rt->dst.dev != dev)
1544				continue;
1545			rt->dst.dev = blackhole_netdev;
1546			dev_hold(rt->dst.dev);
1547			dev_put(dev);
1548		}
1549		spin_unlock_bh(&ul->lock);
1550	}
1551}
1552
1553static bool rt_cache_valid(const struct rtable *rt)
1554{
1555	return	rt &&
1556		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1557		!rt_is_expired(rt);
1558}
1559
1560static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1561			   const struct fib_result *res,
1562			   struct fib_nh_exception *fnhe,
1563			   struct fib_info *fi, u16 type, u32 itag,
1564			   const bool do_cache)
1565{
1566	bool cached = false;
1567
1568	if (fi) {
1569		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1570
1571		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1572			rt->rt_uses_gateway = 1;
1573			rt->rt_gw_family = nhc->nhc_gw_family;
1574			/* only INET and INET6 are supported */
1575			if (likely(nhc->nhc_gw_family == AF_INET))
1576				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1577			else
1578				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1579		}
1580
1581		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1582
1583#ifdef CONFIG_IP_ROUTE_CLASSID
1584		if (nhc->nhc_family == AF_INET) {
1585			struct fib_nh *nh;
1586
1587			nh = container_of(nhc, struct fib_nh, nh_common);
1588			rt->dst.tclassid = nh->nh_tclassid;
1589		}
1590#endif
1591		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1592		if (unlikely(fnhe))
1593			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1594		else if (do_cache)
1595			cached = rt_cache_route(nhc, rt);
1596		if (unlikely(!cached)) {
1597			/* Routes we intend to cache in nexthop exception or
1598			 * FIB nexthop have the DST_NOCACHE bit clear.
1599			 * However, if we are unsuccessful at storing this
1600			 * route into the cache we really need to set it.
1601			 */
1602			if (!rt->rt_gw4) {
1603				rt->rt_gw_family = AF_INET;
1604				rt->rt_gw4 = daddr;
1605			}
1606			rt_add_uncached_list(rt);
1607		}
1608	} else
1609		rt_add_uncached_list(rt);
1610
1611#ifdef CONFIG_IP_ROUTE_CLASSID
1612#ifdef CONFIG_IP_MULTIPLE_TABLES
1613	set_class_tag(rt, res->tclassid);
1614#endif
1615	set_class_tag(rt, itag);
1616#endif
1617}
1618
1619struct rtable *rt_dst_alloc(struct net_device *dev,
1620			    unsigned int flags, u16 type,
1621			    bool nopolicy, bool noxfrm, bool will_cache)
1622{
1623	struct rtable *rt;
1624
1625	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1626		       (will_cache ? 0 : DST_HOST) |
1627		       (nopolicy ? DST_NOPOLICY : 0) |
1628		       (noxfrm ? DST_NOXFRM : 0));
1629
1630	if (rt) {
1631		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1632		rt->rt_flags = flags;
1633		rt->rt_type = type;
1634		rt->rt_is_input = 0;
1635		rt->rt_iif = 0;
1636		rt->rt_pmtu = 0;
1637		rt->rt_mtu_locked = 0;
1638		rt->rt_uses_gateway = 0;
1639		rt->rt_gw_family = 0;
1640		rt->rt_gw4 = 0;
1641		INIT_LIST_HEAD(&rt->rt_uncached);
1642
1643		rt->dst.output = ip_output;
1644		if (flags & RTCF_LOCAL)
1645			rt->dst.input = ip_local_deliver;
1646	}
1647
1648	return rt;
1649}
1650EXPORT_SYMBOL(rt_dst_alloc);
1651
1652struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1653{
1654	struct rtable *new_rt;
1655
1656	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1657			   rt->dst.flags);
1658
1659	if (new_rt) {
1660		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1661		new_rt->rt_flags = rt->rt_flags;
1662		new_rt->rt_type = rt->rt_type;
1663		new_rt->rt_is_input = rt->rt_is_input;
1664		new_rt->rt_iif = rt->rt_iif;
1665		new_rt->rt_pmtu = rt->rt_pmtu;
1666		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1667		new_rt->rt_gw_family = rt->rt_gw_family;
1668		if (rt->rt_gw_family == AF_INET)
1669			new_rt->rt_gw4 = rt->rt_gw4;
1670		else if (rt->rt_gw_family == AF_INET6)
1671			new_rt->rt_gw6 = rt->rt_gw6;
1672		INIT_LIST_HEAD(&new_rt->rt_uncached);
1673
1674		new_rt->dst.flags |= DST_HOST;
1675		new_rt->dst.input = rt->dst.input;
1676		new_rt->dst.output = rt->dst.output;
1677		new_rt->dst.error = rt->dst.error;
1678		new_rt->dst.lastuse = jiffies;
1679		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1680	}
1681	return new_rt;
1682}
1683EXPORT_SYMBOL(rt_dst_clone);
1684
1685/* called in rcu_read_lock() section */
1686int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1687			  u8 tos, struct net_device *dev,
1688			  struct in_device *in_dev, u32 *itag)
1689{
 
 
 
 
 
1690	int err;
1691
1692	/* Primary sanity checks. */
1693	if (!in_dev)
1694		return -EINVAL;
1695
1696	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1697	    skb->protocol != htons(ETH_P_IP))
1698		return -EINVAL;
1699
1700	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1701		return -EINVAL;
 
1702
1703	if (ipv4_is_zeronet(saddr)) {
1704		if (!ipv4_is_local_multicast(daddr) &&
1705		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1706			return -EINVAL;
1707	} else {
1708		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1709					  in_dev, itag);
1710		if (err < 0)
1711			return err;
1712	}
1713	return 0;
1714}
1715
1716/* called in rcu_read_lock() section */
1717static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1718			     u8 tos, struct net_device *dev, int our)
1719{
1720	struct in_device *in_dev = __in_dev_get_rcu(dev);
1721	unsigned int flags = RTCF_MULTICAST;
1722	struct rtable *rth;
1723	u32 itag = 0;
1724	int err;
1725
1726	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1727	if (err)
1728		return err;
1729
1730	if (our)
1731		flags |= RTCF_LOCAL;
1732
1733	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1734			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1735	if (!rth)
1736		return -ENOBUFS;
1737
1738#ifdef CONFIG_IP_ROUTE_CLASSID
1739	rth->dst.tclassid = itag;
1740#endif
1741	rth->dst.output = ip_rt_bug;
1742	rth->rt_is_input= 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1743
1744#ifdef CONFIG_IP_MROUTE
1745	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1746		rth->dst.input = ip_mr_input;
1747#endif
1748	RT_CACHE_STAT_INC(in_slow_mc);
1749
1750	skb_dst_set(skb, &rth->dst);
1751	return 0;
 
 
 
 
 
 
 
 
1752}
1753
1754
1755static void ip_handle_martian_source(struct net_device *dev,
1756				     struct in_device *in_dev,
1757				     struct sk_buff *skb,
1758				     __be32 daddr,
1759				     __be32 saddr)
1760{
1761	RT_CACHE_STAT_INC(in_martian_src);
1762#ifdef CONFIG_IP_ROUTE_VERBOSE
1763	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1764		/*
1765		 *	RFC1812 recommendation, if source is martian,
1766		 *	the only hint is MAC header.
1767		 */
1768		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1769			&daddr, &saddr, dev->name);
1770		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1771			print_hex_dump(KERN_WARNING, "ll header: ",
1772				       DUMP_PREFIX_OFFSET, 16, 1,
1773				       skb_mac_header(skb),
1774				       dev->hard_header_len, false);
 
 
 
 
 
1775		}
1776	}
1777#endif
1778}
1779
1780/* called in rcu_read_lock() section */
1781static int __mkroute_input(struct sk_buff *skb,
1782			   const struct fib_result *res,
1783			   struct in_device *in_dev,
1784			   __be32 daddr, __be32 saddr, u32 tos)
 
1785{
1786	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1787	struct net_device *dev = nhc->nhc_dev;
1788	struct fib_nh_exception *fnhe;
1789	struct rtable *rth;
1790	int err;
1791	struct in_device *out_dev;
1792	bool do_cache;
1793	u32 itag = 0;
 
1794
1795	/* get a working reference to the output device */
1796	out_dev = __in_dev_get_rcu(dev);
1797	if (!out_dev) {
1798		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
 
 
1799		return -EINVAL;
1800	}
1801
 
1802	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1803				  in_dev->dev, in_dev, &itag);
1804	if (err < 0) {
1805		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1806					 saddr);
1807
1808		goto cleanup;
1809	}
1810
1811	do_cache = res->fi && !itag;
1812	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1813	    skb->protocol == htons(ETH_P_IP)) {
1814		__be32 gw;
1815
1816		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1817		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1818		    inet_addr_onlink(out_dev, saddr, gw))
1819			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1820	}
1821
1822	if (skb->protocol != htons(ETH_P_IP)) {
1823		/* Not IP (i.e. ARP). Do not create route, if it is
1824		 * invalid for proxy arp. DNAT routes are always valid.
1825		 *
1826		 * Proxy arp feature have been extended to allow, ARP
1827		 * replies back to the same interface, to support
1828		 * Private VLAN switch technologies. See arp.c.
1829		 */
1830		if (out_dev == in_dev &&
1831		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1832			err = -EINVAL;
1833			goto cleanup;
1834		}
1835	}
1836
1837	fnhe = find_exception(nhc, daddr);
1838	if (do_cache) {
1839		if (fnhe)
1840			rth = rcu_dereference(fnhe->fnhe_rth_input);
1841		else
1842			rth = rcu_dereference(nhc->nhc_rth_input);
1843		if (rt_cache_valid(rth)) {
1844			skb_dst_set_noref(skb, &rth->dst);
1845			goto out;
1846		}
1847	}
1848
1849	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1850			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1851			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1852	if (!rth) {
1853		err = -ENOBUFS;
1854		goto cleanup;
1855	}
1856
1857	rth->rt_is_input = 1;
1858	RT_CACHE_STAT_INC(in_slow_tot);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1859
1860	rth->dst.input = ip_forward;
 
1861
1862	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1863		       do_cache);
1864	lwtunnel_set_redirect(&rth->dst);
1865	skb_dst_set(skb, &rth->dst);
1866out:
1867	err = 0;
1868 cleanup:
1869	return err;
1870}
1871
1872#ifdef CONFIG_IP_ROUTE_MULTIPATH
1873/* To make ICMP packets follow the right flow, the multipath hash is
1874 * calculated from the inner IP addresses.
1875 */
1876static void ip_multipath_l3_keys(const struct sk_buff *skb,
1877				 struct flow_keys *hash_keys)
1878{
1879	const struct iphdr *outer_iph = ip_hdr(skb);
1880	const struct iphdr *key_iph = outer_iph;
1881	const struct iphdr *inner_iph;
1882	const struct icmphdr *icmph;
1883	struct iphdr _inner_iph;
1884	struct icmphdr _icmph;
1885
1886	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1887		goto out;
1888
1889	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1890		goto out;
1891
1892	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1893				   &_icmph);
1894	if (!icmph)
1895		goto out;
1896
1897	if (icmph->type != ICMP_DEST_UNREACH &&
1898	    icmph->type != ICMP_REDIRECT &&
1899	    icmph->type != ICMP_TIME_EXCEEDED &&
1900	    icmph->type != ICMP_PARAMETERPROB)
1901		goto out;
1902
1903	inner_iph = skb_header_pointer(skb,
1904				       outer_iph->ihl * 4 + sizeof(_icmph),
1905				       sizeof(_inner_iph), &_inner_iph);
1906	if (!inner_iph)
1907		goto out;
1908
1909	key_iph = inner_iph;
1910out:
1911	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1912	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1913}
1914
1915/* if skb is set it will be used and fl4 can be NULL */
1916int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1917		       const struct sk_buff *skb, struct flow_keys *flkeys)
1918{
1919	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1920	struct flow_keys hash_keys;
1921	u32 mhash;
1922
1923	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1924	case 0:
1925		memset(&hash_keys, 0, sizeof(hash_keys));
1926		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1927		if (skb) {
1928			ip_multipath_l3_keys(skb, &hash_keys);
1929		} else {
1930			hash_keys.addrs.v4addrs.src = fl4->saddr;
1931			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1932		}
1933		break;
1934	case 1:
1935		/* skb is currently provided only when forwarding */
1936		if (skb) {
1937			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1938			struct flow_keys keys;
1939
1940			/* short-circuit if we already have L4 hash present */
1941			if (skb->l4_hash)
1942				return skb_get_hash_raw(skb) >> 1;
1943
1944			memset(&hash_keys, 0, sizeof(hash_keys));
1945
1946			if (!flkeys) {
1947				skb_flow_dissect_flow_keys(skb, &keys, flag);
1948				flkeys = &keys;
1949			}
1950
1951			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1952			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1953			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1954			hash_keys.ports.src = flkeys->ports.src;
1955			hash_keys.ports.dst = flkeys->ports.dst;
1956			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1957		} else {
1958			memset(&hash_keys, 0, sizeof(hash_keys));
1959			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1960			hash_keys.addrs.v4addrs.src = fl4->saddr;
1961			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1962			hash_keys.ports.src = fl4->fl4_sport;
1963			hash_keys.ports.dst = fl4->fl4_dport;
1964			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1965		}
1966		break;
1967	case 2:
1968		memset(&hash_keys, 0, sizeof(hash_keys));
1969		/* skb is currently provided only when forwarding */
1970		if (skb) {
1971			struct flow_keys keys;
1972
1973			skb_flow_dissect_flow_keys(skb, &keys, 0);
1974			/* Inner can be v4 or v6 */
1975			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1976				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1977				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1978				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1979			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1980				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1981				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1982				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1983				hash_keys.tags.flow_label = keys.tags.flow_label;
1984				hash_keys.basic.ip_proto = keys.basic.ip_proto;
1985			} else {
1986				/* Same as case 0 */
1987				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1988				ip_multipath_l3_keys(skb, &hash_keys);
1989			}
1990		} else {
1991			/* Same as case 0 */
1992			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1993			hash_keys.addrs.v4addrs.src = fl4->saddr;
1994			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1995		}
1996		break;
1997	}
1998	mhash = flow_hash_from_keys(&hash_keys);
1999
2000	if (multipath_hash)
2001		mhash = jhash_2words(mhash, multipath_hash, 0);
2002
2003	return mhash >> 1;
2004}
2005#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2006
2007static int ip_mkroute_input(struct sk_buff *skb,
2008			    struct fib_result *res,
 
2009			    struct in_device *in_dev,
2010			    __be32 daddr, __be32 saddr, u32 tos,
2011			    struct flow_keys *hkeys)
2012{
2013#ifdef CONFIG_IP_ROUTE_MULTIPATH
2014	if (res->fi && fib_info_num_path(res->fi) > 1) {
2015		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2016
2017		fib_select_multipath(res, h);
2018	}
 
2019#endif
2020
2021	/* create a routing cache entry */
2022	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
 
 
 
 
 
 
 
 
 
 
2023}
2024
2025/*
2026 *	NOTE. We drop all the packets that has local source
2027 *	addresses, because every properly looped back packet
2028 *	must have correct destination already attached by output routine.
2029 *
2030 *	Such approach solves two big problems:
2031 *	1. Not simplex devices are handled properly.
2032 *	2. IP spoofing attempts are filtered with 100% of guarantee.
2033 *	called with rcu_read_lock()
2034 */
2035
2036static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2037			       u8 tos, struct net_device *dev,
2038			       struct fib_result *res)
2039{
 
2040	struct in_device *in_dev = __in_dev_get_rcu(dev);
2041	struct flow_keys *flkeys = NULL, _flkeys;
2042	struct net    *net = dev_net(dev);
2043	struct ip_tunnel_info *tun_info;
2044	int		err = -EINVAL;
2045	unsigned int	flags = 0;
2046	u32		itag = 0;
2047	struct rtable	*rth;
2048	struct flowi4	fl4;
2049	bool do_cache = true;
 
 
 
 
 
 
2050
2051	/* IP on this device is disabled. */
2052
2053	if (!in_dev)
2054		goto out;
2055
2056	/* Check for the most weird martians, which can be not detected
2057	   by fib_lookup.
2058	 */
2059
2060	tun_info = skb_tunnel_info(skb);
2061	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2062		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2063	else
2064		fl4.flowi4_tun_key.tun_id = 0;
2065	skb_dst_drop(skb);
2066
2067	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2068		goto martian_source;
2069
2070	res->fi = NULL;
2071	res->table = NULL;
2072	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2073		goto brd_input;
2074
2075	/* Accept zero addresses only to limited broadcast;
2076	 * I even do not know to fix it or not. Waiting for complains :-)
2077	 */
2078	if (ipv4_is_zeronet(saddr))
2079		goto martian_source;
2080
2081	if (ipv4_is_zeronet(daddr))
2082		goto martian_destination;
2083
2084	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2085	 * and call it once if daddr or/and saddr are loopback addresses
2086	 */
2087	if (ipv4_is_loopback(daddr)) {
2088		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2089			goto martian_destination;
2090	} else if (ipv4_is_loopback(saddr)) {
2091		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2092			goto martian_source;
2093	}
2094
2095	/*
2096	 *	Now we are ready to route packet.
2097	 */
2098	fl4.flowi4_oif = 0;
2099	fl4.flowi4_iif = dev->ifindex;
2100	fl4.flowi4_mark = skb->mark;
2101	fl4.flowi4_tos = tos;
2102	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2103	fl4.flowi4_flags = 0;
2104	fl4.daddr = daddr;
2105	fl4.saddr = saddr;
2106	fl4.flowi4_uid = sock_net_uid(net, NULL);
2107
2108	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2109		flkeys = &_flkeys;
2110	} else {
2111		fl4.flowi4_proto = 0;
2112		fl4.fl4_sport = 0;
2113		fl4.fl4_dport = 0;
2114	}
2115
2116	err = fib_lookup(net, &fl4, res, 0);
2117	if (err != 0) {
2118		if (!IN_DEV_FORWARD(in_dev))
2119			err = -EHOSTUNREACH;
2120		goto no_route;
2121	}
2122
2123	if (res->type == RTN_BROADCAST) {
2124		if (IN_DEV_BFORWARD(in_dev))
2125			goto make_route;
2126		/* not do cache if bc_forwarding is enabled */
2127		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2128			do_cache = false;
2129		goto brd_input;
2130	}
2131
2132	if (res->type == RTN_LOCAL) {
2133		err = fib_validate_source(skb, saddr, daddr, tos,
2134					  0, dev, in_dev, &itag);
 
2135		if (err < 0)
2136			goto martian_source;
 
 
 
2137		goto local_input;
2138	}
2139
2140	if (!IN_DEV_FORWARD(in_dev)) {
2141		err = -EHOSTUNREACH;
2142		goto no_route;
2143	}
2144	if (res->type != RTN_UNICAST)
2145		goto martian_destination;
2146
2147make_route:
2148	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2149out:	return err;
2150
2151brd_input:
2152	if (skb->protocol != htons(ETH_P_IP))
2153		goto e_inval;
2154
2155	if (!ipv4_is_zeronet(saddr)) {
2156		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2157					  in_dev, &itag);
 
 
2158		if (err < 0)
2159			goto martian_source;
 
 
2160	}
2161	flags |= RTCF_BROADCAST;
2162	res->type = RTN_BROADCAST;
2163	RT_CACHE_STAT_INC(in_brd);
2164
2165local_input:
2166	do_cache &= res->fi && !itag;
2167	if (do_cache) {
2168		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2169
2170		rth = rcu_dereference(nhc->nhc_rth_input);
2171		if (rt_cache_valid(rth)) {
2172			skb_dst_set_noref(skb, &rth->dst);
2173			err = 0;
2174			goto out;
2175		}
2176	}
2177
2178	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2179			   flags | RTCF_LOCAL, res->type,
2180			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2181	if (!rth)
2182		goto e_nobufs;
2183
 
2184	rth->dst.output= ip_rt_bug;
2185#ifdef CONFIG_IP_ROUTE_CLASSID
2186	rth->dst.tclassid = itag;
2187#endif
2188	rth->rt_is_input = 1;
2189
2190	RT_CACHE_STAT_INC(in_slow_tot);
2191	if (res->type == RTN_UNREACHABLE) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2192		rth->dst.input= ip_error;
2193		rth->dst.error= -err;
2194		rth->rt_flags 	&= ~RTCF_LOCAL;
2195	}
2196
2197	if (do_cache) {
2198		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2199
2200		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2201		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2202			WARN_ON(rth->dst.input == lwtunnel_input);
2203			rth->dst.lwtstate->orig_input = rth->dst.input;
2204			rth->dst.input = lwtunnel_input;
2205		}
2206
2207		if (unlikely(!rt_cache_route(nhc, rth)))
2208			rt_add_uncached_list(rth);
2209	}
2210	skb_dst_set(skb, &rth->dst);
2211	err = 0;
 
 
2212	goto out;
2213
2214no_route:
2215	RT_CACHE_STAT_INC(in_no_route);
2216	res->type = RTN_UNREACHABLE;
2217	res->fi = NULL;
2218	res->table = NULL;
 
2219	goto local_input;
2220
2221	/*
2222	 *	Do not cache martian addresses: they should be logged (RFC1812)
2223	 */
2224martian_destination:
2225	RT_CACHE_STAT_INC(in_martian_dst);
2226#ifdef CONFIG_IP_ROUTE_VERBOSE
2227	if (IN_DEV_LOG_MARTIANS(in_dev))
2228		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2229				     &daddr, &saddr, dev->name);
2230#endif
2231
 
 
 
 
2232e_inval:
2233	err = -EINVAL;
2234	goto out;
2235
2236e_nobufs:
2237	err = -ENOBUFS;
2238	goto out;
2239
2240martian_source:
 
 
2241	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2242	goto out;
2243}
2244
2245int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2246			 u8 tos, struct net_device *dev)
2247{
2248	struct fib_result res;
2249	int err;
 
 
 
 
 
2250
2251	tos &= IPTOS_RT_MASK;
2252	rcu_read_lock();
2253	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2254	rcu_read_unlock();
2255
2256	return err;
2257}
2258EXPORT_SYMBOL(ip_route_input_noref);
2259
2260/* called with rcu_read_lock held */
2261int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2262		       u8 tos, struct net_device *dev, struct fib_result *res)
2263{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2264	/* Multicast recognition logic is moved from route cache to here.
2265	   The problem was that too many Ethernet cards have broken/missing
2266	   hardware multicast filters :-( As result the host on multicasting
2267	   network acquires a lot of useless route cache entries, sort of
2268	   SDR messages from all the world. Now we try to get rid of them.
2269	   Really, provided software IP multicast filter is organized
2270	   reasonably (at least, hashed), it does not result in a slowdown
2271	   comparing with route cache reject entries.
2272	   Note, that multicast routers are not affected, because
2273	   route cache entry is created eventually.
2274	 */
2275	if (ipv4_is_multicast(daddr)) {
2276		struct in_device *in_dev = __in_dev_get_rcu(dev);
2277		int our = 0;
2278		int err = -EINVAL;
2279
2280		if (!in_dev)
2281			return err;
2282		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2283				      ip_hdr(skb)->protocol);
2284
2285		/* check l3 master if no match yet */
2286		if (!our && netif_is_l3_slave(dev)) {
2287			struct in_device *l3_in_dev;
2288
2289			l3_in_dev = __in_dev_get_rcu(skb->dev);
2290			if (l3_in_dev)
2291				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2292						      ip_hdr(skb)->protocol);
2293		}
2294
2295		if (our
2296#ifdef CONFIG_IP_MROUTE
2297			||
2298		    (!ipv4_is_local_multicast(daddr) &&
2299		     IN_DEV_MFORWARD(in_dev))
2300#endif
2301		   ) {
2302			err = ip_route_input_mc(skb, daddr, saddr,
2303						tos, dev, our);
 
 
 
2304		}
2305		return err;
 
2306	}
2307
2308	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
 
2309}
 
2310
2311/* called with rcu_read_lock() */
2312static struct rtable *__mkroute_output(const struct fib_result *res,
2313				       const struct flowi4 *fl4, int orig_oif,
2314				       struct net_device *dev_out,
 
2315				       unsigned int flags)
2316{
2317	struct fib_info *fi = res->fi;
2318	struct fib_nh_exception *fnhe;
2319	struct in_device *in_dev;
2320	u16 type = res->type;
2321	struct rtable *rth;
2322	bool do_cache;
2323
2324	in_dev = __in_dev_get_rcu(dev_out);
2325	if (!in_dev)
2326		return ERR_PTR(-EINVAL);
2327
2328	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2329		if (ipv4_is_loopback(fl4->saddr) &&
2330		    !(dev_out->flags & IFF_LOOPBACK) &&
2331		    !netif_is_l3_master(dev_out))
2332			return ERR_PTR(-EINVAL);
2333
2334	if (ipv4_is_lbcast(fl4->daddr))
2335		type = RTN_BROADCAST;
2336	else if (ipv4_is_multicast(fl4->daddr))
2337		type = RTN_MULTICAST;
2338	else if (ipv4_is_zeronet(fl4->daddr))
2339		return ERR_PTR(-EINVAL);
2340
2341	if (dev_out->flags & IFF_LOOPBACK)
2342		flags |= RTCF_LOCAL;
2343
2344	do_cache = true;
 
 
 
2345	if (type == RTN_BROADCAST) {
2346		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2347		fi = NULL;
2348	} else if (type == RTN_MULTICAST) {
2349		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2350		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2351				     fl4->flowi4_proto))
2352			flags &= ~RTCF_LOCAL;
2353		else
2354			do_cache = false;
2355		/* If multicast route do not exist use
2356		 * default one, but do not gateway in this case.
2357		 * Yes, it is hack.
2358		 */
2359		if (fi && res->prefixlen < 4)
2360			fi = NULL;
2361	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2362		   (orig_oif != dev_out->ifindex)) {
2363		/* For local routes that require a particular output interface
2364		 * we do not want to cache the result.  Caching the result
2365		 * causes incorrect behaviour when there are multiple source
2366		 * addresses on the interface, the end result being that if the
2367		 * intended recipient is waiting on that interface for the
2368		 * packet he won't receive it because it will be delivered on
2369		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2370		 * be set to the loopback interface as well.
2371		 */
2372		do_cache = false;
2373	}
2374
2375	fnhe = NULL;
2376	do_cache &= fi != NULL;
2377	if (fi) {
2378		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2379		struct rtable __rcu **prth;
2380
2381		fnhe = find_exception(nhc, fl4->daddr);
2382		if (!do_cache)
2383			goto add;
2384		if (fnhe) {
2385			prth = &fnhe->fnhe_rth_output;
2386		} else {
2387			if (unlikely(fl4->flowi4_flags &
2388				     FLOWI_FLAG_KNOWN_NH &&
2389				     !(nhc->nhc_gw_family &&
2390				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2391				do_cache = false;
2392				goto add;
2393			}
2394			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2395		}
2396		rth = rcu_dereference(*prth);
2397		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2398			return rth;
2399	}
2400
2401add:
2402	rth = rt_dst_alloc(dev_out, flags, type,
2403			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2404			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2405			   do_cache);
2406	if (!rth)
2407		return ERR_PTR(-ENOBUFS);
2408
2409	rth->rt_iif = orig_oif;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2410
2411	RT_CACHE_STAT_INC(out_slow_tot);
2412
 
 
 
 
2413	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
 
2414		if (flags & RTCF_LOCAL &&
2415		    !(dev_out->flags & IFF_LOOPBACK)) {
2416			rth->dst.output = ip_mc_output;
2417			RT_CACHE_STAT_INC(out_slow_mc);
2418		}
2419#ifdef CONFIG_IP_MROUTE
2420		if (type == RTN_MULTICAST) {
2421			if (IN_DEV_MFORWARD(in_dev) &&
2422			    !ipv4_is_local_multicast(fl4->daddr)) {
2423				rth->dst.input = ip_mr_input;
2424				rth->dst.output = ip_mc_output;
2425			}
2426		}
2427#endif
2428	}
2429
2430	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2431	lwtunnel_set_redirect(&rth->dst);
2432
2433	return rth;
2434}
2435
2436/*
2437 * Major route resolver routine.
 
2438 */
2439
2440struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2441					const struct sk_buff *skb)
2442{
2443	__u8 tos = RT_FL_TOS(fl4);
2444	struct fib_result res = {
2445		.type		= RTN_UNSPEC,
2446		.fi		= NULL,
2447		.table		= NULL,
2448		.tclassid	= 0,
2449	};
2450	struct rtable *rth;
 
 
 
 
 
 
 
 
2451
2452	fl4->flowi4_iif = LOOPBACK_IFINDEX;
 
 
 
 
2453	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2454	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2455			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2456
2457	rcu_read_lock();
2458	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2459	rcu_read_unlock();
2460
2461	return rth;
2462}
2463EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2464
2465struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2466					    struct fib_result *res,
2467					    const struct sk_buff *skb)
2468{
2469	struct net_device *dev_out = NULL;
2470	int orig_oif = fl4->flowi4_oif;
2471	unsigned int flags = 0;
2472	struct rtable *rth;
2473	int err;
2474
2475	if (fl4->saddr) {
 
2476		if (ipv4_is_multicast(fl4->saddr) ||
2477		    ipv4_is_lbcast(fl4->saddr) ||
2478		    ipv4_is_zeronet(fl4->saddr)) {
2479			rth = ERR_PTR(-EINVAL);
2480			goto out;
2481		}
2482
2483		rth = ERR_PTR(-ENETUNREACH);
2484
2485		/* I removed check for oif == dev_out->oif here.
2486		   It was wrong for two reasons:
2487		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2488		      is assigned to multiple interfaces.
2489		   2. Moreover, we are allowed to send packets with saddr
2490		      of another iface. --ANK
2491		 */
2492
2493		if (fl4->flowi4_oif == 0 &&
2494		    (ipv4_is_multicast(fl4->daddr) ||
2495		     ipv4_is_lbcast(fl4->daddr))) {
2496			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2497			dev_out = __ip_dev_find(net, fl4->saddr, false);
2498			if (!dev_out)
2499				goto out;
2500
2501			/* Special hack: user can direct multicasts
2502			   and limited broadcast via necessary interface
2503			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2504			   This hack is not just for fun, it allows
2505			   vic,vat and friends to work.
2506			   They bind socket to loopback, set ttl to zero
2507			   and expect that it will work.
2508			   From the viewpoint of routing cache they are broken,
2509			   because we are not allowed to build multicast path
2510			   with loopback source addr (look, routing cache
2511			   cannot know, that ttl is zero, so that packet
2512			   will not leave this host and route is valid).
2513			   Luckily, this hack is good workaround.
2514			 */
2515
2516			fl4->flowi4_oif = dev_out->ifindex;
2517			goto make_route;
2518		}
2519
2520		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2521			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2522			if (!__ip_dev_find(net, fl4->saddr, false))
2523				goto out;
2524		}
2525	}
2526
2527
2528	if (fl4->flowi4_oif) {
2529		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2530		rth = ERR_PTR(-ENODEV);
2531		if (!dev_out)
2532			goto out;
2533
2534		/* RACE: Check return value of inet_select_addr instead. */
2535		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2536			rth = ERR_PTR(-ENETUNREACH);
2537			goto out;
2538		}
2539		if (ipv4_is_local_multicast(fl4->daddr) ||
2540		    ipv4_is_lbcast(fl4->daddr) ||
2541		    fl4->flowi4_proto == IPPROTO_IGMP) {
2542			if (!fl4->saddr)
2543				fl4->saddr = inet_select_addr(dev_out, 0,
2544							      RT_SCOPE_LINK);
2545			goto make_route;
2546		}
2547		if (!fl4->saddr) {
2548			if (ipv4_is_multicast(fl4->daddr))
2549				fl4->saddr = inet_select_addr(dev_out, 0,
2550							      fl4->flowi4_scope);
2551			else if (!fl4->daddr)
2552				fl4->saddr = inet_select_addr(dev_out, 0,
2553							      RT_SCOPE_HOST);
2554		}
2555	}
2556
2557	if (!fl4->daddr) {
2558		fl4->daddr = fl4->saddr;
2559		if (!fl4->daddr)
2560			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2561		dev_out = net->loopback_dev;
2562		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2563		res->type = RTN_LOCAL;
2564		flags |= RTCF_LOCAL;
2565		goto make_route;
2566	}
2567
2568	err = fib_lookup(net, fl4, res, 0);
2569	if (err) {
2570		res->fi = NULL;
2571		res->table = NULL;
2572		if (fl4->flowi4_oif &&
2573		    (ipv4_is_multicast(fl4->daddr) ||
2574		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2575			/* Apparently, routing tables are wrong. Assume,
2576			   that the destination is on link.
2577
2578			   WHY? DW.
2579			   Because we are allowed to send to iface
2580			   even if it has NO routes and NO assigned
2581			   addresses. When oif is specified, routing
2582			   tables are looked up with only one purpose:
2583			   to catch if destination is gatewayed, rather than
2584			   direct. Moreover, if MSG_DONTROUTE is set,
2585			   we send packet, ignoring both routing tables
2586			   and ifaddr state. --ANK
2587
2588
2589			   We could make it even if oif is unknown,
2590			   likely IPv6, but we do not.
2591			 */
2592
2593			if (fl4->saddr == 0)
2594				fl4->saddr = inet_select_addr(dev_out, 0,
2595							      RT_SCOPE_LINK);
2596			res->type = RTN_UNICAST;
2597			goto make_route;
2598		}
2599		rth = ERR_PTR(err);
2600		goto out;
2601	}
2602
2603	if (res->type == RTN_LOCAL) {
2604		if (!fl4->saddr) {
2605			if (res->fi->fib_prefsrc)
2606				fl4->saddr = res->fi->fib_prefsrc;
2607			else
2608				fl4->saddr = fl4->daddr;
2609		}
2610
2611		/* L3 master device is the loopback for that domain */
2612		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2613			net->loopback_dev;
2614
2615		/* make sure orig_oif points to fib result device even
2616		 * though packet rx/tx happens over loopback or l3mdev
2617		 */
2618		orig_oif = FIB_RES_OIF(*res);
2619
2620		fl4->flowi4_oif = dev_out->ifindex;
 
2621		flags |= RTCF_LOCAL;
2622		goto make_route;
2623	}
2624
2625	fib_select_path(net, res, fl4, skb);
 
 
 
 
 
 
 
 
2626
2627	dev_out = FIB_RES_DEV(*res);
 
 
 
2628	fl4->flowi4_oif = dev_out->ifindex;
2629
2630
2631make_route:
2632	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
 
 
 
 
 
 
 
 
2633
2634out:
 
2635	return rth;
2636}
2637
2638static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2639{
2640	return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2641}
 
2642
2643static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2644{
2645	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2646
2647	return mtu ? : dst->dev->mtu;
2648}
2649
2650static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2651					  struct sk_buff *skb, u32 mtu)
2652{
 
2653}
2654
2655static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2656				       struct sk_buff *skb)
2657{
2658}
2659
2660static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2661					  unsigned long old)
2662{
2663	return NULL;
2664}
2665
2666static struct dst_ops ipv4_dst_blackhole_ops = {
2667	.family			=	AF_INET,
 
 
2668	.check			=	ipv4_blackhole_dst_check,
2669	.mtu			=	ipv4_blackhole_mtu,
2670	.default_advmss		=	ipv4_default_advmss,
2671	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2672	.redirect		=	ipv4_rt_blackhole_redirect,
2673	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2674	.neigh_lookup		=	ipv4_neigh_lookup,
2675};
2676
2677struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2678{
 
2679	struct rtable *ort = (struct rtable *) dst_orig;
2680	struct rtable *rt;
2681
2682	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2683	if (rt) {
2684		struct dst_entry *new = &rt->dst;
2685
2686		new->__use = 1;
2687		new->input = dst_discard;
2688		new->output = dst_discard_out;
 
2689
2690		new->dev = net->loopback_dev;
2691		if (new->dev)
2692			dev_hold(new->dev);
2693
2694		rt->rt_is_input = ort->rt_is_input;
 
 
 
2695		rt->rt_iif = ort->rt_iif;
2696		rt->rt_pmtu = ort->rt_pmtu;
2697		rt->rt_mtu_locked = ort->rt_mtu_locked;
2698
2699		rt->rt_genid = rt_genid_ipv4(net);
2700		rt->rt_flags = ort->rt_flags;
2701		rt->rt_type = ort->rt_type;
2702		rt->rt_uses_gateway = ort->rt_uses_gateway;
2703		rt->rt_gw_family = ort->rt_gw_family;
2704		if (rt->rt_gw_family == AF_INET)
2705			rt->rt_gw4 = ort->rt_gw4;
2706		else if (rt->rt_gw_family == AF_INET6)
2707			rt->rt_gw6 = ort->rt_gw6;
 
 
 
 
2708
2709		INIT_LIST_HEAD(&rt->rt_uncached);
2710	}
2711
2712	dst_release(dst_orig);
2713
2714	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2715}
2716
2717struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2718				    const struct sock *sk)
2719{
2720	struct rtable *rt = __ip_route_output_key(net, flp4);
2721
2722	if (IS_ERR(rt))
2723		return rt;
2724
2725	if (flp4->flowi4_proto)
2726		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2727							flowi4_to_flowi(flp4),
2728							sk, 0);
2729
2730	return rt;
2731}
2732EXPORT_SYMBOL_GPL(ip_route_output_flow);
2733
2734/* called with rcu_read_lock held */
2735static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2736			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2737			struct sk_buff *skb, u32 portid, u32 seq,
2738			unsigned int flags)
2739{
 
2740	struct rtmsg *r;
2741	struct nlmsghdr *nlh;
2742	unsigned long expires = 0;
2743	u32 error;
2744	u32 metrics[RTAX_MAX];
2745
2746	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2747	if (!nlh)
2748		return -EMSGSIZE;
2749
2750	r = nlmsg_data(nlh);
2751	r->rtm_family	 = AF_INET;
2752	r->rtm_dst_len	= 32;
2753	r->rtm_src_len	= 0;
2754	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2755	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2756	if (nla_put_u32(skb, RTA_TABLE, table_id))
2757		goto nla_put_failure;
2758	r->rtm_type	= rt->rt_type;
2759	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2760	r->rtm_protocol = RTPROT_UNSPEC;
2761	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2762	if (rt->rt_flags & RTCF_NOTIFY)
2763		r->rtm_flags |= RTM_F_NOTIFY;
2764	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2765		r->rtm_flags |= RTCF_DOREDIRECT;
2766
2767	if (nla_put_in_addr(skb, RTA_DST, dst))
2768		goto nla_put_failure;
2769	if (src) {
2770		r->rtm_src_len = 32;
2771		if (nla_put_in_addr(skb, RTA_SRC, src))
2772			goto nla_put_failure;
2773	}
2774	if (rt->dst.dev &&
2775	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2776		goto nla_put_failure;
2777#ifdef CONFIG_IP_ROUTE_CLASSID
2778	if (rt->dst.tclassid &&
2779	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2780		goto nla_put_failure;
2781#endif
2782	if (fl4 && !rt_is_input_route(rt) &&
2783	    fl4->saddr != src) {
2784		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2785			goto nla_put_failure;
2786	}
2787	if (rt->rt_uses_gateway) {
2788		if (rt->rt_gw_family == AF_INET &&
2789		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2790			goto nla_put_failure;
2791		} else if (rt->rt_gw_family == AF_INET6) {
2792			int alen = sizeof(struct in6_addr);
2793			struct nlattr *nla;
2794			struct rtvia *via;
2795
2796			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2797			if (!nla)
2798				goto nla_put_failure;
2799
2800			via = nla_data(nla);
2801			via->rtvia_family = AF_INET6;
2802			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2803		}
2804	}
2805
2806	expires = rt->dst.expires;
2807	if (expires) {
2808		unsigned long now = jiffies;
2809
2810		if (time_before(now, expires))
2811			expires -= now;
2812		else
2813			expires = 0;
2814	}
2815
2816	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2817	if (rt->rt_pmtu && expires)
2818		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2819	if (rt->rt_mtu_locked && expires)
2820		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2821	if (rtnetlink_put_metrics(skb, metrics) < 0)
2822		goto nla_put_failure;
2823
2824	if (fl4) {
2825		if (fl4->flowi4_mark &&
2826		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2827			goto nla_put_failure;
2828
2829		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2830		    nla_put_u32(skb, RTA_UID,
2831				from_kuid_munged(current_user_ns(),
2832						 fl4->flowi4_uid)))
2833			goto nla_put_failure;
2834
2835		if (rt_is_input_route(rt)) {
 
 
 
 
 
 
 
 
 
 
 
 
 
2836#ifdef CONFIG_IP_MROUTE
2837			if (ipv4_is_multicast(dst) &&
2838			    !ipv4_is_local_multicast(dst) &&
2839			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2840				int err = ipmr_get_route(net, skb,
2841							 fl4->saddr, fl4->daddr,
2842							 r, portid);
2843
2844				if (err <= 0) {
 
 
 
 
 
 
2845					if (err == 0)
2846						return 0;
2847					goto nla_put_failure;
 
 
 
 
2848				}
2849			} else
 
2850#endif
2851				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2852					goto nla_put_failure;
2853		}
2854	}
2855
2856	error = rt->dst.error;
2857
2858	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2859		goto nla_put_failure;
2860
2861	nlmsg_end(skb, nlh);
2862	return 0;
2863
2864nla_put_failure:
2865	nlmsg_cancel(skb, nlh);
2866	return -EMSGSIZE;
2867}
2868
2869static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2870			    struct netlink_callback *cb, u32 table_id,
2871			    struct fnhe_hash_bucket *bucket, int genid,
2872			    int *fa_index, int fa_start, unsigned int flags)
2873{
2874	int i;
2875
2876	for (i = 0; i < FNHE_HASH_SIZE; i++) {
2877		struct fib_nh_exception *fnhe;
2878
2879		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2880		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2881			struct rtable *rt;
2882			int err;
2883
2884			if (*fa_index < fa_start)
2885				goto next;
2886
2887			if (fnhe->fnhe_genid != genid)
2888				goto next;
2889
2890			if (fnhe->fnhe_expires &&
2891			    time_after(jiffies, fnhe->fnhe_expires))
2892				goto next;
2893
2894			rt = rcu_dereference(fnhe->fnhe_rth_input);
2895			if (!rt)
2896				rt = rcu_dereference(fnhe->fnhe_rth_output);
2897			if (!rt)
2898				goto next;
2899
2900			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2901					   table_id, NULL, skb,
2902					   NETLINK_CB(cb->skb).portid,
2903					   cb->nlh->nlmsg_seq, flags);
2904			if (err)
2905				return err;
2906next:
2907			(*fa_index)++;
2908		}
2909	}
2910
2911	return 0;
2912}
2913
2914int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2915		       u32 table_id, struct fib_info *fi,
2916		       int *fa_index, int fa_start, unsigned int flags)
2917{
2918	struct net *net = sock_net(cb->skb->sk);
2919	int nhsel, genid = fnhe_genid(net);
2920
2921	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2922		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2923		struct fnhe_hash_bucket *bucket;
2924		int err;
2925
2926		if (nhc->nhc_flags & RTNH_F_DEAD)
2927			continue;
2928
2929		rcu_read_lock();
2930		bucket = rcu_dereference(nhc->nhc_exceptions);
2931		err = 0;
2932		if (bucket)
2933			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2934					       genid, fa_index, fa_start,
2935					       flags);
2936		rcu_read_unlock();
2937		if (err)
2938			return err;
2939	}
2940
2941	return 0;
2942}
2943
2944static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2945						   u8 ip_proto, __be16 sport,
2946						   __be16 dport)
2947{
2948	struct sk_buff *skb;
2949	struct iphdr *iph;
2950
2951	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2952	if (!skb)
2953		return NULL;
2954
2955	/* Reserve room for dummy headers, this skb can pass
2956	 * through good chunk of routing engine.
2957	 */
2958	skb_reset_mac_header(skb);
2959	skb_reset_network_header(skb);
2960	skb->protocol = htons(ETH_P_IP);
2961	iph = skb_put(skb, sizeof(struct iphdr));
2962	iph->protocol = ip_proto;
2963	iph->saddr = src;
2964	iph->daddr = dst;
2965	iph->version = 0x4;
2966	iph->frag_off = 0;
2967	iph->ihl = 0x5;
2968	skb_set_transport_header(skb, skb->len);
2969
2970	switch (iph->protocol) {
2971	case IPPROTO_UDP: {
2972		struct udphdr *udph;
2973
2974		udph = skb_put_zero(skb, sizeof(struct udphdr));
2975		udph->source = sport;
2976		udph->dest = dport;
2977		udph->len = sizeof(struct udphdr);
2978		udph->check = 0;
2979		break;
2980	}
2981	case IPPROTO_TCP: {
2982		struct tcphdr *tcph;
2983
2984		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2985		tcph->source	= sport;
2986		tcph->dest	= dport;
2987		tcph->doff	= sizeof(struct tcphdr) / 4;
2988		tcph->rst = 1;
2989		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2990					    src, dst, 0);
2991		break;
2992	}
2993	case IPPROTO_ICMP: {
2994		struct icmphdr *icmph;
2995
2996		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2997		icmph->type = ICMP_ECHO;
2998		icmph->code = 0;
2999	}
3000	}
3001
3002	return skb;
3003}
3004
3005static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3006				       const struct nlmsghdr *nlh,
3007				       struct nlattr **tb,
3008				       struct netlink_ext_ack *extack)
3009{
3010	struct rtmsg *rtm;
3011	int i, err;
3012
3013	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3014		NL_SET_ERR_MSG(extack,
3015			       "ipv4: Invalid header for route get request");
3016		return -EINVAL;
3017	}
3018
3019	if (!netlink_strict_get_check(skb))
3020		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3021					      rtm_ipv4_policy, extack);
3022
3023	rtm = nlmsg_data(nlh);
3024	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3025	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3026	    rtm->rtm_table || rtm->rtm_protocol ||
3027	    rtm->rtm_scope || rtm->rtm_type) {
3028		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3029		return -EINVAL;
3030	}
3031
3032	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3033			       RTM_F_LOOKUP_TABLE |
3034			       RTM_F_FIB_MATCH)) {
3035		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3036		return -EINVAL;
3037	}
3038
3039	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3040					    rtm_ipv4_policy, extack);
3041	if (err)
3042		return err;
3043
3044	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3045	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3046		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3047		return -EINVAL;
3048	}
3049
3050	for (i = 0; i <= RTA_MAX; i++) {
3051		if (!tb[i])
3052			continue;
3053
3054		switch (i) {
3055		case RTA_IIF:
3056		case RTA_OIF:
3057		case RTA_SRC:
3058		case RTA_DST:
3059		case RTA_IP_PROTO:
3060		case RTA_SPORT:
3061		case RTA_DPORT:
3062		case RTA_MARK:
3063		case RTA_UID:
3064			break;
3065		default:
3066			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3067			return -EINVAL;
3068		}
3069	}
3070
3071	return 0;
3072}
3073
3074static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3075			     struct netlink_ext_ack *extack)
3076{
3077	struct net *net = sock_net(in_skb->sk);
 
3078	struct nlattr *tb[RTA_MAX+1];
3079	u32 table_id = RT_TABLE_MAIN;
3080	__be16 sport = 0, dport = 0;
3081	struct fib_result res = {};
3082	u8 ip_proto = IPPROTO_UDP;
3083	struct rtable *rt = NULL;
3084	struct sk_buff *skb;
3085	struct rtmsg *rtm;
3086	struct flowi4 fl4 = {};
3087	__be32 dst = 0;
3088	__be32 src = 0;
3089	kuid_t uid;
3090	u32 iif;
3091	int err;
3092	int mark;
 
3093
3094	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3095	if (err < 0)
3096		return err;
3097
3098	rtm = nlmsg_data(nlh);
3099	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3100	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3101	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3102	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3103	if (tb[RTA_UID])
3104		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3105	else
3106		uid = (iif ? INVALID_UID : current_uid());
3107
3108	if (tb[RTA_IP_PROTO]) {
3109		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3110						  &ip_proto, AF_INET, extack);
3111		if (err)
3112			return err;
3113	}
3114
3115	if (tb[RTA_SPORT])
3116		sport = nla_get_be16(tb[RTA_SPORT]);
3117
3118	if (tb[RTA_DPORT])
3119		dport = nla_get_be16(tb[RTA_DPORT]);
3120
3121	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3122	if (!skb)
3123		return -ENOBUFS;
3124
3125	fl4.daddr = dst;
3126	fl4.saddr = src;
3127	fl4.flowi4_tos = rtm->rtm_tos;
3128	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3129	fl4.flowi4_mark = mark;
3130	fl4.flowi4_uid = uid;
3131	if (sport)
3132		fl4.fl4_sport = sport;
3133	if (dport)
3134		fl4.fl4_dport = dport;
3135	fl4.flowi4_proto = ip_proto;
3136
3137	rcu_read_lock();
 
 
 
3138
3139	if (iif) {
3140		struct net_device *dev;
3141
3142		dev = dev_get_by_index_rcu(net, iif);
3143		if (!dev) {
3144			err = -ENODEV;
3145			goto errout_rcu;
3146		}
3147
3148		fl4.flowi4_iif = iif; /* for rt_fill_info */
3149		skb->dev	= dev;
3150		skb->mark	= mark;
3151		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3152					 dev, &res);
 
3153
3154		rt = skb_rtable(skb);
3155		if (err == 0 && rt->dst.error)
3156			err = -rt->dst.error;
3157	} else {
3158		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3159		skb->dev = net->loopback_dev;
3160		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
 
 
 
 
 
 
3161		err = 0;
3162		if (IS_ERR(rt))
3163			err = PTR_ERR(rt);
3164		else
3165			skb_dst_set(skb, &rt->dst);
3166	}
3167
3168	if (err)
3169		goto errout_rcu;
3170
 
3171	if (rtm->rtm_flags & RTM_F_NOTIFY)
3172		rt->rt_flags |= RTCF_NOTIFY;
3173
3174	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3175		table_id = res.table ? res.table->tb_id : 0;
 
 
3176
3177	/* reset skb for netlink reply msg */
3178	skb_trim(skb, 0);
3179	skb_reset_network_header(skb);
3180	skb_reset_transport_header(skb);
3181	skb_reset_mac_header(skb);
3182
3183	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3184		if (!res.fi) {
3185			err = fib_props[res.type].error;
3186			if (!err)
3187				err = -EHOSTUNREACH;
3188			goto errout_rcu;
3189		}
3190		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3191				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3192				    rt->rt_type, res.prefix, res.prefixlen,
3193				    fl4.flowi4_tos, res.fi, 0);
3194	} else {
3195		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3196				   NETLINK_CB(in_skb).portid,
3197				   nlh->nlmsg_seq, 0);
3198	}
3199	if (err < 0)
3200		goto errout_rcu;
3201
3202	rcu_read_unlock();
 
 
 
 
 
3203
3204	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3205
3206errout_free:
3207	return err;
3208errout_rcu:
3209	rcu_read_unlock();
3210	kfree_skb(skb);
3211	goto errout_free;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3212}
3213
3214void ip_rt_multicast_event(struct in_device *in_dev)
3215{
3216	rt_cache_flush(dev_net(in_dev->dev));
3217}
3218
3219#ifdef CONFIG_SYSCTL
3220static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3221static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3222static int ip_rt_gc_elasticity __read_mostly	= 8;
3223static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3224
3225static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3226					void __user *buffer,
3227					size_t *lenp, loff_t *ppos)
3228{
3229	struct net *net = (struct net *)__ctl->extra1;
3230
3231	if (write) {
3232		rt_cache_flush(net);
3233		fnhe_genid_bump(net);
 
 
 
 
 
 
 
 
3234		return 0;
3235	}
3236
3237	return -EINVAL;
3238}
3239
3240static struct ctl_table ipv4_route_table[] = {
3241	{
3242		.procname	= "gc_thresh",
3243		.data		= &ipv4_dst_ops.gc_thresh,
3244		.maxlen		= sizeof(int),
3245		.mode		= 0644,
3246		.proc_handler	= proc_dointvec,
3247	},
3248	{
3249		.procname	= "max_size",
3250		.data		= &ip_rt_max_size,
3251		.maxlen		= sizeof(int),
3252		.mode		= 0644,
3253		.proc_handler	= proc_dointvec,
3254	},
3255	{
3256		/*  Deprecated. Use gc_min_interval_ms */
3257
3258		.procname	= "gc_min_interval",
3259		.data		= &ip_rt_gc_min_interval,
3260		.maxlen		= sizeof(int),
3261		.mode		= 0644,
3262		.proc_handler	= proc_dointvec_jiffies,
3263	},
3264	{
3265		.procname	= "gc_min_interval_ms",
3266		.data		= &ip_rt_gc_min_interval,
3267		.maxlen		= sizeof(int),
3268		.mode		= 0644,
3269		.proc_handler	= proc_dointvec_ms_jiffies,
3270	},
3271	{
3272		.procname	= "gc_timeout",
3273		.data		= &ip_rt_gc_timeout,
3274		.maxlen		= sizeof(int),
3275		.mode		= 0644,
3276		.proc_handler	= proc_dointvec_jiffies,
3277	},
3278	{
3279		.procname	= "gc_interval",
3280		.data		= &ip_rt_gc_interval,
3281		.maxlen		= sizeof(int),
3282		.mode		= 0644,
3283		.proc_handler	= proc_dointvec_jiffies,
3284	},
3285	{
3286		.procname	= "redirect_load",
3287		.data		= &ip_rt_redirect_load,
3288		.maxlen		= sizeof(int),
3289		.mode		= 0644,
3290		.proc_handler	= proc_dointvec,
3291	},
3292	{
3293		.procname	= "redirect_number",
3294		.data		= &ip_rt_redirect_number,
3295		.maxlen		= sizeof(int),
3296		.mode		= 0644,
3297		.proc_handler	= proc_dointvec,
3298	},
3299	{
3300		.procname	= "redirect_silence",
3301		.data		= &ip_rt_redirect_silence,
3302		.maxlen		= sizeof(int),
3303		.mode		= 0644,
3304		.proc_handler	= proc_dointvec,
3305	},
3306	{
3307		.procname	= "error_cost",
3308		.data		= &ip_rt_error_cost,
3309		.maxlen		= sizeof(int),
3310		.mode		= 0644,
3311		.proc_handler	= proc_dointvec,
3312	},
3313	{
3314		.procname	= "error_burst",
3315		.data		= &ip_rt_error_burst,
3316		.maxlen		= sizeof(int),
3317		.mode		= 0644,
3318		.proc_handler	= proc_dointvec,
3319	},
3320	{
3321		.procname	= "gc_elasticity",
3322		.data		= &ip_rt_gc_elasticity,
3323		.maxlen		= sizeof(int),
3324		.mode		= 0644,
3325		.proc_handler	= proc_dointvec,
3326	},
3327	{
3328		.procname	= "mtu_expires",
3329		.data		= &ip_rt_mtu_expires,
3330		.maxlen		= sizeof(int),
3331		.mode		= 0644,
3332		.proc_handler	= proc_dointvec_jiffies,
3333	},
3334	{
3335		.procname	= "min_pmtu",
3336		.data		= &ip_rt_min_pmtu,
3337		.maxlen		= sizeof(int),
3338		.mode		= 0644,
3339		.proc_handler	= proc_dointvec_minmax,
3340		.extra1		= &ip_min_valid_pmtu,
3341	},
3342	{
3343		.procname	= "min_adv_mss",
3344		.data		= &ip_rt_min_advmss,
3345		.maxlen		= sizeof(int),
3346		.mode		= 0644,
3347		.proc_handler	= proc_dointvec,
3348	},
3349	{ }
3350};
3351
3352static const char ipv4_route_flush_procname[] = "flush";
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3353
3354static struct ctl_table ipv4_route_flush_table[] = {
3355	{
3356		.procname	= ipv4_route_flush_procname,
3357		.maxlen		= sizeof(int),
3358		.mode		= 0200,
3359		.proc_handler	= ipv4_sysctl_rtcache_flush,
3360	},
3361	{ },
3362};
3363
 
 
 
 
 
 
 
3364static __net_init int sysctl_route_net_init(struct net *net)
3365{
3366	struct ctl_table *tbl;
3367
3368	tbl = ipv4_route_flush_table;
3369	if (!net_eq(net, &init_net)) {
3370		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3371		if (!tbl)
3372			goto err_dup;
3373
3374		/* Don't export non-whitelisted sysctls to unprivileged users */
3375		if (net->user_ns != &init_user_ns) {
3376			if (tbl[0].procname != ipv4_route_flush_procname)
3377				tbl[0].procname = NULL;
3378		}
3379	}
3380	tbl[0].extra1 = net;
3381
3382	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3383	if (!net->ipv4.route_hdr)
 
3384		goto err_reg;
3385	return 0;
3386
3387err_reg:
3388	if (tbl != ipv4_route_flush_table)
3389		kfree(tbl);
3390err_dup:
3391	return -ENOMEM;
3392}
3393
3394static __net_exit void sysctl_route_net_exit(struct net *net)
3395{
3396	struct ctl_table *tbl;
3397
3398	tbl = net->ipv4.route_hdr->ctl_table_arg;
3399	unregister_net_sysctl_table(net->ipv4.route_hdr);
3400	BUG_ON(tbl == ipv4_route_flush_table);
3401	kfree(tbl);
3402}
3403
3404static __net_initdata struct pernet_operations sysctl_route_ops = {
3405	.init = sysctl_route_net_init,
3406	.exit = sysctl_route_net_exit,
3407};
3408#endif
3409
3410static __net_init int rt_genid_init(struct net *net)
3411{
3412	atomic_set(&net->ipv4.rt_genid, 0);
3413	atomic_set(&net->fnhe_genid, 0);
3414	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
 
3415	return 0;
3416}
3417
3418static __net_initdata struct pernet_operations rt_genid_ops = {
3419	.init = rt_genid_init,
3420};
3421
3422static int __net_init ipv4_inetpeer_init(struct net *net)
3423{
3424	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3425
3426	if (!bp)
3427		return -ENOMEM;
3428	inet_peer_base_init(bp);
3429	net->ipv4.peers = bp;
3430	return 0;
3431}
3432
3433static void __net_exit ipv4_inetpeer_exit(struct net *net)
3434{
3435	struct inet_peer_base *bp = net->ipv4.peers;
3436
3437	net->ipv4.peers = NULL;
3438	inetpeer_invalidate_tree(bp);
3439	kfree(bp);
3440}
3441
3442static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3443	.init	=	ipv4_inetpeer_init,
3444	.exit	=	ipv4_inetpeer_exit,
3445};
3446
3447#ifdef CONFIG_IP_ROUTE_CLASSID
3448struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3449#endif /* CONFIG_IP_ROUTE_CLASSID */
3450
3451int __init ip_rt_init(void)
 
3452{
3453	int cpu;
3454
3455	ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3456				  GFP_KERNEL);
3457	if (!ip_idents)
3458		panic("IP: failed to allocate ip_idents\n");
3459
3460	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3461
3462	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3463	if (!ip_tstamps)
3464		panic("IP: failed to allocate ip_tstamps\n");
3465
3466	for_each_possible_cpu(cpu) {
3467		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
 
3468
3469		INIT_LIST_HEAD(&ul->head);
3470		spin_lock_init(&ul->lock);
3471	}
3472#ifdef CONFIG_IP_ROUTE_CLASSID
3473	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3474	if (!ip_rt_acct)
3475		panic("IP: failed to allocate ip_rt_acct\n");
3476#endif
3477
3478	ipv4_dst_ops.kmem_cachep =
3479		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3480				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3481
3482	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3483
3484	if (dst_entries_init(&ipv4_dst_ops) < 0)
3485		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3486
3487	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3488		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3489
3490	ipv4_dst_ops.gc_thresh = ~0;
3491	ip_rt_max_size = INT_MAX;
 
 
 
 
 
 
 
 
 
 
 
 
 
3492
3493	devinet_init();
3494	ip_fib_init();
3495
3496	if (ip_rt_proc_init())
3497		pr_err("Unable to create route proc files\n");
3498#ifdef CONFIG_XFRM
3499	xfrm_init();
3500	xfrm4_init();
3501#endif
3502	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3503		      RTNL_FLAG_DOIT_UNLOCKED);
3504
3505#ifdef CONFIG_SYSCTL
3506	register_pernet_subsys(&sysctl_route_ops);
3507#endif
3508	register_pernet_subsys(&rt_genid_ops);
3509	register_pernet_subsys(&ipv4_inetpeer_ops);
3510	return 0;
3511}
3512
3513#ifdef CONFIG_SYSCTL
3514/*
3515 * We really need to sanitize the damn ipv4 init order, then all
3516 * this nonsense will go away.
3517 */
3518void __init ip_static_sysctl_init(void)
3519{
3520	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3521}
3522#endif
v3.1
 
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		ROUTE - implementation of the IP router.
   7 *
   8 * Authors:	Ross Biro
   9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *		Alan Cox	:	Verify area fixes.
  16 *		Alan Cox	:	cli() protects routing changes
  17 *		Rui Oliveira	:	ICMP routing table updates
  18 *		(rco@di.uminho.pt)	Routing table insertion and update
  19 *		Linus Torvalds	:	Rewrote bits to be sensible
  20 *		Alan Cox	:	Added BSD route gw semantics
  21 *		Alan Cox	:	Super /proc >4K
  22 *		Alan Cox	:	MTU in route table
  23 *		Alan Cox	: 	MSS actually. Also added the window
  24 *					clamper.
  25 *		Sam Lantinga	:	Fixed route matching in rt_del()
  26 *		Alan Cox	:	Routing cache support.
  27 *		Alan Cox	:	Removed compatibility cruft.
  28 *		Alan Cox	:	RTF_REJECT support.
  29 *		Alan Cox	:	TCP irtt support.
  30 *		Jonathan Naylor	:	Added Metric support.
  31 *	Miquel van Smoorenburg	:	BSD API fixes.
  32 *	Miquel van Smoorenburg	:	Metrics.
  33 *		Alan Cox	:	Use __u32 properly
  34 *		Alan Cox	:	Aligned routing errors more closely with BSD
  35 *					our system is still very different.
  36 *		Alan Cox	:	Faster /proc handling
  37 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  38 *					routing caches and better behaviour.
  39 *
  40 *		Olaf Erb	:	irtt wasn't being copied right.
  41 *		Bjorn Ekwall	:	Kerneld route support.
  42 *		Alan Cox	:	Multicast fixed (I hope)
  43 * 		Pavel Krauz	:	Limited broadcast fixed
  44 *		Mike McLagan	:	Routing by source
  45 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  46 *					route.c and rewritten from scratch.
  47 *		Andi Kleen	:	Load-limit warning messages.
  48 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  49 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  50 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  51 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  52 *		Marc Boucher	:	routing by fwmark
  53 *	Robert Olsson		:	Added rt_cache statistics
  54 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  55 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  56 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  57 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
  58 *
  59 *		This program is free software; you can redistribute it and/or
  60 *		modify it under the terms of the GNU General Public License
  61 *		as published by the Free Software Foundation; either version
  62 *		2 of the License, or (at your option) any later version.
  63 */
  64
 
 
  65#include <linux/module.h>
  66#include <asm/uaccess.h>
  67#include <asm/system.h>
  68#include <linux/bitops.h>
  69#include <linux/types.h>
  70#include <linux/kernel.h>
  71#include <linux/mm.h>
  72#include <linux/bootmem.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/workqueue.h>
  83#include <linux/skbuff.h>
  84#include <linux/inetdevice.h>
  85#include <linux/igmp.h>
  86#include <linux/pkt_sched.h>
  87#include <linux/mroute.h>
  88#include <linux/netfilter_ipv4.h>
  89#include <linux/random.h>
  90#include <linux/jhash.h>
  91#include <linux/rcupdate.h>
  92#include <linux/times.h>
  93#include <linux/slab.h>
 
  94#include <net/dst.h>
 
  95#include <net/net_namespace.h>
  96#include <net/protocol.h>
  97#include <net/ip.h>
  98#include <net/route.h>
  99#include <net/inetpeer.h>
 100#include <net/sock.h>
 101#include <net/ip_fib.h>
 
 102#include <net/arp.h>
 103#include <net/tcp.h>
 104#include <net/icmp.h>
 105#include <net/xfrm.h>
 
 106#include <net/netevent.h>
 107#include <net/rtnetlink.h>
 108#ifdef CONFIG_SYSCTL
 109#include <linux/sysctl.h>
 110#endif
 111#include <net/atmclip.h>
 112#include <net/secure_seq.h>
 
 
 
 
 113
 114#define RT_FL_TOS(oldflp4) \
 115    ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 116
 117#define IP_MAX_MTU	0xFFF0
 118
 119#define RT_GC_TIMEOUT (300*HZ)
 120
 121static int ip_rt_max_size;
 122static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 123static int ip_rt_gc_interval __read_mostly	= 60 * HZ;
 124static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
 125static int ip_rt_redirect_number __read_mostly	= 9;
 126static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 127static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 128static int ip_rt_error_cost __read_mostly	= HZ;
 129static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 130static int ip_rt_gc_elasticity __read_mostly	= 8;
 131static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 132static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 133static int ip_rt_min_advmss __read_mostly	= 256;
 134static int rt_chain_length_max __read_mostly	= 20;
 
 135
 136/*
 137 *	Interface to generic destination cache.
 138 */
 139
 140static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 142static unsigned int	 ipv4_default_mtu(const struct dst_entry *dst);
 143static void		 ipv4_dst_destroy(struct dst_entry *dst);
 144static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145static void		 ipv4_link_failure(struct sk_buff *skb);
 146static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 147static int rt_garbage_collect(struct dst_ops *ops);
 148
 149static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 150			    int how)
 151{
 152}
 153
 154static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 155{
 156	struct rtable *rt = (struct rtable *) dst;
 157	struct inet_peer *peer;
 158	u32 *p = NULL;
 159
 160	if (!rt->peer)
 161		rt_bind_peer(rt, rt->rt_dst, 1);
 162
 163	peer = rt->peer;
 164	if (peer) {
 165		u32 *old_p = __DST_METRICS_PTR(old);
 166		unsigned long prev, new;
 167
 168		p = peer->metrics;
 169		if (inet_metrics_new(peer))
 170			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 171
 172		new = (unsigned long) p;
 173		prev = cmpxchg(&dst->_metrics, old, new);
 174
 175		if (prev != old) {
 176			p = __DST_METRICS_PTR(prev);
 177			if (prev & DST_METRICS_READ_ONLY)
 178				p = NULL;
 179		} else {
 180			if (rt->fi) {
 181				fib_info_put(rt->fi);
 182				rt->fi = NULL;
 183			}
 184		}
 185	}
 186	return p;
 187}
 188
 189static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 
 
 
 190
 191static struct dst_ops ipv4_dst_ops = {
 192	.family =		AF_INET,
 193	.protocol =		cpu_to_be16(ETH_P_IP),
 194	.gc =			rt_garbage_collect,
 195	.check =		ipv4_dst_check,
 196	.default_advmss =	ipv4_default_advmss,
 197	.default_mtu =		ipv4_default_mtu,
 198	.cow_metrics =		ipv4_cow_metrics,
 199	.destroy =		ipv4_dst_destroy,
 200	.ifdown =		ipv4_dst_ifdown,
 201	.negative_advice =	ipv4_negative_advice,
 202	.link_failure =		ipv4_link_failure,
 203	.update_pmtu =		ip_rt_update_pmtu,
 
 204	.local_out =		__ip_local_out,
 205	.neigh_lookup =		ipv4_neigh_lookup,
 
 206};
 207
 208#define ECN_OR_COST(class)	TC_PRIO_##class
 209
 210const __u8 ip_tos2prio[16] = {
 211	TC_PRIO_BESTEFFORT,
 212	ECN_OR_COST(BESTEFFORT),
 213	TC_PRIO_BESTEFFORT,
 214	ECN_OR_COST(BESTEFFORT),
 215	TC_PRIO_BULK,
 216	ECN_OR_COST(BULK),
 217	TC_PRIO_BULK,
 218	ECN_OR_COST(BULK),
 219	TC_PRIO_INTERACTIVE,
 220	ECN_OR_COST(INTERACTIVE),
 221	TC_PRIO_INTERACTIVE,
 222	ECN_OR_COST(INTERACTIVE),
 223	TC_PRIO_INTERACTIVE_BULK,
 224	ECN_OR_COST(INTERACTIVE_BULK),
 225	TC_PRIO_INTERACTIVE_BULK,
 226	ECN_OR_COST(INTERACTIVE_BULK)
 227};
 228
 229
 230/*
 231 * Route cache.
 232 */
 233
 234/* The locking scheme is rather straight forward:
 235 *
 236 * 1) Read-Copy Update protects the buckets of the central route hash.
 237 * 2) Only writers remove entries, and they hold the lock
 238 *    as they look at rtable reference counts.
 239 * 3) Only readers acquire references to rtable entries,
 240 *    they do so with atomic increments and with the
 241 *    lock held.
 242 */
 243
 244struct rt_hash_bucket {
 245	struct rtable __rcu	*chain;
 246};
 247
 248#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 249	defined(CONFIG_PROVE_LOCKING)
 250/*
 251 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 252 * The size of this table is a power of two and depends on the number of CPUS.
 253 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 254 */
 255#ifdef CONFIG_LOCKDEP
 256# define RT_HASH_LOCK_SZ	256
 257#else
 258# if NR_CPUS >= 32
 259#  define RT_HASH_LOCK_SZ	4096
 260# elif NR_CPUS >= 16
 261#  define RT_HASH_LOCK_SZ	2048
 262# elif NR_CPUS >= 8
 263#  define RT_HASH_LOCK_SZ	1024
 264# elif NR_CPUS >= 4
 265#  define RT_HASH_LOCK_SZ	512
 266# else
 267#  define RT_HASH_LOCK_SZ	256
 268# endif
 269#endif
 270
 271static spinlock_t	*rt_hash_locks;
 272# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 273
 274static __init void rt_hash_lock_init(void)
 275{
 276	int i;
 277
 278	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 279			GFP_KERNEL);
 280	if (!rt_hash_locks)
 281		panic("IP: failed to allocate rt_hash_locks\n");
 282
 283	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 284		spin_lock_init(&rt_hash_locks[i]);
 285}
 286#else
 287# define rt_hash_lock_addr(slot) NULL
 288
 289static inline void rt_hash_lock_init(void)
 290{
 291}
 292#endif
 293
 294static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
 295static unsigned			rt_hash_mask __read_mostly;
 296static unsigned int		rt_hash_log  __read_mostly;
 297
 298static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 299#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 300
 301static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 302				   int genid)
 303{
 304	return jhash_3words((__force u32)daddr, (__force u32)saddr,
 305			    idx, genid)
 306		& rt_hash_mask;
 307}
 308
 309static inline int rt_genid(struct net *net)
 310{
 311	return atomic_read(&net->ipv4.rt_genid);
 312}
 313
 314#ifdef CONFIG_PROC_FS
 315struct rt_cache_iter_state {
 316	struct seq_net_private p;
 317	int bucket;
 318	int genid;
 319};
 320
 321static struct rtable *rt_cache_get_first(struct seq_file *seq)
 322{
 323	struct rt_cache_iter_state *st = seq->private;
 324	struct rtable *r = NULL;
 325
 326	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 327		if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
 328			continue;
 329		rcu_read_lock_bh();
 330		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 331		while (r) {
 332			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 333			    r->rt_genid == st->genid)
 334				return r;
 335			r = rcu_dereference_bh(r->dst.rt_next);
 336		}
 337		rcu_read_unlock_bh();
 338	}
 339	return r;
 340}
 341
 342static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 343					  struct rtable *r)
 344{
 345	struct rt_cache_iter_state *st = seq->private;
 346
 347	r = rcu_dereference_bh(r->dst.rt_next);
 348	while (!r) {
 349		rcu_read_unlock_bh();
 350		do {
 351			if (--st->bucket < 0)
 352				return NULL;
 353		} while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
 354		rcu_read_lock_bh();
 355		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 356	}
 357	return r;
 358}
 359
 360static struct rtable *rt_cache_get_next(struct seq_file *seq,
 361					struct rtable *r)
 362{
 363	struct rt_cache_iter_state *st = seq->private;
 364	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 365		if (dev_net(r->dst.dev) != seq_file_net(seq))
 366			continue;
 367		if (r->rt_genid == st->genid)
 368			break;
 369	}
 370	return r;
 371}
 372
 373static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 374{
 375	struct rtable *r = rt_cache_get_first(seq);
 376
 377	if (r)
 378		while (pos && (r = rt_cache_get_next(seq, r)))
 379			--pos;
 380	return pos ? NULL : r;
 381}
 382
 383static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 384{
 385	struct rt_cache_iter_state *st = seq->private;
 386	if (*pos)
 387		return rt_cache_get_idx(seq, *pos - 1);
 388	st->genid = rt_genid(seq_file_net(seq));
 389	return SEQ_START_TOKEN;
 390}
 391
 392static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 393{
 394	struct rtable *r;
 395
 396	if (v == SEQ_START_TOKEN)
 397		r = rt_cache_get_first(seq);
 398	else
 399		r = rt_cache_get_next(seq, v);
 400	++*pos;
 401	return r;
 402}
 403
 404static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 405{
 406	if (v && v != SEQ_START_TOKEN)
 407		rcu_read_unlock_bh();
 408}
 409
 410static int rt_cache_seq_show(struct seq_file *seq, void *v)
 411{
 412	if (v == SEQ_START_TOKEN)
 413		seq_printf(seq, "%-127s\n",
 414			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 415			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 416			   "HHUptod\tSpecDst");
 417	else {
 418		struct rtable *r = v;
 419		struct neighbour *n;
 420		int len;
 421
 422		n = dst_get_neighbour(&r->dst);
 423		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 424			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 425			r->dst.dev ? r->dst.dev->name : "*",
 426			(__force u32)r->rt_dst,
 427			(__force u32)r->rt_gateway,
 428			r->rt_flags, atomic_read(&r->dst.__refcnt),
 429			r->dst.__use, 0, (__force u32)r->rt_src,
 430			dst_metric_advmss(&r->dst) + 40,
 431			dst_metric(&r->dst, RTAX_WINDOW),
 432			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 433			      dst_metric(&r->dst, RTAX_RTTVAR)),
 434			r->rt_key_tos,
 435			-1,
 436			(n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0,
 437			r->rt_spec_dst, &len);
 438
 439		seq_printf(seq, "%*s\n", 127 - len, "");
 440	}
 441	return 0;
 442}
 443
 444static const struct seq_operations rt_cache_seq_ops = {
 445	.start  = rt_cache_seq_start,
 446	.next   = rt_cache_seq_next,
 447	.stop   = rt_cache_seq_stop,
 448	.show   = rt_cache_seq_show,
 449};
 450
 451static int rt_cache_seq_open(struct inode *inode, struct file *file)
 452{
 453	return seq_open_net(inode, file, &rt_cache_seq_ops,
 454			sizeof(struct rt_cache_iter_state));
 455}
 456
 457static const struct file_operations rt_cache_seq_fops = {
 458	.owner	 = THIS_MODULE,
 459	.open	 = rt_cache_seq_open,
 460	.read	 = seq_read,
 461	.llseek	 = seq_lseek,
 462	.release = seq_release_net,
 463};
 464
 465
 466static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 467{
 468	int cpu;
 469
 470	if (*pos == 0)
 471		return SEQ_START_TOKEN;
 472
 473	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 474		if (!cpu_possible(cpu))
 475			continue;
 476		*pos = cpu+1;
 477		return &per_cpu(rt_cache_stat, cpu);
 478	}
 479	return NULL;
 480}
 481
 482static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 483{
 484	int cpu;
 485
 486	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 487		if (!cpu_possible(cpu))
 488			continue;
 489		*pos = cpu+1;
 490		return &per_cpu(rt_cache_stat, cpu);
 491	}
 492	return NULL;
 493
 494}
 495
 496static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 497{
 498
 499}
 500
 501static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 502{
 503	struct rt_cache_stat *st = v;
 504
 505	if (v == SEQ_START_TOKEN) {
 506		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 507		return 0;
 508	}
 509
 510	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 511		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 512		   dst_entries_get_slow(&ipv4_dst_ops),
 513		   st->in_hit,
 514		   st->in_slow_tot,
 515		   st->in_slow_mc,
 516		   st->in_no_route,
 517		   st->in_brd,
 518		   st->in_martian_dst,
 519		   st->in_martian_src,
 520
 521		   st->out_hit,
 522		   st->out_slow_tot,
 523		   st->out_slow_mc,
 524
 525		   st->gc_total,
 526		   st->gc_ignored,
 527		   st->gc_goal_miss,
 528		   st->gc_dst_overflow,
 529		   st->in_hlist_search,
 530		   st->out_hlist_search
 531		);
 532	return 0;
 533}
 534
 535static const struct seq_operations rt_cpu_seq_ops = {
 536	.start  = rt_cpu_seq_start,
 537	.next   = rt_cpu_seq_next,
 538	.stop   = rt_cpu_seq_stop,
 539	.show   = rt_cpu_seq_show,
 540};
 541
 542
 543static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 544{
 545	return seq_open(file, &rt_cpu_seq_ops);
 546}
 547
 548static const struct file_operations rt_cpu_seq_fops = {
 549	.owner	 = THIS_MODULE,
 550	.open	 = rt_cpu_seq_open,
 551	.read	 = seq_read,
 552	.llseek	 = seq_lseek,
 553	.release = seq_release,
 554};
 555
 556#ifdef CONFIG_IP_ROUTE_CLASSID
 557static int rt_acct_proc_show(struct seq_file *m, void *v)
 558{
 559	struct ip_rt_acct *dst, *src;
 560	unsigned int i, j;
 561
 562	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 563	if (!dst)
 564		return -ENOMEM;
 565
 566	for_each_possible_cpu(i) {
 567		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 568		for (j = 0; j < 256; j++) {
 569			dst[j].o_bytes   += src[j].o_bytes;
 570			dst[j].o_packets += src[j].o_packets;
 571			dst[j].i_bytes   += src[j].i_bytes;
 572			dst[j].i_packets += src[j].i_packets;
 573		}
 574	}
 575
 576	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 577	kfree(dst);
 578	return 0;
 579}
 580
 581static int rt_acct_proc_open(struct inode *inode, struct file *file)
 582{
 583	return single_open(file, rt_acct_proc_show, NULL);
 584}
 585
 586static const struct file_operations rt_acct_proc_fops = {
 587	.owner		= THIS_MODULE,
 588	.open		= rt_acct_proc_open,
 589	.read		= seq_read,
 590	.llseek		= seq_lseek,
 591	.release	= single_release,
 592};
 593#endif
 594
 595static int __net_init ip_rt_do_proc_init(struct net *net)
 596{
 597	struct proc_dir_entry *pde;
 598
 599	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 600			&rt_cache_seq_fops);
 601	if (!pde)
 602		goto err1;
 603
 604	pde = proc_create("rt_cache", S_IRUGO,
 605			  net->proc_net_stat, &rt_cpu_seq_fops);
 606	if (!pde)
 607		goto err2;
 608
 609#ifdef CONFIG_IP_ROUTE_CLASSID
 610	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 
 611	if (!pde)
 612		goto err3;
 613#endif
 614	return 0;
 615
 616#ifdef CONFIG_IP_ROUTE_CLASSID
 617err3:
 618	remove_proc_entry("rt_cache", net->proc_net_stat);
 619#endif
 620err2:
 621	remove_proc_entry("rt_cache", net->proc_net);
 622err1:
 623	return -ENOMEM;
 624}
 625
 626static void __net_exit ip_rt_do_proc_exit(struct net *net)
 627{
 628	remove_proc_entry("rt_cache", net->proc_net_stat);
 629	remove_proc_entry("rt_cache", net->proc_net);
 630#ifdef CONFIG_IP_ROUTE_CLASSID
 631	remove_proc_entry("rt_acct", net->proc_net);
 632#endif
 633}
 634
 635static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 636	.init = ip_rt_do_proc_init,
 637	.exit = ip_rt_do_proc_exit,
 638};
 639
 640static int __init ip_rt_proc_init(void)
 641{
 642	return register_pernet_subsys(&ip_rt_proc_ops);
 643}
 644
 645#else
 646static inline int ip_rt_proc_init(void)
 647{
 648	return 0;
 649}
 650#endif /* CONFIG_PROC_FS */
 651
 652static inline void rt_free(struct rtable *rt)
 653{
 654	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 655}
 656
 657static inline void rt_drop(struct rtable *rt)
 658{
 659	ip_rt_put(rt);
 660	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 661}
 662
 663static inline int rt_fast_clean(struct rtable *rth)
 
 
 664{
 665	/* Kill broadcast/multicast entries very aggresively, if they
 666	   collide in hash table with more useful entries */
 667	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 668		rt_is_input_route(rth) && rth->dst.rt_next;
 669}
 670
 671static inline int rt_valuable(struct rtable *rth)
 672{
 673	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 674		(rth->peer && rth->peer->pmtu_expires);
 675}
 676
 677static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 678{
 679	unsigned long age;
 680	int ret = 0;
 
 
 681
 682	if (atomic_read(&rth->dst.__refcnt))
 683		goto out;
 
 684
 685	age = jiffies - rth->dst.lastuse;
 686	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 687	    (age <= tmo2 && rt_valuable(rth)))
 688		goto out;
 689	ret = 1;
 690out:	return ret;
 691}
 692
 693/* Bits of score are:
 694 * 31: very valuable
 695 * 30: not quite useless
 696 * 29..0: usage counter
 697 */
 698static inline u32 rt_score(struct rtable *rt)
 699{
 700	u32 score = jiffies - rt->dst.lastuse;
 701
 702	score = ~score & ~(3<<30);
 703
 704	if (rt_valuable(rt))
 705		score |= (1<<31);
 706
 707	if (rt_is_output_route(rt) ||
 708	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 709		score |= (1<<30);
 710
 711	return score;
 712}
 713
 714static inline bool rt_caching(const struct net *net)
 715{
 716	return net->ipv4.current_rt_cache_rebuild_count <=
 717		net->ipv4.sysctl_rt_cache_rebuild_count;
 718}
 719
 720static inline bool compare_hash_inputs(const struct rtable *rt1,
 721				       const struct rtable *rt2)
 722{
 723	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 724		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 725		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 
 
 
 
 726}
 727
 728static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 729{
 730	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 731		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 732		(rt1->rt_mark ^ rt2->rt_mark) |
 733		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
 734		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
 735		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
 736}
 737
 738static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 739{
 740	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 741}
 742
 743static inline int rt_is_expired(struct rtable *rth)
 
 
 
 
 744{
 745	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 746}
 
 
 
 747
 748/*
 749 * Perform a full scan of hash table and free all entries.
 750 * Can be called by a softirq or a process.
 751 * In the later case, we want to be reschedule if necessary
 752 */
 753static void rt_do_flush(struct net *net, int process_context)
 754{
 755	unsigned int i;
 756	struct rtable *rth, *next;
 757
 758	for (i = 0; i <= rt_hash_mask; i++) {
 759		struct rtable __rcu **pprev;
 760		struct rtable *list;
 761
 762		if (process_context && need_resched())
 763			cond_resched();
 764		rth = rcu_dereference_raw(rt_hash_table[i].chain);
 765		if (!rth)
 766			continue;
 767
 768		spin_lock_bh(rt_hash_lock_addr(i));
 
 
 769
 770		list = NULL;
 771		pprev = &rt_hash_table[i].chain;
 772		rth = rcu_dereference_protected(*pprev,
 773			lockdep_is_held(rt_hash_lock_addr(i)));
 774
 775		while (rth) {
 776			next = rcu_dereference_protected(rth->dst.rt_next,
 777				lockdep_is_held(rt_hash_lock_addr(i)));
 778
 779			if (!net ||
 780			    net_eq(dev_net(rth->dst.dev), net)) {
 781				rcu_assign_pointer(*pprev, next);
 782				rcu_assign_pointer(rth->dst.rt_next, list);
 783				list = rth;
 784			} else {
 785				pprev = &rth->dst.rt_next;
 786			}
 787			rth = next;
 788		}
 789
 790		spin_unlock_bh(rt_hash_lock_addr(i));
 
 
 
 791
 792		for (; list; list = next) {
 793			next = rcu_dereference_protected(list->dst.rt_next, 1);
 794			rt_free(list);
 795		}
 796	}
 
 797}
 
 798
 799/*
 800 * While freeing expired entries, we compute average chain length
 801 * and standard deviation, using fixed-point arithmetic.
 802 * This to have an estimation of rt_chain_length_max
 803 *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 804 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 805 */
 806
 807#define FRACT_BITS 3
 808#define ONE (1UL << FRACT_BITS)
 809
 810/*
 811 * Given a hash chain and an item in this hash chain,
 812 * find if a previous entry has the same hash_inputs
 813 * (but differs on tos, mark or oif)
 814 * Returns 0 if an alias is found.
 815 * Returns ONE if rth has no alias before itself.
 816 */
 817static int has_noalias(const struct rtable *head, const struct rtable *rth)
 818{
 819	const struct rtable *aux = head;
 
 820
 821	while (aux != rth) {
 822		if (compare_hash_inputs(aux, rth))
 823			return 0;
 824		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 825	}
 826	return ONE;
 
 
 
 
 827}
 828
 829/*
 830 * Perturbation of rt_genid by a small quantity [1..256]
 831 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 832 * many times (2^24) without giving recent rt_genid.
 833 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 834 */
 835static void rt_cache_invalidate(struct net *net)
 836{
 837	unsigned char shuffle;
 
 
 
 
 
 838
 839	get_random_bytes(&shuffle, sizeof(shuffle));
 840	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 841}
 842
 843/*
 844 * delay < 0  : invalidate cache (fast : entries will be deleted later)
 845 * delay >= 0 : invalidate & flush cache (can be long)
 846 */
 847void rt_cache_flush(struct net *net, int delay)
 848{
 849	rt_cache_invalidate(net);
 850	if (delay >= 0)
 851		rt_do_flush(net, !in_softirq());
 852}
 853
 854/* Flush previous cache invalidated entries from the cache */
 855void rt_cache_flush_batch(struct net *net)
 856{
 857	rt_do_flush(net, !in_softirq());
 
 
 
 
 
 
 858}
 859
 860static void rt_emergency_hash_rebuild(struct net *net)
 
 861{
 862	if (net_ratelimit())
 863		printk(KERN_WARNING "Route hash chain too long!\n");
 864	rt_cache_invalidate(net);
 
 865}
 866
 867/*
 868   Short description of GC goals.
 869
 870   We want to build algorithm, which will keep routing cache
 871   at some equilibrium point, when number of aged off entries
 872   is kept approximately equal to newly generated ones.
 873
 874   Current expiration strength is variable "expire".
 875   We try to adjust it dynamically, so that if networking
 876   is idle expires is large enough to keep enough of warm entries,
 877   and when load increases it reduces to limit cache size.
 878 */
 879
 880static int rt_garbage_collect(struct dst_ops *ops)
 881{
 882	static unsigned long expire = RT_GC_TIMEOUT;
 883	static unsigned long last_gc;
 884	static int rover;
 885	static int equilibrium;
 886	struct rtable *rth;
 887	struct rtable __rcu **rthp;
 888	unsigned long now = jiffies;
 889	int goal;
 890	int entries = dst_entries_get_fast(&ipv4_dst_ops);
 891
 892	/*
 893	 * Garbage collection is pretty expensive,
 894	 * do not make it too frequently.
 895	 */
 896
 897	RT_CACHE_STAT_INC(gc_total);
 898
 899	if (now - last_gc < ip_rt_gc_min_interval &&
 900	    entries < ip_rt_max_size) {
 901		RT_CACHE_STAT_INC(gc_ignored);
 902		goto out;
 903	}
 904
 905	entries = dst_entries_get_slow(&ipv4_dst_ops);
 906	/* Calculate number of entries, which we want to expire now. */
 907	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 908	if (goal <= 0) {
 909		if (equilibrium < ipv4_dst_ops.gc_thresh)
 910			equilibrium = ipv4_dst_ops.gc_thresh;
 911		goal = entries - equilibrium;
 912		if (goal > 0) {
 913			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 914			goal = entries - equilibrium;
 915		}
 916	} else {
 917		/* We are in dangerous area. Try to reduce cache really
 918		 * aggressively.
 919		 */
 920		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
 921		equilibrium = entries - goal;
 922	}
 923
 924	if (now - last_gc >= ip_rt_gc_min_interval)
 925		last_gc = now;
 926
 927	if (goal <= 0) {
 928		equilibrium += goal;
 929		goto work_done;
 930	}
 931
 932	do {
 933		int i, k;
 934
 935		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 936			unsigned long tmo = expire;
 937
 938			k = (k + 1) & rt_hash_mask;
 939			rthp = &rt_hash_table[k].chain;
 940			spin_lock_bh(rt_hash_lock_addr(k));
 941			while ((rth = rcu_dereference_protected(*rthp,
 942					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
 943				if (!rt_is_expired(rth) &&
 944					!rt_may_expire(rth, tmo, expire)) {
 945					tmo >>= 1;
 946					rthp = &rth->dst.rt_next;
 947					continue;
 948				}
 949				*rthp = rth->dst.rt_next;
 950				rt_free(rth);
 951				goal--;
 952			}
 953			spin_unlock_bh(rt_hash_lock_addr(k));
 954			if (goal <= 0)
 955				break;
 956		}
 957		rover = k;
 958
 959		if (goal <= 0)
 960			goto work_done;
 961
 962		/* Goal is not achieved. We stop process if:
 963
 964		   - if expire reduced to zero. Otherwise, expire is halfed.
 965		   - if table is not full.
 966		   - if we are called from interrupt.
 967		   - jiffies check is just fallback/debug loop breaker.
 968		     We will not spin here for long time in any case.
 969		 */
 970
 971		RT_CACHE_STAT_INC(gc_goal_miss);
 972
 973		if (expire == 0)
 974			break;
 975
 976		expire >>= 1;
 977
 978		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 979			goto out;
 980	} while (!in_softirq() && time_before_eq(jiffies, now));
 981
 982	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 983		goto out;
 984	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 985		goto out;
 986	if (net_ratelimit())
 987		printk(KERN_WARNING "dst cache overflow\n");
 988	RT_CACHE_STAT_INC(gc_dst_overflow);
 989	return 1;
 990
 991work_done:
 992	expire += ip_rt_gc_min_interval;
 993	if (expire > ip_rt_gc_timeout ||
 994	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
 995	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
 996		expire = ip_rt_gc_timeout;
 997out:	return 0;
 998}
 999
1000/*
1001 * Returns number of entries in a hash chain that have different hash_inputs
1002 */
1003static int slow_chain_length(const struct rtable *head)
1004{
1005	int length = 0;
1006	const struct rtable *rth = head;
1007
1008	while (rth) {
1009		length += has_noalias(head, rth);
1010		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
 
 
1011	}
1012	return length >> FRACT_BITS;
 
1013}
1014
1015static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1016{
1017	struct neigh_table *tbl = &arp_tbl;
1018	static const __be32 inaddr_any = 0;
1019	struct net_device *dev = dst->dev;
1020	const __be32 *pkey = daddr;
1021	struct neighbour *n;
1022
1023#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
1024	if (dev->type == ARPHRD_ATM)
1025		tbl = clip_tbl_hook;
1026#endif
1027	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1028		pkey = &inaddr_any;
1029
1030	n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey);
1031	if (n)
1032		return n;
1033	return neigh_create(tbl, pkey, dev);
1034}
1035
1036static int rt_bind_neighbour(struct rtable *rt)
1037{
1038	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1039	if (IS_ERR(n))
1040		return PTR_ERR(n);
1041	dst_set_neighbour(&rt->dst, n);
1042
1043	return 0;
 
 
 
 
 
1044}
1045
1046static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt,
1047				     struct sk_buff *skb, int ifindex)
 
1048{
1049	struct rtable	*rth, *cand;
1050	struct rtable __rcu **rthp, **candp;
1051	unsigned long	now;
1052	u32 		min_score;
1053	int		chain_length;
1054	int attempts = !in_softirq();
1055
1056restart:
1057	chain_length = 0;
1058	min_score = ~(u32)0;
1059	cand = NULL;
1060	candp = NULL;
1061	now = jiffies;
1062
1063	if (!rt_caching(dev_net(rt->dst.dev))) {
1064		/*
1065		 * If we're not caching, just tell the caller we
1066		 * were successful and don't touch the route.  The
1067		 * caller hold the sole reference to the cache entry, and
1068		 * it will be released when the caller is done with it.
1069		 * If we drop it here, the callers have no way to resolve routes
1070		 * when we're not caching.  Instead, just point *rp at rt, so
1071		 * the caller gets a single use out of the route
1072		 * Note that we do rt_free on this new route entry, so that
1073		 * once its refcount hits zero, we are still able to reap it
1074		 * (Thanks Alexey)
1075		 * Note: To avoid expensive rcu stuff for this uncached dst,
1076		 * we set DST_NOCACHE so that dst_release() can free dst without
1077		 * waiting a grace period.
1078		 */
1079
1080		rt->dst.flags |= DST_NOCACHE;
1081		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1082			int err = rt_bind_neighbour(rt);
1083			if (err) {
1084				if (net_ratelimit())
1085					printk(KERN_WARNING
1086					    "Neighbour table failure & not caching routes.\n");
1087				ip_rt_put(rt);
1088				return ERR_PTR(err);
1089			}
1090		}
1091
1092		goto skip_hashing;
 
 
 
 
 
1093	}
1094
1095	rthp = &rt_hash_table[hash].chain;
1096
1097	spin_lock_bh(rt_hash_lock_addr(hash));
1098	while ((rth = rcu_dereference_protected(*rthp,
1099			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1100		if (rt_is_expired(rth)) {
1101			*rthp = rth->dst.rt_next;
1102			rt_free(rth);
1103			continue;
1104		}
1105		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1106			/* Put it first */
1107			*rthp = rth->dst.rt_next;
1108			/*
1109			 * Since lookup is lockfree, the deletion
1110			 * must be visible to another weakly ordered CPU before
1111			 * the insertion at the start of the hash chain.
1112			 */
1113			rcu_assign_pointer(rth->dst.rt_next,
1114					   rt_hash_table[hash].chain);
1115			/*
1116			 * Since lookup is lockfree, the update writes
1117			 * must be ordered for consistency on SMP.
1118			 */
1119			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1120
1121			dst_use(&rth->dst, now);
1122			spin_unlock_bh(rt_hash_lock_addr(hash));
1123
1124			rt_drop(rt);
1125			if (skb)
1126				skb_dst_set(skb, &rth->dst);
1127			return rth;
1128		}
1129
1130		if (!atomic_read(&rth->dst.__refcnt)) {
1131			u32 score = rt_score(rth);
1132
1133			if (score <= min_score) {
1134				cand = rth;
1135				candp = rthp;
1136				min_score = score;
1137			}
1138		}
1139
1140		chain_length++;
1141
1142		rthp = &rth->dst.rt_next;
1143	}
1144
1145	if (cand) {
1146		/* ip_rt_gc_elasticity used to be average length of chain
1147		 * length, when exceeded gc becomes really aggressive.
1148		 *
1149		 * The second limit is less certain. At the moment it allows
1150		 * only 2 entries per bucket. We will see.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1151		 */
1152		if (chain_length > ip_rt_gc_elasticity) {
1153			*candp = cand->dst.rt_next;
1154			rt_free(cand);
1155		}
1156	} else {
1157		if (chain_length > rt_chain_length_max &&
1158		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1159			struct net *net = dev_net(rt->dst.dev);
1160			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1161			if (!rt_caching(net)) {
1162				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1163					rt->dst.dev->name, num);
1164			}
1165			rt_emergency_hash_rebuild(net);
1166			spin_unlock_bh(rt_hash_lock_addr(hash));
1167
1168			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1169					ifindex, rt_genid(net));
1170			goto restart;
 
 
 
1171		}
1172	}
1173
1174	/* Try to bind route to arp only if it is output
1175	   route or unicast forwarding path.
1176	 */
1177	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1178		int err = rt_bind_neighbour(rt);
1179		if (err) {
1180			spin_unlock_bh(rt_hash_lock_addr(hash));
1181
1182			if (err != -ENOBUFS) {
1183				rt_drop(rt);
1184				return ERR_PTR(err);
1185			}
1186
1187			/* Neighbour tables are full and nothing
1188			   can be released. Try to shrink route cache,
1189			   it is most likely it holds some neighbour records.
1190			 */
1191			if (attempts-- > 0) {
1192				int saved_elasticity = ip_rt_gc_elasticity;
1193				int saved_int = ip_rt_gc_min_interval;
1194				ip_rt_gc_elasticity	= 1;
1195				ip_rt_gc_min_interval	= 0;
1196				rt_garbage_collect(&ipv4_dst_ops);
1197				ip_rt_gc_min_interval	= saved_int;
1198				ip_rt_gc_elasticity	= saved_elasticity;
1199				goto restart;
1200			}
1201
1202			if (net_ratelimit())
1203				printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1204			rt_drop(rt);
1205			return ERR_PTR(-ENOBUFS);
1206		}
1207	}
1208
1209	rt->dst.rt_next = rt_hash_table[hash].chain;
1210
1211	/*
1212	 * Since lookup is lockfree, we must make sure
1213	 * previous writes to rt are committed to memory
1214	 * before making rt visible to other CPUS.
1215	 */
1216	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1217
1218	spin_unlock_bh(rt_hash_lock_addr(hash));
1219
1220skip_hashing:
1221	if (skb)
1222		skb_dst_set(skb, &rt->dst);
1223	return rt;
1224}
1225
1226static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1227
1228static u32 rt_peer_genid(void)
1229{
1230	return atomic_read(&__rt_peer_genid);
1231}
 
 
 
 
 
1232
1233void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1234{
1235	struct inet_peer *peer;
 
 
 
1236
1237	peer = inet_getpeer_v4(daddr, create);
1238
1239	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1240		inet_putpeer(peer);
1241	else
1242		rt->rt_peer_genid = rt_peer_genid();
1243}
1244
1245/*
1246 * Peer allocation may fail only in serious out-of-memory conditions.  However
1247 * we still can generate some output.
1248 * Random ID selection looks a bit dangerous because we have no chances to
1249 * select ID being unique in a reasonable period of time.
1250 * But broken packet identifier may be better than no packet at all.
1251 */
1252static void ip_select_fb_ident(struct iphdr *iph)
1253{
1254	static DEFINE_SPINLOCK(ip_fb_id_lock);
1255	static u32 ip_fallback_id;
1256	u32 salt;
1257
1258	spin_lock_bh(&ip_fb_id_lock);
1259	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1260	iph->id = htons(salt & 0xFFFF);
1261	ip_fallback_id = salt;
1262	spin_unlock_bh(&ip_fb_id_lock);
1263}
1264
1265void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1266{
1267	struct rtable *rt = (struct rtable *) dst;
1268
1269	if (rt) {
1270		if (rt->peer == NULL)
1271			rt_bind_peer(rt, rt->rt_dst, 1);
1272
1273		/* If peer is attached to destination, it is never detached,
1274		   so that we need not to grab a lock to dereference it.
1275		 */
1276		if (rt->peer) {
1277			iph->id = htons(inet_getid(rt->peer, more));
1278			return;
1279		}
1280	} else
1281		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1282		       __builtin_return_address(0));
1283
1284	ip_select_fb_ident(iph);
1285}
1286EXPORT_SYMBOL(__ip_select_ident);
1287
1288static void rt_del(unsigned hash, struct rtable *rt)
1289{
1290	struct rtable __rcu **rthp;
1291	struct rtable *aux;
1292
1293	rthp = &rt_hash_table[hash].chain;
1294	spin_lock_bh(rt_hash_lock_addr(hash));
1295	ip_rt_put(rt);
1296	while ((aux = rcu_dereference_protected(*rthp,
1297			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1298		if (aux == rt || rt_is_expired(aux)) {
1299			*rthp = aux->dst.rt_next;
1300			rt_free(aux);
1301			continue;
1302		}
1303		rthp = &aux->dst.rt_next;
1304	}
1305	spin_unlock_bh(rt_hash_lock_addr(hash));
1306}
1307
1308/* called in rcu_read_lock() section */
1309void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1310		    __be32 saddr, struct net_device *dev)
1311{
1312	struct in_device *in_dev = __in_dev_get_rcu(dev);
1313	struct inet_peer *peer;
1314	struct net *net;
1315
 
1316	if (!in_dev)
1317		return;
1318
1319	net = dev_net(dev);
1320	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1321	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1322	    ipv4_is_zeronet(new_gw))
1323		goto reject_redirect;
1324
1325	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1326		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1327			goto reject_redirect;
1328		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1329			goto reject_redirect;
1330	} else {
1331		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1332			goto reject_redirect;
1333	}
1334
1335	peer = inet_getpeer_v4(daddr, 1);
1336	if (peer) {
1337		peer->redirect_learned.a4 = new_gw;
 
 
 
 
 
 
1338
1339		inet_putpeer(peer);
1340
1341		atomic_inc(&__rt_peer_genid);
 
 
 
 
 
 
1342	}
1343	return;
1344
1345reject_redirect:
1346#ifdef CONFIG_IP_ROUTE_VERBOSE
1347	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1348		printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1349			"  Advised path = %pI4 -> %pI4\n",
1350		       &old_gw, dev->name, &new_gw,
1351		       &saddr, &daddr);
 
 
 
 
 
1352#endif
1353	;
1354}
1355
1356static bool peer_pmtu_expired(struct inet_peer *peer)
1357{
1358	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1359
1360	return orig &&
1361	       time_after_eq(jiffies, orig) &&
1362	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1363}
 
 
1364
1365static bool peer_pmtu_cleaned(struct inet_peer *peer)
1366{
1367	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1368
1369	return orig &&
1370	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1371}
1372
1373static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1374{
1375	struct rtable *rt = (struct rtable *)dst;
1376	struct dst_entry *ret = dst;
1377
1378	if (rt) {
1379		if (dst->obsolete > 0) {
1380			ip_rt_put(rt);
1381			ret = NULL;
1382		} else if (rt->rt_flags & RTCF_REDIRECTED) {
1383			unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1384						rt->rt_oif,
1385						rt_genid(dev_net(dst->dev)));
1386			rt_del(hash, rt);
1387			ret = NULL;
1388		} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1389			dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1390		}
1391	}
1392	return ret;
1393}
1394
1395/*
1396 * Algorithm:
1397 *	1. The first ip_rt_redirect_number redirects are sent
1398 *	   with exponential backoff, then we stop sending them at all,
1399 *	   assuming that the host ignores our redirects.
1400 *	2. If we did not see packets requiring redirects
1401 *	   during ip_rt_redirect_silence, we assume that the host
1402 *	   forgot redirected route and start to send redirects again.
1403 *
1404 * This algorithm is much cheaper and more intelligent than dumb load limiting
1405 * in icmp.c.
1406 *
1407 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1408 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1409 */
1410
1411void ip_rt_send_redirect(struct sk_buff *skb)
1412{
1413	struct rtable *rt = skb_rtable(skb);
1414	struct in_device *in_dev;
1415	struct inet_peer *peer;
 
1416	int log_martians;
 
1417
1418	rcu_read_lock();
1419	in_dev = __in_dev_get_rcu(rt->dst.dev);
1420	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1421		rcu_read_unlock();
1422		return;
1423	}
1424	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 
1425	rcu_read_unlock();
1426
1427	if (!rt->peer)
1428		rt_bind_peer(rt, rt->rt_dst, 1);
1429	peer = rt->peer;
1430	if (!peer) {
1431		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 
1432		return;
1433	}
1434
1435	/* No redirected packets during ip_rt_redirect_silence;
1436	 * reset the algorithm.
1437	 */
1438	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1439		peer->rate_tokens = 0;
 
 
1440
1441	/* Too many ignored redirects; do not send anything
1442	 * set dst.rate_last to the last seen redirected packet.
1443	 */
1444	if (peer->rate_tokens >= ip_rt_redirect_number) {
1445		peer->rate_last = jiffies;
1446		return;
1447	}
1448
1449	/* Check for load limit; set rate_last to the latest sent
1450	 * redirect.
1451	 */
1452	if (peer->rate_tokens == 0 ||
1453	    time_after(jiffies,
1454		       (peer->rate_last +
1455			(ip_rt_redirect_load << peer->rate_tokens)))) {
1456		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 
 
1457		peer->rate_last = jiffies;
1458		++peer->rate_tokens;
1459#ifdef CONFIG_IP_ROUTE_VERBOSE
1460		if (log_martians &&
1461		    peer->rate_tokens == ip_rt_redirect_number &&
1462		    net_ratelimit())
1463			printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1464			       &ip_hdr(skb)->saddr, rt->rt_iif,
1465				&rt->rt_dst, &rt->rt_gateway);
1466#endif
1467	}
 
 
1468}
1469
1470static int ip_error(struct sk_buff *skb)
1471{
1472	struct rtable *rt = skb_rtable(skb);
 
 
1473	struct inet_peer *peer;
1474	unsigned long now;
 
1475	bool send;
1476	int code;
1477
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1478	switch (rt->dst.error) {
1479	case EINVAL:
1480	default:
1481		goto out;
1482	case EHOSTUNREACH:
1483		code = ICMP_HOST_UNREACH;
1484		break;
1485	case ENETUNREACH:
1486		code = ICMP_NET_UNREACH;
1487		IP_INC_STATS_BH(dev_net(rt->dst.dev),
1488				IPSTATS_MIB_INNOROUTES);
1489		break;
1490	case EACCES:
1491		code = ICMP_PKT_FILTERED;
1492		break;
1493	}
1494
1495	if (!rt->peer)
1496		rt_bind_peer(rt, rt->rt_dst, 1);
1497	peer = rt->peer;
1498
1499	send = true;
1500	if (peer) {
1501		now = jiffies;
1502		peer->rate_tokens += now - peer->rate_last;
1503		if (peer->rate_tokens > ip_rt_error_burst)
1504			peer->rate_tokens = ip_rt_error_burst;
1505		peer->rate_last = now;
1506		if (peer->rate_tokens >= ip_rt_error_cost)
1507			peer->rate_tokens -= ip_rt_error_cost;
1508		else
1509			send = false;
 
1510	}
1511	if (send)
1512		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1513
1514out:	kfree_skb(skb);
1515	return 0;
1516}
1517
1518/*
1519 *	The last two values are not from the RFC but
1520 *	are needed for AMPRnet AX.25 paths.
1521 */
 
 
 
 
 
 
 
 
 
 
 
 
 
1522
1523static const unsigned short mtu_plateau[] =
1524{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
 
1525
1526static inline unsigned short guess_mtu(unsigned short old_mtu)
 
 
 
 
 
 
 
 
 
 
 
1527{
1528	int i;
 
1529
1530	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1531		if (old_mtu > mtu_plateau[i])
1532			return mtu_plateau[i];
1533	return 68;
1534}
1535
1536unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1537				 unsigned short new_mtu,
1538				 struct net_device *dev)
1539{
1540	unsigned short old_mtu = ntohs(iph->tot_len);
1541	unsigned short est_mtu = 0;
1542	struct inet_peer *peer;
 
 
 
 
 
 
 
 
 
 
 
1543
1544	peer = inet_getpeer_v4(iph->daddr, 1);
1545	if (peer) {
1546		unsigned short mtu = new_mtu;
 
 
1547
1548		if (new_mtu < 68 || new_mtu >= old_mtu) {
1549			/* BSD 4.2 derived systems incorrectly adjust
1550			 * tot_len by the IP header length, and report
1551			 * a zero MTU in the ICMP message.
1552			 */
1553			if (mtu == 0 &&
1554			    old_mtu >= 68 + (iph->ihl << 2))
1555				old_mtu -= iph->ihl << 2;
1556			mtu = guess_mtu(old_mtu);
1557		}
1558
1559		if (mtu < ip_rt_min_pmtu)
1560			mtu = ip_rt_min_pmtu;
1561		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1562			unsigned long pmtu_expires;
1563
1564			pmtu_expires = jiffies + ip_rt_mtu_expires;
1565			if (!pmtu_expires)
1566				pmtu_expires = 1UL;
1567
1568			est_mtu = mtu;
1569			peer->pmtu_learned = mtu;
1570			peer->pmtu_expires = pmtu_expires;
1571		}
1572
1573		inet_putpeer(peer);
 
1574
1575		atomic_inc(&__rt_peer_genid);
 
 
 
1576	}
1577	return est_mtu ? : new_mtu;
1578}
1579
1580static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1581{
1582	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
 
 
 
 
 
 
 
 
 
 
 
 
1583
1584	if (!expires)
1585		return;
1586	if (time_before(jiffies, expires)) {
1587		u32 orig_dst_mtu = dst_mtu(dst);
1588		if (peer->pmtu_learned < orig_dst_mtu) {
1589			if (!peer->pmtu_orig)
1590				peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1591			dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1592		}
1593	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1594		dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1595}
1596
1597static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1598{
1599	struct rtable *rt = (struct rtable *) dst;
1600	struct inet_peer *peer;
1601
1602	dst_confirm(dst);
 
 
 
 
1603
1604	if (!rt->peer)
1605		rt_bind_peer(rt, rt->rt_dst, 1);
1606	peer = rt->peer;
1607	if (peer) {
1608		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1609
1610		if (mtu < ip_rt_min_pmtu)
1611			mtu = ip_rt_min_pmtu;
1612		if (!pmtu_expires || mtu < peer->pmtu_learned) {
1613
1614			pmtu_expires = jiffies + ip_rt_mtu_expires;
1615			if (!pmtu_expires)
1616				pmtu_expires = 1UL;
1617
1618			peer->pmtu_learned = mtu;
1619			peer->pmtu_expires = pmtu_expires;
 
1620
1621			atomic_inc(&__rt_peer_genid);
1622			rt->rt_peer_genid = rt_peer_genid();
1623		}
1624		check_peer_pmtu(dst, peer);
1625	}
 
 
 
 
 
 
 
1626}
 
1627
1628static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
 
1629{
1630	struct rtable *rt = (struct rtable *) dst;
1631	__be32 orig_gw = rt->rt_gateway;
1632	struct neighbour *n, *old_n;
1633
1634	dst_confirm(&rt->dst);
 
 
 
 
 
 
 
 
1635
1636	rt->rt_gateway = peer->redirect_learned.a4;
 
 
 
 
 
1637
1638	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1639	if (IS_ERR(n))
1640		return PTR_ERR(n);
1641	old_n = xchg(&rt->dst._neighbour, n);
1642	if (old_n)
1643		neigh_release(old_n);
1644	if (!n || !(n->nud_state & NUD_VALID)) {
1645		if (n)
1646			neigh_event_send(n, NULL);
1647		rt->rt_gateway = orig_gw;
1648		return -EAGAIN;
1649	} else {
1650		rt->rt_flags |= RTCF_REDIRECTED;
1651		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1652	}
1653	return 0;
1654}
 
1655
1656static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1657{
1658	struct rtable *rt = (struct rtable *) dst;
1659
1660	if (rt_is_expired(rt))
 
 
 
 
 
 
 
 
1661		return NULL;
1662	if (rt->rt_peer_genid != rt_peer_genid()) {
1663		struct inet_peer *peer;
1664
1665		if (!rt->peer)
1666			rt_bind_peer(rt, rt->rt_dst, 0);
 
 
1667
1668		peer = rt->peer;
1669		if (peer) {
1670			check_peer_pmtu(dst, peer);
1671
1672			if (peer->redirect_learned.a4 &&
1673			    peer->redirect_learned.a4 != rt->rt_gateway) {
1674				if (check_peer_redir(dst, peer))
1675					return NULL;
1676			}
1677		}
1678
1679		rt->rt_peer_genid = rt_peer_genid();
1680	}
1681	return dst;
1682}
 
1683
1684static void ipv4_dst_destroy(struct dst_entry *dst)
1685{
1686	struct rtable *rt = (struct rtable *) dst;
1687	struct inet_peer *peer = rt->peer;
1688
1689	if (rt->fi) {
1690		fib_info_put(rt->fi);
1691		rt->fi = NULL;
1692	}
1693	if (peer) {
1694		rt->peer = NULL;
1695		inet_putpeer(peer);
1696	}
 
1697}
1698
1699
1700static void ipv4_link_failure(struct sk_buff *skb)
1701{
1702	struct rtable *rt;
1703
1704	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1705
1706	rt = skb_rtable(skb);
1707	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1708		dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1709}
1710
1711static int ip_rt_bug(struct sk_buff *skb)
1712{
1713	printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1714		&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1715		skb->dev ? skb->dev->name : "?");
1716	kfree_skb(skb);
1717	WARN_ON(1);
1718	return 0;
1719}
1720
1721/*
1722   We do not cache source address of outgoing interface,
1723   because it is used only by IP RR, TS and SRR options,
1724   so that it out of fast path.
1725
1726   BTW remember: "addr" is allowed to be not aligned
1727   in IP options!
1728 */
1729
1730void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1731{
1732	__be32 src;
1733
1734	if (rt_is_output_route(rt))
1735		src = ip_hdr(skb)->saddr;
1736	else {
1737		struct fib_result res;
1738		struct flowi4 fl4;
1739		struct iphdr *iph;
1740
1741		iph = ip_hdr(skb);
1742
1743		memset(&fl4, 0, sizeof(fl4));
1744		fl4.daddr = iph->daddr;
1745		fl4.saddr = iph->saddr;
1746		fl4.flowi4_tos = RT_TOS(iph->tos);
1747		fl4.flowi4_oif = rt->dst.dev->ifindex;
1748		fl4.flowi4_iif = skb->dev->ifindex;
1749		fl4.flowi4_mark = skb->mark;
1750
1751		rcu_read_lock();
1752		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1753			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1754		else
1755			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1756					RT_SCOPE_UNIVERSE);
 
1757		rcu_read_unlock();
1758	}
1759	memcpy(addr, &src, 4);
1760}
1761
1762#ifdef CONFIG_IP_ROUTE_CLASSID
1763static void set_class_tag(struct rtable *rt, u32 tag)
1764{
1765	if (!(rt->dst.tclassid & 0xFFFF))
1766		rt->dst.tclassid |= tag & 0xFFFF;
1767	if (!(rt->dst.tclassid & 0xFFFF0000))
1768		rt->dst.tclassid |= tag & 0xFFFF0000;
1769}
1770#endif
1771
1772static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1773{
1774	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1775
1776	if (advmss == 0) {
1777		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1778			       ip_rt_min_advmss);
1779		if (advmss > 65535 - 40)
1780			advmss = 65535 - 40;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1781	}
1782	return advmss;
 
1783}
1784
1785static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
 
1786{
1787	unsigned int mtu = dst->dev->mtu;
 
 
1788
1789	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1790		const struct rtable *rt = (const struct rtable *) dst;
1791
1792		if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1793			mtu = 576;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1794	}
1795
1796	if (mtu > IP_MAX_MTU)
1797		mtu = IP_MAX_MTU;
1798
1799	return mtu;
1800}
1801
1802static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1803			    struct fib_info *fi)
1804{
1805	struct inet_peer *peer;
1806	int create = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1807
1808	/* If a peer entry exists for this destination, we must hook
1809	 * it up in order to get at cached metrics.
1810	 */
1811	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1812		create = 1;
1813
1814	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1815	if (peer) {
1816		rt->rt_peer_genid = rt_peer_genid();
1817		if (inet_metrics_new(peer))
1818			memcpy(peer->metrics, fi->fib_metrics,
1819			       sizeof(u32) * RTAX_MAX);
1820		dst_init_metrics(&rt->dst, peer->metrics, false);
1821
1822		check_peer_pmtu(&rt->dst, peer);
1823		if (peer->redirect_learned.a4 &&
1824		    peer->redirect_learned.a4 != rt->rt_gateway) {
1825			rt->rt_gateway = peer->redirect_learned.a4;
1826			rt->rt_flags |= RTCF_REDIRECTED;
1827		}
1828	} else {
1829		if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1830			rt->fi = fi;
1831			atomic_inc(&fi->fib_clntref);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1832		}
1833		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1834	}
1835}
1836
1837static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
 
 
 
 
 
 
 
1838			   const struct fib_result *res,
1839			   struct fib_info *fi, u16 type, u32 itag)
 
 
1840{
1841	struct dst_entry *dst = &rt->dst;
1842
1843	if (fi) {
1844		if (FIB_RES_GW(*res) &&
1845		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1846			rt->rt_gateway = FIB_RES_GW(*res);
1847		rt_init_metrics(rt, fl4, fi);
 
 
 
 
 
 
 
 
 
 
1848#ifdef CONFIG_IP_ROUTE_CLASSID
1849		dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
 
 
 
 
 
1850#endif
1851	}
1852
1853	if (dst_mtu(dst) > IP_MAX_MTU)
1854		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1855	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1856		dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
 
 
 
 
 
 
 
 
 
 
 
 
 
1857
1858#ifdef CONFIG_IP_ROUTE_CLASSID
1859#ifdef CONFIG_IP_MULTIPLE_TABLES
1860	set_class_tag(rt, fib_rules_tclass(res));
1861#endif
1862	set_class_tag(rt, itag);
1863#endif
1864}
1865
1866static struct rtable *rt_dst_alloc(struct net_device *dev,
1867				   bool nopolicy, bool noxfrm)
 
1868{
1869	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
1870			 DST_HOST |
1871			 (nopolicy ? DST_NOPOLICY : 0) |
1872			 (noxfrm ? DST_NOXFRM : 0));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1873}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1874
1875/* called in rcu_read_lock() section */
1876static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1877				u8 tos, struct net_device *dev, int our)
 
1878{
1879	unsigned int hash;
1880	struct rtable *rth;
1881	__be32 spec_dst;
1882	struct in_device *in_dev = __in_dev_get_rcu(dev);
1883	u32 itag = 0;
1884	int err;
1885
1886	/* Primary sanity checks. */
 
 
1887
1888	if (in_dev == NULL)
 
1889		return -EINVAL;
1890
1891	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1892	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1893		goto e_inval;
1894
1895	if (ipv4_is_zeronet(saddr)) {
1896		if (!ipv4_is_local_multicast(daddr))
1897			goto e_inval;
1898		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1899	} else {
1900		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
1901					  &itag);
1902		if (err < 0)
1903			goto e_err;
1904	}
1905	rth = rt_dst_alloc(init_net.loopback_dev,
1906			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1907	if (!rth)
1908		goto e_nobufs;
1909
1910#ifdef CONFIG_IP_ROUTE_CLASSID
1911	rth->dst.tclassid = itag;
1912#endif
1913	rth->dst.output = ip_rt_bug;
1914
1915	rth->rt_key_dst	= daddr;
1916	rth->rt_key_src	= saddr;
1917	rth->rt_genid	= rt_genid(dev_net(dev));
1918	rth->rt_flags	= RTCF_MULTICAST;
1919	rth->rt_type	= RTN_MULTICAST;
1920	rth->rt_key_tos	= tos;
1921	rth->rt_dst	= daddr;
1922	rth->rt_src	= saddr;
1923	rth->rt_route_iif = dev->ifindex;
1924	rth->rt_iif	= dev->ifindex;
1925	rth->rt_oif	= 0;
1926	rth->rt_mark    = skb->mark;
1927	rth->rt_gateway	= daddr;
1928	rth->rt_spec_dst= spec_dst;
1929	rth->rt_peer_genid = 0;
1930	rth->peer = NULL;
1931	rth->fi = NULL;
1932	if (our) {
1933		rth->dst.input= ip_local_deliver;
1934		rth->rt_flags |= RTCF_LOCAL;
1935	}
1936
1937#ifdef CONFIG_IP_MROUTE
1938	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1939		rth->dst.input = ip_mr_input;
1940#endif
1941	RT_CACHE_STAT_INC(in_slow_mc);
1942
1943	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1944	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
1945	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
1946
1947e_nobufs:
1948	return -ENOBUFS;
1949e_inval:
1950	return -EINVAL;
1951e_err:
1952	return err;
1953}
1954
1955
1956static void ip_handle_martian_source(struct net_device *dev,
1957				     struct in_device *in_dev,
1958				     struct sk_buff *skb,
1959				     __be32 daddr,
1960				     __be32 saddr)
1961{
1962	RT_CACHE_STAT_INC(in_martian_src);
1963#ifdef CONFIG_IP_ROUTE_VERBOSE
1964	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1965		/*
1966		 *	RFC1812 recommendation, if source is martian,
1967		 *	the only hint is MAC header.
1968		 */
1969		printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1970			&daddr, &saddr, dev->name);
1971		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1972			int i;
1973			const unsigned char *p = skb_mac_header(skb);
1974			printk(KERN_WARNING "ll header: ");
1975			for (i = 0; i < dev->hard_header_len; i++, p++) {
1976				printk("%02x", *p);
1977				if (i < (dev->hard_header_len - 1))
1978					printk(":");
1979			}
1980			printk("\n");
1981		}
1982	}
1983#endif
1984}
1985
1986/* called in rcu_read_lock() section */
1987static int __mkroute_input(struct sk_buff *skb,
1988			   const struct fib_result *res,
1989			   struct in_device *in_dev,
1990			   __be32 daddr, __be32 saddr, u32 tos,
1991			   struct rtable **result)
1992{
 
 
 
1993	struct rtable *rth;
1994	int err;
1995	struct in_device *out_dev;
1996	unsigned int flags = 0;
1997	__be32 spec_dst;
1998	u32 itag;
1999
2000	/* get a working reference to the output device */
2001	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2002	if (out_dev == NULL) {
2003		if (net_ratelimit())
2004			printk(KERN_CRIT "Bug in ip_route_input" \
2005			       "_slow(). Please, report\n");
2006		return -EINVAL;
2007	}
2008
2009
2010	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2011				  in_dev->dev, &spec_dst, &itag);
2012	if (err < 0) {
2013		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2014					 saddr);
2015
2016		goto cleanup;
2017	}
2018
2019	if (err)
2020		flags |= RTCF_DIRECTSRC;
2021
2022	if (out_dev == in_dev && err &&
2023	    (IN_DEV_SHARED_MEDIA(out_dev) ||
2024	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2025		flags |= RTCF_DOREDIRECT;
 
 
 
2026
2027	if (skb->protocol != htons(ETH_P_IP)) {
2028		/* Not IP (i.e. ARP). Do not create route, if it is
2029		 * invalid for proxy arp. DNAT routes are always valid.
2030		 *
2031		 * Proxy arp feature have been extended to allow, ARP
2032		 * replies back to the same interface, to support
2033		 * Private VLAN switch technologies. See arp.c.
2034		 */
2035		if (out_dev == in_dev &&
2036		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2037			err = -EINVAL;
2038			goto cleanup;
2039		}
2040	}
2041
2042	rth = rt_dst_alloc(out_dev->dev,
 
 
 
 
 
 
 
 
 
 
 
 
2043			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2044			   IN_DEV_CONF_GET(out_dev, NOXFRM));
2045	if (!rth) {
2046		err = -ENOBUFS;
2047		goto cleanup;
2048	}
2049
2050	rth->rt_key_dst	= daddr;
2051	rth->rt_key_src	= saddr;
2052	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2053	rth->rt_flags = flags;
2054	rth->rt_type = res->type;
2055	rth->rt_key_tos	= tos;
2056	rth->rt_dst	= daddr;
2057	rth->rt_src	= saddr;
2058	rth->rt_route_iif = in_dev->dev->ifindex;
2059	rth->rt_iif 	= in_dev->dev->ifindex;
2060	rth->rt_oif 	= 0;
2061	rth->rt_mark    = skb->mark;
2062	rth->rt_gateway	= daddr;
2063	rth->rt_spec_dst= spec_dst;
2064	rth->rt_peer_genid = 0;
2065	rth->peer = NULL;
2066	rth->fi = NULL;
2067
2068	rth->dst.input = ip_forward;
2069	rth->dst.output = ip_output;
2070
2071	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2072
2073	*result = rth;
 
 
2074	err = 0;
2075 cleanup:
2076	return err;
2077}
2078
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2079static int ip_mkroute_input(struct sk_buff *skb,
2080			    struct fib_result *res,
2081			    const struct flowi4 *fl4,
2082			    struct in_device *in_dev,
2083			    __be32 daddr, __be32 saddr, u32 tos)
 
2084{
2085	struct rtable* rth = NULL;
2086	int err;
2087	unsigned hash;
2088
2089#ifdef CONFIG_IP_ROUTE_MULTIPATH
2090	if (res->fi && res->fi->fib_nhs > 1)
2091		fib_select_multipath(res);
2092#endif
2093
2094	/* create a routing cache entry */
2095	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2096	if (err)
2097		return err;
2098
2099	/* put it into the cache */
2100	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2101		       rt_genid(dev_net(rth->dst.dev)));
2102	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2103	if (IS_ERR(rth))
2104		return PTR_ERR(rth);
2105	return 0;
2106}
2107
2108/*
2109 *	NOTE. We drop all the packets that has local source
2110 *	addresses, because every properly looped back packet
2111 *	must have correct destination already attached by output routine.
2112 *
2113 *	Such approach solves two big problems:
2114 *	1. Not simplex devices are handled properly.
2115 *	2. IP spoofing attempts are filtered with 100% of guarantee.
2116 *	called with rcu_read_lock()
2117 */
2118
2119static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2120			       u8 tos, struct net_device *dev)
 
2121{
2122	struct fib_result res;
2123	struct in_device *in_dev = __in_dev_get_rcu(dev);
 
 
 
 
 
 
 
2124	struct flowi4	fl4;
2125	unsigned	flags = 0;
2126	u32		itag = 0;
2127	struct rtable * rth;
2128	unsigned	hash;
2129	__be32		spec_dst;
2130	int		err = -EINVAL;
2131	struct net    * net = dev_net(dev);
2132
2133	/* IP on this device is disabled. */
2134
2135	if (!in_dev)
2136		goto out;
2137
2138	/* Check for the most weird martians, which can be not detected
2139	   by fib_lookup.
2140	 */
2141
2142	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2143	    ipv4_is_loopback(saddr))
 
 
 
 
 
 
2144		goto martian_source;
2145
 
 
2146	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2147		goto brd_input;
2148
2149	/* Accept zero addresses only to limited broadcast;
2150	 * I even do not know to fix it or not. Waiting for complains :-)
2151	 */
2152	if (ipv4_is_zeronet(saddr))
2153		goto martian_source;
2154
2155	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2156		goto martian_destination;
2157
 
 
 
 
 
 
 
 
 
 
 
2158	/*
2159	 *	Now we are ready to route packet.
2160	 */
2161	fl4.flowi4_oif = 0;
2162	fl4.flowi4_iif = dev->ifindex;
2163	fl4.flowi4_mark = skb->mark;
2164	fl4.flowi4_tos = tos;
2165	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 
2166	fl4.daddr = daddr;
2167	fl4.saddr = saddr;
2168	err = fib_lookup(net, &fl4, &res);
 
 
 
 
 
 
 
 
 
 
2169	if (err != 0) {
2170		if (!IN_DEV_FORWARD(in_dev))
2171			goto e_hostunreach;
2172		goto no_route;
2173	}
2174
2175	RT_CACHE_STAT_INC(in_slow_tot);
2176
2177	if (res.type == RTN_BROADCAST)
 
 
 
2178		goto brd_input;
 
2179
2180	if (res.type == RTN_LOCAL) {
2181		err = fib_validate_source(skb, saddr, daddr, tos,
2182					  net->loopback_dev->ifindex,
2183					  dev, &spec_dst, &itag);
2184		if (err < 0)
2185			goto martian_source_keep_err;
2186		if (err)
2187			flags |= RTCF_DIRECTSRC;
2188		spec_dst = daddr;
2189		goto local_input;
2190	}
2191
2192	if (!IN_DEV_FORWARD(in_dev))
2193		goto e_hostunreach;
2194	if (res.type != RTN_UNICAST)
 
 
2195		goto martian_destination;
2196
2197	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
 
2198out:	return err;
2199
2200brd_input:
2201	if (skb->protocol != htons(ETH_P_IP))
2202		goto e_inval;
2203
2204	if (ipv4_is_zeronet(saddr))
2205		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2206	else {
2207		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2208					  &itag);
2209		if (err < 0)
2210			goto martian_source_keep_err;
2211		if (err)
2212			flags |= RTCF_DIRECTSRC;
2213	}
2214	flags |= RTCF_BROADCAST;
2215	res.type = RTN_BROADCAST;
2216	RT_CACHE_STAT_INC(in_brd);
2217
2218local_input:
2219	rth = rt_dst_alloc(net->loopback_dev,
2220			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
 
 
 
 
 
 
 
 
 
 
 
 
 
2221	if (!rth)
2222		goto e_nobufs;
2223
2224	rth->dst.input= ip_local_deliver;
2225	rth->dst.output= ip_rt_bug;
2226#ifdef CONFIG_IP_ROUTE_CLASSID
2227	rth->dst.tclassid = itag;
2228#endif
 
2229
2230	rth->rt_key_dst	= daddr;
2231	rth->rt_key_src	= saddr;
2232	rth->rt_genid = rt_genid(net);
2233	rth->rt_flags 	= flags|RTCF_LOCAL;
2234	rth->rt_type	= res.type;
2235	rth->rt_key_tos	= tos;
2236	rth->rt_dst	= daddr;
2237	rth->rt_src	= saddr;
2238#ifdef CONFIG_IP_ROUTE_CLASSID
2239	rth->dst.tclassid = itag;
2240#endif
2241	rth->rt_route_iif = dev->ifindex;
2242	rth->rt_iif	= dev->ifindex;
2243	rth->rt_oif	= 0;
2244	rth->rt_mark    = skb->mark;
2245	rth->rt_gateway	= daddr;
2246	rth->rt_spec_dst= spec_dst;
2247	rth->rt_peer_genid = 0;
2248	rth->peer = NULL;
2249	rth->fi = NULL;
2250	if (res.type == RTN_UNREACHABLE) {
2251		rth->dst.input= ip_error;
2252		rth->dst.error= -err;
2253		rth->rt_flags 	&= ~RTCF_LOCAL;
2254	}
2255	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2256	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
 
 
 
 
 
 
 
 
 
 
 
 
 
2257	err = 0;
2258	if (IS_ERR(rth))
2259		err = PTR_ERR(rth);
2260	goto out;
2261
2262no_route:
2263	RT_CACHE_STAT_INC(in_no_route);
2264	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2265	res.type = RTN_UNREACHABLE;
2266	if (err == -ESRCH)
2267		err = -ENETUNREACH;
2268	goto local_input;
2269
2270	/*
2271	 *	Do not cache martian addresses: they should be logged (RFC1812)
2272	 */
2273martian_destination:
2274	RT_CACHE_STAT_INC(in_martian_dst);
2275#ifdef CONFIG_IP_ROUTE_VERBOSE
2276	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2277		printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2278			&daddr, &saddr, dev->name);
2279#endif
2280
2281e_hostunreach:
2282	err = -EHOSTUNREACH;
2283	goto out;
2284
2285e_inval:
2286	err = -EINVAL;
2287	goto out;
2288
2289e_nobufs:
2290	err = -ENOBUFS;
2291	goto out;
2292
2293martian_source:
2294	err = -EINVAL;
2295martian_source_keep_err:
2296	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2297	goto out;
2298}
2299
2300int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2301			   u8 tos, struct net_device *dev, bool noref)
2302{
2303	struct rtable * rth;
2304	unsigned	hash;
2305	int iif = dev->ifindex;
2306	struct net *net;
2307	int res;
2308
2309	net = dev_net(dev);
2310
 
2311	rcu_read_lock();
 
 
2312
2313	if (!rt_caching(net))
2314		goto skip_cache;
 
2315
2316	tos &= IPTOS_RT_MASK;
2317	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2318
2319	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2320	     rth = rcu_dereference(rth->dst.rt_next)) {
2321		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2322		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2323		     (rth->rt_route_iif ^ iif) |
2324		     (rth->rt_key_tos ^ tos)) == 0 &&
2325		    rth->rt_mark == skb->mark &&
2326		    net_eq(dev_net(rth->dst.dev), net) &&
2327		    !rt_is_expired(rth)) {
2328			if (noref) {
2329				dst_use_noref(&rth->dst, jiffies);
2330				skb_dst_set_noref(skb, &rth->dst);
2331			} else {
2332				dst_use(&rth->dst, jiffies);
2333				skb_dst_set(skb, &rth->dst);
2334			}
2335			RT_CACHE_STAT_INC(in_hit);
2336			rcu_read_unlock();
2337			return 0;
2338		}
2339		RT_CACHE_STAT_INC(in_hlist_search);
2340	}
2341
2342skip_cache:
2343	/* Multicast recognition logic is moved from route cache to here.
2344	   The problem was that too many Ethernet cards have broken/missing
2345	   hardware multicast filters :-( As result the host on multicasting
2346	   network acquires a lot of useless route cache entries, sort of
2347	   SDR messages from all the world. Now we try to get rid of them.
2348	   Really, provided software IP multicast filter is organized
2349	   reasonably (at least, hashed), it does not result in a slowdown
2350	   comparing with route cache reject entries.
2351	   Note, that multicast routers are not affected, because
2352	   route cache entry is created eventually.
2353	 */
2354	if (ipv4_is_multicast(daddr)) {
2355		struct in_device *in_dev = __in_dev_get_rcu(dev);
 
 
 
 
 
 
 
 
 
 
 
2356
2357		if (in_dev) {
2358			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2359						  ip_hdr(skb)->protocol);
2360			if (our
 
 
 
2361#ifdef CONFIG_IP_MROUTE
2362				||
2363			    (!ipv4_is_local_multicast(daddr) &&
2364			     IN_DEV_MFORWARD(in_dev))
2365#endif
2366			   ) {
2367				int res = ip_route_input_mc(skb, daddr, saddr,
2368							    tos, dev, our);
2369				rcu_read_unlock();
2370				return res;
2371			}
2372		}
2373		rcu_read_unlock();
2374		return -EINVAL;
2375	}
2376	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2377	rcu_read_unlock();
2378	return res;
2379}
2380EXPORT_SYMBOL(ip_route_input_common);
2381
2382/* called with rcu_read_lock() */
2383static struct rtable *__mkroute_output(const struct fib_result *res,
2384				       const struct flowi4 *fl4,
2385				       __be32 orig_daddr, __be32 orig_saddr,
2386				       int orig_oif, struct net_device *dev_out,
2387				       unsigned int flags)
2388{
2389	struct fib_info *fi = res->fi;
2390	u32 tos = RT_FL_TOS(fl4);
2391	struct in_device *in_dev;
2392	u16 type = res->type;
2393	struct rtable *rth;
 
2394
2395	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
 
2396		return ERR_PTR(-EINVAL);
2397
 
 
 
 
 
 
2398	if (ipv4_is_lbcast(fl4->daddr))
2399		type = RTN_BROADCAST;
2400	else if (ipv4_is_multicast(fl4->daddr))
2401		type = RTN_MULTICAST;
2402	else if (ipv4_is_zeronet(fl4->daddr))
2403		return ERR_PTR(-EINVAL);
2404
2405	if (dev_out->flags & IFF_LOOPBACK)
2406		flags |= RTCF_LOCAL;
2407
2408	in_dev = __in_dev_get_rcu(dev_out);
2409	if (!in_dev)
2410		return ERR_PTR(-EINVAL);
2411
2412	if (type == RTN_BROADCAST) {
2413		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2414		fi = NULL;
2415	} else if (type == RTN_MULTICAST) {
2416		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2417		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2418				     fl4->flowi4_proto))
2419			flags &= ~RTCF_LOCAL;
 
 
2420		/* If multicast route do not exist use
2421		 * default one, but do not gateway in this case.
2422		 * Yes, it is hack.
2423		 */
2424		if (fi && res->prefixlen < 4)
2425			fi = NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2426	}
2427
2428	rth = rt_dst_alloc(dev_out,
 
2429			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2430			   IN_DEV_CONF_GET(in_dev, NOXFRM));
 
2431	if (!rth)
2432		return ERR_PTR(-ENOBUFS);
2433
2434	rth->dst.output = ip_output;
2435
2436	rth->rt_key_dst	= orig_daddr;
2437	rth->rt_key_src	= orig_saddr;
2438	rth->rt_genid = rt_genid(dev_net(dev_out));
2439	rth->rt_flags	= flags;
2440	rth->rt_type	= type;
2441	rth->rt_key_tos	= tos;
2442	rth->rt_dst	= fl4->daddr;
2443	rth->rt_src	= fl4->saddr;
2444	rth->rt_route_iif = 0;
2445	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
2446	rth->rt_oif	= orig_oif;
2447	rth->rt_mark    = fl4->flowi4_mark;
2448	rth->rt_gateway = fl4->daddr;
2449	rth->rt_spec_dst= fl4->saddr;
2450	rth->rt_peer_genid = 0;
2451	rth->peer = NULL;
2452	rth->fi = NULL;
2453
2454	RT_CACHE_STAT_INC(out_slow_tot);
2455
2456	if (flags & RTCF_LOCAL) {
2457		rth->dst.input = ip_local_deliver;
2458		rth->rt_spec_dst = fl4->daddr;
2459	}
2460	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2461		rth->rt_spec_dst = fl4->saddr;
2462		if (flags & RTCF_LOCAL &&
2463		    !(dev_out->flags & IFF_LOOPBACK)) {
2464			rth->dst.output = ip_mc_output;
2465			RT_CACHE_STAT_INC(out_slow_mc);
2466		}
2467#ifdef CONFIG_IP_MROUTE
2468		if (type == RTN_MULTICAST) {
2469			if (IN_DEV_MFORWARD(in_dev) &&
2470			    !ipv4_is_local_multicast(fl4->daddr)) {
2471				rth->dst.input = ip_mr_input;
2472				rth->dst.output = ip_mc_output;
2473			}
2474		}
2475#endif
2476	}
2477
2478	rt_set_nexthop(rth, fl4, res, fi, type, 0);
 
2479
2480	return rth;
2481}
2482
2483/*
2484 * Major route resolver routine.
2485 * called with rcu_read_lock();
2486 */
2487
2488static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 
2489{
2490	struct net_device *dev_out = NULL;
2491	u32 tos	= RT_FL_TOS(fl4);
2492	unsigned int flags = 0;
2493	struct fib_result res;
 
 
 
2494	struct rtable *rth;
2495	__be32 orig_daddr;
2496	__be32 orig_saddr;
2497	int orig_oif;
2498
2499	res.fi		= NULL;
2500#ifdef CONFIG_IP_MULTIPLE_TABLES
2501	res.r		= NULL;
2502#endif
2503
2504	orig_daddr = fl4->daddr;
2505	orig_saddr = fl4->saddr;
2506	orig_oif = fl4->flowi4_oif;
2507
2508	fl4->flowi4_iif = net->loopback_dev->ifindex;
2509	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2510	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2511			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2512
2513	rcu_read_lock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2514	if (fl4->saddr) {
2515		rth = ERR_PTR(-EINVAL);
2516		if (ipv4_is_multicast(fl4->saddr) ||
2517		    ipv4_is_lbcast(fl4->saddr) ||
2518		    ipv4_is_zeronet(fl4->saddr))
 
2519			goto out;
 
 
 
2520
2521		/* I removed check for oif == dev_out->oif here.
2522		   It was wrong for two reasons:
2523		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2524		      is assigned to multiple interfaces.
2525		   2. Moreover, we are allowed to send packets with saddr
2526		      of another iface. --ANK
2527		 */
2528
2529		if (fl4->flowi4_oif == 0 &&
2530		    (ipv4_is_multicast(fl4->daddr) ||
2531		     ipv4_is_lbcast(fl4->daddr))) {
2532			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2533			dev_out = __ip_dev_find(net, fl4->saddr, false);
2534			if (dev_out == NULL)
2535				goto out;
2536
2537			/* Special hack: user can direct multicasts
2538			   and limited broadcast via necessary interface
2539			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2540			   This hack is not just for fun, it allows
2541			   vic,vat and friends to work.
2542			   They bind socket to loopback, set ttl to zero
2543			   and expect that it will work.
2544			   From the viewpoint of routing cache they are broken,
2545			   because we are not allowed to build multicast path
2546			   with loopback source addr (look, routing cache
2547			   cannot know, that ttl is zero, so that packet
2548			   will not leave this host and route is valid).
2549			   Luckily, this hack is good workaround.
2550			 */
2551
2552			fl4->flowi4_oif = dev_out->ifindex;
2553			goto make_route;
2554		}
2555
2556		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2557			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2558			if (!__ip_dev_find(net, fl4->saddr, false))
2559				goto out;
2560		}
2561	}
2562
2563
2564	if (fl4->flowi4_oif) {
2565		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2566		rth = ERR_PTR(-ENODEV);
2567		if (dev_out == NULL)
2568			goto out;
2569
2570		/* RACE: Check return value of inet_select_addr instead. */
2571		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2572			rth = ERR_PTR(-ENETUNREACH);
2573			goto out;
2574		}
2575		if (ipv4_is_local_multicast(fl4->daddr) ||
2576		    ipv4_is_lbcast(fl4->daddr)) {
 
2577			if (!fl4->saddr)
2578				fl4->saddr = inet_select_addr(dev_out, 0,
2579							      RT_SCOPE_LINK);
2580			goto make_route;
2581		}
2582		if (fl4->saddr) {
2583			if (ipv4_is_multicast(fl4->daddr))
2584				fl4->saddr = inet_select_addr(dev_out, 0,
2585							      fl4->flowi4_scope);
2586			else if (!fl4->daddr)
2587				fl4->saddr = inet_select_addr(dev_out, 0,
2588							      RT_SCOPE_HOST);
2589		}
2590	}
2591
2592	if (!fl4->daddr) {
2593		fl4->daddr = fl4->saddr;
2594		if (!fl4->daddr)
2595			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2596		dev_out = net->loopback_dev;
2597		fl4->flowi4_oif = net->loopback_dev->ifindex;
2598		res.type = RTN_LOCAL;
2599		flags |= RTCF_LOCAL;
2600		goto make_route;
2601	}
2602
2603	if (fib_lookup(net, fl4, &res)) {
2604		res.fi = NULL;
2605		if (fl4->flowi4_oif) {
 
 
 
 
2606			/* Apparently, routing tables are wrong. Assume,
2607			   that the destination is on link.
2608
2609			   WHY? DW.
2610			   Because we are allowed to send to iface
2611			   even if it has NO routes and NO assigned
2612			   addresses. When oif is specified, routing
2613			   tables are looked up with only one purpose:
2614			   to catch if destination is gatewayed, rather than
2615			   direct. Moreover, if MSG_DONTROUTE is set,
2616			   we send packet, ignoring both routing tables
2617			   and ifaddr state. --ANK
2618
2619
2620			   We could make it even if oif is unknown,
2621			   likely IPv6, but we do not.
2622			 */
2623
2624			if (fl4->saddr == 0)
2625				fl4->saddr = inet_select_addr(dev_out, 0,
2626							      RT_SCOPE_LINK);
2627			res.type = RTN_UNICAST;
2628			goto make_route;
2629		}
2630		rth = ERR_PTR(-ENETUNREACH);
2631		goto out;
2632	}
2633
2634	if (res.type == RTN_LOCAL) {
2635		if (!fl4->saddr) {
2636			if (res.fi->fib_prefsrc)
2637				fl4->saddr = res.fi->fib_prefsrc;
2638			else
2639				fl4->saddr = fl4->daddr;
2640		}
2641		dev_out = net->loopback_dev;
 
 
 
 
 
 
 
 
 
2642		fl4->flowi4_oif = dev_out->ifindex;
2643		res.fi = NULL;
2644		flags |= RTCF_LOCAL;
2645		goto make_route;
2646	}
2647
2648#ifdef CONFIG_IP_ROUTE_MULTIPATH
2649	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2650		fib_select_multipath(&res);
2651	else
2652#endif
2653	if (!res.prefixlen &&
2654	    res.table->tb_num_default > 1 &&
2655	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2656		fib_select_default(&res);
2657
2658	if (!fl4->saddr)
2659		fl4->saddr = FIB_RES_PREFSRC(net, res);
2660
2661	dev_out = FIB_RES_DEV(res);
2662	fl4->flowi4_oif = dev_out->ifindex;
2663
2664
2665make_route:
2666	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2667			       dev_out, flags);
2668	if (!IS_ERR(rth)) {
2669		unsigned int hash;
2670
2671		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2672			       rt_genid(dev_net(dev_out)));
2673		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2674	}
2675
2676out:
2677	rcu_read_unlock();
2678	return rth;
2679}
2680
2681struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2682{
2683	struct rtable *rth;
2684	unsigned int hash;
2685
2686	if (!rt_caching(net))
2687		goto slow_output;
2688
2689	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2690
2691	rcu_read_lock_bh();
2692	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2693		rth = rcu_dereference_bh(rth->dst.rt_next)) {
2694		if (rth->rt_key_dst == flp4->daddr &&
2695		    rth->rt_key_src == flp4->saddr &&
2696		    rt_is_output_route(rth) &&
2697		    rth->rt_oif == flp4->flowi4_oif &&
2698		    rth->rt_mark == flp4->flowi4_mark &&
2699		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2700			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2701		    net_eq(dev_net(rth->dst.dev), net) &&
2702		    !rt_is_expired(rth)) {
2703			dst_use(&rth->dst, jiffies);
2704			RT_CACHE_STAT_INC(out_hit);
2705			rcu_read_unlock_bh();
2706			if (!flp4->saddr)
2707				flp4->saddr = rth->rt_src;
2708			if (!flp4->daddr)
2709				flp4->daddr = rth->rt_dst;
2710			return rth;
2711		}
2712		RT_CACHE_STAT_INC(out_hlist_search);
2713	}
2714	rcu_read_unlock_bh();
2715
2716slow_output:
2717	return ip_route_output_slow(net, flp4);
2718}
2719EXPORT_SYMBOL_GPL(__ip_route_output_key);
2720
2721static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2722{
2723	return NULL;
 
 
2724}
2725
2726static unsigned int ipv4_blackhole_default_mtu(const struct dst_entry *dst)
 
2727{
2728	return 0;
2729}
2730
2731static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 
2732{
2733}
2734
2735static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2736					  unsigned long old)
2737{
2738	return NULL;
2739}
2740
2741static struct dst_ops ipv4_dst_blackhole_ops = {
2742	.family			=	AF_INET,
2743	.protocol		=	cpu_to_be16(ETH_P_IP),
2744	.destroy		=	ipv4_dst_destroy,
2745	.check			=	ipv4_blackhole_dst_check,
2746	.default_mtu		=	ipv4_blackhole_default_mtu,
2747	.default_advmss		=	ipv4_default_advmss,
2748	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
 
2749	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2750	.neigh_lookup		=	ipv4_neigh_lookup,
2751};
2752
2753struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2754{
2755	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2756	struct rtable *ort = (struct rtable *) dst_orig;
 
2757
 
2758	if (rt) {
2759		struct dst_entry *new = &rt->dst;
2760
2761		new->__use = 1;
2762		new->input = dst_discard;
2763		new->output = dst_discard;
2764		dst_copy_metrics(new, &ort->dst);
2765
2766		new->dev = ort->dst.dev;
2767		if (new->dev)
2768			dev_hold(new->dev);
2769
2770		rt->rt_key_dst = ort->rt_key_dst;
2771		rt->rt_key_src = ort->rt_key_src;
2772		rt->rt_key_tos = ort->rt_key_tos;
2773		rt->rt_route_iif = ort->rt_route_iif;
2774		rt->rt_iif = ort->rt_iif;
2775		rt->rt_oif = ort->rt_oif;
2776		rt->rt_mark = ort->rt_mark;
2777
2778		rt->rt_genid = rt_genid(net);
2779		rt->rt_flags = ort->rt_flags;
2780		rt->rt_type = ort->rt_type;
2781		rt->rt_dst = ort->rt_dst;
2782		rt->rt_src = ort->rt_src;
2783		rt->rt_gateway = ort->rt_gateway;
2784		rt->rt_spec_dst = ort->rt_spec_dst;
2785		rt->peer = ort->peer;
2786		if (rt->peer)
2787			atomic_inc(&rt->peer->refcnt);
2788		rt->fi = ort->fi;
2789		if (rt->fi)
2790			atomic_inc(&rt->fi->fib_clntref);
2791
2792		dst_free(new);
2793	}
2794
2795	dst_release(dst_orig);
2796
2797	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2798}
2799
2800struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2801				    struct sock *sk)
2802{
2803	struct rtable *rt = __ip_route_output_key(net, flp4);
2804
2805	if (IS_ERR(rt))
2806		return rt;
2807
2808	if (flp4->flowi4_proto)
2809		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2810						   flowi4_to_flowi(flp4),
2811						   sk, 0);
2812
2813	return rt;
2814}
2815EXPORT_SYMBOL_GPL(ip_route_output_flow);
2816
2817static int rt_fill_info(struct net *net,
2818			struct sk_buff *skb, u32 pid, u32 seq, int event,
2819			int nowait, unsigned int flags)
 
 
2820{
2821	struct rtable *rt = skb_rtable(skb);
2822	struct rtmsg *r;
2823	struct nlmsghdr *nlh;
2824	long expires = 0;
2825	const struct inet_peer *peer = rt->peer;
2826	u32 id = 0, ts = 0, tsage = 0, error;
2827
2828	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2829	if (nlh == NULL)
2830		return -EMSGSIZE;
2831
2832	r = nlmsg_data(nlh);
2833	r->rtm_family	 = AF_INET;
2834	r->rtm_dst_len	= 32;
2835	r->rtm_src_len	= 0;
2836	r->rtm_tos	= rt->rt_key_tos;
2837	r->rtm_table	= RT_TABLE_MAIN;
2838	NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
 
2839	r->rtm_type	= rt->rt_type;
2840	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2841	r->rtm_protocol = RTPROT_UNSPEC;
2842	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2843	if (rt->rt_flags & RTCF_NOTIFY)
2844		r->rtm_flags |= RTM_F_NOTIFY;
 
 
2845
2846	NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2847
2848	if (rt->rt_key_src) {
2849		r->rtm_src_len = 32;
2850		NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src);
 
2851	}
2852	if (rt->dst.dev)
2853		NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
 
2854#ifdef CONFIG_IP_ROUTE_CLASSID
2855	if (rt->dst.tclassid)
2856		NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
 
2857#endif
2858	if (rt_is_input_route(rt))
2859		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2860	else if (rt->rt_src != rt->rt_key_src)
2861		NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2862
2863	if (rt->rt_dst != rt->rt_gateway)
2864		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
 
2865
2866	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
 
 
 
 
 
 
 
 
 
 
 
2867		goto nla_put_failure;
2868
2869	if (rt->rt_mark)
2870		NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark);
 
 
 
 
 
 
 
 
2871
2872	error = rt->dst.error;
2873	if (peer) {
2874		inet_peer_refcheck(rt->peer);
2875		id = atomic_read(&peer->ip_id_count) & 0xffff;
2876		if (peer->tcp_ts_stamp) {
2877			ts = peer->tcp_ts;
2878			tsage = get_seconds() - peer->tcp_ts_stamp;
2879		}
2880		expires = ACCESS_ONCE(peer->pmtu_expires);
2881		if (expires)
2882			expires -= jiffies;
2883	}
2884
2885	if (rt_is_input_route(rt)) {
2886#ifdef CONFIG_IP_MROUTE
2887		__be32 dst = rt->rt_dst;
 
 
 
 
 
2888
2889		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2890		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2891			int err = ipmr_get_route(net, skb,
2892						 rt->rt_src, rt->rt_dst,
2893						 r, nowait);
2894			if (err <= 0) {
2895				if (!nowait) {
2896					if (err == 0)
2897						return 0;
2898					goto nla_put_failure;
2899				} else {
2900					if (err == -EMSGSIZE)
2901						goto nla_put_failure;
2902					error = err;
2903				}
2904			}
2905		} else
2906#endif
2907			NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif);
 
 
2908	}
2909
2910	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2911			       expires, error) < 0)
 
2912		goto nla_put_failure;
2913
2914	return nlmsg_end(skb, nlh);
 
2915
2916nla_put_failure:
2917	nlmsg_cancel(skb, nlh);
2918	return -EMSGSIZE;
2919}
2920
2921static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2922{
2923	struct net *net = sock_net(in_skb->sk);
2924	struct rtmsg *rtm;
2925	struct nlattr *tb[RTA_MAX+1];
 
 
 
 
2926	struct rtable *rt = NULL;
 
 
 
2927	__be32 dst = 0;
2928	__be32 src = 0;
 
2929	u32 iif;
2930	int err;
2931	int mark;
2932	struct sk_buff *skb;
2933
2934	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2935	if (err < 0)
2936		goto errout;
2937
2938	rtm = nlmsg_data(nlh);
 
 
 
 
 
 
 
 
2939
2940	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2941	if (skb == NULL) {
2942		err = -ENOBUFS;
2943		goto errout;
 
2944	}
2945
2946	/* Reserve room for dummy headers, this skb can pass
2947	   through good chunk of routing engine.
2948	 */
2949	skb_reset_mac_header(skb);
2950	skb_reset_network_header(skb);
2951
2952	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2953	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2954	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
 
 
 
 
 
 
 
 
 
 
 
 
2955
2956	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2957	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2958	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2959	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2960
2961	if (iif) {
2962		struct net_device *dev;
2963
2964		dev = __dev_get_by_index(net, iif);
2965		if (dev == NULL) {
2966			err = -ENODEV;
2967			goto errout_free;
2968		}
2969
2970		skb->protocol	= htons(ETH_P_IP);
2971		skb->dev	= dev;
2972		skb->mark	= mark;
2973		local_bh_disable();
2974		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2975		local_bh_enable();
2976
2977		rt = skb_rtable(skb);
2978		if (err == 0 && rt->dst.error)
2979			err = -rt->dst.error;
2980	} else {
2981		struct flowi4 fl4 = {
2982			.daddr = dst,
2983			.saddr = src,
2984			.flowi4_tos = rtm->rtm_tos,
2985			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2986			.flowi4_mark = mark,
2987		};
2988		rt = ip_route_output_key(net, &fl4);
2989
2990		err = 0;
2991		if (IS_ERR(rt))
2992			err = PTR_ERR(rt);
 
 
2993	}
2994
2995	if (err)
2996		goto errout_free;
2997
2998	skb_dst_set(skb, &rt->dst);
2999	if (rtm->rtm_flags & RTM_F_NOTIFY)
3000		rt->rt_flags |= RTCF_NOTIFY;
3001
3002	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3003			   RTM_NEWROUTE, 0, 0);
3004	if (err <= 0)
3005		goto errout_free;
3006
3007	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3008errout:
3009	return err;
 
 
3010
3011errout_free:
3012	kfree_skb(skb);
3013	goto errout;
3014}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3015
3016int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3017{
3018	struct rtable *rt;
3019	int h, s_h;
3020	int idx, s_idx;
3021	struct net *net;
3022
3023	net = sock_net(skb->sk);
3024
3025	s_h = cb->args[0];
3026	if (s_h < 0)
3027		s_h = 0;
3028	s_idx = idx = cb->args[1];
3029	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3030		if (!rt_hash_table[h].chain)
3031			continue;
3032		rcu_read_lock_bh();
3033		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3034		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3035			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3036				continue;
3037			if (rt_is_expired(rt))
3038				continue;
3039			skb_dst_set_noref(skb, &rt->dst);
3040			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3041					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3042					 1, NLM_F_MULTI) <= 0) {
3043				skb_dst_drop(skb);
3044				rcu_read_unlock_bh();
3045				goto done;
3046			}
3047			skb_dst_drop(skb);
3048		}
3049		rcu_read_unlock_bh();
3050	}
3051
3052done:
3053	cb->args[0] = h;
3054	cb->args[1] = idx;
3055	return skb->len;
3056}
3057
3058void ip_rt_multicast_event(struct in_device *in_dev)
3059{
3060	rt_cache_flush(dev_net(in_dev->dev), 0);
3061}
3062
3063#ifdef CONFIG_SYSCTL
3064static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
 
 
 
 
 
3065					void __user *buffer,
3066					size_t *lenp, loff_t *ppos)
3067{
 
 
3068	if (write) {
3069		int flush_delay;
3070		ctl_table ctl;
3071		struct net *net;
3072
3073		memcpy(&ctl, __ctl, sizeof(ctl));
3074		ctl.data = &flush_delay;
3075		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3076
3077		net = (struct net *)__ctl->extra1;
3078		rt_cache_flush(net, flush_delay);
3079		return 0;
3080	}
3081
3082	return -EINVAL;
3083}
3084
3085static ctl_table ipv4_route_table[] = {
3086	{
3087		.procname	= "gc_thresh",
3088		.data		= &ipv4_dst_ops.gc_thresh,
3089		.maxlen		= sizeof(int),
3090		.mode		= 0644,
3091		.proc_handler	= proc_dointvec,
3092	},
3093	{
3094		.procname	= "max_size",
3095		.data		= &ip_rt_max_size,
3096		.maxlen		= sizeof(int),
3097		.mode		= 0644,
3098		.proc_handler	= proc_dointvec,
3099	},
3100	{
3101		/*  Deprecated. Use gc_min_interval_ms */
3102
3103		.procname	= "gc_min_interval",
3104		.data		= &ip_rt_gc_min_interval,
3105		.maxlen		= sizeof(int),
3106		.mode		= 0644,
3107		.proc_handler	= proc_dointvec_jiffies,
3108	},
3109	{
3110		.procname	= "gc_min_interval_ms",
3111		.data		= &ip_rt_gc_min_interval,
3112		.maxlen		= sizeof(int),
3113		.mode		= 0644,
3114		.proc_handler	= proc_dointvec_ms_jiffies,
3115	},
3116	{
3117		.procname	= "gc_timeout",
3118		.data		= &ip_rt_gc_timeout,
3119		.maxlen		= sizeof(int),
3120		.mode		= 0644,
3121		.proc_handler	= proc_dointvec_jiffies,
3122	},
3123	{
3124		.procname	= "gc_interval",
3125		.data		= &ip_rt_gc_interval,
3126		.maxlen		= sizeof(int),
3127		.mode		= 0644,
3128		.proc_handler	= proc_dointvec_jiffies,
3129	},
3130	{
3131		.procname	= "redirect_load",
3132		.data		= &ip_rt_redirect_load,
3133		.maxlen		= sizeof(int),
3134		.mode		= 0644,
3135		.proc_handler	= proc_dointvec,
3136	},
3137	{
3138		.procname	= "redirect_number",
3139		.data		= &ip_rt_redirect_number,
3140		.maxlen		= sizeof(int),
3141		.mode		= 0644,
3142		.proc_handler	= proc_dointvec,
3143	},
3144	{
3145		.procname	= "redirect_silence",
3146		.data		= &ip_rt_redirect_silence,
3147		.maxlen		= sizeof(int),
3148		.mode		= 0644,
3149		.proc_handler	= proc_dointvec,
3150	},
3151	{
3152		.procname	= "error_cost",
3153		.data		= &ip_rt_error_cost,
3154		.maxlen		= sizeof(int),
3155		.mode		= 0644,
3156		.proc_handler	= proc_dointvec,
3157	},
3158	{
3159		.procname	= "error_burst",
3160		.data		= &ip_rt_error_burst,
3161		.maxlen		= sizeof(int),
3162		.mode		= 0644,
3163		.proc_handler	= proc_dointvec,
3164	},
3165	{
3166		.procname	= "gc_elasticity",
3167		.data		= &ip_rt_gc_elasticity,
3168		.maxlen		= sizeof(int),
3169		.mode		= 0644,
3170		.proc_handler	= proc_dointvec,
3171	},
3172	{
3173		.procname	= "mtu_expires",
3174		.data		= &ip_rt_mtu_expires,
3175		.maxlen		= sizeof(int),
3176		.mode		= 0644,
3177		.proc_handler	= proc_dointvec_jiffies,
3178	},
3179	{
3180		.procname	= "min_pmtu",
3181		.data		= &ip_rt_min_pmtu,
3182		.maxlen		= sizeof(int),
3183		.mode		= 0644,
3184		.proc_handler	= proc_dointvec,
 
3185	},
3186	{
3187		.procname	= "min_adv_mss",
3188		.data		= &ip_rt_min_advmss,
3189		.maxlen		= sizeof(int),
3190		.mode		= 0644,
3191		.proc_handler	= proc_dointvec,
3192	},
3193	{ }
3194};
3195
3196static struct ctl_table empty[1];
3197
3198static struct ctl_table ipv4_skeleton[] =
3199{
3200	{ .procname = "route", 
3201	  .mode = 0555, .child = ipv4_route_table},
3202	{ .procname = "neigh", 
3203	  .mode = 0555, .child = empty},
3204	{ }
3205};
3206
3207static __net_initdata struct ctl_path ipv4_path[] = {
3208	{ .procname = "net", },
3209	{ .procname = "ipv4", },
3210	{ },
3211};
3212
3213static struct ctl_table ipv4_route_flush_table[] = {
3214	{
3215		.procname	= "flush",
3216		.maxlen		= sizeof(int),
3217		.mode		= 0200,
3218		.proc_handler	= ipv4_sysctl_rtcache_flush,
3219	},
3220	{ },
3221};
3222
3223static __net_initdata struct ctl_path ipv4_route_path[] = {
3224	{ .procname = "net", },
3225	{ .procname = "ipv4", },
3226	{ .procname = "route", },
3227	{ },
3228};
3229
3230static __net_init int sysctl_route_net_init(struct net *net)
3231{
3232	struct ctl_table *tbl;
3233
3234	tbl = ipv4_route_flush_table;
3235	if (!net_eq(net, &init_net)) {
3236		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3237		if (tbl == NULL)
3238			goto err_dup;
 
 
 
 
 
 
3239	}
3240	tbl[0].extra1 = net;
3241
3242	net->ipv4.route_hdr =
3243		register_net_sysctl_table(net, ipv4_route_path, tbl);
3244	if (net->ipv4.route_hdr == NULL)
3245		goto err_reg;
3246	return 0;
3247
3248err_reg:
3249	if (tbl != ipv4_route_flush_table)
3250		kfree(tbl);
3251err_dup:
3252	return -ENOMEM;
3253}
3254
3255static __net_exit void sysctl_route_net_exit(struct net *net)
3256{
3257	struct ctl_table *tbl;
3258
3259	tbl = net->ipv4.route_hdr->ctl_table_arg;
3260	unregister_net_sysctl_table(net->ipv4.route_hdr);
3261	BUG_ON(tbl == ipv4_route_flush_table);
3262	kfree(tbl);
3263}
3264
3265static __net_initdata struct pernet_operations sysctl_route_ops = {
3266	.init = sysctl_route_net_init,
3267	.exit = sysctl_route_net_exit,
3268};
3269#endif
3270
3271static __net_init int rt_genid_init(struct net *net)
3272{
3273	get_random_bytes(&net->ipv4.rt_genid,
3274			 sizeof(net->ipv4.rt_genid));
3275	get_random_bytes(&net->ipv4.dev_addr_genid,
3276			 sizeof(net->ipv4.dev_addr_genid));
3277	return 0;
3278}
3279
3280static __net_initdata struct pernet_operations rt_genid_ops = {
3281	.init = rt_genid_init,
3282};
3283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3284
3285#ifdef CONFIG_IP_ROUTE_CLASSID
3286struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3287#endif /* CONFIG_IP_ROUTE_CLASSID */
3288
3289static __initdata unsigned long rhash_entries;
3290static int __init set_rhash_entries(char *str)
3291{
3292	if (!str)
3293		return 0;
3294	rhash_entries = simple_strtoul(str, &str, 0);
3295	return 1;
3296}
3297__setup("rhash_entries=", set_rhash_entries);
 
 
 
 
 
 
3298
3299int __init ip_rt_init(void)
3300{
3301	int rc = 0;
3302
 
 
 
3303#ifdef CONFIG_IP_ROUTE_CLASSID
3304	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3305	if (!ip_rt_acct)
3306		panic("IP: failed to allocate ip_rt_acct\n");
3307#endif
3308
3309	ipv4_dst_ops.kmem_cachep =
3310		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3311				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3312
3313	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3314
3315	if (dst_entries_init(&ipv4_dst_ops) < 0)
3316		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3317
3318	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3319		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3320
3321	rt_hash_table = (struct rt_hash_bucket *)
3322		alloc_large_system_hash("IP route cache",
3323					sizeof(struct rt_hash_bucket),
3324					rhash_entries,
3325					(totalram_pages >= 128 * 1024) ?
3326					15 : 17,
3327					0,
3328					&rt_hash_log,
3329					&rt_hash_mask,
3330					rhash_entries ? 0 : 512 * 1024);
3331	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3332	rt_hash_lock_init();
3333
3334	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3335	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3336
3337	devinet_init();
3338	ip_fib_init();
3339
3340	if (ip_rt_proc_init())
3341		printk(KERN_ERR "Unable to create route proc files\n");
3342#ifdef CONFIG_XFRM
3343	xfrm_init();
3344	xfrm4_init(ip_rt_max_size);
3345#endif
3346	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
 
3347
3348#ifdef CONFIG_SYSCTL
3349	register_pernet_subsys(&sysctl_route_ops);
3350#endif
3351	register_pernet_subsys(&rt_genid_ops);
3352	return rc;
 
3353}
3354
3355#ifdef CONFIG_SYSCTL
3356/*
3357 * We really need to sanitize the damn ipv4 init order, then all
3358 * this nonsense will go away.
3359 */
3360void __init ip_static_sysctl_init(void)
3361{
3362	register_sysctl_paths(ipv4_path, ipv4_skeleton);
3363}
3364#endif