Linux Audio

Check our new training course

Loading...
v4.17
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		ROUTE - implementation of the IP router.
   7 *
   8 * Authors:	Ross Biro
   9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *		Alan Cox	:	Verify area fixes.
  16 *		Alan Cox	:	cli() protects routing changes
  17 *		Rui Oliveira	:	ICMP routing table updates
  18 *		(rco@di.uminho.pt)	Routing table insertion and update
  19 *		Linus Torvalds	:	Rewrote bits to be sensible
  20 *		Alan Cox	:	Added BSD route gw semantics
  21 *		Alan Cox	:	Super /proc >4K
  22 *		Alan Cox	:	MTU in route table
  23 *		Alan Cox	: 	MSS actually. Also added the window
  24 *					clamper.
  25 *		Sam Lantinga	:	Fixed route matching in rt_del()
  26 *		Alan Cox	:	Routing cache support.
  27 *		Alan Cox	:	Removed compatibility cruft.
  28 *		Alan Cox	:	RTF_REJECT support.
  29 *		Alan Cox	:	TCP irtt support.
  30 *		Jonathan Naylor	:	Added Metric support.
  31 *	Miquel van Smoorenburg	:	BSD API fixes.
  32 *	Miquel van Smoorenburg	:	Metrics.
  33 *		Alan Cox	:	Use __u32 properly
  34 *		Alan Cox	:	Aligned routing errors more closely with BSD
  35 *					our system is still very different.
  36 *		Alan Cox	:	Faster /proc handling
  37 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  38 *					routing caches and better behaviour.
  39 *
  40 *		Olaf Erb	:	irtt wasn't being copied right.
  41 *		Bjorn Ekwall	:	Kerneld route support.
  42 *		Alan Cox	:	Multicast fixed (I hope)
  43 * 		Pavel Krauz	:	Limited broadcast fixed
  44 *		Mike McLagan	:	Routing by source
  45 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  46 *					route.c and rewritten from scratch.
  47 *		Andi Kleen	:	Load-limit warning messages.
  48 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  49 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  50 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  51 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  52 *		Marc Boucher	:	routing by fwmark
  53 *	Robert Olsson		:	Added rt_cache statistics
  54 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  55 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  56 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  57 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
  58 *
  59 *		This program is free software; you can redistribute it and/or
  60 *		modify it under the terms of the GNU General Public License
  61 *		as published by the Free Software Foundation; either version
  62 *		2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <linux/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <linux/jhash.h>
  93#include <net/dst.h>
  94#include <net/dst_metadata.h>
  95#include <net/net_namespace.h>
  96#include <net/protocol.h>
  97#include <net/ip.h>
  98#include <net/route.h>
  99#include <net/inetpeer.h>
 100#include <net/sock.h>
 101#include <net/ip_fib.h>
 102#include <net/arp.h>
 103#include <net/tcp.h>
 104#include <net/icmp.h>
 105#include <net/xfrm.h>
 106#include <net/lwtunnel.h>
 107#include <net/netevent.h>
 108#include <net/rtnetlink.h>
 109#ifdef CONFIG_SYSCTL
 110#include <linux/sysctl.h>
 
 111#endif
 112#include <net/secure_seq.h>
 113#include <net/ip_tunnels.h>
 114#include <net/l3mdev.h>
 115
 116#include "fib_lookup.h"
 117
 118#define RT_FL_TOS(oldflp4) \
 119	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 120
 121#define RT_GC_TIMEOUT (300*HZ)
 122
 123static int ip_rt_max_size;
 124static int ip_rt_redirect_number __read_mostly	= 9;
 125static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 126static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 127static int ip_rt_error_cost __read_mostly	= HZ;
 128static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 129static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 130static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 131static int ip_rt_min_advmss __read_mostly	= 256;
 132
 133static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 134
 135/*
 136 *	Interface to generic destination cache.
 137 */
 138
 139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 141static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
 142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143static void		 ipv4_link_failure(struct sk_buff *skb);
 144static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145					   struct sk_buff *skb, u32 mtu);
 146static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147					struct sk_buff *skb);
 148static void		ipv4_dst_destroy(struct dst_entry *dst);
 149
 150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151{
 152	WARN_ON(1);
 153	return NULL;
 154}
 155
 156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157					   struct sk_buff *skb,
 158					   const void *daddr);
 159static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 160
 161static struct dst_ops ipv4_dst_ops = {
 162	.family =		AF_INET,
 
 163	.check =		ipv4_dst_check,
 164	.default_advmss =	ipv4_default_advmss,
 165	.mtu =			ipv4_mtu,
 166	.cow_metrics =		ipv4_cow_metrics,
 167	.destroy =		ipv4_dst_destroy,
 168	.negative_advice =	ipv4_negative_advice,
 169	.link_failure =		ipv4_link_failure,
 170	.update_pmtu =		ip_rt_update_pmtu,
 171	.redirect =		ip_do_redirect,
 172	.local_out =		__ip_local_out,
 173	.neigh_lookup =		ipv4_neigh_lookup,
 174	.confirm_neigh =	ipv4_confirm_neigh,
 175};
 176
 177#define ECN_OR_COST(class)	TC_PRIO_##class
 178
 179const __u8 ip_tos2prio[16] = {
 180	TC_PRIO_BESTEFFORT,
 181	ECN_OR_COST(BESTEFFORT),
 182	TC_PRIO_BESTEFFORT,
 183	ECN_OR_COST(BESTEFFORT),
 184	TC_PRIO_BULK,
 185	ECN_OR_COST(BULK),
 186	TC_PRIO_BULK,
 187	ECN_OR_COST(BULK),
 188	TC_PRIO_INTERACTIVE,
 189	ECN_OR_COST(INTERACTIVE),
 190	TC_PRIO_INTERACTIVE,
 191	ECN_OR_COST(INTERACTIVE),
 192	TC_PRIO_INTERACTIVE_BULK,
 193	ECN_OR_COST(INTERACTIVE_BULK),
 194	TC_PRIO_INTERACTIVE_BULK,
 195	ECN_OR_COST(INTERACTIVE_BULK)
 196};
 197EXPORT_SYMBOL(ip_tos2prio);
 198
 199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 200#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 201
 202#ifdef CONFIG_PROC_FS
 203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 204{
 205	if (*pos)
 206		return NULL;
 207	return SEQ_START_TOKEN;
 208}
 209
 210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 211{
 212	++*pos;
 213	return NULL;
 214}
 215
 216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 217{
 218}
 219
 220static int rt_cache_seq_show(struct seq_file *seq, void *v)
 221{
 222	if (v == SEQ_START_TOKEN)
 223		seq_printf(seq, "%-127s\n",
 224			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 225			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 226			   "HHUptod\tSpecDst");
 227	return 0;
 228}
 229
 230static const struct seq_operations rt_cache_seq_ops = {
 231	.start  = rt_cache_seq_start,
 232	.next   = rt_cache_seq_next,
 233	.stop   = rt_cache_seq_stop,
 234	.show   = rt_cache_seq_show,
 235};
 236
 237static int rt_cache_seq_open(struct inode *inode, struct file *file)
 238{
 239	return seq_open(file, &rt_cache_seq_ops);
 240}
 241
 242static const struct file_operations rt_cache_seq_fops = {
 
 243	.open	 = rt_cache_seq_open,
 244	.read	 = seq_read,
 245	.llseek	 = seq_lseek,
 246	.release = seq_release,
 247};
 248
 249
 250static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 251{
 252	int cpu;
 253
 254	if (*pos == 0)
 255		return SEQ_START_TOKEN;
 256
 257	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 258		if (!cpu_possible(cpu))
 259			continue;
 260		*pos = cpu+1;
 261		return &per_cpu(rt_cache_stat, cpu);
 262	}
 263	return NULL;
 264}
 265
 266static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 267{
 268	int cpu;
 269
 270	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 271		if (!cpu_possible(cpu))
 272			continue;
 273		*pos = cpu+1;
 274		return &per_cpu(rt_cache_stat, cpu);
 275	}
 276	return NULL;
 277
 278}
 279
 280static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281{
 282
 283}
 284
 285static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286{
 287	struct rt_cache_stat *st = v;
 288
 289	if (v == SEQ_START_TOKEN) {
 290		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291		return 0;
 292	}
 293
 294	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 296		   dst_entries_get_slow(&ipv4_dst_ops),
 297		   0, /* st->in_hit */
 298		   st->in_slow_tot,
 299		   st->in_slow_mc,
 300		   st->in_no_route,
 301		   st->in_brd,
 302		   st->in_martian_dst,
 303		   st->in_martian_src,
 304
 305		   0, /* st->out_hit */
 306		   st->out_slow_tot,
 307		   st->out_slow_mc,
 308
 309		   0, /* st->gc_total */
 310		   0, /* st->gc_ignored */
 311		   0, /* st->gc_goal_miss */
 312		   0, /* st->gc_dst_overflow */
 313		   0, /* st->in_hlist_search */
 314		   0  /* st->out_hlist_search */
 315		);
 316	return 0;
 317}
 318
 319static const struct seq_operations rt_cpu_seq_ops = {
 320	.start  = rt_cpu_seq_start,
 321	.next   = rt_cpu_seq_next,
 322	.stop   = rt_cpu_seq_stop,
 323	.show   = rt_cpu_seq_show,
 324};
 325
 326
 327static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328{
 329	return seq_open(file, &rt_cpu_seq_ops);
 330}
 331
 332static const struct file_operations rt_cpu_seq_fops = {
 
 333	.open	 = rt_cpu_seq_open,
 334	.read	 = seq_read,
 335	.llseek	 = seq_lseek,
 336	.release = seq_release,
 337};
 338
 339#ifdef CONFIG_IP_ROUTE_CLASSID
 340static int rt_acct_proc_show(struct seq_file *m, void *v)
 341{
 342	struct ip_rt_acct *dst, *src;
 343	unsigned int i, j;
 344
 345	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 346	if (!dst)
 347		return -ENOMEM;
 348
 349	for_each_possible_cpu(i) {
 350		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 351		for (j = 0; j < 256; j++) {
 352			dst[j].o_bytes   += src[j].o_bytes;
 353			dst[j].o_packets += src[j].o_packets;
 354			dst[j].i_bytes   += src[j].i_bytes;
 355			dst[j].i_packets += src[j].i_packets;
 356		}
 357	}
 358
 359	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 360	kfree(dst);
 361	return 0;
 362}
 363
 364static int rt_acct_proc_open(struct inode *inode, struct file *file)
 365{
 366	return single_open(file, rt_acct_proc_show, NULL);
 367}
 368
 369static const struct file_operations rt_acct_proc_fops = {
 
 370	.open		= rt_acct_proc_open,
 371	.read		= seq_read,
 372	.llseek		= seq_lseek,
 373	.release	= single_release,
 374};
 375#endif
 376
 377static int __net_init ip_rt_do_proc_init(struct net *net)
 378{
 379	struct proc_dir_entry *pde;
 380
 381	pde = proc_create("rt_cache", 0444, net->proc_net,
 382			  &rt_cache_seq_fops);
 383	if (!pde)
 384		goto err1;
 385
 386	pde = proc_create("rt_cache", 0444,
 387			  net->proc_net_stat, &rt_cpu_seq_fops);
 388	if (!pde)
 389		goto err2;
 390
 391#ifdef CONFIG_IP_ROUTE_CLASSID
 392	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 393	if (!pde)
 394		goto err3;
 395#endif
 396	return 0;
 397
 398#ifdef CONFIG_IP_ROUTE_CLASSID
 399err3:
 400	remove_proc_entry("rt_cache", net->proc_net_stat);
 401#endif
 402err2:
 403	remove_proc_entry("rt_cache", net->proc_net);
 404err1:
 405	return -ENOMEM;
 406}
 407
 408static void __net_exit ip_rt_do_proc_exit(struct net *net)
 409{
 410	remove_proc_entry("rt_cache", net->proc_net_stat);
 411	remove_proc_entry("rt_cache", net->proc_net);
 412#ifdef CONFIG_IP_ROUTE_CLASSID
 413	remove_proc_entry("rt_acct", net->proc_net);
 414#endif
 415}
 416
 417static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 418	.init = ip_rt_do_proc_init,
 419	.exit = ip_rt_do_proc_exit,
 420};
 421
 422static int __init ip_rt_proc_init(void)
 423{
 424	return register_pernet_subsys(&ip_rt_proc_ops);
 425}
 426
 427#else
 428static inline int ip_rt_proc_init(void)
 429{
 430	return 0;
 431}
 432#endif /* CONFIG_PROC_FS */
 433
 434static inline bool rt_is_expired(const struct rtable *rth)
 435{
 436	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 437}
 438
 439void rt_cache_flush(struct net *net)
 440{
 441	rt_genid_bump_ipv4(net);
 442}
 443
 444static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 445					   struct sk_buff *skb,
 446					   const void *daddr)
 447{
 448	struct net_device *dev = dst->dev;
 449	const __be32 *pkey = daddr;
 450	const struct rtable *rt;
 451	struct neighbour *n;
 452
 453	rt = (const struct rtable *) dst;
 454	if (rt->rt_gateway)
 455		pkey = (const __be32 *) &rt->rt_gateway;
 456	else if (skb)
 457		pkey = &ip_hdr(skb)->daddr;
 458
 459	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 460	if (n)
 461		return n;
 462	return neigh_create(&arp_tbl, pkey, dev);
 463}
 464
 465static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 466{
 467	struct net_device *dev = dst->dev;
 468	const __be32 *pkey = daddr;
 469	const struct rtable *rt;
 470
 471	rt = (const struct rtable *)dst;
 472	if (rt->rt_gateway)
 473		pkey = (const __be32 *)&rt->rt_gateway;
 474	else if (!daddr ||
 475		 (rt->rt_flags &
 476		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 477		return;
 478
 479	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 480}
 481
 482#define IP_IDENTS_SZ 2048u
 483
 484static atomic_t *ip_idents __read_mostly;
 485static u32 *ip_tstamps __read_mostly;
 486
 487/* In order to protect privacy, we add a perturbation to identifiers
 488 * if one generator is seldom used. This makes hard for an attacker
 489 * to infer how many packets were sent between two points in time.
 490 */
 491u32 ip_idents_reserve(u32 hash, int segs)
 492{
 493	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 494	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 495	u32 old = READ_ONCE(*p_tstamp);
 496	u32 now = (u32)jiffies;
 497	u32 new, delta = 0;
 498
 499	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 500		delta = prandom_u32_max(now - old);
 501
 502	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
 503	do {
 504		old = (u32)atomic_read(p_id);
 505		new = old + delta + segs;
 506	} while (atomic_cmpxchg(p_id, old, new) != old);
 507
 508	return new - segs;
 
 
 
 
 509}
 510EXPORT_SYMBOL(ip_idents_reserve);
 511
 512void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 513{
 514	static u32 ip_idents_hashrnd __read_mostly;
 515	u32 hash, id;
 516
 517	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 
 
 
 
 
 518
 519	hash = jhash_3words((__force u32)iph->daddr,
 520			    (__force u32)iph->saddr,
 521			    iph->protocol ^ net_hash_mix(net),
 522			    ip_idents_hashrnd);
 523	id = ip_idents_reserve(hash, segs);
 524	iph->id = htons(id);
 525}
 526EXPORT_SYMBOL(__ip_select_ident);
 527
 528static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 529			     const struct sock *sk,
 530			     const struct iphdr *iph,
 531			     int oif, u8 tos,
 532			     u8 prot, u32 mark, int flow_flags)
 533{
 534	if (sk) {
 535		const struct inet_sock *inet = inet_sk(sk);
 536
 537		oif = sk->sk_bound_dev_if;
 538		mark = sk->sk_mark;
 539		tos = RT_CONN_FLAGS(sk);
 540		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 541	}
 542	flowi4_init_output(fl4, oif, mark, tos,
 543			   RT_SCOPE_UNIVERSE, prot,
 544			   flow_flags,
 545			   iph->daddr, iph->saddr, 0, 0,
 546			   sock_net_uid(net, sk));
 547}
 548
 549static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 550			       const struct sock *sk)
 551{
 552	const struct net *net = dev_net(skb->dev);
 553	const struct iphdr *iph = ip_hdr(skb);
 554	int oif = skb->dev->ifindex;
 555	u8 tos = RT_TOS(iph->tos);
 556	u8 prot = iph->protocol;
 557	u32 mark = skb->mark;
 558
 559	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 560}
 561
 562static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 563{
 564	const struct inet_sock *inet = inet_sk(sk);
 565	const struct ip_options_rcu *inet_opt;
 566	__be32 daddr = inet->inet_daddr;
 567
 568	rcu_read_lock();
 569	inet_opt = rcu_dereference(inet->inet_opt);
 570	if (inet_opt && inet_opt->opt.srr)
 571		daddr = inet_opt->opt.faddr;
 572	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 573			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 574			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 575			   inet_sk_flowi_flags(sk),
 576			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 577	rcu_read_unlock();
 578}
 579
 580static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 581				 const struct sk_buff *skb)
 582{
 583	if (skb)
 584		build_skb_flow_key(fl4, skb, sk);
 585	else
 586		build_sk_flow_key(fl4, sk);
 587}
 588
 
 
 
 
 
 589static DEFINE_SPINLOCK(fnhe_lock);
 590
 591static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 592{
 593	struct rtable *rt;
 594
 595	rt = rcu_dereference(fnhe->fnhe_rth_input);
 596	if (rt) {
 597		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 598		dst_dev_put(&rt->dst);
 599		dst_release(&rt->dst);
 600	}
 601	rt = rcu_dereference(fnhe->fnhe_rth_output);
 602	if (rt) {
 603		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 604		dst_dev_put(&rt->dst);
 605		dst_release(&rt->dst);
 606	}
 607}
 608
 609static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 610{
 611	struct fib_nh_exception *fnhe, *oldest;
 612
 613	oldest = rcu_dereference(hash->chain);
 614	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 615	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 616		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 617			oldest = fnhe;
 618	}
 619	fnhe_flush_routes(oldest);
 620	return oldest;
 621}
 622
 623static inline u32 fnhe_hashfun(__be32 daddr)
 624{
 625	static u32 fnhe_hashrnd __read_mostly;
 626	u32 hval;
 627
 628	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 629	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 630	return hash_32(hval, FNHE_HASH_SHIFT);
 
 631}
 632
 633static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 634{
 635	rt->rt_pmtu = fnhe->fnhe_pmtu;
 636	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 637	rt->dst.expires = fnhe->fnhe_expires;
 638
 639	if (fnhe->fnhe_gw) {
 640		rt->rt_flags |= RTCF_REDIRECTED;
 641		rt->rt_gateway = fnhe->fnhe_gw;
 642		rt->rt_uses_gateway = 1;
 643	}
 644}
 645
 646static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 647				  u32 pmtu, bool lock, unsigned long expires)
 648{
 649	struct fnhe_hash_bucket *hash;
 650	struct fib_nh_exception *fnhe;
 651	struct rtable *rt;
 652	u32 genid, hval;
 653	unsigned int i;
 654	int depth;
 655
 656	genid = fnhe_genid(dev_net(nh->nh_dev));
 657	hval = fnhe_hashfun(daddr);
 658
 659	spin_lock_bh(&fnhe_lock);
 660
 661	hash = rcu_dereference(nh->nh_exceptions);
 662	if (!hash) {
 663		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 664		if (!hash)
 665			goto out_unlock;
 666		rcu_assign_pointer(nh->nh_exceptions, hash);
 667	}
 668
 669	hash += hval;
 670
 671	depth = 0;
 672	for (fnhe = rcu_dereference(hash->chain); fnhe;
 673	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 674		if (fnhe->fnhe_daddr == daddr)
 675			break;
 676		depth++;
 677	}
 678
 679	if (fnhe) {
 680		if (fnhe->fnhe_genid != genid)
 681			fnhe->fnhe_genid = genid;
 682		if (gw)
 683			fnhe->fnhe_gw = gw;
 684		if (pmtu) {
 685			fnhe->fnhe_pmtu = pmtu;
 686			fnhe->fnhe_mtu_locked = lock;
 687		}
 688		fnhe->fnhe_expires = max(1UL, expires);
 689		/* Update all cached dsts too */
 690		rt = rcu_dereference(fnhe->fnhe_rth_input);
 691		if (rt)
 692			fill_route_from_fnhe(rt, fnhe);
 693		rt = rcu_dereference(fnhe->fnhe_rth_output);
 694		if (rt)
 695			fill_route_from_fnhe(rt, fnhe);
 696	} else {
 697		if (depth > FNHE_RECLAIM_DEPTH)
 698			fnhe = fnhe_oldest(hash);
 699		else {
 700			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 701			if (!fnhe)
 702				goto out_unlock;
 703
 704			fnhe->fnhe_next = hash->chain;
 705			rcu_assign_pointer(hash->chain, fnhe);
 706		}
 707		fnhe->fnhe_genid = genid;
 708		fnhe->fnhe_daddr = daddr;
 709		fnhe->fnhe_gw = gw;
 710		fnhe->fnhe_pmtu = pmtu;
 711		fnhe->fnhe_mtu_locked = lock;
 712		fnhe->fnhe_expires = max(1UL, expires);
 713
 714		/* Exception created; mark the cached routes for the nexthop
 715		 * stale, so anyone caching it rechecks if this exception
 716		 * applies to them.
 717		 */
 718		rt = rcu_dereference(nh->nh_rth_input);
 719		if (rt)
 720			rt->dst.obsolete = DST_OBSOLETE_KILL;
 721
 722		for_each_possible_cpu(i) {
 723			struct rtable __rcu **prt;
 724			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 725			rt = rcu_dereference(*prt);
 726			if (rt)
 727				rt->dst.obsolete = DST_OBSOLETE_KILL;
 728		}
 729	}
 730
 731	fnhe->fnhe_stamp = jiffies;
 732
 733out_unlock:
 734	spin_unlock_bh(&fnhe_lock);
 735}
 736
 737static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 738			     bool kill_route)
 739{
 740	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 741	__be32 old_gw = ip_hdr(skb)->saddr;
 742	struct net_device *dev = skb->dev;
 743	struct in_device *in_dev;
 744	struct fib_result res;
 745	struct neighbour *n;
 746	struct net *net;
 747
 748	switch (icmp_hdr(skb)->code & 7) {
 749	case ICMP_REDIR_NET:
 750	case ICMP_REDIR_NETTOS:
 751	case ICMP_REDIR_HOST:
 752	case ICMP_REDIR_HOSTTOS:
 753		break;
 754
 755	default:
 756		return;
 757	}
 758
 759	if (rt->rt_gateway != old_gw)
 760		return;
 761
 762	in_dev = __in_dev_get_rcu(dev);
 763	if (!in_dev)
 764		return;
 765
 766	net = dev_net(dev);
 767	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 768	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 769	    ipv4_is_zeronet(new_gw))
 770		goto reject_redirect;
 771
 772	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 773		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 774			goto reject_redirect;
 775		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 776			goto reject_redirect;
 777	} else {
 778		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 779			goto reject_redirect;
 780	}
 781
 782	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 783	if (!n)
 784		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 785	if (!IS_ERR(n)) {
 786		if (!(n->nud_state & NUD_VALID)) {
 787			neigh_event_send(n, NULL);
 788		} else {
 789			if (fib_lookup(net, fl4, &res, 0) == 0) {
 790				struct fib_nh *nh = &FIB_RES_NH(res);
 791
 792				update_or_create_fnhe(nh, fl4->daddr, new_gw,
 793						0, false,
 794						jiffies + ip_rt_gc_timeout);
 795			}
 796			if (kill_route)
 797				rt->dst.obsolete = DST_OBSOLETE_KILL;
 798			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 799		}
 800		neigh_release(n);
 801	}
 802	return;
 803
 804reject_redirect:
 805#ifdef CONFIG_IP_ROUTE_VERBOSE
 806	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 807		const struct iphdr *iph = (const struct iphdr *) skb->data;
 808		__be32 daddr = iph->daddr;
 809		__be32 saddr = iph->saddr;
 810
 811		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 812				     "  Advised path = %pI4 -> %pI4\n",
 813				     &old_gw, dev->name, &new_gw,
 814				     &saddr, &daddr);
 815	}
 816#endif
 817	;
 818}
 819
 820static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 821{
 822	struct rtable *rt;
 823	struct flowi4 fl4;
 824	const struct iphdr *iph = (const struct iphdr *) skb->data;
 825	struct net *net = dev_net(skb->dev);
 826	int oif = skb->dev->ifindex;
 827	u8 tos = RT_TOS(iph->tos);
 828	u8 prot = iph->protocol;
 829	u32 mark = skb->mark;
 830
 831	rt = (struct rtable *) dst;
 832
 833	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 834	__ip_do_redirect(rt, skb, &fl4, true);
 835}
 836
 837static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 838{
 839	struct rtable *rt = (struct rtable *)dst;
 840	struct dst_entry *ret = dst;
 841
 842	if (rt) {
 843		if (dst->obsolete > 0) {
 844			ip_rt_put(rt);
 845			ret = NULL;
 846		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 847			   rt->dst.expires) {
 848			ip_rt_put(rt);
 849			ret = NULL;
 850		}
 851	}
 852	return ret;
 853}
 854
 855/*
 856 * Algorithm:
 857 *	1. The first ip_rt_redirect_number redirects are sent
 858 *	   with exponential backoff, then we stop sending them at all,
 859 *	   assuming that the host ignores our redirects.
 860 *	2. If we did not see packets requiring redirects
 861 *	   during ip_rt_redirect_silence, we assume that the host
 862 *	   forgot redirected route and start to send redirects again.
 863 *
 864 * This algorithm is much cheaper and more intelligent than dumb load limiting
 865 * in icmp.c.
 866 *
 867 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 868 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 869 */
 870
 871void ip_rt_send_redirect(struct sk_buff *skb)
 872{
 873	struct rtable *rt = skb_rtable(skb);
 874	struct in_device *in_dev;
 875	struct inet_peer *peer;
 876	struct net *net;
 877	int log_martians;
 878	int vif;
 879
 880	rcu_read_lock();
 881	in_dev = __in_dev_get_rcu(rt->dst.dev);
 882	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 883		rcu_read_unlock();
 884		return;
 885	}
 886	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 887	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 888	rcu_read_unlock();
 889
 890	net = dev_net(rt->dst.dev);
 891	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 892	if (!peer) {
 893		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 894			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 895		return;
 896	}
 897
 898	/* No redirected packets during ip_rt_redirect_silence;
 899	 * reset the algorithm.
 900	 */
 901	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 902		peer->rate_tokens = 0;
 903
 904	/* Too many ignored redirects; do not send anything
 905	 * set dst.rate_last to the last seen redirected packet.
 906	 */
 907	if (peer->rate_tokens >= ip_rt_redirect_number) {
 908		peer->rate_last = jiffies;
 909		goto out_put_peer;
 910	}
 911
 912	/* Check for load limit; set rate_last to the latest sent
 913	 * redirect.
 914	 */
 915	if (peer->rate_tokens == 0 ||
 916	    time_after(jiffies,
 917		       (peer->rate_last +
 918			(ip_rt_redirect_load << peer->rate_tokens)))) {
 919		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 920
 921		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 922		peer->rate_last = jiffies;
 923		++peer->rate_tokens;
 924#ifdef CONFIG_IP_ROUTE_VERBOSE
 925		if (log_martians &&
 926		    peer->rate_tokens == ip_rt_redirect_number)
 927			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 928					     &ip_hdr(skb)->saddr, inet_iif(skb),
 929					     &ip_hdr(skb)->daddr, &gw);
 930#endif
 931	}
 932out_put_peer:
 933	inet_putpeer(peer);
 934}
 935
 936static int ip_error(struct sk_buff *skb)
 937{
 
 938	struct rtable *rt = skb_rtable(skb);
 939	struct net_device *dev = skb->dev;
 940	struct in_device *in_dev;
 941	struct inet_peer *peer;
 942	unsigned long now;
 943	struct net *net;
 944	bool send;
 945	int code;
 946
 947	if (netif_is_l3_master(skb->dev)) {
 948		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 949		if (!dev)
 950			goto out;
 951	}
 952
 953	in_dev = __in_dev_get_rcu(dev);
 954
 955	/* IP on this device is disabled. */
 956	if (!in_dev)
 957		goto out;
 958
 959	net = dev_net(rt->dst.dev);
 960	if (!IN_DEV_FORWARD(in_dev)) {
 961		switch (rt->dst.error) {
 962		case EHOSTUNREACH:
 963			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 964			break;
 965
 966		case ENETUNREACH:
 967			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 968			break;
 969		}
 970		goto out;
 971	}
 972
 973	switch (rt->dst.error) {
 974	case EINVAL:
 975	default:
 976		goto out;
 977	case EHOSTUNREACH:
 978		code = ICMP_HOST_UNREACH;
 979		break;
 980	case ENETUNREACH:
 981		code = ICMP_NET_UNREACH;
 982		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 983		break;
 984	case EACCES:
 985		code = ICMP_PKT_FILTERED;
 986		break;
 987	}
 988
 989	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 990			       l3mdev_master_ifindex(skb->dev), 1);
 991
 992	send = true;
 993	if (peer) {
 994		now = jiffies;
 995		peer->rate_tokens += now - peer->rate_last;
 996		if (peer->rate_tokens > ip_rt_error_burst)
 997			peer->rate_tokens = ip_rt_error_burst;
 998		peer->rate_last = now;
 999		if (peer->rate_tokens >= ip_rt_error_cost)
1000			peer->rate_tokens -= ip_rt_error_cost;
1001		else
1002			send = false;
1003		inet_putpeer(peer);
1004	}
1005	if (send)
1006		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1007
1008out:	kfree_skb(skb);
1009	return 0;
1010}
1011
1012static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1013{
1014	struct dst_entry *dst = &rt->dst;
1015	struct fib_result res;
1016	bool lock = false;
1017
1018	if (ip_mtu_locked(dst))
1019		return;
1020
1021	if (ipv4_mtu(dst) < mtu)
1022		return;
1023
1024	if (mtu < ip_rt_min_pmtu) {
1025		lock = true;
1026		mtu = ip_rt_min_pmtu;
1027	}
1028
1029	if (rt->rt_pmtu == mtu &&
1030	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1031		return;
1032
1033	rcu_read_lock();
1034	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1035		struct fib_nh *nh = &FIB_RES_NH(res);
1036
1037		update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1038				      jiffies + ip_rt_mtu_expires);
1039	}
1040	rcu_read_unlock();
1041}
1042
1043static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1044			      struct sk_buff *skb, u32 mtu)
1045{
1046	struct rtable *rt = (struct rtable *) dst;
1047	struct flowi4 fl4;
1048
1049	ip_rt_build_flow_key(&fl4, sk, skb);
1050	__ip_rt_update_pmtu(rt, &fl4, mtu);
1051}
1052
1053void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1054		      int oif, u32 mark, u8 protocol, int flow_flags)
1055{
1056	const struct iphdr *iph = (const struct iphdr *) skb->data;
1057	struct flowi4 fl4;
1058	struct rtable *rt;
1059
1060	if (!mark)
1061		mark = IP4_REPLY_MARK(net, skb->mark);
1062
1063	__build_flow_key(net, &fl4, NULL, iph, oif,
1064			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1065	rt = __ip_route_output_key(net, &fl4);
1066	if (!IS_ERR(rt)) {
1067		__ip_rt_update_pmtu(rt, &fl4, mtu);
1068		ip_rt_put(rt);
1069	}
1070}
1071EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1072
1073static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074{
1075	const struct iphdr *iph = (const struct iphdr *) skb->data;
1076	struct flowi4 fl4;
1077	struct rtable *rt;
1078
1079	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1080
1081	if (!fl4.flowi4_mark)
1082		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1083
1084	rt = __ip_route_output_key(sock_net(sk), &fl4);
1085	if (!IS_ERR(rt)) {
1086		__ip_rt_update_pmtu(rt, &fl4, mtu);
1087		ip_rt_put(rt);
1088	}
1089}
1090
1091void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092{
1093	const struct iphdr *iph = (const struct iphdr *) skb->data;
1094	struct flowi4 fl4;
1095	struct rtable *rt;
1096	struct dst_entry *odst = NULL;
1097	bool new = false;
1098	struct net *net = sock_net(sk);
1099
1100	bh_lock_sock(sk);
1101
1102	if (!ip_sk_accept_pmtu(sk))
1103		goto out;
1104
1105	odst = sk_dst_get(sk);
1106
1107	if (sock_owned_by_user(sk) || !odst) {
1108		__ipv4_sk_update_pmtu(skb, sk, mtu);
1109		goto out;
1110	}
1111
1112	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1113
1114	rt = (struct rtable *)odst;
1115	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1116		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1117		if (IS_ERR(rt))
1118			goto out;
1119
1120		new = true;
1121	}
1122
1123	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1124
1125	if (!dst_check(&rt->dst, 0)) {
 
1126		if (new)
1127			dst_release(&rt->dst);
1128
1129		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1130		if (IS_ERR(rt))
1131			goto out;
1132
1133		new = true;
1134	}
1135
1136	if (new)
1137		sk_dst_set(sk, &rt->dst);
1138
1139out:
1140	bh_unlock_sock(sk);
1141	dst_release(odst);
1142}
1143EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1144
1145void ipv4_redirect(struct sk_buff *skb, struct net *net,
1146		   int oif, u32 mark, u8 protocol, int flow_flags)
1147{
1148	const struct iphdr *iph = (const struct iphdr *) skb->data;
1149	struct flowi4 fl4;
1150	struct rtable *rt;
1151
1152	__build_flow_key(net, &fl4, NULL, iph, oif,
1153			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1154	rt = __ip_route_output_key(net, &fl4);
1155	if (!IS_ERR(rt)) {
1156		__ip_do_redirect(rt, skb, &fl4, false);
1157		ip_rt_put(rt);
1158	}
1159}
1160EXPORT_SYMBOL_GPL(ipv4_redirect);
1161
1162void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1163{
1164	const struct iphdr *iph = (const struct iphdr *) skb->data;
1165	struct flowi4 fl4;
1166	struct rtable *rt;
1167	struct net *net = sock_net(sk);
1168
1169	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1170	rt = __ip_route_output_key(net, &fl4);
1171	if (!IS_ERR(rt)) {
1172		__ip_do_redirect(rt, skb, &fl4, false);
1173		ip_rt_put(rt);
1174	}
1175}
1176EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1177
1178static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1179{
1180	struct rtable *rt = (struct rtable *) dst;
1181
1182	/* All IPV4 dsts are created with ->obsolete set to the value
1183	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1184	 * into this function always.
1185	 *
1186	 * When a PMTU/redirect information update invalidates a route,
1187	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1188	 * DST_OBSOLETE_DEAD by dst_free().
1189	 */
1190	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1191		return NULL;
1192	return dst;
1193}
1194
1195static void ipv4_link_failure(struct sk_buff *skb)
1196{
1197	struct rtable *rt;
1198
1199	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1200
1201	rt = skb_rtable(skb);
1202	if (rt)
1203		dst_set_expires(&rt->dst, 0);
1204}
1205
1206static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1207{
1208	pr_debug("%s: %pI4 -> %pI4, %s\n",
1209		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1210		 skb->dev ? skb->dev->name : "?");
1211	kfree_skb(skb);
1212	WARN_ON(1);
1213	return 0;
1214}
1215
1216/*
1217   We do not cache source address of outgoing interface,
1218   because it is used only by IP RR, TS and SRR options,
1219   so that it out of fast path.
1220
1221   BTW remember: "addr" is allowed to be not aligned
1222   in IP options!
1223 */
1224
1225void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1226{
1227	__be32 src;
1228
1229	if (rt_is_output_route(rt))
1230		src = ip_hdr(skb)->saddr;
1231	else {
1232		struct fib_result res;
1233		struct flowi4 fl4;
1234		struct iphdr *iph;
1235
1236		iph = ip_hdr(skb);
1237
1238		memset(&fl4, 0, sizeof(fl4));
1239		fl4.daddr = iph->daddr;
1240		fl4.saddr = iph->saddr;
1241		fl4.flowi4_tos = RT_TOS(iph->tos);
1242		fl4.flowi4_oif = rt->dst.dev->ifindex;
1243		fl4.flowi4_iif = skb->dev->ifindex;
1244		fl4.flowi4_mark = skb->mark;
1245
1246		rcu_read_lock();
1247		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1248			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1249		else
1250			src = inet_select_addr(rt->dst.dev,
1251					       rt_nexthop(rt, iph->daddr),
1252					       RT_SCOPE_UNIVERSE);
1253		rcu_read_unlock();
1254	}
1255	memcpy(addr, &src, 4);
1256}
1257
1258#ifdef CONFIG_IP_ROUTE_CLASSID
1259static void set_class_tag(struct rtable *rt, u32 tag)
1260{
1261	if (!(rt->dst.tclassid & 0xFFFF))
1262		rt->dst.tclassid |= tag & 0xFFFF;
1263	if (!(rt->dst.tclassid & 0xFFFF0000))
1264		rt->dst.tclassid |= tag & 0xFFFF0000;
1265}
1266#endif
1267
1268static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1269{
1270	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1271	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1272				    ip_rt_min_advmss);
1273
1274	return min(advmss, IPV4_MAX_PMTU - header_size);
 
 
 
 
 
 
1275}
1276
1277static unsigned int ipv4_mtu(const struct dst_entry *dst)
1278{
1279	const struct rtable *rt = (const struct rtable *) dst;
1280	unsigned int mtu = rt->rt_pmtu;
1281
1282	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1283		mtu = dst_metric_raw(dst, RTAX_MTU);
1284
1285	if (mtu)
1286		return mtu;
1287
1288	mtu = READ_ONCE(dst->dev->mtu);
1289
1290	if (unlikely(ip_mtu_locked(dst))) {
1291		if (rt->rt_uses_gateway && mtu > 576)
1292			mtu = 576;
1293	}
1294
1295	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1296
1297	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1298}
1299
1300static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1301{
1302	struct fnhe_hash_bucket *hash;
1303	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1304	u32 hval = fnhe_hashfun(daddr);
1305
1306	spin_lock_bh(&fnhe_lock);
1307
1308	hash = rcu_dereference_protected(nh->nh_exceptions,
1309					 lockdep_is_held(&fnhe_lock));
1310	hash += hval;
1311
1312	fnhe_p = &hash->chain;
1313	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1314	while (fnhe) {
1315		if (fnhe->fnhe_daddr == daddr) {
1316			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1317				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1318			fnhe_flush_routes(fnhe);
1319			kfree_rcu(fnhe, rcu);
1320			break;
1321		}
1322		fnhe_p = &fnhe->fnhe_next;
1323		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1324						 lockdep_is_held(&fnhe_lock));
1325	}
1326
1327	spin_unlock_bh(&fnhe_lock);
1328}
1329
1330static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1331{
1332	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1333	struct fib_nh_exception *fnhe;
1334	u32 hval;
1335
1336	if (!hash)
1337		return NULL;
1338
1339	hval = fnhe_hashfun(daddr);
1340
1341	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1342	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1343		if (fnhe->fnhe_daddr == daddr) {
1344			if (fnhe->fnhe_expires &&
1345			    time_after(jiffies, fnhe->fnhe_expires)) {
1346				ip_del_fnhe(nh, daddr);
1347				break;
1348			}
1349			return fnhe;
1350		}
1351	}
1352	return NULL;
1353}
1354
1355static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1356			      __be32 daddr, const bool do_cache)
1357{
1358	bool ret = false;
1359
1360	spin_lock_bh(&fnhe_lock);
1361
1362	if (daddr == fnhe->fnhe_daddr) {
1363		struct rtable __rcu **porig;
1364		struct rtable *orig;
1365		int genid = fnhe_genid(dev_net(rt->dst.dev));
1366
1367		if (rt_is_input_route(rt))
1368			porig = &fnhe->fnhe_rth_input;
1369		else
1370			porig = &fnhe->fnhe_rth_output;
1371		orig = rcu_dereference(*porig);
1372
1373		if (fnhe->fnhe_genid != genid) {
1374			fnhe->fnhe_genid = genid;
1375			fnhe->fnhe_gw = 0;
1376			fnhe->fnhe_pmtu = 0;
1377			fnhe->fnhe_expires = 0;
1378			fnhe->fnhe_mtu_locked = false;
1379			fnhe_flush_routes(fnhe);
1380			orig = NULL;
1381		}
1382		fill_route_from_fnhe(rt, fnhe);
1383		if (!rt->rt_gateway)
1384			rt->rt_gateway = daddr;
1385
1386		if (do_cache) {
1387			dst_hold(&rt->dst);
1388			rcu_assign_pointer(*porig, rt);
1389			if (orig) {
1390				dst_dev_put(&orig->dst);
1391				dst_release(&orig->dst);
1392			}
1393			ret = true;
1394		}
1395
1396		fnhe->fnhe_stamp = jiffies;
1397	}
1398	spin_unlock_bh(&fnhe_lock);
1399
1400	return ret;
1401}
1402
1403static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1404{
1405	struct rtable *orig, *prev, **p;
1406	bool ret = true;
1407
1408	if (rt_is_input_route(rt)) {
1409		p = (struct rtable **)&nh->nh_rth_input;
1410	} else {
1411		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1412	}
1413	orig = *p;
1414
1415	/* hold dst before doing cmpxchg() to avoid race condition
1416	 * on this dst
1417	 */
1418	dst_hold(&rt->dst);
1419	prev = cmpxchg(p, orig, rt);
1420	if (prev == orig) {
1421		if (orig) {
1422			dst_dev_put(&orig->dst);
1423			dst_release(&orig->dst);
1424		}
1425	} else {
1426		dst_release(&rt->dst);
1427		ret = false;
1428	}
1429
1430	return ret;
1431}
1432
1433struct uncached_list {
1434	spinlock_t		lock;
1435	struct list_head	head;
1436};
1437
1438static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1439
1440void rt_add_uncached_list(struct rtable *rt)
1441{
1442	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1443
1444	rt->rt_uncached_list = ul;
1445
1446	spin_lock_bh(&ul->lock);
1447	list_add_tail(&rt->rt_uncached, &ul->head);
1448	spin_unlock_bh(&ul->lock);
1449}
1450
1451void rt_del_uncached_list(struct rtable *rt)
1452{
1453	if (!list_empty(&rt->rt_uncached)) {
1454		struct uncached_list *ul = rt->rt_uncached_list;
1455
1456		spin_lock_bh(&ul->lock);
 
1457		list_del(&rt->rt_uncached);
1458		spin_unlock_bh(&ul->lock);
1459	}
1460}
1461
1462static void ipv4_dst_destroy(struct dst_entry *dst)
1463{
1464	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1465	struct rtable *rt = (struct rtable *)dst;
1466
1467	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1468		kfree(p);
1469
1470	rt_del_uncached_list(rt);
1471}
1472
1473void rt_flush_dev(struct net_device *dev)
1474{
1475	struct net *net = dev_net(dev);
1476	struct rtable *rt;
1477	int cpu;
1478
1479	for_each_possible_cpu(cpu) {
1480		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1481
1482		spin_lock_bh(&ul->lock);
1483		list_for_each_entry(rt, &ul->head, rt_uncached) {
1484			if (rt->dst.dev != dev)
1485				continue;
1486			rt->dst.dev = net->loopback_dev;
1487			dev_hold(rt->dst.dev);
1488			dev_put(dev);
1489		}
1490		spin_unlock_bh(&ul->lock);
1491	}
1492}
1493
1494static bool rt_cache_valid(const struct rtable *rt)
1495{
1496	return	rt &&
1497		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1498		!rt_is_expired(rt);
1499}
1500
1501static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1502			   const struct fib_result *res,
1503			   struct fib_nh_exception *fnhe,
1504			   struct fib_info *fi, u16 type, u32 itag,
1505			   const bool do_cache)
1506{
1507	bool cached = false;
1508
1509	if (fi) {
1510		struct fib_nh *nh = &FIB_RES_NH(*res);
1511
1512		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1513			rt->rt_gateway = nh->nh_gw;
1514			rt->rt_uses_gateway = 1;
1515		}
1516		dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1517		if (fi->fib_metrics != &dst_default_metrics) {
1518			rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1519			refcount_inc(&fi->fib_metrics->refcnt);
1520		}
1521#ifdef CONFIG_IP_ROUTE_CLASSID
1522		rt->dst.tclassid = nh->nh_tclassid;
1523#endif
1524		rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1525		if (unlikely(fnhe))
1526			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1527		else if (do_cache)
1528			cached = rt_cache_route(nh, rt);
1529		if (unlikely(!cached)) {
1530			/* Routes we intend to cache in nexthop exception or
1531			 * FIB nexthop have the DST_NOCACHE bit clear.
1532			 * However, if we are unsuccessful at storing this
1533			 * route into the cache we really need to set it.
1534			 */
 
1535			if (!rt->rt_gateway)
1536				rt->rt_gateway = daddr;
1537			rt_add_uncached_list(rt);
1538		}
1539	} else
1540		rt_add_uncached_list(rt);
1541
1542#ifdef CONFIG_IP_ROUTE_CLASSID
1543#ifdef CONFIG_IP_MULTIPLE_TABLES
1544	set_class_tag(rt, res->tclassid);
1545#endif
1546	set_class_tag(rt, itag);
1547#endif
1548}
1549
1550struct rtable *rt_dst_alloc(struct net_device *dev,
1551			    unsigned int flags, u16 type,
1552			    bool nopolicy, bool noxfrm, bool will_cache)
1553{
1554	struct rtable *rt;
1555
1556	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1557		       (will_cache ? 0 : DST_HOST) |
1558		       (nopolicy ? DST_NOPOLICY : 0) |
1559		       (noxfrm ? DST_NOXFRM : 0));
1560
1561	if (rt) {
1562		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1563		rt->rt_flags = flags;
1564		rt->rt_type = type;
1565		rt->rt_is_input = 0;
1566		rt->rt_iif = 0;
1567		rt->rt_pmtu = 0;
1568		rt->rt_mtu_locked = 0;
1569		rt->rt_gateway = 0;
1570		rt->rt_uses_gateway = 0;
1571		INIT_LIST_HEAD(&rt->rt_uncached);
1572
1573		rt->dst.output = ip_output;
1574		if (flags & RTCF_LOCAL)
1575			rt->dst.input = ip_local_deliver;
1576	}
1577
1578	return rt;
1579}
1580EXPORT_SYMBOL(rt_dst_alloc);
1581
1582/* called in rcu_read_lock() section */
1583int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1584			  u8 tos, struct net_device *dev,
1585			  struct in_device *in_dev, u32 *itag)
1586{
 
 
 
1587	int err;
1588
1589	/* Primary sanity checks. */
1590	if (!in_dev)
 
1591		return -EINVAL;
1592
1593	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1594	    skb->protocol != htons(ETH_P_IP))
1595		return -EINVAL;
1596
1597	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1598		return -EINVAL;
 
1599
1600	if (ipv4_is_zeronet(saddr)) {
1601		if (!ipv4_is_local_multicast(daddr))
1602			return -EINVAL;
1603	} else {
1604		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1605					  in_dev, itag);
1606		if (err < 0)
1607			return err;
1608	}
1609	return 0;
1610}
1611
1612/* called in rcu_read_lock() section */
1613static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1614			     u8 tos, struct net_device *dev, int our)
1615{
1616	struct in_device *in_dev = __in_dev_get_rcu(dev);
1617	unsigned int flags = RTCF_MULTICAST;
1618	struct rtable *rth;
1619	u32 itag = 0;
1620	int err;
1621
1622	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1623	if (err)
1624		return err;
1625
1626	if (our)
1627		flags |= RTCF_LOCAL;
1628
1629	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1630			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1631	if (!rth)
1632		return -ENOBUFS;
1633
1634#ifdef CONFIG_IP_ROUTE_CLASSID
1635	rth->dst.tclassid = itag;
1636#endif
1637	rth->dst.output = ip_rt_bug;
 
 
 
 
1638	rth->rt_is_input= 1;
 
 
 
 
 
 
 
 
 
1639
1640#ifdef CONFIG_IP_MROUTE
1641	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1642		rth->dst.input = ip_mr_input;
1643#endif
1644	RT_CACHE_STAT_INC(in_slow_mc);
1645
1646	skb_dst_set(skb, &rth->dst);
1647	return 0;
 
 
 
 
 
 
 
1648}
1649
1650
1651static void ip_handle_martian_source(struct net_device *dev,
1652				     struct in_device *in_dev,
1653				     struct sk_buff *skb,
1654				     __be32 daddr,
1655				     __be32 saddr)
1656{
1657	RT_CACHE_STAT_INC(in_martian_src);
1658#ifdef CONFIG_IP_ROUTE_VERBOSE
1659	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1660		/*
1661		 *	RFC1812 recommendation, if source is martian,
1662		 *	the only hint is MAC header.
1663		 */
1664		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1665			&daddr, &saddr, dev->name);
1666		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1667			print_hex_dump(KERN_WARNING, "ll header: ",
1668				       DUMP_PREFIX_OFFSET, 16, 1,
1669				       skb_mac_header(skb),
1670				       dev->hard_header_len, true);
1671		}
1672	}
1673#endif
1674}
1675
1676/* called in rcu_read_lock() section */
1677static int __mkroute_input(struct sk_buff *skb,
1678			   const struct fib_result *res,
1679			   struct in_device *in_dev,
1680			   __be32 daddr, __be32 saddr, u32 tos)
1681{
1682	struct fib_nh_exception *fnhe;
1683	struct rtable *rth;
1684	int err;
1685	struct in_device *out_dev;
 
1686	bool do_cache;
1687	u32 itag = 0;
1688
1689	/* get a working reference to the output device */
1690	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1691	if (!out_dev) {
1692		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1693		return -EINVAL;
1694	}
1695
1696	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1697				  in_dev->dev, in_dev, &itag);
1698	if (err < 0) {
1699		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1700					 saddr);
1701
1702		goto cleanup;
1703	}
1704
1705	do_cache = res->fi && !itag;
1706	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1707	    skb->protocol == htons(ETH_P_IP) &&
1708	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1709	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1710		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
 
 
1711
1712	if (skb->protocol != htons(ETH_P_IP)) {
1713		/* Not IP (i.e. ARP). Do not create route, if it is
1714		 * invalid for proxy arp. DNAT routes are always valid.
1715		 *
1716		 * Proxy arp feature have been extended to allow, ARP
1717		 * replies back to the same interface, to support
1718		 * Private VLAN switch technologies. See arp.c.
1719		 */
1720		if (out_dev == in_dev &&
1721		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1722			err = -EINVAL;
1723			goto cleanup;
1724		}
1725	}
1726
1727	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1728	if (do_cache) {
1729		if (fnhe)
1730			rth = rcu_dereference(fnhe->fnhe_rth_input);
1731		else
1732			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
 
1733		if (rt_cache_valid(rth)) {
1734			skb_dst_set_noref(skb, &rth->dst);
1735			goto out;
1736		}
1737	}
1738
1739	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1740			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1741			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1742	if (!rth) {
1743		err = -ENOBUFS;
1744		goto cleanup;
1745	}
1746
 
 
 
1747	rth->rt_is_input = 1;
 
 
 
 
 
1748	RT_CACHE_STAT_INC(in_slow_tot);
1749
1750	rth->dst.input = ip_forward;
 
1751
1752	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1753		       do_cache);
1754	lwtunnel_set_redirect(&rth->dst);
1755	skb_dst_set(skb, &rth->dst);
1756out:
1757	err = 0;
1758 cleanup:
1759	return err;
1760}
1761
1762#ifdef CONFIG_IP_ROUTE_MULTIPATH
1763/* To make ICMP packets follow the right flow, the multipath hash is
1764 * calculated from the inner IP addresses.
1765 */
1766static void ip_multipath_l3_keys(const struct sk_buff *skb,
1767				 struct flow_keys *hash_keys)
1768{
1769	const struct iphdr *outer_iph = ip_hdr(skb);
1770	const struct iphdr *key_iph = outer_iph;
1771	const struct iphdr *inner_iph;
1772	const struct icmphdr *icmph;
1773	struct iphdr _inner_iph;
1774	struct icmphdr _icmph;
1775
1776	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1777		goto out;
1778
1779	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1780		goto out;
1781
1782	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1783				   &_icmph);
1784	if (!icmph)
1785		goto out;
1786
1787	if (icmph->type != ICMP_DEST_UNREACH &&
1788	    icmph->type != ICMP_REDIRECT &&
1789	    icmph->type != ICMP_TIME_EXCEEDED &&
1790	    icmph->type != ICMP_PARAMETERPROB)
1791		goto out;
1792
1793	inner_iph = skb_header_pointer(skb,
1794				       outer_iph->ihl * 4 + sizeof(_icmph),
1795				       sizeof(_inner_iph), &_inner_iph);
1796	if (!inner_iph)
1797		goto out;
1798
1799	key_iph = inner_iph;
1800out:
1801	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1802	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1803}
1804
1805/* if skb is set it will be used and fl4 can be NULL */
1806int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1807		       const struct sk_buff *skb, struct flow_keys *flkeys)
1808{
1809	struct flow_keys hash_keys;
1810	u32 mhash;
1811
1812	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1813	case 0:
1814		memset(&hash_keys, 0, sizeof(hash_keys));
1815		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1816		if (skb) {
1817			ip_multipath_l3_keys(skb, &hash_keys);
1818		} else {
1819			hash_keys.addrs.v4addrs.src = fl4->saddr;
1820			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1821		}
1822		break;
1823	case 1:
1824		/* skb is currently provided only when forwarding */
1825		if (skb) {
1826			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1827			struct flow_keys keys;
1828
1829			/* short-circuit if we already have L4 hash present */
1830			if (skb->l4_hash)
1831				return skb_get_hash_raw(skb) >> 1;
1832
1833			memset(&hash_keys, 0, sizeof(hash_keys));
1834
1835			if (!flkeys) {
1836				skb_flow_dissect_flow_keys(skb, &keys, flag);
1837				flkeys = &keys;
1838			}
1839
1840			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1841			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1842			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1843			hash_keys.ports.src = flkeys->ports.src;
1844			hash_keys.ports.dst = flkeys->ports.dst;
1845			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1846		} else {
1847			memset(&hash_keys, 0, sizeof(hash_keys));
1848			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1849			hash_keys.addrs.v4addrs.src = fl4->saddr;
1850			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1851			hash_keys.ports.src = fl4->fl4_sport;
1852			hash_keys.ports.dst = fl4->fl4_dport;
1853			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1854		}
1855		break;
1856	}
1857	mhash = flow_hash_from_keys(&hash_keys);
1858
1859	return mhash >> 1;
1860}
1861#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1862
1863static int ip_mkroute_input(struct sk_buff *skb,
1864			    struct fib_result *res,
 
1865			    struct in_device *in_dev,
1866			    __be32 daddr, __be32 saddr, u32 tos,
1867			    struct flow_keys *hkeys)
1868{
1869#ifdef CONFIG_IP_ROUTE_MULTIPATH
1870	if (res->fi && res->fi->fib_nhs > 1) {
1871		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1872
1873		fib_select_multipath(res, h);
1874	}
1875#endif
1876
1877	/* create a routing cache entry */
1878	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1879}
1880
1881/*
1882 *	NOTE. We drop all the packets that has local source
1883 *	addresses, because every properly looped back packet
1884 *	must have correct destination already attached by output routine.
1885 *
1886 *	Such approach solves two big problems:
1887 *	1. Not simplex devices are handled properly.
1888 *	2. IP spoofing attempts are filtered with 100% of guarantee.
1889 *	called with rcu_read_lock()
1890 */
1891
1892static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1893			       u8 tos, struct net_device *dev,
1894			       struct fib_result *res)
1895{
 
1896	struct in_device *in_dev = __in_dev_get_rcu(dev);
1897	struct flow_keys *flkeys = NULL, _flkeys;
1898	struct net    *net = dev_net(dev);
1899	struct ip_tunnel_info *tun_info;
1900	int		err = -EINVAL;
1901	unsigned int	flags = 0;
1902	u32		itag = 0;
1903	struct rtable	*rth;
1904	struct flowi4	fl4;
 
1905	bool do_cache;
1906
1907	/* IP on this device is disabled. */
1908
1909	if (!in_dev)
1910		goto out;
1911
1912	/* Check for the most weird martians, which can be not detected
1913	   by fib_lookup.
1914	 */
1915
1916	tun_info = skb_tunnel_info(skb);
1917	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1918		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1919	else
1920		fl4.flowi4_tun_key.tun_id = 0;
1921	skb_dst_drop(skb);
1922
1923	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1924		goto martian_source;
1925
1926	res->fi = NULL;
1927	res->table = NULL;
1928	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1929		goto brd_input;
1930
1931	/* Accept zero addresses only to limited broadcast;
1932	 * I even do not know to fix it or not. Waiting for complains :-)
1933	 */
1934	if (ipv4_is_zeronet(saddr))
1935		goto martian_source;
1936
1937	if (ipv4_is_zeronet(daddr))
1938		goto martian_destination;
1939
1940	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1941	 * and call it once if daddr or/and saddr are loopback addresses
1942	 */
1943	if (ipv4_is_loopback(daddr)) {
1944		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1945			goto martian_destination;
1946	} else if (ipv4_is_loopback(saddr)) {
1947		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1948			goto martian_source;
1949	}
1950
1951	/*
1952	 *	Now we are ready to route packet.
1953	 */
1954	fl4.flowi4_oif = 0;
1955	fl4.flowi4_iif = dev->ifindex;
1956	fl4.flowi4_mark = skb->mark;
1957	fl4.flowi4_tos = tos;
1958	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1959	fl4.flowi4_flags = 0;
1960	fl4.daddr = daddr;
1961	fl4.saddr = saddr;
1962	fl4.flowi4_uid = sock_net_uid(net, NULL);
1963
1964	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
1965		flkeys = &_flkeys;
1966	} else {
1967		fl4.flowi4_proto = 0;
1968		fl4.fl4_sport = 0;
1969		fl4.fl4_dport = 0;
1970	}
1971
1972	err = fib_lookup(net, &fl4, res, 0);
1973	if (err != 0) {
1974		if (!IN_DEV_FORWARD(in_dev))
1975			err = -EHOSTUNREACH;
1976		goto no_route;
1977	}
1978
1979	if (res->type == RTN_BROADCAST)
1980		goto brd_input;
1981
1982	if (res->type == RTN_LOCAL) {
1983		err = fib_validate_source(skb, saddr, daddr, tos,
1984					  0, dev, in_dev, &itag);
1985		if (err < 0)
1986			goto martian_source;
1987		goto local_input;
1988	}
1989
1990	if (!IN_DEV_FORWARD(in_dev)) {
1991		err = -EHOSTUNREACH;
1992		goto no_route;
1993	}
1994	if (res->type != RTN_UNICAST)
1995		goto martian_destination;
1996
1997	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
1998out:	return err;
1999
2000brd_input:
2001	if (skb->protocol != htons(ETH_P_IP))
2002		goto e_inval;
2003
2004	if (!ipv4_is_zeronet(saddr)) {
2005		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2006					  in_dev, &itag);
2007		if (err < 0)
2008			goto martian_source;
2009	}
2010	flags |= RTCF_BROADCAST;
2011	res->type = RTN_BROADCAST;
2012	RT_CACHE_STAT_INC(in_brd);
2013
2014local_input:
2015	do_cache = false;
2016	if (res->fi) {
2017		if (!itag) {
2018			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2019			if (rt_cache_valid(rth)) {
2020				skb_dst_set_noref(skb, &rth->dst);
2021				err = 0;
2022				goto out;
2023			}
2024			do_cache = true;
2025		}
2026	}
2027
2028	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2029			   flags | RTCF_LOCAL, res->type,
2030			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2031	if (!rth)
2032		goto e_nobufs;
2033
 
2034	rth->dst.output= ip_rt_bug;
2035#ifdef CONFIG_IP_ROUTE_CLASSID
2036	rth->dst.tclassid = itag;
2037#endif
2038	rth->rt_is_input = 1;
2039
 
 
 
 
 
 
 
 
 
2040	RT_CACHE_STAT_INC(in_slow_tot);
2041	if (res->type == RTN_UNREACHABLE) {
2042		rth->dst.input= ip_error;
2043		rth->dst.error= -err;
2044		rth->rt_flags 	&= ~RTCF_LOCAL;
2045	}
2046
2047	if (do_cache) {
2048		struct fib_nh *nh = &FIB_RES_NH(*res);
2049
2050		rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2051		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2052			WARN_ON(rth->dst.input == lwtunnel_input);
2053			rth->dst.lwtstate->orig_input = rth->dst.input;
2054			rth->dst.input = lwtunnel_input;
2055		}
2056
2057		if (unlikely(!rt_cache_route(nh, rth)))
2058			rt_add_uncached_list(rth);
 
2059	}
2060	skb_dst_set(skb, &rth->dst);
2061	err = 0;
2062	goto out;
2063
2064no_route:
2065	RT_CACHE_STAT_INC(in_no_route);
2066	res->type = RTN_UNREACHABLE;
2067	res->fi = NULL;
2068	res->table = NULL;
2069	goto local_input;
2070
2071	/*
2072	 *	Do not cache martian addresses: they should be logged (RFC1812)
2073	 */
2074martian_destination:
2075	RT_CACHE_STAT_INC(in_martian_dst);
2076#ifdef CONFIG_IP_ROUTE_VERBOSE
2077	if (IN_DEV_LOG_MARTIANS(in_dev))
2078		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2079				     &daddr, &saddr, dev->name);
2080#endif
2081
2082e_inval:
2083	err = -EINVAL;
2084	goto out;
2085
2086e_nobufs:
2087	err = -ENOBUFS;
2088	goto out;
2089
2090martian_source:
 
 
2091	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2092	goto out;
2093}
2094
2095int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2096			 u8 tos, struct net_device *dev)
2097{
2098	struct fib_result res;
2099	int err;
2100
2101	tos &= IPTOS_RT_MASK;
2102	rcu_read_lock();
2103	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2104	rcu_read_unlock();
2105
2106	return err;
2107}
2108EXPORT_SYMBOL(ip_route_input_noref);
2109
2110/* called with rcu_read_lock held */
2111int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2112		       u8 tos, struct net_device *dev, struct fib_result *res)
2113{
2114	/* Multicast recognition logic is moved from route cache to here.
2115	   The problem was that too many Ethernet cards have broken/missing
2116	   hardware multicast filters :-( As result the host on multicasting
2117	   network acquires a lot of useless route cache entries, sort of
2118	   SDR messages from all the world. Now we try to get rid of them.
2119	   Really, provided software IP multicast filter is organized
2120	   reasonably (at least, hashed), it does not result in a slowdown
2121	   comparing with route cache reject entries.
2122	   Note, that multicast routers are not affected, because
2123	   route cache entry is created eventually.
2124	 */
2125	if (ipv4_is_multicast(daddr)) {
2126		struct in_device *in_dev = __in_dev_get_rcu(dev);
2127		int our = 0;
2128		int err = -EINVAL;
2129
2130		if (in_dev)
2131			our = ip_check_mc_rcu(in_dev, daddr, saddr,
2132					      ip_hdr(skb)->protocol);
2133
2134		/* check l3 master if no match yet */
2135		if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2136			struct in_device *l3_in_dev;
2137
2138			l3_in_dev = __in_dev_get_rcu(skb->dev);
2139			if (l3_in_dev)
2140				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2141						      ip_hdr(skb)->protocol);
2142		}
2143
2144		if (our
2145#ifdef CONFIG_IP_MROUTE
2146			||
2147		    (!ipv4_is_local_multicast(daddr) &&
2148		     IN_DEV_MFORWARD(in_dev))
2149#endif
2150		   ) {
2151			err = ip_route_input_mc(skb, daddr, saddr,
2152						tos, dev, our);
 
 
 
2153		}
2154		return err;
 
2155	}
2156
2157	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
 
2158}
 
2159
2160/* called with rcu_read_lock() */
2161static struct rtable *__mkroute_output(const struct fib_result *res,
2162				       const struct flowi4 *fl4, int orig_oif,
2163				       struct net_device *dev_out,
2164				       unsigned int flags)
2165{
2166	struct fib_info *fi = res->fi;
2167	struct fib_nh_exception *fnhe;
2168	struct in_device *in_dev;
2169	u16 type = res->type;
2170	struct rtable *rth;
2171	bool do_cache;
2172
2173	in_dev = __in_dev_get_rcu(dev_out);
2174	if (!in_dev)
2175		return ERR_PTR(-EINVAL);
2176
2177	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2178		if (ipv4_is_loopback(fl4->saddr) &&
2179		    !(dev_out->flags & IFF_LOOPBACK) &&
2180		    !netif_is_l3_master(dev_out))
2181			return ERR_PTR(-EINVAL);
2182
2183	if (ipv4_is_lbcast(fl4->daddr))
2184		type = RTN_BROADCAST;
2185	else if (ipv4_is_multicast(fl4->daddr))
2186		type = RTN_MULTICAST;
2187	else if (ipv4_is_zeronet(fl4->daddr))
2188		return ERR_PTR(-EINVAL);
2189
2190	if (dev_out->flags & IFF_LOOPBACK)
2191		flags |= RTCF_LOCAL;
2192
2193	do_cache = true;
2194	if (type == RTN_BROADCAST) {
2195		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2196		fi = NULL;
2197	} else if (type == RTN_MULTICAST) {
2198		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2199		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2200				     fl4->flowi4_proto))
2201			flags &= ~RTCF_LOCAL;
2202		else
2203			do_cache = false;
2204		/* If multicast route do not exist use
2205		 * default one, but do not gateway in this case.
2206		 * Yes, it is hack.
2207		 */
2208		if (fi && res->prefixlen < 4)
2209			fi = NULL;
2210	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2211		   (orig_oif != dev_out->ifindex)) {
2212		/* For local routes that require a particular output interface
2213		 * we do not want to cache the result.  Caching the result
2214		 * causes incorrect behaviour when there are multiple source
2215		 * addresses on the interface, the end result being that if the
2216		 * intended recipient is waiting on that interface for the
2217		 * packet he won't receive it because it will be delivered on
2218		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2219		 * be set to the loopback interface as well.
2220		 */
2221		do_cache = false;
2222	}
2223
2224	fnhe = NULL;
2225	do_cache &= fi != NULL;
2226	if (fi) {
2227		struct rtable __rcu **prth;
2228		struct fib_nh *nh = &FIB_RES_NH(*res);
2229
2230		fnhe = find_exception(nh, fl4->daddr);
2231		if (!do_cache)
2232			goto add;
2233		if (fnhe) {
2234			prth = &fnhe->fnhe_rth_output;
2235		} else {
2236			if (unlikely(fl4->flowi4_flags &
2237				     FLOWI_FLAG_KNOWN_NH &&
2238				     !(nh->nh_gw &&
2239				       nh->nh_scope == RT_SCOPE_LINK))) {
2240				do_cache = false;
2241				goto add;
2242			}
2243			prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2244		}
2245		rth = rcu_dereference(*prth);
2246		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
 
2247			return rth;
 
2248	}
2249
2250add:
2251	rth = rt_dst_alloc(dev_out, flags, type,
2252			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2253			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2254			   do_cache);
2255	if (!rth)
2256		return ERR_PTR(-ENOBUFS);
2257
2258	rth->rt_iif = orig_oif;
 
 
 
 
 
 
 
 
 
 
2259
2260	RT_CACHE_STAT_INC(out_slow_tot);
2261
 
 
2262	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2263		if (flags & RTCF_LOCAL &&
2264		    !(dev_out->flags & IFF_LOOPBACK)) {
2265			rth->dst.output = ip_mc_output;
2266			RT_CACHE_STAT_INC(out_slow_mc);
2267		}
2268#ifdef CONFIG_IP_MROUTE
2269		if (type == RTN_MULTICAST) {
2270			if (IN_DEV_MFORWARD(in_dev) &&
2271			    !ipv4_is_local_multicast(fl4->daddr)) {
2272				rth->dst.input = ip_mr_input;
2273				rth->dst.output = ip_mc_output;
2274			}
2275		}
2276#endif
2277	}
2278
2279	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2280	lwtunnel_set_redirect(&rth->dst);
2281
2282	return rth;
2283}
2284
2285/*
2286 * Major route resolver routine.
2287 */
2288
2289struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2290					const struct sk_buff *skb)
2291{
 
2292	__u8 tos = RT_FL_TOS(fl4);
2293	struct fib_result res = {
2294		.type		= RTN_UNSPEC,
2295		.fi		= NULL,
2296		.table		= NULL,
2297		.tclassid	= 0,
2298	};
2299	struct rtable *rth;
 
 
 
 
 
 
 
2300
2301	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2302	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2303	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2304			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2305
2306	rcu_read_lock();
2307	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2308	rcu_read_unlock();
2309
2310	return rth;
2311}
2312EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2313
2314struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2315					    struct fib_result *res,
2316					    const struct sk_buff *skb)
2317{
2318	struct net_device *dev_out = NULL;
2319	int orig_oif = fl4->flowi4_oif;
2320	unsigned int flags = 0;
2321	struct rtable *rth;
2322	int err = -ENETUNREACH;
2323
2324	if (fl4->saddr) {
2325		rth = ERR_PTR(-EINVAL);
2326		if (ipv4_is_multicast(fl4->saddr) ||
2327		    ipv4_is_lbcast(fl4->saddr) ||
2328		    ipv4_is_zeronet(fl4->saddr))
2329			goto out;
2330
2331		/* I removed check for oif == dev_out->oif here.
2332		   It was wrong for two reasons:
2333		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2334		      is assigned to multiple interfaces.
2335		   2. Moreover, we are allowed to send packets with saddr
2336		      of another iface. --ANK
2337		 */
2338
2339		if (fl4->flowi4_oif == 0 &&
2340		    (ipv4_is_multicast(fl4->daddr) ||
2341		     ipv4_is_lbcast(fl4->daddr))) {
2342			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2343			dev_out = __ip_dev_find(net, fl4->saddr, false);
2344			if (!dev_out)
2345				goto out;
2346
2347			/* Special hack: user can direct multicasts
2348			   and limited broadcast via necessary interface
2349			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2350			   This hack is not just for fun, it allows
2351			   vic,vat and friends to work.
2352			   They bind socket to loopback, set ttl to zero
2353			   and expect that it will work.
2354			   From the viewpoint of routing cache they are broken,
2355			   because we are not allowed to build multicast path
2356			   with loopback source addr (look, routing cache
2357			   cannot know, that ttl is zero, so that packet
2358			   will not leave this host and route is valid).
2359			   Luckily, this hack is good workaround.
2360			 */
2361
2362			fl4->flowi4_oif = dev_out->ifindex;
2363			goto make_route;
2364		}
2365
2366		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2367			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2368			if (!__ip_dev_find(net, fl4->saddr, false))
2369				goto out;
2370		}
2371	}
2372
2373
2374	if (fl4->flowi4_oif) {
2375		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2376		rth = ERR_PTR(-ENODEV);
2377		if (!dev_out)
2378			goto out;
2379
2380		/* RACE: Check return value of inet_select_addr instead. */
2381		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2382			rth = ERR_PTR(-ENETUNREACH);
2383			goto out;
2384		}
2385		if (ipv4_is_local_multicast(fl4->daddr) ||
2386		    ipv4_is_lbcast(fl4->daddr) ||
2387		    fl4->flowi4_proto == IPPROTO_IGMP) {
2388			if (!fl4->saddr)
2389				fl4->saddr = inet_select_addr(dev_out, 0,
2390							      RT_SCOPE_LINK);
2391			goto make_route;
2392		}
2393		if (!fl4->saddr) {
2394			if (ipv4_is_multicast(fl4->daddr))
2395				fl4->saddr = inet_select_addr(dev_out, 0,
2396							      fl4->flowi4_scope);
2397			else if (!fl4->daddr)
2398				fl4->saddr = inet_select_addr(dev_out, 0,
2399							      RT_SCOPE_HOST);
2400		}
2401	}
2402
2403	if (!fl4->daddr) {
2404		fl4->daddr = fl4->saddr;
2405		if (!fl4->daddr)
2406			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2407		dev_out = net->loopback_dev;
2408		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2409		res->type = RTN_LOCAL;
2410		flags |= RTCF_LOCAL;
2411		goto make_route;
2412	}
2413
2414	err = fib_lookup(net, fl4, res, 0);
2415	if (err) {
2416		res->fi = NULL;
2417		res->table = NULL;
2418		if (fl4->flowi4_oif &&
2419		    (ipv4_is_multicast(fl4->daddr) ||
2420		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2421			/* Apparently, routing tables are wrong. Assume,
2422			   that the destination is on link.
2423
2424			   WHY? DW.
2425			   Because we are allowed to send to iface
2426			   even if it has NO routes and NO assigned
2427			   addresses. When oif is specified, routing
2428			   tables are looked up with only one purpose:
2429			   to catch if destination is gatewayed, rather than
2430			   direct. Moreover, if MSG_DONTROUTE is set,
2431			   we send packet, ignoring both routing tables
2432			   and ifaddr state. --ANK
2433
2434
2435			   We could make it even if oif is unknown,
2436			   likely IPv6, but we do not.
2437			 */
2438
2439			if (fl4->saddr == 0)
2440				fl4->saddr = inet_select_addr(dev_out, 0,
2441							      RT_SCOPE_LINK);
2442			res->type = RTN_UNICAST;
2443			goto make_route;
2444		}
2445		rth = ERR_PTR(err);
2446		goto out;
2447	}
2448
2449	if (res->type == RTN_LOCAL) {
2450		if (!fl4->saddr) {
2451			if (res->fi->fib_prefsrc)
2452				fl4->saddr = res->fi->fib_prefsrc;
2453			else
2454				fl4->saddr = fl4->daddr;
2455		}
2456
2457		/* L3 master device is the loopback for that domain */
2458		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2459			net->loopback_dev;
2460
2461		/* make sure orig_oif points to fib result device even
2462		 * though packet rx/tx happens over loopback or l3mdev
2463		 */
2464		orig_oif = FIB_RES_OIF(*res);
2465
2466		fl4->flowi4_oif = dev_out->ifindex;
2467		flags |= RTCF_LOCAL;
2468		goto make_route;
2469	}
2470
2471	fib_select_path(net, res, fl4, skb);
 
 
 
 
 
 
 
 
 
 
 
2472
2473	dev_out = FIB_RES_DEV(*res);
2474	fl4->flowi4_oif = dev_out->ifindex;
2475
2476
2477make_route:
2478	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2479
2480out:
 
2481	return rth;
2482}
 
2483
2484static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2485{
2486	return NULL;
2487}
2488
2489static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2490{
2491	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2492
2493	return mtu ? : dst->dev->mtu;
2494}
2495
2496static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2497					  struct sk_buff *skb, u32 mtu)
2498{
2499}
2500
2501static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2502				       struct sk_buff *skb)
2503{
2504}
2505
2506static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2507					  unsigned long old)
2508{
2509	return NULL;
2510}
2511
2512static struct dst_ops ipv4_dst_blackhole_ops = {
2513	.family			=	AF_INET,
 
2514	.check			=	ipv4_blackhole_dst_check,
2515	.mtu			=	ipv4_blackhole_mtu,
2516	.default_advmss		=	ipv4_default_advmss,
2517	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2518	.redirect		=	ipv4_rt_blackhole_redirect,
2519	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2520	.neigh_lookup		=	ipv4_neigh_lookup,
2521};
2522
2523struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2524{
2525	struct rtable *ort = (struct rtable *) dst_orig;
2526	struct rtable *rt;
2527
2528	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2529	if (rt) {
2530		struct dst_entry *new = &rt->dst;
2531
2532		new->__use = 1;
2533		new->input = dst_discard;
2534		new->output = dst_discard_out;
2535
2536		new->dev = net->loopback_dev;
2537		if (new->dev)
2538			dev_hold(new->dev);
2539
2540		rt->rt_is_input = ort->rt_is_input;
2541		rt->rt_iif = ort->rt_iif;
2542		rt->rt_pmtu = ort->rt_pmtu;
2543		rt->rt_mtu_locked = ort->rt_mtu_locked;
2544
2545		rt->rt_genid = rt_genid_ipv4(net);
2546		rt->rt_flags = ort->rt_flags;
2547		rt->rt_type = ort->rt_type;
2548		rt->rt_gateway = ort->rt_gateway;
2549		rt->rt_uses_gateway = ort->rt_uses_gateway;
2550
2551		INIT_LIST_HEAD(&rt->rt_uncached);
 
 
2552	}
2553
2554	dst_release(dst_orig);
2555
2556	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2557}
2558
2559struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2560				    const struct sock *sk)
2561{
2562	struct rtable *rt = __ip_route_output_key(net, flp4);
2563
2564	if (IS_ERR(rt))
2565		return rt;
2566
2567	if (flp4->flowi4_proto)
2568		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2569							flowi4_to_flowi(flp4),
2570							sk, 0);
2571
2572	return rt;
2573}
2574EXPORT_SYMBOL_GPL(ip_route_output_flow);
2575
2576/* called with rcu_read_lock held */
2577static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2578			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2579			u32 seq)
2580{
2581	struct rtable *rt = skb_rtable(skb);
2582	struct rtmsg *r;
2583	struct nlmsghdr *nlh;
2584	unsigned long expires = 0;
2585	u32 error;
2586	u32 metrics[RTAX_MAX];
2587
2588	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2589	if (!nlh)
2590		return -EMSGSIZE;
2591
2592	r = nlmsg_data(nlh);
2593	r->rtm_family	 = AF_INET;
2594	r->rtm_dst_len	= 32;
2595	r->rtm_src_len	= 0;
2596	r->rtm_tos	= fl4->flowi4_tos;
2597	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2598	if (nla_put_u32(skb, RTA_TABLE, table_id))
2599		goto nla_put_failure;
2600	r->rtm_type	= rt->rt_type;
2601	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2602	r->rtm_protocol = RTPROT_UNSPEC;
2603	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2604	if (rt->rt_flags & RTCF_NOTIFY)
2605		r->rtm_flags |= RTM_F_NOTIFY;
2606	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2607		r->rtm_flags |= RTCF_DOREDIRECT;
2608
2609	if (nla_put_in_addr(skb, RTA_DST, dst))
2610		goto nla_put_failure;
2611	if (src) {
2612		r->rtm_src_len = 32;
2613		if (nla_put_in_addr(skb, RTA_SRC, src))
2614			goto nla_put_failure;
2615	}
2616	if (rt->dst.dev &&
2617	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2618		goto nla_put_failure;
2619#ifdef CONFIG_IP_ROUTE_CLASSID
2620	if (rt->dst.tclassid &&
2621	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2622		goto nla_put_failure;
2623#endif
2624	if (!rt_is_input_route(rt) &&
2625	    fl4->saddr != src) {
2626		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2627			goto nla_put_failure;
2628	}
2629	if (rt->rt_uses_gateway &&
2630	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2631		goto nla_put_failure;
2632
2633	expires = rt->dst.expires;
2634	if (expires) {
2635		unsigned long now = jiffies;
2636
2637		if (time_before(now, expires))
2638			expires -= now;
2639		else
2640			expires = 0;
2641	}
2642
2643	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2644	if (rt->rt_pmtu && expires)
2645		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2646	if (rt->rt_mtu_locked && expires)
2647		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2648	if (rtnetlink_put_metrics(skb, metrics) < 0)
2649		goto nla_put_failure;
2650
2651	if (fl4->flowi4_mark &&
2652	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2653		goto nla_put_failure;
2654
2655	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2656	    nla_put_u32(skb, RTA_UID,
2657			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2658		goto nla_put_failure;
2659
2660	error = rt->dst.error;
2661
2662	if (rt_is_input_route(rt)) {
2663#ifdef CONFIG_IP_MROUTE
2664		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2665		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2666			int err = ipmr_get_route(net, skb,
2667						 fl4->saddr, fl4->daddr,
2668						 r, portid);
2669
2670			if (err <= 0) {
2671				if (err == 0)
2672					return 0;
2673				goto nla_put_failure;
 
 
 
 
 
 
2674			}
2675		} else
2676#endif
2677			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2678				goto nla_put_failure;
2679	}
2680
2681	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2682		goto nla_put_failure;
2683
2684	nlmsg_end(skb, nlh);
2685	return 0;
2686
2687nla_put_failure:
2688	nlmsg_cancel(skb, nlh);
2689	return -EMSGSIZE;
2690}
2691
2692static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2693			     struct netlink_ext_ack *extack)
2694{
2695	struct net *net = sock_net(in_skb->sk);
2696	struct rtmsg *rtm;
2697	struct nlattr *tb[RTA_MAX+1];
2698	struct fib_result res = {};
2699	struct rtable *rt = NULL;
2700	struct flowi4 fl4;
2701	__be32 dst = 0;
2702	__be32 src = 0;
2703	u32 iif;
2704	int err;
2705	int mark;
2706	struct sk_buff *skb;
2707	u32 table_id = RT_TABLE_MAIN;
2708	kuid_t uid;
2709
2710	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2711			  extack);
2712	if (err < 0)
2713		goto errout;
2714
2715	rtm = nlmsg_data(nlh);
2716
2717	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2718	if (!skb) {
2719		err = -ENOBUFS;
2720		goto errout;
2721	}
2722
2723	/* Reserve room for dummy headers, this skb can pass
2724	   through good chunk of routing engine.
2725	 */
2726	skb_reset_mac_header(skb);
2727	skb_reset_network_header(skb);
2728
2729	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2730	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
 
 
 
 
2731	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2732	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2733	if (tb[RTA_UID])
2734		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2735	else
2736		uid = (iif ? INVALID_UID : current_uid());
2737
2738	/* Bugfix: need to give ip_route_input enough of an IP header to
2739	 * not gag.
2740	 */
2741	ip_hdr(skb)->protocol = IPPROTO_UDP;
2742	ip_hdr(skb)->saddr = src;
2743	ip_hdr(skb)->daddr = dst;
2744
2745	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2746
2747	memset(&fl4, 0, sizeof(fl4));
2748	fl4.daddr = dst;
2749	fl4.saddr = src;
2750	fl4.flowi4_tos = rtm->rtm_tos;
2751	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2752	fl4.flowi4_mark = mark;
2753	fl4.flowi4_uid = uid;
2754
2755	rcu_read_lock();
2756
2757	if (iif) {
2758		struct net_device *dev;
2759
2760		dev = dev_get_by_index_rcu(net, iif);
2761		if (!dev) {
2762			err = -ENODEV;
2763			goto errout_free;
2764		}
2765
2766		skb->protocol	= htons(ETH_P_IP);
2767		skb->dev	= dev;
2768		skb->mark	= mark;
2769		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2770					 dev, &res);
 
2771
2772		rt = skb_rtable(skb);
2773		if (err == 0 && rt->dst.error)
2774			err = -rt->dst.error;
2775	} else {
2776		fl4.flowi4_iif = LOOPBACK_IFINDEX;
2777		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2778		err = 0;
2779		if (IS_ERR(rt))
2780			err = PTR_ERR(rt);
2781		else
2782			skb_dst_set(skb, &rt->dst);
2783	}
2784
2785	if (err)
2786		goto errout_free;
2787
 
2788	if (rtm->rtm_flags & RTM_F_NOTIFY)
2789		rt->rt_flags |= RTCF_NOTIFY;
2790
2791	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2792		table_id = res.table ? res.table->tb_id : 0;
2793
2794	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2795		if (!res.fi) {
2796			err = fib_props[res.type].error;
2797			if (!err)
2798				err = -EHOSTUNREACH;
2799			goto errout_free;
2800		}
2801		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2802				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2803				    rt->rt_type, res.prefix, res.prefixlen,
2804				    fl4.flowi4_tos, res.fi, 0);
2805	} else {
2806		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2807				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2808	}
2809	if (err < 0)
2810		goto errout_free;
2811
2812	rcu_read_unlock();
2813
2814	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2815errout:
2816	return err;
2817
2818errout_free:
2819	rcu_read_unlock();
2820	kfree_skb(skb);
2821	goto errout;
2822}
2823
2824void ip_rt_multicast_event(struct in_device *in_dev)
2825{
2826	rt_cache_flush(dev_net(in_dev->dev));
2827}
2828
2829#ifdef CONFIG_SYSCTL
 
2830static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2831static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2832static int ip_rt_gc_elasticity __read_mostly	= 8;
2833static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
2834
2835static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2836					void __user *buffer,
2837					size_t *lenp, loff_t *ppos)
2838{
2839	struct net *net = (struct net *)__ctl->extra1;
2840
2841	if (write) {
2842		rt_cache_flush(net);
2843		fnhe_genid_bump(net);
2844		return 0;
2845	}
2846
2847	return -EINVAL;
2848}
2849
2850static struct ctl_table ipv4_route_table[] = {
2851	{
2852		.procname	= "gc_thresh",
2853		.data		= &ipv4_dst_ops.gc_thresh,
2854		.maxlen		= sizeof(int),
2855		.mode		= 0644,
2856		.proc_handler	= proc_dointvec,
2857	},
2858	{
2859		.procname	= "max_size",
2860		.data		= &ip_rt_max_size,
2861		.maxlen		= sizeof(int),
2862		.mode		= 0644,
2863		.proc_handler	= proc_dointvec,
2864	},
2865	{
2866		/*  Deprecated. Use gc_min_interval_ms */
2867
2868		.procname	= "gc_min_interval",
2869		.data		= &ip_rt_gc_min_interval,
2870		.maxlen		= sizeof(int),
2871		.mode		= 0644,
2872		.proc_handler	= proc_dointvec_jiffies,
2873	},
2874	{
2875		.procname	= "gc_min_interval_ms",
2876		.data		= &ip_rt_gc_min_interval,
2877		.maxlen		= sizeof(int),
2878		.mode		= 0644,
2879		.proc_handler	= proc_dointvec_ms_jiffies,
2880	},
2881	{
2882		.procname	= "gc_timeout",
2883		.data		= &ip_rt_gc_timeout,
2884		.maxlen		= sizeof(int),
2885		.mode		= 0644,
2886		.proc_handler	= proc_dointvec_jiffies,
2887	},
2888	{
2889		.procname	= "gc_interval",
2890		.data		= &ip_rt_gc_interval,
2891		.maxlen		= sizeof(int),
2892		.mode		= 0644,
2893		.proc_handler	= proc_dointvec_jiffies,
2894	},
2895	{
2896		.procname	= "redirect_load",
2897		.data		= &ip_rt_redirect_load,
2898		.maxlen		= sizeof(int),
2899		.mode		= 0644,
2900		.proc_handler	= proc_dointvec,
2901	},
2902	{
2903		.procname	= "redirect_number",
2904		.data		= &ip_rt_redirect_number,
2905		.maxlen		= sizeof(int),
2906		.mode		= 0644,
2907		.proc_handler	= proc_dointvec,
2908	},
2909	{
2910		.procname	= "redirect_silence",
2911		.data		= &ip_rt_redirect_silence,
2912		.maxlen		= sizeof(int),
2913		.mode		= 0644,
2914		.proc_handler	= proc_dointvec,
2915	},
2916	{
2917		.procname	= "error_cost",
2918		.data		= &ip_rt_error_cost,
2919		.maxlen		= sizeof(int),
2920		.mode		= 0644,
2921		.proc_handler	= proc_dointvec,
2922	},
2923	{
2924		.procname	= "error_burst",
2925		.data		= &ip_rt_error_burst,
2926		.maxlen		= sizeof(int),
2927		.mode		= 0644,
2928		.proc_handler	= proc_dointvec,
2929	},
2930	{
2931		.procname	= "gc_elasticity",
2932		.data		= &ip_rt_gc_elasticity,
2933		.maxlen		= sizeof(int),
2934		.mode		= 0644,
2935		.proc_handler	= proc_dointvec,
2936	},
2937	{
2938		.procname	= "mtu_expires",
2939		.data		= &ip_rt_mtu_expires,
2940		.maxlen		= sizeof(int),
2941		.mode		= 0644,
2942		.proc_handler	= proc_dointvec_jiffies,
2943	},
2944	{
2945		.procname	= "min_pmtu",
2946		.data		= &ip_rt_min_pmtu,
2947		.maxlen		= sizeof(int),
2948		.mode		= 0644,
2949		.proc_handler	= proc_dointvec_minmax,
2950		.extra1		= &ip_min_valid_pmtu,
2951	},
2952	{
2953		.procname	= "min_adv_mss",
2954		.data		= &ip_rt_min_advmss,
2955		.maxlen		= sizeof(int),
2956		.mode		= 0644,
2957		.proc_handler	= proc_dointvec,
2958	},
2959	{ }
2960};
2961
2962static struct ctl_table ipv4_route_flush_table[] = {
2963	{
2964		.procname	= "flush",
2965		.maxlen		= sizeof(int),
2966		.mode		= 0200,
2967		.proc_handler	= ipv4_sysctl_rtcache_flush,
2968	},
2969	{ },
2970};
2971
2972static __net_init int sysctl_route_net_init(struct net *net)
2973{
2974	struct ctl_table *tbl;
2975
2976	tbl = ipv4_route_flush_table;
2977	if (!net_eq(net, &init_net)) {
2978		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2979		if (!tbl)
2980			goto err_dup;
2981
2982		/* Don't export sysctls to unprivileged users */
2983		if (net->user_ns != &init_user_ns)
2984			tbl[0].procname = NULL;
2985	}
2986	tbl[0].extra1 = net;
2987
2988	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2989	if (!net->ipv4.route_hdr)
2990		goto err_reg;
2991	return 0;
2992
2993err_reg:
2994	if (tbl != ipv4_route_flush_table)
2995		kfree(tbl);
2996err_dup:
2997	return -ENOMEM;
2998}
2999
3000static __net_exit void sysctl_route_net_exit(struct net *net)
3001{
3002	struct ctl_table *tbl;
3003
3004	tbl = net->ipv4.route_hdr->ctl_table_arg;
3005	unregister_net_sysctl_table(net->ipv4.route_hdr);
3006	BUG_ON(tbl == ipv4_route_flush_table);
3007	kfree(tbl);
3008}
3009
3010static __net_initdata struct pernet_operations sysctl_route_ops = {
3011	.init = sysctl_route_net_init,
3012	.exit = sysctl_route_net_exit,
3013};
3014#endif
3015
3016static __net_init int rt_genid_init(struct net *net)
3017{
3018	atomic_set(&net->ipv4.rt_genid, 0);
3019	atomic_set(&net->fnhe_genid, 0);
3020	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
 
3021	return 0;
3022}
3023
3024static __net_initdata struct pernet_operations rt_genid_ops = {
3025	.init = rt_genid_init,
3026};
3027
3028static int __net_init ipv4_inetpeer_init(struct net *net)
3029{
3030	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3031
3032	if (!bp)
3033		return -ENOMEM;
3034	inet_peer_base_init(bp);
3035	net->ipv4.peers = bp;
3036	return 0;
3037}
3038
3039static void __net_exit ipv4_inetpeer_exit(struct net *net)
3040{
3041	struct inet_peer_base *bp = net->ipv4.peers;
3042
3043	net->ipv4.peers = NULL;
3044	inetpeer_invalidate_tree(bp);
3045	kfree(bp);
3046}
3047
3048static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3049	.init	=	ipv4_inetpeer_init,
3050	.exit	=	ipv4_inetpeer_exit,
3051};
3052
3053#ifdef CONFIG_IP_ROUTE_CLASSID
3054struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3055#endif /* CONFIG_IP_ROUTE_CLASSID */
3056
3057int __init ip_rt_init(void)
3058{
3059	int cpu;
3060
3061	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3062	if (!ip_idents)
3063		panic("IP: failed to allocate ip_idents\n");
3064
3065	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3066
3067	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3068	if (!ip_tstamps)
3069		panic("IP: failed to allocate ip_tstamps\n");
3070
3071	for_each_possible_cpu(cpu) {
3072		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3073
3074		INIT_LIST_HEAD(&ul->head);
3075		spin_lock_init(&ul->lock);
3076	}
3077#ifdef CONFIG_IP_ROUTE_CLASSID
3078	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3079	if (!ip_rt_acct)
3080		panic("IP: failed to allocate ip_rt_acct\n");
3081#endif
3082
3083	ipv4_dst_ops.kmem_cachep =
3084		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3085				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3086
3087	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3088
3089	if (dst_entries_init(&ipv4_dst_ops) < 0)
3090		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3091
3092	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3093		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3094
3095	ipv4_dst_ops.gc_thresh = ~0;
3096	ip_rt_max_size = INT_MAX;
3097
3098	devinet_init();
3099	ip_fib_init();
3100
3101	if (ip_rt_proc_init())
3102		pr_err("Unable to create route proc files\n");
3103#ifdef CONFIG_XFRM
3104	xfrm_init();
3105	xfrm4_init();
3106#endif
3107	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3108		      RTNL_FLAG_DOIT_UNLOCKED);
3109
3110#ifdef CONFIG_SYSCTL
3111	register_pernet_subsys(&sysctl_route_ops);
3112#endif
3113	register_pernet_subsys(&rt_genid_ops);
3114	register_pernet_subsys(&ipv4_inetpeer_ops);
3115	return 0;
3116}
3117
3118#ifdef CONFIG_SYSCTL
3119/*
3120 * We really need to sanitize the damn ipv4 init order, then all
3121 * this nonsense will go away.
3122 */
3123void __init ip_static_sysctl_init(void)
3124{
3125	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3126}
3127#endif
v3.15
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		ROUTE - implementation of the IP router.
   7 *
   8 * Authors:	Ross Biro
   9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *		Alan Cox	:	Verify area fixes.
  16 *		Alan Cox	:	cli() protects routing changes
  17 *		Rui Oliveira	:	ICMP routing table updates
  18 *		(rco@di.uminho.pt)	Routing table insertion and update
  19 *		Linus Torvalds	:	Rewrote bits to be sensible
  20 *		Alan Cox	:	Added BSD route gw semantics
  21 *		Alan Cox	:	Super /proc >4K
  22 *		Alan Cox	:	MTU in route table
  23 *		Alan Cox	: 	MSS actually. Also added the window
  24 *					clamper.
  25 *		Sam Lantinga	:	Fixed route matching in rt_del()
  26 *		Alan Cox	:	Routing cache support.
  27 *		Alan Cox	:	Removed compatibility cruft.
  28 *		Alan Cox	:	RTF_REJECT support.
  29 *		Alan Cox	:	TCP irtt support.
  30 *		Jonathan Naylor	:	Added Metric support.
  31 *	Miquel van Smoorenburg	:	BSD API fixes.
  32 *	Miquel van Smoorenburg	:	Metrics.
  33 *		Alan Cox	:	Use __u32 properly
  34 *		Alan Cox	:	Aligned routing errors more closely with BSD
  35 *					our system is still very different.
  36 *		Alan Cox	:	Faster /proc handling
  37 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  38 *					routing caches and better behaviour.
  39 *
  40 *		Olaf Erb	:	irtt wasn't being copied right.
  41 *		Bjorn Ekwall	:	Kerneld route support.
  42 *		Alan Cox	:	Multicast fixed (I hope)
  43 * 		Pavel Krauz	:	Limited broadcast fixed
  44 *		Mike McLagan	:	Routing by source
  45 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  46 *					route.c and rewritten from scratch.
  47 *		Andi Kleen	:	Load-limit warning messages.
  48 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  49 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  50 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  51 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  52 *		Marc Boucher	:	routing by fwmark
  53 *	Robert Olsson		:	Added rt_cache statistics
  54 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  55 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  56 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  57 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
  58 *
  59 *		This program is free software; you can redistribute it and/or
  60 *		modify it under the terms of the GNU General Public License
  61 *		as published by the Free Software Foundation; either version
  62 *		2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
 
  92#include <net/dst.h>
 
  93#include <net/net_namespace.h>
  94#include <net/protocol.h>
  95#include <net/ip.h>
  96#include <net/route.h>
  97#include <net/inetpeer.h>
  98#include <net/sock.h>
  99#include <net/ip_fib.h>
 100#include <net/arp.h>
 101#include <net/tcp.h>
 102#include <net/icmp.h>
 103#include <net/xfrm.h>
 
 104#include <net/netevent.h>
 105#include <net/rtnetlink.h>
 106#ifdef CONFIG_SYSCTL
 107#include <linux/sysctl.h>
 108#include <linux/kmemleak.h>
 109#endif
 110#include <net/secure_seq.h>
 
 
 
 
 111
 112#define RT_FL_TOS(oldflp4) \
 113	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 114
 115#define RT_GC_TIMEOUT (300*HZ)
 116
 117static int ip_rt_max_size;
 118static int ip_rt_redirect_number __read_mostly	= 9;
 119static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 120static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 121static int ip_rt_error_cost __read_mostly	= HZ;
 122static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 123static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 124static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 125static int ip_rt_min_advmss __read_mostly	= 256;
 126
 
 
 127/*
 128 *	Interface to generic destination cache.
 129 */
 130
 131static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 132static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 133static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
 134static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 135static void		 ipv4_link_failure(struct sk_buff *skb);
 136static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 137					   struct sk_buff *skb, u32 mtu);
 138static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 139					struct sk_buff *skb);
 140static void		ipv4_dst_destroy(struct dst_entry *dst);
 141
 142static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 143{
 144	WARN_ON(1);
 145	return NULL;
 146}
 147
 148static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 149					   struct sk_buff *skb,
 150					   const void *daddr);
 
 151
 152static struct dst_ops ipv4_dst_ops = {
 153	.family =		AF_INET,
 154	.protocol =		cpu_to_be16(ETH_P_IP),
 155	.check =		ipv4_dst_check,
 156	.default_advmss =	ipv4_default_advmss,
 157	.mtu =			ipv4_mtu,
 158	.cow_metrics =		ipv4_cow_metrics,
 159	.destroy =		ipv4_dst_destroy,
 160	.negative_advice =	ipv4_negative_advice,
 161	.link_failure =		ipv4_link_failure,
 162	.update_pmtu =		ip_rt_update_pmtu,
 163	.redirect =		ip_do_redirect,
 164	.local_out =		__ip_local_out,
 165	.neigh_lookup =		ipv4_neigh_lookup,
 
 166};
 167
 168#define ECN_OR_COST(class)	TC_PRIO_##class
 169
 170const __u8 ip_tos2prio[16] = {
 171	TC_PRIO_BESTEFFORT,
 172	ECN_OR_COST(BESTEFFORT),
 173	TC_PRIO_BESTEFFORT,
 174	ECN_OR_COST(BESTEFFORT),
 175	TC_PRIO_BULK,
 176	ECN_OR_COST(BULK),
 177	TC_PRIO_BULK,
 178	ECN_OR_COST(BULK),
 179	TC_PRIO_INTERACTIVE,
 180	ECN_OR_COST(INTERACTIVE),
 181	TC_PRIO_INTERACTIVE,
 182	ECN_OR_COST(INTERACTIVE),
 183	TC_PRIO_INTERACTIVE_BULK,
 184	ECN_OR_COST(INTERACTIVE_BULK),
 185	TC_PRIO_INTERACTIVE_BULK,
 186	ECN_OR_COST(INTERACTIVE_BULK)
 187};
 188EXPORT_SYMBOL(ip_tos2prio);
 189
 190static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 191#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 192
 193#ifdef CONFIG_PROC_FS
 194static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 195{
 196	if (*pos)
 197		return NULL;
 198	return SEQ_START_TOKEN;
 199}
 200
 201static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 202{
 203	++*pos;
 204	return NULL;
 205}
 206
 207static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 208{
 209}
 210
 211static int rt_cache_seq_show(struct seq_file *seq, void *v)
 212{
 213	if (v == SEQ_START_TOKEN)
 214		seq_printf(seq, "%-127s\n",
 215			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 216			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 217			   "HHUptod\tSpecDst");
 218	return 0;
 219}
 220
 221static const struct seq_operations rt_cache_seq_ops = {
 222	.start  = rt_cache_seq_start,
 223	.next   = rt_cache_seq_next,
 224	.stop   = rt_cache_seq_stop,
 225	.show   = rt_cache_seq_show,
 226};
 227
 228static int rt_cache_seq_open(struct inode *inode, struct file *file)
 229{
 230	return seq_open(file, &rt_cache_seq_ops);
 231}
 232
 233static const struct file_operations rt_cache_seq_fops = {
 234	.owner	 = THIS_MODULE,
 235	.open	 = rt_cache_seq_open,
 236	.read	 = seq_read,
 237	.llseek	 = seq_lseek,
 238	.release = seq_release,
 239};
 240
 241
 242static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 243{
 244	int cpu;
 245
 246	if (*pos == 0)
 247		return SEQ_START_TOKEN;
 248
 249	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 250		if (!cpu_possible(cpu))
 251			continue;
 252		*pos = cpu+1;
 253		return &per_cpu(rt_cache_stat, cpu);
 254	}
 255	return NULL;
 256}
 257
 258static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 259{
 260	int cpu;
 261
 262	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 263		if (!cpu_possible(cpu))
 264			continue;
 265		*pos = cpu+1;
 266		return &per_cpu(rt_cache_stat, cpu);
 267	}
 268	return NULL;
 269
 270}
 271
 272static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 273{
 274
 275}
 276
 277static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 278{
 279	struct rt_cache_stat *st = v;
 280
 281	if (v == SEQ_START_TOKEN) {
 282		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 283		return 0;
 284	}
 285
 286	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 287		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 288		   dst_entries_get_slow(&ipv4_dst_ops),
 289		   0, /* st->in_hit */
 290		   st->in_slow_tot,
 291		   st->in_slow_mc,
 292		   st->in_no_route,
 293		   st->in_brd,
 294		   st->in_martian_dst,
 295		   st->in_martian_src,
 296
 297		   0, /* st->out_hit */
 298		   st->out_slow_tot,
 299		   st->out_slow_mc,
 300
 301		   0, /* st->gc_total */
 302		   0, /* st->gc_ignored */
 303		   0, /* st->gc_goal_miss */
 304		   0, /* st->gc_dst_overflow */
 305		   0, /* st->in_hlist_search */
 306		   0  /* st->out_hlist_search */
 307		);
 308	return 0;
 309}
 310
 311static const struct seq_operations rt_cpu_seq_ops = {
 312	.start  = rt_cpu_seq_start,
 313	.next   = rt_cpu_seq_next,
 314	.stop   = rt_cpu_seq_stop,
 315	.show   = rt_cpu_seq_show,
 316};
 317
 318
 319static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 320{
 321	return seq_open(file, &rt_cpu_seq_ops);
 322}
 323
 324static const struct file_operations rt_cpu_seq_fops = {
 325	.owner	 = THIS_MODULE,
 326	.open	 = rt_cpu_seq_open,
 327	.read	 = seq_read,
 328	.llseek	 = seq_lseek,
 329	.release = seq_release,
 330};
 331
 332#ifdef CONFIG_IP_ROUTE_CLASSID
 333static int rt_acct_proc_show(struct seq_file *m, void *v)
 334{
 335	struct ip_rt_acct *dst, *src;
 336	unsigned int i, j;
 337
 338	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 339	if (!dst)
 340		return -ENOMEM;
 341
 342	for_each_possible_cpu(i) {
 343		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 344		for (j = 0; j < 256; j++) {
 345			dst[j].o_bytes   += src[j].o_bytes;
 346			dst[j].o_packets += src[j].o_packets;
 347			dst[j].i_bytes   += src[j].i_bytes;
 348			dst[j].i_packets += src[j].i_packets;
 349		}
 350	}
 351
 352	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 353	kfree(dst);
 354	return 0;
 355}
 356
 357static int rt_acct_proc_open(struct inode *inode, struct file *file)
 358{
 359	return single_open(file, rt_acct_proc_show, NULL);
 360}
 361
 362static const struct file_operations rt_acct_proc_fops = {
 363	.owner		= THIS_MODULE,
 364	.open		= rt_acct_proc_open,
 365	.read		= seq_read,
 366	.llseek		= seq_lseek,
 367	.release	= single_release,
 368};
 369#endif
 370
 371static int __net_init ip_rt_do_proc_init(struct net *net)
 372{
 373	struct proc_dir_entry *pde;
 374
 375	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 376			  &rt_cache_seq_fops);
 377	if (!pde)
 378		goto err1;
 379
 380	pde = proc_create("rt_cache", S_IRUGO,
 381			  net->proc_net_stat, &rt_cpu_seq_fops);
 382	if (!pde)
 383		goto err2;
 384
 385#ifdef CONFIG_IP_ROUTE_CLASSID
 386	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 387	if (!pde)
 388		goto err3;
 389#endif
 390	return 0;
 391
 392#ifdef CONFIG_IP_ROUTE_CLASSID
 393err3:
 394	remove_proc_entry("rt_cache", net->proc_net_stat);
 395#endif
 396err2:
 397	remove_proc_entry("rt_cache", net->proc_net);
 398err1:
 399	return -ENOMEM;
 400}
 401
 402static void __net_exit ip_rt_do_proc_exit(struct net *net)
 403{
 404	remove_proc_entry("rt_cache", net->proc_net_stat);
 405	remove_proc_entry("rt_cache", net->proc_net);
 406#ifdef CONFIG_IP_ROUTE_CLASSID
 407	remove_proc_entry("rt_acct", net->proc_net);
 408#endif
 409}
 410
 411static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 412	.init = ip_rt_do_proc_init,
 413	.exit = ip_rt_do_proc_exit,
 414};
 415
 416static int __init ip_rt_proc_init(void)
 417{
 418	return register_pernet_subsys(&ip_rt_proc_ops);
 419}
 420
 421#else
 422static inline int ip_rt_proc_init(void)
 423{
 424	return 0;
 425}
 426#endif /* CONFIG_PROC_FS */
 427
 428static inline bool rt_is_expired(const struct rtable *rth)
 429{
 430	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 431}
 432
 433void rt_cache_flush(struct net *net)
 434{
 435	rt_genid_bump_ipv4(net);
 436}
 437
 438static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 439					   struct sk_buff *skb,
 440					   const void *daddr)
 441{
 442	struct net_device *dev = dst->dev;
 443	const __be32 *pkey = daddr;
 444	const struct rtable *rt;
 445	struct neighbour *n;
 446
 447	rt = (const struct rtable *) dst;
 448	if (rt->rt_gateway)
 449		pkey = (const __be32 *) &rt->rt_gateway;
 450	else if (skb)
 451		pkey = &ip_hdr(skb)->daddr;
 452
 453	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 454	if (n)
 455		return n;
 456	return neigh_create(&arp_tbl, pkey, dev);
 457}
 458
 459/*
 460 * Peer allocation may fail only in serious out-of-memory conditions.  However
 461 * we still can generate some output.
 462 * Random ID selection looks a bit dangerous because we have no chances to
 463 * select ID being unique in a reasonable period of time.
 464 * But broken packet identifier may be better than no packet at all.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 465 */
 466static void ip_select_fb_ident(struct iphdr *iph)
 467{
 468	static DEFINE_SPINLOCK(ip_fb_id_lock);
 469	static u32 ip_fallback_id;
 470	u32 salt;
 
 
 
 
 
 
 
 
 
 
 
 471
 472	spin_lock_bh(&ip_fb_id_lock);
 473	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
 474	iph->id = htons(salt & 0xFFFF);
 475	ip_fallback_id = salt;
 476	spin_unlock_bh(&ip_fb_id_lock);
 477}
 
 478
 479void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
 480{
 481	struct net *net = dev_net(dst->dev);
 482	struct inet_peer *peer;
 483
 484	peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
 485	if (peer) {
 486		iph->id = htons(inet_getid(peer, more));
 487		inet_putpeer(peer);
 488		return;
 489	}
 490
 491	ip_select_fb_ident(iph);
 
 
 
 
 
 492}
 493EXPORT_SYMBOL(__ip_select_ident);
 494
 495static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 
 496			     const struct iphdr *iph,
 497			     int oif, u8 tos,
 498			     u8 prot, u32 mark, int flow_flags)
 499{
 500	if (sk) {
 501		const struct inet_sock *inet = inet_sk(sk);
 502
 503		oif = sk->sk_bound_dev_if;
 504		mark = sk->sk_mark;
 505		tos = RT_CONN_FLAGS(sk);
 506		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 507	}
 508	flowi4_init_output(fl4, oif, mark, tos,
 509			   RT_SCOPE_UNIVERSE, prot,
 510			   flow_flags,
 511			   iph->daddr, iph->saddr, 0, 0);
 
 512}
 513
 514static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 515			       const struct sock *sk)
 516{
 
 517	const struct iphdr *iph = ip_hdr(skb);
 518	int oif = skb->dev->ifindex;
 519	u8 tos = RT_TOS(iph->tos);
 520	u8 prot = iph->protocol;
 521	u32 mark = skb->mark;
 522
 523	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 524}
 525
 526static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 527{
 528	const struct inet_sock *inet = inet_sk(sk);
 529	const struct ip_options_rcu *inet_opt;
 530	__be32 daddr = inet->inet_daddr;
 531
 532	rcu_read_lock();
 533	inet_opt = rcu_dereference(inet->inet_opt);
 534	if (inet_opt && inet_opt->opt.srr)
 535		daddr = inet_opt->opt.faddr;
 536	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 537			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 538			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 539			   inet_sk_flowi_flags(sk),
 540			   daddr, inet->inet_saddr, 0, 0);
 541	rcu_read_unlock();
 542}
 543
 544static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 545				 const struct sk_buff *skb)
 546{
 547	if (skb)
 548		build_skb_flow_key(fl4, skb, sk);
 549	else
 550		build_sk_flow_key(fl4, sk);
 551}
 552
 553static inline void rt_free(struct rtable *rt)
 554{
 555	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 556}
 557
 558static DEFINE_SPINLOCK(fnhe_lock);
 559
 560static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 561{
 562	struct rtable *rt;
 563
 564	rt = rcu_dereference(fnhe->fnhe_rth_input);
 565	if (rt) {
 566		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 567		rt_free(rt);
 
 568	}
 569	rt = rcu_dereference(fnhe->fnhe_rth_output);
 570	if (rt) {
 571		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 572		rt_free(rt);
 
 573	}
 574}
 575
 576static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 577{
 578	struct fib_nh_exception *fnhe, *oldest;
 579
 580	oldest = rcu_dereference(hash->chain);
 581	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 582	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 583		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 584			oldest = fnhe;
 585	}
 586	fnhe_flush_routes(oldest);
 587	return oldest;
 588}
 589
 590static inline u32 fnhe_hashfun(__be32 daddr)
 591{
 
 592	u32 hval;
 593
 594	hval = (__force u32) daddr;
 595	hval ^= (hval >> 11) ^ (hval >> 22);
 596
 597	return hval & (FNHE_HASH_SIZE - 1);
 598}
 599
 600static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 601{
 602	rt->rt_pmtu = fnhe->fnhe_pmtu;
 
 603	rt->dst.expires = fnhe->fnhe_expires;
 604
 605	if (fnhe->fnhe_gw) {
 606		rt->rt_flags |= RTCF_REDIRECTED;
 607		rt->rt_gateway = fnhe->fnhe_gw;
 608		rt->rt_uses_gateway = 1;
 609	}
 610}
 611
 612static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 613				  u32 pmtu, unsigned long expires)
 614{
 615	struct fnhe_hash_bucket *hash;
 616	struct fib_nh_exception *fnhe;
 617	struct rtable *rt;
 
 618	unsigned int i;
 619	int depth;
 620	u32 hval = fnhe_hashfun(daddr);
 
 
 621
 622	spin_lock_bh(&fnhe_lock);
 623
 624	hash = nh->nh_exceptions;
 625	if (!hash) {
 626		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 627		if (!hash)
 628			goto out_unlock;
 629		nh->nh_exceptions = hash;
 630	}
 631
 632	hash += hval;
 633
 634	depth = 0;
 635	for (fnhe = rcu_dereference(hash->chain); fnhe;
 636	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 637		if (fnhe->fnhe_daddr == daddr)
 638			break;
 639		depth++;
 640	}
 641
 642	if (fnhe) {
 
 
 643		if (gw)
 644			fnhe->fnhe_gw = gw;
 645		if (pmtu) {
 646			fnhe->fnhe_pmtu = pmtu;
 647			fnhe->fnhe_expires = max(1UL, expires);
 648		}
 
 649		/* Update all cached dsts too */
 650		rt = rcu_dereference(fnhe->fnhe_rth_input);
 651		if (rt)
 652			fill_route_from_fnhe(rt, fnhe);
 653		rt = rcu_dereference(fnhe->fnhe_rth_output);
 654		if (rt)
 655			fill_route_from_fnhe(rt, fnhe);
 656	} else {
 657		if (depth > FNHE_RECLAIM_DEPTH)
 658			fnhe = fnhe_oldest(hash);
 659		else {
 660			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 661			if (!fnhe)
 662				goto out_unlock;
 663
 664			fnhe->fnhe_next = hash->chain;
 665			rcu_assign_pointer(hash->chain, fnhe);
 666		}
 667		fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 668		fnhe->fnhe_daddr = daddr;
 669		fnhe->fnhe_gw = gw;
 670		fnhe->fnhe_pmtu = pmtu;
 671		fnhe->fnhe_expires = expires;
 
 672
 673		/* Exception created; mark the cached routes for the nexthop
 674		 * stale, so anyone caching it rechecks if this exception
 675		 * applies to them.
 676		 */
 677		rt = rcu_dereference(nh->nh_rth_input);
 678		if (rt)
 679			rt->dst.obsolete = DST_OBSOLETE_KILL;
 680
 681		for_each_possible_cpu(i) {
 682			struct rtable __rcu **prt;
 683			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 684			rt = rcu_dereference(*prt);
 685			if (rt)
 686				rt->dst.obsolete = DST_OBSOLETE_KILL;
 687		}
 688	}
 689
 690	fnhe->fnhe_stamp = jiffies;
 691
 692out_unlock:
 693	spin_unlock_bh(&fnhe_lock);
 694}
 695
 696static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 697			     bool kill_route)
 698{
 699	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 700	__be32 old_gw = ip_hdr(skb)->saddr;
 701	struct net_device *dev = skb->dev;
 702	struct in_device *in_dev;
 703	struct fib_result res;
 704	struct neighbour *n;
 705	struct net *net;
 706
 707	switch (icmp_hdr(skb)->code & 7) {
 708	case ICMP_REDIR_NET:
 709	case ICMP_REDIR_NETTOS:
 710	case ICMP_REDIR_HOST:
 711	case ICMP_REDIR_HOSTTOS:
 712		break;
 713
 714	default:
 715		return;
 716	}
 717
 718	if (rt->rt_gateway != old_gw)
 719		return;
 720
 721	in_dev = __in_dev_get_rcu(dev);
 722	if (!in_dev)
 723		return;
 724
 725	net = dev_net(dev);
 726	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 727	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 728	    ipv4_is_zeronet(new_gw))
 729		goto reject_redirect;
 730
 731	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 732		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 733			goto reject_redirect;
 734		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 735			goto reject_redirect;
 736	} else {
 737		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 738			goto reject_redirect;
 739	}
 740
 741	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 742	if (n) {
 
 
 743		if (!(n->nud_state & NUD_VALID)) {
 744			neigh_event_send(n, NULL);
 745		} else {
 746			if (fib_lookup(net, fl4, &res) == 0) {
 747				struct fib_nh *nh = &FIB_RES_NH(res);
 748
 749				update_or_create_fnhe(nh, fl4->daddr, new_gw,
 750						      0, 0);
 
 751			}
 752			if (kill_route)
 753				rt->dst.obsolete = DST_OBSOLETE_KILL;
 754			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 755		}
 756		neigh_release(n);
 757	}
 758	return;
 759
 760reject_redirect:
 761#ifdef CONFIG_IP_ROUTE_VERBOSE
 762	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 763		const struct iphdr *iph = (const struct iphdr *) skb->data;
 764		__be32 daddr = iph->daddr;
 765		__be32 saddr = iph->saddr;
 766
 767		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 768				     "  Advised path = %pI4 -> %pI4\n",
 769				     &old_gw, dev->name, &new_gw,
 770				     &saddr, &daddr);
 771	}
 772#endif
 773	;
 774}
 775
 776static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 777{
 778	struct rtable *rt;
 779	struct flowi4 fl4;
 780	const struct iphdr *iph = (const struct iphdr *) skb->data;
 
 781	int oif = skb->dev->ifindex;
 782	u8 tos = RT_TOS(iph->tos);
 783	u8 prot = iph->protocol;
 784	u32 mark = skb->mark;
 785
 786	rt = (struct rtable *) dst;
 787
 788	__build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 789	__ip_do_redirect(rt, skb, &fl4, true);
 790}
 791
 792static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 793{
 794	struct rtable *rt = (struct rtable *)dst;
 795	struct dst_entry *ret = dst;
 796
 797	if (rt) {
 798		if (dst->obsolete > 0) {
 799			ip_rt_put(rt);
 800			ret = NULL;
 801		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 802			   rt->dst.expires) {
 803			ip_rt_put(rt);
 804			ret = NULL;
 805		}
 806	}
 807	return ret;
 808}
 809
 810/*
 811 * Algorithm:
 812 *	1. The first ip_rt_redirect_number redirects are sent
 813 *	   with exponential backoff, then we stop sending them at all,
 814 *	   assuming that the host ignores our redirects.
 815 *	2. If we did not see packets requiring redirects
 816 *	   during ip_rt_redirect_silence, we assume that the host
 817 *	   forgot redirected route and start to send redirects again.
 818 *
 819 * This algorithm is much cheaper and more intelligent than dumb load limiting
 820 * in icmp.c.
 821 *
 822 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 823 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 824 */
 825
 826void ip_rt_send_redirect(struct sk_buff *skb)
 827{
 828	struct rtable *rt = skb_rtable(skb);
 829	struct in_device *in_dev;
 830	struct inet_peer *peer;
 831	struct net *net;
 832	int log_martians;
 
 833
 834	rcu_read_lock();
 835	in_dev = __in_dev_get_rcu(rt->dst.dev);
 836	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 837		rcu_read_unlock();
 838		return;
 839	}
 840	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 
 841	rcu_read_unlock();
 842
 843	net = dev_net(rt->dst.dev);
 844	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 845	if (!peer) {
 846		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 847			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 848		return;
 849	}
 850
 851	/* No redirected packets during ip_rt_redirect_silence;
 852	 * reset the algorithm.
 853	 */
 854	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 855		peer->rate_tokens = 0;
 856
 857	/* Too many ignored redirects; do not send anything
 858	 * set dst.rate_last to the last seen redirected packet.
 859	 */
 860	if (peer->rate_tokens >= ip_rt_redirect_number) {
 861		peer->rate_last = jiffies;
 862		goto out_put_peer;
 863	}
 864
 865	/* Check for load limit; set rate_last to the latest sent
 866	 * redirect.
 867	 */
 868	if (peer->rate_tokens == 0 ||
 869	    time_after(jiffies,
 870		       (peer->rate_last +
 871			(ip_rt_redirect_load << peer->rate_tokens)))) {
 872		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 873
 874		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 875		peer->rate_last = jiffies;
 876		++peer->rate_tokens;
 877#ifdef CONFIG_IP_ROUTE_VERBOSE
 878		if (log_martians &&
 879		    peer->rate_tokens == ip_rt_redirect_number)
 880			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 881					     &ip_hdr(skb)->saddr, inet_iif(skb),
 882					     &ip_hdr(skb)->daddr, &gw);
 883#endif
 884	}
 885out_put_peer:
 886	inet_putpeer(peer);
 887}
 888
 889static int ip_error(struct sk_buff *skb)
 890{
 891	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 892	struct rtable *rt = skb_rtable(skb);
 
 
 893	struct inet_peer *peer;
 894	unsigned long now;
 895	struct net *net;
 896	bool send;
 897	int code;
 898
 
 
 
 
 
 
 
 
 
 
 
 
 899	net = dev_net(rt->dst.dev);
 900	if (!IN_DEV_FORWARD(in_dev)) {
 901		switch (rt->dst.error) {
 902		case EHOSTUNREACH:
 903			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 904			break;
 905
 906		case ENETUNREACH:
 907			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 908			break;
 909		}
 910		goto out;
 911	}
 912
 913	switch (rt->dst.error) {
 914	case EINVAL:
 915	default:
 916		goto out;
 917	case EHOSTUNREACH:
 918		code = ICMP_HOST_UNREACH;
 919		break;
 920	case ENETUNREACH:
 921		code = ICMP_NET_UNREACH;
 922		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 923		break;
 924	case EACCES:
 925		code = ICMP_PKT_FILTERED;
 926		break;
 927	}
 928
 929	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
 
 930
 931	send = true;
 932	if (peer) {
 933		now = jiffies;
 934		peer->rate_tokens += now - peer->rate_last;
 935		if (peer->rate_tokens > ip_rt_error_burst)
 936			peer->rate_tokens = ip_rt_error_burst;
 937		peer->rate_last = now;
 938		if (peer->rate_tokens >= ip_rt_error_cost)
 939			peer->rate_tokens -= ip_rt_error_cost;
 940		else
 941			send = false;
 942		inet_putpeer(peer);
 943	}
 944	if (send)
 945		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 946
 947out:	kfree_skb(skb);
 948	return 0;
 949}
 950
 951static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 952{
 953	struct dst_entry *dst = &rt->dst;
 954	struct fib_result res;
 
 955
 956	if (dst_metric_locked(dst, RTAX_MTU))
 957		return;
 958
 959	if (dst->dev->mtu < mtu)
 960		return;
 961
 962	if (mtu < ip_rt_min_pmtu)
 
 963		mtu = ip_rt_min_pmtu;
 
 964
 965	if (rt->rt_pmtu == mtu &&
 966	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
 967		return;
 968
 969	rcu_read_lock();
 970	if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
 971		struct fib_nh *nh = &FIB_RES_NH(res);
 972
 973		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 974				      jiffies + ip_rt_mtu_expires);
 975	}
 976	rcu_read_unlock();
 977}
 978
 979static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 980			      struct sk_buff *skb, u32 mtu)
 981{
 982	struct rtable *rt = (struct rtable *) dst;
 983	struct flowi4 fl4;
 984
 985	ip_rt_build_flow_key(&fl4, sk, skb);
 986	__ip_rt_update_pmtu(rt, &fl4, mtu);
 987}
 988
 989void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
 990		      int oif, u32 mark, u8 protocol, int flow_flags)
 991{
 992	const struct iphdr *iph = (const struct iphdr *) skb->data;
 993	struct flowi4 fl4;
 994	struct rtable *rt;
 995
 996	__build_flow_key(&fl4, NULL, iph, oif,
 
 
 
 997			 RT_TOS(iph->tos), protocol, mark, flow_flags);
 998	rt = __ip_route_output_key(net, &fl4);
 999	if (!IS_ERR(rt)) {
1000		__ip_rt_update_pmtu(rt, &fl4, mtu);
1001		ip_rt_put(rt);
1002	}
1003}
1004EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1005
1006static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1007{
1008	const struct iphdr *iph = (const struct iphdr *) skb->data;
1009	struct flowi4 fl4;
1010	struct rtable *rt;
1011
1012	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
 
 
 
 
1013	rt = __ip_route_output_key(sock_net(sk), &fl4);
1014	if (!IS_ERR(rt)) {
1015		__ip_rt_update_pmtu(rt, &fl4, mtu);
1016		ip_rt_put(rt);
1017	}
1018}
1019
1020void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1021{
1022	const struct iphdr *iph = (const struct iphdr *) skb->data;
1023	struct flowi4 fl4;
1024	struct rtable *rt;
1025	struct dst_entry *dst;
1026	bool new = false;
 
1027
1028	bh_lock_sock(sk);
1029
1030	if (!ip_sk_accept_pmtu(sk))
1031		goto out;
1032
1033	rt = (struct rtable *) __sk_dst_get(sk);
1034
1035	if (sock_owned_by_user(sk) || !rt) {
1036		__ipv4_sk_update_pmtu(skb, sk, mtu);
1037		goto out;
1038	}
1039
1040	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1041
1042	if (!__sk_dst_check(sk, 0)) {
 
1043		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1044		if (IS_ERR(rt))
1045			goto out;
1046
1047		new = true;
1048	}
1049
1050	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1051
1052	dst = dst_check(&rt->dst, 0);
1053	if (!dst) {
1054		if (new)
1055			dst_release(&rt->dst);
1056
1057		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1058		if (IS_ERR(rt))
1059			goto out;
1060
1061		new = true;
1062	}
1063
1064	if (new)
1065		__sk_dst_set(sk, &rt->dst);
1066
1067out:
1068	bh_unlock_sock(sk);
 
1069}
1070EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1071
1072void ipv4_redirect(struct sk_buff *skb, struct net *net,
1073		   int oif, u32 mark, u8 protocol, int flow_flags)
1074{
1075	const struct iphdr *iph = (const struct iphdr *) skb->data;
1076	struct flowi4 fl4;
1077	struct rtable *rt;
1078
1079	__build_flow_key(&fl4, NULL, iph, oif,
1080			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1081	rt = __ip_route_output_key(net, &fl4);
1082	if (!IS_ERR(rt)) {
1083		__ip_do_redirect(rt, skb, &fl4, false);
1084		ip_rt_put(rt);
1085	}
1086}
1087EXPORT_SYMBOL_GPL(ipv4_redirect);
1088
1089void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1090{
1091	const struct iphdr *iph = (const struct iphdr *) skb->data;
1092	struct flowi4 fl4;
1093	struct rtable *rt;
 
1094
1095	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1096	rt = __ip_route_output_key(sock_net(sk), &fl4);
1097	if (!IS_ERR(rt)) {
1098		__ip_do_redirect(rt, skb, &fl4, false);
1099		ip_rt_put(rt);
1100	}
1101}
1102EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1103
1104static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1105{
1106	struct rtable *rt = (struct rtable *) dst;
1107
1108	/* All IPV4 dsts are created with ->obsolete set to the value
1109	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1110	 * into this function always.
1111	 *
1112	 * When a PMTU/redirect information update invalidates a route,
1113	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1114	 * DST_OBSOLETE_DEAD by dst_free().
1115	 */
1116	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1117		return NULL;
1118	return dst;
1119}
1120
1121static void ipv4_link_failure(struct sk_buff *skb)
1122{
1123	struct rtable *rt;
1124
1125	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1126
1127	rt = skb_rtable(skb);
1128	if (rt)
1129		dst_set_expires(&rt->dst, 0);
1130}
1131
1132static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
1133{
1134	pr_debug("%s: %pI4 -> %pI4, %s\n",
1135		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1136		 skb->dev ? skb->dev->name : "?");
1137	kfree_skb(skb);
1138	WARN_ON(1);
1139	return 0;
1140}
1141
1142/*
1143   We do not cache source address of outgoing interface,
1144   because it is used only by IP RR, TS and SRR options,
1145   so that it out of fast path.
1146
1147   BTW remember: "addr" is allowed to be not aligned
1148   in IP options!
1149 */
1150
1151void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1152{
1153	__be32 src;
1154
1155	if (rt_is_output_route(rt))
1156		src = ip_hdr(skb)->saddr;
1157	else {
1158		struct fib_result res;
1159		struct flowi4 fl4;
1160		struct iphdr *iph;
1161
1162		iph = ip_hdr(skb);
1163
1164		memset(&fl4, 0, sizeof(fl4));
1165		fl4.daddr = iph->daddr;
1166		fl4.saddr = iph->saddr;
1167		fl4.flowi4_tos = RT_TOS(iph->tos);
1168		fl4.flowi4_oif = rt->dst.dev->ifindex;
1169		fl4.flowi4_iif = skb->dev->ifindex;
1170		fl4.flowi4_mark = skb->mark;
1171
1172		rcu_read_lock();
1173		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1174			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1175		else
1176			src = inet_select_addr(rt->dst.dev,
1177					       rt_nexthop(rt, iph->daddr),
1178					       RT_SCOPE_UNIVERSE);
1179		rcu_read_unlock();
1180	}
1181	memcpy(addr, &src, 4);
1182}
1183
1184#ifdef CONFIG_IP_ROUTE_CLASSID
1185static void set_class_tag(struct rtable *rt, u32 tag)
1186{
1187	if (!(rt->dst.tclassid & 0xFFFF))
1188		rt->dst.tclassid |= tag & 0xFFFF;
1189	if (!(rt->dst.tclassid & 0xFFFF0000))
1190		rt->dst.tclassid |= tag & 0xFFFF0000;
1191}
1192#endif
1193
1194static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1195{
1196	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
 
 
1197
1198	if (advmss == 0) {
1199		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1200			       ip_rt_min_advmss);
1201		if (advmss > 65535 - 40)
1202			advmss = 65535 - 40;
1203	}
1204	return advmss;
1205}
1206
1207static unsigned int ipv4_mtu(const struct dst_entry *dst)
1208{
1209	const struct rtable *rt = (const struct rtable *) dst;
1210	unsigned int mtu = rt->rt_pmtu;
1211
1212	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1213		mtu = dst_metric_raw(dst, RTAX_MTU);
1214
1215	if (mtu)
1216		return mtu;
1217
1218	mtu = dst->dev->mtu;
1219
1220	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1221		if (rt->rt_uses_gateway && mtu > 576)
1222			mtu = 576;
1223	}
1224
1225	return min_t(unsigned int, mtu, IP_MAX_MTU);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1226}
1227
1228static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1229{
1230	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1231	struct fib_nh_exception *fnhe;
1232	u32 hval;
1233
1234	if (!hash)
1235		return NULL;
1236
1237	hval = fnhe_hashfun(daddr);
1238
1239	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1240	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1241		if (fnhe->fnhe_daddr == daddr)
 
 
 
 
 
1242			return fnhe;
 
1243	}
1244	return NULL;
1245}
1246
1247static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1248			      __be32 daddr)
1249{
1250	bool ret = false;
1251
1252	spin_lock_bh(&fnhe_lock);
1253
1254	if (daddr == fnhe->fnhe_daddr) {
1255		struct rtable __rcu **porig;
1256		struct rtable *orig;
1257		int genid = fnhe_genid(dev_net(rt->dst.dev));
1258
1259		if (rt_is_input_route(rt))
1260			porig = &fnhe->fnhe_rth_input;
1261		else
1262			porig = &fnhe->fnhe_rth_output;
1263		orig = rcu_dereference(*porig);
1264
1265		if (fnhe->fnhe_genid != genid) {
1266			fnhe->fnhe_genid = genid;
1267			fnhe->fnhe_gw = 0;
1268			fnhe->fnhe_pmtu = 0;
1269			fnhe->fnhe_expires = 0;
 
1270			fnhe_flush_routes(fnhe);
1271			orig = NULL;
1272		}
1273		fill_route_from_fnhe(rt, fnhe);
1274		if (!rt->rt_gateway)
1275			rt->rt_gateway = daddr;
1276
1277		if (!(rt->dst.flags & DST_NOCACHE)) {
 
1278			rcu_assign_pointer(*porig, rt);
1279			if (orig)
1280				rt_free(orig);
 
 
1281			ret = true;
1282		}
1283
1284		fnhe->fnhe_stamp = jiffies;
1285	}
1286	spin_unlock_bh(&fnhe_lock);
1287
1288	return ret;
1289}
1290
1291static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1292{
1293	struct rtable *orig, *prev, **p;
1294	bool ret = true;
1295
1296	if (rt_is_input_route(rt)) {
1297		p = (struct rtable **)&nh->nh_rth_input;
1298	} else {
1299		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1300	}
1301	orig = *p;
1302
 
 
 
 
1303	prev = cmpxchg(p, orig, rt);
1304	if (prev == orig) {
1305		if (orig)
1306			rt_free(orig);
1307	} else
 
 
 
1308		ret = false;
 
1309
1310	return ret;
1311}
1312
1313static DEFINE_SPINLOCK(rt_uncached_lock);
1314static LIST_HEAD(rt_uncached_list);
 
 
 
 
1315
1316static void rt_add_uncached_list(struct rtable *rt)
1317{
1318	spin_lock_bh(&rt_uncached_lock);
1319	list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1320	spin_unlock_bh(&rt_uncached_lock);
 
 
 
 
1321}
1322
1323static void ipv4_dst_destroy(struct dst_entry *dst)
1324{
1325	struct rtable *rt = (struct rtable *) dst;
 
1326
1327	if (!list_empty(&rt->rt_uncached)) {
1328		spin_lock_bh(&rt_uncached_lock);
1329		list_del(&rt->rt_uncached);
1330		spin_unlock_bh(&rt_uncached_lock);
1331	}
1332}
1333
 
 
 
 
 
 
 
 
 
 
 
1334void rt_flush_dev(struct net_device *dev)
1335{
1336	if (!list_empty(&rt_uncached_list)) {
1337		struct net *net = dev_net(dev);
1338		struct rtable *rt;
 
 
 
1339
1340		spin_lock_bh(&rt_uncached_lock);
1341		list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1342			if (rt->dst.dev != dev)
1343				continue;
1344			rt->dst.dev = net->loopback_dev;
1345			dev_hold(rt->dst.dev);
1346			dev_put(dev);
1347		}
1348		spin_unlock_bh(&rt_uncached_lock);
1349	}
1350}
1351
1352static bool rt_cache_valid(const struct rtable *rt)
1353{
1354	return	rt &&
1355		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1356		!rt_is_expired(rt);
1357}
1358
1359static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1360			   const struct fib_result *res,
1361			   struct fib_nh_exception *fnhe,
1362			   struct fib_info *fi, u16 type, u32 itag)
 
1363{
1364	bool cached = false;
1365
1366	if (fi) {
1367		struct fib_nh *nh = &FIB_RES_NH(*res);
1368
1369		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1370			rt->rt_gateway = nh->nh_gw;
1371			rt->rt_uses_gateway = 1;
1372		}
1373		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 
 
 
 
1374#ifdef CONFIG_IP_ROUTE_CLASSID
1375		rt->dst.tclassid = nh->nh_tclassid;
1376#endif
 
1377		if (unlikely(fnhe))
1378			cached = rt_bind_exception(rt, fnhe, daddr);
1379		else if (!(rt->dst.flags & DST_NOCACHE))
1380			cached = rt_cache_route(nh, rt);
1381		if (unlikely(!cached)) {
1382			/* Routes we intend to cache in nexthop exception or
1383			 * FIB nexthop have the DST_NOCACHE bit clear.
1384			 * However, if we are unsuccessful at storing this
1385			 * route into the cache we really need to set it.
1386			 */
1387			rt->dst.flags |= DST_NOCACHE;
1388			if (!rt->rt_gateway)
1389				rt->rt_gateway = daddr;
1390			rt_add_uncached_list(rt);
1391		}
1392	} else
1393		rt_add_uncached_list(rt);
1394
1395#ifdef CONFIG_IP_ROUTE_CLASSID
1396#ifdef CONFIG_IP_MULTIPLE_TABLES
1397	set_class_tag(rt, res->tclassid);
1398#endif
1399	set_class_tag(rt, itag);
1400#endif
1401}
1402
1403static struct rtable *rt_dst_alloc(struct net_device *dev,
1404				   bool nopolicy, bool noxfrm, bool will_cache)
 
1405{
1406	return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1407			 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1408			 (nopolicy ? DST_NOPOLICY : 0) |
1409			 (noxfrm ? DST_NOXFRM : 0));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1410}
 
1411
1412/* called in rcu_read_lock() section */
1413static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1414				u8 tos, struct net_device *dev, int our)
 
1415{
1416	struct rtable *rth;
1417	struct in_device *in_dev = __in_dev_get_rcu(dev);
1418	u32 itag = 0;
1419	int err;
1420
1421	/* Primary sanity checks. */
1422
1423	if (in_dev == NULL)
1424		return -EINVAL;
1425
1426	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1427	    skb->protocol != htons(ETH_P_IP))
1428		goto e_inval;
1429
1430	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1431		if (ipv4_is_loopback(saddr))
1432			goto e_inval;
1433
1434	if (ipv4_is_zeronet(saddr)) {
1435		if (!ipv4_is_local_multicast(daddr))
1436			goto e_inval;
1437	} else {
1438		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1439					  in_dev, &itag);
1440		if (err < 0)
1441			goto e_err;
1442	}
1443	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1444			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1445	if (!rth)
1446		goto e_nobufs;
1447
1448#ifdef CONFIG_IP_ROUTE_CLASSID
1449	rth->dst.tclassid = itag;
1450#endif
1451	rth->dst.output = ip_rt_bug;
1452
1453	rth->rt_genid	= rt_genid_ipv4(dev_net(dev));
1454	rth->rt_flags	= RTCF_MULTICAST;
1455	rth->rt_type	= RTN_MULTICAST;
1456	rth->rt_is_input= 1;
1457	rth->rt_iif	= 0;
1458	rth->rt_pmtu	= 0;
1459	rth->rt_gateway	= 0;
1460	rth->rt_uses_gateway = 0;
1461	INIT_LIST_HEAD(&rth->rt_uncached);
1462	if (our) {
1463		rth->dst.input= ip_local_deliver;
1464		rth->rt_flags |= RTCF_LOCAL;
1465	}
1466
1467#ifdef CONFIG_IP_MROUTE
1468	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1469		rth->dst.input = ip_mr_input;
1470#endif
1471	RT_CACHE_STAT_INC(in_slow_mc);
1472
1473	skb_dst_set(skb, &rth->dst);
1474	return 0;
1475
1476e_nobufs:
1477	return -ENOBUFS;
1478e_inval:
1479	return -EINVAL;
1480e_err:
1481	return err;
1482}
1483
1484
1485static void ip_handle_martian_source(struct net_device *dev,
1486				     struct in_device *in_dev,
1487				     struct sk_buff *skb,
1488				     __be32 daddr,
1489				     __be32 saddr)
1490{
1491	RT_CACHE_STAT_INC(in_martian_src);
1492#ifdef CONFIG_IP_ROUTE_VERBOSE
1493	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1494		/*
1495		 *	RFC1812 recommendation, if source is martian,
1496		 *	the only hint is MAC header.
1497		 */
1498		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1499			&daddr, &saddr, dev->name);
1500		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1501			print_hex_dump(KERN_WARNING, "ll header: ",
1502				       DUMP_PREFIX_OFFSET, 16, 1,
1503				       skb_mac_header(skb),
1504				       dev->hard_header_len, true);
1505		}
1506	}
1507#endif
1508}
1509
1510/* called in rcu_read_lock() section */
1511static int __mkroute_input(struct sk_buff *skb,
1512			   const struct fib_result *res,
1513			   struct in_device *in_dev,
1514			   __be32 daddr, __be32 saddr, u32 tos)
1515{
1516	struct fib_nh_exception *fnhe;
1517	struct rtable *rth;
1518	int err;
1519	struct in_device *out_dev;
1520	unsigned int flags = 0;
1521	bool do_cache;
1522	u32 itag = 0;
1523
1524	/* get a working reference to the output device */
1525	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1526	if (out_dev == NULL) {
1527		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1528		return -EINVAL;
1529	}
1530
1531	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1532				  in_dev->dev, in_dev, &itag);
1533	if (err < 0) {
1534		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1535					 saddr);
1536
1537		goto cleanup;
1538	}
1539
1540	do_cache = res->fi && !itag;
1541	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
 
1542	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1543	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1544		flags |= RTCF_DOREDIRECT;
1545		do_cache = false;
1546	}
1547
1548	if (skb->protocol != htons(ETH_P_IP)) {
1549		/* Not IP (i.e. ARP). Do not create route, if it is
1550		 * invalid for proxy arp. DNAT routes are always valid.
1551		 *
1552		 * Proxy arp feature have been extended to allow, ARP
1553		 * replies back to the same interface, to support
1554		 * Private VLAN switch technologies. See arp.c.
1555		 */
1556		if (out_dev == in_dev &&
1557		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1558			err = -EINVAL;
1559			goto cleanup;
1560		}
1561	}
1562
1563	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1564	if (do_cache) {
1565		if (fnhe != NULL)
1566			rth = rcu_dereference(fnhe->fnhe_rth_input);
1567		else
1568			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1569
1570		if (rt_cache_valid(rth)) {
1571			skb_dst_set_noref(skb, &rth->dst);
1572			goto out;
1573		}
1574	}
1575
1576	rth = rt_dst_alloc(out_dev->dev,
1577			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1578			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1579	if (!rth) {
1580		err = -ENOBUFS;
1581		goto cleanup;
1582	}
1583
1584	rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1585	rth->rt_flags = flags;
1586	rth->rt_type = res->type;
1587	rth->rt_is_input = 1;
1588	rth->rt_iif 	= 0;
1589	rth->rt_pmtu	= 0;
1590	rth->rt_gateway	= 0;
1591	rth->rt_uses_gateway = 0;
1592	INIT_LIST_HEAD(&rth->rt_uncached);
1593	RT_CACHE_STAT_INC(in_slow_tot);
1594
1595	rth->dst.input = ip_forward;
1596	rth->dst.output = ip_output;
1597
1598	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
 
 
1599	skb_dst_set(skb, &rth->dst);
1600out:
1601	err = 0;
1602 cleanup:
1603	return err;
1604}
1605
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1606static int ip_mkroute_input(struct sk_buff *skb,
1607			    struct fib_result *res,
1608			    const struct flowi4 *fl4,
1609			    struct in_device *in_dev,
1610			    __be32 daddr, __be32 saddr, u32 tos)
 
1611{
1612#ifdef CONFIG_IP_ROUTE_MULTIPATH
1613	if (res->fi && res->fi->fib_nhs > 1)
1614		fib_select_multipath(res);
 
 
 
1615#endif
1616
1617	/* create a routing cache entry */
1618	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1619}
1620
1621/*
1622 *	NOTE. We drop all the packets that has local source
1623 *	addresses, because every properly looped back packet
1624 *	must have correct destination already attached by output routine.
1625 *
1626 *	Such approach solves two big problems:
1627 *	1. Not simplex devices are handled properly.
1628 *	2. IP spoofing attempts are filtered with 100% of guarantee.
1629 *	called with rcu_read_lock()
1630 */
1631
1632static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1633			       u8 tos, struct net_device *dev)
 
1634{
1635	struct fib_result res;
1636	struct in_device *in_dev = __in_dev_get_rcu(dev);
1637	struct flowi4	fl4;
 
 
 
1638	unsigned int	flags = 0;
1639	u32		itag = 0;
1640	struct rtable	*rth;
1641	int		err = -EINVAL;
1642	struct net    *net = dev_net(dev);
1643	bool do_cache;
1644
1645	/* IP on this device is disabled. */
1646
1647	if (!in_dev)
1648		goto out;
1649
1650	/* Check for the most weird martians, which can be not detected
1651	   by fib_lookup.
1652	 */
1653
 
 
 
 
 
 
 
1654	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1655		goto martian_source;
1656
1657	res.fi = NULL;
 
1658	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1659		goto brd_input;
1660
1661	/* Accept zero addresses only to limited broadcast;
1662	 * I even do not know to fix it or not. Waiting for complains :-)
1663	 */
1664	if (ipv4_is_zeronet(saddr))
1665		goto martian_source;
1666
1667	if (ipv4_is_zeronet(daddr))
1668		goto martian_destination;
1669
1670	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1671	 * and call it once if daddr or/and saddr are loopback addresses
1672	 */
1673	if (ipv4_is_loopback(daddr)) {
1674		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1675			goto martian_destination;
1676	} else if (ipv4_is_loopback(saddr)) {
1677		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1678			goto martian_source;
1679	}
1680
1681	/*
1682	 *	Now we are ready to route packet.
1683	 */
1684	fl4.flowi4_oif = 0;
1685	fl4.flowi4_iif = dev->ifindex;
1686	fl4.flowi4_mark = skb->mark;
1687	fl4.flowi4_tos = tos;
1688	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 
1689	fl4.daddr = daddr;
1690	fl4.saddr = saddr;
1691	err = fib_lookup(net, &fl4, &res);
 
 
 
 
 
 
 
 
 
 
1692	if (err != 0) {
1693		if (!IN_DEV_FORWARD(in_dev))
1694			err = -EHOSTUNREACH;
1695		goto no_route;
1696	}
1697
1698	if (res.type == RTN_BROADCAST)
1699		goto brd_input;
1700
1701	if (res.type == RTN_LOCAL) {
1702		err = fib_validate_source(skb, saddr, daddr, tos,
1703					  0, dev, in_dev, &itag);
1704		if (err < 0)
1705			goto martian_source_keep_err;
1706		goto local_input;
1707	}
1708
1709	if (!IN_DEV_FORWARD(in_dev)) {
1710		err = -EHOSTUNREACH;
1711		goto no_route;
1712	}
1713	if (res.type != RTN_UNICAST)
1714		goto martian_destination;
1715
1716	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1717out:	return err;
1718
1719brd_input:
1720	if (skb->protocol != htons(ETH_P_IP))
1721		goto e_inval;
1722
1723	if (!ipv4_is_zeronet(saddr)) {
1724		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1725					  in_dev, &itag);
1726		if (err < 0)
1727			goto martian_source_keep_err;
1728	}
1729	flags |= RTCF_BROADCAST;
1730	res.type = RTN_BROADCAST;
1731	RT_CACHE_STAT_INC(in_brd);
1732
1733local_input:
1734	do_cache = false;
1735	if (res.fi) {
1736		if (!itag) {
1737			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1738			if (rt_cache_valid(rth)) {
1739				skb_dst_set_noref(skb, &rth->dst);
1740				err = 0;
1741				goto out;
1742			}
1743			do_cache = true;
1744		}
1745	}
1746
1747	rth = rt_dst_alloc(net->loopback_dev,
 
1748			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1749	if (!rth)
1750		goto e_nobufs;
1751
1752	rth->dst.input= ip_local_deliver;
1753	rth->dst.output= ip_rt_bug;
1754#ifdef CONFIG_IP_ROUTE_CLASSID
1755	rth->dst.tclassid = itag;
1756#endif
 
1757
1758	rth->rt_genid = rt_genid_ipv4(net);
1759	rth->rt_flags 	= flags|RTCF_LOCAL;
1760	rth->rt_type	= res.type;
1761	rth->rt_is_input = 1;
1762	rth->rt_iif	= 0;
1763	rth->rt_pmtu	= 0;
1764	rth->rt_gateway	= 0;
1765	rth->rt_uses_gateway = 0;
1766	INIT_LIST_HEAD(&rth->rt_uncached);
1767	RT_CACHE_STAT_INC(in_slow_tot);
1768	if (res.type == RTN_UNREACHABLE) {
1769		rth->dst.input= ip_error;
1770		rth->dst.error= -err;
1771		rth->rt_flags 	&= ~RTCF_LOCAL;
1772	}
 
1773	if (do_cache) {
1774		if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1775			rth->dst.flags |= DST_NOCACHE;
 
 
 
 
 
 
 
 
1776			rt_add_uncached_list(rth);
1777		}
1778	}
1779	skb_dst_set(skb, &rth->dst);
1780	err = 0;
1781	goto out;
1782
1783no_route:
1784	RT_CACHE_STAT_INC(in_no_route);
1785	res.type = RTN_UNREACHABLE;
1786	if (err == -ESRCH)
1787		err = -ENETUNREACH;
1788	goto local_input;
1789
1790	/*
1791	 *	Do not cache martian addresses: they should be logged (RFC1812)
1792	 */
1793martian_destination:
1794	RT_CACHE_STAT_INC(in_martian_dst);
1795#ifdef CONFIG_IP_ROUTE_VERBOSE
1796	if (IN_DEV_LOG_MARTIANS(in_dev))
1797		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1798				     &daddr, &saddr, dev->name);
1799#endif
1800
1801e_inval:
1802	err = -EINVAL;
1803	goto out;
1804
1805e_nobufs:
1806	err = -ENOBUFS;
1807	goto out;
1808
1809martian_source:
1810	err = -EINVAL;
1811martian_source_keep_err:
1812	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1813	goto out;
1814}
1815
1816int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1817			 u8 tos, struct net_device *dev)
1818{
1819	int res;
 
1820
 
1821	rcu_read_lock();
 
 
1822
 
 
 
 
 
 
 
 
1823	/* Multicast recognition logic is moved from route cache to here.
1824	   The problem was that too many Ethernet cards have broken/missing
1825	   hardware multicast filters :-( As result the host on multicasting
1826	   network acquires a lot of useless route cache entries, sort of
1827	   SDR messages from all the world. Now we try to get rid of them.
1828	   Really, provided software IP multicast filter is organized
1829	   reasonably (at least, hashed), it does not result in a slowdown
1830	   comparing with route cache reject entries.
1831	   Note, that multicast routers are not affected, because
1832	   route cache entry is created eventually.
1833	 */
1834	if (ipv4_is_multicast(daddr)) {
1835		struct in_device *in_dev = __in_dev_get_rcu(dev);
 
 
 
 
 
 
1836
1837		if (in_dev) {
1838			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1839						  ip_hdr(skb)->protocol);
1840			if (our
 
 
 
 
 
 
 
1841#ifdef CONFIG_IP_MROUTE
1842				||
1843			    (!ipv4_is_local_multicast(daddr) &&
1844			     IN_DEV_MFORWARD(in_dev))
1845#endif
1846			   ) {
1847				int res = ip_route_input_mc(skb, daddr, saddr,
1848							    tos, dev, our);
1849				rcu_read_unlock();
1850				return res;
1851			}
1852		}
1853		rcu_read_unlock();
1854		return -EINVAL;
1855	}
1856	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1857	rcu_read_unlock();
1858	return res;
1859}
1860EXPORT_SYMBOL(ip_route_input_noref);
1861
1862/* called with rcu_read_lock() */
1863static struct rtable *__mkroute_output(const struct fib_result *res,
1864				       const struct flowi4 *fl4, int orig_oif,
1865				       struct net_device *dev_out,
1866				       unsigned int flags)
1867{
1868	struct fib_info *fi = res->fi;
1869	struct fib_nh_exception *fnhe;
1870	struct in_device *in_dev;
1871	u16 type = res->type;
1872	struct rtable *rth;
1873	bool do_cache;
1874
1875	in_dev = __in_dev_get_rcu(dev_out);
1876	if (!in_dev)
1877		return ERR_PTR(-EINVAL);
1878
1879	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1880		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
 
 
1881			return ERR_PTR(-EINVAL);
1882
1883	if (ipv4_is_lbcast(fl4->daddr))
1884		type = RTN_BROADCAST;
1885	else if (ipv4_is_multicast(fl4->daddr))
1886		type = RTN_MULTICAST;
1887	else if (ipv4_is_zeronet(fl4->daddr))
1888		return ERR_PTR(-EINVAL);
1889
1890	if (dev_out->flags & IFF_LOOPBACK)
1891		flags |= RTCF_LOCAL;
1892
1893	do_cache = true;
1894	if (type == RTN_BROADCAST) {
1895		flags |= RTCF_BROADCAST | RTCF_LOCAL;
1896		fi = NULL;
1897	} else if (type == RTN_MULTICAST) {
1898		flags |= RTCF_MULTICAST | RTCF_LOCAL;
1899		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1900				     fl4->flowi4_proto))
1901			flags &= ~RTCF_LOCAL;
1902		else
1903			do_cache = false;
1904		/* If multicast route do not exist use
1905		 * default one, but do not gateway in this case.
1906		 * Yes, it is hack.
1907		 */
1908		if (fi && res->prefixlen < 4)
1909			fi = NULL;
 
 
 
 
 
 
 
 
 
 
 
 
1910	}
1911
1912	fnhe = NULL;
1913	do_cache &= fi != NULL;
1914	if (do_cache) {
1915		struct rtable __rcu **prth;
1916		struct fib_nh *nh = &FIB_RES_NH(*res);
1917
1918		fnhe = find_exception(nh, fl4->daddr);
1919		if (fnhe)
 
 
1920			prth = &fnhe->fnhe_rth_output;
1921		else {
1922			if (unlikely(fl4->flowi4_flags &
1923				     FLOWI_FLAG_KNOWN_NH &&
1924				     !(nh->nh_gw &&
1925				       nh->nh_scope == RT_SCOPE_LINK))) {
1926				do_cache = false;
1927				goto add;
1928			}
1929			prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1930		}
1931		rth = rcu_dereference(*prth);
1932		if (rt_cache_valid(rth)) {
1933			dst_hold(&rth->dst);
1934			return rth;
1935		}
1936	}
1937
1938add:
1939	rth = rt_dst_alloc(dev_out,
1940			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1941			   IN_DEV_CONF_GET(in_dev, NOXFRM),
1942			   do_cache);
1943	if (!rth)
1944		return ERR_PTR(-ENOBUFS);
1945
1946	rth->dst.output = ip_output;
1947
1948	rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1949	rth->rt_flags	= flags;
1950	rth->rt_type	= type;
1951	rth->rt_is_input = 0;
1952	rth->rt_iif	= orig_oif ? : 0;
1953	rth->rt_pmtu	= 0;
1954	rth->rt_gateway = 0;
1955	rth->rt_uses_gateway = 0;
1956	INIT_LIST_HEAD(&rth->rt_uncached);
1957
1958	RT_CACHE_STAT_INC(out_slow_tot);
1959
1960	if (flags & RTCF_LOCAL)
1961		rth->dst.input = ip_local_deliver;
1962	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1963		if (flags & RTCF_LOCAL &&
1964		    !(dev_out->flags & IFF_LOOPBACK)) {
1965			rth->dst.output = ip_mc_output;
1966			RT_CACHE_STAT_INC(out_slow_mc);
1967		}
1968#ifdef CONFIG_IP_MROUTE
1969		if (type == RTN_MULTICAST) {
1970			if (IN_DEV_MFORWARD(in_dev) &&
1971			    !ipv4_is_local_multicast(fl4->daddr)) {
1972				rth->dst.input = ip_mr_input;
1973				rth->dst.output = ip_mc_output;
1974			}
1975		}
1976#endif
1977	}
1978
1979	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
 
1980
1981	return rth;
1982}
1983
1984/*
1985 * Major route resolver routine.
1986 */
1987
1988struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 
1989{
1990	struct net_device *dev_out = NULL;
1991	__u8 tos = RT_FL_TOS(fl4);
1992	unsigned int flags = 0;
1993	struct fib_result res;
 
 
 
 
1994	struct rtable *rth;
1995	int orig_oif;
1996
1997	res.tclassid	= 0;
1998	res.fi		= NULL;
1999	res.table	= NULL;
2000
2001	orig_oif = fl4->flowi4_oif;
2002
2003	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2004	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2005	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2006			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2007
2008	rcu_read_lock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2009	if (fl4->saddr) {
2010		rth = ERR_PTR(-EINVAL);
2011		if (ipv4_is_multicast(fl4->saddr) ||
2012		    ipv4_is_lbcast(fl4->saddr) ||
2013		    ipv4_is_zeronet(fl4->saddr))
2014			goto out;
2015
2016		/* I removed check for oif == dev_out->oif here.
2017		   It was wrong for two reasons:
2018		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2019		      is assigned to multiple interfaces.
2020		   2. Moreover, we are allowed to send packets with saddr
2021		      of another iface. --ANK
2022		 */
2023
2024		if (fl4->flowi4_oif == 0 &&
2025		    (ipv4_is_multicast(fl4->daddr) ||
2026		     ipv4_is_lbcast(fl4->daddr))) {
2027			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2028			dev_out = __ip_dev_find(net, fl4->saddr, false);
2029			if (dev_out == NULL)
2030				goto out;
2031
2032			/* Special hack: user can direct multicasts
2033			   and limited broadcast via necessary interface
2034			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2035			   This hack is not just for fun, it allows
2036			   vic,vat and friends to work.
2037			   They bind socket to loopback, set ttl to zero
2038			   and expect that it will work.
2039			   From the viewpoint of routing cache they are broken,
2040			   because we are not allowed to build multicast path
2041			   with loopback source addr (look, routing cache
2042			   cannot know, that ttl is zero, so that packet
2043			   will not leave this host and route is valid).
2044			   Luckily, this hack is good workaround.
2045			 */
2046
2047			fl4->flowi4_oif = dev_out->ifindex;
2048			goto make_route;
2049		}
2050
2051		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2052			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2053			if (!__ip_dev_find(net, fl4->saddr, false))
2054				goto out;
2055		}
2056	}
2057
2058
2059	if (fl4->flowi4_oif) {
2060		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2061		rth = ERR_PTR(-ENODEV);
2062		if (dev_out == NULL)
2063			goto out;
2064
2065		/* RACE: Check return value of inet_select_addr instead. */
2066		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2067			rth = ERR_PTR(-ENETUNREACH);
2068			goto out;
2069		}
2070		if (ipv4_is_local_multicast(fl4->daddr) ||
2071		    ipv4_is_lbcast(fl4->daddr)) {
 
2072			if (!fl4->saddr)
2073				fl4->saddr = inet_select_addr(dev_out, 0,
2074							      RT_SCOPE_LINK);
2075			goto make_route;
2076		}
2077		if (!fl4->saddr) {
2078			if (ipv4_is_multicast(fl4->daddr))
2079				fl4->saddr = inet_select_addr(dev_out, 0,
2080							      fl4->flowi4_scope);
2081			else if (!fl4->daddr)
2082				fl4->saddr = inet_select_addr(dev_out, 0,
2083							      RT_SCOPE_HOST);
2084		}
2085	}
2086
2087	if (!fl4->daddr) {
2088		fl4->daddr = fl4->saddr;
2089		if (!fl4->daddr)
2090			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2091		dev_out = net->loopback_dev;
2092		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2093		res.type = RTN_LOCAL;
2094		flags |= RTCF_LOCAL;
2095		goto make_route;
2096	}
2097
2098	if (fib_lookup(net, fl4, &res)) {
2099		res.fi = NULL;
2100		res.table = NULL;
2101		if (fl4->flowi4_oif) {
 
 
 
2102			/* Apparently, routing tables are wrong. Assume,
2103			   that the destination is on link.
2104
2105			   WHY? DW.
2106			   Because we are allowed to send to iface
2107			   even if it has NO routes and NO assigned
2108			   addresses. When oif is specified, routing
2109			   tables are looked up with only one purpose:
2110			   to catch if destination is gatewayed, rather than
2111			   direct. Moreover, if MSG_DONTROUTE is set,
2112			   we send packet, ignoring both routing tables
2113			   and ifaddr state. --ANK
2114
2115
2116			   We could make it even if oif is unknown,
2117			   likely IPv6, but we do not.
2118			 */
2119
2120			if (fl4->saddr == 0)
2121				fl4->saddr = inet_select_addr(dev_out, 0,
2122							      RT_SCOPE_LINK);
2123			res.type = RTN_UNICAST;
2124			goto make_route;
2125		}
2126		rth = ERR_PTR(-ENETUNREACH);
2127		goto out;
2128	}
2129
2130	if (res.type == RTN_LOCAL) {
2131		if (!fl4->saddr) {
2132			if (res.fi->fib_prefsrc)
2133				fl4->saddr = res.fi->fib_prefsrc;
2134			else
2135				fl4->saddr = fl4->daddr;
2136		}
2137		dev_out = net->loopback_dev;
 
 
 
 
 
 
 
 
 
2138		fl4->flowi4_oif = dev_out->ifindex;
2139		flags |= RTCF_LOCAL;
2140		goto make_route;
2141	}
2142
2143#ifdef CONFIG_IP_ROUTE_MULTIPATH
2144	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2145		fib_select_multipath(&res);
2146	else
2147#endif
2148	if (!res.prefixlen &&
2149	    res.table->tb_num_default > 1 &&
2150	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2151		fib_select_default(&res);
2152
2153	if (!fl4->saddr)
2154		fl4->saddr = FIB_RES_PREFSRC(net, res);
2155
2156	dev_out = FIB_RES_DEV(res);
2157	fl4->flowi4_oif = dev_out->ifindex;
2158
2159
2160make_route:
2161	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2162
2163out:
2164	rcu_read_unlock();
2165	return rth;
2166}
2167EXPORT_SYMBOL_GPL(__ip_route_output_key);
2168
2169static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2170{
2171	return NULL;
2172}
2173
2174static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2175{
2176	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2177
2178	return mtu ? : dst->dev->mtu;
2179}
2180
2181static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2182					  struct sk_buff *skb, u32 mtu)
2183{
2184}
2185
2186static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2187				       struct sk_buff *skb)
2188{
2189}
2190
2191static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2192					  unsigned long old)
2193{
2194	return NULL;
2195}
2196
2197static struct dst_ops ipv4_dst_blackhole_ops = {
2198	.family			=	AF_INET,
2199	.protocol		=	cpu_to_be16(ETH_P_IP),
2200	.check			=	ipv4_blackhole_dst_check,
2201	.mtu			=	ipv4_blackhole_mtu,
2202	.default_advmss		=	ipv4_default_advmss,
2203	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2204	.redirect		=	ipv4_rt_blackhole_redirect,
2205	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2206	.neigh_lookup		=	ipv4_neigh_lookup,
2207};
2208
2209struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2210{
2211	struct rtable *ort = (struct rtable *) dst_orig;
2212	struct rtable *rt;
2213
2214	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2215	if (rt) {
2216		struct dst_entry *new = &rt->dst;
2217
2218		new->__use = 1;
2219		new->input = dst_discard;
2220		new->output = dst_discard_sk;
2221
2222		new->dev = ort->dst.dev;
2223		if (new->dev)
2224			dev_hold(new->dev);
2225
2226		rt->rt_is_input = ort->rt_is_input;
2227		rt->rt_iif = ort->rt_iif;
2228		rt->rt_pmtu = ort->rt_pmtu;
 
2229
2230		rt->rt_genid = rt_genid_ipv4(net);
2231		rt->rt_flags = ort->rt_flags;
2232		rt->rt_type = ort->rt_type;
2233		rt->rt_gateway = ort->rt_gateway;
2234		rt->rt_uses_gateway = ort->rt_uses_gateway;
2235
2236		INIT_LIST_HEAD(&rt->rt_uncached);
2237
2238		dst_free(new);
2239	}
2240
2241	dst_release(dst_orig);
2242
2243	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2244}
2245
2246struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2247				    struct sock *sk)
2248{
2249	struct rtable *rt = __ip_route_output_key(net, flp4);
2250
2251	if (IS_ERR(rt))
2252		return rt;
2253
2254	if (flp4->flowi4_proto)
2255		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2256						   flowi4_to_flowi(flp4),
2257						   sk, 0);
2258
2259	return rt;
2260}
2261EXPORT_SYMBOL_GPL(ip_route_output_flow);
2262
2263static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
 
2264			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2265			u32 seq, int event, int nowait, unsigned int flags)
2266{
2267	struct rtable *rt = skb_rtable(skb);
2268	struct rtmsg *r;
2269	struct nlmsghdr *nlh;
2270	unsigned long expires = 0;
2271	u32 error;
2272	u32 metrics[RTAX_MAX];
2273
2274	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2275	if (nlh == NULL)
2276		return -EMSGSIZE;
2277
2278	r = nlmsg_data(nlh);
2279	r->rtm_family	 = AF_INET;
2280	r->rtm_dst_len	= 32;
2281	r->rtm_src_len	= 0;
2282	r->rtm_tos	= fl4->flowi4_tos;
2283	r->rtm_table	= RT_TABLE_MAIN;
2284	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2285		goto nla_put_failure;
2286	r->rtm_type	= rt->rt_type;
2287	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2288	r->rtm_protocol = RTPROT_UNSPEC;
2289	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2290	if (rt->rt_flags & RTCF_NOTIFY)
2291		r->rtm_flags |= RTM_F_NOTIFY;
 
 
2292
2293	if (nla_put_be32(skb, RTA_DST, dst))
2294		goto nla_put_failure;
2295	if (src) {
2296		r->rtm_src_len = 32;
2297		if (nla_put_be32(skb, RTA_SRC, src))
2298			goto nla_put_failure;
2299	}
2300	if (rt->dst.dev &&
2301	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2302		goto nla_put_failure;
2303#ifdef CONFIG_IP_ROUTE_CLASSID
2304	if (rt->dst.tclassid &&
2305	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2306		goto nla_put_failure;
2307#endif
2308	if (!rt_is_input_route(rt) &&
2309	    fl4->saddr != src) {
2310		if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2311			goto nla_put_failure;
2312	}
2313	if (rt->rt_uses_gateway &&
2314	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2315		goto nla_put_failure;
2316
2317	expires = rt->dst.expires;
2318	if (expires) {
2319		unsigned long now = jiffies;
2320
2321		if (time_before(now, expires))
2322			expires -= now;
2323		else
2324			expires = 0;
2325	}
2326
2327	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2328	if (rt->rt_pmtu && expires)
2329		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
 
 
2330	if (rtnetlink_put_metrics(skb, metrics) < 0)
2331		goto nla_put_failure;
2332
2333	if (fl4->flowi4_mark &&
2334	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2335		goto nla_put_failure;
2336
 
 
 
 
 
2337	error = rt->dst.error;
2338
2339	if (rt_is_input_route(rt)) {
2340#ifdef CONFIG_IP_MROUTE
2341		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2342		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2343			int err = ipmr_get_route(net, skb,
2344						 fl4->saddr, fl4->daddr,
2345						 r, nowait);
 
2346			if (err <= 0) {
2347				if (!nowait) {
2348					if (err == 0)
2349						return 0;
2350					goto nla_put_failure;
2351				} else {
2352					if (err == -EMSGSIZE)
2353						goto nla_put_failure;
2354					error = err;
2355				}
2356			}
2357		} else
2358#endif
2359			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2360				goto nla_put_failure;
2361	}
2362
2363	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2364		goto nla_put_failure;
2365
2366	return nlmsg_end(skb, nlh);
 
2367
2368nla_put_failure:
2369	nlmsg_cancel(skb, nlh);
2370	return -EMSGSIZE;
2371}
2372
2373static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 
2374{
2375	struct net *net = sock_net(in_skb->sk);
2376	struct rtmsg *rtm;
2377	struct nlattr *tb[RTA_MAX+1];
 
2378	struct rtable *rt = NULL;
2379	struct flowi4 fl4;
2380	__be32 dst = 0;
2381	__be32 src = 0;
2382	u32 iif;
2383	int err;
2384	int mark;
2385	struct sk_buff *skb;
 
 
2386
2387	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
 
2388	if (err < 0)
2389		goto errout;
2390
2391	rtm = nlmsg_data(nlh);
2392
2393	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2394	if (skb == NULL) {
2395		err = -ENOBUFS;
2396		goto errout;
2397	}
2398
2399	/* Reserve room for dummy headers, this skb can pass
2400	   through good chunk of routing engine.
2401	 */
2402	skb_reset_mac_header(skb);
2403	skb_reset_network_header(skb);
2404
2405	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2406	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2407	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2408
2409	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2410	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2411	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2412	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
2413
2414	memset(&fl4, 0, sizeof(fl4));
2415	fl4.daddr = dst;
2416	fl4.saddr = src;
2417	fl4.flowi4_tos = rtm->rtm_tos;
2418	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2419	fl4.flowi4_mark = mark;
 
 
 
2420
2421	if (iif) {
2422		struct net_device *dev;
2423
2424		dev = __dev_get_by_index(net, iif);
2425		if (dev == NULL) {
2426			err = -ENODEV;
2427			goto errout_free;
2428		}
2429
2430		skb->protocol	= htons(ETH_P_IP);
2431		skb->dev	= dev;
2432		skb->mark	= mark;
2433		local_bh_disable();
2434		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2435		local_bh_enable();
2436
2437		rt = skb_rtable(skb);
2438		if (err == 0 && rt->dst.error)
2439			err = -rt->dst.error;
2440	} else {
2441		rt = ip_route_output_key(net, &fl4);
2442
2443		err = 0;
2444		if (IS_ERR(rt))
2445			err = PTR_ERR(rt);
 
 
2446	}
2447
2448	if (err)
2449		goto errout_free;
2450
2451	skb_dst_set(skb, &rt->dst);
2452	if (rtm->rtm_flags & RTM_F_NOTIFY)
2453		rt->rt_flags |= RTCF_NOTIFY;
2454
2455	err = rt_fill_info(net, dst, src, &fl4, skb,
2456			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2457			   RTM_NEWROUTE, 0, 0);
2458	if (err <= 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2459		goto errout_free;
2460
 
 
2461	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2462errout:
2463	return err;
2464
2465errout_free:
 
2466	kfree_skb(skb);
2467	goto errout;
2468}
2469
2470void ip_rt_multicast_event(struct in_device *in_dev)
2471{
2472	rt_cache_flush(dev_net(in_dev->dev));
2473}
2474
2475#ifdef CONFIG_SYSCTL
2476static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
2477static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2478static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2479static int ip_rt_gc_elasticity __read_mostly	= 8;
 
2480
2481static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2482					void __user *buffer,
2483					size_t *lenp, loff_t *ppos)
2484{
2485	struct net *net = (struct net *)__ctl->extra1;
2486
2487	if (write) {
2488		rt_cache_flush(net);
2489		fnhe_genid_bump(net);
2490		return 0;
2491	}
2492
2493	return -EINVAL;
2494}
2495
2496static struct ctl_table ipv4_route_table[] = {
2497	{
2498		.procname	= "gc_thresh",
2499		.data		= &ipv4_dst_ops.gc_thresh,
2500		.maxlen		= sizeof(int),
2501		.mode		= 0644,
2502		.proc_handler	= proc_dointvec,
2503	},
2504	{
2505		.procname	= "max_size",
2506		.data		= &ip_rt_max_size,
2507		.maxlen		= sizeof(int),
2508		.mode		= 0644,
2509		.proc_handler	= proc_dointvec,
2510	},
2511	{
2512		/*  Deprecated. Use gc_min_interval_ms */
2513
2514		.procname	= "gc_min_interval",
2515		.data		= &ip_rt_gc_min_interval,
2516		.maxlen		= sizeof(int),
2517		.mode		= 0644,
2518		.proc_handler	= proc_dointvec_jiffies,
2519	},
2520	{
2521		.procname	= "gc_min_interval_ms",
2522		.data		= &ip_rt_gc_min_interval,
2523		.maxlen		= sizeof(int),
2524		.mode		= 0644,
2525		.proc_handler	= proc_dointvec_ms_jiffies,
2526	},
2527	{
2528		.procname	= "gc_timeout",
2529		.data		= &ip_rt_gc_timeout,
2530		.maxlen		= sizeof(int),
2531		.mode		= 0644,
2532		.proc_handler	= proc_dointvec_jiffies,
2533	},
2534	{
2535		.procname	= "gc_interval",
2536		.data		= &ip_rt_gc_interval,
2537		.maxlen		= sizeof(int),
2538		.mode		= 0644,
2539		.proc_handler	= proc_dointvec_jiffies,
2540	},
2541	{
2542		.procname	= "redirect_load",
2543		.data		= &ip_rt_redirect_load,
2544		.maxlen		= sizeof(int),
2545		.mode		= 0644,
2546		.proc_handler	= proc_dointvec,
2547	},
2548	{
2549		.procname	= "redirect_number",
2550		.data		= &ip_rt_redirect_number,
2551		.maxlen		= sizeof(int),
2552		.mode		= 0644,
2553		.proc_handler	= proc_dointvec,
2554	},
2555	{
2556		.procname	= "redirect_silence",
2557		.data		= &ip_rt_redirect_silence,
2558		.maxlen		= sizeof(int),
2559		.mode		= 0644,
2560		.proc_handler	= proc_dointvec,
2561	},
2562	{
2563		.procname	= "error_cost",
2564		.data		= &ip_rt_error_cost,
2565		.maxlen		= sizeof(int),
2566		.mode		= 0644,
2567		.proc_handler	= proc_dointvec,
2568	},
2569	{
2570		.procname	= "error_burst",
2571		.data		= &ip_rt_error_burst,
2572		.maxlen		= sizeof(int),
2573		.mode		= 0644,
2574		.proc_handler	= proc_dointvec,
2575	},
2576	{
2577		.procname	= "gc_elasticity",
2578		.data		= &ip_rt_gc_elasticity,
2579		.maxlen		= sizeof(int),
2580		.mode		= 0644,
2581		.proc_handler	= proc_dointvec,
2582	},
2583	{
2584		.procname	= "mtu_expires",
2585		.data		= &ip_rt_mtu_expires,
2586		.maxlen		= sizeof(int),
2587		.mode		= 0644,
2588		.proc_handler	= proc_dointvec_jiffies,
2589	},
2590	{
2591		.procname	= "min_pmtu",
2592		.data		= &ip_rt_min_pmtu,
2593		.maxlen		= sizeof(int),
2594		.mode		= 0644,
2595		.proc_handler	= proc_dointvec,
 
2596	},
2597	{
2598		.procname	= "min_adv_mss",
2599		.data		= &ip_rt_min_advmss,
2600		.maxlen		= sizeof(int),
2601		.mode		= 0644,
2602		.proc_handler	= proc_dointvec,
2603	},
2604	{ }
2605};
2606
2607static struct ctl_table ipv4_route_flush_table[] = {
2608	{
2609		.procname	= "flush",
2610		.maxlen		= sizeof(int),
2611		.mode		= 0200,
2612		.proc_handler	= ipv4_sysctl_rtcache_flush,
2613	},
2614	{ },
2615};
2616
2617static __net_init int sysctl_route_net_init(struct net *net)
2618{
2619	struct ctl_table *tbl;
2620
2621	tbl = ipv4_route_flush_table;
2622	if (!net_eq(net, &init_net)) {
2623		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2624		if (tbl == NULL)
2625			goto err_dup;
2626
2627		/* Don't export sysctls to unprivileged users */
2628		if (net->user_ns != &init_user_ns)
2629			tbl[0].procname = NULL;
2630	}
2631	tbl[0].extra1 = net;
2632
2633	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2634	if (net->ipv4.route_hdr == NULL)
2635		goto err_reg;
2636	return 0;
2637
2638err_reg:
2639	if (tbl != ipv4_route_flush_table)
2640		kfree(tbl);
2641err_dup:
2642	return -ENOMEM;
2643}
2644
2645static __net_exit void sysctl_route_net_exit(struct net *net)
2646{
2647	struct ctl_table *tbl;
2648
2649	tbl = net->ipv4.route_hdr->ctl_table_arg;
2650	unregister_net_sysctl_table(net->ipv4.route_hdr);
2651	BUG_ON(tbl == ipv4_route_flush_table);
2652	kfree(tbl);
2653}
2654
2655static __net_initdata struct pernet_operations sysctl_route_ops = {
2656	.init = sysctl_route_net_init,
2657	.exit = sysctl_route_net_exit,
2658};
2659#endif
2660
2661static __net_init int rt_genid_init(struct net *net)
2662{
2663	atomic_set(&net->ipv4.rt_genid, 0);
2664	atomic_set(&net->fnhe_genid, 0);
2665	get_random_bytes(&net->ipv4.dev_addr_genid,
2666			 sizeof(net->ipv4.dev_addr_genid));
2667	return 0;
2668}
2669
2670static __net_initdata struct pernet_operations rt_genid_ops = {
2671	.init = rt_genid_init,
2672};
2673
2674static int __net_init ipv4_inetpeer_init(struct net *net)
2675{
2676	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2677
2678	if (!bp)
2679		return -ENOMEM;
2680	inet_peer_base_init(bp);
2681	net->ipv4.peers = bp;
2682	return 0;
2683}
2684
2685static void __net_exit ipv4_inetpeer_exit(struct net *net)
2686{
2687	struct inet_peer_base *bp = net->ipv4.peers;
2688
2689	net->ipv4.peers = NULL;
2690	inetpeer_invalidate_tree(bp);
2691	kfree(bp);
2692}
2693
2694static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2695	.init	=	ipv4_inetpeer_init,
2696	.exit	=	ipv4_inetpeer_exit,
2697};
2698
2699#ifdef CONFIG_IP_ROUTE_CLASSID
2700struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2701#endif /* CONFIG_IP_ROUTE_CLASSID */
2702
2703int __init ip_rt_init(void)
2704{
2705	int rc = 0;
 
 
 
 
 
 
2706
 
 
 
 
 
 
 
 
 
 
2707#ifdef CONFIG_IP_ROUTE_CLASSID
2708	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2709	if (!ip_rt_acct)
2710		panic("IP: failed to allocate ip_rt_acct\n");
2711#endif
2712
2713	ipv4_dst_ops.kmem_cachep =
2714		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2715				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2716
2717	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2718
2719	if (dst_entries_init(&ipv4_dst_ops) < 0)
2720		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2721
2722	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2723		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2724
2725	ipv4_dst_ops.gc_thresh = ~0;
2726	ip_rt_max_size = INT_MAX;
2727
2728	devinet_init();
2729	ip_fib_init();
2730
2731	if (ip_rt_proc_init())
2732		pr_err("Unable to create route proc files\n");
2733#ifdef CONFIG_XFRM
2734	xfrm_init();
2735	xfrm4_init();
2736#endif
2737	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
 
2738
2739#ifdef CONFIG_SYSCTL
2740	register_pernet_subsys(&sysctl_route_ops);
2741#endif
2742	register_pernet_subsys(&rt_genid_ops);
2743	register_pernet_subsys(&ipv4_inetpeer_ops);
2744	return rc;
2745}
2746
2747#ifdef CONFIG_SYSCTL
2748/*
2749 * We really need to sanitize the damn ipv4 init order, then all
2750 * this nonsense will go away.
2751 */
2752void __init ip_static_sysctl_init(void)
2753{
2754	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2755}
2756#endif