route.c - net/ipv4/route.c - Linux diff v4.17 - Bootlin Elixir Cross Referencer

   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		ROUTE - implementation of the IP router.
   7 *
   8 * Authors:	Ross Biro
   9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *		Alan Cox	:	Verify area fixes.
  16 *		Alan Cox	:	cli() protects routing changes
  17 *		Rui Oliveira	:	ICMP routing table updates
  18 *		(rco@di.uminho.pt)	Routing table insertion and update
  19 *		Linus Torvalds	:	Rewrote bits to be sensible
  20 *		Alan Cox	:	Added BSD route gw semantics
  21 *		Alan Cox	:	Super /proc >4K
  22 *		Alan Cox	:	MTU in route table
  23 *		Alan Cox	: 	MSS actually. Also added the window
  24 *					clamper.
  25 *		Sam Lantinga	:	Fixed route matching in rt_del()
  26 *		Alan Cox	:	Routing cache support.
  27 *		Alan Cox	:	Removed compatibility cruft.
  28 *		Alan Cox	:	RTF_REJECT support.
  29 *		Alan Cox	:	TCP irtt support.
  30 *		Jonathan Naylor	:	Added Metric support.
  31 *	Miquel van Smoorenburg	:	BSD API fixes.
  32 *	Miquel van Smoorenburg	:	Metrics.
  33 *		Alan Cox	:	Use __u32 properly
  34 *		Alan Cox	:	Aligned routing errors more closely with BSD
  35 *					our system is still very different.
  36 *		Alan Cox	:	Faster /proc handling
  37 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  38 *					routing caches and better behaviour.
  39 *
  40 *		Olaf Erb	:	irtt wasn't being copied right.
  41 *		Bjorn Ekwall	:	Kerneld route support.
  42 *		Alan Cox	:	Multicast fixed (I hope)
  43 * 		Pavel Krauz	:	Limited broadcast fixed
  44 *		Mike McLagan	:	Routing by source
  45 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  46 *					route.c and rewritten from scratch.
  47 *		Andi Kleen	:	Load-limit warning messages.
  48 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  49 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  50 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  51 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  52 *		Marc Boucher	:	routing by fwmark
  53 *	Robert Olsson		:	Added rt_cache statistics
  54 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  55 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  56 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  57 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
  58 *
  59 *		This program is free software; you can redistribute it and/or
  60 *		modify it under the terms of the GNU General Public License
  61 *		as published by the Free Software Foundation; either version
  62 *		2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <linux/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
 
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
 
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
 
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <linux/jhash.h>
  93#include <net/dst.h>
  94#include <net/dst_metadata.h>
  95#include <net/net_namespace.h>
  96#include <net/protocol.h>
  97#include <net/ip.h>
  98#include <net/route.h>
  99#include <net/inetpeer.h>
 100#include <net/sock.h>
 101#include <net/ip_fib.h>
 102#include <net/arp.h>
 103#include <net/tcp.h>
 104#include <net/icmp.h>
 105#include <net/xfrm.h>
 106#include <net/lwtunnel.h>
 107#include <net/netevent.h>
 108#include <net/rtnetlink.h>
 109#ifdef CONFIG_SYSCTL
 110#include <linux/sysctl.h>
 
 111#endif
 112#include <net/secure_seq.h>
 113#include <net/ip_tunnels.h>
 114#include <net/l3mdev.h>
 115
 116#include "fib_lookup.h"
 117
 118#define RT_FL_TOS(oldflp4) \
 119	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 120
 
 
 121#define RT_GC_TIMEOUT (300*HZ)
 122
 123static int ip_rt_max_size;
 
 
 
 124static int ip_rt_redirect_number __read_mostly	= 9;
 125static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 126static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 127static int ip_rt_error_cost __read_mostly	= HZ;
 128static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 
 129static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 130static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 131static int ip_rt_min_advmss __read_mostly	= 256;
 
 132
 133static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 
 134
 135/*
 136 *	Interface to generic destination cache.
 137 */
 138
 139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 141static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
 
 142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143static void		 ipv4_link_failure(struct sk_buff *skb);
 144static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145					   struct sk_buff *skb, u32 mtu);
 146static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147					struct sk_buff *skb);
 148static void		ipv4_dst_destroy(struct dst_entry *dst);
 
 
 149
 150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151{
 152	WARN_ON(1);
 153	return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 154}
 155
 156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157					   struct sk_buff *skb,
 158					   const void *daddr);
 159static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 160
 161static struct dst_ops ipv4_dst_ops = {
 162	.family =		AF_INET,
 
 
 163	.check =		ipv4_dst_check,
 164	.default_advmss =	ipv4_default_advmss,
 165	.mtu =			ipv4_mtu,
 166	.cow_metrics =		ipv4_cow_metrics,
 167	.destroy =		ipv4_dst_destroy,
 
 168	.negative_advice =	ipv4_negative_advice,
 169	.link_failure =		ipv4_link_failure,
 170	.update_pmtu =		ip_rt_update_pmtu,
 171	.redirect =		ip_do_redirect,
 172	.local_out =		__ip_local_out,
 173	.neigh_lookup =		ipv4_neigh_lookup,
 174	.confirm_neigh =	ipv4_confirm_neigh,
 175};
 176
 177#define ECN_OR_COST(class)	TC_PRIO_##class
 178
 179const __u8 ip_tos2prio[16] = {
 180	TC_PRIO_BESTEFFORT,
 181	ECN_OR_COST(BESTEFFORT),
 182	TC_PRIO_BESTEFFORT,
 183	ECN_OR_COST(BESTEFFORT),
 184	TC_PRIO_BULK,
 185	ECN_OR_COST(BULK),
 186	TC_PRIO_BULK,
 187	ECN_OR_COST(BULK),
 188	TC_PRIO_INTERACTIVE,
 189	ECN_OR_COST(INTERACTIVE),
 190	TC_PRIO_INTERACTIVE,
 191	ECN_OR_COST(INTERACTIVE),
 192	TC_PRIO_INTERACTIVE_BULK,
 193	ECN_OR_COST(INTERACTIVE_BULK),
 194	TC_PRIO_INTERACTIVE_BULK,
 195	ECN_OR_COST(INTERACTIVE_BULK)
 196};
 197EXPORT_SYMBOL(ip_tos2prio);
 198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 200#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 
 
 
 
 
 
 
 
 
 
 
 
 
 201
 202#ifdef CONFIG_PROC_FS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 204{
 
 205	if (*pos)
 206		return NULL;
 
 207	return SEQ_START_TOKEN;
 208}
 209
 210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 211{
 
 
 
 
 
 
 212	++*pos;
 213	return NULL;
 214}
 215
 216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 217{
 
 
 218}
 219
 220static int rt_cache_seq_show(struct seq_file *seq, void *v)
 221{
 222	if (v == SEQ_START_TOKEN)
 223		seq_printf(seq, "%-127s\n",
 224			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 225			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 226			   "HHUptod\tSpecDst");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 227	return 0;
 228}
 229
 230static const struct seq_operations rt_cache_seq_ops = {
 231	.start  = rt_cache_seq_start,
 232	.next   = rt_cache_seq_next,
 233	.stop   = rt_cache_seq_stop,
 234	.show   = rt_cache_seq_show,
 235};
 236
 237static int rt_cache_seq_open(struct inode *inode, struct file *file)
 238{
 239	return seq_open(file, &rt_cache_seq_ops);
 
 240}
 241
 242static const struct file_operations rt_cache_seq_fops = {
 
 243	.open	 = rt_cache_seq_open,
 244	.read	 = seq_read,
 245	.llseek	 = seq_lseek,
 246	.release = seq_release,
 247};
 248
 249
 250static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 251{
 252	int cpu;
 253
 254	if (*pos == 0)
 255		return SEQ_START_TOKEN;
 256
 257	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 258		if (!cpu_possible(cpu))
 259			continue;
 260		*pos = cpu+1;
 261		return &per_cpu(rt_cache_stat, cpu);
 262	}
 263	return NULL;
 264}
 265
 266static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 267{
 268	int cpu;
 269
 270	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 271		if (!cpu_possible(cpu))
 272			continue;
 273		*pos = cpu+1;
 274		return &per_cpu(rt_cache_stat, cpu);
 275	}
 276	return NULL;
 277
 278}
 279
 280static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281{
 282
 283}
 284
 285static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286{
 287	struct rt_cache_stat *st = v;
 288
 289	if (v == SEQ_START_TOKEN) {
 290		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291		return 0;
 292	}
 293
 294	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 296		   dst_entries_get_slow(&ipv4_dst_ops),
 297		   0, /* st->in_hit */
 298		   st->in_slow_tot,
 299		   st->in_slow_mc,
 300		   st->in_no_route,
 301		   st->in_brd,
 302		   st->in_martian_dst,
 303		   st->in_martian_src,
 304
 305		   0, /* st->out_hit */
 306		   st->out_slow_tot,
 307		   st->out_slow_mc,
 308
 309		   0, /* st->gc_total */
 310		   0, /* st->gc_ignored */
 311		   0, /* st->gc_goal_miss */
 312		   0, /* st->gc_dst_overflow */
 313		   0, /* st->in_hlist_search */
 314		   0  /* st->out_hlist_search */
 315		);
 316	return 0;
 317}
 318
 319static const struct seq_operations rt_cpu_seq_ops = {
 320	.start  = rt_cpu_seq_start,
 321	.next   = rt_cpu_seq_next,
 322	.stop   = rt_cpu_seq_stop,
 323	.show   = rt_cpu_seq_show,
 324};
 325
 326
 327static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328{
 329	return seq_open(file, &rt_cpu_seq_ops);
 330}
 331
 332static const struct file_operations rt_cpu_seq_fops = {
 
 333	.open	 = rt_cpu_seq_open,
 334	.read	 = seq_read,
 335	.llseek	 = seq_lseek,
 336	.release = seq_release,
 337};
 338
 339#ifdef CONFIG_IP_ROUTE_CLASSID
 340static int rt_acct_proc_show(struct seq_file *m, void *v)
 341{
 342	struct ip_rt_acct *dst, *src;
 343	unsigned int i, j;
 344
 345	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 346	if (!dst)
 347		return -ENOMEM;
 348
 349	for_each_possible_cpu(i) {
 350		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 351		for (j = 0; j < 256; j++) {
 352			dst[j].o_bytes   += src[j].o_bytes;
 353			dst[j].o_packets += src[j].o_packets;
 354			dst[j].i_bytes   += src[j].i_bytes;
 355			dst[j].i_packets += src[j].i_packets;
 356		}
 357	}
 358
 359	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 360	kfree(dst);
 361	return 0;
 362}
 363
 364static int rt_acct_proc_open(struct inode *inode, struct file *file)
 365{
 366	return single_open(file, rt_acct_proc_show, NULL);
 367}
 368
 369static const struct file_operations rt_acct_proc_fops = {
 
 370	.open		= rt_acct_proc_open,
 371	.read		= seq_read,
 372	.llseek		= seq_lseek,
 373	.release	= single_release,
 374};
 375#endif
 376
 377static int __net_init ip_rt_do_proc_init(struct net *net)
 378{
 379	struct proc_dir_entry *pde;
 380
 381	pde = proc_create("rt_cache", 0444, net->proc_net,
 382			  &rt_cache_seq_fops);
 383	if (!pde)
 384		goto err1;
 385
 386	pde = proc_create("rt_cache", 0444,
 387			  net->proc_net_stat, &rt_cpu_seq_fops);
 388	if (!pde)
 389		goto err2;
 390
 391#ifdef CONFIG_IP_ROUTE_CLASSID
 392	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 393	if (!pde)
 394		goto err3;
 395#endif
 396	return 0;
 397
 398#ifdef CONFIG_IP_ROUTE_CLASSID
 399err3:
 400	remove_proc_entry("rt_cache", net->proc_net_stat);
 401#endif
 402err2:
 403	remove_proc_entry("rt_cache", net->proc_net);
 404err1:
 405	return -ENOMEM;
 406}
 407
 408static void __net_exit ip_rt_do_proc_exit(struct net *net)
 409{
 410	remove_proc_entry("rt_cache", net->proc_net_stat);
 411	remove_proc_entry("rt_cache", net->proc_net);
 412#ifdef CONFIG_IP_ROUTE_CLASSID
 413	remove_proc_entry("rt_acct", net->proc_net);
 414#endif
 415}
 416
 417static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 418	.init = ip_rt_do_proc_init,
 419	.exit = ip_rt_do_proc_exit,
 420};
 421
 422static int __init ip_rt_proc_init(void)
 423{
 424	return register_pernet_subsys(&ip_rt_proc_ops);
 425}
 426
 427#else
 428static inline int ip_rt_proc_init(void)
 429{
 430	return 0;
 431}
 432#endif /* CONFIG_PROC_FS */
 433
 434static inline bool rt_is_expired(const struct rtable *rth)
 435{
 436	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 437}
 438
 439void rt_cache_flush(struct net *net)
 440{
 441	rt_genid_bump_ipv4(net);
 
 442}
 443
 444static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 445					   struct sk_buff *skb,
 446					   const void *daddr)
 447{
 448	struct net_device *dev = dst->dev;
 449	const __be32 *pkey = daddr;
 450	const struct rtable *rt;
 451	struct neighbour *n;
 
 452
 453	rt = (const struct rtable *) dst;
 454	if (rt->rt_gateway)
 455		pkey = (const __be32 *) &rt->rt_gateway;
 456	else if (skb)
 457		pkey = &ip_hdr(skb)->daddr;
 458
 459	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 460	if (n)
 461		return n;
 462	return neigh_create(&arp_tbl, pkey, dev);
 
 
 
 
 
 
 
 
 
 
 463}
 464
 465static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 
 
 
 
 
 466{
 467	struct net_device *dev = dst->dev;
 468	const __be32 *pkey = daddr;
 469	const struct rtable *rt;
 470
 471	rt = (const struct rtable *)dst;
 472	if (rt->rt_gateway)
 473		pkey = (const __be32 *)&rt->rt_gateway;
 474	else if (!daddr ||
 475		 (rt->rt_flags &
 476		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 477		return;
 
 
 
 
 
 
 
 
 
 
 478
 479	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 
 
 
 
 
 480}
 481
 482#define IP_IDENTS_SZ 2048u
 
 
 
 
 
 
 
 
 483
 484static atomic_t *ip_idents __read_mostly;
 485static u32 *ip_tstamps __read_mostly;
 
 
 486
 487/* In order to protect privacy, we add a perturbation to identifiers
 488 * if one generator is seldom used. This makes hard for an attacker
 489 * to infer how many packets were sent between two points in time.
 
 
 
 
 
 
 490 */
 491u32 ip_idents_reserve(u32 hash, int segs)
 492{
 493	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 494	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 495	u32 old = READ_ONCE(*p_tstamp);
 496	u32 now = (u32)jiffies;
 497	u32 new, delta = 0;
 498
 499	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 500		delta = prandom_u32_max(now - old);
 
 
 
 
 
 
 
 501
 502	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
 503	do {
 504		old = (u32)atomic_read(p_id);
 505		new = old + delta + segs;
 506	} while (atomic_cmpxchg(p_id, old, new) != old);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 507
 508	return new - segs;
 
 
 
 
 
 
 509}
 510EXPORT_SYMBOL(ip_idents_reserve);
 511
 512void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 513{
 514	static u32 ip_idents_hashrnd __read_mostly;
 515	u32 hash, id;
 
 
 
 516
 517	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 
 518
 519	hash = jhash_3words((__force u32)iph->daddr,
 520			    (__force u32)iph->saddr,
 521			    iph->protocol ^ net_hash_mix(net),
 522			    ip_idents_hashrnd);
 523	id = ip_idents_reserve(hash, segs);
 524	iph->id = htons(id);
 
 
 
 
 
 
 
 
 
 
 
 525}
 526EXPORT_SYMBOL(__ip_select_ident);
 527
 528static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 529			     const struct sock *sk,
 530			     const struct iphdr *iph,
 531			     int oif, u8 tos,
 532			     u8 prot, u32 mark, int flow_flags)
 533{
 534	if (sk) {
 535		const struct inet_sock *inet = inet_sk(sk);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 536
 537		oif = sk->sk_bound_dev_if;
 538		mark = sk->sk_mark;
 539		tos = RT_CONN_FLAGS(sk);
 540		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 541	}
 542	flowi4_init_output(fl4, oif, mark, tos,
 543			   RT_SCOPE_UNIVERSE, prot,
 544			   flow_flags,
 545			   iph->daddr, iph->saddr, 0, 0,
 546			   sock_net_uid(net, sk));
 547}
 548
 549static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 550			       const struct sock *sk)
 
 
 
 551{
 552	const struct net *net = dev_net(skb->dev);
 553	const struct iphdr *iph = ip_hdr(skb);
 554	int oif = skb->dev->ifindex;
 555	u8 tos = RT_TOS(iph->tos);
 556	u8 prot = iph->protocol;
 557	u32 mark = skb->mark;
 558
 559	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 
 
 
 
 
 
 
 
 
 
 
 
 560}
 561
 562static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 
 
 
 
 563{
 564	const struct inet_sock *inet = inet_sk(sk);
 565	const struct ip_options_rcu *inet_opt;
 566	__be32 daddr = inet->inet_daddr;
 
 567
 568	rcu_read_lock();
 569	inet_opt = rcu_dereference(inet->inet_opt);
 570	if (inet_opt && inet_opt->opt.srr)
 571		daddr = inet_opt->opt.faddr;
 572	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 573			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 574			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 575			   inet_sk_flowi_flags(sk),
 576			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 577	rcu_read_unlock();
 578}
 579
 580static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 581				 const struct sk_buff *skb)
 582{
 583	if (skb)
 584		build_skb_flow_key(fl4, skb, sk);
 585	else
 586		build_sk_flow_key(fl4, sk);
 587}
 588
 589static DEFINE_SPINLOCK(fnhe_lock);
 
 590
 591static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 
 
 
 
 
 
 
 
 
 
 592{
 593	struct rtable *rt;
 
 
 
 
 
 
 
 
 594
 595	rt = rcu_dereference(fnhe->fnhe_rth_input);
 596	if (rt) {
 597		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 598		dst_dev_put(&rt->dst);
 599		dst_release(&rt->dst);
 
 
 
 
 
 
 600	}
 601	rt = rcu_dereference(fnhe->fnhe_rth_output);
 602	if (rt) {
 603		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 604		dst_dev_put(&rt->dst);
 605		dst_release(&rt->dst);
 
 
 
 
 
 
 
 
 
 
 
 
 
 606	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 607}
 608
 609static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 
 
 
 610{
 611	struct fib_nh_exception *fnhe, *oldest;
 
 612
 613	oldest = rcu_dereference(hash->chain);
 614	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 615	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 616		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 617			oldest = fnhe;
 618	}
 619	fnhe_flush_routes(oldest);
 620	return oldest;
 621}
 622
 623static inline u32 fnhe_hashfun(__be32 daddr)
 624{
 625	static u32 fnhe_hashrnd __read_mostly;
 626	u32 hval;
 
 
 
 627
 628	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 629	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 630	return hash_32(hval, FNHE_HASH_SHIFT);
 
 
 
 
 
 
 
 
 631}
 632
 633static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 634{
 635	rt->rt_pmtu = fnhe->fnhe_pmtu;
 636	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 637	rt->dst.expires = fnhe->fnhe_expires;
 
 638
 639	if (fnhe->fnhe_gw) {
 640		rt->rt_flags |= RTCF_REDIRECTED;
 641		rt->rt_gateway = fnhe->fnhe_gw;
 642		rt->rt_uses_gateway = 1;
 643	}
 644}
 645
 646static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 647				  u32 pmtu, bool lock, unsigned long expires)
 648{
 649	struct fnhe_hash_bucket *hash;
 650	struct fib_nh_exception *fnhe;
 651	struct rtable *rt;
 652	u32 genid, hval;
 653	unsigned int i;
 654	int depth;
 
 
 
 
 
 
 
 655
 656	genid = fnhe_genid(dev_net(nh->nh_dev));
 657	hval = fnhe_hashfun(daddr);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 658
 659	spin_lock_bh(&fnhe_lock);
 
 
 
 
 
 
 
 
 660
 661	hash = rcu_dereference(nh->nh_exceptions);
 662	if (!hash) {
 663		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 664		if (!hash)
 665			goto out_unlock;
 666		rcu_assign_pointer(nh->nh_exceptions, hash);
 667	}
 668
 669	hash += hval;
 670
 671	depth = 0;
 672	for (fnhe = rcu_dereference(hash->chain); fnhe;
 673	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 674		if (fnhe->fnhe_daddr == daddr)
 675			break;
 676		depth++;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 677	}
 678
 679	if (fnhe) {
 680		if (fnhe->fnhe_genid != genid)
 681			fnhe->fnhe_genid = genid;
 682		if (gw)
 683			fnhe->fnhe_gw = gw;
 684		if (pmtu) {
 685			fnhe->fnhe_pmtu = pmtu;
 686			fnhe->fnhe_mtu_locked = lock;
 687		}
 688		fnhe->fnhe_expires = max(1UL, expires);
 689		/* Update all cached dsts too */
 690		rt = rcu_dereference(fnhe->fnhe_rth_input);
 691		if (rt)
 692			fill_route_from_fnhe(rt, fnhe);
 693		rt = rcu_dereference(fnhe->fnhe_rth_output);
 694		if (rt)
 695			fill_route_from_fnhe(rt, fnhe);
 696	} else {
 697		if (depth > FNHE_RECLAIM_DEPTH)
 698			fnhe = fnhe_oldest(hash);
 699		else {
 700			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 701			if (!fnhe)
 702				goto out_unlock;
 703
 704			fnhe->fnhe_next = hash->chain;
 705			rcu_assign_pointer(hash->chain, fnhe);
 706		}
 707		fnhe->fnhe_genid = genid;
 708		fnhe->fnhe_daddr = daddr;
 709		fnhe->fnhe_gw = gw;
 710		fnhe->fnhe_pmtu = pmtu;
 711		fnhe->fnhe_mtu_locked = lock;
 712		fnhe->fnhe_expires = max(1UL, expires);
 713
 714		/* Exception created; mark the cached routes for the nexthop
 715		 * stale, so anyone caching it rechecks if this exception
 716		 * applies to them.
 717		 */
 718		rt = rcu_dereference(nh->nh_rth_input);
 719		if (rt)
 720			rt->dst.obsolete = DST_OBSOLETE_KILL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 721
 722		for_each_possible_cpu(i) {
 723			struct rtable __rcu **prt;
 724			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 725			rt = rcu_dereference(*prt);
 726			if (rt)
 727				rt->dst.obsolete = DST_OBSOLETE_KILL;
 728		}
 729	}
 730
 731	fnhe->fnhe_stamp = jiffies;
 732
 733out_unlock:
 734	spin_unlock_bh(&fnhe_lock);
 
 
 
 
 
 
 
 
 
 
 
 735}
 736
 737static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 738			     bool kill_route)
 
 739{
 740	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 741	__be32 old_gw = ip_hdr(skb)->saddr;
 742	struct net_device *dev = skb->dev;
 743	struct in_device *in_dev;
 744	struct fib_result res;
 745	struct neighbour *n;
 746	struct net *net;
 747
 748	switch (icmp_hdr(skb)->code & 7) {
 749	case ICMP_REDIR_NET:
 750	case ICMP_REDIR_NETTOS:
 751	case ICMP_REDIR_HOST:
 752	case ICMP_REDIR_HOSTTOS:
 753		break;
 754
 755	default:
 756		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 757	}
 
 
 758
 759	if (rt->rt_gateway != old_gw)
 
 
 
 
 
 
 
 
 
 
 
 
 760		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 761
 762	in_dev = __in_dev_get_rcu(dev);
 763	if (!in_dev)
 764		return;
 765
 766	net = dev_net(dev);
 767	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 768	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 769	    ipv4_is_zeronet(new_gw))
 770		goto reject_redirect;
 771
 772	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 773		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 774			goto reject_redirect;
 775		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 776			goto reject_redirect;
 777	} else {
 778		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 779			goto reject_redirect;
 780	}
 781
 782	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 783	if (!n)
 784		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 785	if (!IS_ERR(n)) {
 786		if (!(n->nud_state & NUD_VALID)) {
 787			neigh_event_send(n, NULL);
 788		} else {
 789			if (fib_lookup(net, fl4, &res, 0) == 0) {
 790				struct fib_nh *nh = &FIB_RES_NH(res);
 791
 792				update_or_create_fnhe(nh, fl4->daddr, new_gw,
 793						0, false,
 794						jiffies + ip_rt_gc_timeout);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 795			}
 796			if (kill_route)
 797				rt->dst.obsolete = DST_OBSOLETE_KILL;
 798			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 799		}
 800		neigh_release(n);
 801	}
 802	return;
 803
 804reject_redirect:
 805#ifdef CONFIG_IP_ROUTE_VERBOSE
 806	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 807		const struct iphdr *iph = (const struct iphdr *) skb->data;
 808		__be32 daddr = iph->daddr;
 809		__be32 saddr = iph->saddr;
 810
 811		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 812				     "  Advised path = %pI4 -> %pI4\n",
 813				     &old_gw, dev->name, &new_gw,
 814				     &saddr, &daddr);
 815	}
 816#endif
 817	;
 818}
 819
 820static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 821{
 822	struct rtable *rt;
 823	struct flowi4 fl4;
 824	const struct iphdr *iph = (const struct iphdr *) skb->data;
 825	struct net *net = dev_net(skb->dev);
 826	int oif = skb->dev->ifindex;
 827	u8 tos = RT_TOS(iph->tos);
 828	u8 prot = iph->protocol;
 829	u32 mark = skb->mark;
 830
 831	rt = (struct rtable *) dst;
 
 
 
 832
 833	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 834	__ip_do_redirect(rt, skb, &fl4, true);
 
 
 
 
 835}
 836
 837static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 838{
 839	struct rtable *rt = (struct rtable *)dst;
 840	struct dst_entry *ret = dst;
 841
 842	if (rt) {
 843		if (dst->obsolete > 0) {
 844			ip_rt_put(rt);
 845			ret = NULL;
 846		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 847			   rt->dst.expires) {
 848			ip_rt_put(rt);
 
 
 849			ret = NULL;
 
 
 850		}
 851	}
 852	return ret;
 853}
 854
 855/*
 856 * Algorithm:
 857 *	1. The first ip_rt_redirect_number redirects are sent
 858 *	   with exponential backoff, then we stop sending them at all,
 859 *	   assuming that the host ignores our redirects.
 860 *	2. If we did not see packets requiring redirects
 861 *	   during ip_rt_redirect_silence, we assume that the host
 862 *	   forgot redirected route and start to send redirects again.
 863 *
 864 * This algorithm is much cheaper and more intelligent than dumb load limiting
 865 * in icmp.c.
 866 *
 867 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 868 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 869 */
 870
 871void ip_rt_send_redirect(struct sk_buff *skb)
 872{
 873	struct rtable *rt = skb_rtable(skb);
 874	struct in_device *in_dev;
 875	struct inet_peer *peer;
 876	struct net *net;
 877	int log_martians;
 878	int vif;
 879
 880	rcu_read_lock();
 881	in_dev = __in_dev_get_rcu(rt->dst.dev);
 882	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 883		rcu_read_unlock();
 884		return;
 885	}
 886	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 887	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 888	rcu_read_unlock();
 889
 890	net = dev_net(rt->dst.dev);
 891	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 
 892	if (!peer) {
 893		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 894			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 895		return;
 896	}
 897
 898	/* No redirected packets during ip_rt_redirect_silence;
 899	 * reset the algorithm.
 900	 */
 901	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 902		peer->rate_tokens = 0;
 903
 904	/* Too many ignored redirects; do not send anything
 905	 * set dst.rate_last to the last seen redirected packet.
 906	 */
 907	if (peer->rate_tokens >= ip_rt_redirect_number) {
 908		peer->rate_last = jiffies;
 909		goto out_put_peer;
 910	}
 911
 912	/* Check for load limit; set rate_last to the latest sent
 913	 * redirect.
 914	 */
 915	if (peer->rate_tokens == 0 ||
 916	    time_after(jiffies,
 917		       (peer->rate_last +
 918			(ip_rt_redirect_load << peer->rate_tokens)))) {
 919		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 920
 921		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 922		peer->rate_last = jiffies;
 923		++peer->rate_tokens;
 924#ifdef CONFIG_IP_ROUTE_VERBOSE
 925		if (log_martians &&
 926		    peer->rate_tokens == ip_rt_redirect_number)
 927			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 928					     &ip_hdr(skb)->saddr, inet_iif(skb),
 929					     &ip_hdr(skb)->daddr, &gw);
 930#endif
 931	}
 932out_put_peer:
 933	inet_putpeer(peer);
 934}
 935
 936static int ip_error(struct sk_buff *skb)
 937{
 938	struct rtable *rt = skb_rtable(skb);
 939	struct net_device *dev = skb->dev;
 940	struct in_device *in_dev;
 941	struct inet_peer *peer;
 942	unsigned long now;
 943	struct net *net;
 944	bool send;
 945	int code;
 946
 947	if (netif_is_l3_master(skb->dev)) {
 948		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 949		if (!dev)
 950			goto out;
 951	}
 952
 953	in_dev = __in_dev_get_rcu(dev);
 954
 955	/* IP on this device is disabled. */
 956	if (!in_dev)
 957		goto out;
 958
 959	net = dev_net(rt->dst.dev);
 960	if (!IN_DEV_FORWARD(in_dev)) {
 961		switch (rt->dst.error) {
 962		case EHOSTUNREACH:
 963			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 964			break;
 965
 966		case ENETUNREACH:
 967			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 968			break;
 969		}
 970		goto out;
 971	}
 972
 973	switch (rt->dst.error) {
 974	case EINVAL:
 975	default:
 976		goto out;
 977	case EHOSTUNREACH:
 978		code = ICMP_HOST_UNREACH;
 979		break;
 980	case ENETUNREACH:
 981		code = ICMP_NET_UNREACH;
 982		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 
 983		break;
 984	case EACCES:
 985		code = ICMP_PKT_FILTERED;
 986		break;
 987	}
 988
 989	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 990			       l3mdev_master_ifindex(skb->dev), 1);
 
 991
 992	send = true;
 993	if (peer) {
 994		now = jiffies;
 995		peer->rate_tokens += now - peer->rate_last;
 996		if (peer->rate_tokens > ip_rt_error_burst)
 997			peer->rate_tokens = ip_rt_error_burst;
 998		peer->rate_last = now;
 999		if (peer->rate_tokens >= ip_rt_error_cost)
1000			peer->rate_tokens -= ip_rt_error_cost;
1001		else
1002			send = false;
1003		inet_putpeer(peer);
1004	}
1005	if (send)
1006		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1007
1008out:	kfree_skb(skb);
1009	return 0;
1010}
1011
1012static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1013{
1014	struct dst_entry *dst = &rt->dst;
1015	struct fib_result res;
1016	bool lock = false;
1017
1018	if (ip_mtu_locked(dst))
1019		return;
1020
1021	if (ipv4_mtu(dst) < mtu)
1022		return;
1023
1024	if (mtu < ip_rt_min_pmtu) {
1025		lock = true;
1026		mtu = ip_rt_min_pmtu;
1027	}
1028
1029	if (rt->rt_pmtu == mtu &&
1030	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1031		return;
1032
1033	rcu_read_lock();
1034	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1035		struct fib_nh *nh = &FIB_RES_NH(res);
1036
1037		update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1038				      jiffies + ip_rt_mtu_expires);
1039	}
1040	rcu_read_unlock();
1041}
1042
1043static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1044			      struct sk_buff *skb, u32 mtu)
1045{
1046	struct rtable *rt = (struct rtable *) dst;
1047	struct flowi4 fl4;
1048
1049	ip_rt_build_flow_key(&fl4, sk, skb);
1050	__ip_rt_update_pmtu(rt, &fl4, mtu);
 
 
1051}
1052
1053void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1054		      int oif, u32 mark, u8 protocol, int flow_flags)
 
1055{
1056	const struct iphdr *iph = (const struct iphdr *) skb->data;
1057	struct flowi4 fl4;
1058	struct rtable *rt;
1059
1060	if (!mark)
1061		mark = IP4_REPLY_MARK(net, skb->mark);
 
1062
1063	__build_flow_key(net, &fl4, NULL, iph, oif,
1064			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1065	rt = __ip_route_output_key(net, &fl4);
1066	if (!IS_ERR(rt)) {
1067		__ip_rt_update_pmtu(rt, &fl4, mtu);
1068		ip_rt_put(rt);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1069	}
 
1070}
1071EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1072
1073static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074{
1075	const struct iphdr *iph = (const struct iphdr *) skb->data;
1076	struct flowi4 fl4;
1077	struct rtable *rt;
1078
1079	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1080
1081	if (!fl4.flowi4_mark)
1082		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1083
1084	rt = __ip_route_output_key(sock_net(sk), &fl4);
1085	if (!IS_ERR(rt)) {
1086		__ip_rt_update_pmtu(rt, &fl4, mtu);
1087		ip_rt_put(rt);
1088	}
 
 
 
 
 
 
1089}
1090
1091void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092{
1093	const struct iphdr *iph = (const struct iphdr *) skb->data;
1094	struct flowi4 fl4;
1095	struct rtable *rt;
1096	struct dst_entry *odst = NULL;
1097	bool new = false;
1098	struct net *net = sock_net(sk);
1099
1100	bh_lock_sock(sk);
1101
1102	if (!ip_sk_accept_pmtu(sk))
1103		goto out;
1104
1105	odst = sk_dst_get(sk);
 
 
 
 
1106
1107	if (sock_owned_by_user(sk) || !odst) {
1108		__ipv4_sk_update_pmtu(skb, sk, mtu);
1109		goto out;
1110	}
1111
1112	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
 
 
1113
1114	rt = (struct rtable *)odst;
1115	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1116		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1117		if (IS_ERR(rt))
1118			goto out;
1119
1120		new = true;
 
 
 
1121	}
 
1122
1123	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1124
1125	if (!dst_check(&rt->dst, 0)) {
1126		if (new)
1127			dst_release(&rt->dst);
1128
1129		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1130		if (IS_ERR(rt))
1131			goto out;
1132
1133		new = true;
1134	}
 
 
1135
1136	if (new)
1137		sk_dst_set(sk, &rt->dst);
1138
1139out:
1140	bh_unlock_sock(sk);
1141	dst_release(odst);
1142}
1143EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1144
1145void ipv4_redirect(struct sk_buff *skb, struct net *net,
1146		   int oif, u32 mark, u8 protocol, int flow_flags)
1147{
1148	const struct iphdr *iph = (const struct iphdr *) skb->data;
1149	struct flowi4 fl4;
1150	struct rtable *rt;
1151
1152	__build_flow_key(net, &fl4, NULL, iph, oif,
1153			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1154	rt = __ip_route_output_key(net, &fl4);
1155	if (!IS_ERR(rt)) {
1156		__ip_do_redirect(rt, skb, &fl4, false);
1157		ip_rt_put(rt);
1158	}
1159}
1160EXPORT_SYMBOL_GPL(ipv4_redirect);
1161
1162void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1163{
1164	const struct iphdr *iph = (const struct iphdr *) skb->data;
1165	struct flowi4 fl4;
1166	struct rtable *rt;
1167	struct net *net = sock_net(sk);
1168
1169	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1170	rt = __ip_route_output_key(net, &fl4);
1171	if (!IS_ERR(rt)) {
1172		__ip_do_redirect(rt, skb, &fl4, false);
1173		ip_rt_put(rt);
1174	}
1175}
1176EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1177
1178static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1179{
1180	struct rtable *rt = (struct rtable *) dst;
 
1181
1182	/* All IPV4 dsts are created with ->obsolete set to the value
1183	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1184	 * into this function always.
1185	 *
1186	 * When a PMTU/redirect information update invalidates a route,
1187	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1188	 * DST_OBSOLETE_DEAD by dst_free().
1189	 */
1190	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1191		return NULL;
1192	return dst;
1193}
1194
 
1195static void ipv4_link_failure(struct sk_buff *skb)
1196{
1197	struct rtable *rt;
1198
1199	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1200
1201	rt = skb_rtable(skb);
1202	if (rt)
1203		dst_set_expires(&rt->dst, 0);
1204}
1205
1206static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1207{
1208	pr_debug("%s: %pI4 -> %pI4, %s\n",
1209		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1210		 skb->dev ? skb->dev->name : "?");
1211	kfree_skb(skb);
1212	WARN_ON(1);
1213	return 0;
1214}
1215
1216/*
1217   We do not cache source address of outgoing interface,
1218   because it is used only by IP RR, TS and SRR options,
1219   so that it out of fast path.
1220
1221   BTW remember: "addr" is allowed to be not aligned
1222   in IP options!
1223 */
1224
1225void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1226{
1227	__be32 src;
1228
1229	if (rt_is_output_route(rt))
1230		src = ip_hdr(skb)->saddr;
1231	else {
1232		struct fib_result res;
1233		struct flowi4 fl4;
1234		struct iphdr *iph;
1235
1236		iph = ip_hdr(skb);
1237
1238		memset(&fl4, 0, sizeof(fl4));
1239		fl4.daddr = iph->daddr;
1240		fl4.saddr = iph->saddr;
1241		fl4.flowi4_tos = RT_TOS(iph->tos);
1242		fl4.flowi4_oif = rt->dst.dev->ifindex;
1243		fl4.flowi4_iif = skb->dev->ifindex;
1244		fl4.flowi4_mark = skb->mark;
1245
1246		rcu_read_lock();
1247		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1248			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1249		else
1250			src = inet_select_addr(rt->dst.dev,
1251					       rt_nexthop(rt, iph->daddr),
1252					       RT_SCOPE_UNIVERSE);
1253		rcu_read_unlock();
1254	}
1255	memcpy(addr, &src, 4);
1256}
1257
1258#ifdef CONFIG_IP_ROUTE_CLASSID
1259static void set_class_tag(struct rtable *rt, u32 tag)
1260{
1261	if (!(rt->dst.tclassid & 0xFFFF))
1262		rt->dst.tclassid |= tag & 0xFFFF;
1263	if (!(rt->dst.tclassid & 0xFFFF0000))
1264		rt->dst.tclassid |= tag & 0xFFFF0000;
1265}
1266#endif
1267
1268static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1269{
1270	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1271	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1272				    ip_rt_min_advmss);
1273
1274	return min(advmss, IPV4_MAX_PMTU - header_size);
 
 
 
 
 
 
1275}
1276
1277static unsigned int ipv4_mtu(const struct dst_entry *dst)
1278{
1279	const struct rtable *rt = (const struct rtable *) dst;
1280	unsigned int mtu = rt->rt_pmtu;
1281
1282	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1283		mtu = dst_metric_raw(dst, RTAX_MTU);
1284
1285	if (mtu)
1286		return mtu;
1287
1288	mtu = READ_ONCE(dst->dev->mtu);
1289
1290	if (unlikely(ip_mtu_locked(dst))) {
1291		if (rt->rt_uses_gateway && mtu > 576)
1292			mtu = 576;
1293	}
1294
1295	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1296
1297	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1298}
1299
1300static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1301{
1302	struct fnhe_hash_bucket *hash;
1303	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1304	u32 hval = fnhe_hashfun(daddr);
1305
1306	spin_lock_bh(&fnhe_lock);
1307
1308	hash = rcu_dereference_protected(nh->nh_exceptions,
1309					 lockdep_is_held(&fnhe_lock));
1310	hash += hval;
1311
1312	fnhe_p = &hash->chain;
1313	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1314	while (fnhe) {
1315		if (fnhe->fnhe_daddr == daddr) {
1316			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1317				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1318			fnhe_flush_routes(fnhe);
1319			kfree_rcu(fnhe, rcu);
1320			break;
1321		}
1322		fnhe_p = &fnhe->fnhe_next;
1323		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1324						 lockdep_is_held(&fnhe_lock));
1325	}
1326
1327	spin_unlock_bh(&fnhe_lock);
1328}
1329
1330static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1331{
1332	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1333	struct fib_nh_exception *fnhe;
1334	u32 hval;
1335
1336	if (!hash)
1337		return NULL;
1338
1339	hval = fnhe_hashfun(daddr);
1340
1341	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1342	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1343		if (fnhe->fnhe_daddr == daddr) {
1344			if (fnhe->fnhe_expires &&
1345			    time_after(jiffies, fnhe->fnhe_expires)) {
1346				ip_del_fnhe(nh, daddr);
1347				break;
1348			}
1349			return fnhe;
1350		}
1351	}
1352	return NULL;
1353}
1354
1355static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1356			      __be32 daddr, const bool do_cache)
1357{
1358	bool ret = false;
1359
1360	spin_lock_bh(&fnhe_lock);
1361
1362	if (daddr == fnhe->fnhe_daddr) {
1363		struct rtable __rcu **porig;
1364		struct rtable *orig;
1365		int genid = fnhe_genid(dev_net(rt->dst.dev));
1366
1367		if (rt_is_input_route(rt))
1368			porig = &fnhe->fnhe_rth_input;
1369		else
1370			porig = &fnhe->fnhe_rth_output;
1371		orig = rcu_dereference(*porig);
1372
1373		if (fnhe->fnhe_genid != genid) {
1374			fnhe->fnhe_genid = genid;
1375			fnhe->fnhe_gw = 0;
1376			fnhe->fnhe_pmtu = 0;
1377			fnhe->fnhe_expires = 0;
1378			fnhe->fnhe_mtu_locked = false;
1379			fnhe_flush_routes(fnhe);
1380			orig = NULL;
1381		}
1382		fill_route_from_fnhe(rt, fnhe);
1383		if (!rt->rt_gateway)
1384			rt->rt_gateway = daddr;
1385
1386		if (do_cache) {
1387			dst_hold(&rt->dst);
1388			rcu_assign_pointer(*porig, rt);
1389			if (orig) {
1390				dst_dev_put(&orig->dst);
1391				dst_release(&orig->dst);
1392			}
1393			ret = true;
1394		}
1395
1396		fnhe->fnhe_stamp = jiffies;
1397	}
1398	spin_unlock_bh(&fnhe_lock);
1399
1400	return ret;
1401}
1402
1403static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
 
1404{
1405	struct rtable *orig, *prev, **p;
1406	bool ret = true;
1407
1408	if (rt_is_input_route(rt)) {
1409		p = (struct rtable **)&nh->nh_rth_input;
1410	} else {
1411		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1412	}
1413	orig = *p;
1414
1415	/* hold dst before doing cmpxchg() to avoid race condition
1416	 * on this dst
1417	 */
1418	dst_hold(&rt->dst);
1419	prev = cmpxchg(p, orig, rt);
1420	if (prev == orig) {
1421		if (orig) {
1422			dst_dev_put(&orig->dst);
1423			dst_release(&orig->dst);
 
 
 
 
 
 
 
 
 
 
 
1424		}
1425	} else {
1426		dst_release(&rt->dst);
1427		ret = false;
1428	}
1429
1430	return ret;
1431}
1432
1433struct uncached_list {
1434	spinlock_t		lock;
1435	struct list_head	head;
1436};
1437
1438static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1439
1440void rt_add_uncached_list(struct rtable *rt)
1441{
1442	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1443
1444	rt->rt_uncached_list = ul;
1445
1446	spin_lock_bh(&ul->lock);
1447	list_add_tail(&rt->rt_uncached, &ul->head);
1448	spin_unlock_bh(&ul->lock);
1449}
1450
1451void rt_del_uncached_list(struct rtable *rt)
1452{
1453	if (!list_empty(&rt->rt_uncached)) {
1454		struct uncached_list *ul = rt->rt_uncached_list;
1455
1456		spin_lock_bh(&ul->lock);
1457		list_del(&rt->rt_uncached);
1458		spin_unlock_bh(&ul->lock);
1459	}
1460}
1461
1462static void ipv4_dst_destroy(struct dst_entry *dst)
1463{
1464	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1465	struct rtable *rt = (struct rtable *)dst;
1466
1467	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1468		kfree(p);
1469
1470	rt_del_uncached_list(rt);
1471}
1472
1473void rt_flush_dev(struct net_device *dev)
1474{
1475	struct net *net = dev_net(dev);
1476	struct rtable *rt;
1477	int cpu;
1478
1479	for_each_possible_cpu(cpu) {
1480		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1481
1482		spin_lock_bh(&ul->lock);
1483		list_for_each_entry(rt, &ul->head, rt_uncached) {
1484			if (rt->dst.dev != dev)
1485				continue;
1486			rt->dst.dev = net->loopback_dev;
1487			dev_hold(rt->dst.dev);
1488			dev_put(dev);
1489		}
1490		spin_unlock_bh(&ul->lock);
1491	}
1492}
1493
1494static bool rt_cache_valid(const struct rtable *rt)
1495{
1496	return	rt &&
1497		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1498		!rt_is_expired(rt);
1499}
1500
1501static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1502			   const struct fib_result *res,
1503			   struct fib_nh_exception *fnhe,
1504			   struct fib_info *fi, u16 type, u32 itag,
1505			   const bool do_cache)
1506{
1507	bool cached = false;
1508
1509	if (fi) {
1510		struct fib_nh *nh = &FIB_RES_NH(*res);
1511
1512		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1513			rt->rt_gateway = nh->nh_gw;
1514			rt->rt_uses_gateway = 1;
1515		}
1516		dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1517		if (fi->fib_metrics != &dst_default_metrics) {
1518			rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1519			refcount_inc(&fi->fib_metrics->refcnt);
1520		}
1521#ifdef CONFIG_IP_ROUTE_CLASSID
1522		rt->dst.tclassid = nh->nh_tclassid;
1523#endif
1524		rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1525		if (unlikely(fnhe))
1526			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1527		else if (do_cache)
1528			cached = rt_cache_route(nh, rt);
1529		if (unlikely(!cached)) {
1530			/* Routes we intend to cache in nexthop exception or
1531			 * FIB nexthop have the DST_NOCACHE bit clear.
1532			 * However, if we are unsuccessful at storing this
1533			 * route into the cache we really need to set it.
1534			 */
1535			if (!rt->rt_gateway)
1536				rt->rt_gateway = daddr;
1537			rt_add_uncached_list(rt);
1538		}
1539	} else
1540		rt_add_uncached_list(rt);
1541
1542#ifdef CONFIG_IP_ROUTE_CLASSID
1543#ifdef CONFIG_IP_MULTIPLE_TABLES
1544	set_class_tag(rt, res->tclassid);
1545#endif
1546	set_class_tag(rt, itag);
1547#endif
1548}
1549
1550struct rtable *rt_dst_alloc(struct net_device *dev,
1551			    unsigned int flags, u16 type,
1552			    bool nopolicy, bool noxfrm, bool will_cache)
1553{
1554	struct rtable *rt;
1555
1556	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1557		       (will_cache ? 0 : DST_HOST) |
1558		       (nopolicy ? DST_NOPOLICY : 0) |
1559		       (noxfrm ? DST_NOXFRM : 0));
1560
1561	if (rt) {
1562		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1563		rt->rt_flags = flags;
1564		rt->rt_type = type;
1565		rt->rt_is_input = 0;
1566		rt->rt_iif = 0;
1567		rt->rt_pmtu = 0;
1568		rt->rt_mtu_locked = 0;
1569		rt->rt_gateway = 0;
1570		rt->rt_uses_gateway = 0;
1571		INIT_LIST_HEAD(&rt->rt_uncached);
1572
1573		rt->dst.output = ip_output;
1574		if (flags & RTCF_LOCAL)
1575			rt->dst.input = ip_local_deliver;
1576	}
1577
1578	return rt;
1579}
1580EXPORT_SYMBOL(rt_dst_alloc);
1581
1582/* called in rcu_read_lock() section */
1583int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1584			  u8 tos, struct net_device *dev,
1585			  struct in_device *in_dev, u32 *itag)
1586{
 
 
 
 
 
1587	int err;
1588
1589	/* Primary sanity checks. */
1590	if (!in_dev)
1591		return -EINVAL;
1592
1593	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1594	    skb->protocol != htons(ETH_P_IP))
1595		return -EINVAL;
1596
1597	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1598		return -EINVAL;
 
1599
1600	if (ipv4_is_zeronet(saddr)) {
1601		if (!ipv4_is_local_multicast(daddr))
1602			return -EINVAL;
 
1603	} else {
1604		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1605					  in_dev, itag);
1606		if (err < 0)
1607			return err;
1608	}
1609	return 0;
1610}
1611
1612/* called in rcu_read_lock() section */
1613static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1614			     u8 tos, struct net_device *dev, int our)
1615{
1616	struct in_device *in_dev = __in_dev_get_rcu(dev);
1617	unsigned int flags = RTCF_MULTICAST;
1618	struct rtable *rth;
1619	u32 itag = 0;
1620	int err;
1621
1622	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1623	if (err)
1624		return err;
1625
1626	if (our)
1627		flags |= RTCF_LOCAL;
1628
1629	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1630			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1631	if (!rth)
1632		return -ENOBUFS;
1633
1634#ifdef CONFIG_IP_ROUTE_CLASSID
1635	rth->dst.tclassid = itag;
1636#endif
1637	rth->dst.output = ip_rt_bug;
1638	rth->rt_is_input= 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1639
1640#ifdef CONFIG_IP_MROUTE
1641	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1642		rth->dst.input = ip_mr_input;
1643#endif
1644	RT_CACHE_STAT_INC(in_slow_mc);
1645
1646	skb_dst_set(skb, &rth->dst);
1647	return 0;
 
 
 
 
 
 
 
 
1648}
1649
1650
1651static void ip_handle_martian_source(struct net_device *dev,
1652				     struct in_device *in_dev,
1653				     struct sk_buff *skb,
1654				     __be32 daddr,
1655				     __be32 saddr)
1656{
1657	RT_CACHE_STAT_INC(in_martian_src);
1658#ifdef CONFIG_IP_ROUTE_VERBOSE
1659	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1660		/*
1661		 *	RFC1812 recommendation, if source is martian,
1662		 *	the only hint is MAC header.
1663		 */
1664		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1665			&daddr, &saddr, dev->name);
1666		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1667			print_hex_dump(KERN_WARNING, "ll header: ",
1668				       DUMP_PREFIX_OFFSET, 16, 1,
1669				       skb_mac_header(skb),
1670				       dev->hard_header_len, true);
1671		}
1672	}
1673#endif
1674}
1675
1676/* called in rcu_read_lock() section */
1677static int __mkroute_input(struct sk_buff *skb,
1678			   const struct fib_result *res,
1679			   struct in_device *in_dev,
1680			   __be32 daddr, __be32 saddr, u32 tos)
 
1681{
1682	struct fib_nh_exception *fnhe;
1683	struct rtable *rth;
1684	int err;
1685	struct in_device *out_dev;
1686	bool do_cache;
1687	u32 itag = 0;
 
1688
1689	/* get a working reference to the output device */
1690	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1691	if (!out_dev) {
1692		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1693		return -EINVAL;
1694	}
1695
 
1696	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1697				  in_dev->dev, in_dev, &itag);
1698	if (err < 0) {
1699		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1700					 saddr);
1701
1702		goto cleanup;
1703	}
1704
1705	do_cache = res->fi && !itag;
1706	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1707	    skb->protocol == htons(ETH_P_IP) &&
 
1708	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1709	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1710		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1711
1712	if (skb->protocol != htons(ETH_P_IP)) {
1713		/* Not IP (i.e. ARP). Do not create route, if it is
1714		 * invalid for proxy arp. DNAT routes are always valid.
1715		 *
1716		 * Proxy arp feature have been extended to allow, ARP
1717		 * replies back to the same interface, to support
1718		 * Private VLAN switch technologies. See arp.c.
1719		 */
1720		if (out_dev == in_dev &&
1721		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1722			err = -EINVAL;
1723			goto cleanup;
1724		}
1725	}
1726
1727	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1728	if (do_cache) {
1729		if (fnhe)
1730			rth = rcu_dereference(fnhe->fnhe_rth_input);
1731		else
1732			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1733		if (rt_cache_valid(rth)) {
1734			skb_dst_set_noref(skb, &rth->dst);
1735			goto out;
1736		}
1737	}
1738
1739	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1740			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1741			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1742	if (!rth) {
1743		err = -ENOBUFS;
1744		goto cleanup;
1745	}
1746
1747	rth->rt_is_input = 1;
1748	RT_CACHE_STAT_INC(in_slow_tot);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1749
1750	rth->dst.input = ip_forward;
 
1751
1752	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1753		       do_cache);
1754	lwtunnel_set_redirect(&rth->dst);
1755	skb_dst_set(skb, &rth->dst);
1756out:
1757	err = 0;
1758 cleanup:
1759	return err;
1760}
1761
1762#ifdef CONFIG_IP_ROUTE_MULTIPATH
1763/* To make ICMP packets follow the right flow, the multipath hash is
1764 * calculated from the inner IP addresses.
1765 */
1766static void ip_multipath_l3_keys(const struct sk_buff *skb,
1767				 struct flow_keys *hash_keys)
1768{
1769	const struct iphdr *outer_iph = ip_hdr(skb);
1770	const struct iphdr *key_iph = outer_iph;
1771	const struct iphdr *inner_iph;
1772	const struct icmphdr *icmph;
1773	struct iphdr _inner_iph;
1774	struct icmphdr _icmph;
1775
1776	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1777		goto out;
1778
1779	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1780		goto out;
1781
1782	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1783				   &_icmph);
1784	if (!icmph)
1785		goto out;
1786
1787	if (icmph->type != ICMP_DEST_UNREACH &&
1788	    icmph->type != ICMP_REDIRECT &&
1789	    icmph->type != ICMP_TIME_EXCEEDED &&
1790	    icmph->type != ICMP_PARAMETERPROB)
1791		goto out;
1792
1793	inner_iph = skb_header_pointer(skb,
1794				       outer_iph->ihl * 4 + sizeof(_icmph),
1795				       sizeof(_inner_iph), &_inner_iph);
1796	if (!inner_iph)
1797		goto out;
1798
1799	key_iph = inner_iph;
1800out:
1801	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1802	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1803}
1804
1805/* if skb is set it will be used and fl4 can be NULL */
1806int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1807		       const struct sk_buff *skb, struct flow_keys *flkeys)
1808{
1809	struct flow_keys hash_keys;
1810	u32 mhash;
1811
1812	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1813	case 0:
1814		memset(&hash_keys, 0, sizeof(hash_keys));
1815		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1816		if (skb) {
1817			ip_multipath_l3_keys(skb, &hash_keys);
1818		} else {
1819			hash_keys.addrs.v4addrs.src = fl4->saddr;
1820			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1821		}
1822		break;
1823	case 1:
1824		/* skb is currently provided only when forwarding */
1825		if (skb) {
1826			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1827			struct flow_keys keys;
1828
1829			/* short-circuit if we already have L4 hash present */
1830			if (skb->l4_hash)
1831				return skb_get_hash_raw(skb) >> 1;
1832
1833			memset(&hash_keys, 0, sizeof(hash_keys));
1834
1835			if (!flkeys) {
1836				skb_flow_dissect_flow_keys(skb, &keys, flag);
1837				flkeys = &keys;
1838			}
1839
1840			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1841			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1842			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1843			hash_keys.ports.src = flkeys->ports.src;
1844			hash_keys.ports.dst = flkeys->ports.dst;
1845			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1846		} else {
1847			memset(&hash_keys, 0, sizeof(hash_keys));
1848			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1849			hash_keys.addrs.v4addrs.src = fl4->saddr;
1850			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1851			hash_keys.ports.src = fl4->fl4_sport;
1852			hash_keys.ports.dst = fl4->fl4_dport;
1853			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1854		}
1855		break;
1856	}
1857	mhash = flow_hash_from_keys(&hash_keys);
1858
1859	return mhash >> 1;
1860}
1861#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1862
1863static int ip_mkroute_input(struct sk_buff *skb,
1864			    struct fib_result *res,
 
1865			    struct in_device *in_dev,
1866			    __be32 daddr, __be32 saddr, u32 tos,
1867			    struct flow_keys *hkeys)
1868{
1869#ifdef CONFIG_IP_ROUTE_MULTIPATH
1870	if (res->fi && res->fi->fib_nhs > 1) {
1871		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1872
1873		fib_select_multipath(res, h);
1874	}
 
1875#endif
1876
1877	/* create a routing cache entry */
1878	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
 
 
 
 
 
 
 
 
 
 
1879}
1880
1881/*
1882 *	NOTE. We drop all the packets that has local source
1883 *	addresses, because every properly looped back packet
1884 *	must have correct destination already attached by output routine.
1885 *
1886 *	Such approach solves two big problems:
1887 *	1. Not simplex devices are handled properly.
1888 *	2. IP spoofing attempts are filtered with 100% of guarantee.
1889 *	called with rcu_read_lock()
1890 */
1891
1892static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1893			       u8 tos, struct net_device *dev,
1894			       struct fib_result *res)
1895{
 
1896	struct in_device *in_dev = __in_dev_get_rcu(dev);
1897	struct flow_keys *flkeys = NULL, _flkeys;
1898	struct net    *net = dev_net(dev);
1899	struct ip_tunnel_info *tun_info;
1900	int		err = -EINVAL;
1901	unsigned int	flags = 0;
1902	u32		itag = 0;
1903	struct rtable	*rth;
1904	struct flowi4	fl4;
1905	bool do_cache;
 
 
1906
1907	/* IP on this device is disabled. */
1908
1909	if (!in_dev)
1910		goto out;
1911
1912	/* Check for the most weird martians, which can be not detected
1913	   by fib_lookup.
1914	 */
1915
1916	tun_info = skb_tunnel_info(skb);
1917	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1918		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1919	else
1920		fl4.flowi4_tun_key.tun_id = 0;
1921	skb_dst_drop(skb);
1922
1923	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1924		goto martian_source;
1925
1926	res->fi = NULL;
1927	res->table = NULL;
1928	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1929		goto brd_input;
1930
1931	/* Accept zero addresses only to limited broadcast;
1932	 * I even do not know to fix it or not. Waiting for complains :-)
1933	 */
1934	if (ipv4_is_zeronet(saddr))
1935		goto martian_source;
1936
1937	if (ipv4_is_zeronet(daddr))
1938		goto martian_destination;
1939
1940	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1941	 * and call it once if daddr or/and saddr are loopback addresses
1942	 */
1943	if (ipv4_is_loopback(daddr)) {
1944		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1945			goto martian_destination;
1946	} else if (ipv4_is_loopback(saddr)) {
1947		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1948			goto martian_source;
1949	}
1950
1951	/*
1952	 *	Now we are ready to route packet.
1953	 */
1954	fl4.flowi4_oif = 0;
1955	fl4.flowi4_iif = dev->ifindex;
1956	fl4.flowi4_mark = skb->mark;
1957	fl4.flowi4_tos = tos;
1958	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1959	fl4.flowi4_flags = 0;
1960	fl4.daddr = daddr;
1961	fl4.saddr = saddr;
1962	fl4.flowi4_uid = sock_net_uid(net, NULL);
1963
1964	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
1965		flkeys = &_flkeys;
1966	} else {
1967		fl4.flowi4_proto = 0;
1968		fl4.fl4_sport = 0;
1969		fl4.fl4_dport = 0;
1970	}
1971
1972	err = fib_lookup(net, &fl4, res, 0);
1973	if (err != 0) {
1974		if (!IN_DEV_FORWARD(in_dev))
1975			err = -EHOSTUNREACH;
1976		goto no_route;
1977	}
1978
1979	if (res->type == RTN_BROADCAST)
 
 
1980		goto brd_input;
1981
1982	if (res->type == RTN_LOCAL) {
1983		err = fib_validate_source(skb, saddr, daddr, tos,
1984					  0, dev, in_dev, &itag);
 
1985		if (err < 0)
1986			goto martian_source;
 
 
 
1987		goto local_input;
1988	}
1989
1990	if (!IN_DEV_FORWARD(in_dev)) {
1991		err = -EHOSTUNREACH;
1992		goto no_route;
1993	}
1994	if (res->type != RTN_UNICAST)
1995		goto martian_destination;
1996
1997	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
1998out:	return err;
1999
2000brd_input:
2001	if (skb->protocol != htons(ETH_P_IP))
2002		goto e_inval;
2003
2004	if (!ipv4_is_zeronet(saddr)) {
2005		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2006					  in_dev, &itag);
 
 
2007		if (err < 0)
2008			goto martian_source;
 
 
2009	}
2010	flags |= RTCF_BROADCAST;
2011	res->type = RTN_BROADCAST;
2012	RT_CACHE_STAT_INC(in_brd);
2013
2014local_input:
2015	do_cache = false;
2016	if (res->fi) {
2017		if (!itag) {
2018			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2019			if (rt_cache_valid(rth)) {
2020				skb_dst_set_noref(skb, &rth->dst);
2021				err = 0;
2022				goto out;
2023			}
2024			do_cache = true;
2025		}
2026	}
2027
2028	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2029			   flags | RTCF_LOCAL, res->type,
2030			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2031	if (!rth)
2032		goto e_nobufs;
2033
 
2034	rth->dst.output= ip_rt_bug;
2035#ifdef CONFIG_IP_ROUTE_CLASSID
2036	rth->dst.tclassid = itag;
2037#endif
2038	rth->rt_is_input = 1;
2039
2040	RT_CACHE_STAT_INC(in_slow_tot);
2041	if (res->type == RTN_UNREACHABLE) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2042		rth->dst.input= ip_error;
2043		rth->dst.error= -err;
2044		rth->rt_flags 	&= ~RTCF_LOCAL;
2045	}
2046
2047	if (do_cache) {
2048		struct fib_nh *nh = &FIB_RES_NH(*res);
2049
2050		rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2051		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2052			WARN_ON(rth->dst.input == lwtunnel_input);
2053			rth->dst.lwtstate->orig_input = rth->dst.input;
2054			rth->dst.input = lwtunnel_input;
2055		}
2056
2057		if (unlikely(!rt_cache_route(nh, rth)))
2058			rt_add_uncached_list(rth);
2059	}
2060	skb_dst_set(skb, &rth->dst);
2061	err = 0;
 
 
2062	goto out;
2063
2064no_route:
2065	RT_CACHE_STAT_INC(in_no_route);
2066	res->type = RTN_UNREACHABLE;
2067	res->fi = NULL;
2068	res->table = NULL;
 
2069	goto local_input;
2070
2071	/*
2072	 *	Do not cache martian addresses: they should be logged (RFC1812)
2073	 */
2074martian_destination:
2075	RT_CACHE_STAT_INC(in_martian_dst);
2076#ifdef CONFIG_IP_ROUTE_VERBOSE
2077	if (IN_DEV_LOG_MARTIANS(in_dev))
2078		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2079				     &daddr, &saddr, dev->name);
2080#endif
2081
 
 
 
 
2082e_inval:
2083	err = -EINVAL;
2084	goto out;
2085
2086e_nobufs:
2087	err = -ENOBUFS;
2088	goto out;
2089
2090martian_source:
 
 
2091	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2092	goto out;
2093}
2094
2095int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2096			 u8 tos, struct net_device *dev)
2097{
2098	struct fib_result res;
2099	int err;
 
 
 
 
 
2100
2101	tos &= IPTOS_RT_MASK;
2102	rcu_read_lock();
2103	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2104	rcu_read_unlock();
2105
2106	return err;
2107}
2108EXPORT_SYMBOL(ip_route_input_noref);
2109
2110/* called with rcu_read_lock held */
2111int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2112		       u8 tos, struct net_device *dev, struct fib_result *res)
2113{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2114	/* Multicast recognition logic is moved from route cache to here.
2115	   The problem was that too many Ethernet cards have broken/missing
2116	   hardware multicast filters :-( As result the host on multicasting
2117	   network acquires a lot of useless route cache entries, sort of
2118	   SDR messages from all the world. Now we try to get rid of them.
2119	   Really, provided software IP multicast filter is organized
2120	   reasonably (at least, hashed), it does not result in a slowdown
2121	   comparing with route cache reject entries.
2122	   Note, that multicast routers are not affected, because
2123	   route cache entry is created eventually.
2124	 */
2125	if (ipv4_is_multicast(daddr)) {
2126		struct in_device *in_dev = __in_dev_get_rcu(dev);
2127		int our = 0;
2128		int err = -EINVAL;
2129
2130		if (in_dev)
2131			our = ip_check_mc_rcu(in_dev, daddr, saddr,
2132					      ip_hdr(skb)->protocol);
2133
2134		/* check l3 master if no match yet */
2135		if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2136			struct in_device *l3_in_dev;
2137
2138			l3_in_dev = __in_dev_get_rcu(skb->dev);
2139			if (l3_in_dev)
2140				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2141						      ip_hdr(skb)->protocol);
2142		}
2143
2144		if (our
 
 
 
2145#ifdef CONFIG_IP_MROUTE
2146			||
2147		    (!ipv4_is_local_multicast(daddr) &&
2148		     IN_DEV_MFORWARD(in_dev))
2149#endif
2150		   ) {
2151			err = ip_route_input_mc(skb, daddr, saddr,
2152						tos, dev, our);
 
 
 
2153		}
2154		return err;
 
2155	}
2156
2157	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
 
2158}
 
2159
2160/* called with rcu_read_lock() */
2161static struct rtable *__mkroute_output(const struct fib_result *res,
2162				       const struct flowi4 *fl4, int orig_oif,
 
 
2163				       struct net_device *dev_out,
2164				       unsigned int flags)
2165{
2166	struct fib_info *fi = res->fi;
2167	struct fib_nh_exception *fnhe;
2168	struct in_device *in_dev;
2169	u16 type = res->type;
2170	struct rtable *rth;
2171	bool do_cache;
2172
2173	in_dev = __in_dev_get_rcu(dev_out);
2174	if (!in_dev)
2175		return ERR_PTR(-EINVAL);
2176
2177	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2178		if (ipv4_is_loopback(fl4->saddr) &&
2179		    !(dev_out->flags & IFF_LOOPBACK) &&
2180		    !netif_is_l3_master(dev_out))
2181			return ERR_PTR(-EINVAL);
2182
2183	if (ipv4_is_lbcast(fl4->daddr))
2184		type = RTN_BROADCAST;
2185	else if (ipv4_is_multicast(fl4->daddr))
2186		type = RTN_MULTICAST;
2187	else if (ipv4_is_zeronet(fl4->daddr))
2188		return ERR_PTR(-EINVAL);
2189
2190	if (dev_out->flags & IFF_LOOPBACK)
2191		flags |= RTCF_LOCAL;
2192
2193	do_cache = true;
 
 
 
2194	if (type == RTN_BROADCAST) {
2195		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2196		fi = NULL;
2197	} else if (type == RTN_MULTICAST) {
2198		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2199		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2200				     fl4->flowi4_proto))
2201			flags &= ~RTCF_LOCAL;
2202		else
2203			do_cache = false;
2204		/* If multicast route do not exist use
2205		 * default one, but do not gateway in this case.
2206		 * Yes, it is hack.
2207		 */
2208		if (fi && res->prefixlen < 4)
2209			fi = NULL;
2210	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2211		   (orig_oif != dev_out->ifindex)) {
2212		/* For local routes that require a particular output interface
2213		 * we do not want to cache the result.  Caching the result
2214		 * causes incorrect behaviour when there are multiple source
2215		 * addresses on the interface, the end result being that if the
2216		 * intended recipient is waiting on that interface for the
2217		 * packet he won't receive it because it will be delivered on
2218		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2219		 * be set to the loopback interface as well.
2220		 */
2221		do_cache = false;
2222	}
2223
2224	fnhe = NULL;
2225	do_cache &= fi != NULL;
2226	if (fi) {
2227		struct rtable __rcu **prth;
2228		struct fib_nh *nh = &FIB_RES_NH(*res);
2229
2230		fnhe = find_exception(nh, fl4->daddr);
2231		if (!do_cache)
2232			goto add;
2233		if (fnhe) {
2234			prth = &fnhe->fnhe_rth_output;
2235		} else {
2236			if (unlikely(fl4->flowi4_flags &
2237				     FLOWI_FLAG_KNOWN_NH &&
2238				     !(nh->nh_gw &&
2239				       nh->nh_scope == RT_SCOPE_LINK))) {
2240				do_cache = false;
2241				goto add;
2242			}
2243			prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2244		}
2245		rth = rcu_dereference(*prth);
2246		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2247			return rth;
2248	}
2249
2250add:
2251	rth = rt_dst_alloc(dev_out, flags, type,
2252			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2253			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2254			   do_cache);
2255	if (!rth)
2256		return ERR_PTR(-ENOBUFS);
2257
2258	rth->rt_iif = orig_oif;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2259
2260	RT_CACHE_STAT_INC(out_slow_tot);
2261
 
 
 
 
2262	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
 
2263		if (flags & RTCF_LOCAL &&
2264		    !(dev_out->flags & IFF_LOOPBACK)) {
2265			rth->dst.output = ip_mc_output;
2266			RT_CACHE_STAT_INC(out_slow_mc);
2267		}
2268#ifdef CONFIG_IP_MROUTE
2269		if (type == RTN_MULTICAST) {
2270			if (IN_DEV_MFORWARD(in_dev) &&
2271			    !ipv4_is_local_multicast(fl4->daddr)) {
2272				rth->dst.input = ip_mr_input;
2273				rth->dst.output = ip_mc_output;
2274			}
2275		}
2276#endif
2277	}
2278
2279	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2280	lwtunnel_set_redirect(&rth->dst);
2281
2282	return rth;
2283}
2284
2285/*
2286 * Major route resolver routine.
 
2287 */
2288
2289struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2290					const struct sk_buff *skb)
2291{
 
2292	__u8 tos = RT_FL_TOS(fl4);
2293	struct fib_result res = {
2294		.type		= RTN_UNSPEC,
2295		.fi		= NULL,
2296		.table		= NULL,
2297		.tclassid	= 0,
2298	};
2299	struct rtable *rth;
 
 
 
 
 
 
 
 
 
 
 
 
2300
2301	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2302	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2303	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2304			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2305
2306	rcu_read_lock();
2307	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2308	rcu_read_unlock();
2309
2310	return rth;
2311}
2312EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2313
2314struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2315					    struct fib_result *res,
2316					    const struct sk_buff *skb)
2317{
2318	struct net_device *dev_out = NULL;
2319	int orig_oif = fl4->flowi4_oif;
2320	unsigned int flags = 0;
2321	struct rtable *rth;
2322	int err = -ENETUNREACH;
2323
2324	if (fl4->saddr) {
2325		rth = ERR_PTR(-EINVAL);
2326		if (ipv4_is_multicast(fl4->saddr) ||
2327		    ipv4_is_lbcast(fl4->saddr) ||
2328		    ipv4_is_zeronet(fl4->saddr))
2329			goto out;
2330
2331		/* I removed check for oif == dev_out->oif here.
2332		   It was wrong for two reasons:
2333		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2334		      is assigned to multiple interfaces.
2335		   2. Moreover, we are allowed to send packets with saddr
2336		      of another iface. --ANK
2337		 */
2338
2339		if (fl4->flowi4_oif == 0 &&
2340		    (ipv4_is_multicast(fl4->daddr) ||
2341		     ipv4_is_lbcast(fl4->daddr))) {
2342			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2343			dev_out = __ip_dev_find(net, fl4->saddr, false);
2344			if (!dev_out)
2345				goto out;
2346
2347			/* Special hack: user can direct multicasts
2348			   and limited broadcast via necessary interface
2349			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2350			   This hack is not just for fun, it allows
2351			   vic,vat and friends to work.
2352			   They bind socket to loopback, set ttl to zero
2353			   and expect that it will work.
2354			   From the viewpoint of routing cache they are broken,
2355			   because we are not allowed to build multicast path
2356			   with loopback source addr (look, routing cache
2357			   cannot know, that ttl is zero, so that packet
2358			   will not leave this host and route is valid).
2359			   Luckily, this hack is good workaround.
2360			 */
2361
2362			fl4->flowi4_oif = dev_out->ifindex;
2363			goto make_route;
2364		}
2365
2366		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2367			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2368			if (!__ip_dev_find(net, fl4->saddr, false))
2369				goto out;
2370		}
2371	}
2372
2373
2374	if (fl4->flowi4_oif) {
2375		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2376		rth = ERR_PTR(-ENODEV);
2377		if (!dev_out)
2378			goto out;
2379
2380		/* RACE: Check return value of inet_select_addr instead. */
2381		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2382			rth = ERR_PTR(-ENETUNREACH);
2383			goto out;
2384		}
2385		if (ipv4_is_local_multicast(fl4->daddr) ||
2386		    ipv4_is_lbcast(fl4->daddr) ||
2387		    fl4->flowi4_proto == IPPROTO_IGMP) {
2388			if (!fl4->saddr)
2389				fl4->saddr = inet_select_addr(dev_out, 0,
2390							      RT_SCOPE_LINK);
2391			goto make_route;
2392		}
2393		if (!fl4->saddr) {
2394			if (ipv4_is_multicast(fl4->daddr))
2395				fl4->saddr = inet_select_addr(dev_out, 0,
2396							      fl4->flowi4_scope);
2397			else if (!fl4->daddr)
2398				fl4->saddr = inet_select_addr(dev_out, 0,
2399							      RT_SCOPE_HOST);
2400		}
2401	}
2402
2403	if (!fl4->daddr) {
2404		fl4->daddr = fl4->saddr;
2405		if (!fl4->daddr)
2406			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2407		dev_out = net->loopback_dev;
2408		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2409		res->type = RTN_LOCAL;
2410		flags |= RTCF_LOCAL;
2411		goto make_route;
2412	}
2413
2414	err = fib_lookup(net, fl4, res, 0);
2415	if (err) {
2416		res->fi = NULL;
2417		res->table = NULL;
2418		if (fl4->flowi4_oif &&
2419		    (ipv4_is_multicast(fl4->daddr) ||
2420		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2421			/* Apparently, routing tables are wrong. Assume,
2422			   that the destination is on link.
2423
2424			   WHY? DW.
2425			   Because we are allowed to send to iface
2426			   even if it has NO routes and NO assigned
2427			   addresses. When oif is specified, routing
2428			   tables are looked up with only one purpose:
2429			   to catch if destination is gatewayed, rather than
2430			   direct. Moreover, if MSG_DONTROUTE is set,
2431			   we send packet, ignoring both routing tables
2432			   and ifaddr state. --ANK
2433
2434
2435			   We could make it even if oif is unknown,
2436			   likely IPv6, but we do not.
2437			 */
2438
2439			if (fl4->saddr == 0)
2440				fl4->saddr = inet_select_addr(dev_out, 0,
2441							      RT_SCOPE_LINK);
2442			res->type = RTN_UNICAST;
2443			goto make_route;
2444		}
2445		rth = ERR_PTR(err);
2446		goto out;
2447	}
2448
2449	if (res->type == RTN_LOCAL) {
2450		if (!fl4->saddr) {
2451			if (res->fi->fib_prefsrc)
2452				fl4->saddr = res->fi->fib_prefsrc;
2453			else
2454				fl4->saddr = fl4->daddr;
2455		}
2456
2457		/* L3 master device is the loopback for that domain */
2458		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2459			net->loopback_dev;
2460
2461		/* make sure orig_oif points to fib result device even
2462		 * though packet rx/tx happens over loopback or l3mdev
2463		 */
2464		orig_oif = FIB_RES_OIF(*res);
2465
2466		fl4->flowi4_oif = dev_out->ifindex;
 
2467		flags |= RTCF_LOCAL;
2468		goto make_route;
2469	}
2470
2471	fib_select_path(net, res, fl4, skb);
 
 
 
 
 
 
 
 
 
 
 
2472
2473	dev_out = FIB_RES_DEV(*res);
2474	fl4->flowi4_oif = dev_out->ifindex;
2475
2476
2477make_route:
2478	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
 
 
 
 
 
 
 
 
2479
2480out:
 
2481	return rth;
2482}
2483
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2484static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2485{
2486	return NULL;
2487}
2488
2489static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2490{
2491	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2492
2493	return mtu ? : dst->dev->mtu;
2494}
2495
2496static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2497					  struct sk_buff *skb, u32 mtu)
2498{
2499}
2500
2501static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2502				       struct sk_buff *skb)
2503{
2504}
2505
2506static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2507					  unsigned long old)
2508{
2509	return NULL;
2510}
2511
2512static struct dst_ops ipv4_dst_blackhole_ops = {
2513	.family			=	AF_INET,
 
 
2514	.check			=	ipv4_blackhole_dst_check,
2515	.mtu			=	ipv4_blackhole_mtu,
2516	.default_advmss		=	ipv4_default_advmss,
2517	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2518	.redirect		=	ipv4_rt_blackhole_redirect,
2519	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2520	.neigh_lookup		=	ipv4_neigh_lookup,
2521};
2522
2523struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2524{
 
2525	struct rtable *ort = (struct rtable *) dst_orig;
2526	struct rtable *rt;
2527
2528	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2529	if (rt) {
2530		struct dst_entry *new = &rt->dst;
2531
2532		new->__use = 1;
2533		new->input = dst_discard;
2534		new->output = dst_discard_out;
 
2535
2536		new->dev = net->loopback_dev;
2537		if (new->dev)
2538			dev_hold(new->dev);
2539
2540		rt->rt_is_input = ort->rt_is_input;
 
 
 
2541		rt->rt_iif = ort->rt_iif;
2542		rt->rt_pmtu = ort->rt_pmtu;
2543		rt->rt_mtu_locked = ort->rt_mtu_locked;
2544
2545		rt->rt_genid = rt_genid_ipv4(net);
2546		rt->rt_flags = ort->rt_flags;
2547		rt->rt_type = ort->rt_type;
 
 
2548		rt->rt_gateway = ort->rt_gateway;
2549		rt->rt_uses_gateway = ort->rt_uses_gateway;
 
 
 
 
 
 
2550
2551		INIT_LIST_HEAD(&rt->rt_uncached);
2552	}
2553
2554	dst_release(dst_orig);
2555
2556	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2557}
2558
2559struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2560				    const struct sock *sk)
2561{
2562	struct rtable *rt = __ip_route_output_key(net, flp4);
2563
2564	if (IS_ERR(rt))
2565		return rt;
2566
2567	if (flp4->flowi4_proto)
2568		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2569							flowi4_to_flowi(flp4),
2570							sk, 0);
2571
2572	return rt;
2573}
2574EXPORT_SYMBOL_GPL(ip_route_output_flow);
2575
2576/* called with rcu_read_lock held */
2577static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2578			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2579			u32 seq)
2580{
2581	struct rtable *rt = skb_rtable(skb);
2582	struct rtmsg *r;
2583	struct nlmsghdr *nlh;
2584	unsigned long expires = 0;
2585	u32 error;
2586	u32 metrics[RTAX_MAX];
2587
2588	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2589	if (!nlh)
2590		return -EMSGSIZE;
2591
2592	r = nlmsg_data(nlh);
2593	r->rtm_family	 = AF_INET;
2594	r->rtm_dst_len	= 32;
2595	r->rtm_src_len	= 0;
2596	r->rtm_tos	= fl4->flowi4_tos;
2597	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2598	if (nla_put_u32(skb, RTA_TABLE, table_id))
2599		goto nla_put_failure;
2600	r->rtm_type	= rt->rt_type;
2601	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2602	r->rtm_protocol = RTPROT_UNSPEC;
2603	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2604	if (rt->rt_flags & RTCF_NOTIFY)
2605		r->rtm_flags |= RTM_F_NOTIFY;
2606	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2607		r->rtm_flags |= RTCF_DOREDIRECT;
2608
2609	if (nla_put_in_addr(skb, RTA_DST, dst))
2610		goto nla_put_failure;
2611	if (src) {
2612		r->rtm_src_len = 32;
2613		if (nla_put_in_addr(skb, RTA_SRC, src))
2614			goto nla_put_failure;
2615	}
2616	if (rt->dst.dev &&
2617	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2618		goto nla_put_failure;
2619#ifdef CONFIG_IP_ROUTE_CLASSID
2620	if (rt->dst.tclassid &&
2621	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2622		goto nla_put_failure;
2623#endif
2624	if (!rt_is_input_route(rt) &&
2625	    fl4->saddr != src) {
2626		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
 
 
2627			goto nla_put_failure;
2628	}
2629	if (rt->rt_uses_gateway &&
2630	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2631		goto nla_put_failure;
2632
2633	expires = rt->dst.expires;
2634	if (expires) {
2635		unsigned long now = jiffies;
2636
2637		if (time_before(now, expires))
2638			expires -= now;
2639		else
2640			expires = 0;
2641	}
2642
2643	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2644	if (rt->rt_pmtu && expires)
2645		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2646	if (rt->rt_mtu_locked && expires)
2647		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2648	if (rtnetlink_put_metrics(skb, metrics) < 0)
2649		goto nla_put_failure;
2650
2651	if (fl4->flowi4_mark &&
2652	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2653		goto nla_put_failure;
2654
2655	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2656	    nla_put_u32(skb, RTA_UID,
2657			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2658		goto nla_put_failure;
2659
2660	error = rt->dst.error;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2661
2662	if (rt_is_input_route(rt)) {
2663#ifdef CONFIG_IP_MROUTE
 
 
2664		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2665		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2666			int err = ipmr_get_route(net, skb,
2667						 fl4->saddr, fl4->daddr,
2668						 r, portid);
2669
2670			if (err <= 0) {
2671				if (err == 0)
2672					return 0;
2673				goto nla_put_failure;
 
 
 
 
 
 
2674			}
2675		} else
2676#endif
2677			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2678				goto nla_put_failure;
2679	}
2680
2681	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
 
2682		goto nla_put_failure;
2683
2684	nlmsg_end(skb, nlh);
2685	return 0;
2686
2687nla_put_failure:
2688	nlmsg_cancel(skb, nlh);
2689	return -EMSGSIZE;
2690}
2691
2692static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2693			     struct netlink_ext_ack *extack)
2694{
2695	struct net *net = sock_net(in_skb->sk);
2696	struct rtmsg *rtm;
2697	struct nlattr *tb[RTA_MAX+1];
2698	struct fib_result res = {};
2699	struct rtable *rt = NULL;
2700	struct flowi4 fl4;
2701	__be32 dst = 0;
2702	__be32 src = 0;
2703	u32 iif;
2704	int err;
2705	int mark;
2706	struct sk_buff *skb;
2707	u32 table_id = RT_TABLE_MAIN;
2708	kuid_t uid;
2709
2710	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2711			  extack);
2712	if (err < 0)
2713		goto errout;
2714
2715	rtm = nlmsg_data(nlh);
2716
2717	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2718	if (!skb) {
2719		err = -ENOBUFS;
2720		goto errout;
2721	}
2722
2723	/* Reserve room for dummy headers, this skb can pass
2724	   through good chunk of routing engine.
2725	 */
2726	skb_reset_mac_header(skb);
2727	skb_reset_network_header(skb);
2728
2729	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2730	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2731	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2732	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2733	if (tb[RTA_UID])
2734		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2735	else
2736		uid = (iif ? INVALID_UID : current_uid());
2737
2738	/* Bugfix: need to give ip_route_input enough of an IP header to
2739	 * not gag.
2740	 */
2741	ip_hdr(skb)->protocol = IPPROTO_UDP;
2742	ip_hdr(skb)->saddr = src;
2743	ip_hdr(skb)->daddr = dst;
2744
2745	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2746
2747	memset(&fl4, 0, sizeof(fl4));
2748	fl4.daddr = dst;
2749	fl4.saddr = src;
2750	fl4.flowi4_tos = rtm->rtm_tos;
2751	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2752	fl4.flowi4_mark = mark;
2753	fl4.flowi4_uid = uid;
2754
2755	rcu_read_lock();
2756
2757	if (iif) {
2758		struct net_device *dev;
2759
2760		dev = dev_get_by_index_rcu(net, iif);
2761		if (!dev) {
2762			err = -ENODEV;
2763			goto errout_free;
2764		}
2765
2766		skb->protocol	= htons(ETH_P_IP);
2767		skb->dev	= dev;
2768		skb->mark	= mark;
2769		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2770					 dev, &res);
 
2771
2772		rt = skb_rtable(skb);
2773		if (err == 0 && rt->dst.error)
2774			err = -rt->dst.error;
2775	} else {
2776		fl4.flowi4_iif = LOOPBACK_IFINDEX;
2777		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
 
 
 
 
 
 
 
2778		err = 0;
2779		if (IS_ERR(rt))
2780			err = PTR_ERR(rt);
2781		else
2782			skb_dst_set(skb, &rt->dst);
2783	}
2784
2785	if (err)
2786		goto errout_free;
2787
 
2788	if (rtm->rtm_flags & RTM_F_NOTIFY)
2789		rt->rt_flags |= RTCF_NOTIFY;
2790
2791	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2792		table_id = res.table ? res.table->tb_id : 0;
2793
2794	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2795		if (!res.fi) {
2796			err = fib_props[res.type].error;
2797			if (!err)
2798				err = -EHOSTUNREACH;
2799			goto errout_free;
2800		}
2801		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2802				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2803				    rt->rt_type, res.prefix, res.prefixlen,
2804				    fl4.flowi4_tos, res.fi, 0);
2805	} else {
2806		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2807				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2808	}
2809	if (err < 0)
2810		goto errout_free;
2811
2812	rcu_read_unlock();
2813
2814	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2815errout:
2816	return err;
2817
2818errout_free:
2819	rcu_read_unlock();
2820	kfree_skb(skb);
2821	goto errout;
2822}
2823
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2824void ip_rt_multicast_event(struct in_device *in_dev)
2825{
2826	rt_cache_flush(dev_net(in_dev->dev));
2827}
2828
2829#ifdef CONFIG_SYSCTL
2830static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2831static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2832static int ip_rt_gc_elasticity __read_mostly	= 8;
2833static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
2834
2835static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2836					void __user *buffer,
2837					size_t *lenp, loff_t *ppos)
2838{
2839	struct net *net = (struct net *)__ctl->extra1;
2840
2841	if (write) {
2842		rt_cache_flush(net);
2843		fnhe_genid_bump(net);
 
 
 
 
 
 
 
 
2844		return 0;
2845	}
2846
2847	return -EINVAL;
2848}
2849
2850static struct ctl_table ipv4_route_table[] = {
2851	{
2852		.procname	= "gc_thresh",
2853		.data		= &ipv4_dst_ops.gc_thresh,
2854		.maxlen		= sizeof(int),
2855		.mode		= 0644,
2856		.proc_handler	= proc_dointvec,
2857	},
2858	{
2859		.procname	= "max_size",
2860		.data		= &ip_rt_max_size,
2861		.maxlen		= sizeof(int),
2862		.mode		= 0644,
2863		.proc_handler	= proc_dointvec,
2864	},
2865	{
2866		/*  Deprecated. Use gc_min_interval_ms */
2867
2868		.procname	= "gc_min_interval",
2869		.data		= &ip_rt_gc_min_interval,
2870		.maxlen		= sizeof(int),
2871		.mode		= 0644,
2872		.proc_handler	= proc_dointvec_jiffies,
2873	},
2874	{
2875		.procname	= "gc_min_interval_ms",
2876		.data		= &ip_rt_gc_min_interval,
2877		.maxlen		= sizeof(int),
2878		.mode		= 0644,
2879		.proc_handler	= proc_dointvec_ms_jiffies,
2880	},
2881	{
2882		.procname	= "gc_timeout",
2883		.data		= &ip_rt_gc_timeout,
2884		.maxlen		= sizeof(int),
2885		.mode		= 0644,
2886		.proc_handler	= proc_dointvec_jiffies,
2887	},
2888	{
2889		.procname	= "gc_interval",
2890		.data		= &ip_rt_gc_interval,
2891		.maxlen		= sizeof(int),
2892		.mode		= 0644,
2893		.proc_handler	= proc_dointvec_jiffies,
2894	},
2895	{
2896		.procname	= "redirect_load",
2897		.data		= &ip_rt_redirect_load,
2898		.maxlen		= sizeof(int),
2899		.mode		= 0644,
2900		.proc_handler	= proc_dointvec,
2901	},
2902	{
2903		.procname	= "redirect_number",
2904		.data		= &ip_rt_redirect_number,
2905		.maxlen		= sizeof(int),
2906		.mode		= 0644,
2907		.proc_handler	= proc_dointvec,
2908	},
2909	{
2910		.procname	= "redirect_silence",
2911		.data		= &ip_rt_redirect_silence,
2912		.maxlen		= sizeof(int),
2913		.mode		= 0644,
2914		.proc_handler	= proc_dointvec,
2915	},
2916	{
2917		.procname	= "error_cost",
2918		.data		= &ip_rt_error_cost,
2919		.maxlen		= sizeof(int),
2920		.mode		= 0644,
2921		.proc_handler	= proc_dointvec,
2922	},
2923	{
2924		.procname	= "error_burst",
2925		.data		= &ip_rt_error_burst,
2926		.maxlen		= sizeof(int),
2927		.mode		= 0644,
2928		.proc_handler	= proc_dointvec,
2929	},
2930	{
2931		.procname	= "gc_elasticity",
2932		.data		= &ip_rt_gc_elasticity,
2933		.maxlen		= sizeof(int),
2934		.mode		= 0644,
2935		.proc_handler	= proc_dointvec,
2936	},
2937	{
2938		.procname	= "mtu_expires",
2939		.data		= &ip_rt_mtu_expires,
2940		.maxlen		= sizeof(int),
2941		.mode		= 0644,
2942		.proc_handler	= proc_dointvec_jiffies,
2943	},
2944	{
2945		.procname	= "min_pmtu",
2946		.data		= &ip_rt_min_pmtu,
2947		.maxlen		= sizeof(int),
2948		.mode		= 0644,
2949		.proc_handler	= proc_dointvec_minmax,
2950		.extra1		= &ip_min_valid_pmtu,
2951	},
2952	{
2953		.procname	= "min_adv_mss",
2954		.data		= &ip_rt_min_advmss,
2955		.maxlen		= sizeof(int),
2956		.mode		= 0644,
2957		.proc_handler	= proc_dointvec,
2958	},
2959	{ }
2960};
2961
2962static struct ctl_table ipv4_route_flush_table[] = {
2963	{
2964		.procname	= "flush",
2965		.maxlen		= sizeof(int),
2966		.mode		= 0200,
2967		.proc_handler	= ipv4_sysctl_rtcache_flush,
2968	},
2969	{ },
2970};
2971
2972static __net_init int sysctl_route_net_init(struct net *net)
2973{
2974	struct ctl_table *tbl;
2975
2976	tbl = ipv4_route_flush_table;
2977	if (!net_eq(net, &init_net)) {
2978		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2979		if (!tbl)
2980			goto err_dup;
2981
2982		/* Don't export sysctls to unprivileged users */
2983		if (net->user_ns != &init_user_ns)
2984			tbl[0].procname = NULL;
2985	}
2986	tbl[0].extra1 = net;
2987
2988	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2989	if (!net->ipv4.route_hdr)
2990		goto err_reg;
2991	return 0;
2992
2993err_reg:
2994	if (tbl != ipv4_route_flush_table)
2995		kfree(tbl);
2996err_dup:
2997	return -ENOMEM;
2998}
2999
3000static __net_exit void sysctl_route_net_exit(struct net *net)
3001{
3002	struct ctl_table *tbl;
3003
3004	tbl = net->ipv4.route_hdr->ctl_table_arg;
3005	unregister_net_sysctl_table(net->ipv4.route_hdr);
3006	BUG_ON(tbl == ipv4_route_flush_table);
3007	kfree(tbl);
3008}
3009
3010static __net_initdata struct pernet_operations sysctl_route_ops = {
3011	.init = sysctl_route_net_init,
3012	.exit = sysctl_route_net_exit,
3013};
3014#endif
3015
3016static __net_init int rt_genid_init(struct net *net)
3017{
3018	atomic_set(&net->ipv4.rt_genid, 0);
3019	atomic_set(&net->fnhe_genid, 0);
3020	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
 
3021	return 0;
3022}
3023
3024static __net_initdata struct pernet_operations rt_genid_ops = {
3025	.init = rt_genid_init,
3026};
3027
3028static int __net_init ipv4_inetpeer_init(struct net *net)
3029{
3030	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3031
3032	if (!bp)
3033		return -ENOMEM;
3034	inet_peer_base_init(bp);
3035	net->ipv4.peers = bp;
3036	return 0;
3037}
3038
3039static void __net_exit ipv4_inetpeer_exit(struct net *net)
3040{
3041	struct inet_peer_base *bp = net->ipv4.peers;
3042
3043	net->ipv4.peers = NULL;
3044	inetpeer_invalidate_tree(bp);
3045	kfree(bp);
3046}
3047
3048static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3049	.init	=	ipv4_inetpeer_init,
3050	.exit	=	ipv4_inetpeer_exit,
3051};
3052
3053#ifdef CONFIG_IP_ROUTE_CLASSID
3054struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3055#endif /* CONFIG_IP_ROUTE_CLASSID */
3056
3057int __init ip_rt_init(void)
 
3058{
3059	int cpu;
3060
3061	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3062	if (!ip_idents)
3063		panic("IP: failed to allocate ip_idents\n");
3064
3065	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
 
 
3066
3067	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3068	if (!ip_tstamps)
3069		panic("IP: failed to allocate ip_tstamps\n");
3070
3071	for_each_possible_cpu(cpu) {
3072		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
 
3073
3074		INIT_LIST_HEAD(&ul->head);
3075		spin_lock_init(&ul->lock);
3076	}
3077#ifdef CONFIG_IP_ROUTE_CLASSID
3078	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3079	if (!ip_rt_acct)
3080		panic("IP: failed to allocate ip_rt_acct\n");
3081#endif
3082
3083	ipv4_dst_ops.kmem_cachep =
3084		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3085				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3086
3087	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3088
3089	if (dst_entries_init(&ipv4_dst_ops) < 0)
3090		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3091
3092	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3093		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3094
3095	ipv4_dst_ops.gc_thresh = ~0;
3096	ip_rt_max_size = INT_MAX;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3097
3098	devinet_init();
3099	ip_fib_init();
3100
 
 
 
 
 
3101	if (ip_rt_proc_init())
3102		pr_err("Unable to create route proc files\n");
3103#ifdef CONFIG_XFRM
3104	xfrm_init();
3105	xfrm4_init();
3106#endif
3107	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3108		      RTNL_FLAG_DOIT_UNLOCKED);
3109
3110#ifdef CONFIG_SYSCTL
3111	register_pernet_subsys(&sysctl_route_ops);
3112#endif
3113	register_pernet_subsys(&rt_genid_ops);
3114	register_pernet_subsys(&ipv4_inetpeer_ops);
3115	return 0;
3116}
3117
3118#ifdef CONFIG_SYSCTL
3119/*
3120 * We really need to sanitize the damn ipv4 init order, then all
3121 * this nonsense will go away.
3122 */
3123void __init ip_static_sysctl_init(void)
3124{
3125	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3126}
3127#endif

   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		ROUTE - implementation of the IP router.
   7 *
   8 * Authors:	Ross Biro
   9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *		Alan Cox	:	Verify area fixes.
  16 *		Alan Cox	:	cli() protects routing changes
  17 *		Rui Oliveira	:	ICMP routing table updates
  18 *		(rco@di.uminho.pt)	Routing table insertion and update
  19 *		Linus Torvalds	:	Rewrote bits to be sensible
  20 *		Alan Cox	:	Added BSD route gw semantics
  21 *		Alan Cox	:	Super /proc >4K
  22 *		Alan Cox	:	MTU in route table
  23 *		Alan Cox	: 	MSS actually. Also added the window
  24 *					clamper.
  25 *		Sam Lantinga	:	Fixed route matching in rt_del()
  26 *		Alan Cox	:	Routing cache support.
  27 *		Alan Cox	:	Removed compatibility cruft.
  28 *		Alan Cox	:	RTF_REJECT support.
  29 *		Alan Cox	:	TCP irtt support.
  30 *		Jonathan Naylor	:	Added Metric support.
  31 *	Miquel van Smoorenburg	:	BSD API fixes.
  32 *	Miquel van Smoorenburg	:	Metrics.
  33 *		Alan Cox	:	Use __u32 properly
  34 *		Alan Cox	:	Aligned routing errors more closely with BSD
  35 *					our system is still very different.
  36 *		Alan Cox	:	Faster /proc handling
  37 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  38 *					routing caches and better behaviour.
  39 *
  40 *		Olaf Erb	:	irtt wasn't being copied right.
  41 *		Bjorn Ekwall	:	Kerneld route support.
  42 *		Alan Cox	:	Multicast fixed (I hope)
  43 * 		Pavel Krauz	:	Limited broadcast fixed
  44 *		Mike McLagan	:	Routing by source
  45 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  46 *					route.c and rewritten from scratch.
  47 *		Andi Kleen	:	Load-limit warning messages.
  48 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  49 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  50 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  51 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  52 *		Marc Boucher	:	routing by fwmark
  53 *	Robert Olsson		:	Added rt_cache statistics
  54 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  55 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  56 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  57 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
  58 *
  59 *		This program is free software; you can redistribute it and/or
  60 *		modify it under the terms of the GNU General Public License
  61 *		as published by the Free Software Foundation; either version
  62 *		2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/bootmem.h>
  74#include <linux/string.h>
  75#include <linux/socket.h>
  76#include <linux/sockios.h>
  77#include <linux/errno.h>
  78#include <linux/in.h>
  79#include <linux/inet.h>
  80#include <linux/netdevice.h>
  81#include <linux/proc_fs.h>
  82#include <linux/init.h>
  83#include <linux/workqueue.h>
  84#include <linux/skbuff.h>
  85#include <linux/inetdevice.h>
  86#include <linux/igmp.h>
  87#include <linux/pkt_sched.h>
  88#include <linux/mroute.h>
  89#include <linux/netfilter_ipv4.h>
  90#include <linux/random.h>
  91#include <linux/jhash.h>
  92#include <linux/rcupdate.h>
  93#include <linux/times.h>
  94#include <linux/slab.h>
  95#include <linux/prefetch.h>
  96#include <net/dst.h>
 
  97#include <net/net_namespace.h>
  98#include <net/protocol.h>
  99#include <net/ip.h>
 100#include <net/route.h>
 101#include <net/inetpeer.h>
 102#include <net/sock.h>
 103#include <net/ip_fib.h>
 104#include <net/arp.h>
 105#include <net/tcp.h>
 106#include <net/icmp.h>
 107#include <net/xfrm.h>
 
 108#include <net/netevent.h>
 109#include <net/rtnetlink.h>
 110#ifdef CONFIG_SYSCTL
 111#include <linux/sysctl.h>
 112#include <linux/kmemleak.h>
 113#endif
 114#include <net/secure_seq.h>
 
 
 
 
 115
 116#define RT_FL_TOS(oldflp4) \
 117	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119#define IP_MAX_MTU	0xFFF0
 120
 121#define RT_GC_TIMEOUT (300*HZ)
 122
 123static int ip_rt_max_size;
 124static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 125static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
 127static int ip_rt_redirect_number __read_mostly	= 9;
 128static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 129static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 130static int ip_rt_error_cost __read_mostly	= HZ;
 131static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 132static int ip_rt_gc_elasticity __read_mostly	= 8;
 133static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 134static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 135static int ip_rt_min_advmss __read_mostly	= 256;
 136static int rt_chain_length_max __read_mostly	= 20;
 137
 138static struct delayed_work expires_work;
 139static unsigned long expires_ljiffies;
 140
 141/*
 142 *	Interface to generic destination cache.
 143 */
 144
 145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 147static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
 148static void		 ipv4_dst_destroy(struct dst_entry *dst);
 149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150static void		 ipv4_link_failure(struct sk_buff *skb);
 151static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152static int rt_garbage_collect(struct dst_ops *ops);
 153
 154static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155			    int how)
 156{
 157}
 158
 159static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160{
 161	struct rtable *rt = (struct rtable *) dst;
 162	struct inet_peer *peer;
 163	u32 *p = NULL;
 164
 165	if (!rt->peer)
 166		rt_bind_peer(rt, rt->rt_dst, 1);
 167
 168	peer = rt->peer;
 169	if (peer) {
 170		u32 *old_p = __DST_METRICS_PTR(old);
 171		unsigned long prev, new;
 172
 173		p = peer->metrics;
 174		if (inet_metrics_new(peer))
 175			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 176
 177		new = (unsigned long) p;
 178		prev = cmpxchg(&dst->_metrics, old, new);
 179
 180		if (prev != old) {
 181			p = __DST_METRICS_PTR(prev);
 182			if (prev & DST_METRICS_READ_ONLY)
 183				p = NULL;
 184		} else {
 185			if (rt->fi) {
 186				fib_info_put(rt->fi);
 187				rt->fi = NULL;
 188			}
 189		}
 190	}
 191	return p;
 192}
 193
 194static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 
 
 
 195
 196static struct dst_ops ipv4_dst_ops = {
 197	.family =		AF_INET,
 198	.protocol =		cpu_to_be16(ETH_P_IP),
 199	.gc =			rt_garbage_collect,
 200	.check =		ipv4_dst_check,
 201	.default_advmss =	ipv4_default_advmss,
 202	.mtu =			ipv4_mtu,
 203	.cow_metrics =		ipv4_cow_metrics,
 204	.destroy =		ipv4_dst_destroy,
 205	.ifdown =		ipv4_dst_ifdown,
 206	.negative_advice =	ipv4_negative_advice,
 207	.link_failure =		ipv4_link_failure,
 208	.update_pmtu =		ip_rt_update_pmtu,
 
 209	.local_out =		__ip_local_out,
 210	.neigh_lookup =		ipv4_neigh_lookup,
 
 211};
 212
 213#define ECN_OR_COST(class)	TC_PRIO_##class
 214
 215const __u8 ip_tos2prio[16] = {
 216	TC_PRIO_BESTEFFORT,
 217	ECN_OR_COST(BESTEFFORT),
 218	TC_PRIO_BESTEFFORT,
 219	ECN_OR_COST(BESTEFFORT),
 220	TC_PRIO_BULK,
 221	ECN_OR_COST(BULK),
 222	TC_PRIO_BULK,
 223	ECN_OR_COST(BULK),
 224	TC_PRIO_INTERACTIVE,
 225	ECN_OR_COST(INTERACTIVE),
 226	TC_PRIO_INTERACTIVE,
 227	ECN_OR_COST(INTERACTIVE),
 228	TC_PRIO_INTERACTIVE_BULK,
 229	ECN_OR_COST(INTERACTIVE_BULK),
 230	TC_PRIO_INTERACTIVE_BULK,
 231	ECN_OR_COST(INTERACTIVE_BULK)
 232};
 233EXPORT_SYMBOL(ip_tos2prio);
 234
 235/*
 236 * Route cache.
 237 */
 238
 239/* The locking scheme is rather straight forward:
 240 *
 241 * 1) Read-Copy Update protects the buckets of the central route hash.
 242 * 2) Only writers remove entries, and they hold the lock
 243 *    as they look at rtable reference counts.
 244 * 3) Only readers acquire references to rtable entries,
 245 *    they do so with atomic increments and with the
 246 *    lock held.
 247 */
 248
 249struct rt_hash_bucket {
 250	struct rtable __rcu	*chain;
 251};
 252
 253#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 254	defined(CONFIG_PROVE_LOCKING)
 255/*
 256 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 257 * The size of this table is a power of two and depends on the number of CPUS.
 258 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 259 */
 260#ifdef CONFIG_LOCKDEP
 261# define RT_HASH_LOCK_SZ	256
 262#else
 263# if NR_CPUS >= 32
 264#  define RT_HASH_LOCK_SZ	4096
 265# elif NR_CPUS >= 16
 266#  define RT_HASH_LOCK_SZ	2048
 267# elif NR_CPUS >= 8
 268#  define RT_HASH_LOCK_SZ	1024
 269# elif NR_CPUS >= 4
 270#  define RT_HASH_LOCK_SZ	512
 271# else
 272#  define RT_HASH_LOCK_SZ	256
 273# endif
 274#endif
 275
 276static spinlock_t	*rt_hash_locks;
 277# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 278
 279static __init void rt_hash_lock_init(void)
 280{
 281	int i;
 282
 283	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 284			GFP_KERNEL);
 285	if (!rt_hash_locks)
 286		panic("IP: failed to allocate rt_hash_locks\n");
 287
 288	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 289		spin_lock_init(&rt_hash_locks[i]);
 290}
 291#else
 292# define rt_hash_lock_addr(slot) NULL
 293
 294static inline void rt_hash_lock_init(void)
 295{
 296}
 297#endif
 298
 299static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
 300static unsigned int		rt_hash_mask __read_mostly;
 301static unsigned int		rt_hash_log  __read_mostly;
 302
 303static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 304#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 305
 306static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 307				   int genid)
 308{
 309	return jhash_3words((__force u32)daddr, (__force u32)saddr,
 310			    idx, genid)
 311		& rt_hash_mask;
 312}
 313
 314static inline int rt_genid(struct net *net)
 315{
 316	return atomic_read(&net->ipv4.rt_genid);
 317}
 318
 319#ifdef CONFIG_PROC_FS
 320struct rt_cache_iter_state {
 321	struct seq_net_private p;
 322	int bucket;
 323	int genid;
 324};
 325
 326static struct rtable *rt_cache_get_first(struct seq_file *seq)
 327{
 328	struct rt_cache_iter_state *st = seq->private;
 329	struct rtable *r = NULL;
 330
 331	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 332		if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 333			continue;
 334		rcu_read_lock_bh();
 335		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 336		while (r) {
 337			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 338			    r->rt_genid == st->genid)
 339				return r;
 340			r = rcu_dereference_bh(r->dst.rt_next);
 341		}
 342		rcu_read_unlock_bh();
 343	}
 344	return r;
 345}
 346
 347static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 348					  struct rtable *r)
 349{
 350	struct rt_cache_iter_state *st = seq->private;
 351
 352	r = rcu_dereference_bh(r->dst.rt_next);
 353	while (!r) {
 354		rcu_read_unlock_bh();
 355		do {
 356			if (--st->bucket < 0)
 357				return NULL;
 358		} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 359		rcu_read_lock_bh();
 360		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 361	}
 362	return r;
 363}
 364
 365static struct rtable *rt_cache_get_next(struct seq_file *seq,
 366					struct rtable *r)
 367{
 368	struct rt_cache_iter_state *st = seq->private;
 369	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 370		if (dev_net(r->dst.dev) != seq_file_net(seq))
 371			continue;
 372		if (r->rt_genid == st->genid)
 373			break;
 374	}
 375	return r;
 376}
 377
 378static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 379{
 380	struct rtable *r = rt_cache_get_first(seq);
 381
 382	if (r)
 383		while (pos && (r = rt_cache_get_next(seq, r)))
 384			--pos;
 385	return pos ? NULL : r;
 386}
 387
 388static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 389{
 390	struct rt_cache_iter_state *st = seq->private;
 391	if (*pos)
 392		return rt_cache_get_idx(seq, *pos - 1);
 393	st->genid = rt_genid(seq_file_net(seq));
 394	return SEQ_START_TOKEN;
 395}
 396
 397static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 398{
 399	struct rtable *r;
 400
 401	if (v == SEQ_START_TOKEN)
 402		r = rt_cache_get_first(seq);
 403	else
 404		r = rt_cache_get_next(seq, v);
 405	++*pos;
 406	return r;
 407}
 408
 409static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 410{
 411	if (v && v != SEQ_START_TOKEN)
 412		rcu_read_unlock_bh();
 413}
 414
 415static int rt_cache_seq_show(struct seq_file *seq, void *v)
 416{
 417	if (v == SEQ_START_TOKEN)
 418		seq_printf(seq, "%-127s\n",
 419			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 420			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 421			   "HHUptod\tSpecDst");
 422	else {
 423		struct rtable *r = v;
 424		struct neighbour *n;
 425		int len, HHUptod;
 426
 427		rcu_read_lock();
 428		n = dst_get_neighbour_noref(&r->dst);
 429		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 430		rcu_read_unlock();
 431
 432		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 433			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 434			r->dst.dev ? r->dst.dev->name : "*",
 435			(__force u32)r->rt_dst,
 436			(__force u32)r->rt_gateway,
 437			r->rt_flags, atomic_read(&r->dst.__refcnt),
 438			r->dst.__use, 0, (__force u32)r->rt_src,
 439			dst_metric_advmss(&r->dst) + 40,
 440			dst_metric(&r->dst, RTAX_WINDOW),
 441			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 442			      dst_metric(&r->dst, RTAX_RTTVAR)),
 443			r->rt_key_tos,
 444			-1,
 445			HHUptod,
 446			r->rt_spec_dst, &len);
 447
 448		seq_printf(seq, "%*s\n", 127 - len, "");
 449	}
 450	return 0;
 451}
 452
 453static const struct seq_operations rt_cache_seq_ops = {
 454	.start  = rt_cache_seq_start,
 455	.next   = rt_cache_seq_next,
 456	.stop   = rt_cache_seq_stop,
 457	.show   = rt_cache_seq_show,
 458};
 459
 460static int rt_cache_seq_open(struct inode *inode, struct file *file)
 461{
 462	return seq_open_net(inode, file, &rt_cache_seq_ops,
 463			sizeof(struct rt_cache_iter_state));
 464}
 465
 466static const struct file_operations rt_cache_seq_fops = {
 467	.owner	 = THIS_MODULE,
 468	.open	 = rt_cache_seq_open,
 469	.read	 = seq_read,
 470	.llseek	 = seq_lseek,
 471	.release = seq_release_net,
 472};
 473
 474
 475static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 476{
 477	int cpu;
 478
 479	if (*pos == 0)
 480		return SEQ_START_TOKEN;
 481
 482	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 483		if (!cpu_possible(cpu))
 484			continue;
 485		*pos = cpu+1;
 486		return &per_cpu(rt_cache_stat, cpu);
 487	}
 488	return NULL;
 489}
 490
 491static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 492{
 493	int cpu;
 494
 495	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 496		if (!cpu_possible(cpu))
 497			continue;
 498		*pos = cpu+1;
 499		return &per_cpu(rt_cache_stat, cpu);
 500	}
 501	return NULL;
 502
 503}
 504
 505static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 506{
 507
 508}
 509
 510static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 511{
 512	struct rt_cache_stat *st = v;
 513
 514	if (v == SEQ_START_TOKEN) {
 515		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 516		return 0;
 517	}
 518
 519	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 520		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 521		   dst_entries_get_slow(&ipv4_dst_ops),
 522		   st->in_hit,
 523		   st->in_slow_tot,
 524		   st->in_slow_mc,
 525		   st->in_no_route,
 526		   st->in_brd,
 527		   st->in_martian_dst,
 528		   st->in_martian_src,
 529
 530		   st->out_hit,
 531		   st->out_slow_tot,
 532		   st->out_slow_mc,
 533
 534		   st->gc_total,
 535		   st->gc_ignored,
 536		   st->gc_goal_miss,
 537		   st->gc_dst_overflow,
 538		   st->in_hlist_search,
 539		   st->out_hlist_search
 540		);
 541	return 0;
 542}
 543
 544static const struct seq_operations rt_cpu_seq_ops = {
 545	.start  = rt_cpu_seq_start,
 546	.next   = rt_cpu_seq_next,
 547	.stop   = rt_cpu_seq_stop,
 548	.show   = rt_cpu_seq_show,
 549};
 550
 551
 552static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 553{
 554	return seq_open(file, &rt_cpu_seq_ops);
 555}
 556
 557static const struct file_operations rt_cpu_seq_fops = {
 558	.owner	 = THIS_MODULE,
 559	.open	 = rt_cpu_seq_open,
 560	.read	 = seq_read,
 561	.llseek	 = seq_lseek,
 562	.release = seq_release,
 563};
 564
 565#ifdef CONFIG_IP_ROUTE_CLASSID
 566static int rt_acct_proc_show(struct seq_file *m, void *v)
 567{
 568	struct ip_rt_acct *dst, *src;
 569	unsigned int i, j;
 570
 571	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 572	if (!dst)
 573		return -ENOMEM;
 574
 575	for_each_possible_cpu(i) {
 576		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 577		for (j = 0; j < 256; j++) {
 578			dst[j].o_bytes   += src[j].o_bytes;
 579			dst[j].o_packets += src[j].o_packets;
 580			dst[j].i_bytes   += src[j].i_bytes;
 581			dst[j].i_packets += src[j].i_packets;
 582		}
 583	}
 584
 585	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 586	kfree(dst);
 587	return 0;
 588}
 589
 590static int rt_acct_proc_open(struct inode *inode, struct file *file)
 591{
 592	return single_open(file, rt_acct_proc_show, NULL);
 593}
 594
 595static const struct file_operations rt_acct_proc_fops = {
 596	.owner		= THIS_MODULE,
 597	.open		= rt_acct_proc_open,
 598	.read		= seq_read,
 599	.llseek		= seq_lseek,
 600	.release	= single_release,
 601};
 602#endif
 603
 604static int __net_init ip_rt_do_proc_init(struct net *net)
 605{
 606	struct proc_dir_entry *pde;
 607
 608	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 609			&rt_cache_seq_fops);
 610	if (!pde)
 611		goto err1;
 612
 613	pde = proc_create("rt_cache", S_IRUGO,
 614			  net->proc_net_stat, &rt_cpu_seq_fops);
 615	if (!pde)
 616		goto err2;
 617
 618#ifdef CONFIG_IP_ROUTE_CLASSID
 619	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 620	if (!pde)
 621		goto err3;
 622#endif
 623	return 0;
 624
 625#ifdef CONFIG_IP_ROUTE_CLASSID
 626err3:
 627	remove_proc_entry("rt_cache", net->proc_net_stat);
 628#endif
 629err2:
 630	remove_proc_entry("rt_cache", net->proc_net);
 631err1:
 632	return -ENOMEM;
 633}
 634
 635static void __net_exit ip_rt_do_proc_exit(struct net *net)
 636{
 637	remove_proc_entry("rt_cache", net->proc_net_stat);
 638	remove_proc_entry("rt_cache", net->proc_net);
 639#ifdef CONFIG_IP_ROUTE_CLASSID
 640	remove_proc_entry("rt_acct", net->proc_net);
 641#endif
 642}
 643
 644static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 645	.init = ip_rt_do_proc_init,
 646	.exit = ip_rt_do_proc_exit,
 647};
 648
 649static int __init ip_rt_proc_init(void)
 650{
 651	return register_pernet_subsys(&ip_rt_proc_ops);
 652}
 653
 654#else
 655static inline int ip_rt_proc_init(void)
 656{
 657	return 0;
 658}
 659#endif /* CONFIG_PROC_FS */
 660
 661static inline void rt_free(struct rtable *rt)
 662{
 663	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 664}
 665
 666static inline void rt_drop(struct rtable *rt)
 667{
 668	ip_rt_put(rt);
 669	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 670}
 671
 672static inline int rt_fast_clean(struct rtable *rth)
 
 
 673{
 674	/* Kill broadcast/multicast entries very aggresively, if they
 675	   collide in hash table with more useful entries */
 676	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 677		rt_is_input_route(rth) && rth->dst.rt_next;
 678}
 679
 680static inline int rt_valuable(struct rtable *rth)
 681{
 682	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 683		(rth->peer && rth->peer->pmtu_expires);
 684}
 685
 686static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 687{
 688	unsigned long age;
 689	int ret = 0;
 690
 691	if (atomic_read(&rth->dst.__refcnt))
 692		goto out;
 693
 694	age = jiffies - rth->dst.lastuse;
 695	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 696	    (age <= tmo2 && rt_valuable(rth)))
 697		goto out;
 698	ret = 1;
 699out:	return ret;
 700}
 701
 702/* Bits of score are:
 703 * 31: very valuable
 704 * 30: not quite useless
 705 * 29..0: usage counter
 706 */
 707static inline u32 rt_score(struct rtable *rt)
 708{
 709	u32 score = jiffies - rt->dst.lastuse;
 
 
 710
 711	score = ~score & ~(3<<30);
 712
 713	if (rt_valuable(rt))
 714		score |= (1<<31);
 715
 716	if (rt_is_output_route(rt) ||
 717	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 718		score |= (1<<30);
 719
 720	return score;
 721}
 722
 723static inline bool rt_caching(const struct net *net)
 724{
 725	return net->ipv4.current_rt_cache_rebuild_count <=
 726		net->ipv4.sysctl_rt_cache_rebuild_count;
 727}
 728
 729static inline bool compare_hash_inputs(const struct rtable *rt1,
 730				       const struct rtable *rt2)
 731{
 732	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 733		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 734		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 735}
 736
 737static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 738{
 739	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 740		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 741		(rt1->rt_mark ^ rt2->rt_mark) |
 742		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
 743		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
 744		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
 745}
 746
 747static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 748{
 749	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 750}
 751
 752static inline int rt_is_expired(struct rtable *rth)
 753{
 754	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 755}
 756
 757/*
 758 * Perform a full scan of hash table and free all entries.
 759 * Can be called by a softirq or a process.
 760 * In the later case, we want to be reschedule if necessary
 761 */
 762static void rt_do_flush(struct net *net, int process_context)
 763{
 764	unsigned int i;
 765	struct rtable *rth, *next;
 
 
 
 766
 767	for (i = 0; i <= rt_hash_mask; i++) {
 768		struct rtable __rcu **pprev;
 769		struct rtable *list;
 770
 771		if (process_context && need_resched())
 772			cond_resched();
 773		rth = rcu_access_pointer(rt_hash_table[i].chain);
 774		if (!rth)
 775			continue;
 776
 777		spin_lock_bh(rt_hash_lock_addr(i));
 778
 779		list = NULL;
 780		pprev = &rt_hash_table[i].chain;
 781		rth = rcu_dereference_protected(*pprev,
 782			lockdep_is_held(rt_hash_lock_addr(i)));
 783
 784		while (rth) {
 785			next = rcu_dereference_protected(rth->dst.rt_next,
 786				lockdep_is_held(rt_hash_lock_addr(i)));
 787
 788			if (!net ||
 789			    net_eq(dev_net(rth->dst.dev), net)) {
 790				rcu_assign_pointer(*pprev, next);
 791				rcu_assign_pointer(rth->dst.rt_next, list);
 792				list = rth;
 793			} else {
 794				pprev = &rth->dst.rt_next;
 795			}
 796			rth = next;
 797		}
 798
 799		spin_unlock_bh(rt_hash_lock_addr(i));
 800
 801		for (; list; list = next) {
 802			next = rcu_dereference_protected(list->dst.rt_next, 1);
 803			rt_free(list);
 804		}
 805	}
 806}
 
 807
 808/*
 809 * While freeing expired entries, we compute average chain length
 810 * and standard deviation, using fixed-point arithmetic.
 811 * This to have an estimation of rt_chain_length_max
 812 *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 813 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 814 */
 815
 816#define FRACT_BITS 3
 817#define ONE (1UL << FRACT_BITS)
 818
 819/*
 820 * Given a hash chain and an item in this hash chain,
 821 * find if a previous entry has the same hash_inputs
 822 * (but differs on tos, mark or oif)
 823 * Returns 0 if an alias is found.
 824 * Returns ONE if rth has no alias before itself.
 825 */
 826static int has_noalias(const struct rtable *head, const struct rtable *rth)
 827{
 828	const struct rtable *aux = head;
 829
 830	while (aux != rth) {
 831		if (compare_hash_inputs(aux, rth))
 832			return 0;
 833		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 834	}
 835	return ONE;
 836}
 
 837
 838static void rt_check_expire(void)
 
 
 
 
 839{
 840	static unsigned int rover;
 841	unsigned int i = rover, goal;
 842	struct rtable *rth;
 843	struct rtable __rcu **rthp;
 844	unsigned long samples = 0;
 845	unsigned long sum = 0, sum2 = 0;
 846	unsigned long delta;
 847	u64 mult;
 848
 849	delta = jiffies - expires_ljiffies;
 850	expires_ljiffies = jiffies;
 851	mult = ((u64)delta) << rt_hash_log;
 852	if (ip_rt_gc_timeout > 1)
 853		do_div(mult, ip_rt_gc_timeout);
 854	goal = (unsigned int)mult;
 855	if (goal > rt_hash_mask)
 856		goal = rt_hash_mask + 1;
 857	for (; goal > 0; goal--) {
 858		unsigned long tmo = ip_rt_gc_timeout;
 859		unsigned long length;
 860
 861		i = (i + 1) & rt_hash_mask;
 862		rthp = &rt_hash_table[i].chain;
 863
 864		if (need_resched())
 865			cond_resched();
 866
 867		samples++;
 868
 869		if (rcu_dereference_raw(*rthp) == NULL)
 870			continue;
 871		length = 0;
 872		spin_lock_bh(rt_hash_lock_addr(i));
 873		while ((rth = rcu_dereference_protected(*rthp,
 874					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 875			prefetch(rth->dst.rt_next);
 876			if (rt_is_expired(rth)) {
 877				*rthp = rth->dst.rt_next;
 878				rt_free(rth);
 879				continue;
 880			}
 881			if (rth->dst.expires) {
 882				/* Entry is expired even if it is in use */
 883				if (time_before_eq(jiffies, rth->dst.expires)) {
 884nofree:
 885					tmo >>= 1;
 886					rthp = &rth->dst.rt_next;
 887					/*
 888					 * We only count entries on
 889					 * a chain with equal hash inputs once
 890					 * so that entries for different QOS
 891					 * levels, and other non-hash input
 892					 * attributes don't unfairly skew
 893					 * the length computation
 894					 */
 895					length += has_noalias(rt_hash_table[i].chain, rth);
 896					continue;
 897				}
 898			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 899				goto nofree;
 900
 901			/* Cleanup aged off entries. */
 902			*rthp = rth->dst.rt_next;
 903			rt_free(rth);
 904		}
 905		spin_unlock_bh(rt_hash_lock_addr(i));
 906		sum += length;
 907		sum2 += length*length;
 908	}
 909	if (samples) {
 910		unsigned long avg = sum / samples;
 911		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 912		rt_chain_length_max = max_t(unsigned long,
 913					ip_rt_gc_elasticity,
 914					(avg + 4*sd) >> FRACT_BITS);
 915	}
 916	rover = i;
 
 
 
 
 917}
 918
 919/*
 920 * rt_worker_func() is run in process context.
 921 * we call rt_check_expire() to scan part of the hash table
 922 */
 923static void rt_worker_func(struct work_struct *work)
 924{
 925	rt_check_expire();
 926	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 927}
 
 
 
 928
 929/*
 930 * Perturbation of rt_genid by a small quantity [1..256]
 931 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 932 * many times (2^24) without giving recent rt_genid.
 933 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 934 */
 935static void rt_cache_invalidate(struct net *net)
 936{
 937	unsigned char shuffle;
 938
 939	get_random_bytes(&shuffle, sizeof(shuffle));
 940	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 941	inetpeer_invalidate_tree(AF_INET);
 942}
 943
 944/*
 945 * delay < 0  : invalidate cache (fast : entries will be deleted later)
 946 * delay >= 0 : invalidate & flush cache (can be long)
 947 */
 948void rt_cache_flush(struct net *net, int delay)
 949{
 950	rt_cache_invalidate(net);
 951	if (delay >= 0)
 952		rt_do_flush(net, !in_softirq());
 953}
 954
 955/* Flush previous cache invalidated entries from the cache */
 956void rt_cache_flush_batch(struct net *net)
 957{
 958	rt_do_flush(net, !in_softirq());
 
 
 
 
 
 
 959}
 960
 961static void rt_emergency_hash_rebuild(struct net *net)
 
 962{
 963	net_warn_ratelimited("Route hash chain too long!\n");
 964	rt_cache_invalidate(net);
 
 
 965}
 966
 967/*
 968   Short description of GC goals.
 969
 970   We want to build algorithm, which will keep routing cache
 971   at some equilibrium point, when number of aged off entries
 972   is kept approximately equal to newly generated ones.
 973
 974   Current expiration strength is variable "expire".
 975   We try to adjust it dynamically, so that if networking
 976   is idle expires is large enough to keep enough of warm entries,
 977   and when load increases it reduces to limit cache size.
 978 */
 979
 980static int rt_garbage_collect(struct dst_ops *ops)
 981{
 982	static unsigned long expire = RT_GC_TIMEOUT;
 983	static unsigned long last_gc;
 984	static int rover;
 985	static int equilibrium;
 986	struct rtable *rth;
 987	struct rtable __rcu **rthp;
 988	unsigned long now = jiffies;
 989	int goal;
 990	int entries = dst_entries_get_fast(&ipv4_dst_ops);
 991
 992	/*
 993	 * Garbage collection is pretty expensive,
 994	 * do not make it too frequently.
 995	 */
 996
 997	RT_CACHE_STAT_INC(gc_total);
 998
 999	if (now - last_gc < ip_rt_gc_min_interval &&
1000	    entries < ip_rt_max_size) {
1001		RT_CACHE_STAT_INC(gc_ignored);
1002		goto out;
1003	}
1004
1005	entries = dst_entries_get_slow(&ipv4_dst_ops);
1006	/* Calculate number of entries, which we want to expire now. */
1007	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1008	if (goal <= 0) {
1009		if (equilibrium < ipv4_dst_ops.gc_thresh)
1010			equilibrium = ipv4_dst_ops.gc_thresh;
1011		goal = entries - equilibrium;
1012		if (goal > 0) {
1013			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1014			goal = entries - equilibrium;
1015		}
1016	} else {
1017		/* We are in dangerous area. Try to reduce cache really
1018		 * aggressively.
1019		 */
1020		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1021		equilibrium = entries - goal;
1022	}
1023
1024	if (now - last_gc >= ip_rt_gc_min_interval)
1025		last_gc = now;
1026
1027	if (goal <= 0) {
1028		equilibrium += goal;
1029		goto work_done;
1030	}
1031
1032	do {
1033		int i, k;
1034
1035		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036			unsigned long tmo = expire;
1037
1038			k = (k + 1) & rt_hash_mask;
1039			rthp = &rt_hash_table[k].chain;
1040			spin_lock_bh(rt_hash_lock_addr(k));
1041			while ((rth = rcu_dereference_protected(*rthp,
1042					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1043				if (!rt_is_expired(rth) &&
1044					!rt_may_expire(rth, tmo, expire)) {
1045					tmo >>= 1;
1046					rthp = &rth->dst.rt_next;
1047					continue;
1048				}
1049				*rthp = rth->dst.rt_next;
1050				rt_free(rth);
1051				goal--;
1052			}
1053			spin_unlock_bh(rt_hash_lock_addr(k));
1054			if (goal <= 0)
1055				break;
1056		}
1057		rover = k;
1058
1059		if (goal <= 0)
1060			goto work_done;
1061
1062		/* Goal is not achieved. We stop process if:
1063
1064		   - if expire reduced to zero. Otherwise, expire is halfed.
1065		   - if table is not full.
1066		   - if we are called from interrupt.
1067		   - jiffies check is just fallback/debug loop breaker.
1068		     We will not spin here for long time in any case.
1069		 */
1070
1071		RT_CACHE_STAT_INC(gc_goal_miss);
1072
1073		if (expire == 0)
1074			break;
1075
1076		expire >>= 1;
1077
1078		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1079			goto out;
1080	} while (!in_softirq() && time_before_eq(jiffies, now));
1081
1082	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083		goto out;
1084	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1085		goto out;
1086	net_warn_ratelimited("dst cache overflow\n");
1087	RT_CACHE_STAT_INC(gc_dst_overflow);
1088	return 1;
1089
1090work_done:
1091	expire += ip_rt_gc_min_interval;
1092	if (expire > ip_rt_gc_timeout ||
1093	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1095		expire = ip_rt_gc_timeout;
1096out:	return 0;
1097}
1098
1099/*
1100 * Returns number of entries in a hash chain that have different hash_inputs
1101 */
1102static int slow_chain_length(const struct rtable *head)
1103{
1104	int length = 0;
1105	const struct rtable *rth = head;
1106
1107	while (rth) {
1108		length += has_noalias(head, rth);
1109		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
 
 
1110	}
1111	return length >> FRACT_BITS;
 
1112}
1113
1114static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1115{
1116	static const __be32 inaddr_any = 0;
1117	struct net_device *dev = dst->dev;
1118	const __be32 *pkey = daddr;
1119	const struct rtable *rt;
1120	struct neighbour *n;
1121
1122	rt = (const struct rtable *) dst;
1123
1124	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1125		pkey = &inaddr_any;
1126	else if (rt->rt_gateway)
1127		pkey = (const __be32 *) &rt->rt_gateway;
1128
1129	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1130	if (n)
1131		return n;
1132	return neigh_create(&arp_tbl, pkey, dev);
1133}
1134
1135static int rt_bind_neighbour(struct rtable *rt)
1136{
1137	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1138	if (IS_ERR(n))
1139		return PTR_ERR(n);
1140	dst_set_neighbour(&rt->dst, n);
1141
1142	return 0;
 
 
 
 
1143}
1144
1145static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1146				     struct sk_buff *skb, int ifindex)
1147{
1148	struct rtable	*rth, *cand;
1149	struct rtable __rcu **rthp, **candp;
1150	unsigned long	now;
1151	u32 		min_score;
1152	int		chain_length;
1153	int attempts = !in_softirq();
1154
1155restart:
1156	chain_length = 0;
1157	min_score = ~(u32)0;
1158	cand = NULL;
1159	candp = NULL;
1160	now = jiffies;
1161
1162	if (!rt_caching(dev_net(rt->dst.dev))) {
1163		/*
1164		 * If we're not caching, just tell the caller we
1165		 * were successful and don't touch the route.  The
1166		 * caller hold the sole reference to the cache entry, and
1167		 * it will be released when the caller is done with it.
1168		 * If we drop it here, the callers have no way to resolve routes
1169		 * when we're not caching.  Instead, just point *rp at rt, so
1170		 * the caller gets a single use out of the route
1171		 * Note that we do rt_free on this new route entry, so that
1172		 * once its refcount hits zero, we are still able to reap it
1173		 * (Thanks Alexey)
1174		 * Note: To avoid expensive rcu stuff for this uncached dst,
1175		 * we set DST_NOCACHE so that dst_release() can free dst without
1176		 * waiting a grace period.
1177		 */
1178
1179		rt->dst.flags |= DST_NOCACHE;
1180		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1181			int err = rt_bind_neighbour(rt);
1182			if (err) {
1183				net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1184				ip_rt_put(rt);
1185				return ERR_PTR(err);
1186			}
1187		}
1188
1189		goto skip_hashing;
 
 
 
 
 
1190	}
1191
1192	rthp = &rt_hash_table[hash].chain;
1193
1194	spin_lock_bh(rt_hash_lock_addr(hash));
1195	while ((rth = rcu_dereference_protected(*rthp,
1196			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1197		if (rt_is_expired(rth)) {
1198			*rthp = rth->dst.rt_next;
1199			rt_free(rth);
1200			continue;
1201		}
1202		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1203			/* Put it first */
1204			*rthp = rth->dst.rt_next;
1205			/*
1206			 * Since lookup is lockfree, the deletion
1207			 * must be visible to another weakly ordered CPU before
1208			 * the insertion at the start of the hash chain.
1209			 */
1210			rcu_assign_pointer(rth->dst.rt_next,
1211					   rt_hash_table[hash].chain);
1212			/*
1213			 * Since lookup is lockfree, the update writes
1214			 * must be ordered for consistency on SMP.
1215			 */
1216			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1217
1218			dst_use(&rth->dst, now);
1219			spin_unlock_bh(rt_hash_lock_addr(hash));
1220
1221			rt_drop(rt);
1222			if (skb)
1223				skb_dst_set(skb, &rth->dst);
1224			return rth;
1225		}
1226
1227		if (!atomic_read(&rth->dst.__refcnt)) {
1228			u32 score = rt_score(rth);
1229
1230			if (score <= min_score) {
1231				cand = rth;
1232				candp = rthp;
1233				min_score = score;
1234			}
1235		}
1236
1237		chain_length++;
1238
1239		rthp = &rth->dst.rt_next;
1240	}
1241
1242	if (cand) {
1243		/* ip_rt_gc_elasticity used to be average length of chain
1244		 * length, when exceeded gc becomes really aggressive.
1245		 *
1246		 * The second limit is less certain. At the moment it allows
1247		 * only 2 entries per bucket. We will see.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1248		 */
1249		if (chain_length > ip_rt_gc_elasticity) {
1250			*candp = cand->dst.rt_next;
1251			rt_free(cand);
1252		}
1253	} else {
1254		if (chain_length > rt_chain_length_max &&
1255		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1256			struct net *net = dev_net(rt->dst.dev);
1257			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1258			if (!rt_caching(net)) {
1259				pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1260					rt->dst.dev->name, num);
1261			}
1262			rt_emergency_hash_rebuild(net);
1263			spin_unlock_bh(rt_hash_lock_addr(hash));
1264
1265			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1266					ifindex, rt_genid(net));
1267			goto restart;
1268		}
1269	}
1270
1271	/* Try to bind route to arp only if it is output
1272	   route or unicast forwarding path.
1273	 */
1274	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1275		int err = rt_bind_neighbour(rt);
1276		if (err) {
1277			spin_unlock_bh(rt_hash_lock_addr(hash));
1278
1279			if (err != -ENOBUFS) {
1280				rt_drop(rt);
1281				return ERR_PTR(err);
1282			}
1283
1284			/* Neighbour tables are full and nothing
1285			   can be released. Try to shrink route cache,
1286			   it is most likely it holds some neighbour records.
1287			 */
1288			if (attempts-- > 0) {
1289				int saved_elasticity = ip_rt_gc_elasticity;
1290				int saved_int = ip_rt_gc_min_interval;
1291				ip_rt_gc_elasticity	= 1;
1292				ip_rt_gc_min_interval	= 0;
1293				rt_garbage_collect(&ipv4_dst_ops);
1294				ip_rt_gc_min_interval	= saved_int;
1295				ip_rt_gc_elasticity	= saved_elasticity;
1296				goto restart;
1297			}
1298
1299			net_warn_ratelimited("Neighbour table overflow\n");
1300			rt_drop(rt);
1301			return ERR_PTR(-ENOBUFS);
 
 
 
1302		}
1303	}
1304
1305	rt->dst.rt_next = rt_hash_table[hash].chain;
1306
1307	/*
1308	 * Since lookup is lockfree, we must make sure
1309	 * previous writes to rt are committed to memory
1310	 * before making rt visible to other CPUS.
1311	 */
1312	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1313
1314	spin_unlock_bh(rt_hash_lock_addr(hash));
1315
1316skip_hashing:
1317	if (skb)
1318		skb_dst_set(skb, &rt->dst);
1319	return rt;
1320}
1321
1322static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1323
1324static u32 rt_peer_genid(void)
1325{
1326	return atomic_read(&__rt_peer_genid);
1327}
 
 
 
 
 
1328
1329void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1330{
1331	struct inet_peer *peer;
 
 
 
1332
1333	peer = inet_getpeer_v4(daddr, create);
1334
1335	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1336		inet_putpeer(peer);
1337	else
1338		rt->rt_peer_genid = rt_peer_genid();
1339}
1340
1341/*
1342 * Peer allocation may fail only in serious out-of-memory conditions.  However
1343 * we still can generate some output.
1344 * Random ID selection looks a bit dangerous because we have no chances to
1345 * select ID being unique in a reasonable period of time.
1346 * But broken packet identifier may be better than no packet at all.
1347 */
1348static void ip_select_fb_ident(struct iphdr *iph)
1349{
1350	static DEFINE_SPINLOCK(ip_fb_id_lock);
1351	static u32 ip_fallback_id;
1352	u32 salt;
1353
1354	spin_lock_bh(&ip_fb_id_lock);
1355	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1356	iph->id = htons(salt & 0xFFFF);
1357	ip_fallback_id = salt;
1358	spin_unlock_bh(&ip_fb_id_lock);
1359}
1360
1361void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1362{
1363	struct rtable *rt = (struct rtable *) dst;
1364
1365	if (rt && !(rt->dst.flags & DST_NOPEER)) {
1366		if (rt->peer == NULL)
1367			rt_bind_peer(rt, rt->rt_dst, 1);
1368
1369		/* If peer is attached to destination, it is never detached,
1370		   so that we need not to grab a lock to dereference it.
1371		 */
1372		if (rt->peer) {
1373			iph->id = htons(inet_getid(rt->peer, more));
1374			return;
1375		}
1376	} else if (!rt)
1377		pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1378
1379	ip_select_fb_ident(iph);
1380}
1381EXPORT_SYMBOL(__ip_select_ident);
1382
1383static void rt_del(unsigned int hash, struct rtable *rt)
1384{
1385	struct rtable __rcu **rthp;
1386	struct rtable *aux;
1387
1388	rthp = &rt_hash_table[hash].chain;
1389	spin_lock_bh(rt_hash_lock_addr(hash));
1390	ip_rt_put(rt);
1391	while ((aux = rcu_dereference_protected(*rthp,
1392			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1393		if (aux == rt || rt_is_expired(aux)) {
1394			*rthp = aux->dst.rt_next;
1395			rt_free(aux);
1396			continue;
1397		}
1398		rthp = &aux->dst.rt_next;
1399	}
1400	spin_unlock_bh(rt_hash_lock_addr(hash));
1401}
1402
1403static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1404{
1405	struct rtable *rt = (struct rtable *) dst;
1406	__be32 orig_gw = rt->rt_gateway;
1407	struct neighbour *n, *old_n;
1408
1409	dst_confirm(&rt->dst);
1410
1411	rt->rt_gateway = peer->redirect_learned.a4;
1412
1413	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1414	if (IS_ERR(n)) {
1415		rt->rt_gateway = orig_gw;
1416		return;
1417	}
1418	old_n = xchg(&rt->dst._neighbour, n);
1419	if (old_n)
1420		neigh_release(old_n);
1421	if (!(n->nud_state & NUD_VALID)) {
1422		neigh_event_send(n, NULL);
1423	} else {
1424		rt->rt_flags |= RTCF_REDIRECTED;
1425		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1426	}
1427}
1428
1429/* called in rcu_read_lock() section */
1430void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1431		    __be32 saddr, struct net_device *dev)
1432{
1433	int s, i;
1434	struct in_device *in_dev = __in_dev_get_rcu(dev);
1435	__be32 skeys[2] = { saddr, 0 };
1436	int    ikeys[2] = { dev->ifindex, 0 };
1437	struct inet_peer *peer;
1438	struct net *net;
1439
 
1440	if (!in_dev)
1441		return;
1442
1443	net = dev_net(dev);
1444	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1445	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1446	    ipv4_is_zeronet(new_gw))
1447		goto reject_redirect;
1448
1449	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1450		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1451			goto reject_redirect;
1452		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1453			goto reject_redirect;
1454	} else {
1455		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1456			goto reject_redirect;
1457	}
1458
1459	for (s = 0; s < 2; s++) {
1460		for (i = 0; i < 2; i++) {
1461			unsigned int hash;
1462			struct rtable __rcu **rthp;
1463			struct rtable *rt;
1464
1465			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1466
1467			rthp = &rt_hash_table[hash].chain;
1468
1469			while ((rt = rcu_dereference(*rthp)) != NULL) {
1470				rthp = &rt->dst.rt_next;
1471
1472				if (rt->rt_key_dst != daddr ||
1473				    rt->rt_key_src != skeys[s] ||
1474				    rt->rt_oif != ikeys[i] ||
1475				    rt_is_input_route(rt) ||
1476				    rt_is_expired(rt) ||
1477				    !net_eq(dev_net(rt->dst.dev), net) ||
1478				    rt->dst.error ||
1479				    rt->dst.dev != dev ||
1480				    rt->rt_gateway != old_gw)
1481					continue;
1482
1483				if (!rt->peer)
1484					rt_bind_peer(rt, rt->rt_dst, 1);
1485
1486				peer = rt->peer;
1487				if (peer) {
1488					if (peer->redirect_learned.a4 != new_gw) {
1489						peer->redirect_learned.a4 = new_gw;
1490						atomic_inc(&__rt_peer_genid);
1491					}
1492					check_peer_redir(&rt->dst, peer);
1493				}
1494			}
 
 
 
1495		}
 
1496	}
1497	return;
1498
1499reject_redirect:
1500#ifdef CONFIG_IP_ROUTE_VERBOSE
1501	if (IN_DEV_LOG_MARTIANS(in_dev))
 
 
 
 
1502		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1503				     "  Advised path = %pI4 -> %pI4\n",
1504				     &old_gw, dev->name, &new_gw,
1505				     &saddr, &daddr);
 
1506#endif
1507	;
1508}
1509
1510static bool peer_pmtu_expired(struct inet_peer *peer)
1511{
1512	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
 
 
 
 
 
 
 
1513
1514	return orig &&
1515	       time_after_eq(jiffies, orig) &&
1516	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1517}
1518
1519static bool peer_pmtu_cleaned(struct inet_peer *peer)
1520{
1521	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1522
1523	return orig &&
1524	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1525}
1526
1527static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1528{
1529	struct rtable *rt = (struct rtable *)dst;
1530	struct dst_entry *ret = dst;
1531
1532	if (rt) {
1533		if (dst->obsolete > 0) {
1534			ip_rt_put(rt);
1535			ret = NULL;
1536		} else if (rt->rt_flags & RTCF_REDIRECTED) {
1537			unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1538						rt->rt_oif,
1539						rt_genid(dev_net(dst->dev)));
1540			rt_del(hash, rt);
1541			ret = NULL;
1542		} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1543			dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1544		}
1545	}
1546	return ret;
1547}
1548
1549/*
1550 * Algorithm:
1551 *	1. The first ip_rt_redirect_number redirects are sent
1552 *	   with exponential backoff, then we stop sending them at all,
1553 *	   assuming that the host ignores our redirects.
1554 *	2. If we did not see packets requiring redirects
1555 *	   during ip_rt_redirect_silence, we assume that the host
1556 *	   forgot redirected route and start to send redirects again.
1557 *
1558 * This algorithm is much cheaper and more intelligent than dumb load limiting
1559 * in icmp.c.
1560 *
1561 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1562 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1563 */
1564
1565void ip_rt_send_redirect(struct sk_buff *skb)
1566{
1567	struct rtable *rt = skb_rtable(skb);
1568	struct in_device *in_dev;
1569	struct inet_peer *peer;
 
1570	int log_martians;
 
1571
1572	rcu_read_lock();
1573	in_dev = __in_dev_get_rcu(rt->dst.dev);
1574	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1575		rcu_read_unlock();
1576		return;
1577	}
1578	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 
1579	rcu_read_unlock();
1580
1581	if (!rt->peer)
1582		rt_bind_peer(rt, rt->rt_dst, 1);
1583	peer = rt->peer;
1584	if (!peer) {
1585		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 
1586		return;
1587	}
1588
1589	/* No redirected packets during ip_rt_redirect_silence;
1590	 * reset the algorithm.
1591	 */
1592	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1593		peer->rate_tokens = 0;
1594
1595	/* Too many ignored redirects; do not send anything
1596	 * set dst.rate_last to the last seen redirected packet.
1597	 */
1598	if (peer->rate_tokens >= ip_rt_redirect_number) {
1599		peer->rate_last = jiffies;
1600		return;
1601	}
1602
1603	/* Check for load limit; set rate_last to the latest sent
1604	 * redirect.
1605	 */
1606	if (peer->rate_tokens == 0 ||
1607	    time_after(jiffies,
1608		       (peer->rate_last +
1609			(ip_rt_redirect_load << peer->rate_tokens)))) {
1610		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 
 
1611		peer->rate_last = jiffies;
1612		++peer->rate_tokens;
1613#ifdef CONFIG_IP_ROUTE_VERBOSE
1614		if (log_martians &&
1615		    peer->rate_tokens == ip_rt_redirect_number)
1616			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1617					     &ip_hdr(skb)->saddr, rt->rt_iif,
1618					     &rt->rt_dst, &rt->rt_gateway);
1619#endif
1620	}
 
 
1621}
1622
1623static int ip_error(struct sk_buff *skb)
1624{
1625	struct rtable *rt = skb_rtable(skb);
 
 
1626	struct inet_peer *peer;
1627	unsigned long now;
 
1628	bool send;
1629	int code;
1630
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1631	switch (rt->dst.error) {
1632	case EINVAL:
1633	default:
1634		goto out;
1635	case EHOSTUNREACH:
1636		code = ICMP_HOST_UNREACH;
1637		break;
1638	case ENETUNREACH:
1639		code = ICMP_NET_UNREACH;
1640		IP_INC_STATS_BH(dev_net(rt->dst.dev),
1641				IPSTATS_MIB_INNOROUTES);
1642		break;
1643	case EACCES:
1644		code = ICMP_PKT_FILTERED;
1645		break;
1646	}
1647
1648	if (!rt->peer)
1649		rt_bind_peer(rt, rt->rt_dst, 1);
1650	peer = rt->peer;
1651
1652	send = true;
1653	if (peer) {
1654		now = jiffies;
1655		peer->rate_tokens += now - peer->rate_last;
1656		if (peer->rate_tokens > ip_rt_error_burst)
1657			peer->rate_tokens = ip_rt_error_burst;
1658		peer->rate_last = now;
1659		if (peer->rate_tokens >= ip_rt_error_cost)
1660			peer->rate_tokens -= ip_rt_error_cost;
1661		else
1662			send = false;
 
1663	}
1664	if (send)
1665		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1666
1667out:	kfree_skb(skb);
1668	return 0;
1669}
1670
1671/*
1672 *	The last two values are not from the RFC but
1673 *	are needed for AMPRnet AX.25 paths.
1674 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1675
1676static const unsigned short mtu_plateau[] =
1677{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
 
1678
1679static inline unsigned short guess_mtu(unsigned short old_mtu)
 
 
 
 
 
 
 
1680{
1681	int i;
 
1682
1683	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1684		if (old_mtu > mtu_plateau[i])
1685			return mtu_plateau[i];
1686	return 68;
1687}
1688
1689unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1690				 unsigned short new_mtu,
1691				 struct net_device *dev)
1692{
1693	unsigned short old_mtu = ntohs(iph->tot_len);
1694	unsigned short est_mtu = 0;
1695	struct inet_peer *peer;
1696
1697	peer = inet_getpeer_v4(iph->daddr, 1);
1698	if (peer) {
1699		unsigned short mtu = new_mtu;
1700
1701		if (new_mtu < 68 || new_mtu >= old_mtu) {
1702			/* BSD 4.2 derived systems incorrectly adjust
1703			 * tot_len by the IP header length, and report
1704			 * a zero MTU in the ICMP message.
1705			 */
1706			if (mtu == 0 &&
1707			    old_mtu >= 68 + (iph->ihl << 2))
1708				old_mtu -= iph->ihl << 2;
1709			mtu = guess_mtu(old_mtu);
1710		}
1711
1712		if (mtu < ip_rt_min_pmtu)
1713			mtu = ip_rt_min_pmtu;
1714		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1715			unsigned long pmtu_expires;
1716
1717			pmtu_expires = jiffies + ip_rt_mtu_expires;
1718			if (!pmtu_expires)
1719				pmtu_expires = 1UL;
1720
1721			est_mtu = mtu;
1722			peer->pmtu_learned = mtu;
1723			peer->pmtu_expires = pmtu_expires;
1724			atomic_inc(&__rt_peer_genid);
1725		}
1726
1727		inet_putpeer(peer);
1728	}
1729	return est_mtu ? : new_mtu;
1730}
 
1731
1732static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1733{
1734	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
 
 
 
 
 
 
 
1735
1736	if (!expires)
1737		return;
1738	if (time_before(jiffies, expires)) {
1739		u32 orig_dst_mtu = dst_mtu(dst);
1740		if (peer->pmtu_learned < orig_dst_mtu) {
1741			if (!peer->pmtu_orig)
1742				peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1743			dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1744		}
1745	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1746		dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1747}
1748
1749static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1750{
1751	struct rtable *rt = (struct rtable *) dst;
1752	struct inet_peer *peer;
 
 
 
 
 
 
1753
1754	dst_confirm(dst);
 
1755
1756	if (!rt->peer)
1757		rt_bind_peer(rt, rt->rt_dst, 1);
1758	peer = rt->peer;
1759	if (peer) {
1760		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1761
1762		if (mtu < ip_rt_min_pmtu)
1763			mtu = ip_rt_min_pmtu;
1764		if (!pmtu_expires || mtu < peer->pmtu_learned) {
 
1765
1766			pmtu_expires = jiffies + ip_rt_mtu_expires;
1767			if (!pmtu_expires)
1768				pmtu_expires = 1UL;
1769
1770			peer->pmtu_learned = mtu;
1771			peer->pmtu_expires = pmtu_expires;
 
 
 
1772
1773			atomic_inc(&__rt_peer_genid);
1774			rt->rt_peer_genid = rt_peer_genid();
1775		}
1776		check_peer_pmtu(dst, peer);
1777	}
1778}
1779
 
 
 
 
 
 
 
 
 
1780
1781static void ipv4_validate_peer(struct rtable *rt)
1782{
1783	if (rt->rt_peer_genid != rt_peer_genid()) {
1784		struct inet_peer *peer;
1785
1786		if (!rt->peer)
1787			rt_bind_peer(rt, rt->rt_dst, 0);
1788
1789		peer = rt->peer;
1790		if (peer) {
1791			check_peer_pmtu(&rt->dst, peer);
 
 
1792
1793			if (peer->redirect_learned.a4 &&
1794			    peer->redirect_learned.a4 != rt->rt_gateway)
1795				check_peer_redir(&rt->dst, peer);
1796		}
 
 
1797
1798		rt->rt_peer_genid = rt_peer_genid();
 
 
 
 
 
1799	}
1800}
 
1801
1802static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1803{
1804	struct rtable *rt = (struct rtable *) dst;
 
 
 
1805
1806	if (rt_is_expired(rt))
1807		return NULL;
1808	ipv4_validate_peer(rt);
1809	return dst;
 
 
1810}
 
1811
1812static void ipv4_dst_destroy(struct dst_entry *dst)
1813{
1814	struct rtable *rt = (struct rtable *) dst;
1815	struct inet_peer *peer = rt->peer;
1816
1817	if (rt->fi) {
1818		fib_info_put(rt->fi);
1819		rt->fi = NULL;
1820	}
1821	if (peer) {
1822		rt->peer = NULL;
1823		inet_putpeer(peer);
1824	}
 
 
 
1825}
1826
1827
1828static void ipv4_link_failure(struct sk_buff *skb)
1829{
1830	struct rtable *rt;
1831
1832	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1833
1834	rt = skb_rtable(skb);
1835	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1836		dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1837}
1838
1839static int ip_rt_bug(struct sk_buff *skb)
1840{
1841	pr_debug("%s: %pI4 -> %pI4, %s\n",
1842		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1843		 skb->dev ? skb->dev->name : "?");
1844	kfree_skb(skb);
1845	WARN_ON(1);
1846	return 0;
1847}
1848
1849/*
1850   We do not cache source address of outgoing interface,
1851   because it is used only by IP RR, TS and SRR options,
1852   so that it out of fast path.
1853
1854   BTW remember: "addr" is allowed to be not aligned
1855   in IP options!
1856 */
1857
1858void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1859{
1860	__be32 src;
1861
1862	if (rt_is_output_route(rt))
1863		src = ip_hdr(skb)->saddr;
1864	else {
1865		struct fib_result res;
1866		struct flowi4 fl4;
1867		struct iphdr *iph;
1868
1869		iph = ip_hdr(skb);
1870
1871		memset(&fl4, 0, sizeof(fl4));
1872		fl4.daddr = iph->daddr;
1873		fl4.saddr = iph->saddr;
1874		fl4.flowi4_tos = RT_TOS(iph->tos);
1875		fl4.flowi4_oif = rt->dst.dev->ifindex;
1876		fl4.flowi4_iif = skb->dev->ifindex;
1877		fl4.flowi4_mark = skb->mark;
1878
1879		rcu_read_lock();
1880		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1881			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1882		else
1883			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1884					RT_SCOPE_UNIVERSE);
 
1885		rcu_read_unlock();
1886	}
1887	memcpy(addr, &src, 4);
1888}
1889
1890#ifdef CONFIG_IP_ROUTE_CLASSID
1891static void set_class_tag(struct rtable *rt, u32 tag)
1892{
1893	if (!(rt->dst.tclassid & 0xFFFF))
1894		rt->dst.tclassid |= tag & 0xFFFF;
1895	if (!(rt->dst.tclassid & 0xFFFF0000))
1896		rt->dst.tclassid |= tag & 0xFFFF0000;
1897}
1898#endif
1899
1900static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1901{
1902	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
 
 
1903
1904	if (advmss == 0) {
1905		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1906			       ip_rt_min_advmss);
1907		if (advmss > 65535 - 40)
1908			advmss = 65535 - 40;
1909	}
1910	return advmss;
1911}
1912
1913static unsigned int ipv4_mtu(const struct dst_entry *dst)
1914{
1915	const struct rtable *rt = (const struct rtable *) dst;
1916	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
 
 
 
1917
1918	if (mtu && rt_is_output_route(rt))
1919		return mtu;
1920
1921	mtu = dst->dev->mtu;
 
 
 
 
 
1922
1923	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1924
1925		if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1926			mtu = 576;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1927	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1928
1929	if (mtu > IP_MAX_MTU)
1930		mtu = IP_MAX_MTU;
 
1931
1932	return mtu;
1933}
1934
1935static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1936			    struct fib_info *fi)
1937{
1938	struct inet_peer *peer;
1939	int create = 0;
 
 
 
 
 
 
 
1940
1941	/* If a peer entry exists for this destination, we must hook
1942	 * it up in order to get at cached metrics.
1943	 */
1944	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1945		create = 1;
1946
1947	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1948	if (peer) {
1949		rt->rt_peer_genid = rt_peer_genid();
1950		if (inet_metrics_new(peer))
1951			memcpy(peer->metrics, fi->fib_metrics,
1952			       sizeof(u32) * RTAX_MAX);
1953		dst_init_metrics(&rt->dst, peer->metrics, false);
1954
1955		check_peer_pmtu(&rt->dst, peer);
1956
1957		if (peer->redirect_learned.a4 &&
1958		    peer->redirect_learned.a4 != rt->rt_gateway) {
1959			rt->rt_gateway = peer->redirect_learned.a4;
1960			rt->rt_flags |= RTCF_REDIRECTED;
1961		}
1962	} else {
1963		if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1964			rt->fi = fi;
1965			atomic_inc(&fi->fib_clntref);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1966		}
1967		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1968	}
1969}
1970
1971static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
 
 
 
 
 
 
 
1972			   const struct fib_result *res,
1973			   struct fib_info *fi, u16 type, u32 itag)
 
 
1974{
1975	struct dst_entry *dst = &rt->dst;
1976
1977	if (fi) {
1978		if (FIB_RES_GW(*res) &&
1979		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1980			rt->rt_gateway = FIB_RES_GW(*res);
1981		rt_init_metrics(rt, fl4, fi);
 
 
 
 
 
 
 
1982#ifdef CONFIG_IP_ROUTE_CLASSID
1983		dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1984#endif
1985	}
1986
1987	if (dst_mtu(dst) > IP_MAX_MTU)
1988		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1989	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1990		dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
 
 
 
 
 
 
 
 
 
 
 
1991
1992#ifdef CONFIG_IP_ROUTE_CLASSID
1993#ifdef CONFIG_IP_MULTIPLE_TABLES
1994	set_class_tag(rt, fib_rules_tclass(res));
1995#endif
1996	set_class_tag(rt, itag);
1997#endif
1998}
1999
2000static struct rtable *rt_dst_alloc(struct net_device *dev,
2001				   bool nopolicy, bool noxfrm)
 
2002{
2003	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2004			 DST_HOST |
2005			 (nopolicy ? DST_NOPOLICY : 0) |
2006			 (noxfrm ? DST_NOXFRM : 0));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2007}
 
2008
2009/* called in rcu_read_lock() section */
2010static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2011				u8 tos, struct net_device *dev, int our)
 
2012{
2013	unsigned int hash;
2014	struct rtable *rth;
2015	__be32 spec_dst;
2016	struct in_device *in_dev = __in_dev_get_rcu(dev);
2017	u32 itag = 0;
2018	int err;
2019
2020	/* Primary sanity checks. */
 
 
2021
2022	if (in_dev == NULL)
 
2023		return -EINVAL;
2024
2025	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2026	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2027		goto e_inval;
2028
2029	if (ipv4_is_zeronet(saddr)) {
2030		if (!ipv4_is_local_multicast(daddr))
2031			goto e_inval;
2032		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2033	} else {
2034		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2035					  &itag);
2036		if (err < 0)
2037			goto e_err;
2038	}
2039	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
2040			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2041	if (!rth)
2042		goto e_nobufs;
2043
2044#ifdef CONFIG_IP_ROUTE_CLASSID
2045	rth->dst.tclassid = itag;
2046#endif
2047	rth->dst.output = ip_rt_bug;
2048
2049	rth->rt_key_dst	= daddr;
2050	rth->rt_key_src	= saddr;
2051	rth->rt_genid	= rt_genid(dev_net(dev));
2052	rth->rt_flags	= RTCF_MULTICAST;
2053	rth->rt_type	= RTN_MULTICAST;
2054	rth->rt_key_tos	= tos;
2055	rth->rt_dst	= daddr;
2056	rth->rt_src	= saddr;
2057	rth->rt_route_iif = dev->ifindex;
2058	rth->rt_iif	= dev->ifindex;
2059	rth->rt_oif	= 0;
2060	rth->rt_mark    = skb->mark;
2061	rth->rt_gateway	= daddr;
2062	rth->rt_spec_dst= spec_dst;
2063	rth->rt_peer_genid = 0;
2064	rth->peer = NULL;
2065	rth->fi = NULL;
2066	if (our) {
2067		rth->dst.input= ip_local_deliver;
2068		rth->rt_flags |= RTCF_LOCAL;
2069	}
2070
2071#ifdef CONFIG_IP_MROUTE
2072	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2073		rth->dst.input = ip_mr_input;
2074#endif
2075	RT_CACHE_STAT_INC(in_slow_mc);
2076
2077	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2078	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2079	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2080
2081e_nobufs:
2082	return -ENOBUFS;
2083e_inval:
2084	return -EINVAL;
2085e_err:
2086	return err;
2087}
2088
2089
2090static void ip_handle_martian_source(struct net_device *dev,
2091				     struct in_device *in_dev,
2092				     struct sk_buff *skb,
2093				     __be32 daddr,
2094				     __be32 saddr)
2095{
2096	RT_CACHE_STAT_INC(in_martian_src);
2097#ifdef CONFIG_IP_ROUTE_VERBOSE
2098	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2099		/*
2100		 *	RFC1812 recommendation, if source is martian,
2101		 *	the only hint is MAC header.
2102		 */
2103		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2104			&daddr, &saddr, dev->name);
2105		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2106			print_hex_dump(KERN_WARNING, "ll header: ",
2107				       DUMP_PREFIX_OFFSET, 16, 1,
2108				       skb_mac_header(skb),
2109				       dev->hard_header_len, true);
2110		}
2111	}
2112#endif
2113}
2114
2115/* called in rcu_read_lock() section */
2116static int __mkroute_input(struct sk_buff *skb,
2117			   const struct fib_result *res,
2118			   struct in_device *in_dev,
2119			   __be32 daddr, __be32 saddr, u32 tos,
2120			   struct rtable **result)
2121{
 
2122	struct rtable *rth;
2123	int err;
2124	struct in_device *out_dev;
2125	unsigned int flags = 0;
2126	__be32 spec_dst;
2127	u32 itag;
2128
2129	/* get a working reference to the output device */
2130	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2131	if (out_dev == NULL) {
2132		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2133		return -EINVAL;
2134	}
2135
2136
2137	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2138				  in_dev->dev, &spec_dst, &itag);
2139	if (err < 0) {
2140		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2141					 saddr);
2142
2143		goto cleanup;
2144	}
2145
2146	if (err)
2147		flags |= RTCF_DIRECTSRC;
2148
2149	if (out_dev == in_dev && err &&
2150	    (IN_DEV_SHARED_MEDIA(out_dev) ||
2151	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2152		flags |= RTCF_DOREDIRECT;
2153
2154	if (skb->protocol != htons(ETH_P_IP)) {
2155		/* Not IP (i.e. ARP). Do not create route, if it is
2156		 * invalid for proxy arp. DNAT routes are always valid.
2157		 *
2158		 * Proxy arp feature have been extended to allow, ARP
2159		 * replies back to the same interface, to support
2160		 * Private VLAN switch technologies. See arp.c.
2161		 */
2162		if (out_dev == in_dev &&
2163		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2164			err = -EINVAL;
2165			goto cleanup;
2166		}
2167	}
2168
2169	rth = rt_dst_alloc(out_dev->dev,
 
 
 
 
 
 
 
 
 
 
 
 
2170			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2171			   IN_DEV_CONF_GET(out_dev, NOXFRM));
2172	if (!rth) {
2173		err = -ENOBUFS;
2174		goto cleanup;
2175	}
2176
2177	rth->rt_key_dst	= daddr;
2178	rth->rt_key_src	= saddr;
2179	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2180	rth->rt_flags = flags;
2181	rth->rt_type = res->type;
2182	rth->rt_key_tos	= tos;
2183	rth->rt_dst	= daddr;
2184	rth->rt_src	= saddr;
2185	rth->rt_route_iif = in_dev->dev->ifindex;
2186	rth->rt_iif 	= in_dev->dev->ifindex;
2187	rth->rt_oif 	= 0;
2188	rth->rt_mark    = skb->mark;
2189	rth->rt_gateway	= daddr;
2190	rth->rt_spec_dst= spec_dst;
2191	rth->rt_peer_genid = 0;
2192	rth->peer = NULL;
2193	rth->fi = NULL;
2194
2195	rth->dst.input = ip_forward;
2196	rth->dst.output = ip_output;
2197
2198	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2199
2200	*result = rth;
 
 
2201	err = 0;
2202 cleanup:
2203	return err;
2204}
2205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2206static int ip_mkroute_input(struct sk_buff *skb,
2207			    struct fib_result *res,
2208			    const struct flowi4 *fl4,
2209			    struct in_device *in_dev,
2210			    __be32 daddr, __be32 saddr, u32 tos)
 
2211{
2212	struct rtable *rth = NULL;
2213	int err;
2214	unsigned int hash;
2215
2216#ifdef CONFIG_IP_ROUTE_MULTIPATH
2217	if (res->fi && res->fi->fib_nhs > 1)
2218		fib_select_multipath(res);
2219#endif
2220
2221	/* create a routing cache entry */
2222	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2223	if (err)
2224		return err;
2225
2226	/* put it into the cache */
2227	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2228		       rt_genid(dev_net(rth->dst.dev)));
2229	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2230	if (IS_ERR(rth))
2231		return PTR_ERR(rth);
2232	return 0;
2233}
2234
2235/*
2236 *	NOTE. We drop all the packets that has local source
2237 *	addresses, because every properly looped back packet
2238 *	must have correct destination already attached by output routine.
2239 *
2240 *	Such approach solves two big problems:
2241 *	1. Not simplex devices are handled properly.
2242 *	2. IP spoofing attempts are filtered with 100% of guarantee.
2243 *	called with rcu_read_lock()
2244 */
2245
2246static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2247			       u8 tos, struct net_device *dev)
 
2248{
2249	struct fib_result res;
2250	struct in_device *in_dev = __in_dev_get_rcu(dev);
2251	struct flowi4	fl4;
 
 
 
2252	unsigned int	flags = 0;
2253	u32		itag = 0;
2254	struct rtable	*rth;
2255	unsigned int	hash;
2256	__be32		spec_dst;
2257	int		err = -EINVAL;
2258	struct net    *net = dev_net(dev);
2259
2260	/* IP on this device is disabled. */
2261
2262	if (!in_dev)
2263		goto out;
2264
2265	/* Check for the most weird martians, which can be not detected
2266	   by fib_lookup.
2267	 */
2268
2269	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2270	    ipv4_is_loopback(saddr))
 
 
 
 
 
 
2271		goto martian_source;
2272
 
 
2273	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2274		goto brd_input;
2275
2276	/* Accept zero addresses only to limited broadcast;
2277	 * I even do not know to fix it or not. Waiting for complains :-)
2278	 */
2279	if (ipv4_is_zeronet(saddr))
2280		goto martian_source;
2281
2282	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2283		goto martian_destination;
2284
 
 
 
 
 
 
 
 
 
 
 
2285	/*
2286	 *	Now we are ready to route packet.
2287	 */
2288	fl4.flowi4_oif = 0;
2289	fl4.flowi4_iif = dev->ifindex;
2290	fl4.flowi4_mark = skb->mark;
2291	fl4.flowi4_tos = tos;
2292	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 
2293	fl4.daddr = daddr;
2294	fl4.saddr = saddr;
2295	err = fib_lookup(net, &fl4, &res);
 
 
 
 
 
 
 
 
 
 
2296	if (err != 0) {
2297		if (!IN_DEV_FORWARD(in_dev))
2298			goto e_hostunreach;
2299		goto no_route;
2300	}
2301
2302	RT_CACHE_STAT_INC(in_slow_tot);
2303
2304	if (res.type == RTN_BROADCAST)
2305		goto brd_input;
2306
2307	if (res.type == RTN_LOCAL) {
2308		err = fib_validate_source(skb, saddr, daddr, tos,
2309					  net->loopback_dev->ifindex,
2310					  dev, &spec_dst, &itag);
2311		if (err < 0)
2312			goto martian_source_keep_err;
2313		if (err)
2314			flags |= RTCF_DIRECTSRC;
2315		spec_dst = daddr;
2316		goto local_input;
2317	}
2318
2319	if (!IN_DEV_FORWARD(in_dev))
2320		goto e_hostunreach;
2321	if (res.type != RTN_UNICAST)
 
 
2322		goto martian_destination;
2323
2324	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
2325out:	return err;
2326
2327brd_input:
2328	if (skb->protocol != htons(ETH_P_IP))
2329		goto e_inval;
2330
2331	if (ipv4_is_zeronet(saddr))
2332		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2333	else {
2334		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2335					  &itag);
2336		if (err < 0)
2337			goto martian_source_keep_err;
2338		if (err)
2339			flags |= RTCF_DIRECTSRC;
2340	}
2341	flags |= RTCF_BROADCAST;
2342	res.type = RTN_BROADCAST;
2343	RT_CACHE_STAT_INC(in_brd);
2344
2345local_input:
2346	rth = rt_dst_alloc(net->loopback_dev,
2347			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2348	if (!rth)
2349		goto e_nobufs;
2350
2351	rth->dst.input= ip_local_deliver;
2352	rth->dst.output= ip_rt_bug;
2353#ifdef CONFIG_IP_ROUTE_CLASSID
2354	rth->dst.tclassid = itag;
2355#endif
 
2356
2357	rth->rt_key_dst	= daddr;
2358	rth->rt_key_src	= saddr;
2359	rth->rt_genid = rt_genid(net);
2360	rth->rt_flags 	= flags|RTCF_LOCAL;
2361	rth->rt_type	= res.type;
2362	rth->rt_key_tos	= tos;
2363	rth->rt_dst	= daddr;
2364	rth->rt_src	= saddr;
2365#ifdef CONFIG_IP_ROUTE_CLASSID
2366	rth->dst.tclassid = itag;
2367#endif
2368	rth->rt_route_iif = dev->ifindex;
2369	rth->rt_iif	= dev->ifindex;
2370	rth->rt_oif	= 0;
2371	rth->rt_mark    = skb->mark;
2372	rth->rt_gateway	= daddr;
2373	rth->rt_spec_dst= spec_dst;
2374	rth->rt_peer_genid = 0;
2375	rth->peer = NULL;
2376	rth->fi = NULL;
2377	if (res.type == RTN_UNREACHABLE) {
2378		rth->dst.input= ip_error;
2379		rth->dst.error= -err;
2380		rth->rt_flags 	&= ~RTCF_LOCAL;
2381	}
2382	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2383	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
 
 
 
 
 
 
 
 
 
 
 
 
 
2384	err = 0;
2385	if (IS_ERR(rth))
2386		err = PTR_ERR(rth);
2387	goto out;
2388
2389no_route:
2390	RT_CACHE_STAT_INC(in_no_route);
2391	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2392	res.type = RTN_UNREACHABLE;
2393	if (err == -ESRCH)
2394		err = -ENETUNREACH;
2395	goto local_input;
2396
2397	/*
2398	 *	Do not cache martian addresses: they should be logged (RFC1812)
2399	 */
2400martian_destination:
2401	RT_CACHE_STAT_INC(in_martian_dst);
2402#ifdef CONFIG_IP_ROUTE_VERBOSE
2403	if (IN_DEV_LOG_MARTIANS(in_dev))
2404		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2405				     &daddr, &saddr, dev->name);
2406#endif
2407
2408e_hostunreach:
2409	err = -EHOSTUNREACH;
2410	goto out;
2411
2412e_inval:
2413	err = -EINVAL;
2414	goto out;
2415
2416e_nobufs:
2417	err = -ENOBUFS;
2418	goto out;
2419
2420martian_source:
2421	err = -EINVAL;
2422martian_source_keep_err:
2423	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2424	goto out;
2425}
2426
2427int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2428			   u8 tos, struct net_device *dev, bool noref)
2429{
2430	struct rtable	*rth;
2431	unsigned int	hash;
2432	int iif = dev->ifindex;
2433	struct net *net;
2434	int res;
2435
2436	net = dev_net(dev);
2437
 
2438	rcu_read_lock();
 
 
2439
2440	if (!rt_caching(net))
2441		goto skip_cache;
 
2442
2443	tos &= IPTOS_RT_MASK;
2444	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2445
2446	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2447	     rth = rcu_dereference(rth->dst.rt_next)) {
2448		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2449		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2450		     (rth->rt_route_iif ^ iif) |
2451		     (rth->rt_key_tos ^ tos)) == 0 &&
2452		    rth->rt_mark == skb->mark &&
2453		    net_eq(dev_net(rth->dst.dev), net) &&
2454		    !rt_is_expired(rth)) {
2455			ipv4_validate_peer(rth);
2456			if (noref) {
2457				dst_use_noref(&rth->dst, jiffies);
2458				skb_dst_set_noref(skb, &rth->dst);
2459			} else {
2460				dst_use(&rth->dst, jiffies);
2461				skb_dst_set(skb, &rth->dst);
2462			}
2463			RT_CACHE_STAT_INC(in_hit);
2464			rcu_read_unlock();
2465			return 0;
2466		}
2467		RT_CACHE_STAT_INC(in_hlist_search);
2468	}
2469
2470skip_cache:
2471	/* Multicast recognition logic is moved from route cache to here.
2472	   The problem was that too many Ethernet cards have broken/missing
2473	   hardware multicast filters :-( As result the host on multicasting
2474	   network acquires a lot of useless route cache entries, sort of
2475	   SDR messages from all the world. Now we try to get rid of them.
2476	   Really, provided software IP multicast filter is organized
2477	   reasonably (at least, hashed), it does not result in a slowdown
2478	   comparing with route cache reject entries.
2479	   Note, that multicast routers are not affected, because
2480	   route cache entry is created eventually.
2481	 */
2482	if (ipv4_is_multicast(daddr)) {
2483		struct in_device *in_dev = __in_dev_get_rcu(dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2484
2485		if (in_dev) {
2486			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2487						  ip_hdr(skb)->protocol);
2488			if (our
2489#ifdef CONFIG_IP_MROUTE
2490				||
2491			    (!ipv4_is_local_multicast(daddr) &&
2492			     IN_DEV_MFORWARD(in_dev))
2493#endif
2494			   ) {
2495				int res = ip_route_input_mc(skb, daddr, saddr,
2496							    tos, dev, our);
2497				rcu_read_unlock();
2498				return res;
2499			}
2500		}
2501		rcu_read_unlock();
2502		return -EINVAL;
2503	}
2504	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2505	rcu_read_unlock();
2506	return res;
2507}
2508EXPORT_SYMBOL(ip_route_input_common);
2509
2510/* called with rcu_read_lock() */
2511static struct rtable *__mkroute_output(const struct fib_result *res,
2512				       const struct flowi4 *fl4,
2513				       __be32 orig_daddr, __be32 orig_saddr,
2514				       int orig_oif, __u8 orig_rtos,
2515				       struct net_device *dev_out,
2516				       unsigned int flags)
2517{
2518	struct fib_info *fi = res->fi;
 
2519	struct in_device *in_dev;
2520	u16 type = res->type;
2521	struct rtable *rth;
 
2522
2523	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
 
2524		return ERR_PTR(-EINVAL);
2525
 
 
 
 
 
 
2526	if (ipv4_is_lbcast(fl4->daddr))
2527		type = RTN_BROADCAST;
2528	else if (ipv4_is_multicast(fl4->daddr))
2529		type = RTN_MULTICAST;
2530	else if (ipv4_is_zeronet(fl4->daddr))
2531		return ERR_PTR(-EINVAL);
2532
2533	if (dev_out->flags & IFF_LOOPBACK)
2534		flags |= RTCF_LOCAL;
2535
2536	in_dev = __in_dev_get_rcu(dev_out);
2537	if (!in_dev)
2538		return ERR_PTR(-EINVAL);
2539
2540	if (type == RTN_BROADCAST) {
2541		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2542		fi = NULL;
2543	} else if (type == RTN_MULTICAST) {
2544		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2545		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2546				     fl4->flowi4_proto))
2547			flags &= ~RTCF_LOCAL;
 
 
2548		/* If multicast route do not exist use
2549		 * default one, but do not gateway in this case.
2550		 * Yes, it is hack.
2551		 */
2552		if (fi && res->prefixlen < 4)
2553			fi = NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2554	}
2555
2556	rth = rt_dst_alloc(dev_out,
 
2557			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2558			   IN_DEV_CONF_GET(in_dev, NOXFRM));
 
2559	if (!rth)
2560		return ERR_PTR(-ENOBUFS);
2561
2562	rth->dst.output = ip_output;
2563
2564	rth->rt_key_dst	= orig_daddr;
2565	rth->rt_key_src	= orig_saddr;
2566	rth->rt_genid = rt_genid(dev_net(dev_out));
2567	rth->rt_flags	= flags;
2568	rth->rt_type	= type;
2569	rth->rt_key_tos	= orig_rtos;
2570	rth->rt_dst	= fl4->daddr;
2571	rth->rt_src	= fl4->saddr;
2572	rth->rt_route_iif = 0;
2573	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
2574	rth->rt_oif	= orig_oif;
2575	rth->rt_mark    = fl4->flowi4_mark;
2576	rth->rt_gateway = fl4->daddr;
2577	rth->rt_spec_dst= fl4->saddr;
2578	rth->rt_peer_genid = 0;
2579	rth->peer = NULL;
2580	rth->fi = NULL;
2581
2582	RT_CACHE_STAT_INC(out_slow_tot);
2583
2584	if (flags & RTCF_LOCAL) {
2585		rth->dst.input = ip_local_deliver;
2586		rth->rt_spec_dst = fl4->daddr;
2587	}
2588	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2589		rth->rt_spec_dst = fl4->saddr;
2590		if (flags & RTCF_LOCAL &&
2591		    !(dev_out->flags & IFF_LOOPBACK)) {
2592			rth->dst.output = ip_mc_output;
2593			RT_CACHE_STAT_INC(out_slow_mc);
2594		}
2595#ifdef CONFIG_IP_MROUTE
2596		if (type == RTN_MULTICAST) {
2597			if (IN_DEV_MFORWARD(in_dev) &&
2598			    !ipv4_is_local_multicast(fl4->daddr)) {
2599				rth->dst.input = ip_mr_input;
2600				rth->dst.output = ip_mc_output;
2601			}
2602		}
2603#endif
2604	}
2605
2606	rt_set_nexthop(rth, fl4, res, fi, type, 0);
 
2607
2608	return rth;
2609}
2610
2611/*
2612 * Major route resolver routine.
2613 * called with rcu_read_lock();
2614 */
2615
2616static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 
2617{
2618	struct net_device *dev_out = NULL;
2619	__u8 tos = RT_FL_TOS(fl4);
2620	unsigned int flags = 0;
2621	struct fib_result res;
 
 
 
 
2622	struct rtable *rth;
2623	__be32 orig_daddr;
2624	__be32 orig_saddr;
2625	int orig_oif;
2626
2627	res.fi		= NULL;
2628#ifdef CONFIG_IP_MULTIPLE_TABLES
2629	res.r		= NULL;
2630#endif
2631
2632	orig_daddr = fl4->daddr;
2633	orig_saddr = fl4->saddr;
2634	orig_oif = fl4->flowi4_oif;
2635
2636	fl4->flowi4_iif = net->loopback_dev->ifindex;
2637	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2638	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2639			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2640
2641	rcu_read_lock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2642	if (fl4->saddr) {
2643		rth = ERR_PTR(-EINVAL);
2644		if (ipv4_is_multicast(fl4->saddr) ||
2645		    ipv4_is_lbcast(fl4->saddr) ||
2646		    ipv4_is_zeronet(fl4->saddr))
2647			goto out;
2648
2649		/* I removed check for oif == dev_out->oif here.
2650		   It was wrong for two reasons:
2651		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2652		      is assigned to multiple interfaces.
2653		   2. Moreover, we are allowed to send packets with saddr
2654		      of another iface. --ANK
2655		 */
2656
2657		if (fl4->flowi4_oif == 0 &&
2658		    (ipv4_is_multicast(fl4->daddr) ||
2659		     ipv4_is_lbcast(fl4->daddr))) {
2660			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2661			dev_out = __ip_dev_find(net, fl4->saddr, false);
2662			if (dev_out == NULL)
2663				goto out;
2664
2665			/* Special hack: user can direct multicasts
2666			   and limited broadcast via necessary interface
2667			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2668			   This hack is not just for fun, it allows
2669			   vic,vat and friends to work.
2670			   They bind socket to loopback, set ttl to zero
2671			   and expect that it will work.
2672			   From the viewpoint of routing cache they are broken,
2673			   because we are not allowed to build multicast path
2674			   with loopback source addr (look, routing cache
2675			   cannot know, that ttl is zero, so that packet
2676			   will not leave this host and route is valid).
2677			   Luckily, this hack is good workaround.
2678			 */
2679
2680			fl4->flowi4_oif = dev_out->ifindex;
2681			goto make_route;
2682		}
2683
2684		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2685			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2686			if (!__ip_dev_find(net, fl4->saddr, false))
2687				goto out;
2688		}
2689	}
2690
2691
2692	if (fl4->flowi4_oif) {
2693		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2694		rth = ERR_PTR(-ENODEV);
2695		if (dev_out == NULL)
2696			goto out;
2697
2698		/* RACE: Check return value of inet_select_addr instead. */
2699		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2700			rth = ERR_PTR(-ENETUNREACH);
2701			goto out;
2702		}
2703		if (ipv4_is_local_multicast(fl4->daddr) ||
2704		    ipv4_is_lbcast(fl4->daddr)) {
 
2705			if (!fl4->saddr)
2706				fl4->saddr = inet_select_addr(dev_out, 0,
2707							      RT_SCOPE_LINK);
2708			goto make_route;
2709		}
2710		if (fl4->saddr) {
2711			if (ipv4_is_multicast(fl4->daddr))
2712				fl4->saddr = inet_select_addr(dev_out, 0,
2713							      fl4->flowi4_scope);
2714			else if (!fl4->daddr)
2715				fl4->saddr = inet_select_addr(dev_out, 0,
2716							      RT_SCOPE_HOST);
2717		}
2718	}
2719
2720	if (!fl4->daddr) {
2721		fl4->daddr = fl4->saddr;
2722		if (!fl4->daddr)
2723			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2724		dev_out = net->loopback_dev;
2725		fl4->flowi4_oif = net->loopback_dev->ifindex;
2726		res.type = RTN_LOCAL;
2727		flags |= RTCF_LOCAL;
2728		goto make_route;
2729	}
2730
2731	if (fib_lookup(net, fl4, &res)) {
2732		res.fi = NULL;
2733		if (fl4->flowi4_oif) {
 
 
 
 
2734			/* Apparently, routing tables are wrong. Assume,
2735			   that the destination is on link.
2736
2737			   WHY? DW.
2738			   Because we are allowed to send to iface
2739			   even if it has NO routes and NO assigned
2740			   addresses. When oif is specified, routing
2741			   tables are looked up with only one purpose:
2742			   to catch if destination is gatewayed, rather than
2743			   direct. Moreover, if MSG_DONTROUTE is set,
2744			   we send packet, ignoring both routing tables
2745			   and ifaddr state. --ANK
2746
2747
2748			   We could make it even if oif is unknown,
2749			   likely IPv6, but we do not.
2750			 */
2751
2752			if (fl4->saddr == 0)
2753				fl4->saddr = inet_select_addr(dev_out, 0,
2754							      RT_SCOPE_LINK);
2755			res.type = RTN_UNICAST;
2756			goto make_route;
2757		}
2758		rth = ERR_PTR(-ENETUNREACH);
2759		goto out;
2760	}
2761
2762	if (res.type == RTN_LOCAL) {
2763		if (!fl4->saddr) {
2764			if (res.fi->fib_prefsrc)
2765				fl4->saddr = res.fi->fib_prefsrc;
2766			else
2767				fl4->saddr = fl4->daddr;
2768		}
2769		dev_out = net->loopback_dev;
 
 
 
 
 
 
 
 
 
2770		fl4->flowi4_oif = dev_out->ifindex;
2771		res.fi = NULL;
2772		flags |= RTCF_LOCAL;
2773		goto make_route;
2774	}
2775
2776#ifdef CONFIG_IP_ROUTE_MULTIPATH
2777	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2778		fib_select_multipath(&res);
2779	else
2780#endif
2781	if (!res.prefixlen &&
2782	    res.table->tb_num_default > 1 &&
2783	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2784		fib_select_default(&res);
2785
2786	if (!fl4->saddr)
2787		fl4->saddr = FIB_RES_PREFSRC(net, res);
2788
2789	dev_out = FIB_RES_DEV(res);
2790	fl4->flowi4_oif = dev_out->ifindex;
2791
2792
2793make_route:
2794	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2795			       tos, dev_out, flags);
2796	if (!IS_ERR(rth)) {
2797		unsigned int hash;
2798
2799		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2800			       rt_genid(dev_net(dev_out)));
2801		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2802	}
2803
2804out:
2805	rcu_read_unlock();
2806	return rth;
2807}
2808
2809struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2810{
2811	struct rtable *rth;
2812	unsigned int hash;
2813
2814	if (!rt_caching(net))
2815		goto slow_output;
2816
2817	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2818
2819	rcu_read_lock_bh();
2820	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2821		rth = rcu_dereference_bh(rth->dst.rt_next)) {
2822		if (rth->rt_key_dst == flp4->daddr &&
2823		    rth->rt_key_src == flp4->saddr &&
2824		    rt_is_output_route(rth) &&
2825		    rth->rt_oif == flp4->flowi4_oif &&
2826		    rth->rt_mark == flp4->flowi4_mark &&
2827		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2828			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2829		    net_eq(dev_net(rth->dst.dev), net) &&
2830		    !rt_is_expired(rth)) {
2831			ipv4_validate_peer(rth);
2832			dst_use(&rth->dst, jiffies);
2833			RT_CACHE_STAT_INC(out_hit);
2834			rcu_read_unlock_bh();
2835			if (!flp4->saddr)
2836				flp4->saddr = rth->rt_src;
2837			if (!flp4->daddr)
2838				flp4->daddr = rth->rt_dst;
2839			return rth;
2840		}
2841		RT_CACHE_STAT_INC(out_hlist_search);
2842	}
2843	rcu_read_unlock_bh();
2844
2845slow_output:
2846	return ip_route_output_slow(net, flp4);
2847}
2848EXPORT_SYMBOL_GPL(__ip_route_output_key);
2849
2850static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2851{
2852	return NULL;
2853}
2854
2855static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2856{
2857	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2858
2859	return mtu ? : dst->dev->mtu;
2860}
2861
2862static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 
 
 
 
 
 
2863{
2864}
2865
2866static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2867					  unsigned long old)
2868{
2869	return NULL;
2870}
2871
2872static struct dst_ops ipv4_dst_blackhole_ops = {
2873	.family			=	AF_INET,
2874	.protocol		=	cpu_to_be16(ETH_P_IP),
2875	.destroy		=	ipv4_dst_destroy,
2876	.check			=	ipv4_blackhole_dst_check,
2877	.mtu			=	ipv4_blackhole_mtu,
2878	.default_advmss		=	ipv4_default_advmss,
2879	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
 
2880	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2881	.neigh_lookup		=	ipv4_neigh_lookup,
2882};
2883
2884struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2885{
2886	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2887	struct rtable *ort = (struct rtable *) dst_orig;
 
2888
 
2889	if (rt) {
2890		struct dst_entry *new = &rt->dst;
2891
2892		new->__use = 1;
2893		new->input = dst_discard;
2894		new->output = dst_discard;
2895		dst_copy_metrics(new, &ort->dst);
2896
2897		new->dev = ort->dst.dev;
2898		if (new->dev)
2899			dev_hold(new->dev);
2900
2901		rt->rt_key_dst = ort->rt_key_dst;
2902		rt->rt_key_src = ort->rt_key_src;
2903		rt->rt_key_tos = ort->rt_key_tos;
2904		rt->rt_route_iif = ort->rt_route_iif;
2905		rt->rt_iif = ort->rt_iif;
2906		rt->rt_oif = ort->rt_oif;
2907		rt->rt_mark = ort->rt_mark;
2908
2909		rt->rt_genid = rt_genid(net);
2910		rt->rt_flags = ort->rt_flags;
2911		rt->rt_type = ort->rt_type;
2912		rt->rt_dst = ort->rt_dst;
2913		rt->rt_src = ort->rt_src;
2914		rt->rt_gateway = ort->rt_gateway;
2915		rt->rt_spec_dst = ort->rt_spec_dst;
2916		rt->peer = ort->peer;
2917		if (rt->peer)
2918			atomic_inc(&rt->peer->refcnt);
2919		rt->fi = ort->fi;
2920		if (rt->fi)
2921			atomic_inc(&rt->fi->fib_clntref);
2922
2923		dst_free(new);
2924	}
2925
2926	dst_release(dst_orig);
2927
2928	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2929}
2930
2931struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2932				    struct sock *sk)
2933{
2934	struct rtable *rt = __ip_route_output_key(net, flp4);
2935
2936	if (IS_ERR(rt))
2937		return rt;
2938
2939	if (flp4->flowi4_proto)
2940		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2941						   flowi4_to_flowi(flp4),
2942						   sk, 0);
2943
2944	return rt;
2945}
2946EXPORT_SYMBOL_GPL(ip_route_output_flow);
2947
2948static int rt_fill_info(struct net *net,
2949			struct sk_buff *skb, u32 pid, u32 seq, int event,
2950			int nowait, unsigned int flags)
 
2951{
2952	struct rtable *rt = skb_rtable(skb);
2953	struct rtmsg *r;
2954	struct nlmsghdr *nlh;
2955	unsigned long expires = 0;
2956	const struct inet_peer *peer = rt->peer;
2957	u32 id = 0, ts = 0, tsage = 0, error;
2958
2959	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2960	if (nlh == NULL)
2961		return -EMSGSIZE;
2962
2963	r = nlmsg_data(nlh);
2964	r->rtm_family	 = AF_INET;
2965	r->rtm_dst_len	= 32;
2966	r->rtm_src_len	= 0;
2967	r->rtm_tos	= rt->rt_key_tos;
2968	r->rtm_table	= RT_TABLE_MAIN;
2969	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2970		goto nla_put_failure;
2971	r->rtm_type	= rt->rt_type;
2972	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2973	r->rtm_protocol = RTPROT_UNSPEC;
2974	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2975	if (rt->rt_flags & RTCF_NOTIFY)
2976		r->rtm_flags |= RTM_F_NOTIFY;
 
 
2977
2978	if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2979		goto nla_put_failure;
2980	if (rt->rt_key_src) {
2981		r->rtm_src_len = 32;
2982		if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2983			goto nla_put_failure;
2984	}
2985	if (rt->dst.dev &&
2986	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2987		goto nla_put_failure;
2988#ifdef CONFIG_IP_ROUTE_CLASSID
2989	if (rt->dst.tclassid &&
2990	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2991		goto nla_put_failure;
2992#endif
2993	if (rt_is_input_route(rt)) {
2994		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
2995			goto nla_put_failure;
2996	} else if (rt->rt_src != rt->rt_key_src) {
2997		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
2998			goto nla_put_failure;
2999	}
3000	if (rt->rt_dst != rt->rt_gateway &&
3001	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3002		goto nla_put_failure;
3003
3004	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
 
3005		goto nla_put_failure;
3006
3007	if (rt->rt_mark &&
3008	    nla_put_be32(skb, RTA_MARK, rt->rt_mark))
 
3009		goto nla_put_failure;
3010
3011	error = rt->dst.error;
3012	if (peer) {
3013		inet_peer_refcheck(rt->peer);
3014		id = atomic_read(&peer->ip_id_count) & 0xffff;
3015		if (peer->tcp_ts_stamp) {
3016			ts = peer->tcp_ts;
3017			tsage = get_seconds() - peer->tcp_ts_stamp;
3018		}
3019		expires = ACCESS_ONCE(peer->pmtu_expires);
3020		if (expires) {
3021			if (time_before(jiffies, expires))
3022				expires -= jiffies;
3023			else
3024				expires = 0;
3025		}
3026	}
3027
3028	if (rt_is_input_route(rt)) {
3029#ifdef CONFIG_IP_MROUTE
3030		__be32 dst = rt->rt_dst;
3031
3032		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3033		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3034			int err = ipmr_get_route(net, skb,
3035						 rt->rt_src, rt->rt_dst,
3036						 r, nowait);
 
3037			if (err <= 0) {
3038				if (!nowait) {
3039					if (err == 0)
3040						return 0;
3041					goto nla_put_failure;
3042				} else {
3043					if (err == -EMSGSIZE)
3044						goto nla_put_failure;
3045					error = err;
3046				}
3047			}
3048		} else
3049#endif
3050			if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3051				goto nla_put_failure;
3052	}
3053
3054	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3055			       expires, error) < 0)
3056		goto nla_put_failure;
3057
3058	return nlmsg_end(skb, nlh);
 
3059
3060nla_put_failure:
3061	nlmsg_cancel(skb, nlh);
3062	return -EMSGSIZE;
3063}
3064
3065static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
 
3066{
3067	struct net *net = sock_net(in_skb->sk);
3068	struct rtmsg *rtm;
3069	struct nlattr *tb[RTA_MAX+1];
 
3070	struct rtable *rt = NULL;
 
3071	__be32 dst = 0;
3072	__be32 src = 0;
3073	u32 iif;
3074	int err;
3075	int mark;
3076	struct sk_buff *skb;
 
 
3077
3078	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
 
3079	if (err < 0)
3080		goto errout;
3081
3082	rtm = nlmsg_data(nlh);
3083
3084	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3085	if (skb == NULL) {
3086		err = -ENOBUFS;
3087		goto errout;
3088	}
3089
3090	/* Reserve room for dummy headers, this skb can pass
3091	   through good chunk of routing engine.
3092	 */
3093	skb_reset_mac_header(skb);
3094	skb_reset_network_header(skb);
3095
3096	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3097	ip_hdr(skb)->protocol = IPPROTO_ICMP;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3098	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
3099
3100	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3101	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3102	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3103	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
 
 
 
 
 
3104
3105	if (iif) {
3106		struct net_device *dev;
3107
3108		dev = __dev_get_by_index(net, iif);
3109		if (dev == NULL) {
3110			err = -ENODEV;
3111			goto errout_free;
3112		}
3113
3114		skb->protocol	= htons(ETH_P_IP);
3115		skb->dev	= dev;
3116		skb->mark	= mark;
3117		local_bh_disable();
3118		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3119		local_bh_enable();
3120
3121		rt = skb_rtable(skb);
3122		if (err == 0 && rt->dst.error)
3123			err = -rt->dst.error;
3124	} else {
3125		struct flowi4 fl4 = {
3126			.daddr = dst,
3127			.saddr = src,
3128			.flowi4_tos = rtm->rtm_tos,
3129			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3130			.flowi4_mark = mark,
3131		};
3132		rt = ip_route_output_key(net, &fl4);
3133
3134		err = 0;
3135		if (IS_ERR(rt))
3136			err = PTR_ERR(rt);
 
 
3137	}
3138
3139	if (err)
3140		goto errout_free;
3141
3142	skb_dst_set(skb, &rt->dst);
3143	if (rtm->rtm_flags & RTM_F_NOTIFY)
3144		rt->rt_flags |= RTCF_NOTIFY;
3145
3146	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3147			   RTM_NEWROUTE, 0, 0);
3148	if (err <= 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3149		goto errout_free;
3150
3151	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
 
 
3152errout:
3153	return err;
3154
3155errout_free:
 
3156	kfree_skb(skb);
3157	goto errout;
3158}
3159
3160int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3161{
3162	struct rtable *rt;
3163	int h, s_h;
3164	int idx, s_idx;
3165	struct net *net;
3166
3167	net = sock_net(skb->sk);
3168
3169	s_h = cb->args[0];
3170	if (s_h < 0)
3171		s_h = 0;
3172	s_idx = idx = cb->args[1];
3173	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3174		if (!rt_hash_table[h].chain)
3175			continue;
3176		rcu_read_lock_bh();
3177		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3178		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3179			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3180				continue;
3181			if (rt_is_expired(rt))
3182				continue;
3183			skb_dst_set_noref(skb, &rt->dst);
3184			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3185					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3186					 1, NLM_F_MULTI) <= 0) {
3187				skb_dst_drop(skb);
3188				rcu_read_unlock_bh();
3189				goto done;
3190			}
3191			skb_dst_drop(skb);
3192		}
3193		rcu_read_unlock_bh();
3194	}
3195
3196done:
3197	cb->args[0] = h;
3198	cb->args[1] = idx;
3199	return skb->len;
3200}
3201
3202void ip_rt_multicast_event(struct in_device *in_dev)
3203{
3204	rt_cache_flush(dev_net(in_dev->dev), 0);
3205}
3206
3207#ifdef CONFIG_SYSCTL
3208static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
 
 
 
 
 
3209					void __user *buffer,
3210					size_t *lenp, loff_t *ppos)
3211{
 
 
3212	if (write) {
3213		int flush_delay;
3214		ctl_table ctl;
3215		struct net *net;
3216
3217		memcpy(&ctl, __ctl, sizeof(ctl));
3218		ctl.data = &flush_delay;
3219		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3220
3221		net = (struct net *)__ctl->extra1;
3222		rt_cache_flush(net, flush_delay);
3223		return 0;
3224	}
3225
3226	return -EINVAL;
3227}
3228
3229static ctl_table ipv4_route_table[] = {
3230	{
3231		.procname	= "gc_thresh",
3232		.data		= &ipv4_dst_ops.gc_thresh,
3233		.maxlen		= sizeof(int),
3234		.mode		= 0644,
3235		.proc_handler	= proc_dointvec,
3236	},
3237	{
3238		.procname	= "max_size",
3239		.data		= &ip_rt_max_size,
3240		.maxlen		= sizeof(int),
3241		.mode		= 0644,
3242		.proc_handler	= proc_dointvec,
3243	},
3244	{
3245		/*  Deprecated. Use gc_min_interval_ms */
3246
3247		.procname	= "gc_min_interval",
3248		.data		= &ip_rt_gc_min_interval,
3249		.maxlen		= sizeof(int),
3250		.mode		= 0644,
3251		.proc_handler	= proc_dointvec_jiffies,
3252	},
3253	{
3254		.procname	= "gc_min_interval_ms",
3255		.data		= &ip_rt_gc_min_interval,
3256		.maxlen		= sizeof(int),
3257		.mode		= 0644,
3258		.proc_handler	= proc_dointvec_ms_jiffies,
3259	},
3260	{
3261		.procname	= "gc_timeout",
3262		.data		= &ip_rt_gc_timeout,
3263		.maxlen		= sizeof(int),
3264		.mode		= 0644,
3265		.proc_handler	= proc_dointvec_jiffies,
3266	},
3267	{
3268		.procname	= "gc_interval",
3269		.data		= &ip_rt_gc_interval,
3270		.maxlen		= sizeof(int),
3271		.mode		= 0644,
3272		.proc_handler	= proc_dointvec_jiffies,
3273	},
3274	{
3275		.procname	= "redirect_load",
3276		.data		= &ip_rt_redirect_load,
3277		.maxlen		= sizeof(int),
3278		.mode		= 0644,
3279		.proc_handler	= proc_dointvec,
3280	},
3281	{
3282		.procname	= "redirect_number",
3283		.data		= &ip_rt_redirect_number,
3284		.maxlen		= sizeof(int),
3285		.mode		= 0644,
3286		.proc_handler	= proc_dointvec,
3287	},
3288	{
3289		.procname	= "redirect_silence",
3290		.data		= &ip_rt_redirect_silence,
3291		.maxlen		= sizeof(int),
3292		.mode		= 0644,
3293		.proc_handler	= proc_dointvec,
3294	},
3295	{
3296		.procname	= "error_cost",
3297		.data		= &ip_rt_error_cost,
3298		.maxlen		= sizeof(int),
3299		.mode		= 0644,
3300		.proc_handler	= proc_dointvec,
3301	},
3302	{
3303		.procname	= "error_burst",
3304		.data		= &ip_rt_error_burst,
3305		.maxlen		= sizeof(int),
3306		.mode		= 0644,
3307		.proc_handler	= proc_dointvec,
3308	},
3309	{
3310		.procname	= "gc_elasticity",
3311		.data		= &ip_rt_gc_elasticity,
3312		.maxlen		= sizeof(int),
3313		.mode		= 0644,
3314		.proc_handler	= proc_dointvec,
3315	},
3316	{
3317		.procname	= "mtu_expires",
3318		.data		= &ip_rt_mtu_expires,
3319		.maxlen		= sizeof(int),
3320		.mode		= 0644,
3321		.proc_handler	= proc_dointvec_jiffies,
3322	},
3323	{
3324		.procname	= "min_pmtu",
3325		.data		= &ip_rt_min_pmtu,
3326		.maxlen		= sizeof(int),
3327		.mode		= 0644,
3328		.proc_handler	= proc_dointvec,
 
3329	},
3330	{
3331		.procname	= "min_adv_mss",
3332		.data		= &ip_rt_min_advmss,
3333		.maxlen		= sizeof(int),
3334		.mode		= 0644,
3335		.proc_handler	= proc_dointvec,
3336	},
3337	{ }
3338};
3339
3340static struct ctl_table ipv4_route_flush_table[] = {
3341	{
3342		.procname	= "flush",
3343		.maxlen		= sizeof(int),
3344		.mode		= 0200,
3345		.proc_handler	= ipv4_sysctl_rtcache_flush,
3346	},
3347	{ },
3348};
3349
3350static __net_init int sysctl_route_net_init(struct net *net)
3351{
3352	struct ctl_table *tbl;
3353
3354	tbl = ipv4_route_flush_table;
3355	if (!net_eq(net, &init_net)) {
3356		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3357		if (tbl == NULL)
3358			goto err_dup;
 
 
 
 
3359	}
3360	tbl[0].extra1 = net;
3361
3362	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3363	if (net->ipv4.route_hdr == NULL)
3364		goto err_reg;
3365	return 0;
3366
3367err_reg:
3368	if (tbl != ipv4_route_flush_table)
3369		kfree(tbl);
3370err_dup:
3371	return -ENOMEM;
3372}
3373
3374static __net_exit void sysctl_route_net_exit(struct net *net)
3375{
3376	struct ctl_table *tbl;
3377
3378	tbl = net->ipv4.route_hdr->ctl_table_arg;
3379	unregister_net_sysctl_table(net->ipv4.route_hdr);
3380	BUG_ON(tbl == ipv4_route_flush_table);
3381	kfree(tbl);
3382}
3383
3384static __net_initdata struct pernet_operations sysctl_route_ops = {
3385	.init = sysctl_route_net_init,
3386	.exit = sysctl_route_net_exit,
3387};
3388#endif
3389
3390static __net_init int rt_genid_init(struct net *net)
3391{
3392	get_random_bytes(&net->ipv4.rt_genid,
3393			 sizeof(net->ipv4.rt_genid));
3394	get_random_bytes(&net->ipv4.dev_addr_genid,
3395			 sizeof(net->ipv4.dev_addr_genid));
3396	return 0;
3397}
3398
3399static __net_initdata struct pernet_operations rt_genid_ops = {
3400	.init = rt_genid_init,
3401};
3402
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3403
3404#ifdef CONFIG_IP_ROUTE_CLASSID
3405struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3406#endif /* CONFIG_IP_ROUTE_CLASSID */
3407
3408static __initdata unsigned long rhash_entries;
3409static int __init set_rhash_entries(char *str)
3410{
3411	ssize_t ret;
3412
3413	if (!str)
3414		return 0;
 
3415
3416	ret = kstrtoul(str, 0, &rhash_entries);
3417	if (ret)
3418		return 0;
3419
3420	return 1;
3421}
3422__setup("rhash_entries=", set_rhash_entries);
3423
3424int __init ip_rt_init(void)
3425{
3426	int rc = 0;
3427
 
 
 
3428#ifdef CONFIG_IP_ROUTE_CLASSID
3429	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3430	if (!ip_rt_acct)
3431		panic("IP: failed to allocate ip_rt_acct\n");
3432#endif
3433
3434	ipv4_dst_ops.kmem_cachep =
3435		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3436				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3437
3438	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3439
3440	if (dst_entries_init(&ipv4_dst_ops) < 0)
3441		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3442
3443	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3444		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3445
3446	rt_hash_table = (struct rt_hash_bucket *)
3447		alloc_large_system_hash("IP route cache",
3448					sizeof(struct rt_hash_bucket),
3449					rhash_entries,
3450					(totalram_pages >= 128 * 1024) ?
3451					15 : 17,
3452					0,
3453					&rt_hash_log,
3454					&rt_hash_mask,
3455					0,
3456					rhash_entries ? 0 : 512 * 1024);
3457	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3458	rt_hash_lock_init();
3459
3460	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3461	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3462
3463	devinet_init();
3464	ip_fib_init();
3465
3466	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3467	expires_ljiffies = jiffies;
3468	schedule_delayed_work(&expires_work,
3469		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3470
3471	if (ip_rt_proc_init())
3472		pr_err("Unable to create route proc files\n");
3473#ifdef CONFIG_XFRM
3474	xfrm_init();
3475	xfrm4_init(ip_rt_max_size);
3476#endif
3477	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
 
3478
3479#ifdef CONFIG_SYSCTL
3480	register_pernet_subsys(&sysctl_route_ops);
3481#endif
3482	register_pernet_subsys(&rt_genid_ops);
3483	return rc;
 
3484}
3485
3486#ifdef CONFIG_SYSCTL
3487/*
3488 * We really need to sanitize the damn ipv4 init order, then all
3489 * this nonsense will go away.
3490 */
3491void __init ip_static_sysctl_init(void)
3492{
3493	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3494}
3495#endif