Linux Audio

Check our new training course

Loading...
v5.9
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		ROUTE - implementation of the IP router.
   8 *
   9 * Authors:	Ross Biro
  10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  12 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14 *
  15 * Fixes:
  16 *		Alan Cox	:	Verify area fixes.
  17 *		Alan Cox	:	cli() protects routing changes
  18 *		Rui Oliveira	:	ICMP routing table updates
  19 *		(rco@di.uminho.pt)	Routing table insertion and update
  20 *		Linus Torvalds	:	Rewrote bits to be sensible
  21 *		Alan Cox	:	Added BSD route gw semantics
  22 *		Alan Cox	:	Super /proc >4K
  23 *		Alan Cox	:	MTU in route table
  24 *		Alan Cox	: 	MSS actually. Also added the window
  25 *					clamper.
  26 *		Sam Lantinga	:	Fixed route matching in rt_del()
  27 *		Alan Cox	:	Routing cache support.
  28 *		Alan Cox	:	Removed compatibility cruft.
  29 *		Alan Cox	:	RTF_REJECT support.
  30 *		Alan Cox	:	TCP irtt support.
  31 *		Jonathan Naylor	:	Added Metric support.
  32 *	Miquel van Smoorenburg	:	BSD API fixes.
  33 *	Miquel van Smoorenburg	:	Metrics.
  34 *		Alan Cox	:	Use __u32 properly
  35 *		Alan Cox	:	Aligned routing errors more closely with BSD
  36 *					our system is still very different.
  37 *		Alan Cox	:	Faster /proc handling
  38 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  39 *					routing caches and better behaviour.
  40 *
  41 *		Olaf Erb	:	irtt wasn't being copied right.
  42 *		Bjorn Ekwall	:	Kerneld route support.
  43 *		Alan Cox	:	Multicast fixed (I hope)
  44 * 		Pavel Krauz	:	Limited broadcast fixed
  45 *		Mike McLagan	:	Routing by source
  46 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  47 *					route.c and rewritten from scratch.
  48 *		Andi Kleen	:	Load-limit warning messages.
  49 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  50 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  51 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  52 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  53 *		Marc Boucher	:	routing by fwmark
  54 *	Robert Olsson		:	Added rt_cache statistics
  55 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  56 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  57 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  58 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
 
 
 
 
 
  59 */
  60
  61#define pr_fmt(fmt) "IPv4: " fmt
  62
  63#include <linux/module.h>
  64#include <linux/uaccess.h>
  65#include <linux/bitops.h>
  66#include <linux/types.h>
  67#include <linux/kernel.h>
  68#include <linux/mm.h>
 
  69#include <linux/string.h>
  70#include <linux/socket.h>
  71#include <linux/sockios.h>
  72#include <linux/errno.h>
  73#include <linux/in.h>
  74#include <linux/inet.h>
  75#include <linux/netdevice.h>
  76#include <linux/proc_fs.h>
  77#include <linux/init.h>
 
  78#include <linux/skbuff.h>
  79#include <linux/inetdevice.h>
  80#include <linux/igmp.h>
  81#include <linux/pkt_sched.h>
  82#include <linux/mroute.h>
  83#include <linux/netfilter_ipv4.h>
  84#include <linux/random.h>
 
  85#include <linux/rcupdate.h>
  86#include <linux/times.h>
  87#include <linux/slab.h>
  88#include <linux/jhash.h>
  89#include <net/dst.h>
  90#include <net/dst_metadata.h>
  91#include <net/net_namespace.h>
  92#include <net/protocol.h>
  93#include <net/ip.h>
  94#include <net/route.h>
  95#include <net/inetpeer.h>
  96#include <net/sock.h>
  97#include <net/ip_fib.h>
  98#include <net/nexthop.h>
  99#include <net/arp.h>
 100#include <net/tcp.h>
 101#include <net/icmp.h>
 102#include <net/xfrm.h>
 103#include <net/lwtunnel.h>
 104#include <net/netevent.h>
 105#include <net/rtnetlink.h>
 106#ifdef CONFIG_SYSCTL
 107#include <linux/sysctl.h>
 
 108#endif
 109#include <net/secure_seq.h>
 110#include <net/ip_tunnels.h>
 111#include <net/l3mdev.h>
 112
 113#include "fib_lookup.h"
 114
 115#define RT_FL_TOS(oldflp4) \
 116	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 117
 
 
 118#define RT_GC_TIMEOUT (300*HZ)
 119
 120static int ip_rt_max_size;
 
 
 
 121static int ip_rt_redirect_number __read_mostly	= 9;
 122static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 123static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 124static int ip_rt_error_cost __read_mostly	= HZ;
 125static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 
 126static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 127static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 128static int ip_rt_min_advmss __read_mostly	= 256;
 
 129
 130static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 
 131
 132/*
 133 *	Interface to generic destination cache.
 134 */
 135
 136static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 137static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 138static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
 
 139static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 140static void		 ipv4_link_failure(struct sk_buff *skb);
 141static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 142					   struct sk_buff *skb, u32 mtu,
 143					   bool confirm_neigh);
 144static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145					struct sk_buff *skb);
 146static void		ipv4_dst_destroy(struct dst_entry *dst);
 
 147
 148static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 149{
 150	WARN_ON(1);
 151	return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 152}
 153
 154static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 155					   struct sk_buff *skb,
 156					   const void *daddr);
 157static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 158
 159static struct dst_ops ipv4_dst_ops = {
 160	.family =		AF_INET,
 
 
 161	.check =		ipv4_dst_check,
 162	.default_advmss =	ipv4_default_advmss,
 163	.mtu =			ipv4_mtu,
 164	.cow_metrics =		ipv4_cow_metrics,
 165	.destroy =		ipv4_dst_destroy,
 
 166	.negative_advice =	ipv4_negative_advice,
 167	.link_failure =		ipv4_link_failure,
 168	.update_pmtu =		ip_rt_update_pmtu,
 169	.redirect =		ip_do_redirect,
 170	.local_out =		__ip_local_out,
 171	.neigh_lookup =		ipv4_neigh_lookup,
 172	.confirm_neigh =	ipv4_confirm_neigh,
 173};
 174
 175#define ECN_OR_COST(class)	TC_PRIO_##class
 176
 177const __u8 ip_tos2prio[16] = {
 178	TC_PRIO_BESTEFFORT,
 179	ECN_OR_COST(BESTEFFORT),
 180	TC_PRIO_BESTEFFORT,
 181	ECN_OR_COST(BESTEFFORT),
 182	TC_PRIO_BULK,
 183	ECN_OR_COST(BULK),
 184	TC_PRIO_BULK,
 185	ECN_OR_COST(BULK),
 186	TC_PRIO_INTERACTIVE,
 187	ECN_OR_COST(INTERACTIVE),
 188	TC_PRIO_INTERACTIVE,
 189	ECN_OR_COST(INTERACTIVE),
 190	TC_PRIO_INTERACTIVE_BULK,
 191	ECN_OR_COST(INTERACTIVE_BULK),
 192	TC_PRIO_INTERACTIVE_BULK,
 193	ECN_OR_COST(INTERACTIVE_BULK)
 194};
 195EXPORT_SYMBOL(ip_tos2prio);
 196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 197static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 198#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 
 
 
 
 
 
 
 
 
 
 
 
 
 199
 200#ifdef CONFIG_PROC_FS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 201static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 202{
 
 203	if (*pos)
 204		return NULL;
 
 205	return SEQ_START_TOKEN;
 206}
 207
 208static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 209{
 
 
 
 
 
 
 210	++*pos;
 211	return NULL;
 212}
 213
 214static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 215{
 
 
 216}
 217
 218static int rt_cache_seq_show(struct seq_file *seq, void *v)
 219{
 220	if (v == SEQ_START_TOKEN)
 221		seq_printf(seq, "%-127s\n",
 222			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 223			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 224			   "HHUptod\tSpecDst");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 225	return 0;
 226}
 227
 228static const struct seq_operations rt_cache_seq_ops = {
 229	.start  = rt_cache_seq_start,
 230	.next   = rt_cache_seq_next,
 231	.stop   = rt_cache_seq_stop,
 232	.show   = rt_cache_seq_show,
 233};
 234
 235static int rt_cache_seq_open(struct inode *inode, struct file *file)
 236{
 237	return seq_open(file, &rt_cache_seq_ops);
 
 238}
 239
 240static const struct proc_ops rt_cache_proc_ops = {
 241	.proc_open	= rt_cache_seq_open,
 242	.proc_read	= seq_read,
 243	.proc_lseek	= seq_lseek,
 244	.proc_release	= seq_release,
 
 245};
 246
 247
 248static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 249{
 250	int cpu;
 251
 252	if (*pos == 0)
 253		return SEQ_START_TOKEN;
 254
 255	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 256		if (!cpu_possible(cpu))
 257			continue;
 258		*pos = cpu+1;
 259		return &per_cpu(rt_cache_stat, cpu);
 260	}
 261	return NULL;
 262}
 263
 264static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 265{
 266	int cpu;
 267
 268	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 269		if (!cpu_possible(cpu))
 270			continue;
 271		*pos = cpu+1;
 272		return &per_cpu(rt_cache_stat, cpu);
 273	}
 274	(*pos)++;
 275	return NULL;
 276
 277}
 278
 279static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 280{
 281
 282}
 283
 284static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 285{
 286	struct rt_cache_stat *st = v;
 287
 288	if (v == SEQ_START_TOKEN) {
 289		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 290		return 0;
 291	}
 292
 293	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 294		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 295		   dst_entries_get_slow(&ipv4_dst_ops),
 296		   0, /* st->in_hit */
 297		   st->in_slow_tot,
 298		   st->in_slow_mc,
 299		   st->in_no_route,
 300		   st->in_brd,
 301		   st->in_martian_dst,
 302		   st->in_martian_src,
 303
 304		   0, /* st->out_hit */
 305		   st->out_slow_tot,
 306		   st->out_slow_mc,
 307
 308		   0, /* st->gc_total */
 309		   0, /* st->gc_ignored */
 310		   0, /* st->gc_goal_miss */
 311		   0, /* st->gc_dst_overflow */
 312		   0, /* st->in_hlist_search */
 313		   0  /* st->out_hlist_search */
 314		);
 315	return 0;
 316}
 317
 318static const struct seq_operations rt_cpu_seq_ops = {
 319	.start  = rt_cpu_seq_start,
 320	.next   = rt_cpu_seq_next,
 321	.stop   = rt_cpu_seq_stop,
 322	.show   = rt_cpu_seq_show,
 323};
 324
 325
 326static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 327{
 328	return seq_open(file, &rt_cpu_seq_ops);
 329}
 330
 331static const struct proc_ops rt_cpu_proc_ops = {
 332	.proc_open	= rt_cpu_seq_open,
 333	.proc_read	= seq_read,
 334	.proc_lseek	= seq_lseek,
 335	.proc_release	= seq_release,
 
 336};
 337
 338#ifdef CONFIG_IP_ROUTE_CLASSID
 339static int rt_acct_proc_show(struct seq_file *m, void *v)
 340{
 341	struct ip_rt_acct *dst, *src;
 342	unsigned int i, j;
 343
 344	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 345	if (!dst)
 346		return -ENOMEM;
 347
 348	for_each_possible_cpu(i) {
 349		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 350		for (j = 0; j < 256; j++) {
 351			dst[j].o_bytes   += src[j].o_bytes;
 352			dst[j].o_packets += src[j].o_packets;
 353			dst[j].i_bytes   += src[j].i_bytes;
 354			dst[j].i_packets += src[j].i_packets;
 355		}
 356	}
 357
 358	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 359	kfree(dst);
 360	return 0;
 361}
 
 
 
 
 
 
 
 
 
 
 
 
 
 362#endif
 363
 364static int __net_init ip_rt_do_proc_init(struct net *net)
 365{
 366	struct proc_dir_entry *pde;
 367
 368	pde = proc_create("rt_cache", 0444, net->proc_net,
 369			  &rt_cache_proc_ops);
 370	if (!pde)
 371		goto err1;
 372
 373	pde = proc_create("rt_cache", 0444,
 374			  net->proc_net_stat, &rt_cpu_proc_ops);
 375	if (!pde)
 376		goto err2;
 377
 378#ifdef CONFIG_IP_ROUTE_CLASSID
 379	pde = proc_create_single("rt_acct", 0, net->proc_net,
 380			rt_acct_proc_show);
 381	if (!pde)
 382		goto err3;
 383#endif
 384	return 0;
 385
 386#ifdef CONFIG_IP_ROUTE_CLASSID
 387err3:
 388	remove_proc_entry("rt_cache", net->proc_net_stat);
 389#endif
 390err2:
 391	remove_proc_entry("rt_cache", net->proc_net);
 392err1:
 393	return -ENOMEM;
 394}
 395
 396static void __net_exit ip_rt_do_proc_exit(struct net *net)
 397{
 398	remove_proc_entry("rt_cache", net->proc_net_stat);
 399	remove_proc_entry("rt_cache", net->proc_net);
 400#ifdef CONFIG_IP_ROUTE_CLASSID
 401	remove_proc_entry("rt_acct", net->proc_net);
 402#endif
 403}
 404
 405static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 406	.init = ip_rt_do_proc_init,
 407	.exit = ip_rt_do_proc_exit,
 408};
 409
 410static int __init ip_rt_proc_init(void)
 411{
 412	return register_pernet_subsys(&ip_rt_proc_ops);
 413}
 414
 415#else
 416static inline int ip_rt_proc_init(void)
 417{
 418	return 0;
 419}
 420#endif /* CONFIG_PROC_FS */
 421
 422static inline bool rt_is_expired(const struct rtable *rth)
 423{
 424	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 425}
 426
 427void rt_cache_flush(struct net *net)
 428{
 429	rt_genid_bump_ipv4(net);
 
 430}
 431
 432static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 433					   struct sk_buff *skb,
 434					   const void *daddr)
 435{
 436	const struct rtable *rt = container_of(dst, struct rtable, dst);
 437	struct net_device *dev = dst->dev;
 438	struct neighbour *n;
 
 
 439
 440	rcu_read_lock_bh();
 
 
 
 
 441
 442	if (likely(rt->rt_gw_family == AF_INET)) {
 443		n = ip_neigh_gw4(dev, rt->rt_gw4);
 444	} else if (rt->rt_gw_family == AF_INET6) {
 445		n = ip_neigh_gw6(dev, &rt->rt_gw6);
 446        } else {
 447		__be32 pkey;
 448
 449		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 450		n = ip_neigh_gw4(dev, pkey);
 451	}
 452
 453	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 454		n = NULL;
 
 
 
 
 
 455
 456	rcu_read_unlock_bh();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 457
 458	return n;
 459}
 460
 461static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 462{
 463	const struct rtable *rt = container_of(dst, struct rtable, dst);
 464	struct net_device *dev = dst->dev;
 465	const __be32 *pkey = daddr;
 466
 467	if (rt->rt_gw_family == AF_INET) {
 468		pkey = (const __be32 *)&rt->rt_gw4;
 469	} else if (rt->rt_gw_family == AF_INET6) {
 470		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 471	} else if (!daddr ||
 472		 (rt->rt_flags &
 473		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 474		return;
 475	}
 476	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 477}
 478
 479#define IP_IDENTS_SZ 2048u
 
 
 
 
 
 
 
 
 480
 481static atomic_t *ip_idents __read_mostly;
 482static u32 *ip_tstamps __read_mostly;
 
 
 483
 484/* In order to protect privacy, we add a perturbation to identifiers
 485 * if one generator is seldom used. This makes hard for an attacker
 486 * to infer how many packets were sent between two points in time.
 487 */
 488u32 ip_idents_reserve(u32 hash, int segs)
 489{
 490	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 491	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 492	u32 old = READ_ONCE(*p_tstamp);
 493	u32 now = (u32)jiffies;
 494	u32 delta = 0;
 495
 496	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 497		delta = prandom_u32_max(now - old);
 498
 499	/* If UBSAN reports an error there, please make sure your compiler
 500	 * supports -fno-strict-overflow before reporting it that was a bug
 501	 * in UBSAN, and it has been fixed in GCC-8.
 502	 */
 503	return atomic_add_return(segs + delta, p_id) - segs;
 504}
 505EXPORT_SYMBOL(ip_idents_reserve);
 506
 507void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 
 
 
 
 
 508{
 509	u32 hash, id;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 510
 511	/* Note the following code is not safe, but this is okay. */
 512	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 513		get_random_bytes(&net->ipv4.ip_id_key,
 514				 sizeof(net->ipv4.ip_id_key));
 515
 516	hash = siphash_3u32((__force u32)iph->daddr,
 517			    (__force u32)iph->saddr,
 518			    iph->protocol,
 519			    &net->ipv4.ip_id_key);
 520	id = ip_idents_reserve(hash, segs);
 521	iph->id = htons(id);
 522}
 523EXPORT_SYMBOL(__ip_select_ident);
 524
 525static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 526			     const struct sock *sk,
 527			     const struct iphdr *iph,
 528			     int oif, u8 tos,
 529			     u8 prot, u32 mark, int flow_flags)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 530{
 531	if (sk) {
 532		const struct inet_sock *inet = inet_sk(sk);
 533
 534		oif = sk->sk_bound_dev_if;
 535		mark = sk->sk_mark;
 536		tos = RT_CONN_FLAGS(sk);
 537		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 538	}
 539	flowi4_init_output(fl4, oif, mark, tos,
 540			   RT_SCOPE_UNIVERSE, prot,
 541			   flow_flags,
 542			   iph->daddr, iph->saddr, 0, 0,
 543			   sock_net_uid(net, sk));
 544}
 545
 546static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 547			       const struct sock *sk)
 548{
 549	const struct net *net = dev_net(skb->dev);
 550	const struct iphdr *iph = ip_hdr(skb);
 551	int oif = skb->dev->ifindex;
 552	u8 tos = RT_TOS(iph->tos);
 553	u8 prot = iph->protocol;
 554	u32 mark = skb->mark;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 555
 556	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 557}
 558
 559static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 
 
 
 
 560{
 561	const struct inet_sock *inet = inet_sk(sk);
 562	const struct ip_options_rcu *inet_opt;
 563	__be32 daddr = inet->inet_daddr;
 564
 565	rcu_read_lock();
 566	inet_opt = rcu_dereference(inet->inet_opt);
 567	if (inet_opt && inet_opt->opt.srr)
 568		daddr = inet_opt->opt.faddr;
 569	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 570			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 571			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 572			   inet_sk_flowi_flags(sk),
 573			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 574	rcu_read_unlock();
 
 
 
 575}
 576
 577static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 578				 const struct sk_buff *skb)
 
 
 
 579{
 580	if (skb)
 581		build_skb_flow_key(fl4, skb, sk);
 582	else
 583		build_sk_flow_key(fl4, sk);
 584}
 585
 586static DEFINE_SPINLOCK(fnhe_lock);
 
 
 
 
 587
 588static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 589{
 590	struct rtable *rt;
 
 
 591
 592	rt = rcu_dereference(fnhe->fnhe_rth_input);
 593	if (rt) {
 594		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 595		dst_dev_put(&rt->dst);
 596		dst_release(&rt->dst);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 597	}
 598	rt = rcu_dereference(fnhe->fnhe_rth_output);
 599	if (rt) {
 600		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 601		dst_dev_put(&rt->dst);
 602		dst_release(&rt->dst);
 
 
 
 
 
 
 
 
 
 
 
 
 
 603	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 604}
 605
 606static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 
 
 
 607{
 608	struct fib_nh_exception *fnhe, *oldest;
 
 609
 610	oldest = rcu_dereference(hash->chain);
 611	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 612	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 613		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 614			oldest = fnhe;
 615	}
 616	fnhe_flush_routes(oldest);
 617	return oldest;
 618}
 619
 620static inline u32 fnhe_hashfun(__be32 daddr)
 621{
 622	static u32 fnhe_hashrnd __read_mostly;
 623	u32 hval;
 
 
 
 
 
 624
 625	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 626	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 627	return hash_32(hval, FNHE_HASH_SHIFT);
 
 
 
 
 
 
 628}
 629
 630static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 631{
 632	rt->rt_pmtu = fnhe->fnhe_pmtu;
 633	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 634	rt->dst.expires = fnhe->fnhe_expires;
 
 635
 636	if (fnhe->fnhe_gw) {
 637		rt->rt_flags |= RTCF_REDIRECTED;
 638		rt->rt_uses_gateway = 1;
 639		rt->rt_gw_family = AF_INET;
 640		rt->rt_gw4 = fnhe->fnhe_gw;
 641	}
 642}
 643
 644static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 645				  __be32 gw, u32 pmtu, bool lock,
 646				  unsigned long expires)
 647{
 648	struct fnhe_hash_bucket *hash;
 649	struct fib_nh_exception *fnhe;
 650	struct rtable *rt;
 651	u32 genid, hval;
 652	unsigned int i;
 653	int depth;
 
 
 
 
 
 
 
 654
 655	genid = fnhe_genid(dev_net(nhc->nhc_dev));
 656	hval = fnhe_hashfun(daddr);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 657
 658	spin_lock_bh(&fnhe_lock);
 
 
 
 
 
 
 
 
 659
 660	hash = rcu_dereference(nhc->nhc_exceptions);
 661	if (!hash) {
 662		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 663		if (!hash)
 664			goto out_unlock;
 665		rcu_assign_pointer(nhc->nhc_exceptions, hash);
 666	}
 667
 668	hash += hval;
 669
 670	depth = 0;
 671	for (fnhe = rcu_dereference(hash->chain); fnhe;
 672	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 673		if (fnhe->fnhe_daddr == daddr)
 674			break;
 675		depth++;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 676	}
 677
 678	if (fnhe) {
 679		if (fnhe->fnhe_genid != genid)
 680			fnhe->fnhe_genid = genid;
 681		if (gw)
 682			fnhe->fnhe_gw = gw;
 683		if (pmtu) {
 684			fnhe->fnhe_pmtu = pmtu;
 685			fnhe->fnhe_mtu_locked = lock;
 686		}
 687		fnhe->fnhe_expires = max(1UL, expires);
 688		/* Update all cached dsts too */
 689		rt = rcu_dereference(fnhe->fnhe_rth_input);
 690		if (rt)
 691			fill_route_from_fnhe(rt, fnhe);
 692		rt = rcu_dereference(fnhe->fnhe_rth_output);
 693		if (rt)
 694			fill_route_from_fnhe(rt, fnhe);
 695	} else {
 696		if (depth > FNHE_RECLAIM_DEPTH)
 697			fnhe = fnhe_oldest(hash);
 698		else {
 699			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 700			if (!fnhe)
 701				goto out_unlock;
 702
 703			fnhe->fnhe_next = hash->chain;
 704			rcu_assign_pointer(hash->chain, fnhe);
 705		}
 706		fnhe->fnhe_genid = genid;
 707		fnhe->fnhe_daddr = daddr;
 708		fnhe->fnhe_gw = gw;
 709		fnhe->fnhe_pmtu = pmtu;
 710		fnhe->fnhe_mtu_locked = lock;
 711		fnhe->fnhe_expires = max(1UL, expires);
 712
 713		/* Exception created; mark the cached routes for the nexthop
 714		 * stale, so anyone caching it rechecks if this exception
 715		 * applies to them.
 716		 */
 717		rt = rcu_dereference(nhc->nhc_rth_input);
 718		if (rt)
 719			rt->dst.obsolete = DST_OBSOLETE_KILL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 720
 721		for_each_possible_cpu(i) {
 722			struct rtable __rcu **prt;
 723			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 724			rt = rcu_dereference(*prt);
 725			if (rt)
 726				rt->dst.obsolete = DST_OBSOLETE_KILL;
 
 
 
 
 
 
 
 
 
 
 
 
 727		}
 728	}
 729
 730	fnhe->fnhe_stamp = jiffies;
 
 
 
 
 
 
 
 731
 732out_unlock:
 733	spin_unlock_bh(&fnhe_lock);
 
 
 
 
 734}
 735
 736static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 737			     bool kill_route)
 
 738{
 739	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 740	__be32 old_gw = ip_hdr(skb)->saddr;
 741	struct net_device *dev = skb->dev;
 742	struct in_device *in_dev;
 743	struct fib_result res;
 744	struct neighbour *n;
 745	struct net *net;
 746
 747	switch (icmp_hdr(skb)->code & 7) {
 748	case ICMP_REDIR_NET:
 749	case ICMP_REDIR_NETTOS:
 750	case ICMP_REDIR_HOST:
 751	case ICMP_REDIR_HOSTTOS:
 752		break;
 753
 754	default:
 755		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 756	}
 
 
 757
 758	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 
 
 
 
 
 
 
 
 
 
 
 
 759		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 760
 761	in_dev = __in_dev_get_rcu(dev);
 762	if (!in_dev)
 763		return;
 764
 765	net = dev_net(dev);
 766	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 767	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 768	    ipv4_is_zeronet(new_gw))
 769		goto reject_redirect;
 770
 771	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 772		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 773			goto reject_redirect;
 774		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 775			goto reject_redirect;
 776	} else {
 777		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 778			goto reject_redirect;
 779	}
 780
 781	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 782	if (!n)
 783		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 784	if (!IS_ERR(n)) {
 785		if (!(n->nud_state & NUD_VALID)) {
 786			neigh_event_send(n, NULL);
 787		} else {
 788			if (fib_lookup(net, fl4, &res, 0) == 0) {
 789				struct fib_nh_common *nhc;
 790
 791				fib_select_path(net, &res, fl4, skb);
 792				nhc = FIB_RES_NHC(res);
 793				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 794						0, false,
 795						jiffies + ip_rt_gc_timeout);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 796			}
 797			if (kill_route)
 798				rt->dst.obsolete = DST_OBSOLETE_KILL;
 799			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 800		}
 801		neigh_release(n);
 802	}
 803	return;
 804
 805reject_redirect:
 806#ifdef CONFIG_IP_ROUTE_VERBOSE
 807	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 808		const struct iphdr *iph = (const struct iphdr *) skb->data;
 809		__be32 daddr = iph->daddr;
 810		__be32 saddr = iph->saddr;
 811
 812		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 813				     "  Advised path = %pI4 -> %pI4\n",
 814				     &old_gw, dev->name, &new_gw,
 815				     &saddr, &daddr);
 816	}
 817#endif
 818	;
 819}
 820
 821static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 822{
 823	struct rtable *rt;
 824	struct flowi4 fl4;
 825	const struct iphdr *iph = (const struct iphdr *) skb->data;
 826	struct net *net = dev_net(skb->dev);
 827	int oif = skb->dev->ifindex;
 828	u8 tos = RT_TOS(iph->tos);
 829	u8 prot = iph->protocol;
 830	u32 mark = skb->mark;
 831
 832	rt = (struct rtable *) dst;
 
 
 833
 834	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 835	__ip_do_redirect(rt, skb, &fl4, true);
 836}
 837
 838static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 839{
 840	struct rtable *rt = (struct rtable *)dst;
 841	struct dst_entry *ret = dst;
 842
 843	if (rt) {
 844		if (dst->obsolete > 0) {
 845			ip_rt_put(rt);
 846			ret = NULL;
 847		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 848			   rt->dst.expires) {
 849			ip_rt_put(rt);
 
 
 850			ret = NULL;
 
 
 851		}
 852	}
 853	return ret;
 854}
 855
 856/*
 857 * Algorithm:
 858 *	1. The first ip_rt_redirect_number redirects are sent
 859 *	   with exponential backoff, then we stop sending them at all,
 860 *	   assuming that the host ignores our redirects.
 861 *	2. If we did not see packets requiring redirects
 862 *	   during ip_rt_redirect_silence, we assume that the host
 863 *	   forgot redirected route and start to send redirects again.
 864 *
 865 * This algorithm is much cheaper and more intelligent than dumb load limiting
 866 * in icmp.c.
 867 *
 868 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 869 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 870 */
 871
 872void ip_rt_send_redirect(struct sk_buff *skb)
 873{
 874	struct rtable *rt = skb_rtable(skb);
 875	struct in_device *in_dev;
 876	struct inet_peer *peer;
 877	struct net *net;
 878	int log_martians;
 879	int vif;
 880
 881	rcu_read_lock();
 882	in_dev = __in_dev_get_rcu(rt->dst.dev);
 883	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 884		rcu_read_unlock();
 885		return;
 886	}
 887	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 888	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 889	rcu_read_unlock();
 890
 891	net = dev_net(rt->dst.dev);
 892	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 
 893	if (!peer) {
 894		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 895			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 896		return;
 897	}
 898
 899	/* No redirected packets during ip_rt_redirect_silence;
 900	 * reset the algorithm.
 901	 */
 902	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 903		peer->rate_tokens = 0;
 904		peer->n_redirects = 0;
 905	}
 906
 907	/* Too many ignored redirects; do not send anything
 908	 * set dst.rate_last to the last seen redirected packet.
 909	 */
 910	if (peer->n_redirects >= ip_rt_redirect_number) {
 911		peer->rate_last = jiffies;
 912		goto out_put_peer;
 913	}
 914
 915	/* Check for load limit; set rate_last to the latest sent
 916	 * redirect.
 917	 */
 918	if (peer->n_redirects == 0 ||
 919	    time_after(jiffies,
 920		       (peer->rate_last +
 921			(ip_rt_redirect_load << peer->n_redirects)))) {
 922		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 923
 924		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 925		peer->rate_last = jiffies;
 926		++peer->n_redirects;
 927#ifdef CONFIG_IP_ROUTE_VERBOSE
 928		if (log_martians &&
 929		    peer->n_redirects == ip_rt_redirect_number)
 930			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 931					     &ip_hdr(skb)->saddr, inet_iif(skb),
 932					     &ip_hdr(skb)->daddr, &gw);
 933#endif
 934	}
 935out_put_peer:
 936	inet_putpeer(peer);
 937}
 938
 939static int ip_error(struct sk_buff *skb)
 940{
 941	struct rtable *rt = skb_rtable(skb);
 942	struct net_device *dev = skb->dev;
 943	struct in_device *in_dev;
 944	struct inet_peer *peer;
 945	unsigned long now;
 946	struct net *net;
 947	bool send;
 948	int code;
 949
 950	if (netif_is_l3_master(skb->dev)) {
 951		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 952		if (!dev)
 953			goto out;
 954	}
 955
 956	in_dev = __in_dev_get_rcu(dev);
 957
 958	/* IP on this device is disabled. */
 959	if (!in_dev)
 960		goto out;
 961
 962	net = dev_net(rt->dst.dev);
 963	if (!IN_DEV_FORWARD(in_dev)) {
 964		switch (rt->dst.error) {
 965		case EHOSTUNREACH:
 966			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 967			break;
 968
 969		case ENETUNREACH:
 970			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 971			break;
 972		}
 973		goto out;
 974	}
 975
 976	switch (rt->dst.error) {
 977	case EINVAL:
 978	default:
 979		goto out;
 980	case EHOSTUNREACH:
 981		code = ICMP_HOST_UNREACH;
 982		break;
 983	case ENETUNREACH:
 984		code = ICMP_NET_UNREACH;
 985		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 
 986		break;
 987	case EACCES:
 988		code = ICMP_PKT_FILTERED;
 989		break;
 990	}
 991
 992	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 993			       l3mdev_master_ifindex(skb->dev), 1);
 
 994
 995	send = true;
 996	if (peer) {
 997		now = jiffies;
 998		peer->rate_tokens += now - peer->rate_last;
 999		if (peer->rate_tokens > ip_rt_error_burst)
1000			peer->rate_tokens = ip_rt_error_burst;
1001		peer->rate_last = now;
1002		if (peer->rate_tokens >= ip_rt_error_cost)
1003			peer->rate_tokens -= ip_rt_error_cost;
1004		else
1005			send = false;
1006		inet_putpeer(peer);
1007	}
1008	if (send)
1009		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1010
1011out:	kfree_skb(skb);
1012	return 0;
1013}
1014
1015static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1016{
1017	struct dst_entry *dst = &rt->dst;
1018	struct net *net = dev_net(dst->dev);
1019	u32 old_mtu = ipv4_mtu(dst);
1020	struct fib_result res;
1021	bool lock = false;
1022
1023	if (ip_mtu_locked(dst))
1024		return;
1025
1026	if (old_mtu < mtu)
1027		return;
1028
1029	if (mtu < ip_rt_min_pmtu) {
1030		lock = true;
1031		mtu = min(old_mtu, ip_rt_min_pmtu);
1032	}
1033
1034	if (rt->rt_pmtu == mtu && !lock &&
1035	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1036		return;
1037
1038	rcu_read_lock();
1039	if (fib_lookup(net, fl4, &res, 0) == 0) {
1040		struct fib_nh_common *nhc;
1041
1042		fib_select_path(net, &res, fl4, NULL);
1043		nhc = FIB_RES_NHC(res);
1044		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1045				      jiffies + ip_rt_mtu_expires);
1046	}
1047	rcu_read_unlock();
1048}
1049
1050static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1051			      struct sk_buff *skb, u32 mtu,
1052			      bool confirm_neigh)
1053{
1054	struct rtable *rt = (struct rtable *) dst;
1055	struct flowi4 fl4;
1056
1057	ip_rt_build_flow_key(&fl4, sk, skb);
1058
1059	/* Don't make lookup fail for bridged encapsulations */
1060	if (skb && netif_is_any_bridge_port(skb->dev))
1061		fl4.flowi4_oif = 0;
1062
1063	__ip_rt_update_pmtu(rt, &fl4, mtu);
1064}
 
1065
1066void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1067		      int oif, u8 protocol)
1068{
1069	const struct iphdr *iph = (const struct iphdr *) skb->data;
1070	struct flowi4 fl4;
1071	struct rtable *rt;
1072	u32 mark = IP4_REPLY_MARK(net, skb->mark);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1073
1074	__build_flow_key(net, &fl4, NULL, iph, oif,
1075			 RT_TOS(iph->tos), protocol, mark, 0);
1076	rt = __ip_route_output_key(net, &fl4);
1077	if (!IS_ERR(rt)) {
1078		__ip_rt_update_pmtu(rt, &fl4, mtu);
1079		ip_rt_put(rt);
1080	}
 
1081}
1082EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1083
1084static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1085{
1086	const struct iphdr *iph = (const struct iphdr *) skb->data;
1087	struct flowi4 fl4;
1088	struct rtable *rt;
1089
1090	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1091
1092	if (!fl4.flowi4_mark)
1093		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1094
1095	rt = __ip_route_output_key(sock_net(sk), &fl4);
1096	if (!IS_ERR(rt)) {
1097		__ip_rt_update_pmtu(rt, &fl4, mtu);
1098		ip_rt_put(rt);
1099	}
 
 
 
 
 
 
1100}
1101
1102void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1103{
1104	const struct iphdr *iph = (const struct iphdr *) skb->data;
1105	struct flowi4 fl4;
1106	struct rtable *rt;
1107	struct dst_entry *odst = NULL;
1108	bool new = false;
1109	struct net *net = sock_net(sk);
1110
1111	bh_lock_sock(sk);
1112
1113	if (!ip_sk_accept_pmtu(sk))
1114		goto out;
1115
1116	odst = sk_dst_get(sk);
1117
1118	if (sock_owned_by_user(sk) || !odst) {
1119		__ipv4_sk_update_pmtu(skb, sk, mtu);
1120		goto out;
1121	}
1122
1123	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1124
1125	rt = (struct rtable *)odst;
1126	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1127		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1128		if (IS_ERR(rt))
1129			goto out;
1130
1131		new = true;
1132	}
 
 
 
1133
1134	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
 
 
1135
1136	if (!dst_check(&rt->dst, 0)) {
1137		if (new)
1138			dst_release(&rt->dst);
1139
1140		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1141		if (IS_ERR(rt))
1142			goto out;
1143
1144		new = true;
 
 
 
1145	}
1146
1147	if (new)
1148		sk_dst_set(sk, &rt->dst);
1149
1150out:
1151	bh_unlock_sock(sk);
1152	dst_release(odst);
1153}
1154EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1155
1156void ipv4_redirect(struct sk_buff *skb, struct net *net,
1157		   int oif, u8 protocol)
1158{
1159	const struct iphdr *iph = (const struct iphdr *) skb->data;
1160	struct flowi4 fl4;
1161	struct rtable *rt;
1162
1163	__build_flow_key(net, &fl4, NULL, iph, oif,
1164			 RT_TOS(iph->tos), protocol, 0, 0);
1165	rt = __ip_route_output_key(net, &fl4);
1166	if (!IS_ERR(rt)) {
1167		__ip_do_redirect(rt, skb, &fl4, false);
1168		ip_rt_put(rt);
1169	}
1170}
1171EXPORT_SYMBOL_GPL(ipv4_redirect);
1172
1173void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1174{
1175	const struct iphdr *iph = (const struct iphdr *) skb->data;
1176	struct flowi4 fl4;
1177	struct rtable *rt;
1178	struct net *net = sock_net(sk);
 
 
1179
1180	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1181	rt = __ip_route_output_key(net, &fl4);
1182	if (!IS_ERR(rt)) {
1183		__ip_do_redirect(rt, skb, &fl4, false);
1184		ip_rt_put(rt);
1185	}
1186}
1187EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1188
1189static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1190{
1191	struct rtable *rt = (struct rtable *) dst;
1192
1193	/* All IPV4 dsts are created with ->obsolete set to the value
1194	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1195	 * into this function always.
1196	 *
1197	 * When a PMTU/redirect information update invalidates a route,
1198	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1199	 * DST_OBSOLETE_DEAD.
1200	 */
1201	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1202		return NULL;
 
1203	return dst;
1204}
1205
1206static void ipv4_send_dest_unreach(struct sk_buff *skb)
1207{
1208	struct ip_options opt;
1209	int res;
1210
1211	/* Recompile ip options since IPCB may not be valid anymore.
1212	 * Also check we have a reasonable ipv4 header.
1213	 */
1214	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1215	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1216		return;
1217
1218	memset(&opt, 0, sizeof(opt));
1219	if (ip_hdr(skb)->ihl > 5) {
1220		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1221			return;
1222		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1223
1224		rcu_read_lock();
1225		res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1226		rcu_read_unlock();
1227
1228		if (res)
1229			return;
 
 
 
 
 
1230	}
1231	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1232}
1233
 
1234static void ipv4_link_failure(struct sk_buff *skb)
1235{
1236	struct rtable *rt;
1237
1238	ipv4_send_dest_unreach(skb);
1239
1240	rt = skb_rtable(skb);
1241	if (rt)
1242		dst_set_expires(&rt->dst, 0);
1243}
1244
1245static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1246{
1247	pr_debug("%s: %pI4 -> %pI4, %s\n",
1248		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1249		 skb->dev ? skb->dev->name : "?");
1250	kfree_skb(skb);
1251	WARN_ON(1);
1252	return 0;
1253}
1254
1255/*
1256   We do not cache source address of outgoing interface,
1257   because it is used only by IP RR, TS and SRR options,
1258   so that it out of fast path.
1259
1260   BTW remember: "addr" is allowed to be not aligned
1261   in IP options!
1262 */
1263
1264void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1265{
1266	__be32 src;
1267
1268	if (rt_is_output_route(rt))
1269		src = ip_hdr(skb)->saddr;
1270	else {
1271		struct fib_result res;
1272		struct iphdr *iph = ip_hdr(skb);
1273		struct flowi4 fl4 = {
1274			.daddr = iph->daddr,
1275			.saddr = iph->saddr,
1276			.flowi4_tos = RT_TOS(iph->tos),
1277			.flowi4_oif = rt->dst.dev->ifindex,
1278			.flowi4_iif = skb->dev->ifindex,
1279			.flowi4_mark = skb->mark,
1280		};
 
 
 
1281
1282		rcu_read_lock();
1283		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1284			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1285		else
1286			src = inet_select_addr(rt->dst.dev,
1287					       rt_nexthop(rt, iph->daddr),
1288					       RT_SCOPE_UNIVERSE);
1289		rcu_read_unlock();
1290	}
1291	memcpy(addr, &src, 4);
1292}
1293
1294#ifdef CONFIG_IP_ROUTE_CLASSID
1295static void set_class_tag(struct rtable *rt, u32 tag)
1296{
1297	if (!(rt->dst.tclassid & 0xFFFF))
1298		rt->dst.tclassid |= tag & 0xFFFF;
1299	if (!(rt->dst.tclassid & 0xFFFF0000))
1300		rt->dst.tclassid |= tag & 0xFFFF0000;
1301}
1302#endif
1303
1304static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1305{
1306	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1307	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1308				    ip_rt_min_advmss);
1309
1310	return min(advmss, IPV4_MAX_PMTU - header_size);
 
 
 
 
 
 
1311}
1312
1313static unsigned int ipv4_mtu(const struct dst_entry *dst)
1314{
1315	const struct rtable *rt = (const struct rtable *) dst;
1316	unsigned int mtu = rt->rt_pmtu;
1317
1318	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1319		mtu = dst_metric_raw(dst, RTAX_MTU);
1320
1321	if (mtu)
1322		return mtu;
1323
1324	mtu = READ_ONCE(dst->dev->mtu);
1325
1326	if (unlikely(ip_mtu_locked(dst))) {
1327		if (rt->rt_uses_gateway && mtu > 576)
1328			mtu = 576;
1329	}
1330
1331	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1332
1333	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1334}
1335
1336static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1337{
1338	struct fnhe_hash_bucket *hash;
1339	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1340	u32 hval = fnhe_hashfun(daddr);
1341
1342	spin_lock_bh(&fnhe_lock);
1343
1344	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1345					 lockdep_is_held(&fnhe_lock));
1346	hash += hval;
1347
1348	fnhe_p = &hash->chain;
1349	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1350	while (fnhe) {
1351		if (fnhe->fnhe_daddr == daddr) {
1352			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1353				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1354			/* set fnhe_daddr to 0 to ensure it won't bind with
1355			 * new dsts in rt_bind_exception().
1356			 */
1357			fnhe->fnhe_daddr = 0;
1358			fnhe_flush_routes(fnhe);
1359			kfree_rcu(fnhe, rcu);
1360			break;
1361		}
1362		fnhe_p = &fnhe->fnhe_next;
1363		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1364						 lockdep_is_held(&fnhe_lock));
1365	}
1366
1367	spin_unlock_bh(&fnhe_lock);
1368}
1369
1370static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1371					       __be32 daddr)
1372{
1373	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1374	struct fib_nh_exception *fnhe;
1375	u32 hval;
1376
1377	if (!hash)
1378		return NULL;
1379
1380	hval = fnhe_hashfun(daddr);
1381
1382	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1383	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1384		if (fnhe->fnhe_daddr == daddr) {
1385			if (fnhe->fnhe_expires &&
1386			    time_after(jiffies, fnhe->fnhe_expires)) {
1387				ip_del_fnhe(nhc, daddr);
1388				break;
1389			}
1390			return fnhe;
1391		}
1392	}
1393	return NULL;
1394}
1395
1396/* MTU selection:
1397 * 1. mtu on route is locked - use it
1398 * 2. mtu from nexthop exception
1399 * 3. mtu from egress device
1400 */
1401
1402u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1403{
1404	struct fib_nh_common *nhc = res->nhc;
1405	struct net_device *dev = nhc->nhc_dev;
1406	struct fib_info *fi = res->fi;
1407	u32 mtu = 0;
1408
1409	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1410	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1411		mtu = fi->fib_mtu;
1412
1413	if (likely(!mtu)) {
1414		struct fib_nh_exception *fnhe;
1415
1416		fnhe = find_exception(nhc, daddr);
1417		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1418			mtu = fnhe->fnhe_pmtu;
1419	}
1420
1421	if (likely(!mtu))
1422		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1423
1424	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1425}
1426
1427static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1428			      __be32 daddr, const bool do_cache)
1429{
1430	bool ret = false;
1431
1432	spin_lock_bh(&fnhe_lock);
1433
1434	if (daddr == fnhe->fnhe_daddr) {
1435		struct rtable __rcu **porig;
1436		struct rtable *orig;
1437		int genid = fnhe_genid(dev_net(rt->dst.dev));
1438
1439		if (rt_is_input_route(rt))
1440			porig = &fnhe->fnhe_rth_input;
1441		else
1442			porig = &fnhe->fnhe_rth_output;
1443		orig = rcu_dereference(*porig);
1444
1445		if (fnhe->fnhe_genid != genid) {
1446			fnhe->fnhe_genid = genid;
1447			fnhe->fnhe_gw = 0;
1448			fnhe->fnhe_pmtu = 0;
1449			fnhe->fnhe_expires = 0;
1450			fnhe->fnhe_mtu_locked = false;
1451			fnhe_flush_routes(fnhe);
1452			orig = NULL;
1453		}
1454		fill_route_from_fnhe(rt, fnhe);
1455		if (!rt->rt_gw4) {
1456			rt->rt_gw4 = daddr;
1457			rt->rt_gw_family = AF_INET;
1458		}
1459
1460		if (do_cache) {
1461			dst_hold(&rt->dst);
1462			rcu_assign_pointer(*porig, rt);
1463			if (orig) {
1464				dst_dev_put(&orig->dst);
1465				dst_release(&orig->dst);
1466			}
1467			ret = true;
1468		}
1469
1470		fnhe->fnhe_stamp = jiffies;
1471	}
1472	spin_unlock_bh(&fnhe_lock);
1473
1474	return ret;
1475}
1476
1477static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1478{
1479	struct rtable *orig, *prev, **p;
1480	bool ret = true;
1481
1482	if (rt_is_input_route(rt)) {
1483		p = (struct rtable **)&nhc->nhc_rth_input;
1484	} else {
1485		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1486	}
1487	orig = *p;
1488
1489	/* hold dst before doing cmpxchg() to avoid race condition
1490	 * on this dst
1491	 */
1492	dst_hold(&rt->dst);
1493	prev = cmpxchg(p, orig, rt);
1494	if (prev == orig) {
1495		if (orig) {
1496			rt_add_uncached_list(orig);
1497			dst_release(&orig->dst);
1498		}
1499	} else {
1500		dst_release(&rt->dst);
1501		ret = false;
1502	}
1503
1504	return ret;
1505}
1506
1507struct uncached_list {
1508	spinlock_t		lock;
1509	struct list_head	head;
1510};
1511
1512static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1513
1514void rt_add_uncached_list(struct rtable *rt)
1515{
1516	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1517
1518	rt->rt_uncached_list = ul;
1519
1520	spin_lock_bh(&ul->lock);
1521	list_add_tail(&rt->rt_uncached, &ul->head);
1522	spin_unlock_bh(&ul->lock);
1523}
1524
1525void rt_del_uncached_list(struct rtable *rt)
1526{
1527	if (!list_empty(&rt->rt_uncached)) {
1528		struct uncached_list *ul = rt->rt_uncached_list;
1529
1530		spin_lock_bh(&ul->lock);
1531		list_del(&rt->rt_uncached);
1532		spin_unlock_bh(&ul->lock);
1533	}
1534}
1535
1536static void ipv4_dst_destroy(struct dst_entry *dst)
1537{
1538	struct rtable *rt = (struct rtable *)dst;
1539
1540	ip_dst_metrics_put(dst);
1541	rt_del_uncached_list(rt);
1542}
1543
1544void rt_flush_dev(struct net_device *dev)
1545{
1546	struct rtable *rt;
1547	int cpu;
1548
1549	for_each_possible_cpu(cpu) {
1550		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1551
1552		spin_lock_bh(&ul->lock);
1553		list_for_each_entry(rt, &ul->head, rt_uncached) {
1554			if (rt->dst.dev != dev)
1555				continue;
1556			rt->dst.dev = blackhole_netdev;
1557			dev_hold(rt->dst.dev);
1558			dev_put(dev);
1559		}
1560		spin_unlock_bh(&ul->lock);
1561	}
1562}
1563
1564static bool rt_cache_valid(const struct rtable *rt)
1565{
1566	return	rt &&
1567		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1568		!rt_is_expired(rt);
1569}
1570
1571static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1572			   const struct fib_result *res,
1573			   struct fib_nh_exception *fnhe,
1574			   struct fib_info *fi, u16 type, u32 itag,
1575			   const bool do_cache)
1576{
1577	bool cached = false;
1578
1579	if (fi) {
1580		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1581
1582		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1583			rt->rt_uses_gateway = 1;
1584			rt->rt_gw_family = nhc->nhc_gw_family;
1585			/* only INET and INET6 are supported */
1586			if (likely(nhc->nhc_gw_family == AF_INET))
1587				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1588			else
1589				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1590		}
1591
1592		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1593
1594#ifdef CONFIG_IP_ROUTE_CLASSID
1595		if (nhc->nhc_family == AF_INET) {
1596			struct fib_nh *nh;
1597
1598			nh = container_of(nhc, struct fib_nh, nh_common);
1599			rt->dst.tclassid = nh->nh_tclassid;
1600		}
1601#endif
1602		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1603		if (unlikely(fnhe))
1604			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1605		else if (do_cache)
1606			cached = rt_cache_route(nhc, rt);
1607		if (unlikely(!cached)) {
1608			/* Routes we intend to cache in nexthop exception or
1609			 * FIB nexthop have the DST_NOCACHE bit clear.
1610			 * However, if we are unsuccessful at storing this
1611			 * route into the cache we really need to set it.
1612			 */
1613			if (!rt->rt_gw4) {
1614				rt->rt_gw_family = AF_INET;
1615				rt->rt_gw4 = daddr;
1616			}
1617			rt_add_uncached_list(rt);
1618		}
1619	} else
1620		rt_add_uncached_list(rt);
1621
1622#ifdef CONFIG_IP_ROUTE_CLASSID
1623#ifdef CONFIG_IP_MULTIPLE_TABLES
1624	set_class_tag(rt, res->tclassid);
1625#endif
1626	set_class_tag(rt, itag);
1627#endif
1628}
1629
1630struct rtable *rt_dst_alloc(struct net_device *dev,
1631			    unsigned int flags, u16 type,
1632			    bool nopolicy, bool noxfrm)
1633{
1634	struct rtable *rt;
1635
1636	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1637		       (nopolicy ? DST_NOPOLICY : 0) |
1638		       (noxfrm ? DST_NOXFRM : 0));
1639
1640	if (rt) {
1641		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1642		rt->rt_flags = flags;
1643		rt->rt_type = type;
1644		rt->rt_is_input = 0;
1645		rt->rt_iif = 0;
1646		rt->rt_pmtu = 0;
1647		rt->rt_mtu_locked = 0;
1648		rt->rt_uses_gateway = 0;
1649		rt->rt_gw_family = 0;
1650		rt->rt_gw4 = 0;
1651		INIT_LIST_HEAD(&rt->rt_uncached);
1652
1653		rt->dst.output = ip_output;
1654		if (flags & RTCF_LOCAL)
1655			rt->dst.input = ip_local_deliver;
1656	}
1657
1658	return rt;
1659}
1660EXPORT_SYMBOL(rt_dst_alloc);
1661
1662struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1663{
1664	struct rtable *new_rt;
1665
1666	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1667			   rt->dst.flags);
1668
1669	if (new_rt) {
1670		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1671		new_rt->rt_flags = rt->rt_flags;
1672		new_rt->rt_type = rt->rt_type;
1673		new_rt->rt_is_input = rt->rt_is_input;
1674		new_rt->rt_iif = rt->rt_iif;
1675		new_rt->rt_pmtu = rt->rt_pmtu;
1676		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1677		new_rt->rt_gw_family = rt->rt_gw_family;
1678		if (rt->rt_gw_family == AF_INET)
1679			new_rt->rt_gw4 = rt->rt_gw4;
1680		else if (rt->rt_gw_family == AF_INET6)
1681			new_rt->rt_gw6 = rt->rt_gw6;
1682		INIT_LIST_HEAD(&new_rt->rt_uncached);
1683
1684		new_rt->dst.input = rt->dst.input;
1685		new_rt->dst.output = rt->dst.output;
1686		new_rt->dst.error = rt->dst.error;
1687		new_rt->dst.lastuse = jiffies;
1688		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1689	}
1690	return new_rt;
1691}
1692EXPORT_SYMBOL(rt_dst_clone);
1693
1694/* called in rcu_read_lock() section */
1695int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1696			  u8 tos, struct net_device *dev,
1697			  struct in_device *in_dev, u32 *itag)
1698{
 
 
 
 
 
1699	int err;
1700
1701	/* Primary sanity checks. */
1702	if (!in_dev)
1703		return -EINVAL;
1704
1705	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1706	    skb->protocol != htons(ETH_P_IP))
1707		return -EINVAL;
1708
1709	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1710		return -EINVAL;
 
1711
1712	if (ipv4_is_zeronet(saddr)) {
1713		if (!ipv4_is_local_multicast(daddr) &&
1714		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1715			return -EINVAL;
1716	} else {
1717		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1718					  in_dev, itag);
1719		if (err < 0)
1720			return err;
1721	}
1722	return 0;
1723}
1724
1725/* called in rcu_read_lock() section */
1726static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1727			     u8 tos, struct net_device *dev, int our)
1728{
1729	struct in_device *in_dev = __in_dev_get_rcu(dev);
1730	unsigned int flags = RTCF_MULTICAST;
1731	struct rtable *rth;
1732	u32 itag = 0;
1733	int err;
1734
1735	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1736	if (err)
1737		return err;
1738
1739	if (our)
1740		flags |= RTCF_LOCAL;
1741
1742	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1743			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1744	if (!rth)
1745		return -ENOBUFS;
1746
1747#ifdef CONFIG_IP_ROUTE_CLASSID
1748	rth->dst.tclassid = itag;
1749#endif
1750	rth->dst.output = ip_rt_bug;
1751	rth->rt_is_input= 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1752
1753#ifdef CONFIG_IP_MROUTE
1754	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1755		rth->dst.input = ip_mr_input;
1756#endif
1757	RT_CACHE_STAT_INC(in_slow_mc);
1758
1759	skb_dst_set(skb, &rth->dst);
1760	return 0;
 
 
 
 
 
 
 
 
1761}
1762
1763
1764static void ip_handle_martian_source(struct net_device *dev,
1765				     struct in_device *in_dev,
1766				     struct sk_buff *skb,
1767				     __be32 daddr,
1768				     __be32 saddr)
1769{
1770	RT_CACHE_STAT_INC(in_martian_src);
1771#ifdef CONFIG_IP_ROUTE_VERBOSE
1772	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1773		/*
1774		 *	RFC1812 recommendation, if source is martian,
1775		 *	the only hint is MAC header.
1776		 */
1777		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1778			&daddr, &saddr, dev->name);
1779		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1780			print_hex_dump(KERN_WARNING, "ll header: ",
1781				       DUMP_PREFIX_OFFSET, 16, 1,
1782				       skb_mac_header(skb),
1783				       dev->hard_header_len, false);
1784		}
1785	}
1786#endif
1787}
1788
1789/* called in rcu_read_lock() section */
1790static int __mkroute_input(struct sk_buff *skb,
1791			   const struct fib_result *res,
1792			   struct in_device *in_dev,
1793			   __be32 daddr, __be32 saddr, u32 tos)
 
1794{
1795	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1796	struct net_device *dev = nhc->nhc_dev;
1797	struct fib_nh_exception *fnhe;
1798	struct rtable *rth;
1799	int err;
1800	struct in_device *out_dev;
1801	bool do_cache;
1802	u32 itag = 0;
 
1803
1804	/* get a working reference to the output device */
1805	out_dev = __in_dev_get_rcu(dev);
1806	if (!out_dev) {
1807		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1808		return -EINVAL;
1809	}
1810
 
1811	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1812				  in_dev->dev, in_dev, &itag);
1813	if (err < 0) {
1814		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1815					 saddr);
1816
1817		goto cleanup;
1818	}
1819
1820	do_cache = res->fi && !itag;
1821	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1822	    skb->protocol == htons(ETH_P_IP)) {
1823		__be32 gw;
1824
1825		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1826		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1827		    inet_addr_onlink(out_dev, saddr, gw))
1828			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1829	}
1830
1831	if (skb->protocol != htons(ETH_P_IP)) {
1832		/* Not IP (i.e. ARP). Do not create route, if it is
1833		 * invalid for proxy arp. DNAT routes are always valid.
1834		 *
1835		 * Proxy arp feature have been extended to allow, ARP
1836		 * replies back to the same interface, to support
1837		 * Private VLAN switch technologies. See arp.c.
1838		 */
1839		if (out_dev == in_dev &&
1840		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1841			err = -EINVAL;
1842			goto cleanup;
1843		}
1844	}
1845
1846	fnhe = find_exception(nhc, daddr);
1847	if (do_cache) {
1848		if (fnhe)
1849			rth = rcu_dereference(fnhe->fnhe_rth_input);
1850		else
1851			rth = rcu_dereference(nhc->nhc_rth_input);
1852		if (rt_cache_valid(rth)) {
1853			skb_dst_set_noref(skb, &rth->dst);
1854			goto out;
1855		}
1856	}
1857
1858	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1859			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1860			   IN_DEV_CONF_GET(out_dev, NOXFRM));
1861	if (!rth) {
1862		err = -ENOBUFS;
1863		goto cleanup;
1864	}
1865
1866	rth->rt_is_input = 1;
1867	RT_CACHE_STAT_INC(in_slow_tot);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1868
1869	rth->dst.input = ip_forward;
 
1870
1871	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1872		       do_cache);
1873	lwtunnel_set_redirect(&rth->dst);
1874	skb_dst_set(skb, &rth->dst);
1875out:
1876	err = 0;
1877 cleanup:
1878	return err;
1879}
1880
1881#ifdef CONFIG_IP_ROUTE_MULTIPATH
1882/* To make ICMP packets follow the right flow, the multipath hash is
1883 * calculated from the inner IP addresses.
1884 */
1885static void ip_multipath_l3_keys(const struct sk_buff *skb,
1886				 struct flow_keys *hash_keys)
1887{
1888	const struct iphdr *outer_iph = ip_hdr(skb);
1889	const struct iphdr *key_iph = outer_iph;
1890	const struct iphdr *inner_iph;
1891	const struct icmphdr *icmph;
1892	struct iphdr _inner_iph;
1893	struct icmphdr _icmph;
1894
1895	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1896		goto out;
1897
1898	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1899		goto out;
1900
1901	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1902				   &_icmph);
1903	if (!icmph)
1904		goto out;
1905
1906	if (!icmp_is_err(icmph->type))
1907		goto out;
1908
1909	inner_iph = skb_header_pointer(skb,
1910				       outer_iph->ihl * 4 + sizeof(_icmph),
1911				       sizeof(_inner_iph), &_inner_iph);
1912	if (!inner_iph)
1913		goto out;
1914
1915	key_iph = inner_iph;
1916out:
1917	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1918	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1919}
1920
1921/* if skb is set it will be used and fl4 can be NULL */
1922int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1923		       const struct sk_buff *skb, struct flow_keys *flkeys)
1924{
1925	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1926	struct flow_keys hash_keys;
1927	u32 mhash;
1928
1929	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1930	case 0:
1931		memset(&hash_keys, 0, sizeof(hash_keys));
1932		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1933		if (skb) {
1934			ip_multipath_l3_keys(skb, &hash_keys);
1935		} else {
1936			hash_keys.addrs.v4addrs.src = fl4->saddr;
1937			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1938		}
1939		break;
1940	case 1:
1941		/* skb is currently provided only when forwarding */
1942		if (skb) {
1943			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1944			struct flow_keys keys;
1945
1946			/* short-circuit if we already have L4 hash present */
1947			if (skb->l4_hash)
1948				return skb_get_hash_raw(skb) >> 1;
1949
1950			memset(&hash_keys, 0, sizeof(hash_keys));
1951
1952			if (!flkeys) {
1953				skb_flow_dissect_flow_keys(skb, &keys, flag);
1954				flkeys = &keys;
1955			}
1956
1957			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1958			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1959			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1960			hash_keys.ports.src = flkeys->ports.src;
1961			hash_keys.ports.dst = flkeys->ports.dst;
1962			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1963		} else {
1964			memset(&hash_keys, 0, sizeof(hash_keys));
1965			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1966			hash_keys.addrs.v4addrs.src = fl4->saddr;
1967			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1968			hash_keys.ports.src = fl4->fl4_sport;
1969			hash_keys.ports.dst = fl4->fl4_dport;
1970			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1971		}
1972		break;
1973	case 2:
1974		memset(&hash_keys, 0, sizeof(hash_keys));
1975		/* skb is currently provided only when forwarding */
1976		if (skb) {
1977			struct flow_keys keys;
1978
1979			skb_flow_dissect_flow_keys(skb, &keys, 0);
1980			/* Inner can be v4 or v6 */
1981			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1982				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1983				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1984				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1985			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1986				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1987				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1988				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1989				hash_keys.tags.flow_label = keys.tags.flow_label;
1990				hash_keys.basic.ip_proto = keys.basic.ip_proto;
1991			} else {
1992				/* Same as case 0 */
1993				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1994				ip_multipath_l3_keys(skb, &hash_keys);
1995			}
1996		} else {
1997			/* Same as case 0 */
1998			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1999			hash_keys.addrs.v4addrs.src = fl4->saddr;
2000			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2001		}
2002		break;
2003	}
2004	mhash = flow_hash_from_keys(&hash_keys);
2005
2006	if (multipath_hash)
2007		mhash = jhash_2words(mhash, multipath_hash, 0);
2008
2009	return mhash >> 1;
2010}
2011#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2012
2013static int ip_mkroute_input(struct sk_buff *skb,
2014			    struct fib_result *res,
 
2015			    struct in_device *in_dev,
2016			    __be32 daddr, __be32 saddr, u32 tos,
2017			    struct flow_keys *hkeys)
2018{
2019#ifdef CONFIG_IP_ROUTE_MULTIPATH
2020	if (res->fi && fib_info_num_path(res->fi) > 1) {
2021		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2022
2023		fib_select_multipath(res, h);
2024	}
 
2025#endif
2026
2027	/* create a routing cache entry */
2028	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2029}
2030
2031/* Implements all the saddr-related checks as ip_route_input_slow(),
2032 * assuming daddr is valid and the destination is not a local broadcast one.
2033 * Uses the provided hint instead of performing a route lookup.
2034 */
2035int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2036		      u8 tos, struct net_device *dev,
2037		      const struct sk_buff *hint)
2038{
2039	struct in_device *in_dev = __in_dev_get_rcu(dev);
2040	struct rtable *rt = skb_rtable(hint);
2041	struct net *net = dev_net(dev);
2042	int err = -EINVAL;
2043	u32 tag = 0;
2044
2045	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2046		goto martian_source;
2047
2048	if (ipv4_is_zeronet(saddr))
2049		goto martian_source;
2050
2051	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2052		goto martian_source;
2053
2054	if (rt->rt_type != RTN_LOCAL)
2055		goto skip_validate_source;
2056
2057	tos &= IPTOS_RT_MASK;
2058	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2059	if (err < 0)
2060		goto martian_source;
2061
2062skip_validate_source:
2063	skb_dst_copy(skb, hint);
 
 
 
 
2064	return 0;
2065
2066martian_source:
2067	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2068	return err;
2069}
2070
2071/*
2072 *	NOTE. We drop all the packets that has local source
2073 *	addresses, because every properly looped back packet
2074 *	must have correct destination already attached by output routine.
2075 *	Changes in the enforced policies must be applied also to
2076 *	ip_route_use_hint().
2077 *
2078 *	Such approach solves two big problems:
2079 *	1. Not simplex devices are handled properly.
2080 *	2. IP spoofing attempts are filtered with 100% of guarantee.
2081 *	called with rcu_read_lock()
2082 */
2083
2084static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2085			       u8 tos, struct net_device *dev,
2086			       struct fib_result *res)
2087{
 
2088	struct in_device *in_dev = __in_dev_get_rcu(dev);
2089	struct flow_keys *flkeys = NULL, _flkeys;
2090	struct net    *net = dev_net(dev);
2091	struct ip_tunnel_info *tun_info;
2092	int		err = -EINVAL;
2093	unsigned int	flags = 0;
2094	u32		itag = 0;
2095	struct rtable	*rth;
2096	struct flowi4	fl4;
2097	bool do_cache = true;
 
 
2098
2099	/* IP on this device is disabled. */
2100
2101	if (!in_dev)
2102		goto out;
2103
2104	/* Check for the most weird martians, which can be not detected
2105	   by fib_lookup.
2106	 */
2107
2108	tun_info = skb_tunnel_info(skb);
2109	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2110		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2111	else
2112		fl4.flowi4_tun_key.tun_id = 0;
2113	skb_dst_drop(skb);
2114
2115	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2116		goto martian_source;
2117
2118	res->fi = NULL;
2119	res->table = NULL;
2120	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2121		goto brd_input;
2122
2123	/* Accept zero addresses only to limited broadcast;
2124	 * I even do not know to fix it or not. Waiting for complains :-)
2125	 */
2126	if (ipv4_is_zeronet(saddr))
2127		goto martian_source;
2128
2129	if (ipv4_is_zeronet(daddr))
2130		goto martian_destination;
2131
2132	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2133	 * and call it once if daddr or/and saddr are loopback addresses
2134	 */
2135	if (ipv4_is_loopback(daddr)) {
2136		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2137			goto martian_destination;
2138	} else if (ipv4_is_loopback(saddr)) {
2139		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2140			goto martian_source;
2141	}
2142
2143	/*
2144	 *	Now we are ready to route packet.
2145	 */
2146	fl4.flowi4_oif = 0;
2147	fl4.flowi4_iif = dev->ifindex;
2148	fl4.flowi4_mark = skb->mark;
2149	fl4.flowi4_tos = tos;
2150	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2151	fl4.flowi4_flags = 0;
2152	fl4.daddr = daddr;
2153	fl4.saddr = saddr;
2154	fl4.flowi4_uid = sock_net_uid(net, NULL);
2155	fl4.flowi4_multipath_hash = 0;
2156
2157	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2158		flkeys = &_flkeys;
2159	} else {
2160		fl4.flowi4_proto = 0;
2161		fl4.fl4_sport = 0;
2162		fl4.fl4_dport = 0;
2163	}
2164
2165	err = fib_lookup(net, &fl4, res, 0);
2166	if (err != 0) {
2167		if (!IN_DEV_FORWARD(in_dev))
2168			err = -EHOSTUNREACH;
2169		goto no_route;
2170	}
2171
2172	if (res->type == RTN_BROADCAST) {
2173		if (IN_DEV_BFORWARD(in_dev))
2174			goto make_route;
2175		/* not do cache if bc_forwarding is enabled */
2176		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2177			do_cache = false;
2178		goto brd_input;
2179	}
2180
2181	if (res->type == RTN_LOCAL) {
2182		err = fib_validate_source(skb, saddr, daddr, tos,
2183					  0, dev, in_dev, &itag);
 
2184		if (err < 0)
2185			goto martian_source;
 
 
 
2186		goto local_input;
2187	}
2188
2189	if (!IN_DEV_FORWARD(in_dev)) {
2190		err = -EHOSTUNREACH;
2191		goto no_route;
2192	}
2193	if (res->type != RTN_UNICAST)
2194		goto martian_destination;
2195
2196make_route:
2197	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2198out:	return err;
2199
2200brd_input:
2201	if (skb->protocol != htons(ETH_P_IP))
2202		goto e_inval;
2203
2204	if (!ipv4_is_zeronet(saddr)) {
2205		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2206					  in_dev, &itag);
 
 
2207		if (err < 0)
2208			goto martian_source;
 
 
2209	}
2210	flags |= RTCF_BROADCAST;
2211	res->type = RTN_BROADCAST;
2212	RT_CACHE_STAT_INC(in_brd);
2213
2214local_input:
2215	do_cache &= res->fi && !itag;
2216	if (do_cache) {
2217		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2218
2219		rth = rcu_dereference(nhc->nhc_rth_input);
2220		if (rt_cache_valid(rth)) {
2221			skb_dst_set_noref(skb, &rth->dst);
2222			err = 0;
2223			goto out;
2224		}
2225	}
2226
2227	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2228			   flags | RTCF_LOCAL, res->type,
2229			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2230	if (!rth)
2231		goto e_nobufs;
2232
 
2233	rth->dst.output= ip_rt_bug;
2234#ifdef CONFIG_IP_ROUTE_CLASSID
2235	rth->dst.tclassid = itag;
2236#endif
2237	rth->rt_is_input = 1;
2238
2239	RT_CACHE_STAT_INC(in_slow_tot);
2240	if (res->type == RTN_UNREACHABLE) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2241		rth->dst.input= ip_error;
2242		rth->dst.error= -err;
2243		rth->rt_flags 	&= ~RTCF_LOCAL;
2244	}
2245
2246	if (do_cache) {
2247		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2248
2249		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2250		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2251			WARN_ON(rth->dst.input == lwtunnel_input);
2252			rth->dst.lwtstate->orig_input = rth->dst.input;
2253			rth->dst.input = lwtunnel_input;
2254		}
2255
2256		if (unlikely(!rt_cache_route(nhc, rth)))
2257			rt_add_uncached_list(rth);
2258	}
2259	skb_dst_set(skb, &rth->dst);
2260	err = 0;
 
 
2261	goto out;
2262
2263no_route:
2264	RT_CACHE_STAT_INC(in_no_route);
2265	res->type = RTN_UNREACHABLE;
2266	res->fi = NULL;
2267	res->table = NULL;
 
2268	goto local_input;
2269
2270	/*
2271	 *	Do not cache martian addresses: they should be logged (RFC1812)
2272	 */
2273martian_destination:
2274	RT_CACHE_STAT_INC(in_martian_dst);
2275#ifdef CONFIG_IP_ROUTE_VERBOSE
2276	if (IN_DEV_LOG_MARTIANS(in_dev))
2277		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2278				     &daddr, &saddr, dev->name);
2279#endif
2280
 
 
 
 
2281e_inval:
2282	err = -EINVAL;
2283	goto out;
2284
2285e_nobufs:
2286	err = -ENOBUFS;
2287	goto out;
2288
2289martian_source:
 
 
2290	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2291	goto out;
2292}
2293
2294int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2295			 u8 tos, struct net_device *dev)
2296{
2297	struct fib_result res;
2298	int err;
 
 
 
 
 
2299
2300	tos &= IPTOS_RT_MASK;
2301	rcu_read_lock();
2302	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2303	rcu_read_unlock();
2304
2305	return err;
2306}
2307EXPORT_SYMBOL(ip_route_input_noref);
2308
2309/* called with rcu_read_lock held */
2310int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2311		       u8 tos, struct net_device *dev, struct fib_result *res)
2312{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2313	/* Multicast recognition logic is moved from route cache to here.
2314	   The problem was that too many Ethernet cards have broken/missing
2315	   hardware multicast filters :-( As result the host on multicasting
2316	   network acquires a lot of useless route cache entries, sort of
2317	   SDR messages from all the world. Now we try to get rid of them.
2318	   Really, provided software IP multicast filter is organized
2319	   reasonably (at least, hashed), it does not result in a slowdown
2320	   comparing with route cache reject entries.
2321	   Note, that multicast routers are not affected, because
2322	   route cache entry is created eventually.
2323	 */
2324	if (ipv4_is_multicast(daddr)) {
2325		struct in_device *in_dev = __in_dev_get_rcu(dev);
2326		int our = 0;
2327		int err = -EINVAL;
2328
2329		if (!in_dev)
2330			return err;
2331		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2332				      ip_hdr(skb)->protocol);
2333
2334		/* check l3 master if no match yet */
2335		if (!our && netif_is_l3_slave(dev)) {
2336			struct in_device *l3_in_dev;
2337
2338			l3_in_dev = __in_dev_get_rcu(skb->dev);
2339			if (l3_in_dev)
2340				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2341						      ip_hdr(skb)->protocol);
2342		}
2343
2344		if (our
 
 
 
2345#ifdef CONFIG_IP_MROUTE
2346			||
2347		    (!ipv4_is_local_multicast(daddr) &&
2348		     IN_DEV_MFORWARD(in_dev))
2349#endif
2350		   ) {
2351			err = ip_route_input_mc(skb, daddr, saddr,
2352						tos, dev, our);
 
 
 
2353		}
2354		return err;
 
2355	}
2356
2357	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
 
2358}
 
2359
2360/* called with rcu_read_lock() */
2361static struct rtable *__mkroute_output(const struct fib_result *res,
2362				       const struct flowi4 *fl4, int orig_oif,
 
 
2363				       struct net_device *dev_out,
2364				       unsigned int flags)
2365{
2366	struct fib_info *fi = res->fi;
2367	struct fib_nh_exception *fnhe;
2368	struct in_device *in_dev;
2369	u16 type = res->type;
2370	struct rtable *rth;
2371	bool do_cache;
2372
2373	in_dev = __in_dev_get_rcu(dev_out);
2374	if (!in_dev)
2375		return ERR_PTR(-EINVAL);
2376
2377	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2378		if (ipv4_is_loopback(fl4->saddr) &&
2379		    !(dev_out->flags & IFF_LOOPBACK) &&
2380		    !netif_is_l3_master(dev_out))
2381			return ERR_PTR(-EINVAL);
2382
2383	if (ipv4_is_lbcast(fl4->daddr))
2384		type = RTN_BROADCAST;
2385	else if (ipv4_is_multicast(fl4->daddr))
2386		type = RTN_MULTICAST;
2387	else if (ipv4_is_zeronet(fl4->daddr))
2388		return ERR_PTR(-EINVAL);
2389
2390	if (dev_out->flags & IFF_LOOPBACK)
2391		flags |= RTCF_LOCAL;
2392
2393	do_cache = true;
 
 
 
2394	if (type == RTN_BROADCAST) {
2395		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2396		fi = NULL;
2397	} else if (type == RTN_MULTICAST) {
2398		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2399		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2400				     fl4->flowi4_proto))
2401			flags &= ~RTCF_LOCAL;
2402		else
2403			do_cache = false;
2404		/* If multicast route do not exist use
2405		 * default one, but do not gateway in this case.
2406		 * Yes, it is hack.
2407		 */
2408		if (fi && res->prefixlen < 4)
2409			fi = NULL;
2410	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2411		   (orig_oif != dev_out->ifindex)) {
2412		/* For local routes that require a particular output interface
2413		 * we do not want to cache the result.  Caching the result
2414		 * causes incorrect behaviour when there are multiple source
2415		 * addresses on the interface, the end result being that if the
2416		 * intended recipient is waiting on that interface for the
2417		 * packet he won't receive it because it will be delivered on
2418		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2419		 * be set to the loopback interface as well.
2420		 */
2421		do_cache = false;
2422	}
2423
2424	fnhe = NULL;
2425	do_cache &= fi != NULL;
2426	if (fi) {
2427		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2428		struct rtable __rcu **prth;
2429
2430		fnhe = find_exception(nhc, fl4->daddr);
2431		if (!do_cache)
2432			goto add;
2433		if (fnhe) {
2434			prth = &fnhe->fnhe_rth_output;
2435		} else {
2436			if (unlikely(fl4->flowi4_flags &
2437				     FLOWI_FLAG_KNOWN_NH &&
2438				     !(nhc->nhc_gw_family &&
2439				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2440				do_cache = false;
2441				goto add;
2442			}
2443			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2444		}
2445		rth = rcu_dereference(*prth);
2446		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2447			return rth;
2448	}
2449
2450add:
2451	rth = rt_dst_alloc(dev_out, flags, type,
2452			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2453			   IN_DEV_CONF_GET(in_dev, NOXFRM));
2454	if (!rth)
2455		return ERR_PTR(-ENOBUFS);
2456
2457	rth->rt_iif = orig_oif;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2458
2459	RT_CACHE_STAT_INC(out_slow_tot);
2460
 
 
 
 
2461	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
 
2462		if (flags & RTCF_LOCAL &&
2463		    !(dev_out->flags & IFF_LOOPBACK)) {
2464			rth->dst.output = ip_mc_output;
2465			RT_CACHE_STAT_INC(out_slow_mc);
2466		}
2467#ifdef CONFIG_IP_MROUTE
2468		if (type == RTN_MULTICAST) {
2469			if (IN_DEV_MFORWARD(in_dev) &&
2470			    !ipv4_is_local_multicast(fl4->daddr)) {
2471				rth->dst.input = ip_mr_input;
2472				rth->dst.output = ip_mc_output;
2473			}
2474		}
2475#endif
2476	}
2477
2478	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2479	lwtunnel_set_redirect(&rth->dst);
2480
2481	return rth;
2482}
2483
2484/*
2485 * Major route resolver routine.
 
2486 */
2487
2488struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2489					const struct sk_buff *skb)
2490{
 
2491	__u8 tos = RT_FL_TOS(fl4);
2492	struct fib_result res = {
2493		.type		= RTN_UNSPEC,
2494		.fi		= NULL,
2495		.table		= NULL,
2496		.tclassid	= 0,
2497	};
2498	struct rtable *rth;
 
 
 
 
 
 
 
 
 
 
 
 
2499
2500	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2501	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2502	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2503			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2504
2505	rcu_read_lock();
2506	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2507	rcu_read_unlock();
2508
2509	return rth;
2510}
2511EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2512
2513struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2514					    struct fib_result *res,
2515					    const struct sk_buff *skb)
2516{
2517	struct net_device *dev_out = NULL;
2518	int orig_oif = fl4->flowi4_oif;
2519	unsigned int flags = 0;
2520	struct rtable *rth;
2521	int err;
2522
2523	if (fl4->saddr) {
 
2524		if (ipv4_is_multicast(fl4->saddr) ||
2525		    ipv4_is_lbcast(fl4->saddr) ||
2526		    ipv4_is_zeronet(fl4->saddr)) {
2527			rth = ERR_PTR(-EINVAL);
2528			goto out;
2529		}
2530
2531		rth = ERR_PTR(-ENETUNREACH);
2532
2533		/* I removed check for oif == dev_out->oif here.
2534		   It was wrong for two reasons:
2535		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2536		      is assigned to multiple interfaces.
2537		   2. Moreover, we are allowed to send packets with saddr
2538		      of another iface. --ANK
2539		 */
2540
2541		if (fl4->flowi4_oif == 0 &&
2542		    (ipv4_is_multicast(fl4->daddr) ||
2543		     ipv4_is_lbcast(fl4->daddr))) {
2544			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2545			dev_out = __ip_dev_find(net, fl4->saddr, false);
2546			if (!dev_out)
2547				goto out;
2548
2549			/* Special hack: user can direct multicasts
2550			   and limited broadcast via necessary interface
2551			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2552			   This hack is not just for fun, it allows
2553			   vic,vat and friends to work.
2554			   They bind socket to loopback, set ttl to zero
2555			   and expect that it will work.
2556			   From the viewpoint of routing cache they are broken,
2557			   because we are not allowed to build multicast path
2558			   with loopback source addr (look, routing cache
2559			   cannot know, that ttl is zero, so that packet
2560			   will not leave this host and route is valid).
2561			   Luckily, this hack is good workaround.
2562			 */
2563
2564			fl4->flowi4_oif = dev_out->ifindex;
2565			goto make_route;
2566		}
2567
2568		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2569			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2570			if (!__ip_dev_find(net, fl4->saddr, false))
2571				goto out;
2572		}
2573	}
2574
2575
2576	if (fl4->flowi4_oif) {
2577		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2578		rth = ERR_PTR(-ENODEV);
2579		if (!dev_out)
2580			goto out;
2581
2582		/* RACE: Check return value of inet_select_addr instead. */
2583		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2584			rth = ERR_PTR(-ENETUNREACH);
2585			goto out;
2586		}
2587		if (ipv4_is_local_multicast(fl4->daddr) ||
2588		    ipv4_is_lbcast(fl4->daddr) ||
2589		    fl4->flowi4_proto == IPPROTO_IGMP) {
2590			if (!fl4->saddr)
2591				fl4->saddr = inet_select_addr(dev_out, 0,
2592							      RT_SCOPE_LINK);
2593			goto make_route;
2594		}
2595		if (!fl4->saddr) {
2596			if (ipv4_is_multicast(fl4->daddr))
2597				fl4->saddr = inet_select_addr(dev_out, 0,
2598							      fl4->flowi4_scope);
2599			else if (!fl4->daddr)
2600				fl4->saddr = inet_select_addr(dev_out, 0,
2601							      RT_SCOPE_HOST);
2602		}
2603	}
2604
2605	if (!fl4->daddr) {
2606		fl4->daddr = fl4->saddr;
2607		if (!fl4->daddr)
2608			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2609		dev_out = net->loopback_dev;
2610		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2611		res->type = RTN_LOCAL;
2612		flags |= RTCF_LOCAL;
2613		goto make_route;
2614	}
2615
2616	err = fib_lookup(net, fl4, res, 0);
2617	if (err) {
2618		res->fi = NULL;
2619		res->table = NULL;
2620		if (fl4->flowi4_oif &&
2621		    (ipv4_is_multicast(fl4->daddr) ||
2622		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2623			/* Apparently, routing tables are wrong. Assume,
2624			   that the destination is on link.
2625
2626			   WHY? DW.
2627			   Because we are allowed to send to iface
2628			   even if it has NO routes and NO assigned
2629			   addresses. When oif is specified, routing
2630			   tables are looked up with only one purpose:
2631			   to catch if destination is gatewayed, rather than
2632			   direct. Moreover, if MSG_DONTROUTE is set,
2633			   we send packet, ignoring both routing tables
2634			   and ifaddr state. --ANK
2635
2636
2637			   We could make it even if oif is unknown,
2638			   likely IPv6, but we do not.
2639			 */
2640
2641			if (fl4->saddr == 0)
2642				fl4->saddr = inet_select_addr(dev_out, 0,
2643							      RT_SCOPE_LINK);
2644			res->type = RTN_UNICAST;
2645			goto make_route;
2646		}
2647		rth = ERR_PTR(err);
2648		goto out;
2649	}
2650
2651	if (res->type == RTN_LOCAL) {
2652		if (!fl4->saddr) {
2653			if (res->fi->fib_prefsrc)
2654				fl4->saddr = res->fi->fib_prefsrc;
2655			else
2656				fl4->saddr = fl4->daddr;
2657		}
2658
2659		/* L3 master device is the loopback for that domain */
2660		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2661			net->loopback_dev;
2662
2663		/* make sure orig_oif points to fib result device even
2664		 * though packet rx/tx happens over loopback or l3mdev
2665		 */
2666		orig_oif = FIB_RES_OIF(*res);
2667
2668		fl4->flowi4_oif = dev_out->ifindex;
 
2669		flags |= RTCF_LOCAL;
2670		goto make_route;
2671	}
2672
2673	fib_select_path(net, res, fl4, skb);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2674
2675	dev_out = FIB_RES_DEV(*res);
2676
2677make_route:
2678	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
 
 
 
 
 
 
 
 
2679
2680out:
 
2681	return rth;
2682}
2683
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2684static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2685{
2686	return NULL;
2687}
2688
2689static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2690{
2691	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2692
2693	return mtu ? : dst->dev->mtu;
2694}
2695
2696static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2697					  struct sk_buff *skb, u32 mtu,
2698					  bool confirm_neigh)
2699{
2700}
2701
2702static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2703				       struct sk_buff *skb)
2704{
2705}
2706
2707static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2708					  unsigned long old)
2709{
2710	return NULL;
2711}
2712
2713static struct dst_ops ipv4_dst_blackhole_ops = {
2714	.family			=	AF_INET,
 
 
2715	.check			=	ipv4_blackhole_dst_check,
2716	.mtu			=	ipv4_blackhole_mtu,
2717	.default_advmss		=	ipv4_default_advmss,
2718	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2719	.redirect		=	ipv4_rt_blackhole_redirect,
2720	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2721	.neigh_lookup		=	ipv4_neigh_lookup,
2722};
2723
2724struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2725{
 
2726	struct rtable *ort = (struct rtable *) dst_orig;
2727	struct rtable *rt;
2728
2729	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2730	if (rt) {
2731		struct dst_entry *new = &rt->dst;
2732
2733		new->__use = 1;
2734		new->input = dst_discard;
2735		new->output = dst_discard_out;
 
2736
2737		new->dev = net->loopback_dev;
2738		if (new->dev)
2739			dev_hold(new->dev);
2740
2741		rt->rt_is_input = ort->rt_is_input;
 
 
 
2742		rt->rt_iif = ort->rt_iif;
2743		rt->rt_pmtu = ort->rt_pmtu;
2744		rt->rt_mtu_locked = ort->rt_mtu_locked;
2745
2746		rt->rt_genid = rt_genid_ipv4(net);
2747		rt->rt_flags = ort->rt_flags;
2748		rt->rt_type = ort->rt_type;
2749		rt->rt_uses_gateway = ort->rt_uses_gateway;
2750		rt->rt_gw_family = ort->rt_gw_family;
2751		if (rt->rt_gw_family == AF_INET)
2752			rt->rt_gw4 = ort->rt_gw4;
2753		else if (rt->rt_gw_family == AF_INET6)
2754			rt->rt_gw6 = ort->rt_gw6;
 
 
 
 
2755
2756		INIT_LIST_HEAD(&rt->rt_uncached);
2757	}
2758
2759	dst_release(dst_orig);
2760
2761	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2762}
2763
2764struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2765				    const struct sock *sk)
2766{
2767	struct rtable *rt = __ip_route_output_key(net, flp4);
2768
2769	if (IS_ERR(rt))
2770		return rt;
2771
2772	if (flp4->flowi4_proto)
2773		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2774							flowi4_to_flowi(flp4),
2775							sk, 0);
2776
2777	return rt;
2778}
2779EXPORT_SYMBOL_GPL(ip_route_output_flow);
2780
2781struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2782				      struct net_device *dev,
2783				      struct net *net, __be32 *saddr,
2784				      const struct ip_tunnel_info *info,
2785				      u8 protocol, bool use_cache)
2786{
2787#ifdef CONFIG_DST_CACHE
2788	struct dst_cache *dst_cache;
2789#endif
2790	struct rtable *rt = NULL;
2791	struct flowi4 fl4;
2792	__u8 tos;
2793
2794#ifdef CONFIG_DST_CACHE
2795	dst_cache = (struct dst_cache *)&info->dst_cache;
2796	if (use_cache) {
2797		rt = dst_cache_get_ip4(dst_cache, saddr);
2798		if (rt)
2799			return rt;
2800	}
2801#endif
2802	memset(&fl4, 0, sizeof(fl4));
2803	fl4.flowi4_mark = skb->mark;
2804	fl4.flowi4_proto = protocol;
2805	fl4.daddr = info->key.u.ipv4.dst;
2806	fl4.saddr = info->key.u.ipv4.src;
2807	tos = info->key.tos;
2808	fl4.flowi4_tos = RT_TOS(tos);
2809
2810	rt = ip_route_output_key(net, &fl4);
2811	if (IS_ERR(rt)) {
2812		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2813		return ERR_PTR(-ENETUNREACH);
2814	}
2815	if (rt->dst.dev == dev) { /* is this necessary? */
2816		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2817		ip_rt_put(rt);
2818		return ERR_PTR(-ELOOP);
2819	}
2820#ifdef CONFIG_DST_CACHE
2821	if (use_cache)
2822		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2823#endif
2824	*saddr = fl4.saddr;
2825	return rt;
2826}
2827EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2828
2829/* called with rcu_read_lock held */
2830static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2831			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2832			struct sk_buff *skb, u32 portid, u32 seq,
2833			unsigned int flags)
2834{
 
2835	struct rtmsg *r;
2836	struct nlmsghdr *nlh;
2837	unsigned long expires = 0;
2838	u32 error;
2839	u32 metrics[RTAX_MAX];
2840
2841	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2842	if (!nlh)
2843		return -EMSGSIZE;
2844
2845	r = nlmsg_data(nlh);
2846	r->rtm_family	 = AF_INET;
2847	r->rtm_dst_len	= 32;
2848	r->rtm_src_len	= 0;
2849	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2850	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2851	if (nla_put_u32(skb, RTA_TABLE, table_id))
2852		goto nla_put_failure;
2853	r->rtm_type	= rt->rt_type;
2854	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2855	r->rtm_protocol = RTPROT_UNSPEC;
2856	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2857	if (rt->rt_flags & RTCF_NOTIFY)
2858		r->rtm_flags |= RTM_F_NOTIFY;
2859	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2860		r->rtm_flags |= RTCF_DOREDIRECT;
2861
2862	if (nla_put_in_addr(skb, RTA_DST, dst))
2863		goto nla_put_failure;
2864	if (src) {
2865		r->rtm_src_len = 32;
2866		if (nla_put_in_addr(skb, RTA_SRC, src))
2867			goto nla_put_failure;
2868	}
2869	if (rt->dst.dev &&
2870	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2871		goto nla_put_failure;
2872#ifdef CONFIG_IP_ROUTE_CLASSID
2873	if (rt->dst.tclassid &&
2874	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2875		goto nla_put_failure;
2876#endif
2877	if (fl4 && !rt_is_input_route(rt) &&
2878	    fl4->saddr != src) {
2879		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2880			goto nla_put_failure;
2881	}
2882	if (rt->rt_uses_gateway) {
2883		if (rt->rt_gw_family == AF_INET &&
2884		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2885			goto nla_put_failure;
2886		} else if (rt->rt_gw_family == AF_INET6) {
2887			int alen = sizeof(struct in6_addr);
2888			struct nlattr *nla;
2889			struct rtvia *via;
2890
2891			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2892			if (!nla)
2893				goto nla_put_failure;
2894
2895			via = nla_data(nla);
2896			via->rtvia_family = AF_INET6;
2897			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2898		}
2899	}
2900
2901	expires = rt->dst.expires;
2902	if (expires) {
2903		unsigned long now = jiffies;
2904
2905		if (time_before(now, expires))
2906			expires -= now;
2907		else
2908			expires = 0;
2909	}
 
 
 
2910
2911	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2912	if (rt->rt_pmtu && expires)
2913		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2914	if (rt->rt_mtu_locked && expires)
2915		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2916	if (rtnetlink_put_metrics(skb, metrics) < 0)
2917		goto nla_put_failure;
2918
2919	if (fl4) {
2920		if (fl4->flowi4_mark &&
2921		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2922			goto nla_put_failure;
2923
2924		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2925		    nla_put_u32(skb, RTA_UID,
2926				from_kuid_munged(current_user_ns(),
2927						 fl4->flowi4_uid)))
2928			goto nla_put_failure;
 
 
 
 
 
 
 
 
 
 
 
2929
2930		if (rt_is_input_route(rt)) {
2931#ifdef CONFIG_IP_MROUTE
2932			if (ipv4_is_multicast(dst) &&
2933			    !ipv4_is_local_multicast(dst) &&
2934			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2935				int err = ipmr_get_route(net, skb,
2936							 fl4->saddr, fl4->daddr,
2937							 r, portid);
2938
2939				if (err <= 0) {
 
 
 
 
 
 
2940					if (err == 0)
2941						return 0;
2942					goto nla_put_failure;
 
 
 
 
2943				}
2944			} else
 
2945#endif
2946				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2947					goto nla_put_failure;
2948		}
2949	}
2950
2951	error = rt->dst.error;
2952
2953	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2954		goto nla_put_failure;
2955
2956	nlmsg_end(skb, nlh);
2957	return 0;
2958
2959nla_put_failure:
2960	nlmsg_cancel(skb, nlh);
2961	return -EMSGSIZE;
2962}
2963
2964static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2965			    struct netlink_callback *cb, u32 table_id,
2966			    struct fnhe_hash_bucket *bucket, int genid,
2967			    int *fa_index, int fa_start, unsigned int flags)
2968{
2969	int i;
2970
2971	for (i = 0; i < FNHE_HASH_SIZE; i++) {
2972		struct fib_nh_exception *fnhe;
2973
2974		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2975		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2976			struct rtable *rt;
2977			int err;
2978
2979			if (*fa_index < fa_start)
2980				goto next;
2981
2982			if (fnhe->fnhe_genid != genid)
2983				goto next;
2984
2985			if (fnhe->fnhe_expires &&
2986			    time_after(jiffies, fnhe->fnhe_expires))
2987				goto next;
2988
2989			rt = rcu_dereference(fnhe->fnhe_rth_input);
2990			if (!rt)
2991				rt = rcu_dereference(fnhe->fnhe_rth_output);
2992			if (!rt)
2993				goto next;
2994
2995			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2996					   table_id, NULL, skb,
2997					   NETLINK_CB(cb->skb).portid,
2998					   cb->nlh->nlmsg_seq, flags);
2999			if (err)
3000				return err;
3001next:
3002			(*fa_index)++;
3003		}
3004	}
3005
3006	return 0;
3007}
3008
3009int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3010		       u32 table_id, struct fib_info *fi,
3011		       int *fa_index, int fa_start, unsigned int flags)
3012{
3013	struct net *net = sock_net(cb->skb->sk);
3014	int nhsel, genid = fnhe_genid(net);
3015
3016	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3017		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3018		struct fnhe_hash_bucket *bucket;
3019		int err;
3020
3021		if (nhc->nhc_flags & RTNH_F_DEAD)
3022			continue;
3023
3024		rcu_read_lock();
3025		bucket = rcu_dereference(nhc->nhc_exceptions);
3026		err = 0;
3027		if (bucket)
3028			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3029					       genid, fa_index, fa_start,
3030					       flags);
3031		rcu_read_unlock();
3032		if (err)
3033			return err;
3034	}
3035
3036	return 0;
3037}
3038
3039static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3040						   u8 ip_proto, __be16 sport,
3041						   __be16 dport)
3042{
3043	struct sk_buff *skb;
3044	struct iphdr *iph;
3045
3046	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3047	if (!skb)
3048		return NULL;
3049
3050	/* Reserve room for dummy headers, this skb can pass
3051	 * through good chunk of routing engine.
3052	 */
3053	skb_reset_mac_header(skb);
3054	skb_reset_network_header(skb);
3055	skb->protocol = htons(ETH_P_IP);
3056	iph = skb_put(skb, sizeof(struct iphdr));
3057	iph->protocol = ip_proto;
3058	iph->saddr = src;
3059	iph->daddr = dst;
3060	iph->version = 0x4;
3061	iph->frag_off = 0;
3062	iph->ihl = 0x5;
3063	skb_set_transport_header(skb, skb->len);
3064
3065	switch (iph->protocol) {
3066	case IPPROTO_UDP: {
3067		struct udphdr *udph;
3068
3069		udph = skb_put_zero(skb, sizeof(struct udphdr));
3070		udph->source = sport;
3071		udph->dest = dport;
3072		udph->len = sizeof(struct udphdr);
3073		udph->check = 0;
3074		break;
3075	}
3076	case IPPROTO_TCP: {
3077		struct tcphdr *tcph;
3078
3079		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3080		tcph->source	= sport;
3081		tcph->dest	= dport;
3082		tcph->doff	= sizeof(struct tcphdr) / 4;
3083		tcph->rst = 1;
3084		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3085					    src, dst, 0);
3086		break;
3087	}
3088	case IPPROTO_ICMP: {
3089		struct icmphdr *icmph;
3090
3091		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3092		icmph->type = ICMP_ECHO;
3093		icmph->code = 0;
3094	}
3095	}
3096
3097	return skb;
3098}
3099
3100static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3101				       const struct nlmsghdr *nlh,
3102				       struct nlattr **tb,
3103				       struct netlink_ext_ack *extack)
3104{
3105	struct rtmsg *rtm;
3106	int i, err;
3107
3108	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3109		NL_SET_ERR_MSG(extack,
3110			       "ipv4: Invalid header for route get request");
3111		return -EINVAL;
3112	}
3113
3114	if (!netlink_strict_get_check(skb))
3115		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3116					      rtm_ipv4_policy, extack);
3117
3118	rtm = nlmsg_data(nlh);
3119	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3120	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3121	    rtm->rtm_table || rtm->rtm_protocol ||
3122	    rtm->rtm_scope || rtm->rtm_type) {
3123		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3124		return -EINVAL;
3125	}
3126
3127	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3128			       RTM_F_LOOKUP_TABLE |
3129			       RTM_F_FIB_MATCH)) {
3130		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3131		return -EINVAL;
3132	}
3133
3134	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3135					    rtm_ipv4_policy, extack);
3136	if (err)
3137		return err;
3138
3139	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3140	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3141		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3142		return -EINVAL;
3143	}
3144
3145	for (i = 0; i <= RTA_MAX; i++) {
3146		if (!tb[i])
3147			continue;
3148
3149		switch (i) {
3150		case RTA_IIF:
3151		case RTA_OIF:
3152		case RTA_SRC:
3153		case RTA_DST:
3154		case RTA_IP_PROTO:
3155		case RTA_SPORT:
3156		case RTA_DPORT:
3157		case RTA_MARK:
3158		case RTA_UID:
3159			break;
3160		default:
3161			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3162			return -EINVAL;
3163		}
3164	}
3165
3166	return 0;
3167}
3168
3169static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3170			     struct netlink_ext_ack *extack)
3171{
3172	struct net *net = sock_net(in_skb->sk);
 
3173	struct nlattr *tb[RTA_MAX+1];
3174	u32 table_id = RT_TABLE_MAIN;
3175	__be16 sport = 0, dport = 0;
3176	struct fib_result res = {};
3177	u8 ip_proto = IPPROTO_UDP;
3178	struct rtable *rt = NULL;
3179	struct sk_buff *skb;
3180	struct rtmsg *rtm;
3181	struct flowi4 fl4 = {};
3182	__be32 dst = 0;
3183	__be32 src = 0;
3184	kuid_t uid;
3185	u32 iif;
3186	int err;
3187	int mark;
 
3188
3189	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3190	if (err < 0)
3191		return err;
3192
3193	rtm = nlmsg_data(nlh);
3194	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3195	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3196	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3197	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3198	if (tb[RTA_UID])
3199		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3200	else
3201		uid = (iif ? INVALID_UID : current_uid());
3202
3203	if (tb[RTA_IP_PROTO]) {
3204		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3205						  &ip_proto, AF_INET, extack);
3206		if (err)
3207			return err;
3208	}
3209
3210	if (tb[RTA_SPORT])
3211		sport = nla_get_be16(tb[RTA_SPORT]);
3212
3213	if (tb[RTA_DPORT])
3214		dport = nla_get_be16(tb[RTA_DPORT]);
3215
3216	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3217	if (!skb)
3218		return -ENOBUFS;
3219
3220	fl4.daddr = dst;
3221	fl4.saddr = src;
3222	fl4.flowi4_tos = rtm->rtm_tos;
3223	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3224	fl4.flowi4_mark = mark;
3225	fl4.flowi4_uid = uid;
3226	if (sport)
3227		fl4.fl4_sport = sport;
3228	if (dport)
3229		fl4.fl4_dport = dport;
3230	fl4.flowi4_proto = ip_proto;
3231
3232	rcu_read_lock();
 
 
 
3233
3234	if (iif) {
3235		struct net_device *dev;
3236
3237		dev = dev_get_by_index_rcu(net, iif);
3238		if (!dev) {
3239			err = -ENODEV;
3240			goto errout_rcu;
3241		}
3242
3243		fl4.flowi4_iif = iif; /* for rt_fill_info */
3244		skb->dev	= dev;
3245		skb->mark	= mark;
3246		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3247					 dev, &res);
 
3248
3249		rt = skb_rtable(skb);
3250		if (err == 0 && rt->dst.error)
3251			err = -rt->dst.error;
3252	} else {
3253		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3254		skb->dev = net->loopback_dev;
3255		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
 
 
 
 
 
 
3256		err = 0;
3257		if (IS_ERR(rt))
3258			err = PTR_ERR(rt);
3259		else
3260			skb_dst_set(skb, &rt->dst);
3261	}
3262
3263	if (err)
3264		goto errout_rcu;
3265
 
3266	if (rtm->rtm_flags & RTM_F_NOTIFY)
3267		rt->rt_flags |= RTCF_NOTIFY;
3268
3269	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3270		table_id = res.table ? res.table->tb_id : 0;
 
 
3271
3272	/* reset skb for netlink reply msg */
3273	skb_trim(skb, 0);
3274	skb_reset_network_header(skb);
3275	skb_reset_transport_header(skb);
3276	skb_reset_mac_header(skb);
3277
3278	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3279		struct fib_rt_info fri;
 
 
3280
3281		if (!res.fi) {
3282			err = fib_props[res.type].error;
3283			if (!err)
3284				err = -EHOSTUNREACH;
3285			goto errout_rcu;
3286		}
3287		fri.fi = res.fi;
3288		fri.tb_id = table_id;
3289		fri.dst = res.prefix;
3290		fri.dst_len = res.prefixlen;
3291		fri.tos = fl4.flowi4_tos;
3292		fri.type = rt->rt_type;
3293		fri.offload = 0;
3294		fri.trap = 0;
3295		if (res.fa_head) {
3296			struct fib_alias *fa;
3297
3298			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3299				u8 slen = 32 - fri.dst_len;
3300
3301				if (fa->fa_slen == slen &&
3302				    fa->tb_id == fri.tb_id &&
3303				    fa->fa_tos == fri.tos &&
3304				    fa->fa_info == res.fi &&
3305				    fa->fa_type == fri.type) {
3306					fri.offload = fa->offload;
3307					fri.trap = fa->trap;
3308					break;
3309				}
 
3310			}
 
3311		}
3312		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3313				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3314	} else {
3315		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3316				   NETLINK_CB(in_skb).portid,
3317				   nlh->nlmsg_seq, 0);
3318	}
3319	if (err < 0)
3320		goto errout_rcu;
3321
3322	rcu_read_unlock();
3323
3324	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3325
3326errout_free:
3327	return err;
3328errout_rcu:
3329	rcu_read_unlock();
3330	kfree_skb(skb);
3331	goto errout_free;
3332}
3333
3334void ip_rt_multicast_event(struct in_device *in_dev)
3335{
3336	rt_cache_flush(dev_net(in_dev->dev));
3337}
3338
3339#ifdef CONFIG_SYSCTL
3340static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3341static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3342static int ip_rt_gc_elasticity __read_mostly	= 8;
3343static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3344
3345static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3346		void *buffer, size_t *lenp, loff_t *ppos)
3347{
3348	struct net *net = (struct net *)__ctl->extra1;
3349
3350	if (write) {
3351		rt_cache_flush(net);
3352		fnhe_genid_bump(net);
 
 
 
 
 
 
 
 
3353		return 0;
3354	}
3355
3356	return -EINVAL;
3357}
3358
3359static struct ctl_table ipv4_route_table[] = {
3360	{
3361		.procname	= "gc_thresh",
3362		.data		= &ipv4_dst_ops.gc_thresh,
3363		.maxlen		= sizeof(int),
3364		.mode		= 0644,
3365		.proc_handler	= proc_dointvec,
3366	},
3367	{
3368		.procname	= "max_size",
3369		.data		= &ip_rt_max_size,
3370		.maxlen		= sizeof(int),
3371		.mode		= 0644,
3372		.proc_handler	= proc_dointvec,
3373	},
3374	{
3375		/*  Deprecated. Use gc_min_interval_ms */
3376
3377		.procname	= "gc_min_interval",
3378		.data		= &ip_rt_gc_min_interval,
3379		.maxlen		= sizeof(int),
3380		.mode		= 0644,
3381		.proc_handler	= proc_dointvec_jiffies,
3382	},
3383	{
3384		.procname	= "gc_min_interval_ms",
3385		.data		= &ip_rt_gc_min_interval,
3386		.maxlen		= sizeof(int),
3387		.mode		= 0644,
3388		.proc_handler	= proc_dointvec_ms_jiffies,
3389	},
3390	{
3391		.procname	= "gc_timeout",
3392		.data		= &ip_rt_gc_timeout,
3393		.maxlen		= sizeof(int),
3394		.mode		= 0644,
3395		.proc_handler	= proc_dointvec_jiffies,
3396	},
3397	{
3398		.procname	= "gc_interval",
3399		.data		= &ip_rt_gc_interval,
3400		.maxlen		= sizeof(int),
3401		.mode		= 0644,
3402		.proc_handler	= proc_dointvec_jiffies,
3403	},
3404	{
3405		.procname	= "redirect_load",
3406		.data		= &ip_rt_redirect_load,
3407		.maxlen		= sizeof(int),
3408		.mode		= 0644,
3409		.proc_handler	= proc_dointvec,
3410	},
3411	{
3412		.procname	= "redirect_number",
3413		.data		= &ip_rt_redirect_number,
3414		.maxlen		= sizeof(int),
3415		.mode		= 0644,
3416		.proc_handler	= proc_dointvec,
3417	},
3418	{
3419		.procname	= "redirect_silence",
3420		.data		= &ip_rt_redirect_silence,
3421		.maxlen		= sizeof(int),
3422		.mode		= 0644,
3423		.proc_handler	= proc_dointvec,
3424	},
3425	{
3426		.procname	= "error_cost",
3427		.data		= &ip_rt_error_cost,
3428		.maxlen		= sizeof(int),
3429		.mode		= 0644,
3430		.proc_handler	= proc_dointvec,
3431	},
3432	{
3433		.procname	= "error_burst",
3434		.data		= &ip_rt_error_burst,
3435		.maxlen		= sizeof(int),
3436		.mode		= 0644,
3437		.proc_handler	= proc_dointvec,
3438	},
3439	{
3440		.procname	= "gc_elasticity",
3441		.data		= &ip_rt_gc_elasticity,
3442		.maxlen		= sizeof(int),
3443		.mode		= 0644,
3444		.proc_handler	= proc_dointvec,
3445	},
3446	{
3447		.procname	= "mtu_expires",
3448		.data		= &ip_rt_mtu_expires,
3449		.maxlen		= sizeof(int),
3450		.mode		= 0644,
3451		.proc_handler	= proc_dointvec_jiffies,
3452	},
3453	{
3454		.procname	= "min_pmtu",
3455		.data		= &ip_rt_min_pmtu,
3456		.maxlen		= sizeof(int),
3457		.mode		= 0644,
3458		.proc_handler	= proc_dointvec_minmax,
3459		.extra1		= &ip_min_valid_pmtu,
3460	},
3461	{
3462		.procname	= "min_adv_mss",
3463		.data		= &ip_rt_min_advmss,
3464		.maxlen		= sizeof(int),
3465		.mode		= 0644,
3466		.proc_handler	= proc_dointvec,
3467	},
3468	{ }
3469};
3470
3471static const char ipv4_route_flush_procname[] = "flush";
3472
3473static struct ctl_table ipv4_route_flush_table[] = {
3474	{
3475		.procname	= ipv4_route_flush_procname,
3476		.maxlen		= sizeof(int),
3477		.mode		= 0200,
3478		.proc_handler	= ipv4_sysctl_rtcache_flush,
3479	},
3480	{ },
3481};
3482
3483static __net_init int sysctl_route_net_init(struct net *net)
3484{
3485	struct ctl_table *tbl;
3486
3487	tbl = ipv4_route_flush_table;
3488	if (!net_eq(net, &init_net)) {
3489		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3490		if (!tbl)
3491			goto err_dup;
3492
3493		/* Don't export non-whitelisted sysctls to unprivileged users */
3494		if (net->user_ns != &init_user_ns) {
3495			if (tbl[0].procname != ipv4_route_flush_procname)
3496				tbl[0].procname = NULL;
3497		}
3498	}
3499	tbl[0].extra1 = net;
3500
3501	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3502	if (!net->ipv4.route_hdr)
3503		goto err_reg;
3504	return 0;
3505
3506err_reg:
3507	if (tbl != ipv4_route_flush_table)
3508		kfree(tbl);
3509err_dup:
3510	return -ENOMEM;
3511}
3512
3513static __net_exit void sysctl_route_net_exit(struct net *net)
3514{
3515	struct ctl_table *tbl;
3516
3517	tbl = net->ipv4.route_hdr->ctl_table_arg;
3518	unregister_net_sysctl_table(net->ipv4.route_hdr);
3519	BUG_ON(tbl == ipv4_route_flush_table);
3520	kfree(tbl);
3521}
3522
3523static __net_initdata struct pernet_operations sysctl_route_ops = {
3524	.init = sysctl_route_net_init,
3525	.exit = sysctl_route_net_exit,
3526};
3527#endif
3528
3529static __net_init int rt_genid_init(struct net *net)
3530{
3531	atomic_set(&net->ipv4.rt_genid, 0);
3532	atomic_set(&net->fnhe_genid, 0);
3533	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
 
3534	return 0;
3535}
3536
3537static __net_initdata struct pernet_operations rt_genid_ops = {
3538	.init = rt_genid_init,
3539};
3540
3541static int __net_init ipv4_inetpeer_init(struct net *net)
3542{
3543	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3544
3545	if (!bp)
3546		return -ENOMEM;
3547	inet_peer_base_init(bp);
3548	net->ipv4.peers = bp;
3549	return 0;
3550}
3551
3552static void __net_exit ipv4_inetpeer_exit(struct net *net)
 
3553{
3554	struct inet_peer_base *bp = net->ipv4.peers;
3555
3556	net->ipv4.peers = NULL;
3557	inetpeer_invalidate_tree(bp);
3558	kfree(bp);
3559}
3560
3561static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3562	.init	=	ipv4_inetpeer_init,
3563	.exit	=	ipv4_inetpeer_exit,
3564};
3565
3566#ifdef CONFIG_IP_ROUTE_CLASSID
3567struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3568#endif /* CONFIG_IP_ROUTE_CLASSID */
3569
3570int __init ip_rt_init(void)
3571{
3572	int cpu;
3573
3574	ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3575				  GFP_KERNEL);
3576	if (!ip_idents)
3577		panic("IP: failed to allocate ip_idents\n");
3578
3579	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3580
3581	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3582	if (!ip_tstamps)
3583		panic("IP: failed to allocate ip_tstamps\n");
3584
3585	for_each_possible_cpu(cpu) {
3586		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3587
3588		INIT_LIST_HEAD(&ul->head);
3589		spin_lock_init(&ul->lock);
3590	}
3591#ifdef CONFIG_IP_ROUTE_CLASSID
3592	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3593	if (!ip_rt_acct)
3594		panic("IP: failed to allocate ip_rt_acct\n");
3595#endif
3596
3597	ipv4_dst_ops.kmem_cachep =
3598		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3599				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3600
3601	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3602
3603	if (dst_entries_init(&ipv4_dst_ops) < 0)
3604		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3605
3606	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3607		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3608
3609	ipv4_dst_ops.gc_thresh = ~0;
3610	ip_rt_max_size = INT_MAX;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3611
3612	devinet_init();
3613	ip_fib_init();
3614
 
 
 
 
 
3615	if (ip_rt_proc_init())
3616		pr_err("Unable to create route proc files\n");
3617#ifdef CONFIG_XFRM
3618	xfrm_init();
3619	xfrm4_init();
3620#endif
3621	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3622		      RTNL_FLAG_DOIT_UNLOCKED);
3623
3624#ifdef CONFIG_SYSCTL
3625	register_pernet_subsys(&sysctl_route_ops);
3626#endif
3627	register_pernet_subsys(&rt_genid_ops);
3628	register_pernet_subsys(&ipv4_inetpeer_ops);
3629	return 0;
3630}
3631
3632#ifdef CONFIG_SYSCTL
3633/*
3634 * We really need to sanitize the damn ipv4 init order, then all
3635 * this nonsense will go away.
3636 */
3637void __init ip_static_sysctl_init(void)
3638{
3639	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3640}
3641#endif
v3.5.6
 
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		ROUTE - implementation of the IP router.
   7 *
   8 * Authors:	Ross Biro
   9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *		Alan Cox	:	Verify area fixes.
  16 *		Alan Cox	:	cli() protects routing changes
  17 *		Rui Oliveira	:	ICMP routing table updates
  18 *		(rco@di.uminho.pt)	Routing table insertion and update
  19 *		Linus Torvalds	:	Rewrote bits to be sensible
  20 *		Alan Cox	:	Added BSD route gw semantics
  21 *		Alan Cox	:	Super /proc >4K
  22 *		Alan Cox	:	MTU in route table
  23 *		Alan Cox	: 	MSS actually. Also added the window
  24 *					clamper.
  25 *		Sam Lantinga	:	Fixed route matching in rt_del()
  26 *		Alan Cox	:	Routing cache support.
  27 *		Alan Cox	:	Removed compatibility cruft.
  28 *		Alan Cox	:	RTF_REJECT support.
  29 *		Alan Cox	:	TCP irtt support.
  30 *		Jonathan Naylor	:	Added Metric support.
  31 *	Miquel van Smoorenburg	:	BSD API fixes.
  32 *	Miquel van Smoorenburg	:	Metrics.
  33 *		Alan Cox	:	Use __u32 properly
  34 *		Alan Cox	:	Aligned routing errors more closely with BSD
  35 *					our system is still very different.
  36 *		Alan Cox	:	Faster /proc handling
  37 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  38 *					routing caches and better behaviour.
  39 *
  40 *		Olaf Erb	:	irtt wasn't being copied right.
  41 *		Bjorn Ekwall	:	Kerneld route support.
  42 *		Alan Cox	:	Multicast fixed (I hope)
  43 * 		Pavel Krauz	:	Limited broadcast fixed
  44 *		Mike McLagan	:	Routing by source
  45 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  46 *					route.c and rewritten from scratch.
  47 *		Andi Kleen	:	Load-limit warning messages.
  48 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  49 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  50 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  51 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  52 *		Marc Boucher	:	routing by fwmark
  53 *	Robert Olsson		:	Added rt_cache statistics
  54 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  55 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  56 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  57 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
  58 *
  59 *		This program is free software; you can redistribute it and/or
  60 *		modify it under the terms of the GNU General Public License
  61 *		as published by the Free Software Foundation; either version
  62 *		2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/bootmem.h>
  74#include <linux/string.h>
  75#include <linux/socket.h>
  76#include <linux/sockios.h>
  77#include <linux/errno.h>
  78#include <linux/in.h>
  79#include <linux/inet.h>
  80#include <linux/netdevice.h>
  81#include <linux/proc_fs.h>
  82#include <linux/init.h>
  83#include <linux/workqueue.h>
  84#include <linux/skbuff.h>
  85#include <linux/inetdevice.h>
  86#include <linux/igmp.h>
  87#include <linux/pkt_sched.h>
  88#include <linux/mroute.h>
  89#include <linux/netfilter_ipv4.h>
  90#include <linux/random.h>
  91#include <linux/jhash.h>
  92#include <linux/rcupdate.h>
  93#include <linux/times.h>
  94#include <linux/slab.h>
  95#include <linux/prefetch.h>
  96#include <net/dst.h>
 
  97#include <net/net_namespace.h>
  98#include <net/protocol.h>
  99#include <net/ip.h>
 100#include <net/route.h>
 101#include <net/inetpeer.h>
 102#include <net/sock.h>
 103#include <net/ip_fib.h>
 
 104#include <net/arp.h>
 105#include <net/tcp.h>
 106#include <net/icmp.h>
 107#include <net/xfrm.h>
 
 108#include <net/netevent.h>
 109#include <net/rtnetlink.h>
 110#ifdef CONFIG_SYSCTL
 111#include <linux/sysctl.h>
 112#include <linux/kmemleak.h>
 113#endif
 114#include <net/secure_seq.h>
 
 
 
 
 115
 116#define RT_FL_TOS(oldflp4) \
 117	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119#define IP_MAX_MTU	0xFFF0
 120
 121#define RT_GC_TIMEOUT (300*HZ)
 122
 123static int ip_rt_max_size;
 124static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 125static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
 126static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
 127static int ip_rt_redirect_number __read_mostly	= 9;
 128static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 129static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 130static int ip_rt_error_cost __read_mostly	= HZ;
 131static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 132static int ip_rt_gc_elasticity __read_mostly	= 8;
 133static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 134static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 135static int ip_rt_min_advmss __read_mostly	= 256;
 136static int rt_chain_length_max __read_mostly	= 20;
 137
 138static struct delayed_work expires_work;
 139static unsigned long expires_ljiffies;
 140
 141/*
 142 *	Interface to generic destination cache.
 143 */
 144
 145static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 146static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 147static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
 148static void		 ipv4_dst_destroy(struct dst_entry *dst);
 149static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 150static void		 ipv4_link_failure(struct sk_buff *skb);
 151static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 152static int rt_garbage_collect(struct dst_ops *ops);
 153
 154static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 155			    int how)
 156{
 157}
 158
 159static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 160{
 161	struct rtable *rt = (struct rtable *) dst;
 162	struct inet_peer *peer;
 163	u32 *p = NULL;
 164
 165	if (!rt->peer)
 166		rt_bind_peer(rt, rt->rt_dst, 1);
 167
 168	peer = rt->peer;
 169	if (peer) {
 170		u32 *old_p = __DST_METRICS_PTR(old);
 171		unsigned long prev, new;
 172
 173		p = peer->metrics;
 174		if (inet_metrics_new(peer))
 175			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 176
 177		new = (unsigned long) p;
 178		prev = cmpxchg(&dst->_metrics, old, new);
 179
 180		if (prev != old) {
 181			p = __DST_METRICS_PTR(prev);
 182			if (prev & DST_METRICS_READ_ONLY)
 183				p = NULL;
 184		} else {
 185			if (rt->fi) {
 186				fib_info_put(rt->fi);
 187				rt->fi = NULL;
 188			}
 189		}
 190	}
 191	return p;
 192}
 193
 194static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr);
 
 
 
 195
 196static struct dst_ops ipv4_dst_ops = {
 197	.family =		AF_INET,
 198	.protocol =		cpu_to_be16(ETH_P_IP),
 199	.gc =			rt_garbage_collect,
 200	.check =		ipv4_dst_check,
 201	.default_advmss =	ipv4_default_advmss,
 202	.mtu =			ipv4_mtu,
 203	.cow_metrics =		ipv4_cow_metrics,
 204	.destroy =		ipv4_dst_destroy,
 205	.ifdown =		ipv4_dst_ifdown,
 206	.negative_advice =	ipv4_negative_advice,
 207	.link_failure =		ipv4_link_failure,
 208	.update_pmtu =		ip_rt_update_pmtu,
 
 209	.local_out =		__ip_local_out,
 210	.neigh_lookup =		ipv4_neigh_lookup,
 
 211};
 212
 213#define ECN_OR_COST(class)	TC_PRIO_##class
 214
 215const __u8 ip_tos2prio[16] = {
 216	TC_PRIO_BESTEFFORT,
 217	ECN_OR_COST(BESTEFFORT),
 218	TC_PRIO_BESTEFFORT,
 219	ECN_OR_COST(BESTEFFORT),
 220	TC_PRIO_BULK,
 221	ECN_OR_COST(BULK),
 222	TC_PRIO_BULK,
 223	ECN_OR_COST(BULK),
 224	TC_PRIO_INTERACTIVE,
 225	ECN_OR_COST(INTERACTIVE),
 226	TC_PRIO_INTERACTIVE,
 227	ECN_OR_COST(INTERACTIVE),
 228	TC_PRIO_INTERACTIVE_BULK,
 229	ECN_OR_COST(INTERACTIVE_BULK),
 230	TC_PRIO_INTERACTIVE_BULK,
 231	ECN_OR_COST(INTERACTIVE_BULK)
 232};
 233EXPORT_SYMBOL(ip_tos2prio);
 234
 235/*
 236 * Route cache.
 237 */
 238
 239/* The locking scheme is rather straight forward:
 240 *
 241 * 1) Read-Copy Update protects the buckets of the central route hash.
 242 * 2) Only writers remove entries, and they hold the lock
 243 *    as they look at rtable reference counts.
 244 * 3) Only readers acquire references to rtable entries,
 245 *    they do so with atomic increments and with the
 246 *    lock held.
 247 */
 248
 249struct rt_hash_bucket {
 250	struct rtable __rcu	*chain;
 251};
 252
 253#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 254	defined(CONFIG_PROVE_LOCKING)
 255/*
 256 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 257 * The size of this table is a power of two and depends on the number of CPUS.
 258 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 259 */
 260#ifdef CONFIG_LOCKDEP
 261# define RT_HASH_LOCK_SZ	256
 262#else
 263# if NR_CPUS >= 32
 264#  define RT_HASH_LOCK_SZ	4096
 265# elif NR_CPUS >= 16
 266#  define RT_HASH_LOCK_SZ	2048
 267# elif NR_CPUS >= 8
 268#  define RT_HASH_LOCK_SZ	1024
 269# elif NR_CPUS >= 4
 270#  define RT_HASH_LOCK_SZ	512
 271# else
 272#  define RT_HASH_LOCK_SZ	256
 273# endif
 274#endif
 275
 276static spinlock_t	*rt_hash_locks;
 277# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 278
 279static __init void rt_hash_lock_init(void)
 280{
 281	int i;
 282
 283	rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
 284			GFP_KERNEL);
 285	if (!rt_hash_locks)
 286		panic("IP: failed to allocate rt_hash_locks\n");
 287
 288	for (i = 0; i < RT_HASH_LOCK_SZ; i++)
 289		spin_lock_init(&rt_hash_locks[i]);
 290}
 291#else
 292# define rt_hash_lock_addr(slot) NULL
 293
 294static inline void rt_hash_lock_init(void)
 295{
 296}
 297#endif
 298
 299static struct rt_hash_bucket 	*rt_hash_table __read_mostly;
 300static unsigned int		rt_hash_mask __read_mostly;
 301static unsigned int		rt_hash_log  __read_mostly;
 302
 303static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 304#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 305
 306static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
 307				   int genid)
 308{
 309	return jhash_3words((__force u32)daddr, (__force u32)saddr,
 310			    idx, genid)
 311		& rt_hash_mask;
 312}
 313
 314static inline int rt_genid(struct net *net)
 315{
 316	return atomic_read(&net->ipv4.rt_genid);
 317}
 318
 319#ifdef CONFIG_PROC_FS
 320struct rt_cache_iter_state {
 321	struct seq_net_private p;
 322	int bucket;
 323	int genid;
 324};
 325
 326static struct rtable *rt_cache_get_first(struct seq_file *seq)
 327{
 328	struct rt_cache_iter_state *st = seq->private;
 329	struct rtable *r = NULL;
 330
 331	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 332		if (!rcu_access_pointer(rt_hash_table[st->bucket].chain))
 333			continue;
 334		rcu_read_lock_bh();
 335		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 336		while (r) {
 337			if (dev_net(r->dst.dev) == seq_file_net(seq) &&
 338			    r->rt_genid == st->genid)
 339				return r;
 340			r = rcu_dereference_bh(r->dst.rt_next);
 341		}
 342		rcu_read_unlock_bh();
 343	}
 344	return r;
 345}
 346
 347static struct rtable *__rt_cache_get_next(struct seq_file *seq,
 348					  struct rtable *r)
 349{
 350	struct rt_cache_iter_state *st = seq->private;
 351
 352	r = rcu_dereference_bh(r->dst.rt_next);
 353	while (!r) {
 354		rcu_read_unlock_bh();
 355		do {
 356			if (--st->bucket < 0)
 357				return NULL;
 358		} while (!rcu_access_pointer(rt_hash_table[st->bucket].chain));
 359		rcu_read_lock_bh();
 360		r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
 361	}
 362	return r;
 363}
 364
 365static struct rtable *rt_cache_get_next(struct seq_file *seq,
 366					struct rtable *r)
 367{
 368	struct rt_cache_iter_state *st = seq->private;
 369	while ((r = __rt_cache_get_next(seq, r)) != NULL) {
 370		if (dev_net(r->dst.dev) != seq_file_net(seq))
 371			continue;
 372		if (r->rt_genid == st->genid)
 373			break;
 374	}
 375	return r;
 376}
 377
 378static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 379{
 380	struct rtable *r = rt_cache_get_first(seq);
 381
 382	if (r)
 383		while (pos && (r = rt_cache_get_next(seq, r)))
 384			--pos;
 385	return pos ? NULL : r;
 386}
 387
 388static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 389{
 390	struct rt_cache_iter_state *st = seq->private;
 391	if (*pos)
 392		return rt_cache_get_idx(seq, *pos - 1);
 393	st->genid = rt_genid(seq_file_net(seq));
 394	return SEQ_START_TOKEN;
 395}
 396
 397static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 398{
 399	struct rtable *r;
 400
 401	if (v == SEQ_START_TOKEN)
 402		r = rt_cache_get_first(seq);
 403	else
 404		r = rt_cache_get_next(seq, v);
 405	++*pos;
 406	return r;
 407}
 408
 409static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 410{
 411	if (v && v != SEQ_START_TOKEN)
 412		rcu_read_unlock_bh();
 413}
 414
 415static int rt_cache_seq_show(struct seq_file *seq, void *v)
 416{
 417	if (v == SEQ_START_TOKEN)
 418		seq_printf(seq, "%-127s\n",
 419			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 420			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 421			   "HHUptod\tSpecDst");
 422	else {
 423		struct rtable *r = v;
 424		struct neighbour *n;
 425		int len, HHUptod;
 426
 427		rcu_read_lock();
 428		n = dst_get_neighbour_noref(&r->dst);
 429		HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0;
 430		rcu_read_unlock();
 431
 432		seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
 433			      "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
 434			r->dst.dev ? r->dst.dev->name : "*",
 435			(__force u32)r->rt_dst,
 436			(__force u32)r->rt_gateway,
 437			r->rt_flags, atomic_read(&r->dst.__refcnt),
 438			r->dst.__use, 0, (__force u32)r->rt_src,
 439			dst_metric_advmss(&r->dst) + 40,
 440			dst_metric(&r->dst, RTAX_WINDOW),
 441			(int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
 442			      dst_metric(&r->dst, RTAX_RTTVAR)),
 443			r->rt_key_tos,
 444			-1,
 445			HHUptod,
 446			r->rt_spec_dst, &len);
 447
 448		seq_printf(seq, "%*s\n", 127 - len, "");
 449	}
 450	return 0;
 451}
 452
 453static const struct seq_operations rt_cache_seq_ops = {
 454	.start  = rt_cache_seq_start,
 455	.next   = rt_cache_seq_next,
 456	.stop   = rt_cache_seq_stop,
 457	.show   = rt_cache_seq_show,
 458};
 459
 460static int rt_cache_seq_open(struct inode *inode, struct file *file)
 461{
 462	return seq_open_net(inode, file, &rt_cache_seq_ops,
 463			sizeof(struct rt_cache_iter_state));
 464}
 465
 466static const struct file_operations rt_cache_seq_fops = {
 467	.owner	 = THIS_MODULE,
 468	.open	 = rt_cache_seq_open,
 469	.read	 = seq_read,
 470	.llseek	 = seq_lseek,
 471	.release = seq_release_net,
 472};
 473
 474
 475static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 476{
 477	int cpu;
 478
 479	if (*pos == 0)
 480		return SEQ_START_TOKEN;
 481
 482	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 483		if (!cpu_possible(cpu))
 484			continue;
 485		*pos = cpu+1;
 486		return &per_cpu(rt_cache_stat, cpu);
 487	}
 488	return NULL;
 489}
 490
 491static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 492{
 493	int cpu;
 494
 495	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 496		if (!cpu_possible(cpu))
 497			continue;
 498		*pos = cpu+1;
 499		return &per_cpu(rt_cache_stat, cpu);
 500	}
 
 501	return NULL;
 502
 503}
 504
 505static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 506{
 507
 508}
 509
 510static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 511{
 512	struct rt_cache_stat *st = v;
 513
 514	if (v == SEQ_START_TOKEN) {
 515		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 516		return 0;
 517	}
 518
 519	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 520		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 521		   dst_entries_get_slow(&ipv4_dst_ops),
 522		   st->in_hit,
 523		   st->in_slow_tot,
 524		   st->in_slow_mc,
 525		   st->in_no_route,
 526		   st->in_brd,
 527		   st->in_martian_dst,
 528		   st->in_martian_src,
 529
 530		   st->out_hit,
 531		   st->out_slow_tot,
 532		   st->out_slow_mc,
 533
 534		   st->gc_total,
 535		   st->gc_ignored,
 536		   st->gc_goal_miss,
 537		   st->gc_dst_overflow,
 538		   st->in_hlist_search,
 539		   st->out_hlist_search
 540		);
 541	return 0;
 542}
 543
 544static const struct seq_operations rt_cpu_seq_ops = {
 545	.start  = rt_cpu_seq_start,
 546	.next   = rt_cpu_seq_next,
 547	.stop   = rt_cpu_seq_stop,
 548	.show   = rt_cpu_seq_show,
 549};
 550
 551
 552static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 553{
 554	return seq_open(file, &rt_cpu_seq_ops);
 555}
 556
 557static const struct file_operations rt_cpu_seq_fops = {
 558	.owner	 = THIS_MODULE,
 559	.open	 = rt_cpu_seq_open,
 560	.read	 = seq_read,
 561	.llseek	 = seq_lseek,
 562	.release = seq_release,
 563};
 564
 565#ifdef CONFIG_IP_ROUTE_CLASSID
 566static int rt_acct_proc_show(struct seq_file *m, void *v)
 567{
 568	struct ip_rt_acct *dst, *src;
 569	unsigned int i, j;
 570
 571	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 572	if (!dst)
 573		return -ENOMEM;
 574
 575	for_each_possible_cpu(i) {
 576		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 577		for (j = 0; j < 256; j++) {
 578			dst[j].o_bytes   += src[j].o_bytes;
 579			dst[j].o_packets += src[j].o_packets;
 580			dst[j].i_bytes   += src[j].i_bytes;
 581			dst[j].i_packets += src[j].i_packets;
 582		}
 583	}
 584
 585	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 586	kfree(dst);
 587	return 0;
 588}
 589
 590static int rt_acct_proc_open(struct inode *inode, struct file *file)
 591{
 592	return single_open(file, rt_acct_proc_show, NULL);
 593}
 594
 595static const struct file_operations rt_acct_proc_fops = {
 596	.owner		= THIS_MODULE,
 597	.open		= rt_acct_proc_open,
 598	.read		= seq_read,
 599	.llseek		= seq_lseek,
 600	.release	= single_release,
 601};
 602#endif
 603
 604static int __net_init ip_rt_do_proc_init(struct net *net)
 605{
 606	struct proc_dir_entry *pde;
 607
 608	pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
 609			&rt_cache_seq_fops);
 610	if (!pde)
 611		goto err1;
 612
 613	pde = proc_create("rt_cache", S_IRUGO,
 614			  net->proc_net_stat, &rt_cpu_seq_fops);
 615	if (!pde)
 616		goto err2;
 617
 618#ifdef CONFIG_IP_ROUTE_CLASSID
 619	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 
 620	if (!pde)
 621		goto err3;
 622#endif
 623	return 0;
 624
 625#ifdef CONFIG_IP_ROUTE_CLASSID
 626err3:
 627	remove_proc_entry("rt_cache", net->proc_net_stat);
 628#endif
 629err2:
 630	remove_proc_entry("rt_cache", net->proc_net);
 631err1:
 632	return -ENOMEM;
 633}
 634
 635static void __net_exit ip_rt_do_proc_exit(struct net *net)
 636{
 637	remove_proc_entry("rt_cache", net->proc_net_stat);
 638	remove_proc_entry("rt_cache", net->proc_net);
 639#ifdef CONFIG_IP_ROUTE_CLASSID
 640	remove_proc_entry("rt_acct", net->proc_net);
 641#endif
 642}
 643
 644static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 645	.init = ip_rt_do_proc_init,
 646	.exit = ip_rt_do_proc_exit,
 647};
 648
 649static int __init ip_rt_proc_init(void)
 650{
 651	return register_pernet_subsys(&ip_rt_proc_ops);
 652}
 653
 654#else
 655static inline int ip_rt_proc_init(void)
 656{
 657	return 0;
 658}
 659#endif /* CONFIG_PROC_FS */
 660
 661static inline void rt_free(struct rtable *rt)
 662{
 663	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 664}
 665
 666static inline void rt_drop(struct rtable *rt)
 667{
 668	ip_rt_put(rt);
 669	call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
 670}
 671
 672static inline int rt_fast_clean(struct rtable *rth)
 
 
 673{
 674	/* Kill broadcast/multicast entries very aggresively, if they
 675	   collide in hash table with more useful entries */
 676	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 677		rt_is_input_route(rth) && rth->dst.rt_next;
 678}
 679
 680static inline int rt_valuable(struct rtable *rth)
 681{
 682	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 683		(rth->peer && rth->peer->pmtu_expires);
 684}
 685
 686static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 687{
 688	unsigned long age;
 689	int ret = 0;
 
 
 690
 691	if (atomic_read(&rth->dst.__refcnt))
 692		goto out;
 
 693
 694	age = jiffies - rth->dst.lastuse;
 695	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 696	    (age <= tmo2 && rt_valuable(rth)))
 697		goto out;
 698	ret = 1;
 699out:	return ret;
 700}
 701
 702/* Bits of score are:
 703 * 31: very valuable
 704 * 30: not quite useless
 705 * 29..0: usage counter
 706 */
 707static inline u32 rt_score(struct rtable *rt)
 708{
 709	u32 score = jiffies - rt->dst.lastuse;
 710
 711	score = ~score & ~(3<<30);
 712
 713	if (rt_valuable(rt))
 714		score |= (1<<31);
 715
 716	if (rt_is_output_route(rt) ||
 717	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 718		score |= (1<<30);
 719
 720	return score;
 721}
 722
 723static inline bool rt_caching(const struct net *net)
 724{
 725	return net->ipv4.current_rt_cache_rebuild_count <=
 726		net->ipv4.sysctl_rt_cache_rebuild_count;
 727}
 728
 729static inline bool compare_hash_inputs(const struct rtable *rt1,
 730				       const struct rtable *rt2)
 731{
 732	return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 733		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 734		(rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0);
 
 
 
 
 735}
 736
 737static inline int compare_keys(struct rtable *rt1, struct rtable *rt2)
 738{
 739	return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) |
 740		((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) |
 741		(rt1->rt_mark ^ rt2->rt_mark) |
 742		(rt1->rt_key_tos ^ rt2->rt_key_tos) |
 743		(rt1->rt_route_iif ^ rt2->rt_route_iif) |
 744		(rt1->rt_oif ^ rt2->rt_oif)) == 0;
 745}
 746
 747static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
 748{
 749	return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
 750}
 751
 752static inline int rt_is_expired(struct rtable *rth)
 
 
 
 
 753{
 754	return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 
 
 
 
 
 
 
 
 
 
 
 
 
 755}
 
 756
 757/*
 758 * Perform a full scan of hash table and free all entries.
 759 * Can be called by a softirq or a process.
 760 * In the later case, we want to be reschedule if necessary
 761 */
 762static void rt_do_flush(struct net *net, int process_context)
 763{
 764	unsigned int i;
 765	struct rtable *rth, *next;
 766
 767	for (i = 0; i <= rt_hash_mask; i++) {
 768		struct rtable __rcu **pprev;
 769		struct rtable *list;
 770
 771		if (process_context && need_resched())
 772			cond_resched();
 773		rth = rcu_access_pointer(rt_hash_table[i].chain);
 774		if (!rth)
 775			continue;
 776
 777		spin_lock_bh(rt_hash_lock_addr(i));
 778
 779		list = NULL;
 780		pprev = &rt_hash_table[i].chain;
 781		rth = rcu_dereference_protected(*pprev,
 782			lockdep_is_held(rt_hash_lock_addr(i)));
 783
 784		while (rth) {
 785			next = rcu_dereference_protected(rth->dst.rt_next,
 786				lockdep_is_held(rt_hash_lock_addr(i)));
 787
 788			if (!net ||
 789			    net_eq(dev_net(rth->dst.dev), net)) {
 790				rcu_assign_pointer(*pprev, next);
 791				rcu_assign_pointer(rth->dst.rt_next, list);
 792				list = rth;
 793			} else {
 794				pprev = &rth->dst.rt_next;
 795			}
 796			rth = next;
 797		}
 798
 799		spin_unlock_bh(rt_hash_lock_addr(i));
 
 
 
 800
 801		for (; list; list = next) {
 802			next = rcu_dereference_protected(list->dst.rt_next, 1);
 803			rt_free(list);
 804		}
 805	}
 
 806}
 
 807
 808/*
 809 * While freeing expired entries, we compute average chain length
 810 * and standard deviation, using fixed-point arithmetic.
 811 * This to have an estimation of rt_chain_length_max
 812 *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
 813 * We use 3 bits for frational part, and 29 (or 61) for magnitude.
 814 */
 815
 816#define FRACT_BITS 3
 817#define ONE (1UL << FRACT_BITS)
 818
 819/*
 820 * Given a hash chain and an item in this hash chain,
 821 * find if a previous entry has the same hash_inputs
 822 * (but differs on tos, mark or oif)
 823 * Returns 0 if an alias is found.
 824 * Returns ONE if rth has no alias before itself.
 825 */
 826static int has_noalias(const struct rtable *head, const struct rtable *rth)
 827{
 828	const struct rtable *aux = head;
 
 829
 830	while (aux != rth) {
 831		if (compare_hash_inputs(aux, rth))
 832			return 0;
 833		aux = rcu_dereference_protected(aux->dst.rt_next, 1);
 834	}
 835	return ONE;
 
 
 
 
 836}
 837
 838static void rt_check_expire(void)
 
 839{
 840	static unsigned int rover;
 841	unsigned int i = rover, goal;
 842	struct rtable *rth;
 843	struct rtable __rcu **rthp;
 844	unsigned long samples = 0;
 845	unsigned long sum = 0, sum2 = 0;
 846	unsigned long delta;
 847	u64 mult;
 848
 849	delta = jiffies - expires_ljiffies;
 850	expires_ljiffies = jiffies;
 851	mult = ((u64)delta) << rt_hash_log;
 852	if (ip_rt_gc_timeout > 1)
 853		do_div(mult, ip_rt_gc_timeout);
 854	goal = (unsigned int)mult;
 855	if (goal > rt_hash_mask)
 856		goal = rt_hash_mask + 1;
 857	for (; goal > 0; goal--) {
 858		unsigned long tmo = ip_rt_gc_timeout;
 859		unsigned long length;
 860
 861		i = (i + 1) & rt_hash_mask;
 862		rthp = &rt_hash_table[i].chain;
 863
 864		if (need_resched())
 865			cond_resched();
 866
 867		samples++;
 868
 869		if (rcu_dereference_raw(*rthp) == NULL)
 870			continue;
 871		length = 0;
 872		spin_lock_bh(rt_hash_lock_addr(i));
 873		while ((rth = rcu_dereference_protected(*rthp,
 874					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
 875			prefetch(rth->dst.rt_next);
 876			if (rt_is_expired(rth)) {
 877				*rthp = rth->dst.rt_next;
 878				rt_free(rth);
 879				continue;
 880			}
 881			if (rth->dst.expires) {
 882				/* Entry is expired even if it is in use */
 883				if (time_before_eq(jiffies, rth->dst.expires)) {
 884nofree:
 885					tmo >>= 1;
 886					rthp = &rth->dst.rt_next;
 887					/*
 888					 * We only count entries on
 889					 * a chain with equal hash inputs once
 890					 * so that entries for different QOS
 891					 * levels, and other non-hash input
 892					 * attributes don't unfairly skew
 893					 * the length computation
 894					 */
 895					length += has_noalias(rt_hash_table[i].chain, rth);
 896					continue;
 897				}
 898			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
 899				goto nofree;
 900
 901			/* Cleanup aged off entries. */
 902			*rthp = rth->dst.rt_next;
 903			rt_free(rth);
 904		}
 905		spin_unlock_bh(rt_hash_lock_addr(i));
 906		sum += length;
 907		sum2 += length*length;
 908	}
 909	if (samples) {
 910		unsigned long avg = sum / samples;
 911		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
 912		rt_chain_length_max = max_t(unsigned long,
 913					ip_rt_gc_elasticity,
 914					(avg + 4*sd) >> FRACT_BITS);
 915	}
 916	rover = i;
 917}
 918
 919/*
 920 * rt_worker_func() is run in process context.
 921 * we call rt_check_expire() to scan part of the hash table
 922 */
 923static void rt_worker_func(struct work_struct *work)
 924{
 925	rt_check_expire();
 926	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 927}
 928
 929/*
 930 * Perturbation of rt_genid by a small quantity [1..256]
 931 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
 932 * many times (2^24) without giving recent rt_genid.
 933 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
 934 */
 935static void rt_cache_invalidate(struct net *net)
 936{
 937	unsigned char shuffle;
 938
 939	get_random_bytes(&shuffle, sizeof(shuffle));
 940	atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
 941	inetpeer_invalidate_tree(AF_INET);
 942}
 943
 944/*
 945 * delay < 0  : invalidate cache (fast : entries will be deleted later)
 946 * delay >= 0 : invalidate & flush cache (can be long)
 947 */
 948void rt_cache_flush(struct net *net, int delay)
 949{
 950	rt_cache_invalidate(net);
 951	if (delay >= 0)
 952		rt_do_flush(net, !in_softirq());
 
 953}
 954
 955/* Flush previous cache invalidated entries from the cache */
 956void rt_cache_flush_batch(struct net *net)
 957{
 958	rt_do_flush(net, !in_softirq());
 959}
 960
 961static void rt_emergency_hash_rebuild(struct net *net)
 962{
 963	net_warn_ratelimited("Route hash chain too long!\n");
 964	rt_cache_invalidate(net);
 965}
 966
 967/*
 968   Short description of GC goals.
 969
 970   We want to build algorithm, which will keep routing cache
 971   at some equilibrium point, when number of aged off entries
 972   is kept approximately equal to newly generated ones.
 973
 974   Current expiration strength is variable "expire".
 975   We try to adjust it dynamically, so that if networking
 976   is idle expires is large enough to keep enough of warm entries,
 977   and when load increases it reduces to limit cache size.
 978 */
 979
 980static int rt_garbage_collect(struct dst_ops *ops)
 981{
 982	static unsigned long expire = RT_GC_TIMEOUT;
 983	static unsigned long last_gc;
 984	static int rover;
 985	static int equilibrium;
 986	struct rtable *rth;
 987	struct rtable __rcu **rthp;
 988	unsigned long now = jiffies;
 989	int goal;
 990	int entries = dst_entries_get_fast(&ipv4_dst_ops);
 991
 992	/*
 993	 * Garbage collection is pretty expensive,
 994	 * do not make it too frequently.
 995	 */
 996
 997	RT_CACHE_STAT_INC(gc_total);
 998
 999	if (now - last_gc < ip_rt_gc_min_interval &&
1000	    entries < ip_rt_max_size) {
1001		RT_CACHE_STAT_INC(gc_ignored);
1002		goto out;
1003	}
1004
1005	entries = dst_entries_get_slow(&ipv4_dst_ops);
1006	/* Calculate number of entries, which we want to expire now. */
1007	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1008	if (goal <= 0) {
1009		if (equilibrium < ipv4_dst_ops.gc_thresh)
1010			equilibrium = ipv4_dst_ops.gc_thresh;
1011		goal = entries - equilibrium;
1012		if (goal > 0) {
1013			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1014			goal = entries - equilibrium;
1015		}
1016	} else {
1017		/* We are in dangerous area. Try to reduce cache really
1018		 * aggressively.
1019		 */
1020		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1021		equilibrium = entries - goal;
1022	}
1023
1024	if (now - last_gc >= ip_rt_gc_min_interval)
1025		last_gc = now;
1026
1027	if (goal <= 0) {
1028		equilibrium += goal;
1029		goto work_done;
1030	}
1031
1032	do {
1033		int i, k;
1034
1035		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1036			unsigned long tmo = expire;
1037
1038			k = (k + 1) & rt_hash_mask;
1039			rthp = &rt_hash_table[k].chain;
1040			spin_lock_bh(rt_hash_lock_addr(k));
1041			while ((rth = rcu_dereference_protected(*rthp,
1042					lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1043				if (!rt_is_expired(rth) &&
1044					!rt_may_expire(rth, tmo, expire)) {
1045					tmo >>= 1;
1046					rthp = &rth->dst.rt_next;
1047					continue;
1048				}
1049				*rthp = rth->dst.rt_next;
1050				rt_free(rth);
1051				goal--;
1052			}
1053			spin_unlock_bh(rt_hash_lock_addr(k));
1054			if (goal <= 0)
1055				break;
1056		}
1057		rover = k;
1058
1059		if (goal <= 0)
1060			goto work_done;
1061
1062		/* Goal is not achieved. We stop process if:
1063
1064		   - if expire reduced to zero. Otherwise, expire is halfed.
1065		   - if table is not full.
1066		   - if we are called from interrupt.
1067		   - jiffies check is just fallback/debug loop breaker.
1068		     We will not spin here for long time in any case.
1069		 */
1070
1071		RT_CACHE_STAT_INC(gc_goal_miss);
1072
1073		if (expire == 0)
1074			break;
1075
1076		expire >>= 1;
1077
1078		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1079			goto out;
1080	} while (!in_softirq() && time_before_eq(jiffies, now));
1081
1082	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1083		goto out;
1084	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1085		goto out;
1086	net_warn_ratelimited("dst cache overflow\n");
1087	RT_CACHE_STAT_INC(gc_dst_overflow);
1088	return 1;
1089
1090work_done:
1091	expire += ip_rt_gc_min_interval;
1092	if (expire > ip_rt_gc_timeout ||
1093	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1094	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1095		expire = ip_rt_gc_timeout;
1096out:	return 0;
1097}
1098
1099/*
1100 * Returns number of entries in a hash chain that have different hash_inputs
1101 */
1102static int slow_chain_length(const struct rtable *head)
1103{
1104	int length = 0;
1105	const struct rtable *rth = head;
1106
1107	while (rth) {
1108		length += has_noalias(head, rth);
1109		rth = rcu_dereference_protected(rth->dst.rt_next, 1);
 
 
1110	}
1111	return length >> FRACT_BITS;
 
1112}
1113
1114static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr)
1115{
1116	static const __be32 inaddr_any = 0;
1117	struct net_device *dev = dst->dev;
1118	const __be32 *pkey = daddr;
1119	const struct rtable *rt;
1120	struct neighbour *n;
1121
1122	rt = (const struct rtable *) dst;
1123
1124	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
1125		pkey = &inaddr_any;
1126	else if (rt->rt_gateway)
1127		pkey = (const __be32 *) &rt->rt_gateway;
1128
1129	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
1130	if (n)
1131		return n;
1132	return neigh_create(&arp_tbl, pkey, dev);
1133}
1134
1135static int rt_bind_neighbour(struct rtable *rt)
1136{
1137	struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1138	if (IS_ERR(n))
1139		return PTR_ERR(n);
1140	dst_set_neighbour(&rt->dst, n);
1141
1142	return 0;
 
 
 
 
 
1143}
1144
1145static struct rtable *rt_intern_hash(unsigned int hash, struct rtable *rt,
1146				     struct sk_buff *skb, int ifindex)
 
1147{
1148	struct rtable	*rth, *cand;
1149	struct rtable __rcu **rthp, **candp;
1150	unsigned long	now;
1151	u32 		min_score;
1152	int		chain_length;
1153	int attempts = !in_softirq();
1154
1155restart:
1156	chain_length = 0;
1157	min_score = ~(u32)0;
1158	cand = NULL;
1159	candp = NULL;
1160	now = jiffies;
1161
1162	if (!rt_caching(dev_net(rt->dst.dev))) {
1163		/*
1164		 * If we're not caching, just tell the caller we
1165		 * were successful and don't touch the route.  The
1166		 * caller hold the sole reference to the cache entry, and
1167		 * it will be released when the caller is done with it.
1168		 * If we drop it here, the callers have no way to resolve routes
1169		 * when we're not caching.  Instead, just point *rp at rt, so
1170		 * the caller gets a single use out of the route
1171		 * Note that we do rt_free on this new route entry, so that
1172		 * once its refcount hits zero, we are still able to reap it
1173		 * (Thanks Alexey)
1174		 * Note: To avoid expensive rcu stuff for this uncached dst,
1175		 * we set DST_NOCACHE so that dst_release() can free dst without
1176		 * waiting a grace period.
1177		 */
1178
1179		rt->dst.flags |= DST_NOCACHE;
1180		if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1181			int err = rt_bind_neighbour(rt);
1182			if (err) {
1183				net_warn_ratelimited("Neighbour table failure & not caching routes\n");
1184				ip_rt_put(rt);
1185				return ERR_PTR(err);
1186			}
1187		}
1188
1189		goto skip_hashing;
 
 
 
 
 
1190	}
1191
1192	rthp = &rt_hash_table[hash].chain;
1193
1194	spin_lock_bh(rt_hash_lock_addr(hash));
1195	while ((rth = rcu_dereference_protected(*rthp,
1196			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1197		if (rt_is_expired(rth)) {
1198			*rthp = rth->dst.rt_next;
1199			rt_free(rth);
1200			continue;
1201		}
1202		if (compare_keys(rth, rt) && compare_netns(rth, rt)) {
1203			/* Put it first */
1204			*rthp = rth->dst.rt_next;
1205			/*
1206			 * Since lookup is lockfree, the deletion
1207			 * must be visible to another weakly ordered CPU before
1208			 * the insertion at the start of the hash chain.
1209			 */
1210			rcu_assign_pointer(rth->dst.rt_next,
1211					   rt_hash_table[hash].chain);
1212			/*
1213			 * Since lookup is lockfree, the update writes
1214			 * must be ordered for consistency on SMP.
1215			 */
1216			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1217
1218			dst_use(&rth->dst, now);
1219			spin_unlock_bh(rt_hash_lock_addr(hash));
1220
1221			rt_drop(rt);
1222			if (skb)
1223				skb_dst_set(skb, &rth->dst);
1224			return rth;
1225		}
1226
1227		if (!atomic_read(&rth->dst.__refcnt)) {
1228			u32 score = rt_score(rth);
1229
1230			if (score <= min_score) {
1231				cand = rth;
1232				candp = rthp;
1233				min_score = score;
1234			}
1235		}
1236
1237		chain_length++;
1238
1239		rthp = &rth->dst.rt_next;
1240	}
1241
1242	if (cand) {
1243		/* ip_rt_gc_elasticity used to be average length of chain
1244		 * length, when exceeded gc becomes really aggressive.
1245		 *
1246		 * The second limit is less certain. At the moment it allows
1247		 * only 2 entries per bucket. We will see.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1248		 */
1249		if (chain_length > ip_rt_gc_elasticity) {
1250			*candp = cand->dst.rt_next;
1251			rt_free(cand);
1252		}
1253	} else {
1254		if (chain_length > rt_chain_length_max &&
1255		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1256			struct net *net = dev_net(rt->dst.dev);
1257			int num = ++net->ipv4.current_rt_cache_rebuild_count;
1258			if (!rt_caching(net)) {
1259				pr_warn("%s: %d rebuilds is over limit, route caching disabled\n",
1260					rt->dst.dev->name, num);
1261			}
1262			rt_emergency_hash_rebuild(net);
1263			spin_unlock_bh(rt_hash_lock_addr(hash));
1264
1265			hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1266					ifindex, rt_genid(net));
1267			goto restart;
1268		}
1269	}
1270
1271	/* Try to bind route to arp only if it is output
1272	   route or unicast forwarding path.
1273	 */
1274	if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1275		int err = rt_bind_neighbour(rt);
1276		if (err) {
1277			spin_unlock_bh(rt_hash_lock_addr(hash));
1278
1279			if (err != -ENOBUFS) {
1280				rt_drop(rt);
1281				return ERR_PTR(err);
1282			}
1283
1284			/* Neighbour tables are full and nothing
1285			   can be released. Try to shrink route cache,
1286			   it is most likely it holds some neighbour records.
1287			 */
1288			if (attempts-- > 0) {
1289				int saved_elasticity = ip_rt_gc_elasticity;
1290				int saved_int = ip_rt_gc_min_interval;
1291				ip_rt_gc_elasticity	= 1;
1292				ip_rt_gc_min_interval	= 0;
1293				rt_garbage_collect(&ipv4_dst_ops);
1294				ip_rt_gc_min_interval	= saved_int;
1295				ip_rt_gc_elasticity	= saved_elasticity;
1296				goto restart;
1297			}
1298
1299			net_warn_ratelimited("Neighbour table overflow\n");
1300			rt_drop(rt);
1301			return ERR_PTR(-ENOBUFS);
1302		}
1303	}
1304
1305	rt->dst.rt_next = rt_hash_table[hash].chain;
1306
1307	/*
1308	 * Since lookup is lockfree, we must make sure
1309	 * previous writes to rt are committed to memory
1310	 * before making rt visible to other CPUS.
1311	 */
1312	rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1313
1314	spin_unlock_bh(rt_hash_lock_addr(hash));
1315
1316skip_hashing:
1317	if (skb)
1318		skb_dst_set(skb, &rt->dst);
1319	return rt;
1320}
1321
1322static atomic_t __rt_peer_genid = ATOMIC_INIT(0);
1323
1324static u32 rt_peer_genid(void)
1325{
1326	return atomic_read(&__rt_peer_genid);
1327}
 
 
 
 
 
1328
1329void rt_bind_peer(struct rtable *rt, __be32 daddr, int create)
1330{
1331	struct inet_peer *peer;
 
 
 
1332
1333	peer = inet_getpeer_v4(daddr, create);
1334
1335	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1336		inet_putpeer(peer);
1337	else
1338		rt->rt_peer_genid = rt_peer_genid();
1339}
1340
1341/*
1342 * Peer allocation may fail only in serious out-of-memory conditions.  However
1343 * we still can generate some output.
1344 * Random ID selection looks a bit dangerous because we have no chances to
1345 * select ID being unique in a reasonable period of time.
1346 * But broken packet identifier may be better than no packet at all.
1347 */
1348static void ip_select_fb_ident(struct iphdr *iph)
1349{
1350	static DEFINE_SPINLOCK(ip_fb_id_lock);
1351	static u32 ip_fallback_id;
1352	u32 salt;
1353
1354	spin_lock_bh(&ip_fb_id_lock);
1355	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1356	iph->id = htons(salt & 0xFFFF);
1357	ip_fallback_id = salt;
1358	spin_unlock_bh(&ip_fb_id_lock);
1359}
1360
1361void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1362{
1363	struct rtable *rt = (struct rtable *) dst;
1364
1365	if (rt && !(rt->dst.flags & DST_NOPEER)) {
1366		if (rt->peer == NULL)
1367			rt_bind_peer(rt, rt->rt_dst, 1);
1368
1369		/* If peer is attached to destination, it is never detached,
1370		   so that we need not to grab a lock to dereference it.
1371		 */
1372		if (rt->peer) {
1373			iph->id = htons(inet_getid(rt->peer, more));
1374			return;
1375		}
1376	} else if (!rt)
1377		pr_debug("rt_bind_peer(0) @%p\n", __builtin_return_address(0));
1378
1379	ip_select_fb_ident(iph);
1380}
1381EXPORT_SYMBOL(__ip_select_ident);
1382
1383static void rt_del(unsigned int hash, struct rtable *rt)
1384{
1385	struct rtable __rcu **rthp;
1386	struct rtable *aux;
1387
1388	rthp = &rt_hash_table[hash].chain;
1389	spin_lock_bh(rt_hash_lock_addr(hash));
1390	ip_rt_put(rt);
1391	while ((aux = rcu_dereference_protected(*rthp,
1392			lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1393		if (aux == rt || rt_is_expired(aux)) {
1394			*rthp = aux->dst.rt_next;
1395			rt_free(aux);
1396			continue;
1397		}
1398		rthp = &aux->dst.rt_next;
1399	}
1400	spin_unlock_bh(rt_hash_lock_addr(hash));
1401}
1402
1403static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer)
1404{
1405	struct rtable *rt = (struct rtable *) dst;
1406	__be32 orig_gw = rt->rt_gateway;
1407	struct neighbour *n, *old_n;
1408
1409	dst_confirm(&rt->dst);
1410
1411	rt->rt_gateway = peer->redirect_learned.a4;
1412
1413	n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway);
1414	if (IS_ERR(n)) {
1415		rt->rt_gateway = orig_gw;
1416		return;
1417	}
1418	old_n = xchg(&rt->dst._neighbour, n);
1419	if (old_n)
1420		neigh_release(old_n);
1421	if (!(n->nud_state & NUD_VALID)) {
1422		neigh_event_send(n, NULL);
1423	} else {
1424		rt->rt_flags |= RTCF_REDIRECTED;
1425		call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
1426	}
1427}
1428
1429/* called in rcu_read_lock() section */
1430void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1431		    __be32 saddr, struct net_device *dev)
1432{
1433	int s, i;
1434	struct in_device *in_dev = __in_dev_get_rcu(dev);
1435	__be32 skeys[2] = { saddr, 0 };
1436	int    ikeys[2] = { dev->ifindex, 0 };
1437	struct inet_peer *peer;
1438	struct net *net;
1439
 
1440	if (!in_dev)
1441		return;
1442
1443	net = dev_net(dev);
1444	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1445	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1446	    ipv4_is_zeronet(new_gw))
1447		goto reject_redirect;
1448
1449	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1450		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1451			goto reject_redirect;
1452		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1453			goto reject_redirect;
1454	} else {
1455		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1456			goto reject_redirect;
1457	}
1458
1459	for (s = 0; s < 2; s++) {
1460		for (i = 0; i < 2; i++) {
1461			unsigned int hash;
1462			struct rtable __rcu **rthp;
1463			struct rtable *rt;
1464
1465			hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net));
1466
1467			rthp = &rt_hash_table[hash].chain;
1468
1469			while ((rt = rcu_dereference(*rthp)) != NULL) {
1470				rthp = &rt->dst.rt_next;
1471
1472				if (rt->rt_key_dst != daddr ||
1473				    rt->rt_key_src != skeys[s] ||
1474				    rt->rt_oif != ikeys[i] ||
1475				    rt_is_input_route(rt) ||
1476				    rt_is_expired(rt) ||
1477				    !net_eq(dev_net(rt->dst.dev), net) ||
1478				    rt->dst.error ||
1479				    rt->dst.dev != dev ||
1480				    rt->rt_gateway != old_gw)
1481					continue;
1482
1483				if (!rt->peer)
1484					rt_bind_peer(rt, rt->rt_dst, 1);
1485
1486				peer = rt->peer;
1487				if (peer) {
1488					if (peer->redirect_learned.a4 != new_gw) {
1489						peer->redirect_learned.a4 = new_gw;
1490						atomic_inc(&__rt_peer_genid);
1491					}
1492					check_peer_redir(&rt->dst, peer);
1493				}
1494			}
 
 
 
1495		}
 
1496	}
1497	return;
1498
1499reject_redirect:
1500#ifdef CONFIG_IP_ROUTE_VERBOSE
1501	if (IN_DEV_LOG_MARTIANS(in_dev))
 
 
 
 
1502		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
1503				     "  Advised path = %pI4 -> %pI4\n",
1504				     &old_gw, dev->name, &new_gw,
1505				     &saddr, &daddr);
 
1506#endif
1507	;
1508}
1509
1510static bool peer_pmtu_expired(struct inet_peer *peer)
1511{
1512	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1513
1514	return orig &&
1515	       time_after_eq(jiffies, orig) &&
1516	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1517}
 
 
1518
1519static bool peer_pmtu_cleaned(struct inet_peer *peer)
1520{
1521	unsigned long orig = ACCESS_ONCE(peer->pmtu_expires);
1522
1523	return orig &&
1524	       cmpxchg(&peer->pmtu_expires, orig, 0) == orig;
1525}
1526
1527static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1528{
1529	struct rtable *rt = (struct rtable *)dst;
1530	struct dst_entry *ret = dst;
1531
1532	if (rt) {
1533		if (dst->obsolete > 0) {
1534			ip_rt_put(rt);
1535			ret = NULL;
1536		} else if (rt->rt_flags & RTCF_REDIRECTED) {
1537			unsigned int hash = rt_hash(rt->rt_key_dst, rt->rt_key_src,
1538						rt->rt_oif,
1539						rt_genid(dev_net(dst->dev)));
1540			rt_del(hash, rt);
1541			ret = NULL;
1542		} else if (rt->peer && peer_pmtu_expired(rt->peer)) {
1543			dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig);
1544		}
1545	}
1546	return ret;
1547}
1548
1549/*
1550 * Algorithm:
1551 *	1. The first ip_rt_redirect_number redirects are sent
1552 *	   with exponential backoff, then we stop sending them at all,
1553 *	   assuming that the host ignores our redirects.
1554 *	2. If we did not see packets requiring redirects
1555 *	   during ip_rt_redirect_silence, we assume that the host
1556 *	   forgot redirected route and start to send redirects again.
1557 *
1558 * This algorithm is much cheaper and more intelligent than dumb load limiting
1559 * in icmp.c.
1560 *
1561 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1562 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1563 */
1564
1565void ip_rt_send_redirect(struct sk_buff *skb)
1566{
1567	struct rtable *rt = skb_rtable(skb);
1568	struct in_device *in_dev;
1569	struct inet_peer *peer;
 
1570	int log_martians;
 
1571
1572	rcu_read_lock();
1573	in_dev = __in_dev_get_rcu(rt->dst.dev);
1574	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1575		rcu_read_unlock();
1576		return;
1577	}
1578	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 
1579	rcu_read_unlock();
1580
1581	if (!rt->peer)
1582		rt_bind_peer(rt, rt->rt_dst, 1);
1583	peer = rt->peer;
1584	if (!peer) {
1585		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 
1586		return;
1587	}
1588
1589	/* No redirected packets during ip_rt_redirect_silence;
1590	 * reset the algorithm.
1591	 */
1592	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
1593		peer->rate_tokens = 0;
 
 
1594
1595	/* Too many ignored redirects; do not send anything
1596	 * set dst.rate_last to the last seen redirected packet.
1597	 */
1598	if (peer->rate_tokens >= ip_rt_redirect_number) {
1599		peer->rate_last = jiffies;
1600		return;
1601	}
1602
1603	/* Check for load limit; set rate_last to the latest sent
1604	 * redirect.
1605	 */
1606	if (peer->rate_tokens == 0 ||
1607	    time_after(jiffies,
1608		       (peer->rate_last +
1609			(ip_rt_redirect_load << peer->rate_tokens)))) {
1610		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
 
 
1611		peer->rate_last = jiffies;
1612		++peer->rate_tokens;
1613#ifdef CONFIG_IP_ROUTE_VERBOSE
1614		if (log_martians &&
1615		    peer->rate_tokens == ip_rt_redirect_number)
1616			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
1617					     &ip_hdr(skb)->saddr, rt->rt_iif,
1618					     &rt->rt_dst, &rt->rt_gateway);
1619#endif
1620	}
 
 
1621}
1622
1623static int ip_error(struct sk_buff *skb)
1624{
1625	struct rtable *rt = skb_rtable(skb);
 
 
1626	struct inet_peer *peer;
1627	unsigned long now;
 
1628	bool send;
1629	int code;
1630
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1631	switch (rt->dst.error) {
1632	case EINVAL:
1633	default:
1634		goto out;
1635	case EHOSTUNREACH:
1636		code = ICMP_HOST_UNREACH;
1637		break;
1638	case ENETUNREACH:
1639		code = ICMP_NET_UNREACH;
1640		IP_INC_STATS_BH(dev_net(rt->dst.dev),
1641				IPSTATS_MIB_INNOROUTES);
1642		break;
1643	case EACCES:
1644		code = ICMP_PKT_FILTERED;
1645		break;
1646	}
1647
1648	if (!rt->peer)
1649		rt_bind_peer(rt, rt->rt_dst, 1);
1650	peer = rt->peer;
1651
1652	send = true;
1653	if (peer) {
1654		now = jiffies;
1655		peer->rate_tokens += now - peer->rate_last;
1656		if (peer->rate_tokens > ip_rt_error_burst)
1657			peer->rate_tokens = ip_rt_error_burst;
1658		peer->rate_last = now;
1659		if (peer->rate_tokens >= ip_rt_error_cost)
1660			peer->rate_tokens -= ip_rt_error_cost;
1661		else
1662			send = false;
 
1663	}
1664	if (send)
1665		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1666
1667out:	kfree_skb(skb);
1668	return 0;
1669}
1670
1671/*
1672 *	The last two values are not from the RFC but
1673 *	are needed for AMPRnet AX.25 paths.
1674 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1675
1676static const unsigned short mtu_plateau[] =
1677{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
 
1678
1679static inline unsigned short guess_mtu(unsigned short old_mtu)
1680{
1681	int i;
1682
1683	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1684		if (old_mtu > mtu_plateau[i])
1685			return mtu_plateau[i];
1686	return 68;
 
 
1687}
1688
1689unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph,
1690				 unsigned short new_mtu,
1691				 struct net_device *dev)
1692{
1693	unsigned short old_mtu = ntohs(iph->tot_len);
1694	unsigned short est_mtu = 0;
1695	struct inet_peer *peer;
 
 
 
 
 
1696
1697	peer = inet_getpeer_v4(iph->daddr, 1);
1698	if (peer) {
1699		unsigned short mtu = new_mtu;
1700
1701		if (new_mtu < 68 || new_mtu >= old_mtu) {
1702			/* BSD 4.2 derived systems incorrectly adjust
1703			 * tot_len by the IP header length, and report
1704			 * a zero MTU in the ICMP message.
1705			 */
1706			if (mtu == 0 &&
1707			    old_mtu >= 68 + (iph->ihl << 2))
1708				old_mtu -= iph->ihl << 2;
1709			mtu = guess_mtu(old_mtu);
1710		}
1711
1712		if (mtu < ip_rt_min_pmtu)
1713			mtu = ip_rt_min_pmtu;
1714		if (!peer->pmtu_expires || mtu < peer->pmtu_learned) {
1715			unsigned long pmtu_expires;
1716
1717			pmtu_expires = jiffies + ip_rt_mtu_expires;
1718			if (!pmtu_expires)
1719				pmtu_expires = 1UL;
1720
1721			est_mtu = mtu;
1722			peer->pmtu_learned = mtu;
1723			peer->pmtu_expires = pmtu_expires;
1724			atomic_inc(&__rt_peer_genid);
1725		}
1726
1727		inet_putpeer(peer);
 
 
 
 
 
1728	}
1729	return est_mtu ? : new_mtu;
1730}
 
1731
1732static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer)
1733{
1734	unsigned long expires = ACCESS_ONCE(peer->pmtu_expires);
 
 
 
 
 
 
 
1735
1736	if (!expires)
1737		return;
1738	if (time_before(jiffies, expires)) {
1739		u32 orig_dst_mtu = dst_mtu(dst);
1740		if (peer->pmtu_learned < orig_dst_mtu) {
1741			if (!peer->pmtu_orig)
1742				peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU);
1743			dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned);
1744		}
1745	} else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires)
1746		dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig);
1747}
1748
1749static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1750{
1751	struct rtable *rt = (struct rtable *) dst;
1752	struct inet_peer *peer;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1753
1754	dst_confirm(dst);
 
 
 
 
1755
1756	if (!rt->peer)
1757		rt_bind_peer(rt, rt->rt_dst, 1);
1758	peer = rt->peer;
1759	if (peer) {
1760		unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires);
1761
1762		if (mtu < ip_rt_min_pmtu)
1763			mtu = ip_rt_min_pmtu;
1764		if (!pmtu_expires || mtu < peer->pmtu_learned) {
1765
1766			pmtu_expires = jiffies + ip_rt_mtu_expires;
1767			if (!pmtu_expires)
1768				pmtu_expires = 1UL;
1769
1770			peer->pmtu_learned = mtu;
1771			peer->pmtu_expires = pmtu_expires;
 
1772
1773			atomic_inc(&__rt_peer_genid);
1774			rt->rt_peer_genid = rt_peer_genid();
1775		}
1776		check_peer_pmtu(dst, peer);
1777	}
 
 
 
 
 
 
 
1778}
 
1779
1780
1781static void ipv4_validate_peer(struct rtable *rt)
1782{
1783	if (rt->rt_peer_genid != rt_peer_genid()) {
1784		struct inet_peer *peer;
 
1785
1786		if (!rt->peer)
1787			rt_bind_peer(rt, rt->rt_dst, 0);
 
 
 
 
 
 
 
1788
1789		peer = rt->peer;
1790		if (peer) {
1791			check_peer_pmtu(&rt->dst, peer);
1792
1793			if (peer->redirect_learned.a4 &&
1794			    peer->redirect_learned.a4 != rt->rt_gateway)
1795				check_peer_redir(&rt->dst, peer);
1796		}
1797
1798		rt->rt_peer_genid = rt_peer_genid();
 
 
 
 
1799	}
1800}
 
1801
1802static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1803{
1804	struct rtable *rt = (struct rtable *) dst;
1805
1806	if (rt_is_expired(rt))
 
 
 
 
 
 
 
 
1807		return NULL;
1808	ipv4_validate_peer(rt);
1809	return dst;
1810}
1811
1812static void ipv4_dst_destroy(struct dst_entry *dst)
1813{
1814	struct rtable *rt = (struct rtable *) dst;
1815	struct inet_peer *peer = rt->peer;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1816
1817	if (rt->fi) {
1818		fib_info_put(rt->fi);
1819		rt->fi = NULL;
1820	}
1821	if (peer) {
1822		rt->peer = NULL;
1823		inet_putpeer(peer);
1824	}
 
1825}
1826
1827
1828static void ipv4_link_failure(struct sk_buff *skb)
1829{
1830	struct rtable *rt;
1831
1832	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1833
1834	rt = skb_rtable(skb);
1835	if (rt && rt->peer && peer_pmtu_cleaned(rt->peer))
1836		dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig);
1837}
1838
1839static int ip_rt_bug(struct sk_buff *skb)
1840{
1841	pr_debug("%s: %pI4 -> %pI4, %s\n",
1842		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1843		 skb->dev ? skb->dev->name : "?");
1844	kfree_skb(skb);
1845	WARN_ON(1);
1846	return 0;
1847}
1848
1849/*
1850   We do not cache source address of outgoing interface,
1851   because it is used only by IP RR, TS and SRR options,
1852   so that it out of fast path.
1853
1854   BTW remember: "addr" is allowed to be not aligned
1855   in IP options!
1856 */
1857
1858void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1859{
1860	__be32 src;
1861
1862	if (rt_is_output_route(rt))
1863		src = ip_hdr(skb)->saddr;
1864	else {
1865		struct fib_result res;
1866		struct flowi4 fl4;
1867		struct iphdr *iph;
1868
1869		iph = ip_hdr(skb);
1870
1871		memset(&fl4, 0, sizeof(fl4));
1872		fl4.daddr = iph->daddr;
1873		fl4.saddr = iph->saddr;
1874		fl4.flowi4_tos = RT_TOS(iph->tos);
1875		fl4.flowi4_oif = rt->dst.dev->ifindex;
1876		fl4.flowi4_iif = skb->dev->ifindex;
1877		fl4.flowi4_mark = skb->mark;
1878
1879		rcu_read_lock();
1880		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1881			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1882		else
1883			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1884					RT_SCOPE_UNIVERSE);
 
1885		rcu_read_unlock();
1886	}
1887	memcpy(addr, &src, 4);
1888}
1889
1890#ifdef CONFIG_IP_ROUTE_CLASSID
1891static void set_class_tag(struct rtable *rt, u32 tag)
1892{
1893	if (!(rt->dst.tclassid & 0xFFFF))
1894		rt->dst.tclassid |= tag & 0xFFFF;
1895	if (!(rt->dst.tclassid & 0xFFFF0000))
1896		rt->dst.tclassid |= tag & 0xFFFF0000;
1897}
1898#endif
1899
1900static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1901{
1902	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
 
 
1903
1904	if (advmss == 0) {
1905		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1906			       ip_rt_min_advmss);
1907		if (advmss > 65535 - 40)
1908			advmss = 65535 - 40;
1909	}
1910	return advmss;
1911}
1912
1913static unsigned int ipv4_mtu(const struct dst_entry *dst)
1914{
1915	const struct rtable *rt = (const struct rtable *) dst;
1916	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
 
 
 
1917
1918	if (mtu && rt_is_output_route(rt))
1919		return mtu;
1920
1921	mtu = dst->dev->mtu;
 
 
 
 
 
1922
1923	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1924
1925		if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1926			mtu = 576;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1927	}
1928
1929	if (mtu > IP_MAX_MTU)
1930		mtu = IP_MAX_MTU;
1931
1932	return mtu;
1933}
1934
1935static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
1936			    struct fib_info *fi)
1937{
1938	struct inet_peer *peer;
1939	int create = 0;
 
 
 
 
 
 
1940
1941	/* If a peer entry exists for this destination, we must hook
1942	 * it up in order to get at cached metrics.
1943	 */
1944	if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS))
1945		create = 1;
1946
1947	rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create);
1948	if (peer) {
1949		rt->rt_peer_genid = rt_peer_genid();
1950		if (inet_metrics_new(peer))
1951			memcpy(peer->metrics, fi->fib_metrics,
1952			       sizeof(u32) * RTAX_MAX);
1953		dst_init_metrics(&rt->dst, peer->metrics, false);
1954
1955		check_peer_pmtu(&rt->dst, peer);
1956
1957		if (peer->redirect_learned.a4 &&
1958		    peer->redirect_learned.a4 != rt->rt_gateway) {
1959			rt->rt_gateway = peer->redirect_learned.a4;
1960			rt->rt_flags |= RTCF_REDIRECTED;
 
 
 
 
 
 
 
 
 
1961		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1962	} else {
1963		if (fi->fib_metrics != (u32 *) dst_default_metrics) {
1964			rt->fi = fi;
1965			atomic_inc(&fi->fib_clntref);
 
 
 
 
 
 
 
 
 
 
1966		}
1967		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 
 
1968	}
 
 
1969}
1970
1971static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1972			   const struct fib_result *res,
1973			   struct fib_info *fi, u16 type, u32 itag)
 
 
1974{
1975	struct dst_entry *dst = &rt->dst;
1976
1977	if (fi) {
1978		if (FIB_RES_GW(*res) &&
1979		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1980			rt->rt_gateway = FIB_RES_GW(*res);
1981		rt_init_metrics(rt, fl4, fi);
 
 
 
 
 
 
 
 
 
 
1982#ifdef CONFIG_IP_ROUTE_CLASSID
1983		dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
 
 
 
 
 
1984#endif
1985	}
1986
1987	if (dst_mtu(dst) > IP_MAX_MTU)
1988		dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1989	if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1990		dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
 
 
 
 
 
 
 
 
 
 
 
 
 
1991
1992#ifdef CONFIG_IP_ROUTE_CLASSID
1993#ifdef CONFIG_IP_MULTIPLE_TABLES
1994	set_class_tag(rt, fib_rules_tclass(res));
1995#endif
1996	set_class_tag(rt, itag);
1997#endif
1998}
1999
2000static struct rtable *rt_dst_alloc(struct net_device *dev,
2001				   bool nopolicy, bool noxfrm)
 
2002{
2003	return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
2004			 DST_HOST |
2005			 (nopolicy ? DST_NOPOLICY : 0) |
2006			 (noxfrm ? DST_NOXFRM : 0));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2007}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2008
2009/* called in rcu_read_lock() section */
2010static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2011				u8 tos, struct net_device *dev, int our)
 
2012{
2013	unsigned int hash;
2014	struct rtable *rth;
2015	__be32 spec_dst;
2016	struct in_device *in_dev = __in_dev_get_rcu(dev);
2017	u32 itag = 0;
2018	int err;
2019
2020	/* Primary sanity checks. */
 
 
2021
2022	if (in_dev == NULL)
 
2023		return -EINVAL;
2024
2025	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2026	    ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
2027		goto e_inval;
2028
2029	if (ipv4_is_zeronet(saddr)) {
2030		if (!ipv4_is_local_multicast(daddr))
2031			goto e_inval;
2032		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2033	} else {
2034		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2035					  &itag);
2036		if (err < 0)
2037			goto e_err;
2038	}
2039	rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2040			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2041	if (!rth)
2042		goto e_nobufs;
2043
2044#ifdef CONFIG_IP_ROUTE_CLASSID
2045	rth->dst.tclassid = itag;
2046#endif
2047	rth->dst.output = ip_rt_bug;
2048
2049	rth->rt_key_dst	= daddr;
2050	rth->rt_key_src	= saddr;
2051	rth->rt_genid	= rt_genid(dev_net(dev));
2052	rth->rt_flags	= RTCF_MULTICAST;
2053	rth->rt_type	= RTN_MULTICAST;
2054	rth->rt_key_tos	= tos;
2055	rth->rt_dst	= daddr;
2056	rth->rt_src	= saddr;
2057	rth->rt_route_iif = dev->ifindex;
2058	rth->rt_iif	= dev->ifindex;
2059	rth->rt_oif	= 0;
2060	rth->rt_mark    = skb->mark;
2061	rth->rt_gateway	= daddr;
2062	rth->rt_spec_dst= spec_dst;
2063	rth->rt_peer_genid = 0;
2064	rth->peer = NULL;
2065	rth->fi = NULL;
2066	if (our) {
2067		rth->dst.input= ip_local_deliver;
2068		rth->rt_flags |= RTCF_LOCAL;
2069	}
2070
2071#ifdef CONFIG_IP_MROUTE
2072	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
2073		rth->dst.input = ip_mr_input;
2074#endif
2075	RT_CACHE_STAT_INC(in_slow_mc);
2076
2077	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
2078	rth = rt_intern_hash(hash, rth, skb, dev->ifindex);
2079	return IS_ERR(rth) ? PTR_ERR(rth) : 0;
2080
2081e_nobufs:
2082	return -ENOBUFS;
2083e_inval:
2084	return -EINVAL;
2085e_err:
2086	return err;
2087}
2088
2089
2090static void ip_handle_martian_source(struct net_device *dev,
2091				     struct in_device *in_dev,
2092				     struct sk_buff *skb,
2093				     __be32 daddr,
2094				     __be32 saddr)
2095{
2096	RT_CACHE_STAT_INC(in_martian_src);
2097#ifdef CONFIG_IP_ROUTE_VERBOSE
2098	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
2099		/*
2100		 *	RFC1812 recommendation, if source is martian,
2101		 *	the only hint is MAC header.
2102		 */
2103		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
2104			&daddr, &saddr, dev->name);
2105		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
2106			print_hex_dump(KERN_WARNING, "ll header: ",
2107				       DUMP_PREFIX_OFFSET, 16, 1,
2108				       skb_mac_header(skb),
2109				       dev->hard_header_len, true);
2110		}
2111	}
2112#endif
2113}
2114
2115/* called in rcu_read_lock() section */
2116static int __mkroute_input(struct sk_buff *skb,
2117			   const struct fib_result *res,
2118			   struct in_device *in_dev,
2119			   __be32 daddr, __be32 saddr, u32 tos,
2120			   struct rtable **result)
2121{
 
 
 
2122	struct rtable *rth;
2123	int err;
2124	struct in_device *out_dev;
2125	unsigned int flags = 0;
2126	__be32 spec_dst;
2127	u32 itag;
2128
2129	/* get a working reference to the output device */
2130	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2131	if (out_dev == NULL) {
2132		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
2133		return -EINVAL;
2134	}
2135
2136
2137	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
2138				  in_dev->dev, &spec_dst, &itag);
2139	if (err < 0) {
2140		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2141					 saddr);
2142
2143		goto cleanup;
2144	}
2145
2146	if (err)
2147		flags |= RTCF_DIRECTSRC;
2148
2149	if (out_dev == in_dev && err &&
2150	    (IN_DEV_SHARED_MEDIA(out_dev) ||
2151	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2152		flags |= RTCF_DOREDIRECT;
 
 
 
2153
2154	if (skb->protocol != htons(ETH_P_IP)) {
2155		/* Not IP (i.e. ARP). Do not create route, if it is
2156		 * invalid for proxy arp. DNAT routes are always valid.
2157		 *
2158		 * Proxy arp feature have been extended to allow, ARP
2159		 * replies back to the same interface, to support
2160		 * Private VLAN switch technologies. See arp.c.
2161		 */
2162		if (out_dev == in_dev &&
2163		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2164			err = -EINVAL;
2165			goto cleanup;
2166		}
2167	}
2168
2169	rth = rt_dst_alloc(out_dev->dev,
 
 
 
 
 
 
 
 
 
 
 
 
2170			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2171			   IN_DEV_CONF_GET(out_dev, NOXFRM));
2172	if (!rth) {
2173		err = -ENOBUFS;
2174		goto cleanup;
2175	}
2176
2177	rth->rt_key_dst	= daddr;
2178	rth->rt_key_src	= saddr;
2179	rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2180	rth->rt_flags = flags;
2181	rth->rt_type = res->type;
2182	rth->rt_key_tos	= tos;
2183	rth->rt_dst	= daddr;
2184	rth->rt_src	= saddr;
2185	rth->rt_route_iif = in_dev->dev->ifindex;
2186	rth->rt_iif 	= in_dev->dev->ifindex;
2187	rth->rt_oif 	= 0;
2188	rth->rt_mark    = skb->mark;
2189	rth->rt_gateway	= daddr;
2190	rth->rt_spec_dst= spec_dst;
2191	rth->rt_peer_genid = 0;
2192	rth->peer = NULL;
2193	rth->fi = NULL;
2194
2195	rth->dst.input = ip_forward;
2196	rth->dst.output = ip_output;
2197
2198	rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
2199
2200	*result = rth;
 
 
2201	err = 0;
2202 cleanup:
2203	return err;
2204}
2205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2206static int ip_mkroute_input(struct sk_buff *skb,
2207			    struct fib_result *res,
2208			    const struct flowi4 *fl4,
2209			    struct in_device *in_dev,
2210			    __be32 daddr, __be32 saddr, u32 tos)
 
2211{
2212	struct rtable *rth = NULL;
2213	int err;
2214	unsigned int hash;
2215
2216#ifdef CONFIG_IP_ROUTE_MULTIPATH
2217	if (res->fi && res->fi->fib_nhs > 1)
2218		fib_select_multipath(res);
2219#endif
2220
2221	/* create a routing cache entry */
2222	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2223	if (err)
2224		return err;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2225
2226	/* put it into the cache */
2227	hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
2228		       rt_genid(dev_net(rth->dst.dev)));
2229	rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
2230	if (IS_ERR(rth))
2231		return PTR_ERR(rth);
2232	return 0;
 
 
 
 
2233}
2234
2235/*
2236 *	NOTE. We drop all the packets that has local source
2237 *	addresses, because every properly looped back packet
2238 *	must have correct destination already attached by output routine.
 
 
2239 *
2240 *	Such approach solves two big problems:
2241 *	1. Not simplex devices are handled properly.
2242 *	2. IP spoofing attempts are filtered with 100% of guarantee.
2243 *	called with rcu_read_lock()
2244 */
2245
2246static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2247			       u8 tos, struct net_device *dev)
 
2248{
2249	struct fib_result res;
2250	struct in_device *in_dev = __in_dev_get_rcu(dev);
2251	struct flowi4	fl4;
 
 
 
2252	unsigned int	flags = 0;
2253	u32		itag = 0;
2254	struct rtable	*rth;
2255	unsigned int	hash;
2256	__be32		spec_dst;
2257	int		err = -EINVAL;
2258	struct net    *net = dev_net(dev);
2259
2260	/* IP on this device is disabled. */
2261
2262	if (!in_dev)
2263		goto out;
2264
2265	/* Check for the most weird martians, which can be not detected
2266	   by fib_lookup.
2267	 */
2268
2269	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2270	    ipv4_is_loopback(saddr))
 
 
 
 
 
 
2271		goto martian_source;
2272
 
 
2273	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2274		goto brd_input;
2275
2276	/* Accept zero addresses only to limited broadcast;
2277	 * I even do not know to fix it or not. Waiting for complains :-)
2278	 */
2279	if (ipv4_is_zeronet(saddr))
2280		goto martian_source;
2281
2282	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2283		goto martian_destination;
2284
 
 
 
 
 
 
 
 
 
 
 
2285	/*
2286	 *	Now we are ready to route packet.
2287	 */
2288	fl4.flowi4_oif = 0;
2289	fl4.flowi4_iif = dev->ifindex;
2290	fl4.flowi4_mark = skb->mark;
2291	fl4.flowi4_tos = tos;
2292	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
 
2293	fl4.daddr = daddr;
2294	fl4.saddr = saddr;
2295	err = fib_lookup(net, &fl4, &res);
 
 
 
 
 
 
 
 
 
 
 
2296	if (err != 0) {
2297		if (!IN_DEV_FORWARD(in_dev))
2298			goto e_hostunreach;
2299		goto no_route;
2300	}
2301
2302	RT_CACHE_STAT_INC(in_slow_tot);
2303
2304	if (res.type == RTN_BROADCAST)
 
 
 
2305		goto brd_input;
 
2306
2307	if (res.type == RTN_LOCAL) {
2308		err = fib_validate_source(skb, saddr, daddr, tos,
2309					  net->loopback_dev->ifindex,
2310					  dev, &spec_dst, &itag);
2311		if (err < 0)
2312			goto martian_source_keep_err;
2313		if (err)
2314			flags |= RTCF_DIRECTSRC;
2315		spec_dst = daddr;
2316		goto local_input;
2317	}
2318
2319	if (!IN_DEV_FORWARD(in_dev))
2320		goto e_hostunreach;
2321	if (res.type != RTN_UNICAST)
 
 
2322		goto martian_destination;
2323
2324	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
 
2325out:	return err;
2326
2327brd_input:
2328	if (skb->protocol != htons(ETH_P_IP))
2329		goto e_inval;
2330
2331	if (ipv4_is_zeronet(saddr))
2332		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2333	else {
2334		err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst,
2335					  &itag);
2336		if (err < 0)
2337			goto martian_source_keep_err;
2338		if (err)
2339			flags |= RTCF_DIRECTSRC;
2340	}
2341	flags |= RTCF_BROADCAST;
2342	res.type = RTN_BROADCAST;
2343	RT_CACHE_STAT_INC(in_brd);
2344
2345local_input:
2346	rth = rt_dst_alloc(net->loopback_dev,
 
 
 
 
 
 
 
 
 
 
 
 
 
2347			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2348	if (!rth)
2349		goto e_nobufs;
2350
2351	rth->dst.input= ip_local_deliver;
2352	rth->dst.output= ip_rt_bug;
2353#ifdef CONFIG_IP_ROUTE_CLASSID
2354	rth->dst.tclassid = itag;
2355#endif
 
2356
2357	rth->rt_key_dst	= daddr;
2358	rth->rt_key_src	= saddr;
2359	rth->rt_genid = rt_genid(net);
2360	rth->rt_flags 	= flags|RTCF_LOCAL;
2361	rth->rt_type	= res.type;
2362	rth->rt_key_tos	= tos;
2363	rth->rt_dst	= daddr;
2364	rth->rt_src	= saddr;
2365#ifdef CONFIG_IP_ROUTE_CLASSID
2366	rth->dst.tclassid = itag;
2367#endif
2368	rth->rt_route_iif = dev->ifindex;
2369	rth->rt_iif	= dev->ifindex;
2370	rth->rt_oif	= 0;
2371	rth->rt_mark    = skb->mark;
2372	rth->rt_gateway	= daddr;
2373	rth->rt_spec_dst= spec_dst;
2374	rth->rt_peer_genid = 0;
2375	rth->peer = NULL;
2376	rth->fi = NULL;
2377	if (res.type == RTN_UNREACHABLE) {
2378		rth->dst.input= ip_error;
2379		rth->dst.error= -err;
2380		rth->rt_flags 	&= ~RTCF_LOCAL;
2381	}
2382	hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net));
2383	rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif);
 
 
 
 
 
 
 
 
 
 
 
 
 
2384	err = 0;
2385	if (IS_ERR(rth))
2386		err = PTR_ERR(rth);
2387	goto out;
2388
2389no_route:
2390	RT_CACHE_STAT_INC(in_no_route);
2391	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2392	res.type = RTN_UNREACHABLE;
2393	if (err == -ESRCH)
2394		err = -ENETUNREACH;
2395	goto local_input;
2396
2397	/*
2398	 *	Do not cache martian addresses: they should be logged (RFC1812)
2399	 */
2400martian_destination:
2401	RT_CACHE_STAT_INC(in_martian_dst);
2402#ifdef CONFIG_IP_ROUTE_VERBOSE
2403	if (IN_DEV_LOG_MARTIANS(in_dev))
2404		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2405				     &daddr, &saddr, dev->name);
2406#endif
2407
2408e_hostunreach:
2409	err = -EHOSTUNREACH;
2410	goto out;
2411
2412e_inval:
2413	err = -EINVAL;
2414	goto out;
2415
2416e_nobufs:
2417	err = -ENOBUFS;
2418	goto out;
2419
2420martian_source:
2421	err = -EINVAL;
2422martian_source_keep_err:
2423	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2424	goto out;
2425}
2426
2427int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2428			   u8 tos, struct net_device *dev, bool noref)
2429{
2430	struct rtable	*rth;
2431	unsigned int	hash;
2432	int iif = dev->ifindex;
2433	struct net *net;
2434	int res;
2435
2436	net = dev_net(dev);
2437
 
2438	rcu_read_lock();
 
 
2439
2440	if (!rt_caching(net))
2441		goto skip_cache;
 
2442
2443	tos &= IPTOS_RT_MASK;
2444	hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2445
2446	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2447	     rth = rcu_dereference(rth->dst.rt_next)) {
2448		if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
2449		     ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
2450		     (rth->rt_route_iif ^ iif) |
2451		     (rth->rt_key_tos ^ tos)) == 0 &&
2452		    rth->rt_mark == skb->mark &&
2453		    net_eq(dev_net(rth->dst.dev), net) &&
2454		    !rt_is_expired(rth)) {
2455			ipv4_validate_peer(rth);
2456			if (noref) {
2457				dst_use_noref(&rth->dst, jiffies);
2458				skb_dst_set_noref(skb, &rth->dst);
2459			} else {
2460				dst_use(&rth->dst, jiffies);
2461				skb_dst_set(skb, &rth->dst);
2462			}
2463			RT_CACHE_STAT_INC(in_hit);
2464			rcu_read_unlock();
2465			return 0;
2466		}
2467		RT_CACHE_STAT_INC(in_hlist_search);
2468	}
2469
2470skip_cache:
2471	/* Multicast recognition logic is moved from route cache to here.
2472	   The problem was that too many Ethernet cards have broken/missing
2473	   hardware multicast filters :-( As result the host on multicasting
2474	   network acquires a lot of useless route cache entries, sort of
2475	   SDR messages from all the world. Now we try to get rid of them.
2476	   Really, provided software IP multicast filter is organized
2477	   reasonably (at least, hashed), it does not result in a slowdown
2478	   comparing with route cache reject entries.
2479	   Note, that multicast routers are not affected, because
2480	   route cache entry is created eventually.
2481	 */
2482	if (ipv4_is_multicast(daddr)) {
2483		struct in_device *in_dev = __in_dev_get_rcu(dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2484
2485		if (in_dev) {
2486			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2487						  ip_hdr(skb)->protocol);
2488			if (our
2489#ifdef CONFIG_IP_MROUTE
2490				||
2491			    (!ipv4_is_local_multicast(daddr) &&
2492			     IN_DEV_MFORWARD(in_dev))
2493#endif
2494			   ) {
2495				int res = ip_route_input_mc(skb, daddr, saddr,
2496							    tos, dev, our);
2497				rcu_read_unlock();
2498				return res;
2499			}
2500		}
2501		rcu_read_unlock();
2502		return -EINVAL;
2503	}
2504	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2505	rcu_read_unlock();
2506	return res;
2507}
2508EXPORT_SYMBOL(ip_route_input_common);
2509
2510/* called with rcu_read_lock() */
2511static struct rtable *__mkroute_output(const struct fib_result *res,
2512				       const struct flowi4 *fl4,
2513				       __be32 orig_daddr, __be32 orig_saddr,
2514				       int orig_oif, __u8 orig_rtos,
2515				       struct net_device *dev_out,
2516				       unsigned int flags)
2517{
2518	struct fib_info *fi = res->fi;
 
2519	struct in_device *in_dev;
2520	u16 type = res->type;
2521	struct rtable *rth;
 
2522
2523	if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
 
2524		return ERR_PTR(-EINVAL);
2525
 
 
 
 
 
 
2526	if (ipv4_is_lbcast(fl4->daddr))
2527		type = RTN_BROADCAST;
2528	else if (ipv4_is_multicast(fl4->daddr))
2529		type = RTN_MULTICAST;
2530	else if (ipv4_is_zeronet(fl4->daddr))
2531		return ERR_PTR(-EINVAL);
2532
2533	if (dev_out->flags & IFF_LOOPBACK)
2534		flags |= RTCF_LOCAL;
2535
2536	in_dev = __in_dev_get_rcu(dev_out);
2537	if (!in_dev)
2538		return ERR_PTR(-EINVAL);
2539
2540	if (type == RTN_BROADCAST) {
2541		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2542		fi = NULL;
2543	} else if (type == RTN_MULTICAST) {
2544		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2545		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2546				     fl4->flowi4_proto))
2547			flags &= ~RTCF_LOCAL;
 
 
2548		/* If multicast route do not exist use
2549		 * default one, but do not gateway in this case.
2550		 * Yes, it is hack.
2551		 */
2552		if (fi && res->prefixlen < 4)
2553			fi = NULL;
 
 
 
 
 
 
 
 
 
 
 
 
2554	}
2555
2556	rth = rt_dst_alloc(dev_out,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2557			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2558			   IN_DEV_CONF_GET(in_dev, NOXFRM));
2559	if (!rth)
2560		return ERR_PTR(-ENOBUFS);
2561
2562	rth->dst.output = ip_output;
2563
2564	rth->rt_key_dst	= orig_daddr;
2565	rth->rt_key_src	= orig_saddr;
2566	rth->rt_genid = rt_genid(dev_net(dev_out));
2567	rth->rt_flags	= flags;
2568	rth->rt_type	= type;
2569	rth->rt_key_tos	= orig_rtos;
2570	rth->rt_dst	= fl4->daddr;
2571	rth->rt_src	= fl4->saddr;
2572	rth->rt_route_iif = 0;
2573	rth->rt_iif	= orig_oif ? : dev_out->ifindex;
2574	rth->rt_oif	= orig_oif;
2575	rth->rt_mark    = fl4->flowi4_mark;
2576	rth->rt_gateway = fl4->daddr;
2577	rth->rt_spec_dst= fl4->saddr;
2578	rth->rt_peer_genid = 0;
2579	rth->peer = NULL;
2580	rth->fi = NULL;
2581
2582	RT_CACHE_STAT_INC(out_slow_tot);
2583
2584	if (flags & RTCF_LOCAL) {
2585		rth->dst.input = ip_local_deliver;
2586		rth->rt_spec_dst = fl4->daddr;
2587	}
2588	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2589		rth->rt_spec_dst = fl4->saddr;
2590		if (flags & RTCF_LOCAL &&
2591		    !(dev_out->flags & IFF_LOOPBACK)) {
2592			rth->dst.output = ip_mc_output;
2593			RT_CACHE_STAT_INC(out_slow_mc);
2594		}
2595#ifdef CONFIG_IP_MROUTE
2596		if (type == RTN_MULTICAST) {
2597			if (IN_DEV_MFORWARD(in_dev) &&
2598			    !ipv4_is_local_multicast(fl4->daddr)) {
2599				rth->dst.input = ip_mr_input;
2600				rth->dst.output = ip_mc_output;
2601			}
2602		}
2603#endif
2604	}
2605
2606	rt_set_nexthop(rth, fl4, res, fi, type, 0);
 
2607
2608	return rth;
2609}
2610
2611/*
2612 * Major route resolver routine.
2613 * called with rcu_read_lock();
2614 */
2615
2616static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4)
 
2617{
2618	struct net_device *dev_out = NULL;
2619	__u8 tos = RT_FL_TOS(fl4);
2620	unsigned int flags = 0;
2621	struct fib_result res;
 
 
 
 
2622	struct rtable *rth;
2623	__be32 orig_daddr;
2624	__be32 orig_saddr;
2625	int orig_oif;
2626
2627	res.fi		= NULL;
2628#ifdef CONFIG_IP_MULTIPLE_TABLES
2629	res.r		= NULL;
2630#endif
2631
2632	orig_daddr = fl4->daddr;
2633	orig_saddr = fl4->saddr;
2634	orig_oif = fl4->flowi4_oif;
2635
2636	fl4->flowi4_iif = net->loopback_dev->ifindex;
2637	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2638	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2639			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2640
2641	rcu_read_lock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2642	if (fl4->saddr) {
2643		rth = ERR_PTR(-EINVAL);
2644		if (ipv4_is_multicast(fl4->saddr) ||
2645		    ipv4_is_lbcast(fl4->saddr) ||
2646		    ipv4_is_zeronet(fl4->saddr))
 
2647			goto out;
 
 
 
2648
2649		/* I removed check for oif == dev_out->oif here.
2650		   It was wrong for two reasons:
2651		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2652		      is assigned to multiple interfaces.
2653		   2. Moreover, we are allowed to send packets with saddr
2654		      of another iface. --ANK
2655		 */
2656
2657		if (fl4->flowi4_oif == 0 &&
2658		    (ipv4_is_multicast(fl4->daddr) ||
2659		     ipv4_is_lbcast(fl4->daddr))) {
2660			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2661			dev_out = __ip_dev_find(net, fl4->saddr, false);
2662			if (dev_out == NULL)
2663				goto out;
2664
2665			/* Special hack: user can direct multicasts
2666			   and limited broadcast via necessary interface
2667			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2668			   This hack is not just for fun, it allows
2669			   vic,vat and friends to work.
2670			   They bind socket to loopback, set ttl to zero
2671			   and expect that it will work.
2672			   From the viewpoint of routing cache they are broken,
2673			   because we are not allowed to build multicast path
2674			   with loopback source addr (look, routing cache
2675			   cannot know, that ttl is zero, so that packet
2676			   will not leave this host and route is valid).
2677			   Luckily, this hack is good workaround.
2678			 */
2679
2680			fl4->flowi4_oif = dev_out->ifindex;
2681			goto make_route;
2682		}
2683
2684		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2685			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2686			if (!__ip_dev_find(net, fl4->saddr, false))
2687				goto out;
2688		}
2689	}
2690
2691
2692	if (fl4->flowi4_oif) {
2693		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2694		rth = ERR_PTR(-ENODEV);
2695		if (dev_out == NULL)
2696			goto out;
2697
2698		/* RACE: Check return value of inet_select_addr instead. */
2699		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2700			rth = ERR_PTR(-ENETUNREACH);
2701			goto out;
2702		}
2703		if (ipv4_is_local_multicast(fl4->daddr) ||
2704		    ipv4_is_lbcast(fl4->daddr)) {
 
2705			if (!fl4->saddr)
2706				fl4->saddr = inet_select_addr(dev_out, 0,
2707							      RT_SCOPE_LINK);
2708			goto make_route;
2709		}
2710		if (fl4->saddr) {
2711			if (ipv4_is_multicast(fl4->daddr))
2712				fl4->saddr = inet_select_addr(dev_out, 0,
2713							      fl4->flowi4_scope);
2714			else if (!fl4->daddr)
2715				fl4->saddr = inet_select_addr(dev_out, 0,
2716							      RT_SCOPE_HOST);
2717		}
2718	}
2719
2720	if (!fl4->daddr) {
2721		fl4->daddr = fl4->saddr;
2722		if (!fl4->daddr)
2723			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2724		dev_out = net->loopback_dev;
2725		fl4->flowi4_oif = net->loopback_dev->ifindex;
2726		res.type = RTN_LOCAL;
2727		flags |= RTCF_LOCAL;
2728		goto make_route;
2729	}
2730
2731	if (fib_lookup(net, fl4, &res)) {
2732		res.fi = NULL;
2733		if (fl4->flowi4_oif) {
 
 
 
 
2734			/* Apparently, routing tables are wrong. Assume,
2735			   that the destination is on link.
2736
2737			   WHY? DW.
2738			   Because we are allowed to send to iface
2739			   even if it has NO routes and NO assigned
2740			   addresses. When oif is specified, routing
2741			   tables are looked up with only one purpose:
2742			   to catch if destination is gatewayed, rather than
2743			   direct. Moreover, if MSG_DONTROUTE is set,
2744			   we send packet, ignoring both routing tables
2745			   and ifaddr state. --ANK
2746
2747
2748			   We could make it even if oif is unknown,
2749			   likely IPv6, but we do not.
2750			 */
2751
2752			if (fl4->saddr == 0)
2753				fl4->saddr = inet_select_addr(dev_out, 0,
2754							      RT_SCOPE_LINK);
2755			res.type = RTN_UNICAST;
2756			goto make_route;
2757		}
2758		rth = ERR_PTR(-ENETUNREACH);
2759		goto out;
2760	}
2761
2762	if (res.type == RTN_LOCAL) {
2763		if (!fl4->saddr) {
2764			if (res.fi->fib_prefsrc)
2765				fl4->saddr = res.fi->fib_prefsrc;
2766			else
2767				fl4->saddr = fl4->daddr;
2768		}
2769		dev_out = net->loopback_dev;
 
 
 
 
 
 
 
 
 
2770		fl4->flowi4_oif = dev_out->ifindex;
2771		res.fi = NULL;
2772		flags |= RTCF_LOCAL;
2773		goto make_route;
2774	}
2775
2776#ifdef CONFIG_IP_ROUTE_MULTIPATH
2777	if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2778		fib_select_multipath(&res);
2779	else
2780#endif
2781	if (!res.prefixlen &&
2782	    res.table->tb_num_default > 1 &&
2783	    res.type == RTN_UNICAST && !fl4->flowi4_oif)
2784		fib_select_default(&res);
2785
2786	if (!fl4->saddr)
2787		fl4->saddr = FIB_RES_PREFSRC(net, res);
2788
2789	dev_out = FIB_RES_DEV(res);
2790	fl4->flowi4_oif = dev_out->ifindex;
2791
 
2792
2793make_route:
2794	rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
2795			       tos, dev_out, flags);
2796	if (!IS_ERR(rth)) {
2797		unsigned int hash;
2798
2799		hash = rt_hash(orig_daddr, orig_saddr, orig_oif,
2800			       rt_genid(dev_net(dev_out)));
2801		rth = rt_intern_hash(hash, rth, NULL, orig_oif);
2802	}
2803
2804out:
2805	rcu_read_unlock();
2806	return rth;
2807}
2808
2809struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4)
2810{
2811	struct rtable *rth;
2812	unsigned int hash;
2813
2814	if (!rt_caching(net))
2815		goto slow_output;
2816
2817	hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net));
2818
2819	rcu_read_lock_bh();
2820	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2821		rth = rcu_dereference_bh(rth->dst.rt_next)) {
2822		if (rth->rt_key_dst == flp4->daddr &&
2823		    rth->rt_key_src == flp4->saddr &&
2824		    rt_is_output_route(rth) &&
2825		    rth->rt_oif == flp4->flowi4_oif &&
2826		    rth->rt_mark == flp4->flowi4_mark &&
2827		    !((rth->rt_key_tos ^ flp4->flowi4_tos) &
2828			    (IPTOS_RT_MASK | RTO_ONLINK)) &&
2829		    net_eq(dev_net(rth->dst.dev), net) &&
2830		    !rt_is_expired(rth)) {
2831			ipv4_validate_peer(rth);
2832			dst_use(&rth->dst, jiffies);
2833			RT_CACHE_STAT_INC(out_hit);
2834			rcu_read_unlock_bh();
2835			if (!flp4->saddr)
2836				flp4->saddr = rth->rt_src;
2837			if (!flp4->daddr)
2838				flp4->daddr = rth->rt_dst;
2839			return rth;
2840		}
2841		RT_CACHE_STAT_INC(out_hlist_search);
2842	}
2843	rcu_read_unlock_bh();
2844
2845slow_output:
2846	return ip_route_output_slow(net, flp4);
2847}
2848EXPORT_SYMBOL_GPL(__ip_route_output_key);
2849
2850static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2851{
2852	return NULL;
2853}
2854
2855static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2856{
2857	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2858
2859	return mtu ? : dst->dev->mtu;
2860}
2861
2862static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 
 
 
 
 
 
 
2863{
2864}
2865
2866static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2867					  unsigned long old)
2868{
2869	return NULL;
2870}
2871
2872static struct dst_ops ipv4_dst_blackhole_ops = {
2873	.family			=	AF_INET,
2874	.protocol		=	cpu_to_be16(ETH_P_IP),
2875	.destroy		=	ipv4_dst_destroy,
2876	.check			=	ipv4_blackhole_dst_check,
2877	.mtu			=	ipv4_blackhole_mtu,
2878	.default_advmss		=	ipv4_default_advmss,
2879	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
 
2880	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2881	.neigh_lookup		=	ipv4_neigh_lookup,
2882};
2883
2884struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2885{
2886	struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
2887	struct rtable *ort = (struct rtable *) dst_orig;
 
2888
 
2889	if (rt) {
2890		struct dst_entry *new = &rt->dst;
2891
2892		new->__use = 1;
2893		new->input = dst_discard;
2894		new->output = dst_discard;
2895		dst_copy_metrics(new, &ort->dst);
2896
2897		new->dev = ort->dst.dev;
2898		if (new->dev)
2899			dev_hold(new->dev);
2900
2901		rt->rt_key_dst = ort->rt_key_dst;
2902		rt->rt_key_src = ort->rt_key_src;
2903		rt->rt_key_tos = ort->rt_key_tos;
2904		rt->rt_route_iif = ort->rt_route_iif;
2905		rt->rt_iif = ort->rt_iif;
2906		rt->rt_oif = ort->rt_oif;
2907		rt->rt_mark = ort->rt_mark;
2908
2909		rt->rt_genid = rt_genid(net);
2910		rt->rt_flags = ort->rt_flags;
2911		rt->rt_type = ort->rt_type;
2912		rt->rt_dst = ort->rt_dst;
2913		rt->rt_src = ort->rt_src;
2914		rt->rt_gateway = ort->rt_gateway;
2915		rt->rt_spec_dst = ort->rt_spec_dst;
2916		rt->peer = ort->peer;
2917		if (rt->peer)
2918			atomic_inc(&rt->peer->refcnt);
2919		rt->fi = ort->fi;
2920		if (rt->fi)
2921			atomic_inc(&rt->fi->fib_clntref);
2922
2923		dst_free(new);
2924	}
2925
2926	dst_release(dst_orig);
2927
2928	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2929}
2930
2931struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2932				    struct sock *sk)
2933{
2934	struct rtable *rt = __ip_route_output_key(net, flp4);
2935
2936	if (IS_ERR(rt))
2937		return rt;
2938
2939	if (flp4->flowi4_proto)
2940		rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2941						   flowi4_to_flowi(flp4),
2942						   sk, 0);
2943
2944	return rt;
2945}
2946EXPORT_SYMBOL_GPL(ip_route_output_flow);
2947
2948static int rt_fill_info(struct net *net,
2949			struct sk_buff *skb, u32 pid, u32 seq, int event,
2950			int nowait, unsigned int flags)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2951{
2952	struct rtable *rt = skb_rtable(skb);
2953	struct rtmsg *r;
2954	struct nlmsghdr *nlh;
2955	unsigned long expires = 0;
2956	const struct inet_peer *peer = rt->peer;
2957	u32 id = 0, ts = 0, tsage = 0, error;
2958
2959	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2960	if (nlh == NULL)
2961		return -EMSGSIZE;
2962
2963	r = nlmsg_data(nlh);
2964	r->rtm_family	 = AF_INET;
2965	r->rtm_dst_len	= 32;
2966	r->rtm_src_len	= 0;
2967	r->rtm_tos	= rt->rt_key_tos;
2968	r->rtm_table	= RT_TABLE_MAIN;
2969	if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2970		goto nla_put_failure;
2971	r->rtm_type	= rt->rt_type;
2972	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2973	r->rtm_protocol = RTPROT_UNSPEC;
2974	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2975	if (rt->rt_flags & RTCF_NOTIFY)
2976		r->rtm_flags |= RTM_F_NOTIFY;
 
 
2977
2978	if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
2979		goto nla_put_failure;
2980	if (rt->rt_key_src) {
2981		r->rtm_src_len = 32;
2982		if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
2983			goto nla_put_failure;
2984	}
2985	if (rt->dst.dev &&
2986	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2987		goto nla_put_failure;
2988#ifdef CONFIG_IP_ROUTE_CLASSID
2989	if (rt->dst.tclassid &&
2990	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2991		goto nla_put_failure;
2992#endif
2993	if (rt_is_input_route(rt)) {
2994		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_spec_dst))
 
2995			goto nla_put_failure;
2996	} else if (rt->rt_src != rt->rt_key_src) {
2997		if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
 
 
2998			goto nla_put_failure;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2999	}
3000	if (rt->rt_dst != rt->rt_gateway &&
3001	    nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
3002		goto nla_put_failure;
3003
3004	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
 
 
 
 
 
3005		goto nla_put_failure;
3006
3007	if (rt->rt_mark &&
3008	    nla_put_be32(skb, RTA_MARK, rt->rt_mark))
3009		goto nla_put_failure;
 
3010
3011	error = rt->dst.error;
3012	if (peer) {
3013		inet_peer_refcheck(rt->peer);
3014		id = atomic_read(&peer->ip_id_count) & 0xffff;
3015		if (peer->tcp_ts_stamp) {
3016			ts = peer->tcp_ts;
3017			tsage = get_seconds() - peer->tcp_ts_stamp;
3018		}
3019		expires = ACCESS_ONCE(peer->pmtu_expires);
3020		if (expires) {
3021			if (time_before(jiffies, expires))
3022				expires -= jiffies;
3023			else
3024				expires = 0;
3025		}
3026	}
3027
3028	if (rt_is_input_route(rt)) {
3029#ifdef CONFIG_IP_MROUTE
3030		__be32 dst = rt->rt_dst;
 
 
 
 
 
3031
3032		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
3033		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3034			int err = ipmr_get_route(net, skb,
3035						 rt->rt_src, rt->rt_dst,
3036						 r, nowait);
3037			if (err <= 0) {
3038				if (!nowait) {
3039					if (err == 0)
3040						return 0;
3041					goto nla_put_failure;
3042				} else {
3043					if (err == -EMSGSIZE)
3044						goto nla_put_failure;
3045					error = err;
3046				}
3047			}
3048		} else
3049#endif
3050			if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
3051				goto nla_put_failure;
 
3052	}
3053
3054	if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
3055			       expires, error) < 0)
 
3056		goto nla_put_failure;
3057
3058	return nlmsg_end(skb, nlh);
 
3059
3060nla_put_failure:
3061	nlmsg_cancel(skb, nlh);
3062	return -EMSGSIZE;
3063}
3064
3065static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3066{
3067	struct net *net = sock_net(in_skb->sk);
3068	struct rtmsg *rtm;
3069	struct nlattr *tb[RTA_MAX+1];
 
 
 
 
3070	struct rtable *rt = NULL;
 
 
 
3071	__be32 dst = 0;
3072	__be32 src = 0;
 
3073	u32 iif;
3074	int err;
3075	int mark;
3076	struct sk_buff *skb;
3077
3078	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
3079	if (err < 0)
3080		goto errout;
3081
3082	rtm = nlmsg_data(nlh);
 
 
 
 
 
 
 
 
3083
3084	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3085	if (skb == NULL) {
3086		err = -ENOBUFS;
3087		goto errout;
 
3088	}
3089
3090	/* Reserve room for dummy headers, this skb can pass
3091	   through good chunk of routing engine.
3092	 */
3093	skb_reset_mac_header(skb);
3094	skb_reset_network_header(skb);
3095
3096	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
3097	ip_hdr(skb)->protocol = IPPROTO_ICMP;
3098	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
 
 
 
 
 
 
 
 
 
 
 
 
3099
3100	src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
3101	dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
3102	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3103	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3104
3105	if (iif) {
3106		struct net_device *dev;
3107
3108		dev = __dev_get_by_index(net, iif);
3109		if (dev == NULL) {
3110			err = -ENODEV;
3111			goto errout_free;
3112		}
3113
3114		skb->protocol	= htons(ETH_P_IP);
3115		skb->dev	= dev;
3116		skb->mark	= mark;
3117		local_bh_disable();
3118		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
3119		local_bh_enable();
3120
3121		rt = skb_rtable(skb);
3122		if (err == 0 && rt->dst.error)
3123			err = -rt->dst.error;
3124	} else {
3125		struct flowi4 fl4 = {
3126			.daddr = dst,
3127			.saddr = src,
3128			.flowi4_tos = rtm->rtm_tos,
3129			.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3130			.flowi4_mark = mark,
3131		};
3132		rt = ip_route_output_key(net, &fl4);
3133
3134		err = 0;
3135		if (IS_ERR(rt))
3136			err = PTR_ERR(rt);
 
 
3137	}
3138
3139	if (err)
3140		goto errout_free;
3141
3142	skb_dst_set(skb, &rt->dst);
3143	if (rtm->rtm_flags & RTM_F_NOTIFY)
3144		rt->rt_flags |= RTCF_NOTIFY;
3145
3146	err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3147			   RTM_NEWROUTE, 0, 0);
3148	if (err <= 0)
3149		goto errout_free;
3150
3151	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3152errout:
3153	return err;
 
 
3154
3155errout_free:
3156	kfree_skb(skb);
3157	goto errout;
3158}
3159
3160int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3161{
3162	struct rtable *rt;
3163	int h, s_h;
3164	int idx, s_idx;
3165	struct net *net;
3166
3167	net = sock_net(skb->sk);
3168
3169	s_h = cb->args[0];
3170	if (s_h < 0)
3171		s_h = 0;
3172	s_idx = idx = cb->args[1];
3173	for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3174		if (!rt_hash_table[h].chain)
3175			continue;
3176		rcu_read_lock_bh();
3177		for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3178		     rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3179			if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3180				continue;
3181			if (rt_is_expired(rt))
3182				continue;
3183			skb_dst_set_noref(skb, &rt->dst);
3184			if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3185					 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3186					 1, NLM_F_MULTI) <= 0) {
3187				skb_dst_drop(skb);
3188				rcu_read_unlock_bh();
3189				goto done;
3190			}
3191			skb_dst_drop(skb);
3192		}
3193		rcu_read_unlock_bh();
 
 
 
 
 
3194	}
 
 
3195
3196done:
3197	cb->args[0] = h;
3198	cb->args[1] = idx;
3199	return skb->len;
 
 
 
 
 
 
3200}
3201
3202void ip_rt_multicast_event(struct in_device *in_dev)
3203{
3204	rt_cache_flush(dev_net(in_dev->dev), 0);
3205}
3206
3207#ifdef CONFIG_SYSCTL
3208static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3209					void __user *buffer,
3210					size_t *lenp, loff_t *ppos)
 
 
 
 
3211{
 
 
3212	if (write) {
3213		int flush_delay;
3214		ctl_table ctl;
3215		struct net *net;
3216
3217		memcpy(&ctl, __ctl, sizeof(ctl));
3218		ctl.data = &flush_delay;
3219		proc_dointvec(&ctl, write, buffer, lenp, ppos);
3220
3221		net = (struct net *)__ctl->extra1;
3222		rt_cache_flush(net, flush_delay);
3223		return 0;
3224	}
3225
3226	return -EINVAL;
3227}
3228
3229static ctl_table ipv4_route_table[] = {
3230	{
3231		.procname	= "gc_thresh",
3232		.data		= &ipv4_dst_ops.gc_thresh,
3233		.maxlen		= sizeof(int),
3234		.mode		= 0644,
3235		.proc_handler	= proc_dointvec,
3236	},
3237	{
3238		.procname	= "max_size",
3239		.data		= &ip_rt_max_size,
3240		.maxlen		= sizeof(int),
3241		.mode		= 0644,
3242		.proc_handler	= proc_dointvec,
3243	},
3244	{
3245		/*  Deprecated. Use gc_min_interval_ms */
3246
3247		.procname	= "gc_min_interval",
3248		.data		= &ip_rt_gc_min_interval,
3249		.maxlen		= sizeof(int),
3250		.mode		= 0644,
3251		.proc_handler	= proc_dointvec_jiffies,
3252	},
3253	{
3254		.procname	= "gc_min_interval_ms",
3255		.data		= &ip_rt_gc_min_interval,
3256		.maxlen		= sizeof(int),
3257		.mode		= 0644,
3258		.proc_handler	= proc_dointvec_ms_jiffies,
3259	},
3260	{
3261		.procname	= "gc_timeout",
3262		.data		= &ip_rt_gc_timeout,
3263		.maxlen		= sizeof(int),
3264		.mode		= 0644,
3265		.proc_handler	= proc_dointvec_jiffies,
3266	},
3267	{
3268		.procname	= "gc_interval",
3269		.data		= &ip_rt_gc_interval,
3270		.maxlen		= sizeof(int),
3271		.mode		= 0644,
3272		.proc_handler	= proc_dointvec_jiffies,
3273	},
3274	{
3275		.procname	= "redirect_load",
3276		.data		= &ip_rt_redirect_load,
3277		.maxlen		= sizeof(int),
3278		.mode		= 0644,
3279		.proc_handler	= proc_dointvec,
3280	},
3281	{
3282		.procname	= "redirect_number",
3283		.data		= &ip_rt_redirect_number,
3284		.maxlen		= sizeof(int),
3285		.mode		= 0644,
3286		.proc_handler	= proc_dointvec,
3287	},
3288	{
3289		.procname	= "redirect_silence",
3290		.data		= &ip_rt_redirect_silence,
3291		.maxlen		= sizeof(int),
3292		.mode		= 0644,
3293		.proc_handler	= proc_dointvec,
3294	},
3295	{
3296		.procname	= "error_cost",
3297		.data		= &ip_rt_error_cost,
3298		.maxlen		= sizeof(int),
3299		.mode		= 0644,
3300		.proc_handler	= proc_dointvec,
3301	},
3302	{
3303		.procname	= "error_burst",
3304		.data		= &ip_rt_error_burst,
3305		.maxlen		= sizeof(int),
3306		.mode		= 0644,
3307		.proc_handler	= proc_dointvec,
3308	},
3309	{
3310		.procname	= "gc_elasticity",
3311		.data		= &ip_rt_gc_elasticity,
3312		.maxlen		= sizeof(int),
3313		.mode		= 0644,
3314		.proc_handler	= proc_dointvec,
3315	},
3316	{
3317		.procname	= "mtu_expires",
3318		.data		= &ip_rt_mtu_expires,
3319		.maxlen		= sizeof(int),
3320		.mode		= 0644,
3321		.proc_handler	= proc_dointvec_jiffies,
3322	},
3323	{
3324		.procname	= "min_pmtu",
3325		.data		= &ip_rt_min_pmtu,
3326		.maxlen		= sizeof(int),
3327		.mode		= 0644,
3328		.proc_handler	= proc_dointvec,
 
3329	},
3330	{
3331		.procname	= "min_adv_mss",
3332		.data		= &ip_rt_min_advmss,
3333		.maxlen		= sizeof(int),
3334		.mode		= 0644,
3335		.proc_handler	= proc_dointvec,
3336	},
3337	{ }
3338};
3339
 
 
3340static struct ctl_table ipv4_route_flush_table[] = {
3341	{
3342		.procname	= "flush",
3343		.maxlen		= sizeof(int),
3344		.mode		= 0200,
3345		.proc_handler	= ipv4_sysctl_rtcache_flush,
3346	},
3347	{ },
3348};
3349
3350static __net_init int sysctl_route_net_init(struct net *net)
3351{
3352	struct ctl_table *tbl;
3353
3354	tbl = ipv4_route_flush_table;
3355	if (!net_eq(net, &init_net)) {
3356		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3357		if (tbl == NULL)
3358			goto err_dup;
 
 
 
 
 
 
3359	}
3360	tbl[0].extra1 = net;
3361
3362	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3363	if (net->ipv4.route_hdr == NULL)
3364		goto err_reg;
3365	return 0;
3366
3367err_reg:
3368	if (tbl != ipv4_route_flush_table)
3369		kfree(tbl);
3370err_dup:
3371	return -ENOMEM;
3372}
3373
3374static __net_exit void sysctl_route_net_exit(struct net *net)
3375{
3376	struct ctl_table *tbl;
3377
3378	tbl = net->ipv4.route_hdr->ctl_table_arg;
3379	unregister_net_sysctl_table(net->ipv4.route_hdr);
3380	BUG_ON(tbl == ipv4_route_flush_table);
3381	kfree(tbl);
3382}
3383
3384static __net_initdata struct pernet_operations sysctl_route_ops = {
3385	.init = sysctl_route_net_init,
3386	.exit = sysctl_route_net_exit,
3387};
3388#endif
3389
3390static __net_init int rt_genid_init(struct net *net)
3391{
3392	get_random_bytes(&net->ipv4.rt_genid,
3393			 sizeof(net->ipv4.rt_genid));
3394	get_random_bytes(&net->ipv4.dev_addr_genid,
3395			 sizeof(net->ipv4.dev_addr_genid));
3396	return 0;
3397}
3398
3399static __net_initdata struct pernet_operations rt_genid_ops = {
3400	.init = rt_genid_init,
3401};
3402
 
 
 
3403
3404#ifdef CONFIG_IP_ROUTE_CLASSID
3405struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3406#endif /* CONFIG_IP_ROUTE_CLASSID */
 
 
 
3407
3408static __initdata unsigned long rhash_entries;
3409static int __init set_rhash_entries(char *str)
3410{
3411	ssize_t ret;
3412
3413	if (!str)
3414		return 0;
 
 
3415
3416	ret = kstrtoul(str, 0, &rhash_entries);
3417	if (ret)
3418		return 0;
 
3419
3420	return 1;
3421}
3422__setup("rhash_entries=", set_rhash_entries);
3423
3424int __init ip_rt_init(void)
3425{
3426	int rc = 0;
 
 
 
 
 
 
 
 
 
 
 
3427
 
 
 
 
 
 
3428#ifdef CONFIG_IP_ROUTE_CLASSID
3429	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3430	if (!ip_rt_acct)
3431		panic("IP: failed to allocate ip_rt_acct\n");
3432#endif
3433
3434	ipv4_dst_ops.kmem_cachep =
3435		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3436				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3437
3438	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3439
3440	if (dst_entries_init(&ipv4_dst_ops) < 0)
3441		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3442
3443	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3444		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3445
3446	rt_hash_table = (struct rt_hash_bucket *)
3447		alloc_large_system_hash("IP route cache",
3448					sizeof(struct rt_hash_bucket),
3449					rhash_entries,
3450					(totalram_pages >= 128 * 1024) ?
3451					15 : 17,
3452					0,
3453					&rt_hash_log,
3454					&rt_hash_mask,
3455					0,
3456					rhash_entries ? 0 : 512 * 1024);
3457	memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3458	rt_hash_lock_init();
3459
3460	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3461	ip_rt_max_size = (rt_hash_mask + 1) * 16;
3462
3463	devinet_init();
3464	ip_fib_init();
3465
3466	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3467	expires_ljiffies = jiffies;
3468	schedule_delayed_work(&expires_work,
3469		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3470
3471	if (ip_rt_proc_init())
3472		pr_err("Unable to create route proc files\n");
3473#ifdef CONFIG_XFRM
3474	xfrm_init();
3475	xfrm4_init(ip_rt_max_size);
3476#endif
3477	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
 
3478
3479#ifdef CONFIG_SYSCTL
3480	register_pernet_subsys(&sysctl_route_ops);
3481#endif
3482	register_pernet_subsys(&rt_genid_ops);
3483	return rc;
 
3484}
3485
3486#ifdef CONFIG_SYSCTL
3487/*
3488 * We really need to sanitize the damn ipv4 init order, then all
3489 * this nonsense will go away.
3490 */
3491void __init ip_static_sysctl_init(void)
3492{
3493	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3494}
3495#endif