Linux Audio

Check our new training course

Linux debugging, profiling, tracing and performance analysis training

Apr 14-17, 2025
Register
Loading...
v4.6
 
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		ROUTE - implementation of the IP router.
   7 *
   8 * Authors:	Ross Biro
   9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *		Alan Cox	:	Verify area fixes.
  16 *		Alan Cox	:	cli() protects routing changes
  17 *		Rui Oliveira	:	ICMP routing table updates
  18 *		(rco@di.uminho.pt)	Routing table insertion and update
  19 *		Linus Torvalds	:	Rewrote bits to be sensible
  20 *		Alan Cox	:	Added BSD route gw semantics
  21 *		Alan Cox	:	Super /proc >4K
  22 *		Alan Cox	:	MTU in route table
  23 *		Alan Cox	: 	MSS actually. Also added the window
  24 *					clamper.
  25 *		Sam Lantinga	:	Fixed route matching in rt_del()
  26 *		Alan Cox	:	Routing cache support.
  27 *		Alan Cox	:	Removed compatibility cruft.
  28 *		Alan Cox	:	RTF_REJECT support.
  29 *		Alan Cox	:	TCP irtt support.
  30 *		Jonathan Naylor	:	Added Metric support.
  31 *	Miquel van Smoorenburg	:	BSD API fixes.
  32 *	Miquel van Smoorenburg	:	Metrics.
  33 *		Alan Cox	:	Use __u32 properly
  34 *		Alan Cox	:	Aligned routing errors more closely with BSD
  35 *					our system is still very different.
  36 *		Alan Cox	:	Faster /proc handling
  37 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  38 *					routing caches and better behaviour.
  39 *
  40 *		Olaf Erb	:	irtt wasn't being copied right.
  41 *		Bjorn Ekwall	:	Kerneld route support.
  42 *		Alan Cox	:	Multicast fixed (I hope)
  43 * 		Pavel Krauz	:	Limited broadcast fixed
  44 *		Mike McLagan	:	Routing by source
  45 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  46 *					route.c and rewritten from scratch.
  47 *		Andi Kleen	:	Load-limit warning messages.
  48 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  49 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  50 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  51 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  52 *		Marc Boucher	:	routing by fwmark
  53 *	Robert Olsson		:	Added rt_cache statistics
  54 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  55 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  56 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  57 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
  58 *
  59 *		This program is free software; you can redistribute it and/or
  60 *		modify it under the terms of the GNU General Public License
  61 *		as published by the Free Software Foundation; either version
  62 *		2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <asm/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <linux/jhash.h>
  93#include <net/dst.h>
  94#include <net/dst_metadata.h>
 
  95#include <net/net_namespace.h>
  96#include <net/protocol.h>
  97#include <net/ip.h>
  98#include <net/route.h>
  99#include <net/inetpeer.h>
 100#include <net/sock.h>
 101#include <net/ip_fib.h>
 102#include <net/arp.h>
 103#include <net/tcp.h>
 104#include <net/icmp.h>
 105#include <net/xfrm.h>
 106#include <net/lwtunnel.h>
 107#include <net/netevent.h>
 108#include <net/rtnetlink.h>
 109#ifdef CONFIG_SYSCTL
 110#include <linux/sysctl.h>
 111#include <linux/kmemleak.h>
 112#endif
 113#include <net/secure_seq.h>
 114#include <net/ip_tunnels.h>
 115#include <net/l3mdev.h>
 
 116
 117#define RT_FL_TOS(oldflp4) \
 118	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 119
 120#define RT_GC_TIMEOUT (300*HZ)
 121
 
 
 
 122static int ip_rt_max_size;
 123static int ip_rt_redirect_number __read_mostly	= 9;
 124static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 125static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 126static int ip_rt_error_cost __read_mostly	= HZ;
 127static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 128static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 129static int ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 130static int ip_rt_min_advmss __read_mostly	= 256;
 131
 132static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 
 133/*
 134 *	Interface to generic destination cache.
 135 */
 136
 137static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 
 138static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 139static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
 140static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 
 
 141static void		 ipv4_link_failure(struct sk_buff *skb);
 142static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143					   struct sk_buff *skb, u32 mtu);
 
 144static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145					struct sk_buff *skb);
 146static void		ipv4_dst_destroy(struct dst_entry *dst);
 147
 148static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 149{
 150	WARN_ON(1);
 151	return NULL;
 152}
 153
 154static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 155					   struct sk_buff *skb,
 156					   const void *daddr);
 
 157
 158static struct dst_ops ipv4_dst_ops = {
 159	.family =		AF_INET,
 160	.check =		ipv4_dst_check,
 161	.default_advmss =	ipv4_default_advmss,
 162	.mtu =			ipv4_mtu,
 163	.cow_metrics =		ipv4_cow_metrics,
 164	.destroy =		ipv4_dst_destroy,
 165	.negative_advice =	ipv4_negative_advice,
 166	.link_failure =		ipv4_link_failure,
 167	.update_pmtu =		ip_rt_update_pmtu,
 168	.redirect =		ip_do_redirect,
 169	.local_out =		__ip_local_out,
 170	.neigh_lookup =		ipv4_neigh_lookup,
 
 171};
 172
 173#define ECN_OR_COST(class)	TC_PRIO_##class
 174
 175const __u8 ip_tos2prio[16] = {
 176	TC_PRIO_BESTEFFORT,
 177	ECN_OR_COST(BESTEFFORT),
 178	TC_PRIO_BESTEFFORT,
 179	ECN_OR_COST(BESTEFFORT),
 180	TC_PRIO_BULK,
 181	ECN_OR_COST(BULK),
 182	TC_PRIO_BULK,
 183	ECN_OR_COST(BULK),
 184	TC_PRIO_INTERACTIVE,
 185	ECN_OR_COST(INTERACTIVE),
 186	TC_PRIO_INTERACTIVE,
 187	ECN_OR_COST(INTERACTIVE),
 188	TC_PRIO_INTERACTIVE_BULK,
 189	ECN_OR_COST(INTERACTIVE_BULK),
 190	TC_PRIO_INTERACTIVE_BULK,
 191	ECN_OR_COST(INTERACTIVE_BULK)
 192};
 193EXPORT_SYMBOL(ip_tos2prio);
 194
 195static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 196#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 197
 198#ifdef CONFIG_PROC_FS
 199static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 200{
 201	if (*pos)
 202		return NULL;
 203	return SEQ_START_TOKEN;
 204}
 205
 206static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 207{
 208	++*pos;
 209	return NULL;
 210}
 211
 212static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 213{
 214}
 215
 216static int rt_cache_seq_show(struct seq_file *seq, void *v)
 217{
 218	if (v == SEQ_START_TOKEN)
 219		seq_printf(seq, "%-127s\n",
 220			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 221			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 222			   "HHUptod\tSpecDst");
 223	return 0;
 224}
 225
 226static const struct seq_operations rt_cache_seq_ops = {
 227	.start  = rt_cache_seq_start,
 228	.next   = rt_cache_seq_next,
 229	.stop   = rt_cache_seq_stop,
 230	.show   = rt_cache_seq_show,
 231};
 232
 233static int rt_cache_seq_open(struct inode *inode, struct file *file)
 234{
 235	return seq_open(file, &rt_cache_seq_ops);
 236}
 237
 238static const struct file_operations rt_cache_seq_fops = {
 239	.owner	 = THIS_MODULE,
 240	.open	 = rt_cache_seq_open,
 241	.read	 = seq_read,
 242	.llseek	 = seq_lseek,
 243	.release = seq_release,
 244};
 245
 246
 247static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 248{
 249	int cpu;
 250
 251	if (*pos == 0)
 252		return SEQ_START_TOKEN;
 253
 254	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 255		if (!cpu_possible(cpu))
 256			continue;
 257		*pos = cpu+1;
 258		return &per_cpu(rt_cache_stat, cpu);
 259	}
 260	return NULL;
 261}
 262
 263static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 264{
 265	int cpu;
 266
 267	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 268		if (!cpu_possible(cpu))
 269			continue;
 270		*pos = cpu+1;
 271		return &per_cpu(rt_cache_stat, cpu);
 272	}
 
 273	return NULL;
 274
 275}
 276
 277static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 278{
 279
 280}
 281
 282static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 283{
 284	struct rt_cache_stat *st = v;
 285
 286	if (v == SEQ_START_TOKEN) {
 287		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 288		return 0;
 289	}
 290
 291	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 292		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 
 293		   dst_entries_get_slow(&ipv4_dst_ops),
 294		   0, /* st->in_hit */
 295		   st->in_slow_tot,
 296		   st->in_slow_mc,
 297		   st->in_no_route,
 298		   st->in_brd,
 299		   st->in_martian_dst,
 300		   st->in_martian_src,
 301
 302		   0, /* st->out_hit */
 303		   st->out_slow_tot,
 304		   st->out_slow_mc,
 305
 306		   0, /* st->gc_total */
 307		   0, /* st->gc_ignored */
 308		   0, /* st->gc_goal_miss */
 309		   0, /* st->gc_dst_overflow */
 310		   0, /* st->in_hlist_search */
 311		   0  /* st->out_hlist_search */
 312		);
 313	return 0;
 314}
 315
 316static const struct seq_operations rt_cpu_seq_ops = {
 317	.start  = rt_cpu_seq_start,
 318	.next   = rt_cpu_seq_next,
 319	.stop   = rt_cpu_seq_stop,
 320	.show   = rt_cpu_seq_show,
 321};
 322
 323
 324static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 325{
 326	return seq_open(file, &rt_cpu_seq_ops);
 327}
 328
 329static const struct file_operations rt_cpu_seq_fops = {
 330	.owner	 = THIS_MODULE,
 331	.open	 = rt_cpu_seq_open,
 332	.read	 = seq_read,
 333	.llseek	 = seq_lseek,
 334	.release = seq_release,
 335};
 336
 337#ifdef CONFIG_IP_ROUTE_CLASSID
 338static int rt_acct_proc_show(struct seq_file *m, void *v)
 339{
 340	struct ip_rt_acct *dst, *src;
 341	unsigned int i, j;
 342
 343	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 344	if (!dst)
 345		return -ENOMEM;
 346
 347	for_each_possible_cpu(i) {
 348		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 349		for (j = 0; j < 256; j++) {
 350			dst[j].o_bytes   += src[j].o_bytes;
 351			dst[j].o_packets += src[j].o_packets;
 352			dst[j].i_bytes   += src[j].i_bytes;
 353			dst[j].i_packets += src[j].i_packets;
 354		}
 355	}
 356
 357	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 358	kfree(dst);
 359	return 0;
 360}
 361
 362static int rt_acct_proc_open(struct inode *inode, struct file *file)
 363{
 364	return single_open(file, rt_acct_proc_show, NULL);
 365}
 366
 367static const struct file_operations rt_acct_proc_fops = {
 368	.owner		= THIS_MODULE,
 369	.open		= rt_acct_proc_open,
 370	.read		= seq_read,
 371	.llseek		= seq_lseek,
 372	.release	= single_release,
 373};
 374#endif
 375
 376static int __net_init ip_rt_do_proc_init(struct net *net)
 377{
 378	struct proc_dir_entry *pde;
 379
 380	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 381			  &rt_cache_seq_fops);
 382	if (!pde)
 383		goto err1;
 384
 385	pde = proc_create("rt_cache", S_IRUGO,
 386			  net->proc_net_stat, &rt_cpu_seq_fops);
 387	if (!pde)
 388		goto err2;
 389
 390#ifdef CONFIG_IP_ROUTE_CLASSID
 391	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 
 392	if (!pde)
 393		goto err3;
 394#endif
 395	return 0;
 396
 397#ifdef CONFIG_IP_ROUTE_CLASSID
 398err3:
 399	remove_proc_entry("rt_cache", net->proc_net_stat);
 400#endif
 401err2:
 402	remove_proc_entry("rt_cache", net->proc_net);
 403err1:
 404	return -ENOMEM;
 405}
 406
 407static void __net_exit ip_rt_do_proc_exit(struct net *net)
 408{
 409	remove_proc_entry("rt_cache", net->proc_net_stat);
 410	remove_proc_entry("rt_cache", net->proc_net);
 411#ifdef CONFIG_IP_ROUTE_CLASSID
 412	remove_proc_entry("rt_acct", net->proc_net);
 413#endif
 414}
 415
 416static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 417	.init = ip_rt_do_proc_init,
 418	.exit = ip_rt_do_proc_exit,
 419};
 420
 421static int __init ip_rt_proc_init(void)
 422{
 423	return register_pernet_subsys(&ip_rt_proc_ops);
 424}
 425
 426#else
 427static inline int ip_rt_proc_init(void)
 428{
 429	return 0;
 430}
 431#endif /* CONFIG_PROC_FS */
 432
 433static inline bool rt_is_expired(const struct rtable *rth)
 434{
 435	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 436}
 437
 438void rt_cache_flush(struct net *net)
 439{
 440	rt_genid_bump_ipv4(net);
 441}
 442
 443static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 444					   struct sk_buff *skb,
 445					   const void *daddr)
 446{
 
 447	struct net_device *dev = dst->dev;
 448	const __be32 *pkey = daddr;
 449	const struct rtable *rt;
 450	struct neighbour *n;
 451
 452	rt = (const struct rtable *) dst;
 453	if (rt->rt_gateway)
 454		pkey = (const __be32 *) &rt->rt_gateway;
 455	else if (skb)
 456		pkey = &ip_hdr(skb)->daddr;
 457
 458	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 459	if (n)
 460		return n;
 461	return neigh_create(&arp_tbl, pkey, dev);
 
 
 
 
 
 
 
 
 
 462}
 463
 464#define IP_IDENTS_SZ 2048u
 
 
 
 
 465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 466static atomic_t *ip_idents __read_mostly;
 467static u32 *ip_tstamps __read_mostly;
 468
 469/* In order to protect privacy, we add a perturbation to identifiers
 470 * if one generator is seldom used. This makes hard for an attacker
 471 * to infer how many packets were sent between two points in time.
 472 */
 473u32 ip_idents_reserve(u32 hash, int segs)
 474{
 475	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 476	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 477	u32 old = ACCESS_ONCE(*p_tstamp);
 478	u32 now = (u32)jiffies;
 479	u32 delta = 0;
 480
 
 
 
 
 
 481	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 482		delta = prandom_u32_max(now - old);
 483
 
 
 
 
 484	return atomic_add_return(segs + delta, p_id) - segs;
 485}
 486EXPORT_SYMBOL(ip_idents_reserve);
 487
 488void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 489{
 490	static u32 ip_idents_hashrnd __read_mostly;
 491	u32 hash, id;
 492
 493	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 
 
 
 494
 495	hash = jhash_3words((__force u32)iph->daddr,
 496			    (__force u32)iph->saddr,
 497			    iph->protocol ^ net_hash_mix(net),
 498			    ip_idents_hashrnd);
 499	id = ip_idents_reserve(hash, segs);
 500	iph->id = htons(id);
 501}
 502EXPORT_SYMBOL(__ip_select_ident);
 503
 504static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 505			     const struct iphdr *iph,
 506			     int oif, u8 tos,
 507			     u8 prot, u32 mark, int flow_flags)
 508{
 509	if (sk) {
 510		const struct inet_sock *inet = inet_sk(sk);
 
 
 
 
 511
 
 
 
 
 
 
 
 
 512		oif = sk->sk_bound_dev_if;
 513		mark = sk->sk_mark;
 514		tos = RT_CONN_FLAGS(sk);
 515		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 516	}
 517	flowi4_init_output(fl4, oif, mark, tos,
 518			   RT_SCOPE_UNIVERSE, prot,
 519			   flow_flags,
 520			   iph->daddr, iph->saddr, 0, 0);
 
 
 521}
 522
 523static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 524			       const struct sock *sk)
 525{
 
 526	const struct iphdr *iph = ip_hdr(skb);
 527	int oif = skb->dev->ifindex;
 528	u8 tos = RT_TOS(iph->tos);
 529	u8 prot = iph->protocol;
 530	u32 mark = skb->mark;
 
 531
 532	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 533}
 534
 535static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 536{
 537	const struct inet_sock *inet = inet_sk(sk);
 538	const struct ip_options_rcu *inet_opt;
 539	__be32 daddr = inet->inet_daddr;
 540
 541	rcu_read_lock();
 542	inet_opt = rcu_dereference(inet->inet_opt);
 543	if (inet_opt && inet_opt->opt.srr)
 544		daddr = inet_opt->opt.faddr;
 545	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 546			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 547			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 
 
 548			   inet_sk_flowi_flags(sk),
 549			   daddr, inet->inet_saddr, 0, 0);
 550	rcu_read_unlock();
 551}
 552
 553static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 554				 const struct sk_buff *skb)
 555{
 556	if (skb)
 557		build_skb_flow_key(fl4, skb, sk);
 558	else
 559		build_sk_flow_key(fl4, sk);
 560}
 561
 562static inline void rt_free(struct rtable *rt)
 563{
 564	call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 565}
 566
 567static DEFINE_SPINLOCK(fnhe_lock);
 568
 569static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 570{
 571	struct rtable *rt;
 572
 573	rt = rcu_dereference(fnhe->fnhe_rth_input);
 574	if (rt) {
 575		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 576		rt_free(rt);
 
 577	}
 578	rt = rcu_dereference(fnhe->fnhe_rth_output);
 579	if (rt) {
 580		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 581		rt_free(rt);
 
 582	}
 583}
 584
 585static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 586{
 587	struct fib_nh_exception *fnhe, *oldest;
 
 588
 589	oldest = rcu_dereference(hash->chain);
 590	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 591	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 592		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 
 
 
 593			oldest = fnhe;
 
 
 594	}
 595	fnhe_flush_routes(oldest);
 596	return oldest;
 
 597}
 598
 599static inline u32 fnhe_hashfun(__be32 daddr)
 600{
 601	static u32 fnhe_hashrnd __read_mostly;
 602	u32 hval;
 603
 604	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 605	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 606	return hash_32(hval, FNHE_HASH_SHIFT);
 607}
 608
 609static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 610{
 611	rt->rt_pmtu = fnhe->fnhe_pmtu;
 
 612	rt->dst.expires = fnhe->fnhe_expires;
 613
 614	if (fnhe->fnhe_gw) {
 615		rt->rt_flags |= RTCF_REDIRECTED;
 616		rt->rt_gateway = fnhe->fnhe_gw;
 617		rt->rt_uses_gateway = 1;
 
 
 618	}
 619}
 620
 621static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 622				  u32 pmtu, unsigned long expires)
 
 623{
 624	struct fnhe_hash_bucket *hash;
 625	struct fib_nh_exception *fnhe;
 626	struct rtable *rt;
 
 627	unsigned int i;
 628	int depth;
 629	u32 hval = fnhe_hashfun(daddr);
 
 
 630
 631	spin_lock_bh(&fnhe_lock);
 632
 633	hash = rcu_dereference(nh->nh_exceptions);
 634	if (!hash) {
 635		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 636		if (!hash)
 637			goto out_unlock;
 638		rcu_assign_pointer(nh->nh_exceptions, hash);
 639	}
 640
 641	hash += hval;
 642
 643	depth = 0;
 644	for (fnhe = rcu_dereference(hash->chain); fnhe;
 645	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 646		if (fnhe->fnhe_daddr == daddr)
 647			break;
 648		depth++;
 649	}
 650
 651	if (fnhe) {
 
 
 652		if (gw)
 653			fnhe->fnhe_gw = gw;
 654		if (pmtu) {
 655			fnhe->fnhe_pmtu = pmtu;
 656			fnhe->fnhe_expires = max(1UL, expires);
 657		}
 
 658		/* Update all cached dsts too */
 659		rt = rcu_dereference(fnhe->fnhe_rth_input);
 660		if (rt)
 661			fill_route_from_fnhe(rt, fnhe);
 662		rt = rcu_dereference(fnhe->fnhe_rth_output);
 663		if (rt)
 664			fill_route_from_fnhe(rt, fnhe);
 665	} else {
 666		if (depth > FNHE_RECLAIM_DEPTH)
 667			fnhe = fnhe_oldest(hash);
 668		else {
 669			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 670			if (!fnhe)
 671				goto out_unlock;
 672
 673			fnhe->fnhe_next = hash->chain;
 674			rcu_assign_pointer(hash->chain, fnhe);
 
 675		}
 676		fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
 
 
 
 
 
 
 
 677		fnhe->fnhe_daddr = daddr;
 678		fnhe->fnhe_gw = gw;
 679		fnhe->fnhe_pmtu = pmtu;
 680		fnhe->fnhe_expires = expires;
 
 
 
 681
 682		/* Exception created; mark the cached routes for the nexthop
 683		 * stale, so anyone caching it rechecks if this exception
 684		 * applies to them.
 685		 */
 686		rt = rcu_dereference(nh->nh_rth_input);
 687		if (rt)
 688			rt->dst.obsolete = DST_OBSOLETE_KILL;
 689
 690		for_each_possible_cpu(i) {
 691			struct rtable __rcu **prt;
 692			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 
 693			rt = rcu_dereference(*prt);
 694			if (rt)
 695				rt->dst.obsolete = DST_OBSOLETE_KILL;
 696		}
 697	}
 698
 699	fnhe->fnhe_stamp = jiffies;
 700
 701out_unlock:
 702	spin_unlock_bh(&fnhe_lock);
 703}
 704
 705static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 706			     bool kill_route)
 707{
 708	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 709	__be32 old_gw = ip_hdr(skb)->saddr;
 710	struct net_device *dev = skb->dev;
 711	struct in_device *in_dev;
 712	struct fib_result res;
 713	struct neighbour *n;
 714	struct net *net;
 715
 716	switch (icmp_hdr(skb)->code & 7) {
 717	case ICMP_REDIR_NET:
 718	case ICMP_REDIR_NETTOS:
 719	case ICMP_REDIR_HOST:
 720	case ICMP_REDIR_HOSTTOS:
 721		break;
 722
 723	default:
 724		return;
 725	}
 726
 727	if (rt->rt_gateway != old_gw)
 728		return;
 729
 730	in_dev = __in_dev_get_rcu(dev);
 731	if (!in_dev)
 732		return;
 733
 734	net = dev_net(dev);
 735	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 736	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 737	    ipv4_is_zeronet(new_gw))
 738		goto reject_redirect;
 739
 740	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 741		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 742			goto reject_redirect;
 743		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 744			goto reject_redirect;
 745	} else {
 746		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 747			goto reject_redirect;
 748	}
 749
 750	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
 
 
 751	if (!IS_ERR(n)) {
 752		if (!(n->nud_state & NUD_VALID)) {
 753			neigh_event_send(n, NULL);
 754		} else {
 755			if (fib_lookup(net, fl4, &res, 0) == 0) {
 756				struct fib_nh *nh = &FIB_RES_NH(res);
 757
 758				update_or_create_fnhe(nh, fl4->daddr, new_gw,
 759						0, jiffies + ip_rt_gc_timeout);
 
 
 
 760			}
 761			if (kill_route)
 762				rt->dst.obsolete = DST_OBSOLETE_KILL;
 763			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 764		}
 765		neigh_release(n);
 766	}
 767	return;
 768
 769reject_redirect:
 770#ifdef CONFIG_IP_ROUTE_VERBOSE
 771	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 772		const struct iphdr *iph = (const struct iphdr *) skb->data;
 773		__be32 daddr = iph->daddr;
 774		__be32 saddr = iph->saddr;
 775
 776		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 777				     "  Advised path = %pI4 -> %pI4\n",
 778				     &old_gw, dev->name, &new_gw,
 779				     &saddr, &daddr);
 780	}
 781#endif
 782	;
 783}
 784
 785static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 786{
 787	struct rtable *rt;
 788	struct flowi4 fl4;
 789	const struct iphdr *iph = (const struct iphdr *) skb->data;
 
 790	int oif = skb->dev->ifindex;
 791	u8 tos = RT_TOS(iph->tos);
 792	u8 prot = iph->protocol;
 793	u32 mark = skb->mark;
 
 794
 795	rt = (struct rtable *) dst;
 796
 797	__build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 798	__ip_do_redirect(rt, skb, &fl4, true);
 799}
 800
 801static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 
 802{
 803	struct rtable *rt = (struct rtable *)dst;
 804	struct dst_entry *ret = dst;
 805
 806	if (rt) {
 807		if (dst->obsolete > 0) {
 808			ip_rt_put(rt);
 809			ret = NULL;
 810		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 811			   rt->dst.expires) {
 812			ip_rt_put(rt);
 813			ret = NULL;
 814		}
 815	}
 816	return ret;
 817}
 818
 819/*
 820 * Algorithm:
 821 *	1. The first ip_rt_redirect_number redirects are sent
 822 *	   with exponential backoff, then we stop sending them at all,
 823 *	   assuming that the host ignores our redirects.
 824 *	2. If we did not see packets requiring redirects
 825 *	   during ip_rt_redirect_silence, we assume that the host
 826 *	   forgot redirected route and start to send redirects again.
 827 *
 828 * This algorithm is much cheaper and more intelligent than dumb load limiting
 829 * in icmp.c.
 830 *
 831 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 832 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 833 */
 834
 835void ip_rt_send_redirect(struct sk_buff *skb)
 836{
 837	struct rtable *rt = skb_rtable(skb);
 838	struct in_device *in_dev;
 839	struct inet_peer *peer;
 840	struct net *net;
 841	int log_martians;
 842	int vif;
 843
 844	rcu_read_lock();
 845	in_dev = __in_dev_get_rcu(rt->dst.dev);
 846	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 847		rcu_read_unlock();
 848		return;
 849	}
 850	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 851	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 852	rcu_read_unlock();
 853
 854	net = dev_net(rt->dst.dev);
 855	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 856	if (!peer) {
 857		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 858			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 859		return;
 860	}
 861
 862	/* No redirected packets during ip_rt_redirect_silence;
 863	 * reset the algorithm.
 864	 */
 865	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 866		peer->rate_tokens = 0;
 
 
 867
 868	/* Too many ignored redirects; do not send anything
 869	 * set dst.rate_last to the last seen redirected packet.
 870	 */
 871	if (peer->rate_tokens >= ip_rt_redirect_number) {
 872		peer->rate_last = jiffies;
 873		goto out_put_peer;
 874	}
 875
 876	/* Check for load limit; set rate_last to the latest sent
 877	 * redirect.
 878	 */
 879	if (peer->rate_tokens == 0 ||
 880	    time_after(jiffies,
 881		       (peer->rate_last +
 882			(ip_rt_redirect_load << peer->rate_tokens)))) {
 883		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 884
 885		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 886		peer->rate_last = jiffies;
 887		++peer->rate_tokens;
 888#ifdef CONFIG_IP_ROUTE_VERBOSE
 889		if (log_martians &&
 890		    peer->rate_tokens == ip_rt_redirect_number)
 891			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 892					     &ip_hdr(skb)->saddr, inet_iif(skb),
 893					     &ip_hdr(skb)->daddr, &gw);
 894#endif
 895	}
 896out_put_peer:
 897	inet_putpeer(peer);
 898}
 899
 900static int ip_error(struct sk_buff *skb)
 901{
 902	struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 903	struct rtable *rt = skb_rtable(skb);
 
 
 904	struct inet_peer *peer;
 905	unsigned long now;
 906	struct net *net;
 
 907	bool send;
 908	int code;
 909
 
 
 
 
 
 
 
 
 910	/* IP on this device is disabled. */
 911	if (!in_dev)
 912		goto out;
 913
 914	net = dev_net(rt->dst.dev);
 915	if (!IN_DEV_FORWARD(in_dev)) {
 916		switch (rt->dst.error) {
 917		case EHOSTUNREACH:
 918			IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 
 919			break;
 920
 921		case ENETUNREACH:
 922			IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 
 923			break;
 924		}
 925		goto out;
 926	}
 927
 928	switch (rt->dst.error) {
 929	case EINVAL:
 930	default:
 931		goto out;
 932	case EHOSTUNREACH:
 933		code = ICMP_HOST_UNREACH;
 934		break;
 935	case ENETUNREACH:
 936		code = ICMP_NET_UNREACH;
 937		IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 
 938		break;
 939	case EACCES:
 940		code = ICMP_PKT_FILTERED;
 941		break;
 942	}
 943
 944	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 945			       l3mdev_master_ifindex(skb->dev), 1);
 946
 947	send = true;
 948	if (peer) {
 949		now = jiffies;
 950		peer->rate_tokens += now - peer->rate_last;
 951		if (peer->rate_tokens > ip_rt_error_burst)
 952			peer->rate_tokens = ip_rt_error_burst;
 953		peer->rate_last = now;
 954		if (peer->rate_tokens >= ip_rt_error_cost)
 955			peer->rate_tokens -= ip_rt_error_cost;
 956		else
 957			send = false;
 958		inet_putpeer(peer);
 959	}
 960	if (send)
 961		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 962
 963out:	kfree_skb(skb);
 964	return 0;
 965}
 966
 967static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 968{
 969	struct dst_entry *dst = &rt->dst;
 
 970	struct fib_result res;
 
 
 971
 972	if (dst_metric_locked(dst, RTAX_MTU))
 973		return;
 974
 975	if (ipv4_mtu(dst) < mtu)
 
 976		return;
 977
 978	if (mtu < ip_rt_min_pmtu)
 979		mtu = ip_rt_min_pmtu;
 
 
 980
 981	if (rt->rt_pmtu == mtu &&
 982	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
 983		return;
 984
 985	rcu_read_lock();
 986	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
 987		struct fib_nh *nh = &FIB_RES_NH(res);
 988
 989		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
 990				      jiffies + ip_rt_mtu_expires);
 
 
 991	}
 992	rcu_read_unlock();
 993}
 994
 995static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 996			      struct sk_buff *skb, u32 mtu)
 
 997{
 998	struct rtable *rt = (struct rtable *) dst;
 999	struct flowi4 fl4;
1000
1001	ip_rt_build_flow_key(&fl4, sk, skb);
 
 
 
 
 
1002	__ip_rt_update_pmtu(rt, &fl4, mtu);
1003}
1004
1005void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1006		      int oif, u32 mark, u8 protocol, int flow_flags)
1007{
1008	const struct iphdr *iph = (const struct iphdr *) skb->data;
1009	struct flowi4 fl4;
1010	struct rtable *rt;
 
1011
1012	if (!mark)
1013		mark = IP4_REPLY_MARK(net, skb->mark);
1014
1015	__build_flow_key(&fl4, NULL, iph, oif,
1016			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1017	rt = __ip_route_output_key(net, &fl4);
1018	if (!IS_ERR(rt)) {
1019		__ip_rt_update_pmtu(rt, &fl4, mtu);
1020		ip_rt_put(rt);
1021	}
1022}
1023EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1024
1025static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1026{
1027	const struct iphdr *iph = (const struct iphdr *) skb->data;
1028	struct flowi4 fl4;
1029	struct rtable *rt;
1030
1031	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1032
1033	if (!fl4.flowi4_mark)
1034		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1035
1036	rt = __ip_route_output_key(sock_net(sk), &fl4);
1037	if (!IS_ERR(rt)) {
1038		__ip_rt_update_pmtu(rt, &fl4, mtu);
1039		ip_rt_put(rt);
1040	}
1041}
1042
1043void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1044{
1045	const struct iphdr *iph = (const struct iphdr *) skb->data;
1046	struct flowi4 fl4;
1047	struct rtable *rt;
1048	struct dst_entry *odst = NULL;
1049	bool new = false;
 
1050
1051	bh_lock_sock(sk);
1052
1053	if (!ip_sk_accept_pmtu(sk))
1054		goto out;
1055
1056	odst = sk_dst_get(sk);
1057
1058	if (sock_owned_by_user(sk) || !odst) {
1059		__ipv4_sk_update_pmtu(skb, sk, mtu);
1060		goto out;
1061	}
1062
1063	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1064
1065	rt = (struct rtable *)odst;
1066	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1067		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1068		if (IS_ERR(rt))
1069			goto out;
1070
1071		new = true;
1072	}
1073
1074	__ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1075
1076	if (!dst_check(&rt->dst, 0)) {
1077		if (new)
1078			dst_release(&rt->dst);
1079
1080		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1081		if (IS_ERR(rt))
1082			goto out;
1083
1084		new = true;
1085	}
1086
1087	if (new)
1088		sk_dst_set(sk, &rt->dst);
1089
1090out:
1091	bh_unlock_sock(sk);
1092	dst_release(odst);
1093}
1094EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1095
1096void ipv4_redirect(struct sk_buff *skb, struct net *net,
1097		   int oif, u32 mark, u8 protocol, int flow_flags)
1098{
1099	const struct iphdr *iph = (const struct iphdr *) skb->data;
1100	struct flowi4 fl4;
1101	struct rtable *rt;
1102
1103	__build_flow_key(&fl4, NULL, iph, oif,
1104			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1105	rt = __ip_route_output_key(net, &fl4);
1106	if (!IS_ERR(rt)) {
1107		__ip_do_redirect(rt, skb, &fl4, false);
1108		ip_rt_put(rt);
1109	}
1110}
1111EXPORT_SYMBOL_GPL(ipv4_redirect);
1112
1113void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1114{
1115	const struct iphdr *iph = (const struct iphdr *) skb->data;
1116	struct flowi4 fl4;
1117	struct rtable *rt;
 
1118
1119	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1120	rt = __ip_route_output_key(sock_net(sk), &fl4);
1121	if (!IS_ERR(rt)) {
1122		__ip_do_redirect(rt, skb, &fl4, false);
1123		ip_rt_put(rt);
1124	}
1125}
1126EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1127
1128static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 
1129{
1130	struct rtable *rt = (struct rtable *) dst;
1131
1132	/* All IPV4 dsts are created with ->obsolete set to the value
1133	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1134	 * into this function always.
1135	 *
1136	 * When a PMTU/redirect information update invalidates a route,
1137	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1138	 * DST_OBSOLETE_DEAD by dst_free().
1139	 */
1140	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1141		return NULL;
1142	return dst;
1143}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1144
1145static void ipv4_link_failure(struct sk_buff *skb)
1146{
1147	struct rtable *rt;
1148
1149	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1150
1151	rt = skb_rtable(skb);
1152	if (rt)
1153		dst_set_expires(&rt->dst, 0);
1154}
1155
1156static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1157{
1158	pr_debug("%s: %pI4 -> %pI4, %s\n",
1159		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1160		 skb->dev ? skb->dev->name : "?");
1161	kfree_skb(skb);
1162	WARN_ON(1);
1163	return 0;
1164}
1165
1166/*
1167   We do not cache source address of outgoing interface,
1168   because it is used only by IP RR, TS and SRR options,
1169   so that it out of fast path.
1170
1171   BTW remember: "addr" is allowed to be not aligned
1172   in IP options!
1173 */
1174
1175void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1176{
1177	__be32 src;
1178
1179	if (rt_is_output_route(rt))
1180		src = ip_hdr(skb)->saddr;
1181	else {
1182		struct fib_result res;
1183		struct flowi4 fl4;
1184		struct iphdr *iph;
1185
1186		iph = ip_hdr(skb);
1187
1188		memset(&fl4, 0, sizeof(fl4));
1189		fl4.daddr = iph->daddr;
1190		fl4.saddr = iph->saddr;
1191		fl4.flowi4_tos = RT_TOS(iph->tos);
1192		fl4.flowi4_oif = rt->dst.dev->ifindex;
1193		fl4.flowi4_iif = skb->dev->ifindex;
1194		fl4.flowi4_mark = skb->mark;
1195
1196		rcu_read_lock();
1197		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1198			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1199		else
1200			src = inet_select_addr(rt->dst.dev,
1201					       rt_nexthop(rt, iph->daddr),
1202					       RT_SCOPE_UNIVERSE);
1203		rcu_read_unlock();
1204	}
1205	memcpy(addr, &src, 4);
1206}
1207
1208#ifdef CONFIG_IP_ROUTE_CLASSID
1209static void set_class_tag(struct rtable *rt, u32 tag)
1210{
1211	if (!(rt->dst.tclassid & 0xFFFF))
1212		rt->dst.tclassid |= tag & 0xFFFF;
1213	if (!(rt->dst.tclassid & 0xFFFF0000))
1214		rt->dst.tclassid |= tag & 0xFFFF0000;
1215}
1216#endif
1217
1218static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1219{
1220	unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
 
 
 
1221
1222	if (advmss == 0) {
1223		advmss = max_t(unsigned int, dst->dev->mtu - 40,
1224			       ip_rt_min_advmss);
1225		if (advmss > 65535 - 40)
1226			advmss = 65535 - 40;
1227	}
1228	return advmss;
1229}
1230
1231static unsigned int ipv4_mtu(const struct dst_entry *dst)
1232{
1233	const struct rtable *rt = (const struct rtable *) dst;
1234	unsigned int mtu = rt->rt_pmtu;
 
1235
1236	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1237		mtu = dst_metric_raw(dst, RTAX_MTU);
 
 
 
1238
1239	if (mtu)
1240		return mtu;
1241
1242	mtu = dst->dev->mtu;
 
 
1243
1244	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1245		if (rt->rt_uses_gateway && mtu > 576)
1246			mtu = 576;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1247	}
1248
1249	return min_t(unsigned int, mtu, IP_MAX_MTU);
1250}
1251
1252static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 
1253{
1254	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1255	struct fib_nh_exception *fnhe;
1256	u32 hval;
1257
1258	if (!hash)
1259		return NULL;
1260
1261	hval = fnhe_hashfun(daddr);
1262
1263	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1264	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1265		if (fnhe->fnhe_daddr == daddr)
 
 
 
 
 
1266			return fnhe;
 
1267	}
1268	return NULL;
1269}
1270
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1271static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1272			      __be32 daddr)
1273{
1274	bool ret = false;
1275
1276	spin_lock_bh(&fnhe_lock);
1277
1278	if (daddr == fnhe->fnhe_daddr) {
1279		struct rtable __rcu **porig;
1280		struct rtable *orig;
1281		int genid = fnhe_genid(dev_net(rt->dst.dev));
1282
1283		if (rt_is_input_route(rt))
1284			porig = &fnhe->fnhe_rth_input;
1285		else
1286			porig = &fnhe->fnhe_rth_output;
1287		orig = rcu_dereference(*porig);
1288
1289		if (fnhe->fnhe_genid != genid) {
1290			fnhe->fnhe_genid = genid;
1291			fnhe->fnhe_gw = 0;
1292			fnhe->fnhe_pmtu = 0;
1293			fnhe->fnhe_expires = 0;
 
1294			fnhe_flush_routes(fnhe);
1295			orig = NULL;
1296		}
1297		fill_route_from_fnhe(rt, fnhe);
1298		if (!rt->rt_gateway)
1299			rt->rt_gateway = daddr;
 
 
1300
1301		if (!(rt->dst.flags & DST_NOCACHE)) {
 
1302			rcu_assign_pointer(*porig, rt);
1303			if (orig)
1304				rt_free(orig);
 
 
1305			ret = true;
1306		}
1307
1308		fnhe->fnhe_stamp = jiffies;
1309	}
1310	spin_unlock_bh(&fnhe_lock);
1311
1312	return ret;
1313}
1314
1315static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1316{
1317	struct rtable *orig, *prev, **p;
1318	bool ret = true;
1319
1320	if (rt_is_input_route(rt)) {
1321		p = (struct rtable **)&nh->nh_rth_input;
1322	} else {
1323		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1324	}
1325	orig = *p;
1326
 
 
 
 
1327	prev = cmpxchg(p, orig, rt);
1328	if (prev == orig) {
1329		if (orig)
1330			rt_free(orig);
1331	} else
 
 
 
1332		ret = false;
 
1333
1334	return ret;
1335}
1336
1337struct uncached_list {
1338	spinlock_t		lock;
1339	struct list_head	head;
 
1340};
1341
1342static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1343
1344static void rt_add_uncached_list(struct rtable *rt)
1345{
1346	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1347
1348	rt->rt_uncached_list = ul;
1349
1350	spin_lock_bh(&ul->lock);
1351	list_add_tail(&rt->rt_uncached, &ul->head);
1352	spin_unlock_bh(&ul->lock);
1353}
1354
1355static void ipv4_dst_destroy(struct dst_entry *dst)
1356{
1357	struct rtable *rt = (struct rtable *) dst;
1358
1359	if (!list_empty(&rt->rt_uncached)) {
1360		struct uncached_list *ul = rt->rt_uncached_list;
1361
1362		spin_lock_bh(&ul->lock);
1363		list_del(&rt->rt_uncached);
1364		spin_unlock_bh(&ul->lock);
1365	}
1366}
1367
 
 
 
 
 
 
1368void rt_flush_dev(struct net_device *dev)
1369{
1370	struct net *net = dev_net(dev);
1371	struct rtable *rt;
1372	int cpu;
1373
1374	for_each_possible_cpu(cpu) {
1375		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1376
 
 
 
1377		spin_lock_bh(&ul->lock);
1378		list_for_each_entry(rt, &ul->head, rt_uncached) {
1379			if (rt->dst.dev != dev)
1380				continue;
1381			rt->dst.dev = net->loopback_dev;
1382			dev_hold(rt->dst.dev);
1383			dev_put(dev);
 
1384		}
1385		spin_unlock_bh(&ul->lock);
1386	}
1387}
1388
1389static bool rt_cache_valid(const struct rtable *rt)
1390{
1391	return	rt &&
1392		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1393		!rt_is_expired(rt);
1394}
1395
1396static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1397			   const struct fib_result *res,
1398			   struct fib_nh_exception *fnhe,
1399			   struct fib_info *fi, u16 type, u32 itag)
 
1400{
1401	bool cached = false;
1402
1403	if (fi) {
1404		struct fib_nh *nh = &FIB_RES_NH(*res);
1405
1406		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1407			rt->rt_gateway = nh->nh_gw;
1408			rt->rt_uses_gateway = 1;
 
 
 
 
 
 
1409		}
1410		dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 
 
1411#ifdef CONFIG_IP_ROUTE_CLASSID
1412		rt->dst.tclassid = nh->nh_tclassid;
 
 
 
 
 
1413#endif
1414		rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1415		if (unlikely(fnhe))
1416			cached = rt_bind_exception(rt, fnhe, daddr);
1417		else if (!(rt->dst.flags & DST_NOCACHE))
1418			cached = rt_cache_route(nh, rt);
1419		if (unlikely(!cached)) {
1420			/* Routes we intend to cache in nexthop exception or
1421			 * FIB nexthop have the DST_NOCACHE bit clear.
1422			 * However, if we are unsuccessful at storing this
1423			 * route into the cache we really need to set it.
1424			 */
1425			rt->dst.flags |= DST_NOCACHE;
1426			if (!rt->rt_gateway)
1427				rt->rt_gateway = daddr;
 
1428			rt_add_uncached_list(rt);
1429		}
1430	} else
1431		rt_add_uncached_list(rt);
1432
1433#ifdef CONFIG_IP_ROUTE_CLASSID
1434#ifdef CONFIG_IP_MULTIPLE_TABLES
1435	set_class_tag(rt, res->tclassid);
1436#endif
1437	set_class_tag(rt, itag);
1438#endif
1439}
1440
1441struct rtable *rt_dst_alloc(struct net_device *dev,
1442			    unsigned int flags, u16 type,
1443			    bool nopolicy, bool noxfrm, bool will_cache)
1444{
1445	struct rtable *rt;
1446
1447	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1448		       (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1449		       (nopolicy ? DST_NOPOLICY : 0) |
1450		       (noxfrm ? DST_NOXFRM : 0));
1451
1452	if (rt) {
1453		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1454		rt->rt_flags = flags;
1455		rt->rt_type = type;
1456		rt->rt_is_input = 0;
1457		rt->rt_iif = 0;
1458		rt->rt_pmtu = 0;
1459		rt->rt_gateway = 0;
1460		rt->rt_uses_gateway = 0;
1461		rt->rt_table_id = 0;
1462		INIT_LIST_HEAD(&rt->rt_uncached);
1463
1464		rt->dst.output = ip_output;
1465		if (flags & RTCF_LOCAL)
1466			rt->dst.input = ip_local_deliver;
1467	}
1468
1469	return rt;
1470}
1471EXPORT_SYMBOL(rt_dst_alloc);
1472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1473/* called in rcu_read_lock() section */
1474static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1475				u8 tos, struct net_device *dev, int our)
 
1476{
1477	struct rtable *rth;
1478	struct in_device *in_dev = __in_dev_get_rcu(dev);
1479	unsigned int flags = RTCF_MULTICAST;
1480	u32 itag = 0;
1481	int err;
1482
1483	/* Primary sanity checks. */
1484
1485	if (!in_dev)
1486		return -EINVAL;
1487
1488	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1489	    skb->protocol != htons(ETH_P_IP))
1490		goto e_inval;
1491
1492	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1493		goto e_inval;
1494
1495	if (ipv4_is_zeronet(saddr)) {
1496		if (!ipv4_is_local_multicast(daddr))
1497			goto e_inval;
 
1498	} else {
1499		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1500					  in_dev, &itag);
1501		if (err < 0)
1502			goto e_err;
1503	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1504	if (our)
1505		flags |= RTCF_LOCAL;
1506
 
 
 
1507	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1508			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1509	if (!rth)
1510		goto e_nobufs;
1511
1512#ifdef CONFIG_IP_ROUTE_CLASSID
1513	rth->dst.tclassid = itag;
1514#endif
1515	rth->dst.output = ip_rt_bug;
1516	rth->rt_is_input= 1;
1517
1518#ifdef CONFIG_IP_MROUTE
1519	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1520		rth->dst.input = ip_mr_input;
1521#endif
1522	RT_CACHE_STAT_INC(in_slow_mc);
1523
 
1524	skb_dst_set(skb, &rth->dst);
1525	return 0;
1526
1527e_nobufs:
1528	return -ENOBUFS;
1529e_inval:
1530	return -EINVAL;
1531e_err:
1532	return err;
1533}
1534
1535
1536static void ip_handle_martian_source(struct net_device *dev,
1537				     struct in_device *in_dev,
1538				     struct sk_buff *skb,
1539				     __be32 daddr,
1540				     __be32 saddr)
1541{
1542	RT_CACHE_STAT_INC(in_martian_src);
1543#ifdef CONFIG_IP_ROUTE_VERBOSE
1544	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1545		/*
1546		 *	RFC1812 recommendation, if source is martian,
1547		 *	the only hint is MAC header.
1548		 */
1549		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1550			&daddr, &saddr, dev->name);
1551		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1552			print_hex_dump(KERN_WARNING, "ll header: ",
1553				       DUMP_PREFIX_OFFSET, 16, 1,
1554				       skb_mac_header(skb),
1555				       dev->hard_header_len, true);
1556		}
1557	}
1558#endif
1559}
1560
1561static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1562{
1563	struct fnhe_hash_bucket *hash;
1564	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1565	u32 hval = fnhe_hashfun(daddr);
1566
1567	spin_lock_bh(&fnhe_lock);
1568
1569	hash = rcu_dereference_protected(nh->nh_exceptions,
1570					 lockdep_is_held(&fnhe_lock));
1571	hash += hval;
1572
1573	fnhe_p = &hash->chain;
1574	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1575	while (fnhe) {
1576		if (fnhe->fnhe_daddr == daddr) {
1577			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1578				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1579			fnhe_flush_routes(fnhe);
1580			kfree_rcu(fnhe, rcu);
1581			break;
1582		}
1583		fnhe_p = &fnhe->fnhe_next;
1584		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1585						 lockdep_is_held(&fnhe_lock));
1586	}
1587
1588	spin_unlock_bh(&fnhe_lock);
1589}
1590
1591/* called in rcu_read_lock() section */
1592static int __mkroute_input(struct sk_buff *skb,
1593			   const struct fib_result *res,
1594			   struct in_device *in_dev,
1595			   __be32 daddr, __be32 saddr, u32 tos)
1596{
 
 
1597	struct fib_nh_exception *fnhe;
1598	struct rtable *rth;
1599	int err;
1600	struct in_device *out_dev;
1601	bool do_cache;
1602	u32 itag = 0;
1603
1604	/* get a working reference to the output device */
1605	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1606	if (!out_dev) {
1607		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1608		return -EINVAL;
1609	}
1610
1611	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1612				  in_dev->dev, in_dev, &itag);
1613	if (err < 0) {
1614		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1615					 saddr);
1616
1617		goto cleanup;
1618	}
1619
1620	do_cache = res->fi && !itag;
1621	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1622	    skb->protocol == htons(ETH_P_IP) &&
1623	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1624	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1625		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
 
 
 
 
1626
1627	if (skb->protocol != htons(ETH_P_IP)) {
1628		/* Not IP (i.e. ARP). Do not create route, if it is
1629		 * invalid for proxy arp. DNAT routes are always valid.
1630		 *
1631		 * Proxy arp feature have been extended to allow, ARP
1632		 * replies back to the same interface, to support
1633		 * Private VLAN switch technologies. See arp.c.
1634		 */
1635		if (out_dev == in_dev &&
1636		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1637			err = -EINVAL;
1638			goto cleanup;
1639		}
1640	}
1641
1642	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
 
 
 
1643	if (do_cache) {
1644		if (fnhe) {
1645			rth = rcu_dereference(fnhe->fnhe_rth_input);
1646			if (rth && rth->dst.expires &&
1647			    time_after(jiffies, rth->dst.expires)) {
1648				ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1649				fnhe = NULL;
1650			} else {
1651				goto rt_cache;
1652			}
1653		}
1654
1655		rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1656
1657rt_cache:
1658		if (rt_cache_valid(rth)) {
1659			skb_dst_set_noref(skb, &rth->dst);
1660			goto out;
1661		}
1662	}
1663
1664	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1665			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1666			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1667	if (!rth) {
1668		err = -ENOBUFS;
1669		goto cleanup;
1670	}
1671
1672	rth->rt_is_input = 1;
1673	if (res->table)
1674		rth->rt_table_id = res->table->tb_id;
1675	RT_CACHE_STAT_INC(in_slow_tot);
1676
1677	rth->dst.input = ip_forward;
1678
1679	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1680	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1681		rth->dst.lwtstate->orig_output = rth->dst.output;
1682		rth->dst.output = lwtunnel_output;
1683	}
1684	if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1685		rth->dst.lwtstate->orig_input = rth->dst.input;
1686		rth->dst.input = lwtunnel_input;
1687	}
1688	skb_dst_set(skb, &rth->dst);
1689out:
1690	err = 0;
1691 cleanup:
1692	return err;
1693}
1694
1695#ifdef CONFIG_IP_ROUTE_MULTIPATH
1696
1697/* To make ICMP packets follow the right flow, the multipath hash is
1698 * calculated from the inner IP addresses in reverse order.
1699 */
1700static int ip_multipath_icmp_hash(struct sk_buff *skb)
 
1701{
1702	const struct iphdr *outer_iph = ip_hdr(skb);
1703	struct icmphdr _icmph;
 
1704	const struct icmphdr *icmph;
1705	struct iphdr _inner_iph;
1706	const struct iphdr *inner_iph;
 
 
 
1707
1708	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1709		goto standard_hash;
1710
1711	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1712				   &_icmph);
1713	if (!icmph)
1714		goto standard_hash;
1715
1716	if (icmph->type != ICMP_DEST_UNREACH &&
1717	    icmph->type != ICMP_REDIRECT &&
1718	    icmph->type != ICMP_TIME_EXCEEDED &&
1719	    icmph->type != ICMP_PARAMETERPROB) {
1720		goto standard_hash;
1721	}
1722
1723	inner_iph = skb_header_pointer(skb,
1724				       outer_iph->ihl * 4 + sizeof(_icmph),
1725				       sizeof(_inner_iph), &_inner_iph);
1726	if (!inner_iph)
1727		goto standard_hash;
1728
1729	return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
 
 
 
 
1730
1731standard_hash:
1732	return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1733}
1734
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1735#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1736
1737static int ip_mkroute_input(struct sk_buff *skb,
1738			    struct fib_result *res,
1739			    const struct flowi4 *fl4,
1740			    struct in_device *in_dev,
1741			    __be32 daddr, __be32 saddr, u32 tos)
 
1742{
1743#ifdef CONFIG_IP_ROUTE_MULTIPATH
1744	if (res->fi && res->fi->fib_nhs > 1) {
1745		int h;
1746
1747		if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1748			h = ip_multipath_icmp_hash(skb);
1749		else
1750			h = fib_multipath_hash(saddr, daddr);
1751		fib_select_multipath(res, h);
 
1752	}
1753#endif
1754
1755	/* create a routing cache entry */
1756	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1757}
1758
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1759/*
1760 *	NOTE. We drop all the packets that has local source
1761 *	addresses, because every properly looped back packet
1762 *	must have correct destination already attached by output routine.
 
 
1763 *
1764 *	Such approach solves two big problems:
1765 *	1. Not simplex devices are handled properly.
1766 *	2. IP spoofing attempts are filtered with 100% of guarantee.
1767 *	called with rcu_read_lock()
1768 */
1769
1770static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1771			       u8 tos, struct net_device *dev)
 
1772{
1773	struct fib_result res;
1774	struct in_device *in_dev = __in_dev_get_rcu(dev);
 
 
1775	struct ip_tunnel_info *tun_info;
1776	struct flowi4	fl4;
1777	unsigned int	flags = 0;
1778	u32		itag = 0;
1779	struct rtable	*rth;
1780	int		err = -EINVAL;
1781	struct net    *net = dev_net(dev);
1782	bool do_cache;
1783
1784	/* IP on this device is disabled. */
1785
1786	if (!in_dev)
1787		goto out;
1788
1789	/* Check for the most weird martians, which can be not detected
1790	   by fib_lookup.
1791	 */
1792
1793	tun_info = skb_tunnel_info(skb);
1794	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1795		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1796	else
1797		fl4.flowi4_tun_key.tun_id = 0;
1798	skb_dst_drop(skb);
1799
1800	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1801		goto martian_source;
1802
1803	res.fi = NULL;
1804	res.table = NULL;
1805	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1806		goto brd_input;
1807
1808	/* Accept zero addresses only to limited broadcast;
1809	 * I even do not know to fix it or not. Waiting for complains :-)
1810	 */
1811	if (ipv4_is_zeronet(saddr))
1812		goto martian_source;
1813
1814	if (ipv4_is_zeronet(daddr))
1815		goto martian_destination;
1816
1817	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1818	 * and call it once if daddr or/and saddr are loopback addresses
1819	 */
1820	if (ipv4_is_loopback(daddr)) {
1821		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1822			goto martian_destination;
1823	} else if (ipv4_is_loopback(saddr)) {
1824		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1825			goto martian_source;
1826	}
1827
1828	/*
1829	 *	Now we are ready to route packet.
1830	 */
 
1831	fl4.flowi4_oif = 0;
1832	fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
1833	fl4.flowi4_mark = skb->mark;
1834	fl4.flowi4_tos = tos;
1835	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1836	fl4.flowi4_flags = 0;
1837	fl4.daddr = daddr;
1838	fl4.saddr = saddr;
1839	err = fib_lookup(net, &fl4, &res, 0);
 
 
 
 
 
 
 
 
 
 
 
1840	if (err != 0) {
1841		if (!IN_DEV_FORWARD(in_dev))
1842			err = -EHOSTUNREACH;
1843		goto no_route;
1844	}
1845
1846	if (res.type == RTN_BROADCAST)
 
 
 
 
 
1847		goto brd_input;
 
1848
1849	if (res.type == RTN_LOCAL) {
1850		err = fib_validate_source(skb, saddr, daddr, tos,
1851					  0, dev, in_dev, &itag);
1852		if (err < 0)
1853			goto martian_source;
1854		goto local_input;
1855	}
1856
1857	if (!IN_DEV_FORWARD(in_dev)) {
1858		err = -EHOSTUNREACH;
1859		goto no_route;
1860	}
1861	if (res.type != RTN_UNICAST)
1862		goto martian_destination;
1863
1864	err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
 
1865out:	return err;
1866
1867brd_input:
1868	if (skb->protocol != htons(ETH_P_IP))
1869		goto e_inval;
1870
1871	if (!ipv4_is_zeronet(saddr)) {
1872		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1873					  in_dev, &itag);
1874		if (err < 0)
1875			goto martian_source;
1876	}
1877	flags |= RTCF_BROADCAST;
1878	res.type = RTN_BROADCAST;
1879	RT_CACHE_STAT_INC(in_brd);
1880
1881local_input:
1882	do_cache = false;
1883	if (res.fi) {
1884		if (!itag) {
1885			rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1886			if (rt_cache_valid(rth)) {
1887				skb_dst_set_noref(skb, &rth->dst);
1888				err = 0;
1889				goto out;
1890			}
1891			do_cache = true;
 
 
1892		}
1893	}
1894
1895	rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1896			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1897	if (!rth)
1898		goto e_nobufs;
1899
1900	rth->dst.output= ip_rt_bug;
1901#ifdef CONFIG_IP_ROUTE_CLASSID
1902	rth->dst.tclassid = itag;
1903#endif
1904	rth->rt_is_input = 1;
1905	if (res.table)
1906		rth->rt_table_id = res.table->tb_id;
1907
1908	RT_CACHE_STAT_INC(in_slow_tot);
1909	if (res.type == RTN_UNREACHABLE) {
1910		rth->dst.input= ip_error;
1911		rth->dst.error= -err;
1912		rth->rt_flags 	&= ~RTCF_LOCAL;
1913	}
 
1914	if (do_cache) {
1915		if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1916			rth->dst.flags |= DST_NOCACHE;
1917			rt_add_uncached_list(rth);
 
 
 
 
1918		}
 
 
 
1919	}
1920	skb_dst_set(skb, &rth->dst);
1921	err = 0;
1922	goto out;
1923
1924no_route:
1925	RT_CACHE_STAT_INC(in_no_route);
1926	res.type = RTN_UNREACHABLE;
1927	res.fi = NULL;
1928	res.table = NULL;
1929	goto local_input;
1930
1931	/*
1932	 *	Do not cache martian addresses: they should be logged (RFC1812)
1933	 */
1934martian_destination:
1935	RT_CACHE_STAT_INC(in_martian_dst);
1936#ifdef CONFIG_IP_ROUTE_VERBOSE
1937	if (IN_DEV_LOG_MARTIANS(in_dev))
1938		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1939				     &daddr, &saddr, dev->name);
1940#endif
1941
1942e_inval:
1943	err = -EINVAL;
1944	goto out;
1945
1946e_nobufs:
1947	err = -ENOBUFS;
1948	goto out;
1949
1950martian_source:
1951	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1952	goto out;
1953}
1954
1955int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1956			 u8 tos, struct net_device *dev)
 
1957{
1958	int res;
1959
1960	rcu_read_lock();
1961
1962	/* Multicast recognition logic is moved from route cache to here.
1963	   The problem was that too many Ethernet cards have broken/missing
1964	   hardware multicast filters :-( As result the host on multicasting
1965	   network acquires a lot of useless route cache entries, sort of
1966	   SDR messages from all the world. Now we try to get rid of them.
1967	   Really, provided software IP multicast filter is organized
1968	   reasonably (at least, hashed), it does not result in a slowdown
1969	   comparing with route cache reject entries.
1970	   Note, that multicast routers are not affected, because
1971	   route cache entry is created eventually.
1972	 */
1973	if (ipv4_is_multicast(daddr)) {
1974		struct in_device *in_dev = __in_dev_get_rcu(dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1975
1976		if (in_dev) {
1977			int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1978						  ip_hdr(skb)->protocol);
1979			if (our
1980#ifdef CONFIG_IP_MROUTE
1981				||
1982			    (!ipv4_is_local_multicast(daddr) &&
1983			     IN_DEV_MFORWARD(in_dev))
1984#endif
1985			   ) {
1986				int res = ip_route_input_mc(skb, daddr, saddr,
1987							    tos, dev, our);
1988				rcu_read_unlock();
1989				return res;
1990			}
1991		}
1992		rcu_read_unlock();
1993		return -EINVAL;
1994	}
1995	res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
 
 
 
 
 
 
 
 
 
 
 
 
1996	rcu_read_unlock();
1997	return res;
 
1998}
1999EXPORT_SYMBOL(ip_route_input_noref);
2000
2001/* called with rcu_read_lock() */
2002static struct rtable *__mkroute_output(const struct fib_result *res,
2003				       const struct flowi4 *fl4, int orig_oif,
2004				       struct net_device *dev_out,
2005				       unsigned int flags)
2006{
2007	struct fib_info *fi = res->fi;
2008	struct fib_nh_exception *fnhe;
2009	struct in_device *in_dev;
2010	u16 type = res->type;
2011	struct rtable *rth;
2012	bool do_cache;
2013
2014	in_dev = __in_dev_get_rcu(dev_out);
2015	if (!in_dev)
2016		return ERR_PTR(-EINVAL);
2017
2018	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2019		if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
 
 
2020			return ERR_PTR(-EINVAL);
2021
2022	if (ipv4_is_lbcast(fl4->daddr))
2023		type = RTN_BROADCAST;
2024	else if (ipv4_is_multicast(fl4->daddr))
2025		type = RTN_MULTICAST;
2026	else if (ipv4_is_zeronet(fl4->daddr))
2027		return ERR_PTR(-EINVAL);
2028
2029	if (dev_out->flags & IFF_LOOPBACK)
2030		flags |= RTCF_LOCAL;
2031
2032	do_cache = true;
2033	if (type == RTN_BROADCAST) {
2034		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2035		fi = NULL;
2036	} else if (type == RTN_MULTICAST) {
2037		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2038		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2039				     fl4->flowi4_proto))
2040			flags &= ~RTCF_LOCAL;
2041		else
2042			do_cache = false;
2043		/* If multicast route do not exist use
2044		 * default one, but do not gateway in this case.
2045		 * Yes, it is hack.
2046		 */
2047		if (fi && res->prefixlen < 4)
2048			fi = NULL;
2049	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2050		   (orig_oif != dev_out->ifindex)) {
2051		/* For local routes that require a particular output interface
2052		 * we do not want to cache the result.  Caching the result
2053		 * causes incorrect behaviour when there are multiple source
2054		 * addresses on the interface, the end result being that if the
2055		 * intended recipient is waiting on that interface for the
2056		 * packet he won't receive it because it will be delivered on
2057		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2058		 * be set to the loopback interface as well.
2059		 */
2060		fi = NULL;
2061	}
2062
2063	fnhe = NULL;
2064	do_cache &= fi != NULL;
2065	if (do_cache) {
 
2066		struct rtable __rcu **prth;
2067		struct fib_nh *nh = &FIB_RES_NH(*res);
2068
2069		fnhe = find_exception(nh, fl4->daddr);
 
 
2070		if (fnhe) {
2071			prth = &fnhe->fnhe_rth_output;
2072			rth = rcu_dereference(*prth);
2073			if (rth && rth->dst.expires &&
2074			    time_after(jiffies, rth->dst.expires)) {
2075				ip_del_fnhe(nh, fl4->daddr);
2076				fnhe = NULL;
2077			} else {
2078				goto rt_cache;
2079			}
 
2080		}
2081
2082		if (unlikely(fl4->flowi4_flags &
2083			     FLOWI_FLAG_KNOWN_NH &&
2084			     !(nh->nh_gw &&
2085			       nh->nh_scope == RT_SCOPE_LINK))) {
2086			do_cache = false;
2087			goto add;
2088		}
2089		prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2090		rth = rcu_dereference(*prth);
2091
2092rt_cache:
2093		if (rt_cache_valid(rth)) {
2094			dst_hold(&rth->dst);
2095			return rth;
2096		}
2097	}
2098
2099add:
2100	rth = rt_dst_alloc(dev_out, flags, type,
2101			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2102			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2103			   do_cache);
2104	if (!rth)
2105		return ERR_PTR(-ENOBUFS);
2106
2107	rth->rt_iif	= orig_oif ? : 0;
2108	if (res->table)
2109		rth->rt_table_id = res->table->tb_id;
2110
2111	RT_CACHE_STAT_INC(out_slow_tot);
2112
2113	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2114		if (flags & RTCF_LOCAL &&
2115		    !(dev_out->flags & IFF_LOOPBACK)) {
2116			rth->dst.output = ip_mc_output;
2117			RT_CACHE_STAT_INC(out_slow_mc);
2118		}
2119#ifdef CONFIG_IP_MROUTE
2120		if (type == RTN_MULTICAST) {
2121			if (IN_DEV_MFORWARD(in_dev) &&
2122			    !ipv4_is_local_multicast(fl4->daddr)) {
2123				rth->dst.input = ip_mr_input;
2124				rth->dst.output = ip_mc_output;
2125			}
2126		}
2127#endif
2128	}
2129
2130	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2131	if (lwtunnel_output_redirect(rth->dst.lwtstate))
2132		rth->dst.output = lwtunnel_output;
2133
2134	return rth;
2135}
2136
2137/*
2138 * Major route resolver routine.
2139 */
2140
2141struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2142					  int mp_hash)
2143{
2144	struct net_device *dev_out = NULL;
2145	__u8 tos = RT_FL_TOS(fl4);
2146	unsigned int flags = 0;
2147	struct fib_result res;
 
 
2148	struct rtable *rth;
2149	int orig_oif;
2150	int err = -ENETUNREACH;
2151
2152	res.tclassid	= 0;
2153	res.fi		= NULL;
2154	res.table	= NULL;
2155
2156	orig_oif = fl4->flowi4_oif;
2157
2158	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2159	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2160	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2161			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2162
2163	rcu_read_lock();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2164	if (fl4->saddr) {
2165		rth = ERR_PTR(-EINVAL);
2166		if (ipv4_is_multicast(fl4->saddr) ||
2167		    ipv4_is_lbcast(fl4->saddr) ||
2168		    ipv4_is_zeronet(fl4->saddr))
 
2169			goto out;
 
 
 
2170
2171		/* I removed check for oif == dev_out->oif here.
2172		   It was wrong for two reasons:
2173		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2174		      is assigned to multiple interfaces.
2175		   2. Moreover, we are allowed to send packets with saddr
2176		      of another iface. --ANK
2177		 */
2178
2179		if (fl4->flowi4_oif == 0 &&
2180		    (ipv4_is_multicast(fl4->daddr) ||
2181		     ipv4_is_lbcast(fl4->daddr))) {
2182			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2183			dev_out = __ip_dev_find(net, fl4->saddr, false);
2184			if (!dev_out)
2185				goto out;
2186
2187			/* Special hack: user can direct multicasts
2188			   and limited broadcast via necessary interface
2189			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2190			   This hack is not just for fun, it allows
2191			   vic,vat and friends to work.
2192			   They bind socket to loopback, set ttl to zero
2193			   and expect that it will work.
2194			   From the viewpoint of routing cache they are broken,
2195			   because we are not allowed to build multicast path
2196			   with loopback source addr (look, routing cache
2197			   cannot know, that ttl is zero, so that packet
2198			   will not leave this host and route is valid).
2199			   Luckily, this hack is good workaround.
2200			 */
2201
2202			fl4->flowi4_oif = dev_out->ifindex;
2203			goto make_route;
2204		}
2205
2206		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2207			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2208			if (!__ip_dev_find(net, fl4->saddr, false))
2209				goto out;
2210		}
2211	}
2212
2213
2214	if (fl4->flowi4_oif) {
2215		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2216		rth = ERR_PTR(-ENODEV);
2217		if (!dev_out)
2218			goto out;
2219
2220		/* RACE: Check return value of inet_select_addr instead. */
2221		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2222			rth = ERR_PTR(-ENETUNREACH);
2223			goto out;
2224		}
2225		if (ipv4_is_local_multicast(fl4->daddr) ||
2226		    ipv4_is_lbcast(fl4->daddr) ||
2227		    fl4->flowi4_proto == IPPROTO_IGMP) {
2228			if (!fl4->saddr)
2229				fl4->saddr = inet_select_addr(dev_out, 0,
2230							      RT_SCOPE_LINK);
2231			goto make_route;
2232		}
2233		if (!fl4->saddr) {
2234			if (ipv4_is_multicast(fl4->daddr))
2235				fl4->saddr = inet_select_addr(dev_out, 0,
2236							      fl4->flowi4_scope);
2237			else if (!fl4->daddr)
2238				fl4->saddr = inet_select_addr(dev_out, 0,
2239							      RT_SCOPE_HOST);
2240		}
2241
2242		rth = l3mdev_get_rtable(dev_out, fl4);
2243		if (rth)
2244			goto out;
2245	}
2246
2247	if (!fl4->daddr) {
2248		fl4->daddr = fl4->saddr;
2249		if (!fl4->daddr)
2250			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2251		dev_out = net->loopback_dev;
2252		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2253		res.type = RTN_LOCAL;
2254		flags |= RTCF_LOCAL;
2255		goto make_route;
2256	}
2257
2258	err = fib_lookup(net, fl4, &res, 0);
2259	if (err) {
2260		res.fi = NULL;
2261		res.table = NULL;
2262		if (fl4->flowi4_oif &&
2263		    !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2264			/* Apparently, routing tables are wrong. Assume,
2265			   that the destination is on link.
2266
2267			   WHY? DW.
2268			   Because we are allowed to send to iface
2269			   even if it has NO routes and NO assigned
2270			   addresses. When oif is specified, routing
2271			   tables are looked up with only one purpose:
2272			   to catch if destination is gatewayed, rather than
2273			   direct. Moreover, if MSG_DONTROUTE is set,
2274			   we send packet, ignoring both routing tables
2275			   and ifaddr state. --ANK
2276
2277
2278			   We could make it even if oif is unknown,
2279			   likely IPv6, but we do not.
2280			 */
2281
2282			if (fl4->saddr == 0)
2283				fl4->saddr = inet_select_addr(dev_out, 0,
2284							      RT_SCOPE_LINK);
2285			res.type = RTN_UNICAST;
2286			goto make_route;
2287		}
2288		rth = ERR_PTR(err);
2289		goto out;
2290	}
2291
2292	if (res.type == RTN_LOCAL) {
2293		if (!fl4->saddr) {
2294			if (res.fi->fib_prefsrc)
2295				fl4->saddr = res.fi->fib_prefsrc;
2296			else
2297				fl4->saddr = fl4->daddr;
2298		}
2299		dev_out = net->loopback_dev;
 
 
 
 
 
 
 
 
 
2300		fl4->flowi4_oif = dev_out->ifindex;
2301		flags |= RTCF_LOCAL;
2302		goto make_route;
2303	}
2304
2305	fib_select_path(net, &res, fl4, mp_hash);
2306
2307	dev_out = FIB_RES_DEV(res);
2308	fl4->flowi4_oif = dev_out->ifindex;
2309
 
2310
2311make_route:
2312	rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2313
2314out:
2315	rcu_read_unlock();
2316	return rth;
2317}
2318EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2319
2320static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2321{
2322	return NULL;
2323}
2324
2325static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2326{
2327	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2328
2329	return mtu ? : dst->dev->mtu;
2330}
2331
2332static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2333					  struct sk_buff *skb, u32 mtu)
2334{
2335}
2336
2337static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2338				       struct sk_buff *skb)
2339{
2340}
2341
2342static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2343					  unsigned long old)
2344{
2345	return NULL;
2346}
2347
2348static struct dst_ops ipv4_dst_blackhole_ops = {
2349	.family			=	AF_INET,
2350	.check			=	ipv4_blackhole_dst_check,
2351	.mtu			=	ipv4_blackhole_mtu,
2352	.default_advmss		=	ipv4_default_advmss,
2353	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2354	.redirect		=	ipv4_rt_blackhole_redirect,
2355	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2356	.neigh_lookup		=	ipv4_neigh_lookup,
2357};
2358
2359struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2360{
2361	struct rtable *ort = (struct rtable *) dst_orig;
2362	struct rtable *rt;
2363
2364	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2365	if (rt) {
2366		struct dst_entry *new = &rt->dst;
2367
2368		new->__use = 1;
2369		new->input = dst_discard;
2370		new->output = dst_discard_out;
2371
2372		new->dev = ort->dst.dev;
2373		if (new->dev)
2374			dev_hold(new->dev);
2375
2376		rt->rt_is_input = ort->rt_is_input;
2377		rt->rt_iif = ort->rt_iif;
2378		rt->rt_pmtu = ort->rt_pmtu;
 
2379
2380		rt->rt_genid = rt_genid_ipv4(net);
2381		rt->rt_flags = ort->rt_flags;
2382		rt->rt_type = ort->rt_type;
2383		rt->rt_gateway = ort->rt_gateway;
2384		rt->rt_uses_gateway = ort->rt_uses_gateway;
2385
2386		INIT_LIST_HEAD(&rt->rt_uncached);
2387		dst_free(new);
 
 
2388	}
2389
2390	dst_release(dst_orig);
2391
2392	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2393}
2394
2395struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2396				    const struct sock *sk)
2397{
2398	struct rtable *rt = __ip_route_output_key(net, flp4);
2399
2400	if (IS_ERR(rt))
2401		return rt;
2402
2403	if (flp4->flowi4_proto)
2404		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2405							flowi4_to_flowi(flp4),
2406							sk, 0);
 
 
2407
2408	return rt;
2409}
2410EXPORT_SYMBOL_GPL(ip_route_output_flow);
2411
2412static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2413			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2414			u32 seq, int event, int nowait, unsigned int flags)
 
 
2415{
2416	struct rtable *rt = skb_rtable(skb);
2417	struct rtmsg *r;
2418	struct nlmsghdr *nlh;
2419	unsigned long expires = 0;
2420	u32 error;
2421	u32 metrics[RTAX_MAX];
2422
2423	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2424	if (!nlh)
2425		return -EMSGSIZE;
2426
2427	r = nlmsg_data(nlh);
2428	r->rtm_family	 = AF_INET;
2429	r->rtm_dst_len	= 32;
2430	r->rtm_src_len	= 0;
2431	r->rtm_tos	= fl4->flowi4_tos;
2432	r->rtm_table	= table_id;
2433	if (nla_put_u32(skb, RTA_TABLE, table_id))
2434		goto nla_put_failure;
2435	r->rtm_type	= rt->rt_type;
2436	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2437	r->rtm_protocol = RTPROT_UNSPEC;
2438	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2439	if (rt->rt_flags & RTCF_NOTIFY)
2440		r->rtm_flags |= RTM_F_NOTIFY;
2441	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2442		r->rtm_flags |= RTCF_DOREDIRECT;
2443
2444	if (nla_put_in_addr(skb, RTA_DST, dst))
2445		goto nla_put_failure;
2446	if (src) {
2447		r->rtm_src_len = 32;
2448		if (nla_put_in_addr(skb, RTA_SRC, src))
2449			goto nla_put_failure;
2450	}
2451	if (rt->dst.dev &&
2452	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2453		goto nla_put_failure;
 
 
 
2454#ifdef CONFIG_IP_ROUTE_CLASSID
2455	if (rt->dst.tclassid &&
2456	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2457		goto nla_put_failure;
2458#endif
2459	if (!rt_is_input_route(rt) &&
2460	    fl4->saddr != src) {
2461		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2462			goto nla_put_failure;
2463	}
2464	if (rt->rt_uses_gateway &&
2465	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2466		goto nla_put_failure;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2467
2468	expires = rt->dst.expires;
2469	if (expires) {
2470		unsigned long now = jiffies;
2471
2472		if (time_before(now, expires))
2473			expires -= now;
2474		else
2475			expires = 0;
2476	}
2477
2478	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2479	if (rt->rt_pmtu && expires)
2480		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
 
 
2481	if (rtnetlink_put_metrics(skb, metrics) < 0)
2482		goto nla_put_failure;
2483
2484	if (fl4->flowi4_mark &&
2485	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2486		goto nla_put_failure;
 
2487
2488	error = rt->dst.error;
 
 
 
 
2489
2490	if (rt_is_input_route(rt)) {
2491#ifdef CONFIG_IP_MROUTE
2492		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2493		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2494			int err = ipmr_get_route(net, skb,
2495						 fl4->saddr, fl4->daddr,
2496						 r, nowait);
2497			if (err <= 0) {
2498				if (!nowait) {
 
2499					if (err == 0)
2500						return 0;
2501					goto nla_put_failure;
2502				} else {
2503					if (err == -EMSGSIZE)
2504						goto nla_put_failure;
2505					error = err;
2506				}
2507			}
2508		} else
2509#endif
2510			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2511				goto nla_put_failure;
 
2512	}
2513
 
 
2514	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2515		goto nla_put_failure;
2516
2517	nlmsg_end(skb, nlh);
2518	return 0;
2519
2520nla_put_failure:
2521	nlmsg_cancel(skb, nlh);
2522	return -EMSGSIZE;
2523}
2524
2525static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2526{
2527	struct net *net = sock_net(in_skb->sk);
2528	struct rtmsg *rtm;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2529	struct nlattr *tb[RTA_MAX+1];
 
 
 
 
2530	struct rtable *rt = NULL;
2531	struct flowi4 fl4;
 
 
2532	__be32 dst = 0;
2533	__be32 src = 0;
 
2534	u32 iif;
2535	int err;
2536	int mark;
2537	struct sk_buff *skb;
2538	u32 table_id = RT_TABLE_MAIN;
2539
2540	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2541	if (err < 0)
2542		goto errout;
2543
2544	rtm = nlmsg_data(nlh);
 
 
 
 
 
 
 
 
2545
2546	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2547	if (!skb) {
2548		err = -ENOBUFS;
2549		goto errout;
 
2550	}
2551
2552	/* Reserve room for dummy headers, this skb can pass
2553	   through good chunk of routing engine.
2554	 */
2555	skb_reset_mac_header(skb);
2556	skb_reset_network_header(skb);
2557
2558	/* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2559	ip_hdr(skb)->protocol = IPPROTO_ICMP;
2560	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2561
2562	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2563	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2564	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2565	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2566
2567	memset(&fl4, 0, sizeof(fl4));
2568	fl4.daddr = dst;
2569	fl4.saddr = src;
2570	fl4.flowi4_tos = rtm->rtm_tos;
2571	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2572	fl4.flowi4_mark = mark;
 
 
 
 
 
 
2573
2574	if (netif_index_is_l3_master(net, fl4.flowi4_oif))
2575		fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
2576
2577	if (iif) {
2578		struct net_device *dev;
2579
2580		dev = __dev_get_by_index(net, iif);
2581		if (!dev) {
2582			err = -ENODEV;
2583			goto errout_free;
2584		}
2585
2586		skb->protocol	= htons(ETH_P_IP);
2587		skb->dev	= dev;
2588		skb->mark	= mark;
2589		local_bh_disable();
2590		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2591		local_bh_enable();
2592
2593		rt = skb_rtable(skb);
2594		if (err == 0 && rt->dst.error)
2595			err = -rt->dst.error;
2596	} else {
2597		rt = ip_route_output_key(net, &fl4);
2598
 
2599		err = 0;
2600		if (IS_ERR(rt))
2601			err = PTR_ERR(rt);
 
 
2602	}
2603
2604	if (err)
2605		goto errout_free;
2606
2607	skb_dst_set(skb, &rt->dst);
2608	if (rtm->rtm_flags & RTM_F_NOTIFY)
2609		rt->rt_flags |= RTCF_NOTIFY;
2610
2611	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2612		table_id = rt->rt_table_id;
2613
2614	err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2615			   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2616			   RTM_NEWROUTE, 0, 0);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2617	if (err < 0)
2618		goto errout_free;
 
 
2619
2620	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2621errout:
2622	return err;
2623
2624errout_free:
 
 
 
2625	kfree_skb(skb);
2626	goto errout;
2627}
2628
2629void ip_rt_multicast_event(struct in_device *in_dev)
2630{
2631	rt_cache_flush(dev_net(in_dev->dev));
2632}
2633
2634#ifdef CONFIG_SYSCTL
2635static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2636static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2637static int ip_rt_gc_elasticity __read_mostly	= 8;
 
2638
2639static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2640					void __user *buffer,
2641					size_t *lenp, loff_t *ppos)
2642{
2643	struct net *net = (struct net *)__ctl->extra1;
2644
2645	if (write) {
2646		rt_cache_flush(net);
2647		fnhe_genid_bump(net);
2648		return 0;
2649	}
2650
2651	return -EINVAL;
2652}
2653
2654static struct ctl_table ipv4_route_table[] = {
2655	{
2656		.procname	= "gc_thresh",
2657		.data		= &ipv4_dst_ops.gc_thresh,
2658		.maxlen		= sizeof(int),
2659		.mode		= 0644,
2660		.proc_handler	= proc_dointvec,
2661	},
2662	{
2663		.procname	= "max_size",
2664		.data		= &ip_rt_max_size,
2665		.maxlen		= sizeof(int),
2666		.mode		= 0644,
2667		.proc_handler	= proc_dointvec,
2668	},
2669	{
2670		/*  Deprecated. Use gc_min_interval_ms */
2671
2672		.procname	= "gc_min_interval",
2673		.data		= &ip_rt_gc_min_interval,
2674		.maxlen		= sizeof(int),
2675		.mode		= 0644,
2676		.proc_handler	= proc_dointvec_jiffies,
2677	},
2678	{
2679		.procname	= "gc_min_interval_ms",
2680		.data		= &ip_rt_gc_min_interval,
2681		.maxlen		= sizeof(int),
2682		.mode		= 0644,
2683		.proc_handler	= proc_dointvec_ms_jiffies,
2684	},
2685	{
2686		.procname	= "gc_timeout",
2687		.data		= &ip_rt_gc_timeout,
2688		.maxlen		= sizeof(int),
2689		.mode		= 0644,
2690		.proc_handler	= proc_dointvec_jiffies,
2691	},
2692	{
2693		.procname	= "gc_interval",
2694		.data		= &ip_rt_gc_interval,
2695		.maxlen		= sizeof(int),
2696		.mode		= 0644,
2697		.proc_handler	= proc_dointvec_jiffies,
2698	},
2699	{
2700		.procname	= "redirect_load",
2701		.data		= &ip_rt_redirect_load,
2702		.maxlen		= sizeof(int),
2703		.mode		= 0644,
2704		.proc_handler	= proc_dointvec,
2705	},
2706	{
2707		.procname	= "redirect_number",
2708		.data		= &ip_rt_redirect_number,
2709		.maxlen		= sizeof(int),
2710		.mode		= 0644,
2711		.proc_handler	= proc_dointvec,
2712	},
2713	{
2714		.procname	= "redirect_silence",
2715		.data		= &ip_rt_redirect_silence,
2716		.maxlen		= sizeof(int),
2717		.mode		= 0644,
2718		.proc_handler	= proc_dointvec,
2719	},
2720	{
2721		.procname	= "error_cost",
2722		.data		= &ip_rt_error_cost,
2723		.maxlen		= sizeof(int),
2724		.mode		= 0644,
2725		.proc_handler	= proc_dointvec,
2726	},
2727	{
2728		.procname	= "error_burst",
2729		.data		= &ip_rt_error_burst,
2730		.maxlen		= sizeof(int),
2731		.mode		= 0644,
2732		.proc_handler	= proc_dointvec,
2733	},
2734	{
2735		.procname	= "gc_elasticity",
2736		.data		= &ip_rt_gc_elasticity,
2737		.maxlen		= sizeof(int),
2738		.mode		= 0644,
2739		.proc_handler	= proc_dointvec,
2740	},
 
 
 
 
 
 
2741	{
2742		.procname	= "mtu_expires",
2743		.data		= &ip_rt_mtu_expires,
2744		.maxlen		= sizeof(int),
2745		.mode		= 0644,
2746		.proc_handler	= proc_dointvec_jiffies,
2747	},
2748	{
2749		.procname	= "min_pmtu",
2750		.data		= &ip_rt_min_pmtu,
2751		.maxlen		= sizeof(int),
2752		.mode		= 0644,
2753		.proc_handler	= proc_dointvec,
 
2754	},
2755	{
2756		.procname	= "min_adv_mss",
2757		.data		= &ip_rt_min_advmss,
2758		.maxlen		= sizeof(int),
2759		.mode		= 0644,
2760		.proc_handler	= proc_dointvec,
2761	},
2762	{ }
2763};
2764
2765static struct ctl_table ipv4_route_flush_table[] = {
2766	{
2767		.procname	= "flush",
2768		.maxlen		= sizeof(int),
2769		.mode		= 0200,
2770		.proc_handler	= ipv4_sysctl_rtcache_flush,
 
2771	},
2772	{ },
2773};
2774
2775static __net_init int sysctl_route_net_init(struct net *net)
2776{
2777	struct ctl_table *tbl;
 
2778
2779	tbl = ipv4_route_flush_table;
2780	if (!net_eq(net, &init_net)) {
2781		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
 
 
2782		if (!tbl)
2783			goto err_dup;
2784
2785		/* Don't export sysctls to unprivileged users */
2786		if (net->user_ns != &init_user_ns)
2787			tbl[0].procname = NULL;
 
 
 
 
 
 
 
 
 
 
2788	}
2789	tbl[0].extra1 = net;
2790
2791	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
 
2792	if (!net->ipv4.route_hdr)
2793		goto err_reg;
2794	return 0;
2795
2796err_reg:
2797	if (tbl != ipv4_route_flush_table)
2798		kfree(tbl);
2799err_dup:
2800	return -ENOMEM;
2801}
2802
2803static __net_exit void sysctl_route_net_exit(struct net *net)
2804{
2805	struct ctl_table *tbl;
2806
2807	tbl = net->ipv4.route_hdr->ctl_table_arg;
2808	unregister_net_sysctl_table(net->ipv4.route_hdr);
2809	BUG_ON(tbl == ipv4_route_flush_table);
2810	kfree(tbl);
2811}
2812
2813static __net_initdata struct pernet_operations sysctl_route_ops = {
2814	.init = sysctl_route_net_init,
2815	.exit = sysctl_route_net_exit,
2816};
2817#endif
2818
 
 
 
 
 
 
 
 
 
 
 
 
 
2819static __net_init int rt_genid_init(struct net *net)
2820{
2821	atomic_set(&net->ipv4.rt_genid, 0);
2822	atomic_set(&net->fnhe_genid, 0);
2823	get_random_bytes(&net->ipv4.dev_addr_genid,
2824			 sizeof(net->ipv4.dev_addr_genid));
2825	return 0;
2826}
2827
2828static __net_initdata struct pernet_operations rt_genid_ops = {
2829	.init = rt_genid_init,
2830};
2831
2832static int __net_init ipv4_inetpeer_init(struct net *net)
2833{
2834	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2835
2836	if (!bp)
2837		return -ENOMEM;
2838	inet_peer_base_init(bp);
2839	net->ipv4.peers = bp;
2840	return 0;
2841}
2842
2843static void __net_exit ipv4_inetpeer_exit(struct net *net)
2844{
2845	struct inet_peer_base *bp = net->ipv4.peers;
2846
2847	net->ipv4.peers = NULL;
2848	inetpeer_invalidate_tree(bp);
2849	kfree(bp);
2850}
2851
2852static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2853	.init	=	ipv4_inetpeer_init,
2854	.exit	=	ipv4_inetpeer_exit,
2855};
2856
2857#ifdef CONFIG_IP_ROUTE_CLASSID
2858struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2859#endif /* CONFIG_IP_ROUTE_CLASSID */
2860
2861int __init ip_rt_init(void)
2862{
2863	int rc = 0;
2864	int cpu;
2865
2866	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
2867	if (!ip_idents)
2868		panic("IP: failed to allocate ip_idents\n");
2869
2870	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
2871
2872	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
2873	if (!ip_tstamps)
2874		panic("IP: failed to allocate ip_tstamps\n");
 
 
 
 
 
 
 
2875
2876	for_each_possible_cpu(cpu) {
2877		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2878
2879		INIT_LIST_HEAD(&ul->head);
 
2880		spin_lock_init(&ul->lock);
2881	}
2882#ifdef CONFIG_IP_ROUTE_CLASSID
2883	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2884	if (!ip_rt_acct)
2885		panic("IP: failed to allocate ip_rt_acct\n");
2886#endif
2887
2888	ipv4_dst_ops.kmem_cachep =
2889		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2890				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2891
2892	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2893
2894	if (dst_entries_init(&ipv4_dst_ops) < 0)
2895		panic("IP: failed to allocate ipv4_dst_ops counter\n");
2896
2897	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2898		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2899
2900	ipv4_dst_ops.gc_thresh = ~0;
2901	ip_rt_max_size = INT_MAX;
2902
2903	devinet_init();
2904	ip_fib_init();
2905
2906	if (ip_rt_proc_init())
2907		pr_err("Unable to create route proc files\n");
2908#ifdef CONFIG_XFRM
2909	xfrm_init();
2910	xfrm4_init();
2911#endif
2912	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
 
2913
2914#ifdef CONFIG_SYSCTL
2915	register_pernet_subsys(&sysctl_route_ops);
2916#endif
 
2917	register_pernet_subsys(&rt_genid_ops);
2918	register_pernet_subsys(&ipv4_inetpeer_ops);
2919	return rc;
2920}
2921
2922#ifdef CONFIG_SYSCTL
2923/*
2924 * We really need to sanitize the damn ipv4 init order, then all
2925 * this nonsense will go away.
2926 */
2927void __init ip_static_sysctl_init(void)
2928{
2929	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2930}
2931#endif
v6.9.4
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		ROUTE - implementation of the IP router.
   8 *
   9 * Authors:	Ross Biro
  10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  12 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14 *
  15 * Fixes:
  16 *		Alan Cox	:	Verify area fixes.
  17 *		Alan Cox	:	cli() protects routing changes
  18 *		Rui Oliveira	:	ICMP routing table updates
  19 *		(rco@di.uminho.pt)	Routing table insertion and update
  20 *		Linus Torvalds	:	Rewrote bits to be sensible
  21 *		Alan Cox	:	Added BSD route gw semantics
  22 *		Alan Cox	:	Super /proc >4K
  23 *		Alan Cox	:	MTU in route table
  24 *		Alan Cox	:	MSS actually. Also added the window
  25 *					clamper.
  26 *		Sam Lantinga	:	Fixed route matching in rt_del()
  27 *		Alan Cox	:	Routing cache support.
  28 *		Alan Cox	:	Removed compatibility cruft.
  29 *		Alan Cox	:	RTF_REJECT support.
  30 *		Alan Cox	:	TCP irtt support.
  31 *		Jonathan Naylor	:	Added Metric support.
  32 *	Miquel van Smoorenburg	:	BSD API fixes.
  33 *	Miquel van Smoorenburg	:	Metrics.
  34 *		Alan Cox	:	Use __u32 properly
  35 *		Alan Cox	:	Aligned routing errors more closely with BSD
  36 *					our system is still very different.
  37 *		Alan Cox	:	Faster /proc handling
  38 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  39 *					routing caches and better behaviour.
  40 *
  41 *		Olaf Erb	:	irtt wasn't being copied right.
  42 *		Bjorn Ekwall	:	Kerneld route support.
  43 *		Alan Cox	:	Multicast fixed (I hope)
  44 *		Pavel Krauz	:	Limited broadcast fixed
  45 *		Mike McLagan	:	Routing by source
  46 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  47 *					route.c and rewritten from scratch.
  48 *		Andi Kleen	:	Load-limit warning messages.
  49 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  50 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  51 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  52 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  53 *		Marc Boucher	:	routing by fwmark
  54 *	Robert Olsson		:	Added rt_cache statistics
  55 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  56 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  57 *	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  58 *	Ilia Sotnikov		:	Removed TOS from hash calculations
 
 
 
 
 
  59 */
  60
  61#define pr_fmt(fmt) "IPv4: " fmt
  62
  63#include <linux/module.h>
 
  64#include <linux/bitops.h>
 
  65#include <linux/kernel.h>
  66#include <linux/mm.h>
  67#include <linux/memblock.h>
  68#include <linux/socket.h>
 
  69#include <linux/errno.h>
  70#include <linux/in.h>
  71#include <linux/inet.h>
  72#include <linux/netdevice.h>
  73#include <linux/proc_fs.h>
  74#include <linux/init.h>
  75#include <linux/skbuff.h>
  76#include <linux/inetdevice.h>
  77#include <linux/igmp.h>
  78#include <linux/pkt_sched.h>
  79#include <linux/mroute.h>
  80#include <linux/netfilter_ipv4.h>
  81#include <linux/random.h>
  82#include <linux/rcupdate.h>
 
  83#include <linux/slab.h>
  84#include <linux/jhash.h>
  85#include <net/dst.h>
  86#include <net/dst_metadata.h>
  87#include <net/inet_dscp.h>
  88#include <net/net_namespace.h>
 
  89#include <net/ip.h>
  90#include <net/route.h>
  91#include <net/inetpeer.h>
  92#include <net/sock.h>
  93#include <net/ip_fib.h>
  94#include <net/nexthop.h>
  95#include <net/tcp.h>
  96#include <net/icmp.h>
  97#include <net/xfrm.h>
  98#include <net/lwtunnel.h>
  99#include <net/netevent.h>
 100#include <net/rtnetlink.h>
 101#ifdef CONFIG_SYSCTL
 102#include <linux/sysctl.h>
 
 103#endif
 104#include <net/secure_seq.h>
 105#include <net/ip_tunnels.h>
 106
 107#include "fib_lookup.h"
 108
 109#define RT_FL_TOS(oldflp4) \
 110	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 111
 112#define RT_GC_TIMEOUT (300*HZ)
 113
 114#define DEFAULT_MIN_PMTU (512 + 20 + 20)
 115#define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
 116#define DEFAULT_MIN_ADVMSS 256
 117static int ip_rt_max_size;
 118static int ip_rt_redirect_number __read_mostly	= 9;
 119static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 120static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 121static int ip_rt_error_cost __read_mostly	= HZ;
 122static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 
 
 
 123
 124static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 125
 126/*
 127 *	Interface to generic destination cache.
 128 */
 129
 130INDIRECT_CALLABLE_SCOPE
 131struct dst_entry	*ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 132static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 133INDIRECT_CALLABLE_SCOPE
 134unsigned int		ipv4_mtu(const struct dst_entry *dst);
 135static void		ipv4_negative_advice(struct sock *sk,
 136					     struct dst_entry *dst);
 137static void		 ipv4_link_failure(struct sk_buff *skb);
 138static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 139					   struct sk_buff *skb, u32 mtu,
 140					   bool confirm_neigh);
 141static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 142					struct sk_buff *skb);
 143static void		ipv4_dst_destroy(struct dst_entry *dst);
 144
 145static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 146{
 147	WARN_ON(1);
 148	return NULL;
 149}
 150
 151static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 152					   struct sk_buff *skb,
 153					   const void *daddr);
 154static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 155
 156static struct dst_ops ipv4_dst_ops = {
 157	.family =		AF_INET,
 158	.check =		ipv4_dst_check,
 159	.default_advmss =	ipv4_default_advmss,
 160	.mtu =			ipv4_mtu,
 161	.cow_metrics =		ipv4_cow_metrics,
 162	.destroy =		ipv4_dst_destroy,
 163	.negative_advice =	ipv4_negative_advice,
 164	.link_failure =		ipv4_link_failure,
 165	.update_pmtu =		ip_rt_update_pmtu,
 166	.redirect =		ip_do_redirect,
 167	.local_out =		__ip_local_out,
 168	.neigh_lookup =		ipv4_neigh_lookup,
 169	.confirm_neigh =	ipv4_confirm_neigh,
 170};
 171
 172#define ECN_OR_COST(class)	TC_PRIO_##class
 173
 174const __u8 ip_tos2prio[16] = {
 175	TC_PRIO_BESTEFFORT,
 176	ECN_OR_COST(BESTEFFORT),
 177	TC_PRIO_BESTEFFORT,
 178	ECN_OR_COST(BESTEFFORT),
 179	TC_PRIO_BULK,
 180	ECN_OR_COST(BULK),
 181	TC_PRIO_BULK,
 182	ECN_OR_COST(BULK),
 183	TC_PRIO_INTERACTIVE,
 184	ECN_OR_COST(INTERACTIVE),
 185	TC_PRIO_INTERACTIVE,
 186	ECN_OR_COST(INTERACTIVE),
 187	TC_PRIO_INTERACTIVE_BULK,
 188	ECN_OR_COST(INTERACTIVE_BULK),
 189	TC_PRIO_INTERACTIVE_BULK,
 190	ECN_OR_COST(INTERACTIVE_BULK)
 191};
 192EXPORT_SYMBOL(ip_tos2prio);
 193
 194static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 195#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 196
 197#ifdef CONFIG_PROC_FS
 198static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 199{
 200	if (*pos)
 201		return NULL;
 202	return SEQ_START_TOKEN;
 203}
 204
 205static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 206{
 207	++*pos;
 208	return NULL;
 209}
 210
 211static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 212{
 213}
 214
 215static int rt_cache_seq_show(struct seq_file *seq, void *v)
 216{
 217	if (v == SEQ_START_TOKEN)
 218		seq_printf(seq, "%-127s\n",
 219			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 220			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 221			   "HHUptod\tSpecDst");
 222	return 0;
 223}
 224
 225static const struct seq_operations rt_cache_seq_ops = {
 226	.start  = rt_cache_seq_start,
 227	.next   = rt_cache_seq_next,
 228	.stop   = rt_cache_seq_stop,
 229	.show   = rt_cache_seq_show,
 230};
 231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 232static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 233{
 234	int cpu;
 235
 236	if (*pos == 0)
 237		return SEQ_START_TOKEN;
 238
 239	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 240		if (!cpu_possible(cpu))
 241			continue;
 242		*pos = cpu+1;
 243		return &per_cpu(rt_cache_stat, cpu);
 244	}
 245	return NULL;
 246}
 247
 248static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 249{
 250	int cpu;
 251
 252	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 253		if (!cpu_possible(cpu))
 254			continue;
 255		*pos = cpu+1;
 256		return &per_cpu(rt_cache_stat, cpu);
 257	}
 258	(*pos)++;
 259	return NULL;
 260
 261}
 262
 263static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 264{
 265
 266}
 267
 268static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 269{
 270	struct rt_cache_stat *st = v;
 271
 272	if (v == SEQ_START_TOKEN) {
 273		seq_puts(seq, "entries  in_hit   in_slow_tot in_slow_mc in_no_route in_brd   in_martian_dst in_martian_src out_hit  out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 274		return 0;
 275	}
 276
 277	seq_printf(seq, "%08x %08x %08x    %08x   %08x    %08x %08x       "
 278			"%08x       %08x %08x     %08x    %08x %08x   "
 279			"%08x     %08x        %08x        %08x\n",
 280		   dst_entries_get_slow(&ipv4_dst_ops),
 281		   0, /* st->in_hit */
 282		   st->in_slow_tot,
 283		   st->in_slow_mc,
 284		   st->in_no_route,
 285		   st->in_brd,
 286		   st->in_martian_dst,
 287		   st->in_martian_src,
 288
 289		   0, /* st->out_hit */
 290		   st->out_slow_tot,
 291		   st->out_slow_mc,
 292
 293		   0, /* st->gc_total */
 294		   0, /* st->gc_ignored */
 295		   0, /* st->gc_goal_miss */
 296		   0, /* st->gc_dst_overflow */
 297		   0, /* st->in_hlist_search */
 298		   0  /* st->out_hlist_search */
 299		);
 300	return 0;
 301}
 302
 303static const struct seq_operations rt_cpu_seq_ops = {
 304	.start  = rt_cpu_seq_start,
 305	.next   = rt_cpu_seq_next,
 306	.stop   = rt_cpu_seq_stop,
 307	.show   = rt_cpu_seq_show,
 308};
 309
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 310#ifdef CONFIG_IP_ROUTE_CLASSID
 311static int rt_acct_proc_show(struct seq_file *m, void *v)
 312{
 313	struct ip_rt_acct *dst, *src;
 314	unsigned int i, j;
 315
 316	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 317	if (!dst)
 318		return -ENOMEM;
 319
 320	for_each_possible_cpu(i) {
 321		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 322		for (j = 0; j < 256; j++) {
 323			dst[j].o_bytes   += src[j].o_bytes;
 324			dst[j].o_packets += src[j].o_packets;
 325			dst[j].i_bytes   += src[j].i_bytes;
 326			dst[j].i_packets += src[j].i_packets;
 327		}
 328	}
 329
 330	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 331	kfree(dst);
 332	return 0;
 333}
 
 
 
 
 
 
 
 
 
 
 
 
 
 334#endif
 335
 336static int __net_init ip_rt_do_proc_init(struct net *net)
 337{
 338	struct proc_dir_entry *pde;
 339
 340	pde = proc_create_seq("rt_cache", 0444, net->proc_net,
 341			      &rt_cache_seq_ops);
 342	if (!pde)
 343		goto err1;
 344
 345	pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
 346			      &rt_cpu_seq_ops);
 347	if (!pde)
 348		goto err2;
 349
 350#ifdef CONFIG_IP_ROUTE_CLASSID
 351	pde = proc_create_single("rt_acct", 0, net->proc_net,
 352			rt_acct_proc_show);
 353	if (!pde)
 354		goto err3;
 355#endif
 356	return 0;
 357
 358#ifdef CONFIG_IP_ROUTE_CLASSID
 359err3:
 360	remove_proc_entry("rt_cache", net->proc_net_stat);
 361#endif
 362err2:
 363	remove_proc_entry("rt_cache", net->proc_net);
 364err1:
 365	return -ENOMEM;
 366}
 367
 368static void __net_exit ip_rt_do_proc_exit(struct net *net)
 369{
 370	remove_proc_entry("rt_cache", net->proc_net_stat);
 371	remove_proc_entry("rt_cache", net->proc_net);
 372#ifdef CONFIG_IP_ROUTE_CLASSID
 373	remove_proc_entry("rt_acct", net->proc_net);
 374#endif
 375}
 376
 377static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 378	.init = ip_rt_do_proc_init,
 379	.exit = ip_rt_do_proc_exit,
 380};
 381
 382static int __init ip_rt_proc_init(void)
 383{
 384	return register_pernet_subsys(&ip_rt_proc_ops);
 385}
 386
 387#else
 388static inline int ip_rt_proc_init(void)
 389{
 390	return 0;
 391}
 392#endif /* CONFIG_PROC_FS */
 393
 394static inline bool rt_is_expired(const struct rtable *rth)
 395{
 396	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 397}
 398
 399void rt_cache_flush(struct net *net)
 400{
 401	rt_genid_bump_ipv4(net);
 402}
 403
 404static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 405					   struct sk_buff *skb,
 406					   const void *daddr)
 407{
 408	const struct rtable *rt = container_of(dst, struct rtable, dst);
 409	struct net_device *dev = dst->dev;
 
 
 410	struct neighbour *n;
 411
 412	rcu_read_lock();
 413
 414	if (likely(rt->rt_gw_family == AF_INET)) {
 415		n = ip_neigh_gw4(dev, rt->rt_gw4);
 416	} else if (rt->rt_gw_family == AF_INET6) {
 417		n = ip_neigh_gw6(dev, &rt->rt_gw6);
 418        } else {
 419		__be32 pkey;
 420
 421		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 422		n = ip_neigh_gw4(dev, pkey);
 423	}
 424
 425	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 426		n = NULL;
 427
 428	rcu_read_unlock();
 429
 430	return n;
 431}
 432
 433static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 434{
 435	const struct rtable *rt = container_of(dst, struct rtable, dst);
 436	struct net_device *dev = dst->dev;
 437	const __be32 *pkey = daddr;
 438
 439	if (rt->rt_gw_family == AF_INET) {
 440		pkey = (const __be32 *)&rt->rt_gw4;
 441	} else if (rt->rt_gw_family == AF_INET6) {
 442		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 443	} else if (!daddr ||
 444		 (rt->rt_flags &
 445		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 446		return;
 447	}
 448	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 449}
 450
 451/* Hash tables of size 2048..262144 depending on RAM size.
 452 * Each bucket uses 8 bytes.
 453 */
 454static u32 ip_idents_mask __read_mostly;
 455static atomic_t *ip_idents __read_mostly;
 456static u32 *ip_tstamps __read_mostly;
 457
 458/* In order to protect privacy, we add a perturbation to identifiers
 459 * if one generator is seldom used. This makes hard for an attacker
 460 * to infer how many packets were sent between two points in time.
 461 */
 462static u32 ip_idents_reserve(u32 hash, int segs)
 463{
 464	u32 bucket, old, now = (u32)jiffies;
 465	atomic_t *p_id;
 466	u32 *p_tstamp;
 
 467	u32 delta = 0;
 468
 469	bucket = hash & ip_idents_mask;
 470	p_tstamp = ip_tstamps + bucket;
 471	p_id = ip_idents + bucket;
 472	old = READ_ONCE(*p_tstamp);
 473
 474	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 475		delta = get_random_u32_below(now - old);
 476
 477	/* If UBSAN reports an error there, please make sure your compiler
 478	 * supports -fno-strict-overflow before reporting it that was a bug
 479	 * in UBSAN, and it has been fixed in GCC-8.
 480	 */
 481	return atomic_add_return(segs + delta, p_id) - segs;
 482}
 
 483
 484void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 485{
 
 486	u32 hash, id;
 487
 488	/* Note the following code is not safe, but this is okay. */
 489	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 490		get_random_bytes(&net->ipv4.ip_id_key,
 491				 sizeof(net->ipv4.ip_id_key));
 492
 493	hash = siphash_3u32((__force u32)iph->daddr,
 494			    (__force u32)iph->saddr,
 495			    iph->protocol,
 496			    &net->ipv4.ip_id_key);
 497	id = ip_idents_reserve(hash, segs);
 498	iph->id = htons(id);
 499}
 500EXPORT_SYMBOL(__ip_select_ident);
 501
 502static void ip_rt_fix_tos(struct flowi4 *fl4)
 
 
 
 503{
 504	__u8 tos = RT_FL_TOS(fl4);
 505
 506	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
 507	if (tos & RTO_ONLINK)
 508		fl4->flowi4_scope = RT_SCOPE_LINK;
 509}
 510
 511static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 512			     const struct sock *sk, const struct iphdr *iph,
 513			     int oif, __u8 tos, u8 prot, u32 mark,
 514			     int flow_flags)
 515{
 516	__u8 scope = RT_SCOPE_UNIVERSE;
 517
 518	if (sk) {
 519		oif = sk->sk_bound_dev_if;
 520		mark = READ_ONCE(sk->sk_mark);
 521		tos = ip_sock_rt_tos(sk);
 522		scope = ip_sock_rt_scope(sk);
 523		prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW :
 524						    sk->sk_protocol;
 525	}
 526
 527	flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope,
 528			   prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
 529			   sock_net_uid(net, sk));
 530}
 531
 532static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 533			       const struct sock *sk)
 534{
 535	const struct net *net = dev_net(skb->dev);
 536	const struct iphdr *iph = ip_hdr(skb);
 537	int oif = skb->dev->ifindex;
 
 538	u8 prot = iph->protocol;
 539	u32 mark = skb->mark;
 540	__u8 tos = iph->tos;
 541
 542	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 543}
 544
 545static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 546{
 547	const struct inet_sock *inet = inet_sk(sk);
 548	const struct ip_options_rcu *inet_opt;
 549	__be32 daddr = inet->inet_daddr;
 550
 551	rcu_read_lock();
 552	inet_opt = rcu_dereference(inet->inet_opt);
 553	if (inet_opt && inet_opt->opt.srr)
 554		daddr = inet_opt->opt.faddr;
 555	flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
 556			   ip_sock_rt_tos(sk) & IPTOS_RT_MASK,
 557			   ip_sock_rt_scope(sk),
 558			   inet_test_bit(HDRINCL, sk) ?
 559				IPPROTO_RAW : sk->sk_protocol,
 560			   inet_sk_flowi_flags(sk),
 561			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 562	rcu_read_unlock();
 563}
 564
 565static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 566				 const struct sk_buff *skb)
 567{
 568	if (skb)
 569		build_skb_flow_key(fl4, skb, sk);
 570	else
 571		build_sk_flow_key(fl4, sk);
 572}
 573
 
 
 
 
 
 574static DEFINE_SPINLOCK(fnhe_lock);
 575
 576static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 577{
 578	struct rtable *rt;
 579
 580	rt = rcu_dereference(fnhe->fnhe_rth_input);
 581	if (rt) {
 582		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 583		dst_dev_put(&rt->dst);
 584		dst_release(&rt->dst);
 585	}
 586	rt = rcu_dereference(fnhe->fnhe_rth_output);
 587	if (rt) {
 588		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 589		dst_dev_put(&rt->dst);
 590		dst_release(&rt->dst);
 591	}
 592}
 593
 594static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 595{
 596	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 597	struct fib_nh_exception *fnhe, *oldest = NULL;
 598
 599	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 600		fnhe = rcu_dereference_protected(*fnhe_p,
 601						 lockdep_is_held(&fnhe_lock));
 602		if (!fnhe)
 603			break;
 604		if (!oldest ||
 605		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 606			oldest = fnhe;
 607			oldest_p = fnhe_p;
 608		}
 609	}
 610	fnhe_flush_routes(oldest);
 611	*oldest_p = oldest->fnhe_next;
 612	kfree_rcu(oldest, rcu);
 613}
 614
 615static u32 fnhe_hashfun(__be32 daddr)
 616{
 617	static siphash_aligned_key_t fnhe_hash_key;
 618	u64 hval;
 619
 620	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
 621	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
 622	return hash_64(hval, FNHE_HASH_SHIFT);
 623}
 624
 625static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 626{
 627	rt->rt_pmtu = fnhe->fnhe_pmtu;
 628	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 629	rt->dst.expires = fnhe->fnhe_expires;
 630
 631	if (fnhe->fnhe_gw) {
 632		rt->rt_flags |= RTCF_REDIRECTED;
 
 633		rt->rt_uses_gateway = 1;
 634		rt->rt_gw_family = AF_INET;
 635		rt->rt_gw4 = fnhe->fnhe_gw;
 636	}
 637}
 638
 639static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 640				  __be32 gw, u32 pmtu, bool lock,
 641				  unsigned long expires)
 642{
 643	struct fnhe_hash_bucket *hash;
 644	struct fib_nh_exception *fnhe;
 645	struct rtable *rt;
 646	u32 genid, hval;
 647	unsigned int i;
 648	int depth;
 649
 650	genid = fnhe_genid(dev_net(nhc->nhc_dev));
 651	hval = fnhe_hashfun(daddr);
 652
 653	spin_lock_bh(&fnhe_lock);
 654
 655	hash = rcu_dereference(nhc->nhc_exceptions);
 656	if (!hash) {
 657		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 658		if (!hash)
 659			goto out_unlock;
 660		rcu_assign_pointer(nhc->nhc_exceptions, hash);
 661	}
 662
 663	hash += hval;
 664
 665	depth = 0;
 666	for (fnhe = rcu_dereference(hash->chain); fnhe;
 667	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 668		if (fnhe->fnhe_daddr == daddr)
 669			break;
 670		depth++;
 671	}
 672
 673	if (fnhe) {
 674		if (fnhe->fnhe_genid != genid)
 675			fnhe->fnhe_genid = genid;
 676		if (gw)
 677			fnhe->fnhe_gw = gw;
 678		if (pmtu) {
 679			fnhe->fnhe_pmtu = pmtu;
 680			fnhe->fnhe_mtu_locked = lock;
 681		}
 682		fnhe->fnhe_expires = max(1UL, expires);
 683		/* Update all cached dsts too */
 684		rt = rcu_dereference(fnhe->fnhe_rth_input);
 685		if (rt)
 686			fill_route_from_fnhe(rt, fnhe);
 687		rt = rcu_dereference(fnhe->fnhe_rth_output);
 688		if (rt)
 689			fill_route_from_fnhe(rt, fnhe);
 690	} else {
 691		/* Randomize max depth to avoid some side channels attacks. */
 692		int max_depth = FNHE_RECLAIM_DEPTH +
 693				get_random_u32_below(FNHE_RECLAIM_DEPTH);
 
 
 
 694
 695		while (depth > max_depth) {
 696			fnhe_remove_oldest(hash);
 697			depth--;
 698		}
 699
 700		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 701		if (!fnhe)
 702			goto out_unlock;
 703
 704		fnhe->fnhe_next = hash->chain;
 705
 706		fnhe->fnhe_genid = genid;
 707		fnhe->fnhe_daddr = daddr;
 708		fnhe->fnhe_gw = gw;
 709		fnhe->fnhe_pmtu = pmtu;
 710		fnhe->fnhe_mtu_locked = lock;
 711		fnhe->fnhe_expires = max(1UL, expires);
 712
 713		rcu_assign_pointer(hash->chain, fnhe);
 714
 715		/* Exception created; mark the cached routes for the nexthop
 716		 * stale, so anyone caching it rechecks if this exception
 717		 * applies to them.
 718		 */
 719		rt = rcu_dereference(nhc->nhc_rth_input);
 720		if (rt)
 721			rt->dst.obsolete = DST_OBSOLETE_KILL;
 722
 723		for_each_possible_cpu(i) {
 724			struct rtable __rcu **prt;
 725
 726			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 727			rt = rcu_dereference(*prt);
 728			if (rt)
 729				rt->dst.obsolete = DST_OBSOLETE_KILL;
 730		}
 731	}
 732
 733	fnhe->fnhe_stamp = jiffies;
 734
 735out_unlock:
 736	spin_unlock_bh(&fnhe_lock);
 737}
 738
 739static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 740			     bool kill_route)
 741{
 742	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 743	__be32 old_gw = ip_hdr(skb)->saddr;
 744	struct net_device *dev = skb->dev;
 745	struct in_device *in_dev;
 746	struct fib_result res;
 747	struct neighbour *n;
 748	struct net *net;
 749
 750	switch (icmp_hdr(skb)->code & 7) {
 751	case ICMP_REDIR_NET:
 752	case ICMP_REDIR_NETTOS:
 753	case ICMP_REDIR_HOST:
 754	case ICMP_REDIR_HOSTTOS:
 755		break;
 756
 757	default:
 758		return;
 759	}
 760
 761	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 762		return;
 763
 764	in_dev = __in_dev_get_rcu(dev);
 765	if (!in_dev)
 766		return;
 767
 768	net = dev_net(dev);
 769	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 770	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 771	    ipv4_is_zeronet(new_gw))
 772		goto reject_redirect;
 773
 774	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 775		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 776			goto reject_redirect;
 777		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 778			goto reject_redirect;
 779	} else {
 780		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 781			goto reject_redirect;
 782	}
 783
 784	n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
 785	if (!n)
 786		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 787	if (!IS_ERR(n)) {
 788		if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
 789			neigh_event_send(n, NULL);
 790		} else {
 791			if (fib_lookup(net, fl4, &res, 0) == 0) {
 792				struct fib_nh_common *nhc;
 793
 794				fib_select_path(net, &res, fl4, skb);
 795				nhc = FIB_RES_NHC(res);
 796				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 797						0, false,
 798						jiffies + ip_rt_gc_timeout);
 799			}
 800			if (kill_route)
 801				rt->dst.obsolete = DST_OBSOLETE_KILL;
 802			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 803		}
 804		neigh_release(n);
 805	}
 806	return;
 807
 808reject_redirect:
 809#ifdef CONFIG_IP_ROUTE_VERBOSE
 810	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 811		const struct iphdr *iph = (const struct iphdr *) skb->data;
 812		__be32 daddr = iph->daddr;
 813		__be32 saddr = iph->saddr;
 814
 815		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 816				     "  Advised path = %pI4 -> %pI4\n",
 817				     &old_gw, dev->name, &new_gw,
 818				     &saddr, &daddr);
 819	}
 820#endif
 821	;
 822}
 823
 824static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 825{
 826	struct rtable *rt;
 827	struct flowi4 fl4;
 828	const struct iphdr *iph = (const struct iphdr *) skb->data;
 829	struct net *net = dev_net(skb->dev);
 830	int oif = skb->dev->ifindex;
 
 831	u8 prot = iph->protocol;
 832	u32 mark = skb->mark;
 833	__u8 tos = iph->tos;
 834
 835	rt = dst_rtable(dst);
 836
 837	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 838	__ip_do_redirect(rt, skb, &fl4, true);
 839}
 840
 841static void ipv4_negative_advice(struct sock *sk,
 842				 struct dst_entry *dst)
 843{
 844	struct rtable *rt = dst_rtable(dst);
 
 845
 846	if ((dst->obsolete > 0) ||
 847	    (rt->rt_flags & RTCF_REDIRECTED) ||
 848	    rt->dst.expires)
 849		sk_dst_reset(sk);
 
 
 
 
 
 
 
 850}
 851
 852/*
 853 * Algorithm:
 854 *	1. The first ip_rt_redirect_number redirects are sent
 855 *	   with exponential backoff, then we stop sending them at all,
 856 *	   assuming that the host ignores our redirects.
 857 *	2. If we did not see packets requiring redirects
 858 *	   during ip_rt_redirect_silence, we assume that the host
 859 *	   forgot redirected route and start to send redirects again.
 860 *
 861 * This algorithm is much cheaper and more intelligent than dumb load limiting
 862 * in icmp.c.
 863 *
 864 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 865 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 866 */
 867
 868void ip_rt_send_redirect(struct sk_buff *skb)
 869{
 870	struct rtable *rt = skb_rtable(skb);
 871	struct in_device *in_dev;
 872	struct inet_peer *peer;
 873	struct net *net;
 874	int log_martians;
 875	int vif;
 876
 877	rcu_read_lock();
 878	in_dev = __in_dev_get_rcu(rt->dst.dev);
 879	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 880		rcu_read_unlock();
 881		return;
 882	}
 883	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 884	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 885	rcu_read_unlock();
 886
 887	net = dev_net(rt->dst.dev);
 888	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 889	if (!peer) {
 890		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 891			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 892		return;
 893	}
 894
 895	/* No redirected packets during ip_rt_redirect_silence;
 896	 * reset the algorithm.
 897	 */
 898	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 899		peer->rate_tokens = 0;
 900		peer->n_redirects = 0;
 901	}
 902
 903	/* Too many ignored redirects; do not send anything
 904	 * set dst.rate_last to the last seen redirected packet.
 905	 */
 906	if (peer->n_redirects >= ip_rt_redirect_number) {
 907		peer->rate_last = jiffies;
 908		goto out_put_peer;
 909	}
 910
 911	/* Check for load limit; set rate_last to the latest sent
 912	 * redirect.
 913	 */
 914	if (peer->n_redirects == 0 ||
 915	    time_after(jiffies,
 916		       (peer->rate_last +
 917			(ip_rt_redirect_load << peer->n_redirects)))) {
 918		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 919
 920		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 921		peer->rate_last = jiffies;
 922		++peer->n_redirects;
 923		if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians &&
 924		    peer->n_redirects == ip_rt_redirect_number)
 
 925			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 926					     &ip_hdr(skb)->saddr, inet_iif(skb),
 927					     &ip_hdr(skb)->daddr, &gw);
 
 928	}
 929out_put_peer:
 930	inet_putpeer(peer);
 931}
 932
 933static int ip_error(struct sk_buff *skb)
 934{
 
 935	struct rtable *rt = skb_rtable(skb);
 936	struct net_device *dev = skb->dev;
 937	struct in_device *in_dev;
 938	struct inet_peer *peer;
 939	unsigned long now;
 940	struct net *net;
 941	SKB_DR(reason);
 942	bool send;
 943	int code;
 944
 945	if (netif_is_l3_master(skb->dev)) {
 946		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 947		if (!dev)
 948			goto out;
 949	}
 950
 951	in_dev = __in_dev_get_rcu(dev);
 952
 953	/* IP on this device is disabled. */
 954	if (!in_dev)
 955		goto out;
 956
 957	net = dev_net(rt->dst.dev);
 958	if (!IN_DEV_FORWARD(in_dev)) {
 959		switch (rt->dst.error) {
 960		case EHOSTUNREACH:
 961			SKB_DR_SET(reason, IP_INADDRERRORS);
 962			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 963			break;
 964
 965		case ENETUNREACH:
 966			SKB_DR_SET(reason, IP_INNOROUTES);
 967			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 968			break;
 969		}
 970		goto out;
 971	}
 972
 973	switch (rt->dst.error) {
 974	case EINVAL:
 975	default:
 976		goto out;
 977	case EHOSTUNREACH:
 978		code = ICMP_HOST_UNREACH;
 979		break;
 980	case ENETUNREACH:
 981		code = ICMP_NET_UNREACH;
 982		SKB_DR_SET(reason, IP_INNOROUTES);
 983		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 984		break;
 985	case EACCES:
 986		code = ICMP_PKT_FILTERED;
 987		break;
 988	}
 989
 990	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 991			       l3mdev_master_ifindex(skb->dev), 1);
 992
 993	send = true;
 994	if (peer) {
 995		now = jiffies;
 996		peer->rate_tokens += now - peer->rate_last;
 997		if (peer->rate_tokens > ip_rt_error_burst)
 998			peer->rate_tokens = ip_rt_error_burst;
 999		peer->rate_last = now;
1000		if (peer->rate_tokens >= ip_rt_error_cost)
1001			peer->rate_tokens -= ip_rt_error_cost;
1002		else
1003			send = false;
1004		inet_putpeer(peer);
1005	}
1006	if (send)
1007		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1008
1009out:	kfree_skb_reason(skb, reason);
1010	return 0;
1011}
1012
1013static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1014{
1015	struct dst_entry *dst = &rt->dst;
1016	struct net *net = dev_net(dst->dev);
1017	struct fib_result res;
1018	bool lock = false;
1019	u32 old_mtu;
1020
1021	if (ip_mtu_locked(dst))
1022		return;
1023
1024	old_mtu = ipv4_mtu(dst);
1025	if (old_mtu < mtu)
1026		return;
1027
1028	if (mtu < net->ipv4.ip_rt_min_pmtu) {
1029		lock = true;
1030		mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
1031	}
1032
1033	if (rt->rt_pmtu == mtu && !lock &&
1034	    time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
1035		return;
1036
1037	rcu_read_lock();
1038	if (fib_lookup(net, fl4, &res, 0) == 0) {
1039		struct fib_nh_common *nhc;
1040
1041		fib_select_path(net, &res, fl4, NULL);
1042		nhc = FIB_RES_NHC(res);
1043		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1044				      jiffies + net->ipv4.ip_rt_mtu_expires);
1045	}
1046	rcu_read_unlock();
1047}
1048
1049static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1050			      struct sk_buff *skb, u32 mtu,
1051			      bool confirm_neigh)
1052{
1053	struct rtable *rt = dst_rtable(dst);
1054	struct flowi4 fl4;
1055
1056	ip_rt_build_flow_key(&fl4, sk, skb);
1057
1058	/* Don't make lookup fail for bridged encapsulations */
1059	if (skb && netif_is_any_bridge_port(skb->dev))
1060		fl4.flowi4_oif = 0;
1061
1062	__ip_rt_update_pmtu(rt, &fl4, mtu);
1063}
1064
1065void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1066		      int oif, u8 protocol)
1067{
1068	const struct iphdr *iph = (const struct iphdr *)skb->data;
1069	struct flowi4 fl4;
1070	struct rtable *rt;
1071	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1072
1073	__build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
1074			 0);
 
 
 
1075	rt = __ip_route_output_key(net, &fl4);
1076	if (!IS_ERR(rt)) {
1077		__ip_rt_update_pmtu(rt, &fl4, mtu);
1078		ip_rt_put(rt);
1079	}
1080}
1081EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1082
1083static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1084{
1085	const struct iphdr *iph = (const struct iphdr *)skb->data;
1086	struct flowi4 fl4;
1087	struct rtable *rt;
1088
1089	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1090
1091	if (!fl4.flowi4_mark)
1092		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1093
1094	rt = __ip_route_output_key(sock_net(sk), &fl4);
1095	if (!IS_ERR(rt)) {
1096		__ip_rt_update_pmtu(rt, &fl4, mtu);
1097		ip_rt_put(rt);
1098	}
1099}
1100
1101void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1102{
1103	const struct iphdr *iph = (const struct iphdr *)skb->data;
1104	struct flowi4 fl4;
1105	struct rtable *rt;
1106	struct dst_entry *odst = NULL;
1107	bool new = false;
1108	struct net *net = sock_net(sk);
1109
1110	bh_lock_sock(sk);
1111
1112	if (!ip_sk_accept_pmtu(sk))
1113		goto out;
1114
1115	odst = sk_dst_get(sk);
1116
1117	if (sock_owned_by_user(sk) || !odst) {
1118		__ipv4_sk_update_pmtu(skb, sk, mtu);
1119		goto out;
1120	}
1121
1122	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1123
1124	rt = dst_rtable(odst);
1125	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1126		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1127		if (IS_ERR(rt))
1128			goto out;
1129
1130		new = true;
1131	}
1132
1133	__ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu);
1134
1135	if (!dst_check(&rt->dst, 0)) {
1136		if (new)
1137			dst_release(&rt->dst);
1138
1139		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1140		if (IS_ERR(rt))
1141			goto out;
1142
1143		new = true;
1144	}
1145
1146	if (new)
1147		sk_dst_set(sk, &rt->dst);
1148
1149out:
1150	bh_unlock_sock(sk);
1151	dst_release(odst);
1152}
1153EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1154
1155void ipv4_redirect(struct sk_buff *skb, struct net *net,
1156		   int oif, u8 protocol)
1157{
1158	const struct iphdr *iph = (const struct iphdr *)skb->data;
1159	struct flowi4 fl4;
1160	struct rtable *rt;
1161
1162	__build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
 
1163	rt = __ip_route_output_key(net, &fl4);
1164	if (!IS_ERR(rt)) {
1165		__ip_do_redirect(rt, skb, &fl4, false);
1166		ip_rt_put(rt);
1167	}
1168}
1169EXPORT_SYMBOL_GPL(ipv4_redirect);
1170
1171void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1172{
1173	const struct iphdr *iph = (const struct iphdr *)skb->data;
1174	struct flowi4 fl4;
1175	struct rtable *rt;
1176	struct net *net = sock_net(sk);
1177
1178	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1179	rt = __ip_route_output_key(net, &fl4);
1180	if (!IS_ERR(rt)) {
1181		__ip_do_redirect(rt, skb, &fl4, false);
1182		ip_rt_put(rt);
1183	}
1184}
1185EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1186
1187INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
1188							 u32 cookie)
1189{
1190	struct rtable *rt = dst_rtable(dst);
1191
1192	/* All IPV4 dsts are created with ->obsolete set to the value
1193	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1194	 * into this function always.
1195	 *
1196	 * When a PMTU/redirect information update invalidates a route,
1197	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1198	 * DST_OBSOLETE_DEAD.
1199	 */
1200	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1201		return NULL;
1202	return dst;
1203}
1204EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
1205
1206static void ipv4_send_dest_unreach(struct sk_buff *skb)
1207{
1208	struct net_device *dev;
1209	struct ip_options opt;
1210	int res;
1211
1212	/* Recompile ip options since IPCB may not be valid anymore.
1213	 * Also check we have a reasonable ipv4 header.
1214	 */
1215	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1216	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1217		return;
1218
1219	memset(&opt, 0, sizeof(opt));
1220	if (ip_hdr(skb)->ihl > 5) {
1221		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1222			return;
1223		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1224
1225		rcu_read_lock();
1226		dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1227		res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1228		rcu_read_unlock();
1229
1230		if (res)
1231			return;
1232	}
1233	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1234}
1235
1236static void ipv4_link_failure(struct sk_buff *skb)
1237{
1238	struct rtable *rt;
1239
1240	ipv4_send_dest_unreach(skb);
1241
1242	rt = skb_rtable(skb);
1243	if (rt)
1244		dst_set_expires(&rt->dst, 0);
1245}
1246
1247static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1248{
1249	pr_debug("%s: %pI4 -> %pI4, %s\n",
1250		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1251		 skb->dev ? skb->dev->name : "?");
1252	kfree_skb(skb);
1253	WARN_ON(1);
1254	return 0;
1255}
1256
1257/*
1258 * We do not cache source address of outgoing interface,
1259 * because it is used only by IP RR, TS and SRR options,
1260 * so that it out of fast path.
1261 *
1262 * BTW remember: "addr" is allowed to be not aligned
1263 * in IP options!
1264 */
1265
1266void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1267{
1268	__be32 src;
1269
1270	if (rt_is_output_route(rt))
1271		src = ip_hdr(skb)->saddr;
1272	else {
1273		struct fib_result res;
1274		struct iphdr *iph = ip_hdr(skb);
1275		struct flowi4 fl4 = {
1276			.daddr = iph->daddr,
1277			.saddr = iph->saddr,
1278			.flowi4_tos = RT_TOS(iph->tos),
1279			.flowi4_oif = rt->dst.dev->ifindex,
1280			.flowi4_iif = skb->dev->ifindex,
1281			.flowi4_mark = skb->mark,
1282		};
 
 
 
1283
1284		rcu_read_lock();
1285		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1286			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1287		else
1288			src = inet_select_addr(rt->dst.dev,
1289					       rt_nexthop(rt, iph->daddr),
1290					       RT_SCOPE_UNIVERSE);
1291		rcu_read_unlock();
1292	}
1293	memcpy(addr, &src, 4);
1294}
1295
1296#ifdef CONFIG_IP_ROUTE_CLASSID
1297static void set_class_tag(struct rtable *rt, u32 tag)
1298{
1299	if (!(rt->dst.tclassid & 0xFFFF))
1300		rt->dst.tclassid |= tag & 0xFFFF;
1301	if (!(rt->dst.tclassid & 0xFFFF0000))
1302		rt->dst.tclassid |= tag & 0xFFFF0000;
1303}
1304#endif
1305
1306static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1307{
1308	struct net *net = dev_net(dst->dev);
1309	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1310	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1311				    net->ipv4.ip_rt_min_advmss);
1312
1313	return min(advmss, IPV4_MAX_PMTU - header_size);
 
 
 
 
 
 
1314}
1315
1316INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
1317{
1318	return ip_dst_mtu_maybe_forward(dst, false);
1319}
1320EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
1321
1322static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1323{
1324	struct fnhe_hash_bucket *hash;
1325	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1326	u32 hval = fnhe_hashfun(daddr);
1327
1328	spin_lock_bh(&fnhe_lock);
 
1329
1330	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1331					 lockdep_is_held(&fnhe_lock));
1332	hash += hval;
1333
1334	fnhe_p = &hash->chain;
1335	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1336	while (fnhe) {
1337		if (fnhe->fnhe_daddr == daddr) {
1338			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1339				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1340			/* set fnhe_daddr to 0 to ensure it won't bind with
1341			 * new dsts in rt_bind_exception().
1342			 */
1343			fnhe->fnhe_daddr = 0;
1344			fnhe_flush_routes(fnhe);
1345			kfree_rcu(fnhe, rcu);
1346			break;
1347		}
1348		fnhe_p = &fnhe->fnhe_next;
1349		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1350						 lockdep_is_held(&fnhe_lock));
1351	}
1352
1353	spin_unlock_bh(&fnhe_lock);
1354}
1355
1356static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1357					       __be32 daddr)
1358{
1359	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1360	struct fib_nh_exception *fnhe;
1361	u32 hval;
1362
1363	if (!hash)
1364		return NULL;
1365
1366	hval = fnhe_hashfun(daddr);
1367
1368	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1369	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1370		if (fnhe->fnhe_daddr == daddr) {
1371			if (fnhe->fnhe_expires &&
1372			    time_after(jiffies, fnhe->fnhe_expires)) {
1373				ip_del_fnhe(nhc, daddr);
1374				break;
1375			}
1376			return fnhe;
1377		}
1378	}
1379	return NULL;
1380}
1381
1382/* MTU selection:
1383 * 1. mtu on route is locked - use it
1384 * 2. mtu from nexthop exception
1385 * 3. mtu from egress device
1386 */
1387
1388u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1389{
1390	struct fib_nh_common *nhc = res->nhc;
1391	struct net_device *dev = nhc->nhc_dev;
1392	struct fib_info *fi = res->fi;
1393	u32 mtu = 0;
1394
1395	if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1396	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1397		mtu = fi->fib_mtu;
1398
1399	if (likely(!mtu)) {
1400		struct fib_nh_exception *fnhe;
1401
1402		fnhe = find_exception(nhc, daddr);
1403		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1404			mtu = fnhe->fnhe_pmtu;
1405	}
1406
1407	if (likely(!mtu))
1408		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1409
1410	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1411}
1412
1413static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1414			      __be32 daddr, const bool do_cache)
1415{
1416	bool ret = false;
1417
1418	spin_lock_bh(&fnhe_lock);
1419
1420	if (daddr == fnhe->fnhe_daddr) {
1421		struct rtable __rcu **porig;
1422		struct rtable *orig;
1423		int genid = fnhe_genid(dev_net(rt->dst.dev));
1424
1425		if (rt_is_input_route(rt))
1426			porig = &fnhe->fnhe_rth_input;
1427		else
1428			porig = &fnhe->fnhe_rth_output;
1429		orig = rcu_dereference(*porig);
1430
1431		if (fnhe->fnhe_genid != genid) {
1432			fnhe->fnhe_genid = genid;
1433			fnhe->fnhe_gw = 0;
1434			fnhe->fnhe_pmtu = 0;
1435			fnhe->fnhe_expires = 0;
1436			fnhe->fnhe_mtu_locked = false;
1437			fnhe_flush_routes(fnhe);
1438			orig = NULL;
1439		}
1440		fill_route_from_fnhe(rt, fnhe);
1441		if (!rt->rt_gw4) {
1442			rt->rt_gw4 = daddr;
1443			rt->rt_gw_family = AF_INET;
1444		}
1445
1446		if (do_cache) {
1447			dst_hold(&rt->dst);
1448			rcu_assign_pointer(*porig, rt);
1449			if (orig) {
1450				dst_dev_put(&orig->dst);
1451				dst_release(&orig->dst);
1452			}
1453			ret = true;
1454		}
1455
1456		fnhe->fnhe_stamp = jiffies;
1457	}
1458	spin_unlock_bh(&fnhe_lock);
1459
1460	return ret;
1461}
1462
1463static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1464{
1465	struct rtable *orig, *prev, **p;
1466	bool ret = true;
1467
1468	if (rt_is_input_route(rt)) {
1469		p = (struct rtable **)&nhc->nhc_rth_input;
1470	} else {
1471		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1472	}
1473	orig = *p;
1474
1475	/* hold dst before doing cmpxchg() to avoid race condition
1476	 * on this dst
1477	 */
1478	dst_hold(&rt->dst);
1479	prev = cmpxchg(p, orig, rt);
1480	if (prev == orig) {
1481		if (orig) {
1482			rt_add_uncached_list(orig);
1483			dst_release(&orig->dst);
1484		}
1485	} else {
1486		dst_release(&rt->dst);
1487		ret = false;
1488	}
1489
1490	return ret;
1491}
1492
1493struct uncached_list {
1494	spinlock_t		lock;
1495	struct list_head	head;
1496	struct list_head	quarantine;
1497};
1498
1499static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1500
1501void rt_add_uncached_list(struct rtable *rt)
1502{
1503	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1504
1505	rt->dst.rt_uncached_list = ul;
1506
1507	spin_lock_bh(&ul->lock);
1508	list_add_tail(&rt->dst.rt_uncached, &ul->head);
1509	spin_unlock_bh(&ul->lock);
1510}
1511
1512void rt_del_uncached_list(struct rtable *rt)
1513{
1514	if (!list_empty(&rt->dst.rt_uncached)) {
1515		struct uncached_list *ul = rt->dst.rt_uncached_list;
 
 
1516
1517		spin_lock_bh(&ul->lock);
1518		list_del_init(&rt->dst.rt_uncached);
1519		spin_unlock_bh(&ul->lock);
1520	}
1521}
1522
1523static void ipv4_dst_destroy(struct dst_entry *dst)
1524{
1525	ip_dst_metrics_put(dst);
1526	rt_del_uncached_list(dst_rtable(dst));
1527}
1528
1529void rt_flush_dev(struct net_device *dev)
1530{
1531	struct rtable *rt, *safe;
 
1532	int cpu;
1533
1534	for_each_possible_cpu(cpu) {
1535		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1536
1537		if (list_empty(&ul->head))
1538			continue;
1539
1540		spin_lock_bh(&ul->lock);
1541		list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
1542			if (rt->dst.dev != dev)
1543				continue;
1544			rt->dst.dev = blackhole_netdev;
1545			netdev_ref_replace(dev, blackhole_netdev,
1546					   &rt->dst.dev_tracker, GFP_ATOMIC);
1547			list_move(&rt->dst.rt_uncached, &ul->quarantine);
1548		}
1549		spin_unlock_bh(&ul->lock);
1550	}
1551}
1552
1553static bool rt_cache_valid(const struct rtable *rt)
1554{
1555	return	rt &&
1556		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1557		!rt_is_expired(rt);
1558}
1559
1560static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1561			   const struct fib_result *res,
1562			   struct fib_nh_exception *fnhe,
1563			   struct fib_info *fi, u16 type, u32 itag,
1564			   const bool do_cache)
1565{
1566	bool cached = false;
1567
1568	if (fi) {
1569		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1570
1571		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
 
1572			rt->rt_uses_gateway = 1;
1573			rt->rt_gw_family = nhc->nhc_gw_family;
1574			/* only INET and INET6 are supported */
1575			if (likely(nhc->nhc_gw_family == AF_INET))
1576				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1577			else
1578				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1579		}
1580
1581		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1582
1583#ifdef CONFIG_IP_ROUTE_CLASSID
1584		if (nhc->nhc_family == AF_INET) {
1585			struct fib_nh *nh;
1586
1587			nh = container_of(nhc, struct fib_nh, nh_common);
1588			rt->dst.tclassid = nh->nh_tclassid;
1589		}
1590#endif
1591		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1592		if (unlikely(fnhe))
1593			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1594		else if (do_cache)
1595			cached = rt_cache_route(nhc, rt);
1596		if (unlikely(!cached)) {
1597			/* Routes we intend to cache in nexthop exception or
1598			 * FIB nexthop have the DST_NOCACHE bit clear.
1599			 * However, if we are unsuccessful at storing this
1600			 * route into the cache we really need to set it.
1601			 */
1602			if (!rt->rt_gw4) {
1603				rt->rt_gw_family = AF_INET;
1604				rt->rt_gw4 = daddr;
1605			}
1606			rt_add_uncached_list(rt);
1607		}
1608	} else
1609		rt_add_uncached_list(rt);
1610
1611#ifdef CONFIG_IP_ROUTE_CLASSID
1612#ifdef CONFIG_IP_MULTIPLE_TABLES
1613	set_class_tag(rt, res->tclassid);
1614#endif
1615	set_class_tag(rt, itag);
1616#endif
1617}
1618
1619struct rtable *rt_dst_alloc(struct net_device *dev,
1620			    unsigned int flags, u16 type,
1621			    bool noxfrm)
1622{
1623	struct rtable *rt;
1624
1625	rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
 
 
1626		       (noxfrm ? DST_NOXFRM : 0));
1627
1628	if (rt) {
1629		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1630		rt->rt_flags = flags;
1631		rt->rt_type = type;
1632		rt->rt_is_input = 0;
1633		rt->rt_iif = 0;
1634		rt->rt_pmtu = 0;
1635		rt->rt_mtu_locked = 0;
1636		rt->rt_uses_gateway = 0;
1637		rt->rt_gw_family = 0;
1638		rt->rt_gw4 = 0;
1639
1640		rt->dst.output = ip_output;
1641		if (flags & RTCF_LOCAL)
1642			rt->dst.input = ip_local_deliver;
1643	}
1644
1645	return rt;
1646}
1647EXPORT_SYMBOL(rt_dst_alloc);
1648
1649struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1650{
1651	struct rtable *new_rt;
1652
1653	new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
1654			   rt->dst.flags);
1655
1656	if (new_rt) {
1657		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1658		new_rt->rt_flags = rt->rt_flags;
1659		new_rt->rt_type = rt->rt_type;
1660		new_rt->rt_is_input = rt->rt_is_input;
1661		new_rt->rt_iif = rt->rt_iif;
1662		new_rt->rt_pmtu = rt->rt_pmtu;
1663		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1664		new_rt->rt_gw_family = rt->rt_gw_family;
1665		if (rt->rt_gw_family == AF_INET)
1666			new_rt->rt_gw4 = rt->rt_gw4;
1667		else if (rt->rt_gw_family == AF_INET6)
1668			new_rt->rt_gw6 = rt->rt_gw6;
1669
1670		new_rt->dst.input = rt->dst.input;
1671		new_rt->dst.output = rt->dst.output;
1672		new_rt->dst.error = rt->dst.error;
1673		new_rt->dst.lastuse = jiffies;
1674		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1675	}
1676	return new_rt;
1677}
1678EXPORT_SYMBOL(rt_dst_clone);
1679
1680/* called in rcu_read_lock() section */
1681int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1682			  u8 tos, struct net_device *dev,
1683			  struct in_device *in_dev, u32 *itag)
1684{
 
 
 
 
1685	int err;
1686
1687	/* Primary sanity checks. */
 
1688	if (!in_dev)
1689		return -EINVAL;
1690
1691	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1692	    skb->protocol != htons(ETH_P_IP))
1693		return -EINVAL;
1694
1695	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1696		return -EINVAL;
1697
1698	if (ipv4_is_zeronet(saddr)) {
1699		if (!ipv4_is_local_multicast(daddr) &&
1700		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1701			return -EINVAL;
1702	} else {
1703		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1704					  in_dev, itag);
1705		if (err < 0)
1706			return err;
1707	}
1708	return 0;
1709}
1710
1711/* called in rcu_read_lock() section */
1712static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1713			     u8 tos, struct net_device *dev, int our)
1714{
1715	struct in_device *in_dev = __in_dev_get_rcu(dev);
1716	unsigned int flags = RTCF_MULTICAST;
1717	struct rtable *rth;
1718	u32 itag = 0;
1719	int err;
1720
1721	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1722	if (err)
1723		return err;
1724
1725	if (our)
1726		flags |= RTCF_LOCAL;
1727
1728	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
1729		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1730
1731	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1732			   false);
1733	if (!rth)
1734		return -ENOBUFS;
1735
1736#ifdef CONFIG_IP_ROUTE_CLASSID
1737	rth->dst.tclassid = itag;
1738#endif
1739	rth->dst.output = ip_rt_bug;
1740	rth->rt_is_input= 1;
1741
1742#ifdef CONFIG_IP_MROUTE
1743	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1744		rth->dst.input = ip_mr_input;
1745#endif
1746	RT_CACHE_STAT_INC(in_slow_mc);
1747
1748	skb_dst_drop(skb);
1749	skb_dst_set(skb, &rth->dst);
1750	return 0;
 
 
 
 
 
 
 
1751}
1752
1753
1754static void ip_handle_martian_source(struct net_device *dev,
1755				     struct in_device *in_dev,
1756				     struct sk_buff *skb,
1757				     __be32 daddr,
1758				     __be32 saddr)
1759{
1760	RT_CACHE_STAT_INC(in_martian_src);
1761#ifdef CONFIG_IP_ROUTE_VERBOSE
1762	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1763		/*
1764		 *	RFC1812 recommendation, if source is martian,
1765		 *	the only hint is MAC header.
1766		 */
1767		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1768			&daddr, &saddr, dev->name);
1769		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1770			print_hex_dump(KERN_WARNING, "ll header: ",
1771				       DUMP_PREFIX_OFFSET, 16, 1,
1772				       skb_mac_header(skb),
1773				       dev->hard_header_len, false);
1774		}
1775	}
1776#endif
1777}
1778
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1779/* called in rcu_read_lock() section */
1780static int __mkroute_input(struct sk_buff *skb,
1781			   const struct fib_result *res,
1782			   struct in_device *in_dev,
1783			   __be32 daddr, __be32 saddr, u32 tos)
1784{
1785	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1786	struct net_device *dev = nhc->nhc_dev;
1787	struct fib_nh_exception *fnhe;
1788	struct rtable *rth;
1789	int err;
1790	struct in_device *out_dev;
1791	bool do_cache;
1792	u32 itag = 0;
1793
1794	/* get a working reference to the output device */
1795	out_dev = __in_dev_get_rcu(dev);
1796	if (!out_dev) {
1797		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1798		return -EINVAL;
1799	}
1800
1801	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1802				  in_dev->dev, in_dev, &itag);
1803	if (err < 0) {
1804		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1805					 saddr);
1806
1807		goto cleanup;
1808	}
1809
1810	do_cache = res->fi && !itag;
1811	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1812	    skb->protocol == htons(ETH_P_IP)) {
1813		__be32 gw;
1814
1815		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1816		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1817		    inet_addr_onlink(out_dev, saddr, gw))
1818			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1819	}
1820
1821	if (skb->protocol != htons(ETH_P_IP)) {
1822		/* Not IP (i.e. ARP). Do not create route, if it is
1823		 * invalid for proxy arp. DNAT routes are always valid.
1824		 *
1825		 * Proxy arp feature have been extended to allow, ARP
1826		 * replies back to the same interface, to support
1827		 * Private VLAN switch technologies. See arp.c.
1828		 */
1829		if (out_dev == in_dev &&
1830		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1831			err = -EINVAL;
1832			goto cleanup;
1833		}
1834	}
1835
1836	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
1837		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1838
1839	fnhe = find_exception(nhc, daddr);
1840	if (do_cache) {
1841		if (fnhe)
1842			rth = rcu_dereference(fnhe->fnhe_rth_input);
1843		else
1844			rth = rcu_dereference(nhc->nhc_rth_input);
 
 
 
 
 
 
 
 
 
 
1845		if (rt_cache_valid(rth)) {
1846			skb_dst_set_noref(skb, &rth->dst);
1847			goto out;
1848		}
1849	}
1850
1851	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1852			   IN_DEV_ORCONF(out_dev, NOXFRM));
 
1853	if (!rth) {
1854		err = -ENOBUFS;
1855		goto cleanup;
1856	}
1857
1858	rth->rt_is_input = 1;
 
 
1859	RT_CACHE_STAT_INC(in_slow_tot);
1860
1861	rth->dst.input = ip_forward;
1862
1863	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1864		       do_cache);
1865	lwtunnel_set_redirect(&rth->dst);
 
 
 
 
 
 
1866	skb_dst_set(skb, &rth->dst);
1867out:
1868	err = 0;
1869 cleanup:
1870	return err;
1871}
1872
1873#ifdef CONFIG_IP_ROUTE_MULTIPATH
 
1874/* To make ICMP packets follow the right flow, the multipath hash is
1875 * calculated from the inner IP addresses.
1876 */
1877static void ip_multipath_l3_keys(const struct sk_buff *skb,
1878				 struct flow_keys *hash_keys)
1879{
1880	const struct iphdr *outer_iph = ip_hdr(skb);
1881	const struct iphdr *key_iph = outer_iph;
1882	const struct iphdr *inner_iph;
1883	const struct icmphdr *icmph;
1884	struct iphdr _inner_iph;
1885	struct icmphdr _icmph;
1886
1887	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1888		goto out;
1889
1890	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1891		goto out;
1892
1893	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1894				   &_icmph);
1895	if (!icmph)
1896		goto out;
1897
1898	if (!icmp_is_err(icmph->type))
1899		goto out;
 
 
 
 
1900
1901	inner_iph = skb_header_pointer(skb,
1902				       outer_iph->ihl * 4 + sizeof(_icmph),
1903				       sizeof(_inner_iph), &_inner_iph);
1904	if (!inner_iph)
1905		goto out;
1906
1907	key_iph = inner_iph;
1908out:
1909	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1910	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1911}
1912
1913static u32 fib_multipath_custom_hash_outer(const struct net *net,
1914					   const struct sk_buff *skb,
1915					   bool *p_has_inner)
1916{
1917	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
1918	struct flow_keys keys, hash_keys;
1919
1920	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
1921		return 0;
1922
1923	memset(&hash_keys, 0, sizeof(hash_keys));
1924	skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
1925
1926	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1927	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
1928		hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1929	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
1930		hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1931	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
1932		hash_keys.basic.ip_proto = keys.basic.ip_proto;
1933	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
1934		hash_keys.ports.src = keys.ports.src;
1935	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
1936		hash_keys.ports.dst = keys.ports.dst;
1937
1938	*p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
1939	return flow_hash_from_keys(&hash_keys);
1940}
1941
1942static u32 fib_multipath_custom_hash_inner(const struct net *net,
1943					   const struct sk_buff *skb,
1944					   bool has_inner)
1945{
1946	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
1947	struct flow_keys keys, hash_keys;
1948
1949	/* We assume the packet carries an encapsulation, but if none was
1950	 * encountered during dissection of the outer flow, then there is no
1951	 * point in calling the flow dissector again.
1952	 */
1953	if (!has_inner)
1954		return 0;
1955
1956	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
1957		return 0;
1958
1959	memset(&hash_keys, 0, sizeof(hash_keys));
1960	skb_flow_dissect_flow_keys(skb, &keys, 0);
1961
1962	if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
1963		return 0;
1964
1965	if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1966		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1967		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1968			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1969		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1970			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1971	} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1972		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1973		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1974			hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1975		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1976			hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1977		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
1978			hash_keys.tags.flow_label = keys.tags.flow_label;
1979	}
1980
1981	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
1982		hash_keys.basic.ip_proto = keys.basic.ip_proto;
1983	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
1984		hash_keys.ports.src = keys.ports.src;
1985	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
1986		hash_keys.ports.dst = keys.ports.dst;
1987
1988	return flow_hash_from_keys(&hash_keys);
1989}
1990
1991static u32 fib_multipath_custom_hash_skb(const struct net *net,
1992					 const struct sk_buff *skb)
1993{
1994	u32 mhash, mhash_inner;
1995	bool has_inner = true;
1996
1997	mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
1998	mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);
1999
2000	return jhash_2words(mhash, mhash_inner, 0);
2001}
2002
2003static u32 fib_multipath_custom_hash_fl4(const struct net *net,
2004					 const struct flowi4 *fl4)
2005{
2006	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
2007	struct flow_keys hash_keys;
2008
2009	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2010		return 0;
2011
2012	memset(&hash_keys, 0, sizeof(hash_keys));
2013	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2014	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2015		hash_keys.addrs.v4addrs.src = fl4->saddr;
2016	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2017		hash_keys.addrs.v4addrs.dst = fl4->daddr;
2018	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2019		hash_keys.basic.ip_proto = fl4->flowi4_proto;
2020	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2021		hash_keys.ports.src = fl4->fl4_sport;
2022	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2023		hash_keys.ports.dst = fl4->fl4_dport;
2024
2025	return flow_hash_from_keys(&hash_keys);
2026}
2027
2028/* if skb is set it will be used and fl4 can be NULL */
2029int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
2030		       const struct sk_buff *skb, struct flow_keys *flkeys)
2031{
2032	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
2033	struct flow_keys hash_keys;
2034	u32 mhash = 0;
2035
2036	switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
2037	case 0:
2038		memset(&hash_keys, 0, sizeof(hash_keys));
2039		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2040		if (skb) {
2041			ip_multipath_l3_keys(skb, &hash_keys);
2042		} else {
2043			hash_keys.addrs.v4addrs.src = fl4->saddr;
2044			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2045		}
2046		mhash = flow_hash_from_keys(&hash_keys);
2047		break;
2048	case 1:
2049		/* skb is currently provided only when forwarding */
2050		if (skb) {
2051			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2052			struct flow_keys keys;
2053
2054			/* short-circuit if we already have L4 hash present */
2055			if (skb->l4_hash)
2056				return skb_get_hash_raw(skb) >> 1;
2057
2058			memset(&hash_keys, 0, sizeof(hash_keys));
2059
2060			if (!flkeys) {
2061				skb_flow_dissect_flow_keys(skb, &keys, flag);
2062				flkeys = &keys;
2063			}
2064
2065			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2066			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2067			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2068			hash_keys.ports.src = flkeys->ports.src;
2069			hash_keys.ports.dst = flkeys->ports.dst;
2070			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2071		} else {
2072			memset(&hash_keys, 0, sizeof(hash_keys));
2073			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2074			hash_keys.addrs.v4addrs.src = fl4->saddr;
2075			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2076			hash_keys.ports.src = fl4->fl4_sport;
2077			hash_keys.ports.dst = fl4->fl4_dport;
2078			hash_keys.basic.ip_proto = fl4->flowi4_proto;
2079		}
2080		mhash = flow_hash_from_keys(&hash_keys);
2081		break;
2082	case 2:
2083		memset(&hash_keys, 0, sizeof(hash_keys));
2084		/* skb is currently provided only when forwarding */
2085		if (skb) {
2086			struct flow_keys keys;
2087
2088			skb_flow_dissect_flow_keys(skb, &keys, 0);
2089			/* Inner can be v4 or v6 */
2090			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2091				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2092				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2093				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2094			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2095				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2096				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2097				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2098				hash_keys.tags.flow_label = keys.tags.flow_label;
2099				hash_keys.basic.ip_proto = keys.basic.ip_proto;
2100			} else {
2101				/* Same as case 0 */
2102				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2103				ip_multipath_l3_keys(skb, &hash_keys);
2104			}
2105		} else {
2106			/* Same as case 0 */
2107			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2108			hash_keys.addrs.v4addrs.src = fl4->saddr;
2109			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2110		}
2111		mhash = flow_hash_from_keys(&hash_keys);
2112		break;
2113	case 3:
2114		if (skb)
2115			mhash = fib_multipath_custom_hash_skb(net, skb);
2116		else
2117			mhash = fib_multipath_custom_hash_fl4(net, fl4);
2118		break;
2119	}
2120
2121	if (multipath_hash)
2122		mhash = jhash_2words(mhash, multipath_hash, 0);
2123
2124	return mhash >> 1;
2125}
2126#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2127
2128static int ip_mkroute_input(struct sk_buff *skb,
2129			    struct fib_result *res,
 
2130			    struct in_device *in_dev,
2131			    __be32 daddr, __be32 saddr, u32 tos,
2132			    struct flow_keys *hkeys)
2133{
2134#ifdef CONFIG_IP_ROUTE_MULTIPATH
2135	if (res->fi && fib_info_num_path(res->fi) > 1) {
2136		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2137
 
 
 
 
2138		fib_select_multipath(res, h);
2139		IPCB(skb)->flags |= IPSKB_MULTIPATH;
2140	}
2141#endif
2142
2143	/* create a routing cache entry */
2144	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2145}
2146
2147/* Implements all the saddr-related checks as ip_route_input_slow(),
2148 * assuming daddr is valid and the destination is not a local broadcast one.
2149 * Uses the provided hint instead of performing a route lookup.
2150 */
2151int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2152		      u8 tos, struct net_device *dev,
2153		      const struct sk_buff *hint)
2154{
2155	struct in_device *in_dev = __in_dev_get_rcu(dev);
2156	struct rtable *rt = skb_rtable(hint);
2157	struct net *net = dev_net(dev);
2158	int err = -EINVAL;
2159	u32 tag = 0;
2160
2161	if (!in_dev)
2162		return -EINVAL;
2163
2164	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2165		goto martian_source;
2166
2167	if (ipv4_is_zeronet(saddr))
2168		goto martian_source;
2169
2170	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2171		goto martian_source;
2172
2173	if (rt->rt_type != RTN_LOCAL)
2174		goto skip_validate_source;
2175
2176	tos &= IPTOS_RT_MASK;
2177	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2178	if (err < 0)
2179		goto martian_source;
2180
2181skip_validate_source:
2182	skb_dst_copy(skb, hint);
2183	return 0;
2184
2185martian_source:
2186	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2187	return err;
2188}
2189
2190/* get device for dst_alloc with local routes */
2191static struct net_device *ip_rt_get_dev(struct net *net,
2192					const struct fib_result *res)
2193{
2194	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2195	struct net_device *dev = NULL;
2196
2197	if (nhc)
2198		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2199
2200	return dev ? : net->loopback_dev;
2201}
2202
2203/*
2204 *	NOTE. We drop all the packets that has local source
2205 *	addresses, because every properly looped back packet
2206 *	must have correct destination already attached by output routine.
2207 *	Changes in the enforced policies must be applied also to
2208 *	ip_route_use_hint().
2209 *
2210 *	Such approach solves two big problems:
2211 *	1. Not simplex devices are handled properly.
2212 *	2. IP spoofing attempts are filtered with 100% of guarantee.
2213 *	called with rcu_read_lock()
2214 */
2215
2216static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2217			       u8 tos, struct net_device *dev,
2218			       struct fib_result *res)
2219{
 
2220	struct in_device *in_dev = __in_dev_get_rcu(dev);
2221	struct flow_keys *flkeys = NULL, _flkeys;
2222	struct net    *net = dev_net(dev);
2223	struct ip_tunnel_info *tun_info;
2224	int		err = -EINVAL;
2225	unsigned int	flags = 0;
2226	u32		itag = 0;
2227	struct rtable	*rth;
2228	struct flowi4	fl4;
2229	bool do_cache = true;
 
2230
2231	/* IP on this device is disabled. */
2232
2233	if (!in_dev)
2234		goto out;
2235
2236	/* Check for the most weird martians, which can be not detected
2237	 * by fib_lookup.
2238	 */
2239
2240	tun_info = skb_tunnel_info(skb);
2241	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2242		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2243	else
2244		fl4.flowi4_tun_key.tun_id = 0;
2245	skb_dst_drop(skb);
2246
2247	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2248		goto martian_source;
2249
2250	res->fi = NULL;
2251	res->table = NULL;
2252	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2253		goto brd_input;
2254
2255	/* Accept zero addresses only to limited broadcast;
2256	 * I even do not know to fix it or not. Waiting for complains :-)
2257	 */
2258	if (ipv4_is_zeronet(saddr))
2259		goto martian_source;
2260
2261	if (ipv4_is_zeronet(daddr))
2262		goto martian_destination;
2263
2264	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2265	 * and call it once if daddr or/and saddr are loopback addresses
2266	 */
2267	if (ipv4_is_loopback(daddr)) {
2268		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2269			goto martian_destination;
2270	} else if (ipv4_is_loopback(saddr)) {
2271		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2272			goto martian_source;
2273	}
2274
2275	/*
2276	 *	Now we are ready to route packet.
2277	 */
2278	fl4.flowi4_l3mdev = 0;
2279	fl4.flowi4_oif = 0;
2280	fl4.flowi4_iif = dev->ifindex;
2281	fl4.flowi4_mark = skb->mark;
2282	fl4.flowi4_tos = tos;
2283	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2284	fl4.flowi4_flags = 0;
2285	fl4.daddr = daddr;
2286	fl4.saddr = saddr;
2287	fl4.flowi4_uid = sock_net_uid(net, NULL);
2288	fl4.flowi4_multipath_hash = 0;
2289
2290	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2291		flkeys = &_flkeys;
2292	} else {
2293		fl4.flowi4_proto = 0;
2294		fl4.fl4_sport = 0;
2295		fl4.fl4_dport = 0;
2296	}
2297
2298	err = fib_lookup(net, &fl4, res, 0);
2299	if (err != 0) {
2300		if (!IN_DEV_FORWARD(in_dev))
2301			err = -EHOSTUNREACH;
2302		goto no_route;
2303	}
2304
2305	if (res->type == RTN_BROADCAST) {
2306		if (IN_DEV_BFORWARD(in_dev))
2307			goto make_route;
2308		/* not do cache if bc_forwarding is enabled */
2309		if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING))
2310			do_cache = false;
2311		goto brd_input;
2312	}
2313
2314	if (res->type == RTN_LOCAL) {
2315		err = fib_validate_source(skb, saddr, daddr, tos,
2316					  0, dev, in_dev, &itag);
2317		if (err < 0)
2318			goto martian_source;
2319		goto local_input;
2320	}
2321
2322	if (!IN_DEV_FORWARD(in_dev)) {
2323		err = -EHOSTUNREACH;
2324		goto no_route;
2325	}
2326	if (res->type != RTN_UNICAST)
2327		goto martian_destination;
2328
2329make_route:
2330	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2331out:	return err;
2332
2333brd_input:
2334	if (skb->protocol != htons(ETH_P_IP))
2335		goto e_inval;
2336
2337	if (!ipv4_is_zeronet(saddr)) {
2338		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2339					  in_dev, &itag);
2340		if (err < 0)
2341			goto martian_source;
2342	}
2343	flags |= RTCF_BROADCAST;
2344	res->type = RTN_BROADCAST;
2345	RT_CACHE_STAT_INC(in_brd);
2346
2347local_input:
2348	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
2349		IPCB(skb)->flags |= IPSKB_NOPOLICY;
2350
2351	do_cache &= res->fi && !itag;
2352	if (do_cache) {
2353		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2354
2355		rth = rcu_dereference(nhc->nhc_rth_input);
2356		if (rt_cache_valid(rth)) {
2357			skb_dst_set_noref(skb, &rth->dst);
2358			err = 0;
2359			goto out;
2360		}
2361	}
2362
2363	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2364			   flags | RTCF_LOCAL, res->type, false);
2365	if (!rth)
2366		goto e_nobufs;
2367
2368	rth->dst.output= ip_rt_bug;
2369#ifdef CONFIG_IP_ROUTE_CLASSID
2370	rth->dst.tclassid = itag;
2371#endif
2372	rth->rt_is_input = 1;
 
 
2373
2374	RT_CACHE_STAT_INC(in_slow_tot);
2375	if (res->type == RTN_UNREACHABLE) {
2376		rth->dst.input= ip_error;
2377		rth->dst.error= -err;
2378		rth->rt_flags	&= ~RTCF_LOCAL;
2379	}
2380
2381	if (do_cache) {
2382		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2383
2384		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2385		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2386			WARN_ON(rth->dst.input == lwtunnel_input);
2387			rth->dst.lwtstate->orig_input = rth->dst.input;
2388			rth->dst.input = lwtunnel_input;
2389		}
2390
2391		if (unlikely(!rt_cache_route(nhc, rth)))
2392			rt_add_uncached_list(rth);
2393	}
2394	skb_dst_set(skb, &rth->dst);
2395	err = 0;
2396	goto out;
2397
2398no_route:
2399	RT_CACHE_STAT_INC(in_no_route);
2400	res->type = RTN_UNREACHABLE;
2401	res->fi = NULL;
2402	res->table = NULL;
2403	goto local_input;
2404
2405	/*
2406	 *	Do not cache martian addresses: they should be logged (RFC1812)
2407	 */
2408martian_destination:
2409	RT_CACHE_STAT_INC(in_martian_dst);
2410#ifdef CONFIG_IP_ROUTE_VERBOSE
2411	if (IN_DEV_LOG_MARTIANS(in_dev))
2412		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2413				     &daddr, &saddr, dev->name);
2414#endif
2415
2416e_inval:
2417	err = -EINVAL;
2418	goto out;
2419
2420e_nobufs:
2421	err = -ENOBUFS;
2422	goto out;
2423
2424martian_source:
2425	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2426	goto out;
2427}
2428
2429/* called with rcu_read_lock held */
2430static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2431			      u8 tos, struct net_device *dev, struct fib_result *res)
2432{
 
 
 
 
2433	/* Multicast recognition logic is moved from route cache to here.
2434	 * The problem was that too many Ethernet cards have broken/missing
2435	 * hardware multicast filters :-( As result the host on multicasting
2436	 * network acquires a lot of useless route cache entries, sort of
2437	 * SDR messages from all the world. Now we try to get rid of them.
2438	 * Really, provided software IP multicast filter is organized
2439	 * reasonably (at least, hashed), it does not result in a slowdown
2440	 * comparing with route cache reject entries.
2441	 * Note, that multicast routers are not affected, because
2442	 * route cache entry is created eventually.
2443	 */
2444	if (ipv4_is_multicast(daddr)) {
2445		struct in_device *in_dev = __in_dev_get_rcu(dev);
2446		int our = 0;
2447		int err = -EINVAL;
2448
2449		if (!in_dev)
2450			return err;
2451		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2452				      ip_hdr(skb)->protocol);
2453
2454		/* check l3 master if no match yet */
2455		if (!our && netif_is_l3_slave(dev)) {
2456			struct in_device *l3_in_dev;
2457
2458			l3_in_dev = __in_dev_get_rcu(skb->dev);
2459			if (l3_in_dev)
2460				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2461						      ip_hdr(skb)->protocol);
2462		}
2463
2464		if (our
 
 
 
2465#ifdef CONFIG_IP_MROUTE
2466			||
2467		    (!ipv4_is_local_multicast(daddr) &&
2468		     IN_DEV_MFORWARD(in_dev))
2469#endif
2470		   ) {
2471			err = ip_route_input_mc(skb, daddr, saddr,
2472						tos, dev, our);
 
 
 
2473		}
2474		return err;
 
2475	}
2476
2477	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2478}
2479
2480int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2481			 u8 tos, struct net_device *dev)
2482{
2483	struct fib_result res;
2484	int err;
2485
2486	tos &= IPTOS_RT_MASK;
2487	rcu_read_lock();
2488	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2489	rcu_read_unlock();
2490
2491	return err;
2492}
2493EXPORT_SYMBOL(ip_route_input_noref);
2494
2495/* called with rcu_read_lock() */
2496static struct rtable *__mkroute_output(const struct fib_result *res,
2497				       const struct flowi4 *fl4, int orig_oif,
2498				       struct net_device *dev_out,
2499				       unsigned int flags)
2500{
2501	struct fib_info *fi = res->fi;
2502	struct fib_nh_exception *fnhe;
2503	struct in_device *in_dev;
2504	u16 type = res->type;
2505	struct rtable *rth;
2506	bool do_cache;
2507
2508	in_dev = __in_dev_get_rcu(dev_out);
2509	if (!in_dev)
2510		return ERR_PTR(-EINVAL);
2511
2512	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2513		if (ipv4_is_loopback(fl4->saddr) &&
2514		    !(dev_out->flags & IFF_LOOPBACK) &&
2515		    !netif_is_l3_master(dev_out))
2516			return ERR_PTR(-EINVAL);
2517
2518	if (ipv4_is_lbcast(fl4->daddr))
2519		type = RTN_BROADCAST;
2520	else if (ipv4_is_multicast(fl4->daddr))
2521		type = RTN_MULTICAST;
2522	else if (ipv4_is_zeronet(fl4->daddr))
2523		return ERR_PTR(-EINVAL);
2524
2525	if (dev_out->flags & IFF_LOOPBACK)
2526		flags |= RTCF_LOCAL;
2527
2528	do_cache = true;
2529	if (type == RTN_BROADCAST) {
2530		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2531		fi = NULL;
2532	} else if (type == RTN_MULTICAST) {
2533		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2534		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2535				     fl4->flowi4_proto))
2536			flags &= ~RTCF_LOCAL;
2537		else
2538			do_cache = false;
2539		/* If multicast route do not exist use
2540		 * default one, but do not gateway in this case.
2541		 * Yes, it is hack.
2542		 */
2543		if (fi && res->prefixlen < 4)
2544			fi = NULL;
2545	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2546		   (orig_oif != dev_out->ifindex)) {
2547		/* For local routes that require a particular output interface
2548		 * we do not want to cache the result.  Caching the result
2549		 * causes incorrect behaviour when there are multiple source
2550		 * addresses on the interface, the end result being that if the
2551		 * intended recipient is waiting on that interface for the
2552		 * packet he won't receive it because it will be delivered on
2553		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2554		 * be set to the loopback interface as well.
2555		 */
2556		do_cache = false;
2557	}
2558
2559	fnhe = NULL;
2560	do_cache &= fi != NULL;
2561	if (fi) {
2562		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2563		struct rtable __rcu **prth;
 
2564
2565		fnhe = find_exception(nhc, fl4->daddr);
2566		if (!do_cache)
2567			goto add;
2568		if (fnhe) {
2569			prth = &fnhe->fnhe_rth_output;
2570		} else {
2571			if (unlikely(fl4->flowi4_flags &
2572				     FLOWI_FLAG_KNOWN_NH &&
2573				     !(nhc->nhc_gw_family &&
2574				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2575				do_cache = false;
2576				goto add;
2577			}
2578			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2579		}
 
 
 
 
 
 
 
 
 
2580		rth = rcu_dereference(*prth);
2581		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
 
 
 
2582			return rth;
 
2583	}
2584
2585add:
2586	rth = rt_dst_alloc(dev_out, flags, type,
2587			   IN_DEV_ORCONF(in_dev, NOXFRM));
 
 
2588	if (!rth)
2589		return ERR_PTR(-ENOBUFS);
2590
2591	rth->rt_iif = orig_oif;
 
 
2592
2593	RT_CACHE_STAT_INC(out_slow_tot);
2594
2595	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2596		if (flags & RTCF_LOCAL &&
2597		    !(dev_out->flags & IFF_LOOPBACK)) {
2598			rth->dst.output = ip_mc_output;
2599			RT_CACHE_STAT_INC(out_slow_mc);
2600		}
2601#ifdef CONFIG_IP_MROUTE
2602		if (type == RTN_MULTICAST) {
2603			if (IN_DEV_MFORWARD(in_dev) &&
2604			    !ipv4_is_local_multicast(fl4->daddr)) {
2605				rth->dst.input = ip_mr_input;
2606				rth->dst.output = ip_mc_output;
2607			}
2608		}
2609#endif
2610	}
2611
2612	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2613	lwtunnel_set_redirect(&rth->dst);
 
2614
2615	return rth;
2616}
2617
2618/*
2619 * Major route resolver routine.
2620 */
2621
2622struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2623					const struct sk_buff *skb)
2624{
2625	struct fib_result res = {
2626		.type		= RTN_UNSPEC,
2627		.fi		= NULL,
2628		.table		= NULL,
2629		.tclassid	= 0,
2630	};
2631	struct rtable *rth;
 
 
 
 
 
 
 
 
2632
2633	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2634	ip_rt_fix_tos(fl4);
 
 
2635
2636	rcu_read_lock();
2637	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2638	rcu_read_unlock();
2639
2640	return rth;
2641}
2642EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2643
2644struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2645					    struct fib_result *res,
2646					    const struct sk_buff *skb)
2647{
2648	struct net_device *dev_out = NULL;
2649	int orig_oif = fl4->flowi4_oif;
2650	unsigned int flags = 0;
2651	struct rtable *rth;
2652	int err;
2653
2654	if (fl4->saddr) {
 
2655		if (ipv4_is_multicast(fl4->saddr) ||
2656		    ipv4_is_lbcast(fl4->saddr) ||
2657		    ipv4_is_zeronet(fl4->saddr)) {
2658			rth = ERR_PTR(-EINVAL);
2659			goto out;
2660		}
2661
2662		rth = ERR_PTR(-ENETUNREACH);
2663
2664		/* I removed check for oif == dev_out->oif here.
2665		 * It was wrong for two reasons:
2666		 * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2667		 *    is assigned to multiple interfaces.
2668		 * 2. Moreover, we are allowed to send packets with saddr
2669		 *    of another iface. --ANK
2670		 */
2671
2672		if (fl4->flowi4_oif == 0 &&
2673		    (ipv4_is_multicast(fl4->daddr) ||
2674		     ipv4_is_lbcast(fl4->daddr))) {
2675			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2676			dev_out = __ip_dev_find(net, fl4->saddr, false);
2677			if (!dev_out)
2678				goto out;
2679
2680			/* Special hack: user can direct multicasts
2681			 * and limited broadcast via necessary interface
2682			 * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2683			 * This hack is not just for fun, it allows
2684			 * vic,vat and friends to work.
2685			 * They bind socket to loopback, set ttl to zero
2686			 * and expect that it will work.
2687			 * From the viewpoint of routing cache they are broken,
2688			 * because we are not allowed to build multicast path
2689			 * with loopback source addr (look, routing cache
2690			 * cannot know, that ttl is zero, so that packet
2691			 * will not leave this host and route is valid).
2692			 * Luckily, this hack is good workaround.
2693			 */
2694
2695			fl4->flowi4_oif = dev_out->ifindex;
2696			goto make_route;
2697		}
2698
2699		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2700			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2701			if (!__ip_dev_find(net, fl4->saddr, false))
2702				goto out;
2703		}
2704	}
2705
2706
2707	if (fl4->flowi4_oif) {
2708		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2709		rth = ERR_PTR(-ENODEV);
2710		if (!dev_out)
2711			goto out;
2712
2713		/* RACE: Check return value of inet_select_addr instead. */
2714		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2715			rth = ERR_PTR(-ENETUNREACH);
2716			goto out;
2717		}
2718		if (ipv4_is_local_multicast(fl4->daddr) ||
2719		    ipv4_is_lbcast(fl4->daddr) ||
2720		    fl4->flowi4_proto == IPPROTO_IGMP) {
2721			if (!fl4->saddr)
2722				fl4->saddr = inet_select_addr(dev_out, 0,
2723							      RT_SCOPE_LINK);
2724			goto make_route;
2725		}
2726		if (!fl4->saddr) {
2727			if (ipv4_is_multicast(fl4->daddr))
2728				fl4->saddr = inet_select_addr(dev_out, 0,
2729							      fl4->flowi4_scope);
2730			else if (!fl4->daddr)
2731				fl4->saddr = inet_select_addr(dev_out, 0,
2732							      RT_SCOPE_HOST);
2733		}
 
 
 
 
2734	}
2735
2736	if (!fl4->daddr) {
2737		fl4->daddr = fl4->saddr;
2738		if (!fl4->daddr)
2739			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2740		dev_out = net->loopback_dev;
2741		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2742		res->type = RTN_LOCAL;
2743		flags |= RTCF_LOCAL;
2744		goto make_route;
2745	}
2746
2747	err = fib_lookup(net, fl4, res, 0);
2748	if (err) {
2749		res->fi = NULL;
2750		res->table = NULL;
2751		if (fl4->flowi4_oif &&
2752		    (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
2753			/* Apparently, routing tables are wrong. Assume,
2754			 * that the destination is on link.
2755			 *
2756			 * WHY? DW.
2757			 * Because we are allowed to send to iface
2758			 * even if it has NO routes and NO assigned
2759			 * addresses. When oif is specified, routing
2760			 * tables are looked up with only one purpose:
2761			 * to catch if destination is gatewayed, rather than
2762			 * direct. Moreover, if MSG_DONTROUTE is set,
2763			 * we send packet, ignoring both routing tables
2764			 * and ifaddr state. --ANK
2765			 *
2766			 *
2767			 * We could make it even if oif is unknown,
2768			 * likely IPv6, but we do not.
2769			 */
2770
2771			if (fl4->saddr == 0)
2772				fl4->saddr = inet_select_addr(dev_out, 0,
2773							      RT_SCOPE_LINK);
2774			res->type = RTN_UNICAST;
2775			goto make_route;
2776		}
2777		rth = ERR_PTR(err);
2778		goto out;
2779	}
2780
2781	if (res->type == RTN_LOCAL) {
2782		if (!fl4->saddr) {
2783			if (res->fi->fib_prefsrc)
2784				fl4->saddr = res->fi->fib_prefsrc;
2785			else
2786				fl4->saddr = fl4->daddr;
2787		}
2788
2789		/* L3 master device is the loopback for that domain */
2790		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2791			net->loopback_dev;
2792
2793		/* make sure orig_oif points to fib result device even
2794		 * though packet rx/tx happens over loopback or l3mdev
2795		 */
2796		orig_oif = FIB_RES_OIF(*res);
2797
2798		fl4->flowi4_oif = dev_out->ifindex;
2799		flags |= RTCF_LOCAL;
2800		goto make_route;
2801	}
2802
2803	fib_select_path(net, res, fl4, skb);
 
 
 
2804
2805	dev_out = FIB_RES_DEV(*res);
2806
2807make_route:
2808	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2809
2810out:
 
2811	return rth;
2812}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2813
2814static struct dst_ops ipv4_dst_blackhole_ops = {
2815	.family			= AF_INET,
2816	.default_advmss		= ipv4_default_advmss,
2817	.neigh_lookup		= ipv4_neigh_lookup,
2818	.check			= dst_blackhole_check,
2819	.cow_metrics		= dst_blackhole_cow_metrics,
2820	.update_pmtu		= dst_blackhole_update_pmtu,
2821	.redirect		= dst_blackhole_redirect,
2822	.mtu			= dst_blackhole_mtu,
2823};
2824
2825struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2826{
2827	struct rtable *ort = dst_rtable(dst_orig);
2828	struct rtable *rt;
2829
2830	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0);
2831	if (rt) {
2832		struct dst_entry *new = &rt->dst;
2833
2834		new->__use = 1;
2835		new->input = dst_discard;
2836		new->output = dst_discard_out;
2837
2838		new->dev = net->loopback_dev;
2839		netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);
 
2840
2841		rt->rt_is_input = ort->rt_is_input;
2842		rt->rt_iif = ort->rt_iif;
2843		rt->rt_pmtu = ort->rt_pmtu;
2844		rt->rt_mtu_locked = ort->rt_mtu_locked;
2845
2846		rt->rt_genid = rt_genid_ipv4(net);
2847		rt->rt_flags = ort->rt_flags;
2848		rt->rt_type = ort->rt_type;
 
2849		rt->rt_uses_gateway = ort->rt_uses_gateway;
2850		rt->rt_gw_family = ort->rt_gw_family;
2851		if (rt->rt_gw_family == AF_INET)
2852			rt->rt_gw4 = ort->rt_gw4;
2853		else if (rt->rt_gw_family == AF_INET6)
2854			rt->rt_gw6 = ort->rt_gw6;
2855	}
2856
2857	dst_release(dst_orig);
2858
2859	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2860}
2861
2862struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2863				    const struct sock *sk)
2864{
2865	struct rtable *rt = __ip_route_output_key(net, flp4);
2866
2867	if (IS_ERR(rt))
2868		return rt;
2869
2870	if (flp4->flowi4_proto) {
2871		flp4->flowi4_oif = rt->dst.dev->ifindex;
2872		rt = dst_rtable(xfrm_lookup_route(net, &rt->dst,
2873						  flowi4_to_flowi(flp4),
2874						  sk, 0));
2875	}
2876
2877	return rt;
2878}
2879EXPORT_SYMBOL_GPL(ip_route_output_flow);
2880
2881/* called with rcu_read_lock held */
2882static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2883			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2884			struct sk_buff *skb, u32 portid, u32 seq,
2885			unsigned int flags)
2886{
 
2887	struct rtmsg *r;
2888	struct nlmsghdr *nlh;
2889	unsigned long expires = 0;
2890	u32 error;
2891	u32 metrics[RTAX_MAX];
2892
2893	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2894	if (!nlh)
2895		return -EMSGSIZE;
2896
2897	r = nlmsg_data(nlh);
2898	r->rtm_family	 = AF_INET;
2899	r->rtm_dst_len	= 32;
2900	r->rtm_src_len	= 0;
2901	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2902	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2903	if (nla_put_u32(skb, RTA_TABLE, table_id))
2904		goto nla_put_failure;
2905	r->rtm_type	= rt->rt_type;
2906	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2907	r->rtm_protocol = RTPROT_UNSPEC;
2908	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2909	if (rt->rt_flags & RTCF_NOTIFY)
2910		r->rtm_flags |= RTM_F_NOTIFY;
2911	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2912		r->rtm_flags |= RTCF_DOREDIRECT;
2913
2914	if (nla_put_in_addr(skb, RTA_DST, dst))
2915		goto nla_put_failure;
2916	if (src) {
2917		r->rtm_src_len = 32;
2918		if (nla_put_in_addr(skb, RTA_SRC, src))
2919			goto nla_put_failure;
2920	}
2921	if (rt->dst.dev &&
2922	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2923		goto nla_put_failure;
2924	if (rt->dst.lwtstate &&
2925	    lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
2926		goto nla_put_failure;
2927#ifdef CONFIG_IP_ROUTE_CLASSID
2928	if (rt->dst.tclassid &&
2929	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2930		goto nla_put_failure;
2931#endif
2932	if (fl4 && !rt_is_input_route(rt) &&
2933	    fl4->saddr != src) {
2934		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2935			goto nla_put_failure;
2936	}
2937	if (rt->rt_uses_gateway) {
2938		if (rt->rt_gw_family == AF_INET &&
2939		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2940			goto nla_put_failure;
2941		} else if (rt->rt_gw_family == AF_INET6) {
2942			int alen = sizeof(struct in6_addr);
2943			struct nlattr *nla;
2944			struct rtvia *via;
2945
2946			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2947			if (!nla)
2948				goto nla_put_failure;
2949
2950			via = nla_data(nla);
2951			via->rtvia_family = AF_INET6;
2952			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2953		}
2954	}
2955
2956	expires = rt->dst.expires;
2957	if (expires) {
2958		unsigned long now = jiffies;
2959
2960		if (time_before(now, expires))
2961			expires -= now;
2962		else
2963			expires = 0;
2964	}
2965
2966	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2967	if (rt->rt_pmtu && expires)
2968		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2969	if (rt->rt_mtu_locked && expires)
2970		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2971	if (rtnetlink_put_metrics(skb, metrics) < 0)
2972		goto nla_put_failure;
2973
2974	if (fl4) {
2975		if (fl4->flowi4_mark &&
2976		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2977			goto nla_put_failure;
2978
2979		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2980		    nla_put_u32(skb, RTA_UID,
2981				from_kuid_munged(current_user_ns(),
2982						 fl4->flowi4_uid)))
2983			goto nla_put_failure;
2984
2985		if (rt_is_input_route(rt)) {
2986#ifdef CONFIG_IP_MROUTE
2987			if (ipv4_is_multicast(dst) &&
2988			    !ipv4_is_local_multicast(dst) &&
2989			    IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) {
2990				int err = ipmr_get_route(net, skb,
2991							 fl4->saddr, fl4->daddr,
2992							 r, portid);
2993
2994				if (err <= 0) {
2995					if (err == 0)
2996						return 0;
2997					goto nla_put_failure;
 
 
 
 
2998				}
2999			} else
 
3000#endif
3001				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
3002					goto nla_put_failure;
3003		}
3004	}
3005
3006	error = rt->dst.error;
3007
3008	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
3009		goto nla_put_failure;
3010
3011	nlmsg_end(skb, nlh);
3012	return 0;
3013
3014nla_put_failure:
3015	nlmsg_cancel(skb, nlh);
3016	return -EMSGSIZE;
3017}
3018
3019static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
3020			    struct netlink_callback *cb, u32 table_id,
3021			    struct fnhe_hash_bucket *bucket, int genid,
3022			    int *fa_index, int fa_start, unsigned int flags)
3023{
3024	int i;
3025
3026	for (i = 0; i < FNHE_HASH_SIZE; i++) {
3027		struct fib_nh_exception *fnhe;
3028
3029		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3030		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
3031			struct rtable *rt;
3032			int err;
3033
3034			if (*fa_index < fa_start)
3035				goto next;
3036
3037			if (fnhe->fnhe_genid != genid)
3038				goto next;
3039
3040			if (fnhe->fnhe_expires &&
3041			    time_after(jiffies, fnhe->fnhe_expires))
3042				goto next;
3043
3044			rt = rcu_dereference(fnhe->fnhe_rth_input);
3045			if (!rt)
3046				rt = rcu_dereference(fnhe->fnhe_rth_output);
3047			if (!rt)
3048				goto next;
3049
3050			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3051					   table_id, NULL, skb,
3052					   NETLINK_CB(cb->skb).portid,
3053					   cb->nlh->nlmsg_seq, flags);
3054			if (err)
3055				return err;
3056next:
3057			(*fa_index)++;
3058		}
3059	}
3060
3061	return 0;
3062}
3063
3064int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3065		       u32 table_id, struct fib_info *fi,
3066		       int *fa_index, int fa_start, unsigned int flags)
3067{
3068	struct net *net = sock_net(cb->skb->sk);
3069	int nhsel, genid = fnhe_genid(net);
3070
3071	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3072		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3073		struct fnhe_hash_bucket *bucket;
3074		int err;
3075
3076		if (nhc->nhc_flags & RTNH_F_DEAD)
3077			continue;
3078
3079		rcu_read_lock();
3080		bucket = rcu_dereference(nhc->nhc_exceptions);
3081		err = 0;
3082		if (bucket)
3083			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3084					       genid, fa_index, fa_start,
3085					       flags);
3086		rcu_read_unlock();
3087		if (err)
3088			return err;
3089	}
3090
3091	return 0;
3092}
3093
3094static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3095						   u8 ip_proto, __be16 sport,
3096						   __be16 dport)
3097{
3098	struct sk_buff *skb;
3099	struct iphdr *iph;
3100
3101	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3102	if (!skb)
3103		return NULL;
3104
3105	/* Reserve room for dummy headers, this skb can pass
3106	 * through good chunk of routing engine.
3107	 */
3108	skb_reset_mac_header(skb);
3109	skb_reset_network_header(skb);
3110	skb->protocol = htons(ETH_P_IP);
3111	iph = skb_put(skb, sizeof(struct iphdr));
3112	iph->protocol = ip_proto;
3113	iph->saddr = src;
3114	iph->daddr = dst;
3115	iph->version = 0x4;
3116	iph->frag_off = 0;
3117	iph->ihl = 0x5;
3118	skb_set_transport_header(skb, skb->len);
3119
3120	switch (iph->protocol) {
3121	case IPPROTO_UDP: {
3122		struct udphdr *udph;
3123
3124		udph = skb_put_zero(skb, sizeof(struct udphdr));
3125		udph->source = sport;
3126		udph->dest = dport;
3127		udph->len = htons(sizeof(struct udphdr));
3128		udph->check = 0;
3129		break;
3130	}
3131	case IPPROTO_TCP: {
3132		struct tcphdr *tcph;
3133
3134		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3135		tcph->source	= sport;
3136		tcph->dest	= dport;
3137		tcph->doff	= sizeof(struct tcphdr) / 4;
3138		tcph->rst = 1;
3139		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3140					    src, dst, 0);
3141		break;
3142	}
3143	case IPPROTO_ICMP: {
3144		struct icmphdr *icmph;
3145
3146		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3147		icmph->type = ICMP_ECHO;
3148		icmph->code = 0;
3149	}
3150	}
3151
3152	return skb;
3153}
3154
3155static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3156				       const struct nlmsghdr *nlh,
3157				       struct nlattr **tb,
3158				       struct netlink_ext_ack *extack)
3159{
 
3160	struct rtmsg *rtm;
3161	int i, err;
3162
3163	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3164		NL_SET_ERR_MSG(extack,
3165			       "ipv4: Invalid header for route get request");
3166		return -EINVAL;
3167	}
3168
3169	if (!netlink_strict_get_check(skb))
3170		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3171					      rtm_ipv4_policy, extack);
3172
3173	rtm = nlmsg_data(nlh);
3174	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3175	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3176	    rtm->rtm_table || rtm->rtm_protocol ||
3177	    rtm->rtm_scope || rtm->rtm_type) {
3178		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3179		return -EINVAL;
3180	}
3181
3182	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3183			       RTM_F_LOOKUP_TABLE |
3184			       RTM_F_FIB_MATCH)) {
3185		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3186		return -EINVAL;
3187	}
3188
3189	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3190					    rtm_ipv4_policy, extack);
3191	if (err)
3192		return err;
3193
3194	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3195	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3196		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3197		return -EINVAL;
3198	}
3199
3200	for (i = 0; i <= RTA_MAX; i++) {
3201		if (!tb[i])
3202			continue;
3203
3204		switch (i) {
3205		case RTA_IIF:
3206		case RTA_OIF:
3207		case RTA_SRC:
3208		case RTA_DST:
3209		case RTA_IP_PROTO:
3210		case RTA_SPORT:
3211		case RTA_DPORT:
3212		case RTA_MARK:
3213		case RTA_UID:
3214			break;
3215		default:
3216			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3217			return -EINVAL;
3218		}
3219	}
3220
3221	return 0;
3222}
3223
3224static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3225			     struct netlink_ext_ack *extack)
3226{
3227	struct net *net = sock_net(in_skb->sk);
3228	struct nlattr *tb[RTA_MAX+1];
3229	u32 table_id = RT_TABLE_MAIN;
3230	__be16 sport = 0, dport = 0;
3231	struct fib_result res = {};
3232	u8 ip_proto = IPPROTO_UDP;
3233	struct rtable *rt = NULL;
3234	struct sk_buff *skb;
3235	struct rtmsg *rtm;
3236	struct flowi4 fl4 = {};
3237	__be32 dst = 0;
3238	__be32 src = 0;
3239	kuid_t uid;
3240	u32 iif;
3241	int err;
3242	int mark;
 
 
3243
3244	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3245	if (err < 0)
3246		return err;
3247
3248	rtm = nlmsg_data(nlh);
3249	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3250	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3251	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3252	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3253	if (tb[RTA_UID])
3254		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3255	else
3256		uid = (iif ? INVALID_UID : current_uid());
3257
3258	if (tb[RTA_IP_PROTO]) {
3259		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3260						  &ip_proto, AF_INET, extack);
3261		if (err)
3262			return err;
3263	}
3264
3265	if (tb[RTA_SPORT])
3266		sport = nla_get_be16(tb[RTA_SPORT]);
 
 
 
3267
3268	if (tb[RTA_DPORT])
3269		dport = nla_get_be16(tb[RTA_DPORT]);
 
3270
3271	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3272	if (!skb)
3273		return -ENOBUFS;
 
3274
 
3275	fl4.daddr = dst;
3276	fl4.saddr = src;
3277	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3278	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3279	fl4.flowi4_mark = mark;
3280	fl4.flowi4_uid = uid;
3281	if (sport)
3282		fl4.fl4_sport = sport;
3283	if (dport)
3284		fl4.fl4_dport = dport;
3285	fl4.flowi4_proto = ip_proto;
3286
3287	rcu_read_lock();
 
3288
3289	if (iif) {
3290		struct net_device *dev;
3291
3292		dev = dev_get_by_index_rcu(net, iif);
3293		if (!dev) {
3294			err = -ENODEV;
3295			goto errout_rcu;
3296		}
3297
3298		fl4.flowi4_iif = iif; /* for rt_fill_info */
3299		skb->dev	= dev;
3300		skb->mark	= mark;
3301		err = ip_route_input_rcu(skb, dst, src,
3302					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3303					 &res);
3304
3305		rt = skb_rtable(skb);
3306		if (err == 0 && rt->dst.error)
3307			err = -rt->dst.error;
3308	} else {
3309		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3310		skb->dev = net->loopback_dev;
3311		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3312		err = 0;
3313		if (IS_ERR(rt))
3314			err = PTR_ERR(rt);
3315		else
3316			skb_dst_set(skb, &rt->dst);
3317	}
3318
3319	if (err)
3320		goto errout_rcu;
3321
 
3322	if (rtm->rtm_flags & RTM_F_NOTIFY)
3323		rt->rt_flags |= RTCF_NOTIFY;
3324
3325	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3326		table_id = res.table ? res.table->tb_id : 0;
3327
3328	/* reset skb for netlink reply msg */
3329	skb_trim(skb, 0);
3330	skb_reset_network_header(skb);
3331	skb_reset_transport_header(skb);
3332	skb_reset_mac_header(skb);
3333
3334	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3335		struct fib_rt_info fri;
3336
3337		if (!res.fi) {
3338			err = fib_props[res.type].error;
3339			if (!err)
3340				err = -EHOSTUNREACH;
3341			goto errout_rcu;
3342		}
3343		fri.fi = res.fi;
3344		fri.tb_id = table_id;
3345		fri.dst = res.prefix;
3346		fri.dst_len = res.prefixlen;
3347		fri.dscp = inet_dsfield_to_dscp(fl4.flowi4_tos);
3348		fri.type = rt->rt_type;
3349		fri.offload = 0;
3350		fri.trap = 0;
3351		fri.offload_failed = 0;
3352		if (res.fa_head) {
3353			struct fib_alias *fa;
3354
3355			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3356				u8 slen = 32 - fri.dst_len;
3357
3358				if (fa->fa_slen == slen &&
3359				    fa->tb_id == fri.tb_id &&
3360				    fa->fa_dscp == fri.dscp &&
3361				    fa->fa_info == res.fi &&
3362				    fa->fa_type == fri.type) {
3363					fri.offload = READ_ONCE(fa->offload);
3364					fri.trap = READ_ONCE(fa->trap);
3365					fri.offload_failed =
3366						READ_ONCE(fa->offload_failed);
3367					break;
3368				}
3369			}
3370		}
3371		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3372				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3373	} else {
3374		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3375				   NETLINK_CB(in_skb).portid,
3376				   nlh->nlmsg_seq, 0);
3377	}
3378	if (err < 0)
3379		goto errout_rcu;
3380
3381	rcu_read_unlock();
3382
3383	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
 
 
3384
3385errout_free:
3386	return err;
3387errout_rcu:
3388	rcu_read_unlock();
3389	kfree_skb(skb);
3390	goto errout_free;
3391}
3392
3393void ip_rt_multicast_event(struct in_device *in_dev)
3394{
3395	rt_cache_flush(dev_net(in_dev->dev));
3396}
3397
3398#ifdef CONFIG_SYSCTL
3399static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3400static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3401static int ip_rt_gc_elasticity __read_mostly	= 8;
3402static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3403
3404static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3405		void *buffer, size_t *lenp, loff_t *ppos)
 
3406{
3407	struct net *net = (struct net *)__ctl->extra1;
3408
3409	if (write) {
3410		rt_cache_flush(net);
3411		fnhe_genid_bump(net);
3412		return 0;
3413	}
3414
3415	return -EINVAL;
3416}
3417
3418static struct ctl_table ipv4_route_table[] = {
3419	{
3420		.procname	= "gc_thresh",
3421		.data		= &ipv4_dst_ops.gc_thresh,
3422		.maxlen		= sizeof(int),
3423		.mode		= 0644,
3424		.proc_handler	= proc_dointvec,
3425	},
3426	{
3427		.procname	= "max_size",
3428		.data		= &ip_rt_max_size,
3429		.maxlen		= sizeof(int),
3430		.mode		= 0644,
3431		.proc_handler	= proc_dointvec,
3432	},
3433	{
3434		/*  Deprecated. Use gc_min_interval_ms */
3435
3436		.procname	= "gc_min_interval",
3437		.data		= &ip_rt_gc_min_interval,
3438		.maxlen		= sizeof(int),
3439		.mode		= 0644,
3440		.proc_handler	= proc_dointvec_jiffies,
3441	},
3442	{
3443		.procname	= "gc_min_interval_ms",
3444		.data		= &ip_rt_gc_min_interval,
3445		.maxlen		= sizeof(int),
3446		.mode		= 0644,
3447		.proc_handler	= proc_dointvec_ms_jiffies,
3448	},
3449	{
3450		.procname	= "gc_timeout",
3451		.data		= &ip_rt_gc_timeout,
3452		.maxlen		= sizeof(int),
3453		.mode		= 0644,
3454		.proc_handler	= proc_dointvec_jiffies,
3455	},
3456	{
3457		.procname	= "gc_interval",
3458		.data		= &ip_rt_gc_interval,
3459		.maxlen		= sizeof(int),
3460		.mode		= 0644,
3461		.proc_handler	= proc_dointvec_jiffies,
3462	},
3463	{
3464		.procname	= "redirect_load",
3465		.data		= &ip_rt_redirect_load,
3466		.maxlen		= sizeof(int),
3467		.mode		= 0644,
3468		.proc_handler	= proc_dointvec,
3469	},
3470	{
3471		.procname	= "redirect_number",
3472		.data		= &ip_rt_redirect_number,
3473		.maxlen		= sizeof(int),
3474		.mode		= 0644,
3475		.proc_handler	= proc_dointvec,
3476	},
3477	{
3478		.procname	= "redirect_silence",
3479		.data		= &ip_rt_redirect_silence,
3480		.maxlen		= sizeof(int),
3481		.mode		= 0644,
3482		.proc_handler	= proc_dointvec,
3483	},
3484	{
3485		.procname	= "error_cost",
3486		.data		= &ip_rt_error_cost,
3487		.maxlen		= sizeof(int),
3488		.mode		= 0644,
3489		.proc_handler	= proc_dointvec,
3490	},
3491	{
3492		.procname	= "error_burst",
3493		.data		= &ip_rt_error_burst,
3494		.maxlen		= sizeof(int),
3495		.mode		= 0644,
3496		.proc_handler	= proc_dointvec,
3497	},
3498	{
3499		.procname	= "gc_elasticity",
3500		.data		= &ip_rt_gc_elasticity,
3501		.maxlen		= sizeof(int),
3502		.mode		= 0644,
3503		.proc_handler	= proc_dointvec,
3504	},
3505	{ }
3506};
3507
3508static const char ipv4_route_flush_procname[] = "flush";
3509
3510static struct ctl_table ipv4_route_netns_table[] = {
3511	{
3512		.procname	= ipv4_route_flush_procname,
 
3513		.maxlen		= sizeof(int),
3514		.mode		= 0200,
3515		.proc_handler	= ipv4_sysctl_rtcache_flush,
3516	},
3517	{
3518		.procname       = "min_pmtu",
3519		.data           = &init_net.ipv4.ip_rt_min_pmtu,
3520		.maxlen         = sizeof(int),
3521		.mode           = 0644,
3522		.proc_handler   = proc_dointvec_minmax,
3523		.extra1         = &ip_min_valid_pmtu,
3524	},
3525	{
3526		.procname       = "mtu_expires",
3527		.data           = &init_net.ipv4.ip_rt_mtu_expires,
3528		.maxlen         = sizeof(int),
3529		.mode           = 0644,
3530		.proc_handler   = proc_dointvec_jiffies,
3531	},
 
 
 
 
3532	{
3533		.procname   = "min_adv_mss",
3534		.data       = &init_net.ipv4.ip_rt_min_advmss,
3535		.maxlen     = sizeof(int),
3536		.mode       = 0644,
3537		.proc_handler   = proc_dointvec,
3538	},
3539	{ },
3540};
3541
3542static __net_init int sysctl_route_net_init(struct net *net)
3543{
3544	struct ctl_table *tbl;
3545	size_t table_size = ARRAY_SIZE(ipv4_route_netns_table);
3546
3547	tbl = ipv4_route_netns_table;
3548	if (!net_eq(net, &init_net)) {
3549		int i;
3550
3551		tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
3552		if (!tbl)
3553			goto err_dup;
3554
3555		/* Don't export non-whitelisted sysctls to unprivileged users */
3556		if (net->user_ns != &init_user_ns) {
3557			if (tbl[0].procname != ipv4_route_flush_procname) {
3558				tbl[0].procname = NULL;
3559				table_size = 0;
3560			}
3561		}
3562
3563		/* Update the variables to point into the current struct net
3564		 * except for the first element flush
3565		 */
3566		for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++)
3567			tbl[i].data += (void *)net - (void *)&init_net;
3568	}
3569	tbl[0].extra1 = net;
3570
3571	net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route",
3572						     tbl, table_size);
3573	if (!net->ipv4.route_hdr)
3574		goto err_reg;
3575	return 0;
3576
3577err_reg:
3578	if (tbl != ipv4_route_netns_table)
3579		kfree(tbl);
3580err_dup:
3581	return -ENOMEM;
3582}
3583
3584static __net_exit void sysctl_route_net_exit(struct net *net)
3585{
3586	struct ctl_table *tbl;
3587
3588	tbl = net->ipv4.route_hdr->ctl_table_arg;
3589	unregister_net_sysctl_table(net->ipv4.route_hdr);
3590	BUG_ON(tbl == ipv4_route_netns_table);
3591	kfree(tbl);
3592}
3593
3594static __net_initdata struct pernet_operations sysctl_route_ops = {
3595	.init = sysctl_route_net_init,
3596	.exit = sysctl_route_net_exit,
3597};
3598#endif
3599
3600static __net_init int netns_ip_rt_init(struct net *net)
3601{
3602	/* Set default value for namespaceified sysctls */
3603	net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
3604	net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
3605	net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
3606	return 0;
3607}
3608
3609static struct pernet_operations __net_initdata ip_rt_ops = {
3610	.init = netns_ip_rt_init,
3611};
3612
3613static __net_init int rt_genid_init(struct net *net)
3614{
3615	atomic_set(&net->ipv4.rt_genid, 0);
3616	atomic_set(&net->fnhe_genid, 0);
3617	atomic_set(&net->ipv4.dev_addr_genid, get_random_u32());
 
3618	return 0;
3619}
3620
3621static __net_initdata struct pernet_operations rt_genid_ops = {
3622	.init = rt_genid_init,
3623};
3624
3625static int __net_init ipv4_inetpeer_init(struct net *net)
3626{
3627	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3628
3629	if (!bp)
3630		return -ENOMEM;
3631	inet_peer_base_init(bp);
3632	net->ipv4.peers = bp;
3633	return 0;
3634}
3635
3636static void __net_exit ipv4_inetpeer_exit(struct net *net)
3637{
3638	struct inet_peer_base *bp = net->ipv4.peers;
3639
3640	net->ipv4.peers = NULL;
3641	inetpeer_invalidate_tree(bp);
3642	kfree(bp);
3643}
3644
3645static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3646	.init	=	ipv4_inetpeer_init,
3647	.exit	=	ipv4_inetpeer_exit,
3648};
3649
3650#ifdef CONFIG_IP_ROUTE_CLASSID
3651struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3652#endif /* CONFIG_IP_ROUTE_CLASSID */
3653
3654int __init ip_rt_init(void)
3655{
3656	void *idents_hash;
3657	int cpu;
3658
3659	/* For modern hosts, this will use 2 MB of memory */
3660	idents_hash = alloc_large_system_hash("IP idents",
3661					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
3662					      0,
3663					      16, /* one bucket per 64 KB */
3664					      HASH_ZERO,
3665					      NULL,
3666					      &ip_idents_mask,
3667					      2048,
3668					      256*1024);
3669
3670	ip_idents = idents_hash;
3671
3672	get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3673
3674	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3675
3676	for_each_possible_cpu(cpu) {
3677		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3678
3679		INIT_LIST_HEAD(&ul->head);
3680		INIT_LIST_HEAD(&ul->quarantine);
3681		spin_lock_init(&ul->lock);
3682	}
3683#ifdef CONFIG_IP_ROUTE_CLASSID
3684	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3685	if (!ip_rt_acct)
3686		panic("IP: failed to allocate ip_rt_acct\n");
3687#endif
3688
3689	ipv4_dst_ops.kmem_cachep = KMEM_CACHE(rtable,
3690					      SLAB_HWCACHE_ALIGN | SLAB_PANIC);
 
3691
3692	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3693
3694	if (dst_entries_init(&ipv4_dst_ops) < 0)
3695		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3696
3697	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3698		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3699
3700	ipv4_dst_ops.gc_thresh = ~0;
3701	ip_rt_max_size = INT_MAX;
3702
3703	devinet_init();
3704	ip_fib_init();
3705
3706	if (ip_rt_proc_init())
3707		pr_err("Unable to create route proc files\n");
3708#ifdef CONFIG_XFRM
3709	xfrm_init();
3710	xfrm4_init();
3711#endif
3712	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3713		      RTNL_FLAG_DOIT_UNLOCKED);
3714
3715#ifdef CONFIG_SYSCTL
3716	register_pernet_subsys(&sysctl_route_ops);
3717#endif
3718	register_pernet_subsys(&ip_rt_ops);
3719	register_pernet_subsys(&rt_genid_ops);
3720	register_pernet_subsys(&ipv4_inetpeer_ops);
3721	return 0;
3722}
3723
3724#ifdef CONFIG_SYSCTL
3725/*
3726 * We really need to sanitize the damn ipv4 init order, then all
3727 * this nonsense will go away.
3728 */
3729void __init ip_static_sysctl_init(void)
3730{
3731	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3732}
3733#endif