Linux Audio

Check our new training course

Loading...
v4.17
 
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		ROUTE - implementation of the IP router.
   7 *
   8 * Authors:	Ross Biro
   9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *		Alan Cox	:	Verify area fixes.
  16 *		Alan Cox	:	cli() protects routing changes
  17 *		Rui Oliveira	:	ICMP routing table updates
  18 *		(rco@di.uminho.pt)	Routing table insertion and update
  19 *		Linus Torvalds	:	Rewrote bits to be sensible
  20 *		Alan Cox	:	Added BSD route gw semantics
  21 *		Alan Cox	:	Super /proc >4K
  22 *		Alan Cox	:	MTU in route table
  23 *		Alan Cox	: 	MSS actually. Also added the window
  24 *					clamper.
  25 *		Sam Lantinga	:	Fixed route matching in rt_del()
  26 *		Alan Cox	:	Routing cache support.
  27 *		Alan Cox	:	Removed compatibility cruft.
  28 *		Alan Cox	:	RTF_REJECT support.
  29 *		Alan Cox	:	TCP irtt support.
  30 *		Jonathan Naylor	:	Added Metric support.
  31 *	Miquel van Smoorenburg	:	BSD API fixes.
  32 *	Miquel van Smoorenburg	:	Metrics.
  33 *		Alan Cox	:	Use __u32 properly
  34 *		Alan Cox	:	Aligned routing errors more closely with BSD
  35 *					our system is still very different.
  36 *		Alan Cox	:	Faster /proc handling
  37 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  38 *					routing caches and better behaviour.
  39 *
  40 *		Olaf Erb	:	irtt wasn't being copied right.
  41 *		Bjorn Ekwall	:	Kerneld route support.
  42 *		Alan Cox	:	Multicast fixed (I hope)
  43 * 		Pavel Krauz	:	Limited broadcast fixed
  44 *		Mike McLagan	:	Routing by source
  45 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  46 *					route.c and rewritten from scratch.
  47 *		Andi Kleen	:	Load-limit warning messages.
  48 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  49 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  50 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  51 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  52 *		Marc Boucher	:	routing by fwmark
  53 *	Robert Olsson		:	Added rt_cache statistics
  54 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  55 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  56 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  57 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
  58 *
  59 *		This program is free software; you can redistribute it and/or
  60 *		modify it under the terms of the GNU General Public License
  61 *		as published by the Free Software Foundation; either version
  62 *		2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <linux/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
 
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <linux/jhash.h>
  93#include <net/dst.h>
  94#include <net/dst_metadata.h>
  95#include <net/net_namespace.h>
  96#include <net/protocol.h>
  97#include <net/ip.h>
  98#include <net/route.h>
  99#include <net/inetpeer.h>
 100#include <net/sock.h>
 101#include <net/ip_fib.h>
 
 102#include <net/arp.h>
 103#include <net/tcp.h>
 104#include <net/icmp.h>
 105#include <net/xfrm.h>
 106#include <net/lwtunnel.h>
 107#include <net/netevent.h>
 108#include <net/rtnetlink.h>
 109#ifdef CONFIG_SYSCTL
 110#include <linux/sysctl.h>
 111#endif
 112#include <net/secure_seq.h>
 113#include <net/ip_tunnels.h>
 114#include <net/l3mdev.h>
 115
 116#include "fib_lookup.h"
 117
 118#define RT_FL_TOS(oldflp4) \
 119	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 120
 121#define RT_GC_TIMEOUT (300*HZ)
 122
 123static int ip_rt_max_size;
 124static int ip_rt_redirect_number __read_mostly	= 9;
 125static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 126static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 127static int ip_rt_error_cost __read_mostly	= HZ;
 128static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 129static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 130static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 131static int ip_rt_min_advmss __read_mostly	= 256;
 132
 133static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 134
 135/*
 136 *	Interface to generic destination cache.
 137 */
 138
 139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 
 140static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 141static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
 
 142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143static void		 ipv4_link_failure(struct sk_buff *skb);
 144static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145					   struct sk_buff *skb, u32 mtu);
 
 146static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147					struct sk_buff *skb);
 148static void		ipv4_dst_destroy(struct dst_entry *dst);
 149
 150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151{
 152	WARN_ON(1);
 153	return NULL;
 154}
 155
 156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157					   struct sk_buff *skb,
 158					   const void *daddr);
 159static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 160
 161static struct dst_ops ipv4_dst_ops = {
 162	.family =		AF_INET,
 163	.check =		ipv4_dst_check,
 164	.default_advmss =	ipv4_default_advmss,
 165	.mtu =			ipv4_mtu,
 166	.cow_metrics =		ipv4_cow_metrics,
 167	.destroy =		ipv4_dst_destroy,
 168	.negative_advice =	ipv4_negative_advice,
 169	.link_failure =		ipv4_link_failure,
 170	.update_pmtu =		ip_rt_update_pmtu,
 171	.redirect =		ip_do_redirect,
 172	.local_out =		__ip_local_out,
 173	.neigh_lookup =		ipv4_neigh_lookup,
 174	.confirm_neigh =	ipv4_confirm_neigh,
 175};
 176
 177#define ECN_OR_COST(class)	TC_PRIO_##class
 178
 179const __u8 ip_tos2prio[16] = {
 180	TC_PRIO_BESTEFFORT,
 181	ECN_OR_COST(BESTEFFORT),
 182	TC_PRIO_BESTEFFORT,
 183	ECN_OR_COST(BESTEFFORT),
 184	TC_PRIO_BULK,
 185	ECN_OR_COST(BULK),
 186	TC_PRIO_BULK,
 187	ECN_OR_COST(BULK),
 188	TC_PRIO_INTERACTIVE,
 189	ECN_OR_COST(INTERACTIVE),
 190	TC_PRIO_INTERACTIVE,
 191	ECN_OR_COST(INTERACTIVE),
 192	TC_PRIO_INTERACTIVE_BULK,
 193	ECN_OR_COST(INTERACTIVE_BULK),
 194	TC_PRIO_INTERACTIVE_BULK,
 195	ECN_OR_COST(INTERACTIVE_BULK)
 196};
 197EXPORT_SYMBOL(ip_tos2prio);
 198
 199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 200#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 201
 202#ifdef CONFIG_PROC_FS
 203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 204{
 205	if (*pos)
 206		return NULL;
 207	return SEQ_START_TOKEN;
 208}
 209
 210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 211{
 212	++*pos;
 213	return NULL;
 214}
 215
 216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 217{
 218}
 219
 220static int rt_cache_seq_show(struct seq_file *seq, void *v)
 221{
 222	if (v == SEQ_START_TOKEN)
 223		seq_printf(seq, "%-127s\n",
 224			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 225			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 226			   "HHUptod\tSpecDst");
 227	return 0;
 228}
 229
 230static const struct seq_operations rt_cache_seq_ops = {
 231	.start  = rt_cache_seq_start,
 232	.next   = rt_cache_seq_next,
 233	.stop   = rt_cache_seq_stop,
 234	.show   = rt_cache_seq_show,
 235};
 236
 237static int rt_cache_seq_open(struct inode *inode, struct file *file)
 238{
 239	return seq_open(file, &rt_cache_seq_ops);
 240}
 241
 242static const struct file_operations rt_cache_seq_fops = {
 243	.open	 = rt_cache_seq_open,
 244	.read	 = seq_read,
 245	.llseek	 = seq_lseek,
 246	.release = seq_release,
 247};
 248
 249
 250static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 251{
 252	int cpu;
 253
 254	if (*pos == 0)
 255		return SEQ_START_TOKEN;
 256
 257	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 258		if (!cpu_possible(cpu))
 259			continue;
 260		*pos = cpu+1;
 261		return &per_cpu(rt_cache_stat, cpu);
 262	}
 263	return NULL;
 264}
 265
 266static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 267{
 268	int cpu;
 269
 270	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 271		if (!cpu_possible(cpu))
 272			continue;
 273		*pos = cpu+1;
 274		return &per_cpu(rt_cache_stat, cpu);
 275	}
 
 276	return NULL;
 277
 278}
 279
 280static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281{
 282
 283}
 284
 285static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286{
 287	struct rt_cache_stat *st = v;
 288
 289	if (v == SEQ_START_TOKEN) {
 290		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291		return 0;
 292	}
 293
 294	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 296		   dst_entries_get_slow(&ipv4_dst_ops),
 297		   0, /* st->in_hit */
 298		   st->in_slow_tot,
 299		   st->in_slow_mc,
 300		   st->in_no_route,
 301		   st->in_brd,
 302		   st->in_martian_dst,
 303		   st->in_martian_src,
 304
 305		   0, /* st->out_hit */
 306		   st->out_slow_tot,
 307		   st->out_slow_mc,
 308
 309		   0, /* st->gc_total */
 310		   0, /* st->gc_ignored */
 311		   0, /* st->gc_goal_miss */
 312		   0, /* st->gc_dst_overflow */
 313		   0, /* st->in_hlist_search */
 314		   0  /* st->out_hlist_search */
 315		);
 316	return 0;
 317}
 318
 319static const struct seq_operations rt_cpu_seq_ops = {
 320	.start  = rt_cpu_seq_start,
 321	.next   = rt_cpu_seq_next,
 322	.stop   = rt_cpu_seq_stop,
 323	.show   = rt_cpu_seq_show,
 324};
 325
 326
 327static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328{
 329	return seq_open(file, &rt_cpu_seq_ops);
 330}
 331
 332static const struct file_operations rt_cpu_seq_fops = {
 333	.open	 = rt_cpu_seq_open,
 334	.read	 = seq_read,
 335	.llseek	 = seq_lseek,
 336	.release = seq_release,
 337};
 338
 339#ifdef CONFIG_IP_ROUTE_CLASSID
 340static int rt_acct_proc_show(struct seq_file *m, void *v)
 341{
 342	struct ip_rt_acct *dst, *src;
 343	unsigned int i, j;
 344
 345	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 346	if (!dst)
 347		return -ENOMEM;
 348
 349	for_each_possible_cpu(i) {
 350		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 351		for (j = 0; j < 256; j++) {
 352			dst[j].o_bytes   += src[j].o_bytes;
 353			dst[j].o_packets += src[j].o_packets;
 354			dst[j].i_bytes   += src[j].i_bytes;
 355			dst[j].i_packets += src[j].i_packets;
 356		}
 357	}
 358
 359	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 360	kfree(dst);
 361	return 0;
 362}
 363
 364static int rt_acct_proc_open(struct inode *inode, struct file *file)
 365{
 366	return single_open(file, rt_acct_proc_show, NULL);
 367}
 368
 369static const struct file_operations rt_acct_proc_fops = {
 370	.open		= rt_acct_proc_open,
 371	.read		= seq_read,
 372	.llseek		= seq_lseek,
 373	.release	= single_release,
 374};
 375#endif
 376
 377static int __net_init ip_rt_do_proc_init(struct net *net)
 378{
 379	struct proc_dir_entry *pde;
 380
 381	pde = proc_create("rt_cache", 0444, net->proc_net,
 382			  &rt_cache_seq_fops);
 383	if (!pde)
 384		goto err1;
 385
 386	pde = proc_create("rt_cache", 0444,
 387			  net->proc_net_stat, &rt_cpu_seq_fops);
 388	if (!pde)
 389		goto err2;
 390
 391#ifdef CONFIG_IP_ROUTE_CLASSID
 392	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 
 393	if (!pde)
 394		goto err3;
 395#endif
 396	return 0;
 397
 398#ifdef CONFIG_IP_ROUTE_CLASSID
 399err3:
 400	remove_proc_entry("rt_cache", net->proc_net_stat);
 401#endif
 402err2:
 403	remove_proc_entry("rt_cache", net->proc_net);
 404err1:
 405	return -ENOMEM;
 406}
 407
 408static void __net_exit ip_rt_do_proc_exit(struct net *net)
 409{
 410	remove_proc_entry("rt_cache", net->proc_net_stat);
 411	remove_proc_entry("rt_cache", net->proc_net);
 412#ifdef CONFIG_IP_ROUTE_CLASSID
 413	remove_proc_entry("rt_acct", net->proc_net);
 414#endif
 415}
 416
 417static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 418	.init = ip_rt_do_proc_init,
 419	.exit = ip_rt_do_proc_exit,
 420};
 421
 422static int __init ip_rt_proc_init(void)
 423{
 424	return register_pernet_subsys(&ip_rt_proc_ops);
 425}
 426
 427#else
 428static inline int ip_rt_proc_init(void)
 429{
 430	return 0;
 431}
 432#endif /* CONFIG_PROC_FS */
 433
 434static inline bool rt_is_expired(const struct rtable *rth)
 435{
 436	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 437}
 438
 439void rt_cache_flush(struct net *net)
 440{
 441	rt_genid_bump_ipv4(net);
 442}
 443
 444static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 445					   struct sk_buff *skb,
 446					   const void *daddr)
 447{
 
 448	struct net_device *dev = dst->dev;
 449	const __be32 *pkey = daddr;
 450	const struct rtable *rt;
 451	struct neighbour *n;
 452
 453	rt = (const struct rtable *) dst;
 454	if (rt->rt_gateway)
 455		pkey = (const __be32 *) &rt->rt_gateway;
 456	else if (skb)
 457		pkey = &ip_hdr(skb)->daddr;
 458
 459	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 460	if (n)
 461		return n;
 462	return neigh_create(&arp_tbl, pkey, dev);
 
 
 
 
 
 
 
 
 
 463}
 464
 465static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 466{
 
 467	struct net_device *dev = dst->dev;
 468	const __be32 *pkey = daddr;
 469	const struct rtable *rt;
 470
 471	rt = (const struct rtable *)dst;
 472	if (rt->rt_gateway)
 473		pkey = (const __be32 *)&rt->rt_gateway;
 474	else if (!daddr ||
 
 475		 (rt->rt_flags &
 476		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 477		return;
 478
 479	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 480}
 481
 482#define IP_IDENTS_SZ 2048u
 483
 
 
 484static atomic_t *ip_idents __read_mostly;
 485static u32 *ip_tstamps __read_mostly;
 486
 487/* In order to protect privacy, we add a perturbation to identifiers
 488 * if one generator is seldom used. This makes hard for an attacker
 489 * to infer how many packets were sent between two points in time.
 490 */
 491u32 ip_idents_reserve(u32 hash, int segs)
 492{
 493	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 494	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 495	u32 old = READ_ONCE(*p_tstamp);
 496	u32 now = (u32)jiffies;
 497	u32 new, delta = 0;
 
 
 
 
 498
 499	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 500		delta = prandom_u32_max(now - old);
 501
 502	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
 503	do {
 504		old = (u32)atomic_read(p_id);
 505		new = old + delta + segs;
 506	} while (atomic_cmpxchg(p_id, old, new) != old);
 507
 508	return new - segs;
 509}
 510EXPORT_SYMBOL(ip_idents_reserve);
 511
 512void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 513{
 514	static u32 ip_idents_hashrnd __read_mostly;
 515	u32 hash, id;
 516
 517	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 
 
 
 518
 519	hash = jhash_3words((__force u32)iph->daddr,
 520			    (__force u32)iph->saddr,
 521			    iph->protocol ^ net_hash_mix(net),
 522			    ip_idents_hashrnd);
 523	id = ip_idents_reserve(hash, segs);
 524	iph->id = htons(id);
 525}
 526EXPORT_SYMBOL(__ip_select_ident);
 527
 528static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 529			     const struct sock *sk,
 530			     const struct iphdr *iph,
 531			     int oif, u8 tos,
 532			     u8 prot, u32 mark, int flow_flags)
 533{
 534	if (sk) {
 535		const struct inet_sock *inet = inet_sk(sk);
 536
 537		oif = sk->sk_bound_dev_if;
 538		mark = sk->sk_mark;
 539		tos = RT_CONN_FLAGS(sk);
 540		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 541	}
 542	flowi4_init_output(fl4, oif, mark, tos,
 543			   RT_SCOPE_UNIVERSE, prot,
 544			   flow_flags,
 545			   iph->daddr, iph->saddr, 0, 0,
 546			   sock_net_uid(net, sk));
 547}
 548
 549static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 550			       const struct sock *sk)
 551{
 552	const struct net *net = dev_net(skb->dev);
 553	const struct iphdr *iph = ip_hdr(skb);
 554	int oif = skb->dev->ifindex;
 555	u8 tos = RT_TOS(iph->tos);
 556	u8 prot = iph->protocol;
 557	u32 mark = skb->mark;
 558
 559	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 560}
 561
 562static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 563{
 564	const struct inet_sock *inet = inet_sk(sk);
 565	const struct ip_options_rcu *inet_opt;
 566	__be32 daddr = inet->inet_daddr;
 567
 568	rcu_read_lock();
 569	inet_opt = rcu_dereference(inet->inet_opt);
 570	if (inet_opt && inet_opt->opt.srr)
 571		daddr = inet_opt->opt.faddr;
 572	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 573			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 574			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 575			   inet_sk_flowi_flags(sk),
 576			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 577	rcu_read_unlock();
 578}
 579
 580static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 581				 const struct sk_buff *skb)
 582{
 583	if (skb)
 584		build_skb_flow_key(fl4, skb, sk);
 585	else
 586		build_sk_flow_key(fl4, sk);
 587}
 588
 589static DEFINE_SPINLOCK(fnhe_lock);
 590
 591static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 592{
 593	struct rtable *rt;
 594
 595	rt = rcu_dereference(fnhe->fnhe_rth_input);
 596	if (rt) {
 597		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 598		dst_dev_put(&rt->dst);
 599		dst_release(&rt->dst);
 600	}
 601	rt = rcu_dereference(fnhe->fnhe_rth_output);
 602	if (rt) {
 603		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 604		dst_dev_put(&rt->dst);
 605		dst_release(&rt->dst);
 606	}
 607}
 608
 609static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 610{
 611	struct fib_nh_exception *fnhe, *oldest;
 
 612
 613	oldest = rcu_dereference(hash->chain);
 614	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 615	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 616		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 
 
 
 617			oldest = fnhe;
 
 
 618	}
 619	fnhe_flush_routes(oldest);
 620	return oldest;
 
 621}
 622
 623static inline u32 fnhe_hashfun(__be32 daddr)
 624{
 625	static u32 fnhe_hashrnd __read_mostly;
 626	u32 hval;
 627
 628	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 629	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 630	return hash_32(hval, FNHE_HASH_SHIFT);
 631}
 632
 633static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 634{
 635	rt->rt_pmtu = fnhe->fnhe_pmtu;
 636	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 637	rt->dst.expires = fnhe->fnhe_expires;
 638
 639	if (fnhe->fnhe_gw) {
 640		rt->rt_flags |= RTCF_REDIRECTED;
 641		rt->rt_gateway = fnhe->fnhe_gw;
 642		rt->rt_uses_gateway = 1;
 
 
 643	}
 644}
 645
 646static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 647				  u32 pmtu, bool lock, unsigned long expires)
 
 648{
 649	struct fnhe_hash_bucket *hash;
 650	struct fib_nh_exception *fnhe;
 651	struct rtable *rt;
 652	u32 genid, hval;
 653	unsigned int i;
 654	int depth;
 655
 656	genid = fnhe_genid(dev_net(nh->nh_dev));
 657	hval = fnhe_hashfun(daddr);
 658
 659	spin_lock_bh(&fnhe_lock);
 660
 661	hash = rcu_dereference(nh->nh_exceptions);
 662	if (!hash) {
 663		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 664		if (!hash)
 665			goto out_unlock;
 666		rcu_assign_pointer(nh->nh_exceptions, hash);
 667	}
 668
 669	hash += hval;
 670
 671	depth = 0;
 672	for (fnhe = rcu_dereference(hash->chain); fnhe;
 673	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 674		if (fnhe->fnhe_daddr == daddr)
 675			break;
 676		depth++;
 677	}
 678
 679	if (fnhe) {
 680		if (fnhe->fnhe_genid != genid)
 681			fnhe->fnhe_genid = genid;
 682		if (gw)
 683			fnhe->fnhe_gw = gw;
 684		if (pmtu) {
 685			fnhe->fnhe_pmtu = pmtu;
 686			fnhe->fnhe_mtu_locked = lock;
 687		}
 688		fnhe->fnhe_expires = max(1UL, expires);
 689		/* Update all cached dsts too */
 690		rt = rcu_dereference(fnhe->fnhe_rth_input);
 691		if (rt)
 692			fill_route_from_fnhe(rt, fnhe);
 693		rt = rcu_dereference(fnhe->fnhe_rth_output);
 694		if (rt)
 695			fill_route_from_fnhe(rt, fnhe);
 696	} else {
 697		if (depth > FNHE_RECLAIM_DEPTH)
 698			fnhe = fnhe_oldest(hash);
 699		else {
 700			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 701			if (!fnhe)
 702				goto out_unlock;
 703
 704			fnhe->fnhe_next = hash->chain;
 705			rcu_assign_pointer(hash->chain, fnhe);
 
 706		}
 
 
 
 
 
 
 
 707		fnhe->fnhe_genid = genid;
 708		fnhe->fnhe_daddr = daddr;
 709		fnhe->fnhe_gw = gw;
 710		fnhe->fnhe_pmtu = pmtu;
 711		fnhe->fnhe_mtu_locked = lock;
 712		fnhe->fnhe_expires = max(1UL, expires);
 713
 
 
 714		/* Exception created; mark the cached routes for the nexthop
 715		 * stale, so anyone caching it rechecks if this exception
 716		 * applies to them.
 717		 */
 718		rt = rcu_dereference(nh->nh_rth_input);
 719		if (rt)
 720			rt->dst.obsolete = DST_OBSOLETE_KILL;
 721
 722		for_each_possible_cpu(i) {
 723			struct rtable __rcu **prt;
 724			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 
 725			rt = rcu_dereference(*prt);
 726			if (rt)
 727				rt->dst.obsolete = DST_OBSOLETE_KILL;
 728		}
 729	}
 730
 731	fnhe->fnhe_stamp = jiffies;
 732
 733out_unlock:
 734	spin_unlock_bh(&fnhe_lock);
 735}
 736
 737static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 738			     bool kill_route)
 739{
 740	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 741	__be32 old_gw = ip_hdr(skb)->saddr;
 742	struct net_device *dev = skb->dev;
 743	struct in_device *in_dev;
 744	struct fib_result res;
 745	struct neighbour *n;
 746	struct net *net;
 747
 748	switch (icmp_hdr(skb)->code & 7) {
 749	case ICMP_REDIR_NET:
 750	case ICMP_REDIR_NETTOS:
 751	case ICMP_REDIR_HOST:
 752	case ICMP_REDIR_HOSTTOS:
 753		break;
 754
 755	default:
 756		return;
 757	}
 758
 759	if (rt->rt_gateway != old_gw)
 760		return;
 761
 762	in_dev = __in_dev_get_rcu(dev);
 763	if (!in_dev)
 764		return;
 765
 766	net = dev_net(dev);
 767	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 768	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 769	    ipv4_is_zeronet(new_gw))
 770		goto reject_redirect;
 771
 772	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 773		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 774			goto reject_redirect;
 775		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 776			goto reject_redirect;
 777	} else {
 778		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 779			goto reject_redirect;
 780	}
 781
 782	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 783	if (!n)
 784		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 785	if (!IS_ERR(n)) {
 786		if (!(n->nud_state & NUD_VALID)) {
 787			neigh_event_send(n, NULL);
 788		} else {
 789			if (fib_lookup(net, fl4, &res, 0) == 0) {
 790				struct fib_nh *nh = &FIB_RES_NH(res);
 791
 792				update_or_create_fnhe(nh, fl4->daddr, new_gw,
 
 
 793						0, false,
 794						jiffies + ip_rt_gc_timeout);
 795			}
 796			if (kill_route)
 797				rt->dst.obsolete = DST_OBSOLETE_KILL;
 798			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 799		}
 800		neigh_release(n);
 801	}
 802	return;
 803
 804reject_redirect:
 805#ifdef CONFIG_IP_ROUTE_VERBOSE
 806	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 807		const struct iphdr *iph = (const struct iphdr *) skb->data;
 808		__be32 daddr = iph->daddr;
 809		__be32 saddr = iph->saddr;
 810
 811		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 812				     "  Advised path = %pI4 -> %pI4\n",
 813				     &old_gw, dev->name, &new_gw,
 814				     &saddr, &daddr);
 815	}
 816#endif
 817	;
 818}
 819
 820static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 821{
 822	struct rtable *rt;
 823	struct flowi4 fl4;
 824	const struct iphdr *iph = (const struct iphdr *) skb->data;
 825	struct net *net = dev_net(skb->dev);
 826	int oif = skb->dev->ifindex;
 827	u8 tos = RT_TOS(iph->tos);
 828	u8 prot = iph->protocol;
 829	u32 mark = skb->mark;
 830
 831	rt = (struct rtable *) dst;
 832
 833	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 834	__ip_do_redirect(rt, skb, &fl4, true);
 835}
 836
 837static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 838{
 839	struct rtable *rt = (struct rtable *)dst;
 840	struct dst_entry *ret = dst;
 841
 842	if (rt) {
 843		if (dst->obsolete > 0) {
 844			ip_rt_put(rt);
 845			ret = NULL;
 846		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 847			   rt->dst.expires) {
 848			ip_rt_put(rt);
 849			ret = NULL;
 850		}
 851	}
 852	return ret;
 853}
 854
 855/*
 856 * Algorithm:
 857 *	1. The first ip_rt_redirect_number redirects are sent
 858 *	   with exponential backoff, then we stop sending them at all,
 859 *	   assuming that the host ignores our redirects.
 860 *	2. If we did not see packets requiring redirects
 861 *	   during ip_rt_redirect_silence, we assume that the host
 862 *	   forgot redirected route and start to send redirects again.
 863 *
 864 * This algorithm is much cheaper and more intelligent than dumb load limiting
 865 * in icmp.c.
 866 *
 867 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 868 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 869 */
 870
 871void ip_rt_send_redirect(struct sk_buff *skb)
 872{
 873	struct rtable *rt = skb_rtable(skb);
 874	struct in_device *in_dev;
 875	struct inet_peer *peer;
 876	struct net *net;
 877	int log_martians;
 878	int vif;
 879
 880	rcu_read_lock();
 881	in_dev = __in_dev_get_rcu(rt->dst.dev);
 882	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 883		rcu_read_unlock();
 884		return;
 885	}
 886	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 887	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 888	rcu_read_unlock();
 889
 890	net = dev_net(rt->dst.dev);
 891	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 892	if (!peer) {
 893		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 894			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 895		return;
 896	}
 897
 898	/* No redirected packets during ip_rt_redirect_silence;
 899	 * reset the algorithm.
 900	 */
 901	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 902		peer->rate_tokens = 0;
 
 
 903
 904	/* Too many ignored redirects; do not send anything
 905	 * set dst.rate_last to the last seen redirected packet.
 906	 */
 907	if (peer->rate_tokens >= ip_rt_redirect_number) {
 908		peer->rate_last = jiffies;
 909		goto out_put_peer;
 910	}
 911
 912	/* Check for load limit; set rate_last to the latest sent
 913	 * redirect.
 914	 */
 915	if (peer->rate_tokens == 0 ||
 916	    time_after(jiffies,
 917		       (peer->rate_last +
 918			(ip_rt_redirect_load << peer->rate_tokens)))) {
 919		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 920
 921		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 922		peer->rate_last = jiffies;
 923		++peer->rate_tokens;
 924#ifdef CONFIG_IP_ROUTE_VERBOSE
 925		if (log_martians &&
 926		    peer->rate_tokens == ip_rt_redirect_number)
 927			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 928					     &ip_hdr(skb)->saddr, inet_iif(skb),
 929					     &ip_hdr(skb)->daddr, &gw);
 930#endif
 931	}
 932out_put_peer:
 933	inet_putpeer(peer);
 934}
 935
 936static int ip_error(struct sk_buff *skb)
 937{
 938	struct rtable *rt = skb_rtable(skb);
 939	struct net_device *dev = skb->dev;
 940	struct in_device *in_dev;
 941	struct inet_peer *peer;
 942	unsigned long now;
 943	struct net *net;
 944	bool send;
 945	int code;
 946
 947	if (netif_is_l3_master(skb->dev)) {
 948		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 949		if (!dev)
 950			goto out;
 951	}
 952
 953	in_dev = __in_dev_get_rcu(dev);
 954
 955	/* IP on this device is disabled. */
 956	if (!in_dev)
 957		goto out;
 958
 959	net = dev_net(rt->dst.dev);
 960	if (!IN_DEV_FORWARD(in_dev)) {
 961		switch (rt->dst.error) {
 962		case EHOSTUNREACH:
 963			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 964			break;
 965
 966		case ENETUNREACH:
 967			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 968			break;
 969		}
 970		goto out;
 971	}
 972
 973	switch (rt->dst.error) {
 974	case EINVAL:
 975	default:
 976		goto out;
 977	case EHOSTUNREACH:
 978		code = ICMP_HOST_UNREACH;
 979		break;
 980	case ENETUNREACH:
 981		code = ICMP_NET_UNREACH;
 982		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 983		break;
 984	case EACCES:
 985		code = ICMP_PKT_FILTERED;
 986		break;
 987	}
 988
 989	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 990			       l3mdev_master_ifindex(skb->dev), 1);
 991
 992	send = true;
 993	if (peer) {
 994		now = jiffies;
 995		peer->rate_tokens += now - peer->rate_last;
 996		if (peer->rate_tokens > ip_rt_error_burst)
 997			peer->rate_tokens = ip_rt_error_burst;
 998		peer->rate_last = now;
 999		if (peer->rate_tokens >= ip_rt_error_cost)
1000			peer->rate_tokens -= ip_rt_error_cost;
1001		else
1002			send = false;
1003		inet_putpeer(peer);
1004	}
1005	if (send)
1006		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1007
1008out:	kfree_skb(skb);
1009	return 0;
1010}
1011
1012static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1013{
1014	struct dst_entry *dst = &rt->dst;
 
1015	struct fib_result res;
1016	bool lock = false;
 
1017
1018	if (ip_mtu_locked(dst))
1019		return;
1020
1021	if (ipv4_mtu(dst) < mtu)
 
1022		return;
1023
1024	if (mtu < ip_rt_min_pmtu) {
1025		lock = true;
1026		mtu = ip_rt_min_pmtu;
1027	}
1028
1029	if (rt->rt_pmtu == mtu &&
1030	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1031		return;
1032
1033	rcu_read_lock();
1034	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1035		struct fib_nh *nh = &FIB_RES_NH(res);
1036
1037		update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
 
 
1038				      jiffies + ip_rt_mtu_expires);
1039	}
1040	rcu_read_unlock();
1041}
1042
1043static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1044			      struct sk_buff *skb, u32 mtu)
 
1045{
1046	struct rtable *rt = (struct rtable *) dst;
1047	struct flowi4 fl4;
1048
1049	ip_rt_build_flow_key(&fl4, sk, skb);
 
 
 
 
 
1050	__ip_rt_update_pmtu(rt, &fl4, mtu);
1051}
1052
1053void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1054		      int oif, u32 mark, u8 protocol, int flow_flags)
1055{
1056	const struct iphdr *iph = (const struct iphdr *) skb->data;
1057	struct flowi4 fl4;
1058	struct rtable *rt;
1059
1060	if (!mark)
1061		mark = IP4_REPLY_MARK(net, skb->mark);
1062
1063	__build_flow_key(net, &fl4, NULL, iph, oif,
1064			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1065	rt = __ip_route_output_key(net, &fl4);
1066	if (!IS_ERR(rt)) {
1067		__ip_rt_update_pmtu(rt, &fl4, mtu);
1068		ip_rt_put(rt);
1069	}
1070}
1071EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1072
1073static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074{
1075	const struct iphdr *iph = (const struct iphdr *) skb->data;
1076	struct flowi4 fl4;
1077	struct rtable *rt;
1078
1079	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1080
1081	if (!fl4.flowi4_mark)
1082		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1083
1084	rt = __ip_route_output_key(sock_net(sk), &fl4);
1085	if (!IS_ERR(rt)) {
1086		__ip_rt_update_pmtu(rt, &fl4, mtu);
1087		ip_rt_put(rt);
1088	}
1089}
1090
1091void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092{
1093	const struct iphdr *iph = (const struct iphdr *) skb->data;
1094	struct flowi4 fl4;
1095	struct rtable *rt;
1096	struct dst_entry *odst = NULL;
1097	bool new = false;
1098	struct net *net = sock_net(sk);
1099
1100	bh_lock_sock(sk);
1101
1102	if (!ip_sk_accept_pmtu(sk))
1103		goto out;
1104
1105	odst = sk_dst_get(sk);
1106
1107	if (sock_owned_by_user(sk) || !odst) {
1108		__ipv4_sk_update_pmtu(skb, sk, mtu);
1109		goto out;
1110	}
1111
1112	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1113
1114	rt = (struct rtable *)odst;
1115	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1116		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1117		if (IS_ERR(rt))
1118			goto out;
1119
1120		new = true;
1121	}
1122
1123	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1124
1125	if (!dst_check(&rt->dst, 0)) {
1126		if (new)
1127			dst_release(&rt->dst);
1128
1129		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1130		if (IS_ERR(rt))
1131			goto out;
1132
1133		new = true;
1134	}
1135
1136	if (new)
1137		sk_dst_set(sk, &rt->dst);
1138
1139out:
1140	bh_unlock_sock(sk);
1141	dst_release(odst);
1142}
1143EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1144
1145void ipv4_redirect(struct sk_buff *skb, struct net *net,
1146		   int oif, u32 mark, u8 protocol, int flow_flags)
1147{
1148	const struct iphdr *iph = (const struct iphdr *) skb->data;
1149	struct flowi4 fl4;
1150	struct rtable *rt;
1151
1152	__build_flow_key(net, &fl4, NULL, iph, oif,
1153			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1154	rt = __ip_route_output_key(net, &fl4);
1155	if (!IS_ERR(rt)) {
1156		__ip_do_redirect(rt, skb, &fl4, false);
1157		ip_rt_put(rt);
1158	}
1159}
1160EXPORT_SYMBOL_GPL(ipv4_redirect);
1161
1162void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1163{
1164	const struct iphdr *iph = (const struct iphdr *) skb->data;
1165	struct flowi4 fl4;
1166	struct rtable *rt;
1167	struct net *net = sock_net(sk);
1168
1169	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1170	rt = __ip_route_output_key(net, &fl4);
1171	if (!IS_ERR(rt)) {
1172		__ip_do_redirect(rt, skb, &fl4, false);
1173		ip_rt_put(rt);
1174	}
1175}
1176EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1177
1178static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 
1179{
1180	struct rtable *rt = (struct rtable *) dst;
1181
1182	/* All IPV4 dsts are created with ->obsolete set to the value
1183	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1184	 * into this function always.
1185	 *
1186	 * When a PMTU/redirect information update invalidates a route,
1187	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1188	 * DST_OBSOLETE_DEAD by dst_free().
1189	 */
1190	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1191		return NULL;
1192	return dst;
1193}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1194
1195static void ipv4_link_failure(struct sk_buff *skb)
1196{
1197	struct rtable *rt;
1198
1199	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1200
1201	rt = skb_rtable(skb);
1202	if (rt)
1203		dst_set_expires(&rt->dst, 0);
1204}
1205
1206static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1207{
1208	pr_debug("%s: %pI4 -> %pI4, %s\n",
1209		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1210		 skb->dev ? skb->dev->name : "?");
1211	kfree_skb(skb);
1212	WARN_ON(1);
1213	return 0;
1214}
1215
1216/*
1217   We do not cache source address of outgoing interface,
1218   because it is used only by IP RR, TS and SRR options,
1219   so that it out of fast path.
1220
1221   BTW remember: "addr" is allowed to be not aligned
1222   in IP options!
1223 */
1224
1225void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1226{
1227	__be32 src;
1228
1229	if (rt_is_output_route(rt))
1230		src = ip_hdr(skb)->saddr;
1231	else {
1232		struct fib_result res;
1233		struct flowi4 fl4;
1234		struct iphdr *iph;
1235
1236		iph = ip_hdr(skb);
1237
1238		memset(&fl4, 0, sizeof(fl4));
1239		fl4.daddr = iph->daddr;
1240		fl4.saddr = iph->saddr;
1241		fl4.flowi4_tos = RT_TOS(iph->tos);
1242		fl4.flowi4_oif = rt->dst.dev->ifindex;
1243		fl4.flowi4_iif = skb->dev->ifindex;
1244		fl4.flowi4_mark = skb->mark;
1245
1246		rcu_read_lock();
1247		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1248			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1249		else
1250			src = inet_select_addr(rt->dst.dev,
1251					       rt_nexthop(rt, iph->daddr),
1252					       RT_SCOPE_UNIVERSE);
1253		rcu_read_unlock();
1254	}
1255	memcpy(addr, &src, 4);
1256}
1257
1258#ifdef CONFIG_IP_ROUTE_CLASSID
1259static void set_class_tag(struct rtable *rt, u32 tag)
1260{
1261	if (!(rt->dst.tclassid & 0xFFFF))
1262		rt->dst.tclassid |= tag & 0xFFFF;
1263	if (!(rt->dst.tclassid & 0xFFFF0000))
1264		rt->dst.tclassid |= tag & 0xFFFF0000;
1265}
1266#endif
1267
1268static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1269{
1270	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1271	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1272				    ip_rt_min_advmss);
1273
1274	return min(advmss, IPV4_MAX_PMTU - header_size);
1275}
1276
1277static unsigned int ipv4_mtu(const struct dst_entry *dst)
1278{
1279	const struct rtable *rt = (const struct rtable *) dst;
1280	unsigned int mtu = rt->rt_pmtu;
1281
1282	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1283		mtu = dst_metric_raw(dst, RTAX_MTU);
1284
1285	if (mtu)
1286		return mtu;
1287
1288	mtu = READ_ONCE(dst->dev->mtu);
1289
1290	if (unlikely(ip_mtu_locked(dst))) {
1291		if (rt->rt_uses_gateway && mtu > 576)
1292			mtu = 576;
1293	}
1294
 
1295	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1296
1297	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1298}
 
1299
1300static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1301{
1302	struct fnhe_hash_bucket *hash;
1303	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1304	u32 hval = fnhe_hashfun(daddr);
1305
1306	spin_lock_bh(&fnhe_lock);
1307
1308	hash = rcu_dereference_protected(nh->nh_exceptions,
1309					 lockdep_is_held(&fnhe_lock));
1310	hash += hval;
1311
1312	fnhe_p = &hash->chain;
1313	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1314	while (fnhe) {
1315		if (fnhe->fnhe_daddr == daddr) {
1316			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1317				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
 
 
 
 
1318			fnhe_flush_routes(fnhe);
1319			kfree_rcu(fnhe, rcu);
1320			break;
1321		}
1322		fnhe_p = &fnhe->fnhe_next;
1323		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1324						 lockdep_is_held(&fnhe_lock));
1325	}
1326
1327	spin_unlock_bh(&fnhe_lock);
1328}
1329
1330static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 
1331{
1332	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1333	struct fib_nh_exception *fnhe;
1334	u32 hval;
1335
1336	if (!hash)
1337		return NULL;
1338
1339	hval = fnhe_hashfun(daddr);
1340
1341	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1342	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1343		if (fnhe->fnhe_daddr == daddr) {
1344			if (fnhe->fnhe_expires &&
1345			    time_after(jiffies, fnhe->fnhe_expires)) {
1346				ip_del_fnhe(nh, daddr);
1347				break;
1348			}
1349			return fnhe;
1350		}
1351	}
1352	return NULL;
1353}
1354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1355static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1356			      __be32 daddr, const bool do_cache)
1357{
1358	bool ret = false;
1359
1360	spin_lock_bh(&fnhe_lock);
1361
1362	if (daddr == fnhe->fnhe_daddr) {
1363		struct rtable __rcu **porig;
1364		struct rtable *orig;
1365		int genid = fnhe_genid(dev_net(rt->dst.dev));
1366
1367		if (rt_is_input_route(rt))
1368			porig = &fnhe->fnhe_rth_input;
1369		else
1370			porig = &fnhe->fnhe_rth_output;
1371		orig = rcu_dereference(*porig);
1372
1373		if (fnhe->fnhe_genid != genid) {
1374			fnhe->fnhe_genid = genid;
1375			fnhe->fnhe_gw = 0;
1376			fnhe->fnhe_pmtu = 0;
1377			fnhe->fnhe_expires = 0;
1378			fnhe->fnhe_mtu_locked = false;
1379			fnhe_flush_routes(fnhe);
1380			orig = NULL;
1381		}
1382		fill_route_from_fnhe(rt, fnhe);
1383		if (!rt->rt_gateway)
1384			rt->rt_gateway = daddr;
 
 
1385
1386		if (do_cache) {
1387			dst_hold(&rt->dst);
1388			rcu_assign_pointer(*porig, rt);
1389			if (orig) {
1390				dst_dev_put(&orig->dst);
1391				dst_release(&orig->dst);
1392			}
1393			ret = true;
1394		}
1395
1396		fnhe->fnhe_stamp = jiffies;
1397	}
1398	spin_unlock_bh(&fnhe_lock);
1399
1400	return ret;
1401}
1402
1403static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1404{
1405	struct rtable *orig, *prev, **p;
1406	bool ret = true;
1407
1408	if (rt_is_input_route(rt)) {
1409		p = (struct rtable **)&nh->nh_rth_input;
1410	} else {
1411		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1412	}
1413	orig = *p;
1414
1415	/* hold dst before doing cmpxchg() to avoid race condition
1416	 * on this dst
1417	 */
1418	dst_hold(&rt->dst);
1419	prev = cmpxchg(p, orig, rt);
1420	if (prev == orig) {
1421		if (orig) {
1422			dst_dev_put(&orig->dst);
1423			dst_release(&orig->dst);
1424		}
1425	} else {
1426		dst_release(&rt->dst);
1427		ret = false;
1428	}
1429
1430	return ret;
1431}
1432
1433struct uncached_list {
1434	spinlock_t		lock;
1435	struct list_head	head;
1436};
1437
1438static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1439
1440void rt_add_uncached_list(struct rtable *rt)
1441{
1442	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1443
1444	rt->rt_uncached_list = ul;
1445
1446	spin_lock_bh(&ul->lock);
1447	list_add_tail(&rt->rt_uncached, &ul->head);
1448	spin_unlock_bh(&ul->lock);
1449}
1450
1451void rt_del_uncached_list(struct rtable *rt)
1452{
1453	if (!list_empty(&rt->rt_uncached)) {
1454		struct uncached_list *ul = rt->rt_uncached_list;
1455
1456		spin_lock_bh(&ul->lock);
1457		list_del(&rt->rt_uncached);
1458		spin_unlock_bh(&ul->lock);
1459	}
1460}
1461
1462static void ipv4_dst_destroy(struct dst_entry *dst)
1463{
1464	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1465	struct rtable *rt = (struct rtable *)dst;
1466
1467	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1468		kfree(p);
1469
1470	rt_del_uncached_list(rt);
1471}
1472
1473void rt_flush_dev(struct net_device *dev)
1474{
1475	struct net *net = dev_net(dev);
1476	struct rtable *rt;
1477	int cpu;
1478
1479	for_each_possible_cpu(cpu) {
1480		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1481
1482		spin_lock_bh(&ul->lock);
1483		list_for_each_entry(rt, &ul->head, rt_uncached) {
1484			if (rt->dst.dev != dev)
1485				continue;
1486			rt->dst.dev = net->loopback_dev;
1487			dev_hold(rt->dst.dev);
1488			dev_put(dev);
1489		}
1490		spin_unlock_bh(&ul->lock);
1491	}
1492}
1493
1494static bool rt_cache_valid(const struct rtable *rt)
1495{
1496	return	rt &&
1497		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1498		!rt_is_expired(rt);
1499}
1500
1501static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1502			   const struct fib_result *res,
1503			   struct fib_nh_exception *fnhe,
1504			   struct fib_info *fi, u16 type, u32 itag,
1505			   const bool do_cache)
1506{
1507	bool cached = false;
1508
1509	if (fi) {
1510		struct fib_nh *nh = &FIB_RES_NH(*res);
1511
1512		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1513			rt->rt_gateway = nh->nh_gw;
1514			rt->rt_uses_gateway = 1;
 
 
 
 
 
 
1515		}
1516		dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1517		if (fi->fib_metrics != &dst_default_metrics) {
1518			rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1519			refcount_inc(&fi->fib_metrics->refcnt);
1520		}
1521#ifdef CONFIG_IP_ROUTE_CLASSID
1522		rt->dst.tclassid = nh->nh_tclassid;
 
 
 
 
 
1523#endif
1524		rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1525		if (unlikely(fnhe))
1526			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1527		else if (do_cache)
1528			cached = rt_cache_route(nh, rt);
1529		if (unlikely(!cached)) {
1530			/* Routes we intend to cache in nexthop exception or
1531			 * FIB nexthop have the DST_NOCACHE bit clear.
1532			 * However, if we are unsuccessful at storing this
1533			 * route into the cache we really need to set it.
1534			 */
1535			if (!rt->rt_gateway)
1536				rt->rt_gateway = daddr;
 
 
1537			rt_add_uncached_list(rt);
1538		}
1539	} else
1540		rt_add_uncached_list(rt);
1541
1542#ifdef CONFIG_IP_ROUTE_CLASSID
1543#ifdef CONFIG_IP_MULTIPLE_TABLES
1544	set_class_tag(rt, res->tclassid);
1545#endif
1546	set_class_tag(rt, itag);
1547#endif
1548}
1549
1550struct rtable *rt_dst_alloc(struct net_device *dev,
1551			    unsigned int flags, u16 type,
1552			    bool nopolicy, bool noxfrm, bool will_cache)
1553{
1554	struct rtable *rt;
1555
1556	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1557		       (will_cache ? 0 : DST_HOST) |
1558		       (nopolicy ? DST_NOPOLICY : 0) |
1559		       (noxfrm ? DST_NOXFRM : 0));
1560
1561	if (rt) {
1562		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1563		rt->rt_flags = flags;
1564		rt->rt_type = type;
1565		rt->rt_is_input = 0;
1566		rt->rt_iif = 0;
1567		rt->rt_pmtu = 0;
1568		rt->rt_mtu_locked = 0;
1569		rt->rt_gateway = 0;
1570		rt->rt_uses_gateway = 0;
 
 
1571		INIT_LIST_HEAD(&rt->rt_uncached);
1572
1573		rt->dst.output = ip_output;
1574		if (flags & RTCF_LOCAL)
1575			rt->dst.input = ip_local_deliver;
1576	}
1577
1578	return rt;
1579}
1580EXPORT_SYMBOL(rt_dst_alloc);
1581
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1582/* called in rcu_read_lock() section */
1583int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1584			  u8 tos, struct net_device *dev,
1585			  struct in_device *in_dev, u32 *itag)
1586{
1587	int err;
1588
1589	/* Primary sanity checks. */
1590	if (!in_dev)
1591		return -EINVAL;
1592
1593	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1594	    skb->protocol != htons(ETH_P_IP))
1595		return -EINVAL;
1596
1597	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1598		return -EINVAL;
1599
1600	if (ipv4_is_zeronet(saddr)) {
1601		if (!ipv4_is_local_multicast(daddr))
 
1602			return -EINVAL;
1603	} else {
1604		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1605					  in_dev, itag);
1606		if (err < 0)
1607			return err;
1608	}
1609	return 0;
1610}
1611
1612/* called in rcu_read_lock() section */
1613static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1614			     u8 tos, struct net_device *dev, int our)
1615{
1616	struct in_device *in_dev = __in_dev_get_rcu(dev);
1617	unsigned int flags = RTCF_MULTICAST;
1618	struct rtable *rth;
1619	u32 itag = 0;
1620	int err;
1621
1622	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1623	if (err)
1624		return err;
1625
1626	if (our)
1627		flags |= RTCF_LOCAL;
1628
1629	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1630			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1631	if (!rth)
1632		return -ENOBUFS;
1633
1634#ifdef CONFIG_IP_ROUTE_CLASSID
1635	rth->dst.tclassid = itag;
1636#endif
1637	rth->dst.output = ip_rt_bug;
1638	rth->rt_is_input= 1;
1639
1640#ifdef CONFIG_IP_MROUTE
1641	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1642		rth->dst.input = ip_mr_input;
1643#endif
1644	RT_CACHE_STAT_INC(in_slow_mc);
1645
1646	skb_dst_set(skb, &rth->dst);
1647	return 0;
1648}
1649
1650
1651static void ip_handle_martian_source(struct net_device *dev,
1652				     struct in_device *in_dev,
1653				     struct sk_buff *skb,
1654				     __be32 daddr,
1655				     __be32 saddr)
1656{
1657	RT_CACHE_STAT_INC(in_martian_src);
1658#ifdef CONFIG_IP_ROUTE_VERBOSE
1659	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1660		/*
1661		 *	RFC1812 recommendation, if source is martian,
1662		 *	the only hint is MAC header.
1663		 */
1664		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1665			&daddr, &saddr, dev->name);
1666		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1667			print_hex_dump(KERN_WARNING, "ll header: ",
1668				       DUMP_PREFIX_OFFSET, 16, 1,
1669				       skb_mac_header(skb),
1670				       dev->hard_header_len, true);
1671		}
1672	}
1673#endif
1674}
1675
1676/* called in rcu_read_lock() section */
1677static int __mkroute_input(struct sk_buff *skb,
1678			   const struct fib_result *res,
1679			   struct in_device *in_dev,
1680			   __be32 daddr, __be32 saddr, u32 tos)
1681{
 
 
1682	struct fib_nh_exception *fnhe;
1683	struct rtable *rth;
1684	int err;
1685	struct in_device *out_dev;
1686	bool do_cache;
1687	u32 itag = 0;
1688
1689	/* get a working reference to the output device */
1690	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1691	if (!out_dev) {
1692		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1693		return -EINVAL;
1694	}
1695
1696	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1697				  in_dev->dev, in_dev, &itag);
1698	if (err < 0) {
1699		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1700					 saddr);
1701
1702		goto cleanup;
1703	}
1704
1705	do_cache = res->fi && !itag;
1706	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1707	    skb->protocol == htons(ETH_P_IP) &&
1708	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1709	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1710		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
 
 
 
 
1711
1712	if (skb->protocol != htons(ETH_P_IP)) {
1713		/* Not IP (i.e. ARP). Do not create route, if it is
1714		 * invalid for proxy arp. DNAT routes are always valid.
1715		 *
1716		 * Proxy arp feature have been extended to allow, ARP
1717		 * replies back to the same interface, to support
1718		 * Private VLAN switch technologies. See arp.c.
1719		 */
1720		if (out_dev == in_dev &&
1721		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1722			err = -EINVAL;
1723			goto cleanup;
1724		}
1725	}
1726
1727	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1728	if (do_cache) {
1729		if (fnhe)
1730			rth = rcu_dereference(fnhe->fnhe_rth_input);
1731		else
1732			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1733		if (rt_cache_valid(rth)) {
1734			skb_dst_set_noref(skb, &rth->dst);
1735			goto out;
1736		}
1737	}
1738
1739	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1740			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1741			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1742	if (!rth) {
1743		err = -ENOBUFS;
1744		goto cleanup;
1745	}
1746
1747	rth->rt_is_input = 1;
1748	RT_CACHE_STAT_INC(in_slow_tot);
1749
1750	rth->dst.input = ip_forward;
1751
1752	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1753		       do_cache);
1754	lwtunnel_set_redirect(&rth->dst);
1755	skb_dst_set(skb, &rth->dst);
1756out:
1757	err = 0;
1758 cleanup:
1759	return err;
1760}
1761
1762#ifdef CONFIG_IP_ROUTE_MULTIPATH
1763/* To make ICMP packets follow the right flow, the multipath hash is
1764 * calculated from the inner IP addresses.
1765 */
1766static void ip_multipath_l3_keys(const struct sk_buff *skb,
1767				 struct flow_keys *hash_keys)
1768{
1769	const struct iphdr *outer_iph = ip_hdr(skb);
1770	const struct iphdr *key_iph = outer_iph;
1771	const struct iphdr *inner_iph;
1772	const struct icmphdr *icmph;
1773	struct iphdr _inner_iph;
1774	struct icmphdr _icmph;
1775
1776	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1777		goto out;
1778
1779	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1780		goto out;
1781
1782	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1783				   &_icmph);
1784	if (!icmph)
1785		goto out;
1786
1787	if (icmph->type != ICMP_DEST_UNREACH &&
1788	    icmph->type != ICMP_REDIRECT &&
1789	    icmph->type != ICMP_TIME_EXCEEDED &&
1790	    icmph->type != ICMP_PARAMETERPROB)
1791		goto out;
1792
1793	inner_iph = skb_header_pointer(skb,
1794				       outer_iph->ihl * 4 + sizeof(_icmph),
1795				       sizeof(_inner_iph), &_inner_iph);
1796	if (!inner_iph)
1797		goto out;
1798
1799	key_iph = inner_iph;
1800out:
1801	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1802	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1803}
1804
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1805/* if skb is set it will be used and fl4 can be NULL */
1806int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1807		       const struct sk_buff *skb, struct flow_keys *flkeys)
1808{
 
1809	struct flow_keys hash_keys;
1810	u32 mhash;
1811
1812	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1813	case 0:
1814		memset(&hash_keys, 0, sizeof(hash_keys));
1815		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1816		if (skb) {
1817			ip_multipath_l3_keys(skb, &hash_keys);
1818		} else {
1819			hash_keys.addrs.v4addrs.src = fl4->saddr;
1820			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1821		}
 
1822		break;
1823	case 1:
1824		/* skb is currently provided only when forwarding */
1825		if (skb) {
1826			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1827			struct flow_keys keys;
1828
1829			/* short-circuit if we already have L4 hash present */
1830			if (skb->l4_hash)
1831				return skb_get_hash_raw(skb) >> 1;
1832
1833			memset(&hash_keys, 0, sizeof(hash_keys));
1834
1835			if (!flkeys) {
1836				skb_flow_dissect_flow_keys(skb, &keys, flag);
1837				flkeys = &keys;
1838			}
1839
1840			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1841			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1842			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1843			hash_keys.ports.src = flkeys->ports.src;
1844			hash_keys.ports.dst = flkeys->ports.dst;
1845			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1846		} else {
1847			memset(&hash_keys, 0, sizeof(hash_keys));
1848			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1849			hash_keys.addrs.v4addrs.src = fl4->saddr;
1850			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1851			hash_keys.ports.src = fl4->fl4_sport;
1852			hash_keys.ports.dst = fl4->fl4_dport;
1853			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1854		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1855		break;
1856	}
1857	mhash = flow_hash_from_keys(&hash_keys);
 
 
1858
1859	return mhash >> 1;
1860}
1861#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1862
1863static int ip_mkroute_input(struct sk_buff *skb,
1864			    struct fib_result *res,
1865			    struct in_device *in_dev,
1866			    __be32 daddr, __be32 saddr, u32 tos,
1867			    struct flow_keys *hkeys)
1868{
1869#ifdef CONFIG_IP_ROUTE_MULTIPATH
1870	if (res->fi && res->fi->fib_nhs > 1) {
1871		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1872
1873		fib_select_multipath(res, h);
1874	}
1875#endif
1876
1877	/* create a routing cache entry */
1878	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1879}
1880
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1881/*
1882 *	NOTE. We drop all the packets that has local source
1883 *	addresses, because every properly looped back packet
1884 *	must have correct destination already attached by output routine.
 
 
1885 *
1886 *	Such approach solves two big problems:
1887 *	1. Not simplex devices are handled properly.
1888 *	2. IP spoofing attempts are filtered with 100% of guarantee.
1889 *	called with rcu_read_lock()
1890 */
1891
1892static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1893			       u8 tos, struct net_device *dev,
1894			       struct fib_result *res)
1895{
1896	struct in_device *in_dev = __in_dev_get_rcu(dev);
1897	struct flow_keys *flkeys = NULL, _flkeys;
1898	struct net    *net = dev_net(dev);
1899	struct ip_tunnel_info *tun_info;
1900	int		err = -EINVAL;
1901	unsigned int	flags = 0;
1902	u32		itag = 0;
1903	struct rtable	*rth;
1904	struct flowi4	fl4;
1905	bool do_cache;
1906
1907	/* IP on this device is disabled. */
1908
1909	if (!in_dev)
1910		goto out;
1911
1912	/* Check for the most weird martians, which can be not detected
1913	   by fib_lookup.
1914	 */
1915
1916	tun_info = skb_tunnel_info(skb);
1917	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1918		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1919	else
1920		fl4.flowi4_tun_key.tun_id = 0;
1921	skb_dst_drop(skb);
1922
1923	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1924		goto martian_source;
1925
1926	res->fi = NULL;
1927	res->table = NULL;
1928	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1929		goto brd_input;
1930
1931	/* Accept zero addresses only to limited broadcast;
1932	 * I even do not know to fix it or not. Waiting for complains :-)
1933	 */
1934	if (ipv4_is_zeronet(saddr))
1935		goto martian_source;
1936
1937	if (ipv4_is_zeronet(daddr))
1938		goto martian_destination;
1939
1940	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1941	 * and call it once if daddr or/and saddr are loopback addresses
1942	 */
1943	if (ipv4_is_loopback(daddr)) {
1944		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1945			goto martian_destination;
1946	} else if (ipv4_is_loopback(saddr)) {
1947		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1948			goto martian_source;
1949	}
1950
1951	/*
1952	 *	Now we are ready to route packet.
1953	 */
1954	fl4.flowi4_oif = 0;
1955	fl4.flowi4_iif = dev->ifindex;
1956	fl4.flowi4_mark = skb->mark;
1957	fl4.flowi4_tos = tos;
1958	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1959	fl4.flowi4_flags = 0;
1960	fl4.daddr = daddr;
1961	fl4.saddr = saddr;
1962	fl4.flowi4_uid = sock_net_uid(net, NULL);
 
1963
1964	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
1965		flkeys = &_flkeys;
1966	} else {
1967		fl4.flowi4_proto = 0;
1968		fl4.fl4_sport = 0;
1969		fl4.fl4_dport = 0;
1970	}
1971
1972	err = fib_lookup(net, &fl4, res, 0);
1973	if (err != 0) {
1974		if (!IN_DEV_FORWARD(in_dev))
1975			err = -EHOSTUNREACH;
1976		goto no_route;
1977	}
1978
1979	if (res->type == RTN_BROADCAST)
 
 
 
 
 
1980		goto brd_input;
 
1981
1982	if (res->type == RTN_LOCAL) {
1983		err = fib_validate_source(skb, saddr, daddr, tos,
1984					  0, dev, in_dev, &itag);
1985		if (err < 0)
1986			goto martian_source;
1987		goto local_input;
1988	}
1989
1990	if (!IN_DEV_FORWARD(in_dev)) {
1991		err = -EHOSTUNREACH;
1992		goto no_route;
1993	}
1994	if (res->type != RTN_UNICAST)
1995		goto martian_destination;
1996
 
1997	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
1998out:	return err;
1999
2000brd_input:
2001	if (skb->protocol != htons(ETH_P_IP))
2002		goto e_inval;
2003
2004	if (!ipv4_is_zeronet(saddr)) {
2005		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2006					  in_dev, &itag);
2007		if (err < 0)
2008			goto martian_source;
2009	}
2010	flags |= RTCF_BROADCAST;
2011	res->type = RTN_BROADCAST;
2012	RT_CACHE_STAT_INC(in_brd);
2013
2014local_input:
2015	do_cache = false;
2016	if (res->fi) {
2017		if (!itag) {
2018			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2019			if (rt_cache_valid(rth)) {
2020				skb_dst_set_noref(skb, &rth->dst);
2021				err = 0;
2022				goto out;
2023			}
2024			do_cache = true;
2025		}
2026	}
2027
2028	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2029			   flags | RTCF_LOCAL, res->type,
2030			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2031	if (!rth)
2032		goto e_nobufs;
2033
2034	rth->dst.output= ip_rt_bug;
2035#ifdef CONFIG_IP_ROUTE_CLASSID
2036	rth->dst.tclassid = itag;
2037#endif
2038	rth->rt_is_input = 1;
2039
2040	RT_CACHE_STAT_INC(in_slow_tot);
2041	if (res->type == RTN_UNREACHABLE) {
2042		rth->dst.input= ip_error;
2043		rth->dst.error= -err;
2044		rth->rt_flags 	&= ~RTCF_LOCAL;
2045	}
2046
2047	if (do_cache) {
2048		struct fib_nh *nh = &FIB_RES_NH(*res);
2049
2050		rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2051		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2052			WARN_ON(rth->dst.input == lwtunnel_input);
2053			rth->dst.lwtstate->orig_input = rth->dst.input;
2054			rth->dst.input = lwtunnel_input;
2055		}
2056
2057		if (unlikely(!rt_cache_route(nh, rth)))
2058			rt_add_uncached_list(rth);
2059	}
2060	skb_dst_set(skb, &rth->dst);
2061	err = 0;
2062	goto out;
2063
2064no_route:
2065	RT_CACHE_STAT_INC(in_no_route);
2066	res->type = RTN_UNREACHABLE;
2067	res->fi = NULL;
2068	res->table = NULL;
2069	goto local_input;
2070
2071	/*
2072	 *	Do not cache martian addresses: they should be logged (RFC1812)
2073	 */
2074martian_destination:
2075	RT_CACHE_STAT_INC(in_martian_dst);
2076#ifdef CONFIG_IP_ROUTE_VERBOSE
2077	if (IN_DEV_LOG_MARTIANS(in_dev))
2078		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2079				     &daddr, &saddr, dev->name);
2080#endif
2081
2082e_inval:
2083	err = -EINVAL;
2084	goto out;
2085
2086e_nobufs:
2087	err = -ENOBUFS;
2088	goto out;
2089
2090martian_source:
2091	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2092	goto out;
2093}
2094
2095int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2096			 u8 tos, struct net_device *dev)
2097{
2098	struct fib_result res;
2099	int err;
2100
2101	tos &= IPTOS_RT_MASK;
2102	rcu_read_lock();
2103	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2104	rcu_read_unlock();
2105
2106	return err;
2107}
2108EXPORT_SYMBOL(ip_route_input_noref);
2109
2110/* called with rcu_read_lock held */
2111int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2112		       u8 tos, struct net_device *dev, struct fib_result *res)
2113{
2114	/* Multicast recognition logic is moved from route cache to here.
2115	   The problem was that too many Ethernet cards have broken/missing
2116	   hardware multicast filters :-( As result the host on multicasting
2117	   network acquires a lot of useless route cache entries, sort of
2118	   SDR messages from all the world. Now we try to get rid of them.
2119	   Really, provided software IP multicast filter is organized
2120	   reasonably (at least, hashed), it does not result in a slowdown
2121	   comparing with route cache reject entries.
2122	   Note, that multicast routers are not affected, because
2123	   route cache entry is created eventually.
2124	 */
2125	if (ipv4_is_multicast(daddr)) {
2126		struct in_device *in_dev = __in_dev_get_rcu(dev);
2127		int our = 0;
2128		int err = -EINVAL;
2129
2130		if (in_dev)
2131			our = ip_check_mc_rcu(in_dev, daddr, saddr,
2132					      ip_hdr(skb)->protocol);
 
2133
2134		/* check l3 master if no match yet */
2135		if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2136			struct in_device *l3_in_dev;
2137
2138			l3_in_dev = __in_dev_get_rcu(skb->dev);
2139			if (l3_in_dev)
2140				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2141						      ip_hdr(skb)->protocol);
2142		}
2143
2144		if (our
2145#ifdef CONFIG_IP_MROUTE
2146			||
2147		    (!ipv4_is_local_multicast(daddr) &&
2148		     IN_DEV_MFORWARD(in_dev))
2149#endif
2150		   ) {
2151			err = ip_route_input_mc(skb, daddr, saddr,
2152						tos, dev, our);
2153		}
2154		return err;
2155	}
2156
2157	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2158}
2159
2160/* called with rcu_read_lock() */
2161static struct rtable *__mkroute_output(const struct fib_result *res,
2162				       const struct flowi4 *fl4, int orig_oif,
2163				       struct net_device *dev_out,
2164				       unsigned int flags)
2165{
2166	struct fib_info *fi = res->fi;
2167	struct fib_nh_exception *fnhe;
2168	struct in_device *in_dev;
2169	u16 type = res->type;
2170	struct rtable *rth;
2171	bool do_cache;
2172
2173	in_dev = __in_dev_get_rcu(dev_out);
2174	if (!in_dev)
2175		return ERR_PTR(-EINVAL);
2176
2177	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2178		if (ipv4_is_loopback(fl4->saddr) &&
2179		    !(dev_out->flags & IFF_LOOPBACK) &&
2180		    !netif_is_l3_master(dev_out))
2181			return ERR_PTR(-EINVAL);
2182
2183	if (ipv4_is_lbcast(fl4->daddr))
2184		type = RTN_BROADCAST;
2185	else if (ipv4_is_multicast(fl4->daddr))
2186		type = RTN_MULTICAST;
2187	else if (ipv4_is_zeronet(fl4->daddr))
2188		return ERR_PTR(-EINVAL);
2189
2190	if (dev_out->flags & IFF_LOOPBACK)
2191		flags |= RTCF_LOCAL;
2192
2193	do_cache = true;
2194	if (type == RTN_BROADCAST) {
2195		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2196		fi = NULL;
2197	} else if (type == RTN_MULTICAST) {
2198		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2199		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2200				     fl4->flowi4_proto))
2201			flags &= ~RTCF_LOCAL;
2202		else
2203			do_cache = false;
2204		/* If multicast route do not exist use
2205		 * default one, but do not gateway in this case.
2206		 * Yes, it is hack.
2207		 */
2208		if (fi && res->prefixlen < 4)
2209			fi = NULL;
2210	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2211		   (orig_oif != dev_out->ifindex)) {
2212		/* For local routes that require a particular output interface
2213		 * we do not want to cache the result.  Caching the result
2214		 * causes incorrect behaviour when there are multiple source
2215		 * addresses on the interface, the end result being that if the
2216		 * intended recipient is waiting on that interface for the
2217		 * packet he won't receive it because it will be delivered on
2218		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2219		 * be set to the loopback interface as well.
2220		 */
2221		do_cache = false;
2222	}
2223
2224	fnhe = NULL;
2225	do_cache &= fi != NULL;
2226	if (fi) {
 
2227		struct rtable __rcu **prth;
2228		struct fib_nh *nh = &FIB_RES_NH(*res);
2229
2230		fnhe = find_exception(nh, fl4->daddr);
2231		if (!do_cache)
2232			goto add;
2233		if (fnhe) {
2234			prth = &fnhe->fnhe_rth_output;
2235		} else {
2236			if (unlikely(fl4->flowi4_flags &
2237				     FLOWI_FLAG_KNOWN_NH &&
2238				     !(nh->nh_gw &&
2239				       nh->nh_scope == RT_SCOPE_LINK))) {
2240				do_cache = false;
2241				goto add;
2242			}
2243			prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2244		}
2245		rth = rcu_dereference(*prth);
2246		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2247			return rth;
2248	}
2249
2250add:
2251	rth = rt_dst_alloc(dev_out, flags, type,
2252			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2253			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2254			   do_cache);
2255	if (!rth)
2256		return ERR_PTR(-ENOBUFS);
2257
2258	rth->rt_iif = orig_oif;
2259
2260	RT_CACHE_STAT_INC(out_slow_tot);
2261
2262	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2263		if (flags & RTCF_LOCAL &&
2264		    !(dev_out->flags & IFF_LOOPBACK)) {
2265			rth->dst.output = ip_mc_output;
2266			RT_CACHE_STAT_INC(out_slow_mc);
2267		}
2268#ifdef CONFIG_IP_MROUTE
2269		if (type == RTN_MULTICAST) {
2270			if (IN_DEV_MFORWARD(in_dev) &&
2271			    !ipv4_is_local_multicast(fl4->daddr)) {
2272				rth->dst.input = ip_mr_input;
2273				rth->dst.output = ip_mc_output;
2274			}
2275		}
2276#endif
2277	}
2278
2279	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2280	lwtunnel_set_redirect(&rth->dst);
2281
2282	return rth;
2283}
2284
2285/*
2286 * Major route resolver routine.
2287 */
2288
2289struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2290					const struct sk_buff *skb)
2291{
2292	__u8 tos = RT_FL_TOS(fl4);
2293	struct fib_result res = {
2294		.type		= RTN_UNSPEC,
2295		.fi		= NULL,
2296		.table		= NULL,
2297		.tclassid	= 0,
2298	};
2299	struct rtable *rth;
2300
2301	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2302	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2303	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2304			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2305
2306	rcu_read_lock();
2307	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2308	rcu_read_unlock();
2309
2310	return rth;
2311}
2312EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2313
2314struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2315					    struct fib_result *res,
2316					    const struct sk_buff *skb)
2317{
2318	struct net_device *dev_out = NULL;
2319	int orig_oif = fl4->flowi4_oif;
2320	unsigned int flags = 0;
2321	struct rtable *rth;
2322	int err = -ENETUNREACH;
2323
2324	if (fl4->saddr) {
2325		rth = ERR_PTR(-EINVAL);
2326		if (ipv4_is_multicast(fl4->saddr) ||
2327		    ipv4_is_lbcast(fl4->saddr) ||
2328		    ipv4_is_zeronet(fl4->saddr))
 
2329			goto out;
 
 
 
2330
2331		/* I removed check for oif == dev_out->oif here.
2332		   It was wrong for two reasons:
2333		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2334		      is assigned to multiple interfaces.
2335		   2. Moreover, we are allowed to send packets with saddr
2336		      of another iface. --ANK
2337		 */
2338
2339		if (fl4->flowi4_oif == 0 &&
2340		    (ipv4_is_multicast(fl4->daddr) ||
2341		     ipv4_is_lbcast(fl4->daddr))) {
2342			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2343			dev_out = __ip_dev_find(net, fl4->saddr, false);
2344			if (!dev_out)
2345				goto out;
2346
2347			/* Special hack: user can direct multicasts
2348			   and limited broadcast via necessary interface
2349			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2350			   This hack is not just for fun, it allows
2351			   vic,vat and friends to work.
2352			   They bind socket to loopback, set ttl to zero
2353			   and expect that it will work.
2354			   From the viewpoint of routing cache they are broken,
2355			   because we are not allowed to build multicast path
2356			   with loopback source addr (look, routing cache
2357			   cannot know, that ttl is zero, so that packet
2358			   will not leave this host and route is valid).
2359			   Luckily, this hack is good workaround.
2360			 */
2361
2362			fl4->flowi4_oif = dev_out->ifindex;
2363			goto make_route;
2364		}
2365
2366		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2367			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2368			if (!__ip_dev_find(net, fl4->saddr, false))
2369				goto out;
2370		}
2371	}
2372
2373
2374	if (fl4->flowi4_oif) {
2375		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2376		rth = ERR_PTR(-ENODEV);
2377		if (!dev_out)
2378			goto out;
2379
2380		/* RACE: Check return value of inet_select_addr instead. */
2381		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2382			rth = ERR_PTR(-ENETUNREACH);
2383			goto out;
2384		}
2385		if (ipv4_is_local_multicast(fl4->daddr) ||
2386		    ipv4_is_lbcast(fl4->daddr) ||
2387		    fl4->flowi4_proto == IPPROTO_IGMP) {
2388			if (!fl4->saddr)
2389				fl4->saddr = inet_select_addr(dev_out, 0,
2390							      RT_SCOPE_LINK);
2391			goto make_route;
2392		}
2393		if (!fl4->saddr) {
2394			if (ipv4_is_multicast(fl4->daddr))
2395				fl4->saddr = inet_select_addr(dev_out, 0,
2396							      fl4->flowi4_scope);
2397			else if (!fl4->daddr)
2398				fl4->saddr = inet_select_addr(dev_out, 0,
2399							      RT_SCOPE_HOST);
2400		}
2401	}
2402
2403	if (!fl4->daddr) {
2404		fl4->daddr = fl4->saddr;
2405		if (!fl4->daddr)
2406			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2407		dev_out = net->loopback_dev;
2408		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2409		res->type = RTN_LOCAL;
2410		flags |= RTCF_LOCAL;
2411		goto make_route;
2412	}
2413
2414	err = fib_lookup(net, fl4, res, 0);
2415	if (err) {
2416		res->fi = NULL;
2417		res->table = NULL;
2418		if (fl4->flowi4_oif &&
2419		    (ipv4_is_multicast(fl4->daddr) ||
2420		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2421			/* Apparently, routing tables are wrong. Assume,
2422			   that the destination is on link.
2423
2424			   WHY? DW.
2425			   Because we are allowed to send to iface
2426			   even if it has NO routes and NO assigned
2427			   addresses. When oif is specified, routing
2428			   tables are looked up with only one purpose:
2429			   to catch if destination is gatewayed, rather than
2430			   direct. Moreover, if MSG_DONTROUTE is set,
2431			   we send packet, ignoring both routing tables
2432			   and ifaddr state. --ANK
2433
2434
2435			   We could make it even if oif is unknown,
2436			   likely IPv6, but we do not.
2437			 */
2438
2439			if (fl4->saddr == 0)
2440				fl4->saddr = inet_select_addr(dev_out, 0,
2441							      RT_SCOPE_LINK);
2442			res->type = RTN_UNICAST;
2443			goto make_route;
2444		}
2445		rth = ERR_PTR(err);
2446		goto out;
2447	}
2448
2449	if (res->type == RTN_LOCAL) {
2450		if (!fl4->saddr) {
2451			if (res->fi->fib_prefsrc)
2452				fl4->saddr = res->fi->fib_prefsrc;
2453			else
2454				fl4->saddr = fl4->daddr;
2455		}
2456
2457		/* L3 master device is the loopback for that domain */
2458		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2459			net->loopback_dev;
2460
2461		/* make sure orig_oif points to fib result device even
2462		 * though packet rx/tx happens over loopback or l3mdev
2463		 */
2464		orig_oif = FIB_RES_OIF(*res);
2465
2466		fl4->flowi4_oif = dev_out->ifindex;
2467		flags |= RTCF_LOCAL;
2468		goto make_route;
2469	}
2470
2471	fib_select_path(net, res, fl4, skb);
2472
2473	dev_out = FIB_RES_DEV(*res);
2474	fl4->flowi4_oif = dev_out->ifindex;
2475
2476
2477make_route:
2478	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2479
2480out:
2481	return rth;
2482}
2483
2484static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2485{
2486	return NULL;
2487}
2488
2489static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2490{
2491	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2492
2493	return mtu ? : dst->dev->mtu;
2494}
2495
2496static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2497					  struct sk_buff *skb, u32 mtu)
2498{
2499}
2500
2501static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2502				       struct sk_buff *skb)
2503{
2504}
2505
2506static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2507					  unsigned long old)
2508{
2509	return NULL;
2510}
2511
2512static struct dst_ops ipv4_dst_blackhole_ops = {
2513	.family			=	AF_INET,
2514	.check			=	ipv4_blackhole_dst_check,
2515	.mtu			=	ipv4_blackhole_mtu,
2516	.default_advmss		=	ipv4_default_advmss,
2517	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2518	.redirect		=	ipv4_rt_blackhole_redirect,
2519	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2520	.neigh_lookup		=	ipv4_neigh_lookup,
2521};
2522
2523struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2524{
2525	struct rtable *ort = (struct rtable *) dst_orig;
2526	struct rtable *rt;
2527
2528	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2529	if (rt) {
2530		struct dst_entry *new = &rt->dst;
2531
2532		new->__use = 1;
2533		new->input = dst_discard;
2534		new->output = dst_discard_out;
2535
2536		new->dev = net->loopback_dev;
2537		if (new->dev)
2538			dev_hold(new->dev);
2539
2540		rt->rt_is_input = ort->rt_is_input;
2541		rt->rt_iif = ort->rt_iif;
2542		rt->rt_pmtu = ort->rt_pmtu;
2543		rt->rt_mtu_locked = ort->rt_mtu_locked;
2544
2545		rt->rt_genid = rt_genid_ipv4(net);
2546		rt->rt_flags = ort->rt_flags;
2547		rt->rt_type = ort->rt_type;
2548		rt->rt_gateway = ort->rt_gateway;
2549		rt->rt_uses_gateway = ort->rt_uses_gateway;
 
 
 
 
 
2550
2551		INIT_LIST_HEAD(&rt->rt_uncached);
2552	}
2553
2554	dst_release(dst_orig);
2555
2556	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2557}
2558
2559struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2560				    const struct sock *sk)
2561{
2562	struct rtable *rt = __ip_route_output_key(net, flp4);
2563
2564	if (IS_ERR(rt))
2565		return rt;
2566
2567	if (flp4->flowi4_proto)
 
2568		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2569							flowi4_to_flowi(flp4),
2570							sk, 0);
 
2571
2572	return rt;
2573}
2574EXPORT_SYMBOL_GPL(ip_route_output_flow);
2575
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2576/* called with rcu_read_lock held */
2577static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2578			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2579			u32 seq)
 
2580{
2581	struct rtable *rt = skb_rtable(skb);
2582	struct rtmsg *r;
2583	struct nlmsghdr *nlh;
2584	unsigned long expires = 0;
2585	u32 error;
2586	u32 metrics[RTAX_MAX];
2587
2588	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2589	if (!nlh)
2590		return -EMSGSIZE;
2591
2592	r = nlmsg_data(nlh);
2593	r->rtm_family	 = AF_INET;
2594	r->rtm_dst_len	= 32;
2595	r->rtm_src_len	= 0;
2596	r->rtm_tos	= fl4->flowi4_tos;
2597	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2598	if (nla_put_u32(skb, RTA_TABLE, table_id))
2599		goto nla_put_failure;
2600	r->rtm_type	= rt->rt_type;
2601	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2602	r->rtm_protocol = RTPROT_UNSPEC;
2603	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2604	if (rt->rt_flags & RTCF_NOTIFY)
2605		r->rtm_flags |= RTM_F_NOTIFY;
2606	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2607		r->rtm_flags |= RTCF_DOREDIRECT;
2608
2609	if (nla_put_in_addr(skb, RTA_DST, dst))
2610		goto nla_put_failure;
2611	if (src) {
2612		r->rtm_src_len = 32;
2613		if (nla_put_in_addr(skb, RTA_SRC, src))
2614			goto nla_put_failure;
2615	}
2616	if (rt->dst.dev &&
2617	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2618		goto nla_put_failure;
 
 
 
2619#ifdef CONFIG_IP_ROUTE_CLASSID
2620	if (rt->dst.tclassid &&
2621	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2622		goto nla_put_failure;
2623#endif
2624	if (!rt_is_input_route(rt) &&
2625	    fl4->saddr != src) {
2626		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2627			goto nla_put_failure;
2628	}
2629	if (rt->rt_uses_gateway &&
2630	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2631		goto nla_put_failure;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2632
2633	expires = rt->dst.expires;
2634	if (expires) {
2635		unsigned long now = jiffies;
2636
2637		if (time_before(now, expires))
2638			expires -= now;
2639		else
2640			expires = 0;
2641	}
2642
2643	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2644	if (rt->rt_pmtu && expires)
2645		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2646	if (rt->rt_mtu_locked && expires)
2647		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2648	if (rtnetlink_put_metrics(skb, metrics) < 0)
2649		goto nla_put_failure;
2650
2651	if (fl4->flowi4_mark &&
2652	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2653		goto nla_put_failure;
2654
2655	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2656	    nla_put_u32(skb, RTA_UID,
2657			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2658		goto nla_put_failure;
2659
2660	error = rt->dst.error;
 
 
 
 
2661
2662	if (rt_is_input_route(rt)) {
2663#ifdef CONFIG_IP_MROUTE
2664		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2665		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2666			int err = ipmr_get_route(net, skb,
2667						 fl4->saddr, fl4->daddr,
2668						 r, portid);
2669
2670			if (err <= 0) {
2671				if (err == 0)
2672					return 0;
2673				goto nla_put_failure;
2674			}
2675		} else
 
2676#endif
2677			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2678				goto nla_put_failure;
 
2679	}
2680
 
 
2681	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2682		goto nla_put_failure;
2683
2684	nlmsg_end(skb, nlh);
2685	return 0;
2686
2687nla_put_failure:
2688	nlmsg_cancel(skb, nlh);
2689	return -EMSGSIZE;
2690}
2691
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2692static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2693			     struct netlink_ext_ack *extack)
2694{
2695	struct net *net = sock_net(in_skb->sk);
2696	struct rtmsg *rtm;
2697	struct nlattr *tb[RTA_MAX+1];
 
 
2698	struct fib_result res = {};
 
2699	struct rtable *rt = NULL;
2700	struct flowi4 fl4;
 
 
2701	__be32 dst = 0;
2702	__be32 src = 0;
 
2703	u32 iif;
2704	int err;
2705	int mark;
2706	struct sk_buff *skb;
2707	u32 table_id = RT_TABLE_MAIN;
2708	kuid_t uid;
2709
2710	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2711			  extack);
2712	if (err < 0)
2713		goto errout;
2714
2715	rtm = nlmsg_data(nlh);
2716
2717	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2718	if (!skb) {
2719		err = -ENOBUFS;
2720		goto errout;
2721	}
2722
2723	/* Reserve room for dummy headers, this skb can pass
2724	   through good chunk of routing engine.
2725	 */
2726	skb_reset_mac_header(skb);
2727	skb_reset_network_header(skb);
2728
2729	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2730	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2731	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2732	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2733	if (tb[RTA_UID])
2734		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2735	else
2736		uid = (iif ? INVALID_UID : current_uid());
2737
2738	/* Bugfix: need to give ip_route_input enough of an IP header to
2739	 * not gag.
2740	 */
2741	ip_hdr(skb)->protocol = IPPROTO_UDP;
2742	ip_hdr(skb)->saddr = src;
2743	ip_hdr(skb)->daddr = dst;
2744
2745	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
 
 
 
 
 
 
 
 
2746
2747	memset(&fl4, 0, sizeof(fl4));
2748	fl4.daddr = dst;
2749	fl4.saddr = src;
2750	fl4.flowi4_tos = rtm->rtm_tos;
2751	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2752	fl4.flowi4_mark = mark;
2753	fl4.flowi4_uid = uid;
 
 
 
 
 
2754
2755	rcu_read_lock();
2756
2757	if (iif) {
2758		struct net_device *dev;
2759
2760		dev = dev_get_by_index_rcu(net, iif);
2761		if (!dev) {
2762			err = -ENODEV;
2763			goto errout_free;
2764		}
2765
2766		skb->protocol	= htons(ETH_P_IP);
2767		skb->dev	= dev;
2768		skb->mark	= mark;
2769		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2770					 dev, &res);
 
2771
2772		rt = skb_rtable(skb);
2773		if (err == 0 && rt->dst.error)
2774			err = -rt->dst.error;
2775	} else {
2776		fl4.flowi4_iif = LOOPBACK_IFINDEX;
 
2777		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2778		err = 0;
2779		if (IS_ERR(rt))
2780			err = PTR_ERR(rt);
2781		else
2782			skb_dst_set(skb, &rt->dst);
2783	}
2784
2785	if (err)
2786		goto errout_free;
2787
2788	if (rtm->rtm_flags & RTM_F_NOTIFY)
2789		rt->rt_flags |= RTCF_NOTIFY;
2790
2791	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2792		table_id = res.table ? res.table->tb_id : 0;
2793
 
 
 
 
 
 
2794	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
 
 
2795		if (!res.fi) {
2796			err = fib_props[res.type].error;
2797			if (!err)
2798				err = -EHOSTUNREACH;
2799			goto errout_free;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2800		}
2801		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2802				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2803				    rt->rt_type, res.prefix, res.prefixlen,
2804				    fl4.flowi4_tos, res.fi, 0);
2805	} else {
2806		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2807				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
 
2808	}
2809	if (err < 0)
2810		goto errout_free;
2811
2812	rcu_read_unlock();
2813
2814	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2815errout:
2816	return err;
2817
2818errout_free:
 
 
2819	rcu_read_unlock();
2820	kfree_skb(skb);
2821	goto errout;
2822}
2823
2824void ip_rt_multicast_event(struct in_device *in_dev)
2825{
2826	rt_cache_flush(dev_net(in_dev->dev));
2827}
2828
2829#ifdef CONFIG_SYSCTL
2830static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2831static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2832static int ip_rt_gc_elasticity __read_mostly	= 8;
2833static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
2834
2835static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2836					void __user *buffer,
2837					size_t *lenp, loff_t *ppos)
2838{
2839	struct net *net = (struct net *)__ctl->extra1;
2840
2841	if (write) {
2842		rt_cache_flush(net);
2843		fnhe_genid_bump(net);
2844		return 0;
2845	}
2846
2847	return -EINVAL;
2848}
2849
2850static struct ctl_table ipv4_route_table[] = {
2851	{
2852		.procname	= "gc_thresh",
2853		.data		= &ipv4_dst_ops.gc_thresh,
2854		.maxlen		= sizeof(int),
2855		.mode		= 0644,
2856		.proc_handler	= proc_dointvec,
2857	},
2858	{
2859		.procname	= "max_size",
2860		.data		= &ip_rt_max_size,
2861		.maxlen		= sizeof(int),
2862		.mode		= 0644,
2863		.proc_handler	= proc_dointvec,
2864	},
2865	{
2866		/*  Deprecated. Use gc_min_interval_ms */
2867
2868		.procname	= "gc_min_interval",
2869		.data		= &ip_rt_gc_min_interval,
2870		.maxlen		= sizeof(int),
2871		.mode		= 0644,
2872		.proc_handler	= proc_dointvec_jiffies,
2873	},
2874	{
2875		.procname	= "gc_min_interval_ms",
2876		.data		= &ip_rt_gc_min_interval,
2877		.maxlen		= sizeof(int),
2878		.mode		= 0644,
2879		.proc_handler	= proc_dointvec_ms_jiffies,
2880	},
2881	{
2882		.procname	= "gc_timeout",
2883		.data		= &ip_rt_gc_timeout,
2884		.maxlen		= sizeof(int),
2885		.mode		= 0644,
2886		.proc_handler	= proc_dointvec_jiffies,
2887	},
2888	{
2889		.procname	= "gc_interval",
2890		.data		= &ip_rt_gc_interval,
2891		.maxlen		= sizeof(int),
2892		.mode		= 0644,
2893		.proc_handler	= proc_dointvec_jiffies,
2894	},
2895	{
2896		.procname	= "redirect_load",
2897		.data		= &ip_rt_redirect_load,
2898		.maxlen		= sizeof(int),
2899		.mode		= 0644,
2900		.proc_handler	= proc_dointvec,
2901	},
2902	{
2903		.procname	= "redirect_number",
2904		.data		= &ip_rt_redirect_number,
2905		.maxlen		= sizeof(int),
2906		.mode		= 0644,
2907		.proc_handler	= proc_dointvec,
2908	},
2909	{
2910		.procname	= "redirect_silence",
2911		.data		= &ip_rt_redirect_silence,
2912		.maxlen		= sizeof(int),
2913		.mode		= 0644,
2914		.proc_handler	= proc_dointvec,
2915	},
2916	{
2917		.procname	= "error_cost",
2918		.data		= &ip_rt_error_cost,
2919		.maxlen		= sizeof(int),
2920		.mode		= 0644,
2921		.proc_handler	= proc_dointvec,
2922	},
2923	{
2924		.procname	= "error_burst",
2925		.data		= &ip_rt_error_burst,
2926		.maxlen		= sizeof(int),
2927		.mode		= 0644,
2928		.proc_handler	= proc_dointvec,
2929	},
2930	{
2931		.procname	= "gc_elasticity",
2932		.data		= &ip_rt_gc_elasticity,
2933		.maxlen		= sizeof(int),
2934		.mode		= 0644,
2935		.proc_handler	= proc_dointvec,
2936	},
2937	{
2938		.procname	= "mtu_expires",
2939		.data		= &ip_rt_mtu_expires,
2940		.maxlen		= sizeof(int),
2941		.mode		= 0644,
2942		.proc_handler	= proc_dointvec_jiffies,
2943	},
2944	{
2945		.procname	= "min_pmtu",
2946		.data		= &ip_rt_min_pmtu,
2947		.maxlen		= sizeof(int),
2948		.mode		= 0644,
2949		.proc_handler	= proc_dointvec_minmax,
2950		.extra1		= &ip_min_valid_pmtu,
2951	},
2952	{
2953		.procname	= "min_adv_mss",
2954		.data		= &ip_rt_min_advmss,
2955		.maxlen		= sizeof(int),
2956		.mode		= 0644,
2957		.proc_handler	= proc_dointvec,
2958	},
2959	{ }
2960};
2961
 
 
2962static struct ctl_table ipv4_route_flush_table[] = {
2963	{
2964		.procname	= "flush",
2965		.maxlen		= sizeof(int),
2966		.mode		= 0200,
2967		.proc_handler	= ipv4_sysctl_rtcache_flush,
2968	},
2969	{ },
2970};
2971
2972static __net_init int sysctl_route_net_init(struct net *net)
2973{
2974	struct ctl_table *tbl;
2975
2976	tbl = ipv4_route_flush_table;
2977	if (!net_eq(net, &init_net)) {
2978		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2979		if (!tbl)
2980			goto err_dup;
2981
2982		/* Don't export sysctls to unprivileged users */
2983		if (net->user_ns != &init_user_ns)
2984			tbl[0].procname = NULL;
 
 
2985	}
2986	tbl[0].extra1 = net;
2987
2988	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2989	if (!net->ipv4.route_hdr)
2990		goto err_reg;
2991	return 0;
2992
2993err_reg:
2994	if (tbl != ipv4_route_flush_table)
2995		kfree(tbl);
2996err_dup:
2997	return -ENOMEM;
2998}
2999
3000static __net_exit void sysctl_route_net_exit(struct net *net)
3001{
3002	struct ctl_table *tbl;
3003
3004	tbl = net->ipv4.route_hdr->ctl_table_arg;
3005	unregister_net_sysctl_table(net->ipv4.route_hdr);
3006	BUG_ON(tbl == ipv4_route_flush_table);
3007	kfree(tbl);
3008}
3009
3010static __net_initdata struct pernet_operations sysctl_route_ops = {
3011	.init = sysctl_route_net_init,
3012	.exit = sysctl_route_net_exit,
3013};
3014#endif
3015
3016static __net_init int rt_genid_init(struct net *net)
3017{
3018	atomic_set(&net->ipv4.rt_genid, 0);
3019	atomic_set(&net->fnhe_genid, 0);
3020	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3021	return 0;
3022}
3023
3024static __net_initdata struct pernet_operations rt_genid_ops = {
3025	.init = rt_genid_init,
3026};
3027
3028static int __net_init ipv4_inetpeer_init(struct net *net)
3029{
3030	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3031
3032	if (!bp)
3033		return -ENOMEM;
3034	inet_peer_base_init(bp);
3035	net->ipv4.peers = bp;
3036	return 0;
3037}
3038
3039static void __net_exit ipv4_inetpeer_exit(struct net *net)
3040{
3041	struct inet_peer_base *bp = net->ipv4.peers;
3042
3043	net->ipv4.peers = NULL;
3044	inetpeer_invalidate_tree(bp);
3045	kfree(bp);
3046}
3047
3048static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3049	.init	=	ipv4_inetpeer_init,
3050	.exit	=	ipv4_inetpeer_exit,
3051};
3052
3053#ifdef CONFIG_IP_ROUTE_CLASSID
3054struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3055#endif /* CONFIG_IP_ROUTE_CLASSID */
3056
3057int __init ip_rt_init(void)
3058{
 
3059	int cpu;
3060
3061	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3062	if (!ip_idents)
3063		panic("IP: failed to allocate ip_idents\n");
3064
3065	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3066
3067	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3068	if (!ip_tstamps)
3069		panic("IP: failed to allocate ip_tstamps\n");
 
 
 
 
 
 
 
3070
3071	for_each_possible_cpu(cpu) {
3072		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3073
3074		INIT_LIST_HEAD(&ul->head);
3075		spin_lock_init(&ul->lock);
3076	}
3077#ifdef CONFIG_IP_ROUTE_CLASSID
3078	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3079	if (!ip_rt_acct)
3080		panic("IP: failed to allocate ip_rt_acct\n");
3081#endif
3082
3083	ipv4_dst_ops.kmem_cachep =
3084		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3085				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3086
3087	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3088
3089	if (dst_entries_init(&ipv4_dst_ops) < 0)
3090		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3091
3092	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3093		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3094
3095	ipv4_dst_ops.gc_thresh = ~0;
3096	ip_rt_max_size = INT_MAX;
3097
3098	devinet_init();
3099	ip_fib_init();
3100
3101	if (ip_rt_proc_init())
3102		pr_err("Unable to create route proc files\n");
3103#ifdef CONFIG_XFRM
3104	xfrm_init();
3105	xfrm4_init();
3106#endif
3107	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3108		      RTNL_FLAG_DOIT_UNLOCKED);
3109
3110#ifdef CONFIG_SYSCTL
3111	register_pernet_subsys(&sysctl_route_ops);
3112#endif
3113	register_pernet_subsys(&rt_genid_ops);
3114	register_pernet_subsys(&ipv4_inetpeer_ops);
3115	return 0;
3116}
3117
3118#ifdef CONFIG_SYSCTL
3119/*
3120 * We really need to sanitize the damn ipv4 init order, then all
3121 * this nonsense will go away.
3122 */
3123void __init ip_static_sysctl_init(void)
3124{
3125	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3126}
3127#endif
v5.14.15
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		ROUTE - implementation of the IP router.
   8 *
   9 * Authors:	Ross Biro
  10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  12 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14 *
  15 * Fixes:
  16 *		Alan Cox	:	Verify area fixes.
  17 *		Alan Cox	:	cli() protects routing changes
  18 *		Rui Oliveira	:	ICMP routing table updates
  19 *		(rco@di.uminho.pt)	Routing table insertion and update
  20 *		Linus Torvalds	:	Rewrote bits to be sensible
  21 *		Alan Cox	:	Added BSD route gw semantics
  22 *		Alan Cox	:	Super /proc >4K
  23 *		Alan Cox	:	MTU in route table
  24 *		Alan Cox	:	MSS actually. Also added the window
  25 *					clamper.
  26 *		Sam Lantinga	:	Fixed route matching in rt_del()
  27 *		Alan Cox	:	Routing cache support.
  28 *		Alan Cox	:	Removed compatibility cruft.
  29 *		Alan Cox	:	RTF_REJECT support.
  30 *		Alan Cox	:	TCP irtt support.
  31 *		Jonathan Naylor	:	Added Metric support.
  32 *	Miquel van Smoorenburg	:	BSD API fixes.
  33 *	Miquel van Smoorenburg	:	Metrics.
  34 *		Alan Cox	:	Use __u32 properly
  35 *		Alan Cox	:	Aligned routing errors more closely with BSD
  36 *					our system is still very different.
  37 *		Alan Cox	:	Faster /proc handling
  38 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  39 *					routing caches and better behaviour.
  40 *
  41 *		Olaf Erb	:	irtt wasn't being copied right.
  42 *		Bjorn Ekwall	:	Kerneld route support.
  43 *		Alan Cox	:	Multicast fixed (I hope)
  44 *		Pavel Krauz	:	Limited broadcast fixed
  45 *		Mike McLagan	:	Routing by source
  46 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  47 *					route.c and rewritten from scratch.
  48 *		Andi Kleen	:	Load-limit warning messages.
  49 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  50 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  51 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  52 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  53 *		Marc Boucher	:	routing by fwmark
  54 *	Robert Olsson		:	Added rt_cache statistics
  55 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  56 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  57 *	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  58 *	Ilia Sotnikov		:	Removed TOS from hash calculations
 
 
 
 
 
  59 */
  60
  61#define pr_fmt(fmt) "IPv4: " fmt
  62
  63#include <linux/module.h>
  64#include <linux/uaccess.h>
  65#include <linux/bitops.h>
  66#include <linux/types.h>
  67#include <linux/kernel.h>
  68#include <linux/mm.h>
  69#include <linux/memblock.h>
  70#include <linux/string.h>
  71#include <linux/socket.h>
  72#include <linux/sockios.h>
  73#include <linux/errno.h>
  74#include <linux/in.h>
  75#include <linux/inet.h>
  76#include <linux/netdevice.h>
  77#include <linux/proc_fs.h>
  78#include <linux/init.h>
  79#include <linux/skbuff.h>
  80#include <linux/inetdevice.h>
  81#include <linux/igmp.h>
  82#include <linux/pkt_sched.h>
  83#include <linux/mroute.h>
  84#include <linux/netfilter_ipv4.h>
  85#include <linux/random.h>
  86#include <linux/rcupdate.h>
  87#include <linux/times.h>
  88#include <linux/slab.h>
  89#include <linux/jhash.h>
  90#include <net/dst.h>
  91#include <net/dst_metadata.h>
  92#include <net/net_namespace.h>
  93#include <net/protocol.h>
  94#include <net/ip.h>
  95#include <net/route.h>
  96#include <net/inetpeer.h>
  97#include <net/sock.h>
  98#include <net/ip_fib.h>
  99#include <net/nexthop.h>
 100#include <net/arp.h>
 101#include <net/tcp.h>
 102#include <net/icmp.h>
 103#include <net/xfrm.h>
 104#include <net/lwtunnel.h>
 105#include <net/netevent.h>
 106#include <net/rtnetlink.h>
 107#ifdef CONFIG_SYSCTL
 108#include <linux/sysctl.h>
 109#endif
 110#include <net/secure_seq.h>
 111#include <net/ip_tunnels.h>
 112#include <net/l3mdev.h>
 113
 114#include "fib_lookup.h"
 115
 116#define RT_FL_TOS(oldflp4) \
 117	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119#define RT_GC_TIMEOUT (300*HZ)
 120
 121static int ip_rt_max_size;
 122static int ip_rt_redirect_number __read_mostly	= 9;
 123static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 124static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 125static int ip_rt_error_cost __read_mostly	= HZ;
 126static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 127static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 128static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 129static int ip_rt_min_advmss __read_mostly	= 256;
 130
 131static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 132
 133/*
 134 *	Interface to generic destination cache.
 135 */
 136
 137INDIRECT_CALLABLE_SCOPE
 138struct dst_entry	*ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 139static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 140INDIRECT_CALLABLE_SCOPE
 141unsigned int		ipv4_mtu(const struct dst_entry *dst);
 142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143static void		 ipv4_link_failure(struct sk_buff *skb);
 144static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145					   struct sk_buff *skb, u32 mtu,
 146					   bool confirm_neigh);
 147static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 148					struct sk_buff *skb);
 149static void		ipv4_dst_destroy(struct dst_entry *dst);
 150
 151static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 152{
 153	WARN_ON(1);
 154	return NULL;
 155}
 156
 157static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 158					   struct sk_buff *skb,
 159					   const void *daddr);
 160static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 161
 162static struct dst_ops ipv4_dst_ops = {
 163	.family =		AF_INET,
 164	.check =		ipv4_dst_check,
 165	.default_advmss =	ipv4_default_advmss,
 166	.mtu =			ipv4_mtu,
 167	.cow_metrics =		ipv4_cow_metrics,
 168	.destroy =		ipv4_dst_destroy,
 169	.negative_advice =	ipv4_negative_advice,
 170	.link_failure =		ipv4_link_failure,
 171	.update_pmtu =		ip_rt_update_pmtu,
 172	.redirect =		ip_do_redirect,
 173	.local_out =		__ip_local_out,
 174	.neigh_lookup =		ipv4_neigh_lookup,
 175	.confirm_neigh =	ipv4_confirm_neigh,
 176};
 177
 178#define ECN_OR_COST(class)	TC_PRIO_##class
 179
 180const __u8 ip_tos2prio[16] = {
 181	TC_PRIO_BESTEFFORT,
 182	ECN_OR_COST(BESTEFFORT),
 183	TC_PRIO_BESTEFFORT,
 184	ECN_OR_COST(BESTEFFORT),
 185	TC_PRIO_BULK,
 186	ECN_OR_COST(BULK),
 187	TC_PRIO_BULK,
 188	ECN_OR_COST(BULK),
 189	TC_PRIO_INTERACTIVE,
 190	ECN_OR_COST(INTERACTIVE),
 191	TC_PRIO_INTERACTIVE,
 192	ECN_OR_COST(INTERACTIVE),
 193	TC_PRIO_INTERACTIVE_BULK,
 194	ECN_OR_COST(INTERACTIVE_BULK),
 195	TC_PRIO_INTERACTIVE_BULK,
 196	ECN_OR_COST(INTERACTIVE_BULK)
 197};
 198EXPORT_SYMBOL(ip_tos2prio);
 199
 200static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 201#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 202
 203#ifdef CONFIG_PROC_FS
 204static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 205{
 206	if (*pos)
 207		return NULL;
 208	return SEQ_START_TOKEN;
 209}
 210
 211static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 212{
 213	++*pos;
 214	return NULL;
 215}
 216
 217static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 218{
 219}
 220
 221static int rt_cache_seq_show(struct seq_file *seq, void *v)
 222{
 223	if (v == SEQ_START_TOKEN)
 224		seq_printf(seq, "%-127s\n",
 225			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 226			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 227			   "HHUptod\tSpecDst");
 228	return 0;
 229}
 230
 231static const struct seq_operations rt_cache_seq_ops = {
 232	.start  = rt_cache_seq_start,
 233	.next   = rt_cache_seq_next,
 234	.stop   = rt_cache_seq_stop,
 235	.show   = rt_cache_seq_show,
 236};
 237
 
 
 
 
 
 
 
 
 
 
 
 
 
 238static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 239{
 240	int cpu;
 241
 242	if (*pos == 0)
 243		return SEQ_START_TOKEN;
 244
 245	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 246		if (!cpu_possible(cpu))
 247			continue;
 248		*pos = cpu+1;
 249		return &per_cpu(rt_cache_stat, cpu);
 250	}
 251	return NULL;
 252}
 253
 254static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 255{
 256	int cpu;
 257
 258	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 259		if (!cpu_possible(cpu))
 260			continue;
 261		*pos = cpu+1;
 262		return &per_cpu(rt_cache_stat, cpu);
 263	}
 264	(*pos)++;
 265	return NULL;
 266
 267}
 268
 269static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 270{
 271
 272}
 273
 274static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 275{
 276	struct rt_cache_stat *st = v;
 277
 278	if (v == SEQ_START_TOKEN) {
 279		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 280		return 0;
 281	}
 282
 283	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 284		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 285		   dst_entries_get_slow(&ipv4_dst_ops),
 286		   0, /* st->in_hit */
 287		   st->in_slow_tot,
 288		   st->in_slow_mc,
 289		   st->in_no_route,
 290		   st->in_brd,
 291		   st->in_martian_dst,
 292		   st->in_martian_src,
 293
 294		   0, /* st->out_hit */
 295		   st->out_slow_tot,
 296		   st->out_slow_mc,
 297
 298		   0, /* st->gc_total */
 299		   0, /* st->gc_ignored */
 300		   0, /* st->gc_goal_miss */
 301		   0, /* st->gc_dst_overflow */
 302		   0, /* st->in_hlist_search */
 303		   0  /* st->out_hlist_search */
 304		);
 305	return 0;
 306}
 307
 308static const struct seq_operations rt_cpu_seq_ops = {
 309	.start  = rt_cpu_seq_start,
 310	.next   = rt_cpu_seq_next,
 311	.stop   = rt_cpu_seq_stop,
 312	.show   = rt_cpu_seq_show,
 313};
 314
 
 
 
 
 
 
 
 
 
 
 
 
 
 315#ifdef CONFIG_IP_ROUTE_CLASSID
 316static int rt_acct_proc_show(struct seq_file *m, void *v)
 317{
 318	struct ip_rt_acct *dst, *src;
 319	unsigned int i, j;
 320
 321	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 322	if (!dst)
 323		return -ENOMEM;
 324
 325	for_each_possible_cpu(i) {
 326		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 327		for (j = 0; j < 256; j++) {
 328			dst[j].o_bytes   += src[j].o_bytes;
 329			dst[j].o_packets += src[j].o_packets;
 330			dst[j].i_bytes   += src[j].i_bytes;
 331			dst[j].i_packets += src[j].i_packets;
 332		}
 333	}
 334
 335	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 336	kfree(dst);
 337	return 0;
 338}
 
 
 
 
 
 
 
 
 
 
 
 
 339#endif
 340
 341static int __net_init ip_rt_do_proc_init(struct net *net)
 342{
 343	struct proc_dir_entry *pde;
 344
 345	pde = proc_create_seq("rt_cache", 0444, net->proc_net,
 346			      &rt_cache_seq_ops);
 347	if (!pde)
 348		goto err1;
 349
 350	pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
 351			      &rt_cpu_seq_ops);
 352	if (!pde)
 353		goto err2;
 354
 355#ifdef CONFIG_IP_ROUTE_CLASSID
 356	pde = proc_create_single("rt_acct", 0, net->proc_net,
 357			rt_acct_proc_show);
 358	if (!pde)
 359		goto err3;
 360#endif
 361	return 0;
 362
 363#ifdef CONFIG_IP_ROUTE_CLASSID
 364err3:
 365	remove_proc_entry("rt_cache", net->proc_net_stat);
 366#endif
 367err2:
 368	remove_proc_entry("rt_cache", net->proc_net);
 369err1:
 370	return -ENOMEM;
 371}
 372
 373static void __net_exit ip_rt_do_proc_exit(struct net *net)
 374{
 375	remove_proc_entry("rt_cache", net->proc_net_stat);
 376	remove_proc_entry("rt_cache", net->proc_net);
 377#ifdef CONFIG_IP_ROUTE_CLASSID
 378	remove_proc_entry("rt_acct", net->proc_net);
 379#endif
 380}
 381
 382static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 383	.init = ip_rt_do_proc_init,
 384	.exit = ip_rt_do_proc_exit,
 385};
 386
 387static int __init ip_rt_proc_init(void)
 388{
 389	return register_pernet_subsys(&ip_rt_proc_ops);
 390}
 391
 392#else
 393static inline int ip_rt_proc_init(void)
 394{
 395	return 0;
 396}
 397#endif /* CONFIG_PROC_FS */
 398
 399static inline bool rt_is_expired(const struct rtable *rth)
 400{
 401	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 402}
 403
 404void rt_cache_flush(struct net *net)
 405{
 406	rt_genid_bump_ipv4(net);
 407}
 408
 409static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 410					   struct sk_buff *skb,
 411					   const void *daddr)
 412{
 413	const struct rtable *rt = container_of(dst, struct rtable, dst);
 414	struct net_device *dev = dst->dev;
 
 
 415	struct neighbour *n;
 416
 417	rcu_read_lock_bh();
 418
 419	if (likely(rt->rt_gw_family == AF_INET)) {
 420		n = ip_neigh_gw4(dev, rt->rt_gw4);
 421	} else if (rt->rt_gw_family == AF_INET6) {
 422		n = ip_neigh_gw6(dev, &rt->rt_gw6);
 423        } else {
 424		__be32 pkey;
 425
 426		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 427		n = ip_neigh_gw4(dev, pkey);
 428	}
 429
 430	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 431		n = NULL;
 432
 433	rcu_read_unlock_bh();
 434
 435	return n;
 436}
 437
 438static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 439{
 440	const struct rtable *rt = container_of(dst, struct rtable, dst);
 441	struct net_device *dev = dst->dev;
 442	const __be32 *pkey = daddr;
 
 443
 444	if (rt->rt_gw_family == AF_INET) {
 445		pkey = (const __be32 *)&rt->rt_gw4;
 446	} else if (rt->rt_gw_family == AF_INET6) {
 447		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 448	} else if (!daddr ||
 449		 (rt->rt_flags &
 450		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 451		return;
 452	}
 453	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 454}
 455
 456/* Hash tables of size 2048..262144 depending on RAM size.
 457 * Each bucket uses 8 bytes.
 458 */
 459static u32 ip_idents_mask __read_mostly;
 460static atomic_t *ip_idents __read_mostly;
 461static u32 *ip_tstamps __read_mostly;
 462
 463/* In order to protect privacy, we add a perturbation to identifiers
 464 * if one generator is seldom used. This makes hard for an attacker
 465 * to infer how many packets were sent between two points in time.
 466 */
 467u32 ip_idents_reserve(u32 hash, int segs)
 468{
 469	u32 bucket, old, now = (u32)jiffies;
 470	atomic_t *p_id;
 471	u32 *p_tstamp;
 472	u32 delta = 0;
 473
 474	bucket = hash & ip_idents_mask;
 475	p_tstamp = ip_tstamps + bucket;
 476	p_id = ip_idents + bucket;
 477	old = READ_ONCE(*p_tstamp);
 478
 479	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 480		delta = prandom_u32_max(now - old);
 481
 482	/* If UBSAN reports an error there, please make sure your compiler
 483	 * supports -fno-strict-overflow before reporting it that was a bug
 484	 * in UBSAN, and it has been fixed in GCC-8.
 485	 */
 486	return atomic_add_return(segs + delta, p_id) - segs;
 
 
 487}
 488EXPORT_SYMBOL(ip_idents_reserve);
 489
 490void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 491{
 
 492	u32 hash, id;
 493
 494	/* Note the following code is not safe, but this is okay. */
 495	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 496		get_random_bytes(&net->ipv4.ip_id_key,
 497				 sizeof(net->ipv4.ip_id_key));
 498
 499	hash = siphash_3u32((__force u32)iph->daddr,
 500			    (__force u32)iph->saddr,
 501			    iph->protocol,
 502			    &net->ipv4.ip_id_key);
 503	id = ip_idents_reserve(hash, segs);
 504	iph->id = htons(id);
 505}
 506EXPORT_SYMBOL(__ip_select_ident);
 507
 508static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 509			     const struct sock *sk,
 510			     const struct iphdr *iph,
 511			     int oif, u8 tos,
 512			     u8 prot, u32 mark, int flow_flags)
 513{
 514	if (sk) {
 515		const struct inet_sock *inet = inet_sk(sk);
 516
 517		oif = sk->sk_bound_dev_if;
 518		mark = sk->sk_mark;
 519		tos = RT_CONN_FLAGS(sk);
 520		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 521	}
 522	flowi4_init_output(fl4, oif, mark, tos,
 523			   RT_SCOPE_UNIVERSE, prot,
 524			   flow_flags,
 525			   iph->daddr, iph->saddr, 0, 0,
 526			   sock_net_uid(net, sk));
 527}
 528
 529static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 530			       const struct sock *sk)
 531{
 532	const struct net *net = dev_net(skb->dev);
 533	const struct iphdr *iph = ip_hdr(skb);
 534	int oif = skb->dev->ifindex;
 535	u8 tos = RT_TOS(iph->tos);
 536	u8 prot = iph->protocol;
 537	u32 mark = skb->mark;
 538
 539	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 540}
 541
 542static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 543{
 544	const struct inet_sock *inet = inet_sk(sk);
 545	const struct ip_options_rcu *inet_opt;
 546	__be32 daddr = inet->inet_daddr;
 547
 548	rcu_read_lock();
 549	inet_opt = rcu_dereference(inet->inet_opt);
 550	if (inet_opt && inet_opt->opt.srr)
 551		daddr = inet_opt->opt.faddr;
 552	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 553			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 554			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 555			   inet_sk_flowi_flags(sk),
 556			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 557	rcu_read_unlock();
 558}
 559
 560static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 561				 const struct sk_buff *skb)
 562{
 563	if (skb)
 564		build_skb_flow_key(fl4, skb, sk);
 565	else
 566		build_sk_flow_key(fl4, sk);
 567}
 568
 569static DEFINE_SPINLOCK(fnhe_lock);
 570
 571static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 572{
 573	struct rtable *rt;
 574
 575	rt = rcu_dereference(fnhe->fnhe_rth_input);
 576	if (rt) {
 577		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 578		dst_dev_put(&rt->dst);
 579		dst_release(&rt->dst);
 580	}
 581	rt = rcu_dereference(fnhe->fnhe_rth_output);
 582	if (rt) {
 583		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 584		dst_dev_put(&rt->dst);
 585		dst_release(&rt->dst);
 586	}
 587}
 588
 589static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 590{
 591	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 592	struct fib_nh_exception *fnhe, *oldest = NULL;
 593
 594	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 595		fnhe = rcu_dereference_protected(*fnhe_p,
 596						 lockdep_is_held(&fnhe_lock));
 597		if (!fnhe)
 598			break;
 599		if (!oldest ||
 600		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 601			oldest = fnhe;
 602			oldest_p = fnhe_p;
 603		}
 604	}
 605	fnhe_flush_routes(oldest);
 606	*oldest_p = oldest->fnhe_next;
 607	kfree_rcu(oldest, rcu);
 608}
 609
 610static u32 fnhe_hashfun(__be32 daddr)
 611{
 612	static siphash_key_t fnhe_hash_key __read_mostly;
 613	u64 hval;
 614
 615	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
 616	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
 617	return hash_64(hval, FNHE_HASH_SHIFT);
 618}
 619
 620static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 621{
 622	rt->rt_pmtu = fnhe->fnhe_pmtu;
 623	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 624	rt->dst.expires = fnhe->fnhe_expires;
 625
 626	if (fnhe->fnhe_gw) {
 627		rt->rt_flags |= RTCF_REDIRECTED;
 
 628		rt->rt_uses_gateway = 1;
 629		rt->rt_gw_family = AF_INET;
 630		rt->rt_gw4 = fnhe->fnhe_gw;
 631	}
 632}
 633
 634static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 635				  __be32 gw, u32 pmtu, bool lock,
 636				  unsigned long expires)
 637{
 638	struct fnhe_hash_bucket *hash;
 639	struct fib_nh_exception *fnhe;
 640	struct rtable *rt;
 641	u32 genid, hval;
 642	unsigned int i;
 643	int depth;
 644
 645	genid = fnhe_genid(dev_net(nhc->nhc_dev));
 646	hval = fnhe_hashfun(daddr);
 647
 648	spin_lock_bh(&fnhe_lock);
 649
 650	hash = rcu_dereference(nhc->nhc_exceptions);
 651	if (!hash) {
 652		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 653		if (!hash)
 654			goto out_unlock;
 655		rcu_assign_pointer(nhc->nhc_exceptions, hash);
 656	}
 657
 658	hash += hval;
 659
 660	depth = 0;
 661	for (fnhe = rcu_dereference(hash->chain); fnhe;
 662	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 663		if (fnhe->fnhe_daddr == daddr)
 664			break;
 665		depth++;
 666	}
 667
 668	if (fnhe) {
 669		if (fnhe->fnhe_genid != genid)
 670			fnhe->fnhe_genid = genid;
 671		if (gw)
 672			fnhe->fnhe_gw = gw;
 673		if (pmtu) {
 674			fnhe->fnhe_pmtu = pmtu;
 675			fnhe->fnhe_mtu_locked = lock;
 676		}
 677		fnhe->fnhe_expires = max(1UL, expires);
 678		/* Update all cached dsts too */
 679		rt = rcu_dereference(fnhe->fnhe_rth_input);
 680		if (rt)
 681			fill_route_from_fnhe(rt, fnhe);
 682		rt = rcu_dereference(fnhe->fnhe_rth_output);
 683		if (rt)
 684			fill_route_from_fnhe(rt, fnhe);
 685	} else {
 686		/* Randomize max depth to avoid some side channels attacks. */
 687		int max_depth = FNHE_RECLAIM_DEPTH +
 688				prandom_u32_max(FNHE_RECLAIM_DEPTH);
 
 
 
 689
 690		while (depth > max_depth) {
 691			fnhe_remove_oldest(hash);
 692			depth--;
 693		}
 694
 695		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 696		if (!fnhe)
 697			goto out_unlock;
 698
 699		fnhe->fnhe_next = hash->chain;
 700
 701		fnhe->fnhe_genid = genid;
 702		fnhe->fnhe_daddr = daddr;
 703		fnhe->fnhe_gw = gw;
 704		fnhe->fnhe_pmtu = pmtu;
 705		fnhe->fnhe_mtu_locked = lock;
 706		fnhe->fnhe_expires = max(1UL, expires);
 707
 708		rcu_assign_pointer(hash->chain, fnhe);
 709
 710		/* Exception created; mark the cached routes for the nexthop
 711		 * stale, so anyone caching it rechecks if this exception
 712		 * applies to them.
 713		 */
 714		rt = rcu_dereference(nhc->nhc_rth_input);
 715		if (rt)
 716			rt->dst.obsolete = DST_OBSOLETE_KILL;
 717
 718		for_each_possible_cpu(i) {
 719			struct rtable __rcu **prt;
 720
 721			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 722			rt = rcu_dereference(*prt);
 723			if (rt)
 724				rt->dst.obsolete = DST_OBSOLETE_KILL;
 725		}
 726	}
 727
 728	fnhe->fnhe_stamp = jiffies;
 729
 730out_unlock:
 731	spin_unlock_bh(&fnhe_lock);
 732}
 733
 734static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 735			     bool kill_route)
 736{
 737	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 738	__be32 old_gw = ip_hdr(skb)->saddr;
 739	struct net_device *dev = skb->dev;
 740	struct in_device *in_dev;
 741	struct fib_result res;
 742	struct neighbour *n;
 743	struct net *net;
 744
 745	switch (icmp_hdr(skb)->code & 7) {
 746	case ICMP_REDIR_NET:
 747	case ICMP_REDIR_NETTOS:
 748	case ICMP_REDIR_HOST:
 749	case ICMP_REDIR_HOSTTOS:
 750		break;
 751
 752	default:
 753		return;
 754	}
 755
 756	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 757		return;
 758
 759	in_dev = __in_dev_get_rcu(dev);
 760	if (!in_dev)
 761		return;
 762
 763	net = dev_net(dev);
 764	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 765	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 766	    ipv4_is_zeronet(new_gw))
 767		goto reject_redirect;
 768
 769	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 770		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 771			goto reject_redirect;
 772		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 773			goto reject_redirect;
 774	} else {
 775		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 776			goto reject_redirect;
 777	}
 778
 779	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 780	if (!n)
 781		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 782	if (!IS_ERR(n)) {
 783		if (!(n->nud_state & NUD_VALID)) {
 784			neigh_event_send(n, NULL);
 785		} else {
 786			if (fib_lookup(net, fl4, &res, 0) == 0) {
 787				struct fib_nh_common *nhc;
 788
 789				fib_select_path(net, &res, fl4, skb);
 790				nhc = FIB_RES_NHC(res);
 791				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 792						0, false,
 793						jiffies + ip_rt_gc_timeout);
 794			}
 795			if (kill_route)
 796				rt->dst.obsolete = DST_OBSOLETE_KILL;
 797			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 798		}
 799		neigh_release(n);
 800	}
 801	return;
 802
 803reject_redirect:
 804#ifdef CONFIG_IP_ROUTE_VERBOSE
 805	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 806		const struct iphdr *iph = (const struct iphdr *) skb->data;
 807		__be32 daddr = iph->daddr;
 808		__be32 saddr = iph->saddr;
 809
 810		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 811				     "  Advised path = %pI4 -> %pI4\n",
 812				     &old_gw, dev->name, &new_gw,
 813				     &saddr, &daddr);
 814	}
 815#endif
 816	;
 817}
 818
 819static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 820{
 821	struct rtable *rt;
 822	struct flowi4 fl4;
 823	const struct iphdr *iph = (const struct iphdr *) skb->data;
 824	struct net *net = dev_net(skb->dev);
 825	int oif = skb->dev->ifindex;
 826	u8 tos = RT_TOS(iph->tos);
 827	u8 prot = iph->protocol;
 828	u32 mark = skb->mark;
 829
 830	rt = (struct rtable *) dst;
 831
 832	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 833	__ip_do_redirect(rt, skb, &fl4, true);
 834}
 835
 836static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 837{
 838	struct rtable *rt = (struct rtable *)dst;
 839	struct dst_entry *ret = dst;
 840
 841	if (rt) {
 842		if (dst->obsolete > 0) {
 843			ip_rt_put(rt);
 844			ret = NULL;
 845		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 846			   rt->dst.expires) {
 847			ip_rt_put(rt);
 848			ret = NULL;
 849		}
 850	}
 851	return ret;
 852}
 853
 854/*
 855 * Algorithm:
 856 *	1. The first ip_rt_redirect_number redirects are sent
 857 *	   with exponential backoff, then we stop sending them at all,
 858 *	   assuming that the host ignores our redirects.
 859 *	2. If we did not see packets requiring redirects
 860 *	   during ip_rt_redirect_silence, we assume that the host
 861 *	   forgot redirected route and start to send redirects again.
 862 *
 863 * This algorithm is much cheaper and more intelligent than dumb load limiting
 864 * in icmp.c.
 865 *
 866 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 867 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 868 */
 869
 870void ip_rt_send_redirect(struct sk_buff *skb)
 871{
 872	struct rtable *rt = skb_rtable(skb);
 873	struct in_device *in_dev;
 874	struct inet_peer *peer;
 875	struct net *net;
 876	int log_martians;
 877	int vif;
 878
 879	rcu_read_lock();
 880	in_dev = __in_dev_get_rcu(rt->dst.dev);
 881	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 882		rcu_read_unlock();
 883		return;
 884	}
 885	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 886	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 887	rcu_read_unlock();
 888
 889	net = dev_net(rt->dst.dev);
 890	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 891	if (!peer) {
 892		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 893			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 894		return;
 895	}
 896
 897	/* No redirected packets during ip_rt_redirect_silence;
 898	 * reset the algorithm.
 899	 */
 900	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 901		peer->rate_tokens = 0;
 902		peer->n_redirects = 0;
 903	}
 904
 905	/* Too many ignored redirects; do not send anything
 906	 * set dst.rate_last to the last seen redirected packet.
 907	 */
 908	if (peer->n_redirects >= ip_rt_redirect_number) {
 909		peer->rate_last = jiffies;
 910		goto out_put_peer;
 911	}
 912
 913	/* Check for load limit; set rate_last to the latest sent
 914	 * redirect.
 915	 */
 916	if (peer->n_redirects == 0 ||
 917	    time_after(jiffies,
 918		       (peer->rate_last +
 919			(ip_rt_redirect_load << peer->n_redirects)))) {
 920		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 921
 922		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 923		peer->rate_last = jiffies;
 924		++peer->n_redirects;
 925#ifdef CONFIG_IP_ROUTE_VERBOSE
 926		if (log_martians &&
 927		    peer->n_redirects == ip_rt_redirect_number)
 928			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 929					     &ip_hdr(skb)->saddr, inet_iif(skb),
 930					     &ip_hdr(skb)->daddr, &gw);
 931#endif
 932	}
 933out_put_peer:
 934	inet_putpeer(peer);
 935}
 936
 937static int ip_error(struct sk_buff *skb)
 938{
 939	struct rtable *rt = skb_rtable(skb);
 940	struct net_device *dev = skb->dev;
 941	struct in_device *in_dev;
 942	struct inet_peer *peer;
 943	unsigned long now;
 944	struct net *net;
 945	bool send;
 946	int code;
 947
 948	if (netif_is_l3_master(skb->dev)) {
 949		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 950		if (!dev)
 951			goto out;
 952	}
 953
 954	in_dev = __in_dev_get_rcu(dev);
 955
 956	/* IP on this device is disabled. */
 957	if (!in_dev)
 958		goto out;
 959
 960	net = dev_net(rt->dst.dev);
 961	if (!IN_DEV_FORWARD(in_dev)) {
 962		switch (rt->dst.error) {
 963		case EHOSTUNREACH:
 964			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 965			break;
 966
 967		case ENETUNREACH:
 968			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 969			break;
 970		}
 971		goto out;
 972	}
 973
 974	switch (rt->dst.error) {
 975	case EINVAL:
 976	default:
 977		goto out;
 978	case EHOSTUNREACH:
 979		code = ICMP_HOST_UNREACH;
 980		break;
 981	case ENETUNREACH:
 982		code = ICMP_NET_UNREACH;
 983		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 984		break;
 985	case EACCES:
 986		code = ICMP_PKT_FILTERED;
 987		break;
 988	}
 989
 990	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 991			       l3mdev_master_ifindex(skb->dev), 1);
 992
 993	send = true;
 994	if (peer) {
 995		now = jiffies;
 996		peer->rate_tokens += now - peer->rate_last;
 997		if (peer->rate_tokens > ip_rt_error_burst)
 998			peer->rate_tokens = ip_rt_error_burst;
 999		peer->rate_last = now;
1000		if (peer->rate_tokens >= ip_rt_error_cost)
1001			peer->rate_tokens -= ip_rt_error_cost;
1002		else
1003			send = false;
1004		inet_putpeer(peer);
1005	}
1006	if (send)
1007		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1008
1009out:	kfree_skb(skb);
1010	return 0;
1011}
1012
1013static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1014{
1015	struct dst_entry *dst = &rt->dst;
1016	struct net *net = dev_net(dst->dev);
1017	struct fib_result res;
1018	bool lock = false;
1019	u32 old_mtu;
1020
1021	if (ip_mtu_locked(dst))
1022		return;
1023
1024	old_mtu = ipv4_mtu(dst);
1025	if (old_mtu < mtu)
1026		return;
1027
1028	if (mtu < ip_rt_min_pmtu) {
1029		lock = true;
1030		mtu = min(old_mtu, ip_rt_min_pmtu);
1031	}
1032
1033	if (rt->rt_pmtu == mtu && !lock &&
1034	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1035		return;
1036
1037	rcu_read_lock();
1038	if (fib_lookup(net, fl4, &res, 0) == 0) {
1039		struct fib_nh_common *nhc;
1040
1041		fib_select_path(net, &res, fl4, NULL);
1042		nhc = FIB_RES_NHC(res);
1043		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1044				      jiffies + ip_rt_mtu_expires);
1045	}
1046	rcu_read_unlock();
1047}
1048
1049static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1050			      struct sk_buff *skb, u32 mtu,
1051			      bool confirm_neigh)
1052{
1053	struct rtable *rt = (struct rtable *) dst;
1054	struct flowi4 fl4;
1055
1056	ip_rt_build_flow_key(&fl4, sk, skb);
1057
1058	/* Don't make lookup fail for bridged encapsulations */
1059	if (skb && netif_is_any_bridge_port(skb->dev))
1060		fl4.flowi4_oif = 0;
1061
1062	__ip_rt_update_pmtu(rt, &fl4, mtu);
1063}
1064
1065void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1066		      int oif, u8 protocol)
1067{
1068	const struct iphdr *iph = (const struct iphdr *)skb->data;
1069	struct flowi4 fl4;
1070	struct rtable *rt;
1071	u32 mark = IP4_REPLY_MARK(net, skb->mark);
 
 
1072
1073	__build_flow_key(net, &fl4, NULL, iph, oif,
1074			 RT_TOS(iph->tos), protocol, mark, 0);
1075	rt = __ip_route_output_key(net, &fl4);
1076	if (!IS_ERR(rt)) {
1077		__ip_rt_update_pmtu(rt, &fl4, mtu);
1078		ip_rt_put(rt);
1079	}
1080}
1081EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1082
1083static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1084{
1085	const struct iphdr *iph = (const struct iphdr *)skb->data;
1086	struct flowi4 fl4;
1087	struct rtable *rt;
1088
1089	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1090
1091	if (!fl4.flowi4_mark)
1092		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1093
1094	rt = __ip_route_output_key(sock_net(sk), &fl4);
1095	if (!IS_ERR(rt)) {
1096		__ip_rt_update_pmtu(rt, &fl4, mtu);
1097		ip_rt_put(rt);
1098	}
1099}
1100
1101void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1102{
1103	const struct iphdr *iph = (const struct iphdr *)skb->data;
1104	struct flowi4 fl4;
1105	struct rtable *rt;
1106	struct dst_entry *odst = NULL;
1107	bool new = false;
1108	struct net *net = sock_net(sk);
1109
1110	bh_lock_sock(sk);
1111
1112	if (!ip_sk_accept_pmtu(sk))
1113		goto out;
1114
1115	odst = sk_dst_get(sk);
1116
1117	if (sock_owned_by_user(sk) || !odst) {
1118		__ipv4_sk_update_pmtu(skb, sk, mtu);
1119		goto out;
1120	}
1121
1122	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1123
1124	rt = (struct rtable *)odst;
1125	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1126		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1127		if (IS_ERR(rt))
1128			goto out;
1129
1130		new = true;
1131	}
1132
1133	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1134
1135	if (!dst_check(&rt->dst, 0)) {
1136		if (new)
1137			dst_release(&rt->dst);
1138
1139		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1140		if (IS_ERR(rt))
1141			goto out;
1142
1143		new = true;
1144	}
1145
1146	if (new)
1147		sk_dst_set(sk, &rt->dst);
1148
1149out:
1150	bh_unlock_sock(sk);
1151	dst_release(odst);
1152}
1153EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1154
1155void ipv4_redirect(struct sk_buff *skb, struct net *net,
1156		   int oif, u8 protocol)
1157{
1158	const struct iphdr *iph = (const struct iphdr *)skb->data;
1159	struct flowi4 fl4;
1160	struct rtable *rt;
1161
1162	__build_flow_key(net, &fl4, NULL, iph, oif,
1163			 RT_TOS(iph->tos), protocol, 0, 0);
1164	rt = __ip_route_output_key(net, &fl4);
1165	if (!IS_ERR(rt)) {
1166		__ip_do_redirect(rt, skb, &fl4, false);
1167		ip_rt_put(rt);
1168	}
1169}
1170EXPORT_SYMBOL_GPL(ipv4_redirect);
1171
1172void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1173{
1174	const struct iphdr *iph = (const struct iphdr *)skb->data;
1175	struct flowi4 fl4;
1176	struct rtable *rt;
1177	struct net *net = sock_net(sk);
1178
1179	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1180	rt = __ip_route_output_key(net, &fl4);
1181	if (!IS_ERR(rt)) {
1182		__ip_do_redirect(rt, skb, &fl4, false);
1183		ip_rt_put(rt);
1184	}
1185}
1186EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1187
1188INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
1189							 u32 cookie)
1190{
1191	struct rtable *rt = (struct rtable *) dst;
1192
1193	/* All IPV4 dsts are created with ->obsolete set to the value
1194	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1195	 * into this function always.
1196	 *
1197	 * When a PMTU/redirect information update invalidates a route,
1198	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1199	 * DST_OBSOLETE_DEAD.
1200	 */
1201	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1202		return NULL;
1203	return dst;
1204}
1205EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
1206
1207static void ipv4_send_dest_unreach(struct sk_buff *skb)
1208{
1209	struct ip_options opt;
1210	int res;
1211
1212	/* Recompile ip options since IPCB may not be valid anymore.
1213	 * Also check we have a reasonable ipv4 header.
1214	 */
1215	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1216	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1217		return;
1218
1219	memset(&opt, 0, sizeof(opt));
1220	if (ip_hdr(skb)->ihl > 5) {
1221		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1222			return;
1223		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1224
1225		rcu_read_lock();
1226		res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1227		rcu_read_unlock();
1228
1229		if (res)
1230			return;
1231	}
1232	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1233}
1234
1235static void ipv4_link_failure(struct sk_buff *skb)
1236{
1237	struct rtable *rt;
1238
1239	ipv4_send_dest_unreach(skb);
1240
1241	rt = skb_rtable(skb);
1242	if (rt)
1243		dst_set_expires(&rt->dst, 0);
1244}
1245
1246static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1247{
1248	pr_debug("%s: %pI4 -> %pI4, %s\n",
1249		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1250		 skb->dev ? skb->dev->name : "?");
1251	kfree_skb(skb);
1252	WARN_ON(1);
1253	return 0;
1254}
1255
1256/*
1257 * We do not cache source address of outgoing interface,
1258 * because it is used only by IP RR, TS and SRR options,
1259 * so that it out of fast path.
1260 *
1261 * BTW remember: "addr" is allowed to be not aligned
1262 * in IP options!
1263 */
1264
1265void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1266{
1267	__be32 src;
1268
1269	if (rt_is_output_route(rt))
1270		src = ip_hdr(skb)->saddr;
1271	else {
1272		struct fib_result res;
1273		struct iphdr *iph = ip_hdr(skb);
1274		struct flowi4 fl4 = {
1275			.daddr = iph->daddr,
1276			.saddr = iph->saddr,
1277			.flowi4_tos = RT_TOS(iph->tos),
1278			.flowi4_oif = rt->dst.dev->ifindex,
1279			.flowi4_iif = skb->dev->ifindex,
1280			.flowi4_mark = skb->mark,
1281		};
 
 
 
1282
1283		rcu_read_lock();
1284		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1285			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1286		else
1287			src = inet_select_addr(rt->dst.dev,
1288					       rt_nexthop(rt, iph->daddr),
1289					       RT_SCOPE_UNIVERSE);
1290		rcu_read_unlock();
1291	}
1292	memcpy(addr, &src, 4);
1293}
1294
1295#ifdef CONFIG_IP_ROUTE_CLASSID
1296static void set_class_tag(struct rtable *rt, u32 tag)
1297{
1298	if (!(rt->dst.tclassid & 0xFFFF))
1299		rt->dst.tclassid |= tag & 0xFFFF;
1300	if (!(rt->dst.tclassid & 0xFFFF0000))
1301		rt->dst.tclassid |= tag & 0xFFFF0000;
1302}
1303#endif
1304
1305static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1306{
1307	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1308	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1309				    ip_rt_min_advmss);
1310
1311	return min(advmss, IPV4_MAX_PMTU - header_size);
1312}
1313
1314INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
1315{
1316	const struct rtable *rt = (const struct rtable *)dst;
1317	unsigned int mtu = rt->rt_pmtu;
1318
1319	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1320		mtu = dst_metric_raw(dst, RTAX_MTU);
1321
1322	if (mtu)
1323		goto out;
1324
1325	mtu = READ_ONCE(dst->dev->mtu);
1326
1327	if (unlikely(ip_mtu_locked(dst))) {
1328		if (rt->rt_uses_gateway && mtu > 576)
1329			mtu = 576;
1330	}
1331
1332out:
1333	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1334
1335	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1336}
1337EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
1338
1339static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1340{
1341	struct fnhe_hash_bucket *hash;
1342	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1343	u32 hval = fnhe_hashfun(daddr);
1344
1345	spin_lock_bh(&fnhe_lock);
1346
1347	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1348					 lockdep_is_held(&fnhe_lock));
1349	hash += hval;
1350
1351	fnhe_p = &hash->chain;
1352	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1353	while (fnhe) {
1354		if (fnhe->fnhe_daddr == daddr) {
1355			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1356				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1357			/* set fnhe_daddr to 0 to ensure it won't bind with
1358			 * new dsts in rt_bind_exception().
1359			 */
1360			fnhe->fnhe_daddr = 0;
1361			fnhe_flush_routes(fnhe);
1362			kfree_rcu(fnhe, rcu);
1363			break;
1364		}
1365		fnhe_p = &fnhe->fnhe_next;
1366		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1367						 lockdep_is_held(&fnhe_lock));
1368	}
1369
1370	spin_unlock_bh(&fnhe_lock);
1371}
1372
1373static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1374					       __be32 daddr)
1375{
1376	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1377	struct fib_nh_exception *fnhe;
1378	u32 hval;
1379
1380	if (!hash)
1381		return NULL;
1382
1383	hval = fnhe_hashfun(daddr);
1384
1385	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1386	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1387		if (fnhe->fnhe_daddr == daddr) {
1388			if (fnhe->fnhe_expires &&
1389			    time_after(jiffies, fnhe->fnhe_expires)) {
1390				ip_del_fnhe(nhc, daddr);
1391				break;
1392			}
1393			return fnhe;
1394		}
1395	}
1396	return NULL;
1397}
1398
1399/* MTU selection:
1400 * 1. mtu on route is locked - use it
1401 * 2. mtu from nexthop exception
1402 * 3. mtu from egress device
1403 */
1404
1405u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1406{
1407	struct fib_nh_common *nhc = res->nhc;
1408	struct net_device *dev = nhc->nhc_dev;
1409	struct fib_info *fi = res->fi;
1410	u32 mtu = 0;
1411
1412	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1413	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1414		mtu = fi->fib_mtu;
1415
1416	if (likely(!mtu)) {
1417		struct fib_nh_exception *fnhe;
1418
1419		fnhe = find_exception(nhc, daddr);
1420		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1421			mtu = fnhe->fnhe_pmtu;
1422	}
1423
1424	if (likely(!mtu))
1425		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1426
1427	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1428}
1429
1430static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1431			      __be32 daddr, const bool do_cache)
1432{
1433	bool ret = false;
1434
1435	spin_lock_bh(&fnhe_lock);
1436
1437	if (daddr == fnhe->fnhe_daddr) {
1438		struct rtable __rcu **porig;
1439		struct rtable *orig;
1440		int genid = fnhe_genid(dev_net(rt->dst.dev));
1441
1442		if (rt_is_input_route(rt))
1443			porig = &fnhe->fnhe_rth_input;
1444		else
1445			porig = &fnhe->fnhe_rth_output;
1446		orig = rcu_dereference(*porig);
1447
1448		if (fnhe->fnhe_genid != genid) {
1449			fnhe->fnhe_genid = genid;
1450			fnhe->fnhe_gw = 0;
1451			fnhe->fnhe_pmtu = 0;
1452			fnhe->fnhe_expires = 0;
1453			fnhe->fnhe_mtu_locked = false;
1454			fnhe_flush_routes(fnhe);
1455			orig = NULL;
1456		}
1457		fill_route_from_fnhe(rt, fnhe);
1458		if (!rt->rt_gw4) {
1459			rt->rt_gw4 = daddr;
1460			rt->rt_gw_family = AF_INET;
1461		}
1462
1463		if (do_cache) {
1464			dst_hold(&rt->dst);
1465			rcu_assign_pointer(*porig, rt);
1466			if (orig) {
1467				dst_dev_put(&orig->dst);
1468				dst_release(&orig->dst);
1469			}
1470			ret = true;
1471		}
1472
1473		fnhe->fnhe_stamp = jiffies;
1474	}
1475	spin_unlock_bh(&fnhe_lock);
1476
1477	return ret;
1478}
1479
1480static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1481{
1482	struct rtable *orig, *prev, **p;
1483	bool ret = true;
1484
1485	if (rt_is_input_route(rt)) {
1486		p = (struct rtable **)&nhc->nhc_rth_input;
1487	} else {
1488		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1489	}
1490	orig = *p;
1491
1492	/* hold dst before doing cmpxchg() to avoid race condition
1493	 * on this dst
1494	 */
1495	dst_hold(&rt->dst);
1496	prev = cmpxchg(p, orig, rt);
1497	if (prev == orig) {
1498		if (orig) {
1499			rt_add_uncached_list(orig);
1500			dst_release(&orig->dst);
1501		}
1502	} else {
1503		dst_release(&rt->dst);
1504		ret = false;
1505	}
1506
1507	return ret;
1508}
1509
1510struct uncached_list {
1511	spinlock_t		lock;
1512	struct list_head	head;
1513};
1514
1515static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1516
1517void rt_add_uncached_list(struct rtable *rt)
1518{
1519	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1520
1521	rt->rt_uncached_list = ul;
1522
1523	spin_lock_bh(&ul->lock);
1524	list_add_tail(&rt->rt_uncached, &ul->head);
1525	spin_unlock_bh(&ul->lock);
1526}
1527
1528void rt_del_uncached_list(struct rtable *rt)
1529{
1530	if (!list_empty(&rt->rt_uncached)) {
1531		struct uncached_list *ul = rt->rt_uncached_list;
1532
1533		spin_lock_bh(&ul->lock);
1534		list_del(&rt->rt_uncached);
1535		spin_unlock_bh(&ul->lock);
1536	}
1537}
1538
1539static void ipv4_dst_destroy(struct dst_entry *dst)
1540{
 
1541	struct rtable *rt = (struct rtable *)dst;
1542
1543	ip_dst_metrics_put(dst);
 
 
1544	rt_del_uncached_list(rt);
1545}
1546
1547void rt_flush_dev(struct net_device *dev)
1548{
 
1549	struct rtable *rt;
1550	int cpu;
1551
1552	for_each_possible_cpu(cpu) {
1553		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1554
1555		spin_lock_bh(&ul->lock);
1556		list_for_each_entry(rt, &ul->head, rt_uncached) {
1557			if (rt->dst.dev != dev)
1558				continue;
1559			rt->dst.dev = blackhole_netdev;
1560			dev_hold(rt->dst.dev);
1561			dev_put(dev);
1562		}
1563		spin_unlock_bh(&ul->lock);
1564	}
1565}
1566
1567static bool rt_cache_valid(const struct rtable *rt)
1568{
1569	return	rt &&
1570		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1571		!rt_is_expired(rt);
1572}
1573
1574static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1575			   const struct fib_result *res,
1576			   struct fib_nh_exception *fnhe,
1577			   struct fib_info *fi, u16 type, u32 itag,
1578			   const bool do_cache)
1579{
1580	bool cached = false;
1581
1582	if (fi) {
1583		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1584
1585		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
 
1586			rt->rt_uses_gateway = 1;
1587			rt->rt_gw_family = nhc->nhc_gw_family;
1588			/* only INET and INET6 are supported */
1589			if (likely(nhc->nhc_gw_family == AF_INET))
1590				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1591			else
1592				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1593		}
1594
1595		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1596
 
 
1597#ifdef CONFIG_IP_ROUTE_CLASSID
1598		if (nhc->nhc_family == AF_INET) {
1599			struct fib_nh *nh;
1600
1601			nh = container_of(nhc, struct fib_nh, nh_common);
1602			rt->dst.tclassid = nh->nh_tclassid;
1603		}
1604#endif
1605		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1606		if (unlikely(fnhe))
1607			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1608		else if (do_cache)
1609			cached = rt_cache_route(nhc, rt);
1610		if (unlikely(!cached)) {
1611			/* Routes we intend to cache in nexthop exception or
1612			 * FIB nexthop have the DST_NOCACHE bit clear.
1613			 * However, if we are unsuccessful at storing this
1614			 * route into the cache we really need to set it.
1615			 */
1616			if (!rt->rt_gw4) {
1617				rt->rt_gw_family = AF_INET;
1618				rt->rt_gw4 = daddr;
1619			}
1620			rt_add_uncached_list(rt);
1621		}
1622	} else
1623		rt_add_uncached_list(rt);
1624
1625#ifdef CONFIG_IP_ROUTE_CLASSID
1626#ifdef CONFIG_IP_MULTIPLE_TABLES
1627	set_class_tag(rt, res->tclassid);
1628#endif
1629	set_class_tag(rt, itag);
1630#endif
1631}
1632
1633struct rtable *rt_dst_alloc(struct net_device *dev,
1634			    unsigned int flags, u16 type,
1635			    bool nopolicy, bool noxfrm)
1636{
1637	struct rtable *rt;
1638
1639	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
 
1640		       (nopolicy ? DST_NOPOLICY : 0) |
1641		       (noxfrm ? DST_NOXFRM : 0));
1642
1643	if (rt) {
1644		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1645		rt->rt_flags = flags;
1646		rt->rt_type = type;
1647		rt->rt_is_input = 0;
1648		rt->rt_iif = 0;
1649		rt->rt_pmtu = 0;
1650		rt->rt_mtu_locked = 0;
 
1651		rt->rt_uses_gateway = 0;
1652		rt->rt_gw_family = 0;
1653		rt->rt_gw4 = 0;
1654		INIT_LIST_HEAD(&rt->rt_uncached);
1655
1656		rt->dst.output = ip_output;
1657		if (flags & RTCF_LOCAL)
1658			rt->dst.input = ip_local_deliver;
1659	}
1660
1661	return rt;
1662}
1663EXPORT_SYMBOL(rt_dst_alloc);
1664
1665struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1666{
1667	struct rtable *new_rt;
1668
1669	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1670			   rt->dst.flags);
1671
1672	if (new_rt) {
1673		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1674		new_rt->rt_flags = rt->rt_flags;
1675		new_rt->rt_type = rt->rt_type;
1676		new_rt->rt_is_input = rt->rt_is_input;
1677		new_rt->rt_iif = rt->rt_iif;
1678		new_rt->rt_pmtu = rt->rt_pmtu;
1679		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1680		new_rt->rt_gw_family = rt->rt_gw_family;
1681		if (rt->rt_gw_family == AF_INET)
1682			new_rt->rt_gw4 = rt->rt_gw4;
1683		else if (rt->rt_gw_family == AF_INET6)
1684			new_rt->rt_gw6 = rt->rt_gw6;
1685		INIT_LIST_HEAD(&new_rt->rt_uncached);
1686
1687		new_rt->dst.input = rt->dst.input;
1688		new_rt->dst.output = rt->dst.output;
1689		new_rt->dst.error = rt->dst.error;
1690		new_rt->dst.lastuse = jiffies;
1691		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1692	}
1693	return new_rt;
1694}
1695EXPORT_SYMBOL(rt_dst_clone);
1696
1697/* called in rcu_read_lock() section */
1698int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1699			  u8 tos, struct net_device *dev,
1700			  struct in_device *in_dev, u32 *itag)
1701{
1702	int err;
1703
1704	/* Primary sanity checks. */
1705	if (!in_dev)
1706		return -EINVAL;
1707
1708	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1709	    skb->protocol != htons(ETH_P_IP))
1710		return -EINVAL;
1711
1712	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1713		return -EINVAL;
1714
1715	if (ipv4_is_zeronet(saddr)) {
1716		if (!ipv4_is_local_multicast(daddr) &&
1717		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1718			return -EINVAL;
1719	} else {
1720		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1721					  in_dev, itag);
1722		if (err < 0)
1723			return err;
1724	}
1725	return 0;
1726}
1727
1728/* called in rcu_read_lock() section */
1729static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1730			     u8 tos, struct net_device *dev, int our)
1731{
1732	struct in_device *in_dev = __in_dev_get_rcu(dev);
1733	unsigned int flags = RTCF_MULTICAST;
1734	struct rtable *rth;
1735	u32 itag = 0;
1736	int err;
1737
1738	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1739	if (err)
1740		return err;
1741
1742	if (our)
1743		flags |= RTCF_LOCAL;
1744
1745	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1746			   IN_DEV_ORCONF(in_dev, NOPOLICY), false);
1747	if (!rth)
1748		return -ENOBUFS;
1749
1750#ifdef CONFIG_IP_ROUTE_CLASSID
1751	rth->dst.tclassid = itag;
1752#endif
1753	rth->dst.output = ip_rt_bug;
1754	rth->rt_is_input= 1;
1755
1756#ifdef CONFIG_IP_MROUTE
1757	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1758		rth->dst.input = ip_mr_input;
1759#endif
1760	RT_CACHE_STAT_INC(in_slow_mc);
1761
1762	skb_dst_set(skb, &rth->dst);
1763	return 0;
1764}
1765
1766
1767static void ip_handle_martian_source(struct net_device *dev,
1768				     struct in_device *in_dev,
1769				     struct sk_buff *skb,
1770				     __be32 daddr,
1771				     __be32 saddr)
1772{
1773	RT_CACHE_STAT_INC(in_martian_src);
1774#ifdef CONFIG_IP_ROUTE_VERBOSE
1775	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1776		/*
1777		 *	RFC1812 recommendation, if source is martian,
1778		 *	the only hint is MAC header.
1779		 */
1780		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1781			&daddr, &saddr, dev->name);
1782		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1783			print_hex_dump(KERN_WARNING, "ll header: ",
1784				       DUMP_PREFIX_OFFSET, 16, 1,
1785				       skb_mac_header(skb),
1786				       dev->hard_header_len, false);
1787		}
1788	}
1789#endif
1790}
1791
1792/* called in rcu_read_lock() section */
1793static int __mkroute_input(struct sk_buff *skb,
1794			   const struct fib_result *res,
1795			   struct in_device *in_dev,
1796			   __be32 daddr, __be32 saddr, u32 tos)
1797{
1798	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1799	struct net_device *dev = nhc->nhc_dev;
1800	struct fib_nh_exception *fnhe;
1801	struct rtable *rth;
1802	int err;
1803	struct in_device *out_dev;
1804	bool do_cache;
1805	u32 itag = 0;
1806
1807	/* get a working reference to the output device */
1808	out_dev = __in_dev_get_rcu(dev);
1809	if (!out_dev) {
1810		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1811		return -EINVAL;
1812	}
1813
1814	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1815				  in_dev->dev, in_dev, &itag);
1816	if (err < 0) {
1817		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1818					 saddr);
1819
1820		goto cleanup;
1821	}
1822
1823	do_cache = res->fi && !itag;
1824	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1825	    skb->protocol == htons(ETH_P_IP)) {
1826		__be32 gw;
1827
1828		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1829		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1830		    inet_addr_onlink(out_dev, saddr, gw))
1831			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1832	}
1833
1834	if (skb->protocol != htons(ETH_P_IP)) {
1835		/* Not IP (i.e. ARP). Do not create route, if it is
1836		 * invalid for proxy arp. DNAT routes are always valid.
1837		 *
1838		 * Proxy arp feature have been extended to allow, ARP
1839		 * replies back to the same interface, to support
1840		 * Private VLAN switch technologies. See arp.c.
1841		 */
1842		if (out_dev == in_dev &&
1843		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1844			err = -EINVAL;
1845			goto cleanup;
1846		}
1847	}
1848
1849	fnhe = find_exception(nhc, daddr);
1850	if (do_cache) {
1851		if (fnhe)
1852			rth = rcu_dereference(fnhe->fnhe_rth_input);
1853		else
1854			rth = rcu_dereference(nhc->nhc_rth_input);
1855		if (rt_cache_valid(rth)) {
1856			skb_dst_set_noref(skb, &rth->dst);
1857			goto out;
1858		}
1859	}
1860
1861	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1862			   IN_DEV_ORCONF(in_dev, NOPOLICY),
1863			   IN_DEV_ORCONF(out_dev, NOXFRM));
1864	if (!rth) {
1865		err = -ENOBUFS;
1866		goto cleanup;
1867	}
1868
1869	rth->rt_is_input = 1;
1870	RT_CACHE_STAT_INC(in_slow_tot);
1871
1872	rth->dst.input = ip_forward;
1873
1874	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1875		       do_cache);
1876	lwtunnel_set_redirect(&rth->dst);
1877	skb_dst_set(skb, &rth->dst);
1878out:
1879	err = 0;
1880 cleanup:
1881	return err;
1882}
1883
1884#ifdef CONFIG_IP_ROUTE_MULTIPATH
1885/* To make ICMP packets follow the right flow, the multipath hash is
1886 * calculated from the inner IP addresses.
1887 */
1888static void ip_multipath_l3_keys(const struct sk_buff *skb,
1889				 struct flow_keys *hash_keys)
1890{
1891	const struct iphdr *outer_iph = ip_hdr(skb);
1892	const struct iphdr *key_iph = outer_iph;
1893	const struct iphdr *inner_iph;
1894	const struct icmphdr *icmph;
1895	struct iphdr _inner_iph;
1896	struct icmphdr _icmph;
1897
1898	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1899		goto out;
1900
1901	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1902		goto out;
1903
1904	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1905				   &_icmph);
1906	if (!icmph)
1907		goto out;
1908
1909	if (!icmp_is_err(icmph->type))
 
 
 
1910		goto out;
1911
1912	inner_iph = skb_header_pointer(skb,
1913				       outer_iph->ihl * 4 + sizeof(_icmph),
1914				       sizeof(_inner_iph), &_inner_iph);
1915	if (!inner_iph)
1916		goto out;
1917
1918	key_iph = inner_iph;
1919out:
1920	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1921	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1922}
1923
1924static u32 fib_multipath_custom_hash_outer(const struct net *net,
1925					   const struct sk_buff *skb,
1926					   bool *p_has_inner)
1927{
1928	u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields;
1929	struct flow_keys keys, hash_keys;
1930
1931	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
1932		return 0;
1933
1934	memset(&hash_keys, 0, sizeof(hash_keys));
1935	skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
1936
1937	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1938	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
1939		hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1940	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
1941		hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1942	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
1943		hash_keys.basic.ip_proto = keys.basic.ip_proto;
1944	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
1945		hash_keys.ports.src = keys.ports.src;
1946	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
1947		hash_keys.ports.dst = keys.ports.dst;
1948
1949	*p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
1950	return flow_hash_from_keys(&hash_keys);
1951}
1952
1953static u32 fib_multipath_custom_hash_inner(const struct net *net,
1954					   const struct sk_buff *skb,
1955					   bool has_inner)
1956{
1957	u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields;
1958	struct flow_keys keys, hash_keys;
1959
1960	/* We assume the packet carries an encapsulation, but if none was
1961	 * encountered during dissection of the outer flow, then there is no
1962	 * point in calling the flow dissector again.
1963	 */
1964	if (!has_inner)
1965		return 0;
1966
1967	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
1968		return 0;
1969
1970	memset(&hash_keys, 0, sizeof(hash_keys));
1971	skb_flow_dissect_flow_keys(skb, &keys, 0);
1972
1973	if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
1974		return 0;
1975
1976	if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1977		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1978		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1979			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1980		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1981			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1982	} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1983		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1984		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1985			hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1986		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1987			hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1988		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
1989			hash_keys.tags.flow_label = keys.tags.flow_label;
1990	}
1991
1992	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
1993		hash_keys.basic.ip_proto = keys.basic.ip_proto;
1994	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
1995		hash_keys.ports.src = keys.ports.src;
1996	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
1997		hash_keys.ports.dst = keys.ports.dst;
1998
1999	return flow_hash_from_keys(&hash_keys);
2000}
2001
2002static u32 fib_multipath_custom_hash_skb(const struct net *net,
2003					 const struct sk_buff *skb)
2004{
2005	u32 mhash, mhash_inner;
2006	bool has_inner = true;
2007
2008	mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
2009	mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);
2010
2011	return jhash_2words(mhash, mhash_inner, 0);
2012}
2013
2014static u32 fib_multipath_custom_hash_fl4(const struct net *net,
2015					 const struct flowi4 *fl4)
2016{
2017	u32 hash_fields = net->ipv4.sysctl_fib_multipath_hash_fields;
2018	struct flow_keys hash_keys;
2019
2020	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2021		return 0;
2022
2023	memset(&hash_keys, 0, sizeof(hash_keys));
2024	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2025	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2026		hash_keys.addrs.v4addrs.src = fl4->saddr;
2027	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2028		hash_keys.addrs.v4addrs.dst = fl4->daddr;
2029	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2030		hash_keys.basic.ip_proto = fl4->flowi4_proto;
2031	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2032		hash_keys.ports.src = fl4->fl4_sport;
2033	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2034		hash_keys.ports.dst = fl4->fl4_dport;
2035
2036	return flow_hash_from_keys(&hash_keys);
2037}
2038
2039/* if skb is set it will be used and fl4 can be NULL */
2040int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
2041		       const struct sk_buff *skb, struct flow_keys *flkeys)
2042{
2043	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
2044	struct flow_keys hash_keys;
2045	u32 mhash = 0;
2046
2047	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
2048	case 0:
2049		memset(&hash_keys, 0, sizeof(hash_keys));
2050		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2051		if (skb) {
2052			ip_multipath_l3_keys(skb, &hash_keys);
2053		} else {
2054			hash_keys.addrs.v4addrs.src = fl4->saddr;
2055			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2056		}
2057		mhash = flow_hash_from_keys(&hash_keys);
2058		break;
2059	case 1:
2060		/* skb is currently provided only when forwarding */
2061		if (skb) {
2062			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2063			struct flow_keys keys;
2064
2065			/* short-circuit if we already have L4 hash present */
2066			if (skb->l4_hash)
2067				return skb_get_hash_raw(skb) >> 1;
2068
2069			memset(&hash_keys, 0, sizeof(hash_keys));
2070
2071			if (!flkeys) {
2072				skb_flow_dissect_flow_keys(skb, &keys, flag);
2073				flkeys = &keys;
2074			}
2075
2076			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2077			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2078			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2079			hash_keys.ports.src = flkeys->ports.src;
2080			hash_keys.ports.dst = flkeys->ports.dst;
2081			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2082		} else {
2083			memset(&hash_keys, 0, sizeof(hash_keys));
2084			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2085			hash_keys.addrs.v4addrs.src = fl4->saddr;
2086			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2087			hash_keys.ports.src = fl4->fl4_sport;
2088			hash_keys.ports.dst = fl4->fl4_dport;
2089			hash_keys.basic.ip_proto = fl4->flowi4_proto;
2090		}
2091		mhash = flow_hash_from_keys(&hash_keys);
2092		break;
2093	case 2:
2094		memset(&hash_keys, 0, sizeof(hash_keys));
2095		/* skb is currently provided only when forwarding */
2096		if (skb) {
2097			struct flow_keys keys;
2098
2099			skb_flow_dissect_flow_keys(skb, &keys, 0);
2100			/* Inner can be v4 or v6 */
2101			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2102				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2103				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2104				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2105			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2106				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2107				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2108				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2109				hash_keys.tags.flow_label = keys.tags.flow_label;
2110				hash_keys.basic.ip_proto = keys.basic.ip_proto;
2111			} else {
2112				/* Same as case 0 */
2113				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2114				ip_multipath_l3_keys(skb, &hash_keys);
2115			}
2116		} else {
2117			/* Same as case 0 */
2118			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2119			hash_keys.addrs.v4addrs.src = fl4->saddr;
2120			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2121		}
2122		mhash = flow_hash_from_keys(&hash_keys);
2123		break;
2124	case 3:
2125		if (skb)
2126			mhash = fib_multipath_custom_hash_skb(net, skb);
2127		else
2128			mhash = fib_multipath_custom_hash_fl4(net, fl4);
2129		break;
2130	}
2131
2132	if (multipath_hash)
2133		mhash = jhash_2words(mhash, multipath_hash, 0);
2134
2135	return mhash >> 1;
2136}
2137#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2138
2139static int ip_mkroute_input(struct sk_buff *skb,
2140			    struct fib_result *res,
2141			    struct in_device *in_dev,
2142			    __be32 daddr, __be32 saddr, u32 tos,
2143			    struct flow_keys *hkeys)
2144{
2145#ifdef CONFIG_IP_ROUTE_MULTIPATH
2146	if (res->fi && fib_info_num_path(res->fi) > 1) {
2147		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2148
2149		fib_select_multipath(res, h);
2150	}
2151#endif
2152
2153	/* create a routing cache entry */
2154	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2155}
2156
2157/* Implements all the saddr-related checks as ip_route_input_slow(),
2158 * assuming daddr is valid and the destination is not a local broadcast one.
2159 * Uses the provided hint instead of performing a route lookup.
2160 */
2161int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2162		      u8 tos, struct net_device *dev,
2163		      const struct sk_buff *hint)
2164{
2165	struct in_device *in_dev = __in_dev_get_rcu(dev);
2166	struct rtable *rt = skb_rtable(hint);
2167	struct net *net = dev_net(dev);
2168	int err = -EINVAL;
2169	u32 tag = 0;
2170
2171	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2172		goto martian_source;
2173
2174	if (ipv4_is_zeronet(saddr))
2175		goto martian_source;
2176
2177	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2178		goto martian_source;
2179
2180	if (rt->rt_type != RTN_LOCAL)
2181		goto skip_validate_source;
2182
2183	tos &= IPTOS_RT_MASK;
2184	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2185	if (err < 0)
2186		goto martian_source;
2187
2188skip_validate_source:
2189	skb_dst_copy(skb, hint);
2190	return 0;
2191
2192martian_source:
2193	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2194	return err;
2195}
2196
2197/* get device for dst_alloc with local routes */
2198static struct net_device *ip_rt_get_dev(struct net *net,
2199					const struct fib_result *res)
2200{
2201	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2202	struct net_device *dev = NULL;
2203
2204	if (nhc)
2205		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2206
2207	return dev ? : net->loopback_dev;
2208}
2209
2210/*
2211 *	NOTE. We drop all the packets that has local source
2212 *	addresses, because every properly looped back packet
2213 *	must have correct destination already attached by output routine.
2214 *	Changes in the enforced policies must be applied also to
2215 *	ip_route_use_hint().
2216 *
2217 *	Such approach solves two big problems:
2218 *	1. Not simplex devices are handled properly.
2219 *	2. IP spoofing attempts are filtered with 100% of guarantee.
2220 *	called with rcu_read_lock()
2221 */
2222
2223static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2224			       u8 tos, struct net_device *dev,
2225			       struct fib_result *res)
2226{
2227	struct in_device *in_dev = __in_dev_get_rcu(dev);
2228	struct flow_keys *flkeys = NULL, _flkeys;
2229	struct net    *net = dev_net(dev);
2230	struct ip_tunnel_info *tun_info;
2231	int		err = -EINVAL;
2232	unsigned int	flags = 0;
2233	u32		itag = 0;
2234	struct rtable	*rth;
2235	struct flowi4	fl4;
2236	bool do_cache = true;
2237
2238	/* IP on this device is disabled. */
2239
2240	if (!in_dev)
2241		goto out;
2242
2243	/* Check for the most weird martians, which can be not detected
2244	 * by fib_lookup.
2245	 */
2246
2247	tun_info = skb_tunnel_info(skb);
2248	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2249		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2250	else
2251		fl4.flowi4_tun_key.tun_id = 0;
2252	skb_dst_drop(skb);
2253
2254	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2255		goto martian_source;
2256
2257	res->fi = NULL;
2258	res->table = NULL;
2259	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2260		goto brd_input;
2261
2262	/* Accept zero addresses only to limited broadcast;
2263	 * I even do not know to fix it or not. Waiting for complains :-)
2264	 */
2265	if (ipv4_is_zeronet(saddr))
2266		goto martian_source;
2267
2268	if (ipv4_is_zeronet(daddr))
2269		goto martian_destination;
2270
2271	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2272	 * and call it once if daddr or/and saddr are loopback addresses
2273	 */
2274	if (ipv4_is_loopback(daddr)) {
2275		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2276			goto martian_destination;
2277	} else if (ipv4_is_loopback(saddr)) {
2278		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2279			goto martian_source;
2280	}
2281
2282	/*
2283	 *	Now we are ready to route packet.
2284	 */
2285	fl4.flowi4_oif = 0;
2286	fl4.flowi4_iif = dev->ifindex;
2287	fl4.flowi4_mark = skb->mark;
2288	fl4.flowi4_tos = tos;
2289	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2290	fl4.flowi4_flags = 0;
2291	fl4.daddr = daddr;
2292	fl4.saddr = saddr;
2293	fl4.flowi4_uid = sock_net_uid(net, NULL);
2294	fl4.flowi4_multipath_hash = 0;
2295
2296	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2297		flkeys = &_flkeys;
2298	} else {
2299		fl4.flowi4_proto = 0;
2300		fl4.fl4_sport = 0;
2301		fl4.fl4_dport = 0;
2302	}
2303
2304	err = fib_lookup(net, &fl4, res, 0);
2305	if (err != 0) {
2306		if (!IN_DEV_FORWARD(in_dev))
2307			err = -EHOSTUNREACH;
2308		goto no_route;
2309	}
2310
2311	if (res->type == RTN_BROADCAST) {
2312		if (IN_DEV_BFORWARD(in_dev))
2313			goto make_route;
2314		/* not do cache if bc_forwarding is enabled */
2315		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2316			do_cache = false;
2317		goto brd_input;
2318	}
2319
2320	if (res->type == RTN_LOCAL) {
2321		err = fib_validate_source(skb, saddr, daddr, tos,
2322					  0, dev, in_dev, &itag);
2323		if (err < 0)
2324			goto martian_source;
2325		goto local_input;
2326	}
2327
2328	if (!IN_DEV_FORWARD(in_dev)) {
2329		err = -EHOSTUNREACH;
2330		goto no_route;
2331	}
2332	if (res->type != RTN_UNICAST)
2333		goto martian_destination;
2334
2335make_route:
2336	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2337out:	return err;
2338
2339brd_input:
2340	if (skb->protocol != htons(ETH_P_IP))
2341		goto e_inval;
2342
2343	if (!ipv4_is_zeronet(saddr)) {
2344		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2345					  in_dev, &itag);
2346		if (err < 0)
2347			goto martian_source;
2348	}
2349	flags |= RTCF_BROADCAST;
2350	res->type = RTN_BROADCAST;
2351	RT_CACHE_STAT_INC(in_brd);
2352
2353local_input:
2354	do_cache &= res->fi && !itag;
2355	if (do_cache) {
2356		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2357
2358		rth = rcu_dereference(nhc->nhc_rth_input);
2359		if (rt_cache_valid(rth)) {
2360			skb_dst_set_noref(skb, &rth->dst);
2361			err = 0;
2362			goto out;
 
2363		}
2364	}
2365
2366	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2367			   flags | RTCF_LOCAL, res->type,
2368			   IN_DEV_ORCONF(in_dev, NOPOLICY), false);
2369	if (!rth)
2370		goto e_nobufs;
2371
2372	rth->dst.output= ip_rt_bug;
2373#ifdef CONFIG_IP_ROUTE_CLASSID
2374	rth->dst.tclassid = itag;
2375#endif
2376	rth->rt_is_input = 1;
2377
2378	RT_CACHE_STAT_INC(in_slow_tot);
2379	if (res->type == RTN_UNREACHABLE) {
2380		rth->dst.input= ip_error;
2381		rth->dst.error= -err;
2382		rth->rt_flags	&= ~RTCF_LOCAL;
2383	}
2384
2385	if (do_cache) {
2386		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2387
2388		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2389		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2390			WARN_ON(rth->dst.input == lwtunnel_input);
2391			rth->dst.lwtstate->orig_input = rth->dst.input;
2392			rth->dst.input = lwtunnel_input;
2393		}
2394
2395		if (unlikely(!rt_cache_route(nhc, rth)))
2396			rt_add_uncached_list(rth);
2397	}
2398	skb_dst_set(skb, &rth->dst);
2399	err = 0;
2400	goto out;
2401
2402no_route:
2403	RT_CACHE_STAT_INC(in_no_route);
2404	res->type = RTN_UNREACHABLE;
2405	res->fi = NULL;
2406	res->table = NULL;
2407	goto local_input;
2408
2409	/*
2410	 *	Do not cache martian addresses: they should be logged (RFC1812)
2411	 */
2412martian_destination:
2413	RT_CACHE_STAT_INC(in_martian_dst);
2414#ifdef CONFIG_IP_ROUTE_VERBOSE
2415	if (IN_DEV_LOG_MARTIANS(in_dev))
2416		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2417				     &daddr, &saddr, dev->name);
2418#endif
2419
2420e_inval:
2421	err = -EINVAL;
2422	goto out;
2423
2424e_nobufs:
2425	err = -ENOBUFS;
2426	goto out;
2427
2428martian_source:
2429	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2430	goto out;
2431}
2432
2433int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2434			 u8 tos, struct net_device *dev)
2435{
2436	struct fib_result res;
2437	int err;
2438
2439	tos &= IPTOS_RT_MASK;
2440	rcu_read_lock();
2441	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2442	rcu_read_unlock();
2443
2444	return err;
2445}
2446EXPORT_SYMBOL(ip_route_input_noref);
2447
2448/* called with rcu_read_lock held */
2449int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2450		       u8 tos, struct net_device *dev, struct fib_result *res)
2451{
2452	/* Multicast recognition logic is moved from route cache to here.
2453	 * The problem was that too many Ethernet cards have broken/missing
2454	 * hardware multicast filters :-( As result the host on multicasting
2455	 * network acquires a lot of useless route cache entries, sort of
2456	 * SDR messages from all the world. Now we try to get rid of them.
2457	 * Really, provided software IP multicast filter is organized
2458	 * reasonably (at least, hashed), it does not result in a slowdown
2459	 * comparing with route cache reject entries.
2460	 * Note, that multicast routers are not affected, because
2461	 * route cache entry is created eventually.
2462	 */
2463	if (ipv4_is_multicast(daddr)) {
2464		struct in_device *in_dev = __in_dev_get_rcu(dev);
2465		int our = 0;
2466		int err = -EINVAL;
2467
2468		if (!in_dev)
2469			return err;
2470		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2471				      ip_hdr(skb)->protocol);
2472
2473		/* check l3 master if no match yet */
2474		if (!our && netif_is_l3_slave(dev)) {
2475			struct in_device *l3_in_dev;
2476
2477			l3_in_dev = __in_dev_get_rcu(skb->dev);
2478			if (l3_in_dev)
2479				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2480						      ip_hdr(skb)->protocol);
2481		}
2482
2483		if (our
2484#ifdef CONFIG_IP_MROUTE
2485			||
2486		    (!ipv4_is_local_multicast(daddr) &&
2487		     IN_DEV_MFORWARD(in_dev))
2488#endif
2489		   ) {
2490			err = ip_route_input_mc(skb, daddr, saddr,
2491						tos, dev, our);
2492		}
2493		return err;
2494	}
2495
2496	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2497}
2498
2499/* called with rcu_read_lock() */
2500static struct rtable *__mkroute_output(const struct fib_result *res,
2501				       const struct flowi4 *fl4, int orig_oif,
2502				       struct net_device *dev_out,
2503				       unsigned int flags)
2504{
2505	struct fib_info *fi = res->fi;
2506	struct fib_nh_exception *fnhe;
2507	struct in_device *in_dev;
2508	u16 type = res->type;
2509	struct rtable *rth;
2510	bool do_cache;
2511
2512	in_dev = __in_dev_get_rcu(dev_out);
2513	if (!in_dev)
2514		return ERR_PTR(-EINVAL);
2515
2516	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2517		if (ipv4_is_loopback(fl4->saddr) &&
2518		    !(dev_out->flags & IFF_LOOPBACK) &&
2519		    !netif_is_l3_master(dev_out))
2520			return ERR_PTR(-EINVAL);
2521
2522	if (ipv4_is_lbcast(fl4->daddr))
2523		type = RTN_BROADCAST;
2524	else if (ipv4_is_multicast(fl4->daddr))
2525		type = RTN_MULTICAST;
2526	else if (ipv4_is_zeronet(fl4->daddr))
2527		return ERR_PTR(-EINVAL);
2528
2529	if (dev_out->flags & IFF_LOOPBACK)
2530		flags |= RTCF_LOCAL;
2531
2532	do_cache = true;
2533	if (type == RTN_BROADCAST) {
2534		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2535		fi = NULL;
2536	} else if (type == RTN_MULTICAST) {
2537		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2538		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2539				     fl4->flowi4_proto))
2540			flags &= ~RTCF_LOCAL;
2541		else
2542			do_cache = false;
2543		/* If multicast route do not exist use
2544		 * default one, but do not gateway in this case.
2545		 * Yes, it is hack.
2546		 */
2547		if (fi && res->prefixlen < 4)
2548			fi = NULL;
2549	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2550		   (orig_oif != dev_out->ifindex)) {
2551		/* For local routes that require a particular output interface
2552		 * we do not want to cache the result.  Caching the result
2553		 * causes incorrect behaviour when there are multiple source
2554		 * addresses on the interface, the end result being that if the
2555		 * intended recipient is waiting on that interface for the
2556		 * packet he won't receive it because it will be delivered on
2557		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2558		 * be set to the loopback interface as well.
2559		 */
2560		do_cache = false;
2561	}
2562
2563	fnhe = NULL;
2564	do_cache &= fi != NULL;
2565	if (fi) {
2566		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2567		struct rtable __rcu **prth;
 
2568
2569		fnhe = find_exception(nhc, fl4->daddr);
2570		if (!do_cache)
2571			goto add;
2572		if (fnhe) {
2573			prth = &fnhe->fnhe_rth_output;
2574		} else {
2575			if (unlikely(fl4->flowi4_flags &
2576				     FLOWI_FLAG_KNOWN_NH &&
2577				     !(nhc->nhc_gw_family &&
2578				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2579				do_cache = false;
2580				goto add;
2581			}
2582			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2583		}
2584		rth = rcu_dereference(*prth);
2585		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2586			return rth;
2587	}
2588
2589add:
2590	rth = rt_dst_alloc(dev_out, flags, type,
2591			   IN_DEV_ORCONF(in_dev, NOPOLICY),
2592			   IN_DEV_ORCONF(in_dev, NOXFRM));
 
2593	if (!rth)
2594		return ERR_PTR(-ENOBUFS);
2595
2596	rth->rt_iif = orig_oif;
2597
2598	RT_CACHE_STAT_INC(out_slow_tot);
2599
2600	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2601		if (flags & RTCF_LOCAL &&
2602		    !(dev_out->flags & IFF_LOOPBACK)) {
2603			rth->dst.output = ip_mc_output;
2604			RT_CACHE_STAT_INC(out_slow_mc);
2605		}
2606#ifdef CONFIG_IP_MROUTE
2607		if (type == RTN_MULTICAST) {
2608			if (IN_DEV_MFORWARD(in_dev) &&
2609			    !ipv4_is_local_multicast(fl4->daddr)) {
2610				rth->dst.input = ip_mr_input;
2611				rth->dst.output = ip_mc_output;
2612			}
2613		}
2614#endif
2615	}
2616
2617	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2618	lwtunnel_set_redirect(&rth->dst);
2619
2620	return rth;
2621}
2622
2623/*
2624 * Major route resolver routine.
2625 */
2626
2627struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2628					const struct sk_buff *skb)
2629{
2630	__u8 tos = RT_FL_TOS(fl4);
2631	struct fib_result res = {
2632		.type		= RTN_UNSPEC,
2633		.fi		= NULL,
2634		.table		= NULL,
2635		.tclassid	= 0,
2636	};
2637	struct rtable *rth;
2638
2639	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2640	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2641	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2642			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2643
2644	rcu_read_lock();
2645	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2646	rcu_read_unlock();
2647
2648	return rth;
2649}
2650EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2651
2652struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2653					    struct fib_result *res,
2654					    const struct sk_buff *skb)
2655{
2656	struct net_device *dev_out = NULL;
2657	int orig_oif = fl4->flowi4_oif;
2658	unsigned int flags = 0;
2659	struct rtable *rth;
2660	int err;
2661
2662	if (fl4->saddr) {
 
2663		if (ipv4_is_multicast(fl4->saddr) ||
2664		    ipv4_is_lbcast(fl4->saddr) ||
2665		    ipv4_is_zeronet(fl4->saddr)) {
2666			rth = ERR_PTR(-EINVAL);
2667			goto out;
2668		}
2669
2670		rth = ERR_PTR(-ENETUNREACH);
2671
2672		/* I removed check for oif == dev_out->oif here.
2673		 * It was wrong for two reasons:
2674		 * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2675		 *    is assigned to multiple interfaces.
2676		 * 2. Moreover, we are allowed to send packets with saddr
2677		 *    of another iface. --ANK
2678		 */
2679
2680		if (fl4->flowi4_oif == 0 &&
2681		    (ipv4_is_multicast(fl4->daddr) ||
2682		     ipv4_is_lbcast(fl4->daddr))) {
2683			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2684			dev_out = __ip_dev_find(net, fl4->saddr, false);
2685			if (!dev_out)
2686				goto out;
2687
2688			/* Special hack: user can direct multicasts
2689			 * and limited broadcast via necessary interface
2690			 * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2691			 * This hack is not just for fun, it allows
2692			 * vic,vat and friends to work.
2693			 * They bind socket to loopback, set ttl to zero
2694			 * and expect that it will work.
2695			 * From the viewpoint of routing cache they are broken,
2696			 * because we are not allowed to build multicast path
2697			 * with loopback source addr (look, routing cache
2698			 * cannot know, that ttl is zero, so that packet
2699			 * will not leave this host and route is valid).
2700			 * Luckily, this hack is good workaround.
2701			 */
2702
2703			fl4->flowi4_oif = dev_out->ifindex;
2704			goto make_route;
2705		}
2706
2707		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2708			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2709			if (!__ip_dev_find(net, fl4->saddr, false))
2710				goto out;
2711		}
2712	}
2713
2714
2715	if (fl4->flowi4_oif) {
2716		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2717		rth = ERR_PTR(-ENODEV);
2718		if (!dev_out)
2719			goto out;
2720
2721		/* RACE: Check return value of inet_select_addr instead. */
2722		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2723			rth = ERR_PTR(-ENETUNREACH);
2724			goto out;
2725		}
2726		if (ipv4_is_local_multicast(fl4->daddr) ||
2727		    ipv4_is_lbcast(fl4->daddr) ||
2728		    fl4->flowi4_proto == IPPROTO_IGMP) {
2729			if (!fl4->saddr)
2730				fl4->saddr = inet_select_addr(dev_out, 0,
2731							      RT_SCOPE_LINK);
2732			goto make_route;
2733		}
2734		if (!fl4->saddr) {
2735			if (ipv4_is_multicast(fl4->daddr))
2736				fl4->saddr = inet_select_addr(dev_out, 0,
2737							      fl4->flowi4_scope);
2738			else if (!fl4->daddr)
2739				fl4->saddr = inet_select_addr(dev_out, 0,
2740							      RT_SCOPE_HOST);
2741		}
2742	}
2743
2744	if (!fl4->daddr) {
2745		fl4->daddr = fl4->saddr;
2746		if (!fl4->daddr)
2747			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2748		dev_out = net->loopback_dev;
2749		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2750		res->type = RTN_LOCAL;
2751		flags |= RTCF_LOCAL;
2752		goto make_route;
2753	}
2754
2755	err = fib_lookup(net, fl4, res, 0);
2756	if (err) {
2757		res->fi = NULL;
2758		res->table = NULL;
2759		if (fl4->flowi4_oif &&
2760		    (ipv4_is_multicast(fl4->daddr) ||
2761		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2762			/* Apparently, routing tables are wrong. Assume,
2763			 * that the destination is on link.
2764			 *
2765			 * WHY? DW.
2766			 * Because we are allowed to send to iface
2767			 * even if it has NO routes and NO assigned
2768			 * addresses. When oif is specified, routing
2769			 * tables are looked up with only one purpose:
2770			 * to catch if destination is gatewayed, rather than
2771			 * direct. Moreover, if MSG_DONTROUTE is set,
2772			 * we send packet, ignoring both routing tables
2773			 * and ifaddr state. --ANK
2774			 *
2775			 *
2776			 * We could make it even if oif is unknown,
2777			 * likely IPv6, but we do not.
2778			 */
2779
2780			if (fl4->saddr == 0)
2781				fl4->saddr = inet_select_addr(dev_out, 0,
2782							      RT_SCOPE_LINK);
2783			res->type = RTN_UNICAST;
2784			goto make_route;
2785		}
2786		rth = ERR_PTR(err);
2787		goto out;
2788	}
2789
2790	if (res->type == RTN_LOCAL) {
2791		if (!fl4->saddr) {
2792			if (res->fi->fib_prefsrc)
2793				fl4->saddr = res->fi->fib_prefsrc;
2794			else
2795				fl4->saddr = fl4->daddr;
2796		}
2797
2798		/* L3 master device is the loopback for that domain */
2799		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2800			net->loopback_dev;
2801
2802		/* make sure orig_oif points to fib result device even
2803		 * though packet rx/tx happens over loopback or l3mdev
2804		 */
2805		orig_oif = FIB_RES_OIF(*res);
2806
2807		fl4->flowi4_oif = dev_out->ifindex;
2808		flags |= RTCF_LOCAL;
2809		goto make_route;
2810	}
2811
2812	fib_select_path(net, res, fl4, skb);
2813
2814	dev_out = FIB_RES_DEV(*res);
 
 
2815
2816make_route:
2817	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2818
2819out:
2820	return rth;
2821}
2822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2823static struct dst_ops ipv4_dst_blackhole_ops = {
2824	.family			= AF_INET,
2825	.default_advmss		= ipv4_default_advmss,
2826	.neigh_lookup		= ipv4_neigh_lookup,
2827	.check			= dst_blackhole_check,
2828	.cow_metrics		= dst_blackhole_cow_metrics,
2829	.update_pmtu		= dst_blackhole_update_pmtu,
2830	.redirect		= dst_blackhole_redirect,
2831	.mtu			= dst_blackhole_mtu,
2832};
2833
2834struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2835{
2836	struct rtable *ort = (struct rtable *) dst_orig;
2837	struct rtable *rt;
2838
2839	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2840	if (rt) {
2841		struct dst_entry *new = &rt->dst;
2842
2843		new->__use = 1;
2844		new->input = dst_discard;
2845		new->output = dst_discard_out;
2846
2847		new->dev = net->loopback_dev;
2848		if (new->dev)
2849			dev_hold(new->dev);
2850
2851		rt->rt_is_input = ort->rt_is_input;
2852		rt->rt_iif = ort->rt_iif;
2853		rt->rt_pmtu = ort->rt_pmtu;
2854		rt->rt_mtu_locked = ort->rt_mtu_locked;
2855
2856		rt->rt_genid = rt_genid_ipv4(net);
2857		rt->rt_flags = ort->rt_flags;
2858		rt->rt_type = ort->rt_type;
 
2859		rt->rt_uses_gateway = ort->rt_uses_gateway;
2860		rt->rt_gw_family = ort->rt_gw_family;
2861		if (rt->rt_gw_family == AF_INET)
2862			rt->rt_gw4 = ort->rt_gw4;
2863		else if (rt->rt_gw_family == AF_INET6)
2864			rt->rt_gw6 = ort->rt_gw6;
2865
2866		INIT_LIST_HEAD(&rt->rt_uncached);
2867	}
2868
2869	dst_release(dst_orig);
2870
2871	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2872}
2873
2874struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2875				    const struct sock *sk)
2876{
2877	struct rtable *rt = __ip_route_output_key(net, flp4);
2878
2879	if (IS_ERR(rt))
2880		return rt;
2881
2882	if (flp4->flowi4_proto) {
2883		flp4->flowi4_oif = rt->dst.dev->ifindex;
2884		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2885							flowi4_to_flowi(flp4),
2886							sk, 0);
2887	}
2888
2889	return rt;
2890}
2891EXPORT_SYMBOL_GPL(ip_route_output_flow);
2892
2893struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2894				      struct net_device *dev,
2895				      struct net *net, __be32 *saddr,
2896				      const struct ip_tunnel_info *info,
2897				      u8 protocol, bool use_cache)
2898{
2899#ifdef CONFIG_DST_CACHE
2900	struct dst_cache *dst_cache;
2901#endif
2902	struct rtable *rt = NULL;
2903	struct flowi4 fl4;
2904	__u8 tos;
2905
2906#ifdef CONFIG_DST_CACHE
2907	dst_cache = (struct dst_cache *)&info->dst_cache;
2908	if (use_cache) {
2909		rt = dst_cache_get_ip4(dst_cache, saddr);
2910		if (rt)
2911			return rt;
2912	}
2913#endif
2914	memset(&fl4, 0, sizeof(fl4));
2915	fl4.flowi4_mark = skb->mark;
2916	fl4.flowi4_proto = protocol;
2917	fl4.daddr = info->key.u.ipv4.dst;
2918	fl4.saddr = info->key.u.ipv4.src;
2919	tos = info->key.tos;
2920	fl4.flowi4_tos = RT_TOS(tos);
2921
2922	rt = ip_route_output_key(net, &fl4);
2923	if (IS_ERR(rt)) {
2924		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2925		return ERR_PTR(-ENETUNREACH);
2926	}
2927	if (rt->dst.dev == dev) { /* is this necessary? */
2928		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2929		ip_rt_put(rt);
2930		return ERR_PTR(-ELOOP);
2931	}
2932#ifdef CONFIG_DST_CACHE
2933	if (use_cache)
2934		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2935#endif
2936	*saddr = fl4.saddr;
2937	return rt;
2938}
2939EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2940
2941/* called with rcu_read_lock held */
2942static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2943			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2944			struct sk_buff *skb, u32 portid, u32 seq,
2945			unsigned int flags)
2946{
 
2947	struct rtmsg *r;
2948	struct nlmsghdr *nlh;
2949	unsigned long expires = 0;
2950	u32 error;
2951	u32 metrics[RTAX_MAX];
2952
2953	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2954	if (!nlh)
2955		return -EMSGSIZE;
2956
2957	r = nlmsg_data(nlh);
2958	r->rtm_family	 = AF_INET;
2959	r->rtm_dst_len	= 32;
2960	r->rtm_src_len	= 0;
2961	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2962	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2963	if (nla_put_u32(skb, RTA_TABLE, table_id))
2964		goto nla_put_failure;
2965	r->rtm_type	= rt->rt_type;
2966	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2967	r->rtm_protocol = RTPROT_UNSPEC;
2968	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2969	if (rt->rt_flags & RTCF_NOTIFY)
2970		r->rtm_flags |= RTM_F_NOTIFY;
2971	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2972		r->rtm_flags |= RTCF_DOREDIRECT;
2973
2974	if (nla_put_in_addr(skb, RTA_DST, dst))
2975		goto nla_put_failure;
2976	if (src) {
2977		r->rtm_src_len = 32;
2978		if (nla_put_in_addr(skb, RTA_SRC, src))
2979			goto nla_put_failure;
2980	}
2981	if (rt->dst.dev &&
2982	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2983		goto nla_put_failure;
2984	if (rt->dst.lwtstate &&
2985	    lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
2986		goto nla_put_failure;
2987#ifdef CONFIG_IP_ROUTE_CLASSID
2988	if (rt->dst.tclassid &&
2989	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2990		goto nla_put_failure;
2991#endif
2992	if (fl4 && !rt_is_input_route(rt) &&
2993	    fl4->saddr != src) {
2994		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2995			goto nla_put_failure;
2996	}
2997	if (rt->rt_uses_gateway) {
2998		if (rt->rt_gw_family == AF_INET &&
2999		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
3000			goto nla_put_failure;
3001		} else if (rt->rt_gw_family == AF_INET6) {
3002			int alen = sizeof(struct in6_addr);
3003			struct nlattr *nla;
3004			struct rtvia *via;
3005
3006			nla = nla_reserve(skb, RTA_VIA, alen + 2);
3007			if (!nla)
3008				goto nla_put_failure;
3009
3010			via = nla_data(nla);
3011			via->rtvia_family = AF_INET6;
3012			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
3013		}
3014	}
3015
3016	expires = rt->dst.expires;
3017	if (expires) {
3018		unsigned long now = jiffies;
3019
3020		if (time_before(now, expires))
3021			expires -= now;
3022		else
3023			expires = 0;
3024	}
3025
3026	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3027	if (rt->rt_pmtu && expires)
3028		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
3029	if (rt->rt_mtu_locked && expires)
3030		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
3031	if (rtnetlink_put_metrics(skb, metrics) < 0)
3032		goto nla_put_failure;
3033
3034	if (fl4) {
3035		if (fl4->flowi4_mark &&
3036		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
3037			goto nla_put_failure;
 
 
 
 
3038
3039		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
3040		    nla_put_u32(skb, RTA_UID,
3041				from_kuid_munged(current_user_ns(),
3042						 fl4->flowi4_uid)))
3043			goto nla_put_failure;
3044
3045		if (rt_is_input_route(rt)) {
3046#ifdef CONFIG_IP_MROUTE
3047			if (ipv4_is_multicast(dst) &&
3048			    !ipv4_is_local_multicast(dst) &&
3049			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
3050				int err = ipmr_get_route(net, skb,
3051							 fl4->saddr, fl4->daddr,
3052							 r, portid);
3053
3054				if (err <= 0) {
3055					if (err == 0)
3056						return 0;
3057					goto nla_put_failure;
3058				}
3059			} else
3060#endif
3061				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
3062					goto nla_put_failure;
3063		}
3064	}
3065
3066	error = rt->dst.error;
3067
3068	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
3069		goto nla_put_failure;
3070
3071	nlmsg_end(skb, nlh);
3072	return 0;
3073
3074nla_put_failure:
3075	nlmsg_cancel(skb, nlh);
3076	return -EMSGSIZE;
3077}
3078
3079static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
3080			    struct netlink_callback *cb, u32 table_id,
3081			    struct fnhe_hash_bucket *bucket, int genid,
3082			    int *fa_index, int fa_start, unsigned int flags)
3083{
3084	int i;
3085
3086	for (i = 0; i < FNHE_HASH_SIZE; i++) {
3087		struct fib_nh_exception *fnhe;
3088
3089		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3090		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
3091			struct rtable *rt;
3092			int err;
3093
3094			if (*fa_index < fa_start)
3095				goto next;
3096
3097			if (fnhe->fnhe_genid != genid)
3098				goto next;
3099
3100			if (fnhe->fnhe_expires &&
3101			    time_after(jiffies, fnhe->fnhe_expires))
3102				goto next;
3103
3104			rt = rcu_dereference(fnhe->fnhe_rth_input);
3105			if (!rt)
3106				rt = rcu_dereference(fnhe->fnhe_rth_output);
3107			if (!rt)
3108				goto next;
3109
3110			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3111					   table_id, NULL, skb,
3112					   NETLINK_CB(cb->skb).portid,
3113					   cb->nlh->nlmsg_seq, flags);
3114			if (err)
3115				return err;
3116next:
3117			(*fa_index)++;
3118		}
3119	}
3120
3121	return 0;
3122}
3123
3124int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3125		       u32 table_id, struct fib_info *fi,
3126		       int *fa_index, int fa_start, unsigned int flags)
3127{
3128	struct net *net = sock_net(cb->skb->sk);
3129	int nhsel, genid = fnhe_genid(net);
3130
3131	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3132		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3133		struct fnhe_hash_bucket *bucket;
3134		int err;
3135
3136		if (nhc->nhc_flags & RTNH_F_DEAD)
3137			continue;
3138
3139		rcu_read_lock();
3140		bucket = rcu_dereference(nhc->nhc_exceptions);
3141		err = 0;
3142		if (bucket)
3143			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3144					       genid, fa_index, fa_start,
3145					       flags);
3146		rcu_read_unlock();
3147		if (err)
3148			return err;
3149	}
3150
3151	return 0;
3152}
3153
3154static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3155						   u8 ip_proto, __be16 sport,
3156						   __be16 dport)
3157{
3158	struct sk_buff *skb;
3159	struct iphdr *iph;
3160
3161	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3162	if (!skb)
3163		return NULL;
3164
3165	/* Reserve room for dummy headers, this skb can pass
3166	 * through good chunk of routing engine.
3167	 */
3168	skb_reset_mac_header(skb);
3169	skb_reset_network_header(skb);
3170	skb->protocol = htons(ETH_P_IP);
3171	iph = skb_put(skb, sizeof(struct iphdr));
3172	iph->protocol = ip_proto;
3173	iph->saddr = src;
3174	iph->daddr = dst;
3175	iph->version = 0x4;
3176	iph->frag_off = 0;
3177	iph->ihl = 0x5;
3178	skb_set_transport_header(skb, skb->len);
3179
3180	switch (iph->protocol) {
3181	case IPPROTO_UDP: {
3182		struct udphdr *udph;
3183
3184		udph = skb_put_zero(skb, sizeof(struct udphdr));
3185		udph->source = sport;
3186		udph->dest = dport;
3187		udph->len = htons(sizeof(struct udphdr));
3188		udph->check = 0;
3189		break;
3190	}
3191	case IPPROTO_TCP: {
3192		struct tcphdr *tcph;
3193
3194		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3195		tcph->source	= sport;
3196		tcph->dest	= dport;
3197		tcph->doff	= sizeof(struct tcphdr) / 4;
3198		tcph->rst = 1;
3199		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3200					    src, dst, 0);
3201		break;
3202	}
3203	case IPPROTO_ICMP: {
3204		struct icmphdr *icmph;
3205
3206		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3207		icmph->type = ICMP_ECHO;
3208		icmph->code = 0;
3209	}
3210	}
3211
3212	return skb;
3213}
3214
3215static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3216				       const struct nlmsghdr *nlh,
3217				       struct nlattr **tb,
3218				       struct netlink_ext_ack *extack)
3219{
3220	struct rtmsg *rtm;
3221	int i, err;
3222
3223	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3224		NL_SET_ERR_MSG(extack,
3225			       "ipv4: Invalid header for route get request");
3226		return -EINVAL;
3227	}
3228
3229	if (!netlink_strict_get_check(skb))
3230		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3231					      rtm_ipv4_policy, extack);
3232
3233	rtm = nlmsg_data(nlh);
3234	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3235	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3236	    rtm->rtm_table || rtm->rtm_protocol ||
3237	    rtm->rtm_scope || rtm->rtm_type) {
3238		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3239		return -EINVAL;
3240	}
3241
3242	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3243			       RTM_F_LOOKUP_TABLE |
3244			       RTM_F_FIB_MATCH)) {
3245		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3246		return -EINVAL;
3247	}
3248
3249	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3250					    rtm_ipv4_policy, extack);
3251	if (err)
3252		return err;
3253
3254	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3255	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3256		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3257		return -EINVAL;
3258	}
3259
3260	for (i = 0; i <= RTA_MAX; i++) {
3261		if (!tb[i])
3262			continue;
3263
3264		switch (i) {
3265		case RTA_IIF:
3266		case RTA_OIF:
3267		case RTA_SRC:
3268		case RTA_DST:
3269		case RTA_IP_PROTO:
3270		case RTA_SPORT:
3271		case RTA_DPORT:
3272		case RTA_MARK:
3273		case RTA_UID:
3274			break;
3275		default:
3276			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3277			return -EINVAL;
3278		}
3279	}
3280
3281	return 0;
3282}
3283
3284static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3285			     struct netlink_ext_ack *extack)
3286{
3287	struct net *net = sock_net(in_skb->sk);
 
3288	struct nlattr *tb[RTA_MAX+1];
3289	u32 table_id = RT_TABLE_MAIN;
3290	__be16 sport = 0, dport = 0;
3291	struct fib_result res = {};
3292	u8 ip_proto = IPPROTO_UDP;
3293	struct rtable *rt = NULL;
3294	struct sk_buff *skb;
3295	struct rtmsg *rtm;
3296	struct flowi4 fl4 = {};
3297	__be32 dst = 0;
3298	__be32 src = 0;
3299	kuid_t uid;
3300	u32 iif;
3301	int err;
3302	int mark;
 
 
 
3303
3304	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
 
3305	if (err < 0)
3306		return err;
3307
3308	rtm = nlmsg_data(nlh);
 
 
 
 
 
 
 
 
 
 
 
 
 
3309	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3310	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3311	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3312	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3313	if (tb[RTA_UID])
3314		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3315	else
3316		uid = (iif ? INVALID_UID : current_uid());
3317
3318	if (tb[RTA_IP_PROTO]) {
3319		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3320						  &ip_proto, AF_INET, extack);
3321		if (err)
3322			return err;
3323	}
3324
3325	if (tb[RTA_SPORT])
3326		sport = nla_get_be16(tb[RTA_SPORT]);
3327
3328	if (tb[RTA_DPORT])
3329		dport = nla_get_be16(tb[RTA_DPORT]);
3330
3331	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3332	if (!skb)
3333		return -ENOBUFS;
3334
 
3335	fl4.daddr = dst;
3336	fl4.saddr = src;
3337	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3338	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3339	fl4.flowi4_mark = mark;
3340	fl4.flowi4_uid = uid;
3341	if (sport)
3342		fl4.fl4_sport = sport;
3343	if (dport)
3344		fl4.fl4_dport = dport;
3345	fl4.flowi4_proto = ip_proto;
3346
3347	rcu_read_lock();
3348
3349	if (iif) {
3350		struct net_device *dev;
3351
3352		dev = dev_get_by_index_rcu(net, iif);
3353		if (!dev) {
3354			err = -ENODEV;
3355			goto errout_rcu;
3356		}
3357
3358		fl4.flowi4_iif = iif; /* for rt_fill_info */
3359		skb->dev	= dev;
3360		skb->mark	= mark;
3361		err = ip_route_input_rcu(skb, dst, src,
3362					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3363					 &res);
3364
3365		rt = skb_rtable(skb);
3366		if (err == 0 && rt->dst.error)
3367			err = -rt->dst.error;
3368	} else {
3369		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3370		skb->dev = net->loopback_dev;
3371		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3372		err = 0;
3373		if (IS_ERR(rt))
3374			err = PTR_ERR(rt);
3375		else
3376			skb_dst_set(skb, &rt->dst);
3377	}
3378
3379	if (err)
3380		goto errout_rcu;
3381
3382	if (rtm->rtm_flags & RTM_F_NOTIFY)
3383		rt->rt_flags |= RTCF_NOTIFY;
3384
3385	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3386		table_id = res.table ? res.table->tb_id : 0;
3387
3388	/* reset skb for netlink reply msg */
3389	skb_trim(skb, 0);
3390	skb_reset_network_header(skb);
3391	skb_reset_transport_header(skb);
3392	skb_reset_mac_header(skb);
3393
3394	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3395		struct fib_rt_info fri;
3396
3397		if (!res.fi) {
3398			err = fib_props[res.type].error;
3399			if (!err)
3400				err = -EHOSTUNREACH;
3401			goto errout_rcu;
3402		}
3403		fri.fi = res.fi;
3404		fri.tb_id = table_id;
3405		fri.dst = res.prefix;
3406		fri.dst_len = res.prefixlen;
3407		fri.tos = fl4.flowi4_tos;
3408		fri.type = rt->rt_type;
3409		fri.offload = 0;
3410		fri.trap = 0;
3411		fri.offload_failed = 0;
3412		if (res.fa_head) {
3413			struct fib_alias *fa;
3414
3415			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3416				u8 slen = 32 - fri.dst_len;
3417
3418				if (fa->fa_slen == slen &&
3419				    fa->tb_id == fri.tb_id &&
3420				    fa->fa_tos == fri.tos &&
3421				    fa->fa_info == res.fi &&
3422				    fa->fa_type == fri.type) {
3423					fri.offload = fa->offload;
3424					fri.trap = fa->trap;
3425					break;
3426				}
3427			}
3428		}
3429		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3430				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
 
 
3431	} else {
3432		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3433				   NETLINK_CB(in_skb).portid,
3434				   nlh->nlmsg_seq, 0);
3435	}
3436	if (err < 0)
3437		goto errout_rcu;
3438
3439	rcu_read_unlock();
3440
3441	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
 
 
3442
3443errout_free:
3444	return err;
3445errout_rcu:
3446	rcu_read_unlock();
3447	kfree_skb(skb);
3448	goto errout_free;
3449}
3450
3451void ip_rt_multicast_event(struct in_device *in_dev)
3452{
3453	rt_cache_flush(dev_net(in_dev->dev));
3454}
3455
3456#ifdef CONFIG_SYSCTL
3457static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3458static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3459static int ip_rt_gc_elasticity __read_mostly	= 8;
3460static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3461
3462static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3463		void *buffer, size_t *lenp, loff_t *ppos)
 
3464{
3465	struct net *net = (struct net *)__ctl->extra1;
3466
3467	if (write) {
3468		rt_cache_flush(net);
3469		fnhe_genid_bump(net);
3470		return 0;
3471	}
3472
3473	return -EINVAL;
3474}
3475
3476static struct ctl_table ipv4_route_table[] = {
3477	{
3478		.procname	= "gc_thresh",
3479		.data		= &ipv4_dst_ops.gc_thresh,
3480		.maxlen		= sizeof(int),
3481		.mode		= 0644,
3482		.proc_handler	= proc_dointvec,
3483	},
3484	{
3485		.procname	= "max_size",
3486		.data		= &ip_rt_max_size,
3487		.maxlen		= sizeof(int),
3488		.mode		= 0644,
3489		.proc_handler	= proc_dointvec,
3490	},
3491	{
3492		/*  Deprecated. Use gc_min_interval_ms */
3493
3494		.procname	= "gc_min_interval",
3495		.data		= &ip_rt_gc_min_interval,
3496		.maxlen		= sizeof(int),
3497		.mode		= 0644,
3498		.proc_handler	= proc_dointvec_jiffies,
3499	},
3500	{
3501		.procname	= "gc_min_interval_ms",
3502		.data		= &ip_rt_gc_min_interval,
3503		.maxlen		= sizeof(int),
3504		.mode		= 0644,
3505		.proc_handler	= proc_dointvec_ms_jiffies,
3506	},
3507	{
3508		.procname	= "gc_timeout",
3509		.data		= &ip_rt_gc_timeout,
3510		.maxlen		= sizeof(int),
3511		.mode		= 0644,
3512		.proc_handler	= proc_dointvec_jiffies,
3513	},
3514	{
3515		.procname	= "gc_interval",
3516		.data		= &ip_rt_gc_interval,
3517		.maxlen		= sizeof(int),
3518		.mode		= 0644,
3519		.proc_handler	= proc_dointvec_jiffies,
3520	},
3521	{
3522		.procname	= "redirect_load",
3523		.data		= &ip_rt_redirect_load,
3524		.maxlen		= sizeof(int),
3525		.mode		= 0644,
3526		.proc_handler	= proc_dointvec,
3527	},
3528	{
3529		.procname	= "redirect_number",
3530		.data		= &ip_rt_redirect_number,
3531		.maxlen		= sizeof(int),
3532		.mode		= 0644,
3533		.proc_handler	= proc_dointvec,
3534	},
3535	{
3536		.procname	= "redirect_silence",
3537		.data		= &ip_rt_redirect_silence,
3538		.maxlen		= sizeof(int),
3539		.mode		= 0644,
3540		.proc_handler	= proc_dointvec,
3541	},
3542	{
3543		.procname	= "error_cost",
3544		.data		= &ip_rt_error_cost,
3545		.maxlen		= sizeof(int),
3546		.mode		= 0644,
3547		.proc_handler	= proc_dointvec,
3548	},
3549	{
3550		.procname	= "error_burst",
3551		.data		= &ip_rt_error_burst,
3552		.maxlen		= sizeof(int),
3553		.mode		= 0644,
3554		.proc_handler	= proc_dointvec,
3555	},
3556	{
3557		.procname	= "gc_elasticity",
3558		.data		= &ip_rt_gc_elasticity,
3559		.maxlen		= sizeof(int),
3560		.mode		= 0644,
3561		.proc_handler	= proc_dointvec,
3562	},
3563	{
3564		.procname	= "mtu_expires",
3565		.data		= &ip_rt_mtu_expires,
3566		.maxlen		= sizeof(int),
3567		.mode		= 0644,
3568		.proc_handler	= proc_dointvec_jiffies,
3569	},
3570	{
3571		.procname	= "min_pmtu",
3572		.data		= &ip_rt_min_pmtu,
3573		.maxlen		= sizeof(int),
3574		.mode		= 0644,
3575		.proc_handler	= proc_dointvec_minmax,
3576		.extra1		= &ip_min_valid_pmtu,
3577	},
3578	{
3579		.procname	= "min_adv_mss",
3580		.data		= &ip_rt_min_advmss,
3581		.maxlen		= sizeof(int),
3582		.mode		= 0644,
3583		.proc_handler	= proc_dointvec,
3584	},
3585	{ }
3586};
3587
3588static const char ipv4_route_flush_procname[] = "flush";
3589
3590static struct ctl_table ipv4_route_flush_table[] = {
3591	{
3592		.procname	= ipv4_route_flush_procname,
3593		.maxlen		= sizeof(int),
3594		.mode		= 0200,
3595		.proc_handler	= ipv4_sysctl_rtcache_flush,
3596	},
3597	{ },
3598};
3599
3600static __net_init int sysctl_route_net_init(struct net *net)
3601{
3602	struct ctl_table *tbl;
3603
3604	tbl = ipv4_route_flush_table;
3605	if (!net_eq(net, &init_net)) {
3606		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3607		if (!tbl)
3608			goto err_dup;
3609
3610		/* Don't export non-whitelisted sysctls to unprivileged users */
3611		if (net->user_ns != &init_user_ns) {
3612			if (tbl[0].procname != ipv4_route_flush_procname)
3613				tbl[0].procname = NULL;
3614		}
3615	}
3616	tbl[0].extra1 = net;
3617
3618	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3619	if (!net->ipv4.route_hdr)
3620		goto err_reg;
3621	return 0;
3622
3623err_reg:
3624	if (tbl != ipv4_route_flush_table)
3625		kfree(tbl);
3626err_dup:
3627	return -ENOMEM;
3628}
3629
3630static __net_exit void sysctl_route_net_exit(struct net *net)
3631{
3632	struct ctl_table *tbl;
3633
3634	tbl = net->ipv4.route_hdr->ctl_table_arg;
3635	unregister_net_sysctl_table(net->ipv4.route_hdr);
3636	BUG_ON(tbl == ipv4_route_flush_table);
3637	kfree(tbl);
3638}
3639
3640static __net_initdata struct pernet_operations sysctl_route_ops = {
3641	.init = sysctl_route_net_init,
3642	.exit = sysctl_route_net_exit,
3643};
3644#endif
3645
3646static __net_init int rt_genid_init(struct net *net)
3647{
3648	atomic_set(&net->ipv4.rt_genid, 0);
3649	atomic_set(&net->fnhe_genid, 0);
3650	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3651	return 0;
3652}
3653
3654static __net_initdata struct pernet_operations rt_genid_ops = {
3655	.init = rt_genid_init,
3656};
3657
3658static int __net_init ipv4_inetpeer_init(struct net *net)
3659{
3660	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3661
3662	if (!bp)
3663		return -ENOMEM;
3664	inet_peer_base_init(bp);
3665	net->ipv4.peers = bp;
3666	return 0;
3667}
3668
3669static void __net_exit ipv4_inetpeer_exit(struct net *net)
3670{
3671	struct inet_peer_base *bp = net->ipv4.peers;
3672
3673	net->ipv4.peers = NULL;
3674	inetpeer_invalidate_tree(bp);
3675	kfree(bp);
3676}
3677
3678static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3679	.init	=	ipv4_inetpeer_init,
3680	.exit	=	ipv4_inetpeer_exit,
3681};
3682
3683#ifdef CONFIG_IP_ROUTE_CLASSID
3684struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3685#endif /* CONFIG_IP_ROUTE_CLASSID */
3686
3687int __init ip_rt_init(void)
3688{
3689	void *idents_hash;
3690	int cpu;
3691
3692	/* For modern hosts, this will use 2 MB of memory */
3693	idents_hash = alloc_large_system_hash("IP idents",
3694					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
3695					      0,
3696					      16, /* one bucket per 64 KB */
3697					      HASH_ZERO,
3698					      NULL,
3699					      &ip_idents_mask,
3700					      2048,
3701					      256*1024);
3702
3703	ip_idents = idents_hash;
3704
3705	prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3706
3707	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3708
3709	for_each_possible_cpu(cpu) {
3710		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3711
3712		INIT_LIST_HEAD(&ul->head);
3713		spin_lock_init(&ul->lock);
3714	}
3715#ifdef CONFIG_IP_ROUTE_CLASSID
3716	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3717	if (!ip_rt_acct)
3718		panic("IP: failed to allocate ip_rt_acct\n");
3719#endif
3720
3721	ipv4_dst_ops.kmem_cachep =
3722		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3723				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3724
3725	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3726
3727	if (dst_entries_init(&ipv4_dst_ops) < 0)
3728		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3729
3730	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3731		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3732
3733	ipv4_dst_ops.gc_thresh = ~0;
3734	ip_rt_max_size = INT_MAX;
3735
3736	devinet_init();
3737	ip_fib_init();
3738
3739	if (ip_rt_proc_init())
3740		pr_err("Unable to create route proc files\n");
3741#ifdef CONFIG_XFRM
3742	xfrm_init();
3743	xfrm4_init();
3744#endif
3745	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3746		      RTNL_FLAG_DOIT_UNLOCKED);
3747
3748#ifdef CONFIG_SYSCTL
3749	register_pernet_subsys(&sysctl_route_ops);
3750#endif
3751	register_pernet_subsys(&rt_genid_ops);
3752	register_pernet_subsys(&ipv4_inetpeer_ops);
3753	return 0;
3754}
3755
3756#ifdef CONFIG_SYSCTL
3757/*
3758 * We really need to sanitize the damn ipv4 init order, then all
3759 * this nonsense will go away.
3760 */
3761void __init ip_static_sysctl_init(void)
3762{
3763	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3764}
3765#endif