Linux Audio

Check our new training course

Yocto / OpenEmbedded training

Feb 10-13, 2025
Register
Loading...
v5.4
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		ROUTE - implementation of the IP router.
   8 *
   9 * Authors:	Ross Biro
  10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  12 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14 *
  15 * Fixes:
  16 *		Alan Cox	:	Verify area fixes.
  17 *		Alan Cox	:	cli() protects routing changes
  18 *		Rui Oliveira	:	ICMP routing table updates
  19 *		(rco@di.uminho.pt)	Routing table insertion and update
  20 *		Linus Torvalds	:	Rewrote bits to be sensible
  21 *		Alan Cox	:	Added BSD route gw semantics
  22 *		Alan Cox	:	Super /proc >4K
  23 *		Alan Cox	:	MTU in route table
  24 *		Alan Cox	: 	MSS actually. Also added the window
  25 *					clamper.
  26 *		Sam Lantinga	:	Fixed route matching in rt_del()
  27 *		Alan Cox	:	Routing cache support.
  28 *		Alan Cox	:	Removed compatibility cruft.
  29 *		Alan Cox	:	RTF_REJECT support.
  30 *		Alan Cox	:	TCP irtt support.
  31 *		Jonathan Naylor	:	Added Metric support.
  32 *	Miquel van Smoorenburg	:	BSD API fixes.
  33 *	Miquel van Smoorenburg	:	Metrics.
  34 *		Alan Cox	:	Use __u32 properly
  35 *		Alan Cox	:	Aligned routing errors more closely with BSD
  36 *					our system is still very different.
  37 *		Alan Cox	:	Faster /proc handling
  38 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  39 *					routing caches and better behaviour.
  40 *
  41 *		Olaf Erb	:	irtt wasn't being copied right.
  42 *		Bjorn Ekwall	:	Kerneld route support.
  43 *		Alan Cox	:	Multicast fixed (I hope)
  44 * 		Pavel Krauz	:	Limited broadcast fixed
  45 *		Mike McLagan	:	Routing by source
  46 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  47 *					route.c and rewritten from scratch.
  48 *		Andi Kleen	:	Load-limit warning messages.
  49 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  50 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  51 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  52 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  53 *		Marc Boucher	:	routing by fwmark
  54 *	Robert Olsson		:	Added rt_cache statistics
  55 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  56 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  57 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  58 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
  59 */
  60
  61#define pr_fmt(fmt) "IPv4: " fmt
  62
  63#include <linux/module.h>
  64#include <linux/uaccess.h>
  65#include <linux/bitops.h>
  66#include <linux/types.h>
  67#include <linux/kernel.h>
  68#include <linux/mm.h>
  69#include <linux/string.h>
  70#include <linux/socket.h>
  71#include <linux/sockios.h>
  72#include <linux/errno.h>
  73#include <linux/in.h>
  74#include <linux/inet.h>
  75#include <linux/netdevice.h>
  76#include <linux/proc_fs.h>
  77#include <linux/init.h>
  78#include <linux/skbuff.h>
  79#include <linux/inetdevice.h>
  80#include <linux/igmp.h>
  81#include <linux/pkt_sched.h>
  82#include <linux/mroute.h>
  83#include <linux/netfilter_ipv4.h>
  84#include <linux/random.h>
  85#include <linux/rcupdate.h>
  86#include <linux/times.h>
  87#include <linux/slab.h>
  88#include <linux/jhash.h>
  89#include <net/dst.h>
  90#include <net/dst_metadata.h>
 
  91#include <net/net_namespace.h>
  92#include <net/protocol.h>
  93#include <net/ip.h>
  94#include <net/route.h>
  95#include <net/inetpeer.h>
  96#include <net/sock.h>
  97#include <net/ip_fib.h>
  98#include <net/nexthop.h>
  99#include <net/arp.h>
 100#include <net/tcp.h>
 101#include <net/icmp.h>
 102#include <net/xfrm.h>
 103#include <net/lwtunnel.h>
 104#include <net/netevent.h>
 105#include <net/rtnetlink.h>
 106#ifdef CONFIG_SYSCTL
 107#include <linux/sysctl.h>
 108#endif
 109#include <net/secure_seq.h>
 110#include <net/ip_tunnels.h>
 111#include <net/l3mdev.h>
 112
 113#include "fib_lookup.h"
 114
 115#define RT_FL_TOS(oldflp4) \
 116	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 117
 118#define RT_GC_TIMEOUT (300*HZ)
 119
 
 
 
 120static int ip_rt_max_size;
 121static int ip_rt_redirect_number __read_mostly	= 9;
 122static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 123static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 124static int ip_rt_error_cost __read_mostly	= HZ;
 125static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 126static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 127static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 128static int ip_rt_min_advmss __read_mostly	= 256;
 129
 130static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 131
 132/*
 133 *	Interface to generic destination cache.
 134 */
 135
 136static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 
 137static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 138static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
 
 139static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 140static void		 ipv4_link_failure(struct sk_buff *skb);
 141static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 142					   struct sk_buff *skb, u32 mtu);
 
 143static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 144					struct sk_buff *skb);
 145static void		ipv4_dst_destroy(struct dst_entry *dst);
 146
 147static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 148{
 149	WARN_ON(1);
 150	return NULL;
 151}
 152
 153static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 154					   struct sk_buff *skb,
 155					   const void *daddr);
 156static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 157
 158static struct dst_ops ipv4_dst_ops = {
 159	.family =		AF_INET,
 160	.check =		ipv4_dst_check,
 161	.default_advmss =	ipv4_default_advmss,
 162	.mtu =			ipv4_mtu,
 163	.cow_metrics =		ipv4_cow_metrics,
 164	.destroy =		ipv4_dst_destroy,
 165	.negative_advice =	ipv4_negative_advice,
 166	.link_failure =		ipv4_link_failure,
 167	.update_pmtu =		ip_rt_update_pmtu,
 168	.redirect =		ip_do_redirect,
 169	.local_out =		__ip_local_out,
 170	.neigh_lookup =		ipv4_neigh_lookup,
 171	.confirm_neigh =	ipv4_confirm_neigh,
 172};
 173
 174#define ECN_OR_COST(class)	TC_PRIO_##class
 175
 176const __u8 ip_tos2prio[16] = {
 177	TC_PRIO_BESTEFFORT,
 178	ECN_OR_COST(BESTEFFORT),
 179	TC_PRIO_BESTEFFORT,
 180	ECN_OR_COST(BESTEFFORT),
 181	TC_PRIO_BULK,
 182	ECN_OR_COST(BULK),
 183	TC_PRIO_BULK,
 184	ECN_OR_COST(BULK),
 185	TC_PRIO_INTERACTIVE,
 186	ECN_OR_COST(INTERACTIVE),
 187	TC_PRIO_INTERACTIVE,
 188	ECN_OR_COST(INTERACTIVE),
 189	TC_PRIO_INTERACTIVE_BULK,
 190	ECN_OR_COST(INTERACTIVE_BULK),
 191	TC_PRIO_INTERACTIVE_BULK,
 192	ECN_OR_COST(INTERACTIVE_BULK)
 193};
 194EXPORT_SYMBOL(ip_tos2prio);
 195
 196static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 197#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 198
 199#ifdef CONFIG_PROC_FS
 200static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 201{
 202	if (*pos)
 203		return NULL;
 204	return SEQ_START_TOKEN;
 205}
 206
 207static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 208{
 209	++*pos;
 210	return NULL;
 211}
 212
 213static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 214{
 215}
 216
 217static int rt_cache_seq_show(struct seq_file *seq, void *v)
 218{
 219	if (v == SEQ_START_TOKEN)
 220		seq_printf(seq, "%-127s\n",
 221			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 222			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 223			   "HHUptod\tSpecDst");
 224	return 0;
 225}
 226
 227static const struct seq_operations rt_cache_seq_ops = {
 228	.start  = rt_cache_seq_start,
 229	.next   = rt_cache_seq_next,
 230	.stop   = rt_cache_seq_stop,
 231	.show   = rt_cache_seq_show,
 232};
 233
 234static int rt_cache_seq_open(struct inode *inode, struct file *file)
 235{
 236	return seq_open(file, &rt_cache_seq_ops);
 237}
 238
 239static const struct file_operations rt_cache_seq_fops = {
 240	.open	 = rt_cache_seq_open,
 241	.read	 = seq_read,
 242	.llseek	 = seq_lseek,
 243	.release = seq_release,
 244};
 245
 246
 247static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 248{
 249	int cpu;
 250
 251	if (*pos == 0)
 252		return SEQ_START_TOKEN;
 253
 254	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 255		if (!cpu_possible(cpu))
 256			continue;
 257		*pos = cpu+1;
 258		return &per_cpu(rt_cache_stat, cpu);
 259	}
 260	return NULL;
 261}
 262
 263static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 264{
 265	int cpu;
 266
 267	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 268		if (!cpu_possible(cpu))
 269			continue;
 270		*pos = cpu+1;
 271		return &per_cpu(rt_cache_stat, cpu);
 272	}
 
 273	return NULL;
 274
 275}
 276
 277static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 278{
 279
 280}
 281
 282static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 283{
 284	struct rt_cache_stat *st = v;
 285
 286	if (v == SEQ_START_TOKEN) {
 287		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 288		return 0;
 289	}
 290
 291	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 292		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 
 293		   dst_entries_get_slow(&ipv4_dst_ops),
 294		   0, /* st->in_hit */
 295		   st->in_slow_tot,
 296		   st->in_slow_mc,
 297		   st->in_no_route,
 298		   st->in_brd,
 299		   st->in_martian_dst,
 300		   st->in_martian_src,
 301
 302		   0, /* st->out_hit */
 303		   st->out_slow_tot,
 304		   st->out_slow_mc,
 305
 306		   0, /* st->gc_total */
 307		   0, /* st->gc_ignored */
 308		   0, /* st->gc_goal_miss */
 309		   0, /* st->gc_dst_overflow */
 310		   0, /* st->in_hlist_search */
 311		   0  /* st->out_hlist_search */
 312		);
 313	return 0;
 314}
 315
 316static const struct seq_operations rt_cpu_seq_ops = {
 317	.start  = rt_cpu_seq_start,
 318	.next   = rt_cpu_seq_next,
 319	.stop   = rt_cpu_seq_stop,
 320	.show   = rt_cpu_seq_show,
 321};
 322
 323
 324static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 325{
 326	return seq_open(file, &rt_cpu_seq_ops);
 327}
 328
 329static const struct file_operations rt_cpu_seq_fops = {
 330	.open	 = rt_cpu_seq_open,
 331	.read	 = seq_read,
 332	.llseek	 = seq_lseek,
 333	.release = seq_release,
 334};
 335
 336#ifdef CONFIG_IP_ROUTE_CLASSID
 337static int rt_acct_proc_show(struct seq_file *m, void *v)
 338{
 339	struct ip_rt_acct *dst, *src;
 340	unsigned int i, j;
 341
 342	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 343	if (!dst)
 344		return -ENOMEM;
 345
 346	for_each_possible_cpu(i) {
 347		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 348		for (j = 0; j < 256; j++) {
 349			dst[j].o_bytes   += src[j].o_bytes;
 350			dst[j].o_packets += src[j].o_packets;
 351			dst[j].i_bytes   += src[j].i_bytes;
 352			dst[j].i_packets += src[j].i_packets;
 353		}
 354	}
 355
 356	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 357	kfree(dst);
 358	return 0;
 359}
 360#endif
 361
 362static int __net_init ip_rt_do_proc_init(struct net *net)
 363{
 364	struct proc_dir_entry *pde;
 365
 366	pde = proc_create("rt_cache", 0444, net->proc_net,
 367			  &rt_cache_seq_fops);
 368	if (!pde)
 369		goto err1;
 370
 371	pde = proc_create("rt_cache", 0444,
 372			  net->proc_net_stat, &rt_cpu_seq_fops);
 373	if (!pde)
 374		goto err2;
 375
 376#ifdef CONFIG_IP_ROUTE_CLASSID
 377	pde = proc_create_single("rt_acct", 0, net->proc_net,
 378			rt_acct_proc_show);
 379	if (!pde)
 380		goto err3;
 381#endif
 382	return 0;
 383
 384#ifdef CONFIG_IP_ROUTE_CLASSID
 385err3:
 386	remove_proc_entry("rt_cache", net->proc_net_stat);
 387#endif
 388err2:
 389	remove_proc_entry("rt_cache", net->proc_net);
 390err1:
 391	return -ENOMEM;
 392}
 393
 394static void __net_exit ip_rt_do_proc_exit(struct net *net)
 395{
 396	remove_proc_entry("rt_cache", net->proc_net_stat);
 397	remove_proc_entry("rt_cache", net->proc_net);
 398#ifdef CONFIG_IP_ROUTE_CLASSID
 399	remove_proc_entry("rt_acct", net->proc_net);
 400#endif
 401}
 402
 403static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 404	.init = ip_rt_do_proc_init,
 405	.exit = ip_rt_do_proc_exit,
 406};
 407
 408static int __init ip_rt_proc_init(void)
 409{
 410	return register_pernet_subsys(&ip_rt_proc_ops);
 411}
 412
 413#else
 414static inline int ip_rt_proc_init(void)
 415{
 416	return 0;
 417}
 418#endif /* CONFIG_PROC_FS */
 419
 420static inline bool rt_is_expired(const struct rtable *rth)
 421{
 422	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 423}
 424
 425void rt_cache_flush(struct net *net)
 426{
 427	rt_genid_bump_ipv4(net);
 428}
 429
 430static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 431					   struct sk_buff *skb,
 432					   const void *daddr)
 433{
 434	const struct rtable *rt = container_of(dst, struct rtable, dst);
 435	struct net_device *dev = dst->dev;
 436	struct neighbour *n;
 437
 438	rcu_read_lock_bh();
 439
 440	if (likely(rt->rt_gw_family == AF_INET)) {
 441		n = ip_neigh_gw4(dev, rt->rt_gw4);
 442	} else if (rt->rt_gw_family == AF_INET6) {
 443		n = ip_neigh_gw6(dev, &rt->rt_gw6);
 444        } else {
 445		__be32 pkey;
 446
 447		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 448		n = ip_neigh_gw4(dev, pkey);
 449	}
 450
 451	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 452		n = NULL;
 453
 454	rcu_read_unlock_bh();
 455
 456	return n;
 457}
 458
 459static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 460{
 461	const struct rtable *rt = container_of(dst, struct rtable, dst);
 462	struct net_device *dev = dst->dev;
 463	const __be32 *pkey = daddr;
 464
 465	if (rt->rt_gw_family == AF_INET) {
 466		pkey = (const __be32 *)&rt->rt_gw4;
 467	} else if (rt->rt_gw_family == AF_INET6) {
 468		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 469	} else if (!daddr ||
 470		 (rt->rt_flags &
 471		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 472		return;
 473	}
 474	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 475}
 476
 477#define IP_IDENTS_SZ 2048u
 478
 
 
 479static atomic_t *ip_idents __read_mostly;
 480static u32 *ip_tstamps __read_mostly;
 481
 482/* In order to protect privacy, we add a perturbation to identifiers
 483 * if one generator is seldom used. This makes hard for an attacker
 484 * to infer how many packets were sent between two points in time.
 485 */
 486u32 ip_idents_reserve(u32 hash, int segs)
 487{
 488	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 489	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 490	u32 old = READ_ONCE(*p_tstamp);
 491	u32 now = (u32)jiffies;
 492	u32 new, delta = 0;
 
 
 
 
 493
 494	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 495		delta = prandom_u32_max(now - old);
 496
 497	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
 498	do {
 499		old = (u32)atomic_read(p_id);
 500		new = old + delta + segs;
 501	} while (atomic_cmpxchg(p_id, old, new) != old);
 502
 503	return new - segs;
 504}
 505EXPORT_SYMBOL(ip_idents_reserve);
 506
 507void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 508{
 509	u32 hash, id;
 510
 511	/* Note the following code is not safe, but this is okay. */
 512	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 513		get_random_bytes(&net->ipv4.ip_id_key,
 514				 sizeof(net->ipv4.ip_id_key));
 515
 516	hash = siphash_3u32((__force u32)iph->daddr,
 517			    (__force u32)iph->saddr,
 518			    iph->protocol,
 519			    &net->ipv4.ip_id_key);
 520	id = ip_idents_reserve(hash, segs);
 521	iph->id = htons(id);
 522}
 523EXPORT_SYMBOL(__ip_select_ident);
 524
 
 
 
 
 
 
 
 
 
 525static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 526			     const struct sock *sk,
 527			     const struct iphdr *iph,
 528			     int oif, u8 tos,
 529			     u8 prot, u32 mark, int flow_flags)
 530{
 531	if (sk) {
 532		const struct inet_sock *inet = inet_sk(sk);
 533
 
 534		oif = sk->sk_bound_dev_if;
 535		mark = sk->sk_mark;
 536		tos = RT_CONN_FLAGS(sk);
 537		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 538	}
 539	flowi4_init_output(fl4, oif, mark, tos,
 540			   RT_SCOPE_UNIVERSE, prot,
 541			   flow_flags,
 542			   iph->daddr, iph->saddr, 0, 0,
 
 543			   sock_net_uid(net, sk));
 544}
 545
 546static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 547			       const struct sock *sk)
 548{
 549	const struct net *net = dev_net(skb->dev);
 550	const struct iphdr *iph = ip_hdr(skb);
 551	int oif = skb->dev->ifindex;
 552	u8 tos = RT_TOS(iph->tos);
 553	u8 prot = iph->protocol;
 554	u32 mark = skb->mark;
 
 555
 556	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 557}
 558
 559static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 560{
 561	const struct inet_sock *inet = inet_sk(sk);
 562	const struct ip_options_rcu *inet_opt;
 563	__be32 daddr = inet->inet_daddr;
 564
 565	rcu_read_lock();
 566	inet_opt = rcu_dereference(inet->inet_opt);
 567	if (inet_opt && inet_opt->opt.srr)
 568		daddr = inet_opt->opt.faddr;
 569	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 570			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 571			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 
 
 572			   inet_sk_flowi_flags(sk),
 573			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 574	rcu_read_unlock();
 575}
 576
 577static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 578				 const struct sk_buff *skb)
 579{
 580	if (skb)
 581		build_skb_flow_key(fl4, skb, sk);
 582	else
 583		build_sk_flow_key(fl4, sk);
 584}
 585
 586static DEFINE_SPINLOCK(fnhe_lock);
 587
 588static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 589{
 590	struct rtable *rt;
 591
 592	rt = rcu_dereference(fnhe->fnhe_rth_input);
 593	if (rt) {
 594		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 595		dst_dev_put(&rt->dst);
 596		dst_release(&rt->dst);
 597	}
 598	rt = rcu_dereference(fnhe->fnhe_rth_output);
 599	if (rt) {
 600		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 601		dst_dev_put(&rt->dst);
 602		dst_release(&rt->dst);
 603	}
 604}
 605
 606static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 607{
 608	struct fib_nh_exception *fnhe, *oldest;
 
 609
 610	oldest = rcu_dereference(hash->chain);
 611	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 612	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 613		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 
 
 
 614			oldest = fnhe;
 
 
 615	}
 616	fnhe_flush_routes(oldest);
 617	return oldest;
 
 618}
 619
 620static inline u32 fnhe_hashfun(__be32 daddr)
 621{
 622	static u32 fnhe_hashrnd __read_mostly;
 623	u32 hval;
 624
 625	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 626	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 627	return hash_32(hval, FNHE_HASH_SHIFT);
 628}
 629
 630static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 631{
 632	rt->rt_pmtu = fnhe->fnhe_pmtu;
 633	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 634	rt->dst.expires = fnhe->fnhe_expires;
 635
 636	if (fnhe->fnhe_gw) {
 637		rt->rt_flags |= RTCF_REDIRECTED;
 638		rt->rt_uses_gateway = 1;
 639		rt->rt_gw_family = AF_INET;
 640		rt->rt_gw4 = fnhe->fnhe_gw;
 641	}
 642}
 643
 644static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 645				  __be32 gw, u32 pmtu, bool lock,
 646				  unsigned long expires)
 647{
 648	struct fnhe_hash_bucket *hash;
 649	struct fib_nh_exception *fnhe;
 650	struct rtable *rt;
 651	u32 genid, hval;
 652	unsigned int i;
 653	int depth;
 654
 655	genid = fnhe_genid(dev_net(nhc->nhc_dev));
 656	hval = fnhe_hashfun(daddr);
 657
 658	spin_lock_bh(&fnhe_lock);
 659
 660	hash = rcu_dereference(nhc->nhc_exceptions);
 661	if (!hash) {
 662		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 663		if (!hash)
 664			goto out_unlock;
 665		rcu_assign_pointer(nhc->nhc_exceptions, hash);
 666	}
 667
 668	hash += hval;
 669
 670	depth = 0;
 671	for (fnhe = rcu_dereference(hash->chain); fnhe;
 672	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 673		if (fnhe->fnhe_daddr == daddr)
 674			break;
 675		depth++;
 676	}
 677
 678	if (fnhe) {
 679		if (fnhe->fnhe_genid != genid)
 680			fnhe->fnhe_genid = genid;
 681		if (gw)
 682			fnhe->fnhe_gw = gw;
 683		if (pmtu) {
 684			fnhe->fnhe_pmtu = pmtu;
 685			fnhe->fnhe_mtu_locked = lock;
 686		}
 687		fnhe->fnhe_expires = max(1UL, expires);
 688		/* Update all cached dsts too */
 689		rt = rcu_dereference(fnhe->fnhe_rth_input);
 690		if (rt)
 691			fill_route_from_fnhe(rt, fnhe);
 692		rt = rcu_dereference(fnhe->fnhe_rth_output);
 693		if (rt)
 694			fill_route_from_fnhe(rt, fnhe);
 695	} else {
 696		if (depth > FNHE_RECLAIM_DEPTH)
 697			fnhe = fnhe_oldest(hash);
 698		else {
 699			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 700			if (!fnhe)
 701				goto out_unlock;
 702
 703			fnhe->fnhe_next = hash->chain;
 704			rcu_assign_pointer(hash->chain, fnhe);
 
 705		}
 
 
 
 
 
 
 
 706		fnhe->fnhe_genid = genid;
 707		fnhe->fnhe_daddr = daddr;
 708		fnhe->fnhe_gw = gw;
 709		fnhe->fnhe_pmtu = pmtu;
 710		fnhe->fnhe_mtu_locked = lock;
 711		fnhe->fnhe_expires = max(1UL, expires);
 712
 
 
 713		/* Exception created; mark the cached routes for the nexthop
 714		 * stale, so anyone caching it rechecks if this exception
 715		 * applies to them.
 716		 */
 717		rt = rcu_dereference(nhc->nhc_rth_input);
 718		if (rt)
 719			rt->dst.obsolete = DST_OBSOLETE_KILL;
 720
 721		for_each_possible_cpu(i) {
 722			struct rtable __rcu **prt;
 
 723			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 724			rt = rcu_dereference(*prt);
 725			if (rt)
 726				rt->dst.obsolete = DST_OBSOLETE_KILL;
 727		}
 728	}
 729
 730	fnhe->fnhe_stamp = jiffies;
 731
 732out_unlock:
 733	spin_unlock_bh(&fnhe_lock);
 734}
 735
 736static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 737			     bool kill_route)
 738{
 739	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 740	__be32 old_gw = ip_hdr(skb)->saddr;
 741	struct net_device *dev = skb->dev;
 742	struct in_device *in_dev;
 743	struct fib_result res;
 744	struct neighbour *n;
 745	struct net *net;
 746
 747	switch (icmp_hdr(skb)->code & 7) {
 748	case ICMP_REDIR_NET:
 749	case ICMP_REDIR_NETTOS:
 750	case ICMP_REDIR_HOST:
 751	case ICMP_REDIR_HOSTTOS:
 752		break;
 753
 754	default:
 755		return;
 756	}
 757
 758	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 759		return;
 760
 761	in_dev = __in_dev_get_rcu(dev);
 762	if (!in_dev)
 763		return;
 764
 765	net = dev_net(dev);
 766	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 767	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 768	    ipv4_is_zeronet(new_gw))
 769		goto reject_redirect;
 770
 771	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 772		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 773			goto reject_redirect;
 774		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 775			goto reject_redirect;
 776	} else {
 777		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 778			goto reject_redirect;
 779	}
 780
 781	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 782	if (!n)
 783		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 784	if (!IS_ERR(n)) {
 785		if (!(n->nud_state & NUD_VALID)) {
 786			neigh_event_send(n, NULL);
 787		} else {
 788			if (fib_lookup(net, fl4, &res, 0) == 0) {
 789				struct fib_nh_common *nhc = FIB_RES_NHC(res);
 790
 
 
 791				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 792						0, false,
 793						jiffies + ip_rt_gc_timeout);
 794			}
 795			if (kill_route)
 796				rt->dst.obsolete = DST_OBSOLETE_KILL;
 797			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 798		}
 799		neigh_release(n);
 800	}
 801	return;
 802
 803reject_redirect:
 804#ifdef CONFIG_IP_ROUTE_VERBOSE
 805	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 806		const struct iphdr *iph = (const struct iphdr *) skb->data;
 807		__be32 daddr = iph->daddr;
 808		__be32 saddr = iph->saddr;
 809
 810		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 811				     "  Advised path = %pI4 -> %pI4\n",
 812				     &old_gw, dev->name, &new_gw,
 813				     &saddr, &daddr);
 814	}
 815#endif
 816	;
 817}
 818
 819static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 820{
 821	struct rtable *rt;
 822	struct flowi4 fl4;
 823	const struct iphdr *iph = (const struct iphdr *) skb->data;
 824	struct net *net = dev_net(skb->dev);
 825	int oif = skb->dev->ifindex;
 826	u8 tos = RT_TOS(iph->tos);
 827	u8 prot = iph->protocol;
 828	u32 mark = skb->mark;
 
 829
 830	rt = (struct rtable *) dst;
 831
 832	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 833	__ip_do_redirect(rt, skb, &fl4, true);
 834}
 835
 836static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 837{
 838	struct rtable *rt = (struct rtable *)dst;
 839	struct dst_entry *ret = dst;
 840
 841	if (rt) {
 842		if (dst->obsolete > 0) {
 843			ip_rt_put(rt);
 844			ret = NULL;
 845		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 846			   rt->dst.expires) {
 847			ip_rt_put(rt);
 848			ret = NULL;
 849		}
 850	}
 851	return ret;
 852}
 853
 854/*
 855 * Algorithm:
 856 *	1. The first ip_rt_redirect_number redirects are sent
 857 *	   with exponential backoff, then we stop sending them at all,
 858 *	   assuming that the host ignores our redirects.
 859 *	2. If we did not see packets requiring redirects
 860 *	   during ip_rt_redirect_silence, we assume that the host
 861 *	   forgot redirected route and start to send redirects again.
 862 *
 863 * This algorithm is much cheaper and more intelligent than dumb load limiting
 864 * in icmp.c.
 865 *
 866 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 867 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 868 */
 869
 870void ip_rt_send_redirect(struct sk_buff *skb)
 871{
 872	struct rtable *rt = skb_rtable(skb);
 873	struct in_device *in_dev;
 874	struct inet_peer *peer;
 875	struct net *net;
 876	int log_martians;
 877	int vif;
 878
 879	rcu_read_lock();
 880	in_dev = __in_dev_get_rcu(rt->dst.dev);
 881	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 882		rcu_read_unlock();
 883		return;
 884	}
 885	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 886	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 887	rcu_read_unlock();
 888
 889	net = dev_net(rt->dst.dev);
 890	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 891	if (!peer) {
 892		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 893			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 894		return;
 895	}
 896
 897	/* No redirected packets during ip_rt_redirect_silence;
 898	 * reset the algorithm.
 899	 */
 900	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 901		peer->rate_tokens = 0;
 902		peer->n_redirects = 0;
 903	}
 904
 905	/* Too many ignored redirects; do not send anything
 906	 * set dst.rate_last to the last seen redirected packet.
 907	 */
 908	if (peer->n_redirects >= ip_rt_redirect_number) {
 909		peer->rate_last = jiffies;
 910		goto out_put_peer;
 911	}
 912
 913	/* Check for load limit; set rate_last to the latest sent
 914	 * redirect.
 915	 */
 916	if (peer->rate_tokens == 0 ||
 917	    time_after(jiffies,
 918		       (peer->rate_last +
 919			(ip_rt_redirect_load << peer->n_redirects)))) {
 920		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 921
 922		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 923		peer->rate_last = jiffies;
 924		++peer->n_redirects;
 925#ifdef CONFIG_IP_ROUTE_VERBOSE
 926		if (log_martians &&
 927		    peer->n_redirects == ip_rt_redirect_number)
 928			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 929					     &ip_hdr(skb)->saddr, inet_iif(skb),
 930					     &ip_hdr(skb)->daddr, &gw);
 931#endif
 932	}
 933out_put_peer:
 934	inet_putpeer(peer);
 935}
 936
 937static int ip_error(struct sk_buff *skb)
 938{
 939	struct rtable *rt = skb_rtable(skb);
 940	struct net_device *dev = skb->dev;
 941	struct in_device *in_dev;
 942	struct inet_peer *peer;
 943	unsigned long now;
 944	struct net *net;
 
 945	bool send;
 946	int code;
 947
 948	if (netif_is_l3_master(skb->dev)) {
 949		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 950		if (!dev)
 951			goto out;
 952	}
 953
 954	in_dev = __in_dev_get_rcu(dev);
 955
 956	/* IP on this device is disabled. */
 957	if (!in_dev)
 958		goto out;
 959
 960	net = dev_net(rt->dst.dev);
 961	if (!IN_DEV_FORWARD(in_dev)) {
 962		switch (rt->dst.error) {
 963		case EHOSTUNREACH:
 
 964			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 965			break;
 966
 967		case ENETUNREACH:
 
 968			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 969			break;
 970		}
 971		goto out;
 972	}
 973
 974	switch (rt->dst.error) {
 975	case EINVAL:
 976	default:
 977		goto out;
 978	case EHOSTUNREACH:
 979		code = ICMP_HOST_UNREACH;
 980		break;
 981	case ENETUNREACH:
 982		code = ICMP_NET_UNREACH;
 
 983		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 984		break;
 985	case EACCES:
 986		code = ICMP_PKT_FILTERED;
 987		break;
 988	}
 989
 990	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 991			       l3mdev_master_ifindex(skb->dev), 1);
 992
 993	send = true;
 994	if (peer) {
 995		now = jiffies;
 996		peer->rate_tokens += now - peer->rate_last;
 997		if (peer->rate_tokens > ip_rt_error_burst)
 998			peer->rate_tokens = ip_rt_error_burst;
 999		peer->rate_last = now;
1000		if (peer->rate_tokens >= ip_rt_error_cost)
1001			peer->rate_tokens -= ip_rt_error_cost;
1002		else
1003			send = false;
1004		inet_putpeer(peer);
1005	}
1006	if (send)
1007		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1008
1009out:	kfree_skb(skb);
1010	return 0;
1011}
1012
1013static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1014{
1015	struct dst_entry *dst = &rt->dst;
1016	u32 old_mtu = ipv4_mtu(dst);
1017	struct fib_result res;
1018	bool lock = false;
 
1019
1020	if (ip_mtu_locked(dst))
1021		return;
1022
 
1023	if (old_mtu < mtu)
1024		return;
1025
1026	if (mtu < ip_rt_min_pmtu) {
1027		lock = true;
1028		mtu = min(old_mtu, ip_rt_min_pmtu);
1029	}
1030
1031	if (rt->rt_pmtu == mtu && !lock &&
1032	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1033		return;
1034
1035	rcu_read_lock();
1036	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1037		struct fib_nh_common *nhc = FIB_RES_NHC(res);
1038
 
 
1039		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1040				      jiffies + ip_rt_mtu_expires);
1041	}
1042	rcu_read_unlock();
1043}
1044
1045static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1046			      struct sk_buff *skb, u32 mtu)
 
1047{
1048	struct rtable *rt = (struct rtable *) dst;
1049	struct flowi4 fl4;
1050
1051	ip_rt_build_flow_key(&fl4, sk, skb);
 
 
 
 
 
1052	__ip_rt_update_pmtu(rt, &fl4, mtu);
1053}
1054
1055void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1056		      int oif, u8 protocol)
1057{
1058	const struct iphdr *iph = (const struct iphdr *) skb->data;
1059	struct flowi4 fl4;
1060	struct rtable *rt;
1061	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1062
1063	__build_flow_key(net, &fl4, NULL, iph, oif,
1064			 RT_TOS(iph->tos), protocol, mark, 0);
1065	rt = __ip_route_output_key(net, &fl4);
1066	if (!IS_ERR(rt)) {
1067		__ip_rt_update_pmtu(rt, &fl4, mtu);
1068		ip_rt_put(rt);
1069	}
1070}
1071EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1072
1073static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074{
1075	const struct iphdr *iph = (const struct iphdr *) skb->data;
1076	struct flowi4 fl4;
1077	struct rtable *rt;
1078
1079	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1080
1081	if (!fl4.flowi4_mark)
1082		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1083
1084	rt = __ip_route_output_key(sock_net(sk), &fl4);
1085	if (!IS_ERR(rt)) {
1086		__ip_rt_update_pmtu(rt, &fl4, mtu);
1087		ip_rt_put(rt);
1088	}
1089}
1090
1091void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092{
1093	const struct iphdr *iph = (const struct iphdr *) skb->data;
1094	struct flowi4 fl4;
1095	struct rtable *rt;
1096	struct dst_entry *odst = NULL;
1097	bool new = false;
1098	struct net *net = sock_net(sk);
1099
1100	bh_lock_sock(sk);
1101
1102	if (!ip_sk_accept_pmtu(sk))
1103		goto out;
1104
1105	odst = sk_dst_get(sk);
1106
1107	if (sock_owned_by_user(sk) || !odst) {
1108		__ipv4_sk_update_pmtu(skb, sk, mtu);
1109		goto out;
1110	}
1111
1112	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1113
1114	rt = (struct rtable *)odst;
1115	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1116		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1117		if (IS_ERR(rt))
1118			goto out;
1119
1120		new = true;
1121	}
1122
1123	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1124
1125	if (!dst_check(&rt->dst, 0)) {
1126		if (new)
1127			dst_release(&rt->dst);
1128
1129		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1130		if (IS_ERR(rt))
1131			goto out;
1132
1133		new = true;
1134	}
1135
1136	if (new)
1137		sk_dst_set(sk, &rt->dst);
1138
1139out:
1140	bh_unlock_sock(sk);
1141	dst_release(odst);
1142}
1143EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1144
1145void ipv4_redirect(struct sk_buff *skb, struct net *net,
1146		   int oif, u8 protocol)
1147{
1148	const struct iphdr *iph = (const struct iphdr *) skb->data;
1149	struct flowi4 fl4;
1150	struct rtable *rt;
1151
1152	__build_flow_key(net, &fl4, NULL, iph, oif,
1153			 RT_TOS(iph->tos), protocol, 0, 0);
1154	rt = __ip_route_output_key(net, &fl4);
1155	if (!IS_ERR(rt)) {
1156		__ip_do_redirect(rt, skb, &fl4, false);
1157		ip_rt_put(rt);
1158	}
1159}
1160EXPORT_SYMBOL_GPL(ipv4_redirect);
1161
1162void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1163{
1164	const struct iphdr *iph = (const struct iphdr *) skb->data;
1165	struct flowi4 fl4;
1166	struct rtable *rt;
1167	struct net *net = sock_net(sk);
1168
1169	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1170	rt = __ip_route_output_key(net, &fl4);
1171	if (!IS_ERR(rt)) {
1172		__ip_do_redirect(rt, skb, &fl4, false);
1173		ip_rt_put(rt);
1174	}
1175}
1176EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1177
1178static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 
1179{
1180	struct rtable *rt = (struct rtable *) dst;
1181
1182	/* All IPV4 dsts are created with ->obsolete set to the value
1183	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1184	 * into this function always.
1185	 *
1186	 * When a PMTU/redirect information update invalidates a route,
1187	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1188	 * DST_OBSOLETE_DEAD.
1189	 */
1190	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1191		return NULL;
1192	return dst;
1193}
 
1194
1195static void ipv4_send_dest_unreach(struct sk_buff *skb)
1196{
 
1197	struct ip_options opt;
1198	int res;
1199
1200	/* Recompile ip options since IPCB may not be valid anymore.
1201	 * Also check we have a reasonable ipv4 header.
1202	 */
1203	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1204	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1205		return;
1206
1207	memset(&opt, 0, sizeof(opt));
1208	if (ip_hdr(skb)->ihl > 5) {
1209		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1210			return;
1211		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1212
1213		rcu_read_lock();
1214		res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
 
1215		rcu_read_unlock();
1216
1217		if (res)
1218			return;
1219	}
1220	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1221}
1222
1223static void ipv4_link_failure(struct sk_buff *skb)
1224{
1225	struct rtable *rt;
1226
1227	ipv4_send_dest_unreach(skb);
1228
1229	rt = skb_rtable(skb);
1230	if (rt)
1231		dst_set_expires(&rt->dst, 0);
1232}
1233
1234static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1235{
1236	pr_debug("%s: %pI4 -> %pI4, %s\n",
1237		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1238		 skb->dev ? skb->dev->name : "?");
1239	kfree_skb(skb);
1240	WARN_ON(1);
1241	return 0;
1242}
1243
1244/*
1245   We do not cache source address of outgoing interface,
1246   because it is used only by IP RR, TS and SRR options,
1247   so that it out of fast path.
1248
1249   BTW remember: "addr" is allowed to be not aligned
1250   in IP options!
1251 */
1252
1253void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1254{
1255	__be32 src;
1256
1257	if (rt_is_output_route(rt))
1258		src = ip_hdr(skb)->saddr;
1259	else {
1260		struct fib_result res;
1261		struct iphdr *iph = ip_hdr(skb);
1262		struct flowi4 fl4 = {
1263			.daddr = iph->daddr,
1264			.saddr = iph->saddr,
1265			.flowi4_tos = RT_TOS(iph->tos),
1266			.flowi4_oif = rt->dst.dev->ifindex,
1267			.flowi4_iif = skb->dev->ifindex,
1268			.flowi4_mark = skb->mark,
1269		};
1270
1271		rcu_read_lock();
1272		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1273			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1274		else
1275			src = inet_select_addr(rt->dst.dev,
1276					       rt_nexthop(rt, iph->daddr),
1277					       RT_SCOPE_UNIVERSE);
1278		rcu_read_unlock();
1279	}
1280	memcpy(addr, &src, 4);
1281}
1282
1283#ifdef CONFIG_IP_ROUTE_CLASSID
1284static void set_class_tag(struct rtable *rt, u32 tag)
1285{
1286	if (!(rt->dst.tclassid & 0xFFFF))
1287		rt->dst.tclassid |= tag & 0xFFFF;
1288	if (!(rt->dst.tclassid & 0xFFFF0000))
1289		rt->dst.tclassid |= tag & 0xFFFF0000;
1290}
1291#endif
1292
1293static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1294{
 
1295	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1296	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1297				    ip_rt_min_advmss);
1298
1299	return min(advmss, IPV4_MAX_PMTU - header_size);
1300}
1301
1302static unsigned int ipv4_mtu(const struct dst_entry *dst)
1303{
1304	const struct rtable *rt = (const struct rtable *) dst;
1305	unsigned int mtu = rt->rt_pmtu;
1306
1307	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1308		mtu = dst_metric_raw(dst, RTAX_MTU);
1309
1310	if (mtu)
1311		return mtu;
1312
1313	mtu = READ_ONCE(dst->dev->mtu);
1314
1315	if (unlikely(ip_mtu_locked(dst))) {
1316		if (rt->rt_uses_gateway && mtu > 576)
1317			mtu = 576;
1318	}
1319
1320	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1321
1322	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1323}
 
1324
1325static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1326{
1327	struct fnhe_hash_bucket *hash;
1328	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1329	u32 hval = fnhe_hashfun(daddr);
1330
1331	spin_lock_bh(&fnhe_lock);
1332
1333	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1334					 lockdep_is_held(&fnhe_lock));
1335	hash += hval;
1336
1337	fnhe_p = &hash->chain;
1338	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1339	while (fnhe) {
1340		if (fnhe->fnhe_daddr == daddr) {
1341			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1342				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1343			/* set fnhe_daddr to 0 to ensure it won't bind with
1344			 * new dsts in rt_bind_exception().
1345			 */
1346			fnhe->fnhe_daddr = 0;
1347			fnhe_flush_routes(fnhe);
1348			kfree_rcu(fnhe, rcu);
1349			break;
1350		}
1351		fnhe_p = &fnhe->fnhe_next;
1352		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1353						 lockdep_is_held(&fnhe_lock));
1354	}
1355
1356	spin_unlock_bh(&fnhe_lock);
1357}
1358
1359static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1360					       __be32 daddr)
1361{
1362	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1363	struct fib_nh_exception *fnhe;
1364	u32 hval;
1365
1366	if (!hash)
1367		return NULL;
1368
1369	hval = fnhe_hashfun(daddr);
1370
1371	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1372	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1373		if (fnhe->fnhe_daddr == daddr) {
1374			if (fnhe->fnhe_expires &&
1375			    time_after(jiffies, fnhe->fnhe_expires)) {
1376				ip_del_fnhe(nhc, daddr);
1377				break;
1378			}
1379			return fnhe;
1380		}
1381	}
1382	return NULL;
1383}
1384
1385/* MTU selection:
1386 * 1. mtu on route is locked - use it
1387 * 2. mtu from nexthop exception
1388 * 3. mtu from egress device
1389 */
1390
1391u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1392{
1393	struct fib_nh_common *nhc = res->nhc;
1394	struct net_device *dev = nhc->nhc_dev;
1395	struct fib_info *fi = res->fi;
1396	u32 mtu = 0;
1397
1398	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1399	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1400		mtu = fi->fib_mtu;
1401
1402	if (likely(!mtu)) {
1403		struct fib_nh_exception *fnhe;
1404
1405		fnhe = find_exception(nhc, daddr);
1406		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1407			mtu = fnhe->fnhe_pmtu;
1408	}
1409
1410	if (likely(!mtu))
1411		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1412
1413	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1414}
1415
1416static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1417			      __be32 daddr, const bool do_cache)
1418{
1419	bool ret = false;
1420
1421	spin_lock_bh(&fnhe_lock);
1422
1423	if (daddr == fnhe->fnhe_daddr) {
1424		struct rtable __rcu **porig;
1425		struct rtable *orig;
1426		int genid = fnhe_genid(dev_net(rt->dst.dev));
1427
1428		if (rt_is_input_route(rt))
1429			porig = &fnhe->fnhe_rth_input;
1430		else
1431			porig = &fnhe->fnhe_rth_output;
1432		orig = rcu_dereference(*porig);
1433
1434		if (fnhe->fnhe_genid != genid) {
1435			fnhe->fnhe_genid = genid;
1436			fnhe->fnhe_gw = 0;
1437			fnhe->fnhe_pmtu = 0;
1438			fnhe->fnhe_expires = 0;
1439			fnhe->fnhe_mtu_locked = false;
1440			fnhe_flush_routes(fnhe);
1441			orig = NULL;
1442		}
1443		fill_route_from_fnhe(rt, fnhe);
1444		if (!rt->rt_gw4) {
1445			rt->rt_gw4 = daddr;
1446			rt->rt_gw_family = AF_INET;
1447		}
1448
1449		if (do_cache) {
1450			dst_hold(&rt->dst);
1451			rcu_assign_pointer(*porig, rt);
1452			if (orig) {
1453				dst_dev_put(&orig->dst);
1454				dst_release(&orig->dst);
1455			}
1456			ret = true;
1457		}
1458
1459		fnhe->fnhe_stamp = jiffies;
1460	}
1461	spin_unlock_bh(&fnhe_lock);
1462
1463	return ret;
1464}
1465
1466static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1467{
1468	struct rtable *orig, *prev, **p;
1469	bool ret = true;
1470
1471	if (rt_is_input_route(rt)) {
1472		p = (struct rtable **)&nhc->nhc_rth_input;
1473	} else {
1474		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1475	}
1476	orig = *p;
1477
1478	/* hold dst before doing cmpxchg() to avoid race condition
1479	 * on this dst
1480	 */
1481	dst_hold(&rt->dst);
1482	prev = cmpxchg(p, orig, rt);
1483	if (prev == orig) {
1484		if (orig) {
1485			rt_add_uncached_list(orig);
1486			dst_release(&orig->dst);
1487		}
1488	} else {
1489		dst_release(&rt->dst);
1490		ret = false;
1491	}
1492
1493	return ret;
1494}
1495
1496struct uncached_list {
1497	spinlock_t		lock;
1498	struct list_head	head;
 
1499};
1500
1501static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1502
1503void rt_add_uncached_list(struct rtable *rt)
1504{
1505	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1506
1507	rt->rt_uncached_list = ul;
1508
1509	spin_lock_bh(&ul->lock);
1510	list_add_tail(&rt->rt_uncached, &ul->head);
1511	spin_unlock_bh(&ul->lock);
1512}
1513
1514void rt_del_uncached_list(struct rtable *rt)
1515{
1516	if (!list_empty(&rt->rt_uncached)) {
1517		struct uncached_list *ul = rt->rt_uncached_list;
1518
1519		spin_lock_bh(&ul->lock);
1520		list_del(&rt->rt_uncached);
1521		spin_unlock_bh(&ul->lock);
1522	}
1523}
1524
1525static void ipv4_dst_destroy(struct dst_entry *dst)
1526{
1527	struct rtable *rt = (struct rtable *)dst;
1528
1529	ip_dst_metrics_put(dst);
1530	rt_del_uncached_list(rt);
1531}
1532
1533void rt_flush_dev(struct net_device *dev)
1534{
1535	struct rtable *rt;
1536	int cpu;
1537
1538	for_each_possible_cpu(cpu) {
1539		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1540
 
 
 
1541		spin_lock_bh(&ul->lock);
1542		list_for_each_entry(rt, &ul->head, rt_uncached) {
1543			if (rt->dst.dev != dev)
1544				continue;
1545			rt->dst.dev = blackhole_netdev;
1546			dev_hold(rt->dst.dev);
1547			dev_put(dev);
 
1548		}
1549		spin_unlock_bh(&ul->lock);
1550	}
1551}
1552
1553static bool rt_cache_valid(const struct rtable *rt)
1554{
1555	return	rt &&
1556		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1557		!rt_is_expired(rt);
1558}
1559
1560static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1561			   const struct fib_result *res,
1562			   struct fib_nh_exception *fnhe,
1563			   struct fib_info *fi, u16 type, u32 itag,
1564			   const bool do_cache)
1565{
1566	bool cached = false;
1567
1568	if (fi) {
1569		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1570
1571		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1572			rt->rt_uses_gateway = 1;
1573			rt->rt_gw_family = nhc->nhc_gw_family;
1574			/* only INET and INET6 are supported */
1575			if (likely(nhc->nhc_gw_family == AF_INET))
1576				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1577			else
1578				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1579		}
1580
1581		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1582
1583#ifdef CONFIG_IP_ROUTE_CLASSID
1584		if (nhc->nhc_family == AF_INET) {
1585			struct fib_nh *nh;
1586
1587			nh = container_of(nhc, struct fib_nh, nh_common);
1588			rt->dst.tclassid = nh->nh_tclassid;
1589		}
1590#endif
1591		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1592		if (unlikely(fnhe))
1593			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1594		else if (do_cache)
1595			cached = rt_cache_route(nhc, rt);
1596		if (unlikely(!cached)) {
1597			/* Routes we intend to cache in nexthop exception or
1598			 * FIB nexthop have the DST_NOCACHE bit clear.
1599			 * However, if we are unsuccessful at storing this
1600			 * route into the cache we really need to set it.
1601			 */
1602			if (!rt->rt_gw4) {
1603				rt->rt_gw_family = AF_INET;
1604				rt->rt_gw4 = daddr;
1605			}
1606			rt_add_uncached_list(rt);
1607		}
1608	} else
1609		rt_add_uncached_list(rt);
1610
1611#ifdef CONFIG_IP_ROUTE_CLASSID
1612#ifdef CONFIG_IP_MULTIPLE_TABLES
1613	set_class_tag(rt, res->tclassid);
1614#endif
1615	set_class_tag(rt, itag);
1616#endif
1617}
1618
1619struct rtable *rt_dst_alloc(struct net_device *dev,
1620			    unsigned int flags, u16 type,
1621			    bool nopolicy, bool noxfrm, bool will_cache)
1622{
1623	struct rtable *rt;
1624
1625	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1626		       (will_cache ? 0 : DST_HOST) |
1627		       (nopolicy ? DST_NOPOLICY : 0) |
1628		       (noxfrm ? DST_NOXFRM : 0));
1629
1630	if (rt) {
1631		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1632		rt->rt_flags = flags;
1633		rt->rt_type = type;
1634		rt->rt_is_input = 0;
1635		rt->rt_iif = 0;
1636		rt->rt_pmtu = 0;
1637		rt->rt_mtu_locked = 0;
1638		rt->rt_uses_gateway = 0;
1639		rt->rt_gw_family = 0;
1640		rt->rt_gw4 = 0;
1641		INIT_LIST_HEAD(&rt->rt_uncached);
1642
1643		rt->dst.output = ip_output;
1644		if (flags & RTCF_LOCAL)
1645			rt->dst.input = ip_local_deliver;
1646	}
1647
1648	return rt;
1649}
1650EXPORT_SYMBOL(rt_dst_alloc);
1651
1652struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1653{
1654	struct rtable *new_rt;
1655
1656	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1657			   rt->dst.flags);
1658
1659	if (new_rt) {
1660		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1661		new_rt->rt_flags = rt->rt_flags;
1662		new_rt->rt_type = rt->rt_type;
1663		new_rt->rt_is_input = rt->rt_is_input;
1664		new_rt->rt_iif = rt->rt_iif;
1665		new_rt->rt_pmtu = rt->rt_pmtu;
1666		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1667		new_rt->rt_gw_family = rt->rt_gw_family;
1668		if (rt->rt_gw_family == AF_INET)
1669			new_rt->rt_gw4 = rt->rt_gw4;
1670		else if (rt->rt_gw_family == AF_INET6)
1671			new_rt->rt_gw6 = rt->rt_gw6;
1672		INIT_LIST_HEAD(&new_rt->rt_uncached);
1673
1674		new_rt->dst.flags |= DST_HOST;
1675		new_rt->dst.input = rt->dst.input;
1676		new_rt->dst.output = rt->dst.output;
1677		new_rt->dst.error = rt->dst.error;
1678		new_rt->dst.lastuse = jiffies;
1679		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1680	}
1681	return new_rt;
1682}
1683EXPORT_SYMBOL(rt_dst_clone);
1684
1685/* called in rcu_read_lock() section */
1686int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1687			  u8 tos, struct net_device *dev,
1688			  struct in_device *in_dev, u32 *itag)
1689{
1690	int err;
1691
1692	/* Primary sanity checks. */
1693	if (!in_dev)
1694		return -EINVAL;
1695
1696	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1697	    skb->protocol != htons(ETH_P_IP))
1698		return -EINVAL;
1699
1700	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1701		return -EINVAL;
1702
1703	if (ipv4_is_zeronet(saddr)) {
1704		if (!ipv4_is_local_multicast(daddr) &&
1705		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1706			return -EINVAL;
1707	} else {
1708		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1709					  in_dev, itag);
1710		if (err < 0)
1711			return err;
1712	}
1713	return 0;
1714}
1715
1716/* called in rcu_read_lock() section */
1717static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1718			     u8 tos, struct net_device *dev, int our)
1719{
1720	struct in_device *in_dev = __in_dev_get_rcu(dev);
1721	unsigned int flags = RTCF_MULTICAST;
1722	struct rtable *rth;
1723	u32 itag = 0;
1724	int err;
1725
1726	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1727	if (err)
1728		return err;
1729
1730	if (our)
1731		flags |= RTCF_LOCAL;
1732
 
 
 
1733	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1734			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1735	if (!rth)
1736		return -ENOBUFS;
1737
1738#ifdef CONFIG_IP_ROUTE_CLASSID
1739	rth->dst.tclassid = itag;
1740#endif
1741	rth->dst.output = ip_rt_bug;
1742	rth->rt_is_input= 1;
1743
1744#ifdef CONFIG_IP_MROUTE
1745	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1746		rth->dst.input = ip_mr_input;
1747#endif
1748	RT_CACHE_STAT_INC(in_slow_mc);
1749
 
1750	skb_dst_set(skb, &rth->dst);
1751	return 0;
1752}
1753
1754
1755static void ip_handle_martian_source(struct net_device *dev,
1756				     struct in_device *in_dev,
1757				     struct sk_buff *skb,
1758				     __be32 daddr,
1759				     __be32 saddr)
1760{
1761	RT_CACHE_STAT_INC(in_martian_src);
1762#ifdef CONFIG_IP_ROUTE_VERBOSE
1763	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1764		/*
1765		 *	RFC1812 recommendation, if source is martian,
1766		 *	the only hint is MAC header.
1767		 */
1768		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1769			&daddr, &saddr, dev->name);
1770		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1771			print_hex_dump(KERN_WARNING, "ll header: ",
1772				       DUMP_PREFIX_OFFSET, 16, 1,
1773				       skb_mac_header(skb),
1774				       dev->hard_header_len, false);
1775		}
1776	}
1777#endif
1778}
1779
1780/* called in rcu_read_lock() section */
1781static int __mkroute_input(struct sk_buff *skb,
1782			   const struct fib_result *res,
1783			   struct in_device *in_dev,
1784			   __be32 daddr, __be32 saddr, u32 tos)
1785{
1786	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1787	struct net_device *dev = nhc->nhc_dev;
1788	struct fib_nh_exception *fnhe;
1789	struct rtable *rth;
1790	int err;
1791	struct in_device *out_dev;
1792	bool do_cache;
1793	u32 itag = 0;
1794
1795	/* get a working reference to the output device */
1796	out_dev = __in_dev_get_rcu(dev);
1797	if (!out_dev) {
1798		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1799		return -EINVAL;
1800	}
1801
1802	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1803				  in_dev->dev, in_dev, &itag);
1804	if (err < 0) {
1805		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1806					 saddr);
1807
1808		goto cleanup;
1809	}
1810
1811	do_cache = res->fi && !itag;
1812	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1813	    skb->protocol == htons(ETH_P_IP)) {
1814		__be32 gw;
1815
1816		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1817		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1818		    inet_addr_onlink(out_dev, saddr, gw))
1819			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1820	}
1821
1822	if (skb->protocol != htons(ETH_P_IP)) {
1823		/* Not IP (i.e. ARP). Do not create route, if it is
1824		 * invalid for proxy arp. DNAT routes are always valid.
1825		 *
1826		 * Proxy arp feature have been extended to allow, ARP
1827		 * replies back to the same interface, to support
1828		 * Private VLAN switch technologies. See arp.c.
1829		 */
1830		if (out_dev == in_dev &&
1831		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1832			err = -EINVAL;
1833			goto cleanup;
1834		}
1835	}
1836
 
 
 
1837	fnhe = find_exception(nhc, daddr);
1838	if (do_cache) {
1839		if (fnhe)
1840			rth = rcu_dereference(fnhe->fnhe_rth_input);
1841		else
1842			rth = rcu_dereference(nhc->nhc_rth_input);
1843		if (rt_cache_valid(rth)) {
1844			skb_dst_set_noref(skb, &rth->dst);
1845			goto out;
1846		}
1847	}
1848
1849	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1850			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1851			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1852	if (!rth) {
1853		err = -ENOBUFS;
1854		goto cleanup;
1855	}
1856
1857	rth->rt_is_input = 1;
1858	RT_CACHE_STAT_INC(in_slow_tot);
1859
1860	rth->dst.input = ip_forward;
1861
1862	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1863		       do_cache);
1864	lwtunnel_set_redirect(&rth->dst);
1865	skb_dst_set(skb, &rth->dst);
1866out:
1867	err = 0;
1868 cleanup:
1869	return err;
1870}
1871
1872#ifdef CONFIG_IP_ROUTE_MULTIPATH
1873/* To make ICMP packets follow the right flow, the multipath hash is
1874 * calculated from the inner IP addresses.
1875 */
1876static void ip_multipath_l3_keys(const struct sk_buff *skb,
1877				 struct flow_keys *hash_keys)
1878{
1879	const struct iphdr *outer_iph = ip_hdr(skb);
1880	const struct iphdr *key_iph = outer_iph;
1881	const struct iphdr *inner_iph;
1882	const struct icmphdr *icmph;
1883	struct iphdr _inner_iph;
1884	struct icmphdr _icmph;
1885
1886	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1887		goto out;
1888
1889	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1890		goto out;
1891
1892	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1893				   &_icmph);
1894	if (!icmph)
1895		goto out;
1896
1897	if (icmph->type != ICMP_DEST_UNREACH &&
1898	    icmph->type != ICMP_REDIRECT &&
1899	    icmph->type != ICMP_TIME_EXCEEDED &&
1900	    icmph->type != ICMP_PARAMETERPROB)
1901		goto out;
1902
1903	inner_iph = skb_header_pointer(skb,
1904				       outer_iph->ihl * 4 + sizeof(_icmph),
1905				       sizeof(_inner_iph), &_inner_iph);
1906	if (!inner_iph)
1907		goto out;
1908
1909	key_iph = inner_iph;
1910out:
1911	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1912	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1913}
1914
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1915/* if skb is set it will be used and fl4 can be NULL */
1916int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1917		       const struct sk_buff *skb, struct flow_keys *flkeys)
1918{
1919	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1920	struct flow_keys hash_keys;
1921	u32 mhash;
1922
1923	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1924	case 0:
1925		memset(&hash_keys, 0, sizeof(hash_keys));
1926		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1927		if (skb) {
1928			ip_multipath_l3_keys(skb, &hash_keys);
1929		} else {
1930			hash_keys.addrs.v4addrs.src = fl4->saddr;
1931			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1932		}
 
1933		break;
1934	case 1:
1935		/* skb is currently provided only when forwarding */
1936		if (skb) {
1937			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1938			struct flow_keys keys;
1939
1940			/* short-circuit if we already have L4 hash present */
1941			if (skb->l4_hash)
1942				return skb_get_hash_raw(skb) >> 1;
1943
1944			memset(&hash_keys, 0, sizeof(hash_keys));
1945
1946			if (!flkeys) {
1947				skb_flow_dissect_flow_keys(skb, &keys, flag);
1948				flkeys = &keys;
1949			}
1950
1951			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1952			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1953			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1954			hash_keys.ports.src = flkeys->ports.src;
1955			hash_keys.ports.dst = flkeys->ports.dst;
1956			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1957		} else {
1958			memset(&hash_keys, 0, sizeof(hash_keys));
1959			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1960			hash_keys.addrs.v4addrs.src = fl4->saddr;
1961			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1962			hash_keys.ports.src = fl4->fl4_sport;
1963			hash_keys.ports.dst = fl4->fl4_dport;
1964			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1965		}
 
1966		break;
1967	case 2:
1968		memset(&hash_keys, 0, sizeof(hash_keys));
1969		/* skb is currently provided only when forwarding */
1970		if (skb) {
1971			struct flow_keys keys;
1972
1973			skb_flow_dissect_flow_keys(skb, &keys, 0);
1974			/* Inner can be v4 or v6 */
1975			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1976				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1977				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1978				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1979			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1980				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1981				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1982				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1983				hash_keys.tags.flow_label = keys.tags.flow_label;
1984				hash_keys.basic.ip_proto = keys.basic.ip_proto;
1985			} else {
1986				/* Same as case 0 */
1987				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1988				ip_multipath_l3_keys(skb, &hash_keys);
1989			}
1990		} else {
1991			/* Same as case 0 */
1992			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1993			hash_keys.addrs.v4addrs.src = fl4->saddr;
1994			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1995		}
 
 
 
 
 
 
 
1996		break;
1997	}
1998	mhash = flow_hash_from_keys(&hash_keys);
1999
2000	if (multipath_hash)
2001		mhash = jhash_2words(mhash, multipath_hash, 0);
2002
2003	return mhash >> 1;
2004}
2005#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2006
2007static int ip_mkroute_input(struct sk_buff *skb,
2008			    struct fib_result *res,
2009			    struct in_device *in_dev,
2010			    __be32 daddr, __be32 saddr, u32 tos,
2011			    struct flow_keys *hkeys)
2012{
2013#ifdef CONFIG_IP_ROUTE_MULTIPATH
2014	if (res->fi && fib_info_num_path(res->fi) > 1) {
2015		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2016
2017		fib_select_multipath(res, h);
 
2018	}
2019#endif
2020
2021	/* create a routing cache entry */
2022	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2023}
2024
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2025/*
2026 *	NOTE. We drop all the packets that has local source
2027 *	addresses, because every properly looped back packet
2028 *	must have correct destination already attached by output routine.
 
 
2029 *
2030 *	Such approach solves two big problems:
2031 *	1. Not simplex devices are handled properly.
2032 *	2. IP spoofing attempts are filtered with 100% of guarantee.
2033 *	called with rcu_read_lock()
2034 */
2035
2036static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2037			       u8 tos, struct net_device *dev,
2038			       struct fib_result *res)
2039{
2040	struct in_device *in_dev = __in_dev_get_rcu(dev);
2041	struct flow_keys *flkeys = NULL, _flkeys;
2042	struct net    *net = dev_net(dev);
2043	struct ip_tunnel_info *tun_info;
2044	int		err = -EINVAL;
2045	unsigned int	flags = 0;
2046	u32		itag = 0;
2047	struct rtable	*rth;
2048	struct flowi4	fl4;
2049	bool do_cache = true;
2050
2051	/* IP on this device is disabled. */
2052
2053	if (!in_dev)
2054		goto out;
2055
2056	/* Check for the most weird martians, which can be not detected
2057	   by fib_lookup.
2058	 */
2059
2060	tun_info = skb_tunnel_info(skb);
2061	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2062		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2063	else
2064		fl4.flowi4_tun_key.tun_id = 0;
2065	skb_dst_drop(skb);
2066
2067	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2068		goto martian_source;
2069
2070	res->fi = NULL;
2071	res->table = NULL;
2072	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2073		goto brd_input;
2074
2075	/* Accept zero addresses only to limited broadcast;
2076	 * I even do not know to fix it or not. Waiting for complains :-)
2077	 */
2078	if (ipv4_is_zeronet(saddr))
2079		goto martian_source;
2080
2081	if (ipv4_is_zeronet(daddr))
2082		goto martian_destination;
2083
2084	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2085	 * and call it once if daddr or/and saddr are loopback addresses
2086	 */
2087	if (ipv4_is_loopback(daddr)) {
2088		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2089			goto martian_destination;
2090	} else if (ipv4_is_loopback(saddr)) {
2091		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2092			goto martian_source;
2093	}
2094
2095	/*
2096	 *	Now we are ready to route packet.
2097	 */
 
2098	fl4.flowi4_oif = 0;
2099	fl4.flowi4_iif = dev->ifindex;
2100	fl4.flowi4_mark = skb->mark;
2101	fl4.flowi4_tos = tos;
2102	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2103	fl4.flowi4_flags = 0;
2104	fl4.daddr = daddr;
2105	fl4.saddr = saddr;
2106	fl4.flowi4_uid = sock_net_uid(net, NULL);
 
2107
2108	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2109		flkeys = &_flkeys;
2110	} else {
2111		fl4.flowi4_proto = 0;
2112		fl4.fl4_sport = 0;
2113		fl4.fl4_dport = 0;
2114	}
2115
2116	err = fib_lookup(net, &fl4, res, 0);
2117	if (err != 0) {
2118		if (!IN_DEV_FORWARD(in_dev))
2119			err = -EHOSTUNREACH;
2120		goto no_route;
2121	}
2122
2123	if (res->type == RTN_BROADCAST) {
2124		if (IN_DEV_BFORWARD(in_dev))
2125			goto make_route;
2126		/* not do cache if bc_forwarding is enabled */
2127		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2128			do_cache = false;
2129		goto brd_input;
2130	}
2131
2132	if (res->type == RTN_LOCAL) {
2133		err = fib_validate_source(skb, saddr, daddr, tos,
2134					  0, dev, in_dev, &itag);
2135		if (err < 0)
2136			goto martian_source;
2137		goto local_input;
2138	}
2139
2140	if (!IN_DEV_FORWARD(in_dev)) {
2141		err = -EHOSTUNREACH;
2142		goto no_route;
2143	}
2144	if (res->type != RTN_UNICAST)
2145		goto martian_destination;
2146
2147make_route:
2148	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2149out:	return err;
2150
2151brd_input:
2152	if (skb->protocol != htons(ETH_P_IP))
2153		goto e_inval;
2154
2155	if (!ipv4_is_zeronet(saddr)) {
2156		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2157					  in_dev, &itag);
2158		if (err < 0)
2159			goto martian_source;
2160	}
2161	flags |= RTCF_BROADCAST;
2162	res->type = RTN_BROADCAST;
2163	RT_CACHE_STAT_INC(in_brd);
2164
2165local_input:
 
 
 
2166	do_cache &= res->fi && !itag;
2167	if (do_cache) {
2168		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2169
2170		rth = rcu_dereference(nhc->nhc_rth_input);
2171		if (rt_cache_valid(rth)) {
2172			skb_dst_set_noref(skb, &rth->dst);
2173			err = 0;
2174			goto out;
2175		}
2176	}
2177
2178	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2179			   flags | RTCF_LOCAL, res->type,
2180			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2181	if (!rth)
2182		goto e_nobufs;
2183
2184	rth->dst.output= ip_rt_bug;
2185#ifdef CONFIG_IP_ROUTE_CLASSID
2186	rth->dst.tclassid = itag;
2187#endif
2188	rth->rt_is_input = 1;
2189
2190	RT_CACHE_STAT_INC(in_slow_tot);
2191	if (res->type == RTN_UNREACHABLE) {
2192		rth->dst.input= ip_error;
2193		rth->dst.error= -err;
2194		rth->rt_flags 	&= ~RTCF_LOCAL;
2195	}
2196
2197	if (do_cache) {
2198		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2199
2200		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2201		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2202			WARN_ON(rth->dst.input == lwtunnel_input);
2203			rth->dst.lwtstate->orig_input = rth->dst.input;
2204			rth->dst.input = lwtunnel_input;
2205		}
2206
2207		if (unlikely(!rt_cache_route(nhc, rth)))
2208			rt_add_uncached_list(rth);
2209	}
2210	skb_dst_set(skb, &rth->dst);
2211	err = 0;
2212	goto out;
2213
2214no_route:
2215	RT_CACHE_STAT_INC(in_no_route);
2216	res->type = RTN_UNREACHABLE;
2217	res->fi = NULL;
2218	res->table = NULL;
2219	goto local_input;
2220
2221	/*
2222	 *	Do not cache martian addresses: they should be logged (RFC1812)
2223	 */
2224martian_destination:
2225	RT_CACHE_STAT_INC(in_martian_dst);
2226#ifdef CONFIG_IP_ROUTE_VERBOSE
2227	if (IN_DEV_LOG_MARTIANS(in_dev))
2228		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2229				     &daddr, &saddr, dev->name);
2230#endif
2231
2232e_inval:
2233	err = -EINVAL;
2234	goto out;
2235
2236e_nobufs:
2237	err = -ENOBUFS;
2238	goto out;
2239
2240martian_source:
2241	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2242	goto out;
2243}
2244
2245int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2246			 u8 tos, struct net_device *dev)
2247{
2248	struct fib_result res;
2249	int err;
2250
2251	tos &= IPTOS_RT_MASK;
2252	rcu_read_lock();
2253	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2254	rcu_read_unlock();
2255
2256	return err;
2257}
2258EXPORT_SYMBOL(ip_route_input_noref);
2259
2260/* called with rcu_read_lock held */
2261int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2262		       u8 tos, struct net_device *dev, struct fib_result *res)
2263{
2264	/* Multicast recognition logic is moved from route cache to here.
2265	   The problem was that too many Ethernet cards have broken/missing
2266	   hardware multicast filters :-( As result the host on multicasting
2267	   network acquires a lot of useless route cache entries, sort of
2268	   SDR messages from all the world. Now we try to get rid of them.
2269	   Really, provided software IP multicast filter is organized
2270	   reasonably (at least, hashed), it does not result in a slowdown
2271	   comparing with route cache reject entries.
2272	   Note, that multicast routers are not affected, because
2273	   route cache entry is created eventually.
2274	 */
2275	if (ipv4_is_multicast(daddr)) {
2276		struct in_device *in_dev = __in_dev_get_rcu(dev);
2277		int our = 0;
2278		int err = -EINVAL;
2279
2280		if (!in_dev)
2281			return err;
2282		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2283				      ip_hdr(skb)->protocol);
2284
2285		/* check l3 master if no match yet */
2286		if (!our && netif_is_l3_slave(dev)) {
2287			struct in_device *l3_in_dev;
2288
2289			l3_in_dev = __in_dev_get_rcu(skb->dev);
2290			if (l3_in_dev)
2291				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2292						      ip_hdr(skb)->protocol);
2293		}
2294
2295		if (our
2296#ifdef CONFIG_IP_MROUTE
2297			||
2298		    (!ipv4_is_local_multicast(daddr) &&
2299		     IN_DEV_MFORWARD(in_dev))
2300#endif
2301		   ) {
2302			err = ip_route_input_mc(skb, daddr, saddr,
2303						tos, dev, our);
2304		}
2305		return err;
2306	}
2307
2308	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2309}
2310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2311/* called with rcu_read_lock() */
2312static struct rtable *__mkroute_output(const struct fib_result *res,
2313				       const struct flowi4 *fl4, int orig_oif,
2314				       struct net_device *dev_out,
2315				       unsigned int flags)
2316{
2317	struct fib_info *fi = res->fi;
2318	struct fib_nh_exception *fnhe;
2319	struct in_device *in_dev;
2320	u16 type = res->type;
2321	struct rtable *rth;
2322	bool do_cache;
2323
2324	in_dev = __in_dev_get_rcu(dev_out);
2325	if (!in_dev)
2326		return ERR_PTR(-EINVAL);
2327
2328	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2329		if (ipv4_is_loopback(fl4->saddr) &&
2330		    !(dev_out->flags & IFF_LOOPBACK) &&
2331		    !netif_is_l3_master(dev_out))
2332			return ERR_PTR(-EINVAL);
2333
2334	if (ipv4_is_lbcast(fl4->daddr))
2335		type = RTN_BROADCAST;
2336	else if (ipv4_is_multicast(fl4->daddr))
2337		type = RTN_MULTICAST;
2338	else if (ipv4_is_zeronet(fl4->daddr))
2339		return ERR_PTR(-EINVAL);
2340
2341	if (dev_out->flags & IFF_LOOPBACK)
2342		flags |= RTCF_LOCAL;
2343
2344	do_cache = true;
2345	if (type == RTN_BROADCAST) {
2346		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2347		fi = NULL;
2348	} else if (type == RTN_MULTICAST) {
2349		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2350		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2351				     fl4->flowi4_proto))
2352			flags &= ~RTCF_LOCAL;
2353		else
2354			do_cache = false;
2355		/* If multicast route do not exist use
2356		 * default one, but do not gateway in this case.
2357		 * Yes, it is hack.
2358		 */
2359		if (fi && res->prefixlen < 4)
2360			fi = NULL;
2361	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2362		   (orig_oif != dev_out->ifindex)) {
2363		/* For local routes that require a particular output interface
2364		 * we do not want to cache the result.  Caching the result
2365		 * causes incorrect behaviour when there are multiple source
2366		 * addresses on the interface, the end result being that if the
2367		 * intended recipient is waiting on that interface for the
2368		 * packet he won't receive it because it will be delivered on
2369		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2370		 * be set to the loopback interface as well.
2371		 */
2372		do_cache = false;
2373	}
2374
2375	fnhe = NULL;
2376	do_cache &= fi != NULL;
2377	if (fi) {
2378		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2379		struct rtable __rcu **prth;
2380
2381		fnhe = find_exception(nhc, fl4->daddr);
2382		if (!do_cache)
2383			goto add;
2384		if (fnhe) {
2385			prth = &fnhe->fnhe_rth_output;
2386		} else {
2387			if (unlikely(fl4->flowi4_flags &
2388				     FLOWI_FLAG_KNOWN_NH &&
2389				     !(nhc->nhc_gw_family &&
2390				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2391				do_cache = false;
2392				goto add;
2393			}
2394			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2395		}
2396		rth = rcu_dereference(*prth);
2397		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2398			return rth;
2399	}
2400
2401add:
2402	rth = rt_dst_alloc(dev_out, flags, type,
2403			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2404			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2405			   do_cache);
2406	if (!rth)
2407		return ERR_PTR(-ENOBUFS);
2408
2409	rth->rt_iif = orig_oif;
2410
2411	RT_CACHE_STAT_INC(out_slow_tot);
2412
2413	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2414		if (flags & RTCF_LOCAL &&
2415		    !(dev_out->flags & IFF_LOOPBACK)) {
2416			rth->dst.output = ip_mc_output;
2417			RT_CACHE_STAT_INC(out_slow_mc);
2418		}
2419#ifdef CONFIG_IP_MROUTE
2420		if (type == RTN_MULTICAST) {
2421			if (IN_DEV_MFORWARD(in_dev) &&
2422			    !ipv4_is_local_multicast(fl4->daddr)) {
2423				rth->dst.input = ip_mr_input;
2424				rth->dst.output = ip_mc_output;
2425			}
2426		}
2427#endif
2428	}
2429
2430	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2431	lwtunnel_set_redirect(&rth->dst);
2432
2433	return rth;
2434}
2435
2436/*
2437 * Major route resolver routine.
2438 */
2439
2440struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2441					const struct sk_buff *skb)
2442{
2443	__u8 tos = RT_FL_TOS(fl4);
2444	struct fib_result res = {
2445		.type		= RTN_UNSPEC,
2446		.fi		= NULL,
2447		.table		= NULL,
2448		.tclassid	= 0,
2449	};
2450	struct rtable *rth;
2451
2452	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2453	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2454	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2455			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2456
2457	rcu_read_lock();
2458	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2459	rcu_read_unlock();
2460
2461	return rth;
2462}
2463EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2464
2465struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2466					    struct fib_result *res,
2467					    const struct sk_buff *skb)
2468{
2469	struct net_device *dev_out = NULL;
2470	int orig_oif = fl4->flowi4_oif;
2471	unsigned int flags = 0;
2472	struct rtable *rth;
2473	int err;
2474
2475	if (fl4->saddr) {
2476		if (ipv4_is_multicast(fl4->saddr) ||
2477		    ipv4_is_lbcast(fl4->saddr) ||
2478		    ipv4_is_zeronet(fl4->saddr)) {
2479			rth = ERR_PTR(-EINVAL);
2480			goto out;
2481		}
2482
2483		rth = ERR_PTR(-ENETUNREACH);
2484
2485		/* I removed check for oif == dev_out->oif here.
2486		   It was wrong for two reasons:
2487		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2488		      is assigned to multiple interfaces.
2489		   2. Moreover, we are allowed to send packets with saddr
2490		      of another iface. --ANK
2491		 */
2492
2493		if (fl4->flowi4_oif == 0 &&
2494		    (ipv4_is_multicast(fl4->daddr) ||
2495		     ipv4_is_lbcast(fl4->daddr))) {
2496			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2497			dev_out = __ip_dev_find(net, fl4->saddr, false);
2498			if (!dev_out)
2499				goto out;
2500
2501			/* Special hack: user can direct multicasts
2502			   and limited broadcast via necessary interface
2503			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2504			   This hack is not just for fun, it allows
2505			   vic,vat and friends to work.
2506			   They bind socket to loopback, set ttl to zero
2507			   and expect that it will work.
2508			   From the viewpoint of routing cache they are broken,
2509			   because we are not allowed to build multicast path
2510			   with loopback source addr (look, routing cache
2511			   cannot know, that ttl is zero, so that packet
2512			   will not leave this host and route is valid).
2513			   Luckily, this hack is good workaround.
2514			 */
2515
2516			fl4->flowi4_oif = dev_out->ifindex;
2517			goto make_route;
2518		}
2519
2520		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2521			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2522			if (!__ip_dev_find(net, fl4->saddr, false))
2523				goto out;
2524		}
2525	}
2526
2527
2528	if (fl4->flowi4_oif) {
2529		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2530		rth = ERR_PTR(-ENODEV);
2531		if (!dev_out)
2532			goto out;
2533
2534		/* RACE: Check return value of inet_select_addr instead. */
2535		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2536			rth = ERR_PTR(-ENETUNREACH);
2537			goto out;
2538		}
2539		if (ipv4_is_local_multicast(fl4->daddr) ||
2540		    ipv4_is_lbcast(fl4->daddr) ||
2541		    fl4->flowi4_proto == IPPROTO_IGMP) {
2542			if (!fl4->saddr)
2543				fl4->saddr = inet_select_addr(dev_out, 0,
2544							      RT_SCOPE_LINK);
2545			goto make_route;
2546		}
2547		if (!fl4->saddr) {
2548			if (ipv4_is_multicast(fl4->daddr))
2549				fl4->saddr = inet_select_addr(dev_out, 0,
2550							      fl4->flowi4_scope);
2551			else if (!fl4->daddr)
2552				fl4->saddr = inet_select_addr(dev_out, 0,
2553							      RT_SCOPE_HOST);
2554		}
2555	}
2556
2557	if (!fl4->daddr) {
2558		fl4->daddr = fl4->saddr;
2559		if (!fl4->daddr)
2560			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2561		dev_out = net->loopback_dev;
2562		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2563		res->type = RTN_LOCAL;
2564		flags |= RTCF_LOCAL;
2565		goto make_route;
2566	}
2567
2568	err = fib_lookup(net, fl4, res, 0);
2569	if (err) {
2570		res->fi = NULL;
2571		res->table = NULL;
2572		if (fl4->flowi4_oif &&
2573		    (ipv4_is_multicast(fl4->daddr) ||
2574		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2575			/* Apparently, routing tables are wrong. Assume,
2576			   that the destination is on link.
2577
2578			   WHY? DW.
2579			   Because we are allowed to send to iface
2580			   even if it has NO routes and NO assigned
2581			   addresses. When oif is specified, routing
2582			   tables are looked up with only one purpose:
2583			   to catch if destination is gatewayed, rather than
2584			   direct. Moreover, if MSG_DONTROUTE is set,
2585			   we send packet, ignoring both routing tables
2586			   and ifaddr state. --ANK
2587
2588
2589			   We could make it even if oif is unknown,
2590			   likely IPv6, but we do not.
2591			 */
2592
2593			if (fl4->saddr == 0)
2594				fl4->saddr = inet_select_addr(dev_out, 0,
2595							      RT_SCOPE_LINK);
2596			res->type = RTN_UNICAST;
2597			goto make_route;
2598		}
2599		rth = ERR_PTR(err);
2600		goto out;
2601	}
2602
2603	if (res->type == RTN_LOCAL) {
2604		if (!fl4->saddr) {
2605			if (res->fi->fib_prefsrc)
2606				fl4->saddr = res->fi->fib_prefsrc;
2607			else
2608				fl4->saddr = fl4->daddr;
2609		}
2610
2611		/* L3 master device is the loopback for that domain */
2612		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2613			net->loopback_dev;
2614
2615		/* make sure orig_oif points to fib result device even
2616		 * though packet rx/tx happens over loopback or l3mdev
2617		 */
2618		orig_oif = FIB_RES_OIF(*res);
2619
2620		fl4->flowi4_oif = dev_out->ifindex;
2621		flags |= RTCF_LOCAL;
2622		goto make_route;
2623	}
2624
2625	fib_select_path(net, res, fl4, skb);
2626
2627	dev_out = FIB_RES_DEV(*res);
2628	fl4->flowi4_oif = dev_out->ifindex;
2629
2630
2631make_route:
2632	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2633
2634out:
2635	return rth;
2636}
2637
2638static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2639{
2640	return NULL;
2641}
2642
2643static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2644{
2645	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2646
2647	return mtu ? : dst->dev->mtu;
2648}
2649
2650static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2651					  struct sk_buff *skb, u32 mtu)
2652{
2653}
2654
2655static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2656				       struct sk_buff *skb)
2657{
2658}
2659
2660static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2661					  unsigned long old)
2662{
2663	return NULL;
2664}
2665
2666static struct dst_ops ipv4_dst_blackhole_ops = {
2667	.family			=	AF_INET,
2668	.check			=	ipv4_blackhole_dst_check,
2669	.mtu			=	ipv4_blackhole_mtu,
2670	.default_advmss		=	ipv4_default_advmss,
2671	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2672	.redirect		=	ipv4_rt_blackhole_redirect,
2673	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2674	.neigh_lookup		=	ipv4_neigh_lookup,
2675};
2676
2677struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2678{
2679	struct rtable *ort = (struct rtable *) dst_orig;
2680	struct rtable *rt;
2681
2682	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2683	if (rt) {
2684		struct dst_entry *new = &rt->dst;
2685
2686		new->__use = 1;
2687		new->input = dst_discard;
2688		new->output = dst_discard_out;
2689
2690		new->dev = net->loopback_dev;
2691		if (new->dev)
2692			dev_hold(new->dev);
2693
2694		rt->rt_is_input = ort->rt_is_input;
2695		rt->rt_iif = ort->rt_iif;
2696		rt->rt_pmtu = ort->rt_pmtu;
2697		rt->rt_mtu_locked = ort->rt_mtu_locked;
2698
2699		rt->rt_genid = rt_genid_ipv4(net);
2700		rt->rt_flags = ort->rt_flags;
2701		rt->rt_type = ort->rt_type;
2702		rt->rt_uses_gateway = ort->rt_uses_gateway;
2703		rt->rt_gw_family = ort->rt_gw_family;
2704		if (rt->rt_gw_family == AF_INET)
2705			rt->rt_gw4 = ort->rt_gw4;
2706		else if (rt->rt_gw_family == AF_INET6)
2707			rt->rt_gw6 = ort->rt_gw6;
2708
2709		INIT_LIST_HEAD(&rt->rt_uncached);
2710	}
2711
2712	dst_release(dst_orig);
2713
2714	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2715}
2716
2717struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2718				    const struct sock *sk)
2719{
2720	struct rtable *rt = __ip_route_output_key(net, flp4);
2721
2722	if (IS_ERR(rt))
2723		return rt;
2724
2725	if (flp4->flowi4_proto)
 
2726		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2727							flowi4_to_flowi(flp4),
2728							sk, 0);
 
2729
2730	return rt;
2731}
2732EXPORT_SYMBOL_GPL(ip_route_output_flow);
2733
2734/* called with rcu_read_lock held */
2735static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2736			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2737			struct sk_buff *skb, u32 portid, u32 seq,
2738			unsigned int flags)
2739{
2740	struct rtmsg *r;
2741	struct nlmsghdr *nlh;
2742	unsigned long expires = 0;
2743	u32 error;
2744	u32 metrics[RTAX_MAX];
2745
2746	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2747	if (!nlh)
2748		return -EMSGSIZE;
2749
2750	r = nlmsg_data(nlh);
2751	r->rtm_family	 = AF_INET;
2752	r->rtm_dst_len	= 32;
2753	r->rtm_src_len	= 0;
2754	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2755	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2756	if (nla_put_u32(skb, RTA_TABLE, table_id))
2757		goto nla_put_failure;
2758	r->rtm_type	= rt->rt_type;
2759	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2760	r->rtm_protocol = RTPROT_UNSPEC;
2761	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2762	if (rt->rt_flags & RTCF_NOTIFY)
2763		r->rtm_flags |= RTM_F_NOTIFY;
2764	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2765		r->rtm_flags |= RTCF_DOREDIRECT;
2766
2767	if (nla_put_in_addr(skb, RTA_DST, dst))
2768		goto nla_put_failure;
2769	if (src) {
2770		r->rtm_src_len = 32;
2771		if (nla_put_in_addr(skb, RTA_SRC, src))
2772			goto nla_put_failure;
2773	}
2774	if (rt->dst.dev &&
2775	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2776		goto nla_put_failure;
 
 
 
2777#ifdef CONFIG_IP_ROUTE_CLASSID
2778	if (rt->dst.tclassid &&
2779	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2780		goto nla_put_failure;
2781#endif
2782	if (fl4 && !rt_is_input_route(rt) &&
2783	    fl4->saddr != src) {
2784		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2785			goto nla_put_failure;
2786	}
2787	if (rt->rt_uses_gateway) {
2788		if (rt->rt_gw_family == AF_INET &&
2789		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2790			goto nla_put_failure;
2791		} else if (rt->rt_gw_family == AF_INET6) {
2792			int alen = sizeof(struct in6_addr);
2793			struct nlattr *nla;
2794			struct rtvia *via;
2795
2796			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2797			if (!nla)
2798				goto nla_put_failure;
2799
2800			via = nla_data(nla);
2801			via->rtvia_family = AF_INET6;
2802			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2803		}
2804	}
2805
2806	expires = rt->dst.expires;
2807	if (expires) {
2808		unsigned long now = jiffies;
2809
2810		if (time_before(now, expires))
2811			expires -= now;
2812		else
2813			expires = 0;
2814	}
2815
2816	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2817	if (rt->rt_pmtu && expires)
2818		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2819	if (rt->rt_mtu_locked && expires)
2820		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2821	if (rtnetlink_put_metrics(skb, metrics) < 0)
2822		goto nla_put_failure;
2823
2824	if (fl4) {
2825		if (fl4->flowi4_mark &&
2826		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2827			goto nla_put_failure;
2828
2829		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2830		    nla_put_u32(skb, RTA_UID,
2831				from_kuid_munged(current_user_ns(),
2832						 fl4->flowi4_uid)))
2833			goto nla_put_failure;
2834
2835		if (rt_is_input_route(rt)) {
2836#ifdef CONFIG_IP_MROUTE
2837			if (ipv4_is_multicast(dst) &&
2838			    !ipv4_is_local_multicast(dst) &&
2839			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2840				int err = ipmr_get_route(net, skb,
2841							 fl4->saddr, fl4->daddr,
2842							 r, portid);
2843
2844				if (err <= 0) {
2845					if (err == 0)
2846						return 0;
2847					goto nla_put_failure;
2848				}
2849			} else
2850#endif
2851				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2852					goto nla_put_failure;
2853		}
2854	}
2855
2856	error = rt->dst.error;
2857
2858	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2859		goto nla_put_failure;
2860
2861	nlmsg_end(skb, nlh);
2862	return 0;
2863
2864nla_put_failure:
2865	nlmsg_cancel(skb, nlh);
2866	return -EMSGSIZE;
2867}
2868
2869static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2870			    struct netlink_callback *cb, u32 table_id,
2871			    struct fnhe_hash_bucket *bucket, int genid,
2872			    int *fa_index, int fa_start, unsigned int flags)
2873{
2874	int i;
2875
2876	for (i = 0; i < FNHE_HASH_SIZE; i++) {
2877		struct fib_nh_exception *fnhe;
2878
2879		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2880		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2881			struct rtable *rt;
2882			int err;
2883
2884			if (*fa_index < fa_start)
2885				goto next;
2886
2887			if (fnhe->fnhe_genid != genid)
2888				goto next;
2889
2890			if (fnhe->fnhe_expires &&
2891			    time_after(jiffies, fnhe->fnhe_expires))
2892				goto next;
2893
2894			rt = rcu_dereference(fnhe->fnhe_rth_input);
2895			if (!rt)
2896				rt = rcu_dereference(fnhe->fnhe_rth_output);
2897			if (!rt)
2898				goto next;
2899
2900			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2901					   table_id, NULL, skb,
2902					   NETLINK_CB(cb->skb).portid,
2903					   cb->nlh->nlmsg_seq, flags);
2904			if (err)
2905				return err;
2906next:
2907			(*fa_index)++;
2908		}
2909	}
2910
2911	return 0;
2912}
2913
2914int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2915		       u32 table_id, struct fib_info *fi,
2916		       int *fa_index, int fa_start, unsigned int flags)
2917{
2918	struct net *net = sock_net(cb->skb->sk);
2919	int nhsel, genid = fnhe_genid(net);
2920
2921	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2922		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2923		struct fnhe_hash_bucket *bucket;
2924		int err;
2925
2926		if (nhc->nhc_flags & RTNH_F_DEAD)
2927			continue;
2928
2929		rcu_read_lock();
2930		bucket = rcu_dereference(nhc->nhc_exceptions);
2931		err = 0;
2932		if (bucket)
2933			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2934					       genid, fa_index, fa_start,
2935					       flags);
2936		rcu_read_unlock();
2937		if (err)
2938			return err;
2939	}
2940
2941	return 0;
2942}
2943
2944static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2945						   u8 ip_proto, __be16 sport,
2946						   __be16 dport)
2947{
2948	struct sk_buff *skb;
2949	struct iphdr *iph;
2950
2951	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2952	if (!skb)
2953		return NULL;
2954
2955	/* Reserve room for dummy headers, this skb can pass
2956	 * through good chunk of routing engine.
2957	 */
2958	skb_reset_mac_header(skb);
2959	skb_reset_network_header(skb);
2960	skb->protocol = htons(ETH_P_IP);
2961	iph = skb_put(skb, sizeof(struct iphdr));
2962	iph->protocol = ip_proto;
2963	iph->saddr = src;
2964	iph->daddr = dst;
2965	iph->version = 0x4;
2966	iph->frag_off = 0;
2967	iph->ihl = 0x5;
2968	skb_set_transport_header(skb, skb->len);
2969
2970	switch (iph->protocol) {
2971	case IPPROTO_UDP: {
2972		struct udphdr *udph;
2973
2974		udph = skb_put_zero(skb, sizeof(struct udphdr));
2975		udph->source = sport;
2976		udph->dest = dport;
2977		udph->len = sizeof(struct udphdr);
2978		udph->check = 0;
2979		break;
2980	}
2981	case IPPROTO_TCP: {
2982		struct tcphdr *tcph;
2983
2984		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2985		tcph->source	= sport;
2986		tcph->dest	= dport;
2987		tcph->doff	= sizeof(struct tcphdr) / 4;
2988		tcph->rst = 1;
2989		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2990					    src, dst, 0);
2991		break;
2992	}
2993	case IPPROTO_ICMP: {
2994		struct icmphdr *icmph;
2995
2996		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2997		icmph->type = ICMP_ECHO;
2998		icmph->code = 0;
2999	}
3000	}
3001
3002	return skb;
3003}
3004
3005static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3006				       const struct nlmsghdr *nlh,
3007				       struct nlattr **tb,
3008				       struct netlink_ext_ack *extack)
3009{
3010	struct rtmsg *rtm;
3011	int i, err;
3012
3013	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3014		NL_SET_ERR_MSG(extack,
3015			       "ipv4: Invalid header for route get request");
3016		return -EINVAL;
3017	}
3018
3019	if (!netlink_strict_get_check(skb))
3020		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3021					      rtm_ipv4_policy, extack);
3022
3023	rtm = nlmsg_data(nlh);
3024	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3025	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3026	    rtm->rtm_table || rtm->rtm_protocol ||
3027	    rtm->rtm_scope || rtm->rtm_type) {
3028		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3029		return -EINVAL;
3030	}
3031
3032	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3033			       RTM_F_LOOKUP_TABLE |
3034			       RTM_F_FIB_MATCH)) {
3035		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3036		return -EINVAL;
3037	}
3038
3039	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3040					    rtm_ipv4_policy, extack);
3041	if (err)
3042		return err;
3043
3044	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3045	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3046		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3047		return -EINVAL;
3048	}
3049
3050	for (i = 0; i <= RTA_MAX; i++) {
3051		if (!tb[i])
3052			continue;
3053
3054		switch (i) {
3055		case RTA_IIF:
3056		case RTA_OIF:
3057		case RTA_SRC:
3058		case RTA_DST:
3059		case RTA_IP_PROTO:
3060		case RTA_SPORT:
3061		case RTA_DPORT:
3062		case RTA_MARK:
3063		case RTA_UID:
3064			break;
3065		default:
3066			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3067			return -EINVAL;
3068		}
3069	}
3070
3071	return 0;
3072}
3073
3074static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3075			     struct netlink_ext_ack *extack)
3076{
3077	struct net *net = sock_net(in_skb->sk);
3078	struct nlattr *tb[RTA_MAX+1];
3079	u32 table_id = RT_TABLE_MAIN;
3080	__be16 sport = 0, dport = 0;
3081	struct fib_result res = {};
3082	u8 ip_proto = IPPROTO_UDP;
3083	struct rtable *rt = NULL;
3084	struct sk_buff *skb;
3085	struct rtmsg *rtm;
3086	struct flowi4 fl4 = {};
3087	__be32 dst = 0;
3088	__be32 src = 0;
3089	kuid_t uid;
3090	u32 iif;
3091	int err;
3092	int mark;
3093
3094	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3095	if (err < 0)
3096		return err;
3097
3098	rtm = nlmsg_data(nlh);
3099	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3100	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3101	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3102	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3103	if (tb[RTA_UID])
3104		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3105	else
3106		uid = (iif ? INVALID_UID : current_uid());
3107
3108	if (tb[RTA_IP_PROTO]) {
3109		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3110						  &ip_proto, AF_INET, extack);
3111		if (err)
3112			return err;
3113	}
3114
3115	if (tb[RTA_SPORT])
3116		sport = nla_get_be16(tb[RTA_SPORT]);
3117
3118	if (tb[RTA_DPORT])
3119		dport = nla_get_be16(tb[RTA_DPORT]);
3120
3121	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3122	if (!skb)
3123		return -ENOBUFS;
3124
3125	fl4.daddr = dst;
3126	fl4.saddr = src;
3127	fl4.flowi4_tos = rtm->rtm_tos;
3128	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3129	fl4.flowi4_mark = mark;
3130	fl4.flowi4_uid = uid;
3131	if (sport)
3132		fl4.fl4_sport = sport;
3133	if (dport)
3134		fl4.fl4_dport = dport;
3135	fl4.flowi4_proto = ip_proto;
3136
3137	rcu_read_lock();
3138
3139	if (iif) {
3140		struct net_device *dev;
3141
3142		dev = dev_get_by_index_rcu(net, iif);
3143		if (!dev) {
3144			err = -ENODEV;
3145			goto errout_rcu;
3146		}
3147
3148		fl4.flowi4_iif = iif; /* for rt_fill_info */
3149		skb->dev	= dev;
3150		skb->mark	= mark;
3151		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3152					 dev, &res);
 
3153
3154		rt = skb_rtable(skb);
3155		if (err == 0 && rt->dst.error)
3156			err = -rt->dst.error;
3157	} else {
3158		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3159		skb->dev = net->loopback_dev;
3160		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3161		err = 0;
3162		if (IS_ERR(rt))
3163			err = PTR_ERR(rt);
3164		else
3165			skb_dst_set(skb, &rt->dst);
3166	}
3167
3168	if (err)
3169		goto errout_rcu;
3170
3171	if (rtm->rtm_flags & RTM_F_NOTIFY)
3172		rt->rt_flags |= RTCF_NOTIFY;
3173
3174	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3175		table_id = res.table ? res.table->tb_id : 0;
3176
3177	/* reset skb for netlink reply msg */
3178	skb_trim(skb, 0);
3179	skb_reset_network_header(skb);
3180	skb_reset_transport_header(skb);
3181	skb_reset_mac_header(skb);
3182
3183	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
 
 
3184		if (!res.fi) {
3185			err = fib_props[res.type].error;
3186			if (!err)
3187				err = -EHOSTUNREACH;
3188			goto errout_rcu;
3189		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3190		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3191				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3192				    rt->rt_type, res.prefix, res.prefixlen,
3193				    fl4.flowi4_tos, res.fi, 0);
3194	} else {
3195		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3196				   NETLINK_CB(in_skb).portid,
3197				   nlh->nlmsg_seq, 0);
3198	}
3199	if (err < 0)
3200		goto errout_rcu;
3201
3202	rcu_read_unlock();
3203
3204	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3205
3206errout_free:
3207	return err;
3208errout_rcu:
3209	rcu_read_unlock();
3210	kfree_skb(skb);
3211	goto errout_free;
3212}
3213
3214void ip_rt_multicast_event(struct in_device *in_dev)
3215{
3216	rt_cache_flush(dev_net(in_dev->dev));
3217}
3218
3219#ifdef CONFIG_SYSCTL
3220static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3221static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3222static int ip_rt_gc_elasticity __read_mostly	= 8;
3223static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3224
3225static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3226					void __user *buffer,
3227					size_t *lenp, loff_t *ppos)
3228{
3229	struct net *net = (struct net *)__ctl->extra1;
3230
3231	if (write) {
3232		rt_cache_flush(net);
3233		fnhe_genid_bump(net);
3234		return 0;
3235	}
3236
3237	return -EINVAL;
3238}
3239
3240static struct ctl_table ipv4_route_table[] = {
3241	{
3242		.procname	= "gc_thresh",
3243		.data		= &ipv4_dst_ops.gc_thresh,
3244		.maxlen		= sizeof(int),
3245		.mode		= 0644,
3246		.proc_handler	= proc_dointvec,
3247	},
3248	{
3249		.procname	= "max_size",
3250		.data		= &ip_rt_max_size,
3251		.maxlen		= sizeof(int),
3252		.mode		= 0644,
3253		.proc_handler	= proc_dointvec,
3254	},
3255	{
3256		/*  Deprecated. Use gc_min_interval_ms */
3257
3258		.procname	= "gc_min_interval",
3259		.data		= &ip_rt_gc_min_interval,
3260		.maxlen		= sizeof(int),
3261		.mode		= 0644,
3262		.proc_handler	= proc_dointvec_jiffies,
3263	},
3264	{
3265		.procname	= "gc_min_interval_ms",
3266		.data		= &ip_rt_gc_min_interval,
3267		.maxlen		= sizeof(int),
3268		.mode		= 0644,
3269		.proc_handler	= proc_dointvec_ms_jiffies,
3270	},
3271	{
3272		.procname	= "gc_timeout",
3273		.data		= &ip_rt_gc_timeout,
3274		.maxlen		= sizeof(int),
3275		.mode		= 0644,
3276		.proc_handler	= proc_dointvec_jiffies,
3277	},
3278	{
3279		.procname	= "gc_interval",
3280		.data		= &ip_rt_gc_interval,
3281		.maxlen		= sizeof(int),
3282		.mode		= 0644,
3283		.proc_handler	= proc_dointvec_jiffies,
3284	},
3285	{
3286		.procname	= "redirect_load",
3287		.data		= &ip_rt_redirect_load,
3288		.maxlen		= sizeof(int),
3289		.mode		= 0644,
3290		.proc_handler	= proc_dointvec,
3291	},
3292	{
3293		.procname	= "redirect_number",
3294		.data		= &ip_rt_redirect_number,
3295		.maxlen		= sizeof(int),
3296		.mode		= 0644,
3297		.proc_handler	= proc_dointvec,
3298	},
3299	{
3300		.procname	= "redirect_silence",
3301		.data		= &ip_rt_redirect_silence,
3302		.maxlen		= sizeof(int),
3303		.mode		= 0644,
3304		.proc_handler	= proc_dointvec,
3305	},
3306	{
3307		.procname	= "error_cost",
3308		.data		= &ip_rt_error_cost,
3309		.maxlen		= sizeof(int),
3310		.mode		= 0644,
3311		.proc_handler	= proc_dointvec,
3312	},
3313	{
3314		.procname	= "error_burst",
3315		.data		= &ip_rt_error_burst,
3316		.maxlen		= sizeof(int),
3317		.mode		= 0644,
3318		.proc_handler	= proc_dointvec,
3319	},
3320	{
3321		.procname	= "gc_elasticity",
3322		.data		= &ip_rt_gc_elasticity,
3323		.maxlen		= sizeof(int),
3324		.mode		= 0644,
3325		.proc_handler	= proc_dointvec,
3326	},
3327	{
3328		.procname	= "mtu_expires",
3329		.data		= &ip_rt_mtu_expires,
3330		.maxlen		= sizeof(int),
3331		.mode		= 0644,
3332		.proc_handler	= proc_dointvec_jiffies,
3333	},
3334	{
3335		.procname	= "min_pmtu",
3336		.data		= &ip_rt_min_pmtu,
3337		.maxlen		= sizeof(int),
3338		.mode		= 0644,
3339		.proc_handler	= proc_dointvec_minmax,
3340		.extra1		= &ip_min_valid_pmtu,
3341	},
3342	{
3343		.procname	= "min_adv_mss",
3344		.data		= &ip_rt_min_advmss,
3345		.maxlen		= sizeof(int),
3346		.mode		= 0644,
3347		.proc_handler	= proc_dointvec,
3348	},
3349	{ }
3350};
3351
3352static const char ipv4_route_flush_procname[] = "flush";
3353
3354static struct ctl_table ipv4_route_flush_table[] = {
3355	{
3356		.procname	= ipv4_route_flush_procname,
3357		.maxlen		= sizeof(int),
3358		.mode		= 0200,
3359		.proc_handler	= ipv4_sysctl_rtcache_flush,
3360	},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3361	{ },
3362};
3363
3364static __net_init int sysctl_route_net_init(struct net *net)
3365{
3366	struct ctl_table *tbl;
 
3367
3368	tbl = ipv4_route_flush_table;
3369	if (!net_eq(net, &init_net)) {
3370		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
 
 
3371		if (!tbl)
3372			goto err_dup;
3373
3374		/* Don't export non-whitelisted sysctls to unprivileged users */
3375		if (net->user_ns != &init_user_ns) {
3376			if (tbl[0].procname != ipv4_route_flush_procname)
3377				tbl[0].procname = NULL;
 
 
3378		}
 
 
 
 
 
 
3379	}
3380	tbl[0].extra1 = net;
3381
3382	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
 
3383	if (!net->ipv4.route_hdr)
3384		goto err_reg;
3385	return 0;
3386
3387err_reg:
3388	if (tbl != ipv4_route_flush_table)
3389		kfree(tbl);
3390err_dup:
3391	return -ENOMEM;
3392}
3393
3394static __net_exit void sysctl_route_net_exit(struct net *net)
3395{
3396	struct ctl_table *tbl;
3397
3398	tbl = net->ipv4.route_hdr->ctl_table_arg;
3399	unregister_net_sysctl_table(net->ipv4.route_hdr);
3400	BUG_ON(tbl == ipv4_route_flush_table);
3401	kfree(tbl);
3402}
3403
3404static __net_initdata struct pernet_operations sysctl_route_ops = {
3405	.init = sysctl_route_net_init,
3406	.exit = sysctl_route_net_exit,
3407};
3408#endif
3409
 
 
 
 
 
 
 
 
 
 
 
 
 
3410static __net_init int rt_genid_init(struct net *net)
3411{
3412	atomic_set(&net->ipv4.rt_genid, 0);
3413	atomic_set(&net->fnhe_genid, 0);
3414	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3415	return 0;
3416}
3417
3418static __net_initdata struct pernet_operations rt_genid_ops = {
3419	.init = rt_genid_init,
3420};
3421
3422static int __net_init ipv4_inetpeer_init(struct net *net)
3423{
3424	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3425
3426	if (!bp)
3427		return -ENOMEM;
3428	inet_peer_base_init(bp);
3429	net->ipv4.peers = bp;
3430	return 0;
3431}
3432
3433static void __net_exit ipv4_inetpeer_exit(struct net *net)
3434{
3435	struct inet_peer_base *bp = net->ipv4.peers;
3436
3437	net->ipv4.peers = NULL;
3438	inetpeer_invalidate_tree(bp);
3439	kfree(bp);
3440}
3441
3442static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3443	.init	=	ipv4_inetpeer_init,
3444	.exit	=	ipv4_inetpeer_exit,
3445};
3446
3447#ifdef CONFIG_IP_ROUTE_CLASSID
3448struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3449#endif /* CONFIG_IP_ROUTE_CLASSID */
3450
3451int __init ip_rt_init(void)
3452{
 
3453	int cpu;
3454
3455	ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3456				  GFP_KERNEL);
3457	if (!ip_idents)
3458		panic("IP: failed to allocate ip_idents\n");
3459
3460	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3461
3462	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3463	if (!ip_tstamps)
3464		panic("IP: failed to allocate ip_tstamps\n");
 
 
 
 
 
 
3465
3466	for_each_possible_cpu(cpu) {
3467		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3468
3469		INIT_LIST_HEAD(&ul->head);
 
3470		spin_lock_init(&ul->lock);
3471	}
3472#ifdef CONFIG_IP_ROUTE_CLASSID
3473	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3474	if (!ip_rt_acct)
3475		panic("IP: failed to allocate ip_rt_acct\n");
3476#endif
3477
3478	ipv4_dst_ops.kmem_cachep =
3479		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3480				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3481
3482	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3483
3484	if (dst_entries_init(&ipv4_dst_ops) < 0)
3485		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3486
3487	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3488		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3489
3490	ipv4_dst_ops.gc_thresh = ~0;
3491	ip_rt_max_size = INT_MAX;
3492
3493	devinet_init();
3494	ip_fib_init();
3495
3496	if (ip_rt_proc_init())
3497		pr_err("Unable to create route proc files\n");
3498#ifdef CONFIG_XFRM
3499	xfrm_init();
3500	xfrm4_init();
3501#endif
3502	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3503		      RTNL_FLAG_DOIT_UNLOCKED);
3504
3505#ifdef CONFIG_SYSCTL
3506	register_pernet_subsys(&sysctl_route_ops);
3507#endif
 
3508	register_pernet_subsys(&rt_genid_ops);
3509	register_pernet_subsys(&ipv4_inetpeer_ops);
3510	return 0;
3511}
3512
3513#ifdef CONFIG_SYSCTL
3514/*
3515 * We really need to sanitize the damn ipv4 init order, then all
3516 * this nonsense will go away.
3517 */
3518void __init ip_static_sysctl_init(void)
3519{
3520	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3521}
3522#endif
v6.8
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		ROUTE - implementation of the IP router.
   8 *
   9 * Authors:	Ross Biro
  10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  12 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14 *
  15 * Fixes:
  16 *		Alan Cox	:	Verify area fixes.
  17 *		Alan Cox	:	cli() protects routing changes
  18 *		Rui Oliveira	:	ICMP routing table updates
  19 *		(rco@di.uminho.pt)	Routing table insertion and update
  20 *		Linus Torvalds	:	Rewrote bits to be sensible
  21 *		Alan Cox	:	Added BSD route gw semantics
  22 *		Alan Cox	:	Super /proc >4K
  23 *		Alan Cox	:	MTU in route table
  24 *		Alan Cox	:	MSS actually. Also added the window
  25 *					clamper.
  26 *		Sam Lantinga	:	Fixed route matching in rt_del()
  27 *		Alan Cox	:	Routing cache support.
  28 *		Alan Cox	:	Removed compatibility cruft.
  29 *		Alan Cox	:	RTF_REJECT support.
  30 *		Alan Cox	:	TCP irtt support.
  31 *		Jonathan Naylor	:	Added Metric support.
  32 *	Miquel van Smoorenburg	:	BSD API fixes.
  33 *	Miquel van Smoorenburg	:	Metrics.
  34 *		Alan Cox	:	Use __u32 properly
  35 *		Alan Cox	:	Aligned routing errors more closely with BSD
  36 *					our system is still very different.
  37 *		Alan Cox	:	Faster /proc handling
  38 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  39 *					routing caches and better behaviour.
  40 *
  41 *		Olaf Erb	:	irtt wasn't being copied right.
  42 *		Bjorn Ekwall	:	Kerneld route support.
  43 *		Alan Cox	:	Multicast fixed (I hope)
  44 *		Pavel Krauz	:	Limited broadcast fixed
  45 *		Mike McLagan	:	Routing by source
  46 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  47 *					route.c and rewritten from scratch.
  48 *		Andi Kleen	:	Load-limit warning messages.
  49 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  50 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  51 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  52 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  53 *		Marc Boucher	:	routing by fwmark
  54 *	Robert Olsson		:	Added rt_cache statistics
  55 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  56 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  57 *	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  58 *	Ilia Sotnikov		:	Removed TOS from hash calculations
  59 */
  60
  61#define pr_fmt(fmt) "IPv4: " fmt
  62
  63#include <linux/module.h>
 
  64#include <linux/bitops.h>
 
  65#include <linux/kernel.h>
  66#include <linux/mm.h>
  67#include <linux/memblock.h>
  68#include <linux/socket.h>
 
  69#include <linux/errno.h>
  70#include <linux/in.h>
  71#include <linux/inet.h>
  72#include <linux/netdevice.h>
  73#include <linux/proc_fs.h>
  74#include <linux/init.h>
  75#include <linux/skbuff.h>
  76#include <linux/inetdevice.h>
  77#include <linux/igmp.h>
  78#include <linux/pkt_sched.h>
  79#include <linux/mroute.h>
  80#include <linux/netfilter_ipv4.h>
  81#include <linux/random.h>
  82#include <linux/rcupdate.h>
 
  83#include <linux/slab.h>
  84#include <linux/jhash.h>
  85#include <net/dst.h>
  86#include <net/dst_metadata.h>
  87#include <net/inet_dscp.h>
  88#include <net/net_namespace.h>
 
  89#include <net/ip.h>
  90#include <net/route.h>
  91#include <net/inetpeer.h>
  92#include <net/sock.h>
  93#include <net/ip_fib.h>
  94#include <net/nexthop.h>
 
  95#include <net/tcp.h>
  96#include <net/icmp.h>
  97#include <net/xfrm.h>
  98#include <net/lwtunnel.h>
  99#include <net/netevent.h>
 100#include <net/rtnetlink.h>
 101#ifdef CONFIG_SYSCTL
 102#include <linux/sysctl.h>
 103#endif
 104#include <net/secure_seq.h>
 105#include <net/ip_tunnels.h>
 
 106
 107#include "fib_lookup.h"
 108
 109#define RT_FL_TOS(oldflp4) \
 110	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 111
 112#define RT_GC_TIMEOUT (300*HZ)
 113
 114#define DEFAULT_MIN_PMTU (512 + 20 + 20)
 115#define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
 116#define DEFAULT_MIN_ADVMSS 256
 117static int ip_rt_max_size;
 118static int ip_rt_redirect_number __read_mostly	= 9;
 119static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 120static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 121static int ip_rt_error_cost __read_mostly	= HZ;
 122static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 
 
 
 123
 124static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 125
 126/*
 127 *	Interface to generic destination cache.
 128 */
 129
 130INDIRECT_CALLABLE_SCOPE
 131struct dst_entry	*ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 132static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 133INDIRECT_CALLABLE_SCOPE
 134unsigned int		ipv4_mtu(const struct dst_entry *dst);
 135static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 136static void		 ipv4_link_failure(struct sk_buff *skb);
 137static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 138					   struct sk_buff *skb, u32 mtu,
 139					   bool confirm_neigh);
 140static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 141					struct sk_buff *skb);
 142static void		ipv4_dst_destroy(struct dst_entry *dst);
 143
 144static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 145{
 146	WARN_ON(1);
 147	return NULL;
 148}
 149
 150static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 151					   struct sk_buff *skb,
 152					   const void *daddr);
 153static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 154
 155static struct dst_ops ipv4_dst_ops = {
 156	.family =		AF_INET,
 157	.check =		ipv4_dst_check,
 158	.default_advmss =	ipv4_default_advmss,
 159	.mtu =			ipv4_mtu,
 160	.cow_metrics =		ipv4_cow_metrics,
 161	.destroy =		ipv4_dst_destroy,
 162	.negative_advice =	ipv4_negative_advice,
 163	.link_failure =		ipv4_link_failure,
 164	.update_pmtu =		ip_rt_update_pmtu,
 165	.redirect =		ip_do_redirect,
 166	.local_out =		__ip_local_out,
 167	.neigh_lookup =		ipv4_neigh_lookup,
 168	.confirm_neigh =	ipv4_confirm_neigh,
 169};
 170
 171#define ECN_OR_COST(class)	TC_PRIO_##class
 172
 173const __u8 ip_tos2prio[16] = {
 174	TC_PRIO_BESTEFFORT,
 175	ECN_OR_COST(BESTEFFORT),
 176	TC_PRIO_BESTEFFORT,
 177	ECN_OR_COST(BESTEFFORT),
 178	TC_PRIO_BULK,
 179	ECN_OR_COST(BULK),
 180	TC_PRIO_BULK,
 181	ECN_OR_COST(BULK),
 182	TC_PRIO_INTERACTIVE,
 183	ECN_OR_COST(INTERACTIVE),
 184	TC_PRIO_INTERACTIVE,
 185	ECN_OR_COST(INTERACTIVE),
 186	TC_PRIO_INTERACTIVE_BULK,
 187	ECN_OR_COST(INTERACTIVE_BULK),
 188	TC_PRIO_INTERACTIVE_BULK,
 189	ECN_OR_COST(INTERACTIVE_BULK)
 190};
 191EXPORT_SYMBOL(ip_tos2prio);
 192
 193static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 194#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 195
 196#ifdef CONFIG_PROC_FS
 197static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 198{
 199	if (*pos)
 200		return NULL;
 201	return SEQ_START_TOKEN;
 202}
 203
 204static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 205{
 206	++*pos;
 207	return NULL;
 208}
 209
 210static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 211{
 212}
 213
 214static int rt_cache_seq_show(struct seq_file *seq, void *v)
 215{
 216	if (v == SEQ_START_TOKEN)
 217		seq_printf(seq, "%-127s\n",
 218			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 219			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 220			   "HHUptod\tSpecDst");
 221	return 0;
 222}
 223
 224static const struct seq_operations rt_cache_seq_ops = {
 225	.start  = rt_cache_seq_start,
 226	.next   = rt_cache_seq_next,
 227	.stop   = rt_cache_seq_stop,
 228	.show   = rt_cache_seq_show,
 229};
 230
 
 
 
 
 
 
 
 
 
 
 
 
 
 231static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 232{
 233	int cpu;
 234
 235	if (*pos == 0)
 236		return SEQ_START_TOKEN;
 237
 238	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 239		if (!cpu_possible(cpu))
 240			continue;
 241		*pos = cpu+1;
 242		return &per_cpu(rt_cache_stat, cpu);
 243	}
 244	return NULL;
 245}
 246
 247static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 248{
 249	int cpu;
 250
 251	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 252		if (!cpu_possible(cpu))
 253			continue;
 254		*pos = cpu+1;
 255		return &per_cpu(rt_cache_stat, cpu);
 256	}
 257	(*pos)++;
 258	return NULL;
 259
 260}
 261
 262static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 263{
 264
 265}
 266
 267static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 268{
 269	struct rt_cache_stat *st = v;
 270
 271	if (v == SEQ_START_TOKEN) {
 272		seq_puts(seq, "entries  in_hit   in_slow_tot in_slow_mc in_no_route in_brd   in_martian_dst in_martian_src out_hit  out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 273		return 0;
 274	}
 275
 276	seq_printf(seq, "%08x %08x %08x    %08x   %08x    %08x %08x       "
 277			"%08x       %08x %08x     %08x    %08x %08x   "
 278			"%08x     %08x        %08x        %08x\n",
 279		   dst_entries_get_slow(&ipv4_dst_ops),
 280		   0, /* st->in_hit */
 281		   st->in_slow_tot,
 282		   st->in_slow_mc,
 283		   st->in_no_route,
 284		   st->in_brd,
 285		   st->in_martian_dst,
 286		   st->in_martian_src,
 287
 288		   0, /* st->out_hit */
 289		   st->out_slow_tot,
 290		   st->out_slow_mc,
 291
 292		   0, /* st->gc_total */
 293		   0, /* st->gc_ignored */
 294		   0, /* st->gc_goal_miss */
 295		   0, /* st->gc_dst_overflow */
 296		   0, /* st->in_hlist_search */
 297		   0  /* st->out_hlist_search */
 298		);
 299	return 0;
 300}
 301
 302static const struct seq_operations rt_cpu_seq_ops = {
 303	.start  = rt_cpu_seq_start,
 304	.next   = rt_cpu_seq_next,
 305	.stop   = rt_cpu_seq_stop,
 306	.show   = rt_cpu_seq_show,
 307};
 308
 
 
 
 
 
 
 
 
 
 
 
 
 
 309#ifdef CONFIG_IP_ROUTE_CLASSID
 310static int rt_acct_proc_show(struct seq_file *m, void *v)
 311{
 312	struct ip_rt_acct *dst, *src;
 313	unsigned int i, j;
 314
 315	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 316	if (!dst)
 317		return -ENOMEM;
 318
 319	for_each_possible_cpu(i) {
 320		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 321		for (j = 0; j < 256; j++) {
 322			dst[j].o_bytes   += src[j].o_bytes;
 323			dst[j].o_packets += src[j].o_packets;
 324			dst[j].i_bytes   += src[j].i_bytes;
 325			dst[j].i_packets += src[j].i_packets;
 326		}
 327	}
 328
 329	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 330	kfree(dst);
 331	return 0;
 332}
 333#endif
 334
 335static int __net_init ip_rt_do_proc_init(struct net *net)
 336{
 337	struct proc_dir_entry *pde;
 338
 339	pde = proc_create_seq("rt_cache", 0444, net->proc_net,
 340			      &rt_cache_seq_ops);
 341	if (!pde)
 342		goto err1;
 343
 344	pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
 345			      &rt_cpu_seq_ops);
 346	if (!pde)
 347		goto err2;
 348
 349#ifdef CONFIG_IP_ROUTE_CLASSID
 350	pde = proc_create_single("rt_acct", 0, net->proc_net,
 351			rt_acct_proc_show);
 352	if (!pde)
 353		goto err3;
 354#endif
 355	return 0;
 356
 357#ifdef CONFIG_IP_ROUTE_CLASSID
 358err3:
 359	remove_proc_entry("rt_cache", net->proc_net_stat);
 360#endif
 361err2:
 362	remove_proc_entry("rt_cache", net->proc_net);
 363err1:
 364	return -ENOMEM;
 365}
 366
 367static void __net_exit ip_rt_do_proc_exit(struct net *net)
 368{
 369	remove_proc_entry("rt_cache", net->proc_net_stat);
 370	remove_proc_entry("rt_cache", net->proc_net);
 371#ifdef CONFIG_IP_ROUTE_CLASSID
 372	remove_proc_entry("rt_acct", net->proc_net);
 373#endif
 374}
 375
 376static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 377	.init = ip_rt_do_proc_init,
 378	.exit = ip_rt_do_proc_exit,
 379};
 380
 381static int __init ip_rt_proc_init(void)
 382{
 383	return register_pernet_subsys(&ip_rt_proc_ops);
 384}
 385
 386#else
 387static inline int ip_rt_proc_init(void)
 388{
 389	return 0;
 390}
 391#endif /* CONFIG_PROC_FS */
 392
 393static inline bool rt_is_expired(const struct rtable *rth)
 394{
 395	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 396}
 397
 398void rt_cache_flush(struct net *net)
 399{
 400	rt_genid_bump_ipv4(net);
 401}
 402
 403static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 404					   struct sk_buff *skb,
 405					   const void *daddr)
 406{
 407	const struct rtable *rt = container_of(dst, struct rtable, dst);
 408	struct net_device *dev = dst->dev;
 409	struct neighbour *n;
 410
 411	rcu_read_lock();
 412
 413	if (likely(rt->rt_gw_family == AF_INET)) {
 414		n = ip_neigh_gw4(dev, rt->rt_gw4);
 415	} else if (rt->rt_gw_family == AF_INET6) {
 416		n = ip_neigh_gw6(dev, &rt->rt_gw6);
 417        } else {
 418		__be32 pkey;
 419
 420		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 421		n = ip_neigh_gw4(dev, pkey);
 422	}
 423
 424	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 425		n = NULL;
 426
 427	rcu_read_unlock();
 428
 429	return n;
 430}
 431
 432static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 433{
 434	const struct rtable *rt = container_of(dst, struct rtable, dst);
 435	struct net_device *dev = dst->dev;
 436	const __be32 *pkey = daddr;
 437
 438	if (rt->rt_gw_family == AF_INET) {
 439		pkey = (const __be32 *)&rt->rt_gw4;
 440	} else if (rt->rt_gw_family == AF_INET6) {
 441		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 442	} else if (!daddr ||
 443		 (rt->rt_flags &
 444		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 445		return;
 446	}
 447	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 448}
 449
 450/* Hash tables of size 2048..262144 depending on RAM size.
 451 * Each bucket uses 8 bytes.
 452 */
 453static u32 ip_idents_mask __read_mostly;
 454static atomic_t *ip_idents __read_mostly;
 455static u32 *ip_tstamps __read_mostly;
 456
 457/* In order to protect privacy, we add a perturbation to identifiers
 458 * if one generator is seldom used. This makes hard for an attacker
 459 * to infer how many packets were sent between two points in time.
 460 */
 461static u32 ip_idents_reserve(u32 hash, int segs)
 462{
 463	u32 bucket, old, now = (u32)jiffies;
 464	atomic_t *p_id;
 465	u32 *p_tstamp;
 466	u32 delta = 0;
 467
 468	bucket = hash & ip_idents_mask;
 469	p_tstamp = ip_tstamps + bucket;
 470	p_id = ip_idents + bucket;
 471	old = READ_ONCE(*p_tstamp);
 472
 473	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 474		delta = get_random_u32_below(now - old);
 475
 476	/* If UBSAN reports an error there, please make sure your compiler
 477	 * supports -fno-strict-overflow before reporting it that was a bug
 478	 * in UBSAN, and it has been fixed in GCC-8.
 479	 */
 480	return atomic_add_return(segs + delta, p_id) - segs;
 
 
 481}
 
 482
 483void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 484{
 485	u32 hash, id;
 486
 487	/* Note the following code is not safe, but this is okay. */
 488	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 489		get_random_bytes(&net->ipv4.ip_id_key,
 490				 sizeof(net->ipv4.ip_id_key));
 491
 492	hash = siphash_3u32((__force u32)iph->daddr,
 493			    (__force u32)iph->saddr,
 494			    iph->protocol,
 495			    &net->ipv4.ip_id_key);
 496	id = ip_idents_reserve(hash, segs);
 497	iph->id = htons(id);
 498}
 499EXPORT_SYMBOL(__ip_select_ident);
 500
 501static void ip_rt_fix_tos(struct flowi4 *fl4)
 502{
 503	__u8 tos = RT_FL_TOS(fl4);
 504
 505	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
 506	if (tos & RTO_ONLINK)
 507		fl4->flowi4_scope = RT_SCOPE_LINK;
 508}
 509
 510static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 511			     const struct sock *sk, const struct iphdr *iph,
 512			     int oif, __u8 tos, u8 prot, u32 mark,
 513			     int flow_flags)
 
 514{
 515	__u8 scope = RT_SCOPE_UNIVERSE;
 
 516
 517	if (sk) {
 518		oif = sk->sk_bound_dev_if;
 519		mark = READ_ONCE(sk->sk_mark);
 520		tos = ip_sock_rt_tos(sk);
 521		scope = ip_sock_rt_scope(sk);
 522		prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW :
 523						    sk->sk_protocol;
 524	}
 525
 526	flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope,
 527			   prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
 528			   sock_net_uid(net, sk));
 529}
 530
 531static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 532			       const struct sock *sk)
 533{
 534	const struct net *net = dev_net(skb->dev);
 535	const struct iphdr *iph = ip_hdr(skb);
 536	int oif = skb->dev->ifindex;
 
 537	u8 prot = iph->protocol;
 538	u32 mark = skb->mark;
 539	__u8 tos = iph->tos;
 540
 541	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 542}
 543
 544static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 545{
 546	const struct inet_sock *inet = inet_sk(sk);
 547	const struct ip_options_rcu *inet_opt;
 548	__be32 daddr = inet->inet_daddr;
 549
 550	rcu_read_lock();
 551	inet_opt = rcu_dereference(inet->inet_opt);
 552	if (inet_opt && inet_opt->opt.srr)
 553		daddr = inet_opt->opt.faddr;
 554	flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
 555			   ip_sock_rt_tos(sk) & IPTOS_RT_MASK,
 556			   ip_sock_rt_scope(sk),
 557			   inet_test_bit(HDRINCL, sk) ?
 558				IPPROTO_RAW : sk->sk_protocol,
 559			   inet_sk_flowi_flags(sk),
 560			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 561	rcu_read_unlock();
 562}
 563
 564static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 565				 const struct sk_buff *skb)
 566{
 567	if (skb)
 568		build_skb_flow_key(fl4, skb, sk);
 569	else
 570		build_sk_flow_key(fl4, sk);
 571}
 572
 573static DEFINE_SPINLOCK(fnhe_lock);
 574
 575static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 576{
 577	struct rtable *rt;
 578
 579	rt = rcu_dereference(fnhe->fnhe_rth_input);
 580	if (rt) {
 581		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 582		dst_dev_put(&rt->dst);
 583		dst_release(&rt->dst);
 584	}
 585	rt = rcu_dereference(fnhe->fnhe_rth_output);
 586	if (rt) {
 587		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 588		dst_dev_put(&rt->dst);
 589		dst_release(&rt->dst);
 590	}
 591}
 592
 593static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 594{
 595	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 596	struct fib_nh_exception *fnhe, *oldest = NULL;
 597
 598	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 599		fnhe = rcu_dereference_protected(*fnhe_p,
 600						 lockdep_is_held(&fnhe_lock));
 601		if (!fnhe)
 602			break;
 603		if (!oldest ||
 604		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 605			oldest = fnhe;
 606			oldest_p = fnhe_p;
 607		}
 608	}
 609	fnhe_flush_routes(oldest);
 610	*oldest_p = oldest->fnhe_next;
 611	kfree_rcu(oldest, rcu);
 612}
 613
 614static u32 fnhe_hashfun(__be32 daddr)
 615{
 616	static siphash_aligned_key_t fnhe_hash_key;
 617	u64 hval;
 618
 619	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
 620	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
 621	return hash_64(hval, FNHE_HASH_SHIFT);
 622}
 623
 624static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 625{
 626	rt->rt_pmtu = fnhe->fnhe_pmtu;
 627	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 628	rt->dst.expires = fnhe->fnhe_expires;
 629
 630	if (fnhe->fnhe_gw) {
 631		rt->rt_flags |= RTCF_REDIRECTED;
 632		rt->rt_uses_gateway = 1;
 633		rt->rt_gw_family = AF_INET;
 634		rt->rt_gw4 = fnhe->fnhe_gw;
 635	}
 636}
 637
 638static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 639				  __be32 gw, u32 pmtu, bool lock,
 640				  unsigned long expires)
 641{
 642	struct fnhe_hash_bucket *hash;
 643	struct fib_nh_exception *fnhe;
 644	struct rtable *rt;
 645	u32 genid, hval;
 646	unsigned int i;
 647	int depth;
 648
 649	genid = fnhe_genid(dev_net(nhc->nhc_dev));
 650	hval = fnhe_hashfun(daddr);
 651
 652	spin_lock_bh(&fnhe_lock);
 653
 654	hash = rcu_dereference(nhc->nhc_exceptions);
 655	if (!hash) {
 656		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 657		if (!hash)
 658			goto out_unlock;
 659		rcu_assign_pointer(nhc->nhc_exceptions, hash);
 660	}
 661
 662	hash += hval;
 663
 664	depth = 0;
 665	for (fnhe = rcu_dereference(hash->chain); fnhe;
 666	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 667		if (fnhe->fnhe_daddr == daddr)
 668			break;
 669		depth++;
 670	}
 671
 672	if (fnhe) {
 673		if (fnhe->fnhe_genid != genid)
 674			fnhe->fnhe_genid = genid;
 675		if (gw)
 676			fnhe->fnhe_gw = gw;
 677		if (pmtu) {
 678			fnhe->fnhe_pmtu = pmtu;
 679			fnhe->fnhe_mtu_locked = lock;
 680		}
 681		fnhe->fnhe_expires = max(1UL, expires);
 682		/* Update all cached dsts too */
 683		rt = rcu_dereference(fnhe->fnhe_rth_input);
 684		if (rt)
 685			fill_route_from_fnhe(rt, fnhe);
 686		rt = rcu_dereference(fnhe->fnhe_rth_output);
 687		if (rt)
 688			fill_route_from_fnhe(rt, fnhe);
 689	} else {
 690		/* Randomize max depth to avoid some side channels attacks. */
 691		int max_depth = FNHE_RECLAIM_DEPTH +
 692				get_random_u32_below(FNHE_RECLAIM_DEPTH);
 
 
 
 693
 694		while (depth > max_depth) {
 695			fnhe_remove_oldest(hash);
 696			depth--;
 697		}
 698
 699		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 700		if (!fnhe)
 701			goto out_unlock;
 702
 703		fnhe->fnhe_next = hash->chain;
 704
 705		fnhe->fnhe_genid = genid;
 706		fnhe->fnhe_daddr = daddr;
 707		fnhe->fnhe_gw = gw;
 708		fnhe->fnhe_pmtu = pmtu;
 709		fnhe->fnhe_mtu_locked = lock;
 710		fnhe->fnhe_expires = max(1UL, expires);
 711
 712		rcu_assign_pointer(hash->chain, fnhe);
 713
 714		/* Exception created; mark the cached routes for the nexthop
 715		 * stale, so anyone caching it rechecks if this exception
 716		 * applies to them.
 717		 */
 718		rt = rcu_dereference(nhc->nhc_rth_input);
 719		if (rt)
 720			rt->dst.obsolete = DST_OBSOLETE_KILL;
 721
 722		for_each_possible_cpu(i) {
 723			struct rtable __rcu **prt;
 724
 725			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 726			rt = rcu_dereference(*prt);
 727			if (rt)
 728				rt->dst.obsolete = DST_OBSOLETE_KILL;
 729		}
 730	}
 731
 732	fnhe->fnhe_stamp = jiffies;
 733
 734out_unlock:
 735	spin_unlock_bh(&fnhe_lock);
 736}
 737
 738static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 739			     bool kill_route)
 740{
 741	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 742	__be32 old_gw = ip_hdr(skb)->saddr;
 743	struct net_device *dev = skb->dev;
 744	struct in_device *in_dev;
 745	struct fib_result res;
 746	struct neighbour *n;
 747	struct net *net;
 748
 749	switch (icmp_hdr(skb)->code & 7) {
 750	case ICMP_REDIR_NET:
 751	case ICMP_REDIR_NETTOS:
 752	case ICMP_REDIR_HOST:
 753	case ICMP_REDIR_HOSTTOS:
 754		break;
 755
 756	default:
 757		return;
 758	}
 759
 760	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 761		return;
 762
 763	in_dev = __in_dev_get_rcu(dev);
 764	if (!in_dev)
 765		return;
 766
 767	net = dev_net(dev);
 768	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 769	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 770	    ipv4_is_zeronet(new_gw))
 771		goto reject_redirect;
 772
 773	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 774		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 775			goto reject_redirect;
 776		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 777			goto reject_redirect;
 778	} else {
 779		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 780			goto reject_redirect;
 781	}
 782
 783	n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
 784	if (!n)
 785		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 786	if (!IS_ERR(n)) {
 787		if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
 788			neigh_event_send(n, NULL);
 789		} else {
 790			if (fib_lookup(net, fl4, &res, 0) == 0) {
 791				struct fib_nh_common *nhc;
 792
 793				fib_select_path(net, &res, fl4, skb);
 794				nhc = FIB_RES_NHC(res);
 795				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 796						0, false,
 797						jiffies + ip_rt_gc_timeout);
 798			}
 799			if (kill_route)
 800				rt->dst.obsolete = DST_OBSOLETE_KILL;
 801			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 802		}
 803		neigh_release(n);
 804	}
 805	return;
 806
 807reject_redirect:
 808#ifdef CONFIG_IP_ROUTE_VERBOSE
 809	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 810		const struct iphdr *iph = (const struct iphdr *) skb->data;
 811		__be32 daddr = iph->daddr;
 812		__be32 saddr = iph->saddr;
 813
 814		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 815				     "  Advised path = %pI4 -> %pI4\n",
 816				     &old_gw, dev->name, &new_gw,
 817				     &saddr, &daddr);
 818	}
 819#endif
 820	;
 821}
 822
 823static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 824{
 825	struct rtable *rt;
 826	struct flowi4 fl4;
 827	const struct iphdr *iph = (const struct iphdr *) skb->data;
 828	struct net *net = dev_net(skb->dev);
 829	int oif = skb->dev->ifindex;
 
 830	u8 prot = iph->protocol;
 831	u32 mark = skb->mark;
 832	__u8 tos = iph->tos;
 833
 834	rt = (struct rtable *) dst;
 835
 836	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 837	__ip_do_redirect(rt, skb, &fl4, true);
 838}
 839
 840static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 841{
 842	struct rtable *rt = (struct rtable *)dst;
 843	struct dst_entry *ret = dst;
 844
 845	if (rt) {
 846		if (dst->obsolete > 0) {
 847			ip_rt_put(rt);
 848			ret = NULL;
 849		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 850			   rt->dst.expires) {
 851			ip_rt_put(rt);
 852			ret = NULL;
 853		}
 854	}
 855	return ret;
 856}
 857
 858/*
 859 * Algorithm:
 860 *	1. The first ip_rt_redirect_number redirects are sent
 861 *	   with exponential backoff, then we stop sending them at all,
 862 *	   assuming that the host ignores our redirects.
 863 *	2. If we did not see packets requiring redirects
 864 *	   during ip_rt_redirect_silence, we assume that the host
 865 *	   forgot redirected route and start to send redirects again.
 866 *
 867 * This algorithm is much cheaper and more intelligent than dumb load limiting
 868 * in icmp.c.
 869 *
 870 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 871 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 872 */
 873
 874void ip_rt_send_redirect(struct sk_buff *skb)
 875{
 876	struct rtable *rt = skb_rtable(skb);
 877	struct in_device *in_dev;
 878	struct inet_peer *peer;
 879	struct net *net;
 880	int log_martians;
 881	int vif;
 882
 883	rcu_read_lock();
 884	in_dev = __in_dev_get_rcu(rt->dst.dev);
 885	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 886		rcu_read_unlock();
 887		return;
 888	}
 889	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 890	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 891	rcu_read_unlock();
 892
 893	net = dev_net(rt->dst.dev);
 894	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 895	if (!peer) {
 896		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 897			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 898		return;
 899	}
 900
 901	/* No redirected packets during ip_rt_redirect_silence;
 902	 * reset the algorithm.
 903	 */
 904	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 905		peer->rate_tokens = 0;
 906		peer->n_redirects = 0;
 907	}
 908
 909	/* Too many ignored redirects; do not send anything
 910	 * set dst.rate_last to the last seen redirected packet.
 911	 */
 912	if (peer->n_redirects >= ip_rt_redirect_number) {
 913		peer->rate_last = jiffies;
 914		goto out_put_peer;
 915	}
 916
 917	/* Check for load limit; set rate_last to the latest sent
 918	 * redirect.
 919	 */
 920	if (peer->n_redirects == 0 ||
 921	    time_after(jiffies,
 922		       (peer->rate_last +
 923			(ip_rt_redirect_load << peer->n_redirects)))) {
 924		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 925
 926		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 927		peer->rate_last = jiffies;
 928		++peer->n_redirects;
 929#ifdef CONFIG_IP_ROUTE_VERBOSE
 930		if (log_martians &&
 931		    peer->n_redirects == ip_rt_redirect_number)
 932			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 933					     &ip_hdr(skb)->saddr, inet_iif(skb),
 934					     &ip_hdr(skb)->daddr, &gw);
 935#endif
 936	}
 937out_put_peer:
 938	inet_putpeer(peer);
 939}
 940
 941static int ip_error(struct sk_buff *skb)
 942{
 943	struct rtable *rt = skb_rtable(skb);
 944	struct net_device *dev = skb->dev;
 945	struct in_device *in_dev;
 946	struct inet_peer *peer;
 947	unsigned long now;
 948	struct net *net;
 949	SKB_DR(reason);
 950	bool send;
 951	int code;
 952
 953	if (netif_is_l3_master(skb->dev)) {
 954		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 955		if (!dev)
 956			goto out;
 957	}
 958
 959	in_dev = __in_dev_get_rcu(dev);
 960
 961	/* IP on this device is disabled. */
 962	if (!in_dev)
 963		goto out;
 964
 965	net = dev_net(rt->dst.dev);
 966	if (!IN_DEV_FORWARD(in_dev)) {
 967		switch (rt->dst.error) {
 968		case EHOSTUNREACH:
 969			SKB_DR_SET(reason, IP_INADDRERRORS);
 970			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 971			break;
 972
 973		case ENETUNREACH:
 974			SKB_DR_SET(reason, IP_INNOROUTES);
 975			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 976			break;
 977		}
 978		goto out;
 979	}
 980
 981	switch (rt->dst.error) {
 982	case EINVAL:
 983	default:
 984		goto out;
 985	case EHOSTUNREACH:
 986		code = ICMP_HOST_UNREACH;
 987		break;
 988	case ENETUNREACH:
 989		code = ICMP_NET_UNREACH;
 990		SKB_DR_SET(reason, IP_INNOROUTES);
 991		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 992		break;
 993	case EACCES:
 994		code = ICMP_PKT_FILTERED;
 995		break;
 996	}
 997
 998	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 999			       l3mdev_master_ifindex(skb->dev), 1);
1000
1001	send = true;
1002	if (peer) {
1003		now = jiffies;
1004		peer->rate_tokens += now - peer->rate_last;
1005		if (peer->rate_tokens > ip_rt_error_burst)
1006			peer->rate_tokens = ip_rt_error_burst;
1007		peer->rate_last = now;
1008		if (peer->rate_tokens >= ip_rt_error_cost)
1009			peer->rate_tokens -= ip_rt_error_cost;
1010		else
1011			send = false;
1012		inet_putpeer(peer);
1013	}
1014	if (send)
1015		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1016
1017out:	kfree_skb_reason(skb, reason);
1018	return 0;
1019}
1020
1021static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1022{
1023	struct dst_entry *dst = &rt->dst;
1024	struct net *net = dev_net(dst->dev);
1025	struct fib_result res;
1026	bool lock = false;
1027	u32 old_mtu;
1028
1029	if (ip_mtu_locked(dst))
1030		return;
1031
1032	old_mtu = ipv4_mtu(dst);
1033	if (old_mtu < mtu)
1034		return;
1035
1036	if (mtu < net->ipv4.ip_rt_min_pmtu) {
1037		lock = true;
1038		mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
1039	}
1040
1041	if (rt->rt_pmtu == mtu && !lock &&
1042	    time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
1043		return;
1044
1045	rcu_read_lock();
1046	if (fib_lookup(net, fl4, &res, 0) == 0) {
1047		struct fib_nh_common *nhc;
1048
1049		fib_select_path(net, &res, fl4, NULL);
1050		nhc = FIB_RES_NHC(res);
1051		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1052				      jiffies + net->ipv4.ip_rt_mtu_expires);
1053	}
1054	rcu_read_unlock();
1055}
1056
1057static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1058			      struct sk_buff *skb, u32 mtu,
1059			      bool confirm_neigh)
1060{
1061	struct rtable *rt = (struct rtable *) dst;
1062	struct flowi4 fl4;
1063
1064	ip_rt_build_flow_key(&fl4, sk, skb);
1065
1066	/* Don't make lookup fail for bridged encapsulations */
1067	if (skb && netif_is_any_bridge_port(skb->dev))
1068		fl4.flowi4_oif = 0;
1069
1070	__ip_rt_update_pmtu(rt, &fl4, mtu);
1071}
1072
1073void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1074		      int oif, u8 protocol)
1075{
1076	const struct iphdr *iph = (const struct iphdr *)skb->data;
1077	struct flowi4 fl4;
1078	struct rtable *rt;
1079	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1080
1081	__build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
1082			 0);
1083	rt = __ip_route_output_key(net, &fl4);
1084	if (!IS_ERR(rt)) {
1085		__ip_rt_update_pmtu(rt, &fl4, mtu);
1086		ip_rt_put(rt);
1087	}
1088}
1089EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1090
1091static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092{
1093	const struct iphdr *iph = (const struct iphdr *)skb->data;
1094	struct flowi4 fl4;
1095	struct rtable *rt;
1096
1097	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1098
1099	if (!fl4.flowi4_mark)
1100		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1101
1102	rt = __ip_route_output_key(sock_net(sk), &fl4);
1103	if (!IS_ERR(rt)) {
1104		__ip_rt_update_pmtu(rt, &fl4, mtu);
1105		ip_rt_put(rt);
1106	}
1107}
1108
1109void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1110{
1111	const struct iphdr *iph = (const struct iphdr *)skb->data;
1112	struct flowi4 fl4;
1113	struct rtable *rt;
1114	struct dst_entry *odst = NULL;
1115	bool new = false;
1116	struct net *net = sock_net(sk);
1117
1118	bh_lock_sock(sk);
1119
1120	if (!ip_sk_accept_pmtu(sk))
1121		goto out;
1122
1123	odst = sk_dst_get(sk);
1124
1125	if (sock_owned_by_user(sk) || !odst) {
1126		__ipv4_sk_update_pmtu(skb, sk, mtu);
1127		goto out;
1128	}
1129
1130	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1131
1132	rt = (struct rtable *)odst;
1133	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1134		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1135		if (IS_ERR(rt))
1136			goto out;
1137
1138		new = true;
1139	}
1140
1141	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1142
1143	if (!dst_check(&rt->dst, 0)) {
1144		if (new)
1145			dst_release(&rt->dst);
1146
1147		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1148		if (IS_ERR(rt))
1149			goto out;
1150
1151		new = true;
1152	}
1153
1154	if (new)
1155		sk_dst_set(sk, &rt->dst);
1156
1157out:
1158	bh_unlock_sock(sk);
1159	dst_release(odst);
1160}
1161EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1162
1163void ipv4_redirect(struct sk_buff *skb, struct net *net,
1164		   int oif, u8 protocol)
1165{
1166	const struct iphdr *iph = (const struct iphdr *)skb->data;
1167	struct flowi4 fl4;
1168	struct rtable *rt;
1169
1170	__build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
 
1171	rt = __ip_route_output_key(net, &fl4);
1172	if (!IS_ERR(rt)) {
1173		__ip_do_redirect(rt, skb, &fl4, false);
1174		ip_rt_put(rt);
1175	}
1176}
1177EXPORT_SYMBOL_GPL(ipv4_redirect);
1178
1179void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1180{
1181	const struct iphdr *iph = (const struct iphdr *)skb->data;
1182	struct flowi4 fl4;
1183	struct rtable *rt;
1184	struct net *net = sock_net(sk);
1185
1186	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1187	rt = __ip_route_output_key(net, &fl4);
1188	if (!IS_ERR(rt)) {
1189		__ip_do_redirect(rt, skb, &fl4, false);
1190		ip_rt_put(rt);
1191	}
1192}
1193EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1194
1195INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
1196							 u32 cookie)
1197{
1198	struct rtable *rt = (struct rtable *) dst;
1199
1200	/* All IPV4 dsts are created with ->obsolete set to the value
1201	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1202	 * into this function always.
1203	 *
1204	 * When a PMTU/redirect information update invalidates a route,
1205	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1206	 * DST_OBSOLETE_DEAD.
1207	 */
1208	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1209		return NULL;
1210	return dst;
1211}
1212EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
1213
1214static void ipv4_send_dest_unreach(struct sk_buff *skb)
1215{
1216	struct net_device *dev;
1217	struct ip_options opt;
1218	int res;
1219
1220	/* Recompile ip options since IPCB may not be valid anymore.
1221	 * Also check we have a reasonable ipv4 header.
1222	 */
1223	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1224	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1225		return;
1226
1227	memset(&opt, 0, sizeof(opt));
1228	if (ip_hdr(skb)->ihl > 5) {
1229		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1230			return;
1231		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1232
1233		rcu_read_lock();
1234		dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1235		res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1236		rcu_read_unlock();
1237
1238		if (res)
1239			return;
1240	}
1241	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1242}
1243
1244static void ipv4_link_failure(struct sk_buff *skb)
1245{
1246	struct rtable *rt;
1247
1248	ipv4_send_dest_unreach(skb);
1249
1250	rt = skb_rtable(skb);
1251	if (rt)
1252		dst_set_expires(&rt->dst, 0);
1253}
1254
1255static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1256{
1257	pr_debug("%s: %pI4 -> %pI4, %s\n",
1258		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1259		 skb->dev ? skb->dev->name : "?");
1260	kfree_skb(skb);
1261	WARN_ON(1);
1262	return 0;
1263}
1264
1265/*
1266 * We do not cache source address of outgoing interface,
1267 * because it is used only by IP RR, TS and SRR options,
1268 * so that it out of fast path.
1269 *
1270 * BTW remember: "addr" is allowed to be not aligned
1271 * in IP options!
1272 */
1273
1274void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1275{
1276	__be32 src;
1277
1278	if (rt_is_output_route(rt))
1279		src = ip_hdr(skb)->saddr;
1280	else {
1281		struct fib_result res;
1282		struct iphdr *iph = ip_hdr(skb);
1283		struct flowi4 fl4 = {
1284			.daddr = iph->daddr,
1285			.saddr = iph->saddr,
1286			.flowi4_tos = RT_TOS(iph->tos),
1287			.flowi4_oif = rt->dst.dev->ifindex,
1288			.flowi4_iif = skb->dev->ifindex,
1289			.flowi4_mark = skb->mark,
1290		};
1291
1292		rcu_read_lock();
1293		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1294			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1295		else
1296			src = inet_select_addr(rt->dst.dev,
1297					       rt_nexthop(rt, iph->daddr),
1298					       RT_SCOPE_UNIVERSE);
1299		rcu_read_unlock();
1300	}
1301	memcpy(addr, &src, 4);
1302}
1303
1304#ifdef CONFIG_IP_ROUTE_CLASSID
1305static void set_class_tag(struct rtable *rt, u32 tag)
1306{
1307	if (!(rt->dst.tclassid & 0xFFFF))
1308		rt->dst.tclassid |= tag & 0xFFFF;
1309	if (!(rt->dst.tclassid & 0xFFFF0000))
1310		rt->dst.tclassid |= tag & 0xFFFF0000;
1311}
1312#endif
1313
1314static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1315{
1316	struct net *net = dev_net(dst->dev);
1317	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1318	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1319				    net->ipv4.ip_rt_min_advmss);
1320
1321	return min(advmss, IPV4_MAX_PMTU - header_size);
1322}
1323
1324INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
1325{
1326	return ip_dst_mtu_maybe_forward(dst, false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1327}
1328EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
1329
1330static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1331{
1332	struct fnhe_hash_bucket *hash;
1333	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1334	u32 hval = fnhe_hashfun(daddr);
1335
1336	spin_lock_bh(&fnhe_lock);
1337
1338	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1339					 lockdep_is_held(&fnhe_lock));
1340	hash += hval;
1341
1342	fnhe_p = &hash->chain;
1343	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1344	while (fnhe) {
1345		if (fnhe->fnhe_daddr == daddr) {
1346			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1347				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1348			/* set fnhe_daddr to 0 to ensure it won't bind with
1349			 * new dsts in rt_bind_exception().
1350			 */
1351			fnhe->fnhe_daddr = 0;
1352			fnhe_flush_routes(fnhe);
1353			kfree_rcu(fnhe, rcu);
1354			break;
1355		}
1356		fnhe_p = &fnhe->fnhe_next;
1357		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1358						 lockdep_is_held(&fnhe_lock));
1359	}
1360
1361	spin_unlock_bh(&fnhe_lock);
1362}
1363
1364static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1365					       __be32 daddr)
1366{
1367	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1368	struct fib_nh_exception *fnhe;
1369	u32 hval;
1370
1371	if (!hash)
1372		return NULL;
1373
1374	hval = fnhe_hashfun(daddr);
1375
1376	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1377	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1378		if (fnhe->fnhe_daddr == daddr) {
1379			if (fnhe->fnhe_expires &&
1380			    time_after(jiffies, fnhe->fnhe_expires)) {
1381				ip_del_fnhe(nhc, daddr);
1382				break;
1383			}
1384			return fnhe;
1385		}
1386	}
1387	return NULL;
1388}
1389
1390/* MTU selection:
1391 * 1. mtu on route is locked - use it
1392 * 2. mtu from nexthop exception
1393 * 3. mtu from egress device
1394 */
1395
1396u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1397{
1398	struct fib_nh_common *nhc = res->nhc;
1399	struct net_device *dev = nhc->nhc_dev;
1400	struct fib_info *fi = res->fi;
1401	u32 mtu = 0;
1402
1403	if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1404	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1405		mtu = fi->fib_mtu;
1406
1407	if (likely(!mtu)) {
1408		struct fib_nh_exception *fnhe;
1409
1410		fnhe = find_exception(nhc, daddr);
1411		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1412			mtu = fnhe->fnhe_pmtu;
1413	}
1414
1415	if (likely(!mtu))
1416		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1417
1418	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1419}
1420
1421static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1422			      __be32 daddr, const bool do_cache)
1423{
1424	bool ret = false;
1425
1426	spin_lock_bh(&fnhe_lock);
1427
1428	if (daddr == fnhe->fnhe_daddr) {
1429		struct rtable __rcu **porig;
1430		struct rtable *orig;
1431		int genid = fnhe_genid(dev_net(rt->dst.dev));
1432
1433		if (rt_is_input_route(rt))
1434			porig = &fnhe->fnhe_rth_input;
1435		else
1436			porig = &fnhe->fnhe_rth_output;
1437		orig = rcu_dereference(*porig);
1438
1439		if (fnhe->fnhe_genid != genid) {
1440			fnhe->fnhe_genid = genid;
1441			fnhe->fnhe_gw = 0;
1442			fnhe->fnhe_pmtu = 0;
1443			fnhe->fnhe_expires = 0;
1444			fnhe->fnhe_mtu_locked = false;
1445			fnhe_flush_routes(fnhe);
1446			orig = NULL;
1447		}
1448		fill_route_from_fnhe(rt, fnhe);
1449		if (!rt->rt_gw4) {
1450			rt->rt_gw4 = daddr;
1451			rt->rt_gw_family = AF_INET;
1452		}
1453
1454		if (do_cache) {
1455			dst_hold(&rt->dst);
1456			rcu_assign_pointer(*porig, rt);
1457			if (orig) {
1458				dst_dev_put(&orig->dst);
1459				dst_release(&orig->dst);
1460			}
1461			ret = true;
1462		}
1463
1464		fnhe->fnhe_stamp = jiffies;
1465	}
1466	spin_unlock_bh(&fnhe_lock);
1467
1468	return ret;
1469}
1470
1471static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1472{
1473	struct rtable *orig, *prev, **p;
1474	bool ret = true;
1475
1476	if (rt_is_input_route(rt)) {
1477		p = (struct rtable **)&nhc->nhc_rth_input;
1478	} else {
1479		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1480	}
1481	orig = *p;
1482
1483	/* hold dst before doing cmpxchg() to avoid race condition
1484	 * on this dst
1485	 */
1486	dst_hold(&rt->dst);
1487	prev = cmpxchg(p, orig, rt);
1488	if (prev == orig) {
1489		if (orig) {
1490			rt_add_uncached_list(orig);
1491			dst_release(&orig->dst);
1492		}
1493	} else {
1494		dst_release(&rt->dst);
1495		ret = false;
1496	}
1497
1498	return ret;
1499}
1500
1501struct uncached_list {
1502	spinlock_t		lock;
1503	struct list_head	head;
1504	struct list_head	quarantine;
1505};
1506
1507static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1508
1509void rt_add_uncached_list(struct rtable *rt)
1510{
1511	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1512
1513	rt->dst.rt_uncached_list = ul;
1514
1515	spin_lock_bh(&ul->lock);
1516	list_add_tail(&rt->dst.rt_uncached, &ul->head);
1517	spin_unlock_bh(&ul->lock);
1518}
1519
1520void rt_del_uncached_list(struct rtable *rt)
1521{
1522	if (!list_empty(&rt->dst.rt_uncached)) {
1523		struct uncached_list *ul = rt->dst.rt_uncached_list;
1524
1525		spin_lock_bh(&ul->lock);
1526		list_del_init(&rt->dst.rt_uncached);
1527		spin_unlock_bh(&ul->lock);
1528	}
1529}
1530
1531static void ipv4_dst_destroy(struct dst_entry *dst)
1532{
1533	struct rtable *rt = (struct rtable *)dst;
1534
1535	ip_dst_metrics_put(dst);
1536	rt_del_uncached_list(rt);
1537}
1538
1539void rt_flush_dev(struct net_device *dev)
1540{
1541	struct rtable *rt, *safe;
1542	int cpu;
1543
1544	for_each_possible_cpu(cpu) {
1545		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1546
1547		if (list_empty(&ul->head))
1548			continue;
1549
1550		spin_lock_bh(&ul->lock);
1551		list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
1552			if (rt->dst.dev != dev)
1553				continue;
1554			rt->dst.dev = blackhole_netdev;
1555			netdev_ref_replace(dev, blackhole_netdev,
1556					   &rt->dst.dev_tracker, GFP_ATOMIC);
1557			list_move(&rt->dst.rt_uncached, &ul->quarantine);
1558		}
1559		spin_unlock_bh(&ul->lock);
1560	}
1561}
1562
1563static bool rt_cache_valid(const struct rtable *rt)
1564{
1565	return	rt &&
1566		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1567		!rt_is_expired(rt);
1568}
1569
1570static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1571			   const struct fib_result *res,
1572			   struct fib_nh_exception *fnhe,
1573			   struct fib_info *fi, u16 type, u32 itag,
1574			   const bool do_cache)
1575{
1576	bool cached = false;
1577
1578	if (fi) {
1579		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1580
1581		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1582			rt->rt_uses_gateway = 1;
1583			rt->rt_gw_family = nhc->nhc_gw_family;
1584			/* only INET and INET6 are supported */
1585			if (likely(nhc->nhc_gw_family == AF_INET))
1586				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1587			else
1588				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1589		}
1590
1591		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1592
1593#ifdef CONFIG_IP_ROUTE_CLASSID
1594		if (nhc->nhc_family == AF_INET) {
1595			struct fib_nh *nh;
1596
1597			nh = container_of(nhc, struct fib_nh, nh_common);
1598			rt->dst.tclassid = nh->nh_tclassid;
1599		}
1600#endif
1601		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1602		if (unlikely(fnhe))
1603			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1604		else if (do_cache)
1605			cached = rt_cache_route(nhc, rt);
1606		if (unlikely(!cached)) {
1607			/* Routes we intend to cache in nexthop exception or
1608			 * FIB nexthop have the DST_NOCACHE bit clear.
1609			 * However, if we are unsuccessful at storing this
1610			 * route into the cache we really need to set it.
1611			 */
1612			if (!rt->rt_gw4) {
1613				rt->rt_gw_family = AF_INET;
1614				rt->rt_gw4 = daddr;
1615			}
1616			rt_add_uncached_list(rt);
1617		}
1618	} else
1619		rt_add_uncached_list(rt);
1620
1621#ifdef CONFIG_IP_ROUTE_CLASSID
1622#ifdef CONFIG_IP_MULTIPLE_TABLES
1623	set_class_tag(rt, res->tclassid);
1624#endif
1625	set_class_tag(rt, itag);
1626#endif
1627}
1628
1629struct rtable *rt_dst_alloc(struct net_device *dev,
1630			    unsigned int flags, u16 type,
1631			    bool noxfrm)
1632{
1633	struct rtable *rt;
1634
1635	rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
 
 
1636		       (noxfrm ? DST_NOXFRM : 0));
1637
1638	if (rt) {
1639		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1640		rt->rt_flags = flags;
1641		rt->rt_type = type;
1642		rt->rt_is_input = 0;
1643		rt->rt_iif = 0;
1644		rt->rt_pmtu = 0;
1645		rt->rt_mtu_locked = 0;
1646		rt->rt_uses_gateway = 0;
1647		rt->rt_gw_family = 0;
1648		rt->rt_gw4 = 0;
 
1649
1650		rt->dst.output = ip_output;
1651		if (flags & RTCF_LOCAL)
1652			rt->dst.input = ip_local_deliver;
1653	}
1654
1655	return rt;
1656}
1657EXPORT_SYMBOL(rt_dst_alloc);
1658
1659struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1660{
1661	struct rtable *new_rt;
1662
1663	new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
1664			   rt->dst.flags);
1665
1666	if (new_rt) {
1667		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1668		new_rt->rt_flags = rt->rt_flags;
1669		new_rt->rt_type = rt->rt_type;
1670		new_rt->rt_is_input = rt->rt_is_input;
1671		new_rt->rt_iif = rt->rt_iif;
1672		new_rt->rt_pmtu = rt->rt_pmtu;
1673		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1674		new_rt->rt_gw_family = rt->rt_gw_family;
1675		if (rt->rt_gw_family == AF_INET)
1676			new_rt->rt_gw4 = rt->rt_gw4;
1677		else if (rt->rt_gw_family == AF_INET6)
1678			new_rt->rt_gw6 = rt->rt_gw6;
 
1679
 
1680		new_rt->dst.input = rt->dst.input;
1681		new_rt->dst.output = rt->dst.output;
1682		new_rt->dst.error = rt->dst.error;
1683		new_rt->dst.lastuse = jiffies;
1684		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1685	}
1686	return new_rt;
1687}
1688EXPORT_SYMBOL(rt_dst_clone);
1689
1690/* called in rcu_read_lock() section */
1691int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1692			  u8 tos, struct net_device *dev,
1693			  struct in_device *in_dev, u32 *itag)
1694{
1695	int err;
1696
1697	/* Primary sanity checks. */
1698	if (!in_dev)
1699		return -EINVAL;
1700
1701	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1702	    skb->protocol != htons(ETH_P_IP))
1703		return -EINVAL;
1704
1705	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1706		return -EINVAL;
1707
1708	if (ipv4_is_zeronet(saddr)) {
1709		if (!ipv4_is_local_multicast(daddr) &&
1710		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1711			return -EINVAL;
1712	} else {
1713		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1714					  in_dev, itag);
1715		if (err < 0)
1716			return err;
1717	}
1718	return 0;
1719}
1720
1721/* called in rcu_read_lock() section */
1722static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1723			     u8 tos, struct net_device *dev, int our)
1724{
1725	struct in_device *in_dev = __in_dev_get_rcu(dev);
1726	unsigned int flags = RTCF_MULTICAST;
1727	struct rtable *rth;
1728	u32 itag = 0;
1729	int err;
1730
1731	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1732	if (err)
1733		return err;
1734
1735	if (our)
1736		flags |= RTCF_LOCAL;
1737
1738	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
1739		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1740
1741	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1742			   false);
1743	if (!rth)
1744		return -ENOBUFS;
1745
1746#ifdef CONFIG_IP_ROUTE_CLASSID
1747	rth->dst.tclassid = itag;
1748#endif
1749	rth->dst.output = ip_rt_bug;
1750	rth->rt_is_input= 1;
1751
1752#ifdef CONFIG_IP_MROUTE
1753	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1754		rth->dst.input = ip_mr_input;
1755#endif
1756	RT_CACHE_STAT_INC(in_slow_mc);
1757
1758	skb_dst_drop(skb);
1759	skb_dst_set(skb, &rth->dst);
1760	return 0;
1761}
1762
1763
1764static void ip_handle_martian_source(struct net_device *dev,
1765				     struct in_device *in_dev,
1766				     struct sk_buff *skb,
1767				     __be32 daddr,
1768				     __be32 saddr)
1769{
1770	RT_CACHE_STAT_INC(in_martian_src);
1771#ifdef CONFIG_IP_ROUTE_VERBOSE
1772	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1773		/*
1774		 *	RFC1812 recommendation, if source is martian,
1775		 *	the only hint is MAC header.
1776		 */
1777		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1778			&daddr, &saddr, dev->name);
1779		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1780			print_hex_dump(KERN_WARNING, "ll header: ",
1781				       DUMP_PREFIX_OFFSET, 16, 1,
1782				       skb_mac_header(skb),
1783				       dev->hard_header_len, false);
1784		}
1785	}
1786#endif
1787}
1788
1789/* called in rcu_read_lock() section */
1790static int __mkroute_input(struct sk_buff *skb,
1791			   const struct fib_result *res,
1792			   struct in_device *in_dev,
1793			   __be32 daddr, __be32 saddr, u32 tos)
1794{
1795	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1796	struct net_device *dev = nhc->nhc_dev;
1797	struct fib_nh_exception *fnhe;
1798	struct rtable *rth;
1799	int err;
1800	struct in_device *out_dev;
1801	bool do_cache;
1802	u32 itag = 0;
1803
1804	/* get a working reference to the output device */
1805	out_dev = __in_dev_get_rcu(dev);
1806	if (!out_dev) {
1807		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1808		return -EINVAL;
1809	}
1810
1811	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1812				  in_dev->dev, in_dev, &itag);
1813	if (err < 0) {
1814		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1815					 saddr);
1816
1817		goto cleanup;
1818	}
1819
1820	do_cache = res->fi && !itag;
1821	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1822	    skb->protocol == htons(ETH_P_IP)) {
1823		__be32 gw;
1824
1825		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1826		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1827		    inet_addr_onlink(out_dev, saddr, gw))
1828			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1829	}
1830
1831	if (skb->protocol != htons(ETH_P_IP)) {
1832		/* Not IP (i.e. ARP). Do not create route, if it is
1833		 * invalid for proxy arp. DNAT routes are always valid.
1834		 *
1835		 * Proxy arp feature have been extended to allow, ARP
1836		 * replies back to the same interface, to support
1837		 * Private VLAN switch technologies. See arp.c.
1838		 */
1839		if (out_dev == in_dev &&
1840		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1841			err = -EINVAL;
1842			goto cleanup;
1843		}
1844	}
1845
1846	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
1847		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1848
1849	fnhe = find_exception(nhc, daddr);
1850	if (do_cache) {
1851		if (fnhe)
1852			rth = rcu_dereference(fnhe->fnhe_rth_input);
1853		else
1854			rth = rcu_dereference(nhc->nhc_rth_input);
1855		if (rt_cache_valid(rth)) {
1856			skb_dst_set_noref(skb, &rth->dst);
1857			goto out;
1858		}
1859	}
1860
1861	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1862			   IN_DEV_ORCONF(out_dev, NOXFRM));
 
1863	if (!rth) {
1864		err = -ENOBUFS;
1865		goto cleanup;
1866	}
1867
1868	rth->rt_is_input = 1;
1869	RT_CACHE_STAT_INC(in_slow_tot);
1870
1871	rth->dst.input = ip_forward;
1872
1873	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1874		       do_cache);
1875	lwtunnel_set_redirect(&rth->dst);
1876	skb_dst_set(skb, &rth->dst);
1877out:
1878	err = 0;
1879 cleanup:
1880	return err;
1881}
1882
1883#ifdef CONFIG_IP_ROUTE_MULTIPATH
1884/* To make ICMP packets follow the right flow, the multipath hash is
1885 * calculated from the inner IP addresses.
1886 */
1887static void ip_multipath_l3_keys(const struct sk_buff *skb,
1888				 struct flow_keys *hash_keys)
1889{
1890	const struct iphdr *outer_iph = ip_hdr(skb);
1891	const struct iphdr *key_iph = outer_iph;
1892	const struct iphdr *inner_iph;
1893	const struct icmphdr *icmph;
1894	struct iphdr _inner_iph;
1895	struct icmphdr _icmph;
1896
1897	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1898		goto out;
1899
1900	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1901		goto out;
1902
1903	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1904				   &_icmph);
1905	if (!icmph)
1906		goto out;
1907
1908	if (!icmp_is_err(icmph->type))
 
 
 
1909		goto out;
1910
1911	inner_iph = skb_header_pointer(skb,
1912				       outer_iph->ihl * 4 + sizeof(_icmph),
1913				       sizeof(_inner_iph), &_inner_iph);
1914	if (!inner_iph)
1915		goto out;
1916
1917	key_iph = inner_iph;
1918out:
1919	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1920	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1921}
1922
1923static u32 fib_multipath_custom_hash_outer(const struct net *net,
1924					   const struct sk_buff *skb,
1925					   bool *p_has_inner)
1926{
1927	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
1928	struct flow_keys keys, hash_keys;
1929
1930	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
1931		return 0;
1932
1933	memset(&hash_keys, 0, sizeof(hash_keys));
1934	skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
1935
1936	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1937	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
1938		hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1939	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
1940		hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1941	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
1942		hash_keys.basic.ip_proto = keys.basic.ip_proto;
1943	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
1944		hash_keys.ports.src = keys.ports.src;
1945	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
1946		hash_keys.ports.dst = keys.ports.dst;
1947
1948	*p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
1949	return flow_hash_from_keys(&hash_keys);
1950}
1951
1952static u32 fib_multipath_custom_hash_inner(const struct net *net,
1953					   const struct sk_buff *skb,
1954					   bool has_inner)
1955{
1956	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
1957	struct flow_keys keys, hash_keys;
1958
1959	/* We assume the packet carries an encapsulation, but if none was
1960	 * encountered during dissection of the outer flow, then there is no
1961	 * point in calling the flow dissector again.
1962	 */
1963	if (!has_inner)
1964		return 0;
1965
1966	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
1967		return 0;
1968
1969	memset(&hash_keys, 0, sizeof(hash_keys));
1970	skb_flow_dissect_flow_keys(skb, &keys, 0);
1971
1972	if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
1973		return 0;
1974
1975	if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1976		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1977		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1978			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1979		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1980			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1981	} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1982		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1983		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1984			hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1985		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1986			hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1987		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
1988			hash_keys.tags.flow_label = keys.tags.flow_label;
1989	}
1990
1991	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
1992		hash_keys.basic.ip_proto = keys.basic.ip_proto;
1993	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
1994		hash_keys.ports.src = keys.ports.src;
1995	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
1996		hash_keys.ports.dst = keys.ports.dst;
1997
1998	return flow_hash_from_keys(&hash_keys);
1999}
2000
2001static u32 fib_multipath_custom_hash_skb(const struct net *net,
2002					 const struct sk_buff *skb)
2003{
2004	u32 mhash, mhash_inner;
2005	bool has_inner = true;
2006
2007	mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
2008	mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);
2009
2010	return jhash_2words(mhash, mhash_inner, 0);
2011}
2012
2013static u32 fib_multipath_custom_hash_fl4(const struct net *net,
2014					 const struct flowi4 *fl4)
2015{
2016	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
2017	struct flow_keys hash_keys;
2018
2019	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2020		return 0;
2021
2022	memset(&hash_keys, 0, sizeof(hash_keys));
2023	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2024	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2025		hash_keys.addrs.v4addrs.src = fl4->saddr;
2026	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2027		hash_keys.addrs.v4addrs.dst = fl4->daddr;
2028	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2029		hash_keys.basic.ip_proto = fl4->flowi4_proto;
2030	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2031		hash_keys.ports.src = fl4->fl4_sport;
2032	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2033		hash_keys.ports.dst = fl4->fl4_dport;
2034
2035	return flow_hash_from_keys(&hash_keys);
2036}
2037
2038/* if skb is set it will be used and fl4 can be NULL */
2039int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
2040		       const struct sk_buff *skb, struct flow_keys *flkeys)
2041{
2042	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
2043	struct flow_keys hash_keys;
2044	u32 mhash = 0;
2045
2046	switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
2047	case 0:
2048		memset(&hash_keys, 0, sizeof(hash_keys));
2049		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2050		if (skb) {
2051			ip_multipath_l3_keys(skb, &hash_keys);
2052		} else {
2053			hash_keys.addrs.v4addrs.src = fl4->saddr;
2054			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2055		}
2056		mhash = flow_hash_from_keys(&hash_keys);
2057		break;
2058	case 1:
2059		/* skb is currently provided only when forwarding */
2060		if (skb) {
2061			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2062			struct flow_keys keys;
2063
2064			/* short-circuit if we already have L4 hash present */
2065			if (skb->l4_hash)
2066				return skb_get_hash_raw(skb) >> 1;
2067
2068			memset(&hash_keys, 0, sizeof(hash_keys));
2069
2070			if (!flkeys) {
2071				skb_flow_dissect_flow_keys(skb, &keys, flag);
2072				flkeys = &keys;
2073			}
2074
2075			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2076			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2077			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2078			hash_keys.ports.src = flkeys->ports.src;
2079			hash_keys.ports.dst = flkeys->ports.dst;
2080			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2081		} else {
2082			memset(&hash_keys, 0, sizeof(hash_keys));
2083			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2084			hash_keys.addrs.v4addrs.src = fl4->saddr;
2085			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2086			hash_keys.ports.src = fl4->fl4_sport;
2087			hash_keys.ports.dst = fl4->fl4_dport;
2088			hash_keys.basic.ip_proto = fl4->flowi4_proto;
2089		}
2090		mhash = flow_hash_from_keys(&hash_keys);
2091		break;
2092	case 2:
2093		memset(&hash_keys, 0, sizeof(hash_keys));
2094		/* skb is currently provided only when forwarding */
2095		if (skb) {
2096			struct flow_keys keys;
2097
2098			skb_flow_dissect_flow_keys(skb, &keys, 0);
2099			/* Inner can be v4 or v6 */
2100			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2101				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2102				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2103				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2104			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2105				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2106				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2107				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2108				hash_keys.tags.flow_label = keys.tags.flow_label;
2109				hash_keys.basic.ip_proto = keys.basic.ip_proto;
2110			} else {
2111				/* Same as case 0 */
2112				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2113				ip_multipath_l3_keys(skb, &hash_keys);
2114			}
2115		} else {
2116			/* Same as case 0 */
2117			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2118			hash_keys.addrs.v4addrs.src = fl4->saddr;
2119			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2120		}
2121		mhash = flow_hash_from_keys(&hash_keys);
2122		break;
2123	case 3:
2124		if (skb)
2125			mhash = fib_multipath_custom_hash_skb(net, skb);
2126		else
2127			mhash = fib_multipath_custom_hash_fl4(net, fl4);
2128		break;
2129	}
 
2130
2131	if (multipath_hash)
2132		mhash = jhash_2words(mhash, multipath_hash, 0);
2133
2134	return mhash >> 1;
2135}
2136#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2137
2138static int ip_mkroute_input(struct sk_buff *skb,
2139			    struct fib_result *res,
2140			    struct in_device *in_dev,
2141			    __be32 daddr, __be32 saddr, u32 tos,
2142			    struct flow_keys *hkeys)
2143{
2144#ifdef CONFIG_IP_ROUTE_MULTIPATH
2145	if (res->fi && fib_info_num_path(res->fi) > 1) {
2146		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2147
2148		fib_select_multipath(res, h);
2149		IPCB(skb)->flags |= IPSKB_MULTIPATH;
2150	}
2151#endif
2152
2153	/* create a routing cache entry */
2154	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2155}
2156
2157/* Implements all the saddr-related checks as ip_route_input_slow(),
2158 * assuming daddr is valid and the destination is not a local broadcast one.
2159 * Uses the provided hint instead of performing a route lookup.
2160 */
2161int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2162		      u8 tos, struct net_device *dev,
2163		      const struct sk_buff *hint)
2164{
2165	struct in_device *in_dev = __in_dev_get_rcu(dev);
2166	struct rtable *rt = skb_rtable(hint);
2167	struct net *net = dev_net(dev);
2168	int err = -EINVAL;
2169	u32 tag = 0;
2170
2171	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2172		goto martian_source;
2173
2174	if (ipv4_is_zeronet(saddr))
2175		goto martian_source;
2176
2177	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2178		goto martian_source;
2179
2180	if (rt->rt_type != RTN_LOCAL)
2181		goto skip_validate_source;
2182
2183	tos &= IPTOS_RT_MASK;
2184	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2185	if (err < 0)
2186		goto martian_source;
2187
2188skip_validate_source:
2189	skb_dst_copy(skb, hint);
2190	return 0;
2191
2192martian_source:
2193	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2194	return err;
2195}
2196
2197/* get device for dst_alloc with local routes */
2198static struct net_device *ip_rt_get_dev(struct net *net,
2199					const struct fib_result *res)
2200{
2201	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2202	struct net_device *dev = NULL;
2203
2204	if (nhc)
2205		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2206
2207	return dev ? : net->loopback_dev;
2208}
2209
2210/*
2211 *	NOTE. We drop all the packets that has local source
2212 *	addresses, because every properly looped back packet
2213 *	must have correct destination already attached by output routine.
2214 *	Changes in the enforced policies must be applied also to
2215 *	ip_route_use_hint().
2216 *
2217 *	Such approach solves two big problems:
2218 *	1. Not simplex devices are handled properly.
2219 *	2. IP spoofing attempts are filtered with 100% of guarantee.
2220 *	called with rcu_read_lock()
2221 */
2222
2223static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2224			       u8 tos, struct net_device *dev,
2225			       struct fib_result *res)
2226{
2227	struct in_device *in_dev = __in_dev_get_rcu(dev);
2228	struct flow_keys *flkeys = NULL, _flkeys;
2229	struct net    *net = dev_net(dev);
2230	struct ip_tunnel_info *tun_info;
2231	int		err = -EINVAL;
2232	unsigned int	flags = 0;
2233	u32		itag = 0;
2234	struct rtable	*rth;
2235	struct flowi4	fl4;
2236	bool do_cache = true;
2237
2238	/* IP on this device is disabled. */
2239
2240	if (!in_dev)
2241		goto out;
2242
2243	/* Check for the most weird martians, which can be not detected
2244	 * by fib_lookup.
2245	 */
2246
2247	tun_info = skb_tunnel_info(skb);
2248	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2249		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2250	else
2251		fl4.flowi4_tun_key.tun_id = 0;
2252	skb_dst_drop(skb);
2253
2254	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2255		goto martian_source;
2256
2257	res->fi = NULL;
2258	res->table = NULL;
2259	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2260		goto brd_input;
2261
2262	/* Accept zero addresses only to limited broadcast;
2263	 * I even do not know to fix it or not. Waiting for complains :-)
2264	 */
2265	if (ipv4_is_zeronet(saddr))
2266		goto martian_source;
2267
2268	if (ipv4_is_zeronet(daddr))
2269		goto martian_destination;
2270
2271	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2272	 * and call it once if daddr or/and saddr are loopback addresses
2273	 */
2274	if (ipv4_is_loopback(daddr)) {
2275		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2276			goto martian_destination;
2277	} else if (ipv4_is_loopback(saddr)) {
2278		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2279			goto martian_source;
2280	}
2281
2282	/*
2283	 *	Now we are ready to route packet.
2284	 */
2285	fl4.flowi4_l3mdev = 0;
2286	fl4.flowi4_oif = 0;
2287	fl4.flowi4_iif = dev->ifindex;
2288	fl4.flowi4_mark = skb->mark;
2289	fl4.flowi4_tos = tos;
2290	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2291	fl4.flowi4_flags = 0;
2292	fl4.daddr = daddr;
2293	fl4.saddr = saddr;
2294	fl4.flowi4_uid = sock_net_uid(net, NULL);
2295	fl4.flowi4_multipath_hash = 0;
2296
2297	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2298		flkeys = &_flkeys;
2299	} else {
2300		fl4.flowi4_proto = 0;
2301		fl4.fl4_sport = 0;
2302		fl4.fl4_dport = 0;
2303	}
2304
2305	err = fib_lookup(net, &fl4, res, 0);
2306	if (err != 0) {
2307		if (!IN_DEV_FORWARD(in_dev))
2308			err = -EHOSTUNREACH;
2309		goto no_route;
2310	}
2311
2312	if (res->type == RTN_BROADCAST) {
2313		if (IN_DEV_BFORWARD(in_dev))
2314			goto make_route;
2315		/* not do cache if bc_forwarding is enabled */
2316		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2317			do_cache = false;
2318		goto brd_input;
2319	}
2320
2321	if (res->type == RTN_LOCAL) {
2322		err = fib_validate_source(skb, saddr, daddr, tos,
2323					  0, dev, in_dev, &itag);
2324		if (err < 0)
2325			goto martian_source;
2326		goto local_input;
2327	}
2328
2329	if (!IN_DEV_FORWARD(in_dev)) {
2330		err = -EHOSTUNREACH;
2331		goto no_route;
2332	}
2333	if (res->type != RTN_UNICAST)
2334		goto martian_destination;
2335
2336make_route:
2337	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2338out:	return err;
2339
2340brd_input:
2341	if (skb->protocol != htons(ETH_P_IP))
2342		goto e_inval;
2343
2344	if (!ipv4_is_zeronet(saddr)) {
2345		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2346					  in_dev, &itag);
2347		if (err < 0)
2348			goto martian_source;
2349	}
2350	flags |= RTCF_BROADCAST;
2351	res->type = RTN_BROADCAST;
2352	RT_CACHE_STAT_INC(in_brd);
2353
2354local_input:
2355	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
2356		IPCB(skb)->flags |= IPSKB_NOPOLICY;
2357
2358	do_cache &= res->fi && !itag;
2359	if (do_cache) {
2360		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2361
2362		rth = rcu_dereference(nhc->nhc_rth_input);
2363		if (rt_cache_valid(rth)) {
2364			skb_dst_set_noref(skb, &rth->dst);
2365			err = 0;
2366			goto out;
2367		}
2368	}
2369
2370	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2371			   flags | RTCF_LOCAL, res->type, false);
 
2372	if (!rth)
2373		goto e_nobufs;
2374
2375	rth->dst.output= ip_rt_bug;
2376#ifdef CONFIG_IP_ROUTE_CLASSID
2377	rth->dst.tclassid = itag;
2378#endif
2379	rth->rt_is_input = 1;
2380
2381	RT_CACHE_STAT_INC(in_slow_tot);
2382	if (res->type == RTN_UNREACHABLE) {
2383		rth->dst.input= ip_error;
2384		rth->dst.error= -err;
2385		rth->rt_flags	&= ~RTCF_LOCAL;
2386	}
2387
2388	if (do_cache) {
2389		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2390
2391		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2392		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2393			WARN_ON(rth->dst.input == lwtunnel_input);
2394			rth->dst.lwtstate->orig_input = rth->dst.input;
2395			rth->dst.input = lwtunnel_input;
2396		}
2397
2398		if (unlikely(!rt_cache_route(nhc, rth)))
2399			rt_add_uncached_list(rth);
2400	}
2401	skb_dst_set(skb, &rth->dst);
2402	err = 0;
2403	goto out;
2404
2405no_route:
2406	RT_CACHE_STAT_INC(in_no_route);
2407	res->type = RTN_UNREACHABLE;
2408	res->fi = NULL;
2409	res->table = NULL;
2410	goto local_input;
2411
2412	/*
2413	 *	Do not cache martian addresses: they should be logged (RFC1812)
2414	 */
2415martian_destination:
2416	RT_CACHE_STAT_INC(in_martian_dst);
2417#ifdef CONFIG_IP_ROUTE_VERBOSE
2418	if (IN_DEV_LOG_MARTIANS(in_dev))
2419		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2420				     &daddr, &saddr, dev->name);
2421#endif
2422
2423e_inval:
2424	err = -EINVAL;
2425	goto out;
2426
2427e_nobufs:
2428	err = -ENOBUFS;
2429	goto out;
2430
2431martian_source:
2432	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2433	goto out;
2434}
2435
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2436/* called with rcu_read_lock held */
2437static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2438			      u8 tos, struct net_device *dev, struct fib_result *res)
2439{
2440	/* Multicast recognition logic is moved from route cache to here.
2441	 * The problem was that too many Ethernet cards have broken/missing
2442	 * hardware multicast filters :-( As result the host on multicasting
2443	 * network acquires a lot of useless route cache entries, sort of
2444	 * SDR messages from all the world. Now we try to get rid of them.
2445	 * Really, provided software IP multicast filter is organized
2446	 * reasonably (at least, hashed), it does not result in a slowdown
2447	 * comparing with route cache reject entries.
2448	 * Note, that multicast routers are not affected, because
2449	 * route cache entry is created eventually.
2450	 */
2451	if (ipv4_is_multicast(daddr)) {
2452		struct in_device *in_dev = __in_dev_get_rcu(dev);
2453		int our = 0;
2454		int err = -EINVAL;
2455
2456		if (!in_dev)
2457			return err;
2458		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2459				      ip_hdr(skb)->protocol);
2460
2461		/* check l3 master if no match yet */
2462		if (!our && netif_is_l3_slave(dev)) {
2463			struct in_device *l3_in_dev;
2464
2465			l3_in_dev = __in_dev_get_rcu(skb->dev);
2466			if (l3_in_dev)
2467				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2468						      ip_hdr(skb)->protocol);
2469		}
2470
2471		if (our
2472#ifdef CONFIG_IP_MROUTE
2473			||
2474		    (!ipv4_is_local_multicast(daddr) &&
2475		     IN_DEV_MFORWARD(in_dev))
2476#endif
2477		   ) {
2478			err = ip_route_input_mc(skb, daddr, saddr,
2479						tos, dev, our);
2480		}
2481		return err;
2482	}
2483
2484	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2485}
2486
2487int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2488			 u8 tos, struct net_device *dev)
2489{
2490	struct fib_result res;
2491	int err;
2492
2493	tos &= IPTOS_RT_MASK;
2494	rcu_read_lock();
2495	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2496	rcu_read_unlock();
2497
2498	return err;
2499}
2500EXPORT_SYMBOL(ip_route_input_noref);
2501
2502/* called with rcu_read_lock() */
2503static struct rtable *__mkroute_output(const struct fib_result *res,
2504				       const struct flowi4 *fl4, int orig_oif,
2505				       struct net_device *dev_out,
2506				       unsigned int flags)
2507{
2508	struct fib_info *fi = res->fi;
2509	struct fib_nh_exception *fnhe;
2510	struct in_device *in_dev;
2511	u16 type = res->type;
2512	struct rtable *rth;
2513	bool do_cache;
2514
2515	in_dev = __in_dev_get_rcu(dev_out);
2516	if (!in_dev)
2517		return ERR_PTR(-EINVAL);
2518
2519	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2520		if (ipv4_is_loopback(fl4->saddr) &&
2521		    !(dev_out->flags & IFF_LOOPBACK) &&
2522		    !netif_is_l3_master(dev_out))
2523			return ERR_PTR(-EINVAL);
2524
2525	if (ipv4_is_lbcast(fl4->daddr))
2526		type = RTN_BROADCAST;
2527	else if (ipv4_is_multicast(fl4->daddr))
2528		type = RTN_MULTICAST;
2529	else if (ipv4_is_zeronet(fl4->daddr))
2530		return ERR_PTR(-EINVAL);
2531
2532	if (dev_out->flags & IFF_LOOPBACK)
2533		flags |= RTCF_LOCAL;
2534
2535	do_cache = true;
2536	if (type == RTN_BROADCAST) {
2537		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2538		fi = NULL;
2539	} else if (type == RTN_MULTICAST) {
2540		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2541		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2542				     fl4->flowi4_proto))
2543			flags &= ~RTCF_LOCAL;
2544		else
2545			do_cache = false;
2546		/* If multicast route do not exist use
2547		 * default one, but do not gateway in this case.
2548		 * Yes, it is hack.
2549		 */
2550		if (fi && res->prefixlen < 4)
2551			fi = NULL;
2552	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2553		   (orig_oif != dev_out->ifindex)) {
2554		/* For local routes that require a particular output interface
2555		 * we do not want to cache the result.  Caching the result
2556		 * causes incorrect behaviour when there are multiple source
2557		 * addresses on the interface, the end result being that if the
2558		 * intended recipient is waiting on that interface for the
2559		 * packet he won't receive it because it will be delivered on
2560		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2561		 * be set to the loopback interface as well.
2562		 */
2563		do_cache = false;
2564	}
2565
2566	fnhe = NULL;
2567	do_cache &= fi != NULL;
2568	if (fi) {
2569		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2570		struct rtable __rcu **prth;
2571
2572		fnhe = find_exception(nhc, fl4->daddr);
2573		if (!do_cache)
2574			goto add;
2575		if (fnhe) {
2576			prth = &fnhe->fnhe_rth_output;
2577		} else {
2578			if (unlikely(fl4->flowi4_flags &
2579				     FLOWI_FLAG_KNOWN_NH &&
2580				     !(nhc->nhc_gw_family &&
2581				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2582				do_cache = false;
2583				goto add;
2584			}
2585			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2586		}
2587		rth = rcu_dereference(*prth);
2588		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2589			return rth;
2590	}
2591
2592add:
2593	rth = rt_dst_alloc(dev_out, flags, type,
2594			   IN_DEV_ORCONF(in_dev, NOXFRM));
 
 
2595	if (!rth)
2596		return ERR_PTR(-ENOBUFS);
2597
2598	rth->rt_iif = orig_oif;
2599
2600	RT_CACHE_STAT_INC(out_slow_tot);
2601
2602	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2603		if (flags & RTCF_LOCAL &&
2604		    !(dev_out->flags & IFF_LOOPBACK)) {
2605			rth->dst.output = ip_mc_output;
2606			RT_CACHE_STAT_INC(out_slow_mc);
2607		}
2608#ifdef CONFIG_IP_MROUTE
2609		if (type == RTN_MULTICAST) {
2610			if (IN_DEV_MFORWARD(in_dev) &&
2611			    !ipv4_is_local_multicast(fl4->daddr)) {
2612				rth->dst.input = ip_mr_input;
2613				rth->dst.output = ip_mc_output;
2614			}
2615		}
2616#endif
2617	}
2618
2619	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2620	lwtunnel_set_redirect(&rth->dst);
2621
2622	return rth;
2623}
2624
2625/*
2626 * Major route resolver routine.
2627 */
2628
2629struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2630					const struct sk_buff *skb)
2631{
 
2632	struct fib_result res = {
2633		.type		= RTN_UNSPEC,
2634		.fi		= NULL,
2635		.table		= NULL,
2636		.tclassid	= 0,
2637	};
2638	struct rtable *rth;
2639
2640	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2641	ip_rt_fix_tos(fl4);
 
 
2642
2643	rcu_read_lock();
2644	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2645	rcu_read_unlock();
2646
2647	return rth;
2648}
2649EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2650
2651struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2652					    struct fib_result *res,
2653					    const struct sk_buff *skb)
2654{
2655	struct net_device *dev_out = NULL;
2656	int orig_oif = fl4->flowi4_oif;
2657	unsigned int flags = 0;
2658	struct rtable *rth;
2659	int err;
2660
2661	if (fl4->saddr) {
2662		if (ipv4_is_multicast(fl4->saddr) ||
2663		    ipv4_is_lbcast(fl4->saddr) ||
2664		    ipv4_is_zeronet(fl4->saddr)) {
2665			rth = ERR_PTR(-EINVAL);
2666			goto out;
2667		}
2668
2669		rth = ERR_PTR(-ENETUNREACH);
2670
2671		/* I removed check for oif == dev_out->oif here.
2672		 * It was wrong for two reasons:
2673		 * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2674		 *    is assigned to multiple interfaces.
2675		 * 2. Moreover, we are allowed to send packets with saddr
2676		 *    of another iface. --ANK
2677		 */
2678
2679		if (fl4->flowi4_oif == 0 &&
2680		    (ipv4_is_multicast(fl4->daddr) ||
2681		     ipv4_is_lbcast(fl4->daddr))) {
2682			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2683			dev_out = __ip_dev_find(net, fl4->saddr, false);
2684			if (!dev_out)
2685				goto out;
2686
2687			/* Special hack: user can direct multicasts
2688			 * and limited broadcast via necessary interface
2689			 * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2690			 * This hack is not just for fun, it allows
2691			 * vic,vat and friends to work.
2692			 * They bind socket to loopback, set ttl to zero
2693			 * and expect that it will work.
2694			 * From the viewpoint of routing cache they are broken,
2695			 * because we are not allowed to build multicast path
2696			 * with loopback source addr (look, routing cache
2697			 * cannot know, that ttl is zero, so that packet
2698			 * will not leave this host and route is valid).
2699			 * Luckily, this hack is good workaround.
2700			 */
2701
2702			fl4->flowi4_oif = dev_out->ifindex;
2703			goto make_route;
2704		}
2705
2706		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2707			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2708			if (!__ip_dev_find(net, fl4->saddr, false))
2709				goto out;
2710		}
2711	}
2712
2713
2714	if (fl4->flowi4_oif) {
2715		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2716		rth = ERR_PTR(-ENODEV);
2717		if (!dev_out)
2718			goto out;
2719
2720		/* RACE: Check return value of inet_select_addr instead. */
2721		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2722			rth = ERR_PTR(-ENETUNREACH);
2723			goto out;
2724		}
2725		if (ipv4_is_local_multicast(fl4->daddr) ||
2726		    ipv4_is_lbcast(fl4->daddr) ||
2727		    fl4->flowi4_proto == IPPROTO_IGMP) {
2728			if (!fl4->saddr)
2729				fl4->saddr = inet_select_addr(dev_out, 0,
2730							      RT_SCOPE_LINK);
2731			goto make_route;
2732		}
2733		if (!fl4->saddr) {
2734			if (ipv4_is_multicast(fl4->daddr))
2735				fl4->saddr = inet_select_addr(dev_out, 0,
2736							      fl4->flowi4_scope);
2737			else if (!fl4->daddr)
2738				fl4->saddr = inet_select_addr(dev_out, 0,
2739							      RT_SCOPE_HOST);
2740		}
2741	}
2742
2743	if (!fl4->daddr) {
2744		fl4->daddr = fl4->saddr;
2745		if (!fl4->daddr)
2746			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2747		dev_out = net->loopback_dev;
2748		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2749		res->type = RTN_LOCAL;
2750		flags |= RTCF_LOCAL;
2751		goto make_route;
2752	}
2753
2754	err = fib_lookup(net, fl4, res, 0);
2755	if (err) {
2756		res->fi = NULL;
2757		res->table = NULL;
2758		if (fl4->flowi4_oif &&
2759		    (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
 
2760			/* Apparently, routing tables are wrong. Assume,
2761			 * that the destination is on link.
2762			 *
2763			 * WHY? DW.
2764			 * Because we are allowed to send to iface
2765			 * even if it has NO routes and NO assigned
2766			 * addresses. When oif is specified, routing
2767			 * tables are looked up with only one purpose:
2768			 * to catch if destination is gatewayed, rather than
2769			 * direct. Moreover, if MSG_DONTROUTE is set,
2770			 * we send packet, ignoring both routing tables
2771			 * and ifaddr state. --ANK
2772			 *
2773			 *
2774			 * We could make it even if oif is unknown,
2775			 * likely IPv6, but we do not.
2776			 */
2777
2778			if (fl4->saddr == 0)
2779				fl4->saddr = inet_select_addr(dev_out, 0,
2780							      RT_SCOPE_LINK);
2781			res->type = RTN_UNICAST;
2782			goto make_route;
2783		}
2784		rth = ERR_PTR(err);
2785		goto out;
2786	}
2787
2788	if (res->type == RTN_LOCAL) {
2789		if (!fl4->saddr) {
2790			if (res->fi->fib_prefsrc)
2791				fl4->saddr = res->fi->fib_prefsrc;
2792			else
2793				fl4->saddr = fl4->daddr;
2794		}
2795
2796		/* L3 master device is the loopback for that domain */
2797		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2798			net->loopback_dev;
2799
2800		/* make sure orig_oif points to fib result device even
2801		 * though packet rx/tx happens over loopback or l3mdev
2802		 */
2803		orig_oif = FIB_RES_OIF(*res);
2804
2805		fl4->flowi4_oif = dev_out->ifindex;
2806		flags |= RTCF_LOCAL;
2807		goto make_route;
2808	}
2809
2810	fib_select_path(net, res, fl4, skb);
2811
2812	dev_out = FIB_RES_DEV(*res);
 
 
2813
2814make_route:
2815	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2816
2817out:
2818	return rth;
2819}
2820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2821static struct dst_ops ipv4_dst_blackhole_ops = {
2822	.family			= AF_INET,
2823	.default_advmss		= ipv4_default_advmss,
2824	.neigh_lookup		= ipv4_neigh_lookup,
2825	.check			= dst_blackhole_check,
2826	.cow_metrics		= dst_blackhole_cow_metrics,
2827	.update_pmtu		= dst_blackhole_update_pmtu,
2828	.redirect		= dst_blackhole_redirect,
2829	.mtu			= dst_blackhole_mtu,
2830};
2831
2832struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2833{
2834	struct rtable *ort = (struct rtable *) dst_orig;
2835	struct rtable *rt;
2836
2837	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0);
2838	if (rt) {
2839		struct dst_entry *new = &rt->dst;
2840
2841		new->__use = 1;
2842		new->input = dst_discard;
2843		new->output = dst_discard_out;
2844
2845		new->dev = net->loopback_dev;
2846		netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);
 
2847
2848		rt->rt_is_input = ort->rt_is_input;
2849		rt->rt_iif = ort->rt_iif;
2850		rt->rt_pmtu = ort->rt_pmtu;
2851		rt->rt_mtu_locked = ort->rt_mtu_locked;
2852
2853		rt->rt_genid = rt_genid_ipv4(net);
2854		rt->rt_flags = ort->rt_flags;
2855		rt->rt_type = ort->rt_type;
2856		rt->rt_uses_gateway = ort->rt_uses_gateway;
2857		rt->rt_gw_family = ort->rt_gw_family;
2858		if (rt->rt_gw_family == AF_INET)
2859			rt->rt_gw4 = ort->rt_gw4;
2860		else if (rt->rt_gw_family == AF_INET6)
2861			rt->rt_gw6 = ort->rt_gw6;
 
 
2862	}
2863
2864	dst_release(dst_orig);
2865
2866	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2867}
2868
2869struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2870				    const struct sock *sk)
2871{
2872	struct rtable *rt = __ip_route_output_key(net, flp4);
2873
2874	if (IS_ERR(rt))
2875		return rt;
2876
2877	if (flp4->flowi4_proto) {
2878		flp4->flowi4_oif = rt->dst.dev->ifindex;
2879		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2880							flowi4_to_flowi(flp4),
2881							sk, 0);
2882	}
2883
2884	return rt;
2885}
2886EXPORT_SYMBOL_GPL(ip_route_output_flow);
2887
2888/* called with rcu_read_lock held */
2889static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2890			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2891			struct sk_buff *skb, u32 portid, u32 seq,
2892			unsigned int flags)
2893{
2894	struct rtmsg *r;
2895	struct nlmsghdr *nlh;
2896	unsigned long expires = 0;
2897	u32 error;
2898	u32 metrics[RTAX_MAX];
2899
2900	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2901	if (!nlh)
2902		return -EMSGSIZE;
2903
2904	r = nlmsg_data(nlh);
2905	r->rtm_family	 = AF_INET;
2906	r->rtm_dst_len	= 32;
2907	r->rtm_src_len	= 0;
2908	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2909	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2910	if (nla_put_u32(skb, RTA_TABLE, table_id))
2911		goto nla_put_failure;
2912	r->rtm_type	= rt->rt_type;
2913	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2914	r->rtm_protocol = RTPROT_UNSPEC;
2915	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2916	if (rt->rt_flags & RTCF_NOTIFY)
2917		r->rtm_flags |= RTM_F_NOTIFY;
2918	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2919		r->rtm_flags |= RTCF_DOREDIRECT;
2920
2921	if (nla_put_in_addr(skb, RTA_DST, dst))
2922		goto nla_put_failure;
2923	if (src) {
2924		r->rtm_src_len = 32;
2925		if (nla_put_in_addr(skb, RTA_SRC, src))
2926			goto nla_put_failure;
2927	}
2928	if (rt->dst.dev &&
2929	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2930		goto nla_put_failure;
2931	if (rt->dst.lwtstate &&
2932	    lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
2933		goto nla_put_failure;
2934#ifdef CONFIG_IP_ROUTE_CLASSID
2935	if (rt->dst.tclassid &&
2936	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2937		goto nla_put_failure;
2938#endif
2939	if (fl4 && !rt_is_input_route(rt) &&
2940	    fl4->saddr != src) {
2941		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2942			goto nla_put_failure;
2943	}
2944	if (rt->rt_uses_gateway) {
2945		if (rt->rt_gw_family == AF_INET &&
2946		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2947			goto nla_put_failure;
2948		} else if (rt->rt_gw_family == AF_INET6) {
2949			int alen = sizeof(struct in6_addr);
2950			struct nlattr *nla;
2951			struct rtvia *via;
2952
2953			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2954			if (!nla)
2955				goto nla_put_failure;
2956
2957			via = nla_data(nla);
2958			via->rtvia_family = AF_INET6;
2959			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2960		}
2961	}
2962
2963	expires = rt->dst.expires;
2964	if (expires) {
2965		unsigned long now = jiffies;
2966
2967		if (time_before(now, expires))
2968			expires -= now;
2969		else
2970			expires = 0;
2971	}
2972
2973	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2974	if (rt->rt_pmtu && expires)
2975		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2976	if (rt->rt_mtu_locked && expires)
2977		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2978	if (rtnetlink_put_metrics(skb, metrics) < 0)
2979		goto nla_put_failure;
2980
2981	if (fl4) {
2982		if (fl4->flowi4_mark &&
2983		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2984			goto nla_put_failure;
2985
2986		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2987		    nla_put_u32(skb, RTA_UID,
2988				from_kuid_munged(current_user_ns(),
2989						 fl4->flowi4_uid)))
2990			goto nla_put_failure;
2991
2992		if (rt_is_input_route(rt)) {
2993#ifdef CONFIG_IP_MROUTE
2994			if (ipv4_is_multicast(dst) &&
2995			    !ipv4_is_local_multicast(dst) &&
2996			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2997				int err = ipmr_get_route(net, skb,
2998							 fl4->saddr, fl4->daddr,
2999							 r, portid);
3000
3001				if (err <= 0) {
3002					if (err == 0)
3003						return 0;
3004					goto nla_put_failure;
3005				}
3006			} else
3007#endif
3008				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
3009					goto nla_put_failure;
3010		}
3011	}
3012
3013	error = rt->dst.error;
3014
3015	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
3016		goto nla_put_failure;
3017
3018	nlmsg_end(skb, nlh);
3019	return 0;
3020
3021nla_put_failure:
3022	nlmsg_cancel(skb, nlh);
3023	return -EMSGSIZE;
3024}
3025
3026static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
3027			    struct netlink_callback *cb, u32 table_id,
3028			    struct fnhe_hash_bucket *bucket, int genid,
3029			    int *fa_index, int fa_start, unsigned int flags)
3030{
3031	int i;
3032
3033	for (i = 0; i < FNHE_HASH_SIZE; i++) {
3034		struct fib_nh_exception *fnhe;
3035
3036		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3037		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
3038			struct rtable *rt;
3039			int err;
3040
3041			if (*fa_index < fa_start)
3042				goto next;
3043
3044			if (fnhe->fnhe_genid != genid)
3045				goto next;
3046
3047			if (fnhe->fnhe_expires &&
3048			    time_after(jiffies, fnhe->fnhe_expires))
3049				goto next;
3050
3051			rt = rcu_dereference(fnhe->fnhe_rth_input);
3052			if (!rt)
3053				rt = rcu_dereference(fnhe->fnhe_rth_output);
3054			if (!rt)
3055				goto next;
3056
3057			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3058					   table_id, NULL, skb,
3059					   NETLINK_CB(cb->skb).portid,
3060					   cb->nlh->nlmsg_seq, flags);
3061			if (err)
3062				return err;
3063next:
3064			(*fa_index)++;
3065		}
3066	}
3067
3068	return 0;
3069}
3070
3071int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3072		       u32 table_id, struct fib_info *fi,
3073		       int *fa_index, int fa_start, unsigned int flags)
3074{
3075	struct net *net = sock_net(cb->skb->sk);
3076	int nhsel, genid = fnhe_genid(net);
3077
3078	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3079		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3080		struct fnhe_hash_bucket *bucket;
3081		int err;
3082
3083		if (nhc->nhc_flags & RTNH_F_DEAD)
3084			continue;
3085
3086		rcu_read_lock();
3087		bucket = rcu_dereference(nhc->nhc_exceptions);
3088		err = 0;
3089		if (bucket)
3090			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3091					       genid, fa_index, fa_start,
3092					       flags);
3093		rcu_read_unlock();
3094		if (err)
3095			return err;
3096	}
3097
3098	return 0;
3099}
3100
3101static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3102						   u8 ip_proto, __be16 sport,
3103						   __be16 dport)
3104{
3105	struct sk_buff *skb;
3106	struct iphdr *iph;
3107
3108	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3109	if (!skb)
3110		return NULL;
3111
3112	/* Reserve room for dummy headers, this skb can pass
3113	 * through good chunk of routing engine.
3114	 */
3115	skb_reset_mac_header(skb);
3116	skb_reset_network_header(skb);
3117	skb->protocol = htons(ETH_P_IP);
3118	iph = skb_put(skb, sizeof(struct iphdr));
3119	iph->protocol = ip_proto;
3120	iph->saddr = src;
3121	iph->daddr = dst;
3122	iph->version = 0x4;
3123	iph->frag_off = 0;
3124	iph->ihl = 0x5;
3125	skb_set_transport_header(skb, skb->len);
3126
3127	switch (iph->protocol) {
3128	case IPPROTO_UDP: {
3129		struct udphdr *udph;
3130
3131		udph = skb_put_zero(skb, sizeof(struct udphdr));
3132		udph->source = sport;
3133		udph->dest = dport;
3134		udph->len = htons(sizeof(struct udphdr));
3135		udph->check = 0;
3136		break;
3137	}
3138	case IPPROTO_TCP: {
3139		struct tcphdr *tcph;
3140
3141		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3142		tcph->source	= sport;
3143		tcph->dest	= dport;
3144		tcph->doff	= sizeof(struct tcphdr) / 4;
3145		tcph->rst = 1;
3146		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3147					    src, dst, 0);
3148		break;
3149	}
3150	case IPPROTO_ICMP: {
3151		struct icmphdr *icmph;
3152
3153		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3154		icmph->type = ICMP_ECHO;
3155		icmph->code = 0;
3156	}
3157	}
3158
3159	return skb;
3160}
3161
3162static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3163				       const struct nlmsghdr *nlh,
3164				       struct nlattr **tb,
3165				       struct netlink_ext_ack *extack)
3166{
3167	struct rtmsg *rtm;
3168	int i, err;
3169
3170	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3171		NL_SET_ERR_MSG(extack,
3172			       "ipv4: Invalid header for route get request");
3173		return -EINVAL;
3174	}
3175
3176	if (!netlink_strict_get_check(skb))
3177		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3178					      rtm_ipv4_policy, extack);
3179
3180	rtm = nlmsg_data(nlh);
3181	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3182	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3183	    rtm->rtm_table || rtm->rtm_protocol ||
3184	    rtm->rtm_scope || rtm->rtm_type) {
3185		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3186		return -EINVAL;
3187	}
3188
3189	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3190			       RTM_F_LOOKUP_TABLE |
3191			       RTM_F_FIB_MATCH)) {
3192		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3193		return -EINVAL;
3194	}
3195
3196	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3197					    rtm_ipv4_policy, extack);
3198	if (err)
3199		return err;
3200
3201	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3202	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3203		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3204		return -EINVAL;
3205	}
3206
3207	for (i = 0; i <= RTA_MAX; i++) {
3208		if (!tb[i])
3209			continue;
3210
3211		switch (i) {
3212		case RTA_IIF:
3213		case RTA_OIF:
3214		case RTA_SRC:
3215		case RTA_DST:
3216		case RTA_IP_PROTO:
3217		case RTA_SPORT:
3218		case RTA_DPORT:
3219		case RTA_MARK:
3220		case RTA_UID:
3221			break;
3222		default:
3223			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3224			return -EINVAL;
3225		}
3226	}
3227
3228	return 0;
3229}
3230
3231static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3232			     struct netlink_ext_ack *extack)
3233{
3234	struct net *net = sock_net(in_skb->sk);
3235	struct nlattr *tb[RTA_MAX+1];
3236	u32 table_id = RT_TABLE_MAIN;
3237	__be16 sport = 0, dport = 0;
3238	struct fib_result res = {};
3239	u8 ip_proto = IPPROTO_UDP;
3240	struct rtable *rt = NULL;
3241	struct sk_buff *skb;
3242	struct rtmsg *rtm;
3243	struct flowi4 fl4 = {};
3244	__be32 dst = 0;
3245	__be32 src = 0;
3246	kuid_t uid;
3247	u32 iif;
3248	int err;
3249	int mark;
3250
3251	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3252	if (err < 0)
3253		return err;
3254
3255	rtm = nlmsg_data(nlh);
3256	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3257	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3258	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3259	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3260	if (tb[RTA_UID])
3261		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3262	else
3263		uid = (iif ? INVALID_UID : current_uid());
3264
3265	if (tb[RTA_IP_PROTO]) {
3266		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3267						  &ip_proto, AF_INET, extack);
3268		if (err)
3269			return err;
3270	}
3271
3272	if (tb[RTA_SPORT])
3273		sport = nla_get_be16(tb[RTA_SPORT]);
3274
3275	if (tb[RTA_DPORT])
3276		dport = nla_get_be16(tb[RTA_DPORT]);
3277
3278	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3279	if (!skb)
3280		return -ENOBUFS;
3281
3282	fl4.daddr = dst;
3283	fl4.saddr = src;
3284	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3285	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3286	fl4.flowi4_mark = mark;
3287	fl4.flowi4_uid = uid;
3288	if (sport)
3289		fl4.fl4_sport = sport;
3290	if (dport)
3291		fl4.fl4_dport = dport;
3292	fl4.flowi4_proto = ip_proto;
3293
3294	rcu_read_lock();
3295
3296	if (iif) {
3297		struct net_device *dev;
3298
3299		dev = dev_get_by_index_rcu(net, iif);
3300		if (!dev) {
3301			err = -ENODEV;
3302			goto errout_rcu;
3303		}
3304
3305		fl4.flowi4_iif = iif; /* for rt_fill_info */
3306		skb->dev	= dev;
3307		skb->mark	= mark;
3308		err = ip_route_input_rcu(skb, dst, src,
3309					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3310					 &res);
3311
3312		rt = skb_rtable(skb);
3313		if (err == 0 && rt->dst.error)
3314			err = -rt->dst.error;
3315	} else {
3316		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3317		skb->dev = net->loopback_dev;
3318		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3319		err = 0;
3320		if (IS_ERR(rt))
3321			err = PTR_ERR(rt);
3322		else
3323			skb_dst_set(skb, &rt->dst);
3324	}
3325
3326	if (err)
3327		goto errout_rcu;
3328
3329	if (rtm->rtm_flags & RTM_F_NOTIFY)
3330		rt->rt_flags |= RTCF_NOTIFY;
3331
3332	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3333		table_id = res.table ? res.table->tb_id : 0;
3334
3335	/* reset skb for netlink reply msg */
3336	skb_trim(skb, 0);
3337	skb_reset_network_header(skb);
3338	skb_reset_transport_header(skb);
3339	skb_reset_mac_header(skb);
3340
3341	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3342		struct fib_rt_info fri;
3343
3344		if (!res.fi) {
3345			err = fib_props[res.type].error;
3346			if (!err)
3347				err = -EHOSTUNREACH;
3348			goto errout_rcu;
3349		}
3350		fri.fi = res.fi;
3351		fri.tb_id = table_id;
3352		fri.dst = res.prefix;
3353		fri.dst_len = res.prefixlen;
3354		fri.dscp = inet_dsfield_to_dscp(fl4.flowi4_tos);
3355		fri.type = rt->rt_type;
3356		fri.offload = 0;
3357		fri.trap = 0;
3358		fri.offload_failed = 0;
3359		if (res.fa_head) {
3360			struct fib_alias *fa;
3361
3362			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3363				u8 slen = 32 - fri.dst_len;
3364
3365				if (fa->fa_slen == slen &&
3366				    fa->tb_id == fri.tb_id &&
3367				    fa->fa_dscp == fri.dscp &&
3368				    fa->fa_info == res.fi &&
3369				    fa->fa_type == fri.type) {
3370					fri.offload = READ_ONCE(fa->offload);
3371					fri.trap = READ_ONCE(fa->trap);
3372					fri.offload_failed =
3373						READ_ONCE(fa->offload_failed);
3374					break;
3375				}
3376			}
3377		}
3378		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3379				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
 
 
3380	} else {
3381		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3382				   NETLINK_CB(in_skb).portid,
3383				   nlh->nlmsg_seq, 0);
3384	}
3385	if (err < 0)
3386		goto errout_rcu;
3387
3388	rcu_read_unlock();
3389
3390	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3391
3392errout_free:
3393	return err;
3394errout_rcu:
3395	rcu_read_unlock();
3396	kfree_skb(skb);
3397	goto errout_free;
3398}
3399
3400void ip_rt_multicast_event(struct in_device *in_dev)
3401{
3402	rt_cache_flush(dev_net(in_dev->dev));
3403}
3404
3405#ifdef CONFIG_SYSCTL
3406static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3407static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3408static int ip_rt_gc_elasticity __read_mostly	= 8;
3409static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3410
3411static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3412		void *buffer, size_t *lenp, loff_t *ppos)
 
3413{
3414	struct net *net = (struct net *)__ctl->extra1;
3415
3416	if (write) {
3417		rt_cache_flush(net);
3418		fnhe_genid_bump(net);
3419		return 0;
3420	}
3421
3422	return -EINVAL;
3423}
3424
3425static struct ctl_table ipv4_route_table[] = {
3426	{
3427		.procname	= "gc_thresh",
3428		.data		= &ipv4_dst_ops.gc_thresh,
3429		.maxlen		= sizeof(int),
3430		.mode		= 0644,
3431		.proc_handler	= proc_dointvec,
3432	},
3433	{
3434		.procname	= "max_size",
3435		.data		= &ip_rt_max_size,
3436		.maxlen		= sizeof(int),
3437		.mode		= 0644,
3438		.proc_handler	= proc_dointvec,
3439	},
3440	{
3441		/*  Deprecated. Use gc_min_interval_ms */
3442
3443		.procname	= "gc_min_interval",
3444		.data		= &ip_rt_gc_min_interval,
3445		.maxlen		= sizeof(int),
3446		.mode		= 0644,
3447		.proc_handler	= proc_dointvec_jiffies,
3448	},
3449	{
3450		.procname	= "gc_min_interval_ms",
3451		.data		= &ip_rt_gc_min_interval,
3452		.maxlen		= sizeof(int),
3453		.mode		= 0644,
3454		.proc_handler	= proc_dointvec_ms_jiffies,
3455	},
3456	{
3457		.procname	= "gc_timeout",
3458		.data		= &ip_rt_gc_timeout,
3459		.maxlen		= sizeof(int),
3460		.mode		= 0644,
3461		.proc_handler	= proc_dointvec_jiffies,
3462	},
3463	{
3464		.procname	= "gc_interval",
3465		.data		= &ip_rt_gc_interval,
3466		.maxlen		= sizeof(int),
3467		.mode		= 0644,
3468		.proc_handler	= proc_dointvec_jiffies,
3469	},
3470	{
3471		.procname	= "redirect_load",
3472		.data		= &ip_rt_redirect_load,
3473		.maxlen		= sizeof(int),
3474		.mode		= 0644,
3475		.proc_handler	= proc_dointvec,
3476	},
3477	{
3478		.procname	= "redirect_number",
3479		.data		= &ip_rt_redirect_number,
3480		.maxlen		= sizeof(int),
3481		.mode		= 0644,
3482		.proc_handler	= proc_dointvec,
3483	},
3484	{
3485		.procname	= "redirect_silence",
3486		.data		= &ip_rt_redirect_silence,
3487		.maxlen		= sizeof(int),
3488		.mode		= 0644,
3489		.proc_handler	= proc_dointvec,
3490	},
3491	{
3492		.procname	= "error_cost",
3493		.data		= &ip_rt_error_cost,
3494		.maxlen		= sizeof(int),
3495		.mode		= 0644,
3496		.proc_handler	= proc_dointvec,
3497	},
3498	{
3499		.procname	= "error_burst",
3500		.data		= &ip_rt_error_burst,
3501		.maxlen		= sizeof(int),
3502		.mode		= 0644,
3503		.proc_handler	= proc_dointvec,
3504	},
3505	{
3506		.procname	= "gc_elasticity",
3507		.data		= &ip_rt_gc_elasticity,
3508		.maxlen		= sizeof(int),
3509		.mode		= 0644,
3510		.proc_handler	= proc_dointvec,
3511	},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3512	{ }
3513};
3514
3515static const char ipv4_route_flush_procname[] = "flush";
3516
3517static struct ctl_table ipv4_route_netns_table[] = {
3518	{
3519		.procname	= ipv4_route_flush_procname,
3520		.maxlen		= sizeof(int),
3521		.mode		= 0200,
3522		.proc_handler	= ipv4_sysctl_rtcache_flush,
3523	},
3524	{
3525		.procname       = "min_pmtu",
3526		.data           = &init_net.ipv4.ip_rt_min_pmtu,
3527		.maxlen         = sizeof(int),
3528		.mode           = 0644,
3529		.proc_handler   = proc_dointvec_minmax,
3530		.extra1         = &ip_min_valid_pmtu,
3531	},
3532	{
3533		.procname       = "mtu_expires",
3534		.data           = &init_net.ipv4.ip_rt_mtu_expires,
3535		.maxlen         = sizeof(int),
3536		.mode           = 0644,
3537		.proc_handler   = proc_dointvec_jiffies,
3538	},
3539	{
3540		.procname   = "min_adv_mss",
3541		.data       = &init_net.ipv4.ip_rt_min_advmss,
3542		.maxlen     = sizeof(int),
3543		.mode       = 0644,
3544		.proc_handler   = proc_dointvec,
3545	},
3546	{ },
3547};
3548
3549static __net_init int sysctl_route_net_init(struct net *net)
3550{
3551	struct ctl_table *tbl;
3552	size_t table_size = ARRAY_SIZE(ipv4_route_netns_table);
3553
3554	tbl = ipv4_route_netns_table;
3555	if (!net_eq(net, &init_net)) {
3556		int i;
3557
3558		tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
3559		if (!tbl)
3560			goto err_dup;
3561
3562		/* Don't export non-whitelisted sysctls to unprivileged users */
3563		if (net->user_ns != &init_user_ns) {
3564			if (tbl[0].procname != ipv4_route_flush_procname) {
3565				tbl[0].procname = NULL;
3566				table_size = 0;
3567			}
3568		}
3569
3570		/* Update the variables to point into the current struct net
3571		 * except for the first element flush
3572		 */
3573		for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++)
3574			tbl[i].data += (void *)net - (void *)&init_net;
3575	}
3576	tbl[0].extra1 = net;
3577
3578	net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route",
3579						     tbl, table_size);
3580	if (!net->ipv4.route_hdr)
3581		goto err_reg;
3582	return 0;
3583
3584err_reg:
3585	if (tbl != ipv4_route_netns_table)
3586		kfree(tbl);
3587err_dup:
3588	return -ENOMEM;
3589}
3590
3591static __net_exit void sysctl_route_net_exit(struct net *net)
3592{
3593	struct ctl_table *tbl;
3594
3595	tbl = net->ipv4.route_hdr->ctl_table_arg;
3596	unregister_net_sysctl_table(net->ipv4.route_hdr);
3597	BUG_ON(tbl == ipv4_route_netns_table);
3598	kfree(tbl);
3599}
3600
3601static __net_initdata struct pernet_operations sysctl_route_ops = {
3602	.init = sysctl_route_net_init,
3603	.exit = sysctl_route_net_exit,
3604};
3605#endif
3606
3607static __net_init int netns_ip_rt_init(struct net *net)
3608{
3609	/* Set default value for namespaceified sysctls */
3610	net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
3611	net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
3612	net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
3613	return 0;
3614}
3615
3616static struct pernet_operations __net_initdata ip_rt_ops = {
3617	.init = netns_ip_rt_init,
3618};
3619
3620static __net_init int rt_genid_init(struct net *net)
3621{
3622	atomic_set(&net->ipv4.rt_genid, 0);
3623	atomic_set(&net->fnhe_genid, 0);
3624	atomic_set(&net->ipv4.dev_addr_genid, get_random_u32());
3625	return 0;
3626}
3627
3628static __net_initdata struct pernet_operations rt_genid_ops = {
3629	.init = rt_genid_init,
3630};
3631
3632static int __net_init ipv4_inetpeer_init(struct net *net)
3633{
3634	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3635
3636	if (!bp)
3637		return -ENOMEM;
3638	inet_peer_base_init(bp);
3639	net->ipv4.peers = bp;
3640	return 0;
3641}
3642
3643static void __net_exit ipv4_inetpeer_exit(struct net *net)
3644{
3645	struct inet_peer_base *bp = net->ipv4.peers;
3646
3647	net->ipv4.peers = NULL;
3648	inetpeer_invalidate_tree(bp);
3649	kfree(bp);
3650}
3651
3652static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3653	.init	=	ipv4_inetpeer_init,
3654	.exit	=	ipv4_inetpeer_exit,
3655};
3656
3657#ifdef CONFIG_IP_ROUTE_CLASSID
3658struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3659#endif /* CONFIG_IP_ROUTE_CLASSID */
3660
3661int __init ip_rt_init(void)
3662{
3663	void *idents_hash;
3664	int cpu;
3665
3666	/* For modern hosts, this will use 2 MB of memory */
3667	idents_hash = alloc_large_system_hash("IP idents",
3668					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
3669					      0,
3670					      16, /* one bucket per 64 KB */
3671					      HASH_ZERO,
3672					      NULL,
3673					      &ip_idents_mask,
3674					      2048,
3675					      256*1024);
3676
3677	ip_idents = idents_hash;
3678
3679	get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3680
3681	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3682
3683	for_each_possible_cpu(cpu) {
3684		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3685
3686		INIT_LIST_HEAD(&ul->head);
3687		INIT_LIST_HEAD(&ul->quarantine);
3688		spin_lock_init(&ul->lock);
3689	}
3690#ifdef CONFIG_IP_ROUTE_CLASSID
3691	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3692	if (!ip_rt_acct)
3693		panic("IP: failed to allocate ip_rt_acct\n");
3694#endif
3695
3696	ipv4_dst_ops.kmem_cachep =
3697		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3698				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3699
3700	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3701
3702	if (dst_entries_init(&ipv4_dst_ops) < 0)
3703		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3704
3705	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3706		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3707
3708	ipv4_dst_ops.gc_thresh = ~0;
3709	ip_rt_max_size = INT_MAX;
3710
3711	devinet_init();
3712	ip_fib_init();
3713
3714	if (ip_rt_proc_init())
3715		pr_err("Unable to create route proc files\n");
3716#ifdef CONFIG_XFRM
3717	xfrm_init();
3718	xfrm4_init();
3719#endif
3720	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3721		      RTNL_FLAG_DOIT_UNLOCKED);
3722
3723#ifdef CONFIG_SYSCTL
3724	register_pernet_subsys(&sysctl_route_ops);
3725#endif
3726	register_pernet_subsys(&ip_rt_ops);
3727	register_pernet_subsys(&rt_genid_ops);
3728	register_pernet_subsys(&ipv4_inetpeer_ops);
3729	return 0;
3730}
3731
3732#ifdef CONFIG_SYSCTL
3733/*
3734 * We really need to sanitize the damn ipv4 init order, then all
3735 * this nonsense will go away.
3736 */
3737void __init ip_static_sysctl_init(void)
3738{
3739	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3740}
3741#endif