Linux Audio

Check our new training course

Loading...
v5.9
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		ROUTE - implementation of the IP router.
   8 *
   9 * Authors:	Ross Biro
  10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  12 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14 *
  15 * Fixes:
  16 *		Alan Cox	:	Verify area fixes.
  17 *		Alan Cox	:	cli() protects routing changes
  18 *		Rui Oliveira	:	ICMP routing table updates
  19 *		(rco@di.uminho.pt)	Routing table insertion and update
  20 *		Linus Torvalds	:	Rewrote bits to be sensible
  21 *		Alan Cox	:	Added BSD route gw semantics
  22 *		Alan Cox	:	Super /proc >4K
  23 *		Alan Cox	:	MTU in route table
  24 *		Alan Cox	: 	MSS actually. Also added the window
  25 *					clamper.
  26 *		Sam Lantinga	:	Fixed route matching in rt_del()
  27 *		Alan Cox	:	Routing cache support.
  28 *		Alan Cox	:	Removed compatibility cruft.
  29 *		Alan Cox	:	RTF_REJECT support.
  30 *		Alan Cox	:	TCP irtt support.
  31 *		Jonathan Naylor	:	Added Metric support.
  32 *	Miquel van Smoorenburg	:	BSD API fixes.
  33 *	Miquel van Smoorenburg	:	Metrics.
  34 *		Alan Cox	:	Use __u32 properly
  35 *		Alan Cox	:	Aligned routing errors more closely with BSD
  36 *					our system is still very different.
  37 *		Alan Cox	:	Faster /proc handling
  38 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  39 *					routing caches and better behaviour.
  40 *
  41 *		Olaf Erb	:	irtt wasn't being copied right.
  42 *		Bjorn Ekwall	:	Kerneld route support.
  43 *		Alan Cox	:	Multicast fixed (I hope)
  44 * 		Pavel Krauz	:	Limited broadcast fixed
  45 *		Mike McLagan	:	Routing by source
  46 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  47 *					route.c and rewritten from scratch.
  48 *		Andi Kleen	:	Load-limit warning messages.
  49 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  50 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  51 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  52 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  53 *		Marc Boucher	:	routing by fwmark
  54 *	Robert Olsson		:	Added rt_cache statistics
  55 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  56 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  57 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  58 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
  59 */
  60
  61#define pr_fmt(fmt) "IPv4: " fmt
  62
  63#include <linux/module.h>
  64#include <linux/uaccess.h>
  65#include <linux/bitops.h>
  66#include <linux/types.h>
  67#include <linux/kernel.h>
  68#include <linux/mm.h>
  69#include <linux/string.h>
  70#include <linux/socket.h>
  71#include <linux/sockios.h>
  72#include <linux/errno.h>
  73#include <linux/in.h>
  74#include <linux/inet.h>
  75#include <linux/netdevice.h>
  76#include <linux/proc_fs.h>
  77#include <linux/init.h>
  78#include <linux/skbuff.h>
  79#include <linux/inetdevice.h>
  80#include <linux/igmp.h>
  81#include <linux/pkt_sched.h>
  82#include <linux/mroute.h>
  83#include <linux/netfilter_ipv4.h>
  84#include <linux/random.h>
  85#include <linux/rcupdate.h>
  86#include <linux/times.h>
  87#include <linux/slab.h>
  88#include <linux/jhash.h>
  89#include <net/dst.h>
  90#include <net/dst_metadata.h>
 
  91#include <net/net_namespace.h>
  92#include <net/protocol.h>
  93#include <net/ip.h>
  94#include <net/route.h>
  95#include <net/inetpeer.h>
  96#include <net/sock.h>
  97#include <net/ip_fib.h>
  98#include <net/nexthop.h>
  99#include <net/arp.h>
 100#include <net/tcp.h>
 101#include <net/icmp.h>
 102#include <net/xfrm.h>
 103#include <net/lwtunnel.h>
 104#include <net/netevent.h>
 105#include <net/rtnetlink.h>
 106#ifdef CONFIG_SYSCTL
 107#include <linux/sysctl.h>
 108#endif
 109#include <net/secure_seq.h>
 110#include <net/ip_tunnels.h>
 111#include <net/l3mdev.h>
 112
 113#include "fib_lookup.h"
 114
 115#define RT_FL_TOS(oldflp4) \
 116	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 117
 118#define RT_GC_TIMEOUT (300*HZ)
 119
 
 
 
 120static int ip_rt_max_size;
 121static int ip_rt_redirect_number __read_mostly	= 9;
 122static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 123static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 124static int ip_rt_error_cost __read_mostly	= HZ;
 125static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 126static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 127static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 128static int ip_rt_min_advmss __read_mostly	= 256;
 129
 130static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 131
 132/*
 133 *	Interface to generic destination cache.
 134 */
 135
 136static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 
 137static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 138static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
 139static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 
 
 140static void		 ipv4_link_failure(struct sk_buff *skb);
 141static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 142					   struct sk_buff *skb, u32 mtu,
 143					   bool confirm_neigh);
 144static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 145					struct sk_buff *skb);
 146static void		ipv4_dst_destroy(struct dst_entry *dst);
 147
 148static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 149{
 150	WARN_ON(1);
 151	return NULL;
 152}
 153
 154static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 155					   struct sk_buff *skb,
 156					   const void *daddr);
 157static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 158
 159static struct dst_ops ipv4_dst_ops = {
 160	.family =		AF_INET,
 161	.check =		ipv4_dst_check,
 162	.default_advmss =	ipv4_default_advmss,
 163	.mtu =			ipv4_mtu,
 164	.cow_metrics =		ipv4_cow_metrics,
 165	.destroy =		ipv4_dst_destroy,
 166	.negative_advice =	ipv4_negative_advice,
 167	.link_failure =		ipv4_link_failure,
 168	.update_pmtu =		ip_rt_update_pmtu,
 169	.redirect =		ip_do_redirect,
 170	.local_out =		__ip_local_out,
 171	.neigh_lookup =		ipv4_neigh_lookup,
 172	.confirm_neigh =	ipv4_confirm_neigh,
 173};
 174
 175#define ECN_OR_COST(class)	TC_PRIO_##class
 176
 177const __u8 ip_tos2prio[16] = {
 178	TC_PRIO_BESTEFFORT,
 179	ECN_OR_COST(BESTEFFORT),
 180	TC_PRIO_BESTEFFORT,
 181	ECN_OR_COST(BESTEFFORT),
 182	TC_PRIO_BULK,
 183	ECN_OR_COST(BULK),
 184	TC_PRIO_BULK,
 185	ECN_OR_COST(BULK),
 186	TC_PRIO_INTERACTIVE,
 187	ECN_OR_COST(INTERACTIVE),
 188	TC_PRIO_INTERACTIVE,
 189	ECN_OR_COST(INTERACTIVE),
 190	TC_PRIO_INTERACTIVE_BULK,
 191	ECN_OR_COST(INTERACTIVE_BULK),
 192	TC_PRIO_INTERACTIVE_BULK,
 193	ECN_OR_COST(INTERACTIVE_BULK)
 194};
 195EXPORT_SYMBOL(ip_tos2prio);
 196
 197static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 198#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 199
 200#ifdef CONFIG_PROC_FS
 201static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 202{
 203	if (*pos)
 204		return NULL;
 205	return SEQ_START_TOKEN;
 206}
 207
 208static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 209{
 210	++*pos;
 211	return NULL;
 212}
 213
 214static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 215{
 216}
 217
 218static int rt_cache_seq_show(struct seq_file *seq, void *v)
 219{
 220	if (v == SEQ_START_TOKEN)
 221		seq_printf(seq, "%-127s\n",
 222			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 223			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 224			   "HHUptod\tSpecDst");
 225	return 0;
 226}
 227
 228static const struct seq_operations rt_cache_seq_ops = {
 229	.start  = rt_cache_seq_start,
 230	.next   = rt_cache_seq_next,
 231	.stop   = rt_cache_seq_stop,
 232	.show   = rt_cache_seq_show,
 233};
 234
 235static int rt_cache_seq_open(struct inode *inode, struct file *file)
 236{
 237	return seq_open(file, &rt_cache_seq_ops);
 238}
 239
 240static const struct proc_ops rt_cache_proc_ops = {
 241	.proc_open	= rt_cache_seq_open,
 242	.proc_read	= seq_read,
 243	.proc_lseek	= seq_lseek,
 244	.proc_release	= seq_release,
 245};
 246
 247
 248static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 249{
 250	int cpu;
 251
 252	if (*pos == 0)
 253		return SEQ_START_TOKEN;
 254
 255	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 256		if (!cpu_possible(cpu))
 257			continue;
 258		*pos = cpu+1;
 259		return &per_cpu(rt_cache_stat, cpu);
 260	}
 261	return NULL;
 262}
 263
 264static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 265{
 266	int cpu;
 267
 268	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 269		if (!cpu_possible(cpu))
 270			continue;
 271		*pos = cpu+1;
 272		return &per_cpu(rt_cache_stat, cpu);
 273	}
 274	(*pos)++;
 275	return NULL;
 276
 277}
 278
 279static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 280{
 281
 282}
 283
 284static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 285{
 286	struct rt_cache_stat *st = v;
 287
 288	if (v == SEQ_START_TOKEN) {
 289		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 290		return 0;
 291	}
 292
 293	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 294		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 
 295		   dst_entries_get_slow(&ipv4_dst_ops),
 296		   0, /* st->in_hit */
 297		   st->in_slow_tot,
 298		   st->in_slow_mc,
 299		   st->in_no_route,
 300		   st->in_brd,
 301		   st->in_martian_dst,
 302		   st->in_martian_src,
 303
 304		   0, /* st->out_hit */
 305		   st->out_slow_tot,
 306		   st->out_slow_mc,
 307
 308		   0, /* st->gc_total */
 309		   0, /* st->gc_ignored */
 310		   0, /* st->gc_goal_miss */
 311		   0, /* st->gc_dst_overflow */
 312		   0, /* st->in_hlist_search */
 313		   0  /* st->out_hlist_search */
 314		);
 315	return 0;
 316}
 317
 318static const struct seq_operations rt_cpu_seq_ops = {
 319	.start  = rt_cpu_seq_start,
 320	.next   = rt_cpu_seq_next,
 321	.stop   = rt_cpu_seq_stop,
 322	.show   = rt_cpu_seq_show,
 323};
 324
 325
 326static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 327{
 328	return seq_open(file, &rt_cpu_seq_ops);
 329}
 330
 331static const struct proc_ops rt_cpu_proc_ops = {
 332	.proc_open	= rt_cpu_seq_open,
 333	.proc_read	= seq_read,
 334	.proc_lseek	= seq_lseek,
 335	.proc_release	= seq_release,
 336};
 337
 338#ifdef CONFIG_IP_ROUTE_CLASSID
 339static int rt_acct_proc_show(struct seq_file *m, void *v)
 340{
 341	struct ip_rt_acct *dst, *src;
 342	unsigned int i, j;
 343
 344	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 345	if (!dst)
 346		return -ENOMEM;
 347
 348	for_each_possible_cpu(i) {
 349		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 350		for (j = 0; j < 256; j++) {
 351			dst[j].o_bytes   += src[j].o_bytes;
 352			dst[j].o_packets += src[j].o_packets;
 353			dst[j].i_bytes   += src[j].i_bytes;
 354			dst[j].i_packets += src[j].i_packets;
 355		}
 356	}
 357
 358	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 359	kfree(dst);
 360	return 0;
 361}
 362#endif
 363
 364static int __net_init ip_rt_do_proc_init(struct net *net)
 365{
 366	struct proc_dir_entry *pde;
 367
 368	pde = proc_create("rt_cache", 0444, net->proc_net,
 369			  &rt_cache_proc_ops);
 370	if (!pde)
 371		goto err1;
 372
 373	pde = proc_create("rt_cache", 0444,
 374			  net->proc_net_stat, &rt_cpu_proc_ops);
 375	if (!pde)
 376		goto err2;
 377
 378#ifdef CONFIG_IP_ROUTE_CLASSID
 379	pde = proc_create_single("rt_acct", 0, net->proc_net,
 380			rt_acct_proc_show);
 381	if (!pde)
 382		goto err3;
 383#endif
 384	return 0;
 385
 386#ifdef CONFIG_IP_ROUTE_CLASSID
 387err3:
 388	remove_proc_entry("rt_cache", net->proc_net_stat);
 389#endif
 390err2:
 391	remove_proc_entry("rt_cache", net->proc_net);
 392err1:
 393	return -ENOMEM;
 394}
 395
 396static void __net_exit ip_rt_do_proc_exit(struct net *net)
 397{
 398	remove_proc_entry("rt_cache", net->proc_net_stat);
 399	remove_proc_entry("rt_cache", net->proc_net);
 400#ifdef CONFIG_IP_ROUTE_CLASSID
 401	remove_proc_entry("rt_acct", net->proc_net);
 402#endif
 403}
 404
 405static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 406	.init = ip_rt_do_proc_init,
 407	.exit = ip_rt_do_proc_exit,
 408};
 409
 410static int __init ip_rt_proc_init(void)
 411{
 412	return register_pernet_subsys(&ip_rt_proc_ops);
 413}
 414
 415#else
 416static inline int ip_rt_proc_init(void)
 417{
 418	return 0;
 419}
 420#endif /* CONFIG_PROC_FS */
 421
 422static inline bool rt_is_expired(const struct rtable *rth)
 423{
 424	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 
 
 
 
 
 
 425}
 426
 427void rt_cache_flush(struct net *net)
 428{
 429	rt_genid_bump_ipv4(net);
 430}
 431
 432static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 433					   struct sk_buff *skb,
 434					   const void *daddr)
 435{
 436	const struct rtable *rt = container_of(dst, struct rtable, dst);
 437	struct net_device *dev = dst->dev;
 438	struct neighbour *n;
 439
 440	rcu_read_lock_bh();
 441
 442	if (likely(rt->rt_gw_family == AF_INET)) {
 443		n = ip_neigh_gw4(dev, rt->rt_gw4);
 444	} else if (rt->rt_gw_family == AF_INET6) {
 445		n = ip_neigh_gw6(dev, &rt->rt_gw6);
 446        } else {
 447		__be32 pkey;
 448
 449		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 450		n = ip_neigh_gw4(dev, pkey);
 451	}
 452
 453	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 454		n = NULL;
 455
 456	rcu_read_unlock_bh();
 457
 458	return n;
 459}
 460
 461static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 462{
 463	const struct rtable *rt = container_of(dst, struct rtable, dst);
 464	struct net_device *dev = dst->dev;
 465	const __be32 *pkey = daddr;
 466
 467	if (rt->rt_gw_family == AF_INET) {
 468		pkey = (const __be32 *)&rt->rt_gw4;
 469	} else if (rt->rt_gw_family == AF_INET6) {
 470		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 471	} else if (!daddr ||
 472		 (rt->rt_flags &
 473		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 474		return;
 475	}
 476	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 477}
 478
 479#define IP_IDENTS_SZ 2048u
 480
 
 
 481static atomic_t *ip_idents __read_mostly;
 482static u32 *ip_tstamps __read_mostly;
 483
 484/* In order to protect privacy, we add a perturbation to identifiers
 485 * if one generator is seldom used. This makes hard for an attacker
 486 * to infer how many packets were sent between two points in time.
 487 */
 488u32 ip_idents_reserve(u32 hash, int segs)
 489{
 490	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 491	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 492	u32 old = READ_ONCE(*p_tstamp);
 493	u32 now = (u32)jiffies;
 494	u32 delta = 0;
 495
 
 
 
 
 
 496	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 497		delta = prandom_u32_max(now - old);
 498
 499	/* If UBSAN reports an error there, please make sure your compiler
 500	 * supports -fno-strict-overflow before reporting it that was a bug
 501	 * in UBSAN, and it has been fixed in GCC-8.
 502	 */
 503	return atomic_add_return(segs + delta, p_id) - segs;
 504}
 505EXPORT_SYMBOL(ip_idents_reserve);
 506
 507void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 508{
 509	u32 hash, id;
 510
 511	/* Note the following code is not safe, but this is okay. */
 512	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 513		get_random_bytes(&net->ipv4.ip_id_key,
 514				 sizeof(net->ipv4.ip_id_key));
 515
 516	hash = siphash_3u32((__force u32)iph->daddr,
 517			    (__force u32)iph->saddr,
 518			    iph->protocol,
 519			    &net->ipv4.ip_id_key);
 520	id = ip_idents_reserve(hash, segs);
 521	iph->id = htons(id);
 522}
 523EXPORT_SYMBOL(__ip_select_ident);
 524
 525static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 526			     const struct sock *sk,
 527			     const struct iphdr *iph,
 528			     int oif, u8 tos,
 529			     u8 prot, u32 mark, int flow_flags)
 530{
 531	if (sk) {
 532		const struct inet_sock *inet = inet_sk(sk);
 533
 
 534		oif = sk->sk_bound_dev_if;
 535		mark = sk->sk_mark;
 536		tos = RT_CONN_FLAGS(sk);
 537		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 538	}
 539	flowi4_init_output(fl4, oif, mark, tos,
 540			   RT_SCOPE_UNIVERSE, prot,
 541			   flow_flags,
 542			   iph->daddr, iph->saddr, 0, 0,
 
 543			   sock_net_uid(net, sk));
 544}
 545
 546static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 547			       const struct sock *sk)
 548{
 549	const struct net *net = dev_net(skb->dev);
 550	const struct iphdr *iph = ip_hdr(skb);
 551	int oif = skb->dev->ifindex;
 552	u8 tos = RT_TOS(iph->tos);
 553	u8 prot = iph->protocol;
 554	u32 mark = skb->mark;
 
 555
 556	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 557}
 558
 559static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 560{
 561	const struct inet_sock *inet = inet_sk(sk);
 562	const struct ip_options_rcu *inet_opt;
 563	__be32 daddr = inet->inet_daddr;
 564
 565	rcu_read_lock();
 566	inet_opt = rcu_dereference(inet->inet_opt);
 567	if (inet_opt && inet_opt->opt.srr)
 568		daddr = inet_opt->opt.faddr;
 569	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 570			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 571			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 
 
 572			   inet_sk_flowi_flags(sk),
 573			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 574	rcu_read_unlock();
 575}
 576
 577static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 578				 const struct sk_buff *skb)
 579{
 580	if (skb)
 581		build_skb_flow_key(fl4, skb, sk);
 582	else
 583		build_sk_flow_key(fl4, sk);
 584}
 585
 586static DEFINE_SPINLOCK(fnhe_lock);
 587
 588static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 589{
 590	struct rtable *rt;
 591
 592	rt = rcu_dereference(fnhe->fnhe_rth_input);
 593	if (rt) {
 594		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 595		dst_dev_put(&rt->dst);
 596		dst_release(&rt->dst);
 597	}
 598	rt = rcu_dereference(fnhe->fnhe_rth_output);
 599	if (rt) {
 600		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 601		dst_dev_put(&rt->dst);
 602		dst_release(&rt->dst);
 603	}
 604}
 605
 606static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 607{
 608	struct fib_nh_exception *fnhe, *oldest;
 
 609
 610	oldest = rcu_dereference(hash->chain);
 611	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 612	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 613		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 
 
 
 614			oldest = fnhe;
 
 
 615	}
 616	fnhe_flush_routes(oldest);
 617	return oldest;
 
 618}
 619
 620static inline u32 fnhe_hashfun(__be32 daddr)
 621{
 622	static u32 fnhe_hashrnd __read_mostly;
 623	u32 hval;
 624
 625	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 626	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 627	return hash_32(hval, FNHE_HASH_SHIFT);
 628}
 629
 630static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 631{
 632	rt->rt_pmtu = fnhe->fnhe_pmtu;
 633	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 634	rt->dst.expires = fnhe->fnhe_expires;
 635
 636	if (fnhe->fnhe_gw) {
 637		rt->rt_flags |= RTCF_REDIRECTED;
 638		rt->rt_uses_gateway = 1;
 639		rt->rt_gw_family = AF_INET;
 640		rt->rt_gw4 = fnhe->fnhe_gw;
 641	}
 642}
 643
 644static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 645				  __be32 gw, u32 pmtu, bool lock,
 646				  unsigned long expires)
 647{
 648	struct fnhe_hash_bucket *hash;
 649	struct fib_nh_exception *fnhe;
 650	struct rtable *rt;
 651	u32 genid, hval;
 652	unsigned int i;
 653	int depth;
 654
 655	genid = fnhe_genid(dev_net(nhc->nhc_dev));
 656	hval = fnhe_hashfun(daddr);
 657
 658	spin_lock_bh(&fnhe_lock);
 659
 660	hash = rcu_dereference(nhc->nhc_exceptions);
 661	if (!hash) {
 662		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 663		if (!hash)
 664			goto out_unlock;
 665		rcu_assign_pointer(nhc->nhc_exceptions, hash);
 666	}
 667
 668	hash += hval;
 669
 670	depth = 0;
 671	for (fnhe = rcu_dereference(hash->chain); fnhe;
 672	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 673		if (fnhe->fnhe_daddr == daddr)
 674			break;
 675		depth++;
 676	}
 677
 678	if (fnhe) {
 679		if (fnhe->fnhe_genid != genid)
 680			fnhe->fnhe_genid = genid;
 681		if (gw)
 682			fnhe->fnhe_gw = gw;
 683		if (pmtu) {
 684			fnhe->fnhe_pmtu = pmtu;
 685			fnhe->fnhe_mtu_locked = lock;
 686		}
 687		fnhe->fnhe_expires = max(1UL, expires);
 688		/* Update all cached dsts too */
 689		rt = rcu_dereference(fnhe->fnhe_rth_input);
 690		if (rt)
 691			fill_route_from_fnhe(rt, fnhe);
 692		rt = rcu_dereference(fnhe->fnhe_rth_output);
 693		if (rt)
 694			fill_route_from_fnhe(rt, fnhe);
 695	} else {
 696		if (depth > FNHE_RECLAIM_DEPTH)
 697			fnhe = fnhe_oldest(hash);
 698		else {
 699			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 700			if (!fnhe)
 701				goto out_unlock;
 702
 703			fnhe->fnhe_next = hash->chain;
 704			rcu_assign_pointer(hash->chain, fnhe);
 
 705		}
 
 
 
 
 
 
 
 706		fnhe->fnhe_genid = genid;
 707		fnhe->fnhe_daddr = daddr;
 708		fnhe->fnhe_gw = gw;
 709		fnhe->fnhe_pmtu = pmtu;
 710		fnhe->fnhe_mtu_locked = lock;
 711		fnhe->fnhe_expires = max(1UL, expires);
 712
 
 
 713		/* Exception created; mark the cached routes for the nexthop
 714		 * stale, so anyone caching it rechecks if this exception
 715		 * applies to them.
 716		 */
 717		rt = rcu_dereference(nhc->nhc_rth_input);
 718		if (rt)
 719			rt->dst.obsolete = DST_OBSOLETE_KILL;
 720
 721		for_each_possible_cpu(i) {
 722			struct rtable __rcu **prt;
 
 723			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 724			rt = rcu_dereference(*prt);
 725			if (rt)
 726				rt->dst.obsolete = DST_OBSOLETE_KILL;
 727		}
 728	}
 729
 730	fnhe->fnhe_stamp = jiffies;
 731
 732out_unlock:
 733	spin_unlock_bh(&fnhe_lock);
 734}
 735
 736static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 737			     bool kill_route)
 738{
 739	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 740	__be32 old_gw = ip_hdr(skb)->saddr;
 741	struct net_device *dev = skb->dev;
 742	struct in_device *in_dev;
 743	struct fib_result res;
 744	struct neighbour *n;
 745	struct net *net;
 746
 747	switch (icmp_hdr(skb)->code & 7) {
 748	case ICMP_REDIR_NET:
 749	case ICMP_REDIR_NETTOS:
 750	case ICMP_REDIR_HOST:
 751	case ICMP_REDIR_HOSTTOS:
 752		break;
 753
 754	default:
 755		return;
 756	}
 757
 758	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 759		return;
 760
 761	in_dev = __in_dev_get_rcu(dev);
 762	if (!in_dev)
 763		return;
 764
 765	net = dev_net(dev);
 766	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 767	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 768	    ipv4_is_zeronet(new_gw))
 769		goto reject_redirect;
 770
 771	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 772		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 773			goto reject_redirect;
 774		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 775			goto reject_redirect;
 776	} else {
 777		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 778			goto reject_redirect;
 779	}
 780
 781	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 782	if (!n)
 783		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 784	if (!IS_ERR(n)) {
 785		if (!(n->nud_state & NUD_VALID)) {
 786			neigh_event_send(n, NULL);
 787		} else {
 788			if (fib_lookup(net, fl4, &res, 0) == 0) {
 789				struct fib_nh_common *nhc;
 790
 791				fib_select_path(net, &res, fl4, skb);
 792				nhc = FIB_RES_NHC(res);
 793				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 794						0, false,
 795						jiffies + ip_rt_gc_timeout);
 796			}
 797			if (kill_route)
 798				rt->dst.obsolete = DST_OBSOLETE_KILL;
 799			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 800		}
 801		neigh_release(n);
 802	}
 803	return;
 804
 805reject_redirect:
 806#ifdef CONFIG_IP_ROUTE_VERBOSE
 807	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 808		const struct iphdr *iph = (const struct iphdr *) skb->data;
 809		__be32 daddr = iph->daddr;
 810		__be32 saddr = iph->saddr;
 811
 812		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 813				     "  Advised path = %pI4 -> %pI4\n",
 814				     &old_gw, dev->name, &new_gw,
 815				     &saddr, &daddr);
 816	}
 817#endif
 818	;
 819}
 820
 821static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 822{
 823	struct rtable *rt;
 824	struct flowi4 fl4;
 825	const struct iphdr *iph = (const struct iphdr *) skb->data;
 826	struct net *net = dev_net(skb->dev);
 827	int oif = skb->dev->ifindex;
 828	u8 tos = RT_TOS(iph->tos);
 829	u8 prot = iph->protocol;
 830	u32 mark = skb->mark;
 
 831
 832	rt = (struct rtable *) dst;
 833
 834	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 835	__ip_do_redirect(rt, skb, &fl4, true);
 836}
 837
 838static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 
 839{
 840	struct rtable *rt = (struct rtable *)dst;
 841	struct dst_entry *ret = dst;
 842
 843	if (rt) {
 844		if (dst->obsolete > 0) {
 845			ip_rt_put(rt);
 846			ret = NULL;
 847		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 848			   rt->dst.expires) {
 849			ip_rt_put(rt);
 850			ret = NULL;
 851		}
 852	}
 853	return ret;
 854}
 855
 856/*
 857 * Algorithm:
 858 *	1. The first ip_rt_redirect_number redirects are sent
 859 *	   with exponential backoff, then we stop sending them at all,
 860 *	   assuming that the host ignores our redirects.
 861 *	2. If we did not see packets requiring redirects
 862 *	   during ip_rt_redirect_silence, we assume that the host
 863 *	   forgot redirected route and start to send redirects again.
 864 *
 865 * This algorithm is much cheaper and more intelligent than dumb load limiting
 866 * in icmp.c.
 867 *
 868 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 869 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 870 */
 871
 872void ip_rt_send_redirect(struct sk_buff *skb)
 873{
 874	struct rtable *rt = skb_rtable(skb);
 875	struct in_device *in_dev;
 876	struct inet_peer *peer;
 877	struct net *net;
 878	int log_martians;
 879	int vif;
 880
 881	rcu_read_lock();
 882	in_dev = __in_dev_get_rcu(rt->dst.dev);
 883	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 884		rcu_read_unlock();
 885		return;
 886	}
 887	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 888	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 889	rcu_read_unlock();
 890
 891	net = dev_net(rt->dst.dev);
 892	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 893	if (!peer) {
 
 894		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 895			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 896		return;
 897	}
 898
 899	/* No redirected packets during ip_rt_redirect_silence;
 900	 * reset the algorithm.
 901	 */
 902	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 903		peer->rate_tokens = 0;
 904		peer->n_redirects = 0;
 905	}
 906
 907	/* Too many ignored redirects; do not send anything
 908	 * set dst.rate_last to the last seen redirected packet.
 909	 */
 910	if (peer->n_redirects >= ip_rt_redirect_number) {
 911		peer->rate_last = jiffies;
 912		goto out_put_peer;
 913	}
 914
 915	/* Check for load limit; set rate_last to the latest sent
 916	 * redirect.
 917	 */
 918	if (peer->n_redirects == 0 ||
 919	    time_after(jiffies,
 920		       (peer->rate_last +
 921			(ip_rt_redirect_load << peer->n_redirects)))) {
 922		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 923
 924		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 925		peer->rate_last = jiffies;
 926		++peer->n_redirects;
 927#ifdef CONFIG_IP_ROUTE_VERBOSE
 928		if (log_martians &&
 929		    peer->n_redirects == ip_rt_redirect_number)
 930			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 931					     &ip_hdr(skb)->saddr, inet_iif(skb),
 932					     &ip_hdr(skb)->daddr, &gw);
 933#endif
 934	}
 935out_put_peer:
 936	inet_putpeer(peer);
 937}
 938
 939static int ip_error(struct sk_buff *skb)
 940{
 941	struct rtable *rt = skb_rtable(skb);
 942	struct net_device *dev = skb->dev;
 943	struct in_device *in_dev;
 944	struct inet_peer *peer;
 945	unsigned long now;
 946	struct net *net;
 
 947	bool send;
 948	int code;
 949
 950	if (netif_is_l3_master(skb->dev)) {
 951		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 952		if (!dev)
 953			goto out;
 954	}
 955
 956	in_dev = __in_dev_get_rcu(dev);
 957
 958	/* IP on this device is disabled. */
 959	if (!in_dev)
 960		goto out;
 961
 962	net = dev_net(rt->dst.dev);
 963	if (!IN_DEV_FORWARD(in_dev)) {
 964		switch (rt->dst.error) {
 965		case EHOSTUNREACH:
 
 966			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 967			break;
 968
 969		case ENETUNREACH:
 
 970			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 971			break;
 972		}
 973		goto out;
 974	}
 975
 976	switch (rt->dst.error) {
 977	case EINVAL:
 978	default:
 979		goto out;
 980	case EHOSTUNREACH:
 981		code = ICMP_HOST_UNREACH;
 982		break;
 983	case ENETUNREACH:
 984		code = ICMP_NET_UNREACH;
 
 985		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 986		break;
 987	case EACCES:
 988		code = ICMP_PKT_FILTERED;
 989		break;
 990	}
 991
 
 992	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 993			       l3mdev_master_ifindex(skb->dev), 1);
 994
 995	send = true;
 996	if (peer) {
 997		now = jiffies;
 998		peer->rate_tokens += now - peer->rate_last;
 999		if (peer->rate_tokens > ip_rt_error_burst)
1000			peer->rate_tokens = ip_rt_error_burst;
1001		peer->rate_last = now;
1002		if (peer->rate_tokens >= ip_rt_error_cost)
1003			peer->rate_tokens -= ip_rt_error_cost;
1004		else
1005			send = false;
1006		inet_putpeer(peer);
1007	}
 
 
1008	if (send)
1009		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1010
1011out:	kfree_skb(skb);
1012	return 0;
1013}
1014
1015static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1016{
1017	struct dst_entry *dst = &rt->dst;
1018	struct net *net = dev_net(dst->dev);
1019	u32 old_mtu = ipv4_mtu(dst);
1020	struct fib_result res;
1021	bool lock = false;
 
 
1022
1023	if (ip_mtu_locked(dst))
1024		return;
1025
 
1026	if (old_mtu < mtu)
1027		return;
1028
1029	if (mtu < ip_rt_min_pmtu) {
 
 
1030		lock = true;
1031		mtu = min(old_mtu, ip_rt_min_pmtu);
1032	}
1033
1034	if (rt->rt_pmtu == mtu && !lock &&
1035	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1036		return;
1037
1038	rcu_read_lock();
1039	if (fib_lookup(net, fl4, &res, 0) == 0) {
1040		struct fib_nh_common *nhc;
1041
1042		fib_select_path(net, &res, fl4, NULL);
 
 
 
 
 
 
 
 
 
 
 
 
1043		nhc = FIB_RES_NHC(res);
1044		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1045				      jiffies + ip_rt_mtu_expires);
1046	}
 
1047	rcu_read_unlock();
1048}
1049
1050static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1051			      struct sk_buff *skb, u32 mtu,
1052			      bool confirm_neigh)
1053{
1054	struct rtable *rt = (struct rtable *) dst;
1055	struct flowi4 fl4;
1056
1057	ip_rt_build_flow_key(&fl4, sk, skb);
1058
1059	/* Don't make lookup fail for bridged encapsulations */
1060	if (skb && netif_is_any_bridge_port(skb->dev))
1061		fl4.flowi4_oif = 0;
1062
1063	__ip_rt_update_pmtu(rt, &fl4, mtu);
1064}
1065
1066void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1067		      int oif, u8 protocol)
1068{
1069	const struct iphdr *iph = (const struct iphdr *) skb->data;
1070	struct flowi4 fl4;
1071	struct rtable *rt;
1072	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1073
1074	__build_flow_key(net, &fl4, NULL, iph, oif,
1075			 RT_TOS(iph->tos), protocol, mark, 0);
1076	rt = __ip_route_output_key(net, &fl4);
1077	if (!IS_ERR(rt)) {
1078		__ip_rt_update_pmtu(rt, &fl4, mtu);
1079		ip_rt_put(rt);
1080	}
1081}
1082EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1083
1084static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1085{
1086	const struct iphdr *iph = (const struct iphdr *) skb->data;
1087	struct flowi4 fl4;
1088	struct rtable *rt;
1089
1090	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1091
1092	if (!fl4.flowi4_mark)
1093		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1094
1095	rt = __ip_route_output_key(sock_net(sk), &fl4);
1096	if (!IS_ERR(rt)) {
1097		__ip_rt_update_pmtu(rt, &fl4, mtu);
1098		ip_rt_put(rt);
1099	}
1100}
1101
1102void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1103{
1104	const struct iphdr *iph = (const struct iphdr *) skb->data;
1105	struct flowi4 fl4;
1106	struct rtable *rt;
1107	struct dst_entry *odst = NULL;
1108	bool new = false;
1109	struct net *net = sock_net(sk);
1110
1111	bh_lock_sock(sk);
1112
1113	if (!ip_sk_accept_pmtu(sk))
1114		goto out;
1115
1116	odst = sk_dst_get(sk);
1117
1118	if (sock_owned_by_user(sk) || !odst) {
1119		__ipv4_sk_update_pmtu(skb, sk, mtu);
1120		goto out;
1121	}
1122
1123	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1124
1125	rt = (struct rtable *)odst;
1126	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1127		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1128		if (IS_ERR(rt))
1129			goto out;
1130
1131		new = true;
1132	}
1133
1134	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1135
1136	if (!dst_check(&rt->dst, 0)) {
1137		if (new)
1138			dst_release(&rt->dst);
1139
1140		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1141		if (IS_ERR(rt))
1142			goto out;
1143
1144		new = true;
1145	}
1146
1147	if (new)
1148		sk_dst_set(sk, &rt->dst);
1149
1150out:
1151	bh_unlock_sock(sk);
1152	dst_release(odst);
1153}
1154EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1155
1156void ipv4_redirect(struct sk_buff *skb, struct net *net,
1157		   int oif, u8 protocol)
1158{
1159	const struct iphdr *iph = (const struct iphdr *) skb->data;
1160	struct flowi4 fl4;
1161	struct rtable *rt;
1162
1163	__build_flow_key(net, &fl4, NULL, iph, oif,
1164			 RT_TOS(iph->tos), protocol, 0, 0);
1165	rt = __ip_route_output_key(net, &fl4);
1166	if (!IS_ERR(rt)) {
1167		__ip_do_redirect(rt, skb, &fl4, false);
1168		ip_rt_put(rt);
1169	}
1170}
1171EXPORT_SYMBOL_GPL(ipv4_redirect);
1172
1173void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1174{
1175	const struct iphdr *iph = (const struct iphdr *) skb->data;
1176	struct flowi4 fl4;
1177	struct rtable *rt;
1178	struct net *net = sock_net(sk);
1179
1180	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1181	rt = __ip_route_output_key(net, &fl4);
1182	if (!IS_ERR(rt)) {
1183		__ip_do_redirect(rt, skb, &fl4, false);
1184		ip_rt_put(rt);
1185	}
1186}
1187EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1188
1189static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 
1190{
1191	struct rtable *rt = (struct rtable *) dst;
1192
1193	/* All IPV4 dsts are created with ->obsolete set to the value
1194	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1195	 * into this function always.
1196	 *
1197	 * When a PMTU/redirect information update invalidates a route,
1198	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1199	 * DST_OBSOLETE_DEAD.
1200	 */
1201	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1202		return NULL;
1203	return dst;
1204}
 
1205
1206static void ipv4_send_dest_unreach(struct sk_buff *skb)
1207{
 
1208	struct ip_options opt;
1209	int res;
1210
1211	/* Recompile ip options since IPCB may not be valid anymore.
1212	 * Also check we have a reasonable ipv4 header.
1213	 */
1214	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1215	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1216		return;
1217
1218	memset(&opt, 0, sizeof(opt));
1219	if (ip_hdr(skb)->ihl > 5) {
1220		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1221			return;
1222		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1223
1224		rcu_read_lock();
1225		res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
 
1226		rcu_read_unlock();
1227
1228		if (res)
1229			return;
1230	}
1231	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1232}
1233
1234static void ipv4_link_failure(struct sk_buff *skb)
1235{
1236	struct rtable *rt;
1237
1238	ipv4_send_dest_unreach(skb);
1239
1240	rt = skb_rtable(skb);
1241	if (rt)
1242		dst_set_expires(&rt->dst, 0);
1243}
1244
1245static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1246{
1247	pr_debug("%s: %pI4 -> %pI4, %s\n",
1248		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1249		 skb->dev ? skb->dev->name : "?");
1250	kfree_skb(skb);
1251	WARN_ON(1);
1252	return 0;
1253}
1254
1255/*
1256   We do not cache source address of outgoing interface,
1257   because it is used only by IP RR, TS and SRR options,
1258   so that it out of fast path.
1259
1260   BTW remember: "addr" is allowed to be not aligned
1261   in IP options!
1262 */
1263
1264void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1265{
1266	__be32 src;
1267
1268	if (rt_is_output_route(rt))
1269		src = ip_hdr(skb)->saddr;
1270	else {
1271		struct fib_result res;
1272		struct iphdr *iph = ip_hdr(skb);
1273		struct flowi4 fl4 = {
1274			.daddr = iph->daddr,
1275			.saddr = iph->saddr,
1276			.flowi4_tos = RT_TOS(iph->tos),
1277			.flowi4_oif = rt->dst.dev->ifindex,
1278			.flowi4_iif = skb->dev->ifindex,
1279			.flowi4_mark = skb->mark,
1280		};
1281
1282		rcu_read_lock();
1283		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1284			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1285		else
1286			src = inet_select_addr(rt->dst.dev,
1287					       rt_nexthop(rt, iph->daddr),
1288					       RT_SCOPE_UNIVERSE);
1289		rcu_read_unlock();
1290	}
1291	memcpy(addr, &src, 4);
1292}
1293
1294#ifdef CONFIG_IP_ROUTE_CLASSID
1295static void set_class_tag(struct rtable *rt, u32 tag)
1296{
1297	if (!(rt->dst.tclassid & 0xFFFF))
1298		rt->dst.tclassid |= tag & 0xFFFF;
1299	if (!(rt->dst.tclassid & 0xFFFF0000))
1300		rt->dst.tclassid |= tag & 0xFFFF0000;
1301}
1302#endif
1303
1304static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1305{
1306	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1307	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1308				    ip_rt_min_advmss);
 
 
 
 
 
 
1309
1310	return min(advmss, IPV4_MAX_PMTU - header_size);
1311}
1312
1313static unsigned int ipv4_mtu(const struct dst_entry *dst)
1314{
1315	const struct rtable *rt = (const struct rtable *) dst;
1316	unsigned int mtu = rt->rt_pmtu;
1317
1318	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1319		mtu = dst_metric_raw(dst, RTAX_MTU);
1320
1321	if (mtu)
1322		return mtu;
1323
1324	mtu = READ_ONCE(dst->dev->mtu);
1325
1326	if (unlikely(ip_mtu_locked(dst))) {
1327		if (rt->rt_uses_gateway && mtu > 576)
1328			mtu = 576;
1329	}
1330
1331	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1332
1333	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1334}
 
1335
1336static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1337{
1338	struct fnhe_hash_bucket *hash;
1339	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1340	u32 hval = fnhe_hashfun(daddr);
1341
1342	spin_lock_bh(&fnhe_lock);
1343
1344	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1345					 lockdep_is_held(&fnhe_lock));
1346	hash += hval;
1347
1348	fnhe_p = &hash->chain;
1349	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1350	while (fnhe) {
1351		if (fnhe->fnhe_daddr == daddr) {
1352			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1353				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1354			/* set fnhe_daddr to 0 to ensure it won't bind with
1355			 * new dsts in rt_bind_exception().
1356			 */
1357			fnhe->fnhe_daddr = 0;
1358			fnhe_flush_routes(fnhe);
1359			kfree_rcu(fnhe, rcu);
1360			break;
1361		}
1362		fnhe_p = &fnhe->fnhe_next;
1363		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1364						 lockdep_is_held(&fnhe_lock));
1365	}
1366
1367	spin_unlock_bh(&fnhe_lock);
1368}
1369
1370static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1371					       __be32 daddr)
1372{
1373	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1374	struct fib_nh_exception *fnhe;
1375	u32 hval;
1376
1377	if (!hash)
1378		return NULL;
1379
1380	hval = fnhe_hashfun(daddr);
1381
1382	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1383	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1384		if (fnhe->fnhe_daddr == daddr) {
1385			if (fnhe->fnhe_expires &&
1386			    time_after(jiffies, fnhe->fnhe_expires)) {
1387				ip_del_fnhe(nhc, daddr);
1388				break;
1389			}
1390			return fnhe;
1391		}
1392	}
1393	return NULL;
1394}
1395
1396/* MTU selection:
1397 * 1. mtu on route is locked - use it
1398 * 2. mtu from nexthop exception
1399 * 3. mtu from egress device
1400 */
1401
1402u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1403{
1404	struct fib_nh_common *nhc = res->nhc;
1405	struct net_device *dev = nhc->nhc_dev;
1406	struct fib_info *fi = res->fi;
1407	u32 mtu = 0;
1408
1409	if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1410	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1411		mtu = fi->fib_mtu;
1412
1413	if (likely(!mtu)) {
1414		struct fib_nh_exception *fnhe;
1415
1416		fnhe = find_exception(nhc, daddr);
1417		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1418			mtu = fnhe->fnhe_pmtu;
1419	}
1420
1421	if (likely(!mtu))
1422		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1423
1424	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1425}
1426
1427static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1428			      __be32 daddr, const bool do_cache)
1429{
1430	bool ret = false;
1431
1432	spin_lock_bh(&fnhe_lock);
1433
1434	if (daddr == fnhe->fnhe_daddr) {
1435		struct rtable __rcu **porig;
1436		struct rtable *orig;
1437		int genid = fnhe_genid(dev_net(rt->dst.dev));
1438
1439		if (rt_is_input_route(rt))
1440			porig = &fnhe->fnhe_rth_input;
1441		else
1442			porig = &fnhe->fnhe_rth_output;
1443		orig = rcu_dereference(*porig);
1444
1445		if (fnhe->fnhe_genid != genid) {
1446			fnhe->fnhe_genid = genid;
1447			fnhe->fnhe_gw = 0;
1448			fnhe->fnhe_pmtu = 0;
1449			fnhe->fnhe_expires = 0;
1450			fnhe->fnhe_mtu_locked = false;
1451			fnhe_flush_routes(fnhe);
1452			orig = NULL;
1453		}
1454		fill_route_from_fnhe(rt, fnhe);
1455		if (!rt->rt_gw4) {
1456			rt->rt_gw4 = daddr;
1457			rt->rt_gw_family = AF_INET;
1458		}
1459
1460		if (do_cache) {
1461			dst_hold(&rt->dst);
1462			rcu_assign_pointer(*porig, rt);
1463			if (orig) {
1464				dst_dev_put(&orig->dst);
1465				dst_release(&orig->dst);
1466			}
1467			ret = true;
1468		}
1469
1470		fnhe->fnhe_stamp = jiffies;
1471	}
1472	spin_unlock_bh(&fnhe_lock);
1473
1474	return ret;
1475}
1476
1477static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1478{
1479	struct rtable *orig, *prev, **p;
1480	bool ret = true;
1481
1482	if (rt_is_input_route(rt)) {
1483		p = (struct rtable **)&nhc->nhc_rth_input;
1484	} else {
1485		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1486	}
1487	orig = *p;
1488
1489	/* hold dst before doing cmpxchg() to avoid race condition
1490	 * on this dst
1491	 */
1492	dst_hold(&rt->dst);
1493	prev = cmpxchg(p, orig, rt);
1494	if (prev == orig) {
1495		if (orig) {
1496			rt_add_uncached_list(orig);
1497			dst_release(&orig->dst);
1498		}
1499	} else {
1500		dst_release(&rt->dst);
1501		ret = false;
1502	}
1503
1504	return ret;
1505}
1506
1507struct uncached_list {
1508	spinlock_t		lock;
1509	struct list_head	head;
1510};
1511
1512static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1513
1514void rt_add_uncached_list(struct rtable *rt)
1515{
1516	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1517
1518	rt->rt_uncached_list = ul;
1519
1520	spin_lock_bh(&ul->lock);
1521	list_add_tail(&rt->rt_uncached, &ul->head);
1522	spin_unlock_bh(&ul->lock);
1523}
1524
1525void rt_del_uncached_list(struct rtable *rt)
1526{
1527	if (!list_empty(&rt->rt_uncached)) {
1528		struct uncached_list *ul = rt->rt_uncached_list;
1529
1530		spin_lock_bh(&ul->lock);
1531		list_del(&rt->rt_uncached);
1532		spin_unlock_bh(&ul->lock);
1533	}
1534}
1535
1536static void ipv4_dst_destroy(struct dst_entry *dst)
1537{
1538	struct rtable *rt = (struct rtable *)dst;
1539
1540	ip_dst_metrics_put(dst);
1541	rt_del_uncached_list(rt);
1542}
1543
1544void rt_flush_dev(struct net_device *dev)
1545{
1546	struct rtable *rt;
1547	int cpu;
1548
1549	for_each_possible_cpu(cpu) {
1550		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1551
 
 
 
1552		spin_lock_bh(&ul->lock);
1553		list_for_each_entry(rt, &ul->head, rt_uncached) {
1554			if (rt->dst.dev != dev)
1555				continue;
1556			rt->dst.dev = blackhole_netdev;
1557			dev_hold(rt->dst.dev);
1558			dev_put(dev);
 
1559		}
1560		spin_unlock_bh(&ul->lock);
1561	}
1562}
1563
1564static bool rt_cache_valid(const struct rtable *rt)
1565{
1566	return	rt &&
1567		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1568		!rt_is_expired(rt);
1569}
1570
1571static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1572			   const struct fib_result *res,
1573			   struct fib_nh_exception *fnhe,
1574			   struct fib_info *fi, u16 type, u32 itag,
1575			   const bool do_cache)
1576{
1577	bool cached = false;
1578
1579	if (fi) {
1580		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1581
1582		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1583			rt->rt_uses_gateway = 1;
1584			rt->rt_gw_family = nhc->nhc_gw_family;
1585			/* only INET and INET6 are supported */
1586			if (likely(nhc->nhc_gw_family == AF_INET))
1587				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1588			else
1589				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1590		}
1591
1592		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1593
1594#ifdef CONFIG_IP_ROUTE_CLASSID
1595		if (nhc->nhc_family == AF_INET) {
1596			struct fib_nh *nh;
1597
1598			nh = container_of(nhc, struct fib_nh, nh_common);
1599			rt->dst.tclassid = nh->nh_tclassid;
1600		}
1601#endif
1602		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1603		if (unlikely(fnhe))
1604			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1605		else if (do_cache)
1606			cached = rt_cache_route(nhc, rt);
1607		if (unlikely(!cached)) {
1608			/* Routes we intend to cache in nexthop exception or
1609			 * FIB nexthop have the DST_NOCACHE bit clear.
1610			 * However, if we are unsuccessful at storing this
1611			 * route into the cache we really need to set it.
1612			 */
1613			if (!rt->rt_gw4) {
1614				rt->rt_gw_family = AF_INET;
1615				rt->rt_gw4 = daddr;
1616			}
1617			rt_add_uncached_list(rt);
1618		}
1619	} else
1620		rt_add_uncached_list(rt);
1621
1622#ifdef CONFIG_IP_ROUTE_CLASSID
1623#ifdef CONFIG_IP_MULTIPLE_TABLES
1624	set_class_tag(rt, res->tclassid);
1625#endif
1626	set_class_tag(rt, itag);
1627#endif
1628}
1629
1630struct rtable *rt_dst_alloc(struct net_device *dev,
1631			    unsigned int flags, u16 type,
1632			    bool nopolicy, bool noxfrm)
1633{
1634	struct rtable *rt;
1635
1636	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1637		       (nopolicy ? DST_NOPOLICY : 0) |
1638		       (noxfrm ? DST_NOXFRM : 0));
1639
1640	if (rt) {
1641		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1642		rt->rt_flags = flags;
1643		rt->rt_type = type;
1644		rt->rt_is_input = 0;
1645		rt->rt_iif = 0;
1646		rt->rt_pmtu = 0;
1647		rt->rt_mtu_locked = 0;
1648		rt->rt_uses_gateway = 0;
1649		rt->rt_gw_family = 0;
1650		rt->rt_gw4 = 0;
1651		INIT_LIST_HEAD(&rt->rt_uncached);
1652
1653		rt->dst.output = ip_output;
1654		if (flags & RTCF_LOCAL)
1655			rt->dst.input = ip_local_deliver;
1656	}
1657
1658	return rt;
1659}
1660EXPORT_SYMBOL(rt_dst_alloc);
1661
1662struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1663{
1664	struct rtable *new_rt;
1665
1666	new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1667			   rt->dst.flags);
1668
1669	if (new_rt) {
1670		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1671		new_rt->rt_flags = rt->rt_flags;
1672		new_rt->rt_type = rt->rt_type;
1673		new_rt->rt_is_input = rt->rt_is_input;
1674		new_rt->rt_iif = rt->rt_iif;
1675		new_rt->rt_pmtu = rt->rt_pmtu;
1676		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1677		new_rt->rt_gw_family = rt->rt_gw_family;
1678		if (rt->rt_gw_family == AF_INET)
1679			new_rt->rt_gw4 = rt->rt_gw4;
1680		else if (rt->rt_gw_family == AF_INET6)
1681			new_rt->rt_gw6 = rt->rt_gw6;
1682		INIT_LIST_HEAD(&new_rt->rt_uncached);
1683
1684		new_rt->dst.input = rt->dst.input;
1685		new_rt->dst.output = rt->dst.output;
1686		new_rt->dst.error = rt->dst.error;
1687		new_rt->dst.lastuse = jiffies;
1688		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1689	}
1690	return new_rt;
1691}
1692EXPORT_SYMBOL(rt_dst_clone);
1693
1694/* called in rcu_read_lock() section */
1695int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1696			  u8 tos, struct net_device *dev,
1697			  struct in_device *in_dev, u32 *itag)
 
1698{
1699	int err;
1700
1701	/* Primary sanity checks. */
1702	if (!in_dev)
1703		return -EINVAL;
1704
1705	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1706	    skb->protocol != htons(ETH_P_IP))
1707		return -EINVAL;
 
 
1708
1709	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1710		return -EINVAL;
1711
1712	if (ipv4_is_zeronet(saddr)) {
1713		if (!ipv4_is_local_multicast(daddr) &&
1714		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1715			return -EINVAL;
1716	} else {
1717		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1718					  in_dev, itag);
1719		if (err < 0)
1720			return err;
1721	}
1722	return 0;
1723}
1724
1725/* called in rcu_read_lock() section */
1726static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1727			     u8 tos, struct net_device *dev, int our)
 
1728{
1729	struct in_device *in_dev = __in_dev_get_rcu(dev);
1730	unsigned int flags = RTCF_MULTICAST;
 
1731	struct rtable *rth;
1732	u32 itag = 0;
1733	int err;
1734
1735	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1736	if (err)
1737		return err;
 
1738
1739	if (our)
1740		flags |= RTCF_LOCAL;
1741
 
 
 
1742	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1743			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1744	if (!rth)
1745		return -ENOBUFS;
1746
1747#ifdef CONFIG_IP_ROUTE_CLASSID
1748	rth->dst.tclassid = itag;
1749#endif
1750	rth->dst.output = ip_rt_bug;
1751	rth->rt_is_input= 1;
1752
1753#ifdef CONFIG_IP_MROUTE
1754	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1755		rth->dst.input = ip_mr_input;
1756#endif
1757	RT_CACHE_STAT_INC(in_slow_mc);
1758
 
1759	skb_dst_set(skb, &rth->dst);
1760	return 0;
1761}
1762
1763
1764static void ip_handle_martian_source(struct net_device *dev,
1765				     struct in_device *in_dev,
1766				     struct sk_buff *skb,
1767				     __be32 daddr,
1768				     __be32 saddr)
1769{
1770	RT_CACHE_STAT_INC(in_martian_src);
1771#ifdef CONFIG_IP_ROUTE_VERBOSE
1772	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1773		/*
1774		 *	RFC1812 recommendation, if source is martian,
1775		 *	the only hint is MAC header.
1776		 */
1777		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1778			&daddr, &saddr, dev->name);
1779		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1780			print_hex_dump(KERN_WARNING, "ll header: ",
1781				       DUMP_PREFIX_OFFSET, 16, 1,
1782				       skb_mac_header(skb),
1783				       dev->hard_header_len, false);
1784		}
1785	}
1786#endif
1787}
1788
1789/* called in rcu_read_lock() section */
1790static int __mkroute_input(struct sk_buff *skb,
1791			   const struct fib_result *res,
1792			   struct in_device *in_dev,
1793			   __be32 daddr, __be32 saddr, u32 tos)
1794{
 
1795	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1796	struct net_device *dev = nhc->nhc_dev;
1797	struct fib_nh_exception *fnhe;
1798	struct rtable *rth;
1799	int err;
1800	struct in_device *out_dev;
1801	bool do_cache;
1802	u32 itag = 0;
1803
1804	/* get a working reference to the output device */
1805	out_dev = __in_dev_get_rcu(dev);
1806	if (!out_dev) {
1807		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1808		return -EINVAL;
1809	}
1810
1811	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1812				  in_dev->dev, in_dev, &itag);
1813	if (err < 0) {
 
1814		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1815					 saddr);
1816
1817		goto cleanup;
1818	}
1819
1820	do_cache = res->fi && !itag;
1821	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1822	    skb->protocol == htons(ETH_P_IP)) {
1823		__be32 gw;
1824
1825		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1826		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1827		    inet_addr_onlink(out_dev, saddr, gw))
1828			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1829	}
1830
1831	if (skb->protocol != htons(ETH_P_IP)) {
1832		/* Not IP (i.e. ARP). Do not create route, if it is
1833		 * invalid for proxy arp. DNAT routes are always valid.
1834		 *
1835		 * Proxy arp feature have been extended to allow, ARP
1836		 * replies back to the same interface, to support
1837		 * Private VLAN switch technologies. See arp.c.
1838		 */
1839		if (out_dev == in_dev &&
1840		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1841			err = -EINVAL;
1842			goto cleanup;
1843		}
1844	}
1845
 
 
 
1846	fnhe = find_exception(nhc, daddr);
1847	if (do_cache) {
1848		if (fnhe)
1849			rth = rcu_dereference(fnhe->fnhe_rth_input);
1850		else
1851			rth = rcu_dereference(nhc->nhc_rth_input);
1852		if (rt_cache_valid(rth)) {
1853			skb_dst_set_noref(skb, &rth->dst);
1854			goto out;
1855		}
1856	}
1857
1858	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1859			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1860			   IN_DEV_CONF_GET(out_dev, NOXFRM));
1861	if (!rth) {
1862		err = -ENOBUFS;
1863		goto cleanup;
1864	}
1865
1866	rth->rt_is_input = 1;
1867	RT_CACHE_STAT_INC(in_slow_tot);
1868
1869	rth->dst.input = ip_forward;
1870
1871	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1872		       do_cache);
1873	lwtunnel_set_redirect(&rth->dst);
1874	skb_dst_set(skb, &rth->dst);
1875out:
1876	err = 0;
1877 cleanup:
1878	return err;
1879}
1880
1881#ifdef CONFIG_IP_ROUTE_MULTIPATH
1882/* To make ICMP packets follow the right flow, the multipath hash is
1883 * calculated from the inner IP addresses.
1884 */
1885static void ip_multipath_l3_keys(const struct sk_buff *skb,
1886				 struct flow_keys *hash_keys)
1887{
1888	const struct iphdr *outer_iph = ip_hdr(skb);
1889	const struct iphdr *key_iph = outer_iph;
1890	const struct iphdr *inner_iph;
1891	const struct icmphdr *icmph;
1892	struct iphdr _inner_iph;
1893	struct icmphdr _icmph;
1894
1895	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1896		goto out;
1897
1898	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1899		goto out;
1900
1901	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1902				   &_icmph);
1903	if (!icmph)
1904		goto out;
1905
1906	if (!icmp_is_err(icmph->type))
1907		goto out;
1908
1909	inner_iph = skb_header_pointer(skb,
1910				       outer_iph->ihl * 4 + sizeof(_icmph),
1911				       sizeof(_inner_iph), &_inner_iph);
1912	if (!inner_iph)
1913		goto out;
1914
1915	key_iph = inner_iph;
1916out:
1917	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1918	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1919}
1920
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1921/* if skb is set it will be used and fl4 can be NULL */
1922int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1923		       const struct sk_buff *skb, struct flow_keys *flkeys)
1924{
1925	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1926	struct flow_keys hash_keys;
1927	u32 mhash;
1928
1929	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1930	case 0:
1931		memset(&hash_keys, 0, sizeof(hash_keys));
1932		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1933		if (skb) {
1934			ip_multipath_l3_keys(skb, &hash_keys);
1935		} else {
1936			hash_keys.addrs.v4addrs.src = fl4->saddr;
1937			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1938		}
 
1939		break;
1940	case 1:
1941		/* skb is currently provided only when forwarding */
1942		if (skb) {
1943			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1944			struct flow_keys keys;
1945
1946			/* short-circuit if we already have L4 hash present */
1947			if (skb->l4_hash)
1948				return skb_get_hash_raw(skb) >> 1;
1949
1950			memset(&hash_keys, 0, sizeof(hash_keys));
1951
1952			if (!flkeys) {
1953				skb_flow_dissect_flow_keys(skb, &keys, flag);
1954				flkeys = &keys;
1955			}
1956
1957			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1958			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1959			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1960			hash_keys.ports.src = flkeys->ports.src;
1961			hash_keys.ports.dst = flkeys->ports.dst;
1962			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1963		} else {
1964			memset(&hash_keys, 0, sizeof(hash_keys));
1965			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1966			hash_keys.addrs.v4addrs.src = fl4->saddr;
1967			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1968			hash_keys.ports.src = fl4->fl4_sport;
1969			hash_keys.ports.dst = fl4->fl4_dport;
1970			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1971		}
 
1972		break;
1973	case 2:
1974		memset(&hash_keys, 0, sizeof(hash_keys));
1975		/* skb is currently provided only when forwarding */
1976		if (skb) {
1977			struct flow_keys keys;
1978
1979			skb_flow_dissect_flow_keys(skb, &keys, 0);
1980			/* Inner can be v4 or v6 */
1981			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1982				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1983				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1984				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1985			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1986				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1987				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1988				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1989				hash_keys.tags.flow_label = keys.tags.flow_label;
1990				hash_keys.basic.ip_proto = keys.basic.ip_proto;
1991			} else {
1992				/* Same as case 0 */
1993				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1994				ip_multipath_l3_keys(skb, &hash_keys);
1995			}
1996		} else {
1997			/* Same as case 0 */
1998			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1999			hash_keys.addrs.v4addrs.src = fl4->saddr;
2000			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2001		}
 
 
 
 
 
 
 
2002		break;
2003	}
2004	mhash = flow_hash_from_keys(&hash_keys);
2005
2006	if (multipath_hash)
2007		mhash = jhash_2words(mhash, multipath_hash, 0);
2008
2009	return mhash >> 1;
2010}
2011#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2012
2013static int ip_mkroute_input(struct sk_buff *skb,
2014			    struct fib_result *res,
2015			    struct in_device *in_dev,
2016			    __be32 daddr, __be32 saddr, u32 tos,
2017			    struct flow_keys *hkeys)
2018{
2019#ifdef CONFIG_IP_ROUTE_MULTIPATH
2020	if (res->fi && fib_info_num_path(res->fi) > 1) {
2021		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2022
2023		fib_select_multipath(res, h);
 
2024	}
2025#endif
2026
2027	/* create a routing cache entry */
2028	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2029}
2030
2031/* Implements all the saddr-related checks as ip_route_input_slow(),
2032 * assuming daddr is valid and the destination is not a local broadcast one.
2033 * Uses the provided hint instead of performing a route lookup.
2034 */
2035int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2036		      u8 tos, struct net_device *dev,
2037		      const struct sk_buff *hint)
 
2038{
 
2039	struct in_device *in_dev = __in_dev_get_rcu(dev);
2040	struct rtable *rt = skb_rtable(hint);
2041	struct net *net = dev_net(dev);
2042	int err = -EINVAL;
2043	u32 tag = 0;
2044
2045	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
 
 
 
 
2046		goto martian_source;
 
2047
2048	if (ipv4_is_zeronet(saddr))
 
2049		goto martian_source;
 
2050
2051	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
 
2052		goto martian_source;
 
2053
2054	if (rt->rt_type != RTN_LOCAL)
2055		goto skip_validate_source;
2056
2057	tos &= IPTOS_RT_MASK;
2058	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2059	if (err < 0)
2060		goto martian_source;
2061
2062skip_validate_source:
2063	skb_dst_copy(skb, hint);
2064	return 0;
2065
2066martian_source:
2067	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2068	return err;
 
 
 
 
 
 
 
 
 
 
 
 
 
2069}
2070
2071/*
2072 *	NOTE. We drop all the packets that has local source
2073 *	addresses, because every properly looped back packet
2074 *	must have correct destination already attached by output routine.
2075 *	Changes in the enforced policies must be applied also to
2076 *	ip_route_use_hint().
2077 *
2078 *	Such approach solves two big problems:
2079 *	1. Not simplex devices are handled properly.
2080 *	2. IP spoofing attempts are filtered with 100% of guarantee.
2081 *	called with rcu_read_lock()
2082 */
2083
2084static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2085			       u8 tos, struct net_device *dev,
2086			       struct fib_result *res)
 
2087{
 
2088	struct in_device *in_dev = __in_dev_get_rcu(dev);
2089	struct flow_keys *flkeys = NULL, _flkeys;
2090	struct net    *net = dev_net(dev);
2091	struct ip_tunnel_info *tun_info;
2092	int		err = -EINVAL;
2093	unsigned int	flags = 0;
2094	u32		itag = 0;
2095	struct rtable	*rth;
2096	struct flowi4	fl4;
2097	bool do_cache = true;
2098
2099	/* IP on this device is disabled. */
2100
2101	if (!in_dev)
2102		goto out;
2103
2104	/* Check for the most weird martians, which can be not detected
2105	   by fib_lookup.
2106	 */
2107
2108	tun_info = skb_tunnel_info(skb);
2109	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2110		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2111	else
2112		fl4.flowi4_tun_key.tun_id = 0;
2113	skb_dst_drop(skb);
2114
2115	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
 
2116		goto martian_source;
 
2117
2118	res->fi = NULL;
2119	res->table = NULL;
2120	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2121		goto brd_input;
2122
2123	/* Accept zero addresses only to limited broadcast;
2124	 * I even do not know to fix it or not. Waiting for complains :-)
2125	 */
2126	if (ipv4_is_zeronet(saddr))
 
2127		goto martian_source;
 
2128
2129	if (ipv4_is_zeronet(daddr))
 
2130		goto martian_destination;
 
2131
2132	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2133	 * and call it once if daddr or/and saddr are loopback addresses
2134	 */
2135	if (ipv4_is_loopback(daddr)) {
2136		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
 
2137			goto martian_destination;
 
2138	} else if (ipv4_is_loopback(saddr)) {
2139		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
 
2140			goto martian_source;
 
2141	}
2142
2143	/*
2144	 *	Now we are ready to route packet.
2145	 */
 
2146	fl4.flowi4_oif = 0;
2147	fl4.flowi4_iif = dev->ifindex;
2148	fl4.flowi4_mark = skb->mark;
2149	fl4.flowi4_tos = tos;
2150	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2151	fl4.flowi4_flags = 0;
2152	fl4.daddr = daddr;
2153	fl4.saddr = saddr;
2154	fl4.flowi4_uid = sock_net_uid(net, NULL);
2155	fl4.flowi4_multipath_hash = 0;
2156
2157	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2158		flkeys = &_flkeys;
2159	} else {
2160		fl4.flowi4_proto = 0;
2161		fl4.fl4_sport = 0;
2162		fl4.fl4_dport = 0;
2163	}
2164
2165	err = fib_lookup(net, &fl4, res, 0);
2166	if (err != 0) {
2167		if (!IN_DEV_FORWARD(in_dev))
2168			err = -EHOSTUNREACH;
2169		goto no_route;
2170	}
2171
2172	if (res->type == RTN_BROADCAST) {
2173		if (IN_DEV_BFORWARD(in_dev))
2174			goto make_route;
2175		/* not do cache if bc_forwarding is enabled */
2176		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2177			do_cache = false;
2178		goto brd_input;
2179	}
2180
 
2181	if (res->type == RTN_LOCAL) {
2182		err = fib_validate_source(skb, saddr, daddr, tos,
2183					  0, dev, in_dev, &itag);
2184		if (err < 0)
2185			goto martian_source;
2186		goto local_input;
2187	}
2188
2189	if (!IN_DEV_FORWARD(in_dev)) {
2190		err = -EHOSTUNREACH;
2191		goto no_route;
2192	}
2193	if (res->type != RTN_UNICAST)
 
2194		goto martian_destination;
 
2195
2196make_route:
2197	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2198out:	return err;
 
 
 
2199
2200brd_input:
2201	if (skb->protocol != htons(ETH_P_IP))
2202		goto e_inval;
 
 
2203
2204	if (!ipv4_is_zeronet(saddr)) {
2205		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2206					  in_dev, &itag);
2207		if (err < 0)
2208			goto martian_source;
2209	}
2210	flags |= RTCF_BROADCAST;
2211	res->type = RTN_BROADCAST;
2212	RT_CACHE_STAT_INC(in_brd);
2213
2214local_input:
 
 
 
2215	do_cache &= res->fi && !itag;
2216	if (do_cache) {
2217		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2218
2219		rth = rcu_dereference(nhc->nhc_rth_input);
2220		if (rt_cache_valid(rth)) {
2221			skb_dst_set_noref(skb, &rth->dst);
2222			err = 0;
2223			goto out;
2224		}
2225	}
2226
2227	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2228			   flags | RTCF_LOCAL, res->type,
2229			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
2230	if (!rth)
2231		goto e_nobufs;
2232
2233	rth->dst.output= ip_rt_bug;
2234#ifdef CONFIG_IP_ROUTE_CLASSID
2235	rth->dst.tclassid = itag;
2236#endif
2237	rth->rt_is_input = 1;
2238
2239	RT_CACHE_STAT_INC(in_slow_tot);
2240	if (res->type == RTN_UNREACHABLE) {
2241		rth->dst.input= ip_error;
2242		rth->dst.error= -err;
2243		rth->rt_flags 	&= ~RTCF_LOCAL;
2244	}
2245
2246	if (do_cache) {
2247		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2248
2249		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2250		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2251			WARN_ON(rth->dst.input == lwtunnel_input);
2252			rth->dst.lwtstate->orig_input = rth->dst.input;
2253			rth->dst.input = lwtunnel_input;
2254		}
2255
2256		if (unlikely(!rt_cache_route(nhc, rth)))
2257			rt_add_uncached_list(rth);
2258	}
2259	skb_dst_set(skb, &rth->dst);
2260	err = 0;
2261	goto out;
2262
2263no_route:
2264	RT_CACHE_STAT_INC(in_no_route);
2265	res->type = RTN_UNREACHABLE;
2266	res->fi = NULL;
2267	res->table = NULL;
2268	goto local_input;
2269
2270	/*
2271	 *	Do not cache martian addresses: they should be logged (RFC1812)
2272	 */
2273martian_destination:
2274	RT_CACHE_STAT_INC(in_martian_dst);
2275#ifdef CONFIG_IP_ROUTE_VERBOSE
2276	if (IN_DEV_LOG_MARTIANS(in_dev))
2277		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2278				     &daddr, &saddr, dev->name);
2279#endif
2280
2281e_inval:
2282	err = -EINVAL;
2283	goto out;
2284
2285e_nobufs:
2286	err = -ENOBUFS;
2287	goto out;
2288
2289martian_source:
2290	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2291	goto out;
2292}
2293
2294int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2295			 u8 tos, struct net_device *dev)
2296{
2297	struct fib_result res;
2298	int err;
2299
2300	tos &= IPTOS_RT_MASK;
2301	rcu_read_lock();
2302	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2303	rcu_read_unlock();
2304
2305	return err;
2306}
2307EXPORT_SYMBOL(ip_route_input_noref);
2308
2309/* called with rcu_read_lock held */
2310int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2311		       u8 tos, struct net_device *dev, struct fib_result *res)
 
 
2312{
2313	/* Multicast recognition logic is moved from route cache to here.
2314	   The problem was that too many Ethernet cards have broken/missing
2315	   hardware multicast filters :-( As result the host on multicasting
2316	   network acquires a lot of useless route cache entries, sort of
2317	   SDR messages from all the world. Now we try to get rid of them.
2318	   Really, provided software IP multicast filter is organized
2319	   reasonably (at least, hashed), it does not result in a slowdown
2320	   comparing with route cache reject entries.
2321	   Note, that multicast routers are not affected, because
2322	   route cache entry is created eventually.
2323	 */
2324	if (ipv4_is_multicast(daddr)) {
 
2325		struct in_device *in_dev = __in_dev_get_rcu(dev);
2326		int our = 0;
2327		int err = -EINVAL;
2328
2329		if (!in_dev)
2330			return err;
 
2331		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2332				      ip_hdr(skb)->protocol);
2333
2334		/* check l3 master if no match yet */
2335		if (!our && netif_is_l3_slave(dev)) {
2336			struct in_device *l3_in_dev;
2337
2338			l3_in_dev = __in_dev_get_rcu(skb->dev);
2339			if (l3_in_dev)
2340				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2341						      ip_hdr(skb)->protocol);
2342		}
2343
2344		if (our
2345#ifdef CONFIG_IP_MROUTE
2346			||
2347		    (!ipv4_is_local_multicast(daddr) &&
2348		     IN_DEV_MFORWARD(in_dev))
2349#endif
2350		   ) {
2351			err = ip_route_input_mc(skb, daddr, saddr,
2352						tos, dev, our);
2353		}
2354		return err;
2355	}
2356
2357	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2358}
2359
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2360/* called with rcu_read_lock() */
2361static struct rtable *__mkroute_output(const struct fib_result *res,
2362				       const struct flowi4 *fl4, int orig_oif,
2363				       struct net_device *dev_out,
2364				       unsigned int flags)
2365{
2366	struct fib_info *fi = res->fi;
2367	struct fib_nh_exception *fnhe;
2368	struct in_device *in_dev;
2369	u16 type = res->type;
2370	struct rtable *rth;
2371	bool do_cache;
2372
2373	in_dev = __in_dev_get_rcu(dev_out);
2374	if (!in_dev)
2375		return ERR_PTR(-EINVAL);
2376
2377	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2378		if (ipv4_is_loopback(fl4->saddr) &&
2379		    !(dev_out->flags & IFF_LOOPBACK) &&
2380		    !netif_is_l3_master(dev_out))
2381			return ERR_PTR(-EINVAL);
2382
2383	if (ipv4_is_lbcast(fl4->daddr))
2384		type = RTN_BROADCAST;
2385	else if (ipv4_is_multicast(fl4->daddr))
2386		type = RTN_MULTICAST;
2387	else if (ipv4_is_zeronet(fl4->daddr))
2388		return ERR_PTR(-EINVAL);
2389
2390	if (dev_out->flags & IFF_LOOPBACK)
2391		flags |= RTCF_LOCAL;
2392
2393	do_cache = true;
2394	if (type == RTN_BROADCAST) {
2395		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2396		fi = NULL;
2397	} else if (type == RTN_MULTICAST) {
2398		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2399		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2400				     fl4->flowi4_proto))
2401			flags &= ~RTCF_LOCAL;
2402		else
2403			do_cache = false;
2404		/* If multicast route do not exist use
2405		 * default one, but do not gateway in this case.
2406		 * Yes, it is hack.
2407		 */
2408		if (fi && res->prefixlen < 4)
2409			fi = NULL;
2410	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2411		   (orig_oif != dev_out->ifindex)) {
2412		/* For local routes that require a particular output interface
2413		 * we do not want to cache the result.  Caching the result
2414		 * causes incorrect behaviour when there are multiple source
2415		 * addresses on the interface, the end result being that if the
2416		 * intended recipient is waiting on that interface for the
2417		 * packet he won't receive it because it will be delivered on
2418		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2419		 * be set to the loopback interface as well.
2420		 */
2421		do_cache = false;
2422	}
2423
2424	fnhe = NULL;
2425	do_cache &= fi != NULL;
2426	if (fi) {
2427		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2428		struct rtable __rcu **prth;
2429
2430		fnhe = find_exception(nhc, fl4->daddr);
2431		if (!do_cache)
2432			goto add;
2433		if (fnhe) {
2434			prth = &fnhe->fnhe_rth_output;
2435		} else {
2436			if (unlikely(fl4->flowi4_flags &
2437				     FLOWI_FLAG_KNOWN_NH &&
2438				     !(nhc->nhc_gw_family &&
2439				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2440				do_cache = false;
2441				goto add;
2442			}
2443			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2444		}
2445		rth = rcu_dereference(*prth);
2446		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2447			return rth;
2448	}
2449
2450add:
2451	rth = rt_dst_alloc(dev_out, flags, type,
2452			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2453			   IN_DEV_CONF_GET(in_dev, NOXFRM));
2454	if (!rth)
2455		return ERR_PTR(-ENOBUFS);
2456
2457	rth->rt_iif = orig_oif;
2458
2459	RT_CACHE_STAT_INC(out_slow_tot);
2460
2461	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2462		if (flags & RTCF_LOCAL &&
2463		    !(dev_out->flags & IFF_LOOPBACK)) {
2464			rth->dst.output = ip_mc_output;
2465			RT_CACHE_STAT_INC(out_slow_mc);
2466		}
2467#ifdef CONFIG_IP_MROUTE
2468		if (type == RTN_MULTICAST) {
2469			if (IN_DEV_MFORWARD(in_dev) &&
2470			    !ipv4_is_local_multicast(fl4->daddr)) {
2471				rth->dst.input = ip_mr_input;
2472				rth->dst.output = ip_mc_output;
2473			}
2474		}
2475#endif
2476	}
2477
2478	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2479	lwtunnel_set_redirect(&rth->dst);
2480
2481	return rth;
2482}
2483
2484/*
2485 * Major route resolver routine.
2486 */
2487
2488struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2489					const struct sk_buff *skb)
2490{
2491	__u8 tos = RT_FL_TOS(fl4);
2492	struct fib_result res = {
2493		.type		= RTN_UNSPEC,
2494		.fi		= NULL,
2495		.table		= NULL,
2496		.tclassid	= 0,
2497	};
2498	struct rtable *rth;
2499
2500	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2501	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2502	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2503			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2504
2505	rcu_read_lock();
2506	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2507	rcu_read_unlock();
2508
2509	return rth;
2510}
2511EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2512
2513struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2514					    struct fib_result *res,
2515					    const struct sk_buff *skb)
2516{
2517	struct net_device *dev_out = NULL;
2518	int orig_oif = fl4->flowi4_oif;
2519	unsigned int flags = 0;
2520	struct rtable *rth;
2521	int err;
2522
2523	if (fl4->saddr) {
2524		if (ipv4_is_multicast(fl4->saddr) ||
2525		    ipv4_is_lbcast(fl4->saddr) ||
2526		    ipv4_is_zeronet(fl4->saddr)) {
2527			rth = ERR_PTR(-EINVAL);
2528			goto out;
2529		}
2530
2531		rth = ERR_PTR(-ENETUNREACH);
2532
2533		/* I removed check for oif == dev_out->oif here.
2534		   It was wrong for two reasons:
2535		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2536		      is assigned to multiple interfaces.
2537		   2. Moreover, we are allowed to send packets with saddr
2538		      of another iface. --ANK
2539		 */
2540
2541		if (fl4->flowi4_oif == 0 &&
2542		    (ipv4_is_multicast(fl4->daddr) ||
2543		     ipv4_is_lbcast(fl4->daddr))) {
2544			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2545			dev_out = __ip_dev_find(net, fl4->saddr, false);
2546			if (!dev_out)
2547				goto out;
2548
2549			/* Special hack: user can direct multicasts
2550			   and limited broadcast via necessary interface
2551			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2552			   This hack is not just for fun, it allows
2553			   vic,vat and friends to work.
2554			   They bind socket to loopback, set ttl to zero
2555			   and expect that it will work.
2556			   From the viewpoint of routing cache they are broken,
2557			   because we are not allowed to build multicast path
2558			   with loopback source addr (look, routing cache
2559			   cannot know, that ttl is zero, so that packet
2560			   will not leave this host and route is valid).
2561			   Luckily, this hack is good workaround.
2562			 */
2563
2564			fl4->flowi4_oif = dev_out->ifindex;
2565			goto make_route;
2566		}
2567
2568		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2569			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2570			if (!__ip_dev_find(net, fl4->saddr, false))
2571				goto out;
2572		}
2573	}
2574
2575
2576	if (fl4->flowi4_oif) {
2577		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2578		rth = ERR_PTR(-ENODEV);
2579		if (!dev_out)
2580			goto out;
2581
2582		/* RACE: Check return value of inet_select_addr instead. */
2583		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2584			rth = ERR_PTR(-ENETUNREACH);
2585			goto out;
2586		}
2587		if (ipv4_is_local_multicast(fl4->daddr) ||
2588		    ipv4_is_lbcast(fl4->daddr) ||
2589		    fl4->flowi4_proto == IPPROTO_IGMP) {
2590			if (!fl4->saddr)
2591				fl4->saddr = inet_select_addr(dev_out, 0,
2592							      RT_SCOPE_LINK);
2593			goto make_route;
2594		}
2595		if (!fl4->saddr) {
2596			if (ipv4_is_multicast(fl4->daddr))
2597				fl4->saddr = inet_select_addr(dev_out, 0,
2598							      fl4->flowi4_scope);
2599			else if (!fl4->daddr)
2600				fl4->saddr = inet_select_addr(dev_out, 0,
2601							      RT_SCOPE_HOST);
2602		}
2603	}
2604
2605	if (!fl4->daddr) {
2606		fl4->daddr = fl4->saddr;
2607		if (!fl4->daddr)
2608			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2609		dev_out = net->loopback_dev;
2610		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2611		res->type = RTN_LOCAL;
2612		flags |= RTCF_LOCAL;
2613		goto make_route;
2614	}
2615
2616	err = fib_lookup(net, fl4, res, 0);
2617	if (err) {
2618		res->fi = NULL;
2619		res->table = NULL;
2620		if (fl4->flowi4_oif &&
2621		    (ipv4_is_multicast(fl4->daddr) ||
2622		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2623			/* Apparently, routing tables are wrong. Assume,
2624			   that the destination is on link.
2625
2626			   WHY? DW.
2627			   Because we are allowed to send to iface
2628			   even if it has NO routes and NO assigned
2629			   addresses. When oif is specified, routing
2630			   tables are looked up with only one purpose:
2631			   to catch if destination is gatewayed, rather than
2632			   direct. Moreover, if MSG_DONTROUTE is set,
2633			   we send packet, ignoring both routing tables
2634			   and ifaddr state. --ANK
2635
2636
2637			   We could make it even if oif is unknown,
2638			   likely IPv6, but we do not.
2639			 */
2640
2641			if (fl4->saddr == 0)
2642				fl4->saddr = inet_select_addr(dev_out, 0,
2643							      RT_SCOPE_LINK);
2644			res->type = RTN_UNICAST;
2645			goto make_route;
2646		}
2647		rth = ERR_PTR(err);
2648		goto out;
2649	}
2650
2651	if (res->type == RTN_LOCAL) {
2652		if (!fl4->saddr) {
2653			if (res->fi->fib_prefsrc)
2654				fl4->saddr = res->fi->fib_prefsrc;
2655			else
2656				fl4->saddr = fl4->daddr;
2657		}
2658
2659		/* L3 master device is the loopback for that domain */
2660		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2661			net->loopback_dev;
2662
2663		/* make sure orig_oif points to fib result device even
2664		 * though packet rx/tx happens over loopback or l3mdev
2665		 */
2666		orig_oif = FIB_RES_OIF(*res);
2667
2668		fl4->flowi4_oif = dev_out->ifindex;
2669		flags |= RTCF_LOCAL;
2670		goto make_route;
2671	}
2672
2673	fib_select_path(net, res, fl4, skb);
2674
2675	dev_out = FIB_RES_DEV(*res);
2676
2677make_route:
2678	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2679
2680out:
2681	return rth;
2682}
2683
2684static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2685{
2686	return NULL;
2687}
2688
2689static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2690{
2691	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2692
2693	return mtu ? : dst->dev->mtu;
2694}
2695
2696static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2697					  struct sk_buff *skb, u32 mtu,
2698					  bool confirm_neigh)
2699{
2700}
2701
2702static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2703				       struct sk_buff *skb)
2704{
2705}
2706
2707static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2708					  unsigned long old)
2709{
2710	return NULL;
2711}
2712
2713static struct dst_ops ipv4_dst_blackhole_ops = {
2714	.family			=	AF_INET,
2715	.check			=	ipv4_blackhole_dst_check,
2716	.mtu			=	ipv4_blackhole_mtu,
2717	.default_advmss		=	ipv4_default_advmss,
2718	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2719	.redirect		=	ipv4_rt_blackhole_redirect,
2720	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2721	.neigh_lookup		=	ipv4_neigh_lookup,
2722};
2723
2724struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2725{
2726	struct rtable *ort = (struct rtable *) dst_orig;
2727	struct rtable *rt;
2728
2729	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2730	if (rt) {
2731		struct dst_entry *new = &rt->dst;
2732
2733		new->__use = 1;
2734		new->input = dst_discard;
2735		new->output = dst_discard_out;
2736
2737		new->dev = net->loopback_dev;
2738		if (new->dev)
2739			dev_hold(new->dev);
2740
2741		rt->rt_is_input = ort->rt_is_input;
2742		rt->rt_iif = ort->rt_iif;
2743		rt->rt_pmtu = ort->rt_pmtu;
2744		rt->rt_mtu_locked = ort->rt_mtu_locked;
2745
2746		rt->rt_genid = rt_genid_ipv4(net);
2747		rt->rt_flags = ort->rt_flags;
2748		rt->rt_type = ort->rt_type;
2749		rt->rt_uses_gateway = ort->rt_uses_gateway;
2750		rt->rt_gw_family = ort->rt_gw_family;
2751		if (rt->rt_gw_family == AF_INET)
2752			rt->rt_gw4 = ort->rt_gw4;
2753		else if (rt->rt_gw_family == AF_INET6)
2754			rt->rt_gw6 = ort->rt_gw6;
2755
2756		INIT_LIST_HEAD(&rt->rt_uncached);
2757	}
2758
2759	dst_release(dst_orig);
2760
2761	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2762}
2763
2764struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2765				    const struct sock *sk)
2766{
2767	struct rtable *rt = __ip_route_output_key(net, flp4);
2768
2769	if (IS_ERR(rt))
2770		return rt;
2771
2772	if (flp4->flowi4_proto)
2773		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2774							flowi4_to_flowi(flp4),
2775							sk, 0);
 
 
2776
2777	return rt;
2778}
2779EXPORT_SYMBOL_GPL(ip_route_output_flow);
2780
2781struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2782				      struct net_device *dev,
2783				      struct net *net, __be32 *saddr,
2784				      const struct ip_tunnel_info *info,
2785				      u8 protocol, bool use_cache)
2786{
2787#ifdef CONFIG_DST_CACHE
2788	struct dst_cache *dst_cache;
2789#endif
2790	struct rtable *rt = NULL;
2791	struct flowi4 fl4;
2792	__u8 tos;
2793
2794#ifdef CONFIG_DST_CACHE
2795	dst_cache = (struct dst_cache *)&info->dst_cache;
2796	if (use_cache) {
2797		rt = dst_cache_get_ip4(dst_cache, saddr);
2798		if (rt)
2799			return rt;
2800	}
2801#endif
2802	memset(&fl4, 0, sizeof(fl4));
2803	fl4.flowi4_mark = skb->mark;
2804	fl4.flowi4_proto = protocol;
2805	fl4.daddr = info->key.u.ipv4.dst;
2806	fl4.saddr = info->key.u.ipv4.src;
2807	tos = info->key.tos;
2808	fl4.flowi4_tos = RT_TOS(tos);
2809
2810	rt = ip_route_output_key(net, &fl4);
2811	if (IS_ERR(rt)) {
2812		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2813		return ERR_PTR(-ENETUNREACH);
2814	}
2815	if (rt->dst.dev == dev) { /* is this necessary? */
2816		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2817		ip_rt_put(rt);
2818		return ERR_PTR(-ELOOP);
2819	}
2820#ifdef CONFIG_DST_CACHE
2821	if (use_cache)
2822		dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2823#endif
2824	*saddr = fl4.saddr;
2825	return rt;
2826}
2827EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2828
2829/* called with rcu_read_lock held */
2830static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2831			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2832			struct sk_buff *skb, u32 portid, u32 seq,
2833			unsigned int flags)
2834{
2835	struct rtmsg *r;
2836	struct nlmsghdr *nlh;
2837	unsigned long expires = 0;
2838	u32 error;
2839	u32 metrics[RTAX_MAX];
2840
2841	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2842	if (!nlh)
2843		return -EMSGSIZE;
2844
2845	r = nlmsg_data(nlh);
2846	r->rtm_family	 = AF_INET;
2847	r->rtm_dst_len	= 32;
2848	r->rtm_src_len	= 0;
2849	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2850	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2851	if (nla_put_u32(skb, RTA_TABLE, table_id))
2852		goto nla_put_failure;
2853	r->rtm_type	= rt->rt_type;
2854	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2855	r->rtm_protocol = RTPROT_UNSPEC;
2856	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2857	if (rt->rt_flags & RTCF_NOTIFY)
2858		r->rtm_flags |= RTM_F_NOTIFY;
2859	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2860		r->rtm_flags |= RTCF_DOREDIRECT;
2861
2862	if (nla_put_in_addr(skb, RTA_DST, dst))
2863		goto nla_put_failure;
2864	if (src) {
2865		r->rtm_src_len = 32;
2866		if (nla_put_in_addr(skb, RTA_SRC, src))
2867			goto nla_put_failure;
2868	}
2869	if (rt->dst.dev &&
2870	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2871		goto nla_put_failure;
 
 
 
2872#ifdef CONFIG_IP_ROUTE_CLASSID
2873	if (rt->dst.tclassid &&
2874	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2875		goto nla_put_failure;
2876#endif
2877	if (fl4 && !rt_is_input_route(rt) &&
2878	    fl4->saddr != src) {
2879		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2880			goto nla_put_failure;
2881	}
2882	if (rt->rt_uses_gateway) {
2883		if (rt->rt_gw_family == AF_INET &&
2884		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2885			goto nla_put_failure;
2886		} else if (rt->rt_gw_family == AF_INET6) {
2887			int alen = sizeof(struct in6_addr);
2888			struct nlattr *nla;
2889			struct rtvia *via;
2890
2891			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2892			if (!nla)
2893				goto nla_put_failure;
2894
2895			via = nla_data(nla);
2896			via->rtvia_family = AF_INET6;
2897			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2898		}
2899	}
2900
2901	expires = rt->dst.expires;
2902	if (expires) {
2903		unsigned long now = jiffies;
2904
2905		if (time_before(now, expires))
2906			expires -= now;
2907		else
2908			expires = 0;
2909	}
2910
2911	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2912	if (rt->rt_pmtu && expires)
2913		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2914	if (rt->rt_mtu_locked && expires)
2915		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2916	if (rtnetlink_put_metrics(skb, metrics) < 0)
2917		goto nla_put_failure;
2918
2919	if (fl4) {
2920		if (fl4->flowi4_mark &&
2921		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2922			goto nla_put_failure;
2923
2924		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2925		    nla_put_u32(skb, RTA_UID,
2926				from_kuid_munged(current_user_ns(),
2927						 fl4->flowi4_uid)))
2928			goto nla_put_failure;
2929
2930		if (rt_is_input_route(rt)) {
2931#ifdef CONFIG_IP_MROUTE
2932			if (ipv4_is_multicast(dst) &&
2933			    !ipv4_is_local_multicast(dst) &&
2934			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2935				int err = ipmr_get_route(net, skb,
2936							 fl4->saddr, fl4->daddr,
2937							 r, portid);
2938
2939				if (err <= 0) {
2940					if (err == 0)
2941						return 0;
2942					goto nla_put_failure;
2943				}
2944			} else
2945#endif
2946				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2947					goto nla_put_failure;
2948		}
2949	}
2950
2951	error = rt->dst.error;
2952
2953	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2954		goto nla_put_failure;
2955
2956	nlmsg_end(skb, nlh);
2957	return 0;
2958
2959nla_put_failure:
2960	nlmsg_cancel(skb, nlh);
2961	return -EMSGSIZE;
2962}
2963
2964static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2965			    struct netlink_callback *cb, u32 table_id,
2966			    struct fnhe_hash_bucket *bucket, int genid,
2967			    int *fa_index, int fa_start, unsigned int flags)
2968{
2969	int i;
2970
2971	for (i = 0; i < FNHE_HASH_SIZE; i++) {
2972		struct fib_nh_exception *fnhe;
2973
2974		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2975		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
2976			struct rtable *rt;
2977			int err;
2978
2979			if (*fa_index < fa_start)
2980				goto next;
2981
2982			if (fnhe->fnhe_genid != genid)
2983				goto next;
2984
2985			if (fnhe->fnhe_expires &&
2986			    time_after(jiffies, fnhe->fnhe_expires))
2987				goto next;
2988
2989			rt = rcu_dereference(fnhe->fnhe_rth_input);
2990			if (!rt)
2991				rt = rcu_dereference(fnhe->fnhe_rth_output);
2992			if (!rt)
2993				goto next;
2994
2995			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2996					   table_id, NULL, skb,
2997					   NETLINK_CB(cb->skb).portid,
2998					   cb->nlh->nlmsg_seq, flags);
2999			if (err)
3000				return err;
3001next:
3002			(*fa_index)++;
3003		}
3004	}
3005
3006	return 0;
3007}
3008
3009int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3010		       u32 table_id, struct fib_info *fi,
3011		       int *fa_index, int fa_start, unsigned int flags)
3012{
3013	struct net *net = sock_net(cb->skb->sk);
3014	int nhsel, genid = fnhe_genid(net);
3015
3016	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3017		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3018		struct fnhe_hash_bucket *bucket;
3019		int err;
3020
3021		if (nhc->nhc_flags & RTNH_F_DEAD)
3022			continue;
3023
3024		rcu_read_lock();
3025		bucket = rcu_dereference(nhc->nhc_exceptions);
3026		err = 0;
3027		if (bucket)
3028			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3029					       genid, fa_index, fa_start,
3030					       flags);
3031		rcu_read_unlock();
3032		if (err)
3033			return err;
3034	}
3035
3036	return 0;
3037}
3038
3039static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3040						   u8 ip_proto, __be16 sport,
3041						   __be16 dport)
3042{
3043	struct sk_buff *skb;
3044	struct iphdr *iph;
3045
3046	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3047	if (!skb)
3048		return NULL;
3049
3050	/* Reserve room for dummy headers, this skb can pass
3051	 * through good chunk of routing engine.
3052	 */
3053	skb_reset_mac_header(skb);
3054	skb_reset_network_header(skb);
3055	skb->protocol = htons(ETH_P_IP);
3056	iph = skb_put(skb, sizeof(struct iphdr));
3057	iph->protocol = ip_proto;
3058	iph->saddr = src;
3059	iph->daddr = dst;
3060	iph->version = 0x4;
3061	iph->frag_off = 0;
3062	iph->ihl = 0x5;
3063	skb_set_transport_header(skb, skb->len);
3064
3065	switch (iph->protocol) {
3066	case IPPROTO_UDP: {
3067		struct udphdr *udph;
3068
3069		udph = skb_put_zero(skb, sizeof(struct udphdr));
3070		udph->source = sport;
3071		udph->dest = dport;
3072		udph->len = sizeof(struct udphdr);
3073		udph->check = 0;
3074		break;
3075	}
3076	case IPPROTO_TCP: {
3077		struct tcphdr *tcph;
3078
3079		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3080		tcph->source	= sport;
3081		tcph->dest	= dport;
3082		tcph->doff	= sizeof(struct tcphdr) / 4;
3083		tcph->rst = 1;
3084		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3085					    src, dst, 0);
3086		break;
3087	}
3088	case IPPROTO_ICMP: {
3089		struct icmphdr *icmph;
3090
3091		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3092		icmph->type = ICMP_ECHO;
3093		icmph->code = 0;
3094	}
3095	}
3096
3097	return skb;
3098}
3099
3100static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3101				       const struct nlmsghdr *nlh,
3102				       struct nlattr **tb,
3103				       struct netlink_ext_ack *extack)
3104{
3105	struct rtmsg *rtm;
3106	int i, err;
3107
3108	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3109		NL_SET_ERR_MSG(extack,
3110			       "ipv4: Invalid header for route get request");
3111		return -EINVAL;
3112	}
3113
3114	if (!netlink_strict_get_check(skb))
3115		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3116					      rtm_ipv4_policy, extack);
3117
3118	rtm = nlmsg_data(nlh);
3119	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3120	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3121	    rtm->rtm_table || rtm->rtm_protocol ||
3122	    rtm->rtm_scope || rtm->rtm_type) {
3123		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3124		return -EINVAL;
3125	}
3126
3127	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3128			       RTM_F_LOOKUP_TABLE |
3129			       RTM_F_FIB_MATCH)) {
3130		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3131		return -EINVAL;
3132	}
3133
3134	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3135					    rtm_ipv4_policy, extack);
3136	if (err)
3137		return err;
3138
3139	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3140	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3141		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3142		return -EINVAL;
3143	}
3144
3145	for (i = 0; i <= RTA_MAX; i++) {
3146		if (!tb[i])
3147			continue;
3148
3149		switch (i) {
3150		case RTA_IIF:
3151		case RTA_OIF:
3152		case RTA_SRC:
3153		case RTA_DST:
3154		case RTA_IP_PROTO:
3155		case RTA_SPORT:
3156		case RTA_DPORT:
3157		case RTA_MARK:
3158		case RTA_UID:
3159			break;
3160		default:
3161			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3162			return -EINVAL;
3163		}
3164	}
3165
3166	return 0;
3167}
3168
3169static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3170			     struct netlink_ext_ack *extack)
3171{
3172	struct net *net = sock_net(in_skb->sk);
3173	struct nlattr *tb[RTA_MAX+1];
3174	u32 table_id = RT_TABLE_MAIN;
3175	__be16 sport = 0, dport = 0;
3176	struct fib_result res = {};
3177	u8 ip_proto = IPPROTO_UDP;
3178	struct rtable *rt = NULL;
3179	struct sk_buff *skb;
3180	struct rtmsg *rtm;
3181	struct flowi4 fl4 = {};
3182	__be32 dst = 0;
3183	__be32 src = 0;
3184	kuid_t uid;
3185	u32 iif;
3186	int err;
3187	int mark;
3188
3189	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3190	if (err < 0)
3191		return err;
3192
3193	rtm = nlmsg_data(nlh);
3194	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3195	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3196	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3197	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3198	if (tb[RTA_UID])
3199		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3200	else
3201		uid = (iif ? INVALID_UID : current_uid());
3202
3203	if (tb[RTA_IP_PROTO]) {
3204		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3205						  &ip_proto, AF_INET, extack);
3206		if (err)
3207			return err;
3208	}
3209
3210	if (tb[RTA_SPORT])
3211		sport = nla_get_be16(tb[RTA_SPORT]);
3212
3213	if (tb[RTA_DPORT])
3214		dport = nla_get_be16(tb[RTA_DPORT]);
3215
3216	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3217	if (!skb)
3218		return -ENOBUFS;
3219
3220	fl4.daddr = dst;
3221	fl4.saddr = src;
3222	fl4.flowi4_tos = rtm->rtm_tos;
3223	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3224	fl4.flowi4_mark = mark;
3225	fl4.flowi4_uid = uid;
3226	if (sport)
3227		fl4.fl4_sport = sport;
3228	if (dport)
3229		fl4.fl4_dport = dport;
3230	fl4.flowi4_proto = ip_proto;
3231
3232	rcu_read_lock();
3233
3234	if (iif) {
3235		struct net_device *dev;
3236
3237		dev = dev_get_by_index_rcu(net, iif);
3238		if (!dev) {
3239			err = -ENODEV;
3240			goto errout_rcu;
3241		}
3242
3243		fl4.flowi4_iif = iif; /* for rt_fill_info */
3244		skb->dev	= dev;
3245		skb->mark	= mark;
3246		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3247					 dev, &res);
 
3248
3249		rt = skb_rtable(skb);
3250		if (err == 0 && rt->dst.error)
3251			err = -rt->dst.error;
3252	} else {
3253		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3254		skb->dev = net->loopback_dev;
3255		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3256		err = 0;
3257		if (IS_ERR(rt))
3258			err = PTR_ERR(rt);
3259		else
3260			skb_dst_set(skb, &rt->dst);
3261	}
3262
3263	if (err)
3264		goto errout_rcu;
3265
3266	if (rtm->rtm_flags & RTM_F_NOTIFY)
3267		rt->rt_flags |= RTCF_NOTIFY;
3268
3269	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3270		table_id = res.table ? res.table->tb_id : 0;
3271
3272	/* reset skb for netlink reply msg */
3273	skb_trim(skb, 0);
3274	skb_reset_network_header(skb);
3275	skb_reset_transport_header(skb);
3276	skb_reset_mac_header(skb);
3277
3278	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3279		struct fib_rt_info fri;
3280
3281		if (!res.fi) {
3282			err = fib_props[res.type].error;
3283			if (!err)
3284				err = -EHOSTUNREACH;
3285			goto errout_rcu;
3286		}
3287		fri.fi = res.fi;
3288		fri.tb_id = table_id;
3289		fri.dst = res.prefix;
3290		fri.dst_len = res.prefixlen;
3291		fri.tos = fl4.flowi4_tos;
3292		fri.type = rt->rt_type;
3293		fri.offload = 0;
3294		fri.trap = 0;
 
3295		if (res.fa_head) {
3296			struct fib_alias *fa;
3297
3298			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3299				u8 slen = 32 - fri.dst_len;
3300
3301				if (fa->fa_slen == slen &&
3302				    fa->tb_id == fri.tb_id &&
3303				    fa->fa_tos == fri.tos &&
3304				    fa->fa_info == res.fi &&
3305				    fa->fa_type == fri.type) {
3306					fri.offload = fa->offload;
3307					fri.trap = fa->trap;
 
 
3308					break;
3309				}
3310			}
3311		}
3312		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3313				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3314	} else {
3315		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3316				   NETLINK_CB(in_skb).portid,
3317				   nlh->nlmsg_seq, 0);
3318	}
3319	if (err < 0)
3320		goto errout_rcu;
3321
3322	rcu_read_unlock();
3323
3324	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3325
3326errout_free:
3327	return err;
3328errout_rcu:
3329	rcu_read_unlock();
3330	kfree_skb(skb);
3331	goto errout_free;
3332}
3333
3334void ip_rt_multicast_event(struct in_device *in_dev)
3335{
3336	rt_cache_flush(dev_net(in_dev->dev));
3337}
3338
3339#ifdef CONFIG_SYSCTL
3340static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3341static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3342static int ip_rt_gc_elasticity __read_mostly	= 8;
3343static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3344
3345static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3346		void *buffer, size_t *lenp, loff_t *ppos)
3347{
3348	struct net *net = (struct net *)__ctl->extra1;
3349
3350	if (write) {
3351		rt_cache_flush(net);
3352		fnhe_genid_bump(net);
3353		return 0;
3354	}
3355
3356	return -EINVAL;
3357}
3358
3359static struct ctl_table ipv4_route_table[] = {
3360	{
3361		.procname	= "gc_thresh",
3362		.data		= &ipv4_dst_ops.gc_thresh,
3363		.maxlen		= sizeof(int),
3364		.mode		= 0644,
3365		.proc_handler	= proc_dointvec,
3366	},
3367	{
3368		.procname	= "max_size",
3369		.data		= &ip_rt_max_size,
3370		.maxlen		= sizeof(int),
3371		.mode		= 0644,
3372		.proc_handler	= proc_dointvec,
3373	},
3374	{
3375		/*  Deprecated. Use gc_min_interval_ms */
3376
3377		.procname	= "gc_min_interval",
3378		.data		= &ip_rt_gc_min_interval,
3379		.maxlen		= sizeof(int),
3380		.mode		= 0644,
3381		.proc_handler	= proc_dointvec_jiffies,
3382	},
3383	{
3384		.procname	= "gc_min_interval_ms",
3385		.data		= &ip_rt_gc_min_interval,
3386		.maxlen		= sizeof(int),
3387		.mode		= 0644,
3388		.proc_handler	= proc_dointvec_ms_jiffies,
3389	},
3390	{
3391		.procname	= "gc_timeout",
3392		.data		= &ip_rt_gc_timeout,
3393		.maxlen		= sizeof(int),
3394		.mode		= 0644,
3395		.proc_handler	= proc_dointvec_jiffies,
3396	},
3397	{
3398		.procname	= "gc_interval",
3399		.data		= &ip_rt_gc_interval,
3400		.maxlen		= sizeof(int),
3401		.mode		= 0644,
3402		.proc_handler	= proc_dointvec_jiffies,
3403	},
3404	{
3405		.procname	= "redirect_load",
3406		.data		= &ip_rt_redirect_load,
3407		.maxlen		= sizeof(int),
3408		.mode		= 0644,
3409		.proc_handler	= proc_dointvec,
3410	},
3411	{
3412		.procname	= "redirect_number",
3413		.data		= &ip_rt_redirect_number,
3414		.maxlen		= sizeof(int),
3415		.mode		= 0644,
3416		.proc_handler	= proc_dointvec,
3417	},
3418	{
3419		.procname	= "redirect_silence",
3420		.data		= &ip_rt_redirect_silence,
3421		.maxlen		= sizeof(int),
3422		.mode		= 0644,
3423		.proc_handler	= proc_dointvec,
3424	},
3425	{
3426		.procname	= "error_cost",
3427		.data		= &ip_rt_error_cost,
3428		.maxlen		= sizeof(int),
3429		.mode		= 0644,
3430		.proc_handler	= proc_dointvec,
3431	},
3432	{
3433		.procname	= "error_burst",
3434		.data		= &ip_rt_error_burst,
3435		.maxlen		= sizeof(int),
3436		.mode		= 0644,
3437		.proc_handler	= proc_dointvec,
3438	},
3439	{
3440		.procname	= "gc_elasticity",
3441		.data		= &ip_rt_gc_elasticity,
3442		.maxlen		= sizeof(int),
3443		.mode		= 0644,
3444		.proc_handler	= proc_dointvec,
3445	},
3446	{
3447		.procname	= "mtu_expires",
3448		.data		= &ip_rt_mtu_expires,
3449		.maxlen		= sizeof(int),
3450		.mode		= 0644,
3451		.proc_handler	= proc_dointvec_jiffies,
3452	},
3453	{
3454		.procname	= "min_pmtu",
3455		.data		= &ip_rt_min_pmtu,
3456		.maxlen		= sizeof(int),
3457		.mode		= 0644,
3458		.proc_handler	= proc_dointvec_minmax,
3459		.extra1		= &ip_min_valid_pmtu,
3460	},
3461	{
3462		.procname	= "min_adv_mss",
3463		.data		= &ip_rt_min_advmss,
3464		.maxlen		= sizeof(int),
3465		.mode		= 0644,
3466		.proc_handler	= proc_dointvec,
3467	},
3468	{ }
3469};
3470
3471static const char ipv4_route_flush_procname[] = "flush";
3472
3473static struct ctl_table ipv4_route_flush_table[] = {
3474	{
3475		.procname	= ipv4_route_flush_procname,
3476		.maxlen		= sizeof(int),
3477		.mode		= 0200,
3478		.proc_handler	= ipv4_sysctl_rtcache_flush,
3479	},
3480	{ },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3481};
3482
3483static __net_init int sysctl_route_net_init(struct net *net)
3484{
3485	struct ctl_table *tbl;
 
3486
3487	tbl = ipv4_route_flush_table;
3488	if (!net_eq(net, &init_net)) {
3489		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
 
 
3490		if (!tbl)
3491			goto err_dup;
3492
3493		/* Don't export non-whitelisted sysctls to unprivileged users */
3494		if (net->user_ns != &init_user_ns) {
3495			if (tbl[0].procname != ipv4_route_flush_procname)
3496				tbl[0].procname = NULL;
3497		}
 
 
 
 
 
 
3498	}
3499	tbl[0].extra1 = net;
3500
3501	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
 
3502	if (!net->ipv4.route_hdr)
3503		goto err_reg;
3504	return 0;
3505
3506err_reg:
3507	if (tbl != ipv4_route_flush_table)
3508		kfree(tbl);
3509err_dup:
3510	return -ENOMEM;
3511}
3512
3513static __net_exit void sysctl_route_net_exit(struct net *net)
3514{
3515	struct ctl_table *tbl;
3516
3517	tbl = net->ipv4.route_hdr->ctl_table_arg;
3518	unregister_net_sysctl_table(net->ipv4.route_hdr);
3519	BUG_ON(tbl == ipv4_route_flush_table);
3520	kfree(tbl);
3521}
3522
3523static __net_initdata struct pernet_operations sysctl_route_ops = {
3524	.init = sysctl_route_net_init,
3525	.exit = sysctl_route_net_exit,
3526};
3527#endif
3528
 
 
 
 
 
 
 
 
 
 
 
 
 
3529static __net_init int rt_genid_init(struct net *net)
3530{
3531	atomic_set(&net->ipv4.rt_genid, 0);
3532	atomic_set(&net->fnhe_genid, 0);
3533	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3534	return 0;
3535}
3536
3537static __net_initdata struct pernet_operations rt_genid_ops = {
3538	.init = rt_genid_init,
3539};
3540
3541static int __net_init ipv4_inetpeer_init(struct net *net)
3542{
3543	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3544
3545	if (!bp)
3546		return -ENOMEM;
3547	inet_peer_base_init(bp);
3548	net->ipv4.peers = bp;
3549	return 0;
3550}
3551
3552static void __net_exit ipv4_inetpeer_exit(struct net *net)
3553{
3554	struct inet_peer_base *bp = net->ipv4.peers;
3555
3556	net->ipv4.peers = NULL;
3557	inetpeer_invalidate_tree(bp);
3558	kfree(bp);
3559}
3560
3561static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3562	.init	=	ipv4_inetpeer_init,
3563	.exit	=	ipv4_inetpeer_exit,
3564};
3565
3566#ifdef CONFIG_IP_ROUTE_CLASSID
3567struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3568#endif /* CONFIG_IP_ROUTE_CLASSID */
3569
 
 
 
 
 
3570int __init ip_rt_init(void)
3571{
 
3572	int cpu;
3573
3574	ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3575				  GFP_KERNEL);
3576	if (!ip_idents)
3577		panic("IP: failed to allocate ip_idents\n");
3578
3579	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3580
3581	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3582	if (!ip_tstamps)
3583		panic("IP: failed to allocate ip_tstamps\n");
 
 
 
 
 
 
3584
3585	for_each_possible_cpu(cpu) {
3586		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3587
3588		INIT_LIST_HEAD(&ul->head);
3589		spin_lock_init(&ul->lock);
3590	}
3591#ifdef CONFIG_IP_ROUTE_CLASSID
3592	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3593	if (!ip_rt_acct)
3594		panic("IP: failed to allocate ip_rt_acct\n");
3595#endif
3596
3597	ipv4_dst_ops.kmem_cachep =
3598		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3599				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3600
3601	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3602
3603	if (dst_entries_init(&ipv4_dst_ops) < 0)
3604		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3605
3606	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3607		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3608
3609	ipv4_dst_ops.gc_thresh = ~0;
3610	ip_rt_max_size = INT_MAX;
3611
3612	devinet_init();
3613	ip_fib_init();
3614
3615	if (ip_rt_proc_init())
3616		pr_err("Unable to create route proc files\n");
3617#ifdef CONFIG_XFRM
3618	xfrm_init();
3619	xfrm4_init();
3620#endif
3621	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3622		      RTNL_FLAG_DOIT_UNLOCKED);
3623
3624#ifdef CONFIG_SYSCTL
3625	register_pernet_subsys(&sysctl_route_ops);
3626#endif
 
3627	register_pernet_subsys(&rt_genid_ops);
3628	register_pernet_subsys(&ipv4_inetpeer_ops);
3629	return 0;
3630}
3631
3632#ifdef CONFIG_SYSCTL
3633/*
3634 * We really need to sanitize the damn ipv4 init order, then all
3635 * this nonsense will go away.
3636 */
3637void __init ip_static_sysctl_init(void)
3638{
3639	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3640}
3641#endif
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		ROUTE - implementation of the IP router.
   8 *
   9 * Authors:	Ross Biro
  10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  12 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14 *
  15 * Fixes:
  16 *		Alan Cox	:	Verify area fixes.
  17 *		Alan Cox	:	cli() protects routing changes
  18 *		Rui Oliveira	:	ICMP routing table updates
  19 *		(rco@di.uminho.pt)	Routing table insertion and update
  20 *		Linus Torvalds	:	Rewrote bits to be sensible
  21 *		Alan Cox	:	Added BSD route gw semantics
  22 *		Alan Cox	:	Super /proc >4K
  23 *		Alan Cox	:	MTU in route table
  24 *		Alan Cox	:	MSS actually. Also added the window
  25 *					clamper.
  26 *		Sam Lantinga	:	Fixed route matching in rt_del()
  27 *		Alan Cox	:	Routing cache support.
  28 *		Alan Cox	:	Removed compatibility cruft.
  29 *		Alan Cox	:	RTF_REJECT support.
  30 *		Alan Cox	:	TCP irtt support.
  31 *		Jonathan Naylor	:	Added Metric support.
  32 *	Miquel van Smoorenburg	:	BSD API fixes.
  33 *	Miquel van Smoorenburg	:	Metrics.
  34 *		Alan Cox	:	Use __u32 properly
  35 *		Alan Cox	:	Aligned routing errors more closely with BSD
  36 *					our system is still very different.
  37 *		Alan Cox	:	Faster /proc handling
  38 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  39 *					routing caches and better behaviour.
  40 *
  41 *		Olaf Erb	:	irtt wasn't being copied right.
  42 *		Bjorn Ekwall	:	Kerneld route support.
  43 *		Alan Cox	:	Multicast fixed (I hope)
  44 *		Pavel Krauz	:	Limited broadcast fixed
  45 *		Mike McLagan	:	Routing by source
  46 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  47 *					route.c and rewritten from scratch.
  48 *		Andi Kleen	:	Load-limit warning messages.
  49 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  50 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  51 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  52 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  53 *		Marc Boucher	:	routing by fwmark
  54 *	Robert Olsson		:	Added rt_cache statistics
  55 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  56 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  57 *	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  58 *	Ilia Sotnikov		:	Removed TOS from hash calculations
  59 */
  60
  61#define pr_fmt(fmt) "IPv4: " fmt
  62
  63#include <linux/module.h>
 
  64#include <linux/bitops.h>
 
  65#include <linux/kernel.h>
  66#include <linux/mm.h>
  67#include <linux/memblock.h>
  68#include <linux/socket.h>
 
  69#include <linux/errno.h>
  70#include <linux/in.h>
  71#include <linux/inet.h>
  72#include <linux/netdevice.h>
  73#include <linux/proc_fs.h>
  74#include <linux/init.h>
  75#include <linux/skbuff.h>
  76#include <linux/inetdevice.h>
  77#include <linux/igmp.h>
  78#include <linux/pkt_sched.h>
  79#include <linux/mroute.h>
  80#include <linux/netfilter_ipv4.h>
  81#include <linux/random.h>
  82#include <linux/rcupdate.h>
 
  83#include <linux/slab.h>
  84#include <linux/jhash.h>
  85#include <net/dst.h>
  86#include <net/dst_metadata.h>
  87#include <net/inet_dscp.h>
  88#include <net/net_namespace.h>
 
  89#include <net/ip.h>
  90#include <net/route.h>
  91#include <net/inetpeer.h>
  92#include <net/sock.h>
  93#include <net/ip_fib.h>
  94#include <net/nexthop.h>
 
  95#include <net/tcp.h>
  96#include <net/icmp.h>
  97#include <net/xfrm.h>
  98#include <net/lwtunnel.h>
  99#include <net/netevent.h>
 100#include <net/rtnetlink.h>
 101#ifdef CONFIG_SYSCTL
 102#include <linux/sysctl.h>
 103#endif
 104#include <net/secure_seq.h>
 105#include <net/ip_tunnels.h>
 
 106
 107#include "fib_lookup.h"
 108
 
 
 
 109#define RT_GC_TIMEOUT (300*HZ)
 110
 111#define DEFAULT_MIN_PMTU (512 + 20 + 20)
 112#define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
 113#define DEFAULT_MIN_ADVMSS 256
 114static int ip_rt_max_size;
 115static int ip_rt_redirect_number __read_mostly	= 9;
 116static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 117static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 118static int ip_rt_error_cost __read_mostly	= HZ;
 119static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 
 
 
 120
 121static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 122
 123/*
 124 *	Interface to generic destination cache.
 125 */
 126
 127INDIRECT_CALLABLE_SCOPE
 128struct dst_entry	*ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 129static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 130INDIRECT_CALLABLE_SCOPE
 131unsigned int		ipv4_mtu(const struct dst_entry *dst);
 132static void		ipv4_negative_advice(struct sock *sk,
 133					     struct dst_entry *dst);
 134static void		 ipv4_link_failure(struct sk_buff *skb);
 135static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 136					   struct sk_buff *skb, u32 mtu,
 137					   bool confirm_neigh);
 138static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 139					struct sk_buff *skb);
 140static void		ipv4_dst_destroy(struct dst_entry *dst);
 141
 142static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 143{
 144	WARN_ON(1);
 145	return NULL;
 146}
 147
 148static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 149					   struct sk_buff *skb,
 150					   const void *daddr);
 151static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 152
 153static struct dst_ops ipv4_dst_ops = {
 154	.family =		AF_INET,
 155	.check =		ipv4_dst_check,
 156	.default_advmss =	ipv4_default_advmss,
 157	.mtu =			ipv4_mtu,
 158	.cow_metrics =		ipv4_cow_metrics,
 159	.destroy =		ipv4_dst_destroy,
 160	.negative_advice =	ipv4_negative_advice,
 161	.link_failure =		ipv4_link_failure,
 162	.update_pmtu =		ip_rt_update_pmtu,
 163	.redirect =		ip_do_redirect,
 164	.local_out =		__ip_local_out,
 165	.neigh_lookup =		ipv4_neigh_lookup,
 166	.confirm_neigh =	ipv4_confirm_neigh,
 167};
 168
 169#define ECN_OR_COST(class)	TC_PRIO_##class
 170
 171const __u8 ip_tos2prio[16] = {
 172	TC_PRIO_BESTEFFORT,
 173	ECN_OR_COST(BESTEFFORT),
 174	TC_PRIO_BESTEFFORT,
 175	ECN_OR_COST(BESTEFFORT),
 176	TC_PRIO_BULK,
 177	ECN_OR_COST(BULK),
 178	TC_PRIO_BULK,
 179	ECN_OR_COST(BULK),
 180	TC_PRIO_INTERACTIVE,
 181	ECN_OR_COST(INTERACTIVE),
 182	TC_PRIO_INTERACTIVE,
 183	ECN_OR_COST(INTERACTIVE),
 184	TC_PRIO_INTERACTIVE_BULK,
 185	ECN_OR_COST(INTERACTIVE_BULK),
 186	TC_PRIO_INTERACTIVE_BULK,
 187	ECN_OR_COST(INTERACTIVE_BULK)
 188};
 189EXPORT_SYMBOL(ip_tos2prio);
 190
 191static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 192#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 193
 194#ifdef CONFIG_PROC_FS
 195static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 196{
 197	if (*pos)
 198		return NULL;
 199	return SEQ_START_TOKEN;
 200}
 201
 202static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 203{
 204	++*pos;
 205	return NULL;
 206}
 207
 208static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 209{
 210}
 211
 212static int rt_cache_seq_show(struct seq_file *seq, void *v)
 213{
 214	if (v == SEQ_START_TOKEN)
 215		seq_printf(seq, "%-127s\n",
 216			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 217			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 218			   "HHUptod\tSpecDst");
 219	return 0;
 220}
 221
 222static const struct seq_operations rt_cache_seq_ops = {
 223	.start  = rt_cache_seq_start,
 224	.next   = rt_cache_seq_next,
 225	.stop   = rt_cache_seq_stop,
 226	.show   = rt_cache_seq_show,
 227};
 228
 
 
 
 
 
 
 
 
 
 
 
 
 
 229static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 230{
 231	int cpu;
 232
 233	if (*pos == 0)
 234		return SEQ_START_TOKEN;
 235
 236	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 237		if (!cpu_possible(cpu))
 238			continue;
 239		*pos = cpu+1;
 240		return &per_cpu(rt_cache_stat, cpu);
 241	}
 242	return NULL;
 243}
 244
 245static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 246{
 247	int cpu;
 248
 249	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 250		if (!cpu_possible(cpu))
 251			continue;
 252		*pos = cpu+1;
 253		return &per_cpu(rt_cache_stat, cpu);
 254	}
 255	(*pos)++;
 256	return NULL;
 257
 258}
 259
 260static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 261{
 262
 263}
 264
 265static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 266{
 267	struct rt_cache_stat *st = v;
 268
 269	if (v == SEQ_START_TOKEN) {
 270		seq_puts(seq, "entries  in_hit   in_slow_tot in_slow_mc in_no_route in_brd   in_martian_dst in_martian_src out_hit  out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 271		return 0;
 272	}
 273
 274	seq_printf(seq, "%08x %08x %08x    %08x   %08x    %08x %08x       "
 275			"%08x       %08x %08x     %08x    %08x %08x   "
 276			"%08x     %08x        %08x        %08x\n",
 277		   dst_entries_get_slow(&ipv4_dst_ops),
 278		   0, /* st->in_hit */
 279		   st->in_slow_tot,
 280		   st->in_slow_mc,
 281		   st->in_no_route,
 282		   st->in_brd,
 283		   st->in_martian_dst,
 284		   st->in_martian_src,
 285
 286		   0, /* st->out_hit */
 287		   st->out_slow_tot,
 288		   st->out_slow_mc,
 289
 290		   0, /* st->gc_total */
 291		   0, /* st->gc_ignored */
 292		   0, /* st->gc_goal_miss */
 293		   0, /* st->gc_dst_overflow */
 294		   0, /* st->in_hlist_search */
 295		   0  /* st->out_hlist_search */
 296		);
 297	return 0;
 298}
 299
 300static const struct seq_operations rt_cpu_seq_ops = {
 301	.start  = rt_cpu_seq_start,
 302	.next   = rt_cpu_seq_next,
 303	.stop   = rt_cpu_seq_stop,
 304	.show   = rt_cpu_seq_show,
 305};
 306
 
 
 
 
 
 
 
 
 
 
 
 
 
 307#ifdef CONFIG_IP_ROUTE_CLASSID
 308static int rt_acct_proc_show(struct seq_file *m, void *v)
 309{
 310	struct ip_rt_acct *dst, *src;
 311	unsigned int i, j;
 312
 313	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 314	if (!dst)
 315		return -ENOMEM;
 316
 317	for_each_possible_cpu(i) {
 318		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 319		for (j = 0; j < 256; j++) {
 320			dst[j].o_bytes   += src[j].o_bytes;
 321			dst[j].o_packets += src[j].o_packets;
 322			dst[j].i_bytes   += src[j].i_bytes;
 323			dst[j].i_packets += src[j].i_packets;
 324		}
 325	}
 326
 327	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 328	kfree(dst);
 329	return 0;
 330}
 331#endif
 332
 333static int __net_init ip_rt_do_proc_init(struct net *net)
 334{
 335	struct proc_dir_entry *pde;
 336
 337	pde = proc_create_seq("rt_cache", 0444, net->proc_net,
 338			      &rt_cache_seq_ops);
 339	if (!pde)
 340		goto err1;
 341
 342	pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
 343			      &rt_cpu_seq_ops);
 344	if (!pde)
 345		goto err2;
 346
 347#ifdef CONFIG_IP_ROUTE_CLASSID
 348	pde = proc_create_single("rt_acct", 0, net->proc_net,
 349			rt_acct_proc_show);
 350	if (!pde)
 351		goto err3;
 352#endif
 353	return 0;
 354
 355#ifdef CONFIG_IP_ROUTE_CLASSID
 356err3:
 357	remove_proc_entry("rt_cache", net->proc_net_stat);
 358#endif
 359err2:
 360	remove_proc_entry("rt_cache", net->proc_net);
 361err1:
 362	return -ENOMEM;
 363}
 364
 365static void __net_exit ip_rt_do_proc_exit(struct net *net)
 366{
 367	remove_proc_entry("rt_cache", net->proc_net_stat);
 368	remove_proc_entry("rt_cache", net->proc_net);
 369#ifdef CONFIG_IP_ROUTE_CLASSID
 370	remove_proc_entry("rt_acct", net->proc_net);
 371#endif
 372}
 373
 374static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 375	.init = ip_rt_do_proc_init,
 376	.exit = ip_rt_do_proc_exit,
 377};
 378
 379static int __init ip_rt_proc_init(void)
 380{
 381	return register_pernet_subsys(&ip_rt_proc_ops);
 382}
 383
 384#else
 385static inline int ip_rt_proc_init(void)
 386{
 387	return 0;
 388}
 389#endif /* CONFIG_PROC_FS */
 390
 391static inline bool rt_is_expired(const struct rtable *rth)
 392{
 393	bool res;
 394
 395	rcu_read_lock();
 396	res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev));
 397	rcu_read_unlock();
 398
 399	return res;
 400}
 401
 402void rt_cache_flush(struct net *net)
 403{
 404	rt_genid_bump_ipv4(net);
 405}
 406
 407static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 408					   struct sk_buff *skb,
 409					   const void *daddr)
 410{
 411	const struct rtable *rt = container_of(dst, struct rtable, dst);
 412	struct net_device *dev = dst->dev;
 413	struct neighbour *n;
 414
 415	rcu_read_lock();
 416
 417	if (likely(rt->rt_gw_family == AF_INET)) {
 418		n = ip_neigh_gw4(dev, rt->rt_gw4);
 419	} else if (rt->rt_gw_family == AF_INET6) {
 420		n = ip_neigh_gw6(dev, &rt->rt_gw6);
 421        } else {
 422		__be32 pkey;
 423
 424		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 425		n = ip_neigh_gw4(dev, pkey);
 426	}
 427
 428	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 429		n = NULL;
 430
 431	rcu_read_unlock();
 432
 433	return n;
 434}
 435
 436static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 437{
 438	const struct rtable *rt = container_of(dst, struct rtable, dst);
 439	struct net_device *dev = dst->dev;
 440	const __be32 *pkey = daddr;
 441
 442	if (rt->rt_gw_family == AF_INET) {
 443		pkey = (const __be32 *)&rt->rt_gw4;
 444	} else if (rt->rt_gw_family == AF_INET6) {
 445		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 446	} else if (!daddr ||
 447		 (rt->rt_flags &
 448		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 449		return;
 450	}
 451	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 452}
 453
 454/* Hash tables of size 2048..262144 depending on RAM size.
 455 * Each bucket uses 8 bytes.
 456 */
 457static u32 ip_idents_mask __read_mostly;
 458static atomic_t *ip_idents __read_mostly;
 459static u32 *ip_tstamps __read_mostly;
 460
 461/* In order to protect privacy, we add a perturbation to identifiers
 462 * if one generator is seldom used. This makes hard for an attacker
 463 * to infer how many packets were sent between two points in time.
 464 */
 465static u32 ip_idents_reserve(u32 hash, int segs)
 466{
 467	u32 bucket, old, now = (u32)jiffies;
 468	atomic_t *p_id;
 469	u32 *p_tstamp;
 
 470	u32 delta = 0;
 471
 472	bucket = hash & ip_idents_mask;
 473	p_tstamp = ip_tstamps + bucket;
 474	p_id = ip_idents + bucket;
 475	old = READ_ONCE(*p_tstamp);
 476
 477	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 478		delta = get_random_u32_below(now - old);
 479
 480	/* If UBSAN reports an error there, please make sure your compiler
 481	 * supports -fno-strict-overflow before reporting it that was a bug
 482	 * in UBSAN, and it has been fixed in GCC-8.
 483	 */
 484	return atomic_add_return(segs + delta, p_id) - segs;
 485}
 
 486
 487void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 488{
 489	u32 hash, id;
 490
 491	/* Note the following code is not safe, but this is okay. */
 492	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 493		get_random_bytes(&net->ipv4.ip_id_key,
 494				 sizeof(net->ipv4.ip_id_key));
 495
 496	hash = siphash_3u32((__force u32)iph->daddr,
 497			    (__force u32)iph->saddr,
 498			    iph->protocol,
 499			    &net->ipv4.ip_id_key);
 500	id = ip_idents_reserve(hash, segs);
 501	iph->id = htons(id);
 502}
 503EXPORT_SYMBOL(__ip_select_ident);
 504
 505static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 506			     const struct sock *sk, const struct iphdr *iph,
 507			     int oif, __u8 tos, u8 prot, u32 mark,
 508			     int flow_flags)
 
 509{
 510	__u8 scope = RT_SCOPE_UNIVERSE;
 
 511
 512	if (sk) {
 513		oif = sk->sk_bound_dev_if;
 514		mark = READ_ONCE(sk->sk_mark);
 515		tos = ip_sock_rt_tos(sk);
 516		scope = ip_sock_rt_scope(sk);
 517		prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW :
 518						    sk->sk_protocol;
 519	}
 520
 521	flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope,
 522			   prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
 523			   sock_net_uid(net, sk));
 524}
 525
 526static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 527			       const struct sock *sk)
 528{
 529	const struct net *net = dev_net(skb->dev);
 530	const struct iphdr *iph = ip_hdr(skb);
 531	int oif = skb->dev->ifindex;
 
 532	u8 prot = iph->protocol;
 533	u32 mark = skb->mark;
 534	__u8 tos = iph->tos;
 535
 536	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 537}
 538
 539static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 540{
 541	const struct inet_sock *inet = inet_sk(sk);
 542	const struct ip_options_rcu *inet_opt;
 543	__be32 daddr = inet->inet_daddr;
 544
 545	rcu_read_lock();
 546	inet_opt = rcu_dereference(inet->inet_opt);
 547	if (inet_opt && inet_opt->opt.srr)
 548		daddr = inet_opt->opt.faddr;
 549	flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
 550			   ip_sock_rt_tos(sk),
 551			   ip_sock_rt_scope(sk),
 552			   inet_test_bit(HDRINCL, sk) ?
 553				IPPROTO_RAW : sk->sk_protocol,
 554			   inet_sk_flowi_flags(sk),
 555			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 556	rcu_read_unlock();
 557}
 558
 559static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 560				 const struct sk_buff *skb)
 561{
 562	if (skb)
 563		build_skb_flow_key(fl4, skb, sk);
 564	else
 565		build_sk_flow_key(fl4, sk);
 566}
 567
 568static DEFINE_SPINLOCK(fnhe_lock);
 569
 570static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 571{
 572	struct rtable *rt;
 573
 574	rt = rcu_dereference(fnhe->fnhe_rth_input);
 575	if (rt) {
 576		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 577		dst_dev_put(&rt->dst);
 578		dst_release(&rt->dst);
 579	}
 580	rt = rcu_dereference(fnhe->fnhe_rth_output);
 581	if (rt) {
 582		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 583		dst_dev_put(&rt->dst);
 584		dst_release(&rt->dst);
 585	}
 586}
 587
 588static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 589{
 590	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 591	struct fib_nh_exception *fnhe, *oldest = NULL;
 592
 593	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 594		fnhe = rcu_dereference_protected(*fnhe_p,
 595						 lockdep_is_held(&fnhe_lock));
 596		if (!fnhe)
 597			break;
 598		if (!oldest ||
 599		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 600			oldest = fnhe;
 601			oldest_p = fnhe_p;
 602		}
 603	}
 604	fnhe_flush_routes(oldest);
 605	*oldest_p = oldest->fnhe_next;
 606	kfree_rcu(oldest, rcu);
 607}
 608
 609static u32 fnhe_hashfun(__be32 daddr)
 610{
 611	static siphash_aligned_key_t fnhe_hash_key;
 612	u64 hval;
 613
 614	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
 615	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
 616	return hash_64(hval, FNHE_HASH_SHIFT);
 617}
 618
 619static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 620{
 621	rt->rt_pmtu = fnhe->fnhe_pmtu;
 622	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 623	rt->dst.expires = fnhe->fnhe_expires;
 624
 625	if (fnhe->fnhe_gw) {
 626		rt->rt_flags |= RTCF_REDIRECTED;
 627		rt->rt_uses_gateway = 1;
 628		rt->rt_gw_family = AF_INET;
 629		rt->rt_gw4 = fnhe->fnhe_gw;
 630	}
 631}
 632
 633static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 634				  __be32 gw, u32 pmtu, bool lock,
 635				  unsigned long expires)
 636{
 637	struct fnhe_hash_bucket *hash;
 638	struct fib_nh_exception *fnhe;
 639	struct rtable *rt;
 640	u32 genid, hval;
 641	unsigned int i;
 642	int depth;
 643
 644	genid = fnhe_genid(dev_net(nhc->nhc_dev));
 645	hval = fnhe_hashfun(daddr);
 646
 647	spin_lock_bh(&fnhe_lock);
 648
 649	hash = rcu_dereference(nhc->nhc_exceptions);
 650	if (!hash) {
 651		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 652		if (!hash)
 653			goto out_unlock;
 654		rcu_assign_pointer(nhc->nhc_exceptions, hash);
 655	}
 656
 657	hash += hval;
 658
 659	depth = 0;
 660	for (fnhe = rcu_dereference(hash->chain); fnhe;
 661	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 662		if (fnhe->fnhe_daddr == daddr)
 663			break;
 664		depth++;
 665	}
 666
 667	if (fnhe) {
 668		if (fnhe->fnhe_genid != genid)
 669			fnhe->fnhe_genid = genid;
 670		if (gw)
 671			fnhe->fnhe_gw = gw;
 672		if (pmtu) {
 673			fnhe->fnhe_pmtu = pmtu;
 674			fnhe->fnhe_mtu_locked = lock;
 675		}
 676		fnhe->fnhe_expires = max(1UL, expires);
 677		/* Update all cached dsts too */
 678		rt = rcu_dereference(fnhe->fnhe_rth_input);
 679		if (rt)
 680			fill_route_from_fnhe(rt, fnhe);
 681		rt = rcu_dereference(fnhe->fnhe_rth_output);
 682		if (rt)
 683			fill_route_from_fnhe(rt, fnhe);
 684	} else {
 685		/* Randomize max depth to avoid some side channels attacks. */
 686		int max_depth = FNHE_RECLAIM_DEPTH +
 687				get_random_u32_below(FNHE_RECLAIM_DEPTH);
 
 
 
 688
 689		while (depth > max_depth) {
 690			fnhe_remove_oldest(hash);
 691			depth--;
 692		}
 693
 694		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 695		if (!fnhe)
 696			goto out_unlock;
 697
 698		fnhe->fnhe_next = hash->chain;
 699
 700		fnhe->fnhe_genid = genid;
 701		fnhe->fnhe_daddr = daddr;
 702		fnhe->fnhe_gw = gw;
 703		fnhe->fnhe_pmtu = pmtu;
 704		fnhe->fnhe_mtu_locked = lock;
 705		fnhe->fnhe_expires = max(1UL, expires);
 706
 707		rcu_assign_pointer(hash->chain, fnhe);
 708
 709		/* Exception created; mark the cached routes for the nexthop
 710		 * stale, so anyone caching it rechecks if this exception
 711		 * applies to them.
 712		 */
 713		rt = rcu_dereference(nhc->nhc_rth_input);
 714		if (rt)
 715			rt->dst.obsolete = DST_OBSOLETE_KILL;
 716
 717		for_each_possible_cpu(i) {
 718			struct rtable __rcu **prt;
 719
 720			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 721			rt = rcu_dereference(*prt);
 722			if (rt)
 723				rt->dst.obsolete = DST_OBSOLETE_KILL;
 724		}
 725	}
 726
 727	fnhe->fnhe_stamp = jiffies;
 728
 729out_unlock:
 730	spin_unlock_bh(&fnhe_lock);
 731}
 732
 733static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 734			     bool kill_route)
 735{
 736	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 737	__be32 old_gw = ip_hdr(skb)->saddr;
 738	struct net_device *dev = skb->dev;
 739	struct in_device *in_dev;
 740	struct fib_result res;
 741	struct neighbour *n;
 742	struct net *net;
 743
 744	switch (icmp_hdr(skb)->code & 7) {
 745	case ICMP_REDIR_NET:
 746	case ICMP_REDIR_NETTOS:
 747	case ICMP_REDIR_HOST:
 748	case ICMP_REDIR_HOSTTOS:
 749		break;
 750
 751	default:
 752		return;
 753	}
 754
 755	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 756		return;
 757
 758	in_dev = __in_dev_get_rcu(dev);
 759	if (!in_dev)
 760		return;
 761
 762	net = dev_net(dev);
 763	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 764	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 765	    ipv4_is_zeronet(new_gw))
 766		goto reject_redirect;
 767
 768	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 769		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 770			goto reject_redirect;
 771		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 772			goto reject_redirect;
 773	} else {
 774		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 775			goto reject_redirect;
 776	}
 777
 778	n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
 779	if (!n)
 780		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 781	if (!IS_ERR(n)) {
 782		if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
 783			neigh_event_send(n, NULL);
 784		} else {
 785			if (fib_lookup(net, fl4, &res, 0) == 0) {
 786				struct fib_nh_common *nhc;
 787
 788				fib_select_path(net, &res, fl4, skb);
 789				nhc = FIB_RES_NHC(res);
 790				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 791						0, false,
 792						jiffies + ip_rt_gc_timeout);
 793			}
 794			if (kill_route)
 795				rt->dst.obsolete = DST_OBSOLETE_KILL;
 796			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 797		}
 798		neigh_release(n);
 799	}
 800	return;
 801
 802reject_redirect:
 803#ifdef CONFIG_IP_ROUTE_VERBOSE
 804	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 805		const struct iphdr *iph = (const struct iphdr *) skb->data;
 806		__be32 daddr = iph->daddr;
 807		__be32 saddr = iph->saddr;
 808
 809		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 810				     "  Advised path = %pI4 -> %pI4\n",
 811				     &old_gw, dev->name, &new_gw,
 812				     &saddr, &daddr);
 813	}
 814#endif
 815	;
 816}
 817
 818static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 819{
 820	struct rtable *rt;
 821	struct flowi4 fl4;
 822	const struct iphdr *iph = (const struct iphdr *) skb->data;
 823	struct net *net = dev_net(skb->dev);
 824	int oif = skb->dev->ifindex;
 
 825	u8 prot = iph->protocol;
 826	u32 mark = skb->mark;
 827	__u8 tos = iph->tos;
 828
 829	rt = dst_rtable(dst);
 830
 831	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 832	__ip_do_redirect(rt, skb, &fl4, true);
 833}
 834
 835static void ipv4_negative_advice(struct sock *sk,
 836				 struct dst_entry *dst)
 837{
 838	struct rtable *rt = dst_rtable(dst);
 
 839
 840	if ((dst->obsolete > 0) ||
 841	    (rt->rt_flags & RTCF_REDIRECTED) ||
 842	    rt->dst.expires)
 843		sk_dst_reset(sk);
 
 
 
 
 
 
 
 844}
 845
 846/*
 847 * Algorithm:
 848 *	1. The first ip_rt_redirect_number redirects are sent
 849 *	   with exponential backoff, then we stop sending them at all,
 850 *	   assuming that the host ignores our redirects.
 851 *	2. If we did not see packets requiring redirects
 852 *	   during ip_rt_redirect_silence, we assume that the host
 853 *	   forgot redirected route and start to send redirects again.
 854 *
 855 * This algorithm is much cheaper and more intelligent than dumb load limiting
 856 * in icmp.c.
 857 *
 858 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 859 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 860 */
 861
 862void ip_rt_send_redirect(struct sk_buff *skb)
 863{
 864	struct rtable *rt = skb_rtable(skb);
 865	struct in_device *in_dev;
 866	struct inet_peer *peer;
 867	struct net *net;
 868	int log_martians;
 869	int vif;
 870
 871	rcu_read_lock();
 872	in_dev = __in_dev_get_rcu(rt->dst.dev);
 873	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 874		rcu_read_unlock();
 875		return;
 876	}
 877	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 878	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 
 879
 880	net = dev_net(rt->dst.dev);
 881	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif);
 882	if (!peer) {
 883		rcu_read_unlock();
 884		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 885			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 886		return;
 887	}
 888
 889	/* No redirected packets during ip_rt_redirect_silence;
 890	 * reset the algorithm.
 891	 */
 892	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 893		peer->rate_tokens = 0;
 894		peer->n_redirects = 0;
 895	}
 896
 897	/* Too many ignored redirects; do not send anything
 898	 * set dst.rate_last to the last seen redirected packet.
 899	 */
 900	if (peer->n_redirects >= ip_rt_redirect_number) {
 901		peer->rate_last = jiffies;
 902		goto out_unlock;
 903	}
 904
 905	/* Check for load limit; set rate_last to the latest sent
 906	 * redirect.
 907	 */
 908	if (peer->n_redirects == 0 ||
 909	    time_after(jiffies,
 910		       (peer->rate_last +
 911			(ip_rt_redirect_load << peer->n_redirects)))) {
 912		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 913
 914		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 915		peer->rate_last = jiffies;
 916		++peer->n_redirects;
 917		if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians &&
 
 918		    peer->n_redirects == ip_rt_redirect_number)
 919			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 920					     &ip_hdr(skb)->saddr, inet_iif(skb),
 921					     &ip_hdr(skb)->daddr, &gw);
 
 922	}
 923out_unlock:
 924	rcu_read_unlock();
 925}
 926
 927static int ip_error(struct sk_buff *skb)
 928{
 929	struct rtable *rt = skb_rtable(skb);
 930	struct net_device *dev = skb->dev;
 931	struct in_device *in_dev;
 932	struct inet_peer *peer;
 933	unsigned long now;
 934	struct net *net;
 935	SKB_DR(reason);
 936	bool send;
 937	int code;
 938
 939	if (netif_is_l3_master(skb->dev)) {
 940		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 941		if (!dev)
 942			goto out;
 943	}
 944
 945	in_dev = __in_dev_get_rcu(dev);
 946
 947	/* IP on this device is disabled. */
 948	if (!in_dev)
 949		goto out;
 950
 951	net = dev_net(rt->dst.dev);
 952	if (!IN_DEV_FORWARD(in_dev)) {
 953		switch (rt->dst.error) {
 954		case EHOSTUNREACH:
 955			SKB_DR_SET(reason, IP_INADDRERRORS);
 956			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 957			break;
 958
 959		case ENETUNREACH:
 960			SKB_DR_SET(reason, IP_INNOROUTES);
 961			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 962			break;
 963		}
 964		goto out;
 965	}
 966
 967	switch (rt->dst.error) {
 968	case EINVAL:
 969	default:
 970		goto out;
 971	case EHOSTUNREACH:
 972		code = ICMP_HOST_UNREACH;
 973		break;
 974	case ENETUNREACH:
 975		code = ICMP_NET_UNREACH;
 976		SKB_DR_SET(reason, IP_INNOROUTES);
 977		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 978		break;
 979	case EACCES:
 980		code = ICMP_PKT_FILTERED;
 981		break;
 982	}
 983
 984	rcu_read_lock();
 985	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 986			       l3mdev_master_ifindex_rcu(skb->dev));
 
 987	send = true;
 988	if (peer) {
 989		now = jiffies;
 990		peer->rate_tokens += now - peer->rate_last;
 991		if (peer->rate_tokens > ip_rt_error_burst)
 992			peer->rate_tokens = ip_rt_error_burst;
 993		peer->rate_last = now;
 994		if (peer->rate_tokens >= ip_rt_error_cost)
 995			peer->rate_tokens -= ip_rt_error_cost;
 996		else
 997			send = false;
 
 998	}
 999	rcu_read_unlock();
1000
1001	if (send)
1002		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1003
1004out:	kfree_skb_reason(skb, reason);
1005	return 0;
1006}
1007
1008static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1009{
1010	struct dst_entry *dst = &rt->dst;
 
 
1011	struct fib_result res;
1012	bool lock = false;
1013	struct net *net;
1014	u32 old_mtu;
1015
1016	if (ip_mtu_locked(dst))
1017		return;
1018
1019	old_mtu = ipv4_mtu(dst);
1020	if (old_mtu < mtu)
1021		return;
1022
1023	rcu_read_lock();
1024	net = dev_net_rcu(dst->dev);
1025	if (mtu < net->ipv4.ip_rt_min_pmtu) {
1026		lock = true;
1027		mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
1028	}
1029
1030	if (rt->rt_pmtu == mtu && !lock &&
1031	    time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
1032		goto out;
1033
 
1034	if (fib_lookup(net, fl4, &res, 0) == 0) {
1035		struct fib_nh_common *nhc;
1036
1037		fib_select_path(net, &res, fl4, NULL);
1038#ifdef CONFIG_IP_ROUTE_MULTIPATH
1039		if (fib_info_num_path(res.fi) > 1) {
1040			int nhsel;
1041
1042			for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) {
1043				nhc = fib_info_nhc(res.fi, nhsel);
1044				update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1045						      jiffies + net->ipv4.ip_rt_mtu_expires);
1046			}
1047			goto out;
1048		}
1049#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1050		nhc = FIB_RES_NHC(res);
1051		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1052				      jiffies + net->ipv4.ip_rt_mtu_expires);
1053	}
1054out:
1055	rcu_read_unlock();
1056}
1057
1058static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1059			      struct sk_buff *skb, u32 mtu,
1060			      bool confirm_neigh)
1061{
1062	struct rtable *rt = dst_rtable(dst);
1063	struct flowi4 fl4;
1064
1065	ip_rt_build_flow_key(&fl4, sk, skb);
1066
1067	/* Don't make lookup fail for bridged encapsulations */
1068	if (skb && netif_is_any_bridge_port(skb->dev))
1069		fl4.flowi4_oif = 0;
1070
1071	__ip_rt_update_pmtu(rt, &fl4, mtu);
1072}
1073
1074void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1075		      int oif, u8 protocol)
1076{
1077	const struct iphdr *iph = (const struct iphdr *)skb->data;
1078	struct flowi4 fl4;
1079	struct rtable *rt;
1080	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1081
1082	__build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
1083			 0);
1084	rt = __ip_route_output_key(net, &fl4);
1085	if (!IS_ERR(rt)) {
1086		__ip_rt_update_pmtu(rt, &fl4, mtu);
1087		ip_rt_put(rt);
1088	}
1089}
1090EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1091
1092static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1093{
1094	const struct iphdr *iph = (const struct iphdr *)skb->data;
1095	struct flowi4 fl4;
1096	struct rtable *rt;
1097
1098	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1099
1100	if (!fl4.flowi4_mark)
1101		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1102
1103	rt = __ip_route_output_key(sock_net(sk), &fl4);
1104	if (!IS_ERR(rt)) {
1105		__ip_rt_update_pmtu(rt, &fl4, mtu);
1106		ip_rt_put(rt);
1107	}
1108}
1109
1110void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1111{
1112	const struct iphdr *iph = (const struct iphdr *)skb->data;
1113	struct flowi4 fl4;
1114	struct rtable *rt;
1115	struct dst_entry *odst = NULL;
1116	bool new = false;
1117	struct net *net = sock_net(sk);
1118
1119	bh_lock_sock(sk);
1120
1121	if (!ip_sk_accept_pmtu(sk))
1122		goto out;
1123
1124	odst = sk_dst_get(sk);
1125
1126	if (sock_owned_by_user(sk) || !odst) {
1127		__ipv4_sk_update_pmtu(skb, sk, mtu);
1128		goto out;
1129	}
1130
1131	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1132
1133	rt = dst_rtable(odst);
1134	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1135		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1136		if (IS_ERR(rt))
1137			goto out;
1138
1139		new = true;
1140	}
1141
1142	__ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu);
1143
1144	if (!dst_check(&rt->dst, 0)) {
1145		if (new)
1146			dst_release(&rt->dst);
1147
1148		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1149		if (IS_ERR(rt))
1150			goto out;
1151
1152		new = true;
1153	}
1154
1155	if (new)
1156		sk_dst_set(sk, &rt->dst);
1157
1158out:
1159	bh_unlock_sock(sk);
1160	dst_release(odst);
1161}
1162EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1163
1164void ipv4_redirect(struct sk_buff *skb, struct net *net,
1165		   int oif, u8 protocol)
1166{
1167	const struct iphdr *iph = (const struct iphdr *)skb->data;
1168	struct flowi4 fl4;
1169	struct rtable *rt;
1170
1171	__build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
 
1172	rt = __ip_route_output_key(net, &fl4);
1173	if (!IS_ERR(rt)) {
1174		__ip_do_redirect(rt, skb, &fl4, false);
1175		ip_rt_put(rt);
1176	}
1177}
1178EXPORT_SYMBOL_GPL(ipv4_redirect);
1179
1180void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1181{
1182	const struct iphdr *iph = (const struct iphdr *)skb->data;
1183	struct flowi4 fl4;
1184	struct rtable *rt;
1185	struct net *net = sock_net(sk);
1186
1187	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1188	rt = __ip_route_output_key(net, &fl4);
1189	if (!IS_ERR(rt)) {
1190		__ip_do_redirect(rt, skb, &fl4, false);
1191		ip_rt_put(rt);
1192	}
1193}
1194EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1195
1196INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
1197							 u32 cookie)
1198{
1199	struct rtable *rt = dst_rtable(dst);
1200
1201	/* All IPV4 dsts are created with ->obsolete set to the value
1202	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1203	 * into this function always.
1204	 *
1205	 * When a PMTU/redirect information update invalidates a route,
1206	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1207	 * DST_OBSOLETE_DEAD.
1208	 */
1209	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1210		return NULL;
1211	return dst;
1212}
1213EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
1214
1215static void ipv4_send_dest_unreach(struct sk_buff *skb)
1216{
1217	struct net_device *dev;
1218	struct ip_options opt;
1219	int res;
1220
1221	/* Recompile ip options since IPCB may not be valid anymore.
1222	 * Also check we have a reasonable ipv4 header.
1223	 */
1224	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1225	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1226		return;
1227
1228	memset(&opt, 0, sizeof(opt));
1229	if (ip_hdr(skb)->ihl > 5) {
1230		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1231			return;
1232		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1233
1234		rcu_read_lock();
1235		dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1236		res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1237		rcu_read_unlock();
1238
1239		if (res)
1240			return;
1241	}
1242	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1243}
1244
1245static void ipv4_link_failure(struct sk_buff *skb)
1246{
1247	struct rtable *rt;
1248
1249	ipv4_send_dest_unreach(skb);
1250
1251	rt = skb_rtable(skb);
1252	if (rt)
1253		dst_set_expires(&rt->dst, 0);
1254}
1255
1256static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1257{
1258	pr_debug("%s: %pI4 -> %pI4, %s\n",
1259		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1260		 skb->dev ? skb->dev->name : "?");
1261	kfree_skb(skb);
1262	WARN_ON(1);
1263	return 0;
1264}
1265
1266/*
1267 * We do not cache source address of outgoing interface,
1268 * because it is used only by IP RR, TS and SRR options,
1269 * so that it out of fast path.
1270 *
1271 * BTW remember: "addr" is allowed to be not aligned
1272 * in IP options!
1273 */
1274
1275void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1276{
1277	__be32 src;
1278
1279	if (rt_is_output_route(rt))
1280		src = ip_hdr(skb)->saddr;
1281	else {
1282		struct fib_result res;
1283		struct iphdr *iph = ip_hdr(skb);
1284		struct flowi4 fl4 = {
1285			.daddr = iph->daddr,
1286			.saddr = iph->saddr,
1287			.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)),
1288			.flowi4_oif = rt->dst.dev->ifindex,
1289			.flowi4_iif = skb->dev->ifindex,
1290			.flowi4_mark = skb->mark,
1291		};
1292
1293		rcu_read_lock();
1294		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1295			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1296		else
1297			src = inet_select_addr(rt->dst.dev,
1298					       rt_nexthop(rt, iph->daddr),
1299					       RT_SCOPE_UNIVERSE);
1300		rcu_read_unlock();
1301	}
1302	memcpy(addr, &src, 4);
1303}
1304
1305#ifdef CONFIG_IP_ROUTE_CLASSID
1306static void set_class_tag(struct rtable *rt, u32 tag)
1307{
1308	if (!(rt->dst.tclassid & 0xFFFF))
1309		rt->dst.tclassid |= tag & 0xFFFF;
1310	if (!(rt->dst.tclassid & 0xFFFF0000))
1311		rt->dst.tclassid |= tag & 0xFFFF0000;
1312}
1313#endif
1314
1315static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1316{
1317	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1318	unsigned int advmss;
1319	struct net *net;
1320
1321	rcu_read_lock();
1322	net = dev_net_rcu(dst->dev);
1323	advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1324				   net->ipv4.ip_rt_min_advmss);
1325	rcu_read_unlock();
1326
1327	return min(advmss, IPV4_MAX_PMTU - header_size);
1328}
1329
1330INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
1331{
1332	return ip_dst_mtu_maybe_forward(dst, false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1333}
1334EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
1335
1336static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1337{
1338	struct fnhe_hash_bucket *hash;
1339	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1340	u32 hval = fnhe_hashfun(daddr);
1341
1342	spin_lock_bh(&fnhe_lock);
1343
1344	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1345					 lockdep_is_held(&fnhe_lock));
1346	hash += hval;
1347
1348	fnhe_p = &hash->chain;
1349	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1350	while (fnhe) {
1351		if (fnhe->fnhe_daddr == daddr) {
1352			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1353				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1354			/* set fnhe_daddr to 0 to ensure it won't bind with
1355			 * new dsts in rt_bind_exception().
1356			 */
1357			fnhe->fnhe_daddr = 0;
1358			fnhe_flush_routes(fnhe);
1359			kfree_rcu(fnhe, rcu);
1360			break;
1361		}
1362		fnhe_p = &fnhe->fnhe_next;
1363		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1364						 lockdep_is_held(&fnhe_lock));
1365	}
1366
1367	spin_unlock_bh(&fnhe_lock);
1368}
1369
1370static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1371					       __be32 daddr)
1372{
1373	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1374	struct fib_nh_exception *fnhe;
1375	u32 hval;
1376
1377	if (!hash)
1378		return NULL;
1379
1380	hval = fnhe_hashfun(daddr);
1381
1382	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1383	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1384		if (fnhe->fnhe_daddr == daddr) {
1385			if (fnhe->fnhe_expires &&
1386			    time_after(jiffies, fnhe->fnhe_expires)) {
1387				ip_del_fnhe(nhc, daddr);
1388				break;
1389			}
1390			return fnhe;
1391		}
1392	}
1393	return NULL;
1394}
1395
1396/* MTU selection:
1397 * 1. mtu on route is locked - use it
1398 * 2. mtu from nexthop exception
1399 * 3. mtu from egress device
1400 */
1401
1402u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1403{
1404	struct fib_nh_common *nhc = res->nhc;
1405	struct net_device *dev = nhc->nhc_dev;
1406	struct fib_info *fi = res->fi;
1407	u32 mtu = 0;
1408
1409	if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1410	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1411		mtu = fi->fib_mtu;
1412
1413	if (likely(!mtu)) {
1414		struct fib_nh_exception *fnhe;
1415
1416		fnhe = find_exception(nhc, daddr);
1417		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1418			mtu = fnhe->fnhe_pmtu;
1419	}
1420
1421	if (likely(!mtu))
1422		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1423
1424	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1425}
1426
1427static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1428			      __be32 daddr, const bool do_cache)
1429{
1430	bool ret = false;
1431
1432	spin_lock_bh(&fnhe_lock);
1433
1434	if (daddr == fnhe->fnhe_daddr) {
1435		struct rtable __rcu **porig;
1436		struct rtable *orig;
1437		int genid = fnhe_genid(dev_net(rt->dst.dev));
1438
1439		if (rt_is_input_route(rt))
1440			porig = &fnhe->fnhe_rth_input;
1441		else
1442			porig = &fnhe->fnhe_rth_output;
1443		orig = rcu_dereference(*porig);
1444
1445		if (fnhe->fnhe_genid != genid) {
1446			fnhe->fnhe_genid = genid;
1447			fnhe->fnhe_gw = 0;
1448			fnhe->fnhe_pmtu = 0;
1449			fnhe->fnhe_expires = 0;
1450			fnhe->fnhe_mtu_locked = false;
1451			fnhe_flush_routes(fnhe);
1452			orig = NULL;
1453		}
1454		fill_route_from_fnhe(rt, fnhe);
1455		if (!rt->rt_gw4) {
1456			rt->rt_gw4 = daddr;
1457			rt->rt_gw_family = AF_INET;
1458		}
1459
1460		if (do_cache) {
1461			dst_hold(&rt->dst);
1462			rcu_assign_pointer(*porig, rt);
1463			if (orig) {
1464				dst_dev_put(&orig->dst);
1465				dst_release(&orig->dst);
1466			}
1467			ret = true;
1468		}
1469
1470		fnhe->fnhe_stamp = jiffies;
1471	}
1472	spin_unlock_bh(&fnhe_lock);
1473
1474	return ret;
1475}
1476
1477static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1478{
1479	struct rtable *orig, *prev, **p;
1480	bool ret = true;
1481
1482	if (rt_is_input_route(rt)) {
1483		p = (struct rtable **)&nhc->nhc_rth_input;
1484	} else {
1485		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1486	}
1487	orig = *p;
1488
1489	/* hold dst before doing cmpxchg() to avoid race condition
1490	 * on this dst
1491	 */
1492	dst_hold(&rt->dst);
1493	prev = cmpxchg(p, orig, rt);
1494	if (prev == orig) {
1495		if (orig) {
1496			rt_add_uncached_list(orig);
1497			dst_release(&orig->dst);
1498		}
1499	} else {
1500		dst_release(&rt->dst);
1501		ret = false;
1502	}
1503
1504	return ret;
1505}
1506
1507struct uncached_list {
1508	spinlock_t		lock;
1509	struct list_head	head;
1510};
1511
1512static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1513
1514void rt_add_uncached_list(struct rtable *rt)
1515{
1516	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1517
1518	rt->dst.rt_uncached_list = ul;
1519
1520	spin_lock_bh(&ul->lock);
1521	list_add_tail(&rt->dst.rt_uncached, &ul->head);
1522	spin_unlock_bh(&ul->lock);
1523}
1524
1525void rt_del_uncached_list(struct rtable *rt)
1526{
1527	if (!list_empty(&rt->dst.rt_uncached)) {
1528		struct uncached_list *ul = rt->dst.rt_uncached_list;
1529
1530		spin_lock_bh(&ul->lock);
1531		list_del_init(&rt->dst.rt_uncached);
1532		spin_unlock_bh(&ul->lock);
1533	}
1534}
1535
1536static void ipv4_dst_destroy(struct dst_entry *dst)
1537{
 
 
1538	ip_dst_metrics_put(dst);
1539	rt_del_uncached_list(dst_rtable(dst));
1540}
1541
1542void rt_flush_dev(struct net_device *dev)
1543{
1544	struct rtable *rt, *safe;
1545	int cpu;
1546
1547	for_each_possible_cpu(cpu) {
1548		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1549
1550		if (list_empty(&ul->head))
1551			continue;
1552
1553		spin_lock_bh(&ul->lock);
1554		list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
1555			if (rt->dst.dev != dev)
1556				continue;
1557			rt->dst.dev = blackhole_netdev;
1558			netdev_ref_replace(dev, blackhole_netdev,
1559					   &rt->dst.dev_tracker, GFP_ATOMIC);
1560			list_del_init(&rt->dst.rt_uncached);
1561		}
1562		spin_unlock_bh(&ul->lock);
1563	}
1564}
1565
1566static bool rt_cache_valid(const struct rtable *rt)
1567{
1568	return	rt &&
1569		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1570		!rt_is_expired(rt);
1571}
1572
1573static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1574			   const struct fib_result *res,
1575			   struct fib_nh_exception *fnhe,
1576			   struct fib_info *fi, u16 type, u32 itag,
1577			   const bool do_cache)
1578{
1579	bool cached = false;
1580
1581	if (fi) {
1582		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1583
1584		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1585			rt->rt_uses_gateway = 1;
1586			rt->rt_gw_family = nhc->nhc_gw_family;
1587			/* only INET and INET6 are supported */
1588			if (likely(nhc->nhc_gw_family == AF_INET))
1589				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1590			else
1591				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1592		}
1593
1594		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1595
1596#ifdef CONFIG_IP_ROUTE_CLASSID
1597		if (nhc->nhc_family == AF_INET) {
1598			struct fib_nh *nh;
1599
1600			nh = container_of(nhc, struct fib_nh, nh_common);
1601			rt->dst.tclassid = nh->nh_tclassid;
1602		}
1603#endif
1604		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1605		if (unlikely(fnhe))
1606			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1607		else if (do_cache)
1608			cached = rt_cache_route(nhc, rt);
1609		if (unlikely(!cached)) {
1610			/* Routes we intend to cache in nexthop exception or
1611			 * FIB nexthop have the DST_NOCACHE bit clear.
1612			 * However, if we are unsuccessful at storing this
1613			 * route into the cache we really need to set it.
1614			 */
1615			if (!rt->rt_gw4) {
1616				rt->rt_gw_family = AF_INET;
1617				rt->rt_gw4 = daddr;
1618			}
1619			rt_add_uncached_list(rt);
1620		}
1621	} else
1622		rt_add_uncached_list(rt);
1623
1624#ifdef CONFIG_IP_ROUTE_CLASSID
1625#ifdef CONFIG_IP_MULTIPLE_TABLES
1626	set_class_tag(rt, res->tclassid);
1627#endif
1628	set_class_tag(rt, itag);
1629#endif
1630}
1631
1632struct rtable *rt_dst_alloc(struct net_device *dev,
1633			    unsigned int flags, u16 type,
1634			    bool noxfrm)
1635{
1636	struct rtable *rt;
1637
1638	rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
 
1639		       (noxfrm ? DST_NOXFRM : 0));
1640
1641	if (rt) {
1642		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1643		rt->rt_flags = flags;
1644		rt->rt_type = type;
1645		rt->rt_is_input = 0;
1646		rt->rt_iif = 0;
1647		rt->rt_pmtu = 0;
1648		rt->rt_mtu_locked = 0;
1649		rt->rt_uses_gateway = 0;
1650		rt->rt_gw_family = 0;
1651		rt->rt_gw4 = 0;
 
1652
1653		rt->dst.output = ip_output;
1654		if (flags & RTCF_LOCAL)
1655			rt->dst.input = ip_local_deliver;
1656	}
1657
1658	return rt;
1659}
1660EXPORT_SYMBOL(rt_dst_alloc);
1661
1662struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1663{
1664	struct rtable *new_rt;
1665
1666	new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
1667			   rt->dst.flags);
1668
1669	if (new_rt) {
1670		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1671		new_rt->rt_flags = rt->rt_flags;
1672		new_rt->rt_type = rt->rt_type;
1673		new_rt->rt_is_input = rt->rt_is_input;
1674		new_rt->rt_iif = rt->rt_iif;
1675		new_rt->rt_pmtu = rt->rt_pmtu;
1676		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1677		new_rt->rt_gw_family = rt->rt_gw_family;
1678		if (rt->rt_gw_family == AF_INET)
1679			new_rt->rt_gw4 = rt->rt_gw4;
1680		else if (rt->rt_gw_family == AF_INET6)
1681			new_rt->rt_gw6 = rt->rt_gw6;
 
1682
1683		new_rt->dst.input = rt->dst.input;
1684		new_rt->dst.output = rt->dst.output;
1685		new_rt->dst.error = rt->dst.error;
1686		new_rt->dst.lastuse = jiffies;
1687		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1688	}
1689	return new_rt;
1690}
1691EXPORT_SYMBOL(rt_dst_clone);
1692
1693/* called in rcu_read_lock() section */
1694enum skb_drop_reason
1695ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1696		      dscp_t dscp, struct net_device *dev,
1697		      struct in_device *in_dev, u32 *itag)
1698{
1699	enum skb_drop_reason reason;
1700
1701	/* Primary sanity checks. */
1702	if (!in_dev)
1703		return SKB_DROP_REASON_NOT_SPECIFIED;
1704
1705	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1706		return SKB_DROP_REASON_IP_INVALID_SOURCE;
1707
1708	if (skb->protocol != htons(ETH_P_IP))
1709		return SKB_DROP_REASON_INVALID_PROTO;
1710
1711	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1712		return SKB_DROP_REASON_IP_LOCALNET;
1713
1714	if (ipv4_is_zeronet(saddr)) {
1715		if (!ipv4_is_local_multicast(daddr) &&
1716		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1717			return SKB_DROP_REASON_IP_INVALID_SOURCE;
1718	} else {
1719		reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0,
1720						    dev, in_dev, itag);
1721		if (reason)
1722			return reason;
1723	}
1724	return SKB_NOT_DROPPED_YET;
1725}
1726
1727/* called in rcu_read_lock() section */
1728static enum skb_drop_reason
1729ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1730		  dscp_t dscp, struct net_device *dev, int our)
1731{
1732	struct in_device *in_dev = __in_dev_get_rcu(dev);
1733	unsigned int flags = RTCF_MULTICAST;
1734	enum skb_drop_reason reason;
1735	struct rtable *rth;
1736	u32 itag = 0;
 
1737
1738	reason = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev,
1739				       &itag);
1740	if (reason)
1741		return reason;
1742
1743	if (our)
1744		flags |= RTCF_LOCAL;
1745
1746	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
1747		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1748
1749	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1750			   false);
1751	if (!rth)
1752		return SKB_DROP_REASON_NOMEM;
1753
1754#ifdef CONFIG_IP_ROUTE_CLASSID
1755	rth->dst.tclassid = itag;
1756#endif
1757	rth->dst.output = ip_rt_bug;
1758	rth->rt_is_input= 1;
1759
1760#ifdef CONFIG_IP_MROUTE
1761	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1762		rth->dst.input = ip_mr_input;
1763#endif
1764	RT_CACHE_STAT_INC(in_slow_mc);
1765
1766	skb_dst_drop(skb);
1767	skb_dst_set(skb, &rth->dst);
1768	return SKB_NOT_DROPPED_YET;
1769}
1770
1771
1772static void ip_handle_martian_source(struct net_device *dev,
1773				     struct in_device *in_dev,
1774				     struct sk_buff *skb,
1775				     __be32 daddr,
1776				     __be32 saddr)
1777{
1778	RT_CACHE_STAT_INC(in_martian_src);
1779#ifdef CONFIG_IP_ROUTE_VERBOSE
1780	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1781		/*
1782		 *	RFC1812 recommendation, if source is martian,
1783		 *	the only hint is MAC header.
1784		 */
1785		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1786			&daddr, &saddr, dev->name);
1787		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1788			print_hex_dump(KERN_WARNING, "ll header: ",
1789				       DUMP_PREFIX_OFFSET, 16, 1,
1790				       skb_mac_header(skb),
1791				       dev->hard_header_len, false);
1792		}
1793	}
1794#endif
1795}
1796
1797/* called in rcu_read_lock() section */
1798static enum skb_drop_reason
1799__mkroute_input(struct sk_buff *skb, const struct fib_result *res,
1800		struct in_device *in_dev, __be32 daddr,
1801		__be32 saddr, dscp_t dscp)
1802{
1803	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
1804	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1805	struct net_device *dev = nhc->nhc_dev;
1806	struct fib_nh_exception *fnhe;
1807	struct rtable *rth;
1808	int err;
1809	struct in_device *out_dev;
1810	bool do_cache;
1811	u32 itag = 0;
1812
1813	/* get a working reference to the output device */
1814	out_dev = __in_dev_get_rcu(dev);
1815	if (!out_dev) {
1816		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1817		return reason;
1818	}
1819
1820	err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res),
1821				  in_dev->dev, in_dev, &itag);
1822	if (err < 0) {
1823		reason = -err;
1824		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1825					 saddr);
1826
1827		goto cleanup;
1828	}
1829
1830	do_cache = res->fi && !itag;
1831	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1832	    skb->protocol == htons(ETH_P_IP)) {
1833		__be32 gw;
1834
1835		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1836		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1837		    inet_addr_onlink(out_dev, saddr, gw))
1838			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1839	}
1840
1841	if (skb->protocol != htons(ETH_P_IP)) {
1842		/* Not IP (i.e. ARP). Do not create route, if it is
1843		 * invalid for proxy arp. DNAT routes are always valid.
1844		 *
1845		 * Proxy arp feature have been extended to allow, ARP
1846		 * replies back to the same interface, to support
1847		 * Private VLAN switch technologies. See arp.c.
1848		 */
1849		if (out_dev == in_dev &&
1850		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1851			reason = SKB_DROP_REASON_ARP_PVLAN_DISABLE;
1852			goto cleanup;
1853		}
1854	}
1855
1856	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
1857		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1858
1859	fnhe = find_exception(nhc, daddr);
1860	if (do_cache) {
1861		if (fnhe)
1862			rth = rcu_dereference(fnhe->fnhe_rth_input);
1863		else
1864			rth = rcu_dereference(nhc->nhc_rth_input);
1865		if (rt_cache_valid(rth)) {
1866			skb_dst_set_noref(skb, &rth->dst);
1867			goto out;
1868		}
1869	}
1870
1871	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1872			   IN_DEV_ORCONF(out_dev, NOXFRM));
 
1873	if (!rth) {
1874		reason = SKB_DROP_REASON_NOMEM;
1875		goto cleanup;
1876	}
1877
1878	rth->rt_is_input = 1;
1879	RT_CACHE_STAT_INC(in_slow_tot);
1880
1881	rth->dst.input = ip_forward;
1882
1883	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1884		       do_cache);
1885	lwtunnel_set_redirect(&rth->dst);
1886	skb_dst_set(skb, &rth->dst);
1887out:
1888	reason = SKB_NOT_DROPPED_YET;
1889cleanup:
1890	return reason;
1891}
1892
1893#ifdef CONFIG_IP_ROUTE_MULTIPATH
1894/* To make ICMP packets follow the right flow, the multipath hash is
1895 * calculated from the inner IP addresses.
1896 */
1897static void ip_multipath_l3_keys(const struct sk_buff *skb,
1898				 struct flow_keys *hash_keys)
1899{
1900	const struct iphdr *outer_iph = ip_hdr(skb);
1901	const struct iphdr *key_iph = outer_iph;
1902	const struct iphdr *inner_iph;
1903	const struct icmphdr *icmph;
1904	struct iphdr _inner_iph;
1905	struct icmphdr _icmph;
1906
1907	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1908		goto out;
1909
1910	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1911		goto out;
1912
1913	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1914				   &_icmph);
1915	if (!icmph)
1916		goto out;
1917
1918	if (!icmp_is_err(icmph->type))
1919		goto out;
1920
1921	inner_iph = skb_header_pointer(skb,
1922				       outer_iph->ihl * 4 + sizeof(_icmph),
1923				       sizeof(_inner_iph), &_inner_iph);
1924	if (!inner_iph)
1925		goto out;
1926
1927	key_iph = inner_iph;
1928out:
1929	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1930	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1931}
1932
1933static u32 fib_multipath_custom_hash_outer(const struct net *net,
1934					   const struct sk_buff *skb,
1935					   bool *p_has_inner)
1936{
1937	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
1938	struct flow_keys keys, hash_keys;
1939
1940	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
1941		return 0;
1942
1943	memset(&hash_keys, 0, sizeof(hash_keys));
1944	skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
1945
1946	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1947	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
1948		hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1949	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
1950		hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1951	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
1952		hash_keys.basic.ip_proto = keys.basic.ip_proto;
1953	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
1954		hash_keys.ports.src = keys.ports.src;
1955	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
1956		hash_keys.ports.dst = keys.ports.dst;
1957
1958	*p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
1959	return fib_multipath_hash_from_keys(net, &hash_keys);
1960}
1961
1962static u32 fib_multipath_custom_hash_inner(const struct net *net,
1963					   const struct sk_buff *skb,
1964					   bool has_inner)
1965{
1966	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
1967	struct flow_keys keys, hash_keys;
1968
1969	/* We assume the packet carries an encapsulation, but if none was
1970	 * encountered during dissection of the outer flow, then there is no
1971	 * point in calling the flow dissector again.
1972	 */
1973	if (!has_inner)
1974		return 0;
1975
1976	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
1977		return 0;
1978
1979	memset(&hash_keys, 0, sizeof(hash_keys));
1980	skb_flow_dissect_flow_keys(skb, &keys, 0);
1981
1982	if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
1983		return 0;
1984
1985	if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1986		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1987		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1988			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1989		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1990			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1991	} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1992		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1993		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1994			hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1995		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1996			hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1997		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
1998			hash_keys.tags.flow_label = keys.tags.flow_label;
1999	}
2000
2001	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
2002		hash_keys.basic.ip_proto = keys.basic.ip_proto;
2003	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
2004		hash_keys.ports.src = keys.ports.src;
2005	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
2006		hash_keys.ports.dst = keys.ports.dst;
2007
2008	return fib_multipath_hash_from_keys(net, &hash_keys);
2009}
2010
2011static u32 fib_multipath_custom_hash_skb(const struct net *net,
2012					 const struct sk_buff *skb)
2013{
2014	u32 mhash, mhash_inner;
2015	bool has_inner = true;
2016
2017	mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
2018	mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);
2019
2020	return jhash_2words(mhash, mhash_inner, 0);
2021}
2022
2023static u32 fib_multipath_custom_hash_fl4(const struct net *net,
2024					 const struct flowi4 *fl4)
2025{
2026	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
2027	struct flow_keys hash_keys;
2028
2029	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2030		return 0;
2031
2032	memset(&hash_keys, 0, sizeof(hash_keys));
2033	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2034	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2035		hash_keys.addrs.v4addrs.src = fl4->saddr;
2036	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2037		hash_keys.addrs.v4addrs.dst = fl4->daddr;
2038	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2039		hash_keys.basic.ip_proto = fl4->flowi4_proto;
2040	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2041		hash_keys.ports.src = fl4->fl4_sport;
2042	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2043		hash_keys.ports.dst = fl4->fl4_dport;
2044
2045	return fib_multipath_hash_from_keys(net, &hash_keys);
2046}
2047
2048/* if skb is set it will be used and fl4 can be NULL */
2049int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
2050		       const struct sk_buff *skb, struct flow_keys *flkeys)
2051{
2052	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
2053	struct flow_keys hash_keys;
2054	u32 mhash = 0;
2055
2056	switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
2057	case 0:
2058		memset(&hash_keys, 0, sizeof(hash_keys));
2059		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2060		if (skb) {
2061			ip_multipath_l3_keys(skb, &hash_keys);
2062		} else {
2063			hash_keys.addrs.v4addrs.src = fl4->saddr;
2064			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2065		}
2066		mhash = fib_multipath_hash_from_keys(net, &hash_keys);
2067		break;
2068	case 1:
2069		/* skb is currently provided only when forwarding */
2070		if (skb) {
2071			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2072			struct flow_keys keys;
2073
2074			/* short-circuit if we already have L4 hash present */
2075			if (skb->l4_hash)
2076				return skb_get_hash_raw(skb) >> 1;
2077
2078			memset(&hash_keys, 0, sizeof(hash_keys));
2079
2080			if (!flkeys) {
2081				skb_flow_dissect_flow_keys(skb, &keys, flag);
2082				flkeys = &keys;
2083			}
2084
2085			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2086			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2087			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2088			hash_keys.ports.src = flkeys->ports.src;
2089			hash_keys.ports.dst = flkeys->ports.dst;
2090			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2091		} else {
2092			memset(&hash_keys, 0, sizeof(hash_keys));
2093			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2094			hash_keys.addrs.v4addrs.src = fl4->saddr;
2095			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2096			hash_keys.ports.src = fl4->fl4_sport;
2097			hash_keys.ports.dst = fl4->fl4_dport;
2098			hash_keys.basic.ip_proto = fl4->flowi4_proto;
2099		}
2100		mhash = fib_multipath_hash_from_keys(net, &hash_keys);
2101		break;
2102	case 2:
2103		memset(&hash_keys, 0, sizeof(hash_keys));
2104		/* skb is currently provided only when forwarding */
2105		if (skb) {
2106			struct flow_keys keys;
2107
2108			skb_flow_dissect_flow_keys(skb, &keys, 0);
2109			/* Inner can be v4 or v6 */
2110			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2111				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2112				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2113				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2114			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2115				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2116				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2117				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2118				hash_keys.tags.flow_label = keys.tags.flow_label;
2119				hash_keys.basic.ip_proto = keys.basic.ip_proto;
2120			} else {
2121				/* Same as case 0 */
2122				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2123				ip_multipath_l3_keys(skb, &hash_keys);
2124			}
2125		} else {
2126			/* Same as case 0 */
2127			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2128			hash_keys.addrs.v4addrs.src = fl4->saddr;
2129			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2130		}
2131		mhash = fib_multipath_hash_from_keys(net, &hash_keys);
2132		break;
2133	case 3:
2134		if (skb)
2135			mhash = fib_multipath_custom_hash_skb(net, skb);
2136		else
2137			mhash = fib_multipath_custom_hash_fl4(net, fl4);
2138		break;
2139	}
 
2140
2141	if (multipath_hash)
2142		mhash = jhash_2words(mhash, multipath_hash, 0);
2143
2144	return mhash >> 1;
2145}
2146#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2147
2148static enum skb_drop_reason
2149ip_mkroute_input(struct sk_buff *skb, struct fib_result *res,
2150		 struct in_device *in_dev, __be32 daddr,
2151		 __be32 saddr, dscp_t dscp, struct flow_keys *hkeys)
 
2152{
2153#ifdef CONFIG_IP_ROUTE_MULTIPATH
2154	if (res->fi && fib_info_num_path(res->fi) > 1) {
2155		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2156
2157		fib_select_multipath(res, h);
2158		IPCB(skb)->flags |= IPSKB_MULTIPATH;
2159	}
2160#endif
2161
2162	/* create a routing cache entry */
2163	return __mkroute_input(skb, res, in_dev, daddr, saddr, dscp);
2164}
2165
2166/* Implements all the saddr-related checks as ip_route_input_slow(),
2167 * assuming daddr is valid and the destination is not a local broadcast one.
2168 * Uses the provided hint instead of performing a route lookup.
2169 */
2170enum skb_drop_reason
2171ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2172		  dscp_t dscp, struct net_device *dev,
2173		  const struct sk_buff *hint)
2174{
2175	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
2176	struct in_device *in_dev = __in_dev_get_rcu(dev);
2177	struct rtable *rt = skb_rtable(hint);
2178	struct net *net = dev_net(dev);
 
2179	u32 tag = 0;
2180
2181	if (!in_dev)
2182		return reason;
2183
2184	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) {
2185		reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
2186		goto martian_source;
2187	}
2188
2189	if (ipv4_is_zeronet(saddr)) {
2190		reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
2191		goto martian_source;
2192	}
2193
2194	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
2195		reason = SKB_DROP_REASON_IP_LOCALNET;
2196		goto martian_source;
2197	}
2198
2199	if (rt->rt_type != RTN_LOCAL)
2200		goto skip_validate_source;
2201
2202	reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev,
2203					    in_dev, &tag);
2204	if (reason)
2205		goto martian_source;
2206
2207skip_validate_source:
2208	skb_dst_copy(skb, hint);
2209	return SKB_NOT_DROPPED_YET;
2210
2211martian_source:
2212	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2213	return reason;
2214}
2215
2216/* get device for dst_alloc with local routes */
2217static struct net_device *ip_rt_get_dev(struct net *net,
2218					const struct fib_result *res)
2219{
2220	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2221	struct net_device *dev = NULL;
2222
2223	if (nhc)
2224		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2225
2226	return dev ? : net->loopback_dev;
2227}
2228
2229/*
2230 *	NOTE. We drop all the packets that has local source
2231 *	addresses, because every properly looped back packet
2232 *	must have correct destination already attached by output routine.
2233 *	Changes in the enforced policies must be applied also to
2234 *	ip_route_use_hint().
2235 *
2236 *	Such approach solves two big problems:
2237 *	1. Not simplex devices are handled properly.
2238 *	2. IP spoofing attempts are filtered with 100% of guarantee.
2239 *	called with rcu_read_lock()
2240 */
2241
2242static enum skb_drop_reason
2243ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2244		    dscp_t dscp, struct net_device *dev,
2245		    struct fib_result *res)
2246{
2247	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
2248	struct in_device *in_dev = __in_dev_get_rcu(dev);
2249	struct flow_keys *flkeys = NULL, _flkeys;
2250	struct net    *net = dev_net(dev);
2251	struct ip_tunnel_info *tun_info;
2252	int		err = -EINVAL;
2253	unsigned int	flags = 0;
2254	u32		itag = 0;
2255	struct rtable	*rth;
2256	struct flowi4	fl4;
2257	bool do_cache = true;
2258
2259	/* IP on this device is disabled. */
2260
2261	if (!in_dev)
2262		goto out;
2263
2264	/* Check for the most weird martians, which can be not detected
2265	 * by fib_lookup.
2266	 */
2267
2268	tun_info = skb_tunnel_info(skb);
2269	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2270		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2271	else
2272		fl4.flowi4_tun_key.tun_id = 0;
2273	skb_dst_drop(skb);
2274
2275	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) {
2276		reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
2277		goto martian_source;
2278	}
2279
2280	res->fi = NULL;
2281	res->table = NULL;
2282	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2283		goto brd_input;
2284
2285	/* Accept zero addresses only to limited broadcast;
2286	 * I even do not know to fix it or not. Waiting for complains :-)
2287	 */
2288	if (ipv4_is_zeronet(saddr)) {
2289		reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
2290		goto martian_source;
2291	}
2292
2293	if (ipv4_is_zeronet(daddr)) {
2294		reason = SKB_DROP_REASON_IP_INVALID_DEST;
2295		goto martian_destination;
2296	}
2297
2298	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2299	 * and call it once if daddr or/and saddr are loopback addresses
2300	 */
2301	if (ipv4_is_loopback(daddr)) {
2302		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
2303			reason = SKB_DROP_REASON_IP_LOCALNET;
2304			goto martian_destination;
2305		}
2306	} else if (ipv4_is_loopback(saddr)) {
2307		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
2308			reason = SKB_DROP_REASON_IP_LOCALNET;
2309			goto martian_source;
2310		}
2311	}
2312
2313	/*
2314	 *	Now we are ready to route packet.
2315	 */
2316	fl4.flowi4_l3mdev = 0;
2317	fl4.flowi4_oif = 0;
2318	fl4.flowi4_iif = dev->ifindex;
2319	fl4.flowi4_mark = skb->mark;
2320	fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
2321	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2322	fl4.flowi4_flags = 0;
2323	fl4.daddr = daddr;
2324	fl4.saddr = saddr;
2325	fl4.flowi4_uid = sock_net_uid(net, NULL);
2326	fl4.flowi4_multipath_hash = 0;
2327
2328	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2329		flkeys = &_flkeys;
2330	} else {
2331		fl4.flowi4_proto = 0;
2332		fl4.fl4_sport = 0;
2333		fl4.fl4_dport = 0;
2334	}
2335
2336	err = fib_lookup(net, &fl4, res, 0);
2337	if (err != 0) {
2338		if (!IN_DEV_FORWARD(in_dev))
2339			err = -EHOSTUNREACH;
2340		goto no_route;
2341	}
2342
2343	if (res->type == RTN_BROADCAST) {
2344		if (IN_DEV_BFORWARD(in_dev))
2345			goto make_route;
2346		/* not do cache if bc_forwarding is enabled */
2347		if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING))
2348			do_cache = false;
2349		goto brd_input;
2350	}
2351
2352	err = -EINVAL;
2353	if (res->type == RTN_LOCAL) {
2354		reason = fib_validate_source_reason(skb, saddr, daddr, dscp,
2355						    0, dev, in_dev, &itag);
2356		if (reason)
2357			goto martian_source;
2358		goto local_input;
2359	}
2360
2361	if (!IN_DEV_FORWARD(in_dev)) {
2362		err = -EHOSTUNREACH;
2363		goto no_route;
2364	}
2365	if (res->type != RTN_UNICAST) {
2366		reason = SKB_DROP_REASON_IP_INVALID_DEST;
2367		goto martian_destination;
2368	}
2369
2370make_route:
2371	reason = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp,
2372				  flkeys);
2373
2374out:
2375	return reason;
2376
2377brd_input:
2378	if (skb->protocol != htons(ETH_P_IP)) {
2379		reason = SKB_DROP_REASON_INVALID_PROTO;
2380		goto out;
2381	}
2382
2383	if (!ipv4_is_zeronet(saddr)) {
2384		reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0,
2385						    dev, in_dev, &itag);
2386		if (reason)
2387			goto martian_source;
2388	}
2389	flags |= RTCF_BROADCAST;
2390	res->type = RTN_BROADCAST;
2391	RT_CACHE_STAT_INC(in_brd);
2392
2393local_input:
2394	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
2395		IPCB(skb)->flags |= IPSKB_NOPOLICY;
2396
2397	do_cache &= res->fi && !itag;
2398	if (do_cache) {
2399		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2400
2401		rth = rcu_dereference(nhc->nhc_rth_input);
2402		if (rt_cache_valid(rth)) {
2403			skb_dst_set_noref(skb, &rth->dst);
2404			reason = SKB_NOT_DROPPED_YET;
2405			goto out;
2406		}
2407	}
2408
2409	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2410			   flags | RTCF_LOCAL, res->type, false);
 
2411	if (!rth)
2412		goto e_nobufs;
2413
2414	rth->dst.output= ip_rt_bug;
2415#ifdef CONFIG_IP_ROUTE_CLASSID
2416	rth->dst.tclassid = itag;
2417#endif
2418	rth->rt_is_input = 1;
2419
2420	RT_CACHE_STAT_INC(in_slow_tot);
2421	if (res->type == RTN_UNREACHABLE) {
2422		rth->dst.input= ip_error;
2423		rth->dst.error= -err;
2424		rth->rt_flags	&= ~RTCF_LOCAL;
2425	}
2426
2427	if (do_cache) {
2428		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2429
2430		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2431		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2432			WARN_ON(rth->dst.input == lwtunnel_input);
2433			rth->dst.lwtstate->orig_input = rth->dst.input;
2434			rth->dst.input = lwtunnel_input;
2435		}
2436
2437		if (unlikely(!rt_cache_route(nhc, rth)))
2438			rt_add_uncached_list(rth);
2439	}
2440	skb_dst_set(skb, &rth->dst);
2441	reason = SKB_NOT_DROPPED_YET;
2442	goto out;
2443
2444no_route:
2445	RT_CACHE_STAT_INC(in_no_route);
2446	res->type = RTN_UNREACHABLE;
2447	res->fi = NULL;
2448	res->table = NULL;
2449	goto local_input;
2450
2451	/*
2452	 *	Do not cache martian addresses: they should be logged (RFC1812)
2453	 */
2454martian_destination:
2455	RT_CACHE_STAT_INC(in_martian_dst);
2456#ifdef CONFIG_IP_ROUTE_VERBOSE
2457	if (IN_DEV_LOG_MARTIANS(in_dev))
2458		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2459				     &daddr, &saddr, dev->name);
2460#endif
 
 
 
2461	goto out;
2462
2463e_nobufs:
2464	reason = SKB_DROP_REASON_NOMEM;
2465	goto out;
2466
2467martian_source:
2468	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2469	goto out;
2470}
2471
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2472/* called with rcu_read_lock held */
2473static enum skb_drop_reason
2474ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2475		   dscp_t dscp, struct net_device *dev,
2476		   struct fib_result *res)
2477{
2478	/* Multicast recognition logic is moved from route cache to here.
2479	 * The problem was that too many Ethernet cards have broken/missing
2480	 * hardware multicast filters :-( As result the host on multicasting
2481	 * network acquires a lot of useless route cache entries, sort of
2482	 * SDR messages from all the world. Now we try to get rid of them.
2483	 * Really, provided software IP multicast filter is organized
2484	 * reasonably (at least, hashed), it does not result in a slowdown
2485	 * comparing with route cache reject entries.
2486	 * Note, that multicast routers are not affected, because
2487	 * route cache entry is created eventually.
2488	 */
2489	if (ipv4_is_multicast(daddr)) {
2490		enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
2491		struct in_device *in_dev = __in_dev_get_rcu(dev);
2492		int our = 0;
 
2493
2494		if (!in_dev)
2495			return reason;
2496
2497		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2498				      ip_hdr(skb)->protocol);
2499
2500		/* check l3 master if no match yet */
2501		if (!our && netif_is_l3_slave(dev)) {
2502			struct in_device *l3_in_dev;
2503
2504			l3_in_dev = __in_dev_get_rcu(skb->dev);
2505			if (l3_in_dev)
2506				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2507						      ip_hdr(skb)->protocol);
2508		}
2509
2510		if (our
2511#ifdef CONFIG_IP_MROUTE
2512			||
2513		    (!ipv4_is_local_multicast(daddr) &&
2514		     IN_DEV_MFORWARD(in_dev))
2515#endif
2516		   ) {
2517			reason = ip_route_input_mc(skb, daddr, saddr, dscp,
2518						   dev, our);
2519		}
2520		return reason;
2521	}
2522
2523	return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res);
2524}
2525
2526enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr,
2527					  __be32 saddr, dscp_t dscp,
2528					  struct net_device *dev)
2529{
2530	enum skb_drop_reason reason;
2531	struct fib_result res;
2532
2533	rcu_read_lock();
2534	reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res);
2535	rcu_read_unlock();
2536
2537	return reason;
2538}
2539EXPORT_SYMBOL(ip_route_input_noref);
2540
2541/* called with rcu_read_lock() */
2542static struct rtable *__mkroute_output(const struct fib_result *res,
2543				       const struct flowi4 *fl4, int orig_oif,
2544				       struct net_device *dev_out,
2545				       unsigned int flags)
2546{
2547	struct fib_info *fi = res->fi;
2548	struct fib_nh_exception *fnhe;
2549	struct in_device *in_dev;
2550	u16 type = res->type;
2551	struct rtable *rth;
2552	bool do_cache;
2553
2554	in_dev = __in_dev_get_rcu(dev_out);
2555	if (!in_dev)
2556		return ERR_PTR(-EINVAL);
2557
2558	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2559		if (ipv4_is_loopback(fl4->saddr) &&
2560		    !(dev_out->flags & IFF_LOOPBACK) &&
2561		    !netif_is_l3_master(dev_out))
2562			return ERR_PTR(-EINVAL);
2563
2564	if (ipv4_is_lbcast(fl4->daddr))
2565		type = RTN_BROADCAST;
2566	else if (ipv4_is_multicast(fl4->daddr))
2567		type = RTN_MULTICAST;
2568	else if (ipv4_is_zeronet(fl4->daddr))
2569		return ERR_PTR(-EINVAL);
2570
2571	if (dev_out->flags & IFF_LOOPBACK)
2572		flags |= RTCF_LOCAL;
2573
2574	do_cache = true;
2575	if (type == RTN_BROADCAST) {
2576		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2577		fi = NULL;
2578	} else if (type == RTN_MULTICAST) {
2579		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2580		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2581				     fl4->flowi4_proto))
2582			flags &= ~RTCF_LOCAL;
2583		else
2584			do_cache = false;
2585		/* If multicast route do not exist use
2586		 * default one, but do not gateway in this case.
2587		 * Yes, it is hack.
2588		 */
2589		if (fi && res->prefixlen < 4)
2590			fi = NULL;
2591	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2592		   (orig_oif != dev_out->ifindex)) {
2593		/* For local routes that require a particular output interface
2594		 * we do not want to cache the result.  Caching the result
2595		 * causes incorrect behaviour when there are multiple source
2596		 * addresses on the interface, the end result being that if the
2597		 * intended recipient is waiting on that interface for the
2598		 * packet he won't receive it because it will be delivered on
2599		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2600		 * be set to the loopback interface as well.
2601		 */
2602		do_cache = false;
2603	}
2604
2605	fnhe = NULL;
2606	do_cache &= fi != NULL;
2607	if (fi) {
2608		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2609		struct rtable __rcu **prth;
2610
2611		fnhe = find_exception(nhc, fl4->daddr);
2612		if (!do_cache)
2613			goto add;
2614		if (fnhe) {
2615			prth = &fnhe->fnhe_rth_output;
2616		} else {
2617			if (unlikely(fl4->flowi4_flags &
2618				     FLOWI_FLAG_KNOWN_NH &&
2619				     !(nhc->nhc_gw_family &&
2620				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2621				do_cache = false;
2622				goto add;
2623			}
2624			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2625		}
2626		rth = rcu_dereference(*prth);
2627		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2628			return rth;
2629	}
2630
2631add:
2632	rth = rt_dst_alloc(dev_out, flags, type,
2633			   IN_DEV_ORCONF(in_dev, NOXFRM));
 
2634	if (!rth)
2635		return ERR_PTR(-ENOBUFS);
2636
2637	rth->rt_iif = orig_oif;
2638
2639	RT_CACHE_STAT_INC(out_slow_tot);
2640
2641	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2642		if (flags & RTCF_LOCAL &&
2643		    !(dev_out->flags & IFF_LOOPBACK)) {
2644			rth->dst.output = ip_mc_output;
2645			RT_CACHE_STAT_INC(out_slow_mc);
2646		}
2647#ifdef CONFIG_IP_MROUTE
2648		if (type == RTN_MULTICAST) {
2649			if (IN_DEV_MFORWARD(in_dev) &&
2650			    !ipv4_is_local_multicast(fl4->daddr)) {
2651				rth->dst.input = ip_mr_input;
2652				rth->dst.output = ip_mc_output;
2653			}
2654		}
2655#endif
2656	}
2657
2658	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2659	lwtunnel_set_redirect(&rth->dst);
2660
2661	return rth;
2662}
2663
2664/*
2665 * Major route resolver routine.
2666 */
2667
2668struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2669					const struct sk_buff *skb)
2670{
 
2671	struct fib_result res = {
2672		.type		= RTN_UNSPEC,
2673		.fi		= NULL,
2674		.table		= NULL,
2675		.tclassid	= 0,
2676	};
2677	struct rtable *rth;
2678
2679	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2680	fl4->flowi4_tos &= INET_DSCP_MASK;
 
 
2681
2682	rcu_read_lock();
2683	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2684	rcu_read_unlock();
2685
2686	return rth;
2687}
2688EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2689
2690struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2691					    struct fib_result *res,
2692					    const struct sk_buff *skb)
2693{
2694	struct net_device *dev_out = NULL;
2695	int orig_oif = fl4->flowi4_oif;
2696	unsigned int flags = 0;
2697	struct rtable *rth;
2698	int err;
2699
2700	if (fl4->saddr) {
2701		if (ipv4_is_multicast(fl4->saddr) ||
2702		    ipv4_is_lbcast(fl4->saddr) ||
2703		    ipv4_is_zeronet(fl4->saddr)) {
2704			rth = ERR_PTR(-EINVAL);
2705			goto out;
2706		}
2707
2708		rth = ERR_PTR(-ENETUNREACH);
2709
2710		/* I removed check for oif == dev_out->oif here.
2711		 * It was wrong for two reasons:
2712		 * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2713		 *    is assigned to multiple interfaces.
2714		 * 2. Moreover, we are allowed to send packets with saddr
2715		 *    of another iface. --ANK
2716		 */
2717
2718		if (fl4->flowi4_oif == 0 &&
2719		    (ipv4_is_multicast(fl4->daddr) ||
2720		     ipv4_is_lbcast(fl4->daddr))) {
2721			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2722			dev_out = __ip_dev_find(net, fl4->saddr, false);
2723			if (!dev_out)
2724				goto out;
2725
2726			/* Special hack: user can direct multicasts
2727			 * and limited broadcast via necessary interface
2728			 * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2729			 * This hack is not just for fun, it allows
2730			 * vic,vat and friends to work.
2731			 * They bind socket to loopback, set ttl to zero
2732			 * and expect that it will work.
2733			 * From the viewpoint of routing cache they are broken,
2734			 * because we are not allowed to build multicast path
2735			 * with loopback source addr (look, routing cache
2736			 * cannot know, that ttl is zero, so that packet
2737			 * will not leave this host and route is valid).
2738			 * Luckily, this hack is good workaround.
2739			 */
2740
2741			fl4->flowi4_oif = dev_out->ifindex;
2742			goto make_route;
2743		}
2744
2745		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2746			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2747			if (!__ip_dev_find(net, fl4->saddr, false))
2748				goto out;
2749		}
2750	}
2751
2752
2753	if (fl4->flowi4_oif) {
2754		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2755		rth = ERR_PTR(-ENODEV);
2756		if (!dev_out)
2757			goto out;
2758
2759		/* RACE: Check return value of inet_select_addr instead. */
2760		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2761			rth = ERR_PTR(-ENETUNREACH);
2762			goto out;
2763		}
2764		if (ipv4_is_local_multicast(fl4->daddr) ||
2765		    ipv4_is_lbcast(fl4->daddr) ||
2766		    fl4->flowi4_proto == IPPROTO_IGMP) {
2767			if (!fl4->saddr)
2768				fl4->saddr = inet_select_addr(dev_out, 0,
2769							      RT_SCOPE_LINK);
2770			goto make_route;
2771		}
2772		if (!fl4->saddr) {
2773			if (ipv4_is_multicast(fl4->daddr))
2774				fl4->saddr = inet_select_addr(dev_out, 0,
2775							      fl4->flowi4_scope);
2776			else if (!fl4->daddr)
2777				fl4->saddr = inet_select_addr(dev_out, 0,
2778							      RT_SCOPE_HOST);
2779		}
2780	}
2781
2782	if (!fl4->daddr) {
2783		fl4->daddr = fl4->saddr;
2784		if (!fl4->daddr)
2785			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2786		dev_out = net->loopback_dev;
2787		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2788		res->type = RTN_LOCAL;
2789		flags |= RTCF_LOCAL;
2790		goto make_route;
2791	}
2792
2793	err = fib_lookup(net, fl4, res, 0);
2794	if (err) {
2795		res->fi = NULL;
2796		res->table = NULL;
2797		if (fl4->flowi4_oif &&
2798		    (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
 
2799			/* Apparently, routing tables are wrong. Assume,
2800			 * that the destination is on link.
2801			 *
2802			 * WHY? DW.
2803			 * Because we are allowed to send to iface
2804			 * even if it has NO routes and NO assigned
2805			 * addresses. When oif is specified, routing
2806			 * tables are looked up with only one purpose:
2807			 * to catch if destination is gatewayed, rather than
2808			 * direct. Moreover, if MSG_DONTROUTE is set,
2809			 * we send packet, ignoring both routing tables
2810			 * and ifaddr state. --ANK
2811			 *
2812			 *
2813			 * We could make it even if oif is unknown,
2814			 * likely IPv6, but we do not.
2815			 */
2816
2817			if (fl4->saddr == 0)
2818				fl4->saddr = inet_select_addr(dev_out, 0,
2819							      RT_SCOPE_LINK);
2820			res->type = RTN_UNICAST;
2821			goto make_route;
2822		}
2823		rth = ERR_PTR(err);
2824		goto out;
2825	}
2826
2827	if (res->type == RTN_LOCAL) {
2828		if (!fl4->saddr) {
2829			if (res->fi->fib_prefsrc)
2830				fl4->saddr = res->fi->fib_prefsrc;
2831			else
2832				fl4->saddr = fl4->daddr;
2833		}
2834
2835		/* L3 master device is the loopback for that domain */
2836		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2837			net->loopback_dev;
2838
2839		/* make sure orig_oif points to fib result device even
2840		 * though packet rx/tx happens over loopback or l3mdev
2841		 */
2842		orig_oif = FIB_RES_OIF(*res);
2843
2844		fl4->flowi4_oif = dev_out->ifindex;
2845		flags |= RTCF_LOCAL;
2846		goto make_route;
2847	}
2848
2849	fib_select_path(net, res, fl4, skb);
2850
2851	dev_out = FIB_RES_DEV(*res);
2852
2853make_route:
2854	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2855
2856out:
2857	return rth;
2858}
2859
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2860static struct dst_ops ipv4_dst_blackhole_ops = {
2861	.family			= AF_INET,
2862	.default_advmss		= ipv4_default_advmss,
2863	.neigh_lookup		= ipv4_neigh_lookup,
2864	.check			= dst_blackhole_check,
2865	.cow_metrics		= dst_blackhole_cow_metrics,
2866	.update_pmtu		= dst_blackhole_update_pmtu,
2867	.redirect		= dst_blackhole_redirect,
2868	.mtu			= dst_blackhole_mtu,
2869};
2870
2871struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2872{
2873	struct rtable *ort = dst_rtable(dst_orig);
2874	struct rtable *rt;
2875
2876	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0);
2877	if (rt) {
2878		struct dst_entry *new = &rt->dst;
2879
2880		new->__use = 1;
2881		new->input = dst_discard;
2882		new->output = dst_discard_out;
2883
2884		new->dev = net->loopback_dev;
2885		netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);
 
2886
2887		rt->rt_is_input = ort->rt_is_input;
2888		rt->rt_iif = ort->rt_iif;
2889		rt->rt_pmtu = ort->rt_pmtu;
2890		rt->rt_mtu_locked = ort->rt_mtu_locked;
2891
2892		rt->rt_genid = rt_genid_ipv4(net);
2893		rt->rt_flags = ort->rt_flags;
2894		rt->rt_type = ort->rt_type;
2895		rt->rt_uses_gateway = ort->rt_uses_gateway;
2896		rt->rt_gw_family = ort->rt_gw_family;
2897		if (rt->rt_gw_family == AF_INET)
2898			rt->rt_gw4 = ort->rt_gw4;
2899		else if (rt->rt_gw_family == AF_INET6)
2900			rt->rt_gw6 = ort->rt_gw6;
 
 
2901	}
2902
2903	dst_release(dst_orig);
2904
2905	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2906}
2907
2908struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2909				    const struct sock *sk)
2910{
2911	struct rtable *rt = __ip_route_output_key(net, flp4);
2912
2913	if (IS_ERR(rt))
2914		return rt;
2915
2916	if (flp4->flowi4_proto) {
2917		flp4->flowi4_oif = rt->dst.dev->ifindex;
2918		rt = dst_rtable(xfrm_lookup_route(net, &rt->dst,
2919						  flowi4_to_flowi(flp4),
2920						  sk, 0));
2921	}
2922
2923	return rt;
2924}
2925EXPORT_SYMBOL_GPL(ip_route_output_flow);
2926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2927/* called with rcu_read_lock held */
2928static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2929			struct rtable *rt, u32 table_id, dscp_t dscp,
2930			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2931			u32 seq, unsigned int flags)
2932{
2933	struct rtmsg *r;
2934	struct nlmsghdr *nlh;
2935	unsigned long expires = 0;
2936	u32 error;
2937	u32 metrics[RTAX_MAX];
2938
2939	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2940	if (!nlh)
2941		return -EMSGSIZE;
2942
2943	r = nlmsg_data(nlh);
2944	r->rtm_family	 = AF_INET;
2945	r->rtm_dst_len	= 32;
2946	r->rtm_src_len	= 0;
2947	r->rtm_tos	= inet_dscp_to_dsfield(dscp);
2948	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2949	if (nla_put_u32(skb, RTA_TABLE, table_id))
2950		goto nla_put_failure;
2951	r->rtm_type	= rt->rt_type;
2952	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2953	r->rtm_protocol = RTPROT_UNSPEC;
2954	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2955	if (rt->rt_flags & RTCF_NOTIFY)
2956		r->rtm_flags |= RTM_F_NOTIFY;
2957	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2958		r->rtm_flags |= RTCF_DOREDIRECT;
2959
2960	if (nla_put_in_addr(skb, RTA_DST, dst))
2961		goto nla_put_failure;
2962	if (src) {
2963		r->rtm_src_len = 32;
2964		if (nla_put_in_addr(skb, RTA_SRC, src))
2965			goto nla_put_failure;
2966	}
2967	if (rt->dst.dev &&
2968	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2969		goto nla_put_failure;
2970	if (rt->dst.lwtstate &&
2971	    lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
2972		goto nla_put_failure;
2973#ifdef CONFIG_IP_ROUTE_CLASSID
2974	if (rt->dst.tclassid &&
2975	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2976		goto nla_put_failure;
2977#endif
2978	if (fl4 && !rt_is_input_route(rt) &&
2979	    fl4->saddr != src) {
2980		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2981			goto nla_put_failure;
2982	}
2983	if (rt->rt_uses_gateway) {
2984		if (rt->rt_gw_family == AF_INET &&
2985		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2986			goto nla_put_failure;
2987		} else if (rt->rt_gw_family == AF_INET6) {
2988			int alen = sizeof(struct in6_addr);
2989			struct nlattr *nla;
2990			struct rtvia *via;
2991
2992			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2993			if (!nla)
2994				goto nla_put_failure;
2995
2996			via = nla_data(nla);
2997			via->rtvia_family = AF_INET6;
2998			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2999		}
3000	}
3001
3002	expires = rt->dst.expires;
3003	if (expires) {
3004		unsigned long now = jiffies;
3005
3006		if (time_before(now, expires))
3007			expires -= now;
3008		else
3009			expires = 0;
3010	}
3011
3012	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3013	if (rt->rt_pmtu && expires)
3014		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
3015	if (rt->rt_mtu_locked && expires)
3016		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
3017	if (rtnetlink_put_metrics(skb, metrics) < 0)
3018		goto nla_put_failure;
3019
3020	if (fl4) {
3021		if (fl4->flowi4_mark &&
3022		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
3023			goto nla_put_failure;
3024
3025		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
3026		    nla_put_u32(skb, RTA_UID,
3027				from_kuid_munged(current_user_ns(),
3028						 fl4->flowi4_uid)))
3029			goto nla_put_failure;
3030
3031		if (rt_is_input_route(rt)) {
3032#ifdef CONFIG_IP_MROUTE
3033			if (ipv4_is_multicast(dst) &&
3034			    !ipv4_is_local_multicast(dst) &&
3035			    IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) {
3036				int err = ipmr_get_route(net, skb,
3037							 fl4->saddr, fl4->daddr,
3038							 r, portid);
3039
3040				if (err <= 0) {
3041					if (err == 0)
3042						return 0;
3043					goto nla_put_failure;
3044				}
3045			} else
3046#endif
3047				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
3048					goto nla_put_failure;
3049		}
3050	}
3051
3052	error = rt->dst.error;
3053
3054	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
3055		goto nla_put_failure;
3056
3057	nlmsg_end(skb, nlh);
3058	return 0;
3059
3060nla_put_failure:
3061	nlmsg_cancel(skb, nlh);
3062	return -EMSGSIZE;
3063}
3064
3065static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
3066			    struct netlink_callback *cb, u32 table_id,
3067			    struct fnhe_hash_bucket *bucket, int genid,
3068			    int *fa_index, int fa_start, unsigned int flags)
3069{
3070	int i;
3071
3072	for (i = 0; i < FNHE_HASH_SIZE; i++) {
3073		struct fib_nh_exception *fnhe;
3074
3075		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3076		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
3077			struct rtable *rt;
3078			int err;
3079
3080			if (*fa_index < fa_start)
3081				goto next;
3082
3083			if (fnhe->fnhe_genid != genid)
3084				goto next;
3085
3086			if (fnhe->fnhe_expires &&
3087			    time_after(jiffies, fnhe->fnhe_expires))
3088				goto next;
3089
3090			rt = rcu_dereference(fnhe->fnhe_rth_input);
3091			if (!rt)
3092				rt = rcu_dereference(fnhe->fnhe_rth_output);
3093			if (!rt)
3094				goto next;
3095
3096			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3097					   table_id, 0, NULL, skb,
3098					   NETLINK_CB(cb->skb).portid,
3099					   cb->nlh->nlmsg_seq, flags);
3100			if (err)
3101				return err;
3102next:
3103			(*fa_index)++;
3104		}
3105	}
3106
3107	return 0;
3108}
3109
3110int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3111		       u32 table_id, struct fib_info *fi,
3112		       int *fa_index, int fa_start, unsigned int flags)
3113{
3114	struct net *net = sock_net(cb->skb->sk);
3115	int nhsel, genid = fnhe_genid(net);
3116
3117	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3118		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3119		struct fnhe_hash_bucket *bucket;
3120		int err;
3121
3122		if (nhc->nhc_flags & RTNH_F_DEAD)
3123			continue;
3124
3125		rcu_read_lock();
3126		bucket = rcu_dereference(nhc->nhc_exceptions);
3127		err = 0;
3128		if (bucket)
3129			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3130					       genid, fa_index, fa_start,
3131					       flags);
3132		rcu_read_unlock();
3133		if (err)
3134			return err;
3135	}
3136
3137	return 0;
3138}
3139
3140static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3141						   u8 ip_proto, __be16 sport,
3142						   __be16 dport)
3143{
3144	struct sk_buff *skb;
3145	struct iphdr *iph;
3146
3147	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3148	if (!skb)
3149		return NULL;
3150
3151	/* Reserve room for dummy headers, this skb can pass
3152	 * through good chunk of routing engine.
3153	 */
3154	skb_reset_mac_header(skb);
3155	skb_reset_network_header(skb);
3156	skb->protocol = htons(ETH_P_IP);
3157	iph = skb_put(skb, sizeof(struct iphdr));
3158	iph->protocol = ip_proto;
3159	iph->saddr = src;
3160	iph->daddr = dst;
3161	iph->version = 0x4;
3162	iph->frag_off = 0;
3163	iph->ihl = 0x5;
3164	skb_set_transport_header(skb, skb->len);
3165
3166	switch (iph->protocol) {
3167	case IPPROTO_UDP: {
3168		struct udphdr *udph;
3169
3170		udph = skb_put_zero(skb, sizeof(struct udphdr));
3171		udph->source = sport;
3172		udph->dest = dport;
3173		udph->len = htons(sizeof(struct udphdr));
3174		udph->check = 0;
3175		break;
3176	}
3177	case IPPROTO_TCP: {
3178		struct tcphdr *tcph;
3179
3180		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3181		tcph->source	= sport;
3182		tcph->dest	= dport;
3183		tcph->doff	= sizeof(struct tcphdr) / 4;
3184		tcph->rst = 1;
3185		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3186					    src, dst, 0);
3187		break;
3188	}
3189	case IPPROTO_ICMP: {
3190		struct icmphdr *icmph;
3191
3192		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3193		icmph->type = ICMP_ECHO;
3194		icmph->code = 0;
3195	}
3196	}
3197
3198	return skb;
3199}
3200
3201static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3202				       const struct nlmsghdr *nlh,
3203				       struct nlattr **tb,
3204				       struct netlink_ext_ack *extack)
3205{
3206	struct rtmsg *rtm;
3207	int i, err;
3208
3209	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3210		NL_SET_ERR_MSG(extack,
3211			       "ipv4: Invalid header for route get request");
3212		return -EINVAL;
3213	}
3214
3215	if (!netlink_strict_get_check(skb))
3216		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3217					      rtm_ipv4_policy, extack);
3218
3219	rtm = nlmsg_data(nlh);
3220	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3221	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3222	    rtm->rtm_table || rtm->rtm_protocol ||
3223	    rtm->rtm_scope || rtm->rtm_type) {
3224		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3225		return -EINVAL;
3226	}
3227
3228	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3229			       RTM_F_LOOKUP_TABLE |
3230			       RTM_F_FIB_MATCH)) {
3231		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3232		return -EINVAL;
3233	}
3234
3235	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3236					    rtm_ipv4_policy, extack);
3237	if (err)
3238		return err;
3239
3240	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3241	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3242		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3243		return -EINVAL;
3244	}
3245
3246	for (i = 0; i <= RTA_MAX; i++) {
3247		if (!tb[i])
3248			continue;
3249
3250		switch (i) {
3251		case RTA_IIF:
3252		case RTA_OIF:
3253		case RTA_SRC:
3254		case RTA_DST:
3255		case RTA_IP_PROTO:
3256		case RTA_SPORT:
3257		case RTA_DPORT:
3258		case RTA_MARK:
3259		case RTA_UID:
3260			break;
3261		default:
3262			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3263			return -EINVAL;
3264		}
3265	}
3266
3267	return 0;
3268}
3269
3270static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3271			     struct netlink_ext_ack *extack)
3272{
3273	struct net *net = sock_net(in_skb->sk);
3274	struct nlattr *tb[RTA_MAX+1];
3275	u32 table_id = RT_TABLE_MAIN;
3276	__be16 sport = 0, dport = 0;
3277	struct fib_result res = {};
3278	u8 ip_proto = IPPROTO_UDP;
3279	struct rtable *rt = NULL;
3280	struct sk_buff *skb;
3281	struct rtmsg *rtm;
3282	struct flowi4 fl4 = {};
3283	__be32 dst = 0;
3284	__be32 src = 0;
3285	kuid_t uid;
3286	u32 iif;
3287	int err;
3288	int mark;
3289
3290	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3291	if (err < 0)
3292		return err;
3293
3294	rtm = nlmsg_data(nlh);
3295	src = nla_get_in_addr_default(tb[RTA_SRC], 0);
3296	dst = nla_get_in_addr_default(tb[RTA_DST], 0);
3297	iif = nla_get_u32_default(tb[RTA_IIF], 0);
3298	mark = nla_get_u32_default(tb[RTA_MARK], 0);
3299	if (tb[RTA_UID])
3300		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3301	else
3302		uid = (iif ? INVALID_UID : current_uid());
3303
3304	if (tb[RTA_IP_PROTO]) {
3305		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3306						  &ip_proto, AF_INET, extack);
3307		if (err)
3308			return err;
3309	}
3310
3311	if (tb[RTA_SPORT])
3312		sport = nla_get_be16(tb[RTA_SPORT]);
3313
3314	if (tb[RTA_DPORT])
3315		dport = nla_get_be16(tb[RTA_DPORT]);
3316
3317	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3318	if (!skb)
3319		return -ENOBUFS;
3320
3321	fl4.daddr = dst;
3322	fl4.saddr = src;
3323	fl4.flowi4_tos = rtm->rtm_tos & INET_DSCP_MASK;
3324	fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0);
3325	fl4.flowi4_mark = mark;
3326	fl4.flowi4_uid = uid;
3327	if (sport)
3328		fl4.fl4_sport = sport;
3329	if (dport)
3330		fl4.fl4_dport = dport;
3331	fl4.flowi4_proto = ip_proto;
3332
3333	rcu_read_lock();
3334
3335	if (iif) {
3336		struct net_device *dev;
3337
3338		dev = dev_get_by_index_rcu(net, iif);
3339		if (!dev) {
3340			err = -ENODEV;
3341			goto errout_rcu;
3342		}
3343
3344		fl4.flowi4_iif = iif; /* for rt_fill_info */
3345		skb->dev	= dev;
3346		skb->mark	= mark;
3347		err = ip_route_input_rcu(skb, dst, src,
3348					 inet_dsfield_to_dscp(rtm->rtm_tos),
3349					 dev, &res) ? -EINVAL : 0;
3350
3351		rt = skb_rtable(skb);
3352		if (err == 0 && rt->dst.error)
3353			err = -rt->dst.error;
3354	} else {
3355		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3356		skb->dev = net->loopback_dev;
3357		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3358		err = 0;
3359		if (IS_ERR(rt))
3360			err = PTR_ERR(rt);
3361		else
3362			skb_dst_set(skb, &rt->dst);
3363	}
3364
3365	if (err)
3366		goto errout_rcu;
3367
3368	if (rtm->rtm_flags & RTM_F_NOTIFY)
3369		rt->rt_flags |= RTCF_NOTIFY;
3370
3371	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3372		table_id = res.table ? res.table->tb_id : 0;
3373
3374	/* reset skb for netlink reply msg */
3375	skb_trim(skb, 0);
3376	skb_reset_network_header(skb);
3377	skb_reset_transport_header(skb);
3378	skb_reset_mac_header(skb);
3379
3380	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3381		struct fib_rt_info fri;
3382
3383		if (!res.fi) {
3384			err = fib_props[res.type].error;
3385			if (!err)
3386				err = -EHOSTUNREACH;
3387			goto errout_rcu;
3388		}
3389		fri.fi = res.fi;
3390		fri.tb_id = table_id;
3391		fri.dst = res.prefix;
3392		fri.dst_len = res.prefixlen;
3393		fri.dscp = res.dscp;
3394		fri.type = rt->rt_type;
3395		fri.offload = 0;
3396		fri.trap = 0;
3397		fri.offload_failed = 0;
3398		if (res.fa_head) {
3399			struct fib_alias *fa;
3400
3401			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3402				u8 slen = 32 - fri.dst_len;
3403
3404				if (fa->fa_slen == slen &&
3405				    fa->tb_id == fri.tb_id &&
3406				    fa->fa_dscp == fri.dscp &&
3407				    fa->fa_info == res.fi &&
3408				    fa->fa_type == fri.type) {
3409					fri.offload = READ_ONCE(fa->offload);
3410					fri.trap = READ_ONCE(fa->trap);
3411					fri.offload_failed =
3412						READ_ONCE(fa->offload_failed);
3413					break;
3414				}
3415			}
3416		}
3417		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3418				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3419	} else {
3420		err = rt_fill_info(net, dst, src, rt, table_id, res.dscp, &fl4,
3421				   skb, NETLINK_CB(in_skb).portid,
3422				   nlh->nlmsg_seq, 0);
3423	}
3424	if (err < 0)
3425		goto errout_rcu;
3426
3427	rcu_read_unlock();
3428
3429	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3430
3431errout_free:
3432	return err;
3433errout_rcu:
3434	rcu_read_unlock();
3435	kfree_skb(skb);
3436	goto errout_free;
3437}
3438
3439void ip_rt_multicast_event(struct in_device *in_dev)
3440{
3441	rt_cache_flush(dev_net(in_dev->dev));
3442}
3443
3444#ifdef CONFIG_SYSCTL
3445static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3446static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3447static int ip_rt_gc_elasticity __read_mostly	= 8;
3448static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3449
3450static int ipv4_sysctl_rtcache_flush(const struct ctl_table *__ctl, int write,
3451		void *buffer, size_t *lenp, loff_t *ppos)
3452{
3453	struct net *net = (struct net *)__ctl->extra1;
3454
3455	if (write) {
3456		rt_cache_flush(net);
3457		fnhe_genid_bump(net);
3458		return 0;
3459	}
3460
3461	return -EINVAL;
3462}
3463
3464static struct ctl_table ipv4_route_table[] = {
3465	{
3466		.procname	= "gc_thresh",
3467		.data		= &ipv4_dst_ops.gc_thresh,
3468		.maxlen		= sizeof(int),
3469		.mode		= 0644,
3470		.proc_handler	= proc_dointvec,
3471	},
3472	{
3473		.procname	= "max_size",
3474		.data		= &ip_rt_max_size,
3475		.maxlen		= sizeof(int),
3476		.mode		= 0644,
3477		.proc_handler	= proc_dointvec,
3478	},
3479	{
3480		/*  Deprecated. Use gc_min_interval_ms */
3481
3482		.procname	= "gc_min_interval",
3483		.data		= &ip_rt_gc_min_interval,
3484		.maxlen		= sizeof(int),
3485		.mode		= 0644,
3486		.proc_handler	= proc_dointvec_jiffies,
3487	},
3488	{
3489		.procname	= "gc_min_interval_ms",
3490		.data		= &ip_rt_gc_min_interval,
3491		.maxlen		= sizeof(int),
3492		.mode		= 0644,
3493		.proc_handler	= proc_dointvec_ms_jiffies,
3494	},
3495	{
3496		.procname	= "gc_timeout",
3497		.data		= &ip_rt_gc_timeout,
3498		.maxlen		= sizeof(int),
3499		.mode		= 0644,
3500		.proc_handler	= proc_dointvec_jiffies,
3501	},
3502	{
3503		.procname	= "gc_interval",
3504		.data		= &ip_rt_gc_interval,
3505		.maxlen		= sizeof(int),
3506		.mode		= 0644,
3507		.proc_handler	= proc_dointvec_jiffies,
3508	},
3509	{
3510		.procname	= "redirect_load",
3511		.data		= &ip_rt_redirect_load,
3512		.maxlen		= sizeof(int),
3513		.mode		= 0644,
3514		.proc_handler	= proc_dointvec,
3515	},
3516	{
3517		.procname	= "redirect_number",
3518		.data		= &ip_rt_redirect_number,
3519		.maxlen		= sizeof(int),
3520		.mode		= 0644,
3521		.proc_handler	= proc_dointvec,
3522	},
3523	{
3524		.procname	= "redirect_silence",
3525		.data		= &ip_rt_redirect_silence,
3526		.maxlen		= sizeof(int),
3527		.mode		= 0644,
3528		.proc_handler	= proc_dointvec,
3529	},
3530	{
3531		.procname	= "error_cost",
3532		.data		= &ip_rt_error_cost,
3533		.maxlen		= sizeof(int),
3534		.mode		= 0644,
3535		.proc_handler	= proc_dointvec,
3536	},
3537	{
3538		.procname	= "error_burst",
3539		.data		= &ip_rt_error_burst,
3540		.maxlen		= sizeof(int),
3541		.mode		= 0644,
3542		.proc_handler	= proc_dointvec,
3543	},
3544	{
3545		.procname	= "gc_elasticity",
3546		.data		= &ip_rt_gc_elasticity,
3547		.maxlen		= sizeof(int),
3548		.mode		= 0644,
3549		.proc_handler	= proc_dointvec,
3550	},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3551};
3552
3553static const char ipv4_route_flush_procname[] = "flush";
3554
3555static struct ctl_table ipv4_route_netns_table[] = {
3556	{
3557		.procname	= ipv4_route_flush_procname,
3558		.maxlen		= sizeof(int),
3559		.mode		= 0200,
3560		.proc_handler	= ipv4_sysctl_rtcache_flush,
3561	},
3562	{
3563		.procname       = "min_pmtu",
3564		.data           = &init_net.ipv4.ip_rt_min_pmtu,
3565		.maxlen         = sizeof(int),
3566		.mode           = 0644,
3567		.proc_handler   = proc_dointvec_minmax,
3568		.extra1         = &ip_min_valid_pmtu,
3569	},
3570	{
3571		.procname       = "mtu_expires",
3572		.data           = &init_net.ipv4.ip_rt_mtu_expires,
3573		.maxlen         = sizeof(int),
3574		.mode           = 0644,
3575		.proc_handler   = proc_dointvec_jiffies,
3576	},
3577	{
3578		.procname   = "min_adv_mss",
3579		.data       = &init_net.ipv4.ip_rt_min_advmss,
3580		.maxlen     = sizeof(int),
3581		.mode       = 0644,
3582		.proc_handler   = proc_dointvec,
3583	},
3584};
3585
3586static __net_init int sysctl_route_net_init(struct net *net)
3587{
3588	struct ctl_table *tbl;
3589	size_t table_size = ARRAY_SIZE(ipv4_route_netns_table);
3590
3591	tbl = ipv4_route_netns_table;
3592	if (!net_eq(net, &init_net)) {
3593		int i;
3594
3595		tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
3596		if (!tbl)
3597			goto err_dup;
3598
3599		/* Don't export non-whitelisted sysctls to unprivileged users */
3600		if (net->user_ns != &init_user_ns) {
3601			if (tbl[0].procname != ipv4_route_flush_procname)
3602				table_size = 0;
3603		}
3604
3605		/* Update the variables to point into the current struct net
3606		 * except for the first element flush
3607		 */
3608		for (i = 1; i < table_size; i++)
3609			tbl[i].data += (void *)net - (void *)&init_net;
3610	}
3611	tbl[0].extra1 = net;
3612
3613	net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route",
3614						     tbl, table_size);
3615	if (!net->ipv4.route_hdr)
3616		goto err_reg;
3617	return 0;
3618
3619err_reg:
3620	if (tbl != ipv4_route_netns_table)
3621		kfree(tbl);
3622err_dup:
3623	return -ENOMEM;
3624}
3625
3626static __net_exit void sysctl_route_net_exit(struct net *net)
3627{
3628	const struct ctl_table *tbl;
3629
3630	tbl = net->ipv4.route_hdr->ctl_table_arg;
3631	unregister_net_sysctl_table(net->ipv4.route_hdr);
3632	BUG_ON(tbl == ipv4_route_netns_table);
3633	kfree(tbl);
3634}
3635
3636static __net_initdata struct pernet_operations sysctl_route_ops = {
3637	.init = sysctl_route_net_init,
3638	.exit = sysctl_route_net_exit,
3639};
3640#endif
3641
3642static __net_init int netns_ip_rt_init(struct net *net)
3643{
3644	/* Set default value for namespaceified sysctls */
3645	net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
3646	net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
3647	net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
3648	return 0;
3649}
3650
3651static struct pernet_operations __net_initdata ip_rt_ops = {
3652	.init = netns_ip_rt_init,
3653};
3654
3655static __net_init int rt_genid_init(struct net *net)
3656{
3657	atomic_set(&net->ipv4.rt_genid, 0);
3658	atomic_set(&net->fnhe_genid, 0);
3659	atomic_set(&net->ipv4.dev_addr_genid, get_random_u32());
3660	return 0;
3661}
3662
3663static __net_initdata struct pernet_operations rt_genid_ops = {
3664	.init = rt_genid_init,
3665};
3666
3667static int __net_init ipv4_inetpeer_init(struct net *net)
3668{
3669	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3670
3671	if (!bp)
3672		return -ENOMEM;
3673	inet_peer_base_init(bp);
3674	net->ipv4.peers = bp;
3675	return 0;
3676}
3677
3678static void __net_exit ipv4_inetpeer_exit(struct net *net)
3679{
3680	struct inet_peer_base *bp = net->ipv4.peers;
3681
3682	net->ipv4.peers = NULL;
3683	inetpeer_invalidate_tree(bp);
3684	kfree(bp);
3685}
3686
3687static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3688	.init	=	ipv4_inetpeer_init,
3689	.exit	=	ipv4_inetpeer_exit,
3690};
3691
3692#ifdef CONFIG_IP_ROUTE_CLASSID
3693struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3694#endif /* CONFIG_IP_ROUTE_CLASSID */
3695
3696static const struct rtnl_msg_handler ip_rt_rtnl_msg_handlers[] __initconst = {
3697	{.protocol = PF_INET, .msgtype = RTM_GETROUTE,
3698	 .doit = inet_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
3699};
3700
3701int __init ip_rt_init(void)
3702{
3703	void *idents_hash;
3704	int cpu;
3705
3706	/* For modern hosts, this will use 2 MB of memory */
3707	idents_hash = alloc_large_system_hash("IP idents",
3708					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
3709					      0,
3710					      16, /* one bucket per 64 KB */
3711					      HASH_ZERO,
3712					      NULL,
3713					      &ip_idents_mask,
3714					      2048,
3715					      256*1024);
3716
3717	ip_idents = idents_hash;
3718
3719	get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3720
3721	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3722
3723	for_each_possible_cpu(cpu) {
3724		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3725
3726		INIT_LIST_HEAD(&ul->head);
3727		spin_lock_init(&ul->lock);
3728	}
3729#ifdef CONFIG_IP_ROUTE_CLASSID
3730	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3731	if (!ip_rt_acct)
3732		panic("IP: failed to allocate ip_rt_acct\n");
3733#endif
3734
3735	ipv4_dst_ops.kmem_cachep = KMEM_CACHE(rtable,
3736					      SLAB_HWCACHE_ALIGN | SLAB_PANIC);
 
3737
3738	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3739
3740	if (dst_entries_init(&ipv4_dst_ops) < 0)
3741		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3742
3743	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3744		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3745
3746	ipv4_dst_ops.gc_thresh = ~0;
3747	ip_rt_max_size = INT_MAX;
3748
3749	devinet_init();
3750	ip_fib_init();
3751
3752	if (ip_rt_proc_init())
3753		pr_err("Unable to create route proc files\n");
3754#ifdef CONFIG_XFRM
3755	xfrm_init();
3756	xfrm4_init();
3757#endif
3758	rtnl_register_many(ip_rt_rtnl_msg_handlers);
 
3759
3760#ifdef CONFIG_SYSCTL
3761	register_pernet_subsys(&sysctl_route_ops);
3762#endif
3763	register_pernet_subsys(&ip_rt_ops);
3764	register_pernet_subsys(&rt_genid_ops);
3765	register_pernet_subsys(&ipv4_inetpeer_ops);
3766	return 0;
3767}
3768
3769#ifdef CONFIG_SYSCTL
3770/*
3771 * We really need to sanitize the damn ipv4 init order, then all
3772 * this nonsense will go away.
3773 */
3774void __init ip_static_sysctl_init(void)
3775{
3776	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3777}
3778#endif