Linux Audio

Check our new training course

Linux kernel drivers training

Mar 31-Apr 9, 2025, special US time zones
Register
Loading...
v4.17
 
   1/*
   2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   3 *		operating system.  INET is implemented using the  BSD Socket
   4 *		interface as the means of communication with the user level.
   5 *
   6 *		ROUTE - implementation of the IP router.
   7 *
   8 * Authors:	Ross Biro
   9 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  11 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13 *
  14 * Fixes:
  15 *		Alan Cox	:	Verify area fixes.
  16 *		Alan Cox	:	cli() protects routing changes
  17 *		Rui Oliveira	:	ICMP routing table updates
  18 *		(rco@di.uminho.pt)	Routing table insertion and update
  19 *		Linus Torvalds	:	Rewrote bits to be sensible
  20 *		Alan Cox	:	Added BSD route gw semantics
  21 *		Alan Cox	:	Super /proc >4K
  22 *		Alan Cox	:	MTU in route table
  23 *		Alan Cox	: 	MSS actually. Also added the window
  24 *					clamper.
  25 *		Sam Lantinga	:	Fixed route matching in rt_del()
  26 *		Alan Cox	:	Routing cache support.
  27 *		Alan Cox	:	Removed compatibility cruft.
  28 *		Alan Cox	:	RTF_REJECT support.
  29 *		Alan Cox	:	TCP irtt support.
  30 *		Jonathan Naylor	:	Added Metric support.
  31 *	Miquel van Smoorenburg	:	BSD API fixes.
  32 *	Miquel van Smoorenburg	:	Metrics.
  33 *		Alan Cox	:	Use __u32 properly
  34 *		Alan Cox	:	Aligned routing errors more closely with BSD
  35 *					our system is still very different.
  36 *		Alan Cox	:	Faster /proc handling
  37 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  38 *					routing caches and better behaviour.
  39 *
  40 *		Olaf Erb	:	irtt wasn't being copied right.
  41 *		Bjorn Ekwall	:	Kerneld route support.
  42 *		Alan Cox	:	Multicast fixed (I hope)
  43 * 		Pavel Krauz	:	Limited broadcast fixed
  44 *		Mike McLagan	:	Routing by source
  45 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  46 *					route.c and rewritten from scratch.
  47 *		Andi Kleen	:	Load-limit warning messages.
  48 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  49 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  50 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  51 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  52 *		Marc Boucher	:	routing by fwmark
  53 *	Robert Olsson		:	Added rt_cache statistics
  54 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  55 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  56 * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  57 * 	Ilia Sotnikov		:	Removed TOS from hash calculations
  58 *
  59 *		This program is free software; you can redistribute it and/or
  60 *		modify it under the terms of the GNU General Public License
  61 *		as published by the Free Software Foundation; either version
  62 *		2 of the License, or (at your option) any later version.
  63 */
  64
  65#define pr_fmt(fmt) "IPv4: " fmt
  66
  67#include <linux/module.h>
  68#include <linux/uaccess.h>
  69#include <linux/bitops.h>
  70#include <linux/types.h>
  71#include <linux/kernel.h>
  72#include <linux/mm.h>
  73#include <linux/string.h>
  74#include <linux/socket.h>
  75#include <linux/sockios.h>
  76#include <linux/errno.h>
  77#include <linux/in.h>
  78#include <linux/inet.h>
  79#include <linux/netdevice.h>
  80#include <linux/proc_fs.h>
  81#include <linux/init.h>
  82#include <linux/skbuff.h>
  83#include <linux/inetdevice.h>
  84#include <linux/igmp.h>
  85#include <linux/pkt_sched.h>
  86#include <linux/mroute.h>
  87#include <linux/netfilter_ipv4.h>
  88#include <linux/random.h>
  89#include <linux/rcupdate.h>
  90#include <linux/times.h>
  91#include <linux/slab.h>
  92#include <linux/jhash.h>
  93#include <net/dst.h>
  94#include <net/dst_metadata.h>
 
  95#include <net/net_namespace.h>
  96#include <net/protocol.h>
  97#include <net/ip.h>
  98#include <net/route.h>
  99#include <net/inetpeer.h>
 100#include <net/sock.h>
 101#include <net/ip_fib.h>
 102#include <net/arp.h>
 103#include <net/tcp.h>
 104#include <net/icmp.h>
 105#include <net/xfrm.h>
 106#include <net/lwtunnel.h>
 107#include <net/netevent.h>
 108#include <net/rtnetlink.h>
 109#ifdef CONFIG_SYSCTL
 110#include <linux/sysctl.h>
 111#endif
 112#include <net/secure_seq.h>
 113#include <net/ip_tunnels.h>
 114#include <net/l3mdev.h>
 115
 116#include "fib_lookup.h"
 117
 118#define RT_FL_TOS(oldflp4) \
 119	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 120
 121#define RT_GC_TIMEOUT (300*HZ)
 122
 
 
 
 123static int ip_rt_max_size;
 124static int ip_rt_redirect_number __read_mostly	= 9;
 125static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 126static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 127static int ip_rt_error_cost __read_mostly	= HZ;
 128static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 129static int ip_rt_mtu_expires __read_mostly	= 10 * 60 * HZ;
 130static u32 ip_rt_min_pmtu __read_mostly		= 512 + 20 + 20;
 131static int ip_rt_min_advmss __read_mostly	= 256;
 132
 133static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 134
 135/*
 136 *	Interface to generic destination cache.
 137 */
 138
 139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 
 140static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 141static unsigned int	 ipv4_mtu(const struct dst_entry *dst);
 
 142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143static void		 ipv4_link_failure(struct sk_buff *skb);
 144static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145					   struct sk_buff *skb, u32 mtu);
 
 146static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147					struct sk_buff *skb);
 148static void		ipv4_dst_destroy(struct dst_entry *dst);
 149
 150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151{
 152	WARN_ON(1);
 153	return NULL;
 154}
 155
 156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157					   struct sk_buff *skb,
 158					   const void *daddr);
 159static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 160
 161static struct dst_ops ipv4_dst_ops = {
 162	.family =		AF_INET,
 163	.check =		ipv4_dst_check,
 164	.default_advmss =	ipv4_default_advmss,
 165	.mtu =			ipv4_mtu,
 166	.cow_metrics =		ipv4_cow_metrics,
 167	.destroy =		ipv4_dst_destroy,
 168	.negative_advice =	ipv4_negative_advice,
 169	.link_failure =		ipv4_link_failure,
 170	.update_pmtu =		ip_rt_update_pmtu,
 171	.redirect =		ip_do_redirect,
 172	.local_out =		__ip_local_out,
 173	.neigh_lookup =		ipv4_neigh_lookup,
 174	.confirm_neigh =	ipv4_confirm_neigh,
 175};
 176
 177#define ECN_OR_COST(class)	TC_PRIO_##class
 178
 179const __u8 ip_tos2prio[16] = {
 180	TC_PRIO_BESTEFFORT,
 181	ECN_OR_COST(BESTEFFORT),
 182	TC_PRIO_BESTEFFORT,
 183	ECN_OR_COST(BESTEFFORT),
 184	TC_PRIO_BULK,
 185	ECN_OR_COST(BULK),
 186	TC_PRIO_BULK,
 187	ECN_OR_COST(BULK),
 188	TC_PRIO_INTERACTIVE,
 189	ECN_OR_COST(INTERACTIVE),
 190	TC_PRIO_INTERACTIVE,
 191	ECN_OR_COST(INTERACTIVE),
 192	TC_PRIO_INTERACTIVE_BULK,
 193	ECN_OR_COST(INTERACTIVE_BULK),
 194	TC_PRIO_INTERACTIVE_BULK,
 195	ECN_OR_COST(INTERACTIVE_BULK)
 196};
 197EXPORT_SYMBOL(ip_tos2prio);
 198
 199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 200#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 201
 202#ifdef CONFIG_PROC_FS
 203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 204{
 205	if (*pos)
 206		return NULL;
 207	return SEQ_START_TOKEN;
 208}
 209
 210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 211{
 212	++*pos;
 213	return NULL;
 214}
 215
 216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 217{
 218}
 219
 220static int rt_cache_seq_show(struct seq_file *seq, void *v)
 221{
 222	if (v == SEQ_START_TOKEN)
 223		seq_printf(seq, "%-127s\n",
 224			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 225			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 226			   "HHUptod\tSpecDst");
 227	return 0;
 228}
 229
 230static const struct seq_operations rt_cache_seq_ops = {
 231	.start  = rt_cache_seq_start,
 232	.next   = rt_cache_seq_next,
 233	.stop   = rt_cache_seq_stop,
 234	.show   = rt_cache_seq_show,
 235};
 236
 237static int rt_cache_seq_open(struct inode *inode, struct file *file)
 238{
 239	return seq_open(file, &rt_cache_seq_ops);
 240}
 241
 242static const struct file_operations rt_cache_seq_fops = {
 243	.open	 = rt_cache_seq_open,
 244	.read	 = seq_read,
 245	.llseek	 = seq_lseek,
 246	.release = seq_release,
 247};
 248
 249
 250static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 251{
 252	int cpu;
 253
 254	if (*pos == 0)
 255		return SEQ_START_TOKEN;
 256
 257	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 258		if (!cpu_possible(cpu))
 259			continue;
 260		*pos = cpu+1;
 261		return &per_cpu(rt_cache_stat, cpu);
 262	}
 263	return NULL;
 264}
 265
 266static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 267{
 268	int cpu;
 269
 270	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 271		if (!cpu_possible(cpu))
 272			continue;
 273		*pos = cpu+1;
 274		return &per_cpu(rt_cache_stat, cpu);
 275	}
 
 276	return NULL;
 277
 278}
 279
 280static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281{
 282
 283}
 284
 285static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286{
 287	struct rt_cache_stat *st = v;
 288
 289	if (v == SEQ_START_TOKEN) {
 290		seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291		return 0;
 292	}
 293
 294	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 
 296		   dst_entries_get_slow(&ipv4_dst_ops),
 297		   0, /* st->in_hit */
 298		   st->in_slow_tot,
 299		   st->in_slow_mc,
 300		   st->in_no_route,
 301		   st->in_brd,
 302		   st->in_martian_dst,
 303		   st->in_martian_src,
 304
 305		   0, /* st->out_hit */
 306		   st->out_slow_tot,
 307		   st->out_slow_mc,
 308
 309		   0, /* st->gc_total */
 310		   0, /* st->gc_ignored */
 311		   0, /* st->gc_goal_miss */
 312		   0, /* st->gc_dst_overflow */
 313		   0, /* st->in_hlist_search */
 314		   0  /* st->out_hlist_search */
 315		);
 316	return 0;
 317}
 318
 319static const struct seq_operations rt_cpu_seq_ops = {
 320	.start  = rt_cpu_seq_start,
 321	.next   = rt_cpu_seq_next,
 322	.stop   = rt_cpu_seq_stop,
 323	.show   = rt_cpu_seq_show,
 324};
 325
 326
 327static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328{
 329	return seq_open(file, &rt_cpu_seq_ops);
 330}
 331
 332static const struct file_operations rt_cpu_seq_fops = {
 333	.open	 = rt_cpu_seq_open,
 334	.read	 = seq_read,
 335	.llseek	 = seq_lseek,
 336	.release = seq_release,
 337};
 338
 339#ifdef CONFIG_IP_ROUTE_CLASSID
 340static int rt_acct_proc_show(struct seq_file *m, void *v)
 341{
 342	struct ip_rt_acct *dst, *src;
 343	unsigned int i, j;
 344
 345	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 346	if (!dst)
 347		return -ENOMEM;
 348
 349	for_each_possible_cpu(i) {
 350		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 351		for (j = 0; j < 256; j++) {
 352			dst[j].o_bytes   += src[j].o_bytes;
 353			dst[j].o_packets += src[j].o_packets;
 354			dst[j].i_bytes   += src[j].i_bytes;
 355			dst[j].i_packets += src[j].i_packets;
 356		}
 357	}
 358
 359	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 360	kfree(dst);
 361	return 0;
 362}
 363
 364static int rt_acct_proc_open(struct inode *inode, struct file *file)
 365{
 366	return single_open(file, rt_acct_proc_show, NULL);
 367}
 368
 369static const struct file_operations rt_acct_proc_fops = {
 370	.open		= rt_acct_proc_open,
 371	.read		= seq_read,
 372	.llseek		= seq_lseek,
 373	.release	= single_release,
 374};
 375#endif
 376
 377static int __net_init ip_rt_do_proc_init(struct net *net)
 378{
 379	struct proc_dir_entry *pde;
 380
 381	pde = proc_create("rt_cache", 0444, net->proc_net,
 382			  &rt_cache_seq_fops);
 383	if (!pde)
 384		goto err1;
 385
 386	pde = proc_create("rt_cache", 0444,
 387			  net->proc_net_stat, &rt_cpu_seq_fops);
 388	if (!pde)
 389		goto err2;
 390
 391#ifdef CONFIG_IP_ROUTE_CLASSID
 392	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 
 393	if (!pde)
 394		goto err3;
 395#endif
 396	return 0;
 397
 398#ifdef CONFIG_IP_ROUTE_CLASSID
 399err3:
 400	remove_proc_entry("rt_cache", net->proc_net_stat);
 401#endif
 402err2:
 403	remove_proc_entry("rt_cache", net->proc_net);
 404err1:
 405	return -ENOMEM;
 406}
 407
 408static void __net_exit ip_rt_do_proc_exit(struct net *net)
 409{
 410	remove_proc_entry("rt_cache", net->proc_net_stat);
 411	remove_proc_entry("rt_cache", net->proc_net);
 412#ifdef CONFIG_IP_ROUTE_CLASSID
 413	remove_proc_entry("rt_acct", net->proc_net);
 414#endif
 415}
 416
 417static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 418	.init = ip_rt_do_proc_init,
 419	.exit = ip_rt_do_proc_exit,
 420};
 421
 422static int __init ip_rt_proc_init(void)
 423{
 424	return register_pernet_subsys(&ip_rt_proc_ops);
 425}
 426
 427#else
 428static inline int ip_rt_proc_init(void)
 429{
 430	return 0;
 431}
 432#endif /* CONFIG_PROC_FS */
 433
 434static inline bool rt_is_expired(const struct rtable *rth)
 435{
 436	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 437}
 438
 439void rt_cache_flush(struct net *net)
 440{
 441	rt_genid_bump_ipv4(net);
 442}
 443
 444static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 445					   struct sk_buff *skb,
 446					   const void *daddr)
 447{
 
 448	struct net_device *dev = dst->dev;
 449	const __be32 *pkey = daddr;
 450	const struct rtable *rt;
 451	struct neighbour *n;
 452
 453	rt = (const struct rtable *) dst;
 454	if (rt->rt_gateway)
 455		pkey = (const __be32 *) &rt->rt_gateway;
 456	else if (skb)
 457		pkey = &ip_hdr(skb)->daddr;
 458
 459	n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 460	if (n)
 461		return n;
 462	return neigh_create(&arp_tbl, pkey, dev);
 
 
 
 
 
 
 
 
 
 463}
 464
 465static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 466{
 
 467	struct net_device *dev = dst->dev;
 468	const __be32 *pkey = daddr;
 469	const struct rtable *rt;
 470
 471	rt = (const struct rtable *)dst;
 472	if (rt->rt_gateway)
 473		pkey = (const __be32 *)&rt->rt_gateway;
 474	else if (!daddr ||
 
 475		 (rt->rt_flags &
 476		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 477		return;
 478
 479	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 480}
 481
 482#define IP_IDENTS_SZ 2048u
 483
 
 
 484static atomic_t *ip_idents __read_mostly;
 485static u32 *ip_tstamps __read_mostly;
 486
 487/* In order to protect privacy, we add a perturbation to identifiers
 488 * if one generator is seldom used. This makes hard for an attacker
 489 * to infer how many packets were sent between two points in time.
 490 */
 491u32 ip_idents_reserve(u32 hash, int segs)
 492{
 493	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 494	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 495	u32 old = READ_ONCE(*p_tstamp);
 496	u32 now = (u32)jiffies;
 497	u32 new, delta = 0;
 
 
 
 
 498
 499	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 500		delta = prandom_u32_max(now - old);
 501
 502	/* Do not use atomic_add_return() as it makes UBSAN unhappy */
 503	do {
 504		old = (u32)atomic_read(p_id);
 505		new = old + delta + segs;
 506	} while (atomic_cmpxchg(p_id, old, new) != old);
 507
 508	return new - segs;
 
 
 
 
 509}
 510EXPORT_SYMBOL(ip_idents_reserve);
 511
 512void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 513{
 514	static u32 ip_idents_hashrnd __read_mostly;
 515	u32 hash, id;
 516
 517	net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 
 
 
 518
 519	hash = jhash_3words((__force u32)iph->daddr,
 520			    (__force u32)iph->saddr,
 521			    iph->protocol ^ net_hash_mix(net),
 522			    ip_idents_hashrnd);
 523	id = ip_idents_reserve(hash, segs);
 524	iph->id = htons(id);
 525}
 526EXPORT_SYMBOL(__ip_select_ident);
 527
 
 
 
 
 
 
 
 
 
 528static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 529			     const struct sock *sk,
 530			     const struct iphdr *iph,
 531			     int oif, u8 tos,
 532			     u8 prot, u32 mark, int flow_flags)
 533{
 534	if (sk) {
 535		const struct inet_sock *inet = inet_sk(sk);
 536
 
 537		oif = sk->sk_bound_dev_if;
 538		mark = sk->sk_mark;
 539		tos = RT_CONN_FLAGS(sk);
 540		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 541	}
 542	flowi4_init_output(fl4, oif, mark, tos,
 543			   RT_SCOPE_UNIVERSE, prot,
 544			   flow_flags,
 545			   iph->daddr, iph->saddr, 0, 0,
 
 546			   sock_net_uid(net, sk));
 547}
 548
 549static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 550			       const struct sock *sk)
 551{
 552	const struct net *net = dev_net(skb->dev);
 553	const struct iphdr *iph = ip_hdr(skb);
 554	int oif = skb->dev->ifindex;
 555	u8 tos = RT_TOS(iph->tos);
 556	u8 prot = iph->protocol;
 557	u32 mark = skb->mark;
 
 558
 559	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 560}
 561
 562static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 563{
 564	const struct inet_sock *inet = inet_sk(sk);
 565	const struct ip_options_rcu *inet_opt;
 566	__be32 daddr = inet->inet_daddr;
 567
 568	rcu_read_lock();
 569	inet_opt = rcu_dereference(inet->inet_opt);
 570	if (inet_opt && inet_opt->opt.srr)
 571		daddr = inet_opt->opt.faddr;
 572	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 573			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 574			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 
 
 575			   inet_sk_flowi_flags(sk),
 576			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 577	rcu_read_unlock();
 578}
 579
 580static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 581				 const struct sk_buff *skb)
 582{
 583	if (skb)
 584		build_skb_flow_key(fl4, skb, sk);
 585	else
 586		build_sk_flow_key(fl4, sk);
 587}
 588
 589static DEFINE_SPINLOCK(fnhe_lock);
 590
 591static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 592{
 593	struct rtable *rt;
 594
 595	rt = rcu_dereference(fnhe->fnhe_rth_input);
 596	if (rt) {
 597		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 598		dst_dev_put(&rt->dst);
 599		dst_release(&rt->dst);
 600	}
 601	rt = rcu_dereference(fnhe->fnhe_rth_output);
 602	if (rt) {
 603		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 604		dst_dev_put(&rt->dst);
 605		dst_release(&rt->dst);
 606	}
 607}
 608
 609static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 610{
 611	struct fib_nh_exception *fnhe, *oldest;
 
 612
 613	oldest = rcu_dereference(hash->chain);
 614	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 615	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 616		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 
 
 
 617			oldest = fnhe;
 
 
 618	}
 619	fnhe_flush_routes(oldest);
 620	return oldest;
 
 621}
 622
 623static inline u32 fnhe_hashfun(__be32 daddr)
 624{
 625	static u32 fnhe_hashrnd __read_mostly;
 626	u32 hval;
 627
 628	net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 629	hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 630	return hash_32(hval, FNHE_HASH_SHIFT);
 631}
 632
 633static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 634{
 635	rt->rt_pmtu = fnhe->fnhe_pmtu;
 636	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 637	rt->dst.expires = fnhe->fnhe_expires;
 638
 639	if (fnhe->fnhe_gw) {
 640		rt->rt_flags |= RTCF_REDIRECTED;
 641		rt->rt_gateway = fnhe->fnhe_gw;
 642		rt->rt_uses_gateway = 1;
 
 
 643	}
 644}
 645
 646static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 647				  u32 pmtu, bool lock, unsigned long expires)
 
 648{
 649	struct fnhe_hash_bucket *hash;
 650	struct fib_nh_exception *fnhe;
 651	struct rtable *rt;
 652	u32 genid, hval;
 653	unsigned int i;
 654	int depth;
 655
 656	genid = fnhe_genid(dev_net(nh->nh_dev));
 657	hval = fnhe_hashfun(daddr);
 658
 659	spin_lock_bh(&fnhe_lock);
 660
 661	hash = rcu_dereference(nh->nh_exceptions);
 662	if (!hash) {
 663		hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 664		if (!hash)
 665			goto out_unlock;
 666		rcu_assign_pointer(nh->nh_exceptions, hash);
 667	}
 668
 669	hash += hval;
 670
 671	depth = 0;
 672	for (fnhe = rcu_dereference(hash->chain); fnhe;
 673	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 674		if (fnhe->fnhe_daddr == daddr)
 675			break;
 676		depth++;
 677	}
 678
 679	if (fnhe) {
 680		if (fnhe->fnhe_genid != genid)
 681			fnhe->fnhe_genid = genid;
 682		if (gw)
 683			fnhe->fnhe_gw = gw;
 684		if (pmtu) {
 685			fnhe->fnhe_pmtu = pmtu;
 686			fnhe->fnhe_mtu_locked = lock;
 687		}
 688		fnhe->fnhe_expires = max(1UL, expires);
 689		/* Update all cached dsts too */
 690		rt = rcu_dereference(fnhe->fnhe_rth_input);
 691		if (rt)
 692			fill_route_from_fnhe(rt, fnhe);
 693		rt = rcu_dereference(fnhe->fnhe_rth_output);
 694		if (rt)
 695			fill_route_from_fnhe(rt, fnhe);
 696	} else {
 697		if (depth > FNHE_RECLAIM_DEPTH)
 698			fnhe = fnhe_oldest(hash);
 699		else {
 700			fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 701			if (!fnhe)
 702				goto out_unlock;
 703
 704			fnhe->fnhe_next = hash->chain;
 705			rcu_assign_pointer(hash->chain, fnhe);
 
 706		}
 
 
 
 
 
 
 
 707		fnhe->fnhe_genid = genid;
 708		fnhe->fnhe_daddr = daddr;
 709		fnhe->fnhe_gw = gw;
 710		fnhe->fnhe_pmtu = pmtu;
 711		fnhe->fnhe_mtu_locked = lock;
 712		fnhe->fnhe_expires = max(1UL, expires);
 713
 
 
 714		/* Exception created; mark the cached routes for the nexthop
 715		 * stale, so anyone caching it rechecks if this exception
 716		 * applies to them.
 717		 */
 718		rt = rcu_dereference(nh->nh_rth_input);
 719		if (rt)
 720			rt->dst.obsolete = DST_OBSOLETE_KILL;
 721
 722		for_each_possible_cpu(i) {
 723			struct rtable __rcu **prt;
 724			prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 
 725			rt = rcu_dereference(*prt);
 726			if (rt)
 727				rt->dst.obsolete = DST_OBSOLETE_KILL;
 728		}
 729	}
 730
 731	fnhe->fnhe_stamp = jiffies;
 732
 733out_unlock:
 734	spin_unlock_bh(&fnhe_lock);
 735}
 736
 737static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 738			     bool kill_route)
 739{
 740	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 741	__be32 old_gw = ip_hdr(skb)->saddr;
 742	struct net_device *dev = skb->dev;
 743	struct in_device *in_dev;
 744	struct fib_result res;
 745	struct neighbour *n;
 746	struct net *net;
 747
 748	switch (icmp_hdr(skb)->code & 7) {
 749	case ICMP_REDIR_NET:
 750	case ICMP_REDIR_NETTOS:
 751	case ICMP_REDIR_HOST:
 752	case ICMP_REDIR_HOSTTOS:
 753		break;
 754
 755	default:
 756		return;
 757	}
 758
 759	if (rt->rt_gateway != old_gw)
 760		return;
 761
 762	in_dev = __in_dev_get_rcu(dev);
 763	if (!in_dev)
 764		return;
 765
 766	net = dev_net(dev);
 767	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 768	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 769	    ipv4_is_zeronet(new_gw))
 770		goto reject_redirect;
 771
 772	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 773		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 774			goto reject_redirect;
 775		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 776			goto reject_redirect;
 777	} else {
 778		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 779			goto reject_redirect;
 780	}
 781
 782	n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 783	if (!n)
 784		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 785	if (!IS_ERR(n)) {
 786		if (!(n->nud_state & NUD_VALID)) {
 787			neigh_event_send(n, NULL);
 788		} else {
 789			if (fib_lookup(net, fl4, &res, 0) == 0) {
 790				struct fib_nh *nh = &FIB_RES_NH(res);
 791
 792				update_or_create_fnhe(nh, fl4->daddr, new_gw,
 
 
 793						0, false,
 794						jiffies + ip_rt_gc_timeout);
 795			}
 796			if (kill_route)
 797				rt->dst.obsolete = DST_OBSOLETE_KILL;
 798			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 799		}
 800		neigh_release(n);
 801	}
 802	return;
 803
 804reject_redirect:
 805#ifdef CONFIG_IP_ROUTE_VERBOSE
 806	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 807		const struct iphdr *iph = (const struct iphdr *) skb->data;
 808		__be32 daddr = iph->daddr;
 809		__be32 saddr = iph->saddr;
 810
 811		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 812				     "  Advised path = %pI4 -> %pI4\n",
 813				     &old_gw, dev->name, &new_gw,
 814				     &saddr, &daddr);
 815	}
 816#endif
 817	;
 818}
 819
 820static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 821{
 822	struct rtable *rt;
 823	struct flowi4 fl4;
 824	const struct iphdr *iph = (const struct iphdr *) skb->data;
 825	struct net *net = dev_net(skb->dev);
 826	int oif = skb->dev->ifindex;
 827	u8 tos = RT_TOS(iph->tos);
 828	u8 prot = iph->protocol;
 829	u32 mark = skb->mark;
 
 830
 831	rt = (struct rtable *) dst;
 832
 833	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 834	__ip_do_redirect(rt, skb, &fl4, true);
 835}
 836
 837static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 838{
 839	struct rtable *rt = (struct rtable *)dst;
 840	struct dst_entry *ret = dst;
 841
 842	if (rt) {
 843		if (dst->obsolete > 0) {
 844			ip_rt_put(rt);
 845			ret = NULL;
 846		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 847			   rt->dst.expires) {
 848			ip_rt_put(rt);
 849			ret = NULL;
 850		}
 851	}
 852	return ret;
 853}
 854
 855/*
 856 * Algorithm:
 857 *	1. The first ip_rt_redirect_number redirects are sent
 858 *	   with exponential backoff, then we stop sending them at all,
 859 *	   assuming that the host ignores our redirects.
 860 *	2. If we did not see packets requiring redirects
 861 *	   during ip_rt_redirect_silence, we assume that the host
 862 *	   forgot redirected route and start to send redirects again.
 863 *
 864 * This algorithm is much cheaper and more intelligent than dumb load limiting
 865 * in icmp.c.
 866 *
 867 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 868 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 869 */
 870
 871void ip_rt_send_redirect(struct sk_buff *skb)
 872{
 873	struct rtable *rt = skb_rtable(skb);
 874	struct in_device *in_dev;
 875	struct inet_peer *peer;
 876	struct net *net;
 877	int log_martians;
 878	int vif;
 879
 880	rcu_read_lock();
 881	in_dev = __in_dev_get_rcu(rt->dst.dev);
 882	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 883		rcu_read_unlock();
 884		return;
 885	}
 886	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 887	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 888	rcu_read_unlock();
 889
 890	net = dev_net(rt->dst.dev);
 891	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 892	if (!peer) {
 893		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 894			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 895		return;
 896	}
 897
 898	/* No redirected packets during ip_rt_redirect_silence;
 899	 * reset the algorithm.
 900	 */
 901	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 902		peer->rate_tokens = 0;
 
 
 903
 904	/* Too many ignored redirects; do not send anything
 905	 * set dst.rate_last to the last seen redirected packet.
 906	 */
 907	if (peer->rate_tokens >= ip_rt_redirect_number) {
 908		peer->rate_last = jiffies;
 909		goto out_put_peer;
 910	}
 911
 912	/* Check for load limit; set rate_last to the latest sent
 913	 * redirect.
 914	 */
 915	if (peer->rate_tokens == 0 ||
 916	    time_after(jiffies,
 917		       (peer->rate_last +
 918			(ip_rt_redirect_load << peer->rate_tokens)))) {
 919		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 920
 921		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 922		peer->rate_last = jiffies;
 923		++peer->rate_tokens;
 924#ifdef CONFIG_IP_ROUTE_VERBOSE
 925		if (log_martians &&
 926		    peer->rate_tokens == ip_rt_redirect_number)
 927			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 928					     &ip_hdr(skb)->saddr, inet_iif(skb),
 929					     &ip_hdr(skb)->daddr, &gw);
 930#endif
 931	}
 932out_put_peer:
 933	inet_putpeer(peer);
 934}
 935
 936static int ip_error(struct sk_buff *skb)
 937{
 938	struct rtable *rt = skb_rtable(skb);
 939	struct net_device *dev = skb->dev;
 940	struct in_device *in_dev;
 941	struct inet_peer *peer;
 942	unsigned long now;
 943	struct net *net;
 
 944	bool send;
 945	int code;
 946
 947	if (netif_is_l3_master(skb->dev)) {
 948		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 949		if (!dev)
 950			goto out;
 951	}
 952
 953	in_dev = __in_dev_get_rcu(dev);
 954
 955	/* IP on this device is disabled. */
 956	if (!in_dev)
 957		goto out;
 958
 959	net = dev_net(rt->dst.dev);
 960	if (!IN_DEV_FORWARD(in_dev)) {
 961		switch (rt->dst.error) {
 962		case EHOSTUNREACH:
 
 963			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 964			break;
 965
 966		case ENETUNREACH:
 
 967			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 968			break;
 969		}
 970		goto out;
 971	}
 972
 973	switch (rt->dst.error) {
 974	case EINVAL:
 975	default:
 976		goto out;
 977	case EHOSTUNREACH:
 978		code = ICMP_HOST_UNREACH;
 979		break;
 980	case ENETUNREACH:
 981		code = ICMP_NET_UNREACH;
 
 982		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 983		break;
 984	case EACCES:
 985		code = ICMP_PKT_FILTERED;
 986		break;
 987	}
 988
 989	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 990			       l3mdev_master_ifindex(skb->dev), 1);
 991
 992	send = true;
 993	if (peer) {
 994		now = jiffies;
 995		peer->rate_tokens += now - peer->rate_last;
 996		if (peer->rate_tokens > ip_rt_error_burst)
 997			peer->rate_tokens = ip_rt_error_burst;
 998		peer->rate_last = now;
 999		if (peer->rate_tokens >= ip_rt_error_cost)
1000			peer->rate_tokens -= ip_rt_error_cost;
1001		else
1002			send = false;
1003		inet_putpeer(peer);
1004	}
1005	if (send)
1006		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1007
1008out:	kfree_skb(skb);
1009	return 0;
1010}
1011
1012static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1013{
1014	struct dst_entry *dst = &rt->dst;
 
1015	struct fib_result res;
1016	bool lock = false;
 
1017
1018	if (ip_mtu_locked(dst))
1019		return;
1020
1021	if (ipv4_mtu(dst) < mtu)
 
1022		return;
1023
1024	if (mtu < ip_rt_min_pmtu) {
1025		lock = true;
1026		mtu = ip_rt_min_pmtu;
1027	}
1028
1029	if (rt->rt_pmtu == mtu &&
1030	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1031		return;
1032
1033	rcu_read_lock();
1034	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1035		struct fib_nh *nh = &FIB_RES_NH(res);
1036
1037		update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1038				      jiffies + ip_rt_mtu_expires);
 
 
1039	}
1040	rcu_read_unlock();
1041}
1042
1043static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1044			      struct sk_buff *skb, u32 mtu)
 
1045{
1046	struct rtable *rt = (struct rtable *) dst;
1047	struct flowi4 fl4;
1048
1049	ip_rt_build_flow_key(&fl4, sk, skb);
 
 
 
 
 
1050	__ip_rt_update_pmtu(rt, &fl4, mtu);
1051}
1052
1053void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1054		      int oif, u32 mark, u8 protocol, int flow_flags)
1055{
1056	const struct iphdr *iph = (const struct iphdr *) skb->data;
1057	struct flowi4 fl4;
1058	struct rtable *rt;
 
1059
1060	if (!mark)
1061		mark = IP4_REPLY_MARK(net, skb->mark);
1062
1063	__build_flow_key(net, &fl4, NULL, iph, oif,
1064			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1065	rt = __ip_route_output_key(net, &fl4);
1066	if (!IS_ERR(rt)) {
1067		__ip_rt_update_pmtu(rt, &fl4, mtu);
1068		ip_rt_put(rt);
1069	}
1070}
1071EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1072
1073static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074{
1075	const struct iphdr *iph = (const struct iphdr *) skb->data;
1076	struct flowi4 fl4;
1077	struct rtable *rt;
1078
1079	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1080
1081	if (!fl4.flowi4_mark)
1082		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1083
1084	rt = __ip_route_output_key(sock_net(sk), &fl4);
1085	if (!IS_ERR(rt)) {
1086		__ip_rt_update_pmtu(rt, &fl4, mtu);
1087		ip_rt_put(rt);
1088	}
1089}
1090
1091void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092{
1093	const struct iphdr *iph = (const struct iphdr *) skb->data;
1094	struct flowi4 fl4;
1095	struct rtable *rt;
1096	struct dst_entry *odst = NULL;
1097	bool new = false;
1098	struct net *net = sock_net(sk);
1099
1100	bh_lock_sock(sk);
1101
1102	if (!ip_sk_accept_pmtu(sk))
1103		goto out;
1104
1105	odst = sk_dst_get(sk);
1106
1107	if (sock_owned_by_user(sk) || !odst) {
1108		__ipv4_sk_update_pmtu(skb, sk, mtu);
1109		goto out;
1110	}
1111
1112	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1113
1114	rt = (struct rtable *)odst;
1115	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1116		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1117		if (IS_ERR(rt))
1118			goto out;
1119
1120		new = true;
1121	}
1122
1123	__ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1124
1125	if (!dst_check(&rt->dst, 0)) {
1126		if (new)
1127			dst_release(&rt->dst);
1128
1129		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1130		if (IS_ERR(rt))
1131			goto out;
1132
1133		new = true;
1134	}
1135
1136	if (new)
1137		sk_dst_set(sk, &rt->dst);
1138
1139out:
1140	bh_unlock_sock(sk);
1141	dst_release(odst);
1142}
1143EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1144
1145void ipv4_redirect(struct sk_buff *skb, struct net *net,
1146		   int oif, u32 mark, u8 protocol, int flow_flags)
1147{
1148	const struct iphdr *iph = (const struct iphdr *) skb->data;
1149	struct flowi4 fl4;
1150	struct rtable *rt;
1151
1152	__build_flow_key(net, &fl4, NULL, iph, oif,
1153			 RT_TOS(iph->tos), protocol, mark, flow_flags);
1154	rt = __ip_route_output_key(net, &fl4);
1155	if (!IS_ERR(rt)) {
1156		__ip_do_redirect(rt, skb, &fl4, false);
1157		ip_rt_put(rt);
1158	}
1159}
1160EXPORT_SYMBOL_GPL(ipv4_redirect);
1161
1162void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1163{
1164	const struct iphdr *iph = (const struct iphdr *) skb->data;
1165	struct flowi4 fl4;
1166	struct rtable *rt;
1167	struct net *net = sock_net(sk);
1168
1169	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1170	rt = __ip_route_output_key(net, &fl4);
1171	if (!IS_ERR(rt)) {
1172		__ip_do_redirect(rt, skb, &fl4, false);
1173		ip_rt_put(rt);
1174	}
1175}
1176EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1177
1178static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 
1179{
1180	struct rtable *rt = (struct rtable *) dst;
1181
1182	/* All IPV4 dsts are created with ->obsolete set to the value
1183	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1184	 * into this function always.
1185	 *
1186	 * When a PMTU/redirect information update invalidates a route,
1187	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1188	 * DST_OBSOLETE_DEAD by dst_free().
1189	 */
1190	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1191		return NULL;
1192	return dst;
1193}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1194
1195static void ipv4_link_failure(struct sk_buff *skb)
1196{
1197	struct rtable *rt;
1198
1199	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1200
1201	rt = skb_rtable(skb);
1202	if (rt)
1203		dst_set_expires(&rt->dst, 0);
1204}
1205
1206static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1207{
1208	pr_debug("%s: %pI4 -> %pI4, %s\n",
1209		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1210		 skb->dev ? skb->dev->name : "?");
1211	kfree_skb(skb);
1212	WARN_ON(1);
1213	return 0;
1214}
1215
1216/*
1217   We do not cache source address of outgoing interface,
1218   because it is used only by IP RR, TS and SRR options,
1219   so that it out of fast path.
1220
1221   BTW remember: "addr" is allowed to be not aligned
1222   in IP options!
1223 */
1224
1225void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1226{
1227	__be32 src;
1228
1229	if (rt_is_output_route(rt))
1230		src = ip_hdr(skb)->saddr;
1231	else {
1232		struct fib_result res;
1233		struct flowi4 fl4;
1234		struct iphdr *iph;
1235
1236		iph = ip_hdr(skb);
1237
1238		memset(&fl4, 0, sizeof(fl4));
1239		fl4.daddr = iph->daddr;
1240		fl4.saddr = iph->saddr;
1241		fl4.flowi4_tos = RT_TOS(iph->tos);
1242		fl4.flowi4_oif = rt->dst.dev->ifindex;
1243		fl4.flowi4_iif = skb->dev->ifindex;
1244		fl4.flowi4_mark = skb->mark;
1245
1246		rcu_read_lock();
1247		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1248			src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1249		else
1250			src = inet_select_addr(rt->dst.dev,
1251					       rt_nexthop(rt, iph->daddr),
1252					       RT_SCOPE_UNIVERSE);
1253		rcu_read_unlock();
1254	}
1255	memcpy(addr, &src, 4);
1256}
1257
1258#ifdef CONFIG_IP_ROUTE_CLASSID
1259static void set_class_tag(struct rtable *rt, u32 tag)
1260{
1261	if (!(rt->dst.tclassid & 0xFFFF))
1262		rt->dst.tclassid |= tag & 0xFFFF;
1263	if (!(rt->dst.tclassid & 0xFFFF0000))
1264		rt->dst.tclassid |= tag & 0xFFFF0000;
1265}
1266#endif
1267
1268static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1269{
 
1270	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1271	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1272				    ip_rt_min_advmss);
1273
1274	return min(advmss, IPV4_MAX_PMTU - header_size);
1275}
1276
1277static unsigned int ipv4_mtu(const struct dst_entry *dst)
1278{
1279	const struct rtable *rt = (const struct rtable *) dst;
1280	unsigned int mtu = rt->rt_pmtu;
1281
1282	if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1283		mtu = dst_metric_raw(dst, RTAX_MTU);
1284
1285	if (mtu)
1286		return mtu;
1287
1288	mtu = READ_ONCE(dst->dev->mtu);
1289
1290	if (unlikely(ip_mtu_locked(dst))) {
1291		if (rt->rt_uses_gateway && mtu > 576)
1292			mtu = 576;
1293	}
1294
1295	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1296
1297	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1298}
 
1299
1300static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1301{
1302	struct fnhe_hash_bucket *hash;
1303	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1304	u32 hval = fnhe_hashfun(daddr);
1305
1306	spin_lock_bh(&fnhe_lock);
1307
1308	hash = rcu_dereference_protected(nh->nh_exceptions,
1309					 lockdep_is_held(&fnhe_lock));
1310	hash += hval;
1311
1312	fnhe_p = &hash->chain;
1313	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1314	while (fnhe) {
1315		if (fnhe->fnhe_daddr == daddr) {
1316			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1317				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
 
 
 
 
1318			fnhe_flush_routes(fnhe);
1319			kfree_rcu(fnhe, rcu);
1320			break;
1321		}
1322		fnhe_p = &fnhe->fnhe_next;
1323		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1324						 lockdep_is_held(&fnhe_lock));
1325	}
1326
1327	spin_unlock_bh(&fnhe_lock);
1328}
1329
1330static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 
1331{
1332	struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1333	struct fib_nh_exception *fnhe;
1334	u32 hval;
1335
1336	if (!hash)
1337		return NULL;
1338
1339	hval = fnhe_hashfun(daddr);
1340
1341	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1342	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1343		if (fnhe->fnhe_daddr == daddr) {
1344			if (fnhe->fnhe_expires &&
1345			    time_after(jiffies, fnhe->fnhe_expires)) {
1346				ip_del_fnhe(nh, daddr);
1347				break;
1348			}
1349			return fnhe;
1350		}
1351	}
1352	return NULL;
1353}
1354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1355static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1356			      __be32 daddr, const bool do_cache)
1357{
1358	bool ret = false;
1359
1360	spin_lock_bh(&fnhe_lock);
1361
1362	if (daddr == fnhe->fnhe_daddr) {
1363		struct rtable __rcu **porig;
1364		struct rtable *orig;
1365		int genid = fnhe_genid(dev_net(rt->dst.dev));
1366
1367		if (rt_is_input_route(rt))
1368			porig = &fnhe->fnhe_rth_input;
1369		else
1370			porig = &fnhe->fnhe_rth_output;
1371		orig = rcu_dereference(*porig);
1372
1373		if (fnhe->fnhe_genid != genid) {
1374			fnhe->fnhe_genid = genid;
1375			fnhe->fnhe_gw = 0;
1376			fnhe->fnhe_pmtu = 0;
1377			fnhe->fnhe_expires = 0;
1378			fnhe->fnhe_mtu_locked = false;
1379			fnhe_flush_routes(fnhe);
1380			orig = NULL;
1381		}
1382		fill_route_from_fnhe(rt, fnhe);
1383		if (!rt->rt_gateway)
1384			rt->rt_gateway = daddr;
 
 
1385
1386		if (do_cache) {
1387			dst_hold(&rt->dst);
1388			rcu_assign_pointer(*porig, rt);
1389			if (orig) {
1390				dst_dev_put(&orig->dst);
1391				dst_release(&orig->dst);
1392			}
1393			ret = true;
1394		}
1395
1396		fnhe->fnhe_stamp = jiffies;
1397	}
1398	spin_unlock_bh(&fnhe_lock);
1399
1400	return ret;
1401}
1402
1403static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1404{
1405	struct rtable *orig, *prev, **p;
1406	bool ret = true;
1407
1408	if (rt_is_input_route(rt)) {
1409		p = (struct rtable **)&nh->nh_rth_input;
1410	} else {
1411		p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1412	}
1413	orig = *p;
1414
1415	/* hold dst before doing cmpxchg() to avoid race condition
1416	 * on this dst
1417	 */
1418	dst_hold(&rt->dst);
1419	prev = cmpxchg(p, orig, rt);
1420	if (prev == orig) {
1421		if (orig) {
1422			dst_dev_put(&orig->dst);
1423			dst_release(&orig->dst);
1424		}
1425	} else {
1426		dst_release(&rt->dst);
1427		ret = false;
1428	}
1429
1430	return ret;
1431}
1432
1433struct uncached_list {
1434	spinlock_t		lock;
1435	struct list_head	head;
 
1436};
1437
1438static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1439
1440void rt_add_uncached_list(struct rtable *rt)
1441{
1442	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1443
1444	rt->rt_uncached_list = ul;
1445
1446	spin_lock_bh(&ul->lock);
1447	list_add_tail(&rt->rt_uncached, &ul->head);
1448	spin_unlock_bh(&ul->lock);
1449}
1450
1451void rt_del_uncached_list(struct rtable *rt)
1452{
1453	if (!list_empty(&rt->rt_uncached)) {
1454		struct uncached_list *ul = rt->rt_uncached_list;
1455
1456		spin_lock_bh(&ul->lock);
1457		list_del(&rt->rt_uncached);
1458		spin_unlock_bh(&ul->lock);
1459	}
1460}
1461
1462static void ipv4_dst_destroy(struct dst_entry *dst)
1463{
1464	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1465	struct rtable *rt = (struct rtable *)dst;
1466
1467	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1468		kfree(p);
1469
1470	rt_del_uncached_list(rt);
1471}
1472
1473void rt_flush_dev(struct net_device *dev)
1474{
1475	struct net *net = dev_net(dev);
1476	struct rtable *rt;
1477	int cpu;
1478
1479	for_each_possible_cpu(cpu) {
1480		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1481
 
 
 
1482		spin_lock_bh(&ul->lock);
1483		list_for_each_entry(rt, &ul->head, rt_uncached) {
1484			if (rt->dst.dev != dev)
1485				continue;
1486			rt->dst.dev = net->loopback_dev;
1487			dev_hold(rt->dst.dev);
1488			dev_put(dev);
 
1489		}
1490		spin_unlock_bh(&ul->lock);
1491	}
1492}
1493
1494static bool rt_cache_valid(const struct rtable *rt)
1495{
1496	return	rt &&
1497		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1498		!rt_is_expired(rt);
1499}
1500
1501static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1502			   const struct fib_result *res,
1503			   struct fib_nh_exception *fnhe,
1504			   struct fib_info *fi, u16 type, u32 itag,
1505			   const bool do_cache)
1506{
1507	bool cached = false;
1508
1509	if (fi) {
1510		struct fib_nh *nh = &FIB_RES_NH(*res);
1511
1512		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1513			rt->rt_gateway = nh->nh_gw;
1514			rt->rt_uses_gateway = 1;
 
 
 
 
 
 
1515		}
1516		dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1517		if (fi->fib_metrics != &dst_default_metrics) {
1518			rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1519			refcount_inc(&fi->fib_metrics->refcnt);
1520		}
1521#ifdef CONFIG_IP_ROUTE_CLASSID
1522		rt->dst.tclassid = nh->nh_tclassid;
 
 
 
 
 
1523#endif
1524		rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1525		if (unlikely(fnhe))
1526			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1527		else if (do_cache)
1528			cached = rt_cache_route(nh, rt);
1529		if (unlikely(!cached)) {
1530			/* Routes we intend to cache in nexthop exception or
1531			 * FIB nexthop have the DST_NOCACHE bit clear.
1532			 * However, if we are unsuccessful at storing this
1533			 * route into the cache we really need to set it.
1534			 */
1535			if (!rt->rt_gateway)
1536				rt->rt_gateway = daddr;
 
 
1537			rt_add_uncached_list(rt);
1538		}
1539	} else
1540		rt_add_uncached_list(rt);
1541
1542#ifdef CONFIG_IP_ROUTE_CLASSID
1543#ifdef CONFIG_IP_MULTIPLE_TABLES
1544	set_class_tag(rt, res->tclassid);
1545#endif
1546	set_class_tag(rt, itag);
1547#endif
1548}
1549
1550struct rtable *rt_dst_alloc(struct net_device *dev,
1551			    unsigned int flags, u16 type,
1552			    bool nopolicy, bool noxfrm, bool will_cache)
1553{
1554	struct rtable *rt;
1555
1556	rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1557		       (will_cache ? 0 : DST_HOST) |
1558		       (nopolicy ? DST_NOPOLICY : 0) |
1559		       (noxfrm ? DST_NOXFRM : 0));
1560
1561	if (rt) {
1562		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1563		rt->rt_flags = flags;
1564		rt->rt_type = type;
1565		rt->rt_is_input = 0;
1566		rt->rt_iif = 0;
1567		rt->rt_pmtu = 0;
1568		rt->rt_mtu_locked = 0;
1569		rt->rt_gateway = 0;
1570		rt->rt_uses_gateway = 0;
1571		INIT_LIST_HEAD(&rt->rt_uncached);
 
1572
1573		rt->dst.output = ip_output;
1574		if (flags & RTCF_LOCAL)
1575			rt->dst.input = ip_local_deliver;
1576	}
1577
1578	return rt;
1579}
1580EXPORT_SYMBOL(rt_dst_alloc);
1581
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1582/* called in rcu_read_lock() section */
1583int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1584			  u8 tos, struct net_device *dev,
1585			  struct in_device *in_dev, u32 *itag)
1586{
1587	int err;
1588
1589	/* Primary sanity checks. */
1590	if (!in_dev)
1591		return -EINVAL;
1592
1593	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1594	    skb->protocol != htons(ETH_P_IP))
1595		return -EINVAL;
1596
1597	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1598		return -EINVAL;
1599
1600	if (ipv4_is_zeronet(saddr)) {
1601		if (!ipv4_is_local_multicast(daddr))
 
1602			return -EINVAL;
1603	} else {
1604		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1605					  in_dev, itag);
1606		if (err < 0)
1607			return err;
1608	}
1609	return 0;
1610}
1611
1612/* called in rcu_read_lock() section */
1613static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1614			     u8 tos, struct net_device *dev, int our)
1615{
1616	struct in_device *in_dev = __in_dev_get_rcu(dev);
1617	unsigned int flags = RTCF_MULTICAST;
1618	struct rtable *rth;
1619	u32 itag = 0;
1620	int err;
1621
1622	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1623	if (err)
1624		return err;
1625
1626	if (our)
1627		flags |= RTCF_LOCAL;
1628
 
 
 
1629	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1630			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1631	if (!rth)
1632		return -ENOBUFS;
1633
1634#ifdef CONFIG_IP_ROUTE_CLASSID
1635	rth->dst.tclassid = itag;
1636#endif
1637	rth->dst.output = ip_rt_bug;
1638	rth->rt_is_input= 1;
1639
1640#ifdef CONFIG_IP_MROUTE
1641	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1642		rth->dst.input = ip_mr_input;
1643#endif
1644	RT_CACHE_STAT_INC(in_slow_mc);
1645
 
1646	skb_dst_set(skb, &rth->dst);
1647	return 0;
1648}
1649
1650
1651static void ip_handle_martian_source(struct net_device *dev,
1652				     struct in_device *in_dev,
1653				     struct sk_buff *skb,
1654				     __be32 daddr,
1655				     __be32 saddr)
1656{
1657	RT_CACHE_STAT_INC(in_martian_src);
1658#ifdef CONFIG_IP_ROUTE_VERBOSE
1659	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1660		/*
1661		 *	RFC1812 recommendation, if source is martian,
1662		 *	the only hint is MAC header.
1663		 */
1664		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1665			&daddr, &saddr, dev->name);
1666		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1667			print_hex_dump(KERN_WARNING, "ll header: ",
1668				       DUMP_PREFIX_OFFSET, 16, 1,
1669				       skb_mac_header(skb),
1670				       dev->hard_header_len, true);
1671		}
1672	}
1673#endif
1674}
1675
1676/* called in rcu_read_lock() section */
1677static int __mkroute_input(struct sk_buff *skb,
1678			   const struct fib_result *res,
1679			   struct in_device *in_dev,
1680			   __be32 daddr, __be32 saddr, u32 tos)
1681{
 
 
1682	struct fib_nh_exception *fnhe;
1683	struct rtable *rth;
1684	int err;
1685	struct in_device *out_dev;
1686	bool do_cache;
1687	u32 itag = 0;
1688
1689	/* get a working reference to the output device */
1690	out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1691	if (!out_dev) {
1692		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1693		return -EINVAL;
1694	}
1695
1696	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1697				  in_dev->dev, in_dev, &itag);
1698	if (err < 0) {
1699		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1700					 saddr);
1701
1702		goto cleanup;
1703	}
1704
1705	do_cache = res->fi && !itag;
1706	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1707	    skb->protocol == htons(ETH_P_IP) &&
1708	    (IN_DEV_SHARED_MEDIA(out_dev) ||
1709	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1710		IPCB(skb)->flags |= IPSKB_DOREDIRECT;
 
 
 
 
1711
1712	if (skb->protocol != htons(ETH_P_IP)) {
1713		/* Not IP (i.e. ARP). Do not create route, if it is
1714		 * invalid for proxy arp. DNAT routes are always valid.
1715		 *
1716		 * Proxy arp feature have been extended to allow, ARP
1717		 * replies back to the same interface, to support
1718		 * Private VLAN switch technologies. See arp.c.
1719		 */
1720		if (out_dev == in_dev &&
1721		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1722			err = -EINVAL;
1723			goto cleanup;
1724		}
1725	}
1726
1727	fnhe = find_exception(&FIB_RES_NH(*res), daddr);
 
 
 
1728	if (do_cache) {
1729		if (fnhe)
1730			rth = rcu_dereference(fnhe->fnhe_rth_input);
1731		else
1732			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1733		if (rt_cache_valid(rth)) {
1734			skb_dst_set_noref(skb, &rth->dst);
1735			goto out;
1736		}
1737	}
1738
1739	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1740			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
1741			   IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1742	if (!rth) {
1743		err = -ENOBUFS;
1744		goto cleanup;
1745	}
1746
1747	rth->rt_is_input = 1;
1748	RT_CACHE_STAT_INC(in_slow_tot);
1749
1750	rth->dst.input = ip_forward;
1751
1752	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1753		       do_cache);
1754	lwtunnel_set_redirect(&rth->dst);
1755	skb_dst_set(skb, &rth->dst);
1756out:
1757	err = 0;
1758 cleanup:
1759	return err;
1760}
1761
1762#ifdef CONFIG_IP_ROUTE_MULTIPATH
1763/* To make ICMP packets follow the right flow, the multipath hash is
1764 * calculated from the inner IP addresses.
1765 */
1766static void ip_multipath_l3_keys(const struct sk_buff *skb,
1767				 struct flow_keys *hash_keys)
1768{
1769	const struct iphdr *outer_iph = ip_hdr(skb);
1770	const struct iphdr *key_iph = outer_iph;
1771	const struct iphdr *inner_iph;
1772	const struct icmphdr *icmph;
1773	struct iphdr _inner_iph;
1774	struct icmphdr _icmph;
1775
1776	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1777		goto out;
1778
1779	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1780		goto out;
1781
1782	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1783				   &_icmph);
1784	if (!icmph)
1785		goto out;
1786
1787	if (icmph->type != ICMP_DEST_UNREACH &&
1788	    icmph->type != ICMP_REDIRECT &&
1789	    icmph->type != ICMP_TIME_EXCEEDED &&
1790	    icmph->type != ICMP_PARAMETERPROB)
1791		goto out;
1792
1793	inner_iph = skb_header_pointer(skb,
1794				       outer_iph->ihl * 4 + sizeof(_icmph),
1795				       sizeof(_inner_iph), &_inner_iph);
1796	if (!inner_iph)
1797		goto out;
1798
1799	key_iph = inner_iph;
1800out:
1801	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1802	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1803}
1804
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1805/* if skb is set it will be used and fl4 can be NULL */
1806int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1807		       const struct sk_buff *skb, struct flow_keys *flkeys)
1808{
 
1809	struct flow_keys hash_keys;
1810	u32 mhash;
1811
1812	switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1813	case 0:
1814		memset(&hash_keys, 0, sizeof(hash_keys));
1815		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1816		if (skb) {
1817			ip_multipath_l3_keys(skb, &hash_keys);
1818		} else {
1819			hash_keys.addrs.v4addrs.src = fl4->saddr;
1820			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1821		}
 
1822		break;
1823	case 1:
1824		/* skb is currently provided only when forwarding */
1825		if (skb) {
1826			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1827			struct flow_keys keys;
1828
1829			/* short-circuit if we already have L4 hash present */
1830			if (skb->l4_hash)
1831				return skb_get_hash_raw(skb) >> 1;
1832
1833			memset(&hash_keys, 0, sizeof(hash_keys));
1834
1835			if (!flkeys) {
1836				skb_flow_dissect_flow_keys(skb, &keys, flag);
1837				flkeys = &keys;
1838			}
1839
1840			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1841			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1842			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1843			hash_keys.ports.src = flkeys->ports.src;
1844			hash_keys.ports.dst = flkeys->ports.dst;
1845			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1846		} else {
1847			memset(&hash_keys, 0, sizeof(hash_keys));
1848			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1849			hash_keys.addrs.v4addrs.src = fl4->saddr;
1850			hash_keys.addrs.v4addrs.dst = fl4->daddr;
1851			hash_keys.ports.src = fl4->fl4_sport;
1852			hash_keys.ports.dst = fl4->fl4_dport;
1853			hash_keys.basic.ip_proto = fl4->flowi4_proto;
1854		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1855		break;
1856	}
1857	mhash = flow_hash_from_keys(&hash_keys);
 
 
1858
1859	return mhash >> 1;
1860}
1861#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1862
1863static int ip_mkroute_input(struct sk_buff *skb,
1864			    struct fib_result *res,
1865			    struct in_device *in_dev,
1866			    __be32 daddr, __be32 saddr, u32 tos,
1867			    struct flow_keys *hkeys)
1868{
1869#ifdef CONFIG_IP_ROUTE_MULTIPATH
1870	if (res->fi && res->fi->fib_nhs > 1) {
1871		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1872
1873		fib_select_multipath(res, h);
 
1874	}
1875#endif
1876
1877	/* create a routing cache entry */
1878	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1879}
1880
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1881/*
1882 *	NOTE. We drop all the packets that has local source
1883 *	addresses, because every properly looped back packet
1884 *	must have correct destination already attached by output routine.
 
 
1885 *
1886 *	Such approach solves two big problems:
1887 *	1. Not simplex devices are handled properly.
1888 *	2. IP spoofing attempts are filtered with 100% of guarantee.
1889 *	called with rcu_read_lock()
1890 */
1891
1892static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1893			       u8 tos, struct net_device *dev,
1894			       struct fib_result *res)
1895{
1896	struct in_device *in_dev = __in_dev_get_rcu(dev);
1897	struct flow_keys *flkeys = NULL, _flkeys;
1898	struct net    *net = dev_net(dev);
1899	struct ip_tunnel_info *tun_info;
1900	int		err = -EINVAL;
1901	unsigned int	flags = 0;
1902	u32		itag = 0;
1903	struct rtable	*rth;
1904	struct flowi4	fl4;
1905	bool do_cache;
1906
1907	/* IP on this device is disabled. */
1908
1909	if (!in_dev)
1910		goto out;
1911
1912	/* Check for the most weird martians, which can be not detected
1913	   by fib_lookup.
1914	 */
1915
1916	tun_info = skb_tunnel_info(skb);
1917	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1918		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1919	else
1920		fl4.flowi4_tun_key.tun_id = 0;
1921	skb_dst_drop(skb);
1922
1923	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1924		goto martian_source;
1925
1926	res->fi = NULL;
1927	res->table = NULL;
1928	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1929		goto brd_input;
1930
1931	/* Accept zero addresses only to limited broadcast;
1932	 * I even do not know to fix it or not. Waiting for complains :-)
1933	 */
1934	if (ipv4_is_zeronet(saddr))
1935		goto martian_source;
1936
1937	if (ipv4_is_zeronet(daddr))
1938		goto martian_destination;
1939
1940	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1941	 * and call it once if daddr or/and saddr are loopback addresses
1942	 */
1943	if (ipv4_is_loopback(daddr)) {
1944		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1945			goto martian_destination;
1946	} else if (ipv4_is_loopback(saddr)) {
1947		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1948			goto martian_source;
1949	}
1950
1951	/*
1952	 *	Now we are ready to route packet.
1953	 */
 
1954	fl4.flowi4_oif = 0;
1955	fl4.flowi4_iif = dev->ifindex;
1956	fl4.flowi4_mark = skb->mark;
1957	fl4.flowi4_tos = tos;
1958	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1959	fl4.flowi4_flags = 0;
1960	fl4.daddr = daddr;
1961	fl4.saddr = saddr;
1962	fl4.flowi4_uid = sock_net_uid(net, NULL);
 
1963
1964	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
1965		flkeys = &_flkeys;
1966	} else {
1967		fl4.flowi4_proto = 0;
1968		fl4.fl4_sport = 0;
1969		fl4.fl4_dport = 0;
1970	}
1971
1972	err = fib_lookup(net, &fl4, res, 0);
1973	if (err != 0) {
1974		if (!IN_DEV_FORWARD(in_dev))
1975			err = -EHOSTUNREACH;
1976		goto no_route;
1977	}
1978
1979	if (res->type == RTN_BROADCAST)
 
 
 
 
 
1980		goto brd_input;
 
1981
1982	if (res->type == RTN_LOCAL) {
1983		err = fib_validate_source(skb, saddr, daddr, tos,
1984					  0, dev, in_dev, &itag);
1985		if (err < 0)
1986			goto martian_source;
1987		goto local_input;
1988	}
1989
1990	if (!IN_DEV_FORWARD(in_dev)) {
1991		err = -EHOSTUNREACH;
1992		goto no_route;
1993	}
1994	if (res->type != RTN_UNICAST)
1995		goto martian_destination;
1996
 
1997	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
1998out:	return err;
1999
2000brd_input:
2001	if (skb->protocol != htons(ETH_P_IP))
2002		goto e_inval;
2003
2004	if (!ipv4_is_zeronet(saddr)) {
2005		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2006					  in_dev, &itag);
2007		if (err < 0)
2008			goto martian_source;
2009	}
2010	flags |= RTCF_BROADCAST;
2011	res->type = RTN_BROADCAST;
2012	RT_CACHE_STAT_INC(in_brd);
2013
2014local_input:
2015	do_cache = false;
2016	if (res->fi) {
2017		if (!itag) {
2018			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2019			if (rt_cache_valid(rth)) {
2020				skb_dst_set_noref(skb, &rth->dst);
2021				err = 0;
2022				goto out;
2023			}
2024			do_cache = true;
 
 
2025		}
2026	}
2027
2028	rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2029			   flags | RTCF_LOCAL, res->type,
2030			   IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2031	if (!rth)
2032		goto e_nobufs;
2033
2034	rth->dst.output= ip_rt_bug;
2035#ifdef CONFIG_IP_ROUTE_CLASSID
2036	rth->dst.tclassid = itag;
2037#endif
2038	rth->rt_is_input = 1;
2039
2040	RT_CACHE_STAT_INC(in_slow_tot);
2041	if (res->type == RTN_UNREACHABLE) {
2042		rth->dst.input= ip_error;
2043		rth->dst.error= -err;
2044		rth->rt_flags 	&= ~RTCF_LOCAL;
2045	}
2046
2047	if (do_cache) {
2048		struct fib_nh *nh = &FIB_RES_NH(*res);
2049
2050		rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2051		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2052			WARN_ON(rth->dst.input == lwtunnel_input);
2053			rth->dst.lwtstate->orig_input = rth->dst.input;
2054			rth->dst.input = lwtunnel_input;
2055		}
2056
2057		if (unlikely(!rt_cache_route(nh, rth)))
2058			rt_add_uncached_list(rth);
2059	}
2060	skb_dst_set(skb, &rth->dst);
2061	err = 0;
2062	goto out;
2063
2064no_route:
2065	RT_CACHE_STAT_INC(in_no_route);
2066	res->type = RTN_UNREACHABLE;
2067	res->fi = NULL;
2068	res->table = NULL;
2069	goto local_input;
2070
2071	/*
2072	 *	Do not cache martian addresses: they should be logged (RFC1812)
2073	 */
2074martian_destination:
2075	RT_CACHE_STAT_INC(in_martian_dst);
2076#ifdef CONFIG_IP_ROUTE_VERBOSE
2077	if (IN_DEV_LOG_MARTIANS(in_dev))
2078		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2079				     &daddr, &saddr, dev->name);
2080#endif
2081
2082e_inval:
2083	err = -EINVAL;
2084	goto out;
2085
2086e_nobufs:
2087	err = -ENOBUFS;
2088	goto out;
2089
2090martian_source:
2091	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2092	goto out;
2093}
2094
2095int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2096			 u8 tos, struct net_device *dev)
2097{
2098	struct fib_result res;
2099	int err;
2100
2101	tos &= IPTOS_RT_MASK;
2102	rcu_read_lock();
2103	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2104	rcu_read_unlock();
2105
2106	return err;
2107}
2108EXPORT_SYMBOL(ip_route_input_noref);
2109
2110/* called with rcu_read_lock held */
2111int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2112		       u8 tos, struct net_device *dev, struct fib_result *res)
2113{
2114	/* Multicast recognition logic is moved from route cache to here.
2115	   The problem was that too many Ethernet cards have broken/missing
2116	   hardware multicast filters :-( As result the host on multicasting
2117	   network acquires a lot of useless route cache entries, sort of
2118	   SDR messages from all the world. Now we try to get rid of them.
2119	   Really, provided software IP multicast filter is organized
2120	   reasonably (at least, hashed), it does not result in a slowdown
2121	   comparing with route cache reject entries.
2122	   Note, that multicast routers are not affected, because
2123	   route cache entry is created eventually.
2124	 */
2125	if (ipv4_is_multicast(daddr)) {
2126		struct in_device *in_dev = __in_dev_get_rcu(dev);
2127		int our = 0;
2128		int err = -EINVAL;
2129
2130		if (in_dev)
2131			our = ip_check_mc_rcu(in_dev, daddr, saddr,
2132					      ip_hdr(skb)->protocol);
 
2133
2134		/* check l3 master if no match yet */
2135		if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2136			struct in_device *l3_in_dev;
2137
2138			l3_in_dev = __in_dev_get_rcu(skb->dev);
2139			if (l3_in_dev)
2140				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2141						      ip_hdr(skb)->protocol);
2142		}
2143
2144		if (our
2145#ifdef CONFIG_IP_MROUTE
2146			||
2147		    (!ipv4_is_local_multicast(daddr) &&
2148		     IN_DEV_MFORWARD(in_dev))
2149#endif
2150		   ) {
2151			err = ip_route_input_mc(skb, daddr, saddr,
2152						tos, dev, our);
2153		}
2154		return err;
2155	}
2156
2157	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2158}
2159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2160/* called with rcu_read_lock() */
2161static struct rtable *__mkroute_output(const struct fib_result *res,
2162				       const struct flowi4 *fl4, int orig_oif,
2163				       struct net_device *dev_out,
2164				       unsigned int flags)
2165{
2166	struct fib_info *fi = res->fi;
2167	struct fib_nh_exception *fnhe;
2168	struct in_device *in_dev;
2169	u16 type = res->type;
2170	struct rtable *rth;
2171	bool do_cache;
2172
2173	in_dev = __in_dev_get_rcu(dev_out);
2174	if (!in_dev)
2175		return ERR_PTR(-EINVAL);
2176
2177	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2178		if (ipv4_is_loopback(fl4->saddr) &&
2179		    !(dev_out->flags & IFF_LOOPBACK) &&
2180		    !netif_is_l3_master(dev_out))
2181			return ERR_PTR(-EINVAL);
2182
2183	if (ipv4_is_lbcast(fl4->daddr))
2184		type = RTN_BROADCAST;
2185	else if (ipv4_is_multicast(fl4->daddr))
2186		type = RTN_MULTICAST;
2187	else if (ipv4_is_zeronet(fl4->daddr))
2188		return ERR_PTR(-EINVAL);
2189
2190	if (dev_out->flags & IFF_LOOPBACK)
2191		flags |= RTCF_LOCAL;
2192
2193	do_cache = true;
2194	if (type == RTN_BROADCAST) {
2195		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2196		fi = NULL;
2197	} else if (type == RTN_MULTICAST) {
2198		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2199		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2200				     fl4->flowi4_proto))
2201			flags &= ~RTCF_LOCAL;
2202		else
2203			do_cache = false;
2204		/* If multicast route do not exist use
2205		 * default one, but do not gateway in this case.
2206		 * Yes, it is hack.
2207		 */
2208		if (fi && res->prefixlen < 4)
2209			fi = NULL;
2210	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2211		   (orig_oif != dev_out->ifindex)) {
2212		/* For local routes that require a particular output interface
2213		 * we do not want to cache the result.  Caching the result
2214		 * causes incorrect behaviour when there are multiple source
2215		 * addresses on the interface, the end result being that if the
2216		 * intended recipient is waiting on that interface for the
2217		 * packet he won't receive it because it will be delivered on
2218		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2219		 * be set to the loopback interface as well.
2220		 */
2221		do_cache = false;
2222	}
2223
2224	fnhe = NULL;
2225	do_cache &= fi != NULL;
2226	if (fi) {
 
2227		struct rtable __rcu **prth;
2228		struct fib_nh *nh = &FIB_RES_NH(*res);
2229
2230		fnhe = find_exception(nh, fl4->daddr);
2231		if (!do_cache)
2232			goto add;
2233		if (fnhe) {
2234			prth = &fnhe->fnhe_rth_output;
2235		} else {
2236			if (unlikely(fl4->flowi4_flags &
2237				     FLOWI_FLAG_KNOWN_NH &&
2238				     !(nh->nh_gw &&
2239				       nh->nh_scope == RT_SCOPE_LINK))) {
2240				do_cache = false;
2241				goto add;
2242			}
2243			prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2244		}
2245		rth = rcu_dereference(*prth);
2246		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2247			return rth;
2248	}
2249
2250add:
2251	rth = rt_dst_alloc(dev_out, flags, type,
2252			   IN_DEV_CONF_GET(in_dev, NOPOLICY),
2253			   IN_DEV_CONF_GET(in_dev, NOXFRM),
2254			   do_cache);
2255	if (!rth)
2256		return ERR_PTR(-ENOBUFS);
2257
2258	rth->rt_iif = orig_oif;
2259
2260	RT_CACHE_STAT_INC(out_slow_tot);
2261
2262	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2263		if (flags & RTCF_LOCAL &&
2264		    !(dev_out->flags & IFF_LOOPBACK)) {
2265			rth->dst.output = ip_mc_output;
2266			RT_CACHE_STAT_INC(out_slow_mc);
2267		}
2268#ifdef CONFIG_IP_MROUTE
2269		if (type == RTN_MULTICAST) {
2270			if (IN_DEV_MFORWARD(in_dev) &&
2271			    !ipv4_is_local_multicast(fl4->daddr)) {
2272				rth->dst.input = ip_mr_input;
2273				rth->dst.output = ip_mc_output;
2274			}
2275		}
2276#endif
2277	}
2278
2279	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2280	lwtunnel_set_redirect(&rth->dst);
2281
2282	return rth;
2283}
2284
2285/*
2286 * Major route resolver routine.
2287 */
2288
2289struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2290					const struct sk_buff *skb)
2291{
2292	__u8 tos = RT_FL_TOS(fl4);
2293	struct fib_result res = {
2294		.type		= RTN_UNSPEC,
2295		.fi		= NULL,
2296		.table		= NULL,
2297		.tclassid	= 0,
2298	};
2299	struct rtable *rth;
2300
2301	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2302	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2303	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2304			 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2305
2306	rcu_read_lock();
2307	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2308	rcu_read_unlock();
2309
2310	return rth;
2311}
2312EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2313
2314struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2315					    struct fib_result *res,
2316					    const struct sk_buff *skb)
2317{
2318	struct net_device *dev_out = NULL;
2319	int orig_oif = fl4->flowi4_oif;
2320	unsigned int flags = 0;
2321	struct rtable *rth;
2322	int err = -ENETUNREACH;
2323
2324	if (fl4->saddr) {
2325		rth = ERR_PTR(-EINVAL);
2326		if (ipv4_is_multicast(fl4->saddr) ||
2327		    ipv4_is_lbcast(fl4->saddr) ||
2328		    ipv4_is_zeronet(fl4->saddr))
 
2329			goto out;
 
 
 
2330
2331		/* I removed check for oif == dev_out->oif here.
2332		   It was wrong for two reasons:
2333		   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2334		      is assigned to multiple interfaces.
2335		   2. Moreover, we are allowed to send packets with saddr
2336		      of another iface. --ANK
2337		 */
2338
2339		if (fl4->flowi4_oif == 0 &&
2340		    (ipv4_is_multicast(fl4->daddr) ||
2341		     ipv4_is_lbcast(fl4->daddr))) {
2342			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2343			dev_out = __ip_dev_find(net, fl4->saddr, false);
2344			if (!dev_out)
2345				goto out;
2346
2347			/* Special hack: user can direct multicasts
2348			   and limited broadcast via necessary interface
2349			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2350			   This hack is not just for fun, it allows
2351			   vic,vat and friends to work.
2352			   They bind socket to loopback, set ttl to zero
2353			   and expect that it will work.
2354			   From the viewpoint of routing cache they are broken,
2355			   because we are not allowed to build multicast path
2356			   with loopback source addr (look, routing cache
2357			   cannot know, that ttl is zero, so that packet
2358			   will not leave this host and route is valid).
2359			   Luckily, this hack is good workaround.
2360			 */
2361
2362			fl4->flowi4_oif = dev_out->ifindex;
2363			goto make_route;
2364		}
2365
2366		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2367			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2368			if (!__ip_dev_find(net, fl4->saddr, false))
2369				goto out;
2370		}
2371	}
2372
2373
2374	if (fl4->flowi4_oif) {
2375		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2376		rth = ERR_PTR(-ENODEV);
2377		if (!dev_out)
2378			goto out;
2379
2380		/* RACE: Check return value of inet_select_addr instead. */
2381		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2382			rth = ERR_PTR(-ENETUNREACH);
2383			goto out;
2384		}
2385		if (ipv4_is_local_multicast(fl4->daddr) ||
2386		    ipv4_is_lbcast(fl4->daddr) ||
2387		    fl4->flowi4_proto == IPPROTO_IGMP) {
2388			if (!fl4->saddr)
2389				fl4->saddr = inet_select_addr(dev_out, 0,
2390							      RT_SCOPE_LINK);
2391			goto make_route;
2392		}
2393		if (!fl4->saddr) {
2394			if (ipv4_is_multicast(fl4->daddr))
2395				fl4->saddr = inet_select_addr(dev_out, 0,
2396							      fl4->flowi4_scope);
2397			else if (!fl4->daddr)
2398				fl4->saddr = inet_select_addr(dev_out, 0,
2399							      RT_SCOPE_HOST);
2400		}
2401	}
2402
2403	if (!fl4->daddr) {
2404		fl4->daddr = fl4->saddr;
2405		if (!fl4->daddr)
2406			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2407		dev_out = net->loopback_dev;
2408		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2409		res->type = RTN_LOCAL;
2410		flags |= RTCF_LOCAL;
2411		goto make_route;
2412	}
2413
2414	err = fib_lookup(net, fl4, res, 0);
2415	if (err) {
2416		res->fi = NULL;
2417		res->table = NULL;
2418		if (fl4->flowi4_oif &&
2419		    (ipv4_is_multicast(fl4->daddr) ||
2420		    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2421			/* Apparently, routing tables are wrong. Assume,
2422			   that the destination is on link.
2423
2424			   WHY? DW.
2425			   Because we are allowed to send to iface
2426			   even if it has NO routes and NO assigned
2427			   addresses. When oif is specified, routing
2428			   tables are looked up with only one purpose:
2429			   to catch if destination is gatewayed, rather than
2430			   direct. Moreover, if MSG_DONTROUTE is set,
2431			   we send packet, ignoring both routing tables
2432			   and ifaddr state. --ANK
2433
2434
2435			   We could make it even if oif is unknown,
2436			   likely IPv6, but we do not.
2437			 */
2438
2439			if (fl4->saddr == 0)
2440				fl4->saddr = inet_select_addr(dev_out, 0,
2441							      RT_SCOPE_LINK);
2442			res->type = RTN_UNICAST;
2443			goto make_route;
2444		}
2445		rth = ERR_PTR(err);
2446		goto out;
2447	}
2448
2449	if (res->type == RTN_LOCAL) {
2450		if (!fl4->saddr) {
2451			if (res->fi->fib_prefsrc)
2452				fl4->saddr = res->fi->fib_prefsrc;
2453			else
2454				fl4->saddr = fl4->daddr;
2455		}
2456
2457		/* L3 master device is the loopback for that domain */
2458		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2459			net->loopback_dev;
2460
2461		/* make sure orig_oif points to fib result device even
2462		 * though packet rx/tx happens over loopback or l3mdev
2463		 */
2464		orig_oif = FIB_RES_OIF(*res);
2465
2466		fl4->flowi4_oif = dev_out->ifindex;
2467		flags |= RTCF_LOCAL;
2468		goto make_route;
2469	}
2470
2471	fib_select_path(net, res, fl4, skb);
2472
2473	dev_out = FIB_RES_DEV(*res);
2474	fl4->flowi4_oif = dev_out->ifindex;
2475
2476
2477make_route:
2478	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2479
2480out:
2481	return rth;
2482}
2483
2484static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2485{
2486	return NULL;
2487}
2488
2489static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2490{
2491	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2492
2493	return mtu ? : dst->dev->mtu;
2494}
2495
2496static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2497					  struct sk_buff *skb, u32 mtu)
2498{
2499}
2500
2501static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2502				       struct sk_buff *skb)
2503{
2504}
2505
2506static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2507					  unsigned long old)
2508{
2509	return NULL;
2510}
2511
2512static struct dst_ops ipv4_dst_blackhole_ops = {
2513	.family			=	AF_INET,
2514	.check			=	ipv4_blackhole_dst_check,
2515	.mtu			=	ipv4_blackhole_mtu,
2516	.default_advmss		=	ipv4_default_advmss,
2517	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
2518	.redirect		=	ipv4_rt_blackhole_redirect,
2519	.cow_metrics		=	ipv4_rt_blackhole_cow_metrics,
2520	.neigh_lookup		=	ipv4_neigh_lookup,
2521};
2522
2523struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2524{
2525	struct rtable *ort = (struct rtable *) dst_orig;
2526	struct rtable *rt;
2527
2528	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2529	if (rt) {
2530		struct dst_entry *new = &rt->dst;
2531
2532		new->__use = 1;
2533		new->input = dst_discard;
2534		new->output = dst_discard_out;
2535
2536		new->dev = net->loopback_dev;
2537		if (new->dev)
2538			dev_hold(new->dev);
2539
2540		rt->rt_is_input = ort->rt_is_input;
2541		rt->rt_iif = ort->rt_iif;
2542		rt->rt_pmtu = ort->rt_pmtu;
2543		rt->rt_mtu_locked = ort->rt_mtu_locked;
2544
2545		rt->rt_genid = rt_genid_ipv4(net);
2546		rt->rt_flags = ort->rt_flags;
2547		rt->rt_type = ort->rt_type;
2548		rt->rt_gateway = ort->rt_gateway;
2549		rt->rt_uses_gateway = ort->rt_uses_gateway;
2550
2551		INIT_LIST_HEAD(&rt->rt_uncached);
 
 
 
2552	}
2553
2554	dst_release(dst_orig);
2555
2556	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2557}
2558
2559struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2560				    const struct sock *sk)
2561{
2562	struct rtable *rt = __ip_route_output_key(net, flp4);
2563
2564	if (IS_ERR(rt))
2565		return rt;
2566
2567	if (flp4->flowi4_proto)
 
2568		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2569							flowi4_to_flowi(flp4),
2570							sk, 0);
 
2571
2572	return rt;
2573}
2574EXPORT_SYMBOL_GPL(ip_route_output_flow);
2575
2576/* called with rcu_read_lock held */
2577static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2578			struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2579			u32 seq)
 
2580{
2581	struct rtable *rt = skb_rtable(skb);
2582	struct rtmsg *r;
2583	struct nlmsghdr *nlh;
2584	unsigned long expires = 0;
2585	u32 error;
2586	u32 metrics[RTAX_MAX];
2587
2588	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2589	if (!nlh)
2590		return -EMSGSIZE;
2591
2592	r = nlmsg_data(nlh);
2593	r->rtm_family	 = AF_INET;
2594	r->rtm_dst_len	= 32;
2595	r->rtm_src_len	= 0;
2596	r->rtm_tos	= fl4->flowi4_tos;
2597	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2598	if (nla_put_u32(skb, RTA_TABLE, table_id))
2599		goto nla_put_failure;
2600	r->rtm_type	= rt->rt_type;
2601	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2602	r->rtm_protocol = RTPROT_UNSPEC;
2603	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2604	if (rt->rt_flags & RTCF_NOTIFY)
2605		r->rtm_flags |= RTM_F_NOTIFY;
2606	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2607		r->rtm_flags |= RTCF_DOREDIRECT;
2608
2609	if (nla_put_in_addr(skb, RTA_DST, dst))
2610		goto nla_put_failure;
2611	if (src) {
2612		r->rtm_src_len = 32;
2613		if (nla_put_in_addr(skb, RTA_SRC, src))
2614			goto nla_put_failure;
2615	}
2616	if (rt->dst.dev &&
2617	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2618		goto nla_put_failure;
 
 
 
2619#ifdef CONFIG_IP_ROUTE_CLASSID
2620	if (rt->dst.tclassid &&
2621	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2622		goto nla_put_failure;
2623#endif
2624	if (!rt_is_input_route(rt) &&
2625	    fl4->saddr != src) {
2626		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2627			goto nla_put_failure;
2628	}
2629	if (rt->rt_uses_gateway &&
2630	    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2631		goto nla_put_failure;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2632
2633	expires = rt->dst.expires;
2634	if (expires) {
2635		unsigned long now = jiffies;
2636
2637		if (time_before(now, expires))
2638			expires -= now;
2639		else
2640			expires = 0;
2641	}
2642
2643	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2644	if (rt->rt_pmtu && expires)
2645		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2646	if (rt->rt_mtu_locked && expires)
2647		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2648	if (rtnetlink_put_metrics(skb, metrics) < 0)
2649		goto nla_put_failure;
2650
2651	if (fl4->flowi4_mark &&
2652	    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2653		goto nla_put_failure;
2654
2655	if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2656	    nla_put_u32(skb, RTA_UID,
2657			from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2658		goto nla_put_failure;
2659
2660	error = rt->dst.error;
 
 
 
 
2661
2662	if (rt_is_input_route(rt)) {
2663#ifdef CONFIG_IP_MROUTE
2664		if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2665		    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2666			int err = ipmr_get_route(net, skb,
2667						 fl4->saddr, fl4->daddr,
2668						 r, portid);
2669
2670			if (err <= 0) {
2671				if (err == 0)
2672					return 0;
2673				goto nla_put_failure;
2674			}
2675		} else
 
2676#endif
2677			if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2678				goto nla_put_failure;
 
2679	}
2680
 
 
2681	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2682		goto nla_put_failure;
2683
2684	nlmsg_end(skb, nlh);
2685	return 0;
2686
2687nla_put_failure:
2688	nlmsg_cancel(skb, nlh);
2689	return -EMSGSIZE;
2690}
2691
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2692static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2693			     struct netlink_ext_ack *extack)
2694{
2695	struct net *net = sock_net(in_skb->sk);
2696	struct rtmsg *rtm;
2697	struct nlattr *tb[RTA_MAX+1];
 
 
2698	struct fib_result res = {};
 
2699	struct rtable *rt = NULL;
2700	struct flowi4 fl4;
 
 
2701	__be32 dst = 0;
2702	__be32 src = 0;
 
2703	u32 iif;
2704	int err;
2705	int mark;
2706	struct sk_buff *skb;
2707	u32 table_id = RT_TABLE_MAIN;
2708	kuid_t uid;
2709
2710	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2711			  extack);
2712	if (err < 0)
2713		goto errout;
2714
2715	rtm = nlmsg_data(nlh);
2716
2717	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2718	if (!skb) {
2719		err = -ENOBUFS;
2720		goto errout;
2721	}
2722
2723	/* Reserve room for dummy headers, this skb can pass
2724	   through good chunk of routing engine.
2725	 */
2726	skb_reset_mac_header(skb);
2727	skb_reset_network_header(skb);
2728
2729	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2730	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2731	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2732	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2733	if (tb[RTA_UID])
2734		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2735	else
2736		uid = (iif ? INVALID_UID : current_uid());
2737
2738	/* Bugfix: need to give ip_route_input enough of an IP header to
2739	 * not gag.
2740	 */
2741	ip_hdr(skb)->protocol = IPPROTO_UDP;
2742	ip_hdr(skb)->saddr = src;
2743	ip_hdr(skb)->daddr = dst;
 
 
 
2744
2745	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
 
 
 
 
 
2746
2747	memset(&fl4, 0, sizeof(fl4));
2748	fl4.daddr = dst;
2749	fl4.saddr = src;
2750	fl4.flowi4_tos = rtm->rtm_tos;
2751	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2752	fl4.flowi4_mark = mark;
2753	fl4.flowi4_uid = uid;
 
 
 
 
 
2754
2755	rcu_read_lock();
2756
2757	if (iif) {
2758		struct net_device *dev;
2759
2760		dev = dev_get_by_index_rcu(net, iif);
2761		if (!dev) {
2762			err = -ENODEV;
2763			goto errout_free;
2764		}
2765
2766		skb->protocol	= htons(ETH_P_IP);
2767		skb->dev	= dev;
2768		skb->mark	= mark;
2769		err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2770					 dev, &res);
 
2771
2772		rt = skb_rtable(skb);
2773		if (err == 0 && rt->dst.error)
2774			err = -rt->dst.error;
2775	} else {
2776		fl4.flowi4_iif = LOOPBACK_IFINDEX;
 
2777		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2778		err = 0;
2779		if (IS_ERR(rt))
2780			err = PTR_ERR(rt);
2781		else
2782			skb_dst_set(skb, &rt->dst);
2783	}
2784
2785	if (err)
2786		goto errout_free;
2787
2788	if (rtm->rtm_flags & RTM_F_NOTIFY)
2789		rt->rt_flags |= RTCF_NOTIFY;
2790
2791	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2792		table_id = res.table ? res.table->tb_id : 0;
2793
 
 
 
 
 
 
2794	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
 
 
2795		if (!res.fi) {
2796			err = fib_props[res.type].error;
2797			if (!err)
2798				err = -EHOSTUNREACH;
2799			goto errout_free;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2800		}
2801		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2802				    nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2803				    rt->rt_type, res.prefix, res.prefixlen,
2804				    fl4.flowi4_tos, res.fi, 0);
2805	} else {
2806		err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2807				   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
 
2808	}
2809	if (err < 0)
2810		goto errout_free;
2811
2812	rcu_read_unlock();
2813
2814	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2815errout:
2816	return err;
2817
2818errout_free:
 
 
2819	rcu_read_unlock();
2820	kfree_skb(skb);
2821	goto errout;
2822}
2823
2824void ip_rt_multicast_event(struct in_device *in_dev)
2825{
2826	rt_cache_flush(dev_net(in_dev->dev));
2827}
2828
2829#ifdef CONFIG_SYSCTL
2830static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2831static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
2832static int ip_rt_gc_elasticity __read_mostly	= 8;
2833static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
2834
2835static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2836					void __user *buffer,
2837					size_t *lenp, loff_t *ppos)
2838{
2839	struct net *net = (struct net *)__ctl->extra1;
2840
2841	if (write) {
2842		rt_cache_flush(net);
2843		fnhe_genid_bump(net);
2844		return 0;
2845	}
2846
2847	return -EINVAL;
2848}
2849
2850static struct ctl_table ipv4_route_table[] = {
2851	{
2852		.procname	= "gc_thresh",
2853		.data		= &ipv4_dst_ops.gc_thresh,
2854		.maxlen		= sizeof(int),
2855		.mode		= 0644,
2856		.proc_handler	= proc_dointvec,
2857	},
2858	{
2859		.procname	= "max_size",
2860		.data		= &ip_rt_max_size,
2861		.maxlen		= sizeof(int),
2862		.mode		= 0644,
2863		.proc_handler	= proc_dointvec,
2864	},
2865	{
2866		/*  Deprecated. Use gc_min_interval_ms */
2867
2868		.procname	= "gc_min_interval",
2869		.data		= &ip_rt_gc_min_interval,
2870		.maxlen		= sizeof(int),
2871		.mode		= 0644,
2872		.proc_handler	= proc_dointvec_jiffies,
2873	},
2874	{
2875		.procname	= "gc_min_interval_ms",
2876		.data		= &ip_rt_gc_min_interval,
2877		.maxlen		= sizeof(int),
2878		.mode		= 0644,
2879		.proc_handler	= proc_dointvec_ms_jiffies,
2880	},
2881	{
2882		.procname	= "gc_timeout",
2883		.data		= &ip_rt_gc_timeout,
2884		.maxlen		= sizeof(int),
2885		.mode		= 0644,
2886		.proc_handler	= proc_dointvec_jiffies,
2887	},
2888	{
2889		.procname	= "gc_interval",
2890		.data		= &ip_rt_gc_interval,
2891		.maxlen		= sizeof(int),
2892		.mode		= 0644,
2893		.proc_handler	= proc_dointvec_jiffies,
2894	},
2895	{
2896		.procname	= "redirect_load",
2897		.data		= &ip_rt_redirect_load,
2898		.maxlen		= sizeof(int),
2899		.mode		= 0644,
2900		.proc_handler	= proc_dointvec,
2901	},
2902	{
2903		.procname	= "redirect_number",
2904		.data		= &ip_rt_redirect_number,
2905		.maxlen		= sizeof(int),
2906		.mode		= 0644,
2907		.proc_handler	= proc_dointvec,
2908	},
2909	{
2910		.procname	= "redirect_silence",
2911		.data		= &ip_rt_redirect_silence,
2912		.maxlen		= sizeof(int),
2913		.mode		= 0644,
2914		.proc_handler	= proc_dointvec,
2915	},
2916	{
2917		.procname	= "error_cost",
2918		.data		= &ip_rt_error_cost,
2919		.maxlen		= sizeof(int),
2920		.mode		= 0644,
2921		.proc_handler	= proc_dointvec,
2922	},
2923	{
2924		.procname	= "error_burst",
2925		.data		= &ip_rt_error_burst,
2926		.maxlen		= sizeof(int),
2927		.mode		= 0644,
2928		.proc_handler	= proc_dointvec,
2929	},
2930	{
2931		.procname	= "gc_elasticity",
2932		.data		= &ip_rt_gc_elasticity,
2933		.maxlen		= sizeof(int),
2934		.mode		= 0644,
2935		.proc_handler	= proc_dointvec,
2936	},
 
 
 
 
 
 
2937	{
2938		.procname	= "mtu_expires",
2939		.data		= &ip_rt_mtu_expires,
2940		.maxlen		= sizeof(int),
2941		.mode		= 0644,
2942		.proc_handler	= proc_dointvec_jiffies,
2943	},
2944	{
2945		.procname	= "min_pmtu",
2946		.data		= &ip_rt_min_pmtu,
2947		.maxlen		= sizeof(int),
2948		.mode		= 0644,
2949		.proc_handler	= proc_dointvec_minmax,
2950		.extra1		= &ip_min_valid_pmtu,
2951	},
2952	{
2953		.procname	= "min_adv_mss",
2954		.data		= &ip_rt_min_advmss,
2955		.maxlen		= sizeof(int),
2956		.mode		= 0644,
2957		.proc_handler	= proc_dointvec,
2958	},
2959	{ }
2960};
2961
2962static struct ctl_table ipv4_route_flush_table[] = {
2963	{
2964		.procname	= "flush",
2965		.maxlen		= sizeof(int),
2966		.mode		= 0200,
2967		.proc_handler	= ipv4_sysctl_rtcache_flush,
 
2968	},
2969	{ },
2970};
2971
2972static __net_init int sysctl_route_net_init(struct net *net)
2973{
2974	struct ctl_table *tbl;
 
2975
2976	tbl = ipv4_route_flush_table;
2977	if (!net_eq(net, &init_net)) {
2978		tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
 
 
2979		if (!tbl)
2980			goto err_dup;
2981
2982		/* Don't export sysctls to unprivileged users */
2983		if (net->user_ns != &init_user_ns)
2984			tbl[0].procname = NULL;
 
 
 
 
 
 
 
 
 
 
2985	}
2986	tbl[0].extra1 = net;
2987
2988	net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
 
2989	if (!net->ipv4.route_hdr)
2990		goto err_reg;
2991	return 0;
2992
2993err_reg:
2994	if (tbl != ipv4_route_flush_table)
2995		kfree(tbl);
2996err_dup:
2997	return -ENOMEM;
2998}
2999
3000static __net_exit void sysctl_route_net_exit(struct net *net)
3001{
3002	struct ctl_table *tbl;
3003
3004	tbl = net->ipv4.route_hdr->ctl_table_arg;
3005	unregister_net_sysctl_table(net->ipv4.route_hdr);
3006	BUG_ON(tbl == ipv4_route_flush_table);
3007	kfree(tbl);
3008}
3009
3010static __net_initdata struct pernet_operations sysctl_route_ops = {
3011	.init = sysctl_route_net_init,
3012	.exit = sysctl_route_net_exit,
3013};
3014#endif
3015
 
 
 
 
 
 
 
 
 
 
 
 
 
3016static __net_init int rt_genid_init(struct net *net)
3017{
3018	atomic_set(&net->ipv4.rt_genid, 0);
3019	atomic_set(&net->fnhe_genid, 0);
3020	atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3021	return 0;
3022}
3023
3024static __net_initdata struct pernet_operations rt_genid_ops = {
3025	.init = rt_genid_init,
3026};
3027
3028static int __net_init ipv4_inetpeer_init(struct net *net)
3029{
3030	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3031
3032	if (!bp)
3033		return -ENOMEM;
3034	inet_peer_base_init(bp);
3035	net->ipv4.peers = bp;
3036	return 0;
3037}
3038
3039static void __net_exit ipv4_inetpeer_exit(struct net *net)
3040{
3041	struct inet_peer_base *bp = net->ipv4.peers;
3042
3043	net->ipv4.peers = NULL;
3044	inetpeer_invalidate_tree(bp);
3045	kfree(bp);
3046}
3047
3048static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3049	.init	=	ipv4_inetpeer_init,
3050	.exit	=	ipv4_inetpeer_exit,
3051};
3052
3053#ifdef CONFIG_IP_ROUTE_CLASSID
3054struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3055#endif /* CONFIG_IP_ROUTE_CLASSID */
3056
3057int __init ip_rt_init(void)
3058{
 
3059	int cpu;
3060
3061	ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3062	if (!ip_idents)
3063		panic("IP: failed to allocate ip_idents\n");
3064
3065	prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3066
3067	ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3068	if (!ip_tstamps)
3069		panic("IP: failed to allocate ip_tstamps\n");
 
 
 
 
 
 
 
3070
3071	for_each_possible_cpu(cpu) {
3072		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3073
3074		INIT_LIST_HEAD(&ul->head);
 
3075		spin_lock_init(&ul->lock);
3076	}
3077#ifdef CONFIG_IP_ROUTE_CLASSID
3078	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3079	if (!ip_rt_acct)
3080		panic("IP: failed to allocate ip_rt_acct\n");
3081#endif
3082
3083	ipv4_dst_ops.kmem_cachep =
3084		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3085				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3086
3087	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3088
3089	if (dst_entries_init(&ipv4_dst_ops) < 0)
3090		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3091
3092	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3093		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3094
3095	ipv4_dst_ops.gc_thresh = ~0;
3096	ip_rt_max_size = INT_MAX;
3097
3098	devinet_init();
3099	ip_fib_init();
3100
3101	if (ip_rt_proc_init())
3102		pr_err("Unable to create route proc files\n");
3103#ifdef CONFIG_XFRM
3104	xfrm_init();
3105	xfrm4_init();
3106#endif
3107	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3108		      RTNL_FLAG_DOIT_UNLOCKED);
3109
3110#ifdef CONFIG_SYSCTL
3111	register_pernet_subsys(&sysctl_route_ops);
3112#endif
 
3113	register_pernet_subsys(&rt_genid_ops);
3114	register_pernet_subsys(&ipv4_inetpeer_ops);
3115	return 0;
3116}
3117
3118#ifdef CONFIG_SYSCTL
3119/*
3120 * We really need to sanitize the damn ipv4 init order, then all
3121 * this nonsense will go away.
3122 */
3123void __init ip_static_sysctl_init(void)
3124{
3125	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3126}
3127#endif
v6.8
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET		An implementation of the TCP/IP protocol suite for the LINUX
   4 *		operating system.  INET is implemented using the  BSD Socket
   5 *		interface as the means of communication with the user level.
   6 *
   7 *		ROUTE - implementation of the IP router.
   8 *
   9 * Authors:	Ross Biro
  10 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
  12 *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13 *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14 *
  15 * Fixes:
  16 *		Alan Cox	:	Verify area fixes.
  17 *		Alan Cox	:	cli() protects routing changes
  18 *		Rui Oliveira	:	ICMP routing table updates
  19 *		(rco@di.uminho.pt)	Routing table insertion and update
  20 *		Linus Torvalds	:	Rewrote bits to be sensible
  21 *		Alan Cox	:	Added BSD route gw semantics
  22 *		Alan Cox	:	Super /proc >4K
  23 *		Alan Cox	:	MTU in route table
  24 *		Alan Cox	:	MSS actually. Also added the window
  25 *					clamper.
  26 *		Sam Lantinga	:	Fixed route matching in rt_del()
  27 *		Alan Cox	:	Routing cache support.
  28 *		Alan Cox	:	Removed compatibility cruft.
  29 *		Alan Cox	:	RTF_REJECT support.
  30 *		Alan Cox	:	TCP irtt support.
  31 *		Jonathan Naylor	:	Added Metric support.
  32 *	Miquel van Smoorenburg	:	BSD API fixes.
  33 *	Miquel van Smoorenburg	:	Metrics.
  34 *		Alan Cox	:	Use __u32 properly
  35 *		Alan Cox	:	Aligned routing errors more closely with BSD
  36 *					our system is still very different.
  37 *		Alan Cox	:	Faster /proc handling
  38 *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
  39 *					routing caches and better behaviour.
  40 *
  41 *		Olaf Erb	:	irtt wasn't being copied right.
  42 *		Bjorn Ekwall	:	Kerneld route support.
  43 *		Alan Cox	:	Multicast fixed (I hope)
  44 *		Pavel Krauz	:	Limited broadcast fixed
  45 *		Mike McLagan	:	Routing by source
  46 *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
  47 *					route.c and rewritten from scratch.
  48 *		Andi Kleen	:	Load-limit warning messages.
  49 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
  50 *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
  51 *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
  52 *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
  53 *		Marc Boucher	:	routing by fwmark
  54 *	Robert Olsson		:	Added rt_cache statistics
  55 *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  56 *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
  57 *	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
  58 *	Ilia Sotnikov		:	Removed TOS from hash calculations
 
 
 
 
 
  59 */
  60
  61#define pr_fmt(fmt) "IPv4: " fmt
  62
  63#include <linux/module.h>
 
  64#include <linux/bitops.h>
 
  65#include <linux/kernel.h>
  66#include <linux/mm.h>
  67#include <linux/memblock.h>
  68#include <linux/socket.h>
 
  69#include <linux/errno.h>
  70#include <linux/in.h>
  71#include <linux/inet.h>
  72#include <linux/netdevice.h>
  73#include <linux/proc_fs.h>
  74#include <linux/init.h>
  75#include <linux/skbuff.h>
  76#include <linux/inetdevice.h>
  77#include <linux/igmp.h>
  78#include <linux/pkt_sched.h>
  79#include <linux/mroute.h>
  80#include <linux/netfilter_ipv4.h>
  81#include <linux/random.h>
  82#include <linux/rcupdate.h>
 
  83#include <linux/slab.h>
  84#include <linux/jhash.h>
  85#include <net/dst.h>
  86#include <net/dst_metadata.h>
  87#include <net/inet_dscp.h>
  88#include <net/net_namespace.h>
 
  89#include <net/ip.h>
  90#include <net/route.h>
  91#include <net/inetpeer.h>
  92#include <net/sock.h>
  93#include <net/ip_fib.h>
  94#include <net/nexthop.h>
  95#include <net/tcp.h>
  96#include <net/icmp.h>
  97#include <net/xfrm.h>
  98#include <net/lwtunnel.h>
  99#include <net/netevent.h>
 100#include <net/rtnetlink.h>
 101#ifdef CONFIG_SYSCTL
 102#include <linux/sysctl.h>
 103#endif
 104#include <net/secure_seq.h>
 105#include <net/ip_tunnels.h>
 
 106
 107#include "fib_lookup.h"
 108
 109#define RT_FL_TOS(oldflp4) \
 110	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 111
 112#define RT_GC_TIMEOUT (300*HZ)
 113
 114#define DEFAULT_MIN_PMTU (512 + 20 + 20)
 115#define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
 116#define DEFAULT_MIN_ADVMSS 256
 117static int ip_rt_max_size;
 118static int ip_rt_redirect_number __read_mostly	= 9;
 119static int ip_rt_redirect_load __read_mostly	= HZ / 50;
 120static int ip_rt_redirect_silence __read_mostly	= ((HZ / 50) << (9 + 1));
 121static int ip_rt_error_cost __read_mostly	= HZ;
 122static int ip_rt_error_burst __read_mostly	= 5 * HZ;
 
 
 
 123
 124static int ip_rt_gc_timeout __read_mostly	= RT_GC_TIMEOUT;
 125
 126/*
 127 *	Interface to generic destination cache.
 128 */
 129
 130INDIRECT_CALLABLE_SCOPE
 131struct dst_entry	*ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 132static unsigned int	 ipv4_default_advmss(const struct dst_entry *dst);
 133INDIRECT_CALLABLE_SCOPE
 134unsigned int		ipv4_mtu(const struct dst_entry *dst);
 135static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 136static void		 ipv4_link_failure(struct sk_buff *skb);
 137static void		 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 138					   struct sk_buff *skb, u32 mtu,
 139					   bool confirm_neigh);
 140static void		 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 141					struct sk_buff *skb);
 142static void		ipv4_dst_destroy(struct dst_entry *dst);
 143
 144static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 145{
 146	WARN_ON(1);
 147	return NULL;
 148}
 149
 150static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 151					   struct sk_buff *skb,
 152					   const void *daddr);
 153static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 154
 155static struct dst_ops ipv4_dst_ops = {
 156	.family =		AF_INET,
 157	.check =		ipv4_dst_check,
 158	.default_advmss =	ipv4_default_advmss,
 159	.mtu =			ipv4_mtu,
 160	.cow_metrics =		ipv4_cow_metrics,
 161	.destroy =		ipv4_dst_destroy,
 162	.negative_advice =	ipv4_negative_advice,
 163	.link_failure =		ipv4_link_failure,
 164	.update_pmtu =		ip_rt_update_pmtu,
 165	.redirect =		ip_do_redirect,
 166	.local_out =		__ip_local_out,
 167	.neigh_lookup =		ipv4_neigh_lookup,
 168	.confirm_neigh =	ipv4_confirm_neigh,
 169};
 170
 171#define ECN_OR_COST(class)	TC_PRIO_##class
 172
 173const __u8 ip_tos2prio[16] = {
 174	TC_PRIO_BESTEFFORT,
 175	ECN_OR_COST(BESTEFFORT),
 176	TC_PRIO_BESTEFFORT,
 177	ECN_OR_COST(BESTEFFORT),
 178	TC_PRIO_BULK,
 179	ECN_OR_COST(BULK),
 180	TC_PRIO_BULK,
 181	ECN_OR_COST(BULK),
 182	TC_PRIO_INTERACTIVE,
 183	ECN_OR_COST(INTERACTIVE),
 184	TC_PRIO_INTERACTIVE,
 185	ECN_OR_COST(INTERACTIVE),
 186	TC_PRIO_INTERACTIVE_BULK,
 187	ECN_OR_COST(INTERACTIVE_BULK),
 188	TC_PRIO_INTERACTIVE_BULK,
 189	ECN_OR_COST(INTERACTIVE_BULK)
 190};
 191EXPORT_SYMBOL(ip_tos2prio);
 192
 193static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 194#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 195
 196#ifdef CONFIG_PROC_FS
 197static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 198{
 199	if (*pos)
 200		return NULL;
 201	return SEQ_START_TOKEN;
 202}
 203
 204static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 205{
 206	++*pos;
 207	return NULL;
 208}
 209
 210static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 211{
 212}
 213
 214static int rt_cache_seq_show(struct seq_file *seq, void *v)
 215{
 216	if (v == SEQ_START_TOKEN)
 217		seq_printf(seq, "%-127s\n",
 218			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 219			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 220			   "HHUptod\tSpecDst");
 221	return 0;
 222}
 223
 224static const struct seq_operations rt_cache_seq_ops = {
 225	.start  = rt_cache_seq_start,
 226	.next   = rt_cache_seq_next,
 227	.stop   = rt_cache_seq_stop,
 228	.show   = rt_cache_seq_show,
 229};
 230
 
 
 
 
 
 
 
 
 
 
 
 
 
 231static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 232{
 233	int cpu;
 234
 235	if (*pos == 0)
 236		return SEQ_START_TOKEN;
 237
 238	for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 239		if (!cpu_possible(cpu))
 240			continue;
 241		*pos = cpu+1;
 242		return &per_cpu(rt_cache_stat, cpu);
 243	}
 244	return NULL;
 245}
 246
 247static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 248{
 249	int cpu;
 250
 251	for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 252		if (!cpu_possible(cpu))
 253			continue;
 254		*pos = cpu+1;
 255		return &per_cpu(rt_cache_stat, cpu);
 256	}
 257	(*pos)++;
 258	return NULL;
 259
 260}
 261
 262static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 263{
 264
 265}
 266
 267static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 268{
 269	struct rt_cache_stat *st = v;
 270
 271	if (v == SEQ_START_TOKEN) {
 272		seq_puts(seq, "entries  in_hit   in_slow_tot in_slow_mc in_no_route in_brd   in_martian_dst in_martian_src out_hit  out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 273		return 0;
 274	}
 275
 276	seq_printf(seq, "%08x %08x %08x    %08x   %08x    %08x %08x       "
 277			"%08x       %08x %08x     %08x    %08x %08x   "
 278			"%08x     %08x        %08x        %08x\n",
 279		   dst_entries_get_slow(&ipv4_dst_ops),
 280		   0, /* st->in_hit */
 281		   st->in_slow_tot,
 282		   st->in_slow_mc,
 283		   st->in_no_route,
 284		   st->in_brd,
 285		   st->in_martian_dst,
 286		   st->in_martian_src,
 287
 288		   0, /* st->out_hit */
 289		   st->out_slow_tot,
 290		   st->out_slow_mc,
 291
 292		   0, /* st->gc_total */
 293		   0, /* st->gc_ignored */
 294		   0, /* st->gc_goal_miss */
 295		   0, /* st->gc_dst_overflow */
 296		   0, /* st->in_hlist_search */
 297		   0  /* st->out_hlist_search */
 298		);
 299	return 0;
 300}
 301
 302static const struct seq_operations rt_cpu_seq_ops = {
 303	.start  = rt_cpu_seq_start,
 304	.next   = rt_cpu_seq_next,
 305	.stop   = rt_cpu_seq_stop,
 306	.show   = rt_cpu_seq_show,
 307};
 308
 
 
 
 
 
 
 
 
 
 
 
 
 
 309#ifdef CONFIG_IP_ROUTE_CLASSID
 310static int rt_acct_proc_show(struct seq_file *m, void *v)
 311{
 312	struct ip_rt_acct *dst, *src;
 313	unsigned int i, j;
 314
 315	dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 316	if (!dst)
 317		return -ENOMEM;
 318
 319	for_each_possible_cpu(i) {
 320		src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 321		for (j = 0; j < 256; j++) {
 322			dst[j].o_bytes   += src[j].o_bytes;
 323			dst[j].o_packets += src[j].o_packets;
 324			dst[j].i_bytes   += src[j].i_bytes;
 325			dst[j].i_packets += src[j].i_packets;
 326		}
 327	}
 328
 329	seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 330	kfree(dst);
 331	return 0;
 332}
 
 
 
 
 
 
 
 
 
 
 
 
 333#endif
 334
 335static int __net_init ip_rt_do_proc_init(struct net *net)
 336{
 337	struct proc_dir_entry *pde;
 338
 339	pde = proc_create_seq("rt_cache", 0444, net->proc_net,
 340			      &rt_cache_seq_ops);
 341	if (!pde)
 342		goto err1;
 343
 344	pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
 345			      &rt_cpu_seq_ops);
 346	if (!pde)
 347		goto err2;
 348
 349#ifdef CONFIG_IP_ROUTE_CLASSID
 350	pde = proc_create_single("rt_acct", 0, net->proc_net,
 351			rt_acct_proc_show);
 352	if (!pde)
 353		goto err3;
 354#endif
 355	return 0;
 356
 357#ifdef CONFIG_IP_ROUTE_CLASSID
 358err3:
 359	remove_proc_entry("rt_cache", net->proc_net_stat);
 360#endif
 361err2:
 362	remove_proc_entry("rt_cache", net->proc_net);
 363err1:
 364	return -ENOMEM;
 365}
 366
 367static void __net_exit ip_rt_do_proc_exit(struct net *net)
 368{
 369	remove_proc_entry("rt_cache", net->proc_net_stat);
 370	remove_proc_entry("rt_cache", net->proc_net);
 371#ifdef CONFIG_IP_ROUTE_CLASSID
 372	remove_proc_entry("rt_acct", net->proc_net);
 373#endif
 374}
 375
 376static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 377	.init = ip_rt_do_proc_init,
 378	.exit = ip_rt_do_proc_exit,
 379};
 380
 381static int __init ip_rt_proc_init(void)
 382{
 383	return register_pernet_subsys(&ip_rt_proc_ops);
 384}
 385
 386#else
 387static inline int ip_rt_proc_init(void)
 388{
 389	return 0;
 390}
 391#endif /* CONFIG_PROC_FS */
 392
 393static inline bool rt_is_expired(const struct rtable *rth)
 394{
 395	return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 396}
 397
 398void rt_cache_flush(struct net *net)
 399{
 400	rt_genid_bump_ipv4(net);
 401}
 402
 403static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 404					   struct sk_buff *skb,
 405					   const void *daddr)
 406{
 407	const struct rtable *rt = container_of(dst, struct rtable, dst);
 408	struct net_device *dev = dst->dev;
 
 
 409	struct neighbour *n;
 410
 411	rcu_read_lock();
 412
 413	if (likely(rt->rt_gw_family == AF_INET)) {
 414		n = ip_neigh_gw4(dev, rt->rt_gw4);
 415	} else if (rt->rt_gw_family == AF_INET6) {
 416		n = ip_neigh_gw6(dev, &rt->rt_gw6);
 417        } else {
 418		__be32 pkey;
 419
 420		pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 421		n = ip_neigh_gw4(dev, pkey);
 422	}
 423
 424	if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 425		n = NULL;
 426
 427	rcu_read_unlock();
 428
 429	return n;
 430}
 431
 432static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 433{
 434	const struct rtable *rt = container_of(dst, struct rtable, dst);
 435	struct net_device *dev = dst->dev;
 436	const __be32 *pkey = daddr;
 
 437
 438	if (rt->rt_gw_family == AF_INET) {
 439		pkey = (const __be32 *)&rt->rt_gw4;
 440	} else if (rt->rt_gw_family == AF_INET6) {
 441		return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 442	} else if (!daddr ||
 443		 (rt->rt_flags &
 444		  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 445		return;
 446	}
 447	__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 448}
 449
 450/* Hash tables of size 2048..262144 depending on RAM size.
 451 * Each bucket uses 8 bytes.
 452 */
 453static u32 ip_idents_mask __read_mostly;
 454static atomic_t *ip_idents __read_mostly;
 455static u32 *ip_tstamps __read_mostly;
 456
 457/* In order to protect privacy, we add a perturbation to identifiers
 458 * if one generator is seldom used. This makes hard for an attacker
 459 * to infer how many packets were sent between two points in time.
 460 */
 461static u32 ip_idents_reserve(u32 hash, int segs)
 462{
 463	u32 bucket, old, now = (u32)jiffies;
 464	atomic_t *p_id;
 465	u32 *p_tstamp;
 466	u32 delta = 0;
 467
 468	bucket = hash & ip_idents_mask;
 469	p_tstamp = ip_tstamps + bucket;
 470	p_id = ip_idents + bucket;
 471	old = READ_ONCE(*p_tstamp);
 472
 473	if (old != now && cmpxchg(p_tstamp, old, now) == old)
 474		delta = get_random_u32_below(now - old);
 
 
 
 
 
 
 475
 476	/* If UBSAN reports an error there, please make sure your compiler
 477	 * supports -fno-strict-overflow before reporting it that was a bug
 478	 * in UBSAN, and it has been fixed in GCC-8.
 479	 */
 480	return atomic_add_return(segs + delta, p_id) - segs;
 481}
 
 482
 483void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 484{
 
 485	u32 hash, id;
 486
 487	/* Note the following code is not safe, but this is okay. */
 488	if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 489		get_random_bytes(&net->ipv4.ip_id_key,
 490				 sizeof(net->ipv4.ip_id_key));
 491
 492	hash = siphash_3u32((__force u32)iph->daddr,
 493			    (__force u32)iph->saddr,
 494			    iph->protocol,
 495			    &net->ipv4.ip_id_key);
 496	id = ip_idents_reserve(hash, segs);
 497	iph->id = htons(id);
 498}
 499EXPORT_SYMBOL(__ip_select_ident);
 500
 501static void ip_rt_fix_tos(struct flowi4 *fl4)
 502{
 503	__u8 tos = RT_FL_TOS(fl4);
 504
 505	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
 506	if (tos & RTO_ONLINK)
 507		fl4->flowi4_scope = RT_SCOPE_LINK;
 508}
 509
 510static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 511			     const struct sock *sk, const struct iphdr *iph,
 512			     int oif, __u8 tos, u8 prot, u32 mark,
 513			     int flow_flags)
 
 514{
 515	__u8 scope = RT_SCOPE_UNIVERSE;
 
 516
 517	if (sk) {
 518		oif = sk->sk_bound_dev_if;
 519		mark = READ_ONCE(sk->sk_mark);
 520		tos = ip_sock_rt_tos(sk);
 521		scope = ip_sock_rt_scope(sk);
 522		prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW :
 523						    sk->sk_protocol;
 524	}
 525
 526	flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope,
 527			   prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
 528			   sock_net_uid(net, sk));
 529}
 530
 531static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 532			       const struct sock *sk)
 533{
 534	const struct net *net = dev_net(skb->dev);
 535	const struct iphdr *iph = ip_hdr(skb);
 536	int oif = skb->dev->ifindex;
 
 537	u8 prot = iph->protocol;
 538	u32 mark = skb->mark;
 539	__u8 tos = iph->tos;
 540
 541	__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 542}
 543
 544static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 545{
 546	const struct inet_sock *inet = inet_sk(sk);
 547	const struct ip_options_rcu *inet_opt;
 548	__be32 daddr = inet->inet_daddr;
 549
 550	rcu_read_lock();
 551	inet_opt = rcu_dereference(inet->inet_opt);
 552	if (inet_opt && inet_opt->opt.srr)
 553		daddr = inet_opt->opt.faddr;
 554	flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
 555			   ip_sock_rt_tos(sk) & IPTOS_RT_MASK,
 556			   ip_sock_rt_scope(sk),
 557			   inet_test_bit(HDRINCL, sk) ?
 558				IPPROTO_RAW : sk->sk_protocol,
 559			   inet_sk_flowi_flags(sk),
 560			   daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 561	rcu_read_unlock();
 562}
 563
 564static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 565				 const struct sk_buff *skb)
 566{
 567	if (skb)
 568		build_skb_flow_key(fl4, skb, sk);
 569	else
 570		build_sk_flow_key(fl4, sk);
 571}
 572
 573static DEFINE_SPINLOCK(fnhe_lock);
 574
 575static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 576{
 577	struct rtable *rt;
 578
 579	rt = rcu_dereference(fnhe->fnhe_rth_input);
 580	if (rt) {
 581		RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 582		dst_dev_put(&rt->dst);
 583		dst_release(&rt->dst);
 584	}
 585	rt = rcu_dereference(fnhe->fnhe_rth_output);
 586	if (rt) {
 587		RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 588		dst_dev_put(&rt->dst);
 589		dst_release(&rt->dst);
 590	}
 591}
 592
 593static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 594{
 595	struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 596	struct fib_nh_exception *fnhe, *oldest = NULL;
 597
 598	for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 599		fnhe = rcu_dereference_protected(*fnhe_p,
 600						 lockdep_is_held(&fnhe_lock));
 601		if (!fnhe)
 602			break;
 603		if (!oldest ||
 604		    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 605			oldest = fnhe;
 606			oldest_p = fnhe_p;
 607		}
 608	}
 609	fnhe_flush_routes(oldest);
 610	*oldest_p = oldest->fnhe_next;
 611	kfree_rcu(oldest, rcu);
 612}
 613
 614static u32 fnhe_hashfun(__be32 daddr)
 615{
 616	static siphash_aligned_key_t fnhe_hash_key;
 617	u64 hval;
 618
 619	net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
 620	hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
 621	return hash_64(hval, FNHE_HASH_SHIFT);
 622}
 623
 624static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 625{
 626	rt->rt_pmtu = fnhe->fnhe_pmtu;
 627	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 628	rt->dst.expires = fnhe->fnhe_expires;
 629
 630	if (fnhe->fnhe_gw) {
 631		rt->rt_flags |= RTCF_REDIRECTED;
 
 632		rt->rt_uses_gateway = 1;
 633		rt->rt_gw_family = AF_INET;
 634		rt->rt_gw4 = fnhe->fnhe_gw;
 635	}
 636}
 637
 638static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 639				  __be32 gw, u32 pmtu, bool lock,
 640				  unsigned long expires)
 641{
 642	struct fnhe_hash_bucket *hash;
 643	struct fib_nh_exception *fnhe;
 644	struct rtable *rt;
 645	u32 genid, hval;
 646	unsigned int i;
 647	int depth;
 648
 649	genid = fnhe_genid(dev_net(nhc->nhc_dev));
 650	hval = fnhe_hashfun(daddr);
 651
 652	spin_lock_bh(&fnhe_lock);
 653
 654	hash = rcu_dereference(nhc->nhc_exceptions);
 655	if (!hash) {
 656		hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 657		if (!hash)
 658			goto out_unlock;
 659		rcu_assign_pointer(nhc->nhc_exceptions, hash);
 660	}
 661
 662	hash += hval;
 663
 664	depth = 0;
 665	for (fnhe = rcu_dereference(hash->chain); fnhe;
 666	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
 667		if (fnhe->fnhe_daddr == daddr)
 668			break;
 669		depth++;
 670	}
 671
 672	if (fnhe) {
 673		if (fnhe->fnhe_genid != genid)
 674			fnhe->fnhe_genid = genid;
 675		if (gw)
 676			fnhe->fnhe_gw = gw;
 677		if (pmtu) {
 678			fnhe->fnhe_pmtu = pmtu;
 679			fnhe->fnhe_mtu_locked = lock;
 680		}
 681		fnhe->fnhe_expires = max(1UL, expires);
 682		/* Update all cached dsts too */
 683		rt = rcu_dereference(fnhe->fnhe_rth_input);
 684		if (rt)
 685			fill_route_from_fnhe(rt, fnhe);
 686		rt = rcu_dereference(fnhe->fnhe_rth_output);
 687		if (rt)
 688			fill_route_from_fnhe(rt, fnhe);
 689	} else {
 690		/* Randomize max depth to avoid some side channels attacks. */
 691		int max_depth = FNHE_RECLAIM_DEPTH +
 692				get_random_u32_below(FNHE_RECLAIM_DEPTH);
 
 
 
 693
 694		while (depth > max_depth) {
 695			fnhe_remove_oldest(hash);
 696			depth--;
 697		}
 698
 699		fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 700		if (!fnhe)
 701			goto out_unlock;
 702
 703		fnhe->fnhe_next = hash->chain;
 704
 705		fnhe->fnhe_genid = genid;
 706		fnhe->fnhe_daddr = daddr;
 707		fnhe->fnhe_gw = gw;
 708		fnhe->fnhe_pmtu = pmtu;
 709		fnhe->fnhe_mtu_locked = lock;
 710		fnhe->fnhe_expires = max(1UL, expires);
 711
 712		rcu_assign_pointer(hash->chain, fnhe);
 713
 714		/* Exception created; mark the cached routes for the nexthop
 715		 * stale, so anyone caching it rechecks if this exception
 716		 * applies to them.
 717		 */
 718		rt = rcu_dereference(nhc->nhc_rth_input);
 719		if (rt)
 720			rt->dst.obsolete = DST_OBSOLETE_KILL;
 721
 722		for_each_possible_cpu(i) {
 723			struct rtable __rcu **prt;
 724
 725			prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 726			rt = rcu_dereference(*prt);
 727			if (rt)
 728				rt->dst.obsolete = DST_OBSOLETE_KILL;
 729		}
 730	}
 731
 732	fnhe->fnhe_stamp = jiffies;
 733
 734out_unlock:
 735	spin_unlock_bh(&fnhe_lock);
 736}
 737
 738static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 739			     bool kill_route)
 740{
 741	__be32 new_gw = icmp_hdr(skb)->un.gateway;
 742	__be32 old_gw = ip_hdr(skb)->saddr;
 743	struct net_device *dev = skb->dev;
 744	struct in_device *in_dev;
 745	struct fib_result res;
 746	struct neighbour *n;
 747	struct net *net;
 748
 749	switch (icmp_hdr(skb)->code & 7) {
 750	case ICMP_REDIR_NET:
 751	case ICMP_REDIR_NETTOS:
 752	case ICMP_REDIR_HOST:
 753	case ICMP_REDIR_HOSTTOS:
 754		break;
 755
 756	default:
 757		return;
 758	}
 759
 760	if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 761		return;
 762
 763	in_dev = __in_dev_get_rcu(dev);
 764	if (!in_dev)
 765		return;
 766
 767	net = dev_net(dev);
 768	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 769	    ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 770	    ipv4_is_zeronet(new_gw))
 771		goto reject_redirect;
 772
 773	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 774		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 775			goto reject_redirect;
 776		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 777			goto reject_redirect;
 778	} else {
 779		if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 780			goto reject_redirect;
 781	}
 782
 783	n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
 784	if (!n)
 785		n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 786	if (!IS_ERR(n)) {
 787		if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
 788			neigh_event_send(n, NULL);
 789		} else {
 790			if (fib_lookup(net, fl4, &res, 0) == 0) {
 791				struct fib_nh_common *nhc;
 792
 793				fib_select_path(net, &res, fl4, skb);
 794				nhc = FIB_RES_NHC(res);
 795				update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 796						0, false,
 797						jiffies + ip_rt_gc_timeout);
 798			}
 799			if (kill_route)
 800				rt->dst.obsolete = DST_OBSOLETE_KILL;
 801			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 802		}
 803		neigh_release(n);
 804	}
 805	return;
 806
 807reject_redirect:
 808#ifdef CONFIG_IP_ROUTE_VERBOSE
 809	if (IN_DEV_LOG_MARTIANS(in_dev)) {
 810		const struct iphdr *iph = (const struct iphdr *) skb->data;
 811		__be32 daddr = iph->daddr;
 812		__be32 saddr = iph->saddr;
 813
 814		net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 815				     "  Advised path = %pI4 -> %pI4\n",
 816				     &old_gw, dev->name, &new_gw,
 817				     &saddr, &daddr);
 818	}
 819#endif
 820	;
 821}
 822
 823static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 824{
 825	struct rtable *rt;
 826	struct flowi4 fl4;
 827	const struct iphdr *iph = (const struct iphdr *) skb->data;
 828	struct net *net = dev_net(skb->dev);
 829	int oif = skb->dev->ifindex;
 
 830	u8 prot = iph->protocol;
 831	u32 mark = skb->mark;
 832	__u8 tos = iph->tos;
 833
 834	rt = (struct rtable *) dst;
 835
 836	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 837	__ip_do_redirect(rt, skb, &fl4, true);
 838}
 839
 840static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 841{
 842	struct rtable *rt = (struct rtable *)dst;
 843	struct dst_entry *ret = dst;
 844
 845	if (rt) {
 846		if (dst->obsolete > 0) {
 847			ip_rt_put(rt);
 848			ret = NULL;
 849		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 850			   rt->dst.expires) {
 851			ip_rt_put(rt);
 852			ret = NULL;
 853		}
 854	}
 855	return ret;
 856}
 857
 858/*
 859 * Algorithm:
 860 *	1. The first ip_rt_redirect_number redirects are sent
 861 *	   with exponential backoff, then we stop sending them at all,
 862 *	   assuming that the host ignores our redirects.
 863 *	2. If we did not see packets requiring redirects
 864 *	   during ip_rt_redirect_silence, we assume that the host
 865 *	   forgot redirected route and start to send redirects again.
 866 *
 867 * This algorithm is much cheaper and more intelligent than dumb load limiting
 868 * in icmp.c.
 869 *
 870 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 871 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 872 */
 873
 874void ip_rt_send_redirect(struct sk_buff *skb)
 875{
 876	struct rtable *rt = skb_rtable(skb);
 877	struct in_device *in_dev;
 878	struct inet_peer *peer;
 879	struct net *net;
 880	int log_martians;
 881	int vif;
 882
 883	rcu_read_lock();
 884	in_dev = __in_dev_get_rcu(rt->dst.dev);
 885	if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 886		rcu_read_unlock();
 887		return;
 888	}
 889	log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 890	vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 891	rcu_read_unlock();
 892
 893	net = dev_net(rt->dst.dev);
 894	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 895	if (!peer) {
 896		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 897			  rt_nexthop(rt, ip_hdr(skb)->daddr));
 898		return;
 899	}
 900
 901	/* No redirected packets during ip_rt_redirect_silence;
 902	 * reset the algorithm.
 903	 */
 904	if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 905		peer->rate_tokens = 0;
 906		peer->n_redirects = 0;
 907	}
 908
 909	/* Too many ignored redirects; do not send anything
 910	 * set dst.rate_last to the last seen redirected packet.
 911	 */
 912	if (peer->n_redirects >= ip_rt_redirect_number) {
 913		peer->rate_last = jiffies;
 914		goto out_put_peer;
 915	}
 916
 917	/* Check for load limit; set rate_last to the latest sent
 918	 * redirect.
 919	 */
 920	if (peer->n_redirects == 0 ||
 921	    time_after(jiffies,
 922		       (peer->rate_last +
 923			(ip_rt_redirect_load << peer->n_redirects)))) {
 924		__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 925
 926		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 927		peer->rate_last = jiffies;
 928		++peer->n_redirects;
 929#ifdef CONFIG_IP_ROUTE_VERBOSE
 930		if (log_martians &&
 931		    peer->n_redirects == ip_rt_redirect_number)
 932			net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 933					     &ip_hdr(skb)->saddr, inet_iif(skb),
 934					     &ip_hdr(skb)->daddr, &gw);
 935#endif
 936	}
 937out_put_peer:
 938	inet_putpeer(peer);
 939}
 940
 941static int ip_error(struct sk_buff *skb)
 942{
 943	struct rtable *rt = skb_rtable(skb);
 944	struct net_device *dev = skb->dev;
 945	struct in_device *in_dev;
 946	struct inet_peer *peer;
 947	unsigned long now;
 948	struct net *net;
 949	SKB_DR(reason);
 950	bool send;
 951	int code;
 952
 953	if (netif_is_l3_master(skb->dev)) {
 954		dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 955		if (!dev)
 956			goto out;
 957	}
 958
 959	in_dev = __in_dev_get_rcu(dev);
 960
 961	/* IP on this device is disabled. */
 962	if (!in_dev)
 963		goto out;
 964
 965	net = dev_net(rt->dst.dev);
 966	if (!IN_DEV_FORWARD(in_dev)) {
 967		switch (rt->dst.error) {
 968		case EHOSTUNREACH:
 969			SKB_DR_SET(reason, IP_INADDRERRORS);
 970			__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 971			break;
 972
 973		case ENETUNREACH:
 974			SKB_DR_SET(reason, IP_INNOROUTES);
 975			__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 976			break;
 977		}
 978		goto out;
 979	}
 980
 981	switch (rt->dst.error) {
 982	case EINVAL:
 983	default:
 984		goto out;
 985	case EHOSTUNREACH:
 986		code = ICMP_HOST_UNREACH;
 987		break;
 988	case ENETUNREACH:
 989		code = ICMP_NET_UNREACH;
 990		SKB_DR_SET(reason, IP_INNOROUTES);
 991		__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 992		break;
 993	case EACCES:
 994		code = ICMP_PKT_FILTERED;
 995		break;
 996	}
 997
 998	peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 999			       l3mdev_master_ifindex(skb->dev), 1);
1000
1001	send = true;
1002	if (peer) {
1003		now = jiffies;
1004		peer->rate_tokens += now - peer->rate_last;
1005		if (peer->rate_tokens > ip_rt_error_burst)
1006			peer->rate_tokens = ip_rt_error_burst;
1007		peer->rate_last = now;
1008		if (peer->rate_tokens >= ip_rt_error_cost)
1009			peer->rate_tokens -= ip_rt_error_cost;
1010		else
1011			send = false;
1012		inet_putpeer(peer);
1013	}
1014	if (send)
1015		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1016
1017out:	kfree_skb_reason(skb, reason);
1018	return 0;
1019}
1020
1021static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1022{
1023	struct dst_entry *dst = &rt->dst;
1024	struct net *net = dev_net(dst->dev);
1025	struct fib_result res;
1026	bool lock = false;
1027	u32 old_mtu;
1028
1029	if (ip_mtu_locked(dst))
1030		return;
1031
1032	old_mtu = ipv4_mtu(dst);
1033	if (old_mtu < mtu)
1034		return;
1035
1036	if (mtu < net->ipv4.ip_rt_min_pmtu) {
1037		lock = true;
1038		mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
1039	}
1040
1041	if (rt->rt_pmtu == mtu && !lock &&
1042	    time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
1043		return;
1044
1045	rcu_read_lock();
1046	if (fib_lookup(net, fl4, &res, 0) == 0) {
1047		struct fib_nh_common *nhc;
1048
1049		fib_select_path(net, &res, fl4, NULL);
1050		nhc = FIB_RES_NHC(res);
1051		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1052				      jiffies + net->ipv4.ip_rt_mtu_expires);
1053	}
1054	rcu_read_unlock();
1055}
1056
1057static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1058			      struct sk_buff *skb, u32 mtu,
1059			      bool confirm_neigh)
1060{
1061	struct rtable *rt = (struct rtable *) dst;
1062	struct flowi4 fl4;
1063
1064	ip_rt_build_flow_key(&fl4, sk, skb);
1065
1066	/* Don't make lookup fail for bridged encapsulations */
1067	if (skb && netif_is_any_bridge_port(skb->dev))
1068		fl4.flowi4_oif = 0;
1069
1070	__ip_rt_update_pmtu(rt, &fl4, mtu);
1071}
1072
1073void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1074		      int oif, u8 protocol)
1075{
1076	const struct iphdr *iph = (const struct iphdr *)skb->data;
1077	struct flowi4 fl4;
1078	struct rtable *rt;
1079	u32 mark = IP4_REPLY_MARK(net, skb->mark);
1080
1081	__build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
1082			 0);
 
 
 
1083	rt = __ip_route_output_key(net, &fl4);
1084	if (!IS_ERR(rt)) {
1085		__ip_rt_update_pmtu(rt, &fl4, mtu);
1086		ip_rt_put(rt);
1087	}
1088}
1089EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1090
1091static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092{
1093	const struct iphdr *iph = (const struct iphdr *)skb->data;
1094	struct flowi4 fl4;
1095	struct rtable *rt;
1096
1097	__build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1098
1099	if (!fl4.flowi4_mark)
1100		fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1101
1102	rt = __ip_route_output_key(sock_net(sk), &fl4);
1103	if (!IS_ERR(rt)) {
1104		__ip_rt_update_pmtu(rt, &fl4, mtu);
1105		ip_rt_put(rt);
1106	}
1107}
1108
1109void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1110{
1111	const struct iphdr *iph = (const struct iphdr *)skb->data;
1112	struct flowi4 fl4;
1113	struct rtable *rt;
1114	struct dst_entry *odst = NULL;
1115	bool new = false;
1116	struct net *net = sock_net(sk);
1117
1118	bh_lock_sock(sk);
1119
1120	if (!ip_sk_accept_pmtu(sk))
1121		goto out;
1122
1123	odst = sk_dst_get(sk);
1124
1125	if (sock_owned_by_user(sk) || !odst) {
1126		__ipv4_sk_update_pmtu(skb, sk, mtu);
1127		goto out;
1128	}
1129
1130	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1131
1132	rt = (struct rtable *)odst;
1133	if (odst->obsolete && !odst->ops->check(odst, 0)) {
1134		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1135		if (IS_ERR(rt))
1136			goto out;
1137
1138		new = true;
1139	}
1140
1141	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1142
1143	if (!dst_check(&rt->dst, 0)) {
1144		if (new)
1145			dst_release(&rt->dst);
1146
1147		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1148		if (IS_ERR(rt))
1149			goto out;
1150
1151		new = true;
1152	}
1153
1154	if (new)
1155		sk_dst_set(sk, &rt->dst);
1156
1157out:
1158	bh_unlock_sock(sk);
1159	dst_release(odst);
1160}
1161EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1162
1163void ipv4_redirect(struct sk_buff *skb, struct net *net,
1164		   int oif, u8 protocol)
1165{
1166	const struct iphdr *iph = (const struct iphdr *)skb->data;
1167	struct flowi4 fl4;
1168	struct rtable *rt;
1169
1170	__build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
 
1171	rt = __ip_route_output_key(net, &fl4);
1172	if (!IS_ERR(rt)) {
1173		__ip_do_redirect(rt, skb, &fl4, false);
1174		ip_rt_put(rt);
1175	}
1176}
1177EXPORT_SYMBOL_GPL(ipv4_redirect);
1178
1179void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1180{
1181	const struct iphdr *iph = (const struct iphdr *)skb->data;
1182	struct flowi4 fl4;
1183	struct rtable *rt;
1184	struct net *net = sock_net(sk);
1185
1186	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1187	rt = __ip_route_output_key(net, &fl4);
1188	if (!IS_ERR(rt)) {
1189		__ip_do_redirect(rt, skb, &fl4, false);
1190		ip_rt_put(rt);
1191	}
1192}
1193EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1194
1195INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
1196							 u32 cookie)
1197{
1198	struct rtable *rt = (struct rtable *) dst;
1199
1200	/* All IPV4 dsts are created with ->obsolete set to the value
1201	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1202	 * into this function always.
1203	 *
1204	 * When a PMTU/redirect information update invalidates a route,
1205	 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1206	 * DST_OBSOLETE_DEAD.
1207	 */
1208	if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1209		return NULL;
1210	return dst;
1211}
1212EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
1213
1214static void ipv4_send_dest_unreach(struct sk_buff *skb)
1215{
1216	struct net_device *dev;
1217	struct ip_options opt;
1218	int res;
1219
1220	/* Recompile ip options since IPCB may not be valid anymore.
1221	 * Also check we have a reasonable ipv4 header.
1222	 */
1223	if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1224	    ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1225		return;
1226
1227	memset(&opt, 0, sizeof(opt));
1228	if (ip_hdr(skb)->ihl > 5) {
1229		if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1230			return;
1231		opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1232
1233		rcu_read_lock();
1234		dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1235		res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1236		rcu_read_unlock();
1237
1238		if (res)
1239			return;
1240	}
1241	__icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1242}
1243
1244static void ipv4_link_failure(struct sk_buff *skb)
1245{
1246	struct rtable *rt;
1247
1248	ipv4_send_dest_unreach(skb);
1249
1250	rt = skb_rtable(skb);
1251	if (rt)
1252		dst_set_expires(&rt->dst, 0);
1253}
1254
1255static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1256{
1257	pr_debug("%s: %pI4 -> %pI4, %s\n",
1258		 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1259		 skb->dev ? skb->dev->name : "?");
1260	kfree_skb(skb);
1261	WARN_ON(1);
1262	return 0;
1263}
1264
1265/*
1266 * We do not cache source address of outgoing interface,
1267 * because it is used only by IP RR, TS and SRR options,
1268 * so that it out of fast path.
1269 *
1270 * BTW remember: "addr" is allowed to be not aligned
1271 * in IP options!
1272 */
1273
1274void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1275{
1276	__be32 src;
1277
1278	if (rt_is_output_route(rt))
1279		src = ip_hdr(skb)->saddr;
1280	else {
1281		struct fib_result res;
1282		struct iphdr *iph = ip_hdr(skb);
1283		struct flowi4 fl4 = {
1284			.daddr = iph->daddr,
1285			.saddr = iph->saddr,
1286			.flowi4_tos = RT_TOS(iph->tos),
1287			.flowi4_oif = rt->dst.dev->ifindex,
1288			.flowi4_iif = skb->dev->ifindex,
1289			.flowi4_mark = skb->mark,
1290		};
 
 
 
1291
1292		rcu_read_lock();
1293		if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1294			src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1295		else
1296			src = inet_select_addr(rt->dst.dev,
1297					       rt_nexthop(rt, iph->daddr),
1298					       RT_SCOPE_UNIVERSE);
1299		rcu_read_unlock();
1300	}
1301	memcpy(addr, &src, 4);
1302}
1303
1304#ifdef CONFIG_IP_ROUTE_CLASSID
1305static void set_class_tag(struct rtable *rt, u32 tag)
1306{
1307	if (!(rt->dst.tclassid & 0xFFFF))
1308		rt->dst.tclassid |= tag & 0xFFFF;
1309	if (!(rt->dst.tclassid & 0xFFFF0000))
1310		rt->dst.tclassid |= tag & 0xFFFF0000;
1311}
1312#endif
1313
1314static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1315{
1316	struct net *net = dev_net(dst->dev);
1317	unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1318	unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1319				    net->ipv4.ip_rt_min_advmss);
1320
1321	return min(advmss, IPV4_MAX_PMTU - header_size);
1322}
1323
1324INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
1325{
1326	return ip_dst_mtu_maybe_forward(dst, false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1327}
1328EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
1329
1330static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1331{
1332	struct fnhe_hash_bucket *hash;
1333	struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1334	u32 hval = fnhe_hashfun(daddr);
1335
1336	spin_lock_bh(&fnhe_lock);
1337
1338	hash = rcu_dereference_protected(nhc->nhc_exceptions,
1339					 lockdep_is_held(&fnhe_lock));
1340	hash += hval;
1341
1342	fnhe_p = &hash->chain;
1343	fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1344	while (fnhe) {
1345		if (fnhe->fnhe_daddr == daddr) {
1346			rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1347				fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1348			/* set fnhe_daddr to 0 to ensure it won't bind with
1349			 * new dsts in rt_bind_exception().
1350			 */
1351			fnhe->fnhe_daddr = 0;
1352			fnhe_flush_routes(fnhe);
1353			kfree_rcu(fnhe, rcu);
1354			break;
1355		}
1356		fnhe_p = &fnhe->fnhe_next;
1357		fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1358						 lockdep_is_held(&fnhe_lock));
1359	}
1360
1361	spin_unlock_bh(&fnhe_lock);
1362}
1363
1364static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1365					       __be32 daddr)
1366{
1367	struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1368	struct fib_nh_exception *fnhe;
1369	u32 hval;
1370
1371	if (!hash)
1372		return NULL;
1373
1374	hval = fnhe_hashfun(daddr);
1375
1376	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1377	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
1378		if (fnhe->fnhe_daddr == daddr) {
1379			if (fnhe->fnhe_expires &&
1380			    time_after(jiffies, fnhe->fnhe_expires)) {
1381				ip_del_fnhe(nhc, daddr);
1382				break;
1383			}
1384			return fnhe;
1385		}
1386	}
1387	return NULL;
1388}
1389
1390/* MTU selection:
1391 * 1. mtu on route is locked - use it
1392 * 2. mtu from nexthop exception
1393 * 3. mtu from egress device
1394 */
1395
1396u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1397{
1398	struct fib_nh_common *nhc = res->nhc;
1399	struct net_device *dev = nhc->nhc_dev;
1400	struct fib_info *fi = res->fi;
1401	u32 mtu = 0;
1402
1403	if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1404	    fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1405		mtu = fi->fib_mtu;
1406
1407	if (likely(!mtu)) {
1408		struct fib_nh_exception *fnhe;
1409
1410		fnhe = find_exception(nhc, daddr);
1411		if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1412			mtu = fnhe->fnhe_pmtu;
1413	}
1414
1415	if (likely(!mtu))
1416		mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1417
1418	return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1419}
1420
1421static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1422			      __be32 daddr, const bool do_cache)
1423{
1424	bool ret = false;
1425
1426	spin_lock_bh(&fnhe_lock);
1427
1428	if (daddr == fnhe->fnhe_daddr) {
1429		struct rtable __rcu **porig;
1430		struct rtable *orig;
1431		int genid = fnhe_genid(dev_net(rt->dst.dev));
1432
1433		if (rt_is_input_route(rt))
1434			porig = &fnhe->fnhe_rth_input;
1435		else
1436			porig = &fnhe->fnhe_rth_output;
1437		orig = rcu_dereference(*porig);
1438
1439		if (fnhe->fnhe_genid != genid) {
1440			fnhe->fnhe_genid = genid;
1441			fnhe->fnhe_gw = 0;
1442			fnhe->fnhe_pmtu = 0;
1443			fnhe->fnhe_expires = 0;
1444			fnhe->fnhe_mtu_locked = false;
1445			fnhe_flush_routes(fnhe);
1446			orig = NULL;
1447		}
1448		fill_route_from_fnhe(rt, fnhe);
1449		if (!rt->rt_gw4) {
1450			rt->rt_gw4 = daddr;
1451			rt->rt_gw_family = AF_INET;
1452		}
1453
1454		if (do_cache) {
1455			dst_hold(&rt->dst);
1456			rcu_assign_pointer(*porig, rt);
1457			if (orig) {
1458				dst_dev_put(&orig->dst);
1459				dst_release(&orig->dst);
1460			}
1461			ret = true;
1462		}
1463
1464		fnhe->fnhe_stamp = jiffies;
1465	}
1466	spin_unlock_bh(&fnhe_lock);
1467
1468	return ret;
1469}
1470
1471static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1472{
1473	struct rtable *orig, *prev, **p;
1474	bool ret = true;
1475
1476	if (rt_is_input_route(rt)) {
1477		p = (struct rtable **)&nhc->nhc_rth_input;
1478	} else {
1479		p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1480	}
1481	orig = *p;
1482
1483	/* hold dst before doing cmpxchg() to avoid race condition
1484	 * on this dst
1485	 */
1486	dst_hold(&rt->dst);
1487	prev = cmpxchg(p, orig, rt);
1488	if (prev == orig) {
1489		if (orig) {
1490			rt_add_uncached_list(orig);
1491			dst_release(&orig->dst);
1492		}
1493	} else {
1494		dst_release(&rt->dst);
1495		ret = false;
1496	}
1497
1498	return ret;
1499}
1500
1501struct uncached_list {
1502	spinlock_t		lock;
1503	struct list_head	head;
1504	struct list_head	quarantine;
1505};
1506
1507static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1508
1509void rt_add_uncached_list(struct rtable *rt)
1510{
1511	struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1512
1513	rt->dst.rt_uncached_list = ul;
1514
1515	spin_lock_bh(&ul->lock);
1516	list_add_tail(&rt->dst.rt_uncached, &ul->head);
1517	spin_unlock_bh(&ul->lock);
1518}
1519
1520void rt_del_uncached_list(struct rtable *rt)
1521{
1522	if (!list_empty(&rt->dst.rt_uncached)) {
1523		struct uncached_list *ul = rt->dst.rt_uncached_list;
1524
1525		spin_lock_bh(&ul->lock);
1526		list_del_init(&rt->dst.rt_uncached);
1527		spin_unlock_bh(&ul->lock);
1528	}
1529}
1530
1531static void ipv4_dst_destroy(struct dst_entry *dst)
1532{
 
1533	struct rtable *rt = (struct rtable *)dst;
1534
1535	ip_dst_metrics_put(dst);
 
 
1536	rt_del_uncached_list(rt);
1537}
1538
1539void rt_flush_dev(struct net_device *dev)
1540{
1541	struct rtable *rt, *safe;
 
1542	int cpu;
1543
1544	for_each_possible_cpu(cpu) {
1545		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1546
1547		if (list_empty(&ul->head))
1548			continue;
1549
1550		spin_lock_bh(&ul->lock);
1551		list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
1552			if (rt->dst.dev != dev)
1553				continue;
1554			rt->dst.dev = blackhole_netdev;
1555			netdev_ref_replace(dev, blackhole_netdev,
1556					   &rt->dst.dev_tracker, GFP_ATOMIC);
1557			list_move(&rt->dst.rt_uncached, &ul->quarantine);
1558		}
1559		spin_unlock_bh(&ul->lock);
1560	}
1561}
1562
1563static bool rt_cache_valid(const struct rtable *rt)
1564{
1565	return	rt &&
1566		rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1567		!rt_is_expired(rt);
1568}
1569
1570static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1571			   const struct fib_result *res,
1572			   struct fib_nh_exception *fnhe,
1573			   struct fib_info *fi, u16 type, u32 itag,
1574			   const bool do_cache)
1575{
1576	bool cached = false;
1577
1578	if (fi) {
1579		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1580
1581		if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
 
1582			rt->rt_uses_gateway = 1;
1583			rt->rt_gw_family = nhc->nhc_gw_family;
1584			/* only INET and INET6 are supported */
1585			if (likely(nhc->nhc_gw_family == AF_INET))
1586				rt->rt_gw4 = nhc->nhc_gw.ipv4;
1587			else
1588				rt->rt_gw6 = nhc->nhc_gw.ipv6;
1589		}
1590
1591		ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1592
 
 
1593#ifdef CONFIG_IP_ROUTE_CLASSID
1594		if (nhc->nhc_family == AF_INET) {
1595			struct fib_nh *nh;
1596
1597			nh = container_of(nhc, struct fib_nh, nh_common);
1598			rt->dst.tclassid = nh->nh_tclassid;
1599		}
1600#endif
1601		rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1602		if (unlikely(fnhe))
1603			cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1604		else if (do_cache)
1605			cached = rt_cache_route(nhc, rt);
1606		if (unlikely(!cached)) {
1607			/* Routes we intend to cache in nexthop exception or
1608			 * FIB nexthop have the DST_NOCACHE bit clear.
1609			 * However, if we are unsuccessful at storing this
1610			 * route into the cache we really need to set it.
1611			 */
1612			if (!rt->rt_gw4) {
1613				rt->rt_gw_family = AF_INET;
1614				rt->rt_gw4 = daddr;
1615			}
1616			rt_add_uncached_list(rt);
1617		}
1618	} else
1619		rt_add_uncached_list(rt);
1620
1621#ifdef CONFIG_IP_ROUTE_CLASSID
1622#ifdef CONFIG_IP_MULTIPLE_TABLES
1623	set_class_tag(rt, res->tclassid);
1624#endif
1625	set_class_tag(rt, itag);
1626#endif
1627}
1628
1629struct rtable *rt_dst_alloc(struct net_device *dev,
1630			    unsigned int flags, u16 type,
1631			    bool noxfrm)
1632{
1633	struct rtable *rt;
1634
1635	rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
 
 
1636		       (noxfrm ? DST_NOXFRM : 0));
1637
1638	if (rt) {
1639		rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1640		rt->rt_flags = flags;
1641		rt->rt_type = type;
1642		rt->rt_is_input = 0;
1643		rt->rt_iif = 0;
1644		rt->rt_pmtu = 0;
1645		rt->rt_mtu_locked = 0;
 
1646		rt->rt_uses_gateway = 0;
1647		rt->rt_gw_family = 0;
1648		rt->rt_gw4 = 0;
1649
1650		rt->dst.output = ip_output;
1651		if (flags & RTCF_LOCAL)
1652			rt->dst.input = ip_local_deliver;
1653	}
1654
1655	return rt;
1656}
1657EXPORT_SYMBOL(rt_dst_alloc);
1658
1659struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1660{
1661	struct rtable *new_rt;
1662
1663	new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
1664			   rt->dst.flags);
1665
1666	if (new_rt) {
1667		new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1668		new_rt->rt_flags = rt->rt_flags;
1669		new_rt->rt_type = rt->rt_type;
1670		new_rt->rt_is_input = rt->rt_is_input;
1671		new_rt->rt_iif = rt->rt_iif;
1672		new_rt->rt_pmtu = rt->rt_pmtu;
1673		new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1674		new_rt->rt_gw_family = rt->rt_gw_family;
1675		if (rt->rt_gw_family == AF_INET)
1676			new_rt->rt_gw4 = rt->rt_gw4;
1677		else if (rt->rt_gw_family == AF_INET6)
1678			new_rt->rt_gw6 = rt->rt_gw6;
1679
1680		new_rt->dst.input = rt->dst.input;
1681		new_rt->dst.output = rt->dst.output;
1682		new_rt->dst.error = rt->dst.error;
1683		new_rt->dst.lastuse = jiffies;
1684		new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1685	}
1686	return new_rt;
1687}
1688EXPORT_SYMBOL(rt_dst_clone);
1689
1690/* called in rcu_read_lock() section */
1691int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1692			  u8 tos, struct net_device *dev,
1693			  struct in_device *in_dev, u32 *itag)
1694{
1695	int err;
1696
1697	/* Primary sanity checks. */
1698	if (!in_dev)
1699		return -EINVAL;
1700
1701	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1702	    skb->protocol != htons(ETH_P_IP))
1703		return -EINVAL;
1704
1705	if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1706		return -EINVAL;
1707
1708	if (ipv4_is_zeronet(saddr)) {
1709		if (!ipv4_is_local_multicast(daddr) &&
1710		    ip_hdr(skb)->protocol != IPPROTO_IGMP)
1711			return -EINVAL;
1712	} else {
1713		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1714					  in_dev, itag);
1715		if (err < 0)
1716			return err;
1717	}
1718	return 0;
1719}
1720
1721/* called in rcu_read_lock() section */
1722static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1723			     u8 tos, struct net_device *dev, int our)
1724{
1725	struct in_device *in_dev = __in_dev_get_rcu(dev);
1726	unsigned int flags = RTCF_MULTICAST;
1727	struct rtable *rth;
1728	u32 itag = 0;
1729	int err;
1730
1731	err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1732	if (err)
1733		return err;
1734
1735	if (our)
1736		flags |= RTCF_LOCAL;
1737
1738	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
1739		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1740
1741	rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1742			   false);
1743	if (!rth)
1744		return -ENOBUFS;
1745
1746#ifdef CONFIG_IP_ROUTE_CLASSID
1747	rth->dst.tclassid = itag;
1748#endif
1749	rth->dst.output = ip_rt_bug;
1750	rth->rt_is_input= 1;
1751
1752#ifdef CONFIG_IP_MROUTE
1753	if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1754		rth->dst.input = ip_mr_input;
1755#endif
1756	RT_CACHE_STAT_INC(in_slow_mc);
1757
1758	skb_dst_drop(skb);
1759	skb_dst_set(skb, &rth->dst);
1760	return 0;
1761}
1762
1763
1764static void ip_handle_martian_source(struct net_device *dev,
1765				     struct in_device *in_dev,
1766				     struct sk_buff *skb,
1767				     __be32 daddr,
1768				     __be32 saddr)
1769{
1770	RT_CACHE_STAT_INC(in_martian_src);
1771#ifdef CONFIG_IP_ROUTE_VERBOSE
1772	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1773		/*
1774		 *	RFC1812 recommendation, if source is martian,
1775		 *	the only hint is MAC header.
1776		 */
1777		pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1778			&daddr, &saddr, dev->name);
1779		if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1780			print_hex_dump(KERN_WARNING, "ll header: ",
1781				       DUMP_PREFIX_OFFSET, 16, 1,
1782				       skb_mac_header(skb),
1783				       dev->hard_header_len, false);
1784		}
1785	}
1786#endif
1787}
1788
1789/* called in rcu_read_lock() section */
1790static int __mkroute_input(struct sk_buff *skb,
1791			   const struct fib_result *res,
1792			   struct in_device *in_dev,
1793			   __be32 daddr, __be32 saddr, u32 tos)
1794{
1795	struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1796	struct net_device *dev = nhc->nhc_dev;
1797	struct fib_nh_exception *fnhe;
1798	struct rtable *rth;
1799	int err;
1800	struct in_device *out_dev;
1801	bool do_cache;
1802	u32 itag = 0;
1803
1804	/* get a working reference to the output device */
1805	out_dev = __in_dev_get_rcu(dev);
1806	if (!out_dev) {
1807		net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1808		return -EINVAL;
1809	}
1810
1811	err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1812				  in_dev->dev, in_dev, &itag);
1813	if (err < 0) {
1814		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1815					 saddr);
1816
1817		goto cleanup;
1818	}
1819
1820	do_cache = res->fi && !itag;
1821	if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1822	    skb->protocol == htons(ETH_P_IP)) {
1823		__be32 gw;
1824
1825		gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1826		if (IN_DEV_SHARED_MEDIA(out_dev) ||
1827		    inet_addr_onlink(out_dev, saddr, gw))
1828			IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1829	}
1830
1831	if (skb->protocol != htons(ETH_P_IP)) {
1832		/* Not IP (i.e. ARP). Do not create route, if it is
1833		 * invalid for proxy arp. DNAT routes are always valid.
1834		 *
1835		 * Proxy arp feature have been extended to allow, ARP
1836		 * replies back to the same interface, to support
1837		 * Private VLAN switch technologies. See arp.c.
1838		 */
1839		if (out_dev == in_dev &&
1840		    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1841			err = -EINVAL;
1842			goto cleanup;
1843		}
1844	}
1845
1846	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
1847		IPCB(skb)->flags |= IPSKB_NOPOLICY;
1848
1849	fnhe = find_exception(nhc, daddr);
1850	if (do_cache) {
1851		if (fnhe)
1852			rth = rcu_dereference(fnhe->fnhe_rth_input);
1853		else
1854			rth = rcu_dereference(nhc->nhc_rth_input);
1855		if (rt_cache_valid(rth)) {
1856			skb_dst_set_noref(skb, &rth->dst);
1857			goto out;
1858		}
1859	}
1860
1861	rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1862			   IN_DEV_ORCONF(out_dev, NOXFRM));
 
1863	if (!rth) {
1864		err = -ENOBUFS;
1865		goto cleanup;
1866	}
1867
1868	rth->rt_is_input = 1;
1869	RT_CACHE_STAT_INC(in_slow_tot);
1870
1871	rth->dst.input = ip_forward;
1872
1873	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1874		       do_cache);
1875	lwtunnel_set_redirect(&rth->dst);
1876	skb_dst_set(skb, &rth->dst);
1877out:
1878	err = 0;
1879 cleanup:
1880	return err;
1881}
1882
1883#ifdef CONFIG_IP_ROUTE_MULTIPATH
1884/* To make ICMP packets follow the right flow, the multipath hash is
1885 * calculated from the inner IP addresses.
1886 */
1887static void ip_multipath_l3_keys(const struct sk_buff *skb,
1888				 struct flow_keys *hash_keys)
1889{
1890	const struct iphdr *outer_iph = ip_hdr(skb);
1891	const struct iphdr *key_iph = outer_iph;
1892	const struct iphdr *inner_iph;
1893	const struct icmphdr *icmph;
1894	struct iphdr _inner_iph;
1895	struct icmphdr _icmph;
1896
1897	if (likely(outer_iph->protocol != IPPROTO_ICMP))
1898		goto out;
1899
1900	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1901		goto out;
1902
1903	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1904				   &_icmph);
1905	if (!icmph)
1906		goto out;
1907
1908	if (!icmp_is_err(icmph->type))
 
 
 
1909		goto out;
1910
1911	inner_iph = skb_header_pointer(skb,
1912				       outer_iph->ihl * 4 + sizeof(_icmph),
1913				       sizeof(_inner_iph), &_inner_iph);
1914	if (!inner_iph)
1915		goto out;
1916
1917	key_iph = inner_iph;
1918out:
1919	hash_keys->addrs.v4addrs.src = key_iph->saddr;
1920	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1921}
1922
1923static u32 fib_multipath_custom_hash_outer(const struct net *net,
1924					   const struct sk_buff *skb,
1925					   bool *p_has_inner)
1926{
1927	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
1928	struct flow_keys keys, hash_keys;
1929
1930	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
1931		return 0;
1932
1933	memset(&hash_keys, 0, sizeof(hash_keys));
1934	skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
1935
1936	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1937	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
1938		hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1939	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
1940		hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1941	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
1942		hash_keys.basic.ip_proto = keys.basic.ip_proto;
1943	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
1944		hash_keys.ports.src = keys.ports.src;
1945	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
1946		hash_keys.ports.dst = keys.ports.dst;
1947
1948	*p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
1949	return flow_hash_from_keys(&hash_keys);
1950}
1951
1952static u32 fib_multipath_custom_hash_inner(const struct net *net,
1953					   const struct sk_buff *skb,
1954					   bool has_inner)
1955{
1956	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
1957	struct flow_keys keys, hash_keys;
1958
1959	/* We assume the packet carries an encapsulation, but if none was
1960	 * encountered during dissection of the outer flow, then there is no
1961	 * point in calling the flow dissector again.
1962	 */
1963	if (!has_inner)
1964		return 0;
1965
1966	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
1967		return 0;
1968
1969	memset(&hash_keys, 0, sizeof(hash_keys));
1970	skb_flow_dissect_flow_keys(skb, &keys, 0);
1971
1972	if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
1973		return 0;
1974
1975	if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1976		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1977		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1978			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1979		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1980			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1981	} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1982		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1983		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
1984			hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1985		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
1986			hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1987		if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
1988			hash_keys.tags.flow_label = keys.tags.flow_label;
1989	}
1990
1991	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
1992		hash_keys.basic.ip_proto = keys.basic.ip_proto;
1993	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
1994		hash_keys.ports.src = keys.ports.src;
1995	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
1996		hash_keys.ports.dst = keys.ports.dst;
1997
1998	return flow_hash_from_keys(&hash_keys);
1999}
2000
2001static u32 fib_multipath_custom_hash_skb(const struct net *net,
2002					 const struct sk_buff *skb)
2003{
2004	u32 mhash, mhash_inner;
2005	bool has_inner = true;
2006
2007	mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
2008	mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);
2009
2010	return jhash_2words(mhash, mhash_inner, 0);
2011}
2012
2013static u32 fib_multipath_custom_hash_fl4(const struct net *net,
2014					 const struct flowi4 *fl4)
2015{
2016	u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
2017	struct flow_keys hash_keys;
2018
2019	if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
2020		return 0;
2021
2022	memset(&hash_keys, 0, sizeof(hash_keys));
2023	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2024	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
2025		hash_keys.addrs.v4addrs.src = fl4->saddr;
2026	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
2027		hash_keys.addrs.v4addrs.dst = fl4->daddr;
2028	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
2029		hash_keys.basic.ip_proto = fl4->flowi4_proto;
2030	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
2031		hash_keys.ports.src = fl4->fl4_sport;
2032	if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
2033		hash_keys.ports.dst = fl4->fl4_dport;
2034
2035	return flow_hash_from_keys(&hash_keys);
2036}
2037
2038/* if skb is set it will be used and fl4 can be NULL */
2039int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
2040		       const struct sk_buff *skb, struct flow_keys *flkeys)
2041{
2042	u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
2043	struct flow_keys hash_keys;
2044	u32 mhash = 0;
2045
2046	switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
2047	case 0:
2048		memset(&hash_keys, 0, sizeof(hash_keys));
2049		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2050		if (skb) {
2051			ip_multipath_l3_keys(skb, &hash_keys);
2052		} else {
2053			hash_keys.addrs.v4addrs.src = fl4->saddr;
2054			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2055		}
2056		mhash = flow_hash_from_keys(&hash_keys);
2057		break;
2058	case 1:
2059		/* skb is currently provided only when forwarding */
2060		if (skb) {
2061			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2062			struct flow_keys keys;
2063
2064			/* short-circuit if we already have L4 hash present */
2065			if (skb->l4_hash)
2066				return skb_get_hash_raw(skb) >> 1;
2067
2068			memset(&hash_keys, 0, sizeof(hash_keys));
2069
2070			if (!flkeys) {
2071				skb_flow_dissect_flow_keys(skb, &keys, flag);
2072				flkeys = &keys;
2073			}
2074
2075			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2076			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2077			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2078			hash_keys.ports.src = flkeys->ports.src;
2079			hash_keys.ports.dst = flkeys->ports.dst;
2080			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2081		} else {
2082			memset(&hash_keys, 0, sizeof(hash_keys));
2083			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2084			hash_keys.addrs.v4addrs.src = fl4->saddr;
2085			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2086			hash_keys.ports.src = fl4->fl4_sport;
2087			hash_keys.ports.dst = fl4->fl4_dport;
2088			hash_keys.basic.ip_proto = fl4->flowi4_proto;
2089		}
2090		mhash = flow_hash_from_keys(&hash_keys);
2091		break;
2092	case 2:
2093		memset(&hash_keys, 0, sizeof(hash_keys));
2094		/* skb is currently provided only when forwarding */
2095		if (skb) {
2096			struct flow_keys keys;
2097
2098			skb_flow_dissect_flow_keys(skb, &keys, 0);
2099			/* Inner can be v4 or v6 */
2100			if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2101				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2102				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2103				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2104			} else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2105				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2106				hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2107				hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2108				hash_keys.tags.flow_label = keys.tags.flow_label;
2109				hash_keys.basic.ip_proto = keys.basic.ip_proto;
2110			} else {
2111				/* Same as case 0 */
2112				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2113				ip_multipath_l3_keys(skb, &hash_keys);
2114			}
2115		} else {
2116			/* Same as case 0 */
2117			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2118			hash_keys.addrs.v4addrs.src = fl4->saddr;
2119			hash_keys.addrs.v4addrs.dst = fl4->daddr;
2120		}
2121		mhash = flow_hash_from_keys(&hash_keys);
2122		break;
2123	case 3:
2124		if (skb)
2125			mhash = fib_multipath_custom_hash_skb(net, skb);
2126		else
2127			mhash = fib_multipath_custom_hash_fl4(net, fl4);
2128		break;
2129	}
2130
2131	if (multipath_hash)
2132		mhash = jhash_2words(mhash, multipath_hash, 0);
2133
2134	return mhash >> 1;
2135}
2136#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2137
2138static int ip_mkroute_input(struct sk_buff *skb,
2139			    struct fib_result *res,
2140			    struct in_device *in_dev,
2141			    __be32 daddr, __be32 saddr, u32 tos,
2142			    struct flow_keys *hkeys)
2143{
2144#ifdef CONFIG_IP_ROUTE_MULTIPATH
2145	if (res->fi && fib_info_num_path(res->fi) > 1) {
2146		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2147
2148		fib_select_multipath(res, h);
2149		IPCB(skb)->flags |= IPSKB_MULTIPATH;
2150	}
2151#endif
2152
2153	/* create a routing cache entry */
2154	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2155}
2156
2157/* Implements all the saddr-related checks as ip_route_input_slow(),
2158 * assuming daddr is valid and the destination is not a local broadcast one.
2159 * Uses the provided hint instead of performing a route lookup.
2160 */
2161int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2162		      u8 tos, struct net_device *dev,
2163		      const struct sk_buff *hint)
2164{
2165	struct in_device *in_dev = __in_dev_get_rcu(dev);
2166	struct rtable *rt = skb_rtable(hint);
2167	struct net *net = dev_net(dev);
2168	int err = -EINVAL;
2169	u32 tag = 0;
2170
2171	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2172		goto martian_source;
2173
2174	if (ipv4_is_zeronet(saddr))
2175		goto martian_source;
2176
2177	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2178		goto martian_source;
2179
2180	if (rt->rt_type != RTN_LOCAL)
2181		goto skip_validate_source;
2182
2183	tos &= IPTOS_RT_MASK;
2184	err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2185	if (err < 0)
2186		goto martian_source;
2187
2188skip_validate_source:
2189	skb_dst_copy(skb, hint);
2190	return 0;
2191
2192martian_source:
2193	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2194	return err;
2195}
2196
2197/* get device for dst_alloc with local routes */
2198static struct net_device *ip_rt_get_dev(struct net *net,
2199					const struct fib_result *res)
2200{
2201	struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2202	struct net_device *dev = NULL;
2203
2204	if (nhc)
2205		dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2206
2207	return dev ? : net->loopback_dev;
2208}
2209
2210/*
2211 *	NOTE. We drop all the packets that has local source
2212 *	addresses, because every properly looped back packet
2213 *	must have correct destination already attached by output routine.
2214 *	Changes in the enforced policies must be applied also to
2215 *	ip_route_use_hint().
2216 *
2217 *	Such approach solves two big problems:
2218 *	1. Not simplex devices are handled properly.
2219 *	2. IP spoofing attempts are filtered with 100% of guarantee.
2220 *	called with rcu_read_lock()
2221 */
2222
2223static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2224			       u8 tos, struct net_device *dev,
2225			       struct fib_result *res)
2226{
2227	struct in_device *in_dev = __in_dev_get_rcu(dev);
2228	struct flow_keys *flkeys = NULL, _flkeys;
2229	struct net    *net = dev_net(dev);
2230	struct ip_tunnel_info *tun_info;
2231	int		err = -EINVAL;
2232	unsigned int	flags = 0;
2233	u32		itag = 0;
2234	struct rtable	*rth;
2235	struct flowi4	fl4;
2236	bool do_cache = true;
2237
2238	/* IP on this device is disabled. */
2239
2240	if (!in_dev)
2241		goto out;
2242
2243	/* Check for the most weird martians, which can be not detected
2244	 * by fib_lookup.
2245	 */
2246
2247	tun_info = skb_tunnel_info(skb);
2248	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2249		fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2250	else
2251		fl4.flowi4_tun_key.tun_id = 0;
2252	skb_dst_drop(skb);
2253
2254	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2255		goto martian_source;
2256
2257	res->fi = NULL;
2258	res->table = NULL;
2259	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2260		goto brd_input;
2261
2262	/* Accept zero addresses only to limited broadcast;
2263	 * I even do not know to fix it or not. Waiting for complains :-)
2264	 */
2265	if (ipv4_is_zeronet(saddr))
2266		goto martian_source;
2267
2268	if (ipv4_is_zeronet(daddr))
2269		goto martian_destination;
2270
2271	/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2272	 * and call it once if daddr or/and saddr are loopback addresses
2273	 */
2274	if (ipv4_is_loopback(daddr)) {
2275		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2276			goto martian_destination;
2277	} else if (ipv4_is_loopback(saddr)) {
2278		if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2279			goto martian_source;
2280	}
2281
2282	/*
2283	 *	Now we are ready to route packet.
2284	 */
2285	fl4.flowi4_l3mdev = 0;
2286	fl4.flowi4_oif = 0;
2287	fl4.flowi4_iif = dev->ifindex;
2288	fl4.flowi4_mark = skb->mark;
2289	fl4.flowi4_tos = tos;
2290	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2291	fl4.flowi4_flags = 0;
2292	fl4.daddr = daddr;
2293	fl4.saddr = saddr;
2294	fl4.flowi4_uid = sock_net_uid(net, NULL);
2295	fl4.flowi4_multipath_hash = 0;
2296
2297	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2298		flkeys = &_flkeys;
2299	} else {
2300		fl4.flowi4_proto = 0;
2301		fl4.fl4_sport = 0;
2302		fl4.fl4_dport = 0;
2303	}
2304
2305	err = fib_lookup(net, &fl4, res, 0);
2306	if (err != 0) {
2307		if (!IN_DEV_FORWARD(in_dev))
2308			err = -EHOSTUNREACH;
2309		goto no_route;
2310	}
2311
2312	if (res->type == RTN_BROADCAST) {
2313		if (IN_DEV_BFORWARD(in_dev))
2314			goto make_route;
2315		/* not do cache if bc_forwarding is enabled */
2316		if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2317			do_cache = false;
2318		goto brd_input;
2319	}
2320
2321	if (res->type == RTN_LOCAL) {
2322		err = fib_validate_source(skb, saddr, daddr, tos,
2323					  0, dev, in_dev, &itag);
2324		if (err < 0)
2325			goto martian_source;
2326		goto local_input;
2327	}
2328
2329	if (!IN_DEV_FORWARD(in_dev)) {
2330		err = -EHOSTUNREACH;
2331		goto no_route;
2332	}
2333	if (res->type != RTN_UNICAST)
2334		goto martian_destination;
2335
2336make_route:
2337	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2338out:	return err;
2339
2340brd_input:
2341	if (skb->protocol != htons(ETH_P_IP))
2342		goto e_inval;
2343
2344	if (!ipv4_is_zeronet(saddr)) {
2345		err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2346					  in_dev, &itag);
2347		if (err < 0)
2348			goto martian_source;
2349	}
2350	flags |= RTCF_BROADCAST;
2351	res->type = RTN_BROADCAST;
2352	RT_CACHE_STAT_INC(in_brd);
2353
2354local_input:
2355	if (IN_DEV_ORCONF(in_dev, NOPOLICY))
2356		IPCB(skb)->flags |= IPSKB_NOPOLICY;
2357
2358	do_cache &= res->fi && !itag;
2359	if (do_cache) {
2360		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2361
2362		rth = rcu_dereference(nhc->nhc_rth_input);
2363		if (rt_cache_valid(rth)) {
2364			skb_dst_set_noref(skb, &rth->dst);
2365			err = 0;
2366			goto out;
2367		}
2368	}
2369
2370	rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2371			   flags | RTCF_LOCAL, res->type, false);
 
2372	if (!rth)
2373		goto e_nobufs;
2374
2375	rth->dst.output= ip_rt_bug;
2376#ifdef CONFIG_IP_ROUTE_CLASSID
2377	rth->dst.tclassid = itag;
2378#endif
2379	rth->rt_is_input = 1;
2380
2381	RT_CACHE_STAT_INC(in_slow_tot);
2382	if (res->type == RTN_UNREACHABLE) {
2383		rth->dst.input= ip_error;
2384		rth->dst.error= -err;
2385		rth->rt_flags	&= ~RTCF_LOCAL;
2386	}
2387
2388	if (do_cache) {
2389		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2390
2391		rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2392		if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2393			WARN_ON(rth->dst.input == lwtunnel_input);
2394			rth->dst.lwtstate->orig_input = rth->dst.input;
2395			rth->dst.input = lwtunnel_input;
2396		}
2397
2398		if (unlikely(!rt_cache_route(nhc, rth)))
2399			rt_add_uncached_list(rth);
2400	}
2401	skb_dst_set(skb, &rth->dst);
2402	err = 0;
2403	goto out;
2404
2405no_route:
2406	RT_CACHE_STAT_INC(in_no_route);
2407	res->type = RTN_UNREACHABLE;
2408	res->fi = NULL;
2409	res->table = NULL;
2410	goto local_input;
2411
2412	/*
2413	 *	Do not cache martian addresses: they should be logged (RFC1812)
2414	 */
2415martian_destination:
2416	RT_CACHE_STAT_INC(in_martian_dst);
2417#ifdef CONFIG_IP_ROUTE_VERBOSE
2418	if (IN_DEV_LOG_MARTIANS(in_dev))
2419		net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2420				     &daddr, &saddr, dev->name);
2421#endif
2422
2423e_inval:
2424	err = -EINVAL;
2425	goto out;
2426
2427e_nobufs:
2428	err = -ENOBUFS;
2429	goto out;
2430
2431martian_source:
2432	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2433	goto out;
2434}
2435
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2436/* called with rcu_read_lock held */
2437static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2438			      u8 tos, struct net_device *dev, struct fib_result *res)
2439{
2440	/* Multicast recognition logic is moved from route cache to here.
2441	 * The problem was that too many Ethernet cards have broken/missing
2442	 * hardware multicast filters :-( As result the host on multicasting
2443	 * network acquires a lot of useless route cache entries, sort of
2444	 * SDR messages from all the world. Now we try to get rid of them.
2445	 * Really, provided software IP multicast filter is organized
2446	 * reasonably (at least, hashed), it does not result in a slowdown
2447	 * comparing with route cache reject entries.
2448	 * Note, that multicast routers are not affected, because
2449	 * route cache entry is created eventually.
2450	 */
2451	if (ipv4_is_multicast(daddr)) {
2452		struct in_device *in_dev = __in_dev_get_rcu(dev);
2453		int our = 0;
2454		int err = -EINVAL;
2455
2456		if (!in_dev)
2457			return err;
2458		our = ip_check_mc_rcu(in_dev, daddr, saddr,
2459				      ip_hdr(skb)->protocol);
2460
2461		/* check l3 master if no match yet */
2462		if (!our && netif_is_l3_slave(dev)) {
2463			struct in_device *l3_in_dev;
2464
2465			l3_in_dev = __in_dev_get_rcu(skb->dev);
2466			if (l3_in_dev)
2467				our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2468						      ip_hdr(skb)->protocol);
2469		}
2470
2471		if (our
2472#ifdef CONFIG_IP_MROUTE
2473			||
2474		    (!ipv4_is_local_multicast(daddr) &&
2475		     IN_DEV_MFORWARD(in_dev))
2476#endif
2477		   ) {
2478			err = ip_route_input_mc(skb, daddr, saddr,
2479						tos, dev, our);
2480		}
2481		return err;
2482	}
2483
2484	return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2485}
2486
2487int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2488			 u8 tos, struct net_device *dev)
2489{
2490	struct fib_result res;
2491	int err;
2492
2493	tos &= IPTOS_RT_MASK;
2494	rcu_read_lock();
2495	err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2496	rcu_read_unlock();
2497
2498	return err;
2499}
2500EXPORT_SYMBOL(ip_route_input_noref);
2501
2502/* called with rcu_read_lock() */
2503static struct rtable *__mkroute_output(const struct fib_result *res,
2504				       const struct flowi4 *fl4, int orig_oif,
2505				       struct net_device *dev_out,
2506				       unsigned int flags)
2507{
2508	struct fib_info *fi = res->fi;
2509	struct fib_nh_exception *fnhe;
2510	struct in_device *in_dev;
2511	u16 type = res->type;
2512	struct rtable *rth;
2513	bool do_cache;
2514
2515	in_dev = __in_dev_get_rcu(dev_out);
2516	if (!in_dev)
2517		return ERR_PTR(-EINVAL);
2518
2519	if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2520		if (ipv4_is_loopback(fl4->saddr) &&
2521		    !(dev_out->flags & IFF_LOOPBACK) &&
2522		    !netif_is_l3_master(dev_out))
2523			return ERR_PTR(-EINVAL);
2524
2525	if (ipv4_is_lbcast(fl4->daddr))
2526		type = RTN_BROADCAST;
2527	else if (ipv4_is_multicast(fl4->daddr))
2528		type = RTN_MULTICAST;
2529	else if (ipv4_is_zeronet(fl4->daddr))
2530		return ERR_PTR(-EINVAL);
2531
2532	if (dev_out->flags & IFF_LOOPBACK)
2533		flags |= RTCF_LOCAL;
2534
2535	do_cache = true;
2536	if (type == RTN_BROADCAST) {
2537		flags |= RTCF_BROADCAST | RTCF_LOCAL;
2538		fi = NULL;
2539	} else if (type == RTN_MULTICAST) {
2540		flags |= RTCF_MULTICAST | RTCF_LOCAL;
2541		if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2542				     fl4->flowi4_proto))
2543			flags &= ~RTCF_LOCAL;
2544		else
2545			do_cache = false;
2546		/* If multicast route do not exist use
2547		 * default one, but do not gateway in this case.
2548		 * Yes, it is hack.
2549		 */
2550		if (fi && res->prefixlen < 4)
2551			fi = NULL;
2552	} else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2553		   (orig_oif != dev_out->ifindex)) {
2554		/* For local routes that require a particular output interface
2555		 * we do not want to cache the result.  Caching the result
2556		 * causes incorrect behaviour when there are multiple source
2557		 * addresses on the interface, the end result being that if the
2558		 * intended recipient is waiting on that interface for the
2559		 * packet he won't receive it because it will be delivered on
2560		 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2561		 * be set to the loopback interface as well.
2562		 */
2563		do_cache = false;
2564	}
2565
2566	fnhe = NULL;
2567	do_cache &= fi != NULL;
2568	if (fi) {
2569		struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2570		struct rtable __rcu **prth;
 
2571
2572		fnhe = find_exception(nhc, fl4->daddr);
2573		if (!do_cache)
2574			goto add;
2575		if (fnhe) {
2576			prth = &fnhe->fnhe_rth_output;
2577		} else {
2578			if (unlikely(fl4->flowi4_flags &
2579				     FLOWI_FLAG_KNOWN_NH &&
2580				     !(nhc->nhc_gw_family &&
2581				       nhc->nhc_scope == RT_SCOPE_LINK))) {
2582				do_cache = false;
2583				goto add;
2584			}
2585			prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2586		}
2587		rth = rcu_dereference(*prth);
2588		if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2589			return rth;
2590	}
2591
2592add:
2593	rth = rt_dst_alloc(dev_out, flags, type,
2594			   IN_DEV_ORCONF(in_dev, NOXFRM));
 
 
2595	if (!rth)
2596		return ERR_PTR(-ENOBUFS);
2597
2598	rth->rt_iif = orig_oif;
2599
2600	RT_CACHE_STAT_INC(out_slow_tot);
2601
2602	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2603		if (flags & RTCF_LOCAL &&
2604		    !(dev_out->flags & IFF_LOOPBACK)) {
2605			rth->dst.output = ip_mc_output;
2606			RT_CACHE_STAT_INC(out_slow_mc);
2607		}
2608#ifdef CONFIG_IP_MROUTE
2609		if (type == RTN_MULTICAST) {
2610			if (IN_DEV_MFORWARD(in_dev) &&
2611			    !ipv4_is_local_multicast(fl4->daddr)) {
2612				rth->dst.input = ip_mr_input;
2613				rth->dst.output = ip_mc_output;
2614			}
2615		}
2616#endif
2617	}
2618
2619	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2620	lwtunnel_set_redirect(&rth->dst);
2621
2622	return rth;
2623}
2624
2625/*
2626 * Major route resolver routine.
2627 */
2628
2629struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2630					const struct sk_buff *skb)
2631{
 
2632	struct fib_result res = {
2633		.type		= RTN_UNSPEC,
2634		.fi		= NULL,
2635		.table		= NULL,
2636		.tclassid	= 0,
2637	};
2638	struct rtable *rth;
2639
2640	fl4->flowi4_iif = LOOPBACK_IFINDEX;
2641	ip_rt_fix_tos(fl4);
 
 
2642
2643	rcu_read_lock();
2644	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2645	rcu_read_unlock();
2646
2647	return rth;
2648}
2649EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2650
2651struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2652					    struct fib_result *res,
2653					    const struct sk_buff *skb)
2654{
2655	struct net_device *dev_out = NULL;
2656	int orig_oif = fl4->flowi4_oif;
2657	unsigned int flags = 0;
2658	struct rtable *rth;
2659	int err;
2660
2661	if (fl4->saddr) {
 
2662		if (ipv4_is_multicast(fl4->saddr) ||
2663		    ipv4_is_lbcast(fl4->saddr) ||
2664		    ipv4_is_zeronet(fl4->saddr)) {
2665			rth = ERR_PTR(-EINVAL);
2666			goto out;
2667		}
2668
2669		rth = ERR_PTR(-ENETUNREACH);
2670
2671		/* I removed check for oif == dev_out->oif here.
2672		 * It was wrong for two reasons:
2673		 * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2674		 *    is assigned to multiple interfaces.
2675		 * 2. Moreover, we are allowed to send packets with saddr
2676		 *    of another iface. --ANK
2677		 */
2678
2679		if (fl4->flowi4_oif == 0 &&
2680		    (ipv4_is_multicast(fl4->daddr) ||
2681		     ipv4_is_lbcast(fl4->daddr))) {
2682			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2683			dev_out = __ip_dev_find(net, fl4->saddr, false);
2684			if (!dev_out)
2685				goto out;
2686
2687			/* Special hack: user can direct multicasts
2688			 * and limited broadcast via necessary interface
2689			 * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2690			 * This hack is not just for fun, it allows
2691			 * vic,vat and friends to work.
2692			 * They bind socket to loopback, set ttl to zero
2693			 * and expect that it will work.
2694			 * From the viewpoint of routing cache they are broken,
2695			 * because we are not allowed to build multicast path
2696			 * with loopback source addr (look, routing cache
2697			 * cannot know, that ttl is zero, so that packet
2698			 * will not leave this host and route is valid).
2699			 * Luckily, this hack is good workaround.
2700			 */
2701
2702			fl4->flowi4_oif = dev_out->ifindex;
2703			goto make_route;
2704		}
2705
2706		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2707			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2708			if (!__ip_dev_find(net, fl4->saddr, false))
2709				goto out;
2710		}
2711	}
2712
2713
2714	if (fl4->flowi4_oif) {
2715		dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2716		rth = ERR_PTR(-ENODEV);
2717		if (!dev_out)
2718			goto out;
2719
2720		/* RACE: Check return value of inet_select_addr instead. */
2721		if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2722			rth = ERR_PTR(-ENETUNREACH);
2723			goto out;
2724		}
2725		if (ipv4_is_local_multicast(fl4->daddr) ||
2726		    ipv4_is_lbcast(fl4->daddr) ||
2727		    fl4->flowi4_proto == IPPROTO_IGMP) {
2728			if (!fl4->saddr)
2729				fl4->saddr = inet_select_addr(dev_out, 0,
2730							      RT_SCOPE_LINK);
2731			goto make_route;
2732		}
2733		if (!fl4->saddr) {
2734			if (ipv4_is_multicast(fl4->daddr))
2735				fl4->saddr = inet_select_addr(dev_out, 0,
2736							      fl4->flowi4_scope);
2737			else if (!fl4->daddr)
2738				fl4->saddr = inet_select_addr(dev_out, 0,
2739							      RT_SCOPE_HOST);
2740		}
2741	}
2742
2743	if (!fl4->daddr) {
2744		fl4->daddr = fl4->saddr;
2745		if (!fl4->daddr)
2746			fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2747		dev_out = net->loopback_dev;
2748		fl4->flowi4_oif = LOOPBACK_IFINDEX;
2749		res->type = RTN_LOCAL;
2750		flags |= RTCF_LOCAL;
2751		goto make_route;
2752	}
2753
2754	err = fib_lookup(net, fl4, res, 0);
2755	if (err) {
2756		res->fi = NULL;
2757		res->table = NULL;
2758		if (fl4->flowi4_oif &&
2759		    (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
 
2760			/* Apparently, routing tables are wrong. Assume,
2761			 * that the destination is on link.
2762			 *
2763			 * WHY? DW.
2764			 * Because we are allowed to send to iface
2765			 * even if it has NO routes and NO assigned
2766			 * addresses. When oif is specified, routing
2767			 * tables are looked up with only one purpose:
2768			 * to catch if destination is gatewayed, rather than
2769			 * direct. Moreover, if MSG_DONTROUTE is set,
2770			 * we send packet, ignoring both routing tables
2771			 * and ifaddr state. --ANK
2772			 *
2773			 *
2774			 * We could make it even if oif is unknown,
2775			 * likely IPv6, but we do not.
2776			 */
2777
2778			if (fl4->saddr == 0)
2779				fl4->saddr = inet_select_addr(dev_out, 0,
2780							      RT_SCOPE_LINK);
2781			res->type = RTN_UNICAST;
2782			goto make_route;
2783		}
2784		rth = ERR_PTR(err);
2785		goto out;
2786	}
2787
2788	if (res->type == RTN_LOCAL) {
2789		if (!fl4->saddr) {
2790			if (res->fi->fib_prefsrc)
2791				fl4->saddr = res->fi->fib_prefsrc;
2792			else
2793				fl4->saddr = fl4->daddr;
2794		}
2795
2796		/* L3 master device is the loopback for that domain */
2797		dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2798			net->loopback_dev;
2799
2800		/* make sure orig_oif points to fib result device even
2801		 * though packet rx/tx happens over loopback or l3mdev
2802		 */
2803		orig_oif = FIB_RES_OIF(*res);
2804
2805		fl4->flowi4_oif = dev_out->ifindex;
2806		flags |= RTCF_LOCAL;
2807		goto make_route;
2808	}
2809
2810	fib_select_path(net, res, fl4, skb);
2811
2812	dev_out = FIB_RES_DEV(*res);
 
 
2813
2814make_route:
2815	rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2816
2817out:
2818	return rth;
2819}
2820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2821static struct dst_ops ipv4_dst_blackhole_ops = {
2822	.family			= AF_INET,
2823	.default_advmss		= ipv4_default_advmss,
2824	.neigh_lookup		= ipv4_neigh_lookup,
2825	.check			= dst_blackhole_check,
2826	.cow_metrics		= dst_blackhole_cow_metrics,
2827	.update_pmtu		= dst_blackhole_update_pmtu,
2828	.redirect		= dst_blackhole_redirect,
2829	.mtu			= dst_blackhole_mtu,
2830};
2831
2832struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2833{
2834	struct rtable *ort = (struct rtable *) dst_orig;
2835	struct rtable *rt;
2836
2837	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0);
2838	if (rt) {
2839		struct dst_entry *new = &rt->dst;
2840
2841		new->__use = 1;
2842		new->input = dst_discard;
2843		new->output = dst_discard_out;
2844
2845		new->dev = net->loopback_dev;
2846		netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);
 
2847
2848		rt->rt_is_input = ort->rt_is_input;
2849		rt->rt_iif = ort->rt_iif;
2850		rt->rt_pmtu = ort->rt_pmtu;
2851		rt->rt_mtu_locked = ort->rt_mtu_locked;
2852
2853		rt->rt_genid = rt_genid_ipv4(net);
2854		rt->rt_flags = ort->rt_flags;
2855		rt->rt_type = ort->rt_type;
 
2856		rt->rt_uses_gateway = ort->rt_uses_gateway;
2857		rt->rt_gw_family = ort->rt_gw_family;
2858		if (rt->rt_gw_family == AF_INET)
2859			rt->rt_gw4 = ort->rt_gw4;
2860		else if (rt->rt_gw_family == AF_INET6)
2861			rt->rt_gw6 = ort->rt_gw6;
2862	}
2863
2864	dst_release(dst_orig);
2865
2866	return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2867}
2868
2869struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2870				    const struct sock *sk)
2871{
2872	struct rtable *rt = __ip_route_output_key(net, flp4);
2873
2874	if (IS_ERR(rt))
2875		return rt;
2876
2877	if (flp4->flowi4_proto) {
2878		flp4->flowi4_oif = rt->dst.dev->ifindex;
2879		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2880							flowi4_to_flowi(flp4),
2881							sk, 0);
2882	}
2883
2884	return rt;
2885}
2886EXPORT_SYMBOL_GPL(ip_route_output_flow);
2887
2888/* called with rcu_read_lock held */
2889static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2890			struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2891			struct sk_buff *skb, u32 portid, u32 seq,
2892			unsigned int flags)
2893{
 
2894	struct rtmsg *r;
2895	struct nlmsghdr *nlh;
2896	unsigned long expires = 0;
2897	u32 error;
2898	u32 metrics[RTAX_MAX];
2899
2900	nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2901	if (!nlh)
2902		return -EMSGSIZE;
2903
2904	r = nlmsg_data(nlh);
2905	r->rtm_family	 = AF_INET;
2906	r->rtm_dst_len	= 32;
2907	r->rtm_src_len	= 0;
2908	r->rtm_tos	= fl4 ? fl4->flowi4_tos : 0;
2909	r->rtm_table	= table_id < 256 ? table_id : RT_TABLE_COMPAT;
2910	if (nla_put_u32(skb, RTA_TABLE, table_id))
2911		goto nla_put_failure;
2912	r->rtm_type	= rt->rt_type;
2913	r->rtm_scope	= RT_SCOPE_UNIVERSE;
2914	r->rtm_protocol = RTPROT_UNSPEC;
2915	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2916	if (rt->rt_flags & RTCF_NOTIFY)
2917		r->rtm_flags |= RTM_F_NOTIFY;
2918	if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2919		r->rtm_flags |= RTCF_DOREDIRECT;
2920
2921	if (nla_put_in_addr(skb, RTA_DST, dst))
2922		goto nla_put_failure;
2923	if (src) {
2924		r->rtm_src_len = 32;
2925		if (nla_put_in_addr(skb, RTA_SRC, src))
2926			goto nla_put_failure;
2927	}
2928	if (rt->dst.dev &&
2929	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2930		goto nla_put_failure;
2931	if (rt->dst.lwtstate &&
2932	    lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
2933		goto nla_put_failure;
2934#ifdef CONFIG_IP_ROUTE_CLASSID
2935	if (rt->dst.tclassid &&
2936	    nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2937		goto nla_put_failure;
2938#endif
2939	if (fl4 && !rt_is_input_route(rt) &&
2940	    fl4->saddr != src) {
2941		if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2942			goto nla_put_failure;
2943	}
2944	if (rt->rt_uses_gateway) {
2945		if (rt->rt_gw_family == AF_INET &&
2946		    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2947			goto nla_put_failure;
2948		} else if (rt->rt_gw_family == AF_INET6) {
2949			int alen = sizeof(struct in6_addr);
2950			struct nlattr *nla;
2951			struct rtvia *via;
2952
2953			nla = nla_reserve(skb, RTA_VIA, alen + 2);
2954			if (!nla)
2955				goto nla_put_failure;
2956
2957			via = nla_data(nla);
2958			via->rtvia_family = AF_INET6;
2959			memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2960		}
2961	}
2962
2963	expires = rt->dst.expires;
2964	if (expires) {
2965		unsigned long now = jiffies;
2966
2967		if (time_before(now, expires))
2968			expires -= now;
2969		else
2970			expires = 0;
2971	}
2972
2973	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2974	if (rt->rt_pmtu && expires)
2975		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2976	if (rt->rt_mtu_locked && expires)
2977		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2978	if (rtnetlink_put_metrics(skb, metrics) < 0)
2979		goto nla_put_failure;
2980
2981	if (fl4) {
2982		if (fl4->flowi4_mark &&
2983		    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2984			goto nla_put_failure;
 
 
 
 
2985
2986		if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2987		    nla_put_u32(skb, RTA_UID,
2988				from_kuid_munged(current_user_ns(),
2989						 fl4->flowi4_uid)))
2990			goto nla_put_failure;
2991
2992		if (rt_is_input_route(rt)) {
2993#ifdef CONFIG_IP_MROUTE
2994			if (ipv4_is_multicast(dst) &&
2995			    !ipv4_is_local_multicast(dst) &&
2996			    IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2997				int err = ipmr_get_route(net, skb,
2998							 fl4->saddr, fl4->daddr,
2999							 r, portid);
3000
3001				if (err <= 0) {
3002					if (err == 0)
3003						return 0;
3004					goto nla_put_failure;
3005				}
3006			} else
3007#endif
3008				if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
3009					goto nla_put_failure;
3010		}
3011	}
3012
3013	error = rt->dst.error;
3014
3015	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
3016		goto nla_put_failure;
3017
3018	nlmsg_end(skb, nlh);
3019	return 0;
3020
3021nla_put_failure:
3022	nlmsg_cancel(skb, nlh);
3023	return -EMSGSIZE;
3024}
3025
3026static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
3027			    struct netlink_callback *cb, u32 table_id,
3028			    struct fnhe_hash_bucket *bucket, int genid,
3029			    int *fa_index, int fa_start, unsigned int flags)
3030{
3031	int i;
3032
3033	for (i = 0; i < FNHE_HASH_SIZE; i++) {
3034		struct fib_nh_exception *fnhe;
3035
3036		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3037		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
3038			struct rtable *rt;
3039			int err;
3040
3041			if (*fa_index < fa_start)
3042				goto next;
3043
3044			if (fnhe->fnhe_genid != genid)
3045				goto next;
3046
3047			if (fnhe->fnhe_expires &&
3048			    time_after(jiffies, fnhe->fnhe_expires))
3049				goto next;
3050
3051			rt = rcu_dereference(fnhe->fnhe_rth_input);
3052			if (!rt)
3053				rt = rcu_dereference(fnhe->fnhe_rth_output);
3054			if (!rt)
3055				goto next;
3056
3057			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3058					   table_id, NULL, skb,
3059					   NETLINK_CB(cb->skb).portid,
3060					   cb->nlh->nlmsg_seq, flags);
3061			if (err)
3062				return err;
3063next:
3064			(*fa_index)++;
3065		}
3066	}
3067
3068	return 0;
3069}
3070
3071int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3072		       u32 table_id, struct fib_info *fi,
3073		       int *fa_index, int fa_start, unsigned int flags)
3074{
3075	struct net *net = sock_net(cb->skb->sk);
3076	int nhsel, genid = fnhe_genid(net);
3077
3078	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3079		struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3080		struct fnhe_hash_bucket *bucket;
3081		int err;
3082
3083		if (nhc->nhc_flags & RTNH_F_DEAD)
3084			continue;
3085
3086		rcu_read_lock();
3087		bucket = rcu_dereference(nhc->nhc_exceptions);
3088		err = 0;
3089		if (bucket)
3090			err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3091					       genid, fa_index, fa_start,
3092					       flags);
3093		rcu_read_unlock();
3094		if (err)
3095			return err;
3096	}
3097
3098	return 0;
3099}
3100
3101static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3102						   u8 ip_proto, __be16 sport,
3103						   __be16 dport)
3104{
3105	struct sk_buff *skb;
3106	struct iphdr *iph;
3107
3108	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3109	if (!skb)
3110		return NULL;
3111
3112	/* Reserve room for dummy headers, this skb can pass
3113	 * through good chunk of routing engine.
3114	 */
3115	skb_reset_mac_header(skb);
3116	skb_reset_network_header(skb);
3117	skb->protocol = htons(ETH_P_IP);
3118	iph = skb_put(skb, sizeof(struct iphdr));
3119	iph->protocol = ip_proto;
3120	iph->saddr = src;
3121	iph->daddr = dst;
3122	iph->version = 0x4;
3123	iph->frag_off = 0;
3124	iph->ihl = 0x5;
3125	skb_set_transport_header(skb, skb->len);
3126
3127	switch (iph->protocol) {
3128	case IPPROTO_UDP: {
3129		struct udphdr *udph;
3130
3131		udph = skb_put_zero(skb, sizeof(struct udphdr));
3132		udph->source = sport;
3133		udph->dest = dport;
3134		udph->len = htons(sizeof(struct udphdr));
3135		udph->check = 0;
3136		break;
3137	}
3138	case IPPROTO_TCP: {
3139		struct tcphdr *tcph;
3140
3141		tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3142		tcph->source	= sport;
3143		tcph->dest	= dport;
3144		tcph->doff	= sizeof(struct tcphdr) / 4;
3145		tcph->rst = 1;
3146		tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3147					    src, dst, 0);
3148		break;
3149	}
3150	case IPPROTO_ICMP: {
3151		struct icmphdr *icmph;
3152
3153		icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3154		icmph->type = ICMP_ECHO;
3155		icmph->code = 0;
3156	}
3157	}
3158
3159	return skb;
3160}
3161
3162static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3163				       const struct nlmsghdr *nlh,
3164				       struct nlattr **tb,
3165				       struct netlink_ext_ack *extack)
3166{
3167	struct rtmsg *rtm;
3168	int i, err;
3169
3170	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3171		NL_SET_ERR_MSG(extack,
3172			       "ipv4: Invalid header for route get request");
3173		return -EINVAL;
3174	}
3175
3176	if (!netlink_strict_get_check(skb))
3177		return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3178					      rtm_ipv4_policy, extack);
3179
3180	rtm = nlmsg_data(nlh);
3181	if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3182	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3183	    rtm->rtm_table || rtm->rtm_protocol ||
3184	    rtm->rtm_scope || rtm->rtm_type) {
3185		NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3186		return -EINVAL;
3187	}
3188
3189	if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3190			       RTM_F_LOOKUP_TABLE |
3191			       RTM_F_FIB_MATCH)) {
3192		NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3193		return -EINVAL;
3194	}
3195
3196	err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3197					    rtm_ipv4_policy, extack);
3198	if (err)
3199		return err;
3200
3201	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3202	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3203		NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3204		return -EINVAL;
3205	}
3206
3207	for (i = 0; i <= RTA_MAX; i++) {
3208		if (!tb[i])
3209			continue;
3210
3211		switch (i) {
3212		case RTA_IIF:
3213		case RTA_OIF:
3214		case RTA_SRC:
3215		case RTA_DST:
3216		case RTA_IP_PROTO:
3217		case RTA_SPORT:
3218		case RTA_DPORT:
3219		case RTA_MARK:
3220		case RTA_UID:
3221			break;
3222		default:
3223			NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3224			return -EINVAL;
3225		}
3226	}
3227
3228	return 0;
3229}
3230
3231static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3232			     struct netlink_ext_ack *extack)
3233{
3234	struct net *net = sock_net(in_skb->sk);
 
3235	struct nlattr *tb[RTA_MAX+1];
3236	u32 table_id = RT_TABLE_MAIN;
3237	__be16 sport = 0, dport = 0;
3238	struct fib_result res = {};
3239	u8 ip_proto = IPPROTO_UDP;
3240	struct rtable *rt = NULL;
3241	struct sk_buff *skb;
3242	struct rtmsg *rtm;
3243	struct flowi4 fl4 = {};
3244	__be32 dst = 0;
3245	__be32 src = 0;
3246	kuid_t uid;
3247	u32 iif;
3248	int err;
3249	int mark;
 
 
 
3250
3251	err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
 
3252	if (err < 0)
3253		return err;
3254
3255	rtm = nlmsg_data(nlh);
 
 
 
 
 
 
 
 
 
 
 
 
 
3256	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3257	dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3258	iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3259	mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3260	if (tb[RTA_UID])
3261		uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3262	else
3263		uid = (iif ? INVALID_UID : current_uid());
3264
3265	if (tb[RTA_IP_PROTO]) {
3266		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3267						  &ip_proto, AF_INET, extack);
3268		if (err)
3269			return err;
3270	}
3271
3272	if (tb[RTA_SPORT])
3273		sport = nla_get_be16(tb[RTA_SPORT]);
3274
3275	if (tb[RTA_DPORT])
3276		dport = nla_get_be16(tb[RTA_DPORT]);
3277
3278	skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3279	if (!skb)
3280		return -ENOBUFS;
3281
 
3282	fl4.daddr = dst;
3283	fl4.saddr = src;
3284	fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3285	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3286	fl4.flowi4_mark = mark;
3287	fl4.flowi4_uid = uid;
3288	if (sport)
3289		fl4.fl4_sport = sport;
3290	if (dport)
3291		fl4.fl4_dport = dport;
3292	fl4.flowi4_proto = ip_proto;
3293
3294	rcu_read_lock();
3295
3296	if (iif) {
3297		struct net_device *dev;
3298
3299		dev = dev_get_by_index_rcu(net, iif);
3300		if (!dev) {
3301			err = -ENODEV;
3302			goto errout_rcu;
3303		}
3304
3305		fl4.flowi4_iif = iif; /* for rt_fill_info */
3306		skb->dev	= dev;
3307		skb->mark	= mark;
3308		err = ip_route_input_rcu(skb, dst, src,
3309					 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3310					 &res);
3311
3312		rt = skb_rtable(skb);
3313		if (err == 0 && rt->dst.error)
3314			err = -rt->dst.error;
3315	} else {
3316		fl4.flowi4_iif = LOOPBACK_IFINDEX;
3317		skb->dev = net->loopback_dev;
3318		rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3319		err = 0;
3320		if (IS_ERR(rt))
3321			err = PTR_ERR(rt);
3322		else
3323			skb_dst_set(skb, &rt->dst);
3324	}
3325
3326	if (err)
3327		goto errout_rcu;
3328
3329	if (rtm->rtm_flags & RTM_F_NOTIFY)
3330		rt->rt_flags |= RTCF_NOTIFY;
3331
3332	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3333		table_id = res.table ? res.table->tb_id : 0;
3334
3335	/* reset skb for netlink reply msg */
3336	skb_trim(skb, 0);
3337	skb_reset_network_header(skb);
3338	skb_reset_transport_header(skb);
3339	skb_reset_mac_header(skb);
3340
3341	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3342		struct fib_rt_info fri;
3343
3344		if (!res.fi) {
3345			err = fib_props[res.type].error;
3346			if (!err)
3347				err = -EHOSTUNREACH;
3348			goto errout_rcu;
3349		}
3350		fri.fi = res.fi;
3351		fri.tb_id = table_id;
3352		fri.dst = res.prefix;
3353		fri.dst_len = res.prefixlen;
3354		fri.dscp = inet_dsfield_to_dscp(fl4.flowi4_tos);
3355		fri.type = rt->rt_type;
3356		fri.offload = 0;
3357		fri.trap = 0;
3358		fri.offload_failed = 0;
3359		if (res.fa_head) {
3360			struct fib_alias *fa;
3361
3362			hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3363				u8 slen = 32 - fri.dst_len;
3364
3365				if (fa->fa_slen == slen &&
3366				    fa->tb_id == fri.tb_id &&
3367				    fa->fa_dscp == fri.dscp &&
3368				    fa->fa_info == res.fi &&
3369				    fa->fa_type == fri.type) {
3370					fri.offload = READ_ONCE(fa->offload);
3371					fri.trap = READ_ONCE(fa->trap);
3372					fri.offload_failed =
3373						READ_ONCE(fa->offload_failed);
3374					break;
3375				}
3376			}
3377		}
3378		err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3379				    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
 
 
3380	} else {
3381		err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3382				   NETLINK_CB(in_skb).portid,
3383				   nlh->nlmsg_seq, 0);
3384	}
3385	if (err < 0)
3386		goto errout_rcu;
3387
3388	rcu_read_unlock();
3389
3390	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
 
 
3391
3392errout_free:
3393	return err;
3394errout_rcu:
3395	rcu_read_unlock();
3396	kfree_skb(skb);
3397	goto errout_free;
3398}
3399
3400void ip_rt_multicast_event(struct in_device *in_dev)
3401{
3402	rt_cache_flush(dev_net(in_dev->dev));
3403}
3404
3405#ifdef CONFIG_SYSCTL
3406static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3407static int ip_rt_gc_min_interval __read_mostly	= HZ / 2;
3408static int ip_rt_gc_elasticity __read_mostly	= 8;
3409static int ip_min_valid_pmtu __read_mostly	= IPV4_MIN_MTU;
3410
3411static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3412		void *buffer, size_t *lenp, loff_t *ppos)
 
3413{
3414	struct net *net = (struct net *)__ctl->extra1;
3415
3416	if (write) {
3417		rt_cache_flush(net);
3418		fnhe_genid_bump(net);
3419		return 0;
3420	}
3421
3422	return -EINVAL;
3423}
3424
3425static struct ctl_table ipv4_route_table[] = {
3426	{
3427		.procname	= "gc_thresh",
3428		.data		= &ipv4_dst_ops.gc_thresh,
3429		.maxlen		= sizeof(int),
3430		.mode		= 0644,
3431		.proc_handler	= proc_dointvec,
3432	},
3433	{
3434		.procname	= "max_size",
3435		.data		= &ip_rt_max_size,
3436		.maxlen		= sizeof(int),
3437		.mode		= 0644,
3438		.proc_handler	= proc_dointvec,
3439	},
3440	{
3441		/*  Deprecated. Use gc_min_interval_ms */
3442
3443		.procname	= "gc_min_interval",
3444		.data		= &ip_rt_gc_min_interval,
3445		.maxlen		= sizeof(int),
3446		.mode		= 0644,
3447		.proc_handler	= proc_dointvec_jiffies,
3448	},
3449	{
3450		.procname	= "gc_min_interval_ms",
3451		.data		= &ip_rt_gc_min_interval,
3452		.maxlen		= sizeof(int),
3453		.mode		= 0644,
3454		.proc_handler	= proc_dointvec_ms_jiffies,
3455	},
3456	{
3457		.procname	= "gc_timeout",
3458		.data		= &ip_rt_gc_timeout,
3459		.maxlen		= sizeof(int),
3460		.mode		= 0644,
3461		.proc_handler	= proc_dointvec_jiffies,
3462	},
3463	{
3464		.procname	= "gc_interval",
3465		.data		= &ip_rt_gc_interval,
3466		.maxlen		= sizeof(int),
3467		.mode		= 0644,
3468		.proc_handler	= proc_dointvec_jiffies,
3469	},
3470	{
3471		.procname	= "redirect_load",
3472		.data		= &ip_rt_redirect_load,
3473		.maxlen		= sizeof(int),
3474		.mode		= 0644,
3475		.proc_handler	= proc_dointvec,
3476	},
3477	{
3478		.procname	= "redirect_number",
3479		.data		= &ip_rt_redirect_number,
3480		.maxlen		= sizeof(int),
3481		.mode		= 0644,
3482		.proc_handler	= proc_dointvec,
3483	},
3484	{
3485		.procname	= "redirect_silence",
3486		.data		= &ip_rt_redirect_silence,
3487		.maxlen		= sizeof(int),
3488		.mode		= 0644,
3489		.proc_handler	= proc_dointvec,
3490	},
3491	{
3492		.procname	= "error_cost",
3493		.data		= &ip_rt_error_cost,
3494		.maxlen		= sizeof(int),
3495		.mode		= 0644,
3496		.proc_handler	= proc_dointvec,
3497	},
3498	{
3499		.procname	= "error_burst",
3500		.data		= &ip_rt_error_burst,
3501		.maxlen		= sizeof(int),
3502		.mode		= 0644,
3503		.proc_handler	= proc_dointvec,
3504	},
3505	{
3506		.procname	= "gc_elasticity",
3507		.data		= &ip_rt_gc_elasticity,
3508		.maxlen		= sizeof(int),
3509		.mode		= 0644,
3510		.proc_handler	= proc_dointvec,
3511	},
3512	{ }
3513};
3514
3515static const char ipv4_route_flush_procname[] = "flush";
3516
3517static struct ctl_table ipv4_route_netns_table[] = {
3518	{
3519		.procname	= ipv4_route_flush_procname,
 
3520		.maxlen		= sizeof(int),
3521		.mode		= 0200,
3522		.proc_handler	= ipv4_sysctl_rtcache_flush,
3523	},
3524	{
3525		.procname       = "min_pmtu",
3526		.data           = &init_net.ipv4.ip_rt_min_pmtu,
3527		.maxlen         = sizeof(int),
3528		.mode           = 0644,
3529		.proc_handler   = proc_dointvec_minmax,
3530		.extra1         = &ip_min_valid_pmtu,
3531	},
3532	{
3533		.procname       = "mtu_expires",
3534		.data           = &init_net.ipv4.ip_rt_mtu_expires,
3535		.maxlen         = sizeof(int),
3536		.mode           = 0644,
3537		.proc_handler   = proc_dointvec_jiffies,
3538	},
 
 
 
 
3539	{
3540		.procname   = "min_adv_mss",
3541		.data       = &init_net.ipv4.ip_rt_min_advmss,
3542		.maxlen     = sizeof(int),
3543		.mode       = 0644,
3544		.proc_handler   = proc_dointvec,
3545	},
3546	{ },
3547};
3548
3549static __net_init int sysctl_route_net_init(struct net *net)
3550{
3551	struct ctl_table *tbl;
3552	size_t table_size = ARRAY_SIZE(ipv4_route_netns_table);
3553
3554	tbl = ipv4_route_netns_table;
3555	if (!net_eq(net, &init_net)) {
3556		int i;
3557
3558		tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
3559		if (!tbl)
3560			goto err_dup;
3561
3562		/* Don't export non-whitelisted sysctls to unprivileged users */
3563		if (net->user_ns != &init_user_ns) {
3564			if (tbl[0].procname != ipv4_route_flush_procname) {
3565				tbl[0].procname = NULL;
3566				table_size = 0;
3567			}
3568		}
3569
3570		/* Update the variables to point into the current struct net
3571		 * except for the first element flush
3572		 */
3573		for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++)
3574			tbl[i].data += (void *)net - (void *)&init_net;
3575	}
3576	tbl[0].extra1 = net;
3577
3578	net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route",
3579						     tbl, table_size);
3580	if (!net->ipv4.route_hdr)
3581		goto err_reg;
3582	return 0;
3583
3584err_reg:
3585	if (tbl != ipv4_route_netns_table)
3586		kfree(tbl);
3587err_dup:
3588	return -ENOMEM;
3589}
3590
3591static __net_exit void sysctl_route_net_exit(struct net *net)
3592{
3593	struct ctl_table *tbl;
3594
3595	tbl = net->ipv4.route_hdr->ctl_table_arg;
3596	unregister_net_sysctl_table(net->ipv4.route_hdr);
3597	BUG_ON(tbl == ipv4_route_netns_table);
3598	kfree(tbl);
3599}
3600
3601static __net_initdata struct pernet_operations sysctl_route_ops = {
3602	.init = sysctl_route_net_init,
3603	.exit = sysctl_route_net_exit,
3604};
3605#endif
3606
3607static __net_init int netns_ip_rt_init(struct net *net)
3608{
3609	/* Set default value for namespaceified sysctls */
3610	net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
3611	net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
3612	net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
3613	return 0;
3614}
3615
3616static struct pernet_operations __net_initdata ip_rt_ops = {
3617	.init = netns_ip_rt_init,
3618};
3619
3620static __net_init int rt_genid_init(struct net *net)
3621{
3622	atomic_set(&net->ipv4.rt_genid, 0);
3623	atomic_set(&net->fnhe_genid, 0);
3624	atomic_set(&net->ipv4.dev_addr_genid, get_random_u32());
3625	return 0;
3626}
3627
3628static __net_initdata struct pernet_operations rt_genid_ops = {
3629	.init = rt_genid_init,
3630};
3631
3632static int __net_init ipv4_inetpeer_init(struct net *net)
3633{
3634	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3635
3636	if (!bp)
3637		return -ENOMEM;
3638	inet_peer_base_init(bp);
3639	net->ipv4.peers = bp;
3640	return 0;
3641}
3642
3643static void __net_exit ipv4_inetpeer_exit(struct net *net)
3644{
3645	struct inet_peer_base *bp = net->ipv4.peers;
3646
3647	net->ipv4.peers = NULL;
3648	inetpeer_invalidate_tree(bp);
3649	kfree(bp);
3650}
3651
3652static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3653	.init	=	ipv4_inetpeer_init,
3654	.exit	=	ipv4_inetpeer_exit,
3655};
3656
3657#ifdef CONFIG_IP_ROUTE_CLASSID
3658struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3659#endif /* CONFIG_IP_ROUTE_CLASSID */
3660
3661int __init ip_rt_init(void)
3662{
3663	void *idents_hash;
3664	int cpu;
3665
3666	/* For modern hosts, this will use 2 MB of memory */
3667	idents_hash = alloc_large_system_hash("IP idents",
3668					      sizeof(*ip_idents) + sizeof(*ip_tstamps),
3669					      0,
3670					      16, /* one bucket per 64 KB */
3671					      HASH_ZERO,
3672					      NULL,
3673					      &ip_idents_mask,
3674					      2048,
3675					      256*1024);
3676
3677	ip_idents = idents_hash;
3678
3679	get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3680
3681	ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3682
3683	for_each_possible_cpu(cpu) {
3684		struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3685
3686		INIT_LIST_HEAD(&ul->head);
3687		INIT_LIST_HEAD(&ul->quarantine);
3688		spin_lock_init(&ul->lock);
3689	}
3690#ifdef CONFIG_IP_ROUTE_CLASSID
3691	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3692	if (!ip_rt_acct)
3693		panic("IP: failed to allocate ip_rt_acct\n");
3694#endif
3695
3696	ipv4_dst_ops.kmem_cachep =
3697		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3698				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3699
3700	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3701
3702	if (dst_entries_init(&ipv4_dst_ops) < 0)
3703		panic("IP: failed to allocate ipv4_dst_ops counter\n");
3704
3705	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3706		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3707
3708	ipv4_dst_ops.gc_thresh = ~0;
3709	ip_rt_max_size = INT_MAX;
3710
3711	devinet_init();
3712	ip_fib_init();
3713
3714	if (ip_rt_proc_init())
3715		pr_err("Unable to create route proc files\n");
3716#ifdef CONFIG_XFRM
3717	xfrm_init();
3718	xfrm4_init();
3719#endif
3720	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3721		      RTNL_FLAG_DOIT_UNLOCKED);
3722
3723#ifdef CONFIG_SYSCTL
3724	register_pernet_subsys(&sysctl_route_ops);
3725#endif
3726	register_pernet_subsys(&ip_rt_ops);
3727	register_pernet_subsys(&rt_genid_ops);
3728	register_pernet_subsys(&ipv4_inetpeer_ops);
3729	return 0;
3730}
3731
3732#ifdef CONFIG_SYSCTL
3733/*
3734 * We really need to sanitize the damn ipv4 init order, then all
3735 * this nonsense will go away.
3736 */
3737void __init ip_static_sysctl_init(void)
3738{
3739	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3740}
3741#endif