Linux Audio

Check our new training course

Linux debugging, profiling, tracing and performance analysis training

Apr 14-17, 2025
Register
Loading...
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2013 Nicira, Inc.
   4 */
   5
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7
   8#include <linux/capability.h>
   9#include <linux/module.h>
  10#include <linux/types.h>
  11#include <linux/kernel.h>
  12#include <linux/slab.h>
  13#include <linux/uaccess.h>
  14#include <linux/skbuff.h>
  15#include <linux/netdevice.h>
  16#include <linux/in.h>
  17#include <linux/tcp.h>
  18#include <linux/udp.h>
  19#include <linux/if_arp.h>
  20#include <linux/init.h>
  21#include <linux/in6.h>
  22#include <linux/inetdevice.h>
  23#include <linux/igmp.h>
  24#include <linux/netfilter_ipv4.h>
  25#include <linux/etherdevice.h>
  26#include <linux/if_ether.h>
  27#include <linux/if_vlan.h>
  28#include <linux/rculist.h>
  29#include <linux/err.h>
  30
  31#include <net/sock.h>
  32#include <net/ip.h>
  33#include <net/icmp.h>
  34#include <net/protocol.h>
  35#include <net/ip_tunnels.h>
  36#include <net/arp.h>
  37#include <net/checksum.h>
  38#include <net/dsfield.h>
  39#include <net/inet_ecn.h>
  40#include <net/xfrm.h>
  41#include <net/net_namespace.h>
  42#include <net/netns/generic.h>
  43#include <net/rtnetlink.h>
  44#include <net/udp.h>
  45#include <net/dst_metadata.h>
  46#include <net/inet_dscp.h>
  47
  48#if IS_ENABLED(CONFIG_IPV6)
  49#include <net/ipv6.h>
  50#include <net/ip6_fib.h>
  51#include <net/ip6_route.h>
  52#endif
  53
  54static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
  55{
  56	return hash_32((__force u32)key ^ (__force u32)remote,
  57			 IP_TNL_HASH_BITS);
  58}
  59
  60static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p,
  61				const unsigned long *flags, __be32 key)
  62{
  63	if (!test_bit(IP_TUNNEL_KEY_BIT, flags))
  64		return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags);
  65
  66	return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key;
 
 
 
 
  67}
  68
  69/* Fallback tunnel: no source, no destination, no key, no options
  70
  71   Tunnel hash table:
  72   We require exact key match i.e. if a key is present in packet
  73   it will match only tunnel with the same key; if it is not present,
  74   it will match only keyless tunnel.
  75
  76   All keysless packets, if not matched configured keyless tunnels
  77   will match fallback tunnel.
  78   Given src, dst and key, find appropriate for input tunnel.
  79*/
  80struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
  81				   int link, const unsigned long *flags,
  82				   __be32 remote, __be32 local,
  83				   __be32 key)
  84{
  85	struct ip_tunnel *t, *cand = NULL;
  86	struct hlist_head *head;
  87	struct net_device *ndev;
  88	unsigned int hash;
  89
  90	hash = ip_tunnel_hash(key, remote);
  91	head = &itn->tunnels[hash];
  92
  93	hlist_for_each_entry_rcu(t, head, hash_node) {
  94		if (local != t->parms.iph.saddr ||
  95		    remote != t->parms.iph.daddr ||
  96		    !(t->dev->flags & IFF_UP))
  97			continue;
  98
  99		if (!ip_tunnel_key_match(&t->parms, flags, key))
 100			continue;
 101
 102		if (READ_ONCE(t->parms.link) == link)
 103			return t;
 104		cand = t;
 105	}
 106
 107	hlist_for_each_entry_rcu(t, head, hash_node) {
 108		if (remote != t->parms.iph.daddr ||
 109		    t->parms.iph.saddr != 0 ||
 110		    !(t->dev->flags & IFF_UP))
 111			continue;
 112
 113		if (!ip_tunnel_key_match(&t->parms, flags, key))
 114			continue;
 115
 116		if (READ_ONCE(t->parms.link) == link)
 117			return t;
 118		if (!cand)
 119			cand = t;
 120	}
 121
 122	hash = ip_tunnel_hash(key, 0);
 123	head = &itn->tunnels[hash];
 124
 125	hlist_for_each_entry_rcu(t, head, hash_node) {
 126		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
 127		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
 128			continue;
 129
 130		if (!(t->dev->flags & IFF_UP))
 131			continue;
 132
 133		if (!ip_tunnel_key_match(&t->parms, flags, key))
 134			continue;
 135
 136		if (READ_ONCE(t->parms.link) == link)
 137			return t;
 138		if (!cand)
 139			cand = t;
 140	}
 141
 142	hlist_for_each_entry_rcu(t, head, hash_node) {
 143		if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) &&
 144		     t->parms.i_key != key) ||
 145		    t->parms.iph.saddr != 0 ||
 146		    t->parms.iph.daddr != 0 ||
 147		    !(t->dev->flags & IFF_UP))
 148			continue;
 149
 150		if (READ_ONCE(t->parms.link) == link)
 151			return t;
 152		if (!cand)
 153			cand = t;
 154	}
 155
 156	if (cand)
 157		return cand;
 158
 159	t = rcu_dereference(itn->collect_md_tun);
 160	if (t && t->dev->flags & IFF_UP)
 161		return t;
 162
 163	ndev = READ_ONCE(itn->fb_tunnel_dev);
 164	if (ndev && ndev->flags & IFF_UP)
 165		return netdev_priv(ndev);
 166
 167	return NULL;
 168}
 169EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
 170
 171static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
 172				    struct ip_tunnel_parm_kern *parms)
 173{
 174	unsigned int h;
 175	__be32 remote;
 176	__be32 i_key = parms->i_key;
 177
 178	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
 179		remote = parms->iph.daddr;
 180	else
 181		remote = 0;
 182
 183	if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) &&
 184	    test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags))
 185		i_key = 0;
 186
 187	h = ip_tunnel_hash(i_key, remote);
 188	return &itn->tunnels[h];
 189}
 190
 191static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 192{
 193	struct hlist_head *head = ip_bucket(itn, &t->parms);
 194
 195	if (t->collect_md)
 196		rcu_assign_pointer(itn->collect_md_tun, t);
 197	hlist_add_head_rcu(&t->hash_node, head);
 198}
 199
 200static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 201{
 202	if (t->collect_md)
 203		rcu_assign_pointer(itn->collect_md_tun, NULL);
 204	hlist_del_init_rcu(&t->hash_node);
 205}
 206
 207static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
 208					struct ip_tunnel_parm_kern *parms,
 209					int type)
 210{
 211	__be32 remote = parms->iph.daddr;
 212	__be32 local = parms->iph.saddr;
 213	IP_TUNNEL_DECLARE_FLAGS(flags);
 214	__be32 key = parms->i_key;
 
 215	int link = parms->link;
 216	struct ip_tunnel *t = NULL;
 217	struct hlist_head *head = ip_bucket(itn, parms);
 218
 219	ip_tunnel_flags_copy(flags, parms->i_flags);
 220
 221	hlist_for_each_entry_rcu(t, head, hash_node, lockdep_rtnl_is_held()) {
 222		if (local == t->parms.iph.saddr &&
 223		    remote == t->parms.iph.daddr &&
 224		    link == READ_ONCE(t->parms.link) &&
 225		    type == t->dev->type &&
 226		    ip_tunnel_key_match(&t->parms, flags, key))
 227			break;
 228	}
 229	return t;
 230}
 231
 232static struct net_device *__ip_tunnel_create(struct net *net,
 233					     const struct rtnl_link_ops *ops,
 234					     struct ip_tunnel_parm_kern *parms)
 235{
 236	int err;
 237	struct ip_tunnel *tunnel;
 238	struct net_device *dev;
 239	char name[IFNAMSIZ];
 240
 241	err = -E2BIG;
 242	if (parms->name[0]) {
 243		if (!dev_valid_name(parms->name))
 244			goto failed;
 245		strscpy(name, parms->name, IFNAMSIZ);
 246	} else {
 247		if (strlen(ops->kind) > (IFNAMSIZ - 3))
 248			goto failed;
 249		strcpy(name, ops->kind);
 250		strcat(name, "%d");
 251	}
 252
 253	ASSERT_RTNL();
 254	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
 255	if (!dev) {
 256		err = -ENOMEM;
 257		goto failed;
 258	}
 259	dev_net_set(dev, net);
 260
 261	dev->rtnl_link_ops = ops;
 262
 263	tunnel = netdev_priv(dev);
 264	tunnel->parms = *parms;
 265	tunnel->net = net;
 266
 267	err = register_netdevice(dev);
 268	if (err)
 269		goto failed_free;
 270
 271	return dev;
 272
 273failed_free:
 274	free_netdev(dev);
 275failed:
 276	return ERR_PTR(err);
 277}
 278
 279static int ip_tunnel_bind_dev(struct net_device *dev)
 280{
 281	struct net_device *tdev = NULL;
 282	struct ip_tunnel *tunnel = netdev_priv(dev);
 283	const struct iphdr *iph;
 284	int hlen = LL_MAX_HEADER;
 285	int mtu = ETH_DATA_LEN;
 286	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 287
 288	iph = &tunnel->parms.iph;
 289
 290	/* Guess output device to choose reasonable mtu and needed_headroom */
 291	if (iph->daddr) {
 292		struct flowi4 fl4;
 293		struct rtable *rt;
 294
 295		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
 296				    iph->saddr, tunnel->parms.o_key,
 297				    iph->tos & INET_DSCP_MASK, tunnel->net,
 298				    tunnel->parms.link, tunnel->fwmark, 0, 0);
 299		rt = ip_route_output_key(tunnel->net, &fl4);
 300
 301		if (!IS_ERR(rt)) {
 302			tdev = rt->dst.dev;
 303			ip_rt_put(rt);
 304		}
 305		if (dev->type != ARPHRD_ETHER)
 306			dev->flags |= IFF_POINTOPOINT;
 307
 308		dst_cache_reset(&tunnel->dst_cache);
 309	}
 310
 311	if (!tdev && tunnel->parms.link)
 312		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
 313
 314	if (tdev) {
 315		hlen = tdev->hard_header_len + tdev->needed_headroom;
 316		mtu = min(tdev->mtu, IP_MAX_MTU);
 317	}
 318
 319	dev->needed_headroom = t_hlen + hlen;
 320	mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
 321
 322	if (mtu < IPV4_MIN_MTU)
 323		mtu = IPV4_MIN_MTU;
 324
 325	return mtu;
 326}
 327
 328static struct ip_tunnel *ip_tunnel_create(struct net *net,
 329					  struct ip_tunnel_net *itn,
 330					  struct ip_tunnel_parm_kern *parms)
 331{
 332	struct ip_tunnel *nt;
 333	struct net_device *dev;
 334	int t_hlen;
 335	int mtu;
 336	int err;
 337
 338	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
 339	if (IS_ERR(dev))
 340		return ERR_CAST(dev);
 341
 342	mtu = ip_tunnel_bind_dev(dev);
 343	err = dev_set_mtu(dev, mtu);
 344	if (err)
 345		goto err_dev_set_mtu;
 346
 347	nt = netdev_priv(dev);
 348	t_hlen = nt->hlen + sizeof(struct iphdr);
 349	dev->min_mtu = ETH_MIN_MTU;
 350	dev->max_mtu = IP_MAX_MTU - t_hlen;
 351	if (dev->type == ARPHRD_ETHER)
 352		dev->max_mtu -= dev->hard_header_len;
 353
 354	ip_tunnel_add(itn, nt);
 355	return nt;
 356
 357err_dev_set_mtu:
 358	unregister_netdevice(dev);
 359	return ERR_PTR(err);
 360}
 361
 362void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
 363{
 364	const struct iphdr *iph = ip_hdr(skb);
 365	const struct udphdr *udph;
 366
 367	if (iph->protocol != IPPROTO_UDP)
 368		return;
 369
 370	udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
 371	info->encap.sport = udph->source;
 372	info->encap.dport = udph->dest;
 373}
 374EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
 375
 376int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 377		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
 378		  bool log_ecn_error)
 379{
 380	const struct iphdr *iph = ip_hdr(skb);
 381	int nh, err;
 382
 383#ifdef CONFIG_NET_IPGRE_BROADCAST
 384	if (ipv4_is_multicast(iph->daddr)) {
 385		DEV_STATS_INC(tunnel->dev, multicast);
 386		skb->pkt_type = PACKET_BROADCAST;
 387	}
 388#endif
 389
 390	if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
 391	    test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
 392		DEV_STATS_INC(tunnel->dev, rx_crc_errors);
 393		DEV_STATS_INC(tunnel->dev, rx_errors);
 394		goto drop;
 395	}
 396
 397	if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
 398		if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
 399		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
 400			DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
 401			DEV_STATS_INC(tunnel->dev, rx_errors);
 402			goto drop;
 403		}
 404		tunnel->i_seqno = ntohl(tpi->seq) + 1;
 405	}
 406
 407	/* Save offset of outer header relative to skb->head,
 408	 * because we are going to reset the network header to the inner header
 409	 * and might change skb->head.
 410	 */
 411	nh = skb_network_header(skb) - skb->head;
 412
 413	skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
 414
 415	if (!pskb_inet_may_pull(skb)) {
 416		DEV_STATS_INC(tunnel->dev, rx_length_errors);
 417		DEV_STATS_INC(tunnel->dev, rx_errors);
 418		goto drop;
 419	}
 420	iph = (struct iphdr *)(skb->head + nh);
 421
 422	err = IP_ECN_decapsulate(iph, skb);
 423	if (unlikely(err)) {
 424		if (log_ecn_error)
 425			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
 426					&iph->saddr, iph->tos);
 427		if (err > 1) {
 428			DEV_STATS_INC(tunnel->dev, rx_frame_errors);
 429			DEV_STATS_INC(tunnel->dev, rx_errors);
 430			goto drop;
 431		}
 432	}
 433
 434	dev_sw_netstats_rx_add(tunnel->dev, skb->len);
 435	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
 436
 437	if (tunnel->dev->type == ARPHRD_ETHER) {
 438		skb->protocol = eth_type_trans(skb, tunnel->dev);
 439		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 440	} else {
 441		skb->dev = tunnel->dev;
 442	}
 443
 444	if (tun_dst)
 445		skb_dst_set(skb, (struct dst_entry *)tun_dst);
 446
 447	gro_cells_receive(&tunnel->gro_cells, skb);
 448	return 0;
 449
 450drop:
 451	if (tun_dst)
 452		dst_release((struct dst_entry *)tun_dst);
 453	kfree_skb(skb);
 454	return 0;
 455}
 456EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
 457
 458int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
 459			    unsigned int num)
 460{
 461	if (num >= MAX_IPTUN_ENCAP_OPS)
 462		return -ERANGE;
 463
 464	return !cmpxchg((const struct ip_tunnel_encap_ops **)
 465			&iptun_encaps[num],
 466			NULL, ops) ? 0 : -1;
 467}
 468EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
 469
 470int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
 471			    unsigned int num)
 472{
 473	int ret;
 474
 475	if (num >= MAX_IPTUN_ENCAP_OPS)
 476		return -ERANGE;
 477
 478	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
 479		       &iptun_encaps[num],
 480		       ops, NULL) == ops) ? 0 : -1;
 481
 482	synchronize_net();
 483
 484	return ret;
 485}
 486EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
 487
 488int ip_tunnel_encap_setup(struct ip_tunnel *t,
 489			  struct ip_tunnel_encap *ipencap)
 490{
 491	int hlen;
 492
 493	memset(&t->encap, 0, sizeof(t->encap));
 494
 495	hlen = ip_encap_hlen(ipencap);
 496	if (hlen < 0)
 497		return hlen;
 498
 499	t->encap.type = ipencap->type;
 500	t->encap.sport = ipencap->sport;
 501	t->encap.dport = ipencap->dport;
 502	t->encap.flags = ipencap->flags;
 503
 504	t->encap_hlen = hlen;
 505	t->hlen = t->encap_hlen + t->tun_hlen;
 506
 507	return 0;
 508}
 509EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
 510
 511static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 512			    struct rtable *rt, __be16 df,
 513			    const struct iphdr *inner_iph,
 514			    int tunnel_hlen, __be32 dst, bool md)
 515{
 516	struct ip_tunnel *tunnel = netdev_priv(dev);
 517	int pkt_size;
 518	int mtu;
 519
 520	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
 521	pkt_size = skb->len - tunnel_hlen;
 522	pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
 523
 524	if (df) {
 525		mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
 526		mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
 527	} else {
 528		mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 529	}
 530
 531	if (skb_valid_dst(skb))
 532		skb_dst_update_pmtu_no_confirm(skb, mtu);
 533
 534	if (skb->protocol == htons(ETH_P_IP)) {
 535		if (!skb_is_gso(skb) &&
 536		    (inner_iph->frag_off & htons(IP_DF)) &&
 537		    mtu < pkt_size) {
 538			icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 539			return -E2BIG;
 540		}
 541	}
 542#if IS_ENABLED(CONFIG_IPV6)
 543	else if (skb->protocol == htons(ETH_P_IPV6)) {
 544		struct rt6_info *rt6;
 545		__be32 daddr;
 546
 547		rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) :
 548					   NULL;
 549		daddr = md ? dst : tunnel->parms.iph.daddr;
 550
 551		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
 552			   mtu >= IPV6_MIN_MTU) {
 553			if ((daddr && !ipv4_is_multicast(daddr)) ||
 554			    rt6->rt6i_dst.plen == 128) {
 555				rt6->rt6i_flags |= RTF_MODIFIED;
 556				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 557			}
 558		}
 559
 560		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
 561					mtu < pkt_size) {
 562			icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 563			return -E2BIG;
 564		}
 565	}
 566#endif
 567	return 0;
 568}
 569
 570static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom)
 571{
 572	/* we must cap headroom to some upperlimit, else pskb_expand_head
 573	 * will overflow header offsets in skb_headers_offset_update().
 574	 */
 575	static const unsigned int max_allowed = 512;
 576
 577	if (headroom > max_allowed)
 578		headroom = max_allowed;
 579
 580	if (headroom > READ_ONCE(dev->needed_headroom))
 581		WRITE_ONCE(dev->needed_headroom, headroom);
 582}
 583
 584void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 585		       u8 proto, int tunnel_hlen)
 586{
 587	struct ip_tunnel *tunnel = netdev_priv(dev);
 588	u32 headroom = sizeof(struct iphdr);
 589	struct ip_tunnel_info *tun_info;
 590	const struct ip_tunnel_key *key;
 591	const struct iphdr *inner_iph;
 592	struct rtable *rt = NULL;
 593	struct flowi4 fl4;
 594	__be16 df = 0;
 595	u8 tos, ttl;
 596	bool use_cache;
 597
 598	tun_info = skb_tunnel_info(skb);
 599	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
 600		     ip_tunnel_info_af(tun_info) != AF_INET))
 601		goto tx_error;
 602	key = &tun_info->key;
 603	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 604	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 605	tos = key->tos;
 606	if (tos == 1) {
 607		if (skb->protocol == htons(ETH_P_IP))
 608			tos = inner_iph->tos;
 609		else if (skb->protocol == htons(ETH_P_IPV6))
 610			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
 611	}
 612	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
 613			    tunnel_id_to_key32(key->tun_id),
 614			    tos & INET_DSCP_MASK, tunnel->net, 0, skb->mark,
 615			    skb_get_hash(skb), key->flow_flags);
 616
 617	if (!tunnel_hlen)
 618		tunnel_hlen = ip_encap_hlen(&tun_info->encap);
 619
 620	if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
 621		goto tx_error;
 622
 623	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
 624	if (use_cache)
 625		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
 626	if (!rt) {
 627		rt = ip_route_output_key(tunnel->net, &fl4);
 628		if (IS_ERR(rt)) {
 629			DEV_STATS_INC(dev, tx_carrier_errors);
 630			goto tx_error;
 631		}
 632		if (use_cache)
 633			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
 634					  fl4.saddr);
 635	}
 636	if (rt->dst.dev == dev) {
 637		ip_rt_put(rt);
 638		DEV_STATS_INC(dev, collisions);
 639		goto tx_error;
 640	}
 641
 642	if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
 643		df = htons(IP_DF);
 644	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
 645			    key->u.ipv4.dst, true)) {
 646		ip_rt_put(rt);
 647		goto tx_error;
 648	}
 649
 650	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
 651	ttl = key->ttl;
 652	if (ttl == 0) {
 653		if (skb->protocol == htons(ETH_P_IP))
 654			ttl = inner_iph->ttl;
 655		else if (skb->protocol == htons(ETH_P_IPV6))
 656			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
 657		else
 658			ttl = ip4_dst_hoplimit(&rt->dst);
 659	}
 660
 661	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
 662	if (skb_cow_head(skb, headroom)) {
 663		ip_rt_put(rt);
 664		goto tx_dropped;
 665	}
 666
 667	ip_tunnel_adj_headroom(dev, headroom);
 668
 669	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
 670		      df, !net_eq(tunnel->net, dev_net(dev)));
 671	return;
 672tx_error:
 673	DEV_STATS_INC(dev, tx_errors);
 674	goto kfree;
 675tx_dropped:
 676	DEV_STATS_INC(dev, tx_dropped);
 677kfree:
 678	kfree_skb(skb);
 679}
 680EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
 681
 682void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 683		    const struct iphdr *tnl_params, u8 protocol)
 684{
 685	struct ip_tunnel *tunnel = netdev_priv(dev);
 686	struct ip_tunnel_info *tun_info = NULL;
 687	const struct iphdr *inner_iph;
 688	unsigned int max_headroom;	/* The extra header space needed */
 689	struct rtable *rt = NULL;		/* Route to the other host */
 690	__be16 payload_protocol;
 691	bool use_cache = false;
 692	struct flowi4 fl4;
 693	bool md = false;
 694	bool connected;
 695	u8 tos, ttl;
 696	__be32 dst;
 697	__be16 df;
 698
 699	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 700	connected = (tunnel->parms.iph.daddr != 0);
 701	payload_protocol = skb_protocol(skb, true);
 702
 703	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 704
 705	dst = tnl_params->daddr;
 706	if (dst == 0) {
 707		/* NBMA tunnel */
 708
 709		if (!skb_dst(skb)) {
 710			DEV_STATS_INC(dev, tx_fifo_errors);
 711			goto tx_error;
 712		}
 713
 714		tun_info = skb_tunnel_info(skb);
 715		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
 716		    ip_tunnel_info_af(tun_info) == AF_INET &&
 717		    tun_info->key.u.ipv4.dst) {
 718			dst = tun_info->key.u.ipv4.dst;
 719			md = true;
 720			connected = true;
 721		} else if (payload_protocol == htons(ETH_P_IP)) {
 722			rt = skb_rtable(skb);
 723			dst = rt_nexthop(rt, inner_iph->daddr);
 724		}
 725#if IS_ENABLED(CONFIG_IPV6)
 726		else if (payload_protocol == htons(ETH_P_IPV6)) {
 727			const struct in6_addr *addr6;
 728			struct neighbour *neigh;
 729			bool do_tx_error_icmp;
 730			int addr_type;
 731
 732			neigh = dst_neigh_lookup(skb_dst(skb),
 733						 &ipv6_hdr(skb)->daddr);
 734			if (!neigh)
 735				goto tx_error;
 736
 737			addr6 = (const struct in6_addr *)&neigh->primary_key;
 738			addr_type = ipv6_addr_type(addr6);
 739
 740			if (addr_type == IPV6_ADDR_ANY) {
 741				addr6 = &ipv6_hdr(skb)->daddr;
 742				addr_type = ipv6_addr_type(addr6);
 743			}
 744
 745			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 746				do_tx_error_icmp = true;
 747			else {
 748				do_tx_error_icmp = false;
 749				dst = addr6->s6_addr32[3];
 750			}
 751			neigh_release(neigh);
 752			if (do_tx_error_icmp)
 753				goto tx_error_icmp;
 754		}
 755#endif
 756		else
 757			goto tx_error;
 758
 759		if (!md)
 760			connected = false;
 761	}
 762
 763	tos = tnl_params->tos;
 764	if (tos & 0x1) {
 765		tos &= ~0x1;
 766		if (payload_protocol == htons(ETH_P_IP)) {
 767			tos = inner_iph->tos;
 768			connected = false;
 769		} else if (payload_protocol == htons(ETH_P_IPV6)) {
 770			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
 771			connected = false;
 772		}
 773	}
 774
 775	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
 776			    tunnel->parms.o_key, tos & INET_DSCP_MASK,
 777			    tunnel->net, READ_ONCE(tunnel->parms.link),
 778			    tunnel->fwmark, skb_get_hash(skb), 0);
 779
 780	if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
 781		goto tx_error;
 782
 783	if (connected && md) {
 784		use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
 785		if (use_cache)
 786			rt = dst_cache_get_ip4(&tun_info->dst_cache,
 787					       &fl4.saddr);
 788	} else {
 789		rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
 790						&fl4.saddr) : NULL;
 791	}
 792
 793	if (!rt) {
 794		rt = ip_route_output_key(tunnel->net, &fl4);
 795
 796		if (IS_ERR(rt)) {
 797			DEV_STATS_INC(dev, tx_carrier_errors);
 798			goto tx_error;
 799		}
 800		if (use_cache)
 801			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
 802					  fl4.saddr);
 803		else if (!md && connected)
 804			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
 805					  fl4.saddr);
 806	}
 807
 808	if (rt->dst.dev == dev) {
 809		ip_rt_put(rt);
 810		DEV_STATS_INC(dev, collisions);
 811		goto tx_error;
 812	}
 813
 814	df = tnl_params->frag_off;
 815	if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
 816		df |= (inner_iph->frag_off & htons(IP_DF));
 817
 818	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
 819		ip_rt_put(rt);
 820		goto tx_error;
 821	}
 822
 823	if (tunnel->err_count > 0) {
 824		if (time_before(jiffies,
 825				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 826			tunnel->err_count--;
 827
 828			dst_link_failure(skb);
 829		} else
 830			tunnel->err_count = 0;
 831	}
 832
 833	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
 834	ttl = tnl_params->ttl;
 835	if (ttl == 0) {
 836		if (payload_protocol == htons(ETH_P_IP))
 837			ttl = inner_iph->ttl;
 838#if IS_ENABLED(CONFIG_IPV6)
 839		else if (payload_protocol == htons(ETH_P_IPV6))
 840			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
 841#endif
 842		else
 843			ttl = ip4_dst_hoplimit(&rt->dst);
 844	}
 845
 846	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
 847			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
 848
 849	if (skb_cow_head(skb, max_headroom)) {
 850		ip_rt_put(rt);
 851		DEV_STATS_INC(dev, tx_dropped);
 852		kfree_skb(skb);
 853		return;
 854	}
 855
 856	ip_tunnel_adj_headroom(dev, max_headroom);
 857
 858	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
 859		      df, !net_eq(tunnel->net, dev_net(dev)));
 860	return;
 861
 862#if IS_ENABLED(CONFIG_IPV6)
 863tx_error_icmp:
 864	dst_link_failure(skb);
 865#endif
 866tx_error:
 867	DEV_STATS_INC(dev, tx_errors);
 868	kfree_skb(skb);
 869}
 870EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
 871
 872static void ip_tunnel_update(struct ip_tunnel_net *itn,
 873			     struct ip_tunnel *t,
 874			     struct net_device *dev,
 875			     struct ip_tunnel_parm_kern *p,
 876			     bool set_mtu,
 877			     __u32 fwmark)
 878{
 879	ip_tunnel_del(itn, t);
 880	t->parms.iph.saddr = p->iph.saddr;
 881	t->parms.iph.daddr = p->iph.daddr;
 882	t->parms.i_key = p->i_key;
 883	t->parms.o_key = p->o_key;
 884	if (dev->type != ARPHRD_ETHER) {
 885		__dev_addr_set(dev, &p->iph.saddr, 4);
 886		memcpy(dev->broadcast, &p->iph.daddr, 4);
 887	}
 888	ip_tunnel_add(itn, t);
 889
 890	t->parms.iph.ttl = p->iph.ttl;
 891	t->parms.iph.tos = p->iph.tos;
 892	t->parms.iph.frag_off = p->iph.frag_off;
 893
 894	if (t->parms.link != p->link || t->fwmark != fwmark) {
 895		int mtu;
 896
 897		WRITE_ONCE(t->parms.link, p->link);
 898		t->fwmark = fwmark;
 899		mtu = ip_tunnel_bind_dev(dev);
 900		if (set_mtu)
 901			WRITE_ONCE(dev->mtu, mtu);
 902	}
 903	dst_cache_reset(&t->dst_cache);
 904	netdev_state_change(dev);
 905}
 906
 907int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
 908		  int cmd)
 909{
 910	int err = 0;
 911	struct ip_tunnel *t = netdev_priv(dev);
 912	struct net *net = t->net;
 913	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
 914
 915	switch (cmd) {
 916	case SIOCGETTUNNEL:
 917		if (dev == itn->fb_tunnel_dev) {
 918			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 919			if (!t)
 920				t = netdev_priv(dev);
 921		}
 922		memcpy(p, &t->parms, sizeof(*p));
 923		break;
 924
 925	case SIOCADDTUNNEL:
 926	case SIOCCHGTUNNEL:
 927		err = -EPERM;
 928		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 929			goto done;
 930		if (p->iph.ttl)
 931			p->iph.frag_off |= htons(IP_DF);
 932		if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) {
 933			if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags))
 934				p->i_key = 0;
 935			if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags))
 936				p->o_key = 0;
 937		}
 938
 939		t = ip_tunnel_find(itn, p, itn->type);
 940
 941		if (cmd == SIOCADDTUNNEL) {
 942			if (!t) {
 943				t = ip_tunnel_create(net, itn, p);
 944				err = PTR_ERR_OR_ZERO(t);
 945				break;
 946			}
 947
 948			err = -EEXIST;
 949			break;
 950		}
 951		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
 952			if (t) {
 953				if (t->dev != dev) {
 954					err = -EEXIST;
 955					break;
 956				}
 957			} else {
 958				unsigned int nflags = 0;
 959
 960				if (ipv4_is_multicast(p->iph.daddr))
 961					nflags = IFF_BROADCAST;
 962				else if (p->iph.daddr)
 963					nflags = IFF_POINTOPOINT;
 964
 965				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
 966					err = -EINVAL;
 967					break;
 968				}
 969
 970				t = netdev_priv(dev);
 971			}
 972		}
 973
 974		if (t) {
 975			err = 0;
 976			ip_tunnel_update(itn, t, dev, p, true, 0);
 977		} else {
 978			err = -ENOENT;
 979		}
 980		break;
 981
 982	case SIOCDELTUNNEL:
 983		err = -EPERM;
 984		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 985			goto done;
 986
 987		if (dev == itn->fb_tunnel_dev) {
 988			err = -ENOENT;
 989			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 990			if (!t)
 991				goto done;
 992			err = -EPERM;
 993			if (t == netdev_priv(itn->fb_tunnel_dev))
 994				goto done;
 995			dev = t->dev;
 996		}
 997		unregister_netdevice(dev);
 998		err = 0;
 999		break;
1000
1001	default:
1002		err = -EINVAL;
1003	}
1004
1005done:
1006	return err;
1007}
1008EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
1009
1010bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
1011			      const void __user *data)
1012{
1013	struct ip_tunnel_parm p;
1014
1015	if (copy_from_user(&p, data, sizeof(p)))
1016		return false;
1017
1018	strscpy(kp->name, p.name);
1019	kp->link = p.link;
1020	ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags);
1021	ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags);
1022	kp->i_key = p.i_key;
1023	kp->o_key = p.o_key;
1024	memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph)));
1025
1026	return true;
1027}
1028EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user);
1029
1030bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp)
1031{
1032	struct ip_tunnel_parm p;
1033
1034	if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) ||
1035	    !ip_tunnel_flags_is_be16_compat(kp->o_flags))
1036		return false;
1037
1038	memset(&p, 0, sizeof(p));
1039
1040	strscpy(p.name, kp->name);
1041	p.link = kp->link;
1042	p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags);
1043	p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags);
1044	p.i_key = kp->i_key;
1045	p.o_key = kp->o_key;
1046	memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph)));
1047
1048	return !copy_to_user(data, &p, sizeof(p));
1049}
1050EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user);
1051
1052int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
1053			     void __user *data, int cmd)
1054{
1055	struct ip_tunnel_parm_kern p;
1056	int err;
1057
1058	if (!ip_tunnel_parm_from_user(&p, data))
1059		return -EFAULT;
1060	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
1061	if (!err && !ip_tunnel_parm_to_user(data, &p))
1062		return -EFAULT;
1063	return err;
1064}
1065EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
1066
1067int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1068{
1069	struct ip_tunnel *tunnel = netdev_priv(dev);
1070	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1071	int max_mtu = IP_MAX_MTU - t_hlen;
1072
1073	if (dev->type == ARPHRD_ETHER)
1074		max_mtu -= dev->hard_header_len;
1075
1076	if (new_mtu < ETH_MIN_MTU)
1077		return -EINVAL;
1078
1079	if (new_mtu > max_mtu) {
1080		if (strict)
1081			return -EINVAL;
1082
1083		new_mtu = max_mtu;
1084	}
1085
1086	WRITE_ONCE(dev->mtu, new_mtu);
1087	return 0;
1088}
1089EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1090
1091int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1092{
1093	return __ip_tunnel_change_mtu(dev, new_mtu, true);
1094}
1095EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1096
1097static void ip_tunnel_dev_free(struct net_device *dev)
1098{
1099	struct ip_tunnel *tunnel = netdev_priv(dev);
1100
1101	gro_cells_destroy(&tunnel->gro_cells);
1102	dst_cache_destroy(&tunnel->dst_cache);
 
1103}
1104
1105void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1106{
1107	struct ip_tunnel *tunnel = netdev_priv(dev);
1108	struct ip_tunnel_net *itn;
1109
1110	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1111
1112	if (itn->fb_tunnel_dev != dev) {
1113		ip_tunnel_del(itn, netdev_priv(dev));
1114		unregister_netdevice_queue(dev, head);
1115	}
1116}
1117EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1118
1119struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1120{
1121	struct ip_tunnel *tunnel = netdev_priv(dev);
1122
1123	return READ_ONCE(tunnel->net);
1124}
1125EXPORT_SYMBOL(ip_tunnel_get_link_net);
1126
1127int ip_tunnel_get_iflink(const struct net_device *dev)
1128{
1129	const struct ip_tunnel *tunnel = netdev_priv(dev);
1130
1131	return READ_ONCE(tunnel->parms.link);
1132}
1133EXPORT_SYMBOL(ip_tunnel_get_iflink);
1134
1135int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1136				  struct rtnl_link_ops *ops, char *devname)
1137{
1138	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1139	struct ip_tunnel_parm_kern parms;
1140	unsigned int i;
1141
1142	itn->rtnl_link_ops = ops;
1143	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1144		INIT_HLIST_HEAD(&itn->tunnels[i]);
1145
1146	if (!ops || !net_has_fallback_tunnels(net)) {
1147		struct ip_tunnel_net *it_init_net;
1148
1149		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1150		itn->type = it_init_net->type;
1151		itn->fb_tunnel_dev = NULL;
1152		return 0;
1153	}
1154
1155	memset(&parms, 0, sizeof(parms));
1156	if (devname)
1157		strscpy(parms.name, devname, IFNAMSIZ);
1158
1159	rtnl_lock();
1160	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1161	/* FB netdevice is special: we have one, and only one per netns.
1162	 * Allowing to move it to another netns is clearly unsafe.
1163	 */
1164	if (!IS_ERR(itn->fb_tunnel_dev)) {
1165		itn->fb_tunnel_dev->netns_local = true;
1166		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1167		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1168		itn->type = itn->fb_tunnel_dev->type;
1169	}
1170	rtnl_unlock();
1171
1172	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1173}
1174EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1175
1176static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1177			      struct list_head *head,
1178			      struct rtnl_link_ops *ops)
1179{
1180	struct net_device *dev, *aux;
1181	int h;
1182
1183	for_each_netdev_safe(net, dev, aux)
1184		if (dev->rtnl_link_ops == ops)
1185			unregister_netdevice_queue(dev, head);
1186
1187	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1188		struct ip_tunnel *t;
1189		struct hlist_node *n;
1190		struct hlist_head *thead = &itn->tunnels[h];
1191
1192		hlist_for_each_entry_safe(t, n, thead, hash_node)
1193			/* If dev is in the same netns, it has already
1194			 * been added to the list by the previous loop.
1195			 */
1196			if (!net_eq(dev_net(t->dev), net))
1197				unregister_netdevice_queue(t->dev, head);
1198	}
1199}
1200
1201void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1202			   struct rtnl_link_ops *ops,
1203			   struct list_head *dev_to_kill)
1204{
1205	struct ip_tunnel_net *itn;
1206	struct net *net;
1207
1208	ASSERT_RTNL();
1209	list_for_each_entry(net, net_list, exit_list) {
1210		itn = net_generic(net, id);
1211		ip_tunnel_destroy(net, itn, dev_to_kill, ops);
1212	}
1213}
1214EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1215
1216int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1217		      struct ip_tunnel_parm_kern *p, __u32 fwmark)
1218{
1219	struct ip_tunnel *nt;
1220	struct net *net = dev_net(dev);
1221	struct ip_tunnel_net *itn;
1222	int mtu;
1223	int err;
1224
1225	nt = netdev_priv(dev);
1226	itn = net_generic(net, nt->ip_tnl_net_id);
1227
1228	if (nt->collect_md) {
1229		if (rtnl_dereference(itn->collect_md_tun))
1230			return -EEXIST;
1231	} else {
1232		if (ip_tunnel_find(itn, p, dev->type))
1233			return -EEXIST;
1234	}
1235
1236	nt->net = net;
1237	nt->parms = *p;
1238	nt->fwmark = fwmark;
1239	err = register_netdevice(dev);
1240	if (err)
1241		goto err_register_netdevice;
1242
1243	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1244		eth_hw_addr_random(dev);
1245
1246	mtu = ip_tunnel_bind_dev(dev);
1247	if (tb[IFLA_MTU]) {
1248		unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1249
1250		if (dev->type == ARPHRD_ETHER)
1251			max -= dev->hard_header_len;
1252
1253		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1254	}
1255
1256	err = dev_set_mtu(dev, mtu);
1257	if (err)
1258		goto err_dev_set_mtu;
1259
1260	ip_tunnel_add(itn, nt);
1261	return 0;
1262
1263err_dev_set_mtu:
1264	unregister_netdevice(dev);
1265err_register_netdevice:
1266	return err;
1267}
1268EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1269
1270int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1271			 struct ip_tunnel_parm_kern *p, __u32 fwmark)
1272{
1273	struct ip_tunnel *t;
1274	struct ip_tunnel *tunnel = netdev_priv(dev);
1275	struct net *net = tunnel->net;
1276	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1277
1278	if (dev == itn->fb_tunnel_dev)
1279		return -EINVAL;
1280
1281	t = ip_tunnel_find(itn, p, dev->type);
1282
1283	if (t) {
1284		if (t->dev != dev)
1285			return -EEXIST;
1286	} else {
1287		t = tunnel;
1288
1289		if (dev->type != ARPHRD_ETHER) {
1290			unsigned int nflags = 0;
1291
1292			if (ipv4_is_multicast(p->iph.daddr))
1293				nflags = IFF_BROADCAST;
1294			else if (p->iph.daddr)
1295				nflags = IFF_POINTOPOINT;
1296
1297			if ((dev->flags ^ nflags) &
1298			    (IFF_POINTOPOINT | IFF_BROADCAST))
1299				return -EINVAL;
1300		}
1301	}
1302
1303	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1304	return 0;
1305}
1306EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1307
1308int ip_tunnel_init(struct net_device *dev)
1309{
1310	struct ip_tunnel *tunnel = netdev_priv(dev);
1311	struct iphdr *iph = &tunnel->parms.iph;
1312	int err;
1313
1314	dev->needs_free_netdev = true;
1315	dev->priv_destructor = ip_tunnel_dev_free;
1316	dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
 
 
1317
1318	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1319	if (err)
 
1320		return err;
 
1321
1322	err = gro_cells_init(&tunnel->gro_cells, dev);
1323	if (err) {
1324		dst_cache_destroy(&tunnel->dst_cache);
 
1325		return err;
1326	}
1327
1328	tunnel->dev = dev;
1329	tunnel->net = dev_net(dev);
1330	strscpy(tunnel->parms.name, dev->name);
1331	iph->version		= 4;
1332	iph->ihl		= 5;
1333
1334	if (tunnel->collect_md)
1335		netif_keep_dst(dev);
1336	netdev_lockdep_set_classes(dev);
1337	return 0;
1338}
1339EXPORT_SYMBOL_GPL(ip_tunnel_init);
1340
1341void ip_tunnel_uninit(struct net_device *dev)
1342{
1343	struct ip_tunnel *tunnel = netdev_priv(dev);
1344	struct net *net = tunnel->net;
1345	struct ip_tunnel_net *itn;
1346
1347	itn = net_generic(net, tunnel->ip_tnl_net_id);
1348	ip_tunnel_del(itn, netdev_priv(dev));
1349	if (itn->fb_tunnel_dev == dev)
1350		WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1351
1352	dst_cache_reset(&tunnel->dst_cache);
1353}
1354EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1355
1356/* Do least required initialization, rest of init is done in tunnel_init call */
1357void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1358{
1359	struct ip_tunnel *tunnel = netdev_priv(dev);
1360	tunnel->ip_tnl_net_id = net_id;
1361}
1362EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1363
1364MODULE_DESCRIPTION("IPv4 tunnel implementation library");
1365MODULE_LICENSE("GPL");
v6.9.4
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2013 Nicira, Inc.
   4 */
   5
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7
   8#include <linux/capability.h>
   9#include <linux/module.h>
  10#include <linux/types.h>
  11#include <linux/kernel.h>
  12#include <linux/slab.h>
  13#include <linux/uaccess.h>
  14#include <linux/skbuff.h>
  15#include <linux/netdevice.h>
  16#include <linux/in.h>
  17#include <linux/tcp.h>
  18#include <linux/udp.h>
  19#include <linux/if_arp.h>
  20#include <linux/init.h>
  21#include <linux/in6.h>
  22#include <linux/inetdevice.h>
  23#include <linux/igmp.h>
  24#include <linux/netfilter_ipv4.h>
  25#include <linux/etherdevice.h>
  26#include <linux/if_ether.h>
  27#include <linux/if_vlan.h>
  28#include <linux/rculist.h>
  29#include <linux/err.h>
  30
  31#include <net/sock.h>
  32#include <net/ip.h>
  33#include <net/icmp.h>
  34#include <net/protocol.h>
  35#include <net/ip_tunnels.h>
  36#include <net/arp.h>
  37#include <net/checksum.h>
  38#include <net/dsfield.h>
  39#include <net/inet_ecn.h>
  40#include <net/xfrm.h>
  41#include <net/net_namespace.h>
  42#include <net/netns/generic.h>
  43#include <net/rtnetlink.h>
  44#include <net/udp.h>
  45#include <net/dst_metadata.h>
 
  46
  47#if IS_ENABLED(CONFIG_IPV6)
  48#include <net/ipv6.h>
  49#include <net/ip6_fib.h>
  50#include <net/ip6_route.h>
  51#endif
  52
  53static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
  54{
  55	return hash_32((__force u32)key ^ (__force u32)remote,
  56			 IP_TNL_HASH_BITS);
  57}
  58
  59static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
  60				__be16 flags, __be32 key)
  61{
  62	if (p->i_flags & TUNNEL_KEY) {
  63		if (flags & TUNNEL_KEY)
  64			return key == p->i_key;
  65		else
  66			/* key expected, none present */
  67			return false;
  68	} else
  69		return !(flags & TUNNEL_KEY);
  70}
  71
  72/* Fallback tunnel: no source, no destination, no key, no options
  73
  74   Tunnel hash table:
  75   We require exact key match i.e. if a key is present in packet
  76   it will match only tunnel with the same key; if it is not present,
  77   it will match only keyless tunnel.
  78
  79   All keysless packets, if not matched configured keyless tunnels
  80   will match fallback tunnel.
  81   Given src, dst and key, find appropriate for input tunnel.
  82*/
  83struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
  84				   int link, __be16 flags,
  85				   __be32 remote, __be32 local,
  86				   __be32 key)
  87{
  88	struct ip_tunnel *t, *cand = NULL;
  89	struct hlist_head *head;
  90	struct net_device *ndev;
  91	unsigned int hash;
  92
  93	hash = ip_tunnel_hash(key, remote);
  94	head = &itn->tunnels[hash];
  95
  96	hlist_for_each_entry_rcu(t, head, hash_node) {
  97		if (local != t->parms.iph.saddr ||
  98		    remote != t->parms.iph.daddr ||
  99		    !(t->dev->flags & IFF_UP))
 100			continue;
 101
 102		if (!ip_tunnel_key_match(&t->parms, flags, key))
 103			continue;
 104
 105		if (READ_ONCE(t->parms.link) == link)
 106			return t;
 107		cand = t;
 108	}
 109
 110	hlist_for_each_entry_rcu(t, head, hash_node) {
 111		if (remote != t->parms.iph.daddr ||
 112		    t->parms.iph.saddr != 0 ||
 113		    !(t->dev->flags & IFF_UP))
 114			continue;
 115
 116		if (!ip_tunnel_key_match(&t->parms, flags, key))
 117			continue;
 118
 119		if (READ_ONCE(t->parms.link) == link)
 120			return t;
 121		if (!cand)
 122			cand = t;
 123	}
 124
 125	hash = ip_tunnel_hash(key, 0);
 126	head = &itn->tunnels[hash];
 127
 128	hlist_for_each_entry_rcu(t, head, hash_node) {
 129		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
 130		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
 131			continue;
 132
 133		if (!(t->dev->flags & IFF_UP))
 134			continue;
 135
 136		if (!ip_tunnel_key_match(&t->parms, flags, key))
 137			continue;
 138
 139		if (READ_ONCE(t->parms.link) == link)
 140			return t;
 141		if (!cand)
 142			cand = t;
 143	}
 144
 145	hlist_for_each_entry_rcu(t, head, hash_node) {
 146		if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
 
 147		    t->parms.iph.saddr != 0 ||
 148		    t->parms.iph.daddr != 0 ||
 149		    !(t->dev->flags & IFF_UP))
 150			continue;
 151
 152		if (READ_ONCE(t->parms.link) == link)
 153			return t;
 154		if (!cand)
 155			cand = t;
 156	}
 157
 158	if (cand)
 159		return cand;
 160
 161	t = rcu_dereference(itn->collect_md_tun);
 162	if (t && t->dev->flags & IFF_UP)
 163		return t;
 164
 165	ndev = READ_ONCE(itn->fb_tunnel_dev);
 166	if (ndev && ndev->flags & IFF_UP)
 167		return netdev_priv(ndev);
 168
 169	return NULL;
 170}
 171EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
 172
 173static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
 174				    struct ip_tunnel_parm *parms)
 175{
 176	unsigned int h;
 177	__be32 remote;
 178	__be32 i_key = parms->i_key;
 179
 180	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
 181		remote = parms->iph.daddr;
 182	else
 183		remote = 0;
 184
 185	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
 
 186		i_key = 0;
 187
 188	h = ip_tunnel_hash(i_key, remote);
 189	return &itn->tunnels[h];
 190}
 191
 192static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 193{
 194	struct hlist_head *head = ip_bucket(itn, &t->parms);
 195
 196	if (t->collect_md)
 197		rcu_assign_pointer(itn->collect_md_tun, t);
 198	hlist_add_head_rcu(&t->hash_node, head);
 199}
 200
 201static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 202{
 203	if (t->collect_md)
 204		rcu_assign_pointer(itn->collect_md_tun, NULL);
 205	hlist_del_init_rcu(&t->hash_node);
 206}
 207
 208static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
 209					struct ip_tunnel_parm *parms,
 210					int type)
 211{
 212	__be32 remote = parms->iph.daddr;
 213	__be32 local = parms->iph.saddr;
 
 214	__be32 key = parms->i_key;
 215	__be16 flags = parms->i_flags;
 216	int link = parms->link;
 217	struct ip_tunnel *t = NULL;
 218	struct hlist_head *head = ip_bucket(itn, parms);
 219
 220	hlist_for_each_entry_rcu(t, head, hash_node) {
 
 
 221		if (local == t->parms.iph.saddr &&
 222		    remote == t->parms.iph.daddr &&
 223		    link == READ_ONCE(t->parms.link) &&
 224		    type == t->dev->type &&
 225		    ip_tunnel_key_match(&t->parms, flags, key))
 226			break;
 227	}
 228	return t;
 229}
 230
 231static struct net_device *__ip_tunnel_create(struct net *net,
 232					     const struct rtnl_link_ops *ops,
 233					     struct ip_tunnel_parm *parms)
 234{
 235	int err;
 236	struct ip_tunnel *tunnel;
 237	struct net_device *dev;
 238	char name[IFNAMSIZ];
 239
 240	err = -E2BIG;
 241	if (parms->name[0]) {
 242		if (!dev_valid_name(parms->name))
 243			goto failed;
 244		strscpy(name, parms->name, IFNAMSIZ);
 245	} else {
 246		if (strlen(ops->kind) > (IFNAMSIZ - 3))
 247			goto failed;
 248		strcpy(name, ops->kind);
 249		strcat(name, "%d");
 250	}
 251
 252	ASSERT_RTNL();
 253	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
 254	if (!dev) {
 255		err = -ENOMEM;
 256		goto failed;
 257	}
 258	dev_net_set(dev, net);
 259
 260	dev->rtnl_link_ops = ops;
 261
 262	tunnel = netdev_priv(dev);
 263	tunnel->parms = *parms;
 264	tunnel->net = net;
 265
 266	err = register_netdevice(dev);
 267	if (err)
 268		goto failed_free;
 269
 270	return dev;
 271
 272failed_free:
 273	free_netdev(dev);
 274failed:
 275	return ERR_PTR(err);
 276}
 277
 278static int ip_tunnel_bind_dev(struct net_device *dev)
 279{
 280	struct net_device *tdev = NULL;
 281	struct ip_tunnel *tunnel = netdev_priv(dev);
 282	const struct iphdr *iph;
 283	int hlen = LL_MAX_HEADER;
 284	int mtu = ETH_DATA_LEN;
 285	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
 286
 287	iph = &tunnel->parms.iph;
 288
 289	/* Guess output device to choose reasonable mtu and needed_headroom */
 290	if (iph->daddr) {
 291		struct flowi4 fl4;
 292		struct rtable *rt;
 293
 294		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
 295				    iph->saddr, tunnel->parms.o_key,
 296				    RT_TOS(iph->tos), dev_net(dev),
 297				    tunnel->parms.link, tunnel->fwmark, 0, 0);
 298		rt = ip_route_output_key(tunnel->net, &fl4);
 299
 300		if (!IS_ERR(rt)) {
 301			tdev = rt->dst.dev;
 302			ip_rt_put(rt);
 303		}
 304		if (dev->type != ARPHRD_ETHER)
 305			dev->flags |= IFF_POINTOPOINT;
 306
 307		dst_cache_reset(&tunnel->dst_cache);
 308	}
 309
 310	if (!tdev && tunnel->parms.link)
 311		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
 312
 313	if (tdev) {
 314		hlen = tdev->hard_header_len + tdev->needed_headroom;
 315		mtu = min(tdev->mtu, IP_MAX_MTU);
 316	}
 317
 318	dev->needed_headroom = t_hlen + hlen;
 319	mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
 320
 321	if (mtu < IPV4_MIN_MTU)
 322		mtu = IPV4_MIN_MTU;
 323
 324	return mtu;
 325}
 326
 327static struct ip_tunnel *ip_tunnel_create(struct net *net,
 328					  struct ip_tunnel_net *itn,
 329					  struct ip_tunnel_parm *parms)
 330{
 331	struct ip_tunnel *nt;
 332	struct net_device *dev;
 333	int t_hlen;
 334	int mtu;
 335	int err;
 336
 337	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
 338	if (IS_ERR(dev))
 339		return ERR_CAST(dev);
 340
 341	mtu = ip_tunnel_bind_dev(dev);
 342	err = dev_set_mtu(dev, mtu);
 343	if (err)
 344		goto err_dev_set_mtu;
 345
 346	nt = netdev_priv(dev);
 347	t_hlen = nt->hlen + sizeof(struct iphdr);
 348	dev->min_mtu = ETH_MIN_MTU;
 349	dev->max_mtu = IP_MAX_MTU - t_hlen;
 350	if (dev->type == ARPHRD_ETHER)
 351		dev->max_mtu -= dev->hard_header_len;
 352
 353	ip_tunnel_add(itn, nt);
 354	return nt;
 355
 356err_dev_set_mtu:
 357	unregister_netdevice(dev);
 358	return ERR_PTR(err);
 359}
 360
 361void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
 362{
 363	const struct iphdr *iph = ip_hdr(skb);
 364	const struct udphdr *udph;
 365
 366	if (iph->protocol != IPPROTO_UDP)
 367		return;
 368
 369	udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
 370	info->encap.sport = udph->source;
 371	info->encap.dport = udph->dest;
 372}
 373EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
 374
 375int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 376		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
 377		  bool log_ecn_error)
 378{
 379	const struct iphdr *iph = ip_hdr(skb);
 380	int nh, err;
 381
 382#ifdef CONFIG_NET_IPGRE_BROADCAST
 383	if (ipv4_is_multicast(iph->daddr)) {
 384		DEV_STATS_INC(tunnel->dev, multicast);
 385		skb->pkt_type = PACKET_BROADCAST;
 386	}
 387#endif
 388
 389	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
 390	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
 391		DEV_STATS_INC(tunnel->dev, rx_crc_errors);
 392		DEV_STATS_INC(tunnel->dev, rx_errors);
 393		goto drop;
 394	}
 395
 396	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
 397		if (!(tpi->flags&TUNNEL_SEQ) ||
 398		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
 399			DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
 400			DEV_STATS_INC(tunnel->dev, rx_errors);
 401			goto drop;
 402		}
 403		tunnel->i_seqno = ntohl(tpi->seq) + 1;
 404	}
 405
 406	/* Save offset of outer header relative to skb->head,
 407	 * because we are going to reset the network header to the inner header
 408	 * and might change skb->head.
 409	 */
 410	nh = skb_network_header(skb) - skb->head;
 411
 412	skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
 413
 414	if (!pskb_inet_may_pull(skb)) {
 415		DEV_STATS_INC(tunnel->dev, rx_length_errors);
 416		DEV_STATS_INC(tunnel->dev, rx_errors);
 417		goto drop;
 418	}
 419	iph = (struct iphdr *)(skb->head + nh);
 420
 421	err = IP_ECN_decapsulate(iph, skb);
 422	if (unlikely(err)) {
 423		if (log_ecn_error)
 424			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
 425					&iph->saddr, iph->tos);
 426		if (err > 1) {
 427			DEV_STATS_INC(tunnel->dev, rx_frame_errors);
 428			DEV_STATS_INC(tunnel->dev, rx_errors);
 429			goto drop;
 430		}
 431	}
 432
 433	dev_sw_netstats_rx_add(tunnel->dev, skb->len);
 434	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
 435
 436	if (tunnel->dev->type == ARPHRD_ETHER) {
 437		skb->protocol = eth_type_trans(skb, tunnel->dev);
 438		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 439	} else {
 440		skb->dev = tunnel->dev;
 441	}
 442
 443	if (tun_dst)
 444		skb_dst_set(skb, (struct dst_entry *)tun_dst);
 445
 446	gro_cells_receive(&tunnel->gro_cells, skb);
 447	return 0;
 448
 449drop:
 450	if (tun_dst)
 451		dst_release((struct dst_entry *)tun_dst);
 452	kfree_skb(skb);
 453	return 0;
 454}
 455EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
 456
 457int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
 458			    unsigned int num)
 459{
 460	if (num >= MAX_IPTUN_ENCAP_OPS)
 461		return -ERANGE;
 462
 463	return !cmpxchg((const struct ip_tunnel_encap_ops **)
 464			&iptun_encaps[num],
 465			NULL, ops) ? 0 : -1;
 466}
 467EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
 468
 469int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
 470			    unsigned int num)
 471{
 472	int ret;
 473
 474	if (num >= MAX_IPTUN_ENCAP_OPS)
 475		return -ERANGE;
 476
 477	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
 478		       &iptun_encaps[num],
 479		       ops, NULL) == ops) ? 0 : -1;
 480
 481	synchronize_net();
 482
 483	return ret;
 484}
 485EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
 486
 487int ip_tunnel_encap_setup(struct ip_tunnel *t,
 488			  struct ip_tunnel_encap *ipencap)
 489{
 490	int hlen;
 491
 492	memset(&t->encap, 0, sizeof(t->encap));
 493
 494	hlen = ip_encap_hlen(ipencap);
 495	if (hlen < 0)
 496		return hlen;
 497
 498	t->encap.type = ipencap->type;
 499	t->encap.sport = ipencap->sport;
 500	t->encap.dport = ipencap->dport;
 501	t->encap.flags = ipencap->flags;
 502
 503	t->encap_hlen = hlen;
 504	t->hlen = t->encap_hlen + t->tun_hlen;
 505
 506	return 0;
 507}
 508EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
 509
 510static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 511			    struct rtable *rt, __be16 df,
 512			    const struct iphdr *inner_iph,
 513			    int tunnel_hlen, __be32 dst, bool md)
 514{
 515	struct ip_tunnel *tunnel = netdev_priv(dev);
 516	int pkt_size;
 517	int mtu;
 518
 519	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
 520	pkt_size = skb->len - tunnel_hlen;
 521	pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
 522
 523	if (df) {
 524		mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
 525		mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
 526	} else {
 527		mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
 528	}
 529
 530	if (skb_valid_dst(skb))
 531		skb_dst_update_pmtu_no_confirm(skb, mtu);
 532
 533	if (skb->protocol == htons(ETH_P_IP)) {
 534		if (!skb_is_gso(skb) &&
 535		    (inner_iph->frag_off & htons(IP_DF)) &&
 536		    mtu < pkt_size) {
 537			icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
 538			return -E2BIG;
 539		}
 540	}
 541#if IS_ENABLED(CONFIG_IPV6)
 542	else if (skb->protocol == htons(ETH_P_IPV6)) {
 543		struct rt6_info *rt6;
 544		__be32 daddr;
 545
 546		rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) :
 547					   NULL;
 548		daddr = md ? dst : tunnel->parms.iph.daddr;
 549
 550		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
 551			   mtu >= IPV6_MIN_MTU) {
 552			if ((daddr && !ipv4_is_multicast(daddr)) ||
 553			    rt6->rt6i_dst.plen == 128) {
 554				rt6->rt6i_flags |= RTF_MODIFIED;
 555				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
 556			}
 557		}
 558
 559		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
 560					mtu < pkt_size) {
 561			icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 562			return -E2BIG;
 563		}
 564	}
 565#endif
 566	return 0;
 567}
 568
 569static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom)
 570{
 571	/* we must cap headroom to some upperlimit, else pskb_expand_head
 572	 * will overflow header offsets in skb_headers_offset_update().
 573	 */
 574	static const unsigned int max_allowed = 512;
 575
 576	if (headroom > max_allowed)
 577		headroom = max_allowed;
 578
 579	if (headroom > READ_ONCE(dev->needed_headroom))
 580		WRITE_ONCE(dev->needed_headroom, headroom);
 581}
 582
 583void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 584		       u8 proto, int tunnel_hlen)
 585{
 586	struct ip_tunnel *tunnel = netdev_priv(dev);
 587	u32 headroom = sizeof(struct iphdr);
 588	struct ip_tunnel_info *tun_info;
 589	const struct ip_tunnel_key *key;
 590	const struct iphdr *inner_iph;
 591	struct rtable *rt = NULL;
 592	struct flowi4 fl4;
 593	__be16 df = 0;
 594	u8 tos, ttl;
 595	bool use_cache;
 596
 597	tun_info = skb_tunnel_info(skb);
 598	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
 599		     ip_tunnel_info_af(tun_info) != AF_INET))
 600		goto tx_error;
 601	key = &tun_info->key;
 602	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 603	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 604	tos = key->tos;
 605	if (tos == 1) {
 606		if (skb->protocol == htons(ETH_P_IP))
 607			tos = inner_iph->tos;
 608		else if (skb->protocol == htons(ETH_P_IPV6))
 609			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
 610	}
 611	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
 612			    tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
 613			    dev_net(dev), 0, skb->mark, skb_get_hash(skb),
 614			    key->flow_flags);
 615
 616	if (!tunnel_hlen)
 617		tunnel_hlen = ip_encap_hlen(&tun_info->encap);
 618
 619	if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
 620		goto tx_error;
 621
 622	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
 623	if (use_cache)
 624		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
 625	if (!rt) {
 626		rt = ip_route_output_key(tunnel->net, &fl4);
 627		if (IS_ERR(rt)) {
 628			DEV_STATS_INC(dev, tx_carrier_errors);
 629			goto tx_error;
 630		}
 631		if (use_cache)
 632			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
 633					  fl4.saddr);
 634	}
 635	if (rt->dst.dev == dev) {
 636		ip_rt_put(rt);
 637		DEV_STATS_INC(dev, collisions);
 638		goto tx_error;
 639	}
 640
 641	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
 642		df = htons(IP_DF);
 643	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
 644			    key->u.ipv4.dst, true)) {
 645		ip_rt_put(rt);
 646		goto tx_error;
 647	}
 648
 649	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
 650	ttl = key->ttl;
 651	if (ttl == 0) {
 652		if (skb->protocol == htons(ETH_P_IP))
 653			ttl = inner_iph->ttl;
 654		else if (skb->protocol == htons(ETH_P_IPV6))
 655			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
 656		else
 657			ttl = ip4_dst_hoplimit(&rt->dst);
 658	}
 659
 660	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
 661	if (skb_cow_head(skb, headroom)) {
 662		ip_rt_put(rt);
 663		goto tx_dropped;
 664	}
 665
 666	ip_tunnel_adj_headroom(dev, headroom);
 667
 668	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
 669		      df, !net_eq(tunnel->net, dev_net(dev)));
 670	return;
 671tx_error:
 672	DEV_STATS_INC(dev, tx_errors);
 673	goto kfree;
 674tx_dropped:
 675	DEV_STATS_INC(dev, tx_dropped);
 676kfree:
 677	kfree_skb(skb);
 678}
 679EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
 680
 681void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 682		    const struct iphdr *tnl_params, u8 protocol)
 683{
 684	struct ip_tunnel *tunnel = netdev_priv(dev);
 685	struct ip_tunnel_info *tun_info = NULL;
 686	const struct iphdr *inner_iph;
 687	unsigned int max_headroom;	/* The extra header space needed */
 688	struct rtable *rt = NULL;		/* Route to the other host */
 689	__be16 payload_protocol;
 690	bool use_cache = false;
 691	struct flowi4 fl4;
 692	bool md = false;
 693	bool connected;
 694	u8 tos, ttl;
 695	__be32 dst;
 696	__be16 df;
 697
 698	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
 699	connected = (tunnel->parms.iph.daddr != 0);
 700	payload_protocol = skb_protocol(skb, true);
 701
 702	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 703
 704	dst = tnl_params->daddr;
 705	if (dst == 0) {
 706		/* NBMA tunnel */
 707
 708		if (!skb_dst(skb)) {
 709			DEV_STATS_INC(dev, tx_fifo_errors);
 710			goto tx_error;
 711		}
 712
 713		tun_info = skb_tunnel_info(skb);
 714		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
 715		    ip_tunnel_info_af(tun_info) == AF_INET &&
 716		    tun_info->key.u.ipv4.dst) {
 717			dst = tun_info->key.u.ipv4.dst;
 718			md = true;
 719			connected = true;
 720		} else if (payload_protocol == htons(ETH_P_IP)) {
 721			rt = skb_rtable(skb);
 722			dst = rt_nexthop(rt, inner_iph->daddr);
 723		}
 724#if IS_ENABLED(CONFIG_IPV6)
 725		else if (payload_protocol == htons(ETH_P_IPV6)) {
 726			const struct in6_addr *addr6;
 727			struct neighbour *neigh;
 728			bool do_tx_error_icmp;
 729			int addr_type;
 730
 731			neigh = dst_neigh_lookup(skb_dst(skb),
 732						 &ipv6_hdr(skb)->daddr);
 733			if (!neigh)
 734				goto tx_error;
 735
 736			addr6 = (const struct in6_addr *)&neigh->primary_key;
 737			addr_type = ipv6_addr_type(addr6);
 738
 739			if (addr_type == IPV6_ADDR_ANY) {
 740				addr6 = &ipv6_hdr(skb)->daddr;
 741				addr_type = ipv6_addr_type(addr6);
 742			}
 743
 744			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
 745				do_tx_error_icmp = true;
 746			else {
 747				do_tx_error_icmp = false;
 748				dst = addr6->s6_addr32[3];
 749			}
 750			neigh_release(neigh);
 751			if (do_tx_error_icmp)
 752				goto tx_error_icmp;
 753		}
 754#endif
 755		else
 756			goto tx_error;
 757
 758		if (!md)
 759			connected = false;
 760	}
 761
 762	tos = tnl_params->tos;
 763	if (tos & 0x1) {
 764		tos &= ~0x1;
 765		if (payload_protocol == htons(ETH_P_IP)) {
 766			tos = inner_iph->tos;
 767			connected = false;
 768		} else if (payload_protocol == htons(ETH_P_IPV6)) {
 769			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
 770			connected = false;
 771		}
 772	}
 773
 774	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
 775			    tunnel->parms.o_key, RT_TOS(tos),
 776			    dev_net(dev), READ_ONCE(tunnel->parms.link),
 777			    tunnel->fwmark, skb_get_hash(skb), 0);
 778
 779	if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
 780		goto tx_error;
 781
 782	if (connected && md) {
 783		use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
 784		if (use_cache)
 785			rt = dst_cache_get_ip4(&tun_info->dst_cache,
 786					       &fl4.saddr);
 787	} else {
 788		rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
 789						&fl4.saddr) : NULL;
 790	}
 791
 792	if (!rt) {
 793		rt = ip_route_output_key(tunnel->net, &fl4);
 794
 795		if (IS_ERR(rt)) {
 796			DEV_STATS_INC(dev, tx_carrier_errors);
 797			goto tx_error;
 798		}
 799		if (use_cache)
 800			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
 801					  fl4.saddr);
 802		else if (!md && connected)
 803			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
 804					  fl4.saddr);
 805	}
 806
 807	if (rt->dst.dev == dev) {
 808		ip_rt_put(rt);
 809		DEV_STATS_INC(dev, collisions);
 810		goto tx_error;
 811	}
 812
 813	df = tnl_params->frag_off;
 814	if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
 815		df |= (inner_iph->frag_off & htons(IP_DF));
 816
 817	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
 818		ip_rt_put(rt);
 819		goto tx_error;
 820	}
 821
 822	if (tunnel->err_count > 0) {
 823		if (time_before(jiffies,
 824				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
 825			tunnel->err_count--;
 826
 827			dst_link_failure(skb);
 828		} else
 829			tunnel->err_count = 0;
 830	}
 831
 832	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
 833	ttl = tnl_params->ttl;
 834	if (ttl == 0) {
 835		if (payload_protocol == htons(ETH_P_IP))
 836			ttl = inner_iph->ttl;
 837#if IS_ENABLED(CONFIG_IPV6)
 838		else if (payload_protocol == htons(ETH_P_IPV6))
 839			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
 840#endif
 841		else
 842			ttl = ip4_dst_hoplimit(&rt->dst);
 843	}
 844
 845	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
 846			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
 847
 848	if (skb_cow_head(skb, max_headroom)) {
 849		ip_rt_put(rt);
 850		DEV_STATS_INC(dev, tx_dropped);
 851		kfree_skb(skb);
 852		return;
 853	}
 854
 855	ip_tunnel_adj_headroom(dev, max_headroom);
 856
 857	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
 858		      df, !net_eq(tunnel->net, dev_net(dev)));
 859	return;
 860
 861#if IS_ENABLED(CONFIG_IPV6)
 862tx_error_icmp:
 863	dst_link_failure(skb);
 864#endif
 865tx_error:
 866	DEV_STATS_INC(dev, tx_errors);
 867	kfree_skb(skb);
 868}
 869EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
 870
 871static void ip_tunnel_update(struct ip_tunnel_net *itn,
 872			     struct ip_tunnel *t,
 873			     struct net_device *dev,
 874			     struct ip_tunnel_parm *p,
 875			     bool set_mtu,
 876			     __u32 fwmark)
 877{
 878	ip_tunnel_del(itn, t);
 879	t->parms.iph.saddr = p->iph.saddr;
 880	t->parms.iph.daddr = p->iph.daddr;
 881	t->parms.i_key = p->i_key;
 882	t->parms.o_key = p->o_key;
 883	if (dev->type != ARPHRD_ETHER) {
 884		__dev_addr_set(dev, &p->iph.saddr, 4);
 885		memcpy(dev->broadcast, &p->iph.daddr, 4);
 886	}
 887	ip_tunnel_add(itn, t);
 888
 889	t->parms.iph.ttl = p->iph.ttl;
 890	t->parms.iph.tos = p->iph.tos;
 891	t->parms.iph.frag_off = p->iph.frag_off;
 892
 893	if (t->parms.link != p->link || t->fwmark != fwmark) {
 894		int mtu;
 895
 896		WRITE_ONCE(t->parms.link, p->link);
 897		t->fwmark = fwmark;
 898		mtu = ip_tunnel_bind_dev(dev);
 899		if (set_mtu)
 900			dev->mtu = mtu;
 901	}
 902	dst_cache_reset(&t->dst_cache);
 903	netdev_state_change(dev);
 904}
 905
 906int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 
 907{
 908	int err = 0;
 909	struct ip_tunnel *t = netdev_priv(dev);
 910	struct net *net = t->net;
 911	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
 912
 913	switch (cmd) {
 914	case SIOCGETTUNNEL:
 915		if (dev == itn->fb_tunnel_dev) {
 916			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 917			if (!t)
 918				t = netdev_priv(dev);
 919		}
 920		memcpy(p, &t->parms, sizeof(*p));
 921		break;
 922
 923	case SIOCADDTUNNEL:
 924	case SIOCCHGTUNNEL:
 925		err = -EPERM;
 926		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 927			goto done;
 928		if (p->iph.ttl)
 929			p->iph.frag_off |= htons(IP_DF);
 930		if (!(p->i_flags & VTI_ISVTI)) {
 931			if (!(p->i_flags & TUNNEL_KEY))
 932				p->i_key = 0;
 933			if (!(p->o_flags & TUNNEL_KEY))
 934				p->o_key = 0;
 935		}
 936
 937		t = ip_tunnel_find(itn, p, itn->type);
 938
 939		if (cmd == SIOCADDTUNNEL) {
 940			if (!t) {
 941				t = ip_tunnel_create(net, itn, p);
 942				err = PTR_ERR_OR_ZERO(t);
 943				break;
 944			}
 945
 946			err = -EEXIST;
 947			break;
 948		}
 949		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
 950			if (t) {
 951				if (t->dev != dev) {
 952					err = -EEXIST;
 953					break;
 954				}
 955			} else {
 956				unsigned int nflags = 0;
 957
 958				if (ipv4_is_multicast(p->iph.daddr))
 959					nflags = IFF_BROADCAST;
 960				else if (p->iph.daddr)
 961					nflags = IFF_POINTOPOINT;
 962
 963				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
 964					err = -EINVAL;
 965					break;
 966				}
 967
 968				t = netdev_priv(dev);
 969			}
 970		}
 971
 972		if (t) {
 973			err = 0;
 974			ip_tunnel_update(itn, t, dev, p, true, 0);
 975		} else {
 976			err = -ENOENT;
 977		}
 978		break;
 979
 980	case SIOCDELTUNNEL:
 981		err = -EPERM;
 982		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 983			goto done;
 984
 985		if (dev == itn->fb_tunnel_dev) {
 986			err = -ENOENT;
 987			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
 988			if (!t)
 989				goto done;
 990			err = -EPERM;
 991			if (t == netdev_priv(itn->fb_tunnel_dev))
 992				goto done;
 993			dev = t->dev;
 994		}
 995		unregister_netdevice(dev);
 996		err = 0;
 997		break;
 998
 999	default:
1000		err = -EINVAL;
1001	}
1002
1003done:
1004	return err;
1005}
1006EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
1007
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1008int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
1009			     void __user *data, int cmd)
1010{
1011	struct ip_tunnel_parm p;
1012	int err;
1013
1014	if (copy_from_user(&p, data, sizeof(p)))
1015		return -EFAULT;
1016	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
1017	if (!err && copy_to_user(data, &p, sizeof(p)))
1018		return -EFAULT;
1019	return err;
1020}
1021EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
1022
1023int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
1024{
1025	struct ip_tunnel *tunnel = netdev_priv(dev);
1026	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
1027	int max_mtu = IP_MAX_MTU - t_hlen;
1028
1029	if (dev->type == ARPHRD_ETHER)
1030		max_mtu -= dev->hard_header_len;
1031
1032	if (new_mtu < ETH_MIN_MTU)
1033		return -EINVAL;
1034
1035	if (new_mtu > max_mtu) {
1036		if (strict)
1037			return -EINVAL;
1038
1039		new_mtu = max_mtu;
1040	}
1041
1042	dev->mtu = new_mtu;
1043	return 0;
1044}
1045EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
1046
1047int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1048{
1049	return __ip_tunnel_change_mtu(dev, new_mtu, true);
1050}
1051EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
1052
1053static void ip_tunnel_dev_free(struct net_device *dev)
1054{
1055	struct ip_tunnel *tunnel = netdev_priv(dev);
1056
1057	gro_cells_destroy(&tunnel->gro_cells);
1058	dst_cache_destroy(&tunnel->dst_cache);
1059	free_percpu(dev->tstats);
1060}
1061
1062void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1063{
1064	struct ip_tunnel *tunnel = netdev_priv(dev);
1065	struct ip_tunnel_net *itn;
1066
1067	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1068
1069	if (itn->fb_tunnel_dev != dev) {
1070		ip_tunnel_del(itn, netdev_priv(dev));
1071		unregister_netdevice_queue(dev, head);
1072	}
1073}
1074EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1075
1076struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1077{
1078	struct ip_tunnel *tunnel = netdev_priv(dev);
1079
1080	return tunnel->net;
1081}
1082EXPORT_SYMBOL(ip_tunnel_get_link_net);
1083
1084int ip_tunnel_get_iflink(const struct net_device *dev)
1085{
1086	const struct ip_tunnel *tunnel = netdev_priv(dev);
1087
1088	return READ_ONCE(tunnel->parms.link);
1089}
1090EXPORT_SYMBOL(ip_tunnel_get_iflink);
1091
1092int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1093				  struct rtnl_link_ops *ops, char *devname)
1094{
1095	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1096	struct ip_tunnel_parm parms;
1097	unsigned int i;
1098
1099	itn->rtnl_link_ops = ops;
1100	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1101		INIT_HLIST_HEAD(&itn->tunnels[i]);
1102
1103	if (!ops || !net_has_fallback_tunnels(net)) {
1104		struct ip_tunnel_net *it_init_net;
1105
1106		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1107		itn->type = it_init_net->type;
1108		itn->fb_tunnel_dev = NULL;
1109		return 0;
1110	}
1111
1112	memset(&parms, 0, sizeof(parms));
1113	if (devname)
1114		strscpy(parms.name, devname, IFNAMSIZ);
1115
1116	rtnl_lock();
1117	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1118	/* FB netdevice is special: we have one, and only one per netns.
1119	 * Allowing to move it to another netns is clearly unsafe.
1120	 */
1121	if (!IS_ERR(itn->fb_tunnel_dev)) {
1122		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1123		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1124		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1125		itn->type = itn->fb_tunnel_dev->type;
1126	}
1127	rtnl_unlock();
1128
1129	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1130}
1131EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1132
1133static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1134			      struct list_head *head,
1135			      struct rtnl_link_ops *ops)
1136{
1137	struct net_device *dev, *aux;
1138	int h;
1139
1140	for_each_netdev_safe(net, dev, aux)
1141		if (dev->rtnl_link_ops == ops)
1142			unregister_netdevice_queue(dev, head);
1143
1144	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1145		struct ip_tunnel *t;
1146		struct hlist_node *n;
1147		struct hlist_head *thead = &itn->tunnels[h];
1148
1149		hlist_for_each_entry_safe(t, n, thead, hash_node)
1150			/* If dev is in the same netns, it has already
1151			 * been added to the list by the previous loop.
1152			 */
1153			if (!net_eq(dev_net(t->dev), net))
1154				unregister_netdevice_queue(t->dev, head);
1155	}
1156}
1157
1158void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1159			   struct rtnl_link_ops *ops,
1160			   struct list_head *dev_to_kill)
1161{
1162	struct ip_tunnel_net *itn;
1163	struct net *net;
1164
1165	ASSERT_RTNL();
1166	list_for_each_entry(net, net_list, exit_list) {
1167		itn = net_generic(net, id);
1168		ip_tunnel_destroy(net, itn, dev_to_kill, ops);
1169	}
1170}
1171EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1172
1173int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1174		      struct ip_tunnel_parm *p, __u32 fwmark)
1175{
1176	struct ip_tunnel *nt;
1177	struct net *net = dev_net(dev);
1178	struct ip_tunnel_net *itn;
1179	int mtu;
1180	int err;
1181
1182	nt = netdev_priv(dev);
1183	itn = net_generic(net, nt->ip_tnl_net_id);
1184
1185	if (nt->collect_md) {
1186		if (rtnl_dereference(itn->collect_md_tun))
1187			return -EEXIST;
1188	} else {
1189		if (ip_tunnel_find(itn, p, dev->type))
1190			return -EEXIST;
1191	}
1192
1193	nt->net = net;
1194	nt->parms = *p;
1195	nt->fwmark = fwmark;
1196	err = register_netdevice(dev);
1197	if (err)
1198		goto err_register_netdevice;
1199
1200	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1201		eth_hw_addr_random(dev);
1202
1203	mtu = ip_tunnel_bind_dev(dev);
1204	if (tb[IFLA_MTU]) {
1205		unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1206
1207		if (dev->type == ARPHRD_ETHER)
1208			max -= dev->hard_header_len;
1209
1210		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1211	}
1212
1213	err = dev_set_mtu(dev, mtu);
1214	if (err)
1215		goto err_dev_set_mtu;
1216
1217	ip_tunnel_add(itn, nt);
1218	return 0;
1219
1220err_dev_set_mtu:
1221	unregister_netdevice(dev);
1222err_register_netdevice:
1223	return err;
1224}
1225EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1226
1227int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1228			 struct ip_tunnel_parm *p, __u32 fwmark)
1229{
1230	struct ip_tunnel *t;
1231	struct ip_tunnel *tunnel = netdev_priv(dev);
1232	struct net *net = tunnel->net;
1233	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1234
1235	if (dev == itn->fb_tunnel_dev)
1236		return -EINVAL;
1237
1238	t = ip_tunnel_find(itn, p, dev->type);
1239
1240	if (t) {
1241		if (t->dev != dev)
1242			return -EEXIST;
1243	} else {
1244		t = tunnel;
1245
1246		if (dev->type != ARPHRD_ETHER) {
1247			unsigned int nflags = 0;
1248
1249			if (ipv4_is_multicast(p->iph.daddr))
1250				nflags = IFF_BROADCAST;
1251			else if (p->iph.daddr)
1252				nflags = IFF_POINTOPOINT;
1253
1254			if ((dev->flags ^ nflags) &
1255			    (IFF_POINTOPOINT | IFF_BROADCAST))
1256				return -EINVAL;
1257		}
1258	}
1259
1260	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1261	return 0;
1262}
1263EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1264
1265int ip_tunnel_init(struct net_device *dev)
1266{
1267	struct ip_tunnel *tunnel = netdev_priv(dev);
1268	struct iphdr *iph = &tunnel->parms.iph;
1269	int err;
1270
1271	dev->needs_free_netdev = true;
1272	dev->priv_destructor = ip_tunnel_dev_free;
1273	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1274	if (!dev->tstats)
1275		return -ENOMEM;
1276
1277	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1278	if (err) {
1279		free_percpu(dev->tstats);
1280		return err;
1281	}
1282
1283	err = gro_cells_init(&tunnel->gro_cells, dev);
1284	if (err) {
1285		dst_cache_destroy(&tunnel->dst_cache);
1286		free_percpu(dev->tstats);
1287		return err;
1288	}
1289
1290	tunnel->dev = dev;
1291	tunnel->net = dev_net(dev);
1292	strcpy(tunnel->parms.name, dev->name);
1293	iph->version		= 4;
1294	iph->ihl		= 5;
1295
1296	if (tunnel->collect_md)
1297		netif_keep_dst(dev);
1298	netdev_lockdep_set_classes(dev);
1299	return 0;
1300}
1301EXPORT_SYMBOL_GPL(ip_tunnel_init);
1302
1303void ip_tunnel_uninit(struct net_device *dev)
1304{
1305	struct ip_tunnel *tunnel = netdev_priv(dev);
1306	struct net *net = tunnel->net;
1307	struct ip_tunnel_net *itn;
1308
1309	itn = net_generic(net, tunnel->ip_tnl_net_id);
1310	ip_tunnel_del(itn, netdev_priv(dev));
1311	if (itn->fb_tunnel_dev == dev)
1312		WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1313
1314	dst_cache_reset(&tunnel->dst_cache);
1315}
1316EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1317
1318/* Do least required initialization, rest of init is done in tunnel_init call */
1319void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1320{
1321	struct ip_tunnel *tunnel = netdev_priv(dev);
1322	tunnel->ip_tnl_net_id = net_id;
1323}
1324EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1325
1326MODULE_DESCRIPTION("IPv4 tunnel implementation library");
1327MODULE_LICENSE("GPL");