Linux Audio

Check our new training course

Loading...
v5.9
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2013 Nicira, Inc.
   4 */
   5
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7
   8#include <linux/types.h>
   9#include <linux/kernel.h>
  10#include <linux/skbuff.h>
  11#include <linux/netdevice.h>
  12#include <linux/in.h>
  13#include <linux/if_arp.h>
  14#include <linux/init.h>
  15#include <linux/in6.h>
  16#include <linux/inetdevice.h>
  17#include <linux/netfilter_ipv4.h>
  18#include <linux/etherdevice.h>
  19#include <linux/if_ether.h>
  20#include <linux/if_vlan.h>
  21#include <linux/static_key.h>
  22
  23#include <net/ip.h>
  24#include <net/icmp.h>
  25#include <net/protocol.h>
  26#include <net/ip_tunnels.h>
  27#include <net/ip6_tunnel.h>
  28#include <net/ip6_checksum.h>
  29#include <net/arp.h>
  30#include <net/checksum.h>
  31#include <net/dsfield.h>
  32#include <net/inet_ecn.h>
  33#include <net/xfrm.h>
  34#include <net/net_namespace.h>
  35#include <net/netns/generic.h>
  36#include <net/rtnetlink.h>
  37#include <net/dst_metadata.h>
  38#include <net/geneve.h>
  39#include <net/vxlan.h>
  40#include <net/erspan.h>
  41
  42const struct ip_tunnel_encap_ops __rcu *
  43		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
  44EXPORT_SYMBOL(iptun_encaps);
  45
  46const struct ip6_tnl_encap_ops __rcu *
  47		ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
  48EXPORT_SYMBOL(ip6tun_encaps);
  49
  50void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
  51		   __be32 src, __be32 dst, __u8 proto,
  52		   __u8 tos, __u8 ttl, __be16 df, bool xnet)
  53{
  54	int pkt_len = skb->len - skb_inner_network_offset(skb);
  55	struct net *net = dev_net(rt->dst.dev);
  56	struct net_device *dev = skb->dev;
  57	struct iphdr *iph;
  58	int err;
  59
  60	skb_scrub_packet(skb, xnet);
  61
  62	skb_clear_hash_if_not_l4(skb);
  63	skb_dst_set(skb, &rt->dst);
  64	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
  65
  66	/* Push down and install the IP header. */
  67	skb_push(skb, sizeof(struct iphdr));
  68	skb_reset_network_header(skb);
  69
  70	iph = ip_hdr(skb);
  71
  72	iph->version	=	4;
  73	iph->ihl	=	sizeof(struct iphdr) >> 2;
  74	iph->frag_off	=	ip_mtu_locked(&rt->dst) ? 0 : df;
  75	iph->protocol	=	proto;
  76	iph->tos	=	tos;
  77	iph->daddr	=	dst;
  78	iph->saddr	=	src;
  79	iph->ttl	=	ttl;
  80	__ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
  81
  82	err = ip_local_out(net, sk, skb);
  83
  84	if (dev) {
  85		if (unlikely(net_xmit_eval(err)))
  86			pkt_len = 0;
  87		iptunnel_xmit_stats(dev, pkt_len);
  88	}
  89}
  90EXPORT_SYMBOL_GPL(iptunnel_xmit);
  91
  92int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
  93			   __be16 inner_proto, bool raw_proto, bool xnet)
  94{
  95	if (unlikely(!pskb_may_pull(skb, hdr_len)))
  96		return -ENOMEM;
  97
  98	skb_pull_rcsum(skb, hdr_len);
  99
 100	if (!raw_proto && inner_proto == htons(ETH_P_TEB)) {
 101		struct ethhdr *eh;
 102
 103		if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
 104			return -ENOMEM;
 105
 106		eh = (struct ethhdr *)skb->data;
 107		if (likely(eth_proto_is_802_3(eh->h_proto)))
 108			skb->protocol = eh->h_proto;
 109		else
 110			skb->protocol = htons(ETH_P_802_2);
 111
 112	} else {
 113		skb->protocol = inner_proto;
 114	}
 115
 116	skb_clear_hash_if_not_l4(skb);
 117	__vlan_hwaccel_clear_tag(skb);
 118	skb_set_queue_mapping(skb, 0);
 119	skb_scrub_packet(skb, xnet);
 120
 121	return iptunnel_pull_offloads(skb);
 122}
 123EXPORT_SYMBOL_GPL(__iptunnel_pull_header);
 124
 125struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
 126					     gfp_t flags)
 127{
 
 128	struct metadata_dst *res;
 129	struct ip_tunnel_info *dst, *src;
 130
 131	if (!md || md->type != METADATA_IP_TUNNEL ||
 132	    md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
 133		return NULL;
 134
 135	src = &md->u.tun_info;
 136	res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags);
 137	if (!res)
 138		return NULL;
 139
 140	dst = &res->u.tun_info;
 141	dst->key.tun_id = src->key.tun_id;
 142	if (src->mode & IP_TUNNEL_INFO_IPV6)
 143		memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src,
 144		       sizeof(struct in6_addr));
 145	else
 146		dst->key.u.ipv4.dst = src->key.u.ipv4.src;
 147	dst->key.tun_flags = src->key.tun_flags;
 148	dst->mode = src->mode | IP_TUNNEL_INFO_TX;
 149	ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src),
 150				src->options_len, 0);
 151
 152	return res;
 153}
 154EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
 155
 156int iptunnel_handle_offloads(struct sk_buff *skb,
 157			     int gso_type_mask)
 158{
 159	int err;
 160
 161	if (likely(!skb->encapsulation)) {
 162		skb_reset_inner_headers(skb);
 163		skb->encapsulation = 1;
 164	}
 165
 166	if (skb_is_gso(skb)) {
 167		err = skb_header_unclone(skb, GFP_ATOMIC);
 168		if (unlikely(err))
 169			return err;
 170		skb_shinfo(skb)->gso_type |= gso_type_mask;
 171		return 0;
 172	}
 173
 174	if (skb->ip_summed != CHECKSUM_PARTIAL) {
 175		skb->ip_summed = CHECKSUM_NONE;
 176		/* We clear encapsulation here to prevent badly-written
 177		 * drivers potentially deciding to offload an inner checksum
 178		 * if we set CHECKSUM_PARTIAL on the outer header.
 179		 * This should go away when the drivers are all fixed.
 180		 */
 181		skb->encapsulation = 0;
 182	}
 183
 184	return 0;
 185}
 186EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
 187
 188/**
 189 * iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD
 190 * @skb:	Original packet with L2 header
 191 * @mtu:	MTU value for ICMP error
 192 *
 193 * Return: length on success, negative error code if message couldn't be built.
 194 */
 195static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
 196{
 197	const struct iphdr *iph = ip_hdr(skb);
 198	struct icmphdr *icmph;
 199	struct iphdr *niph;
 200	struct ethhdr eh;
 201	int len, err;
 202
 203	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
 204		return -EINVAL;
 205
 206	skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
 207	pskb_pull(skb, ETH_HLEN);
 208	skb_reset_network_header(skb);
 209
 210	err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph));
 211	if (err)
 212		return err;
 213
 214	len = skb->len + sizeof(*icmph);
 215	err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN);
 216	if (err)
 217		return err;
 218
 219	icmph = skb_push(skb, sizeof(*icmph));
 220	*icmph = (struct icmphdr) {
 221		.type			= ICMP_DEST_UNREACH,
 222		.code			= ICMP_FRAG_NEEDED,
 223		.checksum		= 0,
 224		.un.frag.__unused	= 0,
 225		.un.frag.mtu		= ntohs(mtu),
 226	};
 227	icmph->checksum = ip_compute_csum(icmph, len);
 228	skb_reset_transport_header(skb);
 229
 230	niph = skb_push(skb, sizeof(*niph));
 231	*niph = (struct iphdr) {
 232		.ihl			= sizeof(*niph) / 4u,
 233		.version 		= 4,
 234		.tos 			= 0,
 235		.tot_len		= htons(len + sizeof(*niph)),
 236		.id			= 0,
 237		.frag_off		= htons(IP_DF),
 238		.ttl			= iph->ttl,
 239		.protocol		= IPPROTO_ICMP,
 240		.saddr			= iph->daddr,
 241		.daddr			= iph->saddr,
 242	};
 243	ip_send_check(niph);
 244	skb_reset_network_header(skb);
 245
 246	skb->ip_summed = CHECKSUM_NONE;
 247
 248	eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0);
 249	skb_reset_mac_header(skb);
 250
 251	return skb->len;
 252}
 253
 254/**
 255 * iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed
 256 * @skb:	Buffer being sent by encapsulation, L2 headers expected
 257 * @mtu:	Network MTU for path
 258 *
 259 * Return: 0 for no ICMP reply, length if built, negative value on error.
 260 */
 261static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu)
 262{
 263	const struct icmphdr *icmph = icmp_hdr(skb);
 264	const struct iphdr *iph = ip_hdr(skb);
 265
 266	if (mtu <= 576 || iph->frag_off != htons(IP_DF))
 267		return 0;
 268
 269	if (ipv4_is_lbcast(iph->daddr)  || ipv4_is_multicast(iph->daddr) ||
 270	    ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr)  ||
 271	    ipv4_is_lbcast(iph->saddr)  || ipv4_is_multicast(iph->saddr))
 272		return 0;
 273
 274	if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type))
 275		return 0;
 276
 277	return iptunnel_pmtud_build_icmp(skb, mtu);
 278}
 279
 280#if IS_ENABLED(CONFIG_IPV6)
 281/**
 282 * iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD
 283 * @skb:	Original packet with L2 header
 284 * @mtu:	MTU value for ICMPv6 error
 285 *
 286 * Return: length on success, negative error code if message couldn't be built.
 287 */
 288static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
 289{
 290	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
 291	struct icmp6hdr *icmp6h;
 292	struct ipv6hdr *nip6h;
 293	struct ethhdr eh;
 294	int len, err;
 295	__wsum csum;
 296
 297	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
 298		return -EINVAL;
 299
 300	skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
 301	pskb_pull(skb, ETH_HLEN);
 302	skb_reset_network_header(skb);
 303
 304	err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h));
 305	if (err)
 306		return err;
 307
 308	len = skb->len + sizeof(*icmp6h);
 309	err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN);
 310	if (err)
 311		return err;
 312
 313	icmp6h = skb_push(skb, sizeof(*icmp6h));
 314	*icmp6h = (struct icmp6hdr) {
 315		.icmp6_type		= ICMPV6_PKT_TOOBIG,
 316		.icmp6_code		= 0,
 317		.icmp6_cksum		= 0,
 318		.icmp6_mtu		= htonl(mtu),
 319	};
 320	skb_reset_transport_header(skb);
 321
 322	nip6h = skb_push(skb, sizeof(*nip6h));
 323	*nip6h = (struct ipv6hdr) {
 324		.priority		= 0,
 325		.version		= 6,
 326		.flow_lbl		= { 0 },
 327		.payload_len		= htons(len),
 328		.nexthdr		= IPPROTO_ICMPV6,
 329		.hop_limit		= ip6h->hop_limit,
 330		.saddr			= ip6h->daddr,
 331		.daddr			= ip6h->saddr,
 332	};
 333	skb_reset_network_header(skb);
 334
 335	csum = csum_partial(icmp6h, len, 0);
 336	icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len,
 337					      IPPROTO_ICMPV6, csum);
 338
 339	skb->ip_summed = CHECKSUM_NONE;
 340
 341	eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0);
 342	skb_reset_mac_header(skb);
 343
 344	return skb->len;
 345}
 346
 347/**
 348 * iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed
 349 * @skb:	Buffer being sent by encapsulation, L2 headers expected
 350 * @mtu:	Network MTU for path
 351 *
 352 * Return: 0 for no ICMPv6 reply, length if built, negative value on error.
 353 */
 354static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu)
 355{
 356	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
 357	int stype = ipv6_addr_type(&ip6h->saddr);
 358	u8 proto = ip6h->nexthdr;
 359	__be16 frag_off;
 360	int offset;
 361
 362	if (mtu <= IPV6_MIN_MTU)
 363		return 0;
 364
 365	if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST ||
 366	    stype == IPV6_ADDR_LOOPBACK)
 367		return 0;
 368
 369	offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto,
 370				  &frag_off);
 371	if (offset < 0 || (frag_off & htons(~0x7)))
 372		return 0;
 373
 374	if (proto == IPPROTO_ICMPV6) {
 375		struct icmp6hdr *icmp6h;
 376
 377		if (!pskb_may_pull(skb, skb_network_header(skb) +
 378					offset + 1 - skb->data))
 379			return 0;
 380
 381		icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 382		if (icmpv6_is_err(icmp6h->icmp6_type) ||
 383		    icmp6h->icmp6_type == NDISC_REDIRECT)
 384			return 0;
 385	}
 386
 387	return iptunnel_pmtud_build_icmpv6(skb, mtu);
 388}
 389#endif /* IS_ENABLED(CONFIG_IPV6) */
 390
 391/**
 392 * skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed
 393 * @skb:	Buffer being sent by encapsulation, L2 headers expected
 394 * @encap_dst:	Destination for tunnel encapsulation (outer IP)
 395 * @headroom:	Encapsulation header size, bytes
 396 * @reply:	Build matching ICMP or ICMPv6 message as a result
 397 *
 398 * L2 tunnel implementations that can carry IP and can be directly bridged
 399 * (currently UDP tunnels) can't always rely on IP forwarding paths to handle
 400 * PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built
 401 * based on payload and sent back by the encapsulation itself.
 402 *
 403 * For routable interfaces, we just need to update the PMTU for the destination.
 404 *
 405 * Return: 0 if ICMP error not needed, length if built, negative value on error
 406 */
 407int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
 408			  int headroom, bool reply)
 409{
 410	u32 mtu = dst_mtu(encap_dst) - headroom;
 411
 412	if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) ||
 413	    (!skb_is_gso(skb) && (skb->len - skb_mac_header_len(skb)) <= mtu))
 414		return 0;
 415
 416	skb_dst_update_pmtu_no_confirm(skb, mtu);
 417
 418	if (!reply || skb->pkt_type == PACKET_HOST)
 419		return 0;
 420
 421	if (skb->protocol == htons(ETH_P_IP))
 422		return iptunnel_pmtud_check_icmp(skb, mtu);
 423
 424#if IS_ENABLED(CONFIG_IPV6)
 425	if (skb->protocol == htons(ETH_P_IPV6))
 426		return iptunnel_pmtud_check_icmpv6(skb, mtu);
 427#endif
 428	return 0;
 429}
 430EXPORT_SYMBOL(skb_tunnel_check_pmtu);
 431
 432/* Often modified stats are per cpu, other are shared (netdev->stats) */
 433void ip_tunnel_get_stats64(struct net_device *dev,
 434			   struct rtnl_link_stats64 *tot)
 435{
 436	int i;
 437
 438	netdev_stats_to_stats64(tot, &dev->stats);
 439
 440	for_each_possible_cpu(i) {
 441		const struct pcpu_sw_netstats *tstats =
 442						   per_cpu_ptr(dev->tstats, i);
 443		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
 444		unsigned int start;
 445
 446		do {
 447			start = u64_stats_fetch_begin_irq(&tstats->syncp);
 448			rx_packets = tstats->rx_packets;
 449			tx_packets = tstats->tx_packets;
 450			rx_bytes = tstats->rx_bytes;
 451			tx_bytes = tstats->tx_bytes;
 452		} while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
 453
 454		tot->rx_packets += rx_packets;
 455		tot->tx_packets += tx_packets;
 456		tot->rx_bytes   += rx_bytes;
 457		tot->tx_bytes   += tx_bytes;
 458	}
 459}
 460EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
 461
 462static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
 463	[LWTUNNEL_IP_UNSPEC]	= { .strict_start_type = LWTUNNEL_IP_OPTS },
 464	[LWTUNNEL_IP_ID]	= { .type = NLA_U64 },
 465	[LWTUNNEL_IP_DST]	= { .type = NLA_U32 },
 466	[LWTUNNEL_IP_SRC]	= { .type = NLA_U32 },
 467	[LWTUNNEL_IP_TTL]	= { .type = NLA_U8 },
 468	[LWTUNNEL_IP_TOS]	= { .type = NLA_U8 },
 469	[LWTUNNEL_IP_FLAGS]	= { .type = NLA_U16 },
 470	[LWTUNNEL_IP_OPTS]	= { .type = NLA_NESTED },
 471};
 472
 473static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = {
 474	[LWTUNNEL_IP_OPTS_GENEVE]	= { .type = NLA_NESTED },
 475	[LWTUNNEL_IP_OPTS_VXLAN]	= { .type = NLA_NESTED },
 476	[LWTUNNEL_IP_OPTS_ERSPAN]	= { .type = NLA_NESTED },
 477};
 478
 479static const struct nla_policy
 480geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = {
 481	[LWTUNNEL_IP_OPT_GENEVE_CLASS]	= { .type = NLA_U16 },
 482	[LWTUNNEL_IP_OPT_GENEVE_TYPE]	= { .type = NLA_U8 },
 483	[LWTUNNEL_IP_OPT_GENEVE_DATA]	= { .type = NLA_BINARY, .len = 128 },
 484};
 485
 486static const struct nla_policy
 487vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = {
 488	[LWTUNNEL_IP_OPT_VXLAN_GBP]	= { .type = NLA_U32 },
 489};
 490
 491static const struct nla_policy
 492erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = {
 493	[LWTUNNEL_IP_OPT_ERSPAN_VER]	= { .type = NLA_U8 },
 494	[LWTUNNEL_IP_OPT_ERSPAN_INDEX]	= { .type = NLA_U32 },
 495	[LWTUNNEL_IP_OPT_ERSPAN_DIR]	= { .type = NLA_U8 },
 496	[LWTUNNEL_IP_OPT_ERSPAN_HWID]	= { .type = NLA_U8 },
 497};
 498
 499static int ip_tun_parse_opts_geneve(struct nlattr *attr,
 500				    struct ip_tunnel_info *info, int opts_len,
 501				    struct netlink_ext_ack *extack)
 502{
 503	struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1];
 504	int data_len, err;
 505
 506	err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr,
 507			       geneve_opt_policy, extack);
 508	if (err)
 509		return err;
 510
 511	if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] ||
 512	    !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] ||
 513	    !tb[LWTUNNEL_IP_OPT_GENEVE_DATA])
 514		return -EINVAL;
 515
 516	attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA];
 517	data_len = nla_len(attr);
 518	if (data_len % 4)
 519		return -EINVAL;
 520
 521	if (info) {
 522		struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len;
 523
 524		memcpy(opt->opt_data, nla_data(attr), data_len);
 525		opt->length = data_len / 4;
 526		attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS];
 527		opt->opt_class = nla_get_be16(attr);
 528		attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE];
 529		opt->type = nla_get_u8(attr);
 530		info->key.tun_flags |= TUNNEL_GENEVE_OPT;
 531	}
 532
 533	return sizeof(struct geneve_opt) + data_len;
 534}
 535
 536static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
 537				   struct ip_tunnel_info *info, int opts_len,
 538				   struct netlink_ext_ack *extack)
 539{
 540	struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1];
 541	int err;
 542
 543	err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr,
 544			       vxlan_opt_policy, extack);
 545	if (err)
 546		return err;
 547
 548	if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP])
 549		return -EINVAL;
 550
 551	if (info) {
 552		struct vxlan_metadata *md =
 553			ip_tunnel_info_opts(info) + opts_len;
 554
 555		attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP];
 556		md->gbp = nla_get_u32(attr);
 557		md->gbp &= VXLAN_GBP_MASK;
 558		info->key.tun_flags |= TUNNEL_VXLAN_OPT;
 559	}
 560
 561	return sizeof(struct vxlan_metadata);
 562}
 563
 564static int ip_tun_parse_opts_erspan(struct nlattr *attr,
 565				    struct ip_tunnel_info *info, int opts_len,
 566				    struct netlink_ext_ack *extack)
 567{
 568	struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1];
 569	int err;
 570	u8 ver;
 571
 572	err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr,
 573			       erspan_opt_policy, extack);
 574	if (err)
 575		return err;
 576
 577	if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER])
 578		return -EINVAL;
 579
 580	ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]);
 581	if (ver == 1) {
 582		if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX])
 583			return -EINVAL;
 584	} else if (ver == 2) {
 585		if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] ||
 586		    !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID])
 587			return -EINVAL;
 588	} else {
 589		return -EINVAL;
 590	}
 591
 592	if (info) {
 593		struct erspan_metadata *md =
 594			ip_tunnel_info_opts(info) + opts_len;
 595
 596		md->version = ver;
 597		if (ver == 1) {
 598			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX];
 599			md->u.index = nla_get_be32(attr);
 600		} else {
 601			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR];
 602			md->u.md2.dir = nla_get_u8(attr);
 603			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID];
 604			set_hwid(&md->u.md2, nla_get_u8(attr));
 605		}
 606
 607		info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
 608	}
 609
 610	return sizeof(struct erspan_metadata);
 611}
 612
 613static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 614			     struct netlink_ext_ack *extack)
 615{
 616	int err, rem, opt_len, opts_len = 0, type = 0;
 617	struct nlattr *nla;
 
 618
 619	if (!attr)
 620		return 0;
 621
 622	err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX,
 623			   ip_opts_policy, extack);
 624	if (err)
 625		return err;
 626
 627	nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
 628		switch (nla_type(nla)) {
 629		case LWTUNNEL_IP_OPTS_GENEVE:
 630			if (type && type != TUNNEL_GENEVE_OPT)
 631				return -EINVAL;
 632			opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len,
 633							   extack);
 634			if (opt_len < 0)
 635				return opt_len;
 636			opts_len += opt_len;
 637			if (opts_len > IP_TUNNEL_OPTS_MAX)
 638				return -EINVAL;
 639			type = TUNNEL_GENEVE_OPT;
 640			break;
 641		case LWTUNNEL_IP_OPTS_VXLAN:
 642			if (type)
 643				return -EINVAL;
 644			opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len,
 645							  extack);
 646			if (opt_len < 0)
 647				return opt_len;
 648			opts_len += opt_len;
 649			type = TUNNEL_VXLAN_OPT;
 650			break;
 651		case LWTUNNEL_IP_OPTS_ERSPAN:
 652			if (type)
 653				return -EINVAL;
 654			opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len,
 655							   extack);
 656			if (opt_len < 0)
 657				return opt_len;
 658			opts_len += opt_len;
 659			type = TUNNEL_ERSPAN_OPT;
 660			break;
 661		default:
 662			return -EINVAL;
 663		}
 664	}
 665
 666	return opts_len;
 667}
 668
 669static int ip_tun_get_optlen(struct nlattr *attr,
 670			     struct netlink_ext_ack *extack)
 671{
 672	return ip_tun_parse_opts(attr, NULL, extack);
 673}
 674
 675static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 676			   struct netlink_ext_ack *extack)
 677{
 678	return ip_tun_parse_opts(attr, info, extack);
 679}
 680
 681static int ip_tun_build_state(struct net *net, struct nlattr *attr,
 682			      unsigned int family, const void *cfg,
 683			      struct lwtunnel_state **ts,
 684			      struct netlink_ext_ack *extack)
 685{
 686	struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
 687	struct lwtunnel_state *new_state;
 688	struct ip_tunnel_info *tun_info;
 689	int err, opt_len;
 690
 691	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr,
 692					  ip_tun_policy, extack);
 693	if (err < 0)
 694		return err;
 695
 696	opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack);
 697	if (opt_len < 0)
 698		return opt_len;
 699
 700	new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
 701	if (!new_state)
 702		return -ENOMEM;
 703
 704	new_state->type = LWTUNNEL_ENCAP_IP;
 705
 706	tun_info = lwt_tun_info(new_state);
 707
 708	err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack);
 709	if (err < 0) {
 710		lwtstate_free(new_state);
 711		return err;
 712	}
 713
 714#ifdef CONFIG_DST_CACHE
 715	err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL);
 716	if (err) {
 717		lwtstate_free(new_state);
 718		return err;
 719	}
 720#endif
 721
 722	if (tb[LWTUNNEL_IP_ID])
 723		tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);
 724
 725	if (tb[LWTUNNEL_IP_DST])
 726		tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]);
 727
 728	if (tb[LWTUNNEL_IP_SRC])
 729		tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]);
 730
 731	if (tb[LWTUNNEL_IP_TTL])
 732		tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
 733
 734	if (tb[LWTUNNEL_IP_TOS])
 735		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
 736
 737	if (tb[LWTUNNEL_IP_FLAGS])
 738		tun_info->key.tun_flags |=
 739				(nla_get_be16(tb[LWTUNNEL_IP_FLAGS]) &
 740				 ~TUNNEL_OPTIONS_PRESENT);
 
 
 
 
 
 
 741
 742	tun_info->mode = IP_TUNNEL_INFO_TX;
 743	tun_info->options_len = opt_len;
 744
 745	*ts = new_state;
 746
 747	return 0;
 748}
 749
 750static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate)
 751{
 752#ifdef CONFIG_DST_CACHE
 753	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
 754
 755	dst_cache_destroy(&tun_info->dst_cache);
 756#endif
 757}
 758
 759static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb,
 760					 struct ip_tunnel_info *tun_info)
 761{
 762	struct geneve_opt *opt;
 763	struct nlattr *nest;
 764	int offset = 0;
 765
 766	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE);
 767	if (!nest)
 768		return -ENOMEM;
 769
 770	while (tun_info->options_len > offset) {
 771		opt = ip_tunnel_info_opts(tun_info) + offset;
 772		if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS,
 773				 opt->opt_class) ||
 774		    nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) ||
 775		    nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4,
 776			    opt->opt_data)) {
 777			nla_nest_cancel(skb, nest);
 778			return -ENOMEM;
 779		}
 780		offset += sizeof(*opt) + opt->length * 4;
 781	}
 782
 783	nla_nest_end(skb, nest);
 784	return 0;
 785}
 786
 787static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb,
 788					struct ip_tunnel_info *tun_info)
 789{
 790	struct vxlan_metadata *md;
 791	struct nlattr *nest;
 792
 793	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN);
 794	if (!nest)
 795		return -ENOMEM;
 796
 797	md = ip_tunnel_info_opts(tun_info);
 798	if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) {
 799		nla_nest_cancel(skb, nest);
 800		return -ENOMEM;
 801	}
 802
 803	nla_nest_end(skb, nest);
 804	return 0;
 805}
 806
 807static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb,
 808					 struct ip_tunnel_info *tun_info)
 809{
 810	struct erspan_metadata *md;
 811	struct nlattr *nest;
 812
 813	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN);
 814	if (!nest)
 815		return -ENOMEM;
 816
 817	md = ip_tunnel_info_opts(tun_info);
 818	if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version))
 819		goto err;
 820
 821	if (md->version == 1 &&
 822	    nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index))
 823		goto err;
 824
 825	if (md->version == 2 &&
 826	    (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) ||
 827	     nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID,
 828			get_hwid(&md->u.md2))))
 829		goto err;
 830
 831	nla_nest_end(skb, nest);
 832	return 0;
 833err:
 834	nla_nest_cancel(skb, nest);
 835	return -ENOMEM;
 836}
 837
 838static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
 839				  struct ip_tunnel_info *tun_info)
 840{
 841	struct nlattr *nest;
 842	int err = 0;
 843
 844	if (!(tun_info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))
 845		return 0;
 846
 847	nest = nla_nest_start_noflag(skb, type);
 848	if (!nest)
 849		return -ENOMEM;
 850
 851	if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT)
 852		err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
 853	else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT)
 854		err = ip_tun_fill_encap_opts_vxlan(skb, tun_info);
 855	else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)
 856		err = ip_tun_fill_encap_opts_erspan(skb, tun_info);
 857
 858	if (err) {
 859		nla_nest_cancel(skb, nest);
 860		return err;
 861	}
 862
 863	nla_nest_end(skb, nest);
 864	return 0;
 865}
 866
 867static int ip_tun_fill_encap_info(struct sk_buff *skb,
 868				  struct lwtunnel_state *lwtstate)
 869{
 870	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
 871
 872	if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id,
 873			 LWTUNNEL_IP_PAD) ||
 874	    nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
 875	    nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
 876	    nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
 877	    nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
 878	    nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) ||
 
 879	    ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info))
 880		return -ENOMEM;
 881
 882	return 0;
 883}
 884
 885static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
 886{
 887	int opt_len;
 888
 889	if (!(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))
 890		return 0;
 891
 892	opt_len = nla_total_size(0);		/* LWTUNNEL_IP_OPTS */
 893	if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
 894		struct geneve_opt *opt;
 895		int offset = 0;
 896
 897		opt_len += nla_total_size(0);	/* LWTUNNEL_IP_OPTS_GENEVE */
 898		while (info->options_len > offset) {
 899			opt = ip_tunnel_info_opts(info) + offset;
 900			opt_len += nla_total_size(2)	/* OPT_GENEVE_CLASS */
 901				   + nla_total_size(1)	/* OPT_GENEVE_TYPE */
 902				   + nla_total_size(opt->length * 4);
 903							/* OPT_GENEVE_DATA */
 904			offset += sizeof(*opt) + opt->length * 4;
 905		}
 906	} else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
 907		opt_len += nla_total_size(0)	/* LWTUNNEL_IP_OPTS_VXLAN */
 908			   + nla_total_size(4);	/* OPT_VXLAN_GBP */
 909	} else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) {
 910		struct erspan_metadata *md = ip_tunnel_info_opts(info);
 911
 912		opt_len += nla_total_size(0)	/* LWTUNNEL_IP_OPTS_ERSPAN */
 913			   + nla_total_size(1)	/* OPT_ERSPAN_VER */
 914			   + (md->version == 1 ? nla_total_size(4)
 915						/* OPT_ERSPAN_INDEX (v1) */
 916					       : nla_total_size(1) +
 917						 nla_total_size(1));
 918						/* OPT_ERSPAN_DIR + HWID (v2) */
 919	}
 920
 921	return opt_len;
 922}
 923
 924static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
 925{
 926	return nla_total_size_64bit(8)	/* LWTUNNEL_IP_ID */
 927		+ nla_total_size(4)	/* LWTUNNEL_IP_DST */
 928		+ nla_total_size(4)	/* LWTUNNEL_IP_SRC */
 929		+ nla_total_size(1)	/* LWTUNNEL_IP_TOS */
 930		+ nla_total_size(1)	/* LWTUNNEL_IP_TTL */
 931		+ nla_total_size(2)	/* LWTUNNEL_IP_FLAGS */
 932		+ ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
 933					/* LWTUNNEL_IP_OPTS */
 934}
 935
 936static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
 937{
 938	struct ip_tunnel_info *info_a = lwt_tun_info(a);
 939	struct ip_tunnel_info *info_b = lwt_tun_info(b);
 940
 941	return memcmp(info_a, info_b, sizeof(info_a->key)) ||
 942	       info_a->mode != info_b->mode ||
 943	       info_a->options_len != info_b->options_len ||
 944	       memcmp(ip_tunnel_info_opts(info_a),
 945		      ip_tunnel_info_opts(info_b), info_a->options_len);
 946}
 947
 948static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
 949	.build_state = ip_tun_build_state,
 950	.destroy_state = ip_tun_destroy_state,
 951	.fill_encap = ip_tun_fill_encap_info,
 952	.get_encap_size = ip_tun_encap_nlsize,
 953	.cmp_encap = ip_tun_cmp_encap,
 954	.owner = THIS_MODULE,
 955};
 956
 957static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
 958	[LWTUNNEL_IP6_UNSPEC]	= { .strict_start_type = LWTUNNEL_IP6_OPTS },
 959	[LWTUNNEL_IP6_ID]		= { .type = NLA_U64 },
 960	[LWTUNNEL_IP6_DST]		= { .len = sizeof(struct in6_addr) },
 961	[LWTUNNEL_IP6_SRC]		= { .len = sizeof(struct in6_addr) },
 962	[LWTUNNEL_IP6_HOPLIMIT]		= { .type = NLA_U8 },
 963	[LWTUNNEL_IP6_TC]		= { .type = NLA_U8 },
 964	[LWTUNNEL_IP6_FLAGS]		= { .type = NLA_U16 },
 965	[LWTUNNEL_IP6_OPTS]		= { .type = NLA_NESTED },
 966};
 967
 968static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
 969			       unsigned int family, const void *cfg,
 970			       struct lwtunnel_state **ts,
 971			       struct netlink_ext_ack *extack)
 972{
 973	struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
 974	struct lwtunnel_state *new_state;
 975	struct ip_tunnel_info *tun_info;
 976	int err, opt_len;
 977
 978	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr,
 979					  ip6_tun_policy, extack);
 980	if (err < 0)
 981		return err;
 982
 983	opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack);
 984	if (opt_len < 0)
 985		return opt_len;
 986
 987	new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
 988	if (!new_state)
 989		return -ENOMEM;
 990
 991	new_state->type = LWTUNNEL_ENCAP_IP6;
 992
 993	tun_info = lwt_tun_info(new_state);
 994
 995	err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack);
 996	if (err < 0) {
 997		lwtstate_free(new_state);
 998		return err;
 999	}
1000
1001	if (tb[LWTUNNEL_IP6_ID])
1002		tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]);
1003
1004	if (tb[LWTUNNEL_IP6_DST])
1005		tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]);
1006
1007	if (tb[LWTUNNEL_IP6_SRC])
1008		tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]);
1009
1010	if (tb[LWTUNNEL_IP6_HOPLIMIT])
1011		tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]);
1012
1013	if (tb[LWTUNNEL_IP6_TC])
1014		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
1015
1016	if (tb[LWTUNNEL_IP6_FLAGS])
1017		tun_info->key.tun_flags |=
1018				(nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]) &
1019				 ~TUNNEL_OPTIONS_PRESENT);
 
 
 
 
 
 
 
1020
1021	tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
1022	tun_info->options_len = opt_len;
1023
1024	*ts = new_state;
1025
1026	return 0;
1027}
1028
1029static int ip6_tun_fill_encap_info(struct sk_buff *skb,
1030				   struct lwtunnel_state *lwtstate)
1031{
1032	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
1033
1034	if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id,
1035			 LWTUNNEL_IP6_PAD) ||
1036	    nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
1037	    nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
1038	    nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
1039	    nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
1040	    nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) ||
 
1041	    ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info))
1042		return -ENOMEM;
1043
1044	return 0;
1045}
1046
1047static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
1048{
1049	return nla_total_size_64bit(8)	/* LWTUNNEL_IP6_ID */
1050		+ nla_total_size(16)	/* LWTUNNEL_IP6_DST */
1051		+ nla_total_size(16)	/* LWTUNNEL_IP6_SRC */
1052		+ nla_total_size(1)	/* LWTUNNEL_IP6_HOPLIMIT */
1053		+ nla_total_size(1)	/* LWTUNNEL_IP6_TC */
1054		+ nla_total_size(2)	/* LWTUNNEL_IP6_FLAGS */
1055		+ ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
1056					/* LWTUNNEL_IP6_OPTS */
1057}
1058
1059static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
1060	.build_state = ip6_tun_build_state,
1061	.fill_encap = ip6_tun_fill_encap_info,
1062	.get_encap_size = ip6_tun_encap_nlsize,
1063	.cmp_encap = ip_tun_cmp_encap,
1064	.owner = THIS_MODULE,
1065};
1066
1067void __init ip_tunnel_core_init(void)
1068{
1069	/* If you land here, make sure whether increasing ip_tunnel_info's
1070	 * options_len is a reasonable choice with its usage in front ends
1071	 * (f.e., it's part of flow keys, etc).
1072	 */
1073	BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255);
1074
1075	lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
1076	lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
1077}
1078
1079DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
1080EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
1081
1082void ip_tunnel_need_metadata(void)
1083{
1084	static_branch_inc(&ip_tunnel_metadata_cnt);
1085}
1086EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
1087
1088void ip_tunnel_unneed_metadata(void)
1089{
1090	static_branch_dec(&ip_tunnel_metadata_cnt);
1091}
1092EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
1093
1094/* Returns either the correct skb->protocol value, or 0 if invalid. */
1095__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb)
1096{
1097	if (skb_network_header(skb) >= skb->head &&
1098	    (skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) &&
1099	    ip_hdr(skb)->version == 4)
1100		return htons(ETH_P_IP);
1101	if (skb_network_header(skb) >= skb->head &&
1102	    (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) &&
1103	    ipv6_hdr(skb)->version == 6)
1104		return htons(ETH_P_IPV6);
1105	return 0;
1106}
1107EXPORT_SYMBOL(ip_tunnel_parse_protocol);
1108
1109const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol };
1110EXPORT_SYMBOL(ip_tunnel_header_ops);
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2013 Nicira, Inc.
   4 */
   5
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7
   8#include <linux/types.h>
   9#include <linux/kernel.h>
  10#include <linux/skbuff.h>
  11#include <linux/netdevice.h>
  12#include <linux/in.h>
  13#include <linux/if_arp.h>
  14#include <linux/init.h>
  15#include <linux/in6.h>
  16#include <linux/inetdevice.h>
  17#include <linux/netfilter_ipv4.h>
  18#include <linux/etherdevice.h>
  19#include <linux/if_ether.h>
  20#include <linux/if_vlan.h>
  21#include <linux/static_key.h>
  22
  23#include <net/ip.h>
  24#include <net/icmp.h>
  25#include <net/protocol.h>
  26#include <net/ip_tunnels.h>
  27#include <net/ip6_tunnel.h>
  28#include <net/ip6_checksum.h>
  29#include <net/arp.h>
  30#include <net/checksum.h>
  31#include <net/dsfield.h>
  32#include <net/inet_ecn.h>
  33#include <net/xfrm.h>
  34#include <net/net_namespace.h>
  35#include <net/netns/generic.h>
  36#include <net/rtnetlink.h>
  37#include <net/dst_metadata.h>
  38#include <net/geneve.h>
  39#include <net/vxlan.h>
  40#include <net/erspan.h>
  41
  42const struct ip_tunnel_encap_ops __rcu *
  43		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
  44EXPORT_SYMBOL(iptun_encaps);
  45
  46const struct ip6_tnl_encap_ops __rcu *
  47		ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
  48EXPORT_SYMBOL(ip6tun_encaps);
  49
  50void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
  51		   __be32 src, __be32 dst, __u8 proto,
  52		   __u8 tos, __u8 ttl, __be16 df, bool xnet)
  53{
  54	int pkt_len = skb->len - skb_inner_network_offset(skb);
  55	struct net *net = dev_net(rt->dst.dev);
  56	struct net_device *dev = skb->dev;
  57	struct iphdr *iph;
  58	int err;
  59
  60	skb_scrub_packet(skb, xnet);
  61
  62	skb_clear_hash_if_not_l4(skb);
  63	skb_dst_set(skb, &rt->dst);
  64	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
  65
  66	/* Push down and install the IP header. */
  67	skb_push(skb, sizeof(struct iphdr));
  68	skb_reset_network_header(skb);
  69
  70	iph = ip_hdr(skb);
  71
  72	iph->version	=	4;
  73	iph->ihl	=	sizeof(struct iphdr) >> 2;
  74	iph->frag_off	=	ip_mtu_locked(&rt->dst) ? 0 : df;
  75	iph->protocol	=	proto;
  76	iph->tos	=	tos;
  77	iph->daddr	=	dst;
  78	iph->saddr	=	src;
  79	iph->ttl	=	ttl;
  80	__ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
  81
  82	err = ip_local_out(net, sk, skb);
  83
  84	if (dev) {
  85		if (unlikely(net_xmit_eval(err)))
  86			pkt_len = 0;
  87		iptunnel_xmit_stats(dev, pkt_len);
  88	}
  89}
  90EXPORT_SYMBOL_GPL(iptunnel_xmit);
  91
  92int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
  93			   __be16 inner_proto, bool raw_proto, bool xnet)
  94{
  95	if (unlikely(!pskb_may_pull(skb, hdr_len)))
  96		return -ENOMEM;
  97
  98	skb_pull_rcsum(skb, hdr_len);
  99
 100	if (!raw_proto && inner_proto == htons(ETH_P_TEB)) {
 101		struct ethhdr *eh;
 102
 103		if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
 104			return -ENOMEM;
 105
 106		eh = (struct ethhdr *)skb->data;
 107		if (likely(eth_proto_is_802_3(eh->h_proto)))
 108			skb->protocol = eh->h_proto;
 109		else
 110			skb->protocol = htons(ETH_P_802_2);
 111
 112	} else {
 113		skb->protocol = inner_proto;
 114	}
 115
 116	skb_clear_hash_if_not_l4(skb);
 117	__vlan_hwaccel_clear_tag(skb);
 118	skb_set_queue_mapping(skb, 0);
 119	skb_scrub_packet(skb, xnet);
 120
 121	return iptunnel_pull_offloads(skb);
 122}
 123EXPORT_SYMBOL_GPL(__iptunnel_pull_header);
 124
 125struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
 126					     gfp_t flags)
 127{
 128	IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { };
 129	struct metadata_dst *res;
 130	struct ip_tunnel_info *dst, *src;
 131
 132	if (!md || md->type != METADATA_IP_TUNNEL ||
 133	    md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
 134		return NULL;
 135
 136	src = &md->u.tun_info;
 137	res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags);
 138	if (!res)
 139		return NULL;
 140
 141	dst = &res->u.tun_info;
 142	dst->key.tun_id = src->key.tun_id;
 143	if (src->mode & IP_TUNNEL_INFO_IPV6)
 144		memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src,
 145		       sizeof(struct in6_addr));
 146	else
 147		dst->key.u.ipv4.dst = src->key.u.ipv4.src;
 148	ip_tunnel_flags_copy(dst->key.tun_flags, src->key.tun_flags);
 149	dst->mode = src->mode | IP_TUNNEL_INFO_TX;
 150	ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src),
 151				src->options_len, tun_flags);
 152
 153	return res;
 154}
 155EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
 156
 157int iptunnel_handle_offloads(struct sk_buff *skb,
 158			     int gso_type_mask)
 159{
 160	int err;
 161
 162	if (likely(!skb->encapsulation)) {
 163		skb_reset_inner_headers(skb);
 164		skb->encapsulation = 1;
 165	}
 166
 167	if (skb_is_gso(skb)) {
 168		err = skb_header_unclone(skb, GFP_ATOMIC);
 169		if (unlikely(err))
 170			return err;
 171		skb_shinfo(skb)->gso_type |= gso_type_mask;
 172		return 0;
 173	}
 174
 175	if (skb->ip_summed != CHECKSUM_PARTIAL) {
 176		skb->ip_summed = CHECKSUM_NONE;
 177		/* We clear encapsulation here to prevent badly-written
 178		 * drivers potentially deciding to offload an inner checksum
 179		 * if we set CHECKSUM_PARTIAL on the outer header.
 180		 * This should go away when the drivers are all fixed.
 181		 */
 182		skb->encapsulation = 0;
 183	}
 184
 185	return 0;
 186}
 187EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
 188
 189/**
 190 * iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD
 191 * @skb:	Original packet with L2 header
 192 * @mtu:	MTU value for ICMP error
 193 *
 194 * Return: length on success, negative error code if message couldn't be built.
 195 */
 196static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
 197{
 198	const struct iphdr *iph = ip_hdr(skb);
 199	struct icmphdr *icmph;
 200	struct iphdr *niph;
 201	struct ethhdr eh;
 202	int len, err;
 203
 204	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
 205		return -EINVAL;
 206
 207	skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
 208	pskb_pull(skb, ETH_HLEN);
 209	skb_reset_network_header(skb);
 210
 211	err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph));
 212	if (err)
 213		return err;
 214
 215	len = skb->len + sizeof(*icmph);
 216	err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN);
 217	if (err)
 218		return err;
 219
 220	icmph = skb_push(skb, sizeof(*icmph));
 221	*icmph = (struct icmphdr) {
 222		.type			= ICMP_DEST_UNREACH,
 223		.code			= ICMP_FRAG_NEEDED,
 224		.checksum		= 0,
 225		.un.frag.__unused	= 0,
 226		.un.frag.mtu		= htons(mtu),
 227	};
 228	icmph->checksum = csum_fold(skb_checksum(skb, 0, len, 0));
 229	skb_reset_transport_header(skb);
 230
 231	niph = skb_push(skb, sizeof(*niph));
 232	*niph = (struct iphdr) {
 233		.ihl			= sizeof(*niph) / 4u,
 234		.version 		= 4,
 235		.tos 			= 0,
 236		.tot_len		= htons(len + sizeof(*niph)),
 237		.id			= 0,
 238		.frag_off		= htons(IP_DF),
 239		.ttl			= iph->ttl,
 240		.protocol		= IPPROTO_ICMP,
 241		.saddr			= iph->daddr,
 242		.daddr			= iph->saddr,
 243	};
 244	ip_send_check(niph);
 245	skb_reset_network_header(skb);
 246
 247	skb->ip_summed = CHECKSUM_NONE;
 248
 249	eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
 250	skb_reset_mac_header(skb);
 251
 252	return skb->len;
 253}
 254
 255/**
 256 * iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed
 257 * @skb:	Buffer being sent by encapsulation, L2 headers expected
 258 * @mtu:	Network MTU for path
 259 *
 260 * Return: 0 for no ICMP reply, length if built, negative value on error.
 261 */
 262static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu)
 263{
 264	const struct icmphdr *icmph = icmp_hdr(skb);
 265	const struct iphdr *iph = ip_hdr(skb);
 266
 267	if (mtu < 576 || iph->frag_off != htons(IP_DF))
 268		return 0;
 269
 270	if (ipv4_is_lbcast(iph->daddr)  || ipv4_is_multicast(iph->daddr) ||
 271	    ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr)  ||
 272	    ipv4_is_lbcast(iph->saddr)  || ipv4_is_multicast(iph->saddr))
 273		return 0;
 274
 275	if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type))
 276		return 0;
 277
 278	return iptunnel_pmtud_build_icmp(skb, mtu);
 279}
 280
 281#if IS_ENABLED(CONFIG_IPV6)
 282/**
 283 * iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD
 284 * @skb:	Original packet with L2 header
 285 * @mtu:	MTU value for ICMPv6 error
 286 *
 287 * Return: length on success, negative error code if message couldn't be built.
 288 */
 289static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
 290{
 291	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
 292	struct icmp6hdr *icmp6h;
 293	struct ipv6hdr *nip6h;
 294	struct ethhdr eh;
 295	int len, err;
 296	__wsum csum;
 297
 298	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
 299		return -EINVAL;
 300
 301	skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
 302	pskb_pull(skb, ETH_HLEN);
 303	skb_reset_network_header(skb);
 304
 305	err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h));
 306	if (err)
 307		return err;
 308
 309	len = skb->len + sizeof(*icmp6h);
 310	err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN);
 311	if (err)
 312		return err;
 313
 314	icmp6h = skb_push(skb, sizeof(*icmp6h));
 315	*icmp6h = (struct icmp6hdr) {
 316		.icmp6_type		= ICMPV6_PKT_TOOBIG,
 317		.icmp6_code		= 0,
 318		.icmp6_cksum		= 0,
 319		.icmp6_mtu		= htonl(mtu),
 320	};
 321	skb_reset_transport_header(skb);
 322
 323	nip6h = skb_push(skb, sizeof(*nip6h));
 324	*nip6h = (struct ipv6hdr) {
 325		.priority		= 0,
 326		.version		= 6,
 327		.flow_lbl		= { 0 },
 328		.payload_len		= htons(len),
 329		.nexthdr		= IPPROTO_ICMPV6,
 330		.hop_limit		= ip6h->hop_limit,
 331		.saddr			= ip6h->daddr,
 332		.daddr			= ip6h->saddr,
 333	};
 334	skb_reset_network_header(skb);
 335
 336	csum = skb_checksum(skb, skb_transport_offset(skb), len, 0);
 337	icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len,
 338					      IPPROTO_ICMPV6, csum);
 339
 340	skb->ip_summed = CHECKSUM_NONE;
 341
 342	eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
 343	skb_reset_mac_header(skb);
 344
 345	return skb->len;
 346}
 347
 348/**
 349 * iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed
 350 * @skb:	Buffer being sent by encapsulation, L2 headers expected
 351 * @mtu:	Network MTU for path
 352 *
 353 * Return: 0 for no ICMPv6 reply, length if built, negative value on error.
 354 */
 355static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu)
 356{
 357	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
 358	int stype = ipv6_addr_type(&ip6h->saddr);
 359	u8 proto = ip6h->nexthdr;
 360	__be16 frag_off;
 361	int offset;
 362
 363	if (mtu < IPV6_MIN_MTU)
 364		return 0;
 365
 366	if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST ||
 367	    stype == IPV6_ADDR_LOOPBACK)
 368		return 0;
 369
 370	offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto,
 371				  &frag_off);
 372	if (offset < 0 || (frag_off & htons(~0x7)))
 373		return 0;
 374
 375	if (proto == IPPROTO_ICMPV6) {
 376		struct icmp6hdr *icmp6h;
 377
 378		if (!pskb_may_pull(skb, skb_network_header(skb) +
 379					offset + 1 - skb->data))
 380			return 0;
 381
 382		icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 383		if (icmpv6_is_err(icmp6h->icmp6_type) ||
 384		    icmp6h->icmp6_type == NDISC_REDIRECT)
 385			return 0;
 386	}
 387
 388	return iptunnel_pmtud_build_icmpv6(skb, mtu);
 389}
 390#endif /* IS_ENABLED(CONFIG_IPV6) */
 391
 392/**
 393 * skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed
 394 * @skb:	Buffer being sent by encapsulation, L2 headers expected
 395 * @encap_dst:	Destination for tunnel encapsulation (outer IP)
 396 * @headroom:	Encapsulation header size, bytes
 397 * @reply:	Build matching ICMP or ICMPv6 message as a result
 398 *
 399 * L2 tunnel implementations that can carry IP and can be directly bridged
 400 * (currently UDP tunnels) can't always rely on IP forwarding paths to handle
 401 * PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built
 402 * based on payload and sent back by the encapsulation itself.
 403 *
 404 * For routable interfaces, we just need to update the PMTU for the destination.
 405 *
 406 * Return: 0 if ICMP error not needed, length if built, negative value on error
 407 */
 408int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
 409			  int headroom, bool reply)
 410{
 411	u32 mtu = dst_mtu(encap_dst) - headroom;
 412
 413	if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) ||
 414	    (!skb_is_gso(skb) && (skb->len - skb_network_offset(skb)) <= mtu))
 415		return 0;
 416
 417	skb_dst_update_pmtu_no_confirm(skb, mtu);
 418
 419	if (!reply || skb->pkt_type == PACKET_HOST)
 420		return 0;
 421
 422	if (skb->protocol == htons(ETH_P_IP))
 423		return iptunnel_pmtud_check_icmp(skb, mtu);
 424
 425#if IS_ENABLED(CONFIG_IPV6)
 426	if (skb->protocol == htons(ETH_P_IPV6))
 427		return iptunnel_pmtud_check_icmpv6(skb, mtu);
 428#endif
 429	return 0;
 430}
 431EXPORT_SYMBOL(skb_tunnel_check_pmtu);
 432
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 433static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
 434	[LWTUNNEL_IP_UNSPEC]	= { .strict_start_type = LWTUNNEL_IP_OPTS },
 435	[LWTUNNEL_IP_ID]	= { .type = NLA_U64 },
 436	[LWTUNNEL_IP_DST]	= { .type = NLA_U32 },
 437	[LWTUNNEL_IP_SRC]	= { .type = NLA_U32 },
 438	[LWTUNNEL_IP_TTL]	= { .type = NLA_U8 },
 439	[LWTUNNEL_IP_TOS]	= { .type = NLA_U8 },
 440	[LWTUNNEL_IP_FLAGS]	= { .type = NLA_U16 },
 441	[LWTUNNEL_IP_OPTS]	= { .type = NLA_NESTED },
 442};
 443
 444static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = {
 445	[LWTUNNEL_IP_OPTS_GENEVE]	= { .type = NLA_NESTED },
 446	[LWTUNNEL_IP_OPTS_VXLAN]	= { .type = NLA_NESTED },
 447	[LWTUNNEL_IP_OPTS_ERSPAN]	= { .type = NLA_NESTED },
 448};
 449
 450static const struct nla_policy
 451geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = {
 452	[LWTUNNEL_IP_OPT_GENEVE_CLASS]	= { .type = NLA_U16 },
 453	[LWTUNNEL_IP_OPT_GENEVE_TYPE]	= { .type = NLA_U8 },
 454	[LWTUNNEL_IP_OPT_GENEVE_DATA]	= { .type = NLA_BINARY, .len = 128 },
 455};
 456
 457static const struct nla_policy
 458vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = {
 459	[LWTUNNEL_IP_OPT_VXLAN_GBP]	= { .type = NLA_U32 },
 460};
 461
 462static const struct nla_policy
 463erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = {
 464	[LWTUNNEL_IP_OPT_ERSPAN_VER]	= { .type = NLA_U8 },
 465	[LWTUNNEL_IP_OPT_ERSPAN_INDEX]	= { .type = NLA_U32 },
 466	[LWTUNNEL_IP_OPT_ERSPAN_DIR]	= { .type = NLA_U8 },
 467	[LWTUNNEL_IP_OPT_ERSPAN_HWID]	= { .type = NLA_U8 },
 468};
 469
 470static int ip_tun_parse_opts_geneve(struct nlattr *attr,
 471				    struct ip_tunnel_info *info, int opts_len,
 472				    struct netlink_ext_ack *extack)
 473{
 474	struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1];
 475	int data_len, err;
 476
 477	err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr,
 478			       geneve_opt_policy, extack);
 479	if (err)
 480		return err;
 481
 482	if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] ||
 483	    !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] ||
 484	    !tb[LWTUNNEL_IP_OPT_GENEVE_DATA])
 485		return -EINVAL;
 486
 487	attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA];
 488	data_len = nla_len(attr);
 489	if (data_len % 4)
 490		return -EINVAL;
 491
 492	if (info) {
 493		struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len;
 494
 495		memcpy(opt->opt_data, nla_data(attr), data_len);
 496		opt->length = data_len / 4;
 497		attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS];
 498		opt->opt_class = nla_get_be16(attr);
 499		attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE];
 500		opt->type = nla_get_u8(attr);
 501		__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags);
 502	}
 503
 504	return sizeof(struct geneve_opt) + data_len;
 505}
 506
 507static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
 508				   struct ip_tunnel_info *info, int opts_len,
 509				   struct netlink_ext_ack *extack)
 510{
 511	struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1];
 512	int err;
 513
 514	err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr,
 515			       vxlan_opt_policy, extack);
 516	if (err)
 517		return err;
 518
 519	if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP])
 520		return -EINVAL;
 521
 522	if (info) {
 523		struct vxlan_metadata *md =
 524			ip_tunnel_info_opts(info) + opts_len;
 525
 526		attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP];
 527		md->gbp = nla_get_u32(attr);
 528		md->gbp &= VXLAN_GBP_MASK;
 529		__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags);
 530	}
 531
 532	return sizeof(struct vxlan_metadata);
 533}
 534
 535static int ip_tun_parse_opts_erspan(struct nlattr *attr,
 536				    struct ip_tunnel_info *info, int opts_len,
 537				    struct netlink_ext_ack *extack)
 538{
 539	struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1];
 540	int err;
 541	u8 ver;
 542
 543	err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr,
 544			       erspan_opt_policy, extack);
 545	if (err)
 546		return err;
 547
 548	if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER])
 549		return -EINVAL;
 550
 551	ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]);
 552	if (ver == 1) {
 553		if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX])
 554			return -EINVAL;
 555	} else if (ver == 2) {
 556		if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] ||
 557		    !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID])
 558			return -EINVAL;
 559	} else {
 560		return -EINVAL;
 561	}
 562
 563	if (info) {
 564		struct erspan_metadata *md =
 565			ip_tunnel_info_opts(info) + opts_len;
 566
 567		md->version = ver;
 568		if (ver == 1) {
 569			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX];
 570			md->u.index = nla_get_be32(attr);
 571		} else {
 572			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR];
 573			md->u.md2.dir = nla_get_u8(attr);
 574			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID];
 575			set_hwid(&md->u.md2, nla_get_u8(attr));
 576		}
 577
 578		__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags);
 579	}
 580
 581	return sizeof(struct erspan_metadata);
 582}
 583
 584static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 585			     struct netlink_ext_ack *extack)
 586{
 587	int err, rem, opt_len, opts_len = 0;
 588	struct nlattr *nla;
 589	u32 type = 0;
 590
 591	if (!attr)
 592		return 0;
 593
 594	err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX,
 595			   ip_opts_policy, extack);
 596	if (err)
 597		return err;
 598
 599	nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
 600		switch (nla_type(nla)) {
 601		case LWTUNNEL_IP_OPTS_GENEVE:
 602			if (type && type != IP_TUNNEL_GENEVE_OPT_BIT)
 603				return -EINVAL;
 604			opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len,
 605							   extack);
 606			if (opt_len < 0)
 607				return opt_len;
 608			opts_len += opt_len;
 609			if (opts_len > IP_TUNNEL_OPTS_MAX)
 610				return -EINVAL;
 611			type = IP_TUNNEL_GENEVE_OPT_BIT;
 612			break;
 613		case LWTUNNEL_IP_OPTS_VXLAN:
 614			if (type)
 615				return -EINVAL;
 616			opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len,
 617							  extack);
 618			if (opt_len < 0)
 619				return opt_len;
 620			opts_len += opt_len;
 621			type = IP_TUNNEL_VXLAN_OPT_BIT;
 622			break;
 623		case LWTUNNEL_IP_OPTS_ERSPAN:
 624			if (type)
 625				return -EINVAL;
 626			opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len,
 627							   extack);
 628			if (opt_len < 0)
 629				return opt_len;
 630			opts_len += opt_len;
 631			type = IP_TUNNEL_ERSPAN_OPT_BIT;
 632			break;
 633		default:
 634			return -EINVAL;
 635		}
 636	}
 637
 638	return opts_len;
 639}
 640
 641static int ip_tun_get_optlen(struct nlattr *attr,
 642			     struct netlink_ext_ack *extack)
 643{
 644	return ip_tun_parse_opts(attr, NULL, extack);
 645}
 646
 647static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 648			   struct netlink_ext_ack *extack)
 649{
 650	return ip_tun_parse_opts(attr, info, extack);
 651}
 652
 653static int ip_tun_build_state(struct net *net, struct nlattr *attr,
 654			      unsigned int family, const void *cfg,
 655			      struct lwtunnel_state **ts,
 656			      struct netlink_ext_ack *extack)
 657{
 658	struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
 659	struct lwtunnel_state *new_state;
 660	struct ip_tunnel_info *tun_info;
 661	int err, opt_len;
 662
 663	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr,
 664					  ip_tun_policy, extack);
 665	if (err < 0)
 666		return err;
 667
 668	opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack);
 669	if (opt_len < 0)
 670		return opt_len;
 671
 672	new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
 673	if (!new_state)
 674		return -ENOMEM;
 675
 676	new_state->type = LWTUNNEL_ENCAP_IP;
 677
 678	tun_info = lwt_tun_info(new_state);
 679
 680	err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack);
 681	if (err < 0) {
 682		lwtstate_free(new_state);
 683		return err;
 684	}
 685
 686#ifdef CONFIG_DST_CACHE
 687	err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL);
 688	if (err) {
 689		lwtstate_free(new_state);
 690		return err;
 691	}
 692#endif
 693
 694	if (tb[LWTUNNEL_IP_ID])
 695		tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);
 696
 697	if (tb[LWTUNNEL_IP_DST])
 698		tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]);
 699
 700	if (tb[LWTUNNEL_IP_SRC])
 701		tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]);
 702
 703	if (tb[LWTUNNEL_IP_TTL])
 704		tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
 705
 706	if (tb[LWTUNNEL_IP_TOS])
 707		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
 708
 709	if (tb[LWTUNNEL_IP_FLAGS]) {
 710		IP_TUNNEL_DECLARE_FLAGS(flags);
 711
 712		ip_tunnel_flags_from_be16(flags,
 713					  nla_get_be16(tb[LWTUNNEL_IP_FLAGS]));
 714		ip_tunnel_clear_options_present(flags);
 715
 716		ip_tunnel_flags_or(tun_info->key.tun_flags,
 717				   tun_info->key.tun_flags, flags);
 718	}
 719
 720	tun_info->mode = IP_TUNNEL_INFO_TX;
 721	tun_info->options_len = opt_len;
 722
 723	*ts = new_state;
 724
 725	return 0;
 726}
 727
 728static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate)
 729{
 730#ifdef CONFIG_DST_CACHE
 731	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
 732
 733	dst_cache_destroy(&tun_info->dst_cache);
 734#endif
 735}
 736
 737static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb,
 738					 struct ip_tunnel_info *tun_info)
 739{
 740	struct geneve_opt *opt;
 741	struct nlattr *nest;
 742	int offset = 0;
 743
 744	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE);
 745	if (!nest)
 746		return -ENOMEM;
 747
 748	while (tun_info->options_len > offset) {
 749		opt = ip_tunnel_info_opts(tun_info) + offset;
 750		if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS,
 751				 opt->opt_class) ||
 752		    nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) ||
 753		    nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4,
 754			    opt->opt_data)) {
 755			nla_nest_cancel(skb, nest);
 756			return -ENOMEM;
 757		}
 758		offset += sizeof(*opt) + opt->length * 4;
 759	}
 760
 761	nla_nest_end(skb, nest);
 762	return 0;
 763}
 764
 765static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb,
 766					struct ip_tunnel_info *tun_info)
 767{
 768	struct vxlan_metadata *md;
 769	struct nlattr *nest;
 770
 771	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN);
 772	if (!nest)
 773		return -ENOMEM;
 774
 775	md = ip_tunnel_info_opts(tun_info);
 776	if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) {
 777		nla_nest_cancel(skb, nest);
 778		return -ENOMEM;
 779	}
 780
 781	nla_nest_end(skb, nest);
 782	return 0;
 783}
 784
 785static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb,
 786					 struct ip_tunnel_info *tun_info)
 787{
 788	struct erspan_metadata *md;
 789	struct nlattr *nest;
 790
 791	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN);
 792	if (!nest)
 793		return -ENOMEM;
 794
 795	md = ip_tunnel_info_opts(tun_info);
 796	if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version))
 797		goto err;
 798
 799	if (md->version == 1 &&
 800	    nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index))
 801		goto err;
 802
 803	if (md->version == 2 &&
 804	    (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) ||
 805	     nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID,
 806			get_hwid(&md->u.md2))))
 807		goto err;
 808
 809	nla_nest_end(skb, nest);
 810	return 0;
 811err:
 812	nla_nest_cancel(skb, nest);
 813	return -ENOMEM;
 814}
 815
 816static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
 817				  struct ip_tunnel_info *tun_info)
 818{
 819	struct nlattr *nest;
 820	int err = 0;
 821
 822	if (!ip_tunnel_is_options_present(tun_info->key.tun_flags))
 823		return 0;
 824
 825	nest = nla_nest_start_noflag(skb, type);
 826	if (!nest)
 827		return -ENOMEM;
 828
 829	if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_info->key.tun_flags))
 830		err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
 831	else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_info->key.tun_flags))
 832		err = ip_tun_fill_encap_opts_vxlan(skb, tun_info);
 833	else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags))
 834		err = ip_tun_fill_encap_opts_erspan(skb, tun_info);
 835
 836	if (err) {
 837		nla_nest_cancel(skb, nest);
 838		return err;
 839	}
 840
 841	nla_nest_end(skb, nest);
 842	return 0;
 843}
 844
 845static int ip_tun_fill_encap_info(struct sk_buff *skb,
 846				  struct lwtunnel_state *lwtstate)
 847{
 848	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
 849
 850	if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id,
 851			 LWTUNNEL_IP_PAD) ||
 852	    nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
 853	    nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
 854	    nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
 855	    nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
 856	    nla_put_be16(skb, LWTUNNEL_IP_FLAGS,
 857			 ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
 858	    ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info))
 859		return -ENOMEM;
 860
 861	return 0;
 862}
 863
 864static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
 865{
 866	int opt_len;
 867
 868	if (!ip_tunnel_is_options_present(info->key.tun_flags))
 869		return 0;
 870
 871	opt_len = nla_total_size(0);		/* LWTUNNEL_IP_OPTS */
 872	if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) {
 873		struct geneve_opt *opt;
 874		int offset = 0;
 875
 876		opt_len += nla_total_size(0);	/* LWTUNNEL_IP_OPTS_GENEVE */
 877		while (info->options_len > offset) {
 878			opt = ip_tunnel_info_opts(info) + offset;
 879			opt_len += nla_total_size(2)	/* OPT_GENEVE_CLASS */
 880				   + nla_total_size(1)	/* OPT_GENEVE_TYPE */
 881				   + nla_total_size(opt->length * 4);
 882							/* OPT_GENEVE_DATA */
 883			offset += sizeof(*opt) + opt->length * 4;
 884		}
 885	} else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) {
 886		opt_len += nla_total_size(0)	/* LWTUNNEL_IP_OPTS_VXLAN */
 887			   + nla_total_size(4);	/* OPT_VXLAN_GBP */
 888	} else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) {
 889		struct erspan_metadata *md = ip_tunnel_info_opts(info);
 890
 891		opt_len += nla_total_size(0)	/* LWTUNNEL_IP_OPTS_ERSPAN */
 892			   + nla_total_size(1)	/* OPT_ERSPAN_VER */
 893			   + (md->version == 1 ? nla_total_size(4)
 894						/* OPT_ERSPAN_INDEX (v1) */
 895					       : nla_total_size(1) +
 896						 nla_total_size(1));
 897						/* OPT_ERSPAN_DIR + HWID (v2) */
 898	}
 899
 900	return opt_len;
 901}
 902
 903static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
 904{
 905	return nla_total_size_64bit(8)	/* LWTUNNEL_IP_ID */
 906		+ nla_total_size(4)	/* LWTUNNEL_IP_DST */
 907		+ nla_total_size(4)	/* LWTUNNEL_IP_SRC */
 908		+ nla_total_size(1)	/* LWTUNNEL_IP_TOS */
 909		+ nla_total_size(1)	/* LWTUNNEL_IP_TTL */
 910		+ nla_total_size(2)	/* LWTUNNEL_IP_FLAGS */
 911		+ ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
 912					/* LWTUNNEL_IP_OPTS */
 913}
 914
 915static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
 916{
 917	struct ip_tunnel_info *info_a = lwt_tun_info(a);
 918	struct ip_tunnel_info *info_b = lwt_tun_info(b);
 919
 920	return memcmp(info_a, info_b, sizeof(info_a->key)) ||
 921	       info_a->mode != info_b->mode ||
 922	       info_a->options_len != info_b->options_len ||
 923	       memcmp(ip_tunnel_info_opts(info_a),
 924		      ip_tunnel_info_opts(info_b), info_a->options_len);
 925}
 926
 927static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
 928	.build_state = ip_tun_build_state,
 929	.destroy_state = ip_tun_destroy_state,
 930	.fill_encap = ip_tun_fill_encap_info,
 931	.get_encap_size = ip_tun_encap_nlsize,
 932	.cmp_encap = ip_tun_cmp_encap,
 933	.owner = THIS_MODULE,
 934};
 935
 936static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
 937	[LWTUNNEL_IP6_UNSPEC]	= { .strict_start_type = LWTUNNEL_IP6_OPTS },
 938	[LWTUNNEL_IP6_ID]		= { .type = NLA_U64 },
 939	[LWTUNNEL_IP6_DST]		= { .len = sizeof(struct in6_addr) },
 940	[LWTUNNEL_IP6_SRC]		= { .len = sizeof(struct in6_addr) },
 941	[LWTUNNEL_IP6_HOPLIMIT]		= { .type = NLA_U8 },
 942	[LWTUNNEL_IP6_TC]		= { .type = NLA_U8 },
 943	[LWTUNNEL_IP6_FLAGS]		= { .type = NLA_U16 },
 944	[LWTUNNEL_IP6_OPTS]		= { .type = NLA_NESTED },
 945};
 946
 947static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
 948			       unsigned int family, const void *cfg,
 949			       struct lwtunnel_state **ts,
 950			       struct netlink_ext_ack *extack)
 951{
 952	struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
 953	struct lwtunnel_state *new_state;
 954	struct ip_tunnel_info *tun_info;
 955	int err, opt_len;
 956
 957	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr,
 958					  ip6_tun_policy, extack);
 959	if (err < 0)
 960		return err;
 961
 962	opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack);
 963	if (opt_len < 0)
 964		return opt_len;
 965
 966	new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
 967	if (!new_state)
 968		return -ENOMEM;
 969
 970	new_state->type = LWTUNNEL_ENCAP_IP6;
 971
 972	tun_info = lwt_tun_info(new_state);
 973
 974	err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack);
 975	if (err < 0) {
 976		lwtstate_free(new_state);
 977		return err;
 978	}
 979
 980	if (tb[LWTUNNEL_IP6_ID])
 981		tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]);
 982
 983	if (tb[LWTUNNEL_IP6_DST])
 984		tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]);
 985
 986	if (tb[LWTUNNEL_IP6_SRC])
 987		tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]);
 988
 989	if (tb[LWTUNNEL_IP6_HOPLIMIT])
 990		tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]);
 991
 992	if (tb[LWTUNNEL_IP6_TC])
 993		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
 994
 995	if (tb[LWTUNNEL_IP6_FLAGS]) {
 996		IP_TUNNEL_DECLARE_FLAGS(flags);
 997		__be16 data;
 998
 999		data = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
1000		ip_tunnel_flags_from_be16(flags, data);
1001		ip_tunnel_clear_options_present(flags);
1002
1003		ip_tunnel_flags_or(tun_info->key.tun_flags,
1004				   tun_info->key.tun_flags, flags);
1005	}
1006
1007	tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
1008	tun_info->options_len = opt_len;
1009
1010	*ts = new_state;
1011
1012	return 0;
1013}
1014
1015static int ip6_tun_fill_encap_info(struct sk_buff *skb,
1016				   struct lwtunnel_state *lwtstate)
1017{
1018	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
1019
1020	if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id,
1021			 LWTUNNEL_IP6_PAD) ||
1022	    nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
1023	    nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
1024	    nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
1025	    nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
1026	    nla_put_be16(skb, LWTUNNEL_IP6_FLAGS,
1027			 ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
1028	    ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info))
1029		return -ENOMEM;
1030
1031	return 0;
1032}
1033
1034static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
1035{
1036	return nla_total_size_64bit(8)	/* LWTUNNEL_IP6_ID */
1037		+ nla_total_size(16)	/* LWTUNNEL_IP6_DST */
1038		+ nla_total_size(16)	/* LWTUNNEL_IP6_SRC */
1039		+ nla_total_size(1)	/* LWTUNNEL_IP6_HOPLIMIT */
1040		+ nla_total_size(1)	/* LWTUNNEL_IP6_TC */
1041		+ nla_total_size(2)	/* LWTUNNEL_IP6_FLAGS */
1042		+ ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
1043					/* LWTUNNEL_IP6_OPTS */
1044}
1045
1046static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
1047	.build_state = ip6_tun_build_state,
1048	.fill_encap = ip6_tun_fill_encap_info,
1049	.get_encap_size = ip6_tun_encap_nlsize,
1050	.cmp_encap = ip_tun_cmp_encap,
1051	.owner = THIS_MODULE,
1052};
1053
1054void __init ip_tunnel_core_init(void)
1055{
1056	/* If you land here, make sure whether increasing ip_tunnel_info's
1057	 * options_len is a reasonable choice with its usage in front ends
1058	 * (f.e., it's part of flow keys, etc).
1059	 */
1060	BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255);
1061
1062	lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
1063	lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
1064}
1065
1066DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
1067EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
1068
1069void ip_tunnel_need_metadata(void)
1070{
1071	static_branch_inc(&ip_tunnel_metadata_cnt);
1072}
1073EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
1074
1075void ip_tunnel_unneed_metadata(void)
1076{
1077	static_branch_dec(&ip_tunnel_metadata_cnt);
1078}
1079EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
1080
1081/* Returns either the correct skb->protocol value, or 0 if invalid. */
1082__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb)
1083{
1084	if (skb_network_header(skb) >= skb->head &&
1085	    (skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) &&
1086	    ip_hdr(skb)->version == 4)
1087		return htons(ETH_P_IP);
1088	if (skb_network_header(skb) >= skb->head &&
1089	    (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) &&
1090	    ipv6_hdr(skb)->version == 6)
1091		return htons(ETH_P_IPV6);
1092	return 0;
1093}
1094EXPORT_SYMBOL(ip_tunnel_parse_protocol);
1095
1096const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol };
1097EXPORT_SYMBOL(ip_tunnel_header_ops);
1098
1099/* This function returns true when ENCAP attributes are present in the nl msg */
1100bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
1101				   struct ip_tunnel_encap *encap)
1102{
1103	bool ret = false;
1104
1105	memset(encap, 0, sizeof(*encap));
1106
1107	if (!data)
1108		return ret;
1109
1110	if (data[IFLA_IPTUN_ENCAP_TYPE]) {
1111		ret = true;
1112		encap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
1113	}
1114
1115	if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
1116		ret = true;
1117		encap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
1118	}
1119
1120	if (data[IFLA_IPTUN_ENCAP_SPORT]) {
1121		ret = true;
1122		encap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
1123	}
1124
1125	if (data[IFLA_IPTUN_ENCAP_DPORT]) {
1126		ret = true;
1127		encap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
1128	}
1129
1130	return ret;
1131}
1132EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms);
1133
1134void ip_tunnel_netlink_parms(struct nlattr *data[],
1135			     struct ip_tunnel_parm_kern *parms)
1136{
1137	if (data[IFLA_IPTUN_LINK])
1138		parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
1139
1140	if (data[IFLA_IPTUN_LOCAL])
1141		parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
1142
1143	if (data[IFLA_IPTUN_REMOTE])
1144		parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
1145
1146	if (data[IFLA_IPTUN_TTL]) {
1147		parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
1148		if (parms->iph.ttl)
1149			parms->iph.frag_off = htons(IP_DF);
1150	}
1151
1152	if (data[IFLA_IPTUN_TOS])
1153		parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
1154
1155	if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
1156		parms->iph.frag_off = htons(IP_DF);
1157
1158	if (data[IFLA_IPTUN_FLAGS]) {
1159		__be16 flags;
1160
1161		flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
1162		ip_tunnel_flags_from_be16(parms->i_flags, flags);
1163	}
1164
1165	if (data[IFLA_IPTUN_PROTO])
1166		parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
1167}
1168EXPORT_SYMBOL_GPL(ip_tunnel_netlink_parms);