Linux Audio

Check our new training course

Loading...
v3.15
 
  1/*
  2 * Copyright (c) 2013 Nicira, Inc.
  3 *
  4 * This program is free software; you can redistribute it and/or
  5 * modify it under the terms of version 2 of the GNU General Public
  6 * License as published by the Free Software Foundation.
  7 *
  8 * This program is distributed in the hope that it will be useful, but
  9 * WITHOUT ANY WARRANTY; without even the implied warranty of
 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 11 * General Public License for more details.
 12 *
 13 * You should have received a copy of the GNU General Public License
 14 * along with this program; if not, write to the Free Software
 15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 16 * 02110-1301, USA
 17 */
 18
 19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 20
 21#include <linux/types.h>
 22#include <linux/kernel.h>
 23#include <linux/skbuff.h>
 24#include <linux/netdevice.h>
 25#include <linux/in.h>
 26#include <linux/if_arp.h>
 27#include <linux/mroute.h>
 28#include <linux/init.h>
 29#include <linux/in6.h>
 30#include <linux/inetdevice.h>
 31#include <linux/netfilter_ipv4.h>
 32#include <linux/etherdevice.h>
 33#include <linux/if_ether.h>
 34#include <linux/if_vlan.h>
 
 35
 36#include <net/ip.h>
 37#include <net/icmp.h>
 38#include <net/protocol.h>
 39#include <net/ip_tunnels.h>
 
 
 40#include <net/arp.h>
 41#include <net/checksum.h>
 42#include <net/dsfield.h>
 43#include <net/inet_ecn.h>
 44#include <net/xfrm.h>
 45#include <net/net_namespace.h>
 46#include <net/netns/generic.h>
 47#include <net/rtnetlink.h>
 48
 49int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
 50		  __be32 src, __be32 dst, __u8 proto,
 51		  __u8 tos, __u8 ttl, __be16 df, bool xnet)
 
 
 
 
 
 
 
 
 
 
 
 
 52{
 53	int pkt_len = skb->len;
 
 
 54	struct iphdr *iph;
 55	int err;
 56
 57	skb_scrub_packet(skb, xnet);
 58
 59	skb_clear_hash(skb);
 60	skb_dst_set(skb, &rt->dst);
 61	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
 62
 63	/* Push down and install the IP header. */
 64	skb_push(skb, sizeof(struct iphdr));
 65	skb_reset_network_header(skb);
 66
 67	iph = ip_hdr(skb);
 68
 69	iph->version	=	4;
 70	iph->ihl	=	sizeof(struct iphdr) >> 2;
 71	iph->frag_off	=	df;
 72	iph->protocol	=	proto;
 73	iph->tos	=	tos;
 74	iph->daddr	=	dst;
 75	iph->saddr	=	src;
 76	iph->ttl	=	ttl;
 77	__ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 78
 79	err = ip_local_out_sk(sk, skb);
 80	if (unlikely(net_xmit_eval(err)))
 81		pkt_len = 0;
 82	return pkt_len;
 
 
 
 83}
 84EXPORT_SYMBOL_GPL(iptunnel_xmit);
 85
 86int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
 
 87{
 88	if (unlikely(!pskb_may_pull(skb, hdr_len)))
 89		return -ENOMEM;
 90
 91	skb_pull_rcsum(skb, hdr_len);
 92
 93	if (inner_proto == htons(ETH_P_TEB)) {
 94		struct ethhdr *eh = (struct ethhdr *)skb->data;
 95
 96		if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
 97			return -ENOMEM;
 98
 99		if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
 
100			skb->protocol = eh->h_proto;
101		else
102			skb->protocol = htons(ETH_P_802_2);
103
104	} else {
105		skb->protocol = inner_proto;
106	}
107
108	nf_reset(skb);
109	secpath_reset(skb);
110	skb_clear_hash_if_not_l4(skb);
111	skb_dst_drop(skb);
112	skb->vlan_tci = 0;
113	skb_set_queue_mapping(skb, 0);
114	skb->pkt_type = PACKET_HOST;
115	return 0;
 
116}
117EXPORT_SYMBOL_GPL(iptunnel_pull_header);
118
119struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
120					 bool csum_help,
121					 int gso_type_mask)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122{
123	int err;
124
125	if (likely(!skb->encapsulation)) {
126		skb_reset_inner_headers(skb);
127		skb->encapsulation = 1;
128	}
129
130	if (skb_is_gso(skb)) {
131		err = skb_unclone(skb, GFP_ATOMIC);
132		if (unlikely(err))
133			goto error;
134		skb_shinfo(skb)->gso_type |= gso_type_mask;
135		return skb;
136	}
137
138	if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
139		err = skb_checksum_help(skb);
140		if (unlikely(err))
141			goto error;
142	} else if (skb->ip_summed != CHECKSUM_PARTIAL)
143		skb->ip_summed = CHECKSUM_NONE;
 
 
 
 
 
 
 
144
145	return skb;
146error:
147	kfree_skb(skb);
148	return ERR_PTR(err);
149}
150EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
151
152/* Often modified stats are per cpu, other are shared (netdev->stats) */
153struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
154						struct rtnl_link_stats64 *tot)
155{
156	int i;
157
158	for_each_possible_cpu(i) {
159		const struct pcpu_sw_netstats *tstats =
160						   per_cpu_ptr(dev->tstats, i);
161		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
162		unsigned int start;
163
164		do {
165			start = u64_stats_fetch_begin_irq(&tstats->syncp);
166			rx_packets = tstats->rx_packets;
167			tx_packets = tstats->tx_packets;
168			rx_bytes = tstats->rx_bytes;
169			tx_bytes = tstats->tx_bytes;
170		} while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
171
172		tot->rx_packets += rx_packets;
173		tot->tx_packets += tx_packets;
174		tot->rx_bytes   += rx_bytes;
175		tot->tx_bytes   += tx_bytes;
176	}
177
178	tot->multicast = dev->stats.multicast;
179
180	tot->rx_crc_errors = dev->stats.rx_crc_errors;
181	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
182	tot->rx_length_errors = dev->stats.rx_length_errors;
183	tot->rx_frame_errors = dev->stats.rx_frame_errors;
184	tot->rx_errors = dev->stats.rx_errors;
185
186	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
187	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
188	tot->tx_dropped = dev->stats.tx_dropped;
189	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
190	tot->tx_errors = dev->stats.tx_errors;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
192	tot->collisions  = dev->stats.collisions;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
194	return tot;
 
195}
196EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (c) 2013 Nicira, Inc.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
   4 */
   5
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7
   8#include <linux/types.h>
   9#include <linux/kernel.h>
  10#include <linux/skbuff.h>
  11#include <linux/netdevice.h>
  12#include <linux/in.h>
  13#include <linux/if_arp.h>
 
  14#include <linux/init.h>
  15#include <linux/in6.h>
  16#include <linux/inetdevice.h>
  17#include <linux/netfilter_ipv4.h>
  18#include <linux/etherdevice.h>
  19#include <linux/if_ether.h>
  20#include <linux/if_vlan.h>
  21#include <linux/static_key.h>
  22
  23#include <net/ip.h>
  24#include <net/icmp.h>
  25#include <net/protocol.h>
  26#include <net/ip_tunnels.h>
  27#include <net/ip6_tunnel.h>
  28#include <net/ip6_checksum.h>
  29#include <net/arp.h>
  30#include <net/checksum.h>
  31#include <net/dsfield.h>
  32#include <net/inet_ecn.h>
  33#include <net/xfrm.h>
  34#include <net/net_namespace.h>
  35#include <net/netns/generic.h>
  36#include <net/rtnetlink.h>
  37#include <net/dst_metadata.h>
  38#include <net/geneve.h>
  39#include <net/vxlan.h>
  40#include <net/erspan.h>
  41
  42const struct ip_tunnel_encap_ops __rcu *
  43		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
  44EXPORT_SYMBOL(iptun_encaps);
  45
  46const struct ip6_tnl_encap_ops __rcu *
  47		ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
  48EXPORT_SYMBOL(ip6tun_encaps);
  49
  50void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
  51		   __be32 src, __be32 dst, __u8 proto,
  52		   __u8 tos, __u8 ttl, __be16 df, bool xnet)
  53{
  54	int pkt_len = skb->len - skb_inner_network_offset(skb);
  55	struct net *net = dev_net(rt->dst.dev);
  56	struct net_device *dev = skb->dev;
  57	struct iphdr *iph;
  58	int err;
  59
  60	skb_scrub_packet(skb, xnet);
  61
  62	skb_clear_hash_if_not_l4(skb);
  63	skb_dst_set(skb, &rt->dst);
  64	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
  65
  66	/* Push down and install the IP header. */
  67	skb_push(skb, sizeof(struct iphdr));
  68	skb_reset_network_header(skb);
  69
  70	iph = ip_hdr(skb);
  71
  72	iph->version	=	4;
  73	iph->ihl	=	sizeof(struct iphdr) >> 2;
  74	iph->frag_off	=	ip_mtu_locked(&rt->dst) ? 0 : df;
  75	iph->protocol	=	proto;
  76	iph->tos	=	tos;
  77	iph->daddr	=	dst;
  78	iph->saddr	=	src;
  79	iph->ttl	=	ttl;
  80	__ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
  81
  82	err = ip_local_out(net, sk, skb);
  83
  84	if (dev) {
  85		if (unlikely(net_xmit_eval(err)))
  86			pkt_len = 0;
  87		iptunnel_xmit_stats(dev, pkt_len);
  88	}
  89}
  90EXPORT_SYMBOL_GPL(iptunnel_xmit);
  91
  92int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
  93			   __be16 inner_proto, bool raw_proto, bool xnet)
  94{
  95	if (unlikely(!pskb_may_pull(skb, hdr_len)))
  96		return -ENOMEM;
  97
  98	skb_pull_rcsum(skb, hdr_len);
  99
 100	if (!raw_proto && inner_proto == htons(ETH_P_TEB)) {
 101		struct ethhdr *eh;
 102
 103		if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
 104			return -ENOMEM;
 105
 106		eh = (struct ethhdr *)skb->data;
 107		if (likely(eth_proto_is_802_3(eh->h_proto)))
 108			skb->protocol = eh->h_proto;
 109		else
 110			skb->protocol = htons(ETH_P_802_2);
 111
 112	} else {
 113		skb->protocol = inner_proto;
 114	}
 115
 
 
 116	skb_clear_hash_if_not_l4(skb);
 117	__vlan_hwaccel_clear_tag(skb);
 
 118	skb_set_queue_mapping(skb, 0);
 119	skb_scrub_packet(skb, xnet);
 120
 121	return iptunnel_pull_offloads(skb);
 122}
 123EXPORT_SYMBOL_GPL(__iptunnel_pull_header);
 124
 125struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
 126					     gfp_t flags)
 127{
 128	IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { };
 129	struct metadata_dst *res;
 130	struct ip_tunnel_info *dst, *src;
 131
 132	if (!md || md->type != METADATA_IP_TUNNEL ||
 133	    md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
 134		return NULL;
 135
 136	src = &md->u.tun_info;
 137	res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags);
 138	if (!res)
 139		return NULL;
 140
 141	dst = &res->u.tun_info;
 142	dst->key.tun_id = src->key.tun_id;
 143	if (src->mode & IP_TUNNEL_INFO_IPV6)
 144		memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src,
 145		       sizeof(struct in6_addr));
 146	else
 147		dst->key.u.ipv4.dst = src->key.u.ipv4.src;
 148	ip_tunnel_flags_copy(dst->key.tun_flags, src->key.tun_flags);
 149	dst->mode = src->mode | IP_TUNNEL_INFO_TX;
 150	ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src),
 151				src->options_len, tun_flags);
 152
 153	return res;
 154}
 155EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
 156
 157int iptunnel_handle_offloads(struct sk_buff *skb,
 158			     int gso_type_mask)
 159{
 160	int err;
 161
 162	if (likely(!skb->encapsulation)) {
 163		skb_reset_inner_headers(skb);
 164		skb->encapsulation = 1;
 165	}
 166
 167	if (skb_is_gso(skb)) {
 168		err = skb_header_unclone(skb, GFP_ATOMIC);
 169		if (unlikely(err))
 170			return err;
 171		skb_shinfo(skb)->gso_type |= gso_type_mask;
 172		return 0;
 173	}
 174
 175	if (skb->ip_summed != CHECKSUM_PARTIAL) {
 
 
 
 
 176		skb->ip_summed = CHECKSUM_NONE;
 177		/* We clear encapsulation here to prevent badly-written
 178		 * drivers potentially deciding to offload an inner checksum
 179		 * if we set CHECKSUM_PARTIAL on the outer header.
 180		 * This should go away when the drivers are all fixed.
 181		 */
 182		skb->encapsulation = 0;
 183	}
 184
 185	return 0;
 
 
 
 186}
 187EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
 188
 189/**
 190 * iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD
 191 * @skb:	Original packet with L2 header
 192 * @mtu:	MTU value for ICMP error
 193 *
 194 * Return: length on success, negative error code if message couldn't be built.
 195 */
 196static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
 197{
 198	const struct iphdr *iph = ip_hdr(skb);
 199	struct icmphdr *icmph;
 200	struct iphdr *niph;
 201	struct ethhdr eh;
 202	int len, err;
 203
 204	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
 205		return -EINVAL;
 206
 207	skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
 208	pskb_pull(skb, ETH_HLEN);
 209	skb_reset_network_header(skb);
 210
 211	err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph));
 212	if (err)
 213		return err;
 214
 215	len = skb->len + sizeof(*icmph);
 216	err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN);
 217	if (err)
 218		return err;
 219
 220	icmph = skb_push(skb, sizeof(*icmph));
 221	*icmph = (struct icmphdr) {
 222		.type			= ICMP_DEST_UNREACH,
 223		.code			= ICMP_FRAG_NEEDED,
 224		.checksum		= 0,
 225		.un.frag.__unused	= 0,
 226		.un.frag.mtu		= htons(mtu),
 227	};
 228	icmph->checksum = csum_fold(skb_checksum(skb, 0, len, 0));
 229	skb_reset_transport_header(skb);
 230
 231	niph = skb_push(skb, sizeof(*niph));
 232	*niph = (struct iphdr) {
 233		.ihl			= sizeof(*niph) / 4u,
 234		.version 		= 4,
 235		.tos 			= 0,
 236		.tot_len		= htons(len + sizeof(*niph)),
 237		.id			= 0,
 238		.frag_off		= htons(IP_DF),
 239		.ttl			= iph->ttl,
 240		.protocol		= IPPROTO_ICMP,
 241		.saddr			= iph->daddr,
 242		.daddr			= iph->saddr,
 243	};
 244	ip_send_check(niph);
 245	skb_reset_network_header(skb);
 246
 247	skb->ip_summed = CHECKSUM_NONE;
 248
 249	eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
 250	skb_reset_mac_header(skb);
 251
 252	return skb->len;
 253}
 254
 255/**
 256 * iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed
 257 * @skb:	Buffer being sent by encapsulation, L2 headers expected
 258 * @mtu:	Network MTU for path
 259 *
 260 * Return: 0 for no ICMP reply, length if built, negative value on error.
 261 */
 262static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu)
 263{
 264	const struct icmphdr *icmph = icmp_hdr(skb);
 265	const struct iphdr *iph = ip_hdr(skb);
 266
 267	if (mtu < 576 || iph->frag_off != htons(IP_DF))
 268		return 0;
 269
 270	if (ipv4_is_lbcast(iph->daddr)  || ipv4_is_multicast(iph->daddr) ||
 271	    ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr)  ||
 272	    ipv4_is_lbcast(iph->saddr)  || ipv4_is_multicast(iph->saddr))
 273		return 0;
 274
 275	if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type))
 276		return 0;
 277
 278	return iptunnel_pmtud_build_icmp(skb, mtu);
 279}
 280
 281#if IS_ENABLED(CONFIG_IPV6)
 282/**
 283 * iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD
 284 * @skb:	Original packet with L2 header
 285 * @mtu:	MTU value for ICMPv6 error
 286 *
 287 * Return: length on success, negative error code if message couldn't be built.
 288 */
 289static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
 290{
 291	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
 292	struct icmp6hdr *icmp6h;
 293	struct ipv6hdr *nip6h;
 294	struct ethhdr eh;
 295	int len, err;
 296	__wsum csum;
 297
 298	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
 299		return -EINVAL;
 300
 301	skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
 302	pskb_pull(skb, ETH_HLEN);
 303	skb_reset_network_header(skb);
 304
 305	err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h));
 306	if (err)
 307		return err;
 308
 309	len = skb->len + sizeof(*icmp6h);
 310	err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN);
 311	if (err)
 312		return err;
 313
 314	icmp6h = skb_push(skb, sizeof(*icmp6h));
 315	*icmp6h = (struct icmp6hdr) {
 316		.icmp6_type		= ICMPV6_PKT_TOOBIG,
 317		.icmp6_code		= 0,
 318		.icmp6_cksum		= 0,
 319		.icmp6_mtu		= htonl(mtu),
 320	};
 321	skb_reset_transport_header(skb);
 322
 323	nip6h = skb_push(skb, sizeof(*nip6h));
 324	*nip6h = (struct ipv6hdr) {
 325		.priority		= 0,
 326		.version		= 6,
 327		.flow_lbl		= { 0 },
 328		.payload_len		= htons(len),
 329		.nexthdr		= IPPROTO_ICMPV6,
 330		.hop_limit		= ip6h->hop_limit,
 331		.saddr			= ip6h->daddr,
 332		.daddr			= ip6h->saddr,
 333	};
 334	skb_reset_network_header(skb);
 335
 336	csum = skb_checksum(skb, skb_transport_offset(skb), len, 0);
 337	icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len,
 338					      IPPROTO_ICMPV6, csum);
 339
 340	skb->ip_summed = CHECKSUM_NONE;
 341
 342	eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
 343	skb_reset_mac_header(skb);
 344
 345	return skb->len;
 346}
 347
 348/**
 349 * iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed
 350 * @skb:	Buffer being sent by encapsulation, L2 headers expected
 351 * @mtu:	Network MTU for path
 352 *
 353 * Return: 0 for no ICMPv6 reply, length if built, negative value on error.
 354 */
 355static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu)
 356{
 357	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
 358	int stype = ipv6_addr_type(&ip6h->saddr);
 359	u8 proto = ip6h->nexthdr;
 360	__be16 frag_off;
 361	int offset;
 362
 363	if (mtu < IPV6_MIN_MTU)
 364		return 0;
 365
 366	if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST ||
 367	    stype == IPV6_ADDR_LOOPBACK)
 368		return 0;
 369
 370	offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto,
 371				  &frag_off);
 372	if (offset < 0 || (frag_off & htons(~0x7)))
 373		return 0;
 374
 375	if (proto == IPPROTO_ICMPV6) {
 376		struct icmp6hdr *icmp6h;
 377
 378		if (!pskb_may_pull(skb, skb_network_header(skb) +
 379					offset + 1 - skb->data))
 380			return 0;
 381
 382		icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 383		if (icmpv6_is_err(icmp6h->icmp6_type) ||
 384		    icmp6h->icmp6_type == NDISC_REDIRECT)
 385			return 0;
 386	}
 387
 388	return iptunnel_pmtud_build_icmpv6(skb, mtu);
 389}
 390#endif /* IS_ENABLED(CONFIG_IPV6) */
 391
 392/**
 393 * skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed
 394 * @skb:	Buffer being sent by encapsulation, L2 headers expected
 395 * @encap_dst:	Destination for tunnel encapsulation (outer IP)
 396 * @headroom:	Encapsulation header size, bytes
 397 * @reply:	Build matching ICMP or ICMPv6 message as a result
 398 *
 399 * L2 tunnel implementations that can carry IP and can be directly bridged
 400 * (currently UDP tunnels) can't always rely on IP forwarding paths to handle
 401 * PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built
 402 * based on payload and sent back by the encapsulation itself.
 403 *
 404 * For routable interfaces, we just need to update the PMTU for the destination.
 405 *
 406 * Return: 0 if ICMP error not needed, length if built, negative value on error
 407 */
 408int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
 409			  int headroom, bool reply)
 410{
 411	u32 mtu = dst_mtu(encap_dst) - headroom;
 412
 413	if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) ||
 414	    (!skb_is_gso(skb) && (skb->len - skb_network_offset(skb)) <= mtu))
 415		return 0;
 416
 417	skb_dst_update_pmtu_no_confirm(skb, mtu);
 418
 419	if (!reply || skb->pkt_type == PACKET_HOST)
 420		return 0;
 421
 422	if (skb->protocol == htons(ETH_P_IP))
 423		return iptunnel_pmtud_check_icmp(skb, mtu);
 424
 425#if IS_ENABLED(CONFIG_IPV6)
 426	if (skb->protocol == htons(ETH_P_IPV6))
 427		return iptunnel_pmtud_check_icmpv6(skb, mtu);
 428#endif
 429	return 0;
 430}
 431EXPORT_SYMBOL(skb_tunnel_check_pmtu);
 432
 433static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
 434	[LWTUNNEL_IP_UNSPEC]	= { .strict_start_type = LWTUNNEL_IP_OPTS },
 435	[LWTUNNEL_IP_ID]	= { .type = NLA_U64 },
 436	[LWTUNNEL_IP_DST]	= { .type = NLA_U32 },
 437	[LWTUNNEL_IP_SRC]	= { .type = NLA_U32 },
 438	[LWTUNNEL_IP_TTL]	= { .type = NLA_U8 },
 439	[LWTUNNEL_IP_TOS]	= { .type = NLA_U8 },
 440	[LWTUNNEL_IP_FLAGS]	= { .type = NLA_U16 },
 441	[LWTUNNEL_IP_OPTS]	= { .type = NLA_NESTED },
 442};
 443
 444static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = {
 445	[LWTUNNEL_IP_OPTS_GENEVE]	= { .type = NLA_NESTED },
 446	[LWTUNNEL_IP_OPTS_VXLAN]	= { .type = NLA_NESTED },
 447	[LWTUNNEL_IP_OPTS_ERSPAN]	= { .type = NLA_NESTED },
 448};
 449
 450static const struct nla_policy
 451geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = {
 452	[LWTUNNEL_IP_OPT_GENEVE_CLASS]	= { .type = NLA_U16 },
 453	[LWTUNNEL_IP_OPT_GENEVE_TYPE]	= { .type = NLA_U8 },
 454	[LWTUNNEL_IP_OPT_GENEVE_DATA]	= { .type = NLA_BINARY, .len = 128 },
 455};
 456
 457static const struct nla_policy
 458vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = {
 459	[LWTUNNEL_IP_OPT_VXLAN_GBP]	= { .type = NLA_U32 },
 460};
 461
 462static const struct nla_policy
 463erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = {
 464	[LWTUNNEL_IP_OPT_ERSPAN_VER]	= { .type = NLA_U8 },
 465	[LWTUNNEL_IP_OPT_ERSPAN_INDEX]	= { .type = NLA_U32 },
 466	[LWTUNNEL_IP_OPT_ERSPAN_DIR]	= { .type = NLA_U8 },
 467	[LWTUNNEL_IP_OPT_ERSPAN_HWID]	= { .type = NLA_U8 },
 468};
 469
 470static int ip_tun_parse_opts_geneve(struct nlattr *attr,
 471				    struct ip_tunnel_info *info, int opts_len,
 472				    struct netlink_ext_ack *extack)
 473{
 474	struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1];
 475	int data_len, err;
 476
 477	err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr,
 478			       geneve_opt_policy, extack);
 479	if (err)
 480		return err;
 481
 482	if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] ||
 483	    !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] ||
 484	    !tb[LWTUNNEL_IP_OPT_GENEVE_DATA])
 485		return -EINVAL;
 486
 487	attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA];
 488	data_len = nla_len(attr);
 489	if (data_len % 4)
 490		return -EINVAL;
 491
 492	if (info) {
 493		struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len;
 494
 495		memcpy(opt->opt_data, nla_data(attr), data_len);
 496		opt->length = data_len / 4;
 497		attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS];
 498		opt->opt_class = nla_get_be16(attr);
 499		attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE];
 500		opt->type = nla_get_u8(attr);
 501		__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags);
 502	}
 503
 504	return sizeof(struct geneve_opt) + data_len;
 505}
 506
 507static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
 508				   struct ip_tunnel_info *info, int opts_len,
 509				   struct netlink_ext_ack *extack)
 510{
 511	struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1];
 512	int err;
 513
 514	err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr,
 515			       vxlan_opt_policy, extack);
 516	if (err)
 517		return err;
 518
 519	if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP])
 520		return -EINVAL;
 521
 522	if (info) {
 523		struct vxlan_metadata *md =
 524			ip_tunnel_info_opts(info) + opts_len;
 525
 526		attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP];
 527		md->gbp = nla_get_u32(attr);
 528		md->gbp &= VXLAN_GBP_MASK;
 529		__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags);
 530	}
 531
 532	return sizeof(struct vxlan_metadata);
 533}
 534
 535static int ip_tun_parse_opts_erspan(struct nlattr *attr,
 536				    struct ip_tunnel_info *info, int opts_len,
 537				    struct netlink_ext_ack *extack)
 538{
 539	struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1];
 540	int err;
 541	u8 ver;
 542
 543	err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr,
 544			       erspan_opt_policy, extack);
 545	if (err)
 546		return err;
 547
 548	if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER])
 549		return -EINVAL;
 550
 551	ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]);
 552	if (ver == 1) {
 553		if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX])
 554			return -EINVAL;
 555	} else if (ver == 2) {
 556		if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] ||
 557		    !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID])
 558			return -EINVAL;
 559	} else {
 560		return -EINVAL;
 561	}
 562
 563	if (info) {
 564		struct erspan_metadata *md =
 565			ip_tunnel_info_opts(info) + opts_len;
 566
 567		md->version = ver;
 568		if (ver == 1) {
 569			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX];
 570			md->u.index = nla_get_be32(attr);
 571		} else {
 572			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR];
 573			md->u.md2.dir = nla_get_u8(attr);
 574			attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID];
 575			set_hwid(&md->u.md2, nla_get_u8(attr));
 576		}
 577
 578		__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags);
 579	}
 580
 581	return sizeof(struct erspan_metadata);
 582}
 583
 584static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 585			     struct netlink_ext_ack *extack)
 586{
 587	int err, rem, opt_len, opts_len = 0;
 588	struct nlattr *nla;
 589	u32 type = 0;
 590
 591	if (!attr)
 592		return 0;
 593
 594	err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX,
 595			   ip_opts_policy, extack);
 596	if (err)
 597		return err;
 598
 599	nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
 600		switch (nla_type(nla)) {
 601		case LWTUNNEL_IP_OPTS_GENEVE:
 602			if (type && type != IP_TUNNEL_GENEVE_OPT_BIT)
 603				return -EINVAL;
 604			opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len,
 605							   extack);
 606			if (opt_len < 0)
 607				return opt_len;
 608			opts_len += opt_len;
 609			if (opts_len > IP_TUNNEL_OPTS_MAX)
 610				return -EINVAL;
 611			type = IP_TUNNEL_GENEVE_OPT_BIT;
 612			break;
 613		case LWTUNNEL_IP_OPTS_VXLAN:
 614			if (type)
 615				return -EINVAL;
 616			opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len,
 617							  extack);
 618			if (opt_len < 0)
 619				return opt_len;
 620			opts_len += opt_len;
 621			type = IP_TUNNEL_VXLAN_OPT_BIT;
 622			break;
 623		case LWTUNNEL_IP_OPTS_ERSPAN:
 624			if (type)
 625				return -EINVAL;
 626			opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len,
 627							   extack);
 628			if (opt_len < 0)
 629				return opt_len;
 630			opts_len += opt_len;
 631			type = IP_TUNNEL_ERSPAN_OPT_BIT;
 632			break;
 633		default:
 634			return -EINVAL;
 635		}
 636	}
 637
 638	return opts_len;
 639}
 640
 641static int ip_tun_get_optlen(struct nlattr *attr,
 642			     struct netlink_ext_ack *extack)
 643{
 644	return ip_tun_parse_opts(attr, NULL, extack);
 645}
 646
 647static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 648			   struct netlink_ext_ack *extack)
 649{
 650	return ip_tun_parse_opts(attr, info, extack);
 651}
 652
 653static int ip_tun_build_state(struct net *net, struct nlattr *attr,
 654			      unsigned int family, const void *cfg,
 655			      struct lwtunnel_state **ts,
 656			      struct netlink_ext_ack *extack)
 657{
 658	struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
 659	struct lwtunnel_state *new_state;
 660	struct ip_tunnel_info *tun_info;
 661	int err, opt_len;
 662
 663	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr,
 664					  ip_tun_policy, extack);
 665	if (err < 0)
 666		return err;
 667
 668	opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack);
 669	if (opt_len < 0)
 670		return opt_len;
 671
 672	new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
 673	if (!new_state)
 674		return -ENOMEM;
 675
 676	new_state->type = LWTUNNEL_ENCAP_IP;
 677
 678	tun_info = lwt_tun_info(new_state);
 679
 680	err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack);
 681	if (err < 0) {
 682		lwtstate_free(new_state);
 683		return err;
 684	}
 685
 686#ifdef CONFIG_DST_CACHE
 687	err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL);
 688	if (err) {
 689		lwtstate_free(new_state);
 690		return err;
 691	}
 692#endif
 693
 694	if (tb[LWTUNNEL_IP_ID])
 695		tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);
 696
 697	if (tb[LWTUNNEL_IP_DST])
 698		tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]);
 699
 700	if (tb[LWTUNNEL_IP_SRC])
 701		tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]);
 702
 703	if (tb[LWTUNNEL_IP_TTL])
 704		tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
 705
 706	if (tb[LWTUNNEL_IP_TOS])
 707		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
 708
 709	if (tb[LWTUNNEL_IP_FLAGS]) {
 710		IP_TUNNEL_DECLARE_FLAGS(flags);
 711
 712		ip_tunnel_flags_from_be16(flags,
 713					  nla_get_be16(tb[LWTUNNEL_IP_FLAGS]));
 714		ip_tunnel_clear_options_present(flags);
 715
 716		ip_tunnel_flags_or(tun_info->key.tun_flags,
 717				   tun_info->key.tun_flags, flags);
 718	}
 719
 720	tun_info->mode = IP_TUNNEL_INFO_TX;
 721	tun_info->options_len = opt_len;
 722
 723	*ts = new_state;
 724
 725	return 0;
 726}
 727
 728static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate)
 729{
 730#ifdef CONFIG_DST_CACHE
 731	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
 732
 733	dst_cache_destroy(&tun_info->dst_cache);
 734#endif
 735}
 736
 737static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb,
 738					 struct ip_tunnel_info *tun_info)
 739{
 740	struct geneve_opt *opt;
 741	struct nlattr *nest;
 742	int offset = 0;
 743
 744	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE);
 745	if (!nest)
 746		return -ENOMEM;
 747
 748	while (tun_info->options_len > offset) {
 749		opt = ip_tunnel_info_opts(tun_info) + offset;
 750		if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS,
 751				 opt->opt_class) ||
 752		    nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) ||
 753		    nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4,
 754			    opt->opt_data)) {
 755			nla_nest_cancel(skb, nest);
 756			return -ENOMEM;
 757		}
 758		offset += sizeof(*opt) + opt->length * 4;
 759	}
 760
 761	nla_nest_end(skb, nest);
 762	return 0;
 763}
 764
 765static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb,
 766					struct ip_tunnel_info *tun_info)
 767{
 768	struct vxlan_metadata *md;
 769	struct nlattr *nest;
 770
 771	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN);
 772	if (!nest)
 773		return -ENOMEM;
 774
 775	md = ip_tunnel_info_opts(tun_info);
 776	if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) {
 777		nla_nest_cancel(skb, nest);
 778		return -ENOMEM;
 779	}
 780
 781	nla_nest_end(skb, nest);
 782	return 0;
 783}
 784
 785static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb,
 786					 struct ip_tunnel_info *tun_info)
 787{
 788	struct erspan_metadata *md;
 789	struct nlattr *nest;
 790
 791	nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN);
 792	if (!nest)
 793		return -ENOMEM;
 794
 795	md = ip_tunnel_info_opts(tun_info);
 796	if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version))
 797		goto err;
 798
 799	if (md->version == 1 &&
 800	    nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index))
 801		goto err;
 802
 803	if (md->version == 2 &&
 804	    (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) ||
 805	     nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID,
 806			get_hwid(&md->u.md2))))
 807		goto err;
 808
 809	nla_nest_end(skb, nest);
 810	return 0;
 811err:
 812	nla_nest_cancel(skb, nest);
 813	return -ENOMEM;
 814}
 815
 816static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
 817				  struct ip_tunnel_info *tun_info)
 818{
 819	struct nlattr *nest;
 820	int err = 0;
 821
 822	if (!ip_tunnel_is_options_present(tun_info->key.tun_flags))
 823		return 0;
 824
 825	nest = nla_nest_start_noflag(skb, type);
 826	if (!nest)
 827		return -ENOMEM;
 828
 829	if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_info->key.tun_flags))
 830		err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
 831	else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_info->key.tun_flags))
 832		err = ip_tun_fill_encap_opts_vxlan(skb, tun_info);
 833	else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags))
 834		err = ip_tun_fill_encap_opts_erspan(skb, tun_info);
 835
 836	if (err) {
 837		nla_nest_cancel(skb, nest);
 838		return err;
 839	}
 840
 841	nla_nest_end(skb, nest);
 842	return 0;
 843}
 844
 845static int ip_tun_fill_encap_info(struct sk_buff *skb,
 846				  struct lwtunnel_state *lwtstate)
 847{
 848	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
 849
 850	if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id,
 851			 LWTUNNEL_IP_PAD) ||
 852	    nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
 853	    nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
 854	    nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
 855	    nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
 856	    nla_put_be16(skb, LWTUNNEL_IP_FLAGS,
 857			 ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
 858	    ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info))
 859		return -ENOMEM;
 860
 861	return 0;
 862}
 863
 864static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
 865{
 866	int opt_len;
 867
 868	if (!ip_tunnel_is_options_present(info->key.tun_flags))
 869		return 0;
 870
 871	opt_len = nla_total_size(0);		/* LWTUNNEL_IP_OPTS */
 872	if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) {
 873		struct geneve_opt *opt;
 874		int offset = 0;
 875
 876		opt_len += nla_total_size(0);	/* LWTUNNEL_IP_OPTS_GENEVE */
 877		while (info->options_len > offset) {
 878			opt = ip_tunnel_info_opts(info) + offset;
 879			opt_len += nla_total_size(2)	/* OPT_GENEVE_CLASS */
 880				   + nla_total_size(1)	/* OPT_GENEVE_TYPE */
 881				   + nla_total_size(opt->length * 4);
 882							/* OPT_GENEVE_DATA */
 883			offset += sizeof(*opt) + opt->length * 4;
 884		}
 885	} else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) {
 886		opt_len += nla_total_size(0)	/* LWTUNNEL_IP_OPTS_VXLAN */
 887			   + nla_total_size(4);	/* OPT_VXLAN_GBP */
 888	} else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) {
 889		struct erspan_metadata *md = ip_tunnel_info_opts(info);
 890
 891		opt_len += nla_total_size(0)	/* LWTUNNEL_IP_OPTS_ERSPAN */
 892			   + nla_total_size(1)	/* OPT_ERSPAN_VER */
 893			   + (md->version == 1 ? nla_total_size(4)
 894						/* OPT_ERSPAN_INDEX (v1) */
 895					       : nla_total_size(1) +
 896						 nla_total_size(1));
 897						/* OPT_ERSPAN_DIR + HWID (v2) */
 898	}
 899
 900	return opt_len;
 901}
 902
 903static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
 904{
 905	return nla_total_size_64bit(8)	/* LWTUNNEL_IP_ID */
 906		+ nla_total_size(4)	/* LWTUNNEL_IP_DST */
 907		+ nla_total_size(4)	/* LWTUNNEL_IP_SRC */
 908		+ nla_total_size(1)	/* LWTUNNEL_IP_TOS */
 909		+ nla_total_size(1)	/* LWTUNNEL_IP_TTL */
 910		+ nla_total_size(2)	/* LWTUNNEL_IP_FLAGS */
 911		+ ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
 912					/* LWTUNNEL_IP_OPTS */
 913}
 914
 915static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
 916{
 917	struct ip_tunnel_info *info_a = lwt_tun_info(a);
 918	struct ip_tunnel_info *info_b = lwt_tun_info(b);
 919
 920	return memcmp(info_a, info_b, sizeof(info_a->key)) ||
 921	       info_a->mode != info_b->mode ||
 922	       info_a->options_len != info_b->options_len ||
 923	       memcmp(ip_tunnel_info_opts(info_a),
 924		      ip_tunnel_info_opts(info_b), info_a->options_len);
 925}
 926
 927static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
 928	.build_state = ip_tun_build_state,
 929	.destroy_state = ip_tun_destroy_state,
 930	.fill_encap = ip_tun_fill_encap_info,
 931	.get_encap_size = ip_tun_encap_nlsize,
 932	.cmp_encap = ip_tun_cmp_encap,
 933	.owner = THIS_MODULE,
 934};
 935
 936static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
 937	[LWTUNNEL_IP6_UNSPEC]	= { .strict_start_type = LWTUNNEL_IP6_OPTS },
 938	[LWTUNNEL_IP6_ID]		= { .type = NLA_U64 },
 939	[LWTUNNEL_IP6_DST]		= { .len = sizeof(struct in6_addr) },
 940	[LWTUNNEL_IP6_SRC]		= { .len = sizeof(struct in6_addr) },
 941	[LWTUNNEL_IP6_HOPLIMIT]		= { .type = NLA_U8 },
 942	[LWTUNNEL_IP6_TC]		= { .type = NLA_U8 },
 943	[LWTUNNEL_IP6_FLAGS]		= { .type = NLA_U16 },
 944	[LWTUNNEL_IP6_OPTS]		= { .type = NLA_NESTED },
 945};
 946
 947static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
 948			       unsigned int family, const void *cfg,
 949			       struct lwtunnel_state **ts,
 950			       struct netlink_ext_ack *extack)
 951{
 952	struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
 953	struct lwtunnel_state *new_state;
 954	struct ip_tunnel_info *tun_info;
 955	int err, opt_len;
 956
 957	err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr,
 958					  ip6_tun_policy, extack);
 959	if (err < 0)
 960		return err;
 961
 962	opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack);
 963	if (opt_len < 0)
 964		return opt_len;
 965
 966	new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
 967	if (!new_state)
 968		return -ENOMEM;
 969
 970	new_state->type = LWTUNNEL_ENCAP_IP6;
 971
 972	tun_info = lwt_tun_info(new_state);
 973
 974	err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack);
 975	if (err < 0) {
 976		lwtstate_free(new_state);
 977		return err;
 978	}
 979
 980	if (tb[LWTUNNEL_IP6_ID])
 981		tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]);
 982
 983	if (tb[LWTUNNEL_IP6_DST])
 984		tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]);
 985
 986	if (tb[LWTUNNEL_IP6_SRC])
 987		tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]);
 988
 989	if (tb[LWTUNNEL_IP6_HOPLIMIT])
 990		tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]);
 991
 992	if (tb[LWTUNNEL_IP6_TC])
 993		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
 994
 995	if (tb[LWTUNNEL_IP6_FLAGS]) {
 996		IP_TUNNEL_DECLARE_FLAGS(flags);
 997		__be16 data;
 998
 999		data = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
1000		ip_tunnel_flags_from_be16(flags, data);
1001		ip_tunnel_clear_options_present(flags);
1002
1003		ip_tunnel_flags_or(tun_info->key.tun_flags,
1004				   tun_info->key.tun_flags, flags);
1005	}
1006
1007	tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
1008	tun_info->options_len = opt_len;
1009
1010	*ts = new_state;
1011
1012	return 0;
1013}
1014
1015static int ip6_tun_fill_encap_info(struct sk_buff *skb,
1016				   struct lwtunnel_state *lwtstate)
1017{
1018	struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
1019
1020	if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id,
1021			 LWTUNNEL_IP6_PAD) ||
1022	    nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
1023	    nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
1024	    nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
1025	    nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
1026	    nla_put_be16(skb, LWTUNNEL_IP6_FLAGS,
1027			 ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
1028	    ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info))
1029		return -ENOMEM;
1030
1031	return 0;
1032}
1033
1034static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
1035{
1036	return nla_total_size_64bit(8)	/* LWTUNNEL_IP6_ID */
1037		+ nla_total_size(16)	/* LWTUNNEL_IP6_DST */
1038		+ nla_total_size(16)	/* LWTUNNEL_IP6_SRC */
1039		+ nla_total_size(1)	/* LWTUNNEL_IP6_HOPLIMIT */
1040		+ nla_total_size(1)	/* LWTUNNEL_IP6_TC */
1041		+ nla_total_size(2)	/* LWTUNNEL_IP6_FLAGS */
1042		+ ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
1043					/* LWTUNNEL_IP6_OPTS */
1044}
1045
1046static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
1047	.build_state = ip6_tun_build_state,
1048	.fill_encap = ip6_tun_fill_encap_info,
1049	.get_encap_size = ip6_tun_encap_nlsize,
1050	.cmp_encap = ip_tun_cmp_encap,
1051	.owner = THIS_MODULE,
1052};
1053
1054void __init ip_tunnel_core_init(void)
1055{
1056	/* If you land here, make sure whether increasing ip_tunnel_info's
1057	 * options_len is a reasonable choice with its usage in front ends
1058	 * (f.e., it's part of flow keys, etc).
1059	 */
1060	BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255);
1061
1062	lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
1063	lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
1064}
1065
1066DEFINE_STATIC_KEY_FALSE(ip_tunnel_metadata_cnt);
1067EXPORT_SYMBOL(ip_tunnel_metadata_cnt);
1068
1069void ip_tunnel_need_metadata(void)
1070{
1071	static_branch_inc(&ip_tunnel_metadata_cnt);
1072}
1073EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata);
1074
1075void ip_tunnel_unneed_metadata(void)
1076{
1077	static_branch_dec(&ip_tunnel_metadata_cnt);
1078}
1079EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
1080
1081/* Returns either the correct skb->protocol value, or 0 if invalid. */
1082__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb)
1083{
1084	if (skb_network_header(skb) >= skb->head &&
1085	    (skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) &&
1086	    ip_hdr(skb)->version == 4)
1087		return htons(ETH_P_IP);
1088	if (skb_network_header(skb) >= skb->head &&
1089	    (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) &&
1090	    ipv6_hdr(skb)->version == 6)
1091		return htons(ETH_P_IPV6);
1092	return 0;
1093}
1094EXPORT_SYMBOL(ip_tunnel_parse_protocol);
1095
1096const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol };
1097EXPORT_SYMBOL(ip_tunnel_header_ops);
1098
1099/* This function returns true when ENCAP attributes are present in the nl msg */
1100bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
1101				   struct ip_tunnel_encap *encap)
1102{
1103	bool ret = false;
1104
1105	memset(encap, 0, sizeof(*encap));
1106
1107	if (!data)
1108		return ret;
1109
1110	if (data[IFLA_IPTUN_ENCAP_TYPE]) {
1111		ret = true;
1112		encap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
1113	}
1114
1115	if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
1116		ret = true;
1117		encap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
1118	}
1119
1120	if (data[IFLA_IPTUN_ENCAP_SPORT]) {
1121		ret = true;
1122		encap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
1123	}
1124
1125	if (data[IFLA_IPTUN_ENCAP_DPORT]) {
1126		ret = true;
1127		encap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
1128	}
1129
1130	return ret;
1131}
1132EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms);
1133
1134void ip_tunnel_netlink_parms(struct nlattr *data[],
1135			     struct ip_tunnel_parm_kern *parms)
1136{
1137	if (data[IFLA_IPTUN_LINK])
1138		parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
1139
1140	if (data[IFLA_IPTUN_LOCAL])
1141		parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
1142
1143	if (data[IFLA_IPTUN_REMOTE])
1144		parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
1145
1146	if (data[IFLA_IPTUN_TTL]) {
1147		parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
1148		if (parms->iph.ttl)
1149			parms->iph.frag_off = htons(IP_DF);
1150	}
1151
1152	if (data[IFLA_IPTUN_TOS])
1153		parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
1154
1155	if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
1156		parms->iph.frag_off = htons(IP_DF);
1157
1158	if (data[IFLA_IPTUN_FLAGS]) {
1159		__be16 flags;
1160
1161		flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
1162		ip_tunnel_flags_from_be16(parms->i_flags, flags);
1163	}
1164
1165	if (data[IFLA_IPTUN_PROTO])
1166		parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
1167}
1168EXPORT_SYMBOL_GPL(ip_tunnel_netlink_parms);