ip6_output.c - net/ipv6/ip6_output.c - Linux diff v5.14.15

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *	IPv6 output functions
   4 *	Linux INET6 implementation
   5 *
   6 *	Authors:
   7 *	Pedro Roque		<roque@di.fc.ul.pt>
   8 *
   9 *	Based on linux/net/ipv4/ip_output.c
  10 *
  11 *	Changes:
  12 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
  13 *				extension headers are implemented.
  14 *				route changes now work.
  15 *				ip6_forward does not confuse sniffers.
  16 *				etc.
  17 *
  18 *      H. von Brand    :       Added missing #include <linux/string.h>
  19 *	Imran Patel	:	frag id should be in NBO
  20 *      Kazunori MIYAZAWA @USAGI
  21 *			:       add ip6_append_data and related functions
  22 *				for datagram xmit
  23 */
  24
  25#include <linux/errno.h>
  26#include <linux/kernel.h>
  27#include <linux/string.h>
  28#include <linux/socket.h>
  29#include <linux/net.h>
  30#include <linux/netdevice.h>
  31#include <linux/if_arp.h>
  32#include <linux/in6.h>
  33#include <linux/tcp.h>
  34#include <linux/route.h>
  35#include <linux/module.h>
  36#include <linux/slab.h>
  37
  38#include <linux/bpf-cgroup.h>
  39#include <linux/netfilter.h>
  40#include <linux/netfilter_ipv6.h>
  41
  42#include <net/sock.h>
  43#include <net/snmp.h>
  44
  45#include <net/ipv6.h>
  46#include <net/ndisc.h>
  47#include <net/protocol.h>
  48#include <net/ip6_route.h>
  49#include <net/addrconf.h>
  50#include <net/rawv6.h>
  51#include <net/icmp.h>
  52#include <net/xfrm.h>
  53#include <net/checksum.h>
  54#include <linux/mroute6.h>
  55#include <net/l3mdev.h>
  56#include <net/lwtunnel.h>
  57#include <net/ip_tunnels.h>
  58
  59static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  60{
  61	struct dst_entry *dst = skb_dst(skb);
  62	struct net_device *dev = dst->dev;
  63	unsigned int hh_len = LL_RESERVED_SPACE(dev);
  64	int delta = hh_len - skb_headroom(skb);
  65	const struct in6_addr *nexthop;
  66	struct neighbour *neigh;
  67	int ret;
  68
  69	/* Be paranoid, rather than too clever. */
  70	if (unlikely(delta > 0) && dev->header_ops) {
  71		/* pskb_expand_head() might crash, if skb is shared */
  72		if (skb_shared(skb)) {
  73			struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
  74
  75			if (likely(nskb)) {
  76				if (skb->sk)
  77					skb_set_owner_w(nskb, skb->sk);
  78				consume_skb(skb);
  79			} else {
  80				kfree_skb(skb);
  81			}
  82			skb = nskb;
  83		}
  84		if (skb &&
  85		    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
  86			kfree_skb(skb);
  87			skb = NULL;
  88		}
  89		if (!skb) {
  90			IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
  91			return -ENOMEM;
  92		}
  93	}
  94
  95	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  96		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  97
  98		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  99		    ((mroute6_is_socket(net, skb) &&
 100		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 101		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 102					 &ipv6_hdr(skb)->saddr))) {
 103			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 104
 105			/* Do not check for IFF_ALLMULTI; multicast routing
 106			   is not supported in any case.
 107			 */
 108			if (newskb)
 109				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 110					net, sk, newskb, NULL, newskb->dev,
 111					dev_loopback_xmit);
 112
 113			if (ipv6_hdr(skb)->hop_limit == 0) {
 114				IP6_INC_STATS(net, idev,
 115					      IPSTATS_MIB_OUTDISCARDS);
 116				kfree_skb(skb);
 117				return 0;
 118			}
 119		}
 120
 121		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 122
 123		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 124		    IPV6_ADDR_SCOPE_NODELOCAL &&
 125		    !(dev->flags & IFF_LOOPBACK)) {
 126			kfree_skb(skb);
 127			return 0;
 128		}
 129	}
 130
 131	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 132		int res = lwtunnel_xmit(skb);
 133
 134		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 135			return res;
 136	}
 137
 138	rcu_read_lock_bh();
 139	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 140	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 141	if (unlikely(!neigh))
 142		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 143	if (!IS_ERR(neigh)) {
 144		sock_confirm_neigh(skb, neigh);
 145		ret = neigh_output(neigh, skb, false);
 146		rcu_read_unlock_bh();
 147		return ret;
 148	}
 149	rcu_read_unlock_bh();
 150
 151	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 152	kfree_skb(skb);
 153	return -EINVAL;
 154}
 155
 156static int
 157ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 158				    struct sk_buff *skb, unsigned int mtu)
 159{
 160	struct sk_buff *segs, *nskb;
 161	netdev_features_t features;
 162	int ret = 0;
 163
 164	/* Please see corresponding comment in ip_finish_output_gso
 165	 * describing the cases where GSO segment length exceeds the
 166	 * egress MTU.
 167	 */
 168	features = netif_skb_features(skb);
 169	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 170	if (IS_ERR_OR_NULL(segs)) {
 171		kfree_skb(skb);
 172		return -ENOMEM;
 173	}
 174
 175	consume_skb(skb);
 176
 177	skb_list_walk_safe(segs, segs, nskb) {
 178		int err;
 179
 180		skb_mark_not_on_list(segs);
 181		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 182		if (err && ret == 0)
 183			ret = err;
 184	}
 185
 186	return ret;
 187}
 188
 189static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 190{
 191	unsigned int mtu;
 192
 193#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 194	/* Policy lookup after SNAT yielded a new policy */
 195	if (skb_dst(skb)->xfrm) {
 196		IPCB(skb)->flags |= IPSKB_REROUTED;
 197		return dst_output(net, sk, skb);
 198	}
 199#endif
 200
 201	mtu = ip6_skb_dst_mtu(skb);
 202	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
 203		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 204
 205	if ((skb->len > mtu && !skb_is_gso(skb)) ||
 206	    dst_allfrag(skb_dst(skb)) ||
 207	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 208		return ip6_fragment(net, sk, skb, ip6_finish_output2);
 209	else
 210		return ip6_finish_output2(net, sk, skb);
 211}
 212
 213static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 214{
 215	int ret;
 216
 217	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 218	switch (ret) {
 219	case NET_XMIT_SUCCESS:
 220		return __ip6_finish_output(net, sk, skb);
 221	case NET_XMIT_CN:
 222		return __ip6_finish_output(net, sk, skb) ? : ret;
 223	default:
 224		kfree_skb(skb);
 225		return ret;
 226	}
 227}
 228
 229int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 230{
 231	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 232	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 233
 234	skb->protocol = htons(ETH_P_IPV6);
 235	skb->dev = dev;
 236
 237	if (unlikely(idev->cnf.disable_ipv6)) {
 238		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 239		kfree_skb(skb);
 240		return 0;
 241	}
 242
 243	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 244			    net, sk, skb, indev, dev,
 245			    ip6_finish_output,
 246			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 247}
 248EXPORT_SYMBOL(ip6_output);
 249
 250bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 251{
 252	if (!np->autoflowlabel_set)
 253		return ip6_default_np_autolabel(net);
 254	else
 255		return np->autoflowlabel;
 256}
 257
 258/*
 259 * xmit an sk_buff (used by TCP, SCTP and DCCP)
 260 * Note : socket lock is not held for SYNACK packets, but might be modified
 261 * by calls to skb_set_owner_w() and ipv6_local_error(),
 262 * which are using proper atomic operations or spinlocks.
 263 */
 264int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 265	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 266{
 267	struct net *net = sock_net(sk);
 268	const struct ipv6_pinfo *np = inet6_sk(sk);
 269	struct in6_addr *first_hop = &fl6->daddr;
 270	struct dst_entry *dst = skb_dst(skb);
 271	unsigned int head_room;
 272	struct ipv6hdr *hdr;
 273	u8  proto = fl6->flowi6_proto;
 274	int seg_len = skb->len;
 275	int hlimit = -1;
 276	u32 mtu;
 277
 278	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 279	if (opt)
 280		head_room += opt->opt_nflen + opt->opt_flen;
 281
 282	if (unlikely(skb_headroom(skb) < head_room)) {
 283		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 284		if (!skb2) {
 285			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 286				      IPSTATS_MIB_OUTDISCARDS);
 287			kfree_skb(skb);
 288			return -ENOBUFS;
 289		}
 290		if (skb->sk)
 291			skb_set_owner_w(skb2, skb->sk);
 292		consume_skb(skb);
 293		skb = skb2;
 294	}
 295
 296	if (opt) {
 297		seg_len += opt->opt_nflen + opt->opt_flen;
 298
 299		if (opt->opt_flen)
 300			ipv6_push_frag_opts(skb, opt, &proto);
 301
 302		if (opt->opt_nflen)
 303			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 304					     &fl6->saddr);
 305	}
 306
 307	skb_push(skb, sizeof(struct ipv6hdr));
 308	skb_reset_network_header(skb);
 309	hdr = ipv6_hdr(skb);
 310
 311	/*
 312	 *	Fill in the IPv6 header
 313	 */
 314	if (np)
 315		hlimit = np->hop_limit;
 316	if (hlimit < 0)
 317		hlimit = ip6_dst_hoplimit(dst);
 318
 319	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 320				ip6_autoflowlabel(net, np), fl6));
 321
 322	hdr->payload_len = htons(seg_len);
 323	hdr->nexthdr = proto;
 324	hdr->hop_limit = hlimit;
 325
 326	hdr->saddr = fl6->saddr;
 327	hdr->daddr = *first_hop;
 328
 329	skb->protocol = htons(ETH_P_IPV6);
 330	skb->priority = priority;
 331	skb->mark = mark;
 332
 333	mtu = dst_mtu(dst);
 334	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 335		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 336			      IPSTATS_MIB_OUT, skb->len);
 337
 338		/* if egress device is enslaved to an L3 master device pass the
 339		 * skb to its handler for processing
 340		 */
 341		skb = l3mdev_ip6_out((struct sock *)sk, skb);
 342		if (unlikely(!skb))
 343			return 0;
 344
 345		/* hooks should never assume socket lock is held.
 346		 * we promote our socket to non const
 347		 */
 348		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 349			       net, (struct sock *)sk, skb, NULL, dst->dev,
 350			       dst_output);
 351	}
 352
 353	skb->dev = dst->dev;
 354	/* ipv6_local_error() does not require socket lock,
 355	 * we promote our socket to non const
 356	 */
 357	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 358
 359	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 360	kfree_skb(skb);
 361	return -EMSGSIZE;
 362}
 363EXPORT_SYMBOL(ip6_xmit);
 364
 365static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 366{
 367	struct ip6_ra_chain *ra;
 368	struct sock *last = NULL;
 369
 370	read_lock(&ip6_ra_lock);
 371	for (ra = ip6_ra_chain; ra; ra = ra->next) {
 372		struct sock *sk = ra->sk;
 373		if (sk && ra->sel == sel &&
 374		    (!sk->sk_bound_dev_if ||
 375		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
 376			struct ipv6_pinfo *np = inet6_sk(sk);
 377
 378			if (np && np->rtalert_isolate &&
 379			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
 380				continue;
 381			}
 382			if (last) {
 383				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 384				if (skb2)
 385					rawv6_rcv(last, skb2);
 386			}
 387			last = sk;
 388		}
 389	}
 390
 391	if (last) {
 392		rawv6_rcv(last, skb);
 393		read_unlock(&ip6_ra_lock);
 394		return 1;
 395	}
 396	read_unlock(&ip6_ra_lock);
 397	return 0;
 398}
 399
 400static int ip6_forward_proxy_check(struct sk_buff *skb)
 401{
 402	struct ipv6hdr *hdr = ipv6_hdr(skb);
 403	u8 nexthdr = hdr->nexthdr;
 404	__be16 frag_off;
 405	int offset;
 406
 407	if (ipv6_ext_hdr(nexthdr)) {
 408		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 409		if (offset < 0)
 410			return 0;
 411	} else
 412		offset = sizeof(struct ipv6hdr);
 413
 414	if (nexthdr == IPPROTO_ICMPV6) {
 415		struct icmp6hdr *icmp6;
 416
 417		if (!pskb_may_pull(skb, (skb_network_header(skb) +
 418					 offset + 1 - skb->data)))
 419			return 0;
 420
 421		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 422
 423		switch (icmp6->icmp6_type) {
 424		case NDISC_ROUTER_SOLICITATION:
 425		case NDISC_ROUTER_ADVERTISEMENT:
 426		case NDISC_NEIGHBOUR_SOLICITATION:
 427		case NDISC_NEIGHBOUR_ADVERTISEMENT:
 428		case NDISC_REDIRECT:
 429			/* For reaction involving unicast neighbor discovery
 430			 * message destined to the proxied address, pass it to
 431			 * input function.
 432			 */
 433			return 1;
 434		default:
 435			break;
 436		}
 437	}
 438
 439	/*
 440	 * The proxying router can't forward traffic sent to a link-local
 441	 * address, so signal the sender and discard the packet. This
 442	 * behavior is clarified by the MIPv6 specification.
 443	 */
 444	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 445		dst_link_failure(skb);
 446		return -1;
 447	}
 448
 449	return 0;
 450}
 451
 452static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 453				     struct sk_buff *skb)
 454{
 455	struct dst_entry *dst = skb_dst(skb);
 456
 457	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 458	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 459
 460#ifdef CONFIG_NET_SWITCHDEV
 461	if (skb->offload_l3_fwd_mark) {
 462		consume_skb(skb);
 463		return 0;
 464	}
 465#endif
 466
 467	skb->tstamp = 0;
 468	return dst_output(net, sk, skb);
 469}
 470
 471static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 472{
 473	if (skb->len <= mtu)
 474		return false;
 475
 476	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 477	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 478		return true;
 479
 480	if (skb->ignore_df)
 481		return false;
 482
 483	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 484		return false;
 485
 486	return true;
 487}
 488
 489int ip6_forward(struct sk_buff *skb)
 490{
 
 491	struct dst_entry *dst = skb_dst(skb);
 492	struct ipv6hdr *hdr = ipv6_hdr(skb);
 493	struct inet6_skb_parm *opt = IP6CB(skb);
 494	struct net *net = dev_net(dst->dev);
 495	struct inet6_dev *idev;
 496	u32 mtu;
 497
 498	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
 499	if (net->ipv6.devconf_all->forwarding == 0)
 500		goto error;
 501
 502	if (skb->pkt_type != PACKET_HOST)
 503		goto drop;
 504
 505	if (unlikely(skb->sk))
 506		goto drop;
 507
 508	if (skb_warn_if_lro(skb))
 509		goto drop;
 510
 511	if (!net->ipv6.devconf_all->disable_policy &&
 512	    !idev->cnf.disable_policy &&
 513	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 514		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 515		goto drop;
 516	}
 517
 518	skb_forward_csum(skb);
 519
 520	/*
 521	 *	We DO NOT make any processing on
 522	 *	RA packets, pushing them to user level AS IS
 523	 *	without ane WARRANTY that application will be able
 524	 *	to interpret them. The reason is that we
 525	 *	cannot make anything clever here.
 526	 *
 527	 *	We are not end-node, so that if packet contains
 528	 *	AH/ESP, we cannot make anything.
 529	 *	Defragmentation also would be mistake, RA packets
 530	 *	cannot be fragmented, because there is no warranty
 531	 *	that different fragments will go along one path. --ANK
 532	 */
 533	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 534		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 535			return 0;
 536	}
 537
 538	/*
 539	 *	check and decrement ttl
 540	 */
 541	if (hdr->hop_limit <= 1) {
 
 
 542		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 543		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 544
 545		kfree_skb(skb);
 546		return -ETIMEDOUT;
 547	}
 548
 549	/* XXX: idev->cnf.proxy_ndp? */
 550	if (net->ipv6.devconf_all->proxy_ndp &&
 551	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 552		int proxied = ip6_forward_proxy_check(skb);
 553		if (proxied > 0) {
 554			hdr->hop_limit--;
 555			return ip6_input(skb);
 556		} else if (proxied < 0) {
 557			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 558			goto drop;
 559		}
 560	}
 561
 562	if (!xfrm6_route_forward(skb)) {
 563		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 564		goto drop;
 565	}
 566	dst = skb_dst(skb);
 567
 568	/* IPv6 specs say nothing about it, but it is clear that we cannot
 569	   send redirects to source routed frames.
 570	   We don't send redirects to frames decapsulated from IPsec.
 571	 */
 572	if (IP6CB(skb)->iif == dst->dev->ifindex &&
 573	    opt->srcrt == 0 && !skb_sec_path(skb)) {
 574		struct in6_addr *target = NULL;
 575		struct inet_peer *peer;
 576		struct rt6_info *rt;
 577
 578		/*
 579		 *	incoming and outgoing devices are the same
 580		 *	send a redirect.
 581		 */
 582
 583		rt = (struct rt6_info *) dst;
 584		if (rt->rt6i_flags & RTF_GATEWAY)
 585			target = &rt->rt6i_gateway;
 586		else
 587			target = &hdr->daddr;
 588
 589		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 590
 591		/* Limit redirects both by destination (here)
 592		   and by source (inside ndisc_send_redirect)
 593		 */
 594		if (inet_peer_xrlim_allow(peer, 1*HZ))
 595			ndisc_send_redirect(skb, target);
 596		if (peer)
 597			inet_putpeer(peer);
 598	} else {
 599		int addrtype = ipv6_addr_type(&hdr->saddr);
 600
 601		/* This check is security critical. */
 602		if (addrtype == IPV6_ADDR_ANY ||
 603		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 604			goto error;
 605		if (addrtype & IPV6_ADDR_LINKLOCAL) {
 606			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 607				    ICMPV6_NOT_NEIGHBOUR, 0);
 608			goto error;
 609		}
 610	}
 611
 612	mtu = ip6_dst_mtu_forward(dst);
 613	if (mtu < IPV6_MIN_MTU)
 614		mtu = IPV6_MIN_MTU;
 615
 616	if (ip6_pkt_too_big(skb, mtu)) {
 617		/* Again, force OUTPUT device used as source address */
 618		skb->dev = dst->dev;
 619		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 620		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 621		__IP6_INC_STATS(net, ip6_dst_idev(dst),
 622				IPSTATS_MIB_FRAGFAILS);
 623		kfree_skb(skb);
 624		return -EMSGSIZE;
 625	}
 626
 627	if (skb_cow(skb, dst->dev->hard_header_len)) {
 628		__IP6_INC_STATS(net, ip6_dst_idev(dst),
 629				IPSTATS_MIB_OUTDISCARDS);
 630		goto drop;
 631	}
 632
 633	hdr = ipv6_hdr(skb);
 634
 635	/* Mangling hops number delayed to point after skb COW */
 636
 637	hdr->hop_limit--;
 638
 639	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 640		       net, NULL, skb, skb->dev, dst->dev,
 641		       ip6_forward_finish);
 642
 643error:
 644	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 645drop:
 646	kfree_skb(skb);
 647	return -EINVAL;
 648}
 649
 650static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 651{
 652	to->pkt_type = from->pkt_type;
 653	to->priority = from->priority;
 654	to->protocol = from->protocol;
 655	skb_dst_drop(to);
 656	skb_dst_set(to, dst_clone(skb_dst(from)));
 657	to->dev = from->dev;
 658	to->mark = from->mark;
 659
 660	skb_copy_hash(to, from);
 661
 662#ifdef CONFIG_NET_SCHED
 663	to->tc_index = from->tc_index;
 664#endif
 665	nf_copy(to, from);
 666	skb_ext_copy(to, from);
 667	skb_copy_secmark(to, from);
 668}
 669
 670int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 671		      u8 nexthdr, __be32 frag_id,
 672		      struct ip6_fraglist_iter *iter)
 673{
 674	unsigned int first_len;
 675	struct frag_hdr *fh;
 676
 677	/* BUILD HEADER */
 678	*prevhdr = NEXTHDR_FRAGMENT;
 679	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 680	if (!iter->tmp_hdr)
 681		return -ENOMEM;
 682
 683	iter->frag = skb_shinfo(skb)->frag_list;
 684	skb_frag_list_init(skb);
 685
 686	iter->offset = 0;
 687	iter->hlen = hlen;
 688	iter->frag_id = frag_id;
 689	iter->nexthdr = nexthdr;
 690
 691	__skb_pull(skb, hlen);
 692	fh = __skb_push(skb, sizeof(struct frag_hdr));
 693	__skb_push(skb, hlen);
 694	skb_reset_network_header(skb);
 695	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 696
 697	fh->nexthdr = nexthdr;
 698	fh->reserved = 0;
 699	fh->frag_off = htons(IP6_MF);
 700	fh->identification = frag_id;
 701
 702	first_len = skb_pagelen(skb);
 703	skb->data_len = first_len - skb_headlen(skb);
 704	skb->len = first_len;
 705	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 706
 707	return 0;
 708}
 709EXPORT_SYMBOL(ip6_fraglist_init);
 710
 711void ip6_fraglist_prepare(struct sk_buff *skb,
 712			  struct ip6_fraglist_iter *iter)
 713{
 714	struct sk_buff *frag = iter->frag;
 715	unsigned int hlen = iter->hlen;
 716	struct frag_hdr *fh;
 717
 718	frag->ip_summed = CHECKSUM_NONE;
 719	skb_reset_transport_header(frag);
 720	fh = __skb_push(frag, sizeof(struct frag_hdr));
 721	__skb_push(frag, hlen);
 722	skb_reset_network_header(frag);
 723	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 724	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 725	fh->nexthdr = iter->nexthdr;
 726	fh->reserved = 0;
 727	fh->frag_off = htons(iter->offset);
 728	if (frag->next)
 729		fh->frag_off |= htons(IP6_MF);
 730	fh->identification = iter->frag_id;
 731	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 732	ip6_copy_metadata(frag, skb);
 733}
 734EXPORT_SYMBOL(ip6_fraglist_prepare);
 735
 736void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 737		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 738		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 739{
 740	state->prevhdr = prevhdr;
 741	state->nexthdr = nexthdr;
 742	state->frag_id = frag_id;
 743
 744	state->hlen = hlen;
 745	state->mtu = mtu;
 746
 747	state->left = skb->len - hlen;	/* Space per frame */
 748	state->ptr = hlen;		/* Where to start from */
 749
 750	state->hroom = hdr_room;
 751	state->troom = needed_tailroom;
 752
 753	state->offset = 0;
 754}
 755EXPORT_SYMBOL(ip6_frag_init);
 756
 757struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 758{
 759	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 760	struct sk_buff *frag;
 761	struct frag_hdr *fh;
 762	unsigned int len;
 763
 764	len = state->left;
 765	/* IF: it doesn't fit, use 'mtu' - the data space left */
 766	if (len > state->mtu)
 767		len = state->mtu;
 768	/* IF: we are not sending up to and including the packet end
 769	   then align the next start on an eight byte boundary */
 770	if (len < state->left)
 771		len &= ~7;
 772
 773	/* Allocate buffer */
 774	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 775			 state->hroom + state->troom, GFP_ATOMIC);
 776	if (!frag)
 777		return ERR_PTR(-ENOMEM);
 778
 779	/*
 780	 *	Set up data on packet
 781	 */
 782
 783	ip6_copy_metadata(frag, skb);
 784	skb_reserve(frag, state->hroom);
 785	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 786	skb_reset_network_header(frag);
 787	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 788	frag->transport_header = (frag->network_header + state->hlen +
 789				  sizeof(struct frag_hdr));
 790
 791	/*
 792	 *	Charge the memory for the fragment to any owner
 793	 *	it might possess
 794	 */
 795	if (skb->sk)
 796		skb_set_owner_w(frag, skb->sk);
 797
 798	/*
 799	 *	Copy the packet header into the new buffer.
 800	 */
 801	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 802
 803	fragnexthdr_offset = skb_network_header(frag);
 804	fragnexthdr_offset += prevhdr - skb_network_header(skb);
 805	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
 806
 807	/*
 808	 *	Build fragment header.
 809	 */
 810	fh->nexthdr = state->nexthdr;
 811	fh->reserved = 0;
 812	fh->identification = state->frag_id;
 813
 814	/*
 815	 *	Copy a block of the IP datagram.
 816	 */
 817	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 818			     len));
 819	state->left -= len;
 820
 821	fh->frag_off = htons(state->offset);
 822	if (state->left > 0)
 823		fh->frag_off |= htons(IP6_MF);
 824	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 825
 826	state->ptr += len;
 827	state->offset += len;
 828
 829	return frag;
 830}
 831EXPORT_SYMBOL(ip6_frag_next);
 832
 833int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 834		 int (*output)(struct net *, struct sock *, struct sk_buff *))
 835{
 836	struct sk_buff *frag;
 837	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 838	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 839				inet6_sk(skb->sk) : NULL;
 840	struct ip6_frag_state state;
 841	unsigned int mtu, hlen, nexthdr_offset;
 842	ktime_t tstamp = skb->tstamp;
 843	int hroom, err = 0;
 844	__be32 frag_id;
 845	u8 *prevhdr, nexthdr = 0;
 846
 847	err = ip6_find_1stfragopt(skb, &prevhdr);
 848	if (err < 0)
 849		goto fail;
 850	hlen = err;
 851	nexthdr = *prevhdr;
 852	nexthdr_offset = prevhdr - skb_network_header(skb);
 853
 854	mtu = ip6_skb_dst_mtu(skb);
 855
 856	/* We must not fragment if the socket is set to force MTU discovery
 857	 * or if the skb it not generated by a local socket.
 858	 */
 859	if (unlikely(!skb->ignore_df && skb->len > mtu))
 860		goto fail_toobig;
 861
 862	if (IP6CB(skb)->frag_max_size) {
 863		if (IP6CB(skb)->frag_max_size > mtu)
 864			goto fail_toobig;
 865
 866		/* don't send fragments larger than what we received */
 867		mtu = IP6CB(skb)->frag_max_size;
 868		if (mtu < IPV6_MIN_MTU)
 869			mtu = IPV6_MIN_MTU;
 870	}
 871
 872	if (np && np->frag_size < mtu) {
 873		if (np->frag_size)
 874			mtu = np->frag_size;
 875	}
 876	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 877		goto fail_toobig;
 878	mtu -= hlen + sizeof(struct frag_hdr);
 879
 880	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 881				    &ipv6_hdr(skb)->saddr);
 882
 883	if (skb->ip_summed == CHECKSUM_PARTIAL &&
 884	    (err = skb_checksum_help(skb)))
 885		goto fail;
 886
 887	prevhdr = skb_network_header(skb) + nexthdr_offset;
 888	hroom = LL_RESERVED_SPACE(rt->dst.dev);
 889	if (skb_has_frag_list(skb)) {
 890		unsigned int first_len = skb_pagelen(skb);
 891		struct ip6_fraglist_iter iter;
 892		struct sk_buff *frag2;
 893
 894		if (first_len - hlen > mtu ||
 895		    ((first_len - hlen) & 7) ||
 896		    skb_cloned(skb) ||
 897		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 898			goto slow_path;
 899
 900		skb_walk_frags(skb, frag) {
 901			/* Correct geometry. */
 902			if (frag->len > mtu ||
 903			    ((frag->len & 7) && frag->next) ||
 904			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 905				goto slow_path_clean;
 906
 907			/* Partially cloned skb? */
 908			if (skb_shared(frag))
 909				goto slow_path_clean;
 910
 911			BUG_ON(frag->sk);
 912			if (skb->sk) {
 913				frag->sk = skb->sk;
 914				frag->destructor = sock_wfree;
 915			}
 916			skb->truesize -= frag->truesize;
 917		}
 918
 919		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 920					&iter);
 921		if (err < 0)
 922			goto fail;
 923
 924		for (;;) {
 925			/* Prepare header of the next frame,
 926			 * before previous one went down. */
 927			if (iter.frag)
 928				ip6_fraglist_prepare(skb, &iter);
 929
 930			skb->tstamp = tstamp;
 931			err = output(net, sk, skb);
 932			if (!err)
 933				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 934					      IPSTATS_MIB_FRAGCREATES);
 935
 936			if (err || !iter.frag)
 937				break;
 938
 939			skb = ip6_fraglist_next(&iter);
 940		}
 941
 942		kfree(iter.tmp_hdr);
 943
 944		if (err == 0) {
 945			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 946				      IPSTATS_MIB_FRAGOKS);
 947			return 0;
 948		}
 949
 950		kfree_skb_list(iter.frag);
 951
 952		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 953			      IPSTATS_MIB_FRAGFAILS);
 954		return err;
 955
 956slow_path_clean:
 957		skb_walk_frags(skb, frag2) {
 958			if (frag2 == frag)
 959				break;
 960			frag2->sk = NULL;
 961			frag2->destructor = NULL;
 962			skb->truesize += frag2->truesize;
 963		}
 964	}
 965
 966slow_path:
 967	/*
 968	 *	Fragment the datagram.
 969	 */
 970
 971	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 972		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 973		      &state);
 974
 975	/*
 976	 *	Keep copying data until we run out.
 977	 */
 978
 979	while (state.left > 0) {
 980		frag = ip6_frag_next(skb, &state);
 981		if (IS_ERR(frag)) {
 982			err = PTR_ERR(frag);
 983			goto fail;
 984		}
 985
 986		/*
 987		 *	Put this fragment into the sending queue.
 988		 */
 989		frag->tstamp = tstamp;
 990		err = output(net, sk, frag);
 991		if (err)
 992			goto fail;
 993
 994		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 995			      IPSTATS_MIB_FRAGCREATES);
 996	}
 997	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 998		      IPSTATS_MIB_FRAGOKS);
 999	consume_skb(skb);
1000	return err;
1001
1002fail_toobig:
1003	if (skb->sk && dst_allfrag(skb_dst(skb)))
1004		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
1005
1006	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1007	err = -EMSGSIZE;
1008
1009fail:
1010	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1011		      IPSTATS_MIB_FRAGFAILS);
1012	kfree_skb(skb);
1013	return err;
1014}
1015
1016static inline int ip6_rt_check(const struct rt6key *rt_key,
1017			       const struct in6_addr *fl_addr,
1018			       const struct in6_addr *addr_cache)
1019{
1020	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1021		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1022}
1023
1024static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1025					  struct dst_entry *dst,
1026					  const struct flowi6 *fl6)
1027{
1028	struct ipv6_pinfo *np = inet6_sk(sk);
1029	struct rt6_info *rt;
1030
1031	if (!dst)
1032		goto out;
1033
1034	if (dst->ops->family != AF_INET6) {
1035		dst_release(dst);
1036		return NULL;
1037	}
1038
1039	rt = (struct rt6_info *)dst;
1040	/* Yes, checking route validity in not connected
1041	 * case is not very simple. Take into account,
1042	 * that we do not support routing by source, TOS,
1043	 * and MSG_DONTROUTE		--ANK (980726)
1044	 *
1045	 * 1. ip6_rt_check(): If route was host route,
1046	 *    check that cached destination is current.
1047	 *    If it is network route, we still may
1048	 *    check its validity using saved pointer
1049	 *    to the last used address: daddr_cache.
1050	 *    We do not want to save whole address now,
1051	 *    (because main consumer of this service
1052	 *    is tcp, which has not this problem),
1053	 *    so that the last trick works only on connected
1054	 *    sockets.
1055	 * 2. oif also should be the same.
1056	 */
1057	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1058#ifdef CONFIG_IPV6_SUBTREES
1059	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1060#endif
1061	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1062	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1063		dst_release(dst);
1064		dst = NULL;
1065	}
1066
1067out:
1068	return dst;
1069}
1070
1071static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1072			       struct dst_entry **dst, struct flowi6 *fl6)
1073{
1074#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1075	struct neighbour *n;
1076	struct rt6_info *rt;
1077#endif
1078	int err;
1079	int flags = 0;
1080
1081	/* The correct way to handle this would be to do
1082	 * ip6_route_get_saddr, and then ip6_route_output; however,
1083	 * the route-specific preferred source forces the
1084	 * ip6_route_output call _before_ ip6_route_get_saddr.
1085	 *
1086	 * In source specific routing (no src=any default route),
1087	 * ip6_route_output will fail given src=any saddr, though, so
1088	 * that's why we try it again later.
1089	 */
1090	if (ipv6_addr_any(&fl6->saddr)) {
1091		struct fib6_info *from;
1092		struct rt6_info *rt;
 
1093
1094		*dst = ip6_route_output(net, sk, fl6);
 
1095		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1096
1097		rcu_read_lock();
1098		from = rt ? rcu_dereference(rt->from) : NULL;
1099		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1100					  sk ? inet6_sk(sk)->srcprefs : 0,
1101					  &fl6->saddr);
1102		rcu_read_unlock();
1103
1104		if (err)
1105			goto out_err_release;
1106
1107		/* If we had an erroneous initial result, pretend it
1108		 * never existed and let the SA-enabled version take
1109		 * over.
1110		 */
1111		if ((*dst)->error) {
1112			dst_release(*dst);
1113			*dst = NULL;
1114		}
1115
1116		if (fl6->flowi6_oif)
1117			flags |= RT6_LOOKUP_F_IFACE;
1118	}
1119
1120	if (!*dst)
1121		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1122
1123	err = (*dst)->error;
1124	if (err)
1125		goto out_err_release;
1126
1127#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1128	/*
1129	 * Here if the dst entry we've looked up
1130	 * has a neighbour entry that is in the INCOMPLETE
1131	 * state and the src address from the flow is
1132	 * marked as OPTIMISTIC, we release the found
1133	 * dst entry and replace it instead with the
1134	 * dst entry of the nexthop router
1135	 */
1136	rt = (struct rt6_info *) *dst;
1137	rcu_read_lock_bh();
1138	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1139				      rt6_nexthop(rt, &fl6->daddr));
1140	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1141	rcu_read_unlock_bh();
1142
1143	if (err) {
1144		struct inet6_ifaddr *ifp;
1145		struct flowi6 fl_gw6;
1146		int redirect;
1147
1148		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1149				      (*dst)->dev, 1);
1150
1151		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1152		if (ifp)
1153			in6_ifa_put(ifp);
1154
1155		if (redirect) {
1156			/*
1157			 * We need to get the dst entry for the
1158			 * default router instead
1159			 */
1160			dst_release(*dst);
1161			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1162			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1163			*dst = ip6_route_output(net, sk, &fl_gw6);
1164			err = (*dst)->error;
1165			if (err)
1166				goto out_err_release;
1167		}
1168	}
1169#endif
1170	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1171	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1172		err = -EAFNOSUPPORT;
1173		goto out_err_release;
1174	}
1175
1176	return 0;
1177
1178out_err_release:
1179	dst_release(*dst);
1180	*dst = NULL;
1181
1182	if (err == -ENETUNREACH)
1183		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1184	return err;
1185}
1186
1187/**
1188 *	ip6_dst_lookup - perform route lookup on flow
1189 *	@net: Network namespace to perform lookup in
1190 *	@sk: socket which provides route info
1191 *	@dst: pointer to dst_entry * for result
1192 *	@fl6: flow to lookup
1193 *
1194 *	This function performs a route lookup on the given flow.
1195 *
1196 *	It returns zero on success, or a standard errno code on error.
1197 */
1198int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1199		   struct flowi6 *fl6)
1200{
1201	*dst = NULL;
1202	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1203}
1204EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1205
1206/**
1207 *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1208 *	@net: Network namespace to perform lookup in
1209 *	@sk: socket which provides route info
1210 *	@fl6: flow to lookup
1211 *	@final_dst: final destination address for ipsec lookup
1212 *
1213 *	This function performs a route lookup on the given flow.
1214 *
1215 *	It returns a valid dst pointer on success, or a pointer encoded
1216 *	error code.
1217 */
1218struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1219				      const struct in6_addr *final_dst)
1220{
1221	struct dst_entry *dst = NULL;
1222	int err;
1223
1224	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1225	if (err)
1226		return ERR_PTR(err);
1227	if (final_dst)
1228		fl6->daddr = *final_dst;
1229
1230	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1231}
1232EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1233
1234/**
1235 *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1236 *	@sk: socket which provides the dst cache and route info
1237 *	@fl6: flow to lookup
1238 *	@final_dst: final destination address for ipsec lookup
1239 *	@connected: whether @sk is connected or not
1240 *
1241 *	This function performs a route lookup on the given flow with the
1242 *	possibility of using the cached route in the socket if it is valid.
1243 *	It will take the socket dst lock when operating on the dst cache.
1244 *	As a result, this function can only be used in process context.
1245 *
1246 *	In addition, for a connected socket, cache the dst in the socket
1247 *	if the current cache is not valid.
1248 *
1249 *	It returns a valid dst pointer on success, or a pointer encoded
1250 *	error code.
1251 */
1252struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1253					 const struct in6_addr *final_dst,
1254					 bool connected)
1255{
1256	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1257
1258	dst = ip6_sk_dst_check(sk, dst, fl6);
1259	if (dst)
1260		return dst;
1261
1262	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1263	if (connected && !IS_ERR(dst))
1264		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1265
1266	return dst;
1267}
1268EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1269
1270/**
1271 *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1272 *      @skb: Packet for which lookup is done
1273 *      @dev: Tunnel device
1274 *      @net: Network namespace of tunnel device
1275 *      @sock: Socket which provides route info
1276 *      @saddr: Memory to store the src ip address
1277 *      @info: Tunnel information
1278 *      @protocol: IP protocol
1279 *      @use_cache: Flag to enable cache usage
1280 *      This function performs a route lookup on a tunnel
1281 *
1282 *      It returns a valid dst pointer and stores src address to be used in
1283 *      tunnel in param saddr on success, else a pointer encoded error code.
1284 */
1285
1286struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1287					struct net_device *dev,
1288					struct net *net,
1289					struct socket *sock,
1290					struct in6_addr *saddr,
1291					const struct ip_tunnel_info *info,
1292					u8 protocol,
1293					bool use_cache)
1294{
1295	struct dst_entry *dst = NULL;
1296#ifdef CONFIG_DST_CACHE
1297	struct dst_cache *dst_cache;
1298#endif
1299	struct flowi6 fl6;
1300	__u8 prio;
1301
1302#ifdef CONFIG_DST_CACHE
1303	dst_cache = (struct dst_cache *)&info->dst_cache;
1304	if (use_cache) {
1305		dst = dst_cache_get_ip6(dst_cache, saddr);
1306		if (dst)
1307			return dst;
1308	}
1309#endif
1310	memset(&fl6, 0, sizeof(fl6));
1311	fl6.flowi6_mark = skb->mark;
1312	fl6.flowi6_proto = protocol;
1313	fl6.daddr = info->key.u.ipv6.dst;
1314	fl6.saddr = info->key.u.ipv6.src;
1315	prio = info->key.tos;
1316	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1317					  info->key.label);
1318
1319	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1320					      NULL);
1321	if (IS_ERR(dst)) {
1322		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1323		return ERR_PTR(-ENETUNREACH);
1324	}
1325	if (dst->dev == dev) { /* is this necessary? */
1326		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1327		dst_release(dst);
1328		return ERR_PTR(-ELOOP);
1329	}
1330#ifdef CONFIG_DST_CACHE
1331	if (use_cache)
1332		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1333#endif
1334	*saddr = fl6.saddr;
1335	return dst;
1336}
1337EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1338
1339static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1340					       gfp_t gfp)
1341{
1342	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1343}
1344
1345static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1346						gfp_t gfp)
1347{
1348	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1349}
1350
1351static void ip6_append_data_mtu(unsigned int *mtu,
1352				int *maxfraglen,
1353				unsigned int fragheaderlen,
1354				struct sk_buff *skb,
1355				struct rt6_info *rt,
1356				unsigned int orig_mtu)
1357{
1358	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1359		if (!skb) {
1360			/* first fragment, reserve header_len */
1361			*mtu = orig_mtu - rt->dst.header_len;
1362
1363		} else {
1364			/*
1365			 * this fragment is not first, the headers
1366			 * space is regarded as data space.
1367			 */
1368			*mtu = orig_mtu;
1369		}
1370		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1371			      + fragheaderlen - sizeof(struct frag_hdr);
1372	}
1373}
1374
1375static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1376			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1377			  struct rt6_info *rt, struct flowi6 *fl6)
1378{
1379	struct ipv6_pinfo *np = inet6_sk(sk);
1380	unsigned int mtu;
1381	struct ipv6_txoptions *opt = ipc6->opt;
1382
1383	/*
1384	 * setup for corking
1385	 */
1386	if (opt) {
1387		if (WARN_ON(v6_cork->opt))
1388			return -EINVAL;
1389
1390		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1391		if (unlikely(!v6_cork->opt))
1392			return -ENOBUFS;
1393
1394		v6_cork->opt->tot_len = sizeof(*opt);
1395		v6_cork->opt->opt_flen = opt->opt_flen;
1396		v6_cork->opt->opt_nflen = opt->opt_nflen;
1397
1398		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1399						    sk->sk_allocation);
1400		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1401			return -ENOBUFS;
1402
1403		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1404						    sk->sk_allocation);
1405		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1406			return -ENOBUFS;
1407
1408		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1409						   sk->sk_allocation);
1410		if (opt->hopopt && !v6_cork->opt->hopopt)
1411			return -ENOBUFS;
1412
1413		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1414						    sk->sk_allocation);
1415		if (opt->srcrt && !v6_cork->opt->srcrt)
1416			return -ENOBUFS;
1417
1418		/* need source address above miyazawa*/
1419	}
1420	dst_hold(&rt->dst);
1421	cork->base.dst = &rt->dst;
1422	cork->fl.u.ip6 = *fl6;
1423	v6_cork->hop_limit = ipc6->hlimit;
1424	v6_cork->tclass = ipc6->tclass;
1425	if (rt->dst.flags & DST_XFRM_TUNNEL)
1426		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1427		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1428	else
1429		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1430			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1431	if (np->frag_size < mtu) {
1432		if (np->frag_size)
1433			mtu = np->frag_size;
1434	}
1435	if (mtu < IPV6_MIN_MTU)
1436		return -EINVAL;
1437	cork->base.fragsize = mtu;
1438	cork->base.gso_size = ipc6->gso_size;
1439	cork->base.tx_flags = 0;
1440	cork->base.mark = ipc6->sockc.mark;
1441	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1442
1443	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1444		cork->base.flags |= IPCORK_ALLFRAG;
1445	cork->base.length = 0;
1446
1447	cork->base.transmit_time = ipc6->sockc.transmit_time;
1448
1449	return 0;
1450}
1451
1452static int __ip6_append_data(struct sock *sk,
1453			     struct flowi6 *fl6,
1454			     struct sk_buff_head *queue,
1455			     struct inet_cork *cork,
1456			     struct inet6_cork *v6_cork,
1457			     struct page_frag *pfrag,
1458			     int getfrag(void *from, char *to, int offset,
1459					 int len, int odd, struct sk_buff *skb),
1460			     void *from, int length, int transhdrlen,
1461			     unsigned int flags, struct ipcm6_cookie *ipc6)
1462{
1463	struct sk_buff *skb, *skb_prev = NULL;
1464	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1465	struct ubuf_info *uarg = NULL;
1466	int exthdrlen = 0;
1467	int dst_exthdrlen = 0;
1468	int hh_len;
1469	int copy;
1470	int err;
1471	int offset = 0;
1472	u32 tskey = 0;
1473	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1474	struct ipv6_txoptions *opt = v6_cork->opt;
1475	int csummode = CHECKSUM_NONE;
1476	unsigned int maxnonfragsize, headersize;
1477	unsigned int wmem_alloc_delta = 0;
1478	bool paged, extra_uref = false;
1479
1480	skb = skb_peek_tail(queue);
1481	if (!skb) {
1482		exthdrlen = opt ? opt->opt_flen : 0;
1483		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1484	}
1485
1486	paged = !!cork->gso_size;
1487	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1488	orig_mtu = mtu;
1489
1490	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1491	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1492		tskey = sk->sk_tskey++;
1493
1494	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1495
1496	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1497			(opt ? opt->opt_nflen : 0);
1498	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1499		     sizeof(struct frag_hdr);
1500
1501	headersize = sizeof(struct ipv6hdr) +
1502		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1503		     (dst_allfrag(&rt->dst) ?
1504		      sizeof(struct frag_hdr) : 0) +
1505		     rt->rt6i_nfheader_len;
1506
1507	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1508	 * the first fragment
1509	 */
1510	if (headersize + transhdrlen > mtu)
1511		goto emsgsize;
1512
1513	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1514	    (sk->sk_protocol == IPPROTO_UDP ||
1515	     sk->sk_protocol == IPPROTO_RAW)) {
1516		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1517				sizeof(struct ipv6hdr));
1518		goto emsgsize;
1519	}
1520
1521	if (ip6_sk_ignore_df(sk))
1522		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1523	else
1524		maxnonfragsize = mtu;
1525
1526	if (cork->length + length > maxnonfragsize - headersize) {
1527emsgsize:
1528		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1529		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1530		return -EMSGSIZE;
1531	}
1532
1533	/* CHECKSUM_PARTIAL only with no extension headers and when
1534	 * we are not going to fragment
1535	 */
1536	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1537	    headersize == sizeof(struct ipv6hdr) &&
1538	    length <= mtu - headersize &&
1539	    (!(flags & MSG_MORE) || cork->gso_size) &&
1540	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1541		csummode = CHECKSUM_PARTIAL;
1542
1543	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1544		uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1545		if (!uarg)
1546			return -ENOBUFS;
1547		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1548		if (rt->dst.dev->features & NETIF_F_SG &&
1549		    csummode == CHECKSUM_PARTIAL) {
1550			paged = true;
1551		} else {
1552			uarg->zerocopy = 0;
1553			skb_zcopy_set(skb, uarg, &extra_uref);
1554		}
1555	}
1556
1557	/*
1558	 * Let's try using as much space as possible.
1559	 * Use MTU if total length of the message fits into the MTU.
1560	 * Otherwise, we need to reserve fragment header and
1561	 * fragment alignment (= 8-15 octects, in total).
1562	 *
1563	 * Note that we may need to "move" the data from the tail
1564	 * of the buffer to the new fragment when we split
1565	 * the message.
1566	 *
1567	 * FIXME: It may be fragmented into multiple chunks
1568	 *        at once if non-fragmentable extension headers
1569	 *        are too large.
1570	 * --yoshfuji
1571	 */
1572
1573	cork->length += length;
1574	if (!skb)
1575		goto alloc_new_skb;
1576
1577	while (length > 0) {
1578		/* Check if the remaining data fits into current packet. */
1579		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1580		if (copy < length)
1581			copy = maxfraglen - skb->len;
1582
1583		if (copy <= 0) {
1584			char *data;
1585			unsigned int datalen;
1586			unsigned int fraglen;
1587			unsigned int fraggap;
1588			unsigned int alloclen, alloc_extra;
1589			unsigned int pagedlen;
1590alloc_new_skb:
1591			/* There's no room in the current skb */
1592			if (skb)
1593				fraggap = skb->len - maxfraglen;
1594			else
1595				fraggap = 0;
1596			/* update mtu and maxfraglen if necessary */
1597			if (!skb || !skb_prev)
1598				ip6_append_data_mtu(&mtu, &maxfraglen,
1599						    fragheaderlen, skb, rt,
1600						    orig_mtu);
1601
1602			skb_prev = skb;
1603
1604			/*
1605			 * If remaining data exceeds the mtu,
1606			 * we know we need more fragment(s).
1607			 */
1608			datalen = length + fraggap;
1609
1610			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1611				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1612			fraglen = datalen + fragheaderlen;
1613			pagedlen = 0;
1614
1615			alloc_extra = hh_len;
1616			alloc_extra += dst_exthdrlen;
1617			alloc_extra += rt->dst.trailer_len;
1618
1619			/* We just reserve space for fragment header.
1620			 * Note: this may be overallocation if the message
1621			 * (without MSG_MORE) fits into the MTU.
1622			 */
1623			alloc_extra += sizeof(struct frag_hdr);
1624
1625			if ((flags & MSG_MORE) &&
1626			    !(rt->dst.dev->features&NETIF_F_SG))
1627				alloclen = mtu;
1628			else if (!paged &&
1629				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1630				  !(rt->dst.dev->features & NETIF_F_SG)))
1631				alloclen = fraglen;
1632			else {
1633				alloclen = min_t(int, fraglen, MAX_HEADER);
1634				pagedlen = fraglen - alloclen;
1635			}
1636			alloclen += alloc_extra;
 
1637
1638			if (datalen != length + fraggap) {
1639				/*
1640				 * this is not the last fragment, the trailer
1641				 * space is regarded as data space.
1642				 */
1643				datalen += rt->dst.trailer_len;
1644			}
1645
 
1646			fraglen = datalen + fragheaderlen;
1647
 
 
 
 
 
 
 
1648			copy = datalen - transhdrlen - fraggap - pagedlen;
1649			if (copy < 0) {
1650				err = -EINVAL;
1651				goto error;
1652			}
1653			if (transhdrlen) {
1654				skb = sock_alloc_send_skb(sk, alloclen,
 
1655						(flags & MSG_DONTWAIT), &err);
1656			} else {
1657				skb = NULL;
1658				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1659				    2 * sk->sk_sndbuf)
1660					skb = alloc_skb(alloclen,
1661							sk->sk_allocation);
1662				if (unlikely(!skb))
1663					err = -ENOBUFS;
1664			}
1665			if (!skb)
1666				goto error;
1667			/*
1668			 *	Fill in the control structures
1669			 */
1670			skb->protocol = htons(ETH_P_IPV6);
1671			skb->ip_summed = csummode;
1672			skb->csum = 0;
1673			/* reserve for fragmentation and ipsec header */
1674			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1675				    dst_exthdrlen);
1676
1677			/*
1678			 *	Find where to start putting bytes
1679			 */
1680			data = skb_put(skb, fraglen - pagedlen);
1681			skb_set_network_header(skb, exthdrlen);
1682			data += fragheaderlen;
1683			skb->transport_header = (skb->network_header +
1684						 fragheaderlen);
1685			if (fraggap) {
1686				skb->csum = skb_copy_and_csum_bits(
1687					skb_prev, maxfraglen,
1688					data + transhdrlen, fraggap);
1689				skb_prev->csum = csum_sub(skb_prev->csum,
1690							  skb->csum);
1691				data += fraggap;
1692				pskb_trim_unique(skb_prev, maxfraglen);
1693			}
1694			if (copy > 0 &&
1695			    getfrag(from, data + transhdrlen, offset,
1696				    copy, fraggap, skb) < 0) {
1697				err = -EFAULT;
1698				kfree_skb(skb);
1699				goto error;
1700			}
1701
1702			offset += copy;
1703			length -= copy + transhdrlen;
1704			transhdrlen = 0;
1705			exthdrlen = 0;
1706			dst_exthdrlen = 0;
1707
1708			/* Only the initial fragment is time stamped */
1709			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1710			cork->tx_flags = 0;
1711			skb_shinfo(skb)->tskey = tskey;
1712			tskey = 0;
1713			skb_zcopy_set(skb, uarg, &extra_uref);
1714
1715			if ((flags & MSG_CONFIRM) && !skb_prev)
1716				skb_set_dst_pending_confirm(skb, 1);
1717
1718			/*
1719			 * Put the packet on the pending queue
1720			 */
1721			if (!skb->destructor) {
1722				skb->destructor = sock_wfree;
1723				skb->sk = sk;
1724				wmem_alloc_delta += skb->truesize;
1725			}
1726			__skb_queue_tail(queue, skb);
1727			continue;
1728		}
1729
1730		if (copy > length)
1731			copy = length;
1732
1733		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1734		    skb_tailroom(skb) >= copy) {
1735			unsigned int off;
1736
1737			off = skb->len;
1738			if (getfrag(from, skb_put(skb, copy),
1739						offset, copy, off, skb) < 0) {
1740				__skb_trim(skb, off);
1741				err = -EFAULT;
1742				goto error;
1743			}
1744		} else if (!uarg || !uarg->zerocopy) {
1745			int i = skb_shinfo(skb)->nr_frags;
1746
1747			err = -ENOMEM;
1748			if (!sk_page_frag_refill(sk, pfrag))
1749				goto error;
1750
1751			if (!skb_can_coalesce(skb, i, pfrag->page,
1752					      pfrag->offset)) {
1753				err = -EMSGSIZE;
1754				if (i == MAX_SKB_FRAGS)
1755					goto error;
1756
1757				__skb_fill_page_desc(skb, i, pfrag->page,
1758						     pfrag->offset, 0);
1759				skb_shinfo(skb)->nr_frags = ++i;
1760				get_page(pfrag->page);
1761			}
1762			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1763			if (getfrag(from,
1764				    page_address(pfrag->page) + pfrag->offset,
1765				    offset, copy, skb->len, skb) < 0)
1766				goto error_efault;
1767
1768			pfrag->offset += copy;
1769			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1770			skb->len += copy;
1771			skb->data_len += copy;
1772			skb->truesize += copy;
1773			wmem_alloc_delta += copy;
1774		} else {
1775			err = skb_zerocopy_iter_dgram(skb, from, copy);
1776			if (err < 0)
1777				goto error;
1778		}
1779		offset += copy;
1780		length -= copy;
1781	}
1782
1783	if (wmem_alloc_delta)
1784		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1785	return 0;
1786
1787error_efault:
1788	err = -EFAULT;
1789error:
1790	net_zcopy_put_abort(uarg, extra_uref);
 
1791	cork->length -= length;
1792	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1793	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1794	return err;
1795}
1796
1797int ip6_append_data(struct sock *sk,
1798		    int getfrag(void *from, char *to, int offset, int len,
1799				int odd, struct sk_buff *skb),
1800		    void *from, int length, int transhdrlen,
1801		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1802		    struct rt6_info *rt, unsigned int flags)
1803{
1804	struct inet_sock *inet = inet_sk(sk);
1805	struct ipv6_pinfo *np = inet6_sk(sk);
1806	int exthdrlen;
1807	int err;
1808
1809	if (flags&MSG_PROBE)
1810		return 0;
1811	if (skb_queue_empty(&sk->sk_write_queue)) {
1812		/*
1813		 * setup for corking
1814		 */
1815		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1816				     ipc6, rt, fl6);
1817		if (err)
1818			return err;
1819
1820		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1821		length += exthdrlen;
1822		transhdrlen += exthdrlen;
1823	} else {
1824		fl6 = &inet->cork.fl.u.ip6;
1825		transhdrlen = 0;
1826	}
1827
1828	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1829				 &np->cork, sk_page_frag(sk), getfrag,
1830				 from, length, transhdrlen, flags, ipc6);
1831}
1832EXPORT_SYMBOL_GPL(ip6_append_data);
1833
1834static void ip6_cork_release(struct inet_cork_full *cork,
1835			     struct inet6_cork *v6_cork)
1836{
1837	if (v6_cork->opt) {
1838		kfree(v6_cork->opt->dst0opt);
1839		kfree(v6_cork->opt->dst1opt);
1840		kfree(v6_cork->opt->hopopt);
1841		kfree(v6_cork->opt->srcrt);
1842		kfree(v6_cork->opt);
1843		v6_cork->opt = NULL;
1844	}
1845
1846	if (cork->base.dst) {
1847		dst_release(cork->base.dst);
1848		cork->base.dst = NULL;
1849		cork->base.flags &= ~IPCORK_ALLFRAG;
1850	}
1851	memset(&cork->fl, 0, sizeof(cork->fl));
1852}
1853
1854struct sk_buff *__ip6_make_skb(struct sock *sk,
1855			       struct sk_buff_head *queue,
1856			       struct inet_cork_full *cork,
1857			       struct inet6_cork *v6_cork)
1858{
1859	struct sk_buff *skb, *tmp_skb;
1860	struct sk_buff **tail_skb;
1861	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1862	struct ipv6_pinfo *np = inet6_sk(sk);
1863	struct net *net = sock_net(sk);
1864	struct ipv6hdr *hdr;
1865	struct ipv6_txoptions *opt = v6_cork->opt;
1866	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1867	struct flowi6 *fl6 = &cork->fl.u.ip6;
1868	unsigned char proto = fl6->flowi6_proto;
1869
1870	skb = __skb_dequeue(queue);
1871	if (!skb)
1872		goto out;
1873	tail_skb = &(skb_shinfo(skb)->frag_list);
1874
1875	/* move skb->data to ip header from ext header */
1876	if (skb->data < skb_network_header(skb))
1877		__skb_pull(skb, skb_network_offset(skb));
1878	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1879		__skb_pull(tmp_skb, skb_network_header_len(skb));
1880		*tail_skb = tmp_skb;
1881		tail_skb = &(tmp_skb->next);
1882		skb->len += tmp_skb->len;
1883		skb->data_len += tmp_skb->len;
1884		skb->truesize += tmp_skb->truesize;
1885		tmp_skb->destructor = NULL;
1886		tmp_skb->sk = NULL;
1887	}
1888
1889	/* Allow local fragmentation. */
1890	skb->ignore_df = ip6_sk_ignore_df(sk);
1891
1892	*final_dst = fl6->daddr;
1893	__skb_pull(skb, skb_network_header_len(skb));
1894	if (opt && opt->opt_flen)
1895		ipv6_push_frag_opts(skb, opt, &proto);
1896	if (opt && opt->opt_nflen)
1897		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1898
1899	skb_push(skb, sizeof(struct ipv6hdr));
1900	skb_reset_network_header(skb);
1901	hdr = ipv6_hdr(skb);
1902
1903	ip6_flow_hdr(hdr, v6_cork->tclass,
1904		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1905					ip6_autoflowlabel(net, np), fl6));
1906	hdr->hop_limit = v6_cork->hop_limit;
1907	hdr->nexthdr = proto;
1908	hdr->saddr = fl6->saddr;
1909	hdr->daddr = *final_dst;
1910
1911	skb->priority = sk->sk_priority;
1912	skb->mark = cork->base.mark;
1913
1914	skb->tstamp = cork->base.transmit_time;
1915
1916	skb_dst_set(skb, dst_clone(&rt->dst));
1917	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1918	if (proto == IPPROTO_ICMPV6) {
1919		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1920
1921		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1922		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1923	}
1924
1925	ip6_cork_release(cork, v6_cork);
1926out:
1927	return skb;
1928}
1929
1930int ip6_send_skb(struct sk_buff *skb)
1931{
1932	struct net *net = sock_net(skb->sk);
1933	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1934	int err;
1935
1936	err = ip6_local_out(net, skb->sk, skb);
1937	if (err) {
1938		if (err > 0)
1939			err = net_xmit_errno(err);
1940		if (err)
1941			IP6_INC_STATS(net, rt->rt6i_idev,
1942				      IPSTATS_MIB_OUTDISCARDS);
1943	}
1944
1945	return err;
1946}
1947
1948int ip6_push_pending_frames(struct sock *sk)
1949{
1950	struct sk_buff *skb;
1951
1952	skb = ip6_finish_skb(sk);
1953	if (!skb)
1954		return 0;
1955
1956	return ip6_send_skb(skb);
1957}
1958EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1959
1960static void __ip6_flush_pending_frames(struct sock *sk,
1961				       struct sk_buff_head *queue,
1962				       struct inet_cork_full *cork,
1963				       struct inet6_cork *v6_cork)
1964{
1965	struct sk_buff *skb;
1966
1967	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1968		if (skb_dst(skb))
1969			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1970				      IPSTATS_MIB_OUTDISCARDS);
1971		kfree_skb(skb);
1972	}
1973
1974	ip6_cork_release(cork, v6_cork);
1975}
1976
1977void ip6_flush_pending_frames(struct sock *sk)
1978{
1979	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1980				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1981}
1982EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1983
1984struct sk_buff *ip6_make_skb(struct sock *sk,
1985			     int getfrag(void *from, char *to, int offset,
1986					 int len, int odd, struct sk_buff *skb),
1987			     void *from, int length, int transhdrlen,
1988			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1989			     struct rt6_info *rt, unsigned int flags,
1990			     struct inet_cork_full *cork)
1991{
1992	struct inet6_cork v6_cork;
1993	struct sk_buff_head queue;
1994	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1995	int err;
1996
1997	if (flags & MSG_PROBE)
1998		return NULL;
1999
2000	__skb_queue_head_init(&queue);
2001
2002	cork->base.flags = 0;
2003	cork->base.addr = 0;
2004	cork->base.opt = NULL;
2005	cork->base.dst = NULL;
2006	v6_cork.opt = NULL;
2007	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
2008	if (err) {
2009		ip6_cork_release(cork, &v6_cork);
2010		return ERR_PTR(err);
2011	}
2012	if (ipc6->dontfrag < 0)
2013		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2014
2015	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
2016				&current->task_frag, getfrag, from,
2017				length + exthdrlen, transhdrlen + exthdrlen,
2018				flags, ipc6);
2019	if (err) {
2020		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2021		return ERR_PTR(err);
2022	}
2023
2024	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2025}

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *	IPv6 output functions
   4 *	Linux INET6 implementation
   5 *
   6 *	Authors:
   7 *	Pedro Roque		<roque@di.fc.ul.pt>
   8 *
   9 *	Based on linux/net/ipv4/ip_output.c
  10 *
  11 *	Changes:
  12 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
  13 *				extension headers are implemented.
  14 *				route changes now work.
  15 *				ip6_forward does not confuse sniffers.
  16 *				etc.
  17 *
  18 *      H. von Brand    :       Added missing #include <linux/string.h>
  19 *	Imran Patel	:	frag id should be in NBO
  20 *      Kazunori MIYAZAWA @USAGI
  21 *			:       add ip6_append_data and related functions
  22 *				for datagram xmit
  23 */
  24
  25#include <linux/errno.h>
  26#include <linux/kernel.h>
  27#include <linux/string.h>
  28#include <linux/socket.h>
  29#include <linux/net.h>
  30#include <linux/netdevice.h>
  31#include <linux/if_arp.h>
  32#include <linux/in6.h>
  33#include <linux/tcp.h>
  34#include <linux/route.h>
  35#include <linux/module.h>
  36#include <linux/slab.h>
  37
  38#include <linux/bpf-cgroup.h>
  39#include <linux/netfilter.h>
  40#include <linux/netfilter_ipv6.h>
  41
  42#include <net/sock.h>
  43#include <net/snmp.h>
  44
  45#include <net/ipv6.h>
  46#include <net/ndisc.h>
  47#include <net/protocol.h>
  48#include <net/ip6_route.h>
  49#include <net/addrconf.h>
  50#include <net/rawv6.h>
  51#include <net/icmp.h>
  52#include <net/xfrm.h>
  53#include <net/checksum.h>
  54#include <linux/mroute6.h>
  55#include <net/l3mdev.h>
  56#include <net/lwtunnel.h>
 
  57
  58static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  59{
  60	struct dst_entry *dst = skb_dst(skb);
  61	struct net_device *dev = dst->dev;
 
 
  62	const struct in6_addr *nexthop;
  63	struct neighbour *neigh;
  64	int ret;
  65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  66	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  67		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  68
  69		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  70		    ((mroute6_is_socket(net, skb) &&
  71		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  72		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  73					 &ipv6_hdr(skb)->saddr))) {
  74			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  75
  76			/* Do not check for IFF_ALLMULTI; multicast routing
  77			   is not supported in any case.
  78			 */
  79			if (newskb)
  80				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  81					net, sk, newskb, NULL, newskb->dev,
  82					dev_loopback_xmit);
  83
  84			if (ipv6_hdr(skb)->hop_limit == 0) {
  85				IP6_INC_STATS(net, idev,
  86					      IPSTATS_MIB_OUTDISCARDS);
  87				kfree_skb(skb);
  88				return 0;
  89			}
  90		}
  91
  92		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  93
  94		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  95		    IPV6_ADDR_SCOPE_NODELOCAL &&
  96		    !(dev->flags & IFF_LOOPBACK)) {
  97			kfree_skb(skb);
  98			return 0;
  99		}
 100	}
 101
 102	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 103		int res = lwtunnel_xmit(skb);
 104
 105		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 106			return res;
 107	}
 108
 109	rcu_read_lock_bh();
 110	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 111	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 112	if (unlikely(!neigh))
 113		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 114	if (!IS_ERR(neigh)) {
 115		sock_confirm_neigh(skb, neigh);
 116		ret = neigh_output(neigh, skb, false);
 117		rcu_read_unlock_bh();
 118		return ret;
 119	}
 120	rcu_read_unlock_bh();
 121
 122	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 123	kfree_skb(skb);
 124	return -EINVAL;
 125}
 126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 127static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 128{
 
 
 129#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 130	/* Policy lookup after SNAT yielded a new policy */
 131	if (skb_dst(skb)->xfrm) {
 132		IPCB(skb)->flags |= IPSKB_REROUTED;
 133		return dst_output(net, sk, skb);
 134	}
 135#endif
 136
 137	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 
 
 
 
 138	    dst_allfrag(skb_dst(skb)) ||
 139	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 140		return ip6_fragment(net, sk, skb, ip6_finish_output2);
 141	else
 142		return ip6_finish_output2(net, sk, skb);
 143}
 144
 145static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 146{
 147	int ret;
 148
 149	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 150	switch (ret) {
 151	case NET_XMIT_SUCCESS:
 152		return __ip6_finish_output(net, sk, skb);
 153	case NET_XMIT_CN:
 154		return __ip6_finish_output(net, sk, skb) ? : ret;
 155	default:
 156		kfree_skb(skb);
 157		return ret;
 158	}
 159}
 160
 161int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 162{
 163	struct net_device *dev = skb_dst(skb)->dev;
 164	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 165
 166	skb->protocol = htons(ETH_P_IPV6);
 167	skb->dev = dev;
 168
 169	if (unlikely(idev->cnf.disable_ipv6)) {
 170		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 171		kfree_skb(skb);
 172		return 0;
 173	}
 174
 175	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 176			    net, sk, skb, NULL, dev,
 177			    ip6_finish_output,
 178			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 179}
 
 180
 181bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 182{
 183	if (!np->autoflowlabel_set)
 184		return ip6_default_np_autolabel(net);
 185	else
 186		return np->autoflowlabel;
 187}
 188
 189/*
 190 * xmit an sk_buff (used by TCP, SCTP and DCCP)
 191 * Note : socket lock is not held for SYNACK packets, but might be modified
 192 * by calls to skb_set_owner_w() and ipv6_local_error(),
 193 * which are using proper atomic operations or spinlocks.
 194 */
 195int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 196	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 197{
 198	struct net *net = sock_net(sk);
 199	const struct ipv6_pinfo *np = inet6_sk(sk);
 200	struct in6_addr *first_hop = &fl6->daddr;
 201	struct dst_entry *dst = skb_dst(skb);
 202	unsigned int head_room;
 203	struct ipv6hdr *hdr;
 204	u8  proto = fl6->flowi6_proto;
 205	int seg_len = skb->len;
 206	int hlimit = -1;
 207	u32 mtu;
 208
 209	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 210	if (opt)
 211		head_room += opt->opt_nflen + opt->opt_flen;
 212
 213	if (unlikely(skb_headroom(skb) < head_room)) {
 214		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 215		if (!skb2) {
 216			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 217				      IPSTATS_MIB_OUTDISCARDS);
 218			kfree_skb(skb);
 219			return -ENOBUFS;
 220		}
 221		if (skb->sk)
 222			skb_set_owner_w(skb2, skb->sk);
 223		consume_skb(skb);
 224		skb = skb2;
 225	}
 226
 227	if (opt) {
 228		seg_len += opt->opt_nflen + opt->opt_flen;
 229
 230		if (opt->opt_flen)
 231			ipv6_push_frag_opts(skb, opt, &proto);
 232
 233		if (opt->opt_nflen)
 234			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 235					     &fl6->saddr);
 236	}
 237
 238	skb_push(skb, sizeof(struct ipv6hdr));
 239	skb_reset_network_header(skb);
 240	hdr = ipv6_hdr(skb);
 241
 242	/*
 243	 *	Fill in the IPv6 header
 244	 */
 245	if (np)
 246		hlimit = np->hop_limit;
 247	if (hlimit < 0)
 248		hlimit = ip6_dst_hoplimit(dst);
 249
 250	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 251				ip6_autoflowlabel(net, np), fl6));
 252
 253	hdr->payload_len = htons(seg_len);
 254	hdr->nexthdr = proto;
 255	hdr->hop_limit = hlimit;
 256
 257	hdr->saddr = fl6->saddr;
 258	hdr->daddr = *first_hop;
 259
 260	skb->protocol = htons(ETH_P_IPV6);
 261	skb->priority = priority;
 262	skb->mark = mark;
 263
 264	mtu = dst_mtu(dst);
 265	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 266		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 267			      IPSTATS_MIB_OUT, skb->len);
 268
 269		/* if egress device is enslaved to an L3 master device pass the
 270		 * skb to its handler for processing
 271		 */
 272		skb = l3mdev_ip6_out((struct sock *)sk, skb);
 273		if (unlikely(!skb))
 274			return 0;
 275
 276		/* hooks should never assume socket lock is held.
 277		 * we promote our socket to non const
 278		 */
 279		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 280			       net, (struct sock *)sk, skb, NULL, dst->dev,
 281			       dst_output);
 282	}
 283
 284	skb->dev = dst->dev;
 285	/* ipv6_local_error() does not require socket lock,
 286	 * we promote our socket to non const
 287	 */
 288	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 289
 290	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 291	kfree_skb(skb);
 292	return -EMSGSIZE;
 293}
 294EXPORT_SYMBOL(ip6_xmit);
 295
 296static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 297{
 298	struct ip6_ra_chain *ra;
 299	struct sock *last = NULL;
 300
 301	read_lock(&ip6_ra_lock);
 302	for (ra = ip6_ra_chain; ra; ra = ra->next) {
 303		struct sock *sk = ra->sk;
 304		if (sk && ra->sel == sel &&
 305		    (!sk->sk_bound_dev_if ||
 306		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
 307			struct ipv6_pinfo *np = inet6_sk(sk);
 308
 309			if (np && np->rtalert_isolate &&
 310			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
 311				continue;
 312			}
 313			if (last) {
 314				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 315				if (skb2)
 316					rawv6_rcv(last, skb2);
 317			}
 318			last = sk;
 319		}
 320	}
 321
 322	if (last) {
 323		rawv6_rcv(last, skb);
 324		read_unlock(&ip6_ra_lock);
 325		return 1;
 326	}
 327	read_unlock(&ip6_ra_lock);
 328	return 0;
 329}
 330
 331static int ip6_forward_proxy_check(struct sk_buff *skb)
 332{
 333	struct ipv6hdr *hdr = ipv6_hdr(skb);
 334	u8 nexthdr = hdr->nexthdr;
 335	__be16 frag_off;
 336	int offset;
 337
 338	if (ipv6_ext_hdr(nexthdr)) {
 339		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 340		if (offset < 0)
 341			return 0;
 342	} else
 343		offset = sizeof(struct ipv6hdr);
 344
 345	if (nexthdr == IPPROTO_ICMPV6) {
 346		struct icmp6hdr *icmp6;
 347
 348		if (!pskb_may_pull(skb, (skb_network_header(skb) +
 349					 offset + 1 - skb->data)))
 350			return 0;
 351
 352		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 353
 354		switch (icmp6->icmp6_type) {
 355		case NDISC_ROUTER_SOLICITATION:
 356		case NDISC_ROUTER_ADVERTISEMENT:
 357		case NDISC_NEIGHBOUR_SOLICITATION:
 358		case NDISC_NEIGHBOUR_ADVERTISEMENT:
 359		case NDISC_REDIRECT:
 360			/* For reaction involving unicast neighbor discovery
 361			 * message destined to the proxied address, pass it to
 362			 * input function.
 363			 */
 364			return 1;
 365		default:
 366			break;
 367		}
 368	}
 369
 370	/*
 371	 * The proxying router can't forward traffic sent to a link-local
 372	 * address, so signal the sender and discard the packet. This
 373	 * behavior is clarified by the MIPv6 specification.
 374	 */
 375	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 376		dst_link_failure(skb);
 377		return -1;
 378	}
 379
 380	return 0;
 381}
 382
 383static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 384				     struct sk_buff *skb)
 385{
 386	struct dst_entry *dst = skb_dst(skb);
 387
 388	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 389	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 390
 391#ifdef CONFIG_NET_SWITCHDEV
 392	if (skb->offload_l3_fwd_mark) {
 393		consume_skb(skb);
 394		return 0;
 395	}
 396#endif
 397
 398	skb->tstamp = 0;
 399	return dst_output(net, sk, skb);
 400}
 401
 402static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 403{
 404	if (skb->len <= mtu)
 405		return false;
 406
 407	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 408	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 409		return true;
 410
 411	if (skb->ignore_df)
 412		return false;
 413
 414	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 415		return false;
 416
 417	return true;
 418}
 419
 420int ip6_forward(struct sk_buff *skb)
 421{
 422	struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 423	struct dst_entry *dst = skb_dst(skb);
 424	struct ipv6hdr *hdr = ipv6_hdr(skb);
 425	struct inet6_skb_parm *opt = IP6CB(skb);
 426	struct net *net = dev_net(dst->dev);
 
 427	u32 mtu;
 428
 
 429	if (net->ipv6.devconf_all->forwarding == 0)
 430		goto error;
 431
 432	if (skb->pkt_type != PACKET_HOST)
 433		goto drop;
 434
 435	if (unlikely(skb->sk))
 436		goto drop;
 437
 438	if (skb_warn_if_lro(skb))
 439		goto drop;
 440
 441	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 
 
 442		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 443		goto drop;
 444	}
 445
 446	skb_forward_csum(skb);
 447
 448	/*
 449	 *	We DO NOT make any processing on
 450	 *	RA packets, pushing them to user level AS IS
 451	 *	without ane WARRANTY that application will be able
 452	 *	to interpret them. The reason is that we
 453	 *	cannot make anything clever here.
 454	 *
 455	 *	We are not end-node, so that if packet contains
 456	 *	AH/ESP, we cannot make anything.
 457	 *	Defragmentation also would be mistake, RA packets
 458	 *	cannot be fragmented, because there is no warranty
 459	 *	that different fragments will go along one path. --ANK
 460	 */
 461	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 462		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 463			return 0;
 464	}
 465
 466	/*
 467	 *	check and decrement ttl
 468	 */
 469	if (hdr->hop_limit <= 1) {
 470		/* Force OUTPUT device used as source address */
 471		skb->dev = dst->dev;
 472		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 473		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 474
 475		kfree_skb(skb);
 476		return -ETIMEDOUT;
 477	}
 478
 479	/* XXX: idev->cnf.proxy_ndp? */
 480	if (net->ipv6.devconf_all->proxy_ndp &&
 481	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 482		int proxied = ip6_forward_proxy_check(skb);
 483		if (proxied > 0)
 
 484			return ip6_input(skb);
 485		else if (proxied < 0) {
 486			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 487			goto drop;
 488		}
 489	}
 490
 491	if (!xfrm6_route_forward(skb)) {
 492		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 493		goto drop;
 494	}
 495	dst = skb_dst(skb);
 496
 497	/* IPv6 specs say nothing about it, but it is clear that we cannot
 498	   send redirects to source routed frames.
 499	   We don't send redirects to frames decapsulated from IPsec.
 500	 */
 501	if (IP6CB(skb)->iif == dst->dev->ifindex &&
 502	    opt->srcrt == 0 && !skb_sec_path(skb)) {
 503		struct in6_addr *target = NULL;
 504		struct inet_peer *peer;
 505		struct rt6_info *rt;
 506
 507		/*
 508		 *	incoming and outgoing devices are the same
 509		 *	send a redirect.
 510		 */
 511
 512		rt = (struct rt6_info *) dst;
 513		if (rt->rt6i_flags & RTF_GATEWAY)
 514			target = &rt->rt6i_gateway;
 515		else
 516			target = &hdr->daddr;
 517
 518		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 519
 520		/* Limit redirects both by destination (here)
 521		   and by source (inside ndisc_send_redirect)
 522		 */
 523		if (inet_peer_xrlim_allow(peer, 1*HZ))
 524			ndisc_send_redirect(skb, target);
 525		if (peer)
 526			inet_putpeer(peer);
 527	} else {
 528		int addrtype = ipv6_addr_type(&hdr->saddr);
 529
 530		/* This check is security critical. */
 531		if (addrtype == IPV6_ADDR_ANY ||
 532		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 533			goto error;
 534		if (addrtype & IPV6_ADDR_LINKLOCAL) {
 535			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 536				    ICMPV6_NOT_NEIGHBOUR, 0);
 537			goto error;
 538		}
 539	}
 540
 541	mtu = ip6_dst_mtu_forward(dst);
 542	if (mtu < IPV6_MIN_MTU)
 543		mtu = IPV6_MIN_MTU;
 544
 545	if (ip6_pkt_too_big(skb, mtu)) {
 546		/* Again, force OUTPUT device used as source address */
 547		skb->dev = dst->dev;
 548		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 549		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 550		__IP6_INC_STATS(net, ip6_dst_idev(dst),
 551				IPSTATS_MIB_FRAGFAILS);
 552		kfree_skb(skb);
 553		return -EMSGSIZE;
 554	}
 555
 556	if (skb_cow(skb, dst->dev->hard_header_len)) {
 557		__IP6_INC_STATS(net, ip6_dst_idev(dst),
 558				IPSTATS_MIB_OUTDISCARDS);
 559		goto drop;
 560	}
 561
 562	hdr = ipv6_hdr(skb);
 563
 564	/* Mangling hops number delayed to point after skb COW */
 565
 566	hdr->hop_limit--;
 567
 568	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 569		       net, NULL, skb, skb->dev, dst->dev,
 570		       ip6_forward_finish);
 571
 572error:
 573	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 574drop:
 575	kfree_skb(skb);
 576	return -EINVAL;
 577}
 578
 579static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 580{
 581	to->pkt_type = from->pkt_type;
 582	to->priority = from->priority;
 583	to->protocol = from->protocol;
 584	skb_dst_drop(to);
 585	skb_dst_set(to, dst_clone(skb_dst(from)));
 586	to->dev = from->dev;
 587	to->mark = from->mark;
 588
 589	skb_copy_hash(to, from);
 590
 591#ifdef CONFIG_NET_SCHED
 592	to->tc_index = from->tc_index;
 593#endif
 594	nf_copy(to, from);
 595	skb_ext_copy(to, from);
 596	skb_copy_secmark(to, from);
 597}
 598
 599int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 600		      u8 nexthdr, __be32 frag_id,
 601		      struct ip6_fraglist_iter *iter)
 602{
 603	unsigned int first_len;
 604	struct frag_hdr *fh;
 605
 606	/* BUILD HEADER */
 607	*prevhdr = NEXTHDR_FRAGMENT;
 608	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 609	if (!iter->tmp_hdr)
 610		return -ENOMEM;
 611
 612	iter->frag = skb_shinfo(skb)->frag_list;
 613	skb_frag_list_init(skb);
 614
 615	iter->offset = 0;
 616	iter->hlen = hlen;
 617	iter->frag_id = frag_id;
 618	iter->nexthdr = nexthdr;
 619
 620	__skb_pull(skb, hlen);
 621	fh = __skb_push(skb, sizeof(struct frag_hdr));
 622	__skb_push(skb, hlen);
 623	skb_reset_network_header(skb);
 624	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 625
 626	fh->nexthdr = nexthdr;
 627	fh->reserved = 0;
 628	fh->frag_off = htons(IP6_MF);
 629	fh->identification = frag_id;
 630
 631	first_len = skb_pagelen(skb);
 632	skb->data_len = first_len - skb_headlen(skb);
 633	skb->len = first_len;
 634	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 635
 636	return 0;
 637}
 638EXPORT_SYMBOL(ip6_fraglist_init);
 639
 640void ip6_fraglist_prepare(struct sk_buff *skb,
 641			  struct ip6_fraglist_iter *iter)
 642{
 643	struct sk_buff *frag = iter->frag;
 644	unsigned int hlen = iter->hlen;
 645	struct frag_hdr *fh;
 646
 647	frag->ip_summed = CHECKSUM_NONE;
 648	skb_reset_transport_header(frag);
 649	fh = __skb_push(frag, sizeof(struct frag_hdr));
 650	__skb_push(frag, hlen);
 651	skb_reset_network_header(frag);
 652	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 653	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 654	fh->nexthdr = iter->nexthdr;
 655	fh->reserved = 0;
 656	fh->frag_off = htons(iter->offset);
 657	if (frag->next)
 658		fh->frag_off |= htons(IP6_MF);
 659	fh->identification = iter->frag_id;
 660	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 661	ip6_copy_metadata(frag, skb);
 662}
 663EXPORT_SYMBOL(ip6_fraglist_prepare);
 664
 665void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 666		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 667		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 668{
 669	state->prevhdr = prevhdr;
 670	state->nexthdr = nexthdr;
 671	state->frag_id = frag_id;
 672
 673	state->hlen = hlen;
 674	state->mtu = mtu;
 675
 676	state->left = skb->len - hlen;	/* Space per frame */
 677	state->ptr = hlen;		/* Where to start from */
 678
 679	state->hroom = hdr_room;
 680	state->troom = needed_tailroom;
 681
 682	state->offset = 0;
 683}
 684EXPORT_SYMBOL(ip6_frag_init);
 685
 686struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 687{
 688	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 689	struct sk_buff *frag;
 690	struct frag_hdr *fh;
 691	unsigned int len;
 692
 693	len = state->left;
 694	/* IF: it doesn't fit, use 'mtu' - the data space left */
 695	if (len > state->mtu)
 696		len = state->mtu;
 697	/* IF: we are not sending up to and including the packet end
 698	   then align the next start on an eight byte boundary */
 699	if (len < state->left)
 700		len &= ~7;
 701
 702	/* Allocate buffer */
 703	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 704			 state->hroom + state->troom, GFP_ATOMIC);
 705	if (!frag)
 706		return ERR_PTR(-ENOMEM);
 707
 708	/*
 709	 *	Set up data on packet
 710	 */
 711
 712	ip6_copy_metadata(frag, skb);
 713	skb_reserve(frag, state->hroom);
 714	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 715	skb_reset_network_header(frag);
 716	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 717	frag->transport_header = (frag->network_header + state->hlen +
 718				  sizeof(struct frag_hdr));
 719
 720	/*
 721	 *	Charge the memory for the fragment to any owner
 722	 *	it might possess
 723	 */
 724	if (skb->sk)
 725		skb_set_owner_w(frag, skb->sk);
 726
 727	/*
 728	 *	Copy the packet header into the new buffer.
 729	 */
 730	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 731
 732	fragnexthdr_offset = skb_network_header(frag);
 733	fragnexthdr_offset += prevhdr - skb_network_header(skb);
 734	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
 735
 736	/*
 737	 *	Build fragment header.
 738	 */
 739	fh->nexthdr = state->nexthdr;
 740	fh->reserved = 0;
 741	fh->identification = state->frag_id;
 742
 743	/*
 744	 *	Copy a block of the IP datagram.
 745	 */
 746	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 747			     len));
 748	state->left -= len;
 749
 750	fh->frag_off = htons(state->offset);
 751	if (state->left > 0)
 752		fh->frag_off |= htons(IP6_MF);
 753	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 754
 755	state->ptr += len;
 756	state->offset += len;
 757
 758	return frag;
 759}
 760EXPORT_SYMBOL(ip6_frag_next);
 761
 762int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 763		 int (*output)(struct net *, struct sock *, struct sk_buff *))
 764{
 765	struct sk_buff *frag;
 766	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 767	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 768				inet6_sk(skb->sk) : NULL;
 769	struct ip6_frag_state state;
 770	unsigned int mtu, hlen, nexthdr_offset;
 771	ktime_t tstamp = skb->tstamp;
 772	int hroom, err = 0;
 773	__be32 frag_id;
 774	u8 *prevhdr, nexthdr = 0;
 775
 776	err = ip6_find_1stfragopt(skb, &prevhdr);
 777	if (err < 0)
 778		goto fail;
 779	hlen = err;
 780	nexthdr = *prevhdr;
 781	nexthdr_offset = prevhdr - skb_network_header(skb);
 782
 783	mtu = ip6_skb_dst_mtu(skb);
 784
 785	/* We must not fragment if the socket is set to force MTU discovery
 786	 * or if the skb it not generated by a local socket.
 787	 */
 788	if (unlikely(!skb->ignore_df && skb->len > mtu))
 789		goto fail_toobig;
 790
 791	if (IP6CB(skb)->frag_max_size) {
 792		if (IP6CB(skb)->frag_max_size > mtu)
 793			goto fail_toobig;
 794
 795		/* don't send fragments larger than what we received */
 796		mtu = IP6CB(skb)->frag_max_size;
 797		if (mtu < IPV6_MIN_MTU)
 798			mtu = IPV6_MIN_MTU;
 799	}
 800
 801	if (np && np->frag_size < mtu) {
 802		if (np->frag_size)
 803			mtu = np->frag_size;
 804	}
 805	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 806		goto fail_toobig;
 807	mtu -= hlen + sizeof(struct frag_hdr);
 808
 809	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 810				    &ipv6_hdr(skb)->saddr);
 811
 812	if (skb->ip_summed == CHECKSUM_PARTIAL &&
 813	    (err = skb_checksum_help(skb)))
 814		goto fail;
 815
 816	prevhdr = skb_network_header(skb) + nexthdr_offset;
 817	hroom = LL_RESERVED_SPACE(rt->dst.dev);
 818	if (skb_has_frag_list(skb)) {
 819		unsigned int first_len = skb_pagelen(skb);
 820		struct ip6_fraglist_iter iter;
 821		struct sk_buff *frag2;
 822
 823		if (first_len - hlen > mtu ||
 824		    ((first_len - hlen) & 7) ||
 825		    skb_cloned(skb) ||
 826		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 827			goto slow_path;
 828
 829		skb_walk_frags(skb, frag) {
 830			/* Correct geometry. */
 831			if (frag->len > mtu ||
 832			    ((frag->len & 7) && frag->next) ||
 833			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 834				goto slow_path_clean;
 835
 836			/* Partially cloned skb? */
 837			if (skb_shared(frag))
 838				goto slow_path_clean;
 839
 840			BUG_ON(frag->sk);
 841			if (skb->sk) {
 842				frag->sk = skb->sk;
 843				frag->destructor = sock_wfree;
 844			}
 845			skb->truesize -= frag->truesize;
 846		}
 847
 848		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 849					&iter);
 850		if (err < 0)
 851			goto fail;
 852
 853		for (;;) {
 854			/* Prepare header of the next frame,
 855			 * before previous one went down. */
 856			if (iter.frag)
 857				ip6_fraglist_prepare(skb, &iter);
 858
 859			skb->tstamp = tstamp;
 860			err = output(net, sk, skb);
 861			if (!err)
 862				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 863					      IPSTATS_MIB_FRAGCREATES);
 864
 865			if (err || !iter.frag)
 866				break;
 867
 868			skb = ip6_fraglist_next(&iter);
 869		}
 870
 871		kfree(iter.tmp_hdr);
 872
 873		if (err == 0) {
 874			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 875				      IPSTATS_MIB_FRAGOKS);
 876			return 0;
 877		}
 878
 879		kfree_skb_list(iter.frag);
 880
 881		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 882			      IPSTATS_MIB_FRAGFAILS);
 883		return err;
 884
 885slow_path_clean:
 886		skb_walk_frags(skb, frag2) {
 887			if (frag2 == frag)
 888				break;
 889			frag2->sk = NULL;
 890			frag2->destructor = NULL;
 891			skb->truesize += frag2->truesize;
 892		}
 893	}
 894
 895slow_path:
 896	/*
 897	 *	Fragment the datagram.
 898	 */
 899
 900	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 901		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 902		      &state);
 903
 904	/*
 905	 *	Keep copying data until we run out.
 906	 */
 907
 908	while (state.left > 0) {
 909		frag = ip6_frag_next(skb, &state);
 910		if (IS_ERR(frag)) {
 911			err = PTR_ERR(frag);
 912			goto fail;
 913		}
 914
 915		/*
 916		 *	Put this fragment into the sending queue.
 917		 */
 918		frag->tstamp = tstamp;
 919		err = output(net, sk, frag);
 920		if (err)
 921			goto fail;
 922
 923		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 924			      IPSTATS_MIB_FRAGCREATES);
 925	}
 926	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 927		      IPSTATS_MIB_FRAGOKS);
 928	consume_skb(skb);
 929	return err;
 930
 931fail_toobig:
 932	if (skb->sk && dst_allfrag(skb_dst(skb)))
 933		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 934
 935	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 936	err = -EMSGSIZE;
 937
 938fail:
 939	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 940		      IPSTATS_MIB_FRAGFAILS);
 941	kfree_skb(skb);
 942	return err;
 943}
 944
 945static inline int ip6_rt_check(const struct rt6key *rt_key,
 946			       const struct in6_addr *fl_addr,
 947			       const struct in6_addr *addr_cache)
 948{
 949	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 950		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 951}
 952
 953static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 954					  struct dst_entry *dst,
 955					  const struct flowi6 *fl6)
 956{
 957	struct ipv6_pinfo *np = inet6_sk(sk);
 958	struct rt6_info *rt;
 959
 960	if (!dst)
 961		goto out;
 962
 963	if (dst->ops->family != AF_INET6) {
 964		dst_release(dst);
 965		return NULL;
 966	}
 967
 968	rt = (struct rt6_info *)dst;
 969	/* Yes, checking route validity in not connected
 970	 * case is not very simple. Take into account,
 971	 * that we do not support routing by source, TOS,
 972	 * and MSG_DONTROUTE		--ANK (980726)
 973	 *
 974	 * 1. ip6_rt_check(): If route was host route,
 975	 *    check that cached destination is current.
 976	 *    If it is network route, we still may
 977	 *    check its validity using saved pointer
 978	 *    to the last used address: daddr_cache.
 979	 *    We do not want to save whole address now,
 980	 *    (because main consumer of this service
 981	 *    is tcp, which has not this problem),
 982	 *    so that the last trick works only on connected
 983	 *    sockets.
 984	 * 2. oif also should be the same.
 985	 */
 986	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 987#ifdef CONFIG_IPV6_SUBTREES
 988	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 989#endif
 990	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 991	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 992		dst_release(dst);
 993		dst = NULL;
 994	}
 995
 996out:
 997	return dst;
 998}
 999
1000static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1001			       struct dst_entry **dst, struct flowi6 *fl6)
1002{
1003#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1004	struct neighbour *n;
1005	struct rt6_info *rt;
1006#endif
1007	int err;
1008	int flags = 0;
1009
1010	/* The correct way to handle this would be to do
1011	 * ip6_route_get_saddr, and then ip6_route_output; however,
1012	 * the route-specific preferred source forces the
1013	 * ip6_route_output call _before_ ip6_route_get_saddr.
1014	 *
1015	 * In source specific routing (no src=any default route),
1016	 * ip6_route_output will fail given src=any saddr, though, so
1017	 * that's why we try it again later.
1018	 */
1019	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
1020		struct fib6_info *from;
1021		struct rt6_info *rt;
1022		bool had_dst = *dst != NULL;
1023
1024		if (!had_dst)
1025			*dst = ip6_route_output(net, sk, fl6);
1026		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1027
1028		rcu_read_lock();
1029		from = rt ? rcu_dereference(rt->from) : NULL;
1030		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1031					  sk ? inet6_sk(sk)->srcprefs : 0,
1032					  &fl6->saddr);
1033		rcu_read_unlock();
1034
1035		if (err)
1036			goto out_err_release;
1037
1038		/* If we had an erroneous initial result, pretend it
1039		 * never existed and let the SA-enabled version take
1040		 * over.
1041		 */
1042		if (!had_dst && (*dst)->error) {
1043			dst_release(*dst);
1044			*dst = NULL;
1045		}
1046
1047		if (fl6->flowi6_oif)
1048			flags |= RT6_LOOKUP_F_IFACE;
1049	}
1050
1051	if (!*dst)
1052		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1053
1054	err = (*dst)->error;
1055	if (err)
1056		goto out_err_release;
1057
1058#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1059	/*
1060	 * Here if the dst entry we've looked up
1061	 * has a neighbour entry that is in the INCOMPLETE
1062	 * state and the src address from the flow is
1063	 * marked as OPTIMISTIC, we release the found
1064	 * dst entry and replace it instead with the
1065	 * dst entry of the nexthop router
1066	 */
1067	rt = (struct rt6_info *) *dst;
1068	rcu_read_lock_bh();
1069	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1070				      rt6_nexthop(rt, &fl6->daddr));
1071	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1072	rcu_read_unlock_bh();
1073
1074	if (err) {
1075		struct inet6_ifaddr *ifp;
1076		struct flowi6 fl_gw6;
1077		int redirect;
1078
1079		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1080				      (*dst)->dev, 1);
1081
1082		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1083		if (ifp)
1084			in6_ifa_put(ifp);
1085
1086		if (redirect) {
1087			/*
1088			 * We need to get the dst entry for the
1089			 * default router instead
1090			 */
1091			dst_release(*dst);
1092			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1093			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1094			*dst = ip6_route_output(net, sk, &fl_gw6);
1095			err = (*dst)->error;
1096			if (err)
1097				goto out_err_release;
1098		}
1099	}
1100#endif
1101	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1102	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1103		err = -EAFNOSUPPORT;
1104		goto out_err_release;
1105	}
1106
1107	return 0;
1108
1109out_err_release:
1110	dst_release(*dst);
1111	*dst = NULL;
1112
1113	if (err == -ENETUNREACH)
1114		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1115	return err;
1116}
1117
1118/**
1119 *	ip6_dst_lookup - perform route lookup on flow
 
1120 *	@sk: socket which provides route info
1121 *	@dst: pointer to dst_entry * for result
1122 *	@fl6: flow to lookup
1123 *
1124 *	This function performs a route lookup on the given flow.
1125 *
1126 *	It returns zero on success, or a standard errno code on error.
1127 */
1128int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1129		   struct flowi6 *fl6)
1130{
1131	*dst = NULL;
1132	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1133}
1134EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1135
1136/**
1137 *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
 
1138 *	@sk: socket which provides route info
1139 *	@fl6: flow to lookup
1140 *	@final_dst: final destination address for ipsec lookup
1141 *
1142 *	This function performs a route lookup on the given flow.
1143 *
1144 *	It returns a valid dst pointer on success, or a pointer encoded
1145 *	error code.
1146 */
1147struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1148				      const struct in6_addr *final_dst)
1149{
1150	struct dst_entry *dst = NULL;
1151	int err;
1152
1153	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1154	if (err)
1155		return ERR_PTR(err);
1156	if (final_dst)
1157		fl6->daddr = *final_dst;
1158
1159	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1160}
1161EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1162
1163/**
1164 *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1165 *	@sk: socket which provides the dst cache and route info
1166 *	@fl6: flow to lookup
1167 *	@final_dst: final destination address for ipsec lookup
1168 *	@connected: whether @sk is connected or not
1169 *
1170 *	This function performs a route lookup on the given flow with the
1171 *	possibility of using the cached route in the socket if it is valid.
1172 *	It will take the socket dst lock when operating on the dst cache.
1173 *	As a result, this function can only be used in process context.
1174 *
1175 *	In addition, for a connected socket, cache the dst in the socket
1176 *	if the current cache is not valid.
1177 *
1178 *	It returns a valid dst pointer on success, or a pointer encoded
1179 *	error code.
1180 */
1181struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1182					 const struct in6_addr *final_dst,
1183					 bool connected)
1184{
1185	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1186
1187	dst = ip6_sk_dst_check(sk, dst, fl6);
1188	if (dst)
1189		return dst;
1190
1191	dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1192	if (connected && !IS_ERR(dst))
1193		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1194
1195	return dst;
1196}
1197EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1199static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1200					       gfp_t gfp)
1201{
1202	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1203}
1204
1205static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1206						gfp_t gfp)
1207{
1208	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1209}
1210
1211static void ip6_append_data_mtu(unsigned int *mtu,
1212				int *maxfraglen,
1213				unsigned int fragheaderlen,
1214				struct sk_buff *skb,
1215				struct rt6_info *rt,
1216				unsigned int orig_mtu)
1217{
1218	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1219		if (!skb) {
1220			/* first fragment, reserve header_len */
1221			*mtu = orig_mtu - rt->dst.header_len;
1222
1223		} else {
1224			/*
1225			 * this fragment is not first, the headers
1226			 * space is regarded as data space.
1227			 */
1228			*mtu = orig_mtu;
1229		}
1230		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1231			      + fragheaderlen - sizeof(struct frag_hdr);
1232	}
1233}
1234
1235static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1236			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1237			  struct rt6_info *rt, struct flowi6 *fl6)
1238{
1239	struct ipv6_pinfo *np = inet6_sk(sk);
1240	unsigned int mtu;
1241	struct ipv6_txoptions *opt = ipc6->opt;
1242
1243	/*
1244	 * setup for corking
1245	 */
1246	if (opt) {
1247		if (WARN_ON(v6_cork->opt))
1248			return -EINVAL;
1249
1250		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1251		if (unlikely(!v6_cork->opt))
1252			return -ENOBUFS;
1253
1254		v6_cork->opt->tot_len = sizeof(*opt);
1255		v6_cork->opt->opt_flen = opt->opt_flen;
1256		v6_cork->opt->opt_nflen = opt->opt_nflen;
1257
1258		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1259						    sk->sk_allocation);
1260		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1261			return -ENOBUFS;
1262
1263		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1264						    sk->sk_allocation);
1265		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1266			return -ENOBUFS;
1267
1268		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1269						   sk->sk_allocation);
1270		if (opt->hopopt && !v6_cork->opt->hopopt)
1271			return -ENOBUFS;
1272
1273		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1274						    sk->sk_allocation);
1275		if (opt->srcrt && !v6_cork->opt->srcrt)
1276			return -ENOBUFS;
1277
1278		/* need source address above miyazawa*/
1279	}
1280	dst_hold(&rt->dst);
1281	cork->base.dst = &rt->dst;
1282	cork->fl.u.ip6 = *fl6;
1283	v6_cork->hop_limit = ipc6->hlimit;
1284	v6_cork->tclass = ipc6->tclass;
1285	if (rt->dst.flags & DST_XFRM_TUNNEL)
1286		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1287		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1288	else
1289		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1290			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1291	if (np->frag_size < mtu) {
1292		if (np->frag_size)
1293			mtu = np->frag_size;
1294	}
1295	if (mtu < IPV6_MIN_MTU)
1296		return -EINVAL;
1297	cork->base.fragsize = mtu;
1298	cork->base.gso_size = ipc6->gso_size;
1299	cork->base.tx_flags = 0;
1300	cork->base.mark = ipc6->sockc.mark;
1301	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1302
1303	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1304		cork->base.flags |= IPCORK_ALLFRAG;
1305	cork->base.length = 0;
1306
1307	cork->base.transmit_time = ipc6->sockc.transmit_time;
1308
1309	return 0;
1310}
1311
1312static int __ip6_append_data(struct sock *sk,
1313			     struct flowi6 *fl6,
1314			     struct sk_buff_head *queue,
1315			     struct inet_cork *cork,
1316			     struct inet6_cork *v6_cork,
1317			     struct page_frag *pfrag,
1318			     int getfrag(void *from, char *to, int offset,
1319					 int len, int odd, struct sk_buff *skb),
1320			     void *from, int length, int transhdrlen,
1321			     unsigned int flags, struct ipcm6_cookie *ipc6)
1322{
1323	struct sk_buff *skb, *skb_prev = NULL;
1324	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1325	struct ubuf_info *uarg = NULL;
1326	int exthdrlen = 0;
1327	int dst_exthdrlen = 0;
1328	int hh_len;
1329	int copy;
1330	int err;
1331	int offset = 0;
1332	u32 tskey = 0;
1333	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1334	struct ipv6_txoptions *opt = v6_cork->opt;
1335	int csummode = CHECKSUM_NONE;
1336	unsigned int maxnonfragsize, headersize;
1337	unsigned int wmem_alloc_delta = 0;
1338	bool paged, extra_uref = false;
1339
1340	skb = skb_peek_tail(queue);
1341	if (!skb) {
1342		exthdrlen = opt ? opt->opt_flen : 0;
1343		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1344	}
1345
1346	paged = !!cork->gso_size;
1347	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1348	orig_mtu = mtu;
1349
1350	if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1351	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1352		tskey = sk->sk_tskey++;
1353
1354	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1355
1356	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1357			(opt ? opt->opt_nflen : 0);
1358	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1359		     sizeof(struct frag_hdr);
1360
1361	headersize = sizeof(struct ipv6hdr) +
1362		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1363		     (dst_allfrag(&rt->dst) ?
1364		      sizeof(struct frag_hdr) : 0) +
1365		     rt->rt6i_nfheader_len;
1366
1367	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1368	 * the first fragment
1369	 */
1370	if (headersize + transhdrlen > mtu)
1371		goto emsgsize;
1372
1373	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1374	    (sk->sk_protocol == IPPROTO_UDP ||
1375	     sk->sk_protocol == IPPROTO_RAW)) {
1376		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1377				sizeof(struct ipv6hdr));
1378		goto emsgsize;
1379	}
1380
1381	if (ip6_sk_ignore_df(sk))
1382		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1383	else
1384		maxnonfragsize = mtu;
1385
1386	if (cork->length + length > maxnonfragsize - headersize) {
1387emsgsize:
1388		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1389		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1390		return -EMSGSIZE;
1391	}
1392
1393	/* CHECKSUM_PARTIAL only with no extension headers and when
1394	 * we are not going to fragment
1395	 */
1396	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1397	    headersize == sizeof(struct ipv6hdr) &&
1398	    length <= mtu - headersize &&
1399	    (!(flags & MSG_MORE) || cork->gso_size) &&
1400	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1401		csummode = CHECKSUM_PARTIAL;
1402
1403	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1404		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1405		if (!uarg)
1406			return -ENOBUFS;
1407		extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1408		if (rt->dst.dev->features & NETIF_F_SG &&
1409		    csummode == CHECKSUM_PARTIAL) {
1410			paged = true;
1411		} else {
1412			uarg->zerocopy = 0;
1413			skb_zcopy_set(skb, uarg, &extra_uref);
1414		}
1415	}
1416
1417	/*
1418	 * Let's try using as much space as possible.
1419	 * Use MTU if total length of the message fits into the MTU.
1420	 * Otherwise, we need to reserve fragment header and
1421	 * fragment alignment (= 8-15 octects, in total).
1422	 *
1423	 * Note that we may need to "move" the data from the tail of
1424	 * of the buffer to the new fragment when we split
1425	 * the message.
1426	 *
1427	 * FIXME: It may be fragmented into multiple chunks
1428	 *        at once if non-fragmentable extension headers
1429	 *        are too large.
1430	 * --yoshfuji
1431	 */
1432
1433	cork->length += length;
1434	if (!skb)
1435		goto alloc_new_skb;
1436
1437	while (length > 0) {
1438		/* Check if the remaining data fits into current packet. */
1439		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1440		if (copy < length)
1441			copy = maxfraglen - skb->len;
1442
1443		if (copy <= 0) {
1444			char *data;
1445			unsigned int datalen;
1446			unsigned int fraglen;
1447			unsigned int fraggap;
1448			unsigned int alloclen;
1449			unsigned int pagedlen;
1450alloc_new_skb:
1451			/* There's no room in the current skb */
1452			if (skb)
1453				fraggap = skb->len - maxfraglen;
1454			else
1455				fraggap = 0;
1456			/* update mtu and maxfraglen if necessary */
1457			if (!skb || !skb_prev)
1458				ip6_append_data_mtu(&mtu, &maxfraglen,
1459						    fragheaderlen, skb, rt,
1460						    orig_mtu);
1461
1462			skb_prev = skb;
1463
1464			/*
1465			 * If remaining data exceeds the mtu,
1466			 * we know we need more fragment(s).
1467			 */
1468			datalen = length + fraggap;
1469
1470			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1471				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1472			fraglen = datalen + fragheaderlen;
1473			pagedlen = 0;
1474
 
 
 
 
 
 
 
 
 
 
1475			if ((flags & MSG_MORE) &&
1476			    !(rt->dst.dev->features&NETIF_F_SG))
1477				alloclen = mtu;
1478			else if (!paged)
 
 
1479				alloclen = fraglen;
1480			else {
1481				alloclen = min_t(int, fraglen, MAX_HEADER);
1482				pagedlen = fraglen - alloclen;
1483			}
1484
1485			alloclen += dst_exthdrlen;
1486
1487			if (datalen != length + fraggap) {
1488				/*
1489				 * this is not the last fragment, the trailer
1490				 * space is regarded as data space.
1491				 */
1492				datalen += rt->dst.trailer_len;
1493			}
1494
1495			alloclen += rt->dst.trailer_len;
1496			fraglen = datalen + fragheaderlen;
1497
1498			/*
1499			 * We just reserve space for fragment header.
1500			 * Note: this may be overallocation if the message
1501			 * (without MSG_MORE) fits into the MTU.
1502			 */
1503			alloclen += sizeof(struct frag_hdr);
1504
1505			copy = datalen - transhdrlen - fraggap - pagedlen;
1506			if (copy < 0) {
1507				err = -EINVAL;
1508				goto error;
1509			}
1510			if (transhdrlen) {
1511				skb = sock_alloc_send_skb(sk,
1512						alloclen + hh_len,
1513						(flags & MSG_DONTWAIT), &err);
1514			} else {
1515				skb = NULL;
1516				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1517				    2 * sk->sk_sndbuf)
1518					skb = alloc_skb(alloclen + hh_len,
1519							sk->sk_allocation);
1520				if (unlikely(!skb))
1521					err = -ENOBUFS;
1522			}
1523			if (!skb)
1524				goto error;
1525			/*
1526			 *	Fill in the control structures
1527			 */
1528			skb->protocol = htons(ETH_P_IPV6);
1529			skb->ip_summed = csummode;
1530			skb->csum = 0;
1531			/* reserve for fragmentation and ipsec header */
1532			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1533				    dst_exthdrlen);
1534
1535			/*
1536			 *	Find where to start putting bytes
1537			 */
1538			data = skb_put(skb, fraglen - pagedlen);
1539			skb_set_network_header(skb, exthdrlen);
1540			data += fragheaderlen;
1541			skb->transport_header = (skb->network_header +
1542						 fragheaderlen);
1543			if (fraggap) {
1544				skb->csum = skb_copy_and_csum_bits(
1545					skb_prev, maxfraglen,
1546					data + transhdrlen, fraggap, 0);
1547				skb_prev->csum = csum_sub(skb_prev->csum,
1548							  skb->csum);
1549				data += fraggap;
1550				pskb_trim_unique(skb_prev, maxfraglen);
1551			}
1552			if (copy > 0 &&
1553			    getfrag(from, data + transhdrlen, offset,
1554				    copy, fraggap, skb) < 0) {
1555				err = -EFAULT;
1556				kfree_skb(skb);
1557				goto error;
1558			}
1559
1560			offset += copy;
1561			length -= copy + transhdrlen;
1562			transhdrlen = 0;
1563			exthdrlen = 0;
1564			dst_exthdrlen = 0;
1565
1566			/* Only the initial fragment is time stamped */
1567			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1568			cork->tx_flags = 0;
1569			skb_shinfo(skb)->tskey = tskey;
1570			tskey = 0;
1571			skb_zcopy_set(skb, uarg, &extra_uref);
1572
1573			if ((flags & MSG_CONFIRM) && !skb_prev)
1574				skb_set_dst_pending_confirm(skb, 1);
1575
1576			/*
1577			 * Put the packet on the pending queue
1578			 */
1579			if (!skb->destructor) {
1580				skb->destructor = sock_wfree;
1581				skb->sk = sk;
1582				wmem_alloc_delta += skb->truesize;
1583			}
1584			__skb_queue_tail(queue, skb);
1585			continue;
1586		}
1587
1588		if (copy > length)
1589			copy = length;
1590
1591		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1592		    skb_tailroom(skb) >= copy) {
1593			unsigned int off;
1594
1595			off = skb->len;
1596			if (getfrag(from, skb_put(skb, copy),
1597						offset, copy, off, skb) < 0) {
1598				__skb_trim(skb, off);
1599				err = -EFAULT;
1600				goto error;
1601			}
1602		} else if (!uarg || !uarg->zerocopy) {
1603			int i = skb_shinfo(skb)->nr_frags;
1604
1605			err = -ENOMEM;
1606			if (!sk_page_frag_refill(sk, pfrag))
1607				goto error;
1608
1609			if (!skb_can_coalesce(skb, i, pfrag->page,
1610					      pfrag->offset)) {
1611				err = -EMSGSIZE;
1612				if (i == MAX_SKB_FRAGS)
1613					goto error;
1614
1615				__skb_fill_page_desc(skb, i, pfrag->page,
1616						     pfrag->offset, 0);
1617				skb_shinfo(skb)->nr_frags = ++i;
1618				get_page(pfrag->page);
1619			}
1620			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1621			if (getfrag(from,
1622				    page_address(pfrag->page) + pfrag->offset,
1623				    offset, copy, skb->len, skb) < 0)
1624				goto error_efault;
1625
1626			pfrag->offset += copy;
1627			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1628			skb->len += copy;
1629			skb->data_len += copy;
1630			skb->truesize += copy;
1631			wmem_alloc_delta += copy;
1632		} else {
1633			err = skb_zerocopy_iter_dgram(skb, from, copy);
1634			if (err < 0)
1635				goto error;
1636		}
1637		offset += copy;
1638		length -= copy;
1639	}
1640
1641	if (wmem_alloc_delta)
1642		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1643	return 0;
1644
1645error_efault:
1646	err = -EFAULT;
1647error:
1648	if (uarg)
1649		sock_zerocopy_put_abort(uarg, extra_uref);
1650	cork->length -= length;
1651	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1652	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1653	return err;
1654}
1655
1656int ip6_append_data(struct sock *sk,
1657		    int getfrag(void *from, char *to, int offset, int len,
1658				int odd, struct sk_buff *skb),
1659		    void *from, int length, int transhdrlen,
1660		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1661		    struct rt6_info *rt, unsigned int flags)
1662{
1663	struct inet_sock *inet = inet_sk(sk);
1664	struct ipv6_pinfo *np = inet6_sk(sk);
1665	int exthdrlen;
1666	int err;
1667
1668	if (flags&MSG_PROBE)
1669		return 0;
1670	if (skb_queue_empty(&sk->sk_write_queue)) {
1671		/*
1672		 * setup for corking
1673		 */
1674		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1675				     ipc6, rt, fl6);
1676		if (err)
1677			return err;
1678
1679		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1680		length += exthdrlen;
1681		transhdrlen += exthdrlen;
1682	} else {
1683		fl6 = &inet->cork.fl.u.ip6;
1684		transhdrlen = 0;
1685	}
1686
1687	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1688				 &np->cork, sk_page_frag(sk), getfrag,
1689				 from, length, transhdrlen, flags, ipc6);
1690}
1691EXPORT_SYMBOL_GPL(ip6_append_data);
1692
1693static void ip6_cork_release(struct inet_cork_full *cork,
1694			     struct inet6_cork *v6_cork)
1695{
1696	if (v6_cork->opt) {
1697		kfree(v6_cork->opt->dst0opt);
1698		kfree(v6_cork->opt->dst1opt);
1699		kfree(v6_cork->opt->hopopt);
1700		kfree(v6_cork->opt->srcrt);
1701		kfree(v6_cork->opt);
1702		v6_cork->opt = NULL;
1703	}
1704
1705	if (cork->base.dst) {
1706		dst_release(cork->base.dst);
1707		cork->base.dst = NULL;
1708		cork->base.flags &= ~IPCORK_ALLFRAG;
1709	}
1710	memset(&cork->fl, 0, sizeof(cork->fl));
1711}
1712
1713struct sk_buff *__ip6_make_skb(struct sock *sk,
1714			       struct sk_buff_head *queue,
1715			       struct inet_cork_full *cork,
1716			       struct inet6_cork *v6_cork)
1717{
1718	struct sk_buff *skb, *tmp_skb;
1719	struct sk_buff **tail_skb;
1720	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1721	struct ipv6_pinfo *np = inet6_sk(sk);
1722	struct net *net = sock_net(sk);
1723	struct ipv6hdr *hdr;
1724	struct ipv6_txoptions *opt = v6_cork->opt;
1725	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1726	struct flowi6 *fl6 = &cork->fl.u.ip6;
1727	unsigned char proto = fl6->flowi6_proto;
1728
1729	skb = __skb_dequeue(queue);
1730	if (!skb)
1731		goto out;
1732	tail_skb = &(skb_shinfo(skb)->frag_list);
1733
1734	/* move skb->data to ip header from ext header */
1735	if (skb->data < skb_network_header(skb))
1736		__skb_pull(skb, skb_network_offset(skb));
1737	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1738		__skb_pull(tmp_skb, skb_network_header_len(skb));
1739		*tail_skb = tmp_skb;
1740		tail_skb = &(tmp_skb->next);
1741		skb->len += tmp_skb->len;
1742		skb->data_len += tmp_skb->len;
1743		skb->truesize += tmp_skb->truesize;
1744		tmp_skb->destructor = NULL;
1745		tmp_skb->sk = NULL;
1746	}
1747
1748	/* Allow local fragmentation. */
1749	skb->ignore_df = ip6_sk_ignore_df(sk);
1750
1751	*final_dst = fl6->daddr;
1752	__skb_pull(skb, skb_network_header_len(skb));
1753	if (opt && opt->opt_flen)
1754		ipv6_push_frag_opts(skb, opt, &proto);
1755	if (opt && opt->opt_nflen)
1756		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1757
1758	skb_push(skb, sizeof(struct ipv6hdr));
1759	skb_reset_network_header(skb);
1760	hdr = ipv6_hdr(skb);
1761
1762	ip6_flow_hdr(hdr, v6_cork->tclass,
1763		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1764					ip6_autoflowlabel(net, np), fl6));
1765	hdr->hop_limit = v6_cork->hop_limit;
1766	hdr->nexthdr = proto;
1767	hdr->saddr = fl6->saddr;
1768	hdr->daddr = *final_dst;
1769
1770	skb->priority = sk->sk_priority;
1771	skb->mark = cork->base.mark;
1772
1773	skb->tstamp = cork->base.transmit_time;
1774
1775	skb_dst_set(skb, dst_clone(&rt->dst));
1776	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1777	if (proto == IPPROTO_ICMPV6) {
1778		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1779
1780		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1781		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1782	}
1783
1784	ip6_cork_release(cork, v6_cork);
1785out:
1786	return skb;
1787}
1788
1789int ip6_send_skb(struct sk_buff *skb)
1790{
1791	struct net *net = sock_net(skb->sk);
1792	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1793	int err;
1794
1795	err = ip6_local_out(net, skb->sk, skb);
1796	if (err) {
1797		if (err > 0)
1798			err = net_xmit_errno(err);
1799		if (err)
1800			IP6_INC_STATS(net, rt->rt6i_idev,
1801				      IPSTATS_MIB_OUTDISCARDS);
1802	}
1803
1804	return err;
1805}
1806
1807int ip6_push_pending_frames(struct sock *sk)
1808{
1809	struct sk_buff *skb;
1810
1811	skb = ip6_finish_skb(sk);
1812	if (!skb)
1813		return 0;
1814
1815	return ip6_send_skb(skb);
1816}
1817EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1818
1819static void __ip6_flush_pending_frames(struct sock *sk,
1820				       struct sk_buff_head *queue,
1821				       struct inet_cork_full *cork,
1822				       struct inet6_cork *v6_cork)
1823{
1824	struct sk_buff *skb;
1825
1826	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1827		if (skb_dst(skb))
1828			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1829				      IPSTATS_MIB_OUTDISCARDS);
1830		kfree_skb(skb);
1831	}
1832
1833	ip6_cork_release(cork, v6_cork);
1834}
1835
1836void ip6_flush_pending_frames(struct sock *sk)
1837{
1838	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1839				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1840}
1841EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1842
1843struct sk_buff *ip6_make_skb(struct sock *sk,
1844			     int getfrag(void *from, char *to, int offset,
1845					 int len, int odd, struct sk_buff *skb),
1846			     void *from, int length, int transhdrlen,
1847			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1848			     struct rt6_info *rt, unsigned int flags,
1849			     struct inet_cork_full *cork)
1850{
1851	struct inet6_cork v6_cork;
1852	struct sk_buff_head queue;
1853	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1854	int err;
1855
1856	if (flags & MSG_PROBE)
1857		return NULL;
1858
1859	__skb_queue_head_init(&queue);
1860
1861	cork->base.flags = 0;
1862	cork->base.addr = 0;
1863	cork->base.opt = NULL;
1864	cork->base.dst = NULL;
1865	v6_cork.opt = NULL;
1866	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1867	if (err) {
1868		ip6_cork_release(cork, &v6_cork);
1869		return ERR_PTR(err);
1870	}
1871	if (ipc6->dontfrag < 0)
1872		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1873
1874	err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1875				&current->task_frag, getfrag, from,
1876				length + exthdrlen, transhdrlen + exthdrlen,
1877				flags, ipc6);
1878	if (err) {
1879		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1880		return ERR_PTR(err);
1881	}
1882
1883	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1884}