Loading...
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54#include <linux/slab.h>
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
70#include <net/xfrm.h>
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
75#include <net/checksum.h>
76#include <net/inetpeer.h>
77#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
82#include <linux/tcp.h>
83
84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85EXPORT_SYMBOL(sysctl_ip_default_ttl);
86
87/* Generate a checksum for an outgoing IP datagram. */
88__inline__ void ip_send_check(struct iphdr *iph)
89{
90 iph->check = 0;
91 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92}
93EXPORT_SYMBOL(ip_send_check);
94
95int __ip_local_out(struct sk_buff *skb)
96{
97 struct iphdr *iph = ip_hdr(skb);
98
99 iph->tot_len = htons(skb->len);
100 ip_send_check(iph);
101 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102 skb_dst(skb)->dev, dst_output);
103}
104
105int ip_local_out(struct sk_buff *skb)
106{
107 int err;
108
109 err = __ip_local_out(skb);
110 if (likely(err == 1))
111 err = dst_output(skb);
112
113 return err;
114}
115EXPORT_SYMBOL_GPL(ip_local_out);
116
117/* dev_loopback_xmit for use with netfilter. */
118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119{
120 skb_reset_mac_header(newskb);
121 __skb_pull(newskb, skb_network_offset(newskb));
122 newskb->pkt_type = PACKET_LOOPBACK;
123 newskb->ip_summed = CHECKSUM_UNNECESSARY;
124 WARN_ON(!skb_dst(newskb));
125 skb_dst_force(newskb);
126 netif_rx_ni(newskb);
127 return 0;
128}
129
130static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
131{
132 int ttl = inet->uc_ttl;
133
134 if (ttl < 0)
135 ttl = ip4_dst_hoplimit(dst);
136 return ttl;
137}
138
139/*
140 * Add an ip header to a skbuff and send it out.
141 *
142 */
143int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
144 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
145{
146 struct inet_sock *inet = inet_sk(sk);
147 struct rtable *rt = skb_rtable(skb);
148 struct iphdr *iph;
149
150 /* Build the IP header. */
151 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
152 skb_reset_network_header(skb);
153 iph = ip_hdr(skb);
154 iph->version = 4;
155 iph->ihl = 5;
156 iph->tos = inet->tos;
157 if (ip_dont_fragment(sk, &rt->dst))
158 iph->frag_off = htons(IP_DF);
159 else
160 iph->frag_off = 0;
161 iph->ttl = ip_select_ttl(inet, &rt->dst);
162 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
163 iph->saddr = saddr;
164 iph->protocol = sk->sk_protocol;
165 ip_select_ident(iph, &rt->dst, sk);
166
167 if (opt && opt->opt.optlen) {
168 iph->ihl += opt->opt.optlen>>2;
169 ip_options_build(skb, &opt->opt, daddr, rt, 0);
170 }
171
172 skb->priority = sk->sk_priority;
173 skb->mark = sk->sk_mark;
174
175 /* Send it out. */
176 return ip_local_out(skb);
177}
178EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
179
180static inline int ip_finish_output2(struct sk_buff *skb)
181{
182 struct dst_entry *dst = skb_dst(skb);
183 struct rtable *rt = (struct rtable *)dst;
184 struct net_device *dev = dst->dev;
185 unsigned int hh_len = LL_RESERVED_SPACE(dev);
186 struct neighbour *neigh;
187
188 if (rt->rt_type == RTN_MULTICAST) {
189 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
190 } else if (rt->rt_type == RTN_BROADCAST)
191 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
192
193 /* Be paranoid, rather than too clever. */
194 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
195 struct sk_buff *skb2;
196
197 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
198 if (skb2 == NULL) {
199 kfree_skb(skb);
200 return -ENOMEM;
201 }
202 if (skb->sk)
203 skb_set_owner_w(skb2, skb->sk);
204 kfree_skb(skb);
205 skb = skb2;
206 }
207
208 rcu_read_lock();
209 neigh = dst_get_neighbour(dst);
210 if (neigh) {
211 int res = neigh_output(neigh, skb);
212
213 rcu_read_unlock();
214 return res;
215 }
216 rcu_read_unlock();
217
218 if (net_ratelimit())
219 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
220 kfree_skb(skb);
221 return -EINVAL;
222}
223
224static inline int ip_skb_dst_mtu(struct sk_buff *skb)
225{
226 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
227
228 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
229 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
230}
231
232static int ip_finish_output(struct sk_buff *skb)
233{
234#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
235 /* Policy lookup after SNAT yielded a new policy */
236 if (skb_dst(skb)->xfrm != NULL) {
237 IPCB(skb)->flags |= IPSKB_REROUTED;
238 return dst_output(skb);
239 }
240#endif
241 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
242 return ip_fragment(skb, ip_finish_output2);
243 else
244 return ip_finish_output2(skb);
245}
246
247int ip_mc_output(struct sk_buff *skb)
248{
249 struct sock *sk = skb->sk;
250 struct rtable *rt = skb_rtable(skb);
251 struct net_device *dev = rt->dst.dev;
252
253 /*
254 * If the indicated interface is up and running, send the packet.
255 */
256 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
257
258 skb->dev = dev;
259 skb->protocol = htons(ETH_P_IP);
260
261 /*
262 * Multicasts are looped back for other local users
263 */
264
265 if (rt->rt_flags&RTCF_MULTICAST) {
266 if (sk_mc_loop(sk)
267#ifdef CONFIG_IP_MROUTE
268 /* Small optimization: do not loopback not local frames,
269 which returned after forwarding; they will be dropped
270 by ip_mr_input in any case.
271 Note, that local frames are looped back to be delivered
272 to local recipients.
273
274 This check is duplicated in ip_mr_input at the moment.
275 */
276 &&
277 ((rt->rt_flags & RTCF_LOCAL) ||
278 !(IPCB(skb)->flags & IPSKB_FORWARDED))
279#endif
280 ) {
281 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
282 if (newskb)
283 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
284 newskb, NULL, newskb->dev,
285 ip_dev_loopback_xmit);
286 }
287
288 /* Multicasts with ttl 0 must not go beyond the host */
289
290 if (ip_hdr(skb)->ttl == 0) {
291 kfree_skb(skb);
292 return 0;
293 }
294 }
295
296 if (rt->rt_flags&RTCF_BROADCAST) {
297 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
298 if (newskb)
299 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
300 NULL, newskb->dev, ip_dev_loopback_xmit);
301 }
302
303 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
304 skb->dev, ip_finish_output,
305 !(IPCB(skb)->flags & IPSKB_REROUTED));
306}
307
308int ip_output(struct sk_buff *skb)
309{
310 struct net_device *dev = skb_dst(skb)->dev;
311
312 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
313
314 skb->dev = dev;
315 skb->protocol = htons(ETH_P_IP);
316
317 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
318 ip_finish_output,
319 !(IPCB(skb)->flags & IPSKB_REROUTED));
320}
321
322int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
323{
324 struct sock *sk = skb->sk;
325 struct inet_sock *inet = inet_sk(sk);
326 struct ip_options_rcu *inet_opt;
327 struct flowi4 *fl4;
328 struct rtable *rt;
329 struct iphdr *iph;
330 int res;
331
332 /* Skip all of this if the packet is already routed,
333 * f.e. by something like SCTP.
334 */
335 rcu_read_lock();
336 inet_opt = rcu_dereference(inet->inet_opt);
337 fl4 = &fl->u.ip4;
338 rt = skb_rtable(skb);
339 if (rt != NULL)
340 goto packet_routed;
341
342 /* Make sure we can route this packet. */
343 rt = (struct rtable *)__sk_dst_check(sk, 0);
344 if (rt == NULL) {
345 __be32 daddr;
346
347 /* Use correct destination address if we have options. */
348 daddr = inet->inet_daddr;
349 if (inet_opt && inet_opt->opt.srr)
350 daddr = inet_opt->opt.faddr;
351
352 /* If this fails, retransmit mechanism of transport layer will
353 * keep trying until route appears or the connection times
354 * itself out.
355 */
356 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
357 daddr, inet->inet_saddr,
358 inet->inet_dport,
359 inet->inet_sport,
360 sk->sk_protocol,
361 RT_CONN_FLAGS(sk),
362 sk->sk_bound_dev_if);
363 if (IS_ERR(rt))
364 goto no_route;
365 sk_setup_caps(sk, &rt->dst);
366 }
367 skb_dst_set_noref(skb, &rt->dst);
368
369packet_routed:
370 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
371 goto no_route;
372
373 /* OK, we know where to send it, allocate and build IP header. */
374 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
375 skb_reset_network_header(skb);
376 iph = ip_hdr(skb);
377 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
378 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
379 iph->frag_off = htons(IP_DF);
380 else
381 iph->frag_off = 0;
382 iph->ttl = ip_select_ttl(inet, &rt->dst);
383 iph->protocol = sk->sk_protocol;
384 iph->saddr = fl4->saddr;
385 iph->daddr = fl4->daddr;
386 /* Transport layer set skb->h.foo itself. */
387
388 if (inet_opt && inet_opt->opt.optlen) {
389 iph->ihl += inet_opt->opt.optlen >> 2;
390 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
391 }
392
393 ip_select_ident_more(iph, &rt->dst, sk,
394 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
395
396 skb->priority = sk->sk_priority;
397 skb->mark = sk->sk_mark;
398
399 res = ip_local_out(skb);
400 rcu_read_unlock();
401 return res;
402
403no_route:
404 rcu_read_unlock();
405 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
406 kfree_skb(skb);
407 return -EHOSTUNREACH;
408}
409EXPORT_SYMBOL(ip_queue_xmit);
410
411
412static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
413{
414 to->pkt_type = from->pkt_type;
415 to->priority = from->priority;
416 to->protocol = from->protocol;
417 skb_dst_drop(to);
418 skb_dst_copy(to, from);
419 to->dev = from->dev;
420 to->mark = from->mark;
421
422 /* Copy the flags to each fragment. */
423 IPCB(to)->flags = IPCB(from)->flags;
424
425#ifdef CONFIG_NET_SCHED
426 to->tc_index = from->tc_index;
427#endif
428 nf_copy(to, from);
429#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
430 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
431 to->nf_trace = from->nf_trace;
432#endif
433#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
434 to->ipvs_property = from->ipvs_property;
435#endif
436 skb_copy_secmark(to, from);
437}
438
439/*
440 * This IP datagram is too large to be sent in one piece. Break it up into
441 * smaller pieces (each of size equal to IP header plus
442 * a block of the data of the original IP data part) that will yet fit in a
443 * single device frame, and queue such a frame for sending.
444 */
445
446int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
447{
448 struct iphdr *iph;
449 int ptr;
450 struct net_device *dev;
451 struct sk_buff *skb2;
452 unsigned int mtu, hlen, left, len, ll_rs;
453 int offset;
454 __be16 not_last_frag;
455 struct rtable *rt = skb_rtable(skb);
456 int err = 0;
457
458 dev = rt->dst.dev;
459
460 /*
461 * Point into the IP datagram header.
462 */
463
464 iph = ip_hdr(skb);
465
466 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
467 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
468 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
469 htonl(ip_skb_dst_mtu(skb)));
470 kfree_skb(skb);
471 return -EMSGSIZE;
472 }
473
474 /*
475 * Setup starting values.
476 */
477
478 hlen = iph->ihl * 4;
479 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
480#ifdef CONFIG_BRIDGE_NETFILTER
481 if (skb->nf_bridge)
482 mtu -= nf_bridge_mtu_reduction(skb);
483#endif
484 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
485
486 /* When frag_list is given, use it. First, check its validity:
487 * some transformers could create wrong frag_list or break existing
488 * one, it is not prohibited. In this case fall back to copying.
489 *
490 * LATER: this step can be merged to real generation of fragments,
491 * we can switch to copy when see the first bad fragment.
492 */
493 if (skb_has_frag_list(skb)) {
494 struct sk_buff *frag, *frag2;
495 int first_len = skb_pagelen(skb);
496
497 if (first_len - hlen > mtu ||
498 ((first_len - hlen) & 7) ||
499 ip_is_fragment(iph) ||
500 skb_cloned(skb))
501 goto slow_path;
502
503 skb_walk_frags(skb, frag) {
504 /* Correct geometry. */
505 if (frag->len > mtu ||
506 ((frag->len & 7) && frag->next) ||
507 skb_headroom(frag) < hlen)
508 goto slow_path_clean;
509
510 /* Partially cloned skb? */
511 if (skb_shared(frag))
512 goto slow_path_clean;
513
514 BUG_ON(frag->sk);
515 if (skb->sk) {
516 frag->sk = skb->sk;
517 frag->destructor = sock_wfree;
518 }
519 skb->truesize -= frag->truesize;
520 }
521
522 /* Everything is OK. Generate! */
523
524 err = 0;
525 offset = 0;
526 frag = skb_shinfo(skb)->frag_list;
527 skb_frag_list_init(skb);
528 skb->data_len = first_len - skb_headlen(skb);
529 skb->len = first_len;
530 iph->tot_len = htons(first_len);
531 iph->frag_off = htons(IP_MF);
532 ip_send_check(iph);
533
534 for (;;) {
535 /* Prepare header of the next frame,
536 * before previous one went down. */
537 if (frag) {
538 frag->ip_summed = CHECKSUM_NONE;
539 skb_reset_transport_header(frag);
540 __skb_push(frag, hlen);
541 skb_reset_network_header(frag);
542 memcpy(skb_network_header(frag), iph, hlen);
543 iph = ip_hdr(frag);
544 iph->tot_len = htons(frag->len);
545 ip_copy_metadata(frag, skb);
546 if (offset == 0)
547 ip_options_fragment(frag);
548 offset += skb->len - hlen;
549 iph->frag_off = htons(offset>>3);
550 if (frag->next != NULL)
551 iph->frag_off |= htons(IP_MF);
552 /* Ready, complete checksum */
553 ip_send_check(iph);
554 }
555
556 err = output(skb);
557
558 if (!err)
559 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
560 if (err || !frag)
561 break;
562
563 skb = frag;
564 frag = skb->next;
565 skb->next = NULL;
566 }
567
568 if (err == 0) {
569 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
570 return 0;
571 }
572
573 while (frag) {
574 skb = frag->next;
575 kfree_skb(frag);
576 frag = skb;
577 }
578 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
579 return err;
580
581slow_path_clean:
582 skb_walk_frags(skb, frag2) {
583 if (frag2 == frag)
584 break;
585 frag2->sk = NULL;
586 frag2->destructor = NULL;
587 skb->truesize += frag2->truesize;
588 }
589 }
590
591slow_path:
592 left = skb->len - hlen; /* Space per frame */
593 ptr = hlen; /* Where to start from */
594
595 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
596 * we need to make room for the encapsulating header
597 */
598 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
599
600 /*
601 * Fragment the datagram.
602 */
603
604 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
605 not_last_frag = iph->frag_off & htons(IP_MF);
606
607 /*
608 * Keep copying data until we run out.
609 */
610
611 while (left > 0) {
612 len = left;
613 /* IF: it doesn't fit, use 'mtu' - the data space left */
614 if (len > mtu)
615 len = mtu;
616 /* IF: we are not sending up to and including the packet end
617 then align the next start on an eight byte boundary */
618 if (len < left) {
619 len &= ~7;
620 }
621 /*
622 * Allocate buffer.
623 */
624
625 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
626 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
627 err = -ENOMEM;
628 goto fail;
629 }
630
631 /*
632 * Set up data on packet
633 */
634
635 ip_copy_metadata(skb2, skb);
636 skb_reserve(skb2, ll_rs);
637 skb_put(skb2, len + hlen);
638 skb_reset_network_header(skb2);
639 skb2->transport_header = skb2->network_header + hlen;
640
641 /*
642 * Charge the memory for the fragment to any owner
643 * it might possess
644 */
645
646 if (skb->sk)
647 skb_set_owner_w(skb2, skb->sk);
648
649 /*
650 * Copy the packet header into the new buffer.
651 */
652
653 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
654
655 /*
656 * Copy a block of the IP datagram.
657 */
658 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
659 BUG();
660 left -= len;
661
662 /*
663 * Fill in the new header fields.
664 */
665 iph = ip_hdr(skb2);
666 iph->frag_off = htons((offset >> 3));
667
668 /* ANK: dirty, but effective trick. Upgrade options only if
669 * the segment to be fragmented was THE FIRST (otherwise,
670 * options are already fixed) and make it ONCE
671 * on the initial skb, so that all the following fragments
672 * will inherit fixed options.
673 */
674 if (offset == 0)
675 ip_options_fragment(skb);
676
677 /*
678 * Added AC : If we are fragmenting a fragment that's not the
679 * last fragment then keep MF on each bit
680 */
681 if (left > 0 || not_last_frag)
682 iph->frag_off |= htons(IP_MF);
683 ptr += len;
684 offset += len;
685
686 /*
687 * Put this fragment into the sending queue.
688 */
689 iph->tot_len = htons(len + hlen);
690
691 ip_send_check(iph);
692
693 err = output(skb2);
694 if (err)
695 goto fail;
696
697 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
698 }
699 kfree_skb(skb);
700 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
701 return err;
702
703fail:
704 kfree_skb(skb);
705 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
706 return err;
707}
708EXPORT_SYMBOL(ip_fragment);
709
710int
711ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
712{
713 struct iovec *iov = from;
714
715 if (skb->ip_summed == CHECKSUM_PARTIAL) {
716 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
717 return -EFAULT;
718 } else {
719 __wsum csum = 0;
720 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
721 return -EFAULT;
722 skb->csum = csum_block_add(skb->csum, csum, odd);
723 }
724 return 0;
725}
726EXPORT_SYMBOL(ip_generic_getfrag);
727
728static inline __wsum
729csum_page(struct page *page, int offset, int copy)
730{
731 char *kaddr;
732 __wsum csum;
733 kaddr = kmap(page);
734 csum = csum_partial(kaddr + offset, copy, 0);
735 kunmap(page);
736 return csum;
737}
738
739static inline int ip_ufo_append_data(struct sock *sk,
740 struct sk_buff_head *queue,
741 int getfrag(void *from, char *to, int offset, int len,
742 int odd, struct sk_buff *skb),
743 void *from, int length, int hh_len, int fragheaderlen,
744 int transhdrlen, int maxfraglen, unsigned int flags)
745{
746 struct sk_buff *skb;
747 int err;
748
749 /* There is support for UDP fragmentation offload by network
750 * device, so create one single skb packet containing complete
751 * udp datagram
752 */
753 if ((skb = skb_peek_tail(queue)) == NULL) {
754 skb = sock_alloc_send_skb(sk,
755 hh_len + fragheaderlen + transhdrlen + 20,
756 (flags & MSG_DONTWAIT), &err);
757
758 if (skb == NULL)
759 return err;
760
761 /* reserve space for Hardware header */
762 skb_reserve(skb, hh_len);
763
764 /* create space for UDP/IP header */
765 skb_put(skb, fragheaderlen + transhdrlen);
766
767 /* initialize network header pointer */
768 skb_reset_network_header(skb);
769
770 /* initialize protocol header pointer */
771 skb->transport_header = skb->network_header + fragheaderlen;
772
773 skb->ip_summed = CHECKSUM_PARTIAL;
774 skb->csum = 0;
775
776 /* specify the length of each IP datagram fragment */
777 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
778 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
779 __skb_queue_tail(queue, skb);
780 }
781
782 return skb_append_datato_frags(sk, skb, getfrag, from,
783 (length - transhdrlen));
784}
785
786static int __ip_append_data(struct sock *sk,
787 struct flowi4 *fl4,
788 struct sk_buff_head *queue,
789 struct inet_cork *cork,
790 int getfrag(void *from, char *to, int offset,
791 int len, int odd, struct sk_buff *skb),
792 void *from, int length, int transhdrlen,
793 unsigned int flags)
794{
795 struct inet_sock *inet = inet_sk(sk);
796 struct sk_buff *skb;
797
798 struct ip_options *opt = cork->opt;
799 int hh_len;
800 int exthdrlen;
801 int mtu;
802 int copy;
803 int err;
804 int offset = 0;
805 unsigned int maxfraglen, fragheaderlen;
806 int csummode = CHECKSUM_NONE;
807 struct rtable *rt = (struct rtable *)cork->dst;
808
809 skb = skb_peek_tail(queue);
810
811 exthdrlen = !skb ? rt->dst.header_len : 0;
812 mtu = cork->fragsize;
813
814 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
815
816 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
817 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
818
819 if (cork->length + length > 0xFFFF - fragheaderlen) {
820 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
821 mtu-exthdrlen);
822 return -EMSGSIZE;
823 }
824
825 /*
826 * transhdrlen > 0 means that this is the first fragment and we wish
827 * it won't be fragmented in the future.
828 */
829 if (transhdrlen &&
830 length + fragheaderlen <= mtu &&
831 rt->dst.dev->features & NETIF_F_V4_CSUM &&
832 !exthdrlen)
833 csummode = CHECKSUM_PARTIAL;
834
835 cork->length += length;
836 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
837 (sk->sk_protocol == IPPROTO_UDP) &&
838 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
839 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
840 hh_len, fragheaderlen, transhdrlen,
841 maxfraglen, flags);
842 if (err)
843 goto error;
844 return 0;
845 }
846
847 /* So, what's going on in the loop below?
848 *
849 * We use calculated fragment length to generate chained skb,
850 * each of segments is IP fragment ready for sending to network after
851 * adding appropriate IP header.
852 */
853
854 if (!skb)
855 goto alloc_new_skb;
856
857 while (length > 0) {
858 /* Check if the remaining data fits into current packet. */
859 copy = mtu - skb->len;
860 if (copy < length)
861 copy = maxfraglen - skb->len;
862 if (copy <= 0) {
863 char *data;
864 unsigned int datalen;
865 unsigned int fraglen;
866 unsigned int fraggap;
867 unsigned int alloclen;
868 struct sk_buff *skb_prev;
869alloc_new_skb:
870 skb_prev = skb;
871 if (skb_prev)
872 fraggap = skb_prev->len - maxfraglen;
873 else
874 fraggap = 0;
875
876 /*
877 * If remaining data exceeds the mtu,
878 * we know we need more fragment(s).
879 */
880 datalen = length + fraggap;
881 if (datalen > mtu - fragheaderlen)
882 datalen = maxfraglen - fragheaderlen;
883 fraglen = datalen + fragheaderlen;
884
885 if ((flags & MSG_MORE) &&
886 !(rt->dst.dev->features&NETIF_F_SG))
887 alloclen = mtu;
888 else
889 alloclen = fraglen;
890
891 alloclen += exthdrlen;
892
893 /* The last fragment gets additional space at tail.
894 * Note, with MSG_MORE we overallocate on fragments,
895 * because we have no idea what fragment will be
896 * the last.
897 */
898 if (datalen == length + fraggap)
899 alloclen += rt->dst.trailer_len;
900
901 if (transhdrlen) {
902 skb = sock_alloc_send_skb(sk,
903 alloclen + hh_len + 15,
904 (flags & MSG_DONTWAIT), &err);
905 } else {
906 skb = NULL;
907 if (atomic_read(&sk->sk_wmem_alloc) <=
908 2 * sk->sk_sndbuf)
909 skb = sock_wmalloc(sk,
910 alloclen + hh_len + 15, 1,
911 sk->sk_allocation);
912 if (unlikely(skb == NULL))
913 err = -ENOBUFS;
914 else
915 /* only the initial fragment is
916 time stamped */
917 cork->tx_flags = 0;
918 }
919 if (skb == NULL)
920 goto error;
921
922 /*
923 * Fill in the control structures
924 */
925 skb->ip_summed = csummode;
926 skb->csum = 0;
927 skb_reserve(skb, hh_len);
928 skb_shinfo(skb)->tx_flags = cork->tx_flags;
929
930 /*
931 * Find where to start putting bytes.
932 */
933 data = skb_put(skb, fraglen + exthdrlen);
934 skb_set_network_header(skb, exthdrlen);
935 skb->transport_header = (skb->network_header +
936 fragheaderlen);
937 data += fragheaderlen + exthdrlen;
938
939 if (fraggap) {
940 skb->csum = skb_copy_and_csum_bits(
941 skb_prev, maxfraglen,
942 data + transhdrlen, fraggap, 0);
943 skb_prev->csum = csum_sub(skb_prev->csum,
944 skb->csum);
945 data += fraggap;
946 pskb_trim_unique(skb_prev, maxfraglen);
947 }
948
949 copy = datalen - transhdrlen - fraggap;
950 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
951 err = -EFAULT;
952 kfree_skb(skb);
953 goto error;
954 }
955
956 offset += copy;
957 length -= datalen - fraggap;
958 transhdrlen = 0;
959 exthdrlen = 0;
960 csummode = CHECKSUM_NONE;
961
962 /*
963 * Put the packet on the pending queue.
964 */
965 __skb_queue_tail(queue, skb);
966 continue;
967 }
968
969 if (copy > length)
970 copy = length;
971
972 if (!(rt->dst.dev->features&NETIF_F_SG)) {
973 unsigned int off;
974
975 off = skb->len;
976 if (getfrag(from, skb_put(skb, copy),
977 offset, copy, off, skb) < 0) {
978 __skb_trim(skb, off);
979 err = -EFAULT;
980 goto error;
981 }
982 } else {
983 int i = skb_shinfo(skb)->nr_frags;
984 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
985 struct page *page = cork->page;
986 int off = cork->off;
987 unsigned int left;
988
989 if (page && (left = PAGE_SIZE - off) > 0) {
990 if (copy >= left)
991 copy = left;
992 if (page != frag->page) {
993 if (i == MAX_SKB_FRAGS) {
994 err = -EMSGSIZE;
995 goto error;
996 }
997 get_page(page);
998 skb_fill_page_desc(skb, i, page, off, 0);
999 frag = &skb_shinfo(skb)->frags[i];
1000 }
1001 } else if (i < MAX_SKB_FRAGS) {
1002 if (copy > PAGE_SIZE)
1003 copy = PAGE_SIZE;
1004 page = alloc_pages(sk->sk_allocation, 0);
1005 if (page == NULL) {
1006 err = -ENOMEM;
1007 goto error;
1008 }
1009 cork->page = page;
1010 cork->off = 0;
1011
1012 skb_fill_page_desc(skb, i, page, 0, 0);
1013 frag = &skb_shinfo(skb)->frags[i];
1014 } else {
1015 err = -EMSGSIZE;
1016 goto error;
1017 }
1018 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1019 err = -EFAULT;
1020 goto error;
1021 }
1022 cork->off += copy;
1023 frag->size += copy;
1024 skb->len += copy;
1025 skb->data_len += copy;
1026 skb->truesize += copy;
1027 atomic_add(copy, &sk->sk_wmem_alloc);
1028 }
1029 offset += copy;
1030 length -= copy;
1031 }
1032
1033 return 0;
1034
1035error:
1036 cork->length -= length;
1037 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1038 return err;
1039}
1040
1041static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1042 struct ipcm_cookie *ipc, struct rtable **rtp)
1043{
1044 struct inet_sock *inet = inet_sk(sk);
1045 struct ip_options_rcu *opt;
1046 struct rtable *rt;
1047
1048 /*
1049 * setup for corking.
1050 */
1051 opt = ipc->opt;
1052 if (opt) {
1053 if (cork->opt == NULL) {
1054 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1055 sk->sk_allocation);
1056 if (unlikely(cork->opt == NULL))
1057 return -ENOBUFS;
1058 }
1059 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1060 cork->flags |= IPCORK_OPT;
1061 cork->addr = ipc->addr;
1062 }
1063 rt = *rtp;
1064 if (unlikely(!rt))
1065 return -EFAULT;
1066 /*
1067 * We steal reference to this route, caller should not release it
1068 */
1069 *rtp = NULL;
1070 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1071 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1072 cork->dst = &rt->dst;
1073 cork->length = 0;
1074 cork->tx_flags = ipc->tx_flags;
1075 cork->page = NULL;
1076 cork->off = 0;
1077
1078 return 0;
1079}
1080
1081/*
1082 * ip_append_data() and ip_append_page() can make one large IP datagram
1083 * from many pieces of data. Each pieces will be holded on the socket
1084 * until ip_push_pending_frames() is called. Each piece can be a page
1085 * or non-page data.
1086 *
1087 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1088 * this interface potentially.
1089 *
1090 * LATER: length must be adjusted by pad at tail, when it is required.
1091 */
1092int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1093 int getfrag(void *from, char *to, int offset, int len,
1094 int odd, struct sk_buff *skb),
1095 void *from, int length, int transhdrlen,
1096 struct ipcm_cookie *ipc, struct rtable **rtp,
1097 unsigned int flags)
1098{
1099 struct inet_sock *inet = inet_sk(sk);
1100 int err;
1101
1102 if (flags&MSG_PROBE)
1103 return 0;
1104
1105 if (skb_queue_empty(&sk->sk_write_queue)) {
1106 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1107 if (err)
1108 return err;
1109 } else {
1110 transhdrlen = 0;
1111 }
1112
1113 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1114 from, length, transhdrlen, flags);
1115}
1116
1117ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1118 int offset, size_t size, int flags)
1119{
1120 struct inet_sock *inet = inet_sk(sk);
1121 struct sk_buff *skb;
1122 struct rtable *rt;
1123 struct ip_options *opt = NULL;
1124 struct inet_cork *cork;
1125 int hh_len;
1126 int mtu;
1127 int len;
1128 int err;
1129 unsigned int maxfraglen, fragheaderlen, fraggap;
1130
1131 if (inet->hdrincl)
1132 return -EPERM;
1133
1134 if (flags&MSG_PROBE)
1135 return 0;
1136
1137 if (skb_queue_empty(&sk->sk_write_queue))
1138 return -EINVAL;
1139
1140 cork = &inet->cork.base;
1141 rt = (struct rtable *)cork->dst;
1142 if (cork->flags & IPCORK_OPT)
1143 opt = cork->opt;
1144
1145 if (!(rt->dst.dev->features&NETIF_F_SG))
1146 return -EOPNOTSUPP;
1147
1148 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1149 mtu = cork->fragsize;
1150
1151 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1152 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1153
1154 if (cork->length + size > 0xFFFF - fragheaderlen) {
1155 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1156 return -EMSGSIZE;
1157 }
1158
1159 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1160 return -EINVAL;
1161
1162 cork->length += size;
1163 if ((size + skb->len > mtu) &&
1164 (sk->sk_protocol == IPPROTO_UDP) &&
1165 (rt->dst.dev->features & NETIF_F_UFO)) {
1166 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1167 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1168 }
1169
1170
1171 while (size > 0) {
1172 int i;
1173
1174 if (skb_is_gso(skb))
1175 len = size;
1176 else {
1177
1178 /* Check if the remaining data fits into current packet. */
1179 len = mtu - skb->len;
1180 if (len < size)
1181 len = maxfraglen - skb->len;
1182 }
1183 if (len <= 0) {
1184 struct sk_buff *skb_prev;
1185 int alloclen;
1186
1187 skb_prev = skb;
1188 fraggap = skb_prev->len - maxfraglen;
1189
1190 alloclen = fragheaderlen + hh_len + fraggap + 15;
1191 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1192 if (unlikely(!skb)) {
1193 err = -ENOBUFS;
1194 goto error;
1195 }
1196
1197 /*
1198 * Fill in the control structures
1199 */
1200 skb->ip_summed = CHECKSUM_NONE;
1201 skb->csum = 0;
1202 skb_reserve(skb, hh_len);
1203
1204 /*
1205 * Find where to start putting bytes.
1206 */
1207 skb_put(skb, fragheaderlen + fraggap);
1208 skb_reset_network_header(skb);
1209 skb->transport_header = (skb->network_header +
1210 fragheaderlen);
1211 if (fraggap) {
1212 skb->csum = skb_copy_and_csum_bits(skb_prev,
1213 maxfraglen,
1214 skb_transport_header(skb),
1215 fraggap, 0);
1216 skb_prev->csum = csum_sub(skb_prev->csum,
1217 skb->csum);
1218 pskb_trim_unique(skb_prev, maxfraglen);
1219 }
1220
1221 /*
1222 * Put the packet on the pending queue.
1223 */
1224 __skb_queue_tail(&sk->sk_write_queue, skb);
1225 continue;
1226 }
1227
1228 i = skb_shinfo(skb)->nr_frags;
1229 if (len > size)
1230 len = size;
1231 if (skb_can_coalesce(skb, i, page, offset)) {
1232 skb_shinfo(skb)->frags[i-1].size += len;
1233 } else if (i < MAX_SKB_FRAGS) {
1234 get_page(page);
1235 skb_fill_page_desc(skb, i, page, offset, len);
1236 } else {
1237 err = -EMSGSIZE;
1238 goto error;
1239 }
1240
1241 if (skb->ip_summed == CHECKSUM_NONE) {
1242 __wsum csum;
1243 csum = csum_page(page, offset, len);
1244 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1245 }
1246
1247 skb->len += len;
1248 skb->data_len += len;
1249 skb->truesize += len;
1250 atomic_add(len, &sk->sk_wmem_alloc);
1251 offset += len;
1252 size -= len;
1253 }
1254 return 0;
1255
1256error:
1257 cork->length -= size;
1258 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1259 return err;
1260}
1261
1262static void ip_cork_release(struct inet_cork *cork)
1263{
1264 cork->flags &= ~IPCORK_OPT;
1265 kfree(cork->opt);
1266 cork->opt = NULL;
1267 dst_release(cork->dst);
1268 cork->dst = NULL;
1269}
1270
1271/*
1272 * Combined all pending IP fragments on the socket as one IP datagram
1273 * and push them out.
1274 */
1275struct sk_buff *__ip_make_skb(struct sock *sk,
1276 struct flowi4 *fl4,
1277 struct sk_buff_head *queue,
1278 struct inet_cork *cork)
1279{
1280 struct sk_buff *skb, *tmp_skb;
1281 struct sk_buff **tail_skb;
1282 struct inet_sock *inet = inet_sk(sk);
1283 struct net *net = sock_net(sk);
1284 struct ip_options *opt = NULL;
1285 struct rtable *rt = (struct rtable *)cork->dst;
1286 struct iphdr *iph;
1287 __be16 df = 0;
1288 __u8 ttl;
1289
1290 if ((skb = __skb_dequeue(queue)) == NULL)
1291 goto out;
1292 tail_skb = &(skb_shinfo(skb)->frag_list);
1293
1294 /* move skb->data to ip header from ext header */
1295 if (skb->data < skb_network_header(skb))
1296 __skb_pull(skb, skb_network_offset(skb));
1297 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1298 __skb_pull(tmp_skb, skb_network_header_len(skb));
1299 *tail_skb = tmp_skb;
1300 tail_skb = &(tmp_skb->next);
1301 skb->len += tmp_skb->len;
1302 skb->data_len += tmp_skb->len;
1303 skb->truesize += tmp_skb->truesize;
1304 tmp_skb->destructor = NULL;
1305 tmp_skb->sk = NULL;
1306 }
1307
1308 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1309 * to fragment the frame generated here. No matter, what transforms
1310 * how transforms change size of the packet, it will come out.
1311 */
1312 if (inet->pmtudisc < IP_PMTUDISC_DO)
1313 skb->local_df = 1;
1314
1315 /* DF bit is set when we want to see DF on outgoing frames.
1316 * If local_df is set too, we still allow to fragment this frame
1317 * locally. */
1318 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1319 (skb->len <= dst_mtu(&rt->dst) &&
1320 ip_dont_fragment(sk, &rt->dst)))
1321 df = htons(IP_DF);
1322
1323 if (cork->flags & IPCORK_OPT)
1324 opt = cork->opt;
1325
1326 if (rt->rt_type == RTN_MULTICAST)
1327 ttl = inet->mc_ttl;
1328 else
1329 ttl = ip_select_ttl(inet, &rt->dst);
1330
1331 iph = (struct iphdr *)skb->data;
1332 iph->version = 4;
1333 iph->ihl = 5;
1334 iph->tos = inet->tos;
1335 iph->frag_off = df;
1336 ip_select_ident(iph, &rt->dst, sk);
1337 iph->ttl = ttl;
1338 iph->protocol = sk->sk_protocol;
1339 iph->saddr = fl4->saddr;
1340 iph->daddr = fl4->daddr;
1341
1342 if (opt) {
1343 iph->ihl += opt->optlen>>2;
1344 ip_options_build(skb, opt, cork->addr, rt, 0);
1345 }
1346
1347 skb->priority = sk->sk_priority;
1348 skb->mark = sk->sk_mark;
1349 /*
1350 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1351 * on dst refcount
1352 */
1353 cork->dst = NULL;
1354 skb_dst_set(skb, &rt->dst);
1355
1356 if (iph->protocol == IPPROTO_ICMP)
1357 icmp_out_count(net, ((struct icmphdr *)
1358 skb_transport_header(skb))->type);
1359
1360 ip_cork_release(cork);
1361out:
1362 return skb;
1363}
1364
1365int ip_send_skb(struct sk_buff *skb)
1366{
1367 struct net *net = sock_net(skb->sk);
1368 int err;
1369
1370 err = ip_local_out(skb);
1371 if (err) {
1372 if (err > 0)
1373 err = net_xmit_errno(err);
1374 if (err)
1375 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1376 }
1377
1378 return err;
1379}
1380
1381int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1382{
1383 struct sk_buff *skb;
1384
1385 skb = ip_finish_skb(sk, fl4);
1386 if (!skb)
1387 return 0;
1388
1389 /* Netfilter gets whole the not fragmented skb. */
1390 return ip_send_skb(skb);
1391}
1392
1393/*
1394 * Throw away all pending data on the socket.
1395 */
1396static void __ip_flush_pending_frames(struct sock *sk,
1397 struct sk_buff_head *queue,
1398 struct inet_cork *cork)
1399{
1400 struct sk_buff *skb;
1401
1402 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1403 kfree_skb(skb);
1404
1405 ip_cork_release(cork);
1406}
1407
1408void ip_flush_pending_frames(struct sock *sk)
1409{
1410 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1411}
1412
1413struct sk_buff *ip_make_skb(struct sock *sk,
1414 struct flowi4 *fl4,
1415 int getfrag(void *from, char *to, int offset,
1416 int len, int odd, struct sk_buff *skb),
1417 void *from, int length, int transhdrlen,
1418 struct ipcm_cookie *ipc, struct rtable **rtp,
1419 unsigned int flags)
1420{
1421 struct inet_cork cork;
1422 struct sk_buff_head queue;
1423 int err;
1424
1425 if (flags & MSG_PROBE)
1426 return NULL;
1427
1428 __skb_queue_head_init(&queue);
1429
1430 cork.flags = 0;
1431 cork.addr = 0;
1432 cork.opt = NULL;
1433 err = ip_setup_cork(sk, &cork, ipc, rtp);
1434 if (err)
1435 return ERR_PTR(err);
1436
1437 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1438 from, length, transhdrlen, flags);
1439 if (err) {
1440 __ip_flush_pending_frames(sk, &queue, &cork);
1441 return ERR_PTR(err);
1442 }
1443
1444 return __ip_make_skb(sk, fl4, &queue, &cork);
1445}
1446
1447/*
1448 * Fetch data from kernel space and fill in checksum if needed.
1449 */
1450static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1451 int len, int odd, struct sk_buff *skb)
1452{
1453 __wsum csum;
1454
1455 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1456 skb->csum = csum_block_add(skb->csum, csum, odd);
1457 return 0;
1458}
1459
1460/*
1461 * Generic function to send a packet as reply to another packet.
1462 * Used to send TCP resets so far. ICMP should use this function too.
1463 *
1464 * Should run single threaded per socket because it uses the sock
1465 * structure to pass arguments.
1466 */
1467void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1468 struct ip_reply_arg *arg, unsigned int len)
1469{
1470 struct inet_sock *inet = inet_sk(sk);
1471 struct ip_options_data replyopts;
1472 struct ipcm_cookie ipc;
1473 struct flowi4 fl4;
1474 struct rtable *rt = skb_rtable(skb);
1475
1476 if (ip_options_echo(&replyopts.opt.opt, skb))
1477 return;
1478
1479 ipc.addr = daddr;
1480 ipc.opt = NULL;
1481 ipc.tx_flags = 0;
1482
1483 if (replyopts.opt.opt.optlen) {
1484 ipc.opt = &replyopts.opt;
1485
1486 if (replyopts.opt.opt.srr)
1487 daddr = replyopts.opt.opt.faddr;
1488 }
1489
1490 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1491 RT_TOS(ip_hdr(skb)->tos),
1492 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1493 ip_reply_arg_flowi_flags(arg),
1494 daddr, rt->rt_spec_dst,
1495 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1496 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1497 rt = ip_route_output_key(sock_net(sk), &fl4);
1498 if (IS_ERR(rt))
1499 return;
1500
1501 /* And let IP do all the hard work.
1502
1503 This chunk is not reenterable, hence spinlock.
1504 Note that it uses the fact, that this function is called
1505 with locally disabled BH and that sk cannot be already spinlocked.
1506 */
1507 bh_lock_sock(sk);
1508 inet->tos = ip_hdr(skb)->tos;
1509 sk->sk_priority = skb->priority;
1510 sk->sk_protocol = ip_hdr(skb)->protocol;
1511 sk->sk_bound_dev_if = arg->bound_dev_if;
1512 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1513 &ipc, &rt, MSG_DONTWAIT);
1514 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1515 if (arg->csumoffset >= 0)
1516 *((__sum16 *)skb_transport_header(skb) +
1517 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1518 arg->csum));
1519 skb->ip_summed = CHECKSUM_NONE;
1520 ip_push_pending_frames(sk, &fl4);
1521 }
1522
1523 bh_unlock_sock(sk);
1524
1525 ip_rt_put(rt);
1526}
1527
1528void __init ip_init(void)
1529{
1530 ip_rt_init();
1531 inet_initpeers();
1532
1533#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1534 igmp_mc_proc_init();
1535#endif
1536}
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * The Internet Protocol (IP) output module.
8 *
9 * Authors: Ross Biro
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Donald Becker, <becker@super.org>
12 * Alan Cox, <Alan.Cox@linux.org>
13 * Richard Underwood
14 * Stefan Becker, <stefanb@yello.ping.de>
15 * Jorge Cwik, <jorge@laser.satlink.net>
16 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
17 * Hirokazu Takahashi, <taka@valinux.co.jp>
18 *
19 * See ip_input.c for original log
20 *
21 * Fixes:
22 * Alan Cox : Missing nonblock feature in ip_build_xmit.
23 * Mike Kilburn : htons() missing in ip_build_xmit.
24 * Bradford Johnson: Fix faulty handling of some frames when
25 * no route is found.
26 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
27 * (in case if packet not accepted by
28 * output firewall rules)
29 * Mike McLagan : Routing by source
30 * Alexey Kuznetsov: use new route cache
31 * Andi Kleen: Fix broken PMTU recovery and remove
32 * some redundant tests.
33 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
34 * Andi Kleen : Replace ip_reply with ip_send_reply.
35 * Andi Kleen : Split fast and slow ip_build_xmit path
36 * for decreased register pressure on x86
37 * and more readability.
38 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
39 * silently drop skb instead of failing with -EPERM.
40 * Detlev Wengorz : Copy protocol for fragments.
41 * Hirokazu Takahashi: HW checksumming for outgoing UDP
42 * datagrams.
43 * Hirokazu Takahashi: sendfile() on UDP works now.
44 */
45
46#include <linux/uaccess.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54#include <linux/slab.h>
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
70#include <net/xfrm.h>
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
75#include <net/checksum.h>
76#include <net/inetpeer.h>
77#include <net/inet_ecn.h>
78#include <net/lwtunnel.h>
79#include <linux/bpf-cgroup.h>
80#include <linux/igmp.h>
81#include <linux/netfilter_ipv4.h>
82#include <linux/netfilter_bridge.h>
83#include <linux/netlink.h>
84#include <linux/tcp.h>
85
86static int
87ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
88 unsigned int mtu,
89 int (*output)(struct net *, struct sock *, struct sk_buff *));
90
91/* Generate a checksum for an outgoing IP datagram. */
92void ip_send_check(struct iphdr *iph)
93{
94 iph->check = 0;
95 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
96}
97EXPORT_SYMBOL(ip_send_check);
98
99int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
100{
101 struct iphdr *iph = ip_hdr(skb);
102
103 iph->tot_len = htons(skb->len);
104 ip_send_check(iph);
105
106 /* if egress device is enslaved to an L3 master device pass the
107 * skb to its handler for processing
108 */
109 skb = l3mdev_ip_out(sk, skb);
110 if (unlikely(!skb))
111 return 0;
112
113 skb->protocol = htons(ETH_P_IP);
114
115 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
116 net, sk, skb, NULL, skb_dst(skb)->dev,
117 dst_output);
118}
119
120int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
121{
122 int err;
123
124 err = __ip_local_out(net, sk, skb);
125 if (likely(err == 1))
126 err = dst_output(net, sk, skb);
127
128 return err;
129}
130EXPORT_SYMBOL_GPL(ip_local_out);
131
132static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
133{
134 int ttl = inet->uc_ttl;
135
136 if (ttl < 0)
137 ttl = ip4_dst_hoplimit(dst);
138 return ttl;
139}
140
141/*
142 * Add an ip header to a skbuff and send it out.
143 *
144 */
145int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
146 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
147 u8 tos)
148{
149 struct inet_sock *inet = inet_sk(sk);
150 struct rtable *rt = skb_rtable(skb);
151 struct net *net = sock_net(sk);
152 struct iphdr *iph;
153
154 /* Build the IP header. */
155 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
156 skb_reset_network_header(skb);
157 iph = ip_hdr(skb);
158 iph->version = 4;
159 iph->ihl = 5;
160 iph->tos = tos;
161 iph->ttl = ip_select_ttl(inet, &rt->dst);
162 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
163 iph->saddr = saddr;
164 iph->protocol = sk->sk_protocol;
165 /* Do not bother generating IPID for small packets (eg SYNACK) */
166 if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) {
167 iph->frag_off = htons(IP_DF);
168 iph->id = 0;
169 } else {
170 iph->frag_off = 0;
171 /* TCP packets here are SYNACK with fat IPv4/TCP options.
172 * Avoid using the hashed IP ident generator.
173 */
174 if (sk->sk_protocol == IPPROTO_TCP)
175 iph->id = (__force __be16)get_random_u16();
176 else
177 __ip_select_ident(net, iph, 1);
178 }
179
180 if (opt && opt->opt.optlen) {
181 iph->ihl += opt->opt.optlen>>2;
182 ip_options_build(skb, &opt->opt, daddr, rt);
183 }
184
185 skb->priority = sk->sk_priority;
186 if (!skb->mark)
187 skb->mark = sk->sk_mark;
188
189 /* Send it out. */
190 return ip_local_out(net, skb->sk, skb);
191}
192EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
193
194static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
195{
196 struct dst_entry *dst = skb_dst(skb);
197 struct rtable *rt = (struct rtable *)dst;
198 struct net_device *dev = dst->dev;
199 unsigned int hh_len = LL_RESERVED_SPACE(dev);
200 struct neighbour *neigh;
201 bool is_v6gw = false;
202
203 if (rt->rt_type == RTN_MULTICAST) {
204 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
205 } else if (rt->rt_type == RTN_BROADCAST)
206 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
207
208 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
209 skb = skb_expand_head(skb, hh_len);
210 if (!skb)
211 return -ENOMEM;
212 }
213
214 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
215 int res = lwtunnel_xmit(skb);
216
217 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
218 return res;
219 }
220
221 rcu_read_lock_bh();
222 neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
223 if (!IS_ERR(neigh)) {
224 int res;
225
226 sock_confirm_neigh(skb, neigh);
227 /* if crossing protocols, can not use the cached header */
228 res = neigh_output(neigh, skb, is_v6gw);
229 rcu_read_unlock_bh();
230 return res;
231 }
232 rcu_read_unlock_bh();
233
234 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
235 __func__);
236 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
237 return -EINVAL;
238}
239
240static int ip_finish_output_gso(struct net *net, struct sock *sk,
241 struct sk_buff *skb, unsigned int mtu)
242{
243 struct sk_buff *segs, *nskb;
244 netdev_features_t features;
245 int ret = 0;
246
247 /* common case: seglen is <= mtu
248 */
249 if (skb_gso_validate_network_len(skb, mtu))
250 return ip_finish_output2(net, sk, skb);
251
252 /* Slowpath - GSO segment length exceeds the egress MTU.
253 *
254 * This can happen in several cases:
255 * - Forwarding of a TCP GRO skb, when DF flag is not set.
256 * - Forwarding of an skb that arrived on a virtualization interface
257 * (virtio-net/vhost/tap) with TSO/GSO size set by other network
258 * stack.
259 * - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an
260 * interface with a smaller MTU.
261 * - Arriving GRO skb (or GSO skb in a virtualized environment) that is
262 * bridged to a NETIF_F_TSO tunnel stacked over an interface with an
263 * insufficient MTU.
264 */
265 features = netif_skb_features(skb);
266 BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
267 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
268 if (IS_ERR_OR_NULL(segs)) {
269 kfree_skb(skb);
270 return -ENOMEM;
271 }
272
273 consume_skb(skb);
274
275 skb_list_walk_safe(segs, segs, nskb) {
276 int err;
277
278 skb_mark_not_on_list(segs);
279 err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
280
281 if (err && ret == 0)
282 ret = err;
283 }
284
285 return ret;
286}
287
288static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
289{
290 unsigned int mtu;
291
292#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
293 /* Policy lookup after SNAT yielded a new policy */
294 if (skb_dst(skb)->xfrm) {
295 IPCB(skb)->flags |= IPSKB_REROUTED;
296 return dst_output(net, sk, skb);
297 }
298#endif
299 mtu = ip_skb_dst_mtu(sk, skb);
300 if (skb_is_gso(skb))
301 return ip_finish_output_gso(net, sk, skb, mtu);
302
303 if (skb->len > mtu || IPCB(skb)->frag_max_size)
304 return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
305
306 return ip_finish_output2(net, sk, skb);
307}
308
309static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
310{
311 int ret;
312
313 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
314 switch (ret) {
315 case NET_XMIT_SUCCESS:
316 return __ip_finish_output(net, sk, skb);
317 case NET_XMIT_CN:
318 return __ip_finish_output(net, sk, skb) ? : ret;
319 default:
320 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
321 return ret;
322 }
323}
324
325static int ip_mc_finish_output(struct net *net, struct sock *sk,
326 struct sk_buff *skb)
327{
328 struct rtable *new_rt;
329 bool do_cn = false;
330 int ret, err;
331
332 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
333 switch (ret) {
334 case NET_XMIT_CN:
335 do_cn = true;
336 fallthrough;
337 case NET_XMIT_SUCCESS:
338 break;
339 default:
340 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
341 return ret;
342 }
343
344 /* Reset rt_iif so that inet_iif() will return skb->skb_iif. Setting
345 * this to non-zero causes ipi_ifindex in in_pktinfo to be overwritten,
346 * see ipv4_pktinfo_prepare().
347 */
348 new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb));
349 if (new_rt) {
350 new_rt->rt_iif = 0;
351 skb_dst_drop(skb);
352 skb_dst_set(skb, &new_rt->dst);
353 }
354
355 err = dev_loopback_xmit(net, sk, skb);
356 return (do_cn && err) ? ret : err;
357}
358
359int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
360{
361 struct rtable *rt = skb_rtable(skb);
362 struct net_device *dev = rt->dst.dev;
363
364 /*
365 * If the indicated interface is up and running, send the packet.
366 */
367 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
368
369 skb->dev = dev;
370 skb->protocol = htons(ETH_P_IP);
371
372 /*
373 * Multicasts are looped back for other local users
374 */
375
376 if (rt->rt_flags&RTCF_MULTICAST) {
377 if (sk_mc_loop(sk)
378#ifdef CONFIG_IP_MROUTE
379 /* Small optimization: do not loopback not local frames,
380 which returned after forwarding; they will be dropped
381 by ip_mr_input in any case.
382 Note, that local frames are looped back to be delivered
383 to local recipients.
384
385 This check is duplicated in ip_mr_input at the moment.
386 */
387 &&
388 ((rt->rt_flags & RTCF_LOCAL) ||
389 !(IPCB(skb)->flags & IPSKB_FORWARDED))
390#endif
391 ) {
392 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
393 if (newskb)
394 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
395 net, sk, newskb, NULL, newskb->dev,
396 ip_mc_finish_output);
397 }
398
399 /* Multicasts with ttl 0 must not go beyond the host */
400
401 if (ip_hdr(skb)->ttl == 0) {
402 kfree_skb(skb);
403 return 0;
404 }
405 }
406
407 if (rt->rt_flags&RTCF_BROADCAST) {
408 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
409 if (newskb)
410 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
411 net, sk, newskb, NULL, newskb->dev,
412 ip_mc_finish_output);
413 }
414
415 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
416 net, sk, skb, NULL, skb->dev,
417 ip_finish_output,
418 !(IPCB(skb)->flags & IPSKB_REROUTED));
419}
420
421int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
422{
423 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
424
425 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
426
427 skb->dev = dev;
428 skb->protocol = htons(ETH_P_IP);
429
430 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
431 net, sk, skb, indev, dev,
432 ip_finish_output,
433 !(IPCB(skb)->flags & IPSKB_REROUTED));
434}
435EXPORT_SYMBOL(ip_output);
436
437/*
438 * copy saddr and daddr, possibly using 64bit load/stores
439 * Equivalent to :
440 * iph->saddr = fl4->saddr;
441 * iph->daddr = fl4->daddr;
442 */
443static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
444{
445 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
446 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
447
448 iph->saddr = fl4->saddr;
449 iph->daddr = fl4->daddr;
450}
451
452/* Note: skb->sk can be different from sk, in case of tunnels */
453int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
454 __u8 tos)
455{
456 struct inet_sock *inet = inet_sk(sk);
457 struct net *net = sock_net(sk);
458 struct ip_options_rcu *inet_opt;
459 struct flowi4 *fl4;
460 struct rtable *rt;
461 struct iphdr *iph;
462 int res;
463
464 /* Skip all of this if the packet is already routed,
465 * f.e. by something like SCTP.
466 */
467 rcu_read_lock();
468 inet_opt = rcu_dereference(inet->inet_opt);
469 fl4 = &fl->u.ip4;
470 rt = skb_rtable(skb);
471 if (rt)
472 goto packet_routed;
473
474 /* Make sure we can route this packet. */
475 rt = (struct rtable *)__sk_dst_check(sk, 0);
476 if (!rt) {
477 __be32 daddr;
478
479 /* Use correct destination address if we have options. */
480 daddr = inet->inet_daddr;
481 if (inet_opt && inet_opt->opt.srr)
482 daddr = inet_opt->opt.faddr;
483
484 /* If this fails, retransmit mechanism of transport layer will
485 * keep trying until route appears or the connection times
486 * itself out.
487 */
488 rt = ip_route_output_ports(net, fl4, sk,
489 daddr, inet->inet_saddr,
490 inet->inet_dport,
491 inet->inet_sport,
492 sk->sk_protocol,
493 RT_CONN_FLAGS_TOS(sk, tos),
494 sk->sk_bound_dev_if);
495 if (IS_ERR(rt))
496 goto no_route;
497 sk_setup_caps(sk, &rt->dst);
498 }
499 skb_dst_set_noref(skb, &rt->dst);
500
501packet_routed:
502 if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
503 goto no_route;
504
505 /* OK, we know where to send it, allocate and build IP header. */
506 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
507 skb_reset_network_header(skb);
508 iph = ip_hdr(skb);
509 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
510 if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
511 iph->frag_off = htons(IP_DF);
512 else
513 iph->frag_off = 0;
514 iph->ttl = ip_select_ttl(inet, &rt->dst);
515 iph->protocol = sk->sk_protocol;
516 ip_copy_addrs(iph, fl4);
517
518 /* Transport layer set skb->h.foo itself. */
519
520 if (inet_opt && inet_opt->opt.optlen) {
521 iph->ihl += inet_opt->opt.optlen >> 2;
522 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt);
523 }
524
525 ip_select_ident_segs(net, skb, sk,
526 skb_shinfo(skb)->gso_segs ?: 1);
527
528 /* TODO : should we use skb->sk here instead of sk ? */
529 skb->priority = sk->sk_priority;
530 skb->mark = sk->sk_mark;
531
532 res = ip_local_out(net, sk, skb);
533 rcu_read_unlock();
534 return res;
535
536no_route:
537 rcu_read_unlock();
538 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
539 kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES);
540 return -EHOSTUNREACH;
541}
542EXPORT_SYMBOL(__ip_queue_xmit);
543
544int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
545{
546 return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
547}
548EXPORT_SYMBOL(ip_queue_xmit);
549
550static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
551{
552 to->pkt_type = from->pkt_type;
553 to->priority = from->priority;
554 to->protocol = from->protocol;
555 to->skb_iif = from->skb_iif;
556 skb_dst_drop(to);
557 skb_dst_copy(to, from);
558 to->dev = from->dev;
559 to->mark = from->mark;
560
561 skb_copy_hash(to, from);
562
563#ifdef CONFIG_NET_SCHED
564 to->tc_index = from->tc_index;
565#endif
566 nf_copy(to, from);
567 skb_ext_copy(to, from);
568#if IS_ENABLED(CONFIG_IP_VS)
569 to->ipvs_property = from->ipvs_property;
570#endif
571 skb_copy_secmark(to, from);
572}
573
574static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
575 unsigned int mtu,
576 int (*output)(struct net *, struct sock *, struct sk_buff *))
577{
578 struct iphdr *iph = ip_hdr(skb);
579
580 if ((iph->frag_off & htons(IP_DF)) == 0)
581 return ip_do_fragment(net, sk, skb, output);
582
583 if (unlikely(!skb->ignore_df ||
584 (IPCB(skb)->frag_max_size &&
585 IPCB(skb)->frag_max_size > mtu))) {
586 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
587 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
588 htonl(mtu));
589 kfree_skb(skb);
590 return -EMSGSIZE;
591 }
592
593 return ip_do_fragment(net, sk, skb, output);
594}
595
596void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
597 unsigned int hlen, struct ip_fraglist_iter *iter)
598{
599 unsigned int first_len = skb_pagelen(skb);
600
601 iter->frag = skb_shinfo(skb)->frag_list;
602 skb_frag_list_init(skb);
603
604 iter->offset = 0;
605 iter->iph = iph;
606 iter->hlen = hlen;
607
608 skb->data_len = first_len - skb_headlen(skb);
609 skb->len = first_len;
610 iph->tot_len = htons(first_len);
611 iph->frag_off = htons(IP_MF);
612 ip_send_check(iph);
613}
614EXPORT_SYMBOL(ip_fraglist_init);
615
616void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
617{
618 unsigned int hlen = iter->hlen;
619 struct iphdr *iph = iter->iph;
620 struct sk_buff *frag;
621
622 frag = iter->frag;
623 frag->ip_summed = CHECKSUM_NONE;
624 skb_reset_transport_header(frag);
625 __skb_push(frag, hlen);
626 skb_reset_network_header(frag);
627 memcpy(skb_network_header(frag), iph, hlen);
628 iter->iph = ip_hdr(frag);
629 iph = iter->iph;
630 iph->tot_len = htons(frag->len);
631 ip_copy_metadata(frag, skb);
632 iter->offset += skb->len - hlen;
633 iph->frag_off = htons(iter->offset >> 3);
634 if (frag->next)
635 iph->frag_off |= htons(IP_MF);
636 /* Ready, complete checksum */
637 ip_send_check(iph);
638}
639EXPORT_SYMBOL(ip_fraglist_prepare);
640
641void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
642 unsigned int ll_rs, unsigned int mtu, bool DF,
643 struct ip_frag_state *state)
644{
645 struct iphdr *iph = ip_hdr(skb);
646
647 state->DF = DF;
648 state->hlen = hlen;
649 state->ll_rs = ll_rs;
650 state->mtu = mtu;
651
652 state->left = skb->len - hlen; /* Space per frame */
653 state->ptr = hlen; /* Where to start from */
654
655 state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
656 state->not_last_frag = iph->frag_off & htons(IP_MF);
657}
658EXPORT_SYMBOL(ip_frag_init);
659
660static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
661 bool first_frag)
662{
663 /* Copy the flags to each fragment. */
664 IPCB(to)->flags = IPCB(from)->flags;
665
666 /* ANK: dirty, but effective trick. Upgrade options only if
667 * the segment to be fragmented was THE FIRST (otherwise,
668 * options are already fixed) and make it ONCE
669 * on the initial skb, so that all the following fragments
670 * will inherit fixed options.
671 */
672 if (first_frag)
673 ip_options_fragment(from);
674}
675
676struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
677{
678 unsigned int len = state->left;
679 struct sk_buff *skb2;
680 struct iphdr *iph;
681
682 /* IF: it doesn't fit, use 'mtu' - the data space left */
683 if (len > state->mtu)
684 len = state->mtu;
685 /* IF: we are not sending up to and including the packet end
686 then align the next start on an eight byte boundary */
687 if (len < state->left) {
688 len &= ~7;
689 }
690
691 /* Allocate buffer */
692 skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
693 if (!skb2)
694 return ERR_PTR(-ENOMEM);
695
696 /*
697 * Set up data on packet
698 */
699
700 ip_copy_metadata(skb2, skb);
701 skb_reserve(skb2, state->ll_rs);
702 skb_put(skb2, len + state->hlen);
703 skb_reset_network_header(skb2);
704 skb2->transport_header = skb2->network_header + state->hlen;
705
706 /*
707 * Charge the memory for the fragment to any owner
708 * it might possess
709 */
710
711 if (skb->sk)
712 skb_set_owner_w(skb2, skb->sk);
713
714 /*
715 * Copy the packet header into the new buffer.
716 */
717
718 skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen);
719
720 /*
721 * Copy a block of the IP datagram.
722 */
723 if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
724 BUG();
725 state->left -= len;
726
727 /*
728 * Fill in the new header fields.
729 */
730 iph = ip_hdr(skb2);
731 iph->frag_off = htons((state->offset >> 3));
732 if (state->DF)
733 iph->frag_off |= htons(IP_DF);
734
735 /*
736 * Added AC : If we are fragmenting a fragment that's not the
737 * last fragment then keep MF on each bit
738 */
739 if (state->left > 0 || state->not_last_frag)
740 iph->frag_off |= htons(IP_MF);
741 state->ptr += len;
742 state->offset += len;
743
744 iph->tot_len = htons(len + state->hlen);
745
746 ip_send_check(iph);
747
748 return skb2;
749}
750EXPORT_SYMBOL(ip_frag_next);
751
752/*
753 * This IP datagram is too large to be sent in one piece. Break it up into
754 * smaller pieces (each of size equal to IP header plus
755 * a block of the data of the original IP data part) that will yet fit in a
756 * single device frame, and queue such a frame for sending.
757 */
758
759int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
760 int (*output)(struct net *, struct sock *, struct sk_buff *))
761{
762 struct iphdr *iph;
763 struct sk_buff *skb2;
764 bool mono_delivery_time = skb->mono_delivery_time;
765 struct rtable *rt = skb_rtable(skb);
766 unsigned int mtu, hlen, ll_rs;
767 struct ip_fraglist_iter iter;
768 ktime_t tstamp = skb->tstamp;
769 struct ip_frag_state state;
770 int err = 0;
771
772 /* for offloaded checksums cleanup checksum before fragmentation */
773 if (skb->ip_summed == CHECKSUM_PARTIAL &&
774 (err = skb_checksum_help(skb)))
775 goto fail;
776
777 /*
778 * Point into the IP datagram header.
779 */
780
781 iph = ip_hdr(skb);
782
783 mtu = ip_skb_dst_mtu(sk, skb);
784 if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
785 mtu = IPCB(skb)->frag_max_size;
786
787 /*
788 * Setup starting values.
789 */
790
791 hlen = iph->ihl * 4;
792 mtu = mtu - hlen; /* Size of data space */
793 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
794 ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
795
796 /* When frag_list is given, use it. First, check its validity:
797 * some transformers could create wrong frag_list or break existing
798 * one, it is not prohibited. In this case fall back to copying.
799 *
800 * LATER: this step can be merged to real generation of fragments,
801 * we can switch to copy when see the first bad fragment.
802 */
803 if (skb_has_frag_list(skb)) {
804 struct sk_buff *frag, *frag2;
805 unsigned int first_len = skb_pagelen(skb);
806
807 if (first_len - hlen > mtu ||
808 ((first_len - hlen) & 7) ||
809 ip_is_fragment(iph) ||
810 skb_cloned(skb) ||
811 skb_headroom(skb) < ll_rs)
812 goto slow_path;
813
814 skb_walk_frags(skb, frag) {
815 /* Correct geometry. */
816 if (frag->len > mtu ||
817 ((frag->len & 7) && frag->next) ||
818 skb_headroom(frag) < hlen + ll_rs)
819 goto slow_path_clean;
820
821 /* Partially cloned skb? */
822 if (skb_shared(frag))
823 goto slow_path_clean;
824
825 BUG_ON(frag->sk);
826 if (skb->sk) {
827 frag->sk = skb->sk;
828 frag->destructor = sock_wfree;
829 }
830 skb->truesize -= frag->truesize;
831 }
832
833 /* Everything is OK. Generate! */
834 ip_fraglist_init(skb, iph, hlen, &iter);
835
836 for (;;) {
837 /* Prepare header of the next frame,
838 * before previous one went down. */
839 if (iter.frag) {
840 bool first_frag = (iter.offset == 0);
841
842 IPCB(iter.frag)->flags = IPCB(skb)->flags;
843 ip_fraglist_prepare(skb, &iter);
844 if (first_frag && IPCB(skb)->opt.optlen) {
845 /* ipcb->opt is not populated for frags
846 * coming from __ip_make_skb(),
847 * ip_options_fragment() needs optlen
848 */
849 IPCB(iter.frag)->opt.optlen =
850 IPCB(skb)->opt.optlen;
851 ip_options_fragment(iter.frag);
852 ip_send_check(iter.iph);
853 }
854 }
855
856 skb_set_delivery_time(skb, tstamp, mono_delivery_time);
857 err = output(net, sk, skb);
858
859 if (!err)
860 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
861 if (err || !iter.frag)
862 break;
863
864 skb = ip_fraglist_next(&iter);
865 }
866
867 if (err == 0) {
868 IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
869 return 0;
870 }
871
872 kfree_skb_list(iter.frag);
873
874 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
875 return err;
876
877slow_path_clean:
878 skb_walk_frags(skb, frag2) {
879 if (frag2 == frag)
880 break;
881 frag2->sk = NULL;
882 frag2->destructor = NULL;
883 skb->truesize += frag2->truesize;
884 }
885 }
886
887slow_path:
888 /*
889 * Fragment the datagram.
890 */
891
892 ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU,
893 &state);
894
895 /*
896 * Keep copying data until we run out.
897 */
898
899 while (state.left > 0) {
900 bool first_frag = (state.offset == 0);
901
902 skb2 = ip_frag_next(skb, &state);
903 if (IS_ERR(skb2)) {
904 err = PTR_ERR(skb2);
905 goto fail;
906 }
907 ip_frag_ipcb(skb, skb2, first_frag);
908
909 /*
910 * Put this fragment into the sending queue.
911 */
912 skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
913 err = output(net, sk, skb2);
914 if (err)
915 goto fail;
916
917 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
918 }
919 consume_skb(skb);
920 IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
921 return err;
922
923fail:
924 kfree_skb(skb);
925 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
926 return err;
927}
928EXPORT_SYMBOL(ip_do_fragment);
929
930int
931ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
932{
933 struct msghdr *msg = from;
934
935 if (skb->ip_summed == CHECKSUM_PARTIAL) {
936 if (!copy_from_iter_full(to, len, &msg->msg_iter))
937 return -EFAULT;
938 } else {
939 __wsum csum = 0;
940 if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
941 return -EFAULT;
942 skb->csum = csum_block_add(skb->csum, csum, odd);
943 }
944 return 0;
945}
946EXPORT_SYMBOL(ip_generic_getfrag);
947
948static inline __wsum
949csum_page(struct page *page, int offset, int copy)
950{
951 char *kaddr;
952 __wsum csum;
953 kaddr = kmap(page);
954 csum = csum_partial(kaddr + offset, copy, 0);
955 kunmap(page);
956 return csum;
957}
958
959static int __ip_append_data(struct sock *sk,
960 struct flowi4 *fl4,
961 struct sk_buff_head *queue,
962 struct inet_cork *cork,
963 struct page_frag *pfrag,
964 int getfrag(void *from, char *to, int offset,
965 int len, int odd, struct sk_buff *skb),
966 void *from, int length, int transhdrlen,
967 unsigned int flags)
968{
969 struct inet_sock *inet = inet_sk(sk);
970 struct ubuf_info *uarg = NULL;
971 struct sk_buff *skb;
972 struct ip_options *opt = cork->opt;
973 int hh_len;
974 int exthdrlen;
975 int mtu;
976 int copy;
977 int err;
978 int offset = 0;
979 bool zc = false;
980 unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
981 int csummode = CHECKSUM_NONE;
982 struct rtable *rt = (struct rtable *)cork->dst;
983 unsigned int wmem_alloc_delta = 0;
984 bool paged, extra_uref = false;
985 u32 tskey = 0;
986
987 skb = skb_peek_tail(queue);
988
989 exthdrlen = !skb ? rt->dst.header_len : 0;
990 mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
991 paged = !!cork->gso_size;
992
993 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
994 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
995 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
996
997 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
998
999 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1000 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1001 maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu;
1002
1003 if (cork->length + length > maxnonfragsize - fragheaderlen) {
1004 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
1005 mtu - (opt ? opt->optlen : 0));
1006 return -EMSGSIZE;
1007 }
1008
1009 /*
1010 * transhdrlen > 0 means that this is the first fragment and we wish
1011 * it won't be fragmented in the future.
1012 */
1013 if (transhdrlen &&
1014 length + fragheaderlen <= mtu &&
1015 rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
1016 (!(flags & MSG_MORE) || cork->gso_size) &&
1017 (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
1018 csummode = CHECKSUM_PARTIAL;
1019
1020 if ((flags & MSG_ZEROCOPY) && length) {
1021 struct msghdr *msg = from;
1022
1023 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1024 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1025 return -EINVAL;
1026
1027 /* Leave uarg NULL if can't zerocopy, callers should
1028 * be able to handle it.
1029 */
1030 if ((rt->dst.dev->features & NETIF_F_SG) &&
1031 csummode == CHECKSUM_PARTIAL) {
1032 paged = true;
1033 zc = true;
1034 uarg = msg->msg_ubuf;
1035 }
1036 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1037 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1038 if (!uarg)
1039 return -ENOBUFS;
1040 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1041 if (rt->dst.dev->features & NETIF_F_SG &&
1042 csummode == CHECKSUM_PARTIAL) {
1043 paged = true;
1044 zc = true;
1045 } else {
1046 uarg_to_msgzc(uarg)->zerocopy = 0;
1047 skb_zcopy_set(skb, uarg, &extra_uref);
1048 }
1049 }
1050 }
1051
1052 cork->length += length;
1053
1054 /* So, what's going on in the loop below?
1055 *
1056 * We use calculated fragment length to generate chained skb,
1057 * each of segments is IP fragment ready for sending to network after
1058 * adding appropriate IP header.
1059 */
1060
1061 if (!skb)
1062 goto alloc_new_skb;
1063
1064 while (length > 0) {
1065 /* Check if the remaining data fits into current packet. */
1066 copy = mtu - skb->len;
1067 if (copy < length)
1068 copy = maxfraglen - skb->len;
1069 if (copy <= 0) {
1070 char *data;
1071 unsigned int datalen;
1072 unsigned int fraglen;
1073 unsigned int fraggap;
1074 unsigned int alloclen, alloc_extra;
1075 unsigned int pagedlen;
1076 struct sk_buff *skb_prev;
1077alloc_new_skb:
1078 skb_prev = skb;
1079 if (skb_prev)
1080 fraggap = skb_prev->len - maxfraglen;
1081 else
1082 fraggap = 0;
1083
1084 /*
1085 * If remaining data exceeds the mtu,
1086 * we know we need more fragment(s).
1087 */
1088 datalen = length + fraggap;
1089 if (datalen > mtu - fragheaderlen)
1090 datalen = maxfraglen - fragheaderlen;
1091 fraglen = datalen + fragheaderlen;
1092 pagedlen = 0;
1093
1094 alloc_extra = hh_len + 15;
1095 alloc_extra += exthdrlen;
1096
1097 /* The last fragment gets additional space at tail.
1098 * Note, with MSG_MORE we overallocate on fragments,
1099 * because we have no idea what fragment will be
1100 * the last.
1101 */
1102 if (datalen == length + fraggap)
1103 alloc_extra += rt->dst.trailer_len;
1104
1105 if ((flags & MSG_MORE) &&
1106 !(rt->dst.dev->features&NETIF_F_SG))
1107 alloclen = mtu;
1108 else if (!paged &&
1109 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1110 !(rt->dst.dev->features & NETIF_F_SG)))
1111 alloclen = fraglen;
1112 else {
1113 alloclen = fragheaderlen + transhdrlen;
1114 pagedlen = datalen - transhdrlen;
1115 }
1116
1117 alloclen += alloc_extra;
1118
1119 if (transhdrlen) {
1120 skb = sock_alloc_send_skb(sk, alloclen,
1121 (flags & MSG_DONTWAIT), &err);
1122 } else {
1123 skb = NULL;
1124 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1125 2 * sk->sk_sndbuf)
1126 skb = alloc_skb(alloclen,
1127 sk->sk_allocation);
1128 if (unlikely(!skb))
1129 err = -ENOBUFS;
1130 }
1131 if (!skb)
1132 goto error;
1133
1134 /*
1135 * Fill in the control structures
1136 */
1137 skb->ip_summed = csummode;
1138 skb->csum = 0;
1139 skb_reserve(skb, hh_len);
1140
1141 /*
1142 * Find where to start putting bytes.
1143 */
1144 data = skb_put(skb, fraglen + exthdrlen - pagedlen);
1145 skb_set_network_header(skb, exthdrlen);
1146 skb->transport_header = (skb->network_header +
1147 fragheaderlen);
1148 data += fragheaderlen + exthdrlen;
1149
1150 if (fraggap) {
1151 skb->csum = skb_copy_and_csum_bits(
1152 skb_prev, maxfraglen,
1153 data + transhdrlen, fraggap);
1154 skb_prev->csum = csum_sub(skb_prev->csum,
1155 skb->csum);
1156 data += fraggap;
1157 pskb_trim_unique(skb_prev, maxfraglen);
1158 }
1159
1160 copy = datalen - transhdrlen - fraggap - pagedlen;
1161 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1162 err = -EFAULT;
1163 kfree_skb(skb);
1164 goto error;
1165 }
1166
1167 offset += copy;
1168 length -= copy + transhdrlen;
1169 transhdrlen = 0;
1170 exthdrlen = 0;
1171 csummode = CHECKSUM_NONE;
1172
1173 /* only the initial fragment is time stamped */
1174 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1175 cork->tx_flags = 0;
1176 skb_shinfo(skb)->tskey = tskey;
1177 tskey = 0;
1178 skb_zcopy_set(skb, uarg, &extra_uref);
1179
1180 if ((flags & MSG_CONFIRM) && !skb_prev)
1181 skb_set_dst_pending_confirm(skb, 1);
1182
1183 /*
1184 * Put the packet on the pending queue.
1185 */
1186 if (!skb->destructor) {
1187 skb->destructor = sock_wfree;
1188 skb->sk = sk;
1189 wmem_alloc_delta += skb->truesize;
1190 }
1191 __skb_queue_tail(queue, skb);
1192 continue;
1193 }
1194
1195 if (copy > length)
1196 copy = length;
1197
1198 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1199 skb_tailroom(skb) >= copy) {
1200 unsigned int off;
1201
1202 off = skb->len;
1203 if (getfrag(from, skb_put(skb, copy),
1204 offset, copy, off, skb) < 0) {
1205 __skb_trim(skb, off);
1206 err = -EFAULT;
1207 goto error;
1208 }
1209 } else if (!zc) {
1210 int i = skb_shinfo(skb)->nr_frags;
1211
1212 err = -ENOMEM;
1213 if (!sk_page_frag_refill(sk, pfrag))
1214 goto error;
1215
1216 skb_zcopy_downgrade_managed(skb);
1217 if (!skb_can_coalesce(skb, i, pfrag->page,
1218 pfrag->offset)) {
1219 err = -EMSGSIZE;
1220 if (i == MAX_SKB_FRAGS)
1221 goto error;
1222
1223 __skb_fill_page_desc(skb, i, pfrag->page,
1224 pfrag->offset, 0);
1225 skb_shinfo(skb)->nr_frags = ++i;
1226 get_page(pfrag->page);
1227 }
1228 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1229 if (getfrag(from,
1230 page_address(pfrag->page) + pfrag->offset,
1231 offset, copy, skb->len, skb) < 0)
1232 goto error_efault;
1233
1234 pfrag->offset += copy;
1235 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1236 skb_len_add(skb, copy);
1237 wmem_alloc_delta += copy;
1238 } else {
1239 err = skb_zerocopy_iter_dgram(skb, from, copy);
1240 if (err < 0)
1241 goto error;
1242 }
1243 offset += copy;
1244 length -= copy;
1245 }
1246
1247 if (wmem_alloc_delta)
1248 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1249 return 0;
1250
1251error_efault:
1252 err = -EFAULT;
1253error:
1254 net_zcopy_put_abort(uarg, extra_uref);
1255 cork->length -= length;
1256 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1257 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1258 return err;
1259}
1260
1261static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1262 struct ipcm_cookie *ipc, struct rtable **rtp)
1263{
1264 struct ip_options_rcu *opt;
1265 struct rtable *rt;
1266
1267 rt = *rtp;
1268 if (unlikely(!rt))
1269 return -EFAULT;
1270
1271 /*
1272 * setup for corking.
1273 */
1274 opt = ipc->opt;
1275 if (opt) {
1276 if (!cork->opt) {
1277 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1278 sk->sk_allocation);
1279 if (unlikely(!cork->opt))
1280 return -ENOBUFS;
1281 }
1282 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1283 cork->flags |= IPCORK_OPT;
1284 cork->addr = ipc->addr;
1285 }
1286
1287 cork->fragsize = ip_sk_use_pmtu(sk) ?
1288 dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
1289
1290 if (!inetdev_valid_mtu(cork->fragsize))
1291 return -ENETUNREACH;
1292
1293 cork->gso_size = ipc->gso_size;
1294
1295 cork->dst = &rt->dst;
1296 /* We stole this route, caller should not release it. */
1297 *rtp = NULL;
1298
1299 cork->length = 0;
1300 cork->ttl = ipc->ttl;
1301 cork->tos = ipc->tos;
1302 cork->mark = ipc->sockc.mark;
1303 cork->priority = ipc->priority;
1304 cork->transmit_time = ipc->sockc.transmit_time;
1305 cork->tx_flags = 0;
1306 sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags);
1307
1308 return 0;
1309}
1310
1311/*
1312 * ip_append_data() and ip_append_page() can make one large IP datagram
1313 * from many pieces of data. Each pieces will be holded on the socket
1314 * until ip_push_pending_frames() is called. Each piece can be a page
1315 * or non-page data.
1316 *
1317 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1318 * this interface potentially.
1319 *
1320 * LATER: length must be adjusted by pad at tail, when it is required.
1321 */
1322int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1323 int getfrag(void *from, char *to, int offset, int len,
1324 int odd, struct sk_buff *skb),
1325 void *from, int length, int transhdrlen,
1326 struct ipcm_cookie *ipc, struct rtable **rtp,
1327 unsigned int flags)
1328{
1329 struct inet_sock *inet = inet_sk(sk);
1330 int err;
1331
1332 if (flags&MSG_PROBE)
1333 return 0;
1334
1335 if (skb_queue_empty(&sk->sk_write_queue)) {
1336 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1337 if (err)
1338 return err;
1339 } else {
1340 transhdrlen = 0;
1341 }
1342
1343 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
1344 sk_page_frag(sk), getfrag,
1345 from, length, transhdrlen, flags);
1346}
1347
1348ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1349 int offset, size_t size, int flags)
1350{
1351 struct inet_sock *inet = inet_sk(sk);
1352 struct sk_buff *skb;
1353 struct rtable *rt;
1354 struct ip_options *opt = NULL;
1355 struct inet_cork *cork;
1356 int hh_len;
1357 int mtu;
1358 int len;
1359 int err;
1360 unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
1361
1362 if (inet->hdrincl)
1363 return -EPERM;
1364
1365 if (flags&MSG_PROBE)
1366 return 0;
1367
1368 if (skb_queue_empty(&sk->sk_write_queue))
1369 return -EINVAL;
1370
1371 cork = &inet->cork.base;
1372 rt = (struct rtable *)cork->dst;
1373 if (cork->flags & IPCORK_OPT)
1374 opt = cork->opt;
1375
1376 if (!(rt->dst.dev->features & NETIF_F_SG))
1377 return -EOPNOTSUPP;
1378
1379 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1380 mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
1381
1382 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1383 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1384 maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
1385
1386 if (cork->length + size > maxnonfragsize - fragheaderlen) {
1387 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
1388 mtu - (opt ? opt->optlen : 0));
1389 return -EMSGSIZE;
1390 }
1391
1392 skb = skb_peek_tail(&sk->sk_write_queue);
1393 if (!skb)
1394 return -EINVAL;
1395
1396 cork->length += size;
1397
1398 while (size > 0) {
1399 /* Check if the remaining data fits into current packet. */
1400 len = mtu - skb->len;
1401 if (len < size)
1402 len = maxfraglen - skb->len;
1403
1404 if (len <= 0) {
1405 struct sk_buff *skb_prev;
1406 int alloclen;
1407
1408 skb_prev = skb;
1409 fraggap = skb_prev->len - maxfraglen;
1410
1411 alloclen = fragheaderlen + hh_len + fraggap + 15;
1412 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1413 if (unlikely(!skb)) {
1414 err = -ENOBUFS;
1415 goto error;
1416 }
1417
1418 /*
1419 * Fill in the control structures
1420 */
1421 skb->ip_summed = CHECKSUM_NONE;
1422 skb->csum = 0;
1423 skb_reserve(skb, hh_len);
1424
1425 /*
1426 * Find where to start putting bytes.
1427 */
1428 skb_put(skb, fragheaderlen + fraggap);
1429 skb_reset_network_header(skb);
1430 skb->transport_header = (skb->network_header +
1431 fragheaderlen);
1432 if (fraggap) {
1433 skb->csum = skb_copy_and_csum_bits(skb_prev,
1434 maxfraglen,
1435 skb_transport_header(skb),
1436 fraggap);
1437 skb_prev->csum = csum_sub(skb_prev->csum,
1438 skb->csum);
1439 pskb_trim_unique(skb_prev, maxfraglen);
1440 }
1441
1442 /*
1443 * Put the packet on the pending queue.
1444 */
1445 __skb_queue_tail(&sk->sk_write_queue, skb);
1446 continue;
1447 }
1448
1449 if (len > size)
1450 len = size;
1451
1452 if (skb_append_pagefrags(skb, page, offset, len)) {
1453 err = -EMSGSIZE;
1454 goto error;
1455 }
1456
1457 if (skb->ip_summed == CHECKSUM_NONE) {
1458 __wsum csum;
1459 csum = csum_page(page, offset, len);
1460 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1461 }
1462
1463 skb_len_add(skb, len);
1464 refcount_add(len, &sk->sk_wmem_alloc);
1465 offset += len;
1466 size -= len;
1467 }
1468 return 0;
1469
1470error:
1471 cork->length -= size;
1472 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1473 return err;
1474}
1475
1476static void ip_cork_release(struct inet_cork *cork)
1477{
1478 cork->flags &= ~IPCORK_OPT;
1479 kfree(cork->opt);
1480 cork->opt = NULL;
1481 dst_release(cork->dst);
1482 cork->dst = NULL;
1483}
1484
1485/*
1486 * Combined all pending IP fragments on the socket as one IP datagram
1487 * and push them out.
1488 */
1489struct sk_buff *__ip_make_skb(struct sock *sk,
1490 struct flowi4 *fl4,
1491 struct sk_buff_head *queue,
1492 struct inet_cork *cork)
1493{
1494 struct sk_buff *skb, *tmp_skb;
1495 struct sk_buff **tail_skb;
1496 struct inet_sock *inet = inet_sk(sk);
1497 struct net *net = sock_net(sk);
1498 struct ip_options *opt = NULL;
1499 struct rtable *rt = (struct rtable *)cork->dst;
1500 struct iphdr *iph;
1501 __be16 df = 0;
1502 __u8 ttl;
1503
1504 skb = __skb_dequeue(queue);
1505 if (!skb)
1506 goto out;
1507 tail_skb = &(skb_shinfo(skb)->frag_list);
1508
1509 /* move skb->data to ip header from ext header */
1510 if (skb->data < skb_network_header(skb))
1511 __skb_pull(skb, skb_network_offset(skb));
1512 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1513 __skb_pull(tmp_skb, skb_network_header_len(skb));
1514 *tail_skb = tmp_skb;
1515 tail_skb = &(tmp_skb->next);
1516 skb->len += tmp_skb->len;
1517 skb->data_len += tmp_skb->len;
1518 skb->truesize += tmp_skb->truesize;
1519 tmp_skb->destructor = NULL;
1520 tmp_skb->sk = NULL;
1521 }
1522
1523 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1524 * to fragment the frame generated here. No matter, what transforms
1525 * how transforms change size of the packet, it will come out.
1526 */
1527 skb->ignore_df = ip_sk_ignore_df(sk);
1528
1529 /* DF bit is set when we want to see DF on outgoing frames.
1530 * If ignore_df is set too, we still allow to fragment this frame
1531 * locally. */
1532 if (inet->pmtudisc == IP_PMTUDISC_DO ||
1533 inet->pmtudisc == IP_PMTUDISC_PROBE ||
1534 (skb->len <= dst_mtu(&rt->dst) &&
1535 ip_dont_fragment(sk, &rt->dst)))
1536 df = htons(IP_DF);
1537
1538 if (cork->flags & IPCORK_OPT)
1539 opt = cork->opt;
1540
1541 if (cork->ttl != 0)
1542 ttl = cork->ttl;
1543 else if (rt->rt_type == RTN_MULTICAST)
1544 ttl = inet->mc_ttl;
1545 else
1546 ttl = ip_select_ttl(inet, &rt->dst);
1547
1548 iph = ip_hdr(skb);
1549 iph->version = 4;
1550 iph->ihl = 5;
1551 iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
1552 iph->frag_off = df;
1553 iph->ttl = ttl;
1554 iph->protocol = sk->sk_protocol;
1555 ip_copy_addrs(iph, fl4);
1556 ip_select_ident(net, skb, sk);
1557
1558 if (opt) {
1559 iph->ihl += opt->optlen >> 2;
1560 ip_options_build(skb, opt, cork->addr, rt);
1561 }
1562
1563 skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
1564 skb->mark = cork->mark;
1565 skb->tstamp = cork->transmit_time;
1566 /*
1567 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1568 * on dst refcount
1569 */
1570 cork->dst = NULL;
1571 skb_dst_set(skb, &rt->dst);
1572
1573 if (iph->protocol == IPPROTO_ICMP)
1574 icmp_out_count(net, ((struct icmphdr *)
1575 skb_transport_header(skb))->type);
1576
1577 ip_cork_release(cork);
1578out:
1579 return skb;
1580}
1581
1582int ip_send_skb(struct net *net, struct sk_buff *skb)
1583{
1584 int err;
1585
1586 err = ip_local_out(net, skb->sk, skb);
1587 if (err) {
1588 if (err > 0)
1589 err = net_xmit_errno(err);
1590 if (err)
1591 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1592 }
1593
1594 return err;
1595}
1596
1597int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1598{
1599 struct sk_buff *skb;
1600
1601 skb = ip_finish_skb(sk, fl4);
1602 if (!skb)
1603 return 0;
1604
1605 /* Netfilter gets whole the not fragmented skb. */
1606 return ip_send_skb(sock_net(sk), skb);
1607}
1608
1609/*
1610 * Throw away all pending data on the socket.
1611 */
1612static void __ip_flush_pending_frames(struct sock *sk,
1613 struct sk_buff_head *queue,
1614 struct inet_cork *cork)
1615{
1616 struct sk_buff *skb;
1617
1618 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1619 kfree_skb(skb);
1620
1621 ip_cork_release(cork);
1622}
1623
1624void ip_flush_pending_frames(struct sock *sk)
1625{
1626 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1627}
1628
1629struct sk_buff *ip_make_skb(struct sock *sk,
1630 struct flowi4 *fl4,
1631 int getfrag(void *from, char *to, int offset,
1632 int len, int odd, struct sk_buff *skb),
1633 void *from, int length, int transhdrlen,
1634 struct ipcm_cookie *ipc, struct rtable **rtp,
1635 struct inet_cork *cork, unsigned int flags)
1636{
1637 struct sk_buff_head queue;
1638 int err;
1639
1640 if (flags & MSG_PROBE)
1641 return NULL;
1642
1643 __skb_queue_head_init(&queue);
1644
1645 cork->flags = 0;
1646 cork->addr = 0;
1647 cork->opt = NULL;
1648 err = ip_setup_cork(sk, cork, ipc, rtp);
1649 if (err)
1650 return ERR_PTR(err);
1651
1652 err = __ip_append_data(sk, fl4, &queue, cork,
1653 ¤t->task_frag, getfrag,
1654 from, length, transhdrlen, flags);
1655 if (err) {
1656 __ip_flush_pending_frames(sk, &queue, cork);
1657 return ERR_PTR(err);
1658 }
1659
1660 return __ip_make_skb(sk, fl4, &queue, cork);
1661}
1662
1663/*
1664 * Fetch data from kernel space and fill in checksum if needed.
1665 */
1666static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1667 int len, int odd, struct sk_buff *skb)
1668{
1669 __wsum csum;
1670
1671 csum = csum_partial_copy_nocheck(dptr+offset, to, len);
1672 skb->csum = csum_block_add(skb->csum, csum, odd);
1673 return 0;
1674}
1675
1676/*
1677 * Generic function to send a packet as reply to another packet.
1678 * Used to send some TCP resets/acks so far.
1679 */
1680void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1681 const struct ip_options *sopt,
1682 __be32 daddr, __be32 saddr,
1683 const struct ip_reply_arg *arg,
1684 unsigned int len, u64 transmit_time)
1685{
1686 struct ip_options_data replyopts;
1687 struct ipcm_cookie ipc;
1688 struct flowi4 fl4;
1689 struct rtable *rt = skb_rtable(skb);
1690 struct net *net = sock_net(sk);
1691 struct sk_buff *nskb;
1692 int err;
1693 int oif;
1694
1695 if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
1696 return;
1697
1698 ipcm_init(&ipc);
1699 ipc.addr = daddr;
1700 ipc.sockc.transmit_time = transmit_time;
1701
1702 if (replyopts.opt.opt.optlen) {
1703 ipc.opt = &replyopts.opt;
1704
1705 if (replyopts.opt.opt.srr)
1706 daddr = replyopts.opt.opt.faddr;
1707 }
1708
1709 oif = arg->bound_dev_if;
1710 if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
1711 oif = skb->skb_iif;
1712
1713 flowi4_init_output(&fl4, oif,
1714 IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
1715 RT_TOS(arg->tos),
1716 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1717 ip_reply_arg_flowi_flags(arg),
1718 daddr, saddr,
1719 tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
1720 arg->uid);
1721 security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
1722 rt = ip_route_output_flow(net, &fl4, sk);
1723 if (IS_ERR(rt))
1724 return;
1725
1726 inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK;
1727
1728 sk->sk_protocol = ip_hdr(skb)->protocol;
1729 sk->sk_bound_dev_if = arg->bound_dev_if;
1730 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
1731 ipc.sockc.mark = fl4.flowi4_mark;
1732 err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
1733 len, 0, &ipc, &rt, MSG_DONTWAIT);
1734 if (unlikely(err)) {
1735 ip_flush_pending_frames(sk);
1736 goto out;
1737 }
1738
1739 nskb = skb_peek(&sk->sk_write_queue);
1740 if (nskb) {
1741 if (arg->csumoffset >= 0)
1742 *((__sum16 *)skb_transport_header(nskb) +
1743 arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1744 arg->csum));
1745 nskb->ip_summed = CHECKSUM_NONE;
1746 nskb->mono_delivery_time = !!transmit_time;
1747 ip_push_pending_frames(sk, &fl4);
1748 }
1749out:
1750 ip_rt_put(rt);
1751}
1752
1753void __init ip_init(void)
1754{
1755 ip_rt_init();
1756 inet_initpeers();
1757
1758#if defined(CONFIG_IP_MULTICAST)
1759 igmp_mc_init();
1760#endif
1761}