Loading...
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <linux/uaccess.h>
46#include <linux/module.h>
47#include <linux/types.h>
48#include <linux/kernel.h>
49#include <linux/mm.h>
50#include <linux/string.h>
51#include <linux/errno.h>
52#include <linux/highmem.h>
53#include <linux/slab.h>
54
55#include <linux/socket.h>
56#include <linux/sockios.h>
57#include <linux/in.h>
58#include <linux/inet.h>
59#include <linux/netdevice.h>
60#include <linux/etherdevice.h>
61#include <linux/proc_fs.h>
62#include <linux/stat.h>
63#include <linux/init.h>
64
65#include <net/snmp.h>
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <net/route.h>
69#include <net/xfrm.h>
70#include <linux/skbuff.h>
71#include <net/sock.h>
72#include <net/arp.h>
73#include <net/icmp.h>
74#include <net/checksum.h>
75#include <net/inetpeer.h>
76#include <net/lwtunnel.h>
77#include <linux/bpf-cgroup.h>
78#include <linux/igmp.h>
79#include <linux/netfilter_ipv4.h>
80#include <linux/netfilter_bridge.h>
81#include <linux/netlink.h>
82#include <linux/tcp.h>
83
84static int
85ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
86 unsigned int mtu,
87 int (*output)(struct net *, struct sock *, struct sk_buff *));
88
89/* Generate a checksum for an outgoing IP datagram. */
90void ip_send_check(struct iphdr *iph)
91{
92 iph->check = 0;
93 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
94}
95EXPORT_SYMBOL(ip_send_check);
96
97int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
98{
99 struct iphdr *iph = ip_hdr(skb);
100
101 iph->tot_len = htons(skb->len);
102 ip_send_check(iph);
103
104 /* if egress device is enslaved to an L3 master device pass the
105 * skb to its handler for processing
106 */
107 skb = l3mdev_ip_out(sk, skb);
108 if (unlikely(!skb))
109 return 0;
110
111 skb->protocol = htons(ETH_P_IP);
112
113 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
114 net, sk, skb, NULL, skb_dst(skb)->dev,
115 dst_output);
116}
117
118int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
119{
120 int err;
121
122 err = __ip_local_out(net, sk, skb);
123 if (likely(err == 1))
124 err = dst_output(net, sk, skb);
125
126 return err;
127}
128EXPORT_SYMBOL_GPL(ip_local_out);
129
130static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
131{
132 int ttl = inet->uc_ttl;
133
134 if (ttl < 0)
135 ttl = ip4_dst_hoplimit(dst);
136 return ttl;
137}
138
139/*
140 * Add an ip header to a skbuff and send it out.
141 *
142 */
143int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
144 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
145{
146 struct inet_sock *inet = inet_sk(sk);
147 struct rtable *rt = skb_rtable(skb);
148 struct net *net = sock_net(sk);
149 struct iphdr *iph;
150
151 /* Build the IP header. */
152 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
153 skb_reset_network_header(skb);
154 iph = ip_hdr(skb);
155 iph->version = 4;
156 iph->ihl = 5;
157 iph->tos = inet->tos;
158 iph->ttl = ip_select_ttl(inet, &rt->dst);
159 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
160 iph->saddr = saddr;
161 iph->protocol = sk->sk_protocol;
162 if (ip_dont_fragment(sk, &rt->dst)) {
163 iph->frag_off = htons(IP_DF);
164 iph->id = 0;
165 } else {
166 iph->frag_off = 0;
167 __ip_select_ident(net, iph, 1);
168 }
169
170 if (opt && opt->opt.optlen) {
171 iph->ihl += opt->opt.optlen>>2;
172 ip_options_build(skb, &opt->opt, daddr, rt, 0);
173 }
174
175 skb->priority = sk->sk_priority;
176 if (!skb->mark)
177 skb->mark = sk->sk_mark;
178
179 /* Send it out. */
180 return ip_local_out(net, skb->sk, skb);
181}
182EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
183
184static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
185{
186 struct dst_entry *dst = skb_dst(skb);
187 struct rtable *rt = (struct rtable *)dst;
188 struct net_device *dev = dst->dev;
189 unsigned int hh_len = LL_RESERVED_SPACE(dev);
190 struct neighbour *neigh;
191 u32 nexthop;
192
193 if (rt->rt_type == RTN_MULTICAST) {
194 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
195 } else if (rt->rt_type == RTN_BROADCAST)
196 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
197
198 /* Be paranoid, rather than too clever. */
199 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
200 struct sk_buff *skb2;
201
202 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
203 if (!skb2) {
204 kfree_skb(skb);
205 return -ENOMEM;
206 }
207 if (skb->sk)
208 skb_set_owner_w(skb2, skb->sk);
209 consume_skb(skb);
210 skb = skb2;
211 }
212
213 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
214 int res = lwtunnel_xmit(skb);
215
216 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
217 return res;
218 }
219
220 rcu_read_lock_bh();
221 nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
222 neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
223 if (unlikely(!neigh))
224 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
225 if (!IS_ERR(neigh)) {
226 int res;
227
228 sock_confirm_neigh(skb, neigh);
229 res = neigh_output(neigh, skb);
230
231 rcu_read_unlock_bh();
232 return res;
233 }
234 rcu_read_unlock_bh();
235
236 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
237 __func__);
238 kfree_skb(skb);
239 return -EINVAL;
240}
241
242static int ip_finish_output_gso(struct net *net, struct sock *sk,
243 struct sk_buff *skb, unsigned int mtu)
244{
245 netdev_features_t features;
246 struct sk_buff *segs;
247 int ret = 0;
248
249 /* common case: seglen is <= mtu
250 */
251 if (skb_gso_validate_network_len(skb, mtu))
252 return ip_finish_output2(net, sk, skb);
253
254 /* Slowpath - GSO segment length exceeds the egress MTU.
255 *
256 * This can happen in several cases:
257 * - Forwarding of a TCP GRO skb, when DF flag is not set.
258 * - Forwarding of an skb that arrived on a virtualization interface
259 * (virtio-net/vhost/tap) with TSO/GSO size set by other network
260 * stack.
261 * - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an
262 * interface with a smaller MTU.
263 * - Arriving GRO skb (or GSO skb in a virtualized environment) that is
264 * bridged to a NETIF_F_TSO tunnel stacked over an interface with an
265 * insufficent MTU.
266 */
267 features = netif_skb_features(skb);
268 BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
269 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
270 if (IS_ERR_OR_NULL(segs)) {
271 kfree_skb(skb);
272 return -ENOMEM;
273 }
274
275 consume_skb(skb);
276
277 do {
278 struct sk_buff *nskb = segs->next;
279 int err;
280
281 segs->next = NULL;
282 err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
283
284 if (err && ret == 0)
285 ret = err;
286 segs = nskb;
287 } while (segs);
288
289 return ret;
290}
291
292static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
293{
294 unsigned int mtu;
295 int ret;
296
297 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
298 if (ret) {
299 kfree_skb(skb);
300 return ret;
301 }
302
303#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
304 /* Policy lookup after SNAT yielded a new policy */
305 if (skb_dst(skb)->xfrm) {
306 IPCB(skb)->flags |= IPSKB_REROUTED;
307 return dst_output(net, sk, skb);
308 }
309#endif
310 mtu = ip_skb_dst_mtu(sk, skb);
311 if (skb_is_gso(skb))
312 return ip_finish_output_gso(net, sk, skb, mtu);
313
314 if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
315 return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
316
317 return ip_finish_output2(net, sk, skb);
318}
319
320static int ip_mc_finish_output(struct net *net, struct sock *sk,
321 struct sk_buff *skb)
322{
323 int ret;
324
325 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
326 if (ret) {
327 kfree_skb(skb);
328 return ret;
329 }
330
331 return dev_loopback_xmit(net, sk, skb);
332}
333
334int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
335{
336 struct rtable *rt = skb_rtable(skb);
337 struct net_device *dev = rt->dst.dev;
338
339 /*
340 * If the indicated interface is up and running, send the packet.
341 */
342 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
343
344 skb->dev = dev;
345 skb->protocol = htons(ETH_P_IP);
346
347 /*
348 * Multicasts are looped back for other local users
349 */
350
351 if (rt->rt_flags&RTCF_MULTICAST) {
352 if (sk_mc_loop(sk)
353#ifdef CONFIG_IP_MROUTE
354 /* Small optimization: do not loopback not local frames,
355 which returned after forwarding; they will be dropped
356 by ip_mr_input in any case.
357 Note, that local frames are looped back to be delivered
358 to local recipients.
359
360 This check is duplicated in ip_mr_input at the moment.
361 */
362 &&
363 ((rt->rt_flags & RTCF_LOCAL) ||
364 !(IPCB(skb)->flags & IPSKB_FORWARDED))
365#endif
366 ) {
367 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
368 if (newskb)
369 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
370 net, sk, newskb, NULL, newskb->dev,
371 ip_mc_finish_output);
372 }
373
374 /* Multicasts with ttl 0 must not go beyond the host */
375
376 if (ip_hdr(skb)->ttl == 0) {
377 kfree_skb(skb);
378 return 0;
379 }
380 }
381
382 if (rt->rt_flags&RTCF_BROADCAST) {
383 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
384 if (newskb)
385 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
386 net, sk, newskb, NULL, newskb->dev,
387 ip_mc_finish_output);
388 }
389
390 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
391 net, sk, skb, NULL, skb->dev,
392 ip_finish_output,
393 !(IPCB(skb)->flags & IPSKB_REROUTED));
394}
395
396int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
397{
398 struct net_device *dev = skb_dst(skb)->dev;
399
400 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
401
402 skb->dev = dev;
403 skb->protocol = htons(ETH_P_IP);
404
405 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
406 net, sk, skb, NULL, dev,
407 ip_finish_output,
408 !(IPCB(skb)->flags & IPSKB_REROUTED));
409}
410
411/*
412 * copy saddr and daddr, possibly using 64bit load/stores
413 * Equivalent to :
414 * iph->saddr = fl4->saddr;
415 * iph->daddr = fl4->daddr;
416 */
417static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
418{
419 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
420 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
421 memcpy(&iph->saddr, &fl4->saddr,
422 sizeof(fl4->saddr) + sizeof(fl4->daddr));
423}
424
425/* Note: skb->sk can be different from sk, in case of tunnels */
426int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
427{
428 struct inet_sock *inet = inet_sk(sk);
429 struct net *net = sock_net(sk);
430 struct ip_options_rcu *inet_opt;
431 struct flowi4 *fl4;
432 struct rtable *rt;
433 struct iphdr *iph;
434 int res;
435
436 /* Skip all of this if the packet is already routed,
437 * f.e. by something like SCTP.
438 */
439 rcu_read_lock();
440 inet_opt = rcu_dereference(inet->inet_opt);
441 fl4 = &fl->u.ip4;
442 rt = skb_rtable(skb);
443 if (rt)
444 goto packet_routed;
445
446 /* Make sure we can route this packet. */
447 rt = (struct rtable *)__sk_dst_check(sk, 0);
448 if (!rt) {
449 __be32 daddr;
450
451 /* Use correct destination address if we have options. */
452 daddr = inet->inet_daddr;
453 if (inet_opt && inet_opt->opt.srr)
454 daddr = inet_opt->opt.faddr;
455
456 /* If this fails, retransmit mechanism of transport layer will
457 * keep trying until route appears or the connection times
458 * itself out.
459 */
460 rt = ip_route_output_ports(net, fl4, sk,
461 daddr, inet->inet_saddr,
462 inet->inet_dport,
463 inet->inet_sport,
464 sk->sk_protocol,
465 RT_CONN_FLAGS(sk),
466 sk->sk_bound_dev_if);
467 if (IS_ERR(rt))
468 goto no_route;
469 sk_setup_caps(sk, &rt->dst);
470 }
471 skb_dst_set_noref(skb, &rt->dst);
472
473packet_routed:
474 if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
475 goto no_route;
476
477 /* OK, we know where to send it, allocate and build IP header. */
478 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
479 skb_reset_network_header(skb);
480 iph = ip_hdr(skb);
481 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
482 if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
483 iph->frag_off = htons(IP_DF);
484 else
485 iph->frag_off = 0;
486 iph->ttl = ip_select_ttl(inet, &rt->dst);
487 iph->protocol = sk->sk_protocol;
488 ip_copy_addrs(iph, fl4);
489
490 /* Transport layer set skb->h.foo itself. */
491
492 if (inet_opt && inet_opt->opt.optlen) {
493 iph->ihl += inet_opt->opt.optlen >> 2;
494 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
495 }
496
497 ip_select_ident_segs(net, skb, sk,
498 skb_shinfo(skb)->gso_segs ?: 1);
499
500 /* TODO : should we use skb->sk here instead of sk ? */
501 skb->priority = sk->sk_priority;
502 skb->mark = sk->sk_mark;
503
504 res = ip_local_out(net, sk, skb);
505 rcu_read_unlock();
506 return res;
507
508no_route:
509 rcu_read_unlock();
510 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
511 kfree_skb(skb);
512 return -EHOSTUNREACH;
513}
514EXPORT_SYMBOL(ip_queue_xmit);
515
516static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
517{
518 to->pkt_type = from->pkt_type;
519 to->priority = from->priority;
520 to->protocol = from->protocol;
521 skb_dst_drop(to);
522 skb_dst_copy(to, from);
523 to->dev = from->dev;
524 to->mark = from->mark;
525
526 /* Copy the flags to each fragment. */
527 IPCB(to)->flags = IPCB(from)->flags;
528
529#ifdef CONFIG_NET_SCHED
530 to->tc_index = from->tc_index;
531#endif
532 nf_copy(to, from);
533#if IS_ENABLED(CONFIG_IP_VS)
534 to->ipvs_property = from->ipvs_property;
535#endif
536 skb_copy_secmark(to, from);
537}
538
539static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
540 unsigned int mtu,
541 int (*output)(struct net *, struct sock *, struct sk_buff *))
542{
543 struct iphdr *iph = ip_hdr(skb);
544
545 if ((iph->frag_off & htons(IP_DF)) == 0)
546 return ip_do_fragment(net, sk, skb, output);
547
548 if (unlikely(!skb->ignore_df ||
549 (IPCB(skb)->frag_max_size &&
550 IPCB(skb)->frag_max_size > mtu))) {
551 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
552 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
553 htonl(mtu));
554 kfree_skb(skb);
555 return -EMSGSIZE;
556 }
557
558 return ip_do_fragment(net, sk, skb, output);
559}
560
561/*
562 * This IP datagram is too large to be sent in one piece. Break it up into
563 * smaller pieces (each of size equal to IP header plus
564 * a block of the data of the original IP data part) that will yet fit in a
565 * single device frame, and queue such a frame for sending.
566 */
567
568int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
569 int (*output)(struct net *, struct sock *, struct sk_buff *))
570{
571 struct iphdr *iph;
572 int ptr;
573 struct sk_buff *skb2;
574 unsigned int mtu, hlen, left, len, ll_rs;
575 int offset;
576 __be16 not_last_frag;
577 struct rtable *rt = skb_rtable(skb);
578 int err = 0;
579
580 /* for offloaded checksums cleanup checksum before fragmentation */
581 if (skb->ip_summed == CHECKSUM_PARTIAL &&
582 (err = skb_checksum_help(skb)))
583 goto fail;
584
585 /*
586 * Point into the IP datagram header.
587 */
588
589 iph = ip_hdr(skb);
590
591 mtu = ip_skb_dst_mtu(sk, skb);
592 if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
593 mtu = IPCB(skb)->frag_max_size;
594
595 /*
596 * Setup starting values.
597 */
598
599 hlen = iph->ihl * 4;
600 mtu = mtu - hlen; /* Size of data space */
601 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
602 ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
603
604 /* When frag_list is given, use it. First, check its validity:
605 * some transformers could create wrong frag_list or break existing
606 * one, it is not prohibited. In this case fall back to copying.
607 *
608 * LATER: this step can be merged to real generation of fragments,
609 * we can switch to copy when see the first bad fragment.
610 */
611 if (skb_has_frag_list(skb)) {
612 struct sk_buff *frag, *frag2;
613 unsigned int first_len = skb_pagelen(skb);
614
615 if (first_len - hlen > mtu ||
616 ((first_len - hlen) & 7) ||
617 ip_is_fragment(iph) ||
618 skb_cloned(skb) ||
619 skb_headroom(skb) < ll_rs)
620 goto slow_path;
621
622 skb_walk_frags(skb, frag) {
623 /* Correct geometry. */
624 if (frag->len > mtu ||
625 ((frag->len & 7) && frag->next) ||
626 skb_headroom(frag) < hlen + ll_rs)
627 goto slow_path_clean;
628
629 /* Partially cloned skb? */
630 if (skb_shared(frag))
631 goto slow_path_clean;
632
633 BUG_ON(frag->sk);
634 if (skb->sk) {
635 frag->sk = skb->sk;
636 frag->destructor = sock_wfree;
637 }
638 skb->truesize -= frag->truesize;
639 }
640
641 /* Everything is OK. Generate! */
642
643 err = 0;
644 offset = 0;
645 frag = skb_shinfo(skb)->frag_list;
646 skb_frag_list_init(skb);
647 skb->data_len = first_len - skb_headlen(skb);
648 skb->len = first_len;
649 iph->tot_len = htons(first_len);
650 iph->frag_off = htons(IP_MF);
651 ip_send_check(iph);
652
653 for (;;) {
654 /* Prepare header of the next frame,
655 * before previous one went down. */
656 if (frag) {
657 frag->ip_summed = CHECKSUM_NONE;
658 skb_reset_transport_header(frag);
659 __skb_push(frag, hlen);
660 skb_reset_network_header(frag);
661 memcpy(skb_network_header(frag), iph, hlen);
662 iph = ip_hdr(frag);
663 iph->tot_len = htons(frag->len);
664 ip_copy_metadata(frag, skb);
665 if (offset == 0)
666 ip_options_fragment(frag);
667 offset += skb->len - hlen;
668 iph->frag_off = htons(offset>>3);
669 if (frag->next)
670 iph->frag_off |= htons(IP_MF);
671 /* Ready, complete checksum */
672 ip_send_check(iph);
673 }
674
675 err = output(net, sk, skb);
676
677 if (!err)
678 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
679 if (err || !frag)
680 break;
681
682 skb = frag;
683 frag = skb->next;
684 skb->next = NULL;
685 }
686
687 if (err == 0) {
688 IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
689 return 0;
690 }
691
692 while (frag) {
693 skb = frag->next;
694 kfree_skb(frag);
695 frag = skb;
696 }
697 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
698 return err;
699
700slow_path_clean:
701 skb_walk_frags(skb, frag2) {
702 if (frag2 == frag)
703 break;
704 frag2->sk = NULL;
705 frag2->destructor = NULL;
706 skb->truesize += frag2->truesize;
707 }
708 }
709
710slow_path:
711 iph = ip_hdr(skb);
712
713 left = skb->len - hlen; /* Space per frame */
714 ptr = hlen; /* Where to start from */
715
716 /*
717 * Fragment the datagram.
718 */
719
720 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
721 not_last_frag = iph->frag_off & htons(IP_MF);
722
723 /*
724 * Keep copying data until we run out.
725 */
726
727 while (left > 0) {
728 len = left;
729 /* IF: it doesn't fit, use 'mtu' - the data space left */
730 if (len > mtu)
731 len = mtu;
732 /* IF: we are not sending up to and including the packet end
733 then align the next start on an eight byte boundary */
734 if (len < left) {
735 len &= ~7;
736 }
737
738 /* Allocate buffer */
739 skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
740 if (!skb2) {
741 err = -ENOMEM;
742 goto fail;
743 }
744
745 /*
746 * Set up data on packet
747 */
748
749 ip_copy_metadata(skb2, skb);
750 skb_reserve(skb2, ll_rs);
751 skb_put(skb2, len + hlen);
752 skb_reset_network_header(skb2);
753 skb2->transport_header = skb2->network_header + hlen;
754
755 /*
756 * Charge the memory for the fragment to any owner
757 * it might possess
758 */
759
760 if (skb->sk)
761 skb_set_owner_w(skb2, skb->sk);
762
763 /*
764 * Copy the packet header into the new buffer.
765 */
766
767 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
768
769 /*
770 * Copy a block of the IP datagram.
771 */
772 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
773 BUG();
774 left -= len;
775
776 /*
777 * Fill in the new header fields.
778 */
779 iph = ip_hdr(skb2);
780 iph->frag_off = htons((offset >> 3));
781
782 if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
783 iph->frag_off |= htons(IP_DF);
784
785 /* ANK: dirty, but effective trick. Upgrade options only if
786 * the segment to be fragmented was THE FIRST (otherwise,
787 * options are already fixed) and make it ONCE
788 * on the initial skb, so that all the following fragments
789 * will inherit fixed options.
790 */
791 if (offset == 0)
792 ip_options_fragment(skb);
793
794 /*
795 * Added AC : If we are fragmenting a fragment that's not the
796 * last fragment then keep MF on each bit
797 */
798 if (left > 0 || not_last_frag)
799 iph->frag_off |= htons(IP_MF);
800 ptr += len;
801 offset += len;
802
803 /*
804 * Put this fragment into the sending queue.
805 */
806 iph->tot_len = htons(len + hlen);
807
808 ip_send_check(iph);
809
810 err = output(net, sk, skb2);
811 if (err)
812 goto fail;
813
814 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
815 }
816 consume_skb(skb);
817 IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
818 return err;
819
820fail:
821 kfree_skb(skb);
822 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
823 return err;
824}
825EXPORT_SYMBOL(ip_do_fragment);
826
827int
828ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
829{
830 struct msghdr *msg = from;
831
832 if (skb->ip_summed == CHECKSUM_PARTIAL) {
833 if (!copy_from_iter_full(to, len, &msg->msg_iter))
834 return -EFAULT;
835 } else {
836 __wsum csum = 0;
837 if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
838 return -EFAULT;
839 skb->csum = csum_block_add(skb->csum, csum, odd);
840 }
841 return 0;
842}
843EXPORT_SYMBOL(ip_generic_getfrag);
844
845static inline __wsum
846csum_page(struct page *page, int offset, int copy)
847{
848 char *kaddr;
849 __wsum csum;
850 kaddr = kmap(page);
851 csum = csum_partial(kaddr + offset, copy, 0);
852 kunmap(page);
853 return csum;
854}
855
856static int __ip_append_data(struct sock *sk,
857 struct flowi4 *fl4,
858 struct sk_buff_head *queue,
859 struct inet_cork *cork,
860 struct page_frag *pfrag,
861 int getfrag(void *from, char *to, int offset,
862 int len, int odd, struct sk_buff *skb),
863 void *from, int length, int transhdrlen,
864 unsigned int flags)
865{
866 struct inet_sock *inet = inet_sk(sk);
867 struct sk_buff *skb;
868
869 struct ip_options *opt = cork->opt;
870 int hh_len;
871 int exthdrlen;
872 int mtu;
873 int copy;
874 int err;
875 int offset = 0;
876 unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
877 int csummode = CHECKSUM_NONE;
878 struct rtable *rt = (struct rtable *)cork->dst;
879 unsigned int wmem_alloc_delta = 0;
880 u32 tskey = 0;
881
882 skb = skb_peek_tail(queue);
883
884 exthdrlen = !skb ? rt->dst.header_len : 0;
885 mtu = cork->fragsize;
886 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
887 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
888 tskey = sk->sk_tskey++;
889
890 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
891
892 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
893 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
894 maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
895
896 if (cork->length + length > maxnonfragsize - fragheaderlen) {
897 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
898 mtu - (opt ? opt->optlen : 0));
899 return -EMSGSIZE;
900 }
901
902 /*
903 * transhdrlen > 0 means that this is the first fragment and we wish
904 * it won't be fragmented in the future.
905 */
906 if (transhdrlen &&
907 length + fragheaderlen <= mtu &&
908 rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
909 !(flags & MSG_MORE) &&
910 !exthdrlen)
911 csummode = CHECKSUM_PARTIAL;
912
913 cork->length += length;
914
915 /* So, what's going on in the loop below?
916 *
917 * We use calculated fragment length to generate chained skb,
918 * each of segments is IP fragment ready for sending to network after
919 * adding appropriate IP header.
920 */
921
922 if (!skb)
923 goto alloc_new_skb;
924
925 while (length > 0) {
926 /* Check if the remaining data fits into current packet. */
927 copy = mtu - skb->len;
928 if (copy < length)
929 copy = maxfraglen - skb->len;
930 if (copy <= 0) {
931 char *data;
932 unsigned int datalen;
933 unsigned int fraglen;
934 unsigned int fraggap;
935 unsigned int alloclen;
936 struct sk_buff *skb_prev;
937alloc_new_skb:
938 skb_prev = skb;
939 if (skb_prev)
940 fraggap = skb_prev->len - maxfraglen;
941 else
942 fraggap = 0;
943
944 /*
945 * If remaining data exceeds the mtu,
946 * we know we need more fragment(s).
947 */
948 datalen = length + fraggap;
949 if (datalen > mtu - fragheaderlen)
950 datalen = maxfraglen - fragheaderlen;
951 fraglen = datalen + fragheaderlen;
952
953 if ((flags & MSG_MORE) &&
954 !(rt->dst.dev->features&NETIF_F_SG))
955 alloclen = mtu;
956 else
957 alloclen = fraglen;
958
959 alloclen += exthdrlen;
960
961 /* The last fragment gets additional space at tail.
962 * Note, with MSG_MORE we overallocate on fragments,
963 * because we have no idea what fragment will be
964 * the last.
965 */
966 if (datalen == length + fraggap)
967 alloclen += rt->dst.trailer_len;
968
969 if (transhdrlen) {
970 skb = sock_alloc_send_skb(sk,
971 alloclen + hh_len + 15,
972 (flags & MSG_DONTWAIT), &err);
973 } else {
974 skb = NULL;
975 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
976 2 * sk->sk_sndbuf)
977 skb = alloc_skb(alloclen + hh_len + 15,
978 sk->sk_allocation);
979 if (unlikely(!skb))
980 err = -ENOBUFS;
981 }
982 if (!skb)
983 goto error;
984
985 /*
986 * Fill in the control structures
987 */
988 skb->ip_summed = csummode;
989 skb->csum = 0;
990 skb_reserve(skb, hh_len);
991
992 /* only the initial fragment is time stamped */
993 skb_shinfo(skb)->tx_flags = cork->tx_flags;
994 cork->tx_flags = 0;
995 skb_shinfo(skb)->tskey = tskey;
996 tskey = 0;
997
998 /*
999 * Find where to start putting bytes.
1000 */
1001 data = skb_put(skb, fraglen + exthdrlen);
1002 skb_set_network_header(skb, exthdrlen);
1003 skb->transport_header = (skb->network_header +
1004 fragheaderlen);
1005 data += fragheaderlen + exthdrlen;
1006
1007 if (fraggap) {
1008 skb->csum = skb_copy_and_csum_bits(
1009 skb_prev, maxfraglen,
1010 data + transhdrlen, fraggap, 0);
1011 skb_prev->csum = csum_sub(skb_prev->csum,
1012 skb->csum);
1013 data += fraggap;
1014 pskb_trim_unique(skb_prev, maxfraglen);
1015 }
1016
1017 copy = datalen - transhdrlen - fraggap;
1018 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1019 err = -EFAULT;
1020 kfree_skb(skb);
1021 goto error;
1022 }
1023
1024 offset += copy;
1025 length -= datalen - fraggap;
1026 transhdrlen = 0;
1027 exthdrlen = 0;
1028 csummode = CHECKSUM_NONE;
1029
1030 if ((flags & MSG_CONFIRM) && !skb_prev)
1031 skb_set_dst_pending_confirm(skb, 1);
1032
1033 /*
1034 * Put the packet on the pending queue.
1035 */
1036 if (!skb->destructor) {
1037 skb->destructor = sock_wfree;
1038 skb->sk = sk;
1039 wmem_alloc_delta += skb->truesize;
1040 }
1041 __skb_queue_tail(queue, skb);
1042 continue;
1043 }
1044
1045 if (copy > length)
1046 copy = length;
1047
1048 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1049 skb_tailroom(skb) >= copy) {
1050 unsigned int off;
1051
1052 off = skb->len;
1053 if (getfrag(from, skb_put(skb, copy),
1054 offset, copy, off, skb) < 0) {
1055 __skb_trim(skb, off);
1056 err = -EFAULT;
1057 goto error;
1058 }
1059 } else {
1060 int i = skb_shinfo(skb)->nr_frags;
1061
1062 err = -ENOMEM;
1063 if (!sk_page_frag_refill(sk, pfrag))
1064 goto error;
1065
1066 if (!skb_can_coalesce(skb, i, pfrag->page,
1067 pfrag->offset)) {
1068 err = -EMSGSIZE;
1069 if (i == MAX_SKB_FRAGS)
1070 goto error;
1071
1072 __skb_fill_page_desc(skb, i, pfrag->page,
1073 pfrag->offset, 0);
1074 skb_shinfo(skb)->nr_frags = ++i;
1075 get_page(pfrag->page);
1076 }
1077 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1078 if (getfrag(from,
1079 page_address(pfrag->page) + pfrag->offset,
1080 offset, copy, skb->len, skb) < 0)
1081 goto error_efault;
1082
1083 pfrag->offset += copy;
1084 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1085 skb->len += copy;
1086 skb->data_len += copy;
1087 skb->truesize += copy;
1088 wmem_alloc_delta += copy;
1089 }
1090 offset += copy;
1091 length -= copy;
1092 }
1093
1094 if (wmem_alloc_delta)
1095 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1096 return 0;
1097
1098error_efault:
1099 err = -EFAULT;
1100error:
1101 cork->length -= length;
1102 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1103 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1104 return err;
1105}
1106
1107static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1108 struct ipcm_cookie *ipc, struct rtable **rtp)
1109{
1110 struct ip_options_rcu *opt;
1111 struct rtable *rt;
1112
1113 rt = *rtp;
1114 if (unlikely(!rt))
1115 return -EFAULT;
1116
1117 /*
1118 * setup for corking.
1119 */
1120 opt = ipc->opt;
1121 if (opt) {
1122 if (!cork->opt) {
1123 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1124 sk->sk_allocation);
1125 if (unlikely(!cork->opt))
1126 return -ENOBUFS;
1127 }
1128 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1129 cork->flags |= IPCORK_OPT;
1130 cork->addr = ipc->addr;
1131 }
1132
1133 /*
1134 * We steal reference to this route, caller should not release it
1135 */
1136 *rtp = NULL;
1137 cork->fragsize = ip_sk_use_pmtu(sk) ?
1138 dst_mtu(&rt->dst) : rt->dst.dev->mtu;
1139 cork->dst = &rt->dst;
1140 cork->length = 0;
1141 cork->ttl = ipc->ttl;
1142 cork->tos = ipc->tos;
1143 cork->priority = ipc->priority;
1144 cork->tx_flags = ipc->tx_flags;
1145
1146 return 0;
1147}
1148
1149/*
1150 * ip_append_data() and ip_append_page() can make one large IP datagram
1151 * from many pieces of data. Each pieces will be holded on the socket
1152 * until ip_push_pending_frames() is called. Each piece can be a page
1153 * or non-page data.
1154 *
1155 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1156 * this interface potentially.
1157 *
1158 * LATER: length must be adjusted by pad at tail, when it is required.
1159 */
1160int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1161 int getfrag(void *from, char *to, int offset, int len,
1162 int odd, struct sk_buff *skb),
1163 void *from, int length, int transhdrlen,
1164 struct ipcm_cookie *ipc, struct rtable **rtp,
1165 unsigned int flags)
1166{
1167 struct inet_sock *inet = inet_sk(sk);
1168 int err;
1169
1170 if (flags&MSG_PROBE)
1171 return 0;
1172
1173 if (skb_queue_empty(&sk->sk_write_queue)) {
1174 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1175 if (err)
1176 return err;
1177 } else {
1178 transhdrlen = 0;
1179 }
1180
1181 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
1182 sk_page_frag(sk), getfrag,
1183 from, length, transhdrlen, flags);
1184}
1185
1186ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1187 int offset, size_t size, int flags)
1188{
1189 struct inet_sock *inet = inet_sk(sk);
1190 struct sk_buff *skb;
1191 struct rtable *rt;
1192 struct ip_options *opt = NULL;
1193 struct inet_cork *cork;
1194 int hh_len;
1195 int mtu;
1196 int len;
1197 int err;
1198 unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
1199
1200 if (inet->hdrincl)
1201 return -EPERM;
1202
1203 if (flags&MSG_PROBE)
1204 return 0;
1205
1206 if (skb_queue_empty(&sk->sk_write_queue))
1207 return -EINVAL;
1208
1209 cork = &inet->cork.base;
1210 rt = (struct rtable *)cork->dst;
1211 if (cork->flags & IPCORK_OPT)
1212 opt = cork->opt;
1213
1214 if (!(rt->dst.dev->features&NETIF_F_SG))
1215 return -EOPNOTSUPP;
1216
1217 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1218 mtu = cork->fragsize;
1219
1220 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1221 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1222 maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
1223
1224 if (cork->length + size > maxnonfragsize - fragheaderlen) {
1225 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
1226 mtu - (opt ? opt->optlen : 0));
1227 return -EMSGSIZE;
1228 }
1229
1230 skb = skb_peek_tail(&sk->sk_write_queue);
1231 if (!skb)
1232 return -EINVAL;
1233
1234 cork->length += size;
1235
1236 while (size > 0) {
1237 /* Check if the remaining data fits into current packet. */
1238 len = mtu - skb->len;
1239 if (len < size)
1240 len = maxfraglen - skb->len;
1241
1242 if (len <= 0) {
1243 struct sk_buff *skb_prev;
1244 int alloclen;
1245
1246 skb_prev = skb;
1247 fraggap = skb_prev->len - maxfraglen;
1248
1249 alloclen = fragheaderlen + hh_len + fraggap + 15;
1250 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1251 if (unlikely(!skb)) {
1252 err = -ENOBUFS;
1253 goto error;
1254 }
1255
1256 /*
1257 * Fill in the control structures
1258 */
1259 skb->ip_summed = CHECKSUM_NONE;
1260 skb->csum = 0;
1261 skb_reserve(skb, hh_len);
1262
1263 /*
1264 * Find where to start putting bytes.
1265 */
1266 skb_put(skb, fragheaderlen + fraggap);
1267 skb_reset_network_header(skb);
1268 skb->transport_header = (skb->network_header +
1269 fragheaderlen);
1270 if (fraggap) {
1271 skb->csum = skb_copy_and_csum_bits(skb_prev,
1272 maxfraglen,
1273 skb_transport_header(skb),
1274 fraggap, 0);
1275 skb_prev->csum = csum_sub(skb_prev->csum,
1276 skb->csum);
1277 pskb_trim_unique(skb_prev, maxfraglen);
1278 }
1279
1280 /*
1281 * Put the packet on the pending queue.
1282 */
1283 __skb_queue_tail(&sk->sk_write_queue, skb);
1284 continue;
1285 }
1286
1287 if (len > size)
1288 len = size;
1289
1290 if (skb_append_pagefrags(skb, page, offset, len)) {
1291 err = -EMSGSIZE;
1292 goto error;
1293 }
1294
1295 if (skb->ip_summed == CHECKSUM_NONE) {
1296 __wsum csum;
1297 csum = csum_page(page, offset, len);
1298 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1299 }
1300
1301 skb->len += len;
1302 skb->data_len += len;
1303 skb->truesize += len;
1304 refcount_add(len, &sk->sk_wmem_alloc);
1305 offset += len;
1306 size -= len;
1307 }
1308 return 0;
1309
1310error:
1311 cork->length -= size;
1312 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1313 return err;
1314}
1315
1316static void ip_cork_release(struct inet_cork *cork)
1317{
1318 cork->flags &= ~IPCORK_OPT;
1319 kfree(cork->opt);
1320 cork->opt = NULL;
1321 dst_release(cork->dst);
1322 cork->dst = NULL;
1323}
1324
1325/*
1326 * Combined all pending IP fragments on the socket as one IP datagram
1327 * and push them out.
1328 */
1329struct sk_buff *__ip_make_skb(struct sock *sk,
1330 struct flowi4 *fl4,
1331 struct sk_buff_head *queue,
1332 struct inet_cork *cork)
1333{
1334 struct sk_buff *skb, *tmp_skb;
1335 struct sk_buff **tail_skb;
1336 struct inet_sock *inet = inet_sk(sk);
1337 struct net *net = sock_net(sk);
1338 struct ip_options *opt = NULL;
1339 struct rtable *rt = (struct rtable *)cork->dst;
1340 struct iphdr *iph;
1341 __be16 df = 0;
1342 __u8 ttl;
1343
1344 skb = __skb_dequeue(queue);
1345 if (!skb)
1346 goto out;
1347 tail_skb = &(skb_shinfo(skb)->frag_list);
1348
1349 /* move skb->data to ip header from ext header */
1350 if (skb->data < skb_network_header(skb))
1351 __skb_pull(skb, skb_network_offset(skb));
1352 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1353 __skb_pull(tmp_skb, skb_network_header_len(skb));
1354 *tail_skb = tmp_skb;
1355 tail_skb = &(tmp_skb->next);
1356 skb->len += tmp_skb->len;
1357 skb->data_len += tmp_skb->len;
1358 skb->truesize += tmp_skb->truesize;
1359 tmp_skb->destructor = NULL;
1360 tmp_skb->sk = NULL;
1361 }
1362
1363 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1364 * to fragment the frame generated here. No matter, what transforms
1365 * how transforms change size of the packet, it will come out.
1366 */
1367 skb->ignore_df = ip_sk_ignore_df(sk);
1368
1369 /* DF bit is set when we want to see DF on outgoing frames.
1370 * If ignore_df is set too, we still allow to fragment this frame
1371 * locally. */
1372 if (inet->pmtudisc == IP_PMTUDISC_DO ||
1373 inet->pmtudisc == IP_PMTUDISC_PROBE ||
1374 (skb->len <= dst_mtu(&rt->dst) &&
1375 ip_dont_fragment(sk, &rt->dst)))
1376 df = htons(IP_DF);
1377
1378 if (cork->flags & IPCORK_OPT)
1379 opt = cork->opt;
1380
1381 if (cork->ttl != 0)
1382 ttl = cork->ttl;
1383 else if (rt->rt_type == RTN_MULTICAST)
1384 ttl = inet->mc_ttl;
1385 else
1386 ttl = ip_select_ttl(inet, &rt->dst);
1387
1388 iph = ip_hdr(skb);
1389 iph->version = 4;
1390 iph->ihl = 5;
1391 iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
1392 iph->frag_off = df;
1393 iph->ttl = ttl;
1394 iph->protocol = sk->sk_protocol;
1395 ip_copy_addrs(iph, fl4);
1396 ip_select_ident(net, skb, sk);
1397
1398 if (opt) {
1399 iph->ihl += opt->optlen>>2;
1400 ip_options_build(skb, opt, cork->addr, rt, 0);
1401 }
1402
1403 skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
1404 skb->mark = sk->sk_mark;
1405 /*
1406 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1407 * on dst refcount
1408 */
1409 cork->dst = NULL;
1410 skb_dst_set(skb, &rt->dst);
1411
1412 if (iph->protocol == IPPROTO_ICMP)
1413 icmp_out_count(net, ((struct icmphdr *)
1414 skb_transport_header(skb))->type);
1415
1416 ip_cork_release(cork);
1417out:
1418 return skb;
1419}
1420
1421int ip_send_skb(struct net *net, struct sk_buff *skb)
1422{
1423 int err;
1424
1425 err = ip_local_out(net, skb->sk, skb);
1426 if (err) {
1427 if (err > 0)
1428 err = net_xmit_errno(err);
1429 if (err)
1430 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1431 }
1432
1433 return err;
1434}
1435
1436int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1437{
1438 struct sk_buff *skb;
1439
1440 skb = ip_finish_skb(sk, fl4);
1441 if (!skb)
1442 return 0;
1443
1444 /* Netfilter gets whole the not fragmented skb. */
1445 return ip_send_skb(sock_net(sk), skb);
1446}
1447
1448/*
1449 * Throw away all pending data on the socket.
1450 */
1451static void __ip_flush_pending_frames(struct sock *sk,
1452 struct sk_buff_head *queue,
1453 struct inet_cork *cork)
1454{
1455 struct sk_buff *skb;
1456
1457 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1458 kfree_skb(skb);
1459
1460 ip_cork_release(cork);
1461}
1462
1463void ip_flush_pending_frames(struct sock *sk)
1464{
1465 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1466}
1467
1468struct sk_buff *ip_make_skb(struct sock *sk,
1469 struct flowi4 *fl4,
1470 int getfrag(void *from, char *to, int offset,
1471 int len, int odd, struct sk_buff *skb),
1472 void *from, int length, int transhdrlen,
1473 struct ipcm_cookie *ipc, struct rtable **rtp,
1474 unsigned int flags)
1475{
1476 struct inet_cork cork;
1477 struct sk_buff_head queue;
1478 int err;
1479
1480 if (flags & MSG_PROBE)
1481 return NULL;
1482
1483 __skb_queue_head_init(&queue);
1484
1485 cork.flags = 0;
1486 cork.addr = 0;
1487 cork.opt = NULL;
1488 err = ip_setup_cork(sk, &cork, ipc, rtp);
1489 if (err)
1490 return ERR_PTR(err);
1491
1492 err = __ip_append_data(sk, fl4, &queue, &cork,
1493 ¤t->task_frag, getfrag,
1494 from, length, transhdrlen, flags);
1495 if (err) {
1496 __ip_flush_pending_frames(sk, &queue, &cork);
1497 return ERR_PTR(err);
1498 }
1499
1500 return __ip_make_skb(sk, fl4, &queue, &cork);
1501}
1502
1503/*
1504 * Fetch data from kernel space and fill in checksum if needed.
1505 */
1506static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1507 int len, int odd, struct sk_buff *skb)
1508{
1509 __wsum csum;
1510
1511 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1512 skb->csum = csum_block_add(skb->csum, csum, odd);
1513 return 0;
1514}
1515
1516/*
1517 * Generic function to send a packet as reply to another packet.
1518 * Used to send some TCP resets/acks so far.
1519 */
1520void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1521 const struct ip_options *sopt,
1522 __be32 daddr, __be32 saddr,
1523 const struct ip_reply_arg *arg,
1524 unsigned int len)
1525{
1526 struct ip_options_data replyopts;
1527 struct ipcm_cookie ipc;
1528 struct flowi4 fl4;
1529 struct rtable *rt = skb_rtable(skb);
1530 struct net *net = sock_net(sk);
1531 struct sk_buff *nskb;
1532 int err;
1533 int oif;
1534
1535 if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
1536 return;
1537
1538 ipc.addr = daddr;
1539 ipc.opt = NULL;
1540 ipc.tx_flags = 0;
1541 ipc.ttl = 0;
1542 ipc.tos = -1;
1543
1544 if (replyopts.opt.opt.optlen) {
1545 ipc.opt = &replyopts.opt;
1546
1547 if (replyopts.opt.opt.srr)
1548 daddr = replyopts.opt.opt.faddr;
1549 }
1550
1551 oif = arg->bound_dev_if;
1552 if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
1553 oif = skb->skb_iif;
1554
1555 flowi4_init_output(&fl4, oif,
1556 IP4_REPLY_MARK(net, skb->mark),
1557 RT_TOS(arg->tos),
1558 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1559 ip_reply_arg_flowi_flags(arg),
1560 daddr, saddr,
1561 tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
1562 arg->uid);
1563 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1564 rt = ip_route_output_key(net, &fl4);
1565 if (IS_ERR(rt))
1566 return;
1567
1568 inet_sk(sk)->tos = arg->tos;
1569
1570 sk->sk_priority = skb->priority;
1571 sk->sk_protocol = ip_hdr(skb)->protocol;
1572 sk->sk_bound_dev_if = arg->bound_dev_if;
1573 sk->sk_sndbuf = sysctl_wmem_default;
1574 sk->sk_mark = fl4.flowi4_mark;
1575 err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
1576 len, 0, &ipc, &rt, MSG_DONTWAIT);
1577 if (unlikely(err)) {
1578 ip_flush_pending_frames(sk);
1579 goto out;
1580 }
1581
1582 nskb = skb_peek(&sk->sk_write_queue);
1583 if (nskb) {
1584 if (arg->csumoffset >= 0)
1585 *((__sum16 *)skb_transport_header(nskb) +
1586 arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1587 arg->csum));
1588 nskb->ip_summed = CHECKSUM_NONE;
1589 ip_push_pending_frames(sk, &fl4);
1590 }
1591out:
1592 ip_rt_put(rt);
1593}
1594
1595void __init ip_init(void)
1596{
1597 ip_rt_init();
1598 inet_initpeers();
1599
1600#if defined(CONFIG_IP_MULTICAST)
1601 igmp_mc_init();
1602#endif
1603}
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <linux/module.h>
47#include <linux/types.h>
48#include <linux/kernel.h>
49#include <linux/mm.h>
50#include <linux/string.h>
51#include <linux/errno.h>
52#include <linux/highmem.h>
53#include <linux/slab.h>
54
55#include <linux/socket.h>
56#include <linux/sockios.h>
57#include <linux/in.h>
58#include <linux/inet.h>
59#include <linux/netdevice.h>
60#include <linux/etherdevice.h>
61#include <linux/proc_fs.h>
62#include <linux/stat.h>
63#include <linux/init.h>
64
65#include <net/snmp.h>
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <net/route.h>
69#include <net/xfrm.h>
70#include <linux/skbuff.h>
71#include <net/sock.h>
72#include <net/arp.h>
73#include <net/icmp.h>
74#include <net/checksum.h>
75#include <net/inetpeer.h>
76#include <linux/igmp.h>
77#include <linux/netfilter_ipv4.h>
78#include <linux/netfilter_bridge.h>
79#include <linux/mroute.h>
80#include <linux/netlink.h>
81#include <linux/tcp.h>
82
83int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84EXPORT_SYMBOL(sysctl_ip_default_ttl);
85
86/* Generate a checksum for an outgoing IP datagram. */
87__inline__ void ip_send_check(struct iphdr *iph)
88{
89 iph->check = 0;
90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
91}
92EXPORT_SYMBOL(ip_send_check);
93
94int __ip_local_out(struct sk_buff *skb)
95{
96 struct iphdr *iph = ip_hdr(skb);
97
98 iph->tot_len = htons(skb->len);
99 ip_send_check(iph);
100 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
101 skb_dst(skb)->dev, dst_output);
102}
103
104int ip_local_out(struct sk_buff *skb)
105{
106 int err;
107
108 err = __ip_local_out(skb);
109 if (likely(err == 1))
110 err = dst_output(skb);
111
112 return err;
113}
114EXPORT_SYMBOL_GPL(ip_local_out);
115
116/* dev_loopback_xmit for use with netfilter. */
117static int ip_dev_loopback_xmit(struct sk_buff *newskb)
118{
119 skb_reset_mac_header(newskb);
120 __skb_pull(newskb, skb_network_offset(newskb));
121 newskb->pkt_type = PACKET_LOOPBACK;
122 newskb->ip_summed = CHECKSUM_UNNECESSARY;
123 WARN_ON(!skb_dst(newskb));
124 skb_dst_force(newskb);
125 netif_rx_ni(newskb);
126 return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131 int ttl = inet->uc_ttl;
132
133 if (ttl < 0)
134 ttl = ip4_dst_hoplimit(dst);
135 return ttl;
136}
137
138/*
139 * Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
144{
145 struct inet_sock *inet = inet_sk(sk);
146 struct rtable *rt = skb_rtable(skb);
147 struct iphdr *iph;
148
149 /* Build the IP header. */
150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151 skb_reset_network_header(skb);
152 iph = ip_hdr(skb);
153 iph->version = 4;
154 iph->ihl = 5;
155 iph->tos = inet->tos;
156 if (ip_dont_fragment(sk, &rt->dst))
157 iph->frag_off = htons(IP_DF);
158 else
159 iph->frag_off = 0;
160 iph->ttl = ip_select_ttl(inet, &rt->dst);
161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 iph->saddr = saddr;
163 iph->protocol = sk->sk_protocol;
164 ip_select_ident(iph, &rt->dst, sk);
165
166 if (opt && opt->opt.optlen) {
167 iph->ihl += opt->opt.optlen>>2;
168 ip_options_build(skb, &opt->opt, daddr, rt, 0);
169 }
170
171 skb->priority = sk->sk_priority;
172 skb->mark = sk->sk_mark;
173
174 /* Send it out. */
175 return ip_local_out(skb);
176}
177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
179static inline int ip_finish_output2(struct sk_buff *skb)
180{
181 struct dst_entry *dst = skb_dst(skb);
182 struct rtable *rt = (struct rtable *)dst;
183 struct net_device *dev = dst->dev;
184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
185 struct neighbour *neigh;
186
187 if (rt->rt_type == RTN_MULTICAST) {
188 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
189 } else if (rt->rt_type == RTN_BROADCAST)
190 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
191
192 /* Be paranoid, rather than too clever. */
193 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
194 struct sk_buff *skb2;
195
196 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
197 if (skb2 == NULL) {
198 kfree_skb(skb);
199 return -ENOMEM;
200 }
201 if (skb->sk)
202 skb_set_owner_w(skb2, skb->sk);
203 kfree_skb(skb);
204 skb = skb2;
205 }
206
207 rcu_read_lock();
208 neigh = dst_get_neighbour_noref(dst);
209 if (neigh) {
210 int res = neigh_output(neigh, skb);
211
212 rcu_read_unlock();
213 return res;
214 }
215 rcu_read_unlock();
216
217 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
218 __func__);
219 kfree_skb(skb);
220 return -EINVAL;
221}
222
223static inline int ip_skb_dst_mtu(struct sk_buff *skb)
224{
225 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
226
227 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
228 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
229}
230
231static int ip_finish_output(struct sk_buff *skb)
232{
233#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
234 /* Policy lookup after SNAT yielded a new policy */
235 if (skb_dst(skb)->xfrm != NULL) {
236 IPCB(skb)->flags |= IPSKB_REROUTED;
237 return dst_output(skb);
238 }
239#endif
240 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
241 return ip_fragment(skb, ip_finish_output2);
242 else
243 return ip_finish_output2(skb);
244}
245
246int ip_mc_output(struct sk_buff *skb)
247{
248 struct sock *sk = skb->sk;
249 struct rtable *rt = skb_rtable(skb);
250 struct net_device *dev = rt->dst.dev;
251
252 /*
253 * If the indicated interface is up and running, send the packet.
254 */
255 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
256
257 skb->dev = dev;
258 skb->protocol = htons(ETH_P_IP);
259
260 /*
261 * Multicasts are looped back for other local users
262 */
263
264 if (rt->rt_flags&RTCF_MULTICAST) {
265 if (sk_mc_loop(sk)
266#ifdef CONFIG_IP_MROUTE
267 /* Small optimization: do not loopback not local frames,
268 which returned after forwarding; they will be dropped
269 by ip_mr_input in any case.
270 Note, that local frames are looped back to be delivered
271 to local recipients.
272
273 This check is duplicated in ip_mr_input at the moment.
274 */
275 &&
276 ((rt->rt_flags & RTCF_LOCAL) ||
277 !(IPCB(skb)->flags & IPSKB_FORWARDED))
278#endif
279 ) {
280 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
281 if (newskb)
282 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
283 newskb, NULL, newskb->dev,
284 ip_dev_loopback_xmit);
285 }
286
287 /* Multicasts with ttl 0 must not go beyond the host */
288
289 if (ip_hdr(skb)->ttl == 0) {
290 kfree_skb(skb);
291 return 0;
292 }
293 }
294
295 if (rt->rt_flags&RTCF_BROADCAST) {
296 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
297 if (newskb)
298 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
299 NULL, newskb->dev, ip_dev_loopback_xmit);
300 }
301
302 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
303 skb->dev, ip_finish_output,
304 !(IPCB(skb)->flags & IPSKB_REROUTED));
305}
306
307int ip_output(struct sk_buff *skb)
308{
309 struct net_device *dev = skb_dst(skb)->dev;
310
311 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
312
313 skb->dev = dev;
314 skb->protocol = htons(ETH_P_IP);
315
316 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
317 ip_finish_output,
318 !(IPCB(skb)->flags & IPSKB_REROUTED));
319}
320
321/*
322 * copy saddr and daddr, possibly using 64bit load/stores
323 * Equivalent to :
324 * iph->saddr = fl4->saddr;
325 * iph->daddr = fl4->daddr;
326 */
327static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
328{
329 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
330 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
331 memcpy(&iph->saddr, &fl4->saddr,
332 sizeof(fl4->saddr) + sizeof(fl4->daddr));
333}
334
335int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
336{
337 struct sock *sk = skb->sk;
338 struct inet_sock *inet = inet_sk(sk);
339 struct ip_options_rcu *inet_opt;
340 struct flowi4 *fl4;
341 struct rtable *rt;
342 struct iphdr *iph;
343 int res;
344
345 /* Skip all of this if the packet is already routed,
346 * f.e. by something like SCTP.
347 */
348 rcu_read_lock();
349 inet_opt = rcu_dereference(inet->inet_opt);
350 fl4 = &fl->u.ip4;
351 rt = skb_rtable(skb);
352 if (rt != NULL)
353 goto packet_routed;
354
355 /* Make sure we can route this packet. */
356 rt = (struct rtable *)__sk_dst_check(sk, 0);
357 if (rt == NULL) {
358 __be32 daddr;
359
360 /* Use correct destination address if we have options. */
361 daddr = inet->inet_daddr;
362 if (inet_opt && inet_opt->opt.srr)
363 daddr = inet_opt->opt.faddr;
364
365 /* If this fails, retransmit mechanism of transport layer will
366 * keep trying until route appears or the connection times
367 * itself out.
368 */
369 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
370 daddr, inet->inet_saddr,
371 inet->inet_dport,
372 inet->inet_sport,
373 sk->sk_protocol,
374 RT_CONN_FLAGS(sk),
375 sk->sk_bound_dev_if);
376 if (IS_ERR(rt))
377 goto no_route;
378 sk_setup_caps(sk, &rt->dst);
379 }
380 skb_dst_set_noref(skb, &rt->dst);
381
382packet_routed:
383 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
384 goto no_route;
385
386 /* OK, we know where to send it, allocate and build IP header. */
387 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
388 skb_reset_network_header(skb);
389 iph = ip_hdr(skb);
390 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
391 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
392 iph->frag_off = htons(IP_DF);
393 else
394 iph->frag_off = 0;
395 iph->ttl = ip_select_ttl(inet, &rt->dst);
396 iph->protocol = sk->sk_protocol;
397 ip_copy_addrs(iph, fl4);
398
399 /* Transport layer set skb->h.foo itself. */
400
401 if (inet_opt && inet_opt->opt.optlen) {
402 iph->ihl += inet_opt->opt.optlen >> 2;
403 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
404 }
405
406 ip_select_ident_more(iph, &rt->dst, sk,
407 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
408
409 skb->priority = sk->sk_priority;
410 skb->mark = sk->sk_mark;
411
412 res = ip_local_out(skb);
413 rcu_read_unlock();
414 return res;
415
416no_route:
417 rcu_read_unlock();
418 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
419 kfree_skb(skb);
420 return -EHOSTUNREACH;
421}
422EXPORT_SYMBOL(ip_queue_xmit);
423
424
425static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
426{
427 to->pkt_type = from->pkt_type;
428 to->priority = from->priority;
429 to->protocol = from->protocol;
430 skb_dst_drop(to);
431 skb_dst_copy(to, from);
432 to->dev = from->dev;
433 to->mark = from->mark;
434
435 /* Copy the flags to each fragment. */
436 IPCB(to)->flags = IPCB(from)->flags;
437
438#ifdef CONFIG_NET_SCHED
439 to->tc_index = from->tc_index;
440#endif
441 nf_copy(to, from);
442#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
443 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
444 to->nf_trace = from->nf_trace;
445#endif
446#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
447 to->ipvs_property = from->ipvs_property;
448#endif
449 skb_copy_secmark(to, from);
450}
451
452/*
453 * This IP datagram is too large to be sent in one piece. Break it up into
454 * smaller pieces (each of size equal to IP header plus
455 * a block of the data of the original IP data part) that will yet fit in a
456 * single device frame, and queue such a frame for sending.
457 */
458
459int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
460{
461 struct iphdr *iph;
462 int ptr;
463 struct net_device *dev;
464 struct sk_buff *skb2;
465 unsigned int mtu, hlen, left, len, ll_rs;
466 int offset;
467 __be16 not_last_frag;
468 struct rtable *rt = skb_rtable(skb);
469 int err = 0;
470
471 dev = rt->dst.dev;
472
473 /*
474 * Point into the IP datagram header.
475 */
476
477 iph = ip_hdr(skb);
478
479 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
480 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
481 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
482 htonl(ip_skb_dst_mtu(skb)));
483 kfree_skb(skb);
484 return -EMSGSIZE;
485 }
486
487 /*
488 * Setup starting values.
489 */
490
491 hlen = iph->ihl * 4;
492 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
493#ifdef CONFIG_BRIDGE_NETFILTER
494 if (skb->nf_bridge)
495 mtu -= nf_bridge_mtu_reduction(skb);
496#endif
497 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
498
499 /* When frag_list is given, use it. First, check its validity:
500 * some transformers could create wrong frag_list or break existing
501 * one, it is not prohibited. In this case fall back to copying.
502 *
503 * LATER: this step can be merged to real generation of fragments,
504 * we can switch to copy when see the first bad fragment.
505 */
506 if (skb_has_frag_list(skb)) {
507 struct sk_buff *frag, *frag2;
508 int first_len = skb_pagelen(skb);
509
510 if (first_len - hlen > mtu ||
511 ((first_len - hlen) & 7) ||
512 ip_is_fragment(iph) ||
513 skb_cloned(skb))
514 goto slow_path;
515
516 skb_walk_frags(skb, frag) {
517 /* Correct geometry. */
518 if (frag->len > mtu ||
519 ((frag->len & 7) && frag->next) ||
520 skb_headroom(frag) < hlen)
521 goto slow_path_clean;
522
523 /* Partially cloned skb? */
524 if (skb_shared(frag))
525 goto slow_path_clean;
526
527 BUG_ON(frag->sk);
528 if (skb->sk) {
529 frag->sk = skb->sk;
530 frag->destructor = sock_wfree;
531 }
532 skb->truesize -= frag->truesize;
533 }
534
535 /* Everything is OK. Generate! */
536
537 err = 0;
538 offset = 0;
539 frag = skb_shinfo(skb)->frag_list;
540 skb_frag_list_init(skb);
541 skb->data_len = first_len - skb_headlen(skb);
542 skb->len = first_len;
543 iph->tot_len = htons(first_len);
544 iph->frag_off = htons(IP_MF);
545 ip_send_check(iph);
546
547 for (;;) {
548 /* Prepare header of the next frame,
549 * before previous one went down. */
550 if (frag) {
551 frag->ip_summed = CHECKSUM_NONE;
552 skb_reset_transport_header(frag);
553 __skb_push(frag, hlen);
554 skb_reset_network_header(frag);
555 memcpy(skb_network_header(frag), iph, hlen);
556 iph = ip_hdr(frag);
557 iph->tot_len = htons(frag->len);
558 ip_copy_metadata(frag, skb);
559 if (offset == 0)
560 ip_options_fragment(frag);
561 offset += skb->len - hlen;
562 iph->frag_off = htons(offset>>3);
563 if (frag->next != NULL)
564 iph->frag_off |= htons(IP_MF);
565 /* Ready, complete checksum */
566 ip_send_check(iph);
567 }
568
569 err = output(skb);
570
571 if (!err)
572 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
573 if (err || !frag)
574 break;
575
576 skb = frag;
577 frag = skb->next;
578 skb->next = NULL;
579 }
580
581 if (err == 0) {
582 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
583 return 0;
584 }
585
586 while (frag) {
587 skb = frag->next;
588 kfree_skb(frag);
589 frag = skb;
590 }
591 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
592 return err;
593
594slow_path_clean:
595 skb_walk_frags(skb, frag2) {
596 if (frag2 == frag)
597 break;
598 frag2->sk = NULL;
599 frag2->destructor = NULL;
600 skb->truesize += frag2->truesize;
601 }
602 }
603
604slow_path:
605 left = skb->len - hlen; /* Space per frame */
606 ptr = hlen; /* Where to start from */
607
608 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
609 * we need to make room for the encapsulating header
610 */
611 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
612
613 /*
614 * Fragment the datagram.
615 */
616
617 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
618 not_last_frag = iph->frag_off & htons(IP_MF);
619
620 /*
621 * Keep copying data until we run out.
622 */
623
624 while (left > 0) {
625 len = left;
626 /* IF: it doesn't fit, use 'mtu' - the data space left */
627 if (len > mtu)
628 len = mtu;
629 /* IF: we are not sending up to and including the packet end
630 then align the next start on an eight byte boundary */
631 if (len < left) {
632 len &= ~7;
633 }
634 /*
635 * Allocate buffer.
636 */
637
638 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
639 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
640 err = -ENOMEM;
641 goto fail;
642 }
643
644 /*
645 * Set up data on packet
646 */
647
648 ip_copy_metadata(skb2, skb);
649 skb_reserve(skb2, ll_rs);
650 skb_put(skb2, len + hlen);
651 skb_reset_network_header(skb2);
652 skb2->transport_header = skb2->network_header + hlen;
653
654 /*
655 * Charge the memory for the fragment to any owner
656 * it might possess
657 */
658
659 if (skb->sk)
660 skb_set_owner_w(skb2, skb->sk);
661
662 /*
663 * Copy the packet header into the new buffer.
664 */
665
666 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
667
668 /*
669 * Copy a block of the IP datagram.
670 */
671 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
672 BUG();
673 left -= len;
674
675 /*
676 * Fill in the new header fields.
677 */
678 iph = ip_hdr(skb2);
679 iph->frag_off = htons((offset >> 3));
680
681 /* ANK: dirty, but effective trick. Upgrade options only if
682 * the segment to be fragmented was THE FIRST (otherwise,
683 * options are already fixed) and make it ONCE
684 * on the initial skb, so that all the following fragments
685 * will inherit fixed options.
686 */
687 if (offset == 0)
688 ip_options_fragment(skb);
689
690 /*
691 * Added AC : If we are fragmenting a fragment that's not the
692 * last fragment then keep MF on each bit
693 */
694 if (left > 0 || not_last_frag)
695 iph->frag_off |= htons(IP_MF);
696 ptr += len;
697 offset += len;
698
699 /*
700 * Put this fragment into the sending queue.
701 */
702 iph->tot_len = htons(len + hlen);
703
704 ip_send_check(iph);
705
706 err = output(skb2);
707 if (err)
708 goto fail;
709
710 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
711 }
712 kfree_skb(skb);
713 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
714 return err;
715
716fail:
717 kfree_skb(skb);
718 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
719 return err;
720}
721EXPORT_SYMBOL(ip_fragment);
722
723int
724ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
725{
726 struct iovec *iov = from;
727
728 if (skb->ip_summed == CHECKSUM_PARTIAL) {
729 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
730 return -EFAULT;
731 } else {
732 __wsum csum = 0;
733 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
734 return -EFAULT;
735 skb->csum = csum_block_add(skb->csum, csum, odd);
736 }
737 return 0;
738}
739EXPORT_SYMBOL(ip_generic_getfrag);
740
741static inline __wsum
742csum_page(struct page *page, int offset, int copy)
743{
744 char *kaddr;
745 __wsum csum;
746 kaddr = kmap(page);
747 csum = csum_partial(kaddr + offset, copy, 0);
748 kunmap(page);
749 return csum;
750}
751
752static inline int ip_ufo_append_data(struct sock *sk,
753 struct sk_buff_head *queue,
754 int getfrag(void *from, char *to, int offset, int len,
755 int odd, struct sk_buff *skb),
756 void *from, int length, int hh_len, int fragheaderlen,
757 int transhdrlen, int maxfraglen, unsigned int flags)
758{
759 struct sk_buff *skb;
760 int err;
761
762 /* There is support for UDP fragmentation offload by network
763 * device, so create one single skb packet containing complete
764 * udp datagram
765 */
766 if ((skb = skb_peek_tail(queue)) == NULL) {
767 skb = sock_alloc_send_skb(sk,
768 hh_len + fragheaderlen + transhdrlen + 20,
769 (flags & MSG_DONTWAIT), &err);
770
771 if (skb == NULL)
772 return err;
773
774 /* reserve space for Hardware header */
775 skb_reserve(skb, hh_len);
776
777 /* create space for UDP/IP header */
778 skb_put(skb, fragheaderlen + transhdrlen);
779
780 /* initialize network header pointer */
781 skb_reset_network_header(skb);
782
783 /* initialize protocol header pointer */
784 skb->transport_header = skb->network_header + fragheaderlen;
785
786 skb->ip_summed = CHECKSUM_PARTIAL;
787 skb->csum = 0;
788
789 /* specify the length of each IP datagram fragment */
790 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
791 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
792 __skb_queue_tail(queue, skb);
793 }
794
795 return skb_append_datato_frags(sk, skb, getfrag, from,
796 (length - transhdrlen));
797}
798
799static int __ip_append_data(struct sock *sk,
800 struct flowi4 *fl4,
801 struct sk_buff_head *queue,
802 struct inet_cork *cork,
803 int getfrag(void *from, char *to, int offset,
804 int len, int odd, struct sk_buff *skb),
805 void *from, int length, int transhdrlen,
806 unsigned int flags)
807{
808 struct inet_sock *inet = inet_sk(sk);
809 struct sk_buff *skb;
810
811 struct ip_options *opt = cork->opt;
812 int hh_len;
813 int exthdrlen;
814 int mtu;
815 int copy;
816 int err;
817 int offset = 0;
818 unsigned int maxfraglen, fragheaderlen;
819 int csummode = CHECKSUM_NONE;
820 struct rtable *rt = (struct rtable *)cork->dst;
821
822 skb = skb_peek_tail(queue);
823
824 exthdrlen = !skb ? rt->dst.header_len : 0;
825 mtu = cork->fragsize;
826
827 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
828
829 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
830 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
831
832 if (cork->length + length > 0xFFFF - fragheaderlen) {
833 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
834 mtu-exthdrlen);
835 return -EMSGSIZE;
836 }
837
838 /*
839 * transhdrlen > 0 means that this is the first fragment and we wish
840 * it won't be fragmented in the future.
841 */
842 if (transhdrlen &&
843 length + fragheaderlen <= mtu &&
844 rt->dst.dev->features & NETIF_F_V4_CSUM &&
845 !exthdrlen)
846 csummode = CHECKSUM_PARTIAL;
847
848 cork->length += length;
849 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
850 (sk->sk_protocol == IPPROTO_UDP) &&
851 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
852 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
853 hh_len, fragheaderlen, transhdrlen,
854 maxfraglen, flags);
855 if (err)
856 goto error;
857 return 0;
858 }
859
860 /* So, what's going on in the loop below?
861 *
862 * We use calculated fragment length to generate chained skb,
863 * each of segments is IP fragment ready for sending to network after
864 * adding appropriate IP header.
865 */
866
867 if (!skb)
868 goto alloc_new_skb;
869
870 while (length > 0) {
871 /* Check if the remaining data fits into current packet. */
872 copy = mtu - skb->len;
873 if (copy < length)
874 copy = maxfraglen - skb->len;
875 if (copy <= 0) {
876 char *data;
877 unsigned int datalen;
878 unsigned int fraglen;
879 unsigned int fraggap;
880 unsigned int alloclen;
881 struct sk_buff *skb_prev;
882alloc_new_skb:
883 skb_prev = skb;
884 if (skb_prev)
885 fraggap = skb_prev->len - maxfraglen;
886 else
887 fraggap = 0;
888
889 /*
890 * If remaining data exceeds the mtu,
891 * we know we need more fragment(s).
892 */
893 datalen = length + fraggap;
894 if (datalen > mtu - fragheaderlen)
895 datalen = maxfraglen - fragheaderlen;
896 fraglen = datalen + fragheaderlen;
897
898 if ((flags & MSG_MORE) &&
899 !(rt->dst.dev->features&NETIF_F_SG))
900 alloclen = mtu;
901 else
902 alloclen = fraglen;
903
904 alloclen += exthdrlen;
905
906 /* The last fragment gets additional space at tail.
907 * Note, with MSG_MORE we overallocate on fragments,
908 * because we have no idea what fragment will be
909 * the last.
910 */
911 if (datalen == length + fraggap)
912 alloclen += rt->dst.trailer_len;
913
914 if (transhdrlen) {
915 skb = sock_alloc_send_skb(sk,
916 alloclen + hh_len + 15,
917 (flags & MSG_DONTWAIT), &err);
918 } else {
919 skb = NULL;
920 if (atomic_read(&sk->sk_wmem_alloc) <=
921 2 * sk->sk_sndbuf)
922 skb = sock_wmalloc(sk,
923 alloclen + hh_len + 15, 1,
924 sk->sk_allocation);
925 if (unlikely(skb == NULL))
926 err = -ENOBUFS;
927 else
928 /* only the initial fragment is
929 time stamped */
930 cork->tx_flags = 0;
931 }
932 if (skb == NULL)
933 goto error;
934
935 /*
936 * Fill in the control structures
937 */
938 skb->ip_summed = csummode;
939 skb->csum = 0;
940 skb_reserve(skb, hh_len);
941 skb_shinfo(skb)->tx_flags = cork->tx_flags;
942
943 /*
944 * Find where to start putting bytes.
945 */
946 data = skb_put(skb, fraglen + exthdrlen);
947 skb_set_network_header(skb, exthdrlen);
948 skb->transport_header = (skb->network_header +
949 fragheaderlen);
950 data += fragheaderlen + exthdrlen;
951
952 if (fraggap) {
953 skb->csum = skb_copy_and_csum_bits(
954 skb_prev, maxfraglen,
955 data + transhdrlen, fraggap, 0);
956 skb_prev->csum = csum_sub(skb_prev->csum,
957 skb->csum);
958 data += fraggap;
959 pskb_trim_unique(skb_prev, maxfraglen);
960 }
961
962 copy = datalen - transhdrlen - fraggap;
963 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
964 err = -EFAULT;
965 kfree_skb(skb);
966 goto error;
967 }
968
969 offset += copy;
970 length -= datalen - fraggap;
971 transhdrlen = 0;
972 exthdrlen = 0;
973 csummode = CHECKSUM_NONE;
974
975 /*
976 * Put the packet on the pending queue.
977 */
978 __skb_queue_tail(queue, skb);
979 continue;
980 }
981
982 if (copy > length)
983 copy = length;
984
985 if (!(rt->dst.dev->features&NETIF_F_SG)) {
986 unsigned int off;
987
988 off = skb->len;
989 if (getfrag(from, skb_put(skb, copy),
990 offset, copy, off, skb) < 0) {
991 __skb_trim(skb, off);
992 err = -EFAULT;
993 goto error;
994 }
995 } else {
996 int i = skb_shinfo(skb)->nr_frags;
997 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
998 struct page *page = cork->page;
999 int off = cork->off;
1000 unsigned int left;
1001
1002 if (page && (left = PAGE_SIZE - off) > 0) {
1003 if (copy >= left)
1004 copy = left;
1005 if (page != skb_frag_page(frag)) {
1006 if (i == MAX_SKB_FRAGS) {
1007 err = -EMSGSIZE;
1008 goto error;
1009 }
1010 skb_fill_page_desc(skb, i, page, off, 0);
1011 skb_frag_ref(skb, i);
1012 frag = &skb_shinfo(skb)->frags[i];
1013 }
1014 } else if (i < MAX_SKB_FRAGS) {
1015 if (copy > PAGE_SIZE)
1016 copy = PAGE_SIZE;
1017 page = alloc_pages(sk->sk_allocation, 0);
1018 if (page == NULL) {
1019 err = -ENOMEM;
1020 goto error;
1021 }
1022 cork->page = page;
1023 cork->off = 0;
1024
1025 skb_fill_page_desc(skb, i, page, 0, 0);
1026 frag = &skb_shinfo(skb)->frags[i];
1027 } else {
1028 err = -EMSGSIZE;
1029 goto error;
1030 }
1031 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1032 offset, copy, skb->len, skb) < 0) {
1033 err = -EFAULT;
1034 goto error;
1035 }
1036 cork->off += copy;
1037 skb_frag_size_add(frag, copy);
1038 skb->len += copy;
1039 skb->data_len += copy;
1040 skb->truesize += copy;
1041 atomic_add(copy, &sk->sk_wmem_alloc);
1042 }
1043 offset += copy;
1044 length -= copy;
1045 }
1046
1047 return 0;
1048
1049error:
1050 cork->length -= length;
1051 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1052 return err;
1053}
1054
1055static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1056 struct ipcm_cookie *ipc, struct rtable **rtp)
1057{
1058 struct inet_sock *inet = inet_sk(sk);
1059 struct ip_options_rcu *opt;
1060 struct rtable *rt;
1061
1062 /*
1063 * setup for corking.
1064 */
1065 opt = ipc->opt;
1066 if (opt) {
1067 if (cork->opt == NULL) {
1068 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1069 sk->sk_allocation);
1070 if (unlikely(cork->opt == NULL))
1071 return -ENOBUFS;
1072 }
1073 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1074 cork->flags |= IPCORK_OPT;
1075 cork->addr = ipc->addr;
1076 }
1077 rt = *rtp;
1078 if (unlikely(!rt))
1079 return -EFAULT;
1080 /*
1081 * We steal reference to this route, caller should not release it
1082 */
1083 *rtp = NULL;
1084 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1085 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1086 cork->dst = &rt->dst;
1087 cork->length = 0;
1088 cork->tx_flags = ipc->tx_flags;
1089 cork->page = NULL;
1090 cork->off = 0;
1091
1092 return 0;
1093}
1094
1095/*
1096 * ip_append_data() and ip_append_page() can make one large IP datagram
1097 * from many pieces of data. Each pieces will be holded on the socket
1098 * until ip_push_pending_frames() is called. Each piece can be a page
1099 * or non-page data.
1100 *
1101 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1102 * this interface potentially.
1103 *
1104 * LATER: length must be adjusted by pad at tail, when it is required.
1105 */
1106int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1107 int getfrag(void *from, char *to, int offset, int len,
1108 int odd, struct sk_buff *skb),
1109 void *from, int length, int transhdrlen,
1110 struct ipcm_cookie *ipc, struct rtable **rtp,
1111 unsigned int flags)
1112{
1113 struct inet_sock *inet = inet_sk(sk);
1114 int err;
1115
1116 if (flags&MSG_PROBE)
1117 return 0;
1118
1119 if (skb_queue_empty(&sk->sk_write_queue)) {
1120 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1121 if (err)
1122 return err;
1123 } else {
1124 transhdrlen = 0;
1125 }
1126
1127 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1128 from, length, transhdrlen, flags);
1129}
1130
1131ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1132 int offset, size_t size, int flags)
1133{
1134 struct inet_sock *inet = inet_sk(sk);
1135 struct sk_buff *skb;
1136 struct rtable *rt;
1137 struct ip_options *opt = NULL;
1138 struct inet_cork *cork;
1139 int hh_len;
1140 int mtu;
1141 int len;
1142 int err;
1143 unsigned int maxfraglen, fragheaderlen, fraggap;
1144
1145 if (inet->hdrincl)
1146 return -EPERM;
1147
1148 if (flags&MSG_PROBE)
1149 return 0;
1150
1151 if (skb_queue_empty(&sk->sk_write_queue))
1152 return -EINVAL;
1153
1154 cork = &inet->cork.base;
1155 rt = (struct rtable *)cork->dst;
1156 if (cork->flags & IPCORK_OPT)
1157 opt = cork->opt;
1158
1159 if (!(rt->dst.dev->features&NETIF_F_SG))
1160 return -EOPNOTSUPP;
1161
1162 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1163 mtu = cork->fragsize;
1164
1165 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1166 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1167
1168 if (cork->length + size > 0xFFFF - fragheaderlen) {
1169 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1170 return -EMSGSIZE;
1171 }
1172
1173 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1174 return -EINVAL;
1175
1176 cork->length += size;
1177 if ((size + skb->len > mtu) &&
1178 (sk->sk_protocol == IPPROTO_UDP) &&
1179 (rt->dst.dev->features & NETIF_F_UFO)) {
1180 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1181 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1182 }
1183
1184
1185 while (size > 0) {
1186 int i;
1187
1188 if (skb_is_gso(skb))
1189 len = size;
1190 else {
1191
1192 /* Check if the remaining data fits into current packet. */
1193 len = mtu - skb->len;
1194 if (len < size)
1195 len = maxfraglen - skb->len;
1196 }
1197 if (len <= 0) {
1198 struct sk_buff *skb_prev;
1199 int alloclen;
1200
1201 skb_prev = skb;
1202 fraggap = skb_prev->len - maxfraglen;
1203
1204 alloclen = fragheaderlen + hh_len + fraggap + 15;
1205 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1206 if (unlikely(!skb)) {
1207 err = -ENOBUFS;
1208 goto error;
1209 }
1210
1211 /*
1212 * Fill in the control structures
1213 */
1214 skb->ip_summed = CHECKSUM_NONE;
1215 skb->csum = 0;
1216 skb_reserve(skb, hh_len);
1217
1218 /*
1219 * Find where to start putting bytes.
1220 */
1221 skb_put(skb, fragheaderlen + fraggap);
1222 skb_reset_network_header(skb);
1223 skb->transport_header = (skb->network_header +
1224 fragheaderlen);
1225 if (fraggap) {
1226 skb->csum = skb_copy_and_csum_bits(skb_prev,
1227 maxfraglen,
1228 skb_transport_header(skb),
1229 fraggap, 0);
1230 skb_prev->csum = csum_sub(skb_prev->csum,
1231 skb->csum);
1232 pskb_trim_unique(skb_prev, maxfraglen);
1233 }
1234
1235 /*
1236 * Put the packet on the pending queue.
1237 */
1238 __skb_queue_tail(&sk->sk_write_queue, skb);
1239 continue;
1240 }
1241
1242 i = skb_shinfo(skb)->nr_frags;
1243 if (len > size)
1244 len = size;
1245 if (skb_can_coalesce(skb, i, page, offset)) {
1246 skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1247 } else if (i < MAX_SKB_FRAGS) {
1248 get_page(page);
1249 skb_fill_page_desc(skb, i, page, offset, len);
1250 } else {
1251 err = -EMSGSIZE;
1252 goto error;
1253 }
1254
1255 if (skb->ip_summed == CHECKSUM_NONE) {
1256 __wsum csum;
1257 csum = csum_page(page, offset, len);
1258 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1259 }
1260
1261 skb->len += len;
1262 skb->data_len += len;
1263 skb->truesize += len;
1264 atomic_add(len, &sk->sk_wmem_alloc);
1265 offset += len;
1266 size -= len;
1267 }
1268 return 0;
1269
1270error:
1271 cork->length -= size;
1272 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1273 return err;
1274}
1275
1276static void ip_cork_release(struct inet_cork *cork)
1277{
1278 cork->flags &= ~IPCORK_OPT;
1279 kfree(cork->opt);
1280 cork->opt = NULL;
1281 dst_release(cork->dst);
1282 cork->dst = NULL;
1283}
1284
1285/*
1286 * Combined all pending IP fragments on the socket as one IP datagram
1287 * and push them out.
1288 */
1289struct sk_buff *__ip_make_skb(struct sock *sk,
1290 struct flowi4 *fl4,
1291 struct sk_buff_head *queue,
1292 struct inet_cork *cork)
1293{
1294 struct sk_buff *skb, *tmp_skb;
1295 struct sk_buff **tail_skb;
1296 struct inet_sock *inet = inet_sk(sk);
1297 struct net *net = sock_net(sk);
1298 struct ip_options *opt = NULL;
1299 struct rtable *rt = (struct rtable *)cork->dst;
1300 struct iphdr *iph;
1301 __be16 df = 0;
1302 __u8 ttl;
1303
1304 if ((skb = __skb_dequeue(queue)) == NULL)
1305 goto out;
1306 tail_skb = &(skb_shinfo(skb)->frag_list);
1307
1308 /* move skb->data to ip header from ext header */
1309 if (skb->data < skb_network_header(skb))
1310 __skb_pull(skb, skb_network_offset(skb));
1311 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1312 __skb_pull(tmp_skb, skb_network_header_len(skb));
1313 *tail_skb = tmp_skb;
1314 tail_skb = &(tmp_skb->next);
1315 skb->len += tmp_skb->len;
1316 skb->data_len += tmp_skb->len;
1317 skb->truesize += tmp_skb->truesize;
1318 tmp_skb->destructor = NULL;
1319 tmp_skb->sk = NULL;
1320 }
1321
1322 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1323 * to fragment the frame generated here. No matter, what transforms
1324 * how transforms change size of the packet, it will come out.
1325 */
1326 if (inet->pmtudisc < IP_PMTUDISC_DO)
1327 skb->local_df = 1;
1328
1329 /* DF bit is set when we want to see DF on outgoing frames.
1330 * If local_df is set too, we still allow to fragment this frame
1331 * locally. */
1332 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1333 (skb->len <= dst_mtu(&rt->dst) &&
1334 ip_dont_fragment(sk, &rt->dst)))
1335 df = htons(IP_DF);
1336
1337 if (cork->flags & IPCORK_OPT)
1338 opt = cork->opt;
1339
1340 if (rt->rt_type == RTN_MULTICAST)
1341 ttl = inet->mc_ttl;
1342 else
1343 ttl = ip_select_ttl(inet, &rt->dst);
1344
1345 iph = (struct iphdr *)skb->data;
1346 iph->version = 4;
1347 iph->ihl = 5;
1348 iph->tos = inet->tos;
1349 iph->frag_off = df;
1350 ip_select_ident(iph, &rt->dst, sk);
1351 iph->ttl = ttl;
1352 iph->protocol = sk->sk_protocol;
1353 ip_copy_addrs(iph, fl4);
1354
1355 if (opt) {
1356 iph->ihl += opt->optlen>>2;
1357 ip_options_build(skb, opt, cork->addr, rt, 0);
1358 }
1359
1360 skb->priority = sk->sk_priority;
1361 skb->mark = sk->sk_mark;
1362 /*
1363 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1364 * on dst refcount
1365 */
1366 cork->dst = NULL;
1367 skb_dst_set(skb, &rt->dst);
1368
1369 if (iph->protocol == IPPROTO_ICMP)
1370 icmp_out_count(net, ((struct icmphdr *)
1371 skb_transport_header(skb))->type);
1372
1373 ip_cork_release(cork);
1374out:
1375 return skb;
1376}
1377
1378int ip_send_skb(struct sk_buff *skb)
1379{
1380 struct net *net = sock_net(skb->sk);
1381 int err;
1382
1383 err = ip_local_out(skb);
1384 if (err) {
1385 if (err > 0)
1386 err = net_xmit_errno(err);
1387 if (err)
1388 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1389 }
1390
1391 return err;
1392}
1393
1394int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1395{
1396 struct sk_buff *skb;
1397
1398 skb = ip_finish_skb(sk, fl4);
1399 if (!skb)
1400 return 0;
1401
1402 /* Netfilter gets whole the not fragmented skb. */
1403 return ip_send_skb(skb);
1404}
1405
1406/*
1407 * Throw away all pending data on the socket.
1408 */
1409static void __ip_flush_pending_frames(struct sock *sk,
1410 struct sk_buff_head *queue,
1411 struct inet_cork *cork)
1412{
1413 struct sk_buff *skb;
1414
1415 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1416 kfree_skb(skb);
1417
1418 ip_cork_release(cork);
1419}
1420
1421void ip_flush_pending_frames(struct sock *sk)
1422{
1423 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1424}
1425
1426struct sk_buff *ip_make_skb(struct sock *sk,
1427 struct flowi4 *fl4,
1428 int getfrag(void *from, char *to, int offset,
1429 int len, int odd, struct sk_buff *skb),
1430 void *from, int length, int transhdrlen,
1431 struct ipcm_cookie *ipc, struct rtable **rtp,
1432 unsigned int flags)
1433{
1434 struct inet_cork cork;
1435 struct sk_buff_head queue;
1436 int err;
1437
1438 if (flags & MSG_PROBE)
1439 return NULL;
1440
1441 __skb_queue_head_init(&queue);
1442
1443 cork.flags = 0;
1444 cork.addr = 0;
1445 cork.opt = NULL;
1446 err = ip_setup_cork(sk, &cork, ipc, rtp);
1447 if (err)
1448 return ERR_PTR(err);
1449
1450 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1451 from, length, transhdrlen, flags);
1452 if (err) {
1453 __ip_flush_pending_frames(sk, &queue, &cork);
1454 return ERR_PTR(err);
1455 }
1456
1457 return __ip_make_skb(sk, fl4, &queue, &cork);
1458}
1459
1460/*
1461 * Fetch data from kernel space and fill in checksum if needed.
1462 */
1463static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1464 int len, int odd, struct sk_buff *skb)
1465{
1466 __wsum csum;
1467
1468 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1469 skb->csum = csum_block_add(skb->csum, csum, odd);
1470 return 0;
1471}
1472
1473/*
1474 * Generic function to send a packet as reply to another packet.
1475 * Used to send TCP resets so far. ICMP should use this function too.
1476 *
1477 * Should run single threaded per socket because it uses the sock
1478 * structure to pass arguments.
1479 */
1480void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1481 const struct ip_reply_arg *arg, unsigned int len)
1482{
1483 struct inet_sock *inet = inet_sk(sk);
1484 struct ip_options_data replyopts;
1485 struct ipcm_cookie ipc;
1486 struct flowi4 fl4;
1487 struct rtable *rt = skb_rtable(skb);
1488
1489 if (ip_options_echo(&replyopts.opt.opt, skb))
1490 return;
1491
1492 ipc.addr = daddr;
1493 ipc.opt = NULL;
1494 ipc.tx_flags = 0;
1495
1496 if (replyopts.opt.opt.optlen) {
1497 ipc.opt = &replyopts.opt;
1498
1499 if (replyopts.opt.opt.srr)
1500 daddr = replyopts.opt.opt.faddr;
1501 }
1502
1503 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1504 RT_TOS(arg->tos),
1505 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1506 ip_reply_arg_flowi_flags(arg),
1507 daddr, rt->rt_spec_dst,
1508 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1509 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1510 rt = ip_route_output_key(sock_net(sk), &fl4);
1511 if (IS_ERR(rt))
1512 return;
1513
1514 /* And let IP do all the hard work.
1515
1516 This chunk is not reenterable, hence spinlock.
1517 Note that it uses the fact, that this function is called
1518 with locally disabled BH and that sk cannot be already spinlocked.
1519 */
1520 bh_lock_sock(sk);
1521 inet->tos = arg->tos;
1522 sk->sk_priority = skb->priority;
1523 sk->sk_protocol = ip_hdr(skb)->protocol;
1524 sk->sk_bound_dev_if = arg->bound_dev_if;
1525 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1526 &ipc, &rt, MSG_DONTWAIT);
1527 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1528 if (arg->csumoffset >= 0)
1529 *((__sum16 *)skb_transport_header(skb) +
1530 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1531 arg->csum));
1532 skb->ip_summed = CHECKSUM_NONE;
1533 ip_push_pending_frames(sk, &fl4);
1534 }
1535
1536 bh_unlock_sock(sk);
1537
1538 ip_rt_put(rt);
1539}
1540
1541void __init ip_init(void)
1542{
1543 ip_rt_init();
1544 inet_initpeers();
1545
1546#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1547 igmp_mc_proc_init();
1548#endif
1549}