Loading...
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * The Internet Protocol (IP) output module.
8 *
9 * Authors: Ross Biro
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Donald Becker, <becker@super.org>
12 * Alan Cox, <Alan.Cox@linux.org>
13 * Richard Underwood
14 * Stefan Becker, <stefanb@yello.ping.de>
15 * Jorge Cwik, <jorge@laser.satlink.net>
16 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
17 * Hirokazu Takahashi, <taka@valinux.co.jp>
18 *
19 * See ip_input.c for original log
20 *
21 * Fixes:
22 * Alan Cox : Missing nonblock feature in ip_build_xmit.
23 * Mike Kilburn : htons() missing in ip_build_xmit.
24 * Bradford Johnson: Fix faulty handling of some frames when
25 * no route is found.
26 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
27 * (in case if packet not accepted by
28 * output firewall rules)
29 * Mike McLagan : Routing by source
30 * Alexey Kuznetsov: use new route cache
31 * Andi Kleen: Fix broken PMTU recovery and remove
32 * some redundant tests.
33 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
34 * Andi Kleen : Replace ip_reply with ip_send_reply.
35 * Andi Kleen : Split fast and slow ip_build_xmit path
36 * for decreased register pressure on x86
37 * and more readibility.
38 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
39 * silently drop skb instead of failing with -EPERM.
40 * Detlev Wengorz : Copy protocol for fragments.
41 * Hirokazu Takahashi: HW checksumming for outgoing UDP
42 * datagrams.
43 * Hirokazu Takahashi: sendfile() on UDP works now.
44 */
45
46#include <linux/uaccess.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54#include <linux/slab.h>
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
70#include <net/xfrm.h>
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
75#include <net/checksum.h>
76#include <net/inetpeer.h>
77#include <net/inet_ecn.h>
78#include <net/lwtunnel.h>
79#include <linux/bpf-cgroup.h>
80#include <linux/igmp.h>
81#include <linux/netfilter_ipv4.h>
82#include <linux/netfilter_bridge.h>
83#include <linux/netlink.h>
84#include <linux/tcp.h>
85
86static int
87ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
88 unsigned int mtu,
89 int (*output)(struct net *, struct sock *, struct sk_buff *));
90
91/* Generate a checksum for an outgoing IP datagram. */
92void ip_send_check(struct iphdr *iph)
93{
94 iph->check = 0;
95 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
96}
97EXPORT_SYMBOL(ip_send_check);
98
99int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
100{
101 struct iphdr *iph = ip_hdr(skb);
102
103 iph->tot_len = htons(skb->len);
104 ip_send_check(iph);
105
106 /* if egress device is enslaved to an L3 master device pass the
107 * skb to its handler for processing
108 */
109 skb = l3mdev_ip_out(sk, skb);
110 if (unlikely(!skb))
111 return 0;
112
113 skb->protocol = htons(ETH_P_IP);
114
115 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
116 net, sk, skb, NULL, skb_dst(skb)->dev,
117 dst_output);
118}
119
120int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
121{
122 int err;
123
124 err = __ip_local_out(net, sk, skb);
125 if (likely(err == 1))
126 err = dst_output(net, sk, skb);
127
128 return err;
129}
130EXPORT_SYMBOL_GPL(ip_local_out);
131
132static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
133{
134 int ttl = inet->uc_ttl;
135
136 if (ttl < 0)
137 ttl = ip4_dst_hoplimit(dst);
138 return ttl;
139}
140
141/*
142 * Add an ip header to a skbuff and send it out.
143 *
144 */
145int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
146 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
147{
148 struct inet_sock *inet = inet_sk(sk);
149 struct rtable *rt = skb_rtable(skb);
150 struct net *net = sock_net(sk);
151 struct iphdr *iph;
152
153 /* Build the IP header. */
154 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
155 skb_reset_network_header(skb);
156 iph = ip_hdr(skb);
157 iph->version = 4;
158 iph->ihl = 5;
159 iph->tos = inet->tos;
160 iph->ttl = ip_select_ttl(inet, &rt->dst);
161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 iph->saddr = saddr;
163 iph->protocol = sk->sk_protocol;
164 if (ip_dont_fragment(sk, &rt->dst)) {
165 iph->frag_off = htons(IP_DF);
166 iph->id = 0;
167 } else {
168 iph->frag_off = 0;
169 __ip_select_ident(net, iph, 1);
170 }
171
172 if (opt && opt->opt.optlen) {
173 iph->ihl += opt->opt.optlen>>2;
174 ip_options_build(skb, &opt->opt, daddr, rt, 0);
175 }
176
177 skb->priority = sk->sk_priority;
178 if (!skb->mark)
179 skb->mark = sk->sk_mark;
180
181 /* Send it out. */
182 return ip_local_out(net, skb->sk, skb);
183}
184EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
185
186static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
187{
188 struct dst_entry *dst = skb_dst(skb);
189 struct rtable *rt = (struct rtable *)dst;
190 struct net_device *dev = dst->dev;
191 unsigned int hh_len = LL_RESERVED_SPACE(dev);
192 struct neighbour *neigh;
193 bool is_v6gw = false;
194
195 if (rt->rt_type == RTN_MULTICAST) {
196 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
197 } else if (rt->rt_type == RTN_BROADCAST)
198 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
199
200 /* Be paranoid, rather than too clever. */
201 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
202 struct sk_buff *skb2;
203
204 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
205 if (!skb2) {
206 kfree_skb(skb);
207 return -ENOMEM;
208 }
209 if (skb->sk)
210 skb_set_owner_w(skb2, skb->sk);
211 consume_skb(skb);
212 skb = skb2;
213 }
214
215 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
216 int res = lwtunnel_xmit(skb);
217
218 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
219 return res;
220 }
221
222 rcu_read_lock_bh();
223 neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
224 if (!IS_ERR(neigh)) {
225 int res;
226
227 sock_confirm_neigh(skb, neigh);
228 /* if crossing protocols, can not use the cached header */
229 res = neigh_output(neigh, skb, is_v6gw);
230 rcu_read_unlock_bh();
231 return res;
232 }
233 rcu_read_unlock_bh();
234
235 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
236 __func__);
237 kfree_skb(skb);
238 return -EINVAL;
239}
240
241static int ip_finish_output_gso(struct net *net, struct sock *sk,
242 struct sk_buff *skb, unsigned int mtu)
243{
244 struct sk_buff *segs, *nskb;
245 netdev_features_t features;
246 int ret = 0;
247
248 /* common case: seglen is <= mtu
249 */
250 if (skb_gso_validate_network_len(skb, mtu))
251 return ip_finish_output2(net, sk, skb);
252
253 /* Slowpath - GSO segment length exceeds the egress MTU.
254 *
255 * This can happen in several cases:
256 * - Forwarding of a TCP GRO skb, when DF flag is not set.
257 * - Forwarding of an skb that arrived on a virtualization interface
258 * (virtio-net/vhost/tap) with TSO/GSO size set by other network
259 * stack.
260 * - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an
261 * interface with a smaller MTU.
262 * - Arriving GRO skb (or GSO skb in a virtualized environment) that is
263 * bridged to a NETIF_F_TSO tunnel stacked over an interface with an
264 * insufficent MTU.
265 */
266 features = netif_skb_features(skb);
267 BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
268 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
269 if (IS_ERR_OR_NULL(segs)) {
270 kfree_skb(skb);
271 return -ENOMEM;
272 }
273
274 consume_skb(skb);
275
276 skb_list_walk_safe(segs, segs, nskb) {
277 int err;
278
279 skb_mark_not_on_list(segs);
280 err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
281
282 if (err && ret == 0)
283 ret = err;
284 }
285
286 return ret;
287}
288
289static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
290{
291 unsigned int mtu;
292
293#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
294 /* Policy lookup after SNAT yielded a new policy */
295 if (skb_dst(skb)->xfrm) {
296 IPCB(skb)->flags |= IPSKB_REROUTED;
297 return dst_output(net, sk, skb);
298 }
299#endif
300 mtu = ip_skb_dst_mtu(sk, skb);
301 if (skb_is_gso(skb))
302 return ip_finish_output_gso(net, sk, skb, mtu);
303
304 if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
305 return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
306
307 return ip_finish_output2(net, sk, skb);
308}
309
310static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
311{
312 int ret;
313
314 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
315 switch (ret) {
316 case NET_XMIT_SUCCESS:
317 return __ip_finish_output(net, sk, skb);
318 case NET_XMIT_CN:
319 return __ip_finish_output(net, sk, skb) ? : ret;
320 default:
321 kfree_skb(skb);
322 return ret;
323 }
324}
325
326static int ip_mc_finish_output(struct net *net, struct sock *sk,
327 struct sk_buff *skb)
328{
329 struct rtable *new_rt;
330 bool do_cn = false;
331 int ret, err;
332
333 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
334 switch (ret) {
335 case NET_XMIT_CN:
336 do_cn = true;
337 fallthrough;
338 case NET_XMIT_SUCCESS:
339 break;
340 default:
341 kfree_skb(skb);
342 return ret;
343 }
344
345 /* Reset rt_iif so that inet_iif() will return skb->skb_iif. Setting
346 * this to non-zero causes ipi_ifindex in in_pktinfo to be overwritten,
347 * see ipv4_pktinfo_prepare().
348 */
349 new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb));
350 if (new_rt) {
351 new_rt->rt_iif = 0;
352 skb_dst_drop(skb);
353 skb_dst_set(skb, &new_rt->dst);
354 }
355
356 err = dev_loopback_xmit(net, sk, skb);
357 return (do_cn && err) ? ret : err;
358}
359
360int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
361{
362 struct rtable *rt = skb_rtable(skb);
363 struct net_device *dev = rt->dst.dev;
364
365 /*
366 * If the indicated interface is up and running, send the packet.
367 */
368 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
369
370 skb->dev = dev;
371 skb->protocol = htons(ETH_P_IP);
372
373 /*
374 * Multicasts are looped back for other local users
375 */
376
377 if (rt->rt_flags&RTCF_MULTICAST) {
378 if (sk_mc_loop(sk)
379#ifdef CONFIG_IP_MROUTE
380 /* Small optimization: do not loopback not local frames,
381 which returned after forwarding; they will be dropped
382 by ip_mr_input in any case.
383 Note, that local frames are looped back to be delivered
384 to local recipients.
385
386 This check is duplicated in ip_mr_input at the moment.
387 */
388 &&
389 ((rt->rt_flags & RTCF_LOCAL) ||
390 !(IPCB(skb)->flags & IPSKB_FORWARDED))
391#endif
392 ) {
393 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
394 if (newskb)
395 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
396 net, sk, newskb, NULL, newskb->dev,
397 ip_mc_finish_output);
398 }
399
400 /* Multicasts with ttl 0 must not go beyond the host */
401
402 if (ip_hdr(skb)->ttl == 0) {
403 kfree_skb(skb);
404 return 0;
405 }
406 }
407
408 if (rt->rt_flags&RTCF_BROADCAST) {
409 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
410 if (newskb)
411 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
412 net, sk, newskb, NULL, newskb->dev,
413 ip_mc_finish_output);
414 }
415
416 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
417 net, sk, skb, NULL, skb->dev,
418 ip_finish_output,
419 !(IPCB(skb)->flags & IPSKB_REROUTED));
420}
421
422int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
423{
424 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
425
426 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
427
428 skb->dev = dev;
429 skb->protocol = htons(ETH_P_IP);
430
431 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
432 net, sk, skb, indev, dev,
433 ip_finish_output,
434 !(IPCB(skb)->flags & IPSKB_REROUTED));
435}
436
437/*
438 * copy saddr and daddr, possibly using 64bit load/stores
439 * Equivalent to :
440 * iph->saddr = fl4->saddr;
441 * iph->daddr = fl4->daddr;
442 */
443static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
444{
445 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
446 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
447 memcpy(&iph->saddr, &fl4->saddr,
448 sizeof(fl4->saddr) + sizeof(fl4->daddr));
449}
450
451/* Note: skb->sk can be different from sk, in case of tunnels */
452int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
453 __u8 tos)
454{
455 struct inet_sock *inet = inet_sk(sk);
456 struct net *net = sock_net(sk);
457 struct ip_options_rcu *inet_opt;
458 struct flowi4 *fl4;
459 struct rtable *rt;
460 struct iphdr *iph;
461 int res;
462
463 /* Skip all of this if the packet is already routed,
464 * f.e. by something like SCTP.
465 */
466 rcu_read_lock();
467 inet_opt = rcu_dereference(inet->inet_opt);
468 fl4 = &fl->u.ip4;
469 rt = skb_rtable(skb);
470 if (rt)
471 goto packet_routed;
472
473 /* Make sure we can route this packet. */
474 rt = (struct rtable *)__sk_dst_check(sk, 0);
475 if (!rt) {
476 __be32 daddr;
477
478 /* Use correct destination address if we have options. */
479 daddr = inet->inet_daddr;
480 if (inet_opt && inet_opt->opt.srr)
481 daddr = inet_opt->opt.faddr;
482
483 /* If this fails, retransmit mechanism of transport layer will
484 * keep trying until route appears or the connection times
485 * itself out.
486 */
487 rt = ip_route_output_ports(net, fl4, sk,
488 daddr, inet->inet_saddr,
489 inet->inet_dport,
490 inet->inet_sport,
491 sk->sk_protocol,
492 RT_CONN_FLAGS_TOS(sk, tos),
493 sk->sk_bound_dev_if);
494 if (IS_ERR(rt))
495 goto no_route;
496 sk_setup_caps(sk, &rt->dst);
497 }
498 skb_dst_set_noref(skb, &rt->dst);
499
500packet_routed:
501 if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
502 goto no_route;
503
504 /* OK, we know where to send it, allocate and build IP header. */
505 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
506 skb_reset_network_header(skb);
507 iph = ip_hdr(skb);
508 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
509 if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
510 iph->frag_off = htons(IP_DF);
511 else
512 iph->frag_off = 0;
513 iph->ttl = ip_select_ttl(inet, &rt->dst);
514 iph->protocol = sk->sk_protocol;
515 ip_copy_addrs(iph, fl4);
516
517 /* Transport layer set skb->h.foo itself. */
518
519 if (inet_opt && inet_opt->opt.optlen) {
520 iph->ihl += inet_opt->opt.optlen >> 2;
521 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
522 }
523
524 ip_select_ident_segs(net, skb, sk,
525 skb_shinfo(skb)->gso_segs ?: 1);
526
527 /* TODO : should we use skb->sk here instead of sk ? */
528 skb->priority = sk->sk_priority;
529 skb->mark = sk->sk_mark;
530
531 res = ip_local_out(net, sk, skb);
532 rcu_read_unlock();
533 return res;
534
535no_route:
536 rcu_read_unlock();
537 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
538 kfree_skb(skb);
539 return -EHOSTUNREACH;
540}
541EXPORT_SYMBOL(__ip_queue_xmit);
542
543int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
544{
545 return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
546}
547EXPORT_SYMBOL(ip_queue_xmit);
548
549static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
550{
551 to->pkt_type = from->pkt_type;
552 to->priority = from->priority;
553 to->protocol = from->protocol;
554 to->skb_iif = from->skb_iif;
555 skb_dst_drop(to);
556 skb_dst_copy(to, from);
557 to->dev = from->dev;
558 to->mark = from->mark;
559
560 skb_copy_hash(to, from);
561
562#ifdef CONFIG_NET_SCHED
563 to->tc_index = from->tc_index;
564#endif
565 nf_copy(to, from);
566 skb_ext_copy(to, from);
567#if IS_ENABLED(CONFIG_IP_VS)
568 to->ipvs_property = from->ipvs_property;
569#endif
570 skb_copy_secmark(to, from);
571}
572
573static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
574 unsigned int mtu,
575 int (*output)(struct net *, struct sock *, struct sk_buff *))
576{
577 struct iphdr *iph = ip_hdr(skb);
578
579 if ((iph->frag_off & htons(IP_DF)) == 0)
580 return ip_do_fragment(net, sk, skb, output);
581
582 if (unlikely(!skb->ignore_df ||
583 (IPCB(skb)->frag_max_size &&
584 IPCB(skb)->frag_max_size > mtu))) {
585 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
586 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
587 htonl(mtu));
588 kfree_skb(skb);
589 return -EMSGSIZE;
590 }
591
592 return ip_do_fragment(net, sk, skb, output);
593}
594
595void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
596 unsigned int hlen, struct ip_fraglist_iter *iter)
597{
598 unsigned int first_len = skb_pagelen(skb);
599
600 iter->frag = skb_shinfo(skb)->frag_list;
601 skb_frag_list_init(skb);
602
603 iter->offset = 0;
604 iter->iph = iph;
605 iter->hlen = hlen;
606
607 skb->data_len = first_len - skb_headlen(skb);
608 skb->len = first_len;
609 iph->tot_len = htons(first_len);
610 iph->frag_off = htons(IP_MF);
611 ip_send_check(iph);
612}
613EXPORT_SYMBOL(ip_fraglist_init);
614
615static void ip_fraglist_ipcb_prepare(struct sk_buff *skb,
616 struct ip_fraglist_iter *iter)
617{
618 struct sk_buff *to = iter->frag;
619
620 /* Copy the flags to each fragment. */
621 IPCB(to)->flags = IPCB(skb)->flags;
622
623 if (iter->offset == 0)
624 ip_options_fragment(to);
625}
626
627void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
628{
629 unsigned int hlen = iter->hlen;
630 struct iphdr *iph = iter->iph;
631 struct sk_buff *frag;
632
633 frag = iter->frag;
634 frag->ip_summed = CHECKSUM_NONE;
635 skb_reset_transport_header(frag);
636 __skb_push(frag, hlen);
637 skb_reset_network_header(frag);
638 memcpy(skb_network_header(frag), iph, hlen);
639 iter->iph = ip_hdr(frag);
640 iph = iter->iph;
641 iph->tot_len = htons(frag->len);
642 ip_copy_metadata(frag, skb);
643 iter->offset += skb->len - hlen;
644 iph->frag_off = htons(iter->offset >> 3);
645 if (frag->next)
646 iph->frag_off |= htons(IP_MF);
647 /* Ready, complete checksum */
648 ip_send_check(iph);
649}
650EXPORT_SYMBOL(ip_fraglist_prepare);
651
652void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
653 unsigned int ll_rs, unsigned int mtu, bool DF,
654 struct ip_frag_state *state)
655{
656 struct iphdr *iph = ip_hdr(skb);
657
658 state->DF = DF;
659 state->hlen = hlen;
660 state->ll_rs = ll_rs;
661 state->mtu = mtu;
662
663 state->left = skb->len - hlen; /* Space per frame */
664 state->ptr = hlen; /* Where to start from */
665
666 state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
667 state->not_last_frag = iph->frag_off & htons(IP_MF);
668}
669EXPORT_SYMBOL(ip_frag_init);
670
671static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
672 bool first_frag, struct ip_frag_state *state)
673{
674 /* Copy the flags to each fragment. */
675 IPCB(to)->flags = IPCB(from)->flags;
676
677 /* ANK: dirty, but effective trick. Upgrade options only if
678 * the segment to be fragmented was THE FIRST (otherwise,
679 * options are already fixed) and make it ONCE
680 * on the initial skb, so that all the following fragments
681 * will inherit fixed options.
682 */
683 if (first_frag)
684 ip_options_fragment(from);
685}
686
687struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
688{
689 unsigned int len = state->left;
690 struct sk_buff *skb2;
691 struct iphdr *iph;
692
693 len = state->left;
694 /* IF: it doesn't fit, use 'mtu' - the data space left */
695 if (len > state->mtu)
696 len = state->mtu;
697 /* IF: we are not sending up to and including the packet end
698 then align the next start on an eight byte boundary */
699 if (len < state->left) {
700 len &= ~7;
701 }
702
703 /* Allocate buffer */
704 skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
705 if (!skb2)
706 return ERR_PTR(-ENOMEM);
707
708 /*
709 * Set up data on packet
710 */
711
712 ip_copy_metadata(skb2, skb);
713 skb_reserve(skb2, state->ll_rs);
714 skb_put(skb2, len + state->hlen);
715 skb_reset_network_header(skb2);
716 skb2->transport_header = skb2->network_header + state->hlen;
717
718 /*
719 * Charge the memory for the fragment to any owner
720 * it might possess
721 */
722
723 if (skb->sk)
724 skb_set_owner_w(skb2, skb->sk);
725
726 /*
727 * Copy the packet header into the new buffer.
728 */
729
730 skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen);
731
732 /*
733 * Copy a block of the IP datagram.
734 */
735 if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
736 BUG();
737 state->left -= len;
738
739 /*
740 * Fill in the new header fields.
741 */
742 iph = ip_hdr(skb2);
743 iph->frag_off = htons((state->offset >> 3));
744 if (state->DF)
745 iph->frag_off |= htons(IP_DF);
746
747 /*
748 * Added AC : If we are fragmenting a fragment that's not the
749 * last fragment then keep MF on each bit
750 */
751 if (state->left > 0 || state->not_last_frag)
752 iph->frag_off |= htons(IP_MF);
753 state->ptr += len;
754 state->offset += len;
755
756 iph->tot_len = htons(len + state->hlen);
757
758 ip_send_check(iph);
759
760 return skb2;
761}
762EXPORT_SYMBOL(ip_frag_next);
763
764/*
765 * This IP datagram is too large to be sent in one piece. Break it up into
766 * smaller pieces (each of size equal to IP header plus
767 * a block of the data of the original IP data part) that will yet fit in a
768 * single device frame, and queue such a frame for sending.
769 */
770
771int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
772 int (*output)(struct net *, struct sock *, struct sk_buff *))
773{
774 struct iphdr *iph;
775 struct sk_buff *skb2;
776 struct rtable *rt = skb_rtable(skb);
777 unsigned int mtu, hlen, ll_rs;
778 struct ip_fraglist_iter iter;
779 ktime_t tstamp = skb->tstamp;
780 struct ip_frag_state state;
781 int err = 0;
782
783 /* for offloaded checksums cleanup checksum before fragmentation */
784 if (skb->ip_summed == CHECKSUM_PARTIAL &&
785 (err = skb_checksum_help(skb)))
786 goto fail;
787
788 /*
789 * Point into the IP datagram header.
790 */
791
792 iph = ip_hdr(skb);
793
794 mtu = ip_skb_dst_mtu(sk, skb);
795 if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
796 mtu = IPCB(skb)->frag_max_size;
797
798 /*
799 * Setup starting values.
800 */
801
802 hlen = iph->ihl * 4;
803 mtu = mtu - hlen; /* Size of data space */
804 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
805 ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
806
807 /* When frag_list is given, use it. First, check its validity:
808 * some transformers could create wrong frag_list or break existing
809 * one, it is not prohibited. In this case fall back to copying.
810 *
811 * LATER: this step can be merged to real generation of fragments,
812 * we can switch to copy when see the first bad fragment.
813 */
814 if (skb_has_frag_list(skb)) {
815 struct sk_buff *frag, *frag2;
816 unsigned int first_len = skb_pagelen(skb);
817
818 if (first_len - hlen > mtu ||
819 ((first_len - hlen) & 7) ||
820 ip_is_fragment(iph) ||
821 skb_cloned(skb) ||
822 skb_headroom(skb) < ll_rs)
823 goto slow_path;
824
825 skb_walk_frags(skb, frag) {
826 /* Correct geometry. */
827 if (frag->len > mtu ||
828 ((frag->len & 7) && frag->next) ||
829 skb_headroom(frag) < hlen + ll_rs)
830 goto slow_path_clean;
831
832 /* Partially cloned skb? */
833 if (skb_shared(frag))
834 goto slow_path_clean;
835
836 BUG_ON(frag->sk);
837 if (skb->sk) {
838 frag->sk = skb->sk;
839 frag->destructor = sock_wfree;
840 }
841 skb->truesize -= frag->truesize;
842 }
843
844 /* Everything is OK. Generate! */
845 ip_fraglist_init(skb, iph, hlen, &iter);
846
847 for (;;) {
848 /* Prepare header of the next frame,
849 * before previous one went down. */
850 if (iter.frag) {
851 ip_fraglist_ipcb_prepare(skb, &iter);
852 ip_fraglist_prepare(skb, &iter);
853 }
854
855 skb->tstamp = tstamp;
856 err = output(net, sk, skb);
857
858 if (!err)
859 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
860 if (err || !iter.frag)
861 break;
862
863 skb = ip_fraglist_next(&iter);
864 }
865
866 if (err == 0) {
867 IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
868 return 0;
869 }
870
871 kfree_skb_list(iter.frag);
872
873 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
874 return err;
875
876slow_path_clean:
877 skb_walk_frags(skb, frag2) {
878 if (frag2 == frag)
879 break;
880 frag2->sk = NULL;
881 frag2->destructor = NULL;
882 skb->truesize += frag2->truesize;
883 }
884 }
885
886slow_path:
887 /*
888 * Fragment the datagram.
889 */
890
891 ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU,
892 &state);
893
894 /*
895 * Keep copying data until we run out.
896 */
897
898 while (state.left > 0) {
899 bool first_frag = (state.offset == 0);
900
901 skb2 = ip_frag_next(skb, &state);
902 if (IS_ERR(skb2)) {
903 err = PTR_ERR(skb2);
904 goto fail;
905 }
906 ip_frag_ipcb(skb, skb2, first_frag, &state);
907
908 /*
909 * Put this fragment into the sending queue.
910 */
911 skb2->tstamp = tstamp;
912 err = output(net, sk, skb2);
913 if (err)
914 goto fail;
915
916 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
917 }
918 consume_skb(skb);
919 IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
920 return err;
921
922fail:
923 kfree_skb(skb);
924 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
925 return err;
926}
927EXPORT_SYMBOL(ip_do_fragment);
928
929int
930ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
931{
932 struct msghdr *msg = from;
933
934 if (skb->ip_summed == CHECKSUM_PARTIAL) {
935 if (!copy_from_iter_full(to, len, &msg->msg_iter))
936 return -EFAULT;
937 } else {
938 __wsum csum = 0;
939 if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
940 return -EFAULT;
941 skb->csum = csum_block_add(skb->csum, csum, odd);
942 }
943 return 0;
944}
945EXPORT_SYMBOL(ip_generic_getfrag);
946
947static inline __wsum
948csum_page(struct page *page, int offset, int copy)
949{
950 char *kaddr;
951 __wsum csum;
952 kaddr = kmap(page);
953 csum = csum_partial(kaddr + offset, copy, 0);
954 kunmap(page);
955 return csum;
956}
957
958static int __ip_append_data(struct sock *sk,
959 struct flowi4 *fl4,
960 struct sk_buff_head *queue,
961 struct inet_cork *cork,
962 struct page_frag *pfrag,
963 int getfrag(void *from, char *to, int offset,
964 int len, int odd, struct sk_buff *skb),
965 void *from, int length, int transhdrlen,
966 unsigned int flags)
967{
968 struct inet_sock *inet = inet_sk(sk);
969 struct ubuf_info *uarg = NULL;
970 struct sk_buff *skb;
971
972 struct ip_options *opt = cork->opt;
973 int hh_len;
974 int exthdrlen;
975 int mtu;
976 int copy;
977 int err;
978 int offset = 0;
979 unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
980 int csummode = CHECKSUM_NONE;
981 struct rtable *rt = (struct rtable *)cork->dst;
982 unsigned int wmem_alloc_delta = 0;
983 bool paged, extra_uref = false;
984 u32 tskey = 0;
985
986 skb = skb_peek_tail(queue);
987
988 exthdrlen = !skb ? rt->dst.header_len : 0;
989 mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
990 paged = !!cork->gso_size;
991
992 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
993 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
994 tskey = sk->sk_tskey++;
995
996 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
997
998 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
999 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1000 maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
1001
1002 if (cork->length + length > maxnonfragsize - fragheaderlen) {
1003 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
1004 mtu - (opt ? opt->optlen : 0));
1005 return -EMSGSIZE;
1006 }
1007
1008 /*
1009 * transhdrlen > 0 means that this is the first fragment and we wish
1010 * it won't be fragmented in the future.
1011 */
1012 if (transhdrlen &&
1013 length + fragheaderlen <= mtu &&
1014 rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
1015 (!(flags & MSG_MORE) || cork->gso_size) &&
1016 (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
1017 csummode = CHECKSUM_PARTIAL;
1018
1019 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1020 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1021 if (!uarg)
1022 return -ENOBUFS;
1023 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1024 if (rt->dst.dev->features & NETIF_F_SG &&
1025 csummode == CHECKSUM_PARTIAL) {
1026 paged = true;
1027 } else {
1028 uarg->zerocopy = 0;
1029 skb_zcopy_set(skb, uarg, &extra_uref);
1030 }
1031 }
1032
1033 cork->length += length;
1034
1035 /* So, what's going on in the loop below?
1036 *
1037 * We use calculated fragment length to generate chained skb,
1038 * each of segments is IP fragment ready for sending to network after
1039 * adding appropriate IP header.
1040 */
1041
1042 if (!skb)
1043 goto alloc_new_skb;
1044
1045 while (length > 0) {
1046 /* Check if the remaining data fits into current packet. */
1047 copy = mtu - skb->len;
1048 if (copy < length)
1049 copy = maxfraglen - skb->len;
1050 if (copy <= 0) {
1051 char *data;
1052 unsigned int datalen;
1053 unsigned int fraglen;
1054 unsigned int fraggap;
1055 unsigned int alloclen;
1056 unsigned int pagedlen;
1057 struct sk_buff *skb_prev;
1058alloc_new_skb:
1059 skb_prev = skb;
1060 if (skb_prev)
1061 fraggap = skb_prev->len - maxfraglen;
1062 else
1063 fraggap = 0;
1064
1065 /*
1066 * If remaining data exceeds the mtu,
1067 * we know we need more fragment(s).
1068 */
1069 datalen = length + fraggap;
1070 if (datalen > mtu - fragheaderlen)
1071 datalen = maxfraglen - fragheaderlen;
1072 fraglen = datalen + fragheaderlen;
1073 pagedlen = 0;
1074
1075 if ((flags & MSG_MORE) &&
1076 !(rt->dst.dev->features&NETIF_F_SG))
1077 alloclen = mtu;
1078 else if (!paged)
1079 alloclen = fraglen;
1080 else {
1081 alloclen = min_t(int, fraglen, MAX_HEADER);
1082 pagedlen = fraglen - alloclen;
1083 }
1084
1085 alloclen += exthdrlen;
1086
1087 /* The last fragment gets additional space at tail.
1088 * Note, with MSG_MORE we overallocate on fragments,
1089 * because we have no idea what fragment will be
1090 * the last.
1091 */
1092 if (datalen == length + fraggap)
1093 alloclen += rt->dst.trailer_len;
1094
1095 if (transhdrlen) {
1096 skb = sock_alloc_send_skb(sk,
1097 alloclen + hh_len + 15,
1098 (flags & MSG_DONTWAIT), &err);
1099 } else {
1100 skb = NULL;
1101 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1102 2 * sk->sk_sndbuf)
1103 skb = alloc_skb(alloclen + hh_len + 15,
1104 sk->sk_allocation);
1105 if (unlikely(!skb))
1106 err = -ENOBUFS;
1107 }
1108 if (!skb)
1109 goto error;
1110
1111 /*
1112 * Fill in the control structures
1113 */
1114 skb->ip_summed = csummode;
1115 skb->csum = 0;
1116 skb_reserve(skb, hh_len);
1117
1118 /*
1119 * Find where to start putting bytes.
1120 */
1121 data = skb_put(skb, fraglen + exthdrlen - pagedlen);
1122 skb_set_network_header(skb, exthdrlen);
1123 skb->transport_header = (skb->network_header +
1124 fragheaderlen);
1125 data += fragheaderlen + exthdrlen;
1126
1127 if (fraggap) {
1128 skb->csum = skb_copy_and_csum_bits(
1129 skb_prev, maxfraglen,
1130 data + transhdrlen, fraggap, 0);
1131 skb_prev->csum = csum_sub(skb_prev->csum,
1132 skb->csum);
1133 data += fraggap;
1134 pskb_trim_unique(skb_prev, maxfraglen);
1135 }
1136
1137 copy = datalen - transhdrlen - fraggap - pagedlen;
1138 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1139 err = -EFAULT;
1140 kfree_skb(skb);
1141 goto error;
1142 }
1143
1144 offset += copy;
1145 length -= copy + transhdrlen;
1146 transhdrlen = 0;
1147 exthdrlen = 0;
1148 csummode = CHECKSUM_NONE;
1149
1150 /* only the initial fragment is time stamped */
1151 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1152 cork->tx_flags = 0;
1153 skb_shinfo(skb)->tskey = tskey;
1154 tskey = 0;
1155 skb_zcopy_set(skb, uarg, &extra_uref);
1156
1157 if ((flags & MSG_CONFIRM) && !skb_prev)
1158 skb_set_dst_pending_confirm(skb, 1);
1159
1160 /*
1161 * Put the packet on the pending queue.
1162 */
1163 if (!skb->destructor) {
1164 skb->destructor = sock_wfree;
1165 skb->sk = sk;
1166 wmem_alloc_delta += skb->truesize;
1167 }
1168 __skb_queue_tail(queue, skb);
1169 continue;
1170 }
1171
1172 if (copy > length)
1173 copy = length;
1174
1175 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1176 skb_tailroom(skb) >= copy) {
1177 unsigned int off;
1178
1179 off = skb->len;
1180 if (getfrag(from, skb_put(skb, copy),
1181 offset, copy, off, skb) < 0) {
1182 __skb_trim(skb, off);
1183 err = -EFAULT;
1184 goto error;
1185 }
1186 } else if (!uarg || !uarg->zerocopy) {
1187 int i = skb_shinfo(skb)->nr_frags;
1188
1189 err = -ENOMEM;
1190 if (!sk_page_frag_refill(sk, pfrag))
1191 goto error;
1192
1193 if (!skb_can_coalesce(skb, i, pfrag->page,
1194 pfrag->offset)) {
1195 err = -EMSGSIZE;
1196 if (i == MAX_SKB_FRAGS)
1197 goto error;
1198
1199 __skb_fill_page_desc(skb, i, pfrag->page,
1200 pfrag->offset, 0);
1201 skb_shinfo(skb)->nr_frags = ++i;
1202 get_page(pfrag->page);
1203 }
1204 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1205 if (getfrag(from,
1206 page_address(pfrag->page) + pfrag->offset,
1207 offset, copy, skb->len, skb) < 0)
1208 goto error_efault;
1209
1210 pfrag->offset += copy;
1211 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1212 skb->len += copy;
1213 skb->data_len += copy;
1214 skb->truesize += copy;
1215 wmem_alloc_delta += copy;
1216 } else {
1217 err = skb_zerocopy_iter_dgram(skb, from, copy);
1218 if (err < 0)
1219 goto error;
1220 }
1221 offset += copy;
1222 length -= copy;
1223 }
1224
1225 if (wmem_alloc_delta)
1226 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1227 return 0;
1228
1229error_efault:
1230 err = -EFAULT;
1231error:
1232 if (uarg)
1233 sock_zerocopy_put_abort(uarg, extra_uref);
1234 cork->length -= length;
1235 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1236 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1237 return err;
1238}
1239
1240static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1241 struct ipcm_cookie *ipc, struct rtable **rtp)
1242{
1243 struct ip_options_rcu *opt;
1244 struct rtable *rt;
1245
1246 rt = *rtp;
1247 if (unlikely(!rt))
1248 return -EFAULT;
1249
1250 /*
1251 * setup for corking.
1252 */
1253 opt = ipc->opt;
1254 if (opt) {
1255 if (!cork->opt) {
1256 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1257 sk->sk_allocation);
1258 if (unlikely(!cork->opt))
1259 return -ENOBUFS;
1260 }
1261 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1262 cork->flags |= IPCORK_OPT;
1263 cork->addr = ipc->addr;
1264 }
1265
1266 cork->fragsize = ip_sk_use_pmtu(sk) ?
1267 dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
1268
1269 if (!inetdev_valid_mtu(cork->fragsize))
1270 return -ENETUNREACH;
1271
1272 cork->gso_size = ipc->gso_size;
1273
1274 cork->dst = &rt->dst;
1275 /* We stole this route, caller should not release it. */
1276 *rtp = NULL;
1277
1278 cork->length = 0;
1279 cork->ttl = ipc->ttl;
1280 cork->tos = ipc->tos;
1281 cork->mark = ipc->sockc.mark;
1282 cork->priority = ipc->priority;
1283 cork->transmit_time = ipc->sockc.transmit_time;
1284 cork->tx_flags = 0;
1285 sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags);
1286
1287 return 0;
1288}
1289
1290/*
1291 * ip_append_data() and ip_append_page() can make one large IP datagram
1292 * from many pieces of data. Each pieces will be holded on the socket
1293 * until ip_push_pending_frames() is called. Each piece can be a page
1294 * or non-page data.
1295 *
1296 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1297 * this interface potentially.
1298 *
1299 * LATER: length must be adjusted by pad at tail, when it is required.
1300 */
1301int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1302 int getfrag(void *from, char *to, int offset, int len,
1303 int odd, struct sk_buff *skb),
1304 void *from, int length, int transhdrlen,
1305 struct ipcm_cookie *ipc, struct rtable **rtp,
1306 unsigned int flags)
1307{
1308 struct inet_sock *inet = inet_sk(sk);
1309 int err;
1310
1311 if (flags&MSG_PROBE)
1312 return 0;
1313
1314 if (skb_queue_empty(&sk->sk_write_queue)) {
1315 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1316 if (err)
1317 return err;
1318 } else {
1319 transhdrlen = 0;
1320 }
1321
1322 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
1323 sk_page_frag(sk), getfrag,
1324 from, length, transhdrlen, flags);
1325}
1326
1327ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1328 int offset, size_t size, int flags)
1329{
1330 struct inet_sock *inet = inet_sk(sk);
1331 struct sk_buff *skb;
1332 struct rtable *rt;
1333 struct ip_options *opt = NULL;
1334 struct inet_cork *cork;
1335 int hh_len;
1336 int mtu;
1337 int len;
1338 int err;
1339 unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
1340
1341 if (inet->hdrincl)
1342 return -EPERM;
1343
1344 if (flags&MSG_PROBE)
1345 return 0;
1346
1347 if (skb_queue_empty(&sk->sk_write_queue))
1348 return -EINVAL;
1349
1350 cork = &inet->cork.base;
1351 rt = (struct rtable *)cork->dst;
1352 if (cork->flags & IPCORK_OPT)
1353 opt = cork->opt;
1354
1355 if (!(rt->dst.dev->features&NETIF_F_SG))
1356 return -EOPNOTSUPP;
1357
1358 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1359 mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
1360
1361 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1362 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1363 maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
1364
1365 if (cork->length + size > maxnonfragsize - fragheaderlen) {
1366 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
1367 mtu - (opt ? opt->optlen : 0));
1368 return -EMSGSIZE;
1369 }
1370
1371 skb = skb_peek_tail(&sk->sk_write_queue);
1372 if (!skb)
1373 return -EINVAL;
1374
1375 cork->length += size;
1376
1377 while (size > 0) {
1378 /* Check if the remaining data fits into current packet. */
1379 len = mtu - skb->len;
1380 if (len < size)
1381 len = maxfraglen - skb->len;
1382
1383 if (len <= 0) {
1384 struct sk_buff *skb_prev;
1385 int alloclen;
1386
1387 skb_prev = skb;
1388 fraggap = skb_prev->len - maxfraglen;
1389
1390 alloclen = fragheaderlen + hh_len + fraggap + 15;
1391 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1392 if (unlikely(!skb)) {
1393 err = -ENOBUFS;
1394 goto error;
1395 }
1396
1397 /*
1398 * Fill in the control structures
1399 */
1400 skb->ip_summed = CHECKSUM_NONE;
1401 skb->csum = 0;
1402 skb_reserve(skb, hh_len);
1403
1404 /*
1405 * Find where to start putting bytes.
1406 */
1407 skb_put(skb, fragheaderlen + fraggap);
1408 skb_reset_network_header(skb);
1409 skb->transport_header = (skb->network_header +
1410 fragheaderlen);
1411 if (fraggap) {
1412 skb->csum = skb_copy_and_csum_bits(skb_prev,
1413 maxfraglen,
1414 skb_transport_header(skb),
1415 fraggap, 0);
1416 skb_prev->csum = csum_sub(skb_prev->csum,
1417 skb->csum);
1418 pskb_trim_unique(skb_prev, maxfraglen);
1419 }
1420
1421 /*
1422 * Put the packet on the pending queue.
1423 */
1424 __skb_queue_tail(&sk->sk_write_queue, skb);
1425 continue;
1426 }
1427
1428 if (len > size)
1429 len = size;
1430
1431 if (skb_append_pagefrags(skb, page, offset, len)) {
1432 err = -EMSGSIZE;
1433 goto error;
1434 }
1435
1436 if (skb->ip_summed == CHECKSUM_NONE) {
1437 __wsum csum;
1438 csum = csum_page(page, offset, len);
1439 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1440 }
1441
1442 skb->len += len;
1443 skb->data_len += len;
1444 skb->truesize += len;
1445 refcount_add(len, &sk->sk_wmem_alloc);
1446 offset += len;
1447 size -= len;
1448 }
1449 return 0;
1450
1451error:
1452 cork->length -= size;
1453 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1454 return err;
1455}
1456
1457static void ip_cork_release(struct inet_cork *cork)
1458{
1459 cork->flags &= ~IPCORK_OPT;
1460 kfree(cork->opt);
1461 cork->opt = NULL;
1462 dst_release(cork->dst);
1463 cork->dst = NULL;
1464}
1465
1466/*
1467 * Combined all pending IP fragments on the socket as one IP datagram
1468 * and push them out.
1469 */
1470struct sk_buff *__ip_make_skb(struct sock *sk,
1471 struct flowi4 *fl4,
1472 struct sk_buff_head *queue,
1473 struct inet_cork *cork)
1474{
1475 struct sk_buff *skb, *tmp_skb;
1476 struct sk_buff **tail_skb;
1477 struct inet_sock *inet = inet_sk(sk);
1478 struct net *net = sock_net(sk);
1479 struct ip_options *opt = NULL;
1480 struct rtable *rt = (struct rtable *)cork->dst;
1481 struct iphdr *iph;
1482 __be16 df = 0;
1483 __u8 ttl;
1484
1485 skb = __skb_dequeue(queue);
1486 if (!skb)
1487 goto out;
1488 tail_skb = &(skb_shinfo(skb)->frag_list);
1489
1490 /* move skb->data to ip header from ext header */
1491 if (skb->data < skb_network_header(skb))
1492 __skb_pull(skb, skb_network_offset(skb));
1493 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1494 __skb_pull(tmp_skb, skb_network_header_len(skb));
1495 *tail_skb = tmp_skb;
1496 tail_skb = &(tmp_skb->next);
1497 skb->len += tmp_skb->len;
1498 skb->data_len += tmp_skb->len;
1499 skb->truesize += tmp_skb->truesize;
1500 tmp_skb->destructor = NULL;
1501 tmp_skb->sk = NULL;
1502 }
1503
1504 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1505 * to fragment the frame generated here. No matter, what transforms
1506 * how transforms change size of the packet, it will come out.
1507 */
1508 skb->ignore_df = ip_sk_ignore_df(sk);
1509
1510 /* DF bit is set when we want to see DF on outgoing frames.
1511 * If ignore_df is set too, we still allow to fragment this frame
1512 * locally. */
1513 if (inet->pmtudisc == IP_PMTUDISC_DO ||
1514 inet->pmtudisc == IP_PMTUDISC_PROBE ||
1515 (skb->len <= dst_mtu(&rt->dst) &&
1516 ip_dont_fragment(sk, &rt->dst)))
1517 df = htons(IP_DF);
1518
1519 if (cork->flags & IPCORK_OPT)
1520 opt = cork->opt;
1521
1522 if (cork->ttl != 0)
1523 ttl = cork->ttl;
1524 else if (rt->rt_type == RTN_MULTICAST)
1525 ttl = inet->mc_ttl;
1526 else
1527 ttl = ip_select_ttl(inet, &rt->dst);
1528
1529 iph = ip_hdr(skb);
1530 iph->version = 4;
1531 iph->ihl = 5;
1532 iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
1533 iph->frag_off = df;
1534 iph->ttl = ttl;
1535 iph->protocol = sk->sk_protocol;
1536 ip_copy_addrs(iph, fl4);
1537 ip_select_ident(net, skb, sk);
1538
1539 if (opt) {
1540 iph->ihl += opt->optlen>>2;
1541 ip_options_build(skb, opt, cork->addr, rt, 0);
1542 }
1543
1544 skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
1545 skb->mark = cork->mark;
1546 skb->tstamp = cork->transmit_time;
1547 /*
1548 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1549 * on dst refcount
1550 */
1551 cork->dst = NULL;
1552 skb_dst_set(skb, &rt->dst);
1553
1554 if (iph->protocol == IPPROTO_ICMP)
1555 icmp_out_count(net, ((struct icmphdr *)
1556 skb_transport_header(skb))->type);
1557
1558 ip_cork_release(cork);
1559out:
1560 return skb;
1561}
1562
1563int ip_send_skb(struct net *net, struct sk_buff *skb)
1564{
1565 int err;
1566
1567 err = ip_local_out(net, skb->sk, skb);
1568 if (err) {
1569 if (err > 0)
1570 err = net_xmit_errno(err);
1571 if (err)
1572 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1573 }
1574
1575 return err;
1576}
1577
1578int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1579{
1580 struct sk_buff *skb;
1581
1582 skb = ip_finish_skb(sk, fl4);
1583 if (!skb)
1584 return 0;
1585
1586 /* Netfilter gets whole the not fragmented skb. */
1587 return ip_send_skb(sock_net(sk), skb);
1588}
1589
1590/*
1591 * Throw away all pending data on the socket.
1592 */
1593static void __ip_flush_pending_frames(struct sock *sk,
1594 struct sk_buff_head *queue,
1595 struct inet_cork *cork)
1596{
1597 struct sk_buff *skb;
1598
1599 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1600 kfree_skb(skb);
1601
1602 ip_cork_release(cork);
1603}
1604
1605void ip_flush_pending_frames(struct sock *sk)
1606{
1607 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1608}
1609
1610struct sk_buff *ip_make_skb(struct sock *sk,
1611 struct flowi4 *fl4,
1612 int getfrag(void *from, char *to, int offset,
1613 int len, int odd, struct sk_buff *skb),
1614 void *from, int length, int transhdrlen,
1615 struct ipcm_cookie *ipc, struct rtable **rtp,
1616 struct inet_cork *cork, unsigned int flags)
1617{
1618 struct sk_buff_head queue;
1619 int err;
1620
1621 if (flags & MSG_PROBE)
1622 return NULL;
1623
1624 __skb_queue_head_init(&queue);
1625
1626 cork->flags = 0;
1627 cork->addr = 0;
1628 cork->opt = NULL;
1629 err = ip_setup_cork(sk, cork, ipc, rtp);
1630 if (err)
1631 return ERR_PTR(err);
1632
1633 err = __ip_append_data(sk, fl4, &queue, cork,
1634 ¤t->task_frag, getfrag,
1635 from, length, transhdrlen, flags);
1636 if (err) {
1637 __ip_flush_pending_frames(sk, &queue, cork);
1638 return ERR_PTR(err);
1639 }
1640
1641 return __ip_make_skb(sk, fl4, &queue, cork);
1642}
1643
1644/*
1645 * Fetch data from kernel space and fill in checksum if needed.
1646 */
1647static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1648 int len, int odd, struct sk_buff *skb)
1649{
1650 __wsum csum;
1651
1652 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1653 skb->csum = csum_block_add(skb->csum, csum, odd);
1654 return 0;
1655}
1656
1657/*
1658 * Generic function to send a packet as reply to another packet.
1659 * Used to send some TCP resets/acks so far.
1660 */
1661void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
1662 const struct ip_options *sopt,
1663 __be32 daddr, __be32 saddr,
1664 const struct ip_reply_arg *arg,
1665 unsigned int len, u64 transmit_time)
1666{
1667 struct ip_options_data replyopts;
1668 struct ipcm_cookie ipc;
1669 struct flowi4 fl4;
1670 struct rtable *rt = skb_rtable(skb);
1671 struct net *net = sock_net(sk);
1672 struct sk_buff *nskb;
1673 int err;
1674 int oif;
1675
1676 if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
1677 return;
1678
1679 ipcm_init(&ipc);
1680 ipc.addr = daddr;
1681 ipc.sockc.transmit_time = transmit_time;
1682
1683 if (replyopts.opt.opt.optlen) {
1684 ipc.opt = &replyopts.opt;
1685
1686 if (replyopts.opt.opt.srr)
1687 daddr = replyopts.opt.opt.faddr;
1688 }
1689
1690 oif = arg->bound_dev_if;
1691 if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
1692 oif = skb->skb_iif;
1693
1694 flowi4_init_output(&fl4, oif,
1695 IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
1696 RT_TOS(arg->tos),
1697 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1698 ip_reply_arg_flowi_flags(arg),
1699 daddr, saddr,
1700 tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
1701 arg->uid);
1702 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1703 rt = ip_route_output_key(net, &fl4);
1704 if (IS_ERR(rt))
1705 return;
1706
1707 inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK;
1708
1709 sk->sk_protocol = ip_hdr(skb)->protocol;
1710 sk->sk_bound_dev_if = arg->bound_dev_if;
1711 sk->sk_sndbuf = sysctl_wmem_default;
1712 ipc.sockc.mark = fl4.flowi4_mark;
1713 err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
1714 len, 0, &ipc, &rt, MSG_DONTWAIT);
1715 if (unlikely(err)) {
1716 ip_flush_pending_frames(sk);
1717 goto out;
1718 }
1719
1720 nskb = skb_peek(&sk->sk_write_queue);
1721 if (nskb) {
1722 if (arg->csumoffset >= 0)
1723 *((__sum16 *)skb_transport_header(nskb) +
1724 arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1725 arg->csum));
1726 nskb->ip_summed = CHECKSUM_NONE;
1727 ip_push_pending_frames(sk, &fl4);
1728 }
1729out:
1730 ip_rt_put(rt);
1731}
1732
1733void __init ip_init(void)
1734{
1735 ip_rt_init();
1736 inet_initpeers();
1737
1738#if defined(CONFIG_IP_MULTICAST)
1739 igmp_mc_init();
1740#endif
1741}
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <linux/module.h>
47#include <linux/types.h>
48#include <linux/kernel.h>
49#include <linux/mm.h>
50#include <linux/string.h>
51#include <linux/errno.h>
52#include <linux/highmem.h>
53#include <linux/slab.h>
54
55#include <linux/socket.h>
56#include <linux/sockios.h>
57#include <linux/in.h>
58#include <linux/inet.h>
59#include <linux/netdevice.h>
60#include <linux/etherdevice.h>
61#include <linux/proc_fs.h>
62#include <linux/stat.h>
63#include <linux/init.h>
64
65#include <net/snmp.h>
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <net/route.h>
69#include <net/xfrm.h>
70#include <linux/skbuff.h>
71#include <net/sock.h>
72#include <net/arp.h>
73#include <net/icmp.h>
74#include <net/checksum.h>
75#include <net/inetpeer.h>
76#include <linux/igmp.h>
77#include <linux/netfilter_ipv4.h>
78#include <linux/netfilter_bridge.h>
79#include <linux/mroute.h>
80#include <linux/netlink.h>
81#include <linux/tcp.h>
82
83int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84EXPORT_SYMBOL(sysctl_ip_default_ttl);
85
86/* Generate a checksum for an outgoing IP datagram. */
87__inline__ void ip_send_check(struct iphdr *iph)
88{
89 iph->check = 0;
90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
91}
92EXPORT_SYMBOL(ip_send_check);
93
94int __ip_local_out(struct sk_buff *skb)
95{
96 struct iphdr *iph = ip_hdr(skb);
97
98 iph->tot_len = htons(skb->len);
99 ip_send_check(iph);
100 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
101 skb_dst(skb)->dev, dst_output);
102}
103
104int ip_local_out(struct sk_buff *skb)
105{
106 int err;
107
108 err = __ip_local_out(skb);
109 if (likely(err == 1))
110 err = dst_output(skb);
111
112 return err;
113}
114EXPORT_SYMBOL_GPL(ip_local_out);
115
116/* dev_loopback_xmit for use with netfilter. */
117static int ip_dev_loopback_xmit(struct sk_buff *newskb)
118{
119 skb_reset_mac_header(newskb);
120 __skb_pull(newskb, skb_network_offset(newskb));
121 newskb->pkt_type = PACKET_LOOPBACK;
122 newskb->ip_summed = CHECKSUM_UNNECESSARY;
123 WARN_ON(!skb_dst(newskb));
124 skb_dst_force(newskb);
125 netif_rx_ni(newskb);
126 return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131 int ttl = inet->uc_ttl;
132
133 if (ttl < 0)
134 ttl = ip4_dst_hoplimit(dst);
135 return ttl;
136}
137
138/*
139 * Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
144{
145 struct inet_sock *inet = inet_sk(sk);
146 struct rtable *rt = skb_rtable(skb);
147 struct iphdr *iph;
148
149 /* Build the IP header. */
150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151 skb_reset_network_header(skb);
152 iph = ip_hdr(skb);
153 iph->version = 4;
154 iph->ihl = 5;
155 iph->tos = inet->tos;
156 if (ip_dont_fragment(sk, &rt->dst))
157 iph->frag_off = htons(IP_DF);
158 else
159 iph->frag_off = 0;
160 iph->ttl = ip_select_ttl(inet, &rt->dst);
161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 iph->saddr = saddr;
163 iph->protocol = sk->sk_protocol;
164 ip_select_ident(iph, &rt->dst, sk);
165
166 if (opt && opt->opt.optlen) {
167 iph->ihl += opt->opt.optlen>>2;
168 ip_options_build(skb, &opt->opt, daddr, rt, 0);
169 }
170
171 skb->priority = sk->sk_priority;
172 skb->mark = sk->sk_mark;
173
174 /* Send it out. */
175 return ip_local_out(skb);
176}
177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
179static inline int ip_finish_output2(struct sk_buff *skb)
180{
181 struct dst_entry *dst = skb_dst(skb);
182 struct rtable *rt = (struct rtable *)dst;
183 struct net_device *dev = dst->dev;
184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
185 struct neighbour *neigh;
186
187 if (rt->rt_type == RTN_MULTICAST) {
188 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
189 } else if (rt->rt_type == RTN_BROADCAST)
190 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
191
192 /* Be paranoid, rather than too clever. */
193 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
194 struct sk_buff *skb2;
195
196 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
197 if (skb2 == NULL) {
198 kfree_skb(skb);
199 return -ENOMEM;
200 }
201 if (skb->sk)
202 skb_set_owner_w(skb2, skb->sk);
203 kfree_skb(skb);
204 skb = skb2;
205 }
206
207 rcu_read_lock();
208 neigh = dst_get_neighbour_noref(dst);
209 if (neigh) {
210 int res = neigh_output(neigh, skb);
211
212 rcu_read_unlock();
213 return res;
214 }
215 rcu_read_unlock();
216
217 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
218 __func__);
219 kfree_skb(skb);
220 return -EINVAL;
221}
222
223static inline int ip_skb_dst_mtu(struct sk_buff *skb)
224{
225 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
226
227 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
228 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
229}
230
231static int ip_finish_output(struct sk_buff *skb)
232{
233#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
234 /* Policy lookup after SNAT yielded a new policy */
235 if (skb_dst(skb)->xfrm != NULL) {
236 IPCB(skb)->flags |= IPSKB_REROUTED;
237 return dst_output(skb);
238 }
239#endif
240 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
241 return ip_fragment(skb, ip_finish_output2);
242 else
243 return ip_finish_output2(skb);
244}
245
246int ip_mc_output(struct sk_buff *skb)
247{
248 struct sock *sk = skb->sk;
249 struct rtable *rt = skb_rtable(skb);
250 struct net_device *dev = rt->dst.dev;
251
252 /*
253 * If the indicated interface is up and running, send the packet.
254 */
255 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
256
257 skb->dev = dev;
258 skb->protocol = htons(ETH_P_IP);
259
260 /*
261 * Multicasts are looped back for other local users
262 */
263
264 if (rt->rt_flags&RTCF_MULTICAST) {
265 if (sk_mc_loop(sk)
266#ifdef CONFIG_IP_MROUTE
267 /* Small optimization: do not loopback not local frames,
268 which returned after forwarding; they will be dropped
269 by ip_mr_input in any case.
270 Note, that local frames are looped back to be delivered
271 to local recipients.
272
273 This check is duplicated in ip_mr_input at the moment.
274 */
275 &&
276 ((rt->rt_flags & RTCF_LOCAL) ||
277 !(IPCB(skb)->flags & IPSKB_FORWARDED))
278#endif
279 ) {
280 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
281 if (newskb)
282 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
283 newskb, NULL, newskb->dev,
284 ip_dev_loopback_xmit);
285 }
286
287 /* Multicasts with ttl 0 must not go beyond the host */
288
289 if (ip_hdr(skb)->ttl == 0) {
290 kfree_skb(skb);
291 return 0;
292 }
293 }
294
295 if (rt->rt_flags&RTCF_BROADCAST) {
296 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
297 if (newskb)
298 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
299 NULL, newskb->dev, ip_dev_loopback_xmit);
300 }
301
302 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
303 skb->dev, ip_finish_output,
304 !(IPCB(skb)->flags & IPSKB_REROUTED));
305}
306
307int ip_output(struct sk_buff *skb)
308{
309 struct net_device *dev = skb_dst(skb)->dev;
310
311 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
312
313 skb->dev = dev;
314 skb->protocol = htons(ETH_P_IP);
315
316 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
317 ip_finish_output,
318 !(IPCB(skb)->flags & IPSKB_REROUTED));
319}
320
321/*
322 * copy saddr and daddr, possibly using 64bit load/stores
323 * Equivalent to :
324 * iph->saddr = fl4->saddr;
325 * iph->daddr = fl4->daddr;
326 */
327static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
328{
329 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
330 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
331 memcpy(&iph->saddr, &fl4->saddr,
332 sizeof(fl4->saddr) + sizeof(fl4->daddr));
333}
334
335int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
336{
337 struct sock *sk = skb->sk;
338 struct inet_sock *inet = inet_sk(sk);
339 struct ip_options_rcu *inet_opt;
340 struct flowi4 *fl4;
341 struct rtable *rt;
342 struct iphdr *iph;
343 int res;
344
345 /* Skip all of this if the packet is already routed,
346 * f.e. by something like SCTP.
347 */
348 rcu_read_lock();
349 inet_opt = rcu_dereference(inet->inet_opt);
350 fl4 = &fl->u.ip4;
351 rt = skb_rtable(skb);
352 if (rt != NULL)
353 goto packet_routed;
354
355 /* Make sure we can route this packet. */
356 rt = (struct rtable *)__sk_dst_check(sk, 0);
357 if (rt == NULL) {
358 __be32 daddr;
359
360 /* Use correct destination address if we have options. */
361 daddr = inet->inet_daddr;
362 if (inet_opt && inet_opt->opt.srr)
363 daddr = inet_opt->opt.faddr;
364
365 /* If this fails, retransmit mechanism of transport layer will
366 * keep trying until route appears or the connection times
367 * itself out.
368 */
369 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
370 daddr, inet->inet_saddr,
371 inet->inet_dport,
372 inet->inet_sport,
373 sk->sk_protocol,
374 RT_CONN_FLAGS(sk),
375 sk->sk_bound_dev_if);
376 if (IS_ERR(rt))
377 goto no_route;
378 sk_setup_caps(sk, &rt->dst);
379 }
380 skb_dst_set_noref(skb, &rt->dst);
381
382packet_routed:
383 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
384 goto no_route;
385
386 /* OK, we know where to send it, allocate and build IP header. */
387 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
388 skb_reset_network_header(skb);
389 iph = ip_hdr(skb);
390 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
391 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
392 iph->frag_off = htons(IP_DF);
393 else
394 iph->frag_off = 0;
395 iph->ttl = ip_select_ttl(inet, &rt->dst);
396 iph->protocol = sk->sk_protocol;
397 ip_copy_addrs(iph, fl4);
398
399 /* Transport layer set skb->h.foo itself. */
400
401 if (inet_opt && inet_opt->opt.optlen) {
402 iph->ihl += inet_opt->opt.optlen >> 2;
403 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
404 }
405
406 ip_select_ident_more(iph, &rt->dst, sk,
407 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
408
409 skb->priority = sk->sk_priority;
410 skb->mark = sk->sk_mark;
411
412 res = ip_local_out(skb);
413 rcu_read_unlock();
414 return res;
415
416no_route:
417 rcu_read_unlock();
418 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
419 kfree_skb(skb);
420 return -EHOSTUNREACH;
421}
422EXPORT_SYMBOL(ip_queue_xmit);
423
424
425static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
426{
427 to->pkt_type = from->pkt_type;
428 to->priority = from->priority;
429 to->protocol = from->protocol;
430 skb_dst_drop(to);
431 skb_dst_copy(to, from);
432 to->dev = from->dev;
433 to->mark = from->mark;
434
435 /* Copy the flags to each fragment. */
436 IPCB(to)->flags = IPCB(from)->flags;
437
438#ifdef CONFIG_NET_SCHED
439 to->tc_index = from->tc_index;
440#endif
441 nf_copy(to, from);
442#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
443 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
444 to->nf_trace = from->nf_trace;
445#endif
446#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
447 to->ipvs_property = from->ipvs_property;
448#endif
449 skb_copy_secmark(to, from);
450}
451
452/*
453 * This IP datagram is too large to be sent in one piece. Break it up into
454 * smaller pieces (each of size equal to IP header plus
455 * a block of the data of the original IP data part) that will yet fit in a
456 * single device frame, and queue such a frame for sending.
457 */
458
459int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
460{
461 struct iphdr *iph;
462 int ptr;
463 struct net_device *dev;
464 struct sk_buff *skb2;
465 unsigned int mtu, hlen, left, len, ll_rs;
466 int offset;
467 __be16 not_last_frag;
468 struct rtable *rt = skb_rtable(skb);
469 int err = 0;
470
471 dev = rt->dst.dev;
472
473 /*
474 * Point into the IP datagram header.
475 */
476
477 iph = ip_hdr(skb);
478
479 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
480 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
481 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
482 htonl(ip_skb_dst_mtu(skb)));
483 kfree_skb(skb);
484 return -EMSGSIZE;
485 }
486
487 /*
488 * Setup starting values.
489 */
490
491 hlen = iph->ihl * 4;
492 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
493#ifdef CONFIG_BRIDGE_NETFILTER
494 if (skb->nf_bridge)
495 mtu -= nf_bridge_mtu_reduction(skb);
496#endif
497 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
498
499 /* When frag_list is given, use it. First, check its validity:
500 * some transformers could create wrong frag_list or break existing
501 * one, it is not prohibited. In this case fall back to copying.
502 *
503 * LATER: this step can be merged to real generation of fragments,
504 * we can switch to copy when see the first bad fragment.
505 */
506 if (skb_has_frag_list(skb)) {
507 struct sk_buff *frag, *frag2;
508 int first_len = skb_pagelen(skb);
509
510 if (first_len - hlen > mtu ||
511 ((first_len - hlen) & 7) ||
512 ip_is_fragment(iph) ||
513 skb_cloned(skb))
514 goto slow_path;
515
516 skb_walk_frags(skb, frag) {
517 /* Correct geometry. */
518 if (frag->len > mtu ||
519 ((frag->len & 7) && frag->next) ||
520 skb_headroom(frag) < hlen)
521 goto slow_path_clean;
522
523 /* Partially cloned skb? */
524 if (skb_shared(frag))
525 goto slow_path_clean;
526
527 BUG_ON(frag->sk);
528 if (skb->sk) {
529 frag->sk = skb->sk;
530 frag->destructor = sock_wfree;
531 }
532 skb->truesize -= frag->truesize;
533 }
534
535 /* Everything is OK. Generate! */
536
537 err = 0;
538 offset = 0;
539 frag = skb_shinfo(skb)->frag_list;
540 skb_frag_list_init(skb);
541 skb->data_len = first_len - skb_headlen(skb);
542 skb->len = first_len;
543 iph->tot_len = htons(first_len);
544 iph->frag_off = htons(IP_MF);
545 ip_send_check(iph);
546
547 for (;;) {
548 /* Prepare header of the next frame,
549 * before previous one went down. */
550 if (frag) {
551 frag->ip_summed = CHECKSUM_NONE;
552 skb_reset_transport_header(frag);
553 __skb_push(frag, hlen);
554 skb_reset_network_header(frag);
555 memcpy(skb_network_header(frag), iph, hlen);
556 iph = ip_hdr(frag);
557 iph->tot_len = htons(frag->len);
558 ip_copy_metadata(frag, skb);
559 if (offset == 0)
560 ip_options_fragment(frag);
561 offset += skb->len - hlen;
562 iph->frag_off = htons(offset>>3);
563 if (frag->next != NULL)
564 iph->frag_off |= htons(IP_MF);
565 /* Ready, complete checksum */
566 ip_send_check(iph);
567 }
568
569 err = output(skb);
570
571 if (!err)
572 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
573 if (err || !frag)
574 break;
575
576 skb = frag;
577 frag = skb->next;
578 skb->next = NULL;
579 }
580
581 if (err == 0) {
582 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
583 return 0;
584 }
585
586 while (frag) {
587 skb = frag->next;
588 kfree_skb(frag);
589 frag = skb;
590 }
591 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
592 return err;
593
594slow_path_clean:
595 skb_walk_frags(skb, frag2) {
596 if (frag2 == frag)
597 break;
598 frag2->sk = NULL;
599 frag2->destructor = NULL;
600 skb->truesize += frag2->truesize;
601 }
602 }
603
604slow_path:
605 left = skb->len - hlen; /* Space per frame */
606 ptr = hlen; /* Where to start from */
607
608 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
609 * we need to make room for the encapsulating header
610 */
611 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
612
613 /*
614 * Fragment the datagram.
615 */
616
617 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
618 not_last_frag = iph->frag_off & htons(IP_MF);
619
620 /*
621 * Keep copying data until we run out.
622 */
623
624 while (left > 0) {
625 len = left;
626 /* IF: it doesn't fit, use 'mtu' - the data space left */
627 if (len > mtu)
628 len = mtu;
629 /* IF: we are not sending up to and including the packet end
630 then align the next start on an eight byte boundary */
631 if (len < left) {
632 len &= ~7;
633 }
634 /*
635 * Allocate buffer.
636 */
637
638 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
639 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
640 err = -ENOMEM;
641 goto fail;
642 }
643
644 /*
645 * Set up data on packet
646 */
647
648 ip_copy_metadata(skb2, skb);
649 skb_reserve(skb2, ll_rs);
650 skb_put(skb2, len + hlen);
651 skb_reset_network_header(skb2);
652 skb2->transport_header = skb2->network_header + hlen;
653
654 /*
655 * Charge the memory for the fragment to any owner
656 * it might possess
657 */
658
659 if (skb->sk)
660 skb_set_owner_w(skb2, skb->sk);
661
662 /*
663 * Copy the packet header into the new buffer.
664 */
665
666 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
667
668 /*
669 * Copy a block of the IP datagram.
670 */
671 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
672 BUG();
673 left -= len;
674
675 /*
676 * Fill in the new header fields.
677 */
678 iph = ip_hdr(skb2);
679 iph->frag_off = htons((offset >> 3));
680
681 /* ANK: dirty, but effective trick. Upgrade options only if
682 * the segment to be fragmented was THE FIRST (otherwise,
683 * options are already fixed) and make it ONCE
684 * on the initial skb, so that all the following fragments
685 * will inherit fixed options.
686 */
687 if (offset == 0)
688 ip_options_fragment(skb);
689
690 /*
691 * Added AC : If we are fragmenting a fragment that's not the
692 * last fragment then keep MF on each bit
693 */
694 if (left > 0 || not_last_frag)
695 iph->frag_off |= htons(IP_MF);
696 ptr += len;
697 offset += len;
698
699 /*
700 * Put this fragment into the sending queue.
701 */
702 iph->tot_len = htons(len + hlen);
703
704 ip_send_check(iph);
705
706 err = output(skb2);
707 if (err)
708 goto fail;
709
710 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
711 }
712 kfree_skb(skb);
713 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
714 return err;
715
716fail:
717 kfree_skb(skb);
718 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
719 return err;
720}
721EXPORT_SYMBOL(ip_fragment);
722
723int
724ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
725{
726 struct iovec *iov = from;
727
728 if (skb->ip_summed == CHECKSUM_PARTIAL) {
729 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
730 return -EFAULT;
731 } else {
732 __wsum csum = 0;
733 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
734 return -EFAULT;
735 skb->csum = csum_block_add(skb->csum, csum, odd);
736 }
737 return 0;
738}
739EXPORT_SYMBOL(ip_generic_getfrag);
740
741static inline __wsum
742csum_page(struct page *page, int offset, int copy)
743{
744 char *kaddr;
745 __wsum csum;
746 kaddr = kmap(page);
747 csum = csum_partial(kaddr + offset, copy, 0);
748 kunmap(page);
749 return csum;
750}
751
752static inline int ip_ufo_append_data(struct sock *sk,
753 struct sk_buff_head *queue,
754 int getfrag(void *from, char *to, int offset, int len,
755 int odd, struct sk_buff *skb),
756 void *from, int length, int hh_len, int fragheaderlen,
757 int transhdrlen, int maxfraglen, unsigned int flags)
758{
759 struct sk_buff *skb;
760 int err;
761
762 /* There is support for UDP fragmentation offload by network
763 * device, so create one single skb packet containing complete
764 * udp datagram
765 */
766 if ((skb = skb_peek_tail(queue)) == NULL) {
767 skb = sock_alloc_send_skb(sk,
768 hh_len + fragheaderlen + transhdrlen + 20,
769 (flags & MSG_DONTWAIT), &err);
770
771 if (skb == NULL)
772 return err;
773
774 /* reserve space for Hardware header */
775 skb_reserve(skb, hh_len);
776
777 /* create space for UDP/IP header */
778 skb_put(skb, fragheaderlen + transhdrlen);
779
780 /* initialize network header pointer */
781 skb_reset_network_header(skb);
782
783 /* initialize protocol header pointer */
784 skb->transport_header = skb->network_header + fragheaderlen;
785
786 skb->ip_summed = CHECKSUM_PARTIAL;
787 skb->csum = 0;
788
789 /* specify the length of each IP datagram fragment */
790 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
791 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
792 __skb_queue_tail(queue, skb);
793 }
794
795 return skb_append_datato_frags(sk, skb, getfrag, from,
796 (length - transhdrlen));
797}
798
799static int __ip_append_data(struct sock *sk,
800 struct flowi4 *fl4,
801 struct sk_buff_head *queue,
802 struct inet_cork *cork,
803 int getfrag(void *from, char *to, int offset,
804 int len, int odd, struct sk_buff *skb),
805 void *from, int length, int transhdrlen,
806 unsigned int flags)
807{
808 struct inet_sock *inet = inet_sk(sk);
809 struct sk_buff *skb;
810
811 struct ip_options *opt = cork->opt;
812 int hh_len;
813 int exthdrlen;
814 int mtu;
815 int copy;
816 int err;
817 int offset = 0;
818 unsigned int maxfraglen, fragheaderlen;
819 int csummode = CHECKSUM_NONE;
820 struct rtable *rt = (struct rtable *)cork->dst;
821
822 skb = skb_peek_tail(queue);
823
824 exthdrlen = !skb ? rt->dst.header_len : 0;
825 mtu = cork->fragsize;
826
827 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
828
829 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
830 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
831
832 if (cork->length + length > 0xFFFF - fragheaderlen) {
833 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
834 mtu-exthdrlen);
835 return -EMSGSIZE;
836 }
837
838 /*
839 * transhdrlen > 0 means that this is the first fragment and we wish
840 * it won't be fragmented in the future.
841 */
842 if (transhdrlen &&
843 length + fragheaderlen <= mtu &&
844 rt->dst.dev->features & NETIF_F_V4_CSUM &&
845 !exthdrlen)
846 csummode = CHECKSUM_PARTIAL;
847
848 cork->length += length;
849 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
850 (sk->sk_protocol == IPPROTO_UDP) &&
851 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
852 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
853 hh_len, fragheaderlen, transhdrlen,
854 maxfraglen, flags);
855 if (err)
856 goto error;
857 return 0;
858 }
859
860 /* So, what's going on in the loop below?
861 *
862 * We use calculated fragment length to generate chained skb,
863 * each of segments is IP fragment ready for sending to network after
864 * adding appropriate IP header.
865 */
866
867 if (!skb)
868 goto alloc_new_skb;
869
870 while (length > 0) {
871 /* Check if the remaining data fits into current packet. */
872 copy = mtu - skb->len;
873 if (copy < length)
874 copy = maxfraglen - skb->len;
875 if (copy <= 0) {
876 char *data;
877 unsigned int datalen;
878 unsigned int fraglen;
879 unsigned int fraggap;
880 unsigned int alloclen;
881 struct sk_buff *skb_prev;
882alloc_new_skb:
883 skb_prev = skb;
884 if (skb_prev)
885 fraggap = skb_prev->len - maxfraglen;
886 else
887 fraggap = 0;
888
889 /*
890 * If remaining data exceeds the mtu,
891 * we know we need more fragment(s).
892 */
893 datalen = length + fraggap;
894 if (datalen > mtu - fragheaderlen)
895 datalen = maxfraglen - fragheaderlen;
896 fraglen = datalen + fragheaderlen;
897
898 if ((flags & MSG_MORE) &&
899 !(rt->dst.dev->features&NETIF_F_SG))
900 alloclen = mtu;
901 else
902 alloclen = fraglen;
903
904 alloclen += exthdrlen;
905
906 /* The last fragment gets additional space at tail.
907 * Note, with MSG_MORE we overallocate on fragments,
908 * because we have no idea what fragment will be
909 * the last.
910 */
911 if (datalen == length + fraggap)
912 alloclen += rt->dst.trailer_len;
913
914 if (transhdrlen) {
915 skb = sock_alloc_send_skb(sk,
916 alloclen + hh_len + 15,
917 (flags & MSG_DONTWAIT), &err);
918 } else {
919 skb = NULL;
920 if (atomic_read(&sk->sk_wmem_alloc) <=
921 2 * sk->sk_sndbuf)
922 skb = sock_wmalloc(sk,
923 alloclen + hh_len + 15, 1,
924 sk->sk_allocation);
925 if (unlikely(skb == NULL))
926 err = -ENOBUFS;
927 else
928 /* only the initial fragment is
929 time stamped */
930 cork->tx_flags = 0;
931 }
932 if (skb == NULL)
933 goto error;
934
935 /*
936 * Fill in the control structures
937 */
938 skb->ip_summed = csummode;
939 skb->csum = 0;
940 skb_reserve(skb, hh_len);
941 skb_shinfo(skb)->tx_flags = cork->tx_flags;
942
943 /*
944 * Find where to start putting bytes.
945 */
946 data = skb_put(skb, fraglen + exthdrlen);
947 skb_set_network_header(skb, exthdrlen);
948 skb->transport_header = (skb->network_header +
949 fragheaderlen);
950 data += fragheaderlen + exthdrlen;
951
952 if (fraggap) {
953 skb->csum = skb_copy_and_csum_bits(
954 skb_prev, maxfraglen,
955 data + transhdrlen, fraggap, 0);
956 skb_prev->csum = csum_sub(skb_prev->csum,
957 skb->csum);
958 data += fraggap;
959 pskb_trim_unique(skb_prev, maxfraglen);
960 }
961
962 copy = datalen - transhdrlen - fraggap;
963 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
964 err = -EFAULT;
965 kfree_skb(skb);
966 goto error;
967 }
968
969 offset += copy;
970 length -= datalen - fraggap;
971 transhdrlen = 0;
972 exthdrlen = 0;
973 csummode = CHECKSUM_NONE;
974
975 /*
976 * Put the packet on the pending queue.
977 */
978 __skb_queue_tail(queue, skb);
979 continue;
980 }
981
982 if (copy > length)
983 copy = length;
984
985 if (!(rt->dst.dev->features&NETIF_F_SG)) {
986 unsigned int off;
987
988 off = skb->len;
989 if (getfrag(from, skb_put(skb, copy),
990 offset, copy, off, skb) < 0) {
991 __skb_trim(skb, off);
992 err = -EFAULT;
993 goto error;
994 }
995 } else {
996 int i = skb_shinfo(skb)->nr_frags;
997 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
998 struct page *page = cork->page;
999 int off = cork->off;
1000 unsigned int left;
1001
1002 if (page && (left = PAGE_SIZE - off) > 0) {
1003 if (copy >= left)
1004 copy = left;
1005 if (page != skb_frag_page(frag)) {
1006 if (i == MAX_SKB_FRAGS) {
1007 err = -EMSGSIZE;
1008 goto error;
1009 }
1010 skb_fill_page_desc(skb, i, page, off, 0);
1011 skb_frag_ref(skb, i);
1012 frag = &skb_shinfo(skb)->frags[i];
1013 }
1014 } else if (i < MAX_SKB_FRAGS) {
1015 if (copy > PAGE_SIZE)
1016 copy = PAGE_SIZE;
1017 page = alloc_pages(sk->sk_allocation, 0);
1018 if (page == NULL) {
1019 err = -ENOMEM;
1020 goto error;
1021 }
1022 cork->page = page;
1023 cork->off = 0;
1024
1025 skb_fill_page_desc(skb, i, page, 0, 0);
1026 frag = &skb_shinfo(skb)->frags[i];
1027 } else {
1028 err = -EMSGSIZE;
1029 goto error;
1030 }
1031 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1032 offset, copy, skb->len, skb) < 0) {
1033 err = -EFAULT;
1034 goto error;
1035 }
1036 cork->off += copy;
1037 skb_frag_size_add(frag, copy);
1038 skb->len += copy;
1039 skb->data_len += copy;
1040 skb->truesize += copy;
1041 atomic_add(copy, &sk->sk_wmem_alloc);
1042 }
1043 offset += copy;
1044 length -= copy;
1045 }
1046
1047 return 0;
1048
1049error:
1050 cork->length -= length;
1051 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1052 return err;
1053}
1054
1055static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1056 struct ipcm_cookie *ipc, struct rtable **rtp)
1057{
1058 struct inet_sock *inet = inet_sk(sk);
1059 struct ip_options_rcu *opt;
1060 struct rtable *rt;
1061
1062 /*
1063 * setup for corking.
1064 */
1065 opt = ipc->opt;
1066 if (opt) {
1067 if (cork->opt == NULL) {
1068 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1069 sk->sk_allocation);
1070 if (unlikely(cork->opt == NULL))
1071 return -ENOBUFS;
1072 }
1073 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1074 cork->flags |= IPCORK_OPT;
1075 cork->addr = ipc->addr;
1076 }
1077 rt = *rtp;
1078 if (unlikely(!rt))
1079 return -EFAULT;
1080 /*
1081 * We steal reference to this route, caller should not release it
1082 */
1083 *rtp = NULL;
1084 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1085 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1086 cork->dst = &rt->dst;
1087 cork->length = 0;
1088 cork->tx_flags = ipc->tx_flags;
1089 cork->page = NULL;
1090 cork->off = 0;
1091
1092 return 0;
1093}
1094
1095/*
1096 * ip_append_data() and ip_append_page() can make one large IP datagram
1097 * from many pieces of data. Each pieces will be holded on the socket
1098 * until ip_push_pending_frames() is called. Each piece can be a page
1099 * or non-page data.
1100 *
1101 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1102 * this interface potentially.
1103 *
1104 * LATER: length must be adjusted by pad at tail, when it is required.
1105 */
1106int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1107 int getfrag(void *from, char *to, int offset, int len,
1108 int odd, struct sk_buff *skb),
1109 void *from, int length, int transhdrlen,
1110 struct ipcm_cookie *ipc, struct rtable **rtp,
1111 unsigned int flags)
1112{
1113 struct inet_sock *inet = inet_sk(sk);
1114 int err;
1115
1116 if (flags&MSG_PROBE)
1117 return 0;
1118
1119 if (skb_queue_empty(&sk->sk_write_queue)) {
1120 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1121 if (err)
1122 return err;
1123 } else {
1124 transhdrlen = 0;
1125 }
1126
1127 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1128 from, length, transhdrlen, flags);
1129}
1130
1131ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1132 int offset, size_t size, int flags)
1133{
1134 struct inet_sock *inet = inet_sk(sk);
1135 struct sk_buff *skb;
1136 struct rtable *rt;
1137 struct ip_options *opt = NULL;
1138 struct inet_cork *cork;
1139 int hh_len;
1140 int mtu;
1141 int len;
1142 int err;
1143 unsigned int maxfraglen, fragheaderlen, fraggap;
1144
1145 if (inet->hdrincl)
1146 return -EPERM;
1147
1148 if (flags&MSG_PROBE)
1149 return 0;
1150
1151 if (skb_queue_empty(&sk->sk_write_queue))
1152 return -EINVAL;
1153
1154 cork = &inet->cork.base;
1155 rt = (struct rtable *)cork->dst;
1156 if (cork->flags & IPCORK_OPT)
1157 opt = cork->opt;
1158
1159 if (!(rt->dst.dev->features&NETIF_F_SG))
1160 return -EOPNOTSUPP;
1161
1162 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1163 mtu = cork->fragsize;
1164
1165 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1166 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1167
1168 if (cork->length + size > 0xFFFF - fragheaderlen) {
1169 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1170 return -EMSGSIZE;
1171 }
1172
1173 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1174 return -EINVAL;
1175
1176 cork->length += size;
1177 if ((size + skb->len > mtu) &&
1178 (sk->sk_protocol == IPPROTO_UDP) &&
1179 (rt->dst.dev->features & NETIF_F_UFO)) {
1180 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1181 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1182 }
1183
1184
1185 while (size > 0) {
1186 int i;
1187
1188 if (skb_is_gso(skb))
1189 len = size;
1190 else {
1191
1192 /* Check if the remaining data fits into current packet. */
1193 len = mtu - skb->len;
1194 if (len < size)
1195 len = maxfraglen - skb->len;
1196 }
1197 if (len <= 0) {
1198 struct sk_buff *skb_prev;
1199 int alloclen;
1200
1201 skb_prev = skb;
1202 fraggap = skb_prev->len - maxfraglen;
1203
1204 alloclen = fragheaderlen + hh_len + fraggap + 15;
1205 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1206 if (unlikely(!skb)) {
1207 err = -ENOBUFS;
1208 goto error;
1209 }
1210
1211 /*
1212 * Fill in the control structures
1213 */
1214 skb->ip_summed = CHECKSUM_NONE;
1215 skb->csum = 0;
1216 skb_reserve(skb, hh_len);
1217
1218 /*
1219 * Find where to start putting bytes.
1220 */
1221 skb_put(skb, fragheaderlen + fraggap);
1222 skb_reset_network_header(skb);
1223 skb->transport_header = (skb->network_header +
1224 fragheaderlen);
1225 if (fraggap) {
1226 skb->csum = skb_copy_and_csum_bits(skb_prev,
1227 maxfraglen,
1228 skb_transport_header(skb),
1229 fraggap, 0);
1230 skb_prev->csum = csum_sub(skb_prev->csum,
1231 skb->csum);
1232 pskb_trim_unique(skb_prev, maxfraglen);
1233 }
1234
1235 /*
1236 * Put the packet on the pending queue.
1237 */
1238 __skb_queue_tail(&sk->sk_write_queue, skb);
1239 continue;
1240 }
1241
1242 i = skb_shinfo(skb)->nr_frags;
1243 if (len > size)
1244 len = size;
1245 if (skb_can_coalesce(skb, i, page, offset)) {
1246 skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1247 } else if (i < MAX_SKB_FRAGS) {
1248 get_page(page);
1249 skb_fill_page_desc(skb, i, page, offset, len);
1250 } else {
1251 err = -EMSGSIZE;
1252 goto error;
1253 }
1254
1255 if (skb->ip_summed == CHECKSUM_NONE) {
1256 __wsum csum;
1257 csum = csum_page(page, offset, len);
1258 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1259 }
1260
1261 skb->len += len;
1262 skb->data_len += len;
1263 skb->truesize += len;
1264 atomic_add(len, &sk->sk_wmem_alloc);
1265 offset += len;
1266 size -= len;
1267 }
1268 return 0;
1269
1270error:
1271 cork->length -= size;
1272 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1273 return err;
1274}
1275
1276static void ip_cork_release(struct inet_cork *cork)
1277{
1278 cork->flags &= ~IPCORK_OPT;
1279 kfree(cork->opt);
1280 cork->opt = NULL;
1281 dst_release(cork->dst);
1282 cork->dst = NULL;
1283}
1284
1285/*
1286 * Combined all pending IP fragments on the socket as one IP datagram
1287 * and push them out.
1288 */
1289struct sk_buff *__ip_make_skb(struct sock *sk,
1290 struct flowi4 *fl4,
1291 struct sk_buff_head *queue,
1292 struct inet_cork *cork)
1293{
1294 struct sk_buff *skb, *tmp_skb;
1295 struct sk_buff **tail_skb;
1296 struct inet_sock *inet = inet_sk(sk);
1297 struct net *net = sock_net(sk);
1298 struct ip_options *opt = NULL;
1299 struct rtable *rt = (struct rtable *)cork->dst;
1300 struct iphdr *iph;
1301 __be16 df = 0;
1302 __u8 ttl;
1303
1304 if ((skb = __skb_dequeue(queue)) == NULL)
1305 goto out;
1306 tail_skb = &(skb_shinfo(skb)->frag_list);
1307
1308 /* move skb->data to ip header from ext header */
1309 if (skb->data < skb_network_header(skb))
1310 __skb_pull(skb, skb_network_offset(skb));
1311 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1312 __skb_pull(tmp_skb, skb_network_header_len(skb));
1313 *tail_skb = tmp_skb;
1314 tail_skb = &(tmp_skb->next);
1315 skb->len += tmp_skb->len;
1316 skb->data_len += tmp_skb->len;
1317 skb->truesize += tmp_skb->truesize;
1318 tmp_skb->destructor = NULL;
1319 tmp_skb->sk = NULL;
1320 }
1321
1322 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1323 * to fragment the frame generated here. No matter, what transforms
1324 * how transforms change size of the packet, it will come out.
1325 */
1326 if (inet->pmtudisc < IP_PMTUDISC_DO)
1327 skb->local_df = 1;
1328
1329 /* DF bit is set when we want to see DF on outgoing frames.
1330 * If local_df is set too, we still allow to fragment this frame
1331 * locally. */
1332 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1333 (skb->len <= dst_mtu(&rt->dst) &&
1334 ip_dont_fragment(sk, &rt->dst)))
1335 df = htons(IP_DF);
1336
1337 if (cork->flags & IPCORK_OPT)
1338 opt = cork->opt;
1339
1340 if (rt->rt_type == RTN_MULTICAST)
1341 ttl = inet->mc_ttl;
1342 else
1343 ttl = ip_select_ttl(inet, &rt->dst);
1344
1345 iph = (struct iphdr *)skb->data;
1346 iph->version = 4;
1347 iph->ihl = 5;
1348 iph->tos = inet->tos;
1349 iph->frag_off = df;
1350 ip_select_ident(iph, &rt->dst, sk);
1351 iph->ttl = ttl;
1352 iph->protocol = sk->sk_protocol;
1353 ip_copy_addrs(iph, fl4);
1354
1355 if (opt) {
1356 iph->ihl += opt->optlen>>2;
1357 ip_options_build(skb, opt, cork->addr, rt, 0);
1358 }
1359
1360 skb->priority = sk->sk_priority;
1361 skb->mark = sk->sk_mark;
1362 /*
1363 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1364 * on dst refcount
1365 */
1366 cork->dst = NULL;
1367 skb_dst_set(skb, &rt->dst);
1368
1369 if (iph->protocol == IPPROTO_ICMP)
1370 icmp_out_count(net, ((struct icmphdr *)
1371 skb_transport_header(skb))->type);
1372
1373 ip_cork_release(cork);
1374out:
1375 return skb;
1376}
1377
1378int ip_send_skb(struct sk_buff *skb)
1379{
1380 struct net *net = sock_net(skb->sk);
1381 int err;
1382
1383 err = ip_local_out(skb);
1384 if (err) {
1385 if (err > 0)
1386 err = net_xmit_errno(err);
1387 if (err)
1388 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1389 }
1390
1391 return err;
1392}
1393
1394int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1395{
1396 struct sk_buff *skb;
1397
1398 skb = ip_finish_skb(sk, fl4);
1399 if (!skb)
1400 return 0;
1401
1402 /* Netfilter gets whole the not fragmented skb. */
1403 return ip_send_skb(skb);
1404}
1405
1406/*
1407 * Throw away all pending data on the socket.
1408 */
1409static void __ip_flush_pending_frames(struct sock *sk,
1410 struct sk_buff_head *queue,
1411 struct inet_cork *cork)
1412{
1413 struct sk_buff *skb;
1414
1415 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1416 kfree_skb(skb);
1417
1418 ip_cork_release(cork);
1419}
1420
1421void ip_flush_pending_frames(struct sock *sk)
1422{
1423 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1424}
1425
1426struct sk_buff *ip_make_skb(struct sock *sk,
1427 struct flowi4 *fl4,
1428 int getfrag(void *from, char *to, int offset,
1429 int len, int odd, struct sk_buff *skb),
1430 void *from, int length, int transhdrlen,
1431 struct ipcm_cookie *ipc, struct rtable **rtp,
1432 unsigned int flags)
1433{
1434 struct inet_cork cork;
1435 struct sk_buff_head queue;
1436 int err;
1437
1438 if (flags & MSG_PROBE)
1439 return NULL;
1440
1441 __skb_queue_head_init(&queue);
1442
1443 cork.flags = 0;
1444 cork.addr = 0;
1445 cork.opt = NULL;
1446 err = ip_setup_cork(sk, &cork, ipc, rtp);
1447 if (err)
1448 return ERR_PTR(err);
1449
1450 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1451 from, length, transhdrlen, flags);
1452 if (err) {
1453 __ip_flush_pending_frames(sk, &queue, &cork);
1454 return ERR_PTR(err);
1455 }
1456
1457 return __ip_make_skb(sk, fl4, &queue, &cork);
1458}
1459
1460/*
1461 * Fetch data from kernel space and fill in checksum if needed.
1462 */
1463static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1464 int len, int odd, struct sk_buff *skb)
1465{
1466 __wsum csum;
1467
1468 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1469 skb->csum = csum_block_add(skb->csum, csum, odd);
1470 return 0;
1471}
1472
1473/*
1474 * Generic function to send a packet as reply to another packet.
1475 * Used to send TCP resets so far. ICMP should use this function too.
1476 *
1477 * Should run single threaded per socket because it uses the sock
1478 * structure to pass arguments.
1479 */
1480void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1481 const struct ip_reply_arg *arg, unsigned int len)
1482{
1483 struct inet_sock *inet = inet_sk(sk);
1484 struct ip_options_data replyopts;
1485 struct ipcm_cookie ipc;
1486 struct flowi4 fl4;
1487 struct rtable *rt = skb_rtable(skb);
1488
1489 if (ip_options_echo(&replyopts.opt.opt, skb))
1490 return;
1491
1492 ipc.addr = daddr;
1493 ipc.opt = NULL;
1494 ipc.tx_flags = 0;
1495
1496 if (replyopts.opt.opt.optlen) {
1497 ipc.opt = &replyopts.opt;
1498
1499 if (replyopts.opt.opt.srr)
1500 daddr = replyopts.opt.opt.faddr;
1501 }
1502
1503 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1504 RT_TOS(arg->tos),
1505 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1506 ip_reply_arg_flowi_flags(arg),
1507 daddr, rt->rt_spec_dst,
1508 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1509 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1510 rt = ip_route_output_key(sock_net(sk), &fl4);
1511 if (IS_ERR(rt))
1512 return;
1513
1514 /* And let IP do all the hard work.
1515
1516 This chunk is not reenterable, hence spinlock.
1517 Note that it uses the fact, that this function is called
1518 with locally disabled BH and that sk cannot be already spinlocked.
1519 */
1520 bh_lock_sock(sk);
1521 inet->tos = arg->tos;
1522 sk->sk_priority = skb->priority;
1523 sk->sk_protocol = ip_hdr(skb)->protocol;
1524 sk->sk_bound_dev_if = arg->bound_dev_if;
1525 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1526 &ipc, &rt, MSG_DONTWAIT);
1527 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1528 if (arg->csumoffset >= 0)
1529 *((__sum16 *)skb_transport_header(skb) +
1530 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1531 arg->csum));
1532 skb->ip_summed = CHECKSUM_NONE;
1533 ip_push_pending_frames(sk, &fl4);
1534 }
1535
1536 bh_unlock_sock(sk);
1537
1538 ip_rt_put(rt);
1539}
1540
1541void __init ip_init(void)
1542{
1543 ip_rt_init();
1544 inet_initpeers();
1545
1546#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1547 igmp_mc_proc_init();
1548#endif
1549}