Loading...
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * The Internet Protocol (IP) output module.
8 *
9 * Authors: Ross Biro
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Donald Becker, <becker@super.org>
12 * Alan Cox, <Alan.Cox@linux.org>
13 * Richard Underwood
14 * Stefan Becker, <stefanb@yello.ping.de>
15 * Jorge Cwik, <jorge@laser.satlink.net>
16 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
17 * Hirokazu Takahashi, <taka@valinux.co.jp>
18 *
19 * See ip_input.c for original log
20 *
21 * Fixes:
22 * Alan Cox : Missing nonblock feature in ip_build_xmit.
23 * Mike Kilburn : htons() missing in ip_build_xmit.
24 * Bradford Johnson: Fix faulty handling of some frames when
25 * no route is found.
26 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
27 * (in case if packet not accepted by
28 * output firewall rules)
29 * Mike McLagan : Routing by source
30 * Alexey Kuznetsov: use new route cache
31 * Andi Kleen: Fix broken PMTU recovery and remove
32 * some redundant tests.
33 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
34 * Andi Kleen : Replace ip_reply with ip_send_reply.
35 * Andi Kleen : Split fast and slow ip_build_xmit path
36 * for decreased register pressure on x86
37 * and more readability.
38 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
39 * silently drop skb instead of failing with -EPERM.
40 * Detlev Wengorz : Copy protocol for fragments.
41 * Hirokazu Takahashi: HW checksumming for outgoing UDP
42 * datagrams.
43 * Hirokazu Takahashi: sendfile() on UDP works now.
44 */
45
46#include <linux/uaccess.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
53#include <linux/highmem.h>
54#include <linux/slab.h>
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
70#include <net/xfrm.h>
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
75#include <net/checksum.h>
76#include <net/gso.h>
77#include <net/inetpeer.h>
78#include <net/inet_ecn.h>
79#include <net/lwtunnel.h>
80#include <net/inet_dscp.h>
81#include <linux/bpf-cgroup.h>
82#include <linux/igmp.h>
83#include <linux/netfilter_ipv4.h>
84#include <linux/netfilter_bridge.h>
85#include <linux/netlink.h>
86#include <linux/tcp.h>
87
88static int
89ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
90 unsigned int mtu,
91 int (*output)(struct net *, struct sock *, struct sk_buff *));
92
93/* Generate a checksum for an outgoing IP datagram. */
94void ip_send_check(struct iphdr *iph)
95{
96 iph->check = 0;
97 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
98}
99EXPORT_SYMBOL(ip_send_check);
100
101int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
102{
103 struct iphdr *iph = ip_hdr(skb);
104
105 IP_INC_STATS(net, IPSTATS_MIB_OUTREQUESTS);
106
107 iph_set_totlen(iph, skb->len);
108 ip_send_check(iph);
109
110 /* if egress device is enslaved to an L3 master device pass the
111 * skb to its handler for processing
112 */
113 skb = l3mdev_ip_out(sk, skb);
114 if (unlikely(!skb))
115 return 0;
116
117 skb->protocol = htons(ETH_P_IP);
118
119 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
120 net, sk, skb, NULL, skb_dst(skb)->dev,
121 dst_output);
122}
123
124int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
125{
126 int err;
127
128 err = __ip_local_out(net, sk, skb);
129 if (likely(err == 1))
130 err = dst_output(net, sk, skb);
131
132 return err;
133}
134EXPORT_SYMBOL_GPL(ip_local_out);
135
136static inline int ip_select_ttl(const struct inet_sock *inet,
137 const struct dst_entry *dst)
138{
139 int ttl = READ_ONCE(inet->uc_ttl);
140
141 if (ttl < 0)
142 ttl = ip4_dst_hoplimit(dst);
143 return ttl;
144}
145
146/*
147 * Add an ip header to a skbuff and send it out.
148 *
149 */
150int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
151 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
152 u8 tos)
153{
154 const struct inet_sock *inet = inet_sk(sk);
155 struct rtable *rt = skb_rtable(skb);
156 struct net *net = sock_net(sk);
157 struct iphdr *iph;
158
159 /* Build the IP header. */
160 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
161 skb_reset_network_header(skb);
162 iph = ip_hdr(skb);
163 iph->version = 4;
164 iph->ihl = 5;
165 iph->tos = tos;
166 iph->ttl = ip_select_ttl(inet, &rt->dst);
167 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
168 iph->saddr = saddr;
169 iph->protocol = sk->sk_protocol;
170 /* Do not bother generating IPID for small packets (eg SYNACK) */
171 if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) {
172 iph->frag_off = htons(IP_DF);
173 iph->id = 0;
174 } else {
175 iph->frag_off = 0;
176 /* TCP packets here are SYNACK with fat IPv4/TCP options.
177 * Avoid using the hashed IP ident generator.
178 */
179 if (sk->sk_protocol == IPPROTO_TCP)
180 iph->id = (__force __be16)get_random_u16();
181 else
182 __ip_select_ident(net, iph, 1);
183 }
184
185 if (opt && opt->opt.optlen) {
186 iph->ihl += opt->opt.optlen>>2;
187 ip_options_build(skb, &opt->opt, daddr, rt);
188 }
189
190 skb->priority = READ_ONCE(sk->sk_priority);
191 if (!skb->mark)
192 skb->mark = READ_ONCE(sk->sk_mark);
193
194 /* Send it out. */
195 return ip_local_out(net, skb->sk, skb);
196}
197EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
198
199static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
200{
201 struct dst_entry *dst = skb_dst(skb);
202 struct rtable *rt = dst_rtable(dst);
203 struct net_device *dev = dst->dev;
204 unsigned int hh_len = LL_RESERVED_SPACE(dev);
205 struct neighbour *neigh;
206 bool is_v6gw = false;
207
208 if (rt->rt_type == RTN_MULTICAST) {
209 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
210 } else if (rt->rt_type == RTN_BROADCAST)
211 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
212
213 /* OUTOCTETS should be counted after fragment */
214 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
215
216 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
217 skb = skb_expand_head(skb, hh_len);
218 if (!skb)
219 return -ENOMEM;
220 }
221
222 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
223 int res = lwtunnel_xmit(skb);
224
225 if (res != LWTUNNEL_XMIT_CONTINUE)
226 return res;
227 }
228
229 rcu_read_lock();
230 neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
231 if (!IS_ERR(neigh)) {
232 int res;
233
234 sock_confirm_neigh(skb, neigh);
235 /* if crossing protocols, can not use the cached header */
236 res = neigh_output(neigh, skb, is_v6gw);
237 rcu_read_unlock();
238 return res;
239 }
240 rcu_read_unlock();
241
242 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
243 __func__);
244 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
245 return PTR_ERR(neigh);
246}
247
248static int ip_finish_output_gso(struct net *net, struct sock *sk,
249 struct sk_buff *skb, unsigned int mtu)
250{
251 struct sk_buff *segs, *nskb;
252 netdev_features_t features;
253 int ret = 0;
254
255 /* common case: seglen is <= mtu
256 */
257 if (skb_gso_validate_network_len(skb, mtu))
258 return ip_finish_output2(net, sk, skb);
259
260 /* Slowpath - GSO segment length exceeds the egress MTU.
261 *
262 * This can happen in several cases:
263 * - Forwarding of a TCP GRO skb, when DF flag is not set.
264 * - Forwarding of an skb that arrived on a virtualization interface
265 * (virtio-net/vhost/tap) with TSO/GSO size set by other network
266 * stack.
267 * - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an
268 * interface with a smaller MTU.
269 * - Arriving GRO skb (or GSO skb in a virtualized environment) that is
270 * bridged to a NETIF_F_TSO tunnel stacked over an interface with an
271 * insufficient MTU.
272 */
273 features = netif_skb_features(skb);
274 BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
275 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
276 if (IS_ERR_OR_NULL(segs)) {
277 kfree_skb(skb);
278 return -ENOMEM;
279 }
280
281 consume_skb(skb);
282
283 skb_list_walk_safe(segs, segs, nskb) {
284 int err;
285
286 skb_mark_not_on_list(segs);
287 err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
288
289 if (err && ret == 0)
290 ret = err;
291 }
292
293 return ret;
294}
295
296static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
297{
298 unsigned int mtu;
299
300#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
301 /* Policy lookup after SNAT yielded a new policy */
302 if (skb_dst(skb)->xfrm) {
303 IPCB(skb)->flags |= IPSKB_REROUTED;
304 return dst_output(net, sk, skb);
305 }
306#endif
307 mtu = ip_skb_dst_mtu(sk, skb);
308 if (skb_is_gso(skb))
309 return ip_finish_output_gso(net, sk, skb, mtu);
310
311 if (skb->len > mtu || IPCB(skb)->frag_max_size)
312 return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
313
314 return ip_finish_output2(net, sk, skb);
315}
316
317static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
318{
319 int ret;
320
321 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
322 switch (ret) {
323 case NET_XMIT_SUCCESS:
324 return __ip_finish_output(net, sk, skb);
325 case NET_XMIT_CN:
326 return __ip_finish_output(net, sk, skb) ? : ret;
327 default:
328 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
329 return ret;
330 }
331}
332
333static int ip_mc_finish_output(struct net *net, struct sock *sk,
334 struct sk_buff *skb)
335{
336 struct rtable *new_rt;
337 bool do_cn = false;
338 int ret, err;
339
340 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
341 switch (ret) {
342 case NET_XMIT_CN:
343 do_cn = true;
344 fallthrough;
345 case NET_XMIT_SUCCESS:
346 break;
347 default:
348 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
349 return ret;
350 }
351
352 /* Reset rt_iif so that inet_iif() will return skb->skb_iif. Setting
353 * this to non-zero causes ipi_ifindex in in_pktinfo to be overwritten,
354 * see ipv4_pktinfo_prepare().
355 */
356 new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb));
357 if (new_rt) {
358 new_rt->rt_iif = 0;
359 skb_dst_drop(skb);
360 skb_dst_set(skb, &new_rt->dst);
361 }
362
363 err = dev_loopback_xmit(net, sk, skb);
364 return (do_cn && err) ? ret : err;
365}
366
367int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
368{
369 struct rtable *rt = skb_rtable(skb);
370 struct net_device *dev = rt->dst.dev;
371
372 /*
373 * If the indicated interface is up and running, send the packet.
374 */
375 skb->dev = dev;
376 skb->protocol = htons(ETH_P_IP);
377
378 /*
379 * Multicasts are looped back for other local users
380 */
381
382 if (rt->rt_flags&RTCF_MULTICAST) {
383 if (sk_mc_loop(sk)
384#ifdef CONFIG_IP_MROUTE
385 /* Small optimization: do not loopback not local frames,
386 which returned after forwarding; they will be dropped
387 by ip_mr_input in any case.
388 Note, that local frames are looped back to be delivered
389 to local recipients.
390
391 This check is duplicated in ip_mr_input at the moment.
392 */
393 &&
394 ((rt->rt_flags & RTCF_LOCAL) ||
395 !(IPCB(skb)->flags & IPSKB_FORWARDED))
396#endif
397 ) {
398 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
399 if (newskb)
400 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
401 net, sk, newskb, NULL, newskb->dev,
402 ip_mc_finish_output);
403 }
404
405 /* Multicasts with ttl 0 must not go beyond the host */
406
407 if (ip_hdr(skb)->ttl == 0) {
408 kfree_skb(skb);
409 return 0;
410 }
411 }
412
413 if (rt->rt_flags&RTCF_BROADCAST) {
414 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
415 if (newskb)
416 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
417 net, sk, newskb, NULL, newskb->dev,
418 ip_mc_finish_output);
419 }
420
421 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
422 net, sk, skb, NULL, skb->dev,
423 ip_finish_output,
424 !(IPCB(skb)->flags & IPSKB_REROUTED));
425}
426
427int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
428{
429 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
430
431 skb->dev = dev;
432 skb->protocol = htons(ETH_P_IP);
433
434 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
435 net, sk, skb, indev, dev,
436 ip_finish_output,
437 !(IPCB(skb)->flags & IPSKB_REROUTED));
438}
439EXPORT_SYMBOL(ip_output);
440
441/*
442 * copy saddr and daddr, possibly using 64bit load/stores
443 * Equivalent to :
444 * iph->saddr = fl4->saddr;
445 * iph->daddr = fl4->daddr;
446 */
447static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
448{
449 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
450 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
451
452 iph->saddr = fl4->saddr;
453 iph->daddr = fl4->daddr;
454}
455
456/* Note: skb->sk can be different from sk, in case of tunnels */
457int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
458 __u8 tos)
459{
460 struct inet_sock *inet = inet_sk(sk);
461 struct net *net = sock_net(sk);
462 struct ip_options_rcu *inet_opt;
463 struct flowi4 *fl4;
464 struct rtable *rt;
465 struct iphdr *iph;
466 int res;
467
468 /* Skip all of this if the packet is already routed,
469 * f.e. by something like SCTP.
470 */
471 rcu_read_lock();
472 inet_opt = rcu_dereference(inet->inet_opt);
473 fl4 = &fl->u.ip4;
474 rt = skb_rtable(skb);
475 if (rt)
476 goto packet_routed;
477
478 /* Make sure we can route this packet. */
479 rt = dst_rtable(__sk_dst_check(sk, 0));
480 if (!rt) {
481 __be32 daddr;
482
483 /* Use correct destination address if we have options. */
484 daddr = inet->inet_daddr;
485 if (inet_opt && inet_opt->opt.srr)
486 daddr = inet_opt->opt.faddr;
487
488 /* If this fails, retransmit mechanism of transport layer will
489 * keep trying until route appears or the connection times
490 * itself out.
491 */
492 rt = ip_route_output_ports(net, fl4, sk,
493 daddr, inet->inet_saddr,
494 inet->inet_dport,
495 inet->inet_sport,
496 sk->sk_protocol,
497 tos & INET_DSCP_MASK,
498 sk->sk_bound_dev_if);
499 if (IS_ERR(rt))
500 goto no_route;
501 sk_setup_caps(sk, &rt->dst);
502 }
503 skb_dst_set_noref(skb, &rt->dst);
504
505packet_routed:
506 if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway)
507 goto no_route;
508
509 /* OK, we know where to send it, allocate and build IP header. */
510 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
511 skb_reset_network_header(skb);
512 iph = ip_hdr(skb);
513 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (tos & 0xff));
514 if (ip_dont_fragment(sk, &rt->dst) && !skb->ignore_df)
515 iph->frag_off = htons(IP_DF);
516 else
517 iph->frag_off = 0;
518 iph->ttl = ip_select_ttl(inet, &rt->dst);
519 iph->protocol = sk->sk_protocol;
520 ip_copy_addrs(iph, fl4);
521
522 /* Transport layer set skb->h.foo itself. */
523
524 if (inet_opt && inet_opt->opt.optlen) {
525 iph->ihl += inet_opt->opt.optlen >> 2;
526 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt);
527 }
528
529 ip_select_ident_segs(net, skb, sk,
530 skb_shinfo(skb)->gso_segs ?: 1);
531
532 /* TODO : should we use skb->sk here instead of sk ? */
533 skb->priority = READ_ONCE(sk->sk_priority);
534 skb->mark = READ_ONCE(sk->sk_mark);
535
536 res = ip_local_out(net, sk, skb);
537 rcu_read_unlock();
538 return res;
539
540no_route:
541 rcu_read_unlock();
542 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
543 kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES);
544 return -EHOSTUNREACH;
545}
546EXPORT_SYMBOL(__ip_queue_xmit);
547
548int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
549{
550 return __ip_queue_xmit(sk, skb, fl, READ_ONCE(inet_sk(sk)->tos));
551}
552EXPORT_SYMBOL(ip_queue_xmit);
553
554static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
555{
556 to->pkt_type = from->pkt_type;
557 to->priority = from->priority;
558 to->protocol = from->protocol;
559 to->skb_iif = from->skb_iif;
560 skb_dst_drop(to);
561 skb_dst_copy(to, from);
562 to->dev = from->dev;
563 to->mark = from->mark;
564
565 skb_copy_hash(to, from);
566
567#ifdef CONFIG_NET_SCHED
568 to->tc_index = from->tc_index;
569#endif
570 nf_copy(to, from);
571 skb_ext_copy(to, from);
572#if IS_ENABLED(CONFIG_IP_VS)
573 to->ipvs_property = from->ipvs_property;
574#endif
575 skb_copy_secmark(to, from);
576}
577
578static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
579 unsigned int mtu,
580 int (*output)(struct net *, struct sock *, struct sk_buff *))
581{
582 struct iphdr *iph = ip_hdr(skb);
583
584 if ((iph->frag_off & htons(IP_DF)) == 0)
585 return ip_do_fragment(net, sk, skb, output);
586
587 if (unlikely(!skb->ignore_df ||
588 (IPCB(skb)->frag_max_size &&
589 IPCB(skb)->frag_max_size > mtu))) {
590 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
591 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
592 htonl(mtu));
593 kfree_skb(skb);
594 return -EMSGSIZE;
595 }
596
597 return ip_do_fragment(net, sk, skb, output);
598}
599
600void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
601 unsigned int hlen, struct ip_fraglist_iter *iter)
602{
603 unsigned int first_len = skb_pagelen(skb);
604
605 iter->frag = skb_shinfo(skb)->frag_list;
606 skb_frag_list_init(skb);
607
608 iter->offset = 0;
609 iter->iph = iph;
610 iter->hlen = hlen;
611
612 skb->data_len = first_len - skb_headlen(skb);
613 skb->len = first_len;
614 iph->tot_len = htons(first_len);
615 iph->frag_off = htons(IP_MF);
616 ip_send_check(iph);
617}
618EXPORT_SYMBOL(ip_fraglist_init);
619
620void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
621{
622 unsigned int hlen = iter->hlen;
623 struct iphdr *iph = iter->iph;
624 struct sk_buff *frag;
625
626 frag = iter->frag;
627 frag->ip_summed = CHECKSUM_NONE;
628 skb_reset_transport_header(frag);
629 __skb_push(frag, hlen);
630 skb_reset_network_header(frag);
631 memcpy(skb_network_header(frag), iph, hlen);
632 iter->iph = ip_hdr(frag);
633 iph = iter->iph;
634 iph->tot_len = htons(frag->len);
635 ip_copy_metadata(frag, skb);
636 iter->offset += skb->len - hlen;
637 iph->frag_off = htons(iter->offset >> 3);
638 if (frag->next)
639 iph->frag_off |= htons(IP_MF);
640 /* Ready, complete checksum */
641 ip_send_check(iph);
642}
643EXPORT_SYMBOL(ip_fraglist_prepare);
644
645void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
646 unsigned int ll_rs, unsigned int mtu, bool DF,
647 struct ip_frag_state *state)
648{
649 struct iphdr *iph = ip_hdr(skb);
650
651 state->DF = DF;
652 state->hlen = hlen;
653 state->ll_rs = ll_rs;
654 state->mtu = mtu;
655
656 state->left = skb->len - hlen; /* Space per frame */
657 state->ptr = hlen; /* Where to start from */
658
659 state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
660 state->not_last_frag = iph->frag_off & htons(IP_MF);
661}
662EXPORT_SYMBOL(ip_frag_init);
663
664static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
665 bool first_frag)
666{
667 /* Copy the flags to each fragment. */
668 IPCB(to)->flags = IPCB(from)->flags;
669
670 /* ANK: dirty, but effective trick. Upgrade options only if
671 * the segment to be fragmented was THE FIRST (otherwise,
672 * options are already fixed) and make it ONCE
673 * on the initial skb, so that all the following fragments
674 * will inherit fixed options.
675 */
676 if (first_frag)
677 ip_options_fragment(from);
678}
679
680struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
681{
682 unsigned int len = state->left;
683 struct sk_buff *skb2;
684 struct iphdr *iph;
685
686 /* IF: it doesn't fit, use 'mtu' - the data space left */
687 if (len > state->mtu)
688 len = state->mtu;
689 /* IF: we are not sending up to and including the packet end
690 then align the next start on an eight byte boundary */
691 if (len < state->left) {
692 len &= ~7;
693 }
694
695 /* Allocate buffer */
696 skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
697 if (!skb2)
698 return ERR_PTR(-ENOMEM);
699
700 /*
701 * Set up data on packet
702 */
703
704 ip_copy_metadata(skb2, skb);
705 skb_reserve(skb2, state->ll_rs);
706 skb_put(skb2, len + state->hlen);
707 skb_reset_network_header(skb2);
708 skb2->transport_header = skb2->network_header + state->hlen;
709
710 /*
711 * Charge the memory for the fragment to any owner
712 * it might possess
713 */
714
715 if (skb->sk)
716 skb_set_owner_w(skb2, skb->sk);
717
718 /*
719 * Copy the packet header into the new buffer.
720 */
721
722 skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen);
723
724 /*
725 * Copy a block of the IP datagram.
726 */
727 if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
728 BUG();
729 state->left -= len;
730
731 /*
732 * Fill in the new header fields.
733 */
734 iph = ip_hdr(skb2);
735 iph->frag_off = htons((state->offset >> 3));
736 if (state->DF)
737 iph->frag_off |= htons(IP_DF);
738
739 /*
740 * Added AC : If we are fragmenting a fragment that's not the
741 * last fragment then keep MF on each bit
742 */
743 if (state->left > 0 || state->not_last_frag)
744 iph->frag_off |= htons(IP_MF);
745 state->ptr += len;
746 state->offset += len;
747
748 iph->tot_len = htons(len + state->hlen);
749
750 ip_send_check(iph);
751
752 return skb2;
753}
754EXPORT_SYMBOL(ip_frag_next);
755
756/*
757 * This IP datagram is too large to be sent in one piece. Break it up into
758 * smaller pieces (each of size equal to IP header plus
759 * a block of the data of the original IP data part) that will yet fit in a
760 * single device frame, and queue such a frame for sending.
761 */
762
763int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
764 int (*output)(struct net *, struct sock *, struct sk_buff *))
765{
766 struct iphdr *iph;
767 struct sk_buff *skb2;
768 u8 tstamp_type = skb->tstamp_type;
769 struct rtable *rt = skb_rtable(skb);
770 unsigned int mtu, hlen, ll_rs;
771 struct ip_fraglist_iter iter;
772 ktime_t tstamp = skb->tstamp;
773 struct ip_frag_state state;
774 int err = 0;
775
776 /* for offloaded checksums cleanup checksum before fragmentation */
777 if (skb->ip_summed == CHECKSUM_PARTIAL &&
778 (err = skb_checksum_help(skb)))
779 goto fail;
780
781 /*
782 * Point into the IP datagram header.
783 */
784
785 iph = ip_hdr(skb);
786
787 mtu = ip_skb_dst_mtu(sk, skb);
788 if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
789 mtu = IPCB(skb)->frag_max_size;
790
791 /*
792 * Setup starting values.
793 */
794
795 hlen = iph->ihl * 4;
796 mtu = mtu - hlen; /* Size of data space */
797 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
798 ll_rs = LL_RESERVED_SPACE(rt->dst.dev);
799
800 /* When frag_list is given, use it. First, check its validity:
801 * some transformers could create wrong frag_list or break existing
802 * one, it is not prohibited. In this case fall back to copying.
803 *
804 * LATER: this step can be merged to real generation of fragments,
805 * we can switch to copy when see the first bad fragment.
806 */
807 if (skb_has_frag_list(skb)) {
808 struct sk_buff *frag, *frag2;
809 unsigned int first_len = skb_pagelen(skb);
810
811 if (first_len - hlen > mtu ||
812 ((first_len - hlen) & 7) ||
813 ip_is_fragment(iph) ||
814 skb_cloned(skb) ||
815 skb_headroom(skb) < ll_rs)
816 goto slow_path;
817
818 skb_walk_frags(skb, frag) {
819 /* Correct geometry. */
820 if (frag->len > mtu ||
821 ((frag->len & 7) && frag->next) ||
822 skb_headroom(frag) < hlen + ll_rs)
823 goto slow_path_clean;
824
825 /* Partially cloned skb? */
826 if (skb_shared(frag))
827 goto slow_path_clean;
828
829 BUG_ON(frag->sk);
830 if (skb->sk) {
831 frag->sk = skb->sk;
832 frag->destructor = sock_wfree;
833 }
834 skb->truesize -= frag->truesize;
835 }
836
837 /* Everything is OK. Generate! */
838 ip_fraglist_init(skb, iph, hlen, &iter);
839
840 for (;;) {
841 /* Prepare header of the next frame,
842 * before previous one went down. */
843 if (iter.frag) {
844 bool first_frag = (iter.offset == 0);
845
846 IPCB(iter.frag)->flags = IPCB(skb)->flags;
847 ip_fraglist_prepare(skb, &iter);
848 if (first_frag && IPCB(skb)->opt.optlen) {
849 /* ipcb->opt is not populated for frags
850 * coming from __ip_make_skb(),
851 * ip_options_fragment() needs optlen
852 */
853 IPCB(iter.frag)->opt.optlen =
854 IPCB(skb)->opt.optlen;
855 ip_options_fragment(iter.frag);
856 ip_send_check(iter.iph);
857 }
858 }
859
860 skb_set_delivery_time(skb, tstamp, tstamp_type);
861 err = output(net, sk, skb);
862
863 if (!err)
864 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
865 if (err || !iter.frag)
866 break;
867
868 skb = ip_fraglist_next(&iter);
869 }
870
871 if (err == 0) {
872 IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
873 return 0;
874 }
875
876 kfree_skb_list(iter.frag);
877
878 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
879 return err;
880
881slow_path_clean:
882 skb_walk_frags(skb, frag2) {
883 if (frag2 == frag)
884 break;
885 frag2->sk = NULL;
886 frag2->destructor = NULL;
887 skb->truesize += frag2->truesize;
888 }
889 }
890
891slow_path:
892 /*
893 * Fragment the datagram.
894 */
895
896 ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU,
897 &state);
898
899 /*
900 * Keep copying data until we run out.
901 */
902
903 while (state.left > 0) {
904 bool first_frag = (state.offset == 0);
905
906 skb2 = ip_frag_next(skb, &state);
907 if (IS_ERR(skb2)) {
908 err = PTR_ERR(skb2);
909 goto fail;
910 }
911 ip_frag_ipcb(skb, skb2, first_frag);
912
913 /*
914 * Put this fragment into the sending queue.
915 */
916 skb_set_delivery_time(skb2, tstamp, tstamp_type);
917 err = output(net, sk, skb2);
918 if (err)
919 goto fail;
920
921 IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
922 }
923 consume_skb(skb);
924 IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS);
925 return err;
926
927fail:
928 kfree_skb(skb);
929 IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
930 return err;
931}
932EXPORT_SYMBOL(ip_do_fragment);
933
934int
935ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
936{
937 struct msghdr *msg = from;
938
939 if (skb->ip_summed == CHECKSUM_PARTIAL) {
940 if (!copy_from_iter_full(to, len, &msg->msg_iter))
941 return -EFAULT;
942 } else {
943 __wsum csum = 0;
944 if (!csum_and_copy_from_iter_full(to, len, &csum, &msg->msg_iter))
945 return -EFAULT;
946 skb->csum = csum_block_add(skb->csum, csum, odd);
947 }
948 return 0;
949}
950EXPORT_SYMBOL(ip_generic_getfrag);
951
952static int __ip_append_data(struct sock *sk,
953 struct flowi4 *fl4,
954 struct sk_buff_head *queue,
955 struct inet_cork *cork,
956 struct page_frag *pfrag,
957 int getfrag(void *from, char *to, int offset,
958 int len, int odd, struct sk_buff *skb),
959 void *from, int length, int transhdrlen,
960 unsigned int flags)
961{
962 struct inet_sock *inet = inet_sk(sk);
963 struct ubuf_info *uarg = NULL;
964 struct sk_buff *skb;
965 struct ip_options *opt = cork->opt;
966 int hh_len;
967 int exthdrlen;
968 int mtu;
969 int copy;
970 int err;
971 int offset = 0;
972 bool zc = false;
973 unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
974 int csummode = CHECKSUM_NONE;
975 struct rtable *rt = dst_rtable(cork->dst);
976 bool paged, hold_tskey = false, extra_uref = false;
977 unsigned int wmem_alloc_delta = 0;
978 u32 tskey = 0;
979
980 skb = skb_peek_tail(queue);
981
982 exthdrlen = !skb ? rt->dst.header_len : 0;
983 mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
984 paged = !!cork->gso_size;
985
986 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
987
988 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
989 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
990 maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu;
991
992 if (cork->length + length > maxnonfragsize - fragheaderlen) {
993 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
994 mtu - (opt ? opt->optlen : 0));
995 return -EMSGSIZE;
996 }
997
998 /*
999 * transhdrlen > 0 means that this is the first fragment and we wish
1000 * it won't be fragmented in the future.
1001 */
1002 if (transhdrlen &&
1003 length + fragheaderlen <= mtu &&
1004 rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
1005 (!(flags & MSG_MORE) || cork->gso_size) &&
1006 (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
1007 csummode = CHECKSUM_PARTIAL;
1008
1009 if ((flags & MSG_ZEROCOPY) && length) {
1010 struct msghdr *msg = from;
1011
1012 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1013 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1014 return -EINVAL;
1015
1016 /* Leave uarg NULL if can't zerocopy, callers should
1017 * be able to handle it.
1018 */
1019 if ((rt->dst.dev->features & NETIF_F_SG) &&
1020 csummode == CHECKSUM_PARTIAL) {
1021 paged = true;
1022 zc = true;
1023 uarg = msg->msg_ubuf;
1024 }
1025 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1026 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1027 if (!uarg)
1028 return -ENOBUFS;
1029 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1030 if (rt->dst.dev->features & NETIF_F_SG &&
1031 csummode == CHECKSUM_PARTIAL) {
1032 paged = true;
1033 zc = true;
1034 } else {
1035 uarg_to_msgzc(uarg)->zerocopy = 0;
1036 skb_zcopy_set(skb, uarg, &extra_uref);
1037 }
1038 }
1039 } else if ((flags & MSG_SPLICE_PAGES) && length) {
1040 if (inet_test_bit(HDRINCL, sk))
1041 return -EPERM;
1042 if (rt->dst.dev->features & NETIF_F_SG &&
1043 getfrag == ip_generic_getfrag)
1044 /* We need an empty buffer to attach stuff to */
1045 paged = true;
1046 else
1047 flags &= ~MSG_SPLICE_PAGES;
1048 }
1049
1050 cork->length += length;
1051
1052 if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1053 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
1054 if (cork->flags & IPCORK_TS_OPT_ID) {
1055 tskey = cork->ts_opt_id;
1056 } else {
1057 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1058 hold_tskey = true;
1059 }
1060 }
1061
1062 /* So, what's going on in the loop below?
1063 *
1064 * We use calculated fragment length to generate chained skb,
1065 * each of segments is IP fragment ready for sending to network after
1066 * adding appropriate IP header.
1067 */
1068
1069 if (!skb)
1070 goto alloc_new_skb;
1071
1072 while (length > 0) {
1073 /* Check if the remaining data fits into current packet. */
1074 copy = mtu - skb->len;
1075 if (copy < length)
1076 copy = maxfraglen - skb->len;
1077 if (copy <= 0) {
1078 char *data;
1079 unsigned int datalen;
1080 unsigned int fraglen;
1081 unsigned int fraggap;
1082 unsigned int alloclen, alloc_extra;
1083 unsigned int pagedlen;
1084 struct sk_buff *skb_prev;
1085alloc_new_skb:
1086 skb_prev = skb;
1087 if (skb_prev)
1088 fraggap = skb_prev->len - maxfraglen;
1089 else
1090 fraggap = 0;
1091
1092 /*
1093 * If remaining data exceeds the mtu,
1094 * we know we need more fragment(s).
1095 */
1096 datalen = length + fraggap;
1097 if (datalen > mtu - fragheaderlen)
1098 datalen = maxfraglen - fragheaderlen;
1099 fraglen = datalen + fragheaderlen;
1100 pagedlen = 0;
1101
1102 alloc_extra = hh_len + 15;
1103 alloc_extra += exthdrlen;
1104
1105 /* The last fragment gets additional space at tail.
1106 * Note, with MSG_MORE we overallocate on fragments,
1107 * because we have no idea what fragment will be
1108 * the last.
1109 */
1110 if (datalen == length + fraggap)
1111 alloc_extra += rt->dst.trailer_len;
1112
1113 if ((flags & MSG_MORE) &&
1114 !(rt->dst.dev->features&NETIF_F_SG))
1115 alloclen = mtu;
1116 else if (!paged &&
1117 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1118 !(rt->dst.dev->features & NETIF_F_SG)))
1119 alloclen = fraglen;
1120 else {
1121 alloclen = fragheaderlen + transhdrlen;
1122 pagedlen = datalen - transhdrlen;
1123 }
1124
1125 alloclen += alloc_extra;
1126
1127 if (transhdrlen) {
1128 skb = sock_alloc_send_skb(sk, alloclen,
1129 (flags & MSG_DONTWAIT), &err);
1130 } else {
1131 skb = NULL;
1132 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1133 2 * sk->sk_sndbuf)
1134 skb = alloc_skb(alloclen,
1135 sk->sk_allocation);
1136 if (unlikely(!skb))
1137 err = -ENOBUFS;
1138 }
1139 if (!skb)
1140 goto error;
1141
1142 /*
1143 * Fill in the control structures
1144 */
1145 skb->ip_summed = csummode;
1146 skb->csum = 0;
1147 skb_reserve(skb, hh_len);
1148
1149 /*
1150 * Find where to start putting bytes.
1151 */
1152 data = skb_put(skb, fraglen + exthdrlen - pagedlen);
1153 skb_set_network_header(skb, exthdrlen);
1154 skb->transport_header = (skb->network_header +
1155 fragheaderlen);
1156 data += fragheaderlen + exthdrlen;
1157
1158 if (fraggap) {
1159 skb->csum = skb_copy_and_csum_bits(
1160 skb_prev, maxfraglen,
1161 data + transhdrlen, fraggap);
1162 skb_prev->csum = csum_sub(skb_prev->csum,
1163 skb->csum);
1164 data += fraggap;
1165 pskb_trim_unique(skb_prev, maxfraglen);
1166 }
1167
1168 copy = datalen - transhdrlen - fraggap - pagedlen;
1169 /* [!] NOTE: copy will be negative if pagedlen>0
1170 * because then the equation reduces to -fraggap.
1171 */
1172 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1173 err = -EFAULT;
1174 kfree_skb(skb);
1175 goto error;
1176 } else if (flags & MSG_SPLICE_PAGES) {
1177 copy = 0;
1178 }
1179
1180 offset += copy;
1181 length -= copy + transhdrlen;
1182 transhdrlen = 0;
1183 exthdrlen = 0;
1184 csummode = CHECKSUM_NONE;
1185
1186 /* only the initial fragment is time stamped */
1187 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1188 cork->tx_flags = 0;
1189 skb_shinfo(skb)->tskey = tskey;
1190 tskey = 0;
1191 skb_zcopy_set(skb, uarg, &extra_uref);
1192
1193 if ((flags & MSG_CONFIRM) && !skb_prev)
1194 skb_set_dst_pending_confirm(skb, 1);
1195
1196 /*
1197 * Put the packet on the pending queue.
1198 */
1199 if (!skb->destructor) {
1200 skb->destructor = sock_wfree;
1201 skb->sk = sk;
1202 wmem_alloc_delta += skb->truesize;
1203 }
1204 __skb_queue_tail(queue, skb);
1205 continue;
1206 }
1207
1208 if (copy > length)
1209 copy = length;
1210
1211 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1212 skb_tailroom(skb) >= copy) {
1213 unsigned int off;
1214
1215 off = skb->len;
1216 if (getfrag(from, skb_put(skb, copy),
1217 offset, copy, off, skb) < 0) {
1218 __skb_trim(skb, off);
1219 err = -EFAULT;
1220 goto error;
1221 }
1222 } else if (flags & MSG_SPLICE_PAGES) {
1223 struct msghdr *msg = from;
1224
1225 err = -EIO;
1226 if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1227 goto error;
1228
1229 err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1230 sk->sk_allocation);
1231 if (err < 0)
1232 goto error;
1233 copy = err;
1234 wmem_alloc_delta += copy;
1235 } else if (!zc) {
1236 int i = skb_shinfo(skb)->nr_frags;
1237
1238 err = -ENOMEM;
1239 if (!sk_page_frag_refill(sk, pfrag))
1240 goto error;
1241
1242 skb_zcopy_downgrade_managed(skb);
1243 if (!skb_can_coalesce(skb, i, pfrag->page,
1244 pfrag->offset)) {
1245 err = -EMSGSIZE;
1246 if (i == MAX_SKB_FRAGS)
1247 goto error;
1248
1249 __skb_fill_page_desc(skb, i, pfrag->page,
1250 pfrag->offset, 0);
1251 skb_shinfo(skb)->nr_frags = ++i;
1252 get_page(pfrag->page);
1253 }
1254 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1255 if (getfrag(from,
1256 page_address(pfrag->page) + pfrag->offset,
1257 offset, copy, skb->len, skb) < 0)
1258 goto error_efault;
1259
1260 pfrag->offset += copy;
1261 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1262 skb_len_add(skb, copy);
1263 wmem_alloc_delta += copy;
1264 } else {
1265 err = skb_zerocopy_iter_dgram(skb, from, copy);
1266 if (err < 0)
1267 goto error;
1268 }
1269 offset += copy;
1270 length -= copy;
1271 }
1272
1273 if (wmem_alloc_delta)
1274 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1275 return 0;
1276
1277error_efault:
1278 err = -EFAULT;
1279error:
1280 net_zcopy_put_abort(uarg, extra_uref);
1281 cork->length -= length;
1282 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1283 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1284 if (hold_tskey)
1285 atomic_dec(&sk->sk_tskey);
1286 return err;
1287}
1288
1289static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1290 struct ipcm_cookie *ipc, struct rtable **rtp)
1291{
1292 struct ip_options_rcu *opt;
1293 struct rtable *rt;
1294
1295 rt = *rtp;
1296 if (unlikely(!rt))
1297 return -EFAULT;
1298
1299 cork->fragsize = ip_sk_use_pmtu(sk) ?
1300 dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
1301
1302 if (!inetdev_valid_mtu(cork->fragsize))
1303 return -ENETUNREACH;
1304
1305 /*
1306 * setup for corking.
1307 */
1308 opt = ipc->opt;
1309 if (opt) {
1310 if (!cork->opt) {
1311 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1312 sk->sk_allocation);
1313 if (unlikely(!cork->opt))
1314 return -ENOBUFS;
1315 }
1316 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1317 cork->flags |= IPCORK_OPT;
1318 cork->addr = ipc->addr;
1319 }
1320
1321 cork->gso_size = ipc->gso_size;
1322
1323 cork->dst = &rt->dst;
1324 /* We stole this route, caller should not release it. */
1325 *rtp = NULL;
1326
1327 cork->length = 0;
1328 cork->ttl = ipc->ttl;
1329 cork->tos = ipc->tos;
1330 cork->mark = ipc->sockc.mark;
1331 cork->priority = ipc->priority;
1332 cork->transmit_time = ipc->sockc.transmit_time;
1333 cork->tx_flags = 0;
1334 sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags);
1335 if (ipc->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
1336 cork->flags |= IPCORK_TS_OPT_ID;
1337 cork->ts_opt_id = ipc->sockc.ts_opt_id;
1338 }
1339
1340 return 0;
1341}
1342
1343/*
1344 * ip_append_data() can make one large IP datagram from many pieces of
1345 * data. Each piece will be held on the socket until
1346 * ip_push_pending_frames() is called. Each piece can be a page or
1347 * non-page data.
1348 *
1349 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1350 * this interface potentially.
1351 *
1352 * LATER: length must be adjusted by pad at tail, when it is required.
1353 */
1354int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1355 int getfrag(void *from, char *to, int offset, int len,
1356 int odd, struct sk_buff *skb),
1357 void *from, int length, int transhdrlen,
1358 struct ipcm_cookie *ipc, struct rtable **rtp,
1359 unsigned int flags)
1360{
1361 struct inet_sock *inet = inet_sk(sk);
1362 int err;
1363
1364 if (flags&MSG_PROBE)
1365 return 0;
1366
1367 if (skb_queue_empty(&sk->sk_write_queue)) {
1368 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1369 if (err)
1370 return err;
1371 } else {
1372 transhdrlen = 0;
1373 }
1374
1375 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
1376 sk_page_frag(sk), getfrag,
1377 from, length, transhdrlen, flags);
1378}
1379
1380static void ip_cork_release(struct inet_cork *cork)
1381{
1382 cork->flags &= ~IPCORK_OPT;
1383 kfree(cork->opt);
1384 cork->opt = NULL;
1385 dst_release(cork->dst);
1386 cork->dst = NULL;
1387}
1388
1389/*
1390 * Combined all pending IP fragments on the socket as one IP datagram
1391 * and push them out.
1392 */
1393struct sk_buff *__ip_make_skb(struct sock *sk,
1394 struct flowi4 *fl4,
1395 struct sk_buff_head *queue,
1396 struct inet_cork *cork)
1397{
1398 struct sk_buff *skb, *tmp_skb;
1399 struct sk_buff **tail_skb;
1400 struct inet_sock *inet = inet_sk(sk);
1401 struct net *net = sock_net(sk);
1402 struct ip_options *opt = NULL;
1403 struct rtable *rt = dst_rtable(cork->dst);
1404 struct iphdr *iph;
1405 u8 pmtudisc, ttl;
1406 __be16 df = 0;
1407
1408 skb = __skb_dequeue(queue);
1409 if (!skb)
1410 goto out;
1411 tail_skb = &(skb_shinfo(skb)->frag_list);
1412
1413 /* move skb->data to ip header from ext header */
1414 if (skb->data < skb_network_header(skb))
1415 __skb_pull(skb, skb_network_offset(skb));
1416 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1417 __skb_pull(tmp_skb, skb_network_header_len(skb));
1418 *tail_skb = tmp_skb;
1419 tail_skb = &(tmp_skb->next);
1420 skb->len += tmp_skb->len;
1421 skb->data_len += tmp_skb->len;
1422 skb->truesize += tmp_skb->truesize;
1423 tmp_skb->destructor = NULL;
1424 tmp_skb->sk = NULL;
1425 }
1426
1427 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1428 * to fragment the frame generated here. No matter, what transforms
1429 * how transforms change size of the packet, it will come out.
1430 */
1431 skb->ignore_df = ip_sk_ignore_df(sk);
1432
1433 /* DF bit is set when we want to see DF on outgoing frames.
1434 * If ignore_df is set too, we still allow to fragment this frame
1435 * locally. */
1436 pmtudisc = READ_ONCE(inet->pmtudisc);
1437 if (pmtudisc == IP_PMTUDISC_DO ||
1438 pmtudisc == IP_PMTUDISC_PROBE ||
1439 (skb->len <= dst_mtu(&rt->dst) &&
1440 ip_dont_fragment(sk, &rt->dst)))
1441 df = htons(IP_DF);
1442
1443 if (cork->flags & IPCORK_OPT)
1444 opt = cork->opt;
1445
1446 if (cork->ttl != 0)
1447 ttl = cork->ttl;
1448 else if (rt->rt_type == RTN_MULTICAST)
1449 ttl = READ_ONCE(inet->mc_ttl);
1450 else
1451 ttl = ip_select_ttl(inet, &rt->dst);
1452
1453 iph = ip_hdr(skb);
1454 iph->version = 4;
1455 iph->ihl = 5;
1456 iph->tos = (cork->tos != -1) ? cork->tos : READ_ONCE(inet->tos);
1457 iph->frag_off = df;
1458 iph->ttl = ttl;
1459 iph->protocol = sk->sk_protocol;
1460 ip_copy_addrs(iph, fl4);
1461 ip_select_ident(net, skb, sk);
1462
1463 if (opt) {
1464 iph->ihl += opt->optlen >> 2;
1465 ip_options_build(skb, opt, cork->addr, rt);
1466 }
1467
1468 skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
1469 skb->mark = cork->mark;
1470 if (sk_is_tcp(sk))
1471 skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC);
1472 else
1473 skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid);
1474 /*
1475 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1476 * on dst refcount
1477 */
1478 cork->dst = NULL;
1479 skb_dst_set(skb, &rt->dst);
1480
1481 if (iph->protocol == IPPROTO_ICMP) {
1482 u8 icmp_type;
1483
1484 /* For such sockets, transhdrlen is zero when do ip_append_data(),
1485 * so icmphdr does not in skb linear region and can not get icmp_type
1486 * by icmp_hdr(skb)->type.
1487 */
1488 if (sk->sk_type == SOCK_RAW &&
1489 !(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH))
1490 icmp_type = fl4->fl4_icmp_type;
1491 else
1492 icmp_type = icmp_hdr(skb)->type;
1493 icmp_out_count(net, icmp_type);
1494 }
1495
1496 ip_cork_release(cork);
1497out:
1498 return skb;
1499}
1500
1501int ip_send_skb(struct net *net, struct sk_buff *skb)
1502{
1503 int err;
1504
1505 err = ip_local_out(net, skb->sk, skb);
1506 if (err) {
1507 if (err > 0)
1508 err = net_xmit_errno(err);
1509 if (err)
1510 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1511 }
1512
1513 return err;
1514}
1515
1516int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1517{
1518 struct sk_buff *skb;
1519
1520 skb = ip_finish_skb(sk, fl4);
1521 if (!skb)
1522 return 0;
1523
1524 /* Netfilter gets whole the not fragmented skb. */
1525 return ip_send_skb(sock_net(sk), skb);
1526}
1527
1528/*
1529 * Throw away all pending data on the socket.
1530 */
1531static void __ip_flush_pending_frames(struct sock *sk,
1532 struct sk_buff_head *queue,
1533 struct inet_cork *cork)
1534{
1535 struct sk_buff *skb;
1536
1537 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1538 kfree_skb(skb);
1539
1540 ip_cork_release(cork);
1541}
1542
1543void ip_flush_pending_frames(struct sock *sk)
1544{
1545 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1546}
1547
1548struct sk_buff *ip_make_skb(struct sock *sk,
1549 struct flowi4 *fl4,
1550 int getfrag(void *from, char *to, int offset,
1551 int len, int odd, struct sk_buff *skb),
1552 void *from, int length, int transhdrlen,
1553 struct ipcm_cookie *ipc, struct rtable **rtp,
1554 struct inet_cork *cork, unsigned int flags)
1555{
1556 struct sk_buff_head queue;
1557 int err;
1558
1559 if (flags & MSG_PROBE)
1560 return NULL;
1561
1562 __skb_queue_head_init(&queue);
1563
1564 cork->flags = 0;
1565 cork->addr = 0;
1566 cork->opt = NULL;
1567 err = ip_setup_cork(sk, cork, ipc, rtp);
1568 if (err)
1569 return ERR_PTR(err);
1570
1571 err = __ip_append_data(sk, fl4, &queue, cork,
1572 ¤t->task_frag, getfrag,
1573 from, length, transhdrlen, flags);
1574 if (err) {
1575 __ip_flush_pending_frames(sk, &queue, cork);
1576 return ERR_PTR(err);
1577 }
1578
1579 return __ip_make_skb(sk, fl4, &queue, cork);
1580}
1581
1582/*
1583 * Fetch data from kernel space and fill in checksum if needed.
1584 */
1585static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1586 int len, int odd, struct sk_buff *skb)
1587{
1588 __wsum csum;
1589
1590 csum = csum_partial_copy_nocheck(dptr+offset, to, len);
1591 skb->csum = csum_block_add(skb->csum, csum, odd);
1592 return 0;
1593}
1594
1595/*
1596 * Generic function to send a packet as reply to another packet.
1597 * Used to send some TCP resets/acks so far.
1598 */
1599void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
1600 struct sk_buff *skb,
1601 const struct ip_options *sopt,
1602 __be32 daddr, __be32 saddr,
1603 const struct ip_reply_arg *arg,
1604 unsigned int len, u64 transmit_time, u32 txhash)
1605{
1606 struct ip_options_data replyopts;
1607 struct ipcm_cookie ipc;
1608 struct flowi4 fl4;
1609 struct rtable *rt = skb_rtable(skb);
1610 struct net *net = sock_net(sk);
1611 struct sk_buff *nskb;
1612 int err;
1613 int oif;
1614
1615 if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
1616 return;
1617
1618 ipcm_init(&ipc);
1619 ipc.addr = daddr;
1620 ipc.sockc.transmit_time = transmit_time;
1621
1622 if (replyopts.opt.opt.optlen) {
1623 ipc.opt = &replyopts.opt;
1624
1625 if (replyopts.opt.opt.srr)
1626 daddr = replyopts.opt.opt.faddr;
1627 }
1628
1629 oif = arg->bound_dev_if;
1630 if (!oif && netif_index_is_l3_master(net, skb->skb_iif))
1631 oif = skb->skb_iif;
1632
1633 flowi4_init_output(&fl4, oif,
1634 IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
1635 arg->tos & INET_DSCP_MASK,
1636 RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
1637 ip_reply_arg_flowi_flags(arg),
1638 daddr, saddr,
1639 tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
1640 arg->uid);
1641 security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
1642 rt = ip_route_output_flow(net, &fl4, sk);
1643 if (IS_ERR(rt))
1644 return;
1645
1646 inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK;
1647
1648 sk->sk_protocol = ip_hdr(skb)->protocol;
1649 sk->sk_bound_dev_if = arg->bound_dev_if;
1650 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
1651 ipc.sockc.mark = fl4.flowi4_mark;
1652 err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
1653 len, 0, &ipc, &rt, MSG_DONTWAIT);
1654 if (unlikely(err)) {
1655 ip_flush_pending_frames(sk);
1656 goto out;
1657 }
1658
1659 nskb = skb_peek(&sk->sk_write_queue);
1660 if (nskb) {
1661 if (arg->csumoffset >= 0)
1662 *((__sum16 *)skb_transport_header(nskb) +
1663 arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1664 arg->csum));
1665 nskb->ip_summed = CHECKSUM_NONE;
1666 if (orig_sk)
1667 skb_set_owner_edemux(nskb, (struct sock *)orig_sk);
1668 if (transmit_time)
1669 nskb->tstamp_type = SKB_CLOCK_MONOTONIC;
1670 if (txhash)
1671 skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4);
1672 ip_push_pending_frames(sk, &fl4);
1673 }
1674out:
1675 ip_rt_put(rt);
1676}
1677
1678void __init ip_init(void)
1679{
1680 ip_rt_init();
1681 inet_initpeers();
1682
1683#if defined(CONFIG_IP_MULTICAST)
1684 igmp_mc_init();
1685#endif
1686}
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
8 * Authors: Ross Biro
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
23 * Bradford Johnson: Fix faulty handling of some frames when
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <linux/module.h>
47#include <linux/types.h>
48#include <linux/kernel.h>
49#include <linux/mm.h>
50#include <linux/string.h>
51#include <linux/errno.h>
52#include <linux/highmem.h>
53#include <linux/slab.h>
54
55#include <linux/socket.h>
56#include <linux/sockios.h>
57#include <linux/in.h>
58#include <linux/inet.h>
59#include <linux/netdevice.h>
60#include <linux/etherdevice.h>
61#include <linux/proc_fs.h>
62#include <linux/stat.h>
63#include <linux/init.h>
64
65#include <net/snmp.h>
66#include <net/ip.h>
67#include <net/protocol.h>
68#include <net/route.h>
69#include <net/xfrm.h>
70#include <linux/skbuff.h>
71#include <net/sock.h>
72#include <net/arp.h>
73#include <net/icmp.h>
74#include <net/checksum.h>
75#include <net/inetpeer.h>
76#include <linux/igmp.h>
77#include <linux/netfilter_ipv4.h>
78#include <linux/netfilter_bridge.h>
79#include <linux/mroute.h>
80#include <linux/netlink.h>
81#include <linux/tcp.h>
82
83int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84EXPORT_SYMBOL(sysctl_ip_default_ttl);
85
86/* Generate a checksum for an outgoing IP datagram. */
87__inline__ void ip_send_check(struct iphdr *iph)
88{
89 iph->check = 0;
90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
91}
92EXPORT_SYMBOL(ip_send_check);
93
94int __ip_local_out(struct sk_buff *skb)
95{
96 struct iphdr *iph = ip_hdr(skb);
97
98 iph->tot_len = htons(skb->len);
99 ip_send_check(iph);
100 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
101 skb_dst(skb)->dev, dst_output);
102}
103
104int ip_local_out(struct sk_buff *skb)
105{
106 int err;
107
108 err = __ip_local_out(skb);
109 if (likely(err == 1))
110 err = dst_output(skb);
111
112 return err;
113}
114EXPORT_SYMBOL_GPL(ip_local_out);
115
116/* dev_loopback_xmit for use with netfilter. */
117static int ip_dev_loopback_xmit(struct sk_buff *newskb)
118{
119 skb_reset_mac_header(newskb);
120 __skb_pull(newskb, skb_network_offset(newskb));
121 newskb->pkt_type = PACKET_LOOPBACK;
122 newskb->ip_summed = CHECKSUM_UNNECESSARY;
123 WARN_ON(!skb_dst(newskb));
124 skb_dst_force(newskb);
125 netif_rx_ni(newskb);
126 return 0;
127}
128
129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130{
131 int ttl = inet->uc_ttl;
132
133 if (ttl < 0)
134 ttl = ip4_dst_hoplimit(dst);
135 return ttl;
136}
137
138/*
139 * Add an ip header to a skbuff and send it out.
140 *
141 */
142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143 __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
144{
145 struct inet_sock *inet = inet_sk(sk);
146 struct rtable *rt = skb_rtable(skb);
147 struct iphdr *iph;
148
149 /* Build the IP header. */
150 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
151 skb_reset_network_header(skb);
152 iph = ip_hdr(skb);
153 iph->version = 4;
154 iph->ihl = 5;
155 iph->tos = inet->tos;
156 if (ip_dont_fragment(sk, &rt->dst))
157 iph->frag_off = htons(IP_DF);
158 else
159 iph->frag_off = 0;
160 iph->ttl = ip_select_ttl(inet, &rt->dst);
161 iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
162 iph->saddr = saddr;
163 iph->protocol = sk->sk_protocol;
164 ip_select_ident(iph, &rt->dst, sk);
165
166 if (opt && opt->opt.optlen) {
167 iph->ihl += opt->opt.optlen>>2;
168 ip_options_build(skb, &opt->opt, daddr, rt, 0);
169 }
170
171 skb->priority = sk->sk_priority;
172 skb->mark = sk->sk_mark;
173
174 /* Send it out. */
175 return ip_local_out(skb);
176}
177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
179static inline int ip_finish_output2(struct sk_buff *skb)
180{
181 struct dst_entry *dst = skb_dst(skb);
182 struct rtable *rt = (struct rtable *)dst;
183 struct net_device *dev = dst->dev;
184 unsigned int hh_len = LL_RESERVED_SPACE(dev);
185 struct neighbour *neigh;
186
187 if (rt->rt_type == RTN_MULTICAST) {
188 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
189 } else if (rt->rt_type == RTN_BROADCAST)
190 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
191
192 /* Be paranoid, rather than too clever. */
193 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
194 struct sk_buff *skb2;
195
196 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
197 if (skb2 == NULL) {
198 kfree_skb(skb);
199 return -ENOMEM;
200 }
201 if (skb->sk)
202 skb_set_owner_w(skb2, skb->sk);
203 kfree_skb(skb);
204 skb = skb2;
205 }
206
207 rcu_read_lock();
208 neigh = dst_get_neighbour_noref(dst);
209 if (neigh) {
210 int res = neigh_output(neigh, skb);
211
212 rcu_read_unlock();
213 return res;
214 }
215 rcu_read_unlock();
216
217 net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
218 __func__);
219 kfree_skb(skb);
220 return -EINVAL;
221}
222
223static inline int ip_skb_dst_mtu(struct sk_buff *skb)
224{
225 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
226
227 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
228 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
229}
230
231static int ip_finish_output(struct sk_buff *skb)
232{
233#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
234 /* Policy lookup after SNAT yielded a new policy */
235 if (skb_dst(skb)->xfrm != NULL) {
236 IPCB(skb)->flags |= IPSKB_REROUTED;
237 return dst_output(skb);
238 }
239#endif
240 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
241 return ip_fragment(skb, ip_finish_output2);
242 else
243 return ip_finish_output2(skb);
244}
245
246int ip_mc_output(struct sk_buff *skb)
247{
248 struct sock *sk = skb->sk;
249 struct rtable *rt = skb_rtable(skb);
250 struct net_device *dev = rt->dst.dev;
251
252 /*
253 * If the indicated interface is up and running, send the packet.
254 */
255 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
256
257 skb->dev = dev;
258 skb->protocol = htons(ETH_P_IP);
259
260 /*
261 * Multicasts are looped back for other local users
262 */
263
264 if (rt->rt_flags&RTCF_MULTICAST) {
265 if (sk_mc_loop(sk)
266#ifdef CONFIG_IP_MROUTE
267 /* Small optimization: do not loopback not local frames,
268 which returned after forwarding; they will be dropped
269 by ip_mr_input in any case.
270 Note, that local frames are looped back to be delivered
271 to local recipients.
272
273 This check is duplicated in ip_mr_input at the moment.
274 */
275 &&
276 ((rt->rt_flags & RTCF_LOCAL) ||
277 !(IPCB(skb)->flags & IPSKB_FORWARDED))
278#endif
279 ) {
280 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
281 if (newskb)
282 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
283 newskb, NULL, newskb->dev,
284 ip_dev_loopback_xmit);
285 }
286
287 /* Multicasts with ttl 0 must not go beyond the host */
288
289 if (ip_hdr(skb)->ttl == 0) {
290 kfree_skb(skb);
291 return 0;
292 }
293 }
294
295 if (rt->rt_flags&RTCF_BROADCAST) {
296 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
297 if (newskb)
298 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
299 NULL, newskb->dev, ip_dev_loopback_xmit);
300 }
301
302 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
303 skb->dev, ip_finish_output,
304 !(IPCB(skb)->flags & IPSKB_REROUTED));
305}
306
307int ip_output(struct sk_buff *skb)
308{
309 struct net_device *dev = skb_dst(skb)->dev;
310
311 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
312
313 skb->dev = dev;
314 skb->protocol = htons(ETH_P_IP);
315
316 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
317 ip_finish_output,
318 !(IPCB(skb)->flags & IPSKB_REROUTED));
319}
320
321/*
322 * copy saddr and daddr, possibly using 64bit load/stores
323 * Equivalent to :
324 * iph->saddr = fl4->saddr;
325 * iph->daddr = fl4->daddr;
326 */
327static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
328{
329 BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
330 offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
331 memcpy(&iph->saddr, &fl4->saddr,
332 sizeof(fl4->saddr) + sizeof(fl4->daddr));
333}
334
335int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
336{
337 struct sock *sk = skb->sk;
338 struct inet_sock *inet = inet_sk(sk);
339 struct ip_options_rcu *inet_opt;
340 struct flowi4 *fl4;
341 struct rtable *rt;
342 struct iphdr *iph;
343 int res;
344
345 /* Skip all of this if the packet is already routed,
346 * f.e. by something like SCTP.
347 */
348 rcu_read_lock();
349 inet_opt = rcu_dereference(inet->inet_opt);
350 fl4 = &fl->u.ip4;
351 rt = skb_rtable(skb);
352 if (rt != NULL)
353 goto packet_routed;
354
355 /* Make sure we can route this packet. */
356 rt = (struct rtable *)__sk_dst_check(sk, 0);
357 if (rt == NULL) {
358 __be32 daddr;
359
360 /* Use correct destination address if we have options. */
361 daddr = inet->inet_daddr;
362 if (inet_opt && inet_opt->opt.srr)
363 daddr = inet_opt->opt.faddr;
364
365 /* If this fails, retransmit mechanism of transport layer will
366 * keep trying until route appears or the connection times
367 * itself out.
368 */
369 rt = ip_route_output_ports(sock_net(sk), fl4, sk,
370 daddr, inet->inet_saddr,
371 inet->inet_dport,
372 inet->inet_sport,
373 sk->sk_protocol,
374 RT_CONN_FLAGS(sk),
375 sk->sk_bound_dev_if);
376 if (IS_ERR(rt))
377 goto no_route;
378 sk_setup_caps(sk, &rt->dst);
379 }
380 skb_dst_set_noref(skb, &rt->dst);
381
382packet_routed:
383 if (inet_opt && inet_opt->opt.is_strictroute && fl4->daddr != rt->rt_gateway)
384 goto no_route;
385
386 /* OK, we know where to send it, allocate and build IP header. */
387 skb_push(skb, sizeof(struct iphdr) + (inet_opt ? inet_opt->opt.optlen : 0));
388 skb_reset_network_header(skb);
389 iph = ip_hdr(skb);
390 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
391 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
392 iph->frag_off = htons(IP_DF);
393 else
394 iph->frag_off = 0;
395 iph->ttl = ip_select_ttl(inet, &rt->dst);
396 iph->protocol = sk->sk_protocol;
397 ip_copy_addrs(iph, fl4);
398
399 /* Transport layer set skb->h.foo itself. */
400
401 if (inet_opt && inet_opt->opt.optlen) {
402 iph->ihl += inet_opt->opt.optlen >> 2;
403 ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
404 }
405
406 ip_select_ident_more(iph, &rt->dst, sk,
407 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
408
409 skb->priority = sk->sk_priority;
410 skb->mark = sk->sk_mark;
411
412 res = ip_local_out(skb);
413 rcu_read_unlock();
414 return res;
415
416no_route:
417 rcu_read_unlock();
418 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
419 kfree_skb(skb);
420 return -EHOSTUNREACH;
421}
422EXPORT_SYMBOL(ip_queue_xmit);
423
424
425static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
426{
427 to->pkt_type = from->pkt_type;
428 to->priority = from->priority;
429 to->protocol = from->protocol;
430 skb_dst_drop(to);
431 skb_dst_copy(to, from);
432 to->dev = from->dev;
433 to->mark = from->mark;
434
435 /* Copy the flags to each fragment. */
436 IPCB(to)->flags = IPCB(from)->flags;
437
438#ifdef CONFIG_NET_SCHED
439 to->tc_index = from->tc_index;
440#endif
441 nf_copy(to, from);
442#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
443 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
444 to->nf_trace = from->nf_trace;
445#endif
446#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
447 to->ipvs_property = from->ipvs_property;
448#endif
449 skb_copy_secmark(to, from);
450}
451
452/*
453 * This IP datagram is too large to be sent in one piece. Break it up into
454 * smaller pieces (each of size equal to IP header plus
455 * a block of the data of the original IP data part) that will yet fit in a
456 * single device frame, and queue such a frame for sending.
457 */
458
459int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
460{
461 struct iphdr *iph;
462 int ptr;
463 struct net_device *dev;
464 struct sk_buff *skb2;
465 unsigned int mtu, hlen, left, len, ll_rs;
466 int offset;
467 __be16 not_last_frag;
468 struct rtable *rt = skb_rtable(skb);
469 int err = 0;
470
471 dev = rt->dst.dev;
472
473 /*
474 * Point into the IP datagram header.
475 */
476
477 iph = ip_hdr(skb);
478
479 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
480 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
481 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
482 htonl(ip_skb_dst_mtu(skb)));
483 kfree_skb(skb);
484 return -EMSGSIZE;
485 }
486
487 /*
488 * Setup starting values.
489 */
490
491 hlen = iph->ihl * 4;
492 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
493#ifdef CONFIG_BRIDGE_NETFILTER
494 if (skb->nf_bridge)
495 mtu -= nf_bridge_mtu_reduction(skb);
496#endif
497 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
498
499 /* When frag_list is given, use it. First, check its validity:
500 * some transformers could create wrong frag_list or break existing
501 * one, it is not prohibited. In this case fall back to copying.
502 *
503 * LATER: this step can be merged to real generation of fragments,
504 * we can switch to copy when see the first bad fragment.
505 */
506 if (skb_has_frag_list(skb)) {
507 struct sk_buff *frag, *frag2;
508 int first_len = skb_pagelen(skb);
509
510 if (first_len - hlen > mtu ||
511 ((first_len - hlen) & 7) ||
512 ip_is_fragment(iph) ||
513 skb_cloned(skb))
514 goto slow_path;
515
516 skb_walk_frags(skb, frag) {
517 /* Correct geometry. */
518 if (frag->len > mtu ||
519 ((frag->len & 7) && frag->next) ||
520 skb_headroom(frag) < hlen)
521 goto slow_path_clean;
522
523 /* Partially cloned skb? */
524 if (skb_shared(frag))
525 goto slow_path_clean;
526
527 BUG_ON(frag->sk);
528 if (skb->sk) {
529 frag->sk = skb->sk;
530 frag->destructor = sock_wfree;
531 }
532 skb->truesize -= frag->truesize;
533 }
534
535 /* Everything is OK. Generate! */
536
537 err = 0;
538 offset = 0;
539 frag = skb_shinfo(skb)->frag_list;
540 skb_frag_list_init(skb);
541 skb->data_len = first_len - skb_headlen(skb);
542 skb->len = first_len;
543 iph->tot_len = htons(first_len);
544 iph->frag_off = htons(IP_MF);
545 ip_send_check(iph);
546
547 for (;;) {
548 /* Prepare header of the next frame,
549 * before previous one went down. */
550 if (frag) {
551 frag->ip_summed = CHECKSUM_NONE;
552 skb_reset_transport_header(frag);
553 __skb_push(frag, hlen);
554 skb_reset_network_header(frag);
555 memcpy(skb_network_header(frag), iph, hlen);
556 iph = ip_hdr(frag);
557 iph->tot_len = htons(frag->len);
558 ip_copy_metadata(frag, skb);
559 if (offset == 0)
560 ip_options_fragment(frag);
561 offset += skb->len - hlen;
562 iph->frag_off = htons(offset>>3);
563 if (frag->next != NULL)
564 iph->frag_off |= htons(IP_MF);
565 /* Ready, complete checksum */
566 ip_send_check(iph);
567 }
568
569 err = output(skb);
570
571 if (!err)
572 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
573 if (err || !frag)
574 break;
575
576 skb = frag;
577 frag = skb->next;
578 skb->next = NULL;
579 }
580
581 if (err == 0) {
582 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
583 return 0;
584 }
585
586 while (frag) {
587 skb = frag->next;
588 kfree_skb(frag);
589 frag = skb;
590 }
591 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
592 return err;
593
594slow_path_clean:
595 skb_walk_frags(skb, frag2) {
596 if (frag2 == frag)
597 break;
598 frag2->sk = NULL;
599 frag2->destructor = NULL;
600 skb->truesize += frag2->truesize;
601 }
602 }
603
604slow_path:
605 left = skb->len - hlen; /* Space per frame */
606 ptr = hlen; /* Where to start from */
607
608 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
609 * we need to make room for the encapsulating header
610 */
611 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
612
613 /*
614 * Fragment the datagram.
615 */
616
617 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
618 not_last_frag = iph->frag_off & htons(IP_MF);
619
620 /*
621 * Keep copying data until we run out.
622 */
623
624 while (left > 0) {
625 len = left;
626 /* IF: it doesn't fit, use 'mtu' - the data space left */
627 if (len > mtu)
628 len = mtu;
629 /* IF: we are not sending up to and including the packet end
630 then align the next start on an eight byte boundary */
631 if (len < left) {
632 len &= ~7;
633 }
634 /*
635 * Allocate buffer.
636 */
637
638 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
639 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
640 err = -ENOMEM;
641 goto fail;
642 }
643
644 /*
645 * Set up data on packet
646 */
647
648 ip_copy_metadata(skb2, skb);
649 skb_reserve(skb2, ll_rs);
650 skb_put(skb2, len + hlen);
651 skb_reset_network_header(skb2);
652 skb2->transport_header = skb2->network_header + hlen;
653
654 /*
655 * Charge the memory for the fragment to any owner
656 * it might possess
657 */
658
659 if (skb->sk)
660 skb_set_owner_w(skb2, skb->sk);
661
662 /*
663 * Copy the packet header into the new buffer.
664 */
665
666 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
667
668 /*
669 * Copy a block of the IP datagram.
670 */
671 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
672 BUG();
673 left -= len;
674
675 /*
676 * Fill in the new header fields.
677 */
678 iph = ip_hdr(skb2);
679 iph->frag_off = htons((offset >> 3));
680
681 /* ANK: dirty, but effective trick. Upgrade options only if
682 * the segment to be fragmented was THE FIRST (otherwise,
683 * options are already fixed) and make it ONCE
684 * on the initial skb, so that all the following fragments
685 * will inherit fixed options.
686 */
687 if (offset == 0)
688 ip_options_fragment(skb);
689
690 /*
691 * Added AC : If we are fragmenting a fragment that's not the
692 * last fragment then keep MF on each bit
693 */
694 if (left > 0 || not_last_frag)
695 iph->frag_off |= htons(IP_MF);
696 ptr += len;
697 offset += len;
698
699 /*
700 * Put this fragment into the sending queue.
701 */
702 iph->tot_len = htons(len + hlen);
703
704 ip_send_check(iph);
705
706 err = output(skb2);
707 if (err)
708 goto fail;
709
710 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
711 }
712 kfree_skb(skb);
713 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
714 return err;
715
716fail:
717 kfree_skb(skb);
718 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
719 return err;
720}
721EXPORT_SYMBOL(ip_fragment);
722
723int
724ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
725{
726 struct iovec *iov = from;
727
728 if (skb->ip_summed == CHECKSUM_PARTIAL) {
729 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
730 return -EFAULT;
731 } else {
732 __wsum csum = 0;
733 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
734 return -EFAULT;
735 skb->csum = csum_block_add(skb->csum, csum, odd);
736 }
737 return 0;
738}
739EXPORT_SYMBOL(ip_generic_getfrag);
740
741static inline __wsum
742csum_page(struct page *page, int offset, int copy)
743{
744 char *kaddr;
745 __wsum csum;
746 kaddr = kmap(page);
747 csum = csum_partial(kaddr + offset, copy, 0);
748 kunmap(page);
749 return csum;
750}
751
752static inline int ip_ufo_append_data(struct sock *sk,
753 struct sk_buff_head *queue,
754 int getfrag(void *from, char *to, int offset, int len,
755 int odd, struct sk_buff *skb),
756 void *from, int length, int hh_len, int fragheaderlen,
757 int transhdrlen, int maxfraglen, unsigned int flags)
758{
759 struct sk_buff *skb;
760 int err;
761
762 /* There is support for UDP fragmentation offload by network
763 * device, so create one single skb packet containing complete
764 * udp datagram
765 */
766 if ((skb = skb_peek_tail(queue)) == NULL) {
767 skb = sock_alloc_send_skb(sk,
768 hh_len + fragheaderlen + transhdrlen + 20,
769 (flags & MSG_DONTWAIT), &err);
770
771 if (skb == NULL)
772 return err;
773
774 /* reserve space for Hardware header */
775 skb_reserve(skb, hh_len);
776
777 /* create space for UDP/IP header */
778 skb_put(skb, fragheaderlen + transhdrlen);
779
780 /* initialize network header pointer */
781 skb_reset_network_header(skb);
782
783 /* initialize protocol header pointer */
784 skb->transport_header = skb->network_header + fragheaderlen;
785
786 skb->ip_summed = CHECKSUM_PARTIAL;
787 skb->csum = 0;
788
789 /* specify the length of each IP datagram fragment */
790 skb_shinfo(skb)->gso_size = maxfraglen - fragheaderlen;
791 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
792 __skb_queue_tail(queue, skb);
793 }
794
795 return skb_append_datato_frags(sk, skb, getfrag, from,
796 (length - transhdrlen));
797}
798
799static int __ip_append_data(struct sock *sk,
800 struct flowi4 *fl4,
801 struct sk_buff_head *queue,
802 struct inet_cork *cork,
803 int getfrag(void *from, char *to, int offset,
804 int len, int odd, struct sk_buff *skb),
805 void *from, int length, int transhdrlen,
806 unsigned int flags)
807{
808 struct inet_sock *inet = inet_sk(sk);
809 struct sk_buff *skb;
810
811 struct ip_options *opt = cork->opt;
812 int hh_len;
813 int exthdrlen;
814 int mtu;
815 int copy;
816 int err;
817 int offset = 0;
818 unsigned int maxfraglen, fragheaderlen;
819 int csummode = CHECKSUM_NONE;
820 struct rtable *rt = (struct rtable *)cork->dst;
821
822 skb = skb_peek_tail(queue);
823
824 exthdrlen = !skb ? rt->dst.header_len : 0;
825 mtu = cork->fragsize;
826
827 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
828
829 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
830 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
831
832 if (cork->length + length > 0xFFFF - fragheaderlen) {
833 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
834 mtu-exthdrlen);
835 return -EMSGSIZE;
836 }
837
838 /*
839 * transhdrlen > 0 means that this is the first fragment and we wish
840 * it won't be fragmented in the future.
841 */
842 if (transhdrlen &&
843 length + fragheaderlen <= mtu &&
844 rt->dst.dev->features & NETIF_F_V4_CSUM &&
845 !exthdrlen)
846 csummode = CHECKSUM_PARTIAL;
847
848 cork->length += length;
849 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
850 (sk->sk_protocol == IPPROTO_UDP) &&
851 (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len) {
852 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
853 hh_len, fragheaderlen, transhdrlen,
854 maxfraglen, flags);
855 if (err)
856 goto error;
857 return 0;
858 }
859
860 /* So, what's going on in the loop below?
861 *
862 * We use calculated fragment length to generate chained skb,
863 * each of segments is IP fragment ready for sending to network after
864 * adding appropriate IP header.
865 */
866
867 if (!skb)
868 goto alloc_new_skb;
869
870 while (length > 0) {
871 /* Check if the remaining data fits into current packet. */
872 copy = mtu - skb->len;
873 if (copy < length)
874 copy = maxfraglen - skb->len;
875 if (copy <= 0) {
876 char *data;
877 unsigned int datalen;
878 unsigned int fraglen;
879 unsigned int fraggap;
880 unsigned int alloclen;
881 struct sk_buff *skb_prev;
882alloc_new_skb:
883 skb_prev = skb;
884 if (skb_prev)
885 fraggap = skb_prev->len - maxfraglen;
886 else
887 fraggap = 0;
888
889 /*
890 * If remaining data exceeds the mtu,
891 * we know we need more fragment(s).
892 */
893 datalen = length + fraggap;
894 if (datalen > mtu - fragheaderlen)
895 datalen = maxfraglen - fragheaderlen;
896 fraglen = datalen + fragheaderlen;
897
898 if ((flags & MSG_MORE) &&
899 !(rt->dst.dev->features&NETIF_F_SG))
900 alloclen = mtu;
901 else
902 alloclen = fraglen;
903
904 alloclen += exthdrlen;
905
906 /* The last fragment gets additional space at tail.
907 * Note, with MSG_MORE we overallocate on fragments,
908 * because we have no idea what fragment will be
909 * the last.
910 */
911 if (datalen == length + fraggap)
912 alloclen += rt->dst.trailer_len;
913
914 if (transhdrlen) {
915 skb = sock_alloc_send_skb(sk,
916 alloclen + hh_len + 15,
917 (flags & MSG_DONTWAIT), &err);
918 } else {
919 skb = NULL;
920 if (atomic_read(&sk->sk_wmem_alloc) <=
921 2 * sk->sk_sndbuf)
922 skb = sock_wmalloc(sk,
923 alloclen + hh_len + 15, 1,
924 sk->sk_allocation);
925 if (unlikely(skb == NULL))
926 err = -ENOBUFS;
927 else
928 /* only the initial fragment is
929 time stamped */
930 cork->tx_flags = 0;
931 }
932 if (skb == NULL)
933 goto error;
934
935 /*
936 * Fill in the control structures
937 */
938 skb->ip_summed = csummode;
939 skb->csum = 0;
940 skb_reserve(skb, hh_len);
941 skb_shinfo(skb)->tx_flags = cork->tx_flags;
942
943 /*
944 * Find where to start putting bytes.
945 */
946 data = skb_put(skb, fraglen + exthdrlen);
947 skb_set_network_header(skb, exthdrlen);
948 skb->transport_header = (skb->network_header +
949 fragheaderlen);
950 data += fragheaderlen + exthdrlen;
951
952 if (fraggap) {
953 skb->csum = skb_copy_and_csum_bits(
954 skb_prev, maxfraglen,
955 data + transhdrlen, fraggap, 0);
956 skb_prev->csum = csum_sub(skb_prev->csum,
957 skb->csum);
958 data += fraggap;
959 pskb_trim_unique(skb_prev, maxfraglen);
960 }
961
962 copy = datalen - transhdrlen - fraggap;
963 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
964 err = -EFAULT;
965 kfree_skb(skb);
966 goto error;
967 }
968
969 offset += copy;
970 length -= datalen - fraggap;
971 transhdrlen = 0;
972 exthdrlen = 0;
973 csummode = CHECKSUM_NONE;
974
975 /*
976 * Put the packet on the pending queue.
977 */
978 __skb_queue_tail(queue, skb);
979 continue;
980 }
981
982 if (copy > length)
983 copy = length;
984
985 if (!(rt->dst.dev->features&NETIF_F_SG)) {
986 unsigned int off;
987
988 off = skb->len;
989 if (getfrag(from, skb_put(skb, copy),
990 offset, copy, off, skb) < 0) {
991 __skb_trim(skb, off);
992 err = -EFAULT;
993 goto error;
994 }
995 } else {
996 int i = skb_shinfo(skb)->nr_frags;
997 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
998 struct page *page = cork->page;
999 int off = cork->off;
1000 unsigned int left;
1001
1002 if (page && (left = PAGE_SIZE - off) > 0) {
1003 if (copy >= left)
1004 copy = left;
1005 if (page != skb_frag_page(frag)) {
1006 if (i == MAX_SKB_FRAGS) {
1007 err = -EMSGSIZE;
1008 goto error;
1009 }
1010 skb_fill_page_desc(skb, i, page, off, 0);
1011 skb_frag_ref(skb, i);
1012 frag = &skb_shinfo(skb)->frags[i];
1013 }
1014 } else if (i < MAX_SKB_FRAGS) {
1015 if (copy > PAGE_SIZE)
1016 copy = PAGE_SIZE;
1017 page = alloc_pages(sk->sk_allocation, 0);
1018 if (page == NULL) {
1019 err = -ENOMEM;
1020 goto error;
1021 }
1022 cork->page = page;
1023 cork->off = 0;
1024
1025 skb_fill_page_desc(skb, i, page, 0, 0);
1026 frag = &skb_shinfo(skb)->frags[i];
1027 } else {
1028 err = -EMSGSIZE;
1029 goto error;
1030 }
1031 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1032 offset, copy, skb->len, skb) < 0) {
1033 err = -EFAULT;
1034 goto error;
1035 }
1036 cork->off += copy;
1037 skb_frag_size_add(frag, copy);
1038 skb->len += copy;
1039 skb->data_len += copy;
1040 skb->truesize += copy;
1041 atomic_add(copy, &sk->sk_wmem_alloc);
1042 }
1043 offset += copy;
1044 length -= copy;
1045 }
1046
1047 return 0;
1048
1049error:
1050 cork->length -= length;
1051 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1052 return err;
1053}
1054
1055static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1056 struct ipcm_cookie *ipc, struct rtable **rtp)
1057{
1058 struct inet_sock *inet = inet_sk(sk);
1059 struct ip_options_rcu *opt;
1060 struct rtable *rt;
1061
1062 /*
1063 * setup for corking.
1064 */
1065 opt = ipc->opt;
1066 if (opt) {
1067 if (cork->opt == NULL) {
1068 cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1069 sk->sk_allocation);
1070 if (unlikely(cork->opt == NULL))
1071 return -ENOBUFS;
1072 }
1073 memcpy(cork->opt, &opt->opt, sizeof(struct ip_options) + opt->opt.optlen);
1074 cork->flags |= IPCORK_OPT;
1075 cork->addr = ipc->addr;
1076 }
1077 rt = *rtp;
1078 if (unlikely(!rt))
1079 return -EFAULT;
1080 /*
1081 * We steal reference to this route, caller should not release it
1082 */
1083 *rtp = NULL;
1084 cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1085 rt->dst.dev->mtu : dst_mtu(&rt->dst);
1086 cork->dst = &rt->dst;
1087 cork->length = 0;
1088 cork->tx_flags = ipc->tx_flags;
1089 cork->page = NULL;
1090 cork->off = 0;
1091
1092 return 0;
1093}
1094
1095/*
1096 * ip_append_data() and ip_append_page() can make one large IP datagram
1097 * from many pieces of data. Each pieces will be holded on the socket
1098 * until ip_push_pending_frames() is called. Each piece can be a page
1099 * or non-page data.
1100 *
1101 * Not only UDP, other transport protocols - e.g. raw sockets - can use
1102 * this interface potentially.
1103 *
1104 * LATER: length must be adjusted by pad at tail, when it is required.
1105 */
1106int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1107 int getfrag(void *from, char *to, int offset, int len,
1108 int odd, struct sk_buff *skb),
1109 void *from, int length, int transhdrlen,
1110 struct ipcm_cookie *ipc, struct rtable **rtp,
1111 unsigned int flags)
1112{
1113 struct inet_sock *inet = inet_sk(sk);
1114 int err;
1115
1116 if (flags&MSG_PROBE)
1117 return 0;
1118
1119 if (skb_queue_empty(&sk->sk_write_queue)) {
1120 err = ip_setup_cork(sk, &inet->cork.base, ipc, rtp);
1121 if (err)
1122 return err;
1123 } else {
1124 transhdrlen = 0;
1125 }
1126
1127 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag,
1128 from, length, transhdrlen, flags);
1129}
1130
1131ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
1132 int offset, size_t size, int flags)
1133{
1134 struct inet_sock *inet = inet_sk(sk);
1135 struct sk_buff *skb;
1136 struct rtable *rt;
1137 struct ip_options *opt = NULL;
1138 struct inet_cork *cork;
1139 int hh_len;
1140 int mtu;
1141 int len;
1142 int err;
1143 unsigned int maxfraglen, fragheaderlen, fraggap;
1144
1145 if (inet->hdrincl)
1146 return -EPERM;
1147
1148 if (flags&MSG_PROBE)
1149 return 0;
1150
1151 if (skb_queue_empty(&sk->sk_write_queue))
1152 return -EINVAL;
1153
1154 cork = &inet->cork.base;
1155 rt = (struct rtable *)cork->dst;
1156 if (cork->flags & IPCORK_OPT)
1157 opt = cork->opt;
1158
1159 if (!(rt->dst.dev->features&NETIF_F_SG))
1160 return -EOPNOTSUPP;
1161
1162 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1163 mtu = cork->fragsize;
1164
1165 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1166 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1167
1168 if (cork->length + size > 0xFFFF - fragheaderlen) {
1169 ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, mtu);
1170 return -EMSGSIZE;
1171 }
1172
1173 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1174 return -EINVAL;
1175
1176 cork->length += size;
1177 if ((size + skb->len > mtu) &&
1178 (sk->sk_protocol == IPPROTO_UDP) &&
1179 (rt->dst.dev->features & NETIF_F_UFO)) {
1180 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1181 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1182 }
1183
1184
1185 while (size > 0) {
1186 int i;
1187
1188 if (skb_is_gso(skb))
1189 len = size;
1190 else {
1191
1192 /* Check if the remaining data fits into current packet. */
1193 len = mtu - skb->len;
1194 if (len < size)
1195 len = maxfraglen - skb->len;
1196 }
1197 if (len <= 0) {
1198 struct sk_buff *skb_prev;
1199 int alloclen;
1200
1201 skb_prev = skb;
1202 fraggap = skb_prev->len - maxfraglen;
1203
1204 alloclen = fragheaderlen + hh_len + fraggap + 15;
1205 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1206 if (unlikely(!skb)) {
1207 err = -ENOBUFS;
1208 goto error;
1209 }
1210
1211 /*
1212 * Fill in the control structures
1213 */
1214 skb->ip_summed = CHECKSUM_NONE;
1215 skb->csum = 0;
1216 skb_reserve(skb, hh_len);
1217
1218 /*
1219 * Find where to start putting bytes.
1220 */
1221 skb_put(skb, fragheaderlen + fraggap);
1222 skb_reset_network_header(skb);
1223 skb->transport_header = (skb->network_header +
1224 fragheaderlen);
1225 if (fraggap) {
1226 skb->csum = skb_copy_and_csum_bits(skb_prev,
1227 maxfraglen,
1228 skb_transport_header(skb),
1229 fraggap, 0);
1230 skb_prev->csum = csum_sub(skb_prev->csum,
1231 skb->csum);
1232 pskb_trim_unique(skb_prev, maxfraglen);
1233 }
1234
1235 /*
1236 * Put the packet on the pending queue.
1237 */
1238 __skb_queue_tail(&sk->sk_write_queue, skb);
1239 continue;
1240 }
1241
1242 i = skb_shinfo(skb)->nr_frags;
1243 if (len > size)
1244 len = size;
1245 if (skb_can_coalesce(skb, i, page, offset)) {
1246 skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
1247 } else if (i < MAX_SKB_FRAGS) {
1248 get_page(page);
1249 skb_fill_page_desc(skb, i, page, offset, len);
1250 } else {
1251 err = -EMSGSIZE;
1252 goto error;
1253 }
1254
1255 if (skb->ip_summed == CHECKSUM_NONE) {
1256 __wsum csum;
1257 csum = csum_page(page, offset, len);
1258 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1259 }
1260
1261 skb->len += len;
1262 skb->data_len += len;
1263 skb->truesize += len;
1264 atomic_add(len, &sk->sk_wmem_alloc);
1265 offset += len;
1266 size -= len;
1267 }
1268 return 0;
1269
1270error:
1271 cork->length -= size;
1272 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1273 return err;
1274}
1275
1276static void ip_cork_release(struct inet_cork *cork)
1277{
1278 cork->flags &= ~IPCORK_OPT;
1279 kfree(cork->opt);
1280 cork->opt = NULL;
1281 dst_release(cork->dst);
1282 cork->dst = NULL;
1283}
1284
1285/*
1286 * Combined all pending IP fragments on the socket as one IP datagram
1287 * and push them out.
1288 */
1289struct sk_buff *__ip_make_skb(struct sock *sk,
1290 struct flowi4 *fl4,
1291 struct sk_buff_head *queue,
1292 struct inet_cork *cork)
1293{
1294 struct sk_buff *skb, *tmp_skb;
1295 struct sk_buff **tail_skb;
1296 struct inet_sock *inet = inet_sk(sk);
1297 struct net *net = sock_net(sk);
1298 struct ip_options *opt = NULL;
1299 struct rtable *rt = (struct rtable *)cork->dst;
1300 struct iphdr *iph;
1301 __be16 df = 0;
1302 __u8 ttl;
1303
1304 if ((skb = __skb_dequeue(queue)) == NULL)
1305 goto out;
1306 tail_skb = &(skb_shinfo(skb)->frag_list);
1307
1308 /* move skb->data to ip header from ext header */
1309 if (skb->data < skb_network_header(skb))
1310 __skb_pull(skb, skb_network_offset(skb));
1311 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1312 __skb_pull(tmp_skb, skb_network_header_len(skb));
1313 *tail_skb = tmp_skb;
1314 tail_skb = &(tmp_skb->next);
1315 skb->len += tmp_skb->len;
1316 skb->data_len += tmp_skb->len;
1317 skb->truesize += tmp_skb->truesize;
1318 tmp_skb->destructor = NULL;
1319 tmp_skb->sk = NULL;
1320 }
1321
1322 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1323 * to fragment the frame generated here. No matter, what transforms
1324 * how transforms change size of the packet, it will come out.
1325 */
1326 if (inet->pmtudisc < IP_PMTUDISC_DO)
1327 skb->local_df = 1;
1328
1329 /* DF bit is set when we want to see DF on outgoing frames.
1330 * If local_df is set too, we still allow to fragment this frame
1331 * locally. */
1332 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1333 (skb->len <= dst_mtu(&rt->dst) &&
1334 ip_dont_fragment(sk, &rt->dst)))
1335 df = htons(IP_DF);
1336
1337 if (cork->flags & IPCORK_OPT)
1338 opt = cork->opt;
1339
1340 if (rt->rt_type == RTN_MULTICAST)
1341 ttl = inet->mc_ttl;
1342 else
1343 ttl = ip_select_ttl(inet, &rt->dst);
1344
1345 iph = (struct iphdr *)skb->data;
1346 iph->version = 4;
1347 iph->ihl = 5;
1348 iph->tos = inet->tos;
1349 iph->frag_off = df;
1350 ip_select_ident(iph, &rt->dst, sk);
1351 iph->ttl = ttl;
1352 iph->protocol = sk->sk_protocol;
1353 ip_copy_addrs(iph, fl4);
1354
1355 if (opt) {
1356 iph->ihl += opt->optlen>>2;
1357 ip_options_build(skb, opt, cork->addr, rt, 0);
1358 }
1359
1360 skb->priority = sk->sk_priority;
1361 skb->mark = sk->sk_mark;
1362 /*
1363 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1364 * on dst refcount
1365 */
1366 cork->dst = NULL;
1367 skb_dst_set(skb, &rt->dst);
1368
1369 if (iph->protocol == IPPROTO_ICMP)
1370 icmp_out_count(net, ((struct icmphdr *)
1371 skb_transport_header(skb))->type);
1372
1373 ip_cork_release(cork);
1374out:
1375 return skb;
1376}
1377
1378int ip_send_skb(struct sk_buff *skb)
1379{
1380 struct net *net = sock_net(skb->sk);
1381 int err;
1382
1383 err = ip_local_out(skb);
1384 if (err) {
1385 if (err > 0)
1386 err = net_xmit_errno(err);
1387 if (err)
1388 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1389 }
1390
1391 return err;
1392}
1393
1394int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1395{
1396 struct sk_buff *skb;
1397
1398 skb = ip_finish_skb(sk, fl4);
1399 if (!skb)
1400 return 0;
1401
1402 /* Netfilter gets whole the not fragmented skb. */
1403 return ip_send_skb(skb);
1404}
1405
1406/*
1407 * Throw away all pending data on the socket.
1408 */
1409static void __ip_flush_pending_frames(struct sock *sk,
1410 struct sk_buff_head *queue,
1411 struct inet_cork *cork)
1412{
1413 struct sk_buff *skb;
1414
1415 while ((skb = __skb_dequeue_tail(queue)) != NULL)
1416 kfree_skb(skb);
1417
1418 ip_cork_release(cork);
1419}
1420
1421void ip_flush_pending_frames(struct sock *sk)
1422{
1423 __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork.base);
1424}
1425
1426struct sk_buff *ip_make_skb(struct sock *sk,
1427 struct flowi4 *fl4,
1428 int getfrag(void *from, char *to, int offset,
1429 int len, int odd, struct sk_buff *skb),
1430 void *from, int length, int transhdrlen,
1431 struct ipcm_cookie *ipc, struct rtable **rtp,
1432 unsigned int flags)
1433{
1434 struct inet_cork cork;
1435 struct sk_buff_head queue;
1436 int err;
1437
1438 if (flags & MSG_PROBE)
1439 return NULL;
1440
1441 __skb_queue_head_init(&queue);
1442
1443 cork.flags = 0;
1444 cork.addr = 0;
1445 cork.opt = NULL;
1446 err = ip_setup_cork(sk, &cork, ipc, rtp);
1447 if (err)
1448 return ERR_PTR(err);
1449
1450 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag,
1451 from, length, transhdrlen, flags);
1452 if (err) {
1453 __ip_flush_pending_frames(sk, &queue, &cork);
1454 return ERR_PTR(err);
1455 }
1456
1457 return __ip_make_skb(sk, fl4, &queue, &cork);
1458}
1459
1460/*
1461 * Fetch data from kernel space and fill in checksum if needed.
1462 */
1463static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1464 int len, int odd, struct sk_buff *skb)
1465{
1466 __wsum csum;
1467
1468 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1469 skb->csum = csum_block_add(skb->csum, csum, odd);
1470 return 0;
1471}
1472
1473/*
1474 * Generic function to send a packet as reply to another packet.
1475 * Used to send TCP resets so far. ICMP should use this function too.
1476 *
1477 * Should run single threaded per socket because it uses the sock
1478 * structure to pass arguments.
1479 */
1480void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1481 const struct ip_reply_arg *arg, unsigned int len)
1482{
1483 struct inet_sock *inet = inet_sk(sk);
1484 struct ip_options_data replyopts;
1485 struct ipcm_cookie ipc;
1486 struct flowi4 fl4;
1487 struct rtable *rt = skb_rtable(skb);
1488
1489 if (ip_options_echo(&replyopts.opt.opt, skb))
1490 return;
1491
1492 ipc.addr = daddr;
1493 ipc.opt = NULL;
1494 ipc.tx_flags = 0;
1495
1496 if (replyopts.opt.opt.optlen) {
1497 ipc.opt = &replyopts.opt;
1498
1499 if (replyopts.opt.opt.srr)
1500 daddr = replyopts.opt.opt.faddr;
1501 }
1502
1503 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1504 RT_TOS(arg->tos),
1505 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1506 ip_reply_arg_flowi_flags(arg),
1507 daddr, rt->rt_spec_dst,
1508 tcp_hdr(skb)->source, tcp_hdr(skb)->dest);
1509 security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
1510 rt = ip_route_output_key(sock_net(sk), &fl4);
1511 if (IS_ERR(rt))
1512 return;
1513
1514 /* And let IP do all the hard work.
1515
1516 This chunk is not reenterable, hence spinlock.
1517 Note that it uses the fact, that this function is called
1518 with locally disabled BH and that sk cannot be already spinlocked.
1519 */
1520 bh_lock_sock(sk);
1521 inet->tos = arg->tos;
1522 sk->sk_priority = skb->priority;
1523 sk->sk_protocol = ip_hdr(skb)->protocol;
1524 sk->sk_bound_dev_if = arg->bound_dev_if;
1525 ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1526 &ipc, &rt, MSG_DONTWAIT);
1527 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1528 if (arg->csumoffset >= 0)
1529 *((__sum16 *)skb_transport_header(skb) +
1530 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1531 arg->csum));
1532 skb->ip_summed = CHECKSUM_NONE;
1533 ip_push_pending_frames(sk, &fl4);
1534 }
1535
1536 bh_unlock_sock(sk);
1537
1538 ip_rt_put(rt);
1539}
1540
1541void __init ip_init(void)
1542{
1543 ip_rt_init();
1544 inet_initpeers();
1545
1546#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1547 igmp_mc_proc_init();
1548#endif
1549}