Loading...
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19/*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48#define pr_fmt(fmt) "TCP: " fmt
49
50#include <linux/bottom_half.h>
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
59#include <linux/slab.h>
60
61#include <net/net_namespace.h>
62#include <net/icmp.h>
63#include <net/inet_hashtables.h>
64#include <net/tcp.h>
65#include <net/transp_v6.h>
66#include <net/ipv6.h>
67#include <net/inet_common.h>
68#include <net/timewait_sock.h>
69#include <net/xfrm.h>
70#include <net/secure_seq.h>
71#include <net/busy_poll.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78#include <linux/inetdevice.h>
79#include <linux/btf_ids.h>
80
81#include <crypto/hash.h>
82#include <linux/scatterlist.h>
83
84#include <trace/events/tcp.h>
85
86#ifdef CONFIG_TCP_MD5SIG
87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
89#endif
90
91struct inet_hashinfo tcp_hashinfo;
92EXPORT_SYMBOL(tcp_hashinfo);
93
94static u32 tcp_v4_init_seq(const struct sk_buff *skb)
95{
96 return secure_tcp_seq(ip_hdr(skb)->daddr,
97 ip_hdr(skb)->saddr,
98 tcp_hdr(skb)->dest,
99 tcp_hdr(skb)->source);
100}
101
102static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
103{
104 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
105}
106
107int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
108{
109 const struct inet_timewait_sock *tw = inet_twsk(sktw);
110 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
111 struct tcp_sock *tp = tcp_sk(sk);
112 int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
113
114 if (reuse == 2) {
115 /* Still does not detect *everything* that goes through
116 * lo, since we require a loopback src or dst address
117 * or direct binding to 'lo' interface.
118 */
119 bool loopback = false;
120 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
121 loopback = true;
122#if IS_ENABLED(CONFIG_IPV6)
123 if (tw->tw_family == AF_INET6) {
124 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
125 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
126 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
128 loopback = true;
129 } else
130#endif
131 {
132 if (ipv4_is_loopback(tw->tw_daddr) ||
133 ipv4_is_loopback(tw->tw_rcv_saddr))
134 loopback = true;
135 }
136 if (!loopback)
137 reuse = 0;
138 }
139
140 /* With PAWS, it is safe from the viewpoint
141 of data integrity. Even without PAWS it is safe provided sequence
142 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
143
144 Actually, the idea is close to VJ's one, only timestamp cache is
145 held not per host, but per port pair and TW bucket is used as state
146 holder.
147
148 If TW bucket has been already destroyed we fall back to VJ's scheme
149 and use initial timestamp retrieved from peer table.
150 */
151 if (tcptw->tw_ts_recent_stamp &&
152 (!twp || (reuse && time_after32(ktime_get_seconds(),
153 tcptw->tw_ts_recent_stamp)))) {
154 /* In case of repair and re-using TIME-WAIT sockets we still
155 * want to be sure that it is safe as above but honor the
156 * sequence numbers and time stamps set as part of the repair
157 * process.
158 *
159 * Without this check re-using a TIME-WAIT socket with TCP
160 * repair would accumulate a -1 on the repair assigned
161 * sequence number. The first time it is reused the sequence
162 * is -1, the second time -2, etc. This fixes that issue
163 * without appearing to create any others.
164 */
165 if (likely(!tp->repair)) {
166 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
167
168 if (!seq)
169 seq = 1;
170 WRITE_ONCE(tp->write_seq, seq);
171 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
172 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
173 }
174 sock_hold(sktw);
175 return 1;
176 }
177
178 return 0;
179}
180EXPORT_SYMBOL_GPL(tcp_twsk_unique);
181
182static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
183 int addr_len)
184{
185 /* This check is replicated from tcp_v4_connect() and intended to
186 * prevent BPF program called below from accessing bytes that are out
187 * of the bound specified by user in addr_len.
188 */
189 if (addr_len < sizeof(struct sockaddr_in))
190 return -EINVAL;
191
192 sock_owned_by_me(sk);
193
194 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
195}
196
197/* This will initiate an outgoing connection. */
198int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
199{
200 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
201 struct inet_sock *inet = inet_sk(sk);
202 struct tcp_sock *tp = tcp_sk(sk);
203 __be16 orig_sport, orig_dport;
204 __be32 daddr, nexthop;
205 struct flowi4 *fl4;
206 struct rtable *rt;
207 int err;
208 struct ip_options_rcu *inet_opt;
209 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
210
211 if (addr_len < sizeof(struct sockaddr_in))
212 return -EINVAL;
213
214 if (usin->sin_family != AF_INET)
215 return -EAFNOSUPPORT;
216
217 nexthop = daddr = usin->sin_addr.s_addr;
218 inet_opt = rcu_dereference_protected(inet->inet_opt,
219 lockdep_sock_is_held(sk));
220 if (inet_opt && inet_opt->opt.srr) {
221 if (!daddr)
222 return -EINVAL;
223 nexthop = inet_opt->opt.faddr;
224 }
225
226 orig_sport = inet->inet_sport;
227 orig_dport = usin->sin_port;
228 fl4 = &inet->cork.fl.u.ip4;
229 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
230 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
231 IPPROTO_TCP,
232 orig_sport, orig_dport, sk);
233 if (IS_ERR(rt)) {
234 err = PTR_ERR(rt);
235 if (err == -ENETUNREACH)
236 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
237 return err;
238 }
239
240 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
241 ip_rt_put(rt);
242 return -ENETUNREACH;
243 }
244
245 if (!inet_opt || !inet_opt->opt.srr)
246 daddr = fl4->daddr;
247
248 if (!inet->inet_saddr)
249 inet->inet_saddr = fl4->saddr;
250 sk_rcv_saddr_set(sk, inet->inet_saddr);
251
252 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
253 /* Reset inherited state */
254 tp->rx_opt.ts_recent = 0;
255 tp->rx_opt.ts_recent_stamp = 0;
256 if (likely(!tp->repair))
257 WRITE_ONCE(tp->write_seq, 0);
258 }
259
260 inet->inet_dport = usin->sin_port;
261 sk_daddr_set(sk, daddr);
262
263 inet_csk(sk)->icsk_ext_hdr_len = 0;
264 if (inet_opt)
265 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
266
267 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
268
269 /* Socket identity is still unknown (sport may be zero).
270 * However we set state to SYN-SENT and not releasing socket
271 * lock select source port, enter ourselves into the hash tables and
272 * complete initialization after this.
273 */
274 tcp_set_state(sk, TCP_SYN_SENT);
275 err = inet_hash_connect(tcp_death_row, sk);
276 if (err)
277 goto failure;
278
279 sk_set_txhash(sk);
280
281 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
282 inet->inet_sport, inet->inet_dport, sk);
283 if (IS_ERR(rt)) {
284 err = PTR_ERR(rt);
285 rt = NULL;
286 goto failure;
287 }
288 /* OK, now commit destination to socket. */
289 sk->sk_gso_type = SKB_GSO_TCPV4;
290 sk_setup_caps(sk, &rt->dst);
291 rt = NULL;
292
293 if (likely(!tp->repair)) {
294 if (!tp->write_seq)
295 WRITE_ONCE(tp->write_seq,
296 secure_tcp_seq(inet->inet_saddr,
297 inet->inet_daddr,
298 inet->inet_sport,
299 usin->sin_port));
300 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
301 inet->inet_saddr,
302 inet->inet_daddr);
303 }
304
305 inet->inet_id = prandom_u32();
306
307 if (tcp_fastopen_defer_connect(sk, &err))
308 return err;
309 if (err)
310 goto failure;
311
312 err = tcp_connect(sk);
313
314 if (err)
315 goto failure;
316
317 return 0;
318
319failure:
320 /*
321 * This unhashes the socket and releases the local port,
322 * if necessary.
323 */
324 tcp_set_state(sk, TCP_CLOSE);
325 ip_rt_put(rt);
326 sk->sk_route_caps = 0;
327 inet->inet_dport = 0;
328 return err;
329}
330EXPORT_SYMBOL(tcp_v4_connect);
331
332/*
333 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
334 * It can be called through tcp_release_cb() if socket was owned by user
335 * at the time tcp_v4_err() was called to handle ICMP message.
336 */
337void tcp_v4_mtu_reduced(struct sock *sk)
338{
339 struct inet_sock *inet = inet_sk(sk);
340 struct dst_entry *dst;
341 u32 mtu;
342
343 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
344 return;
345 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
346 dst = inet_csk_update_pmtu(sk, mtu);
347 if (!dst)
348 return;
349
350 /* Something is about to be wrong... Remember soft error
351 * for the case, if this connection will not able to recover.
352 */
353 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
354 sk->sk_err_soft = EMSGSIZE;
355
356 mtu = dst_mtu(dst);
357
358 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
359 ip_sk_accept_pmtu(sk) &&
360 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
361 tcp_sync_mss(sk, mtu);
362
363 /* Resend the TCP packet because it's
364 * clear that the old packet has been
365 * dropped. This is the new "fast" path mtu
366 * discovery.
367 */
368 tcp_simple_retransmit(sk);
369 } /* else let the usual retransmit timer handle it */
370}
371EXPORT_SYMBOL(tcp_v4_mtu_reduced);
372
373static void do_redirect(struct sk_buff *skb, struct sock *sk)
374{
375 struct dst_entry *dst = __sk_dst_check(sk, 0);
376
377 if (dst)
378 dst->ops->redirect(dst, sk, skb);
379}
380
381
382/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
383void tcp_req_err(struct sock *sk, u32 seq, bool abort)
384{
385 struct request_sock *req = inet_reqsk(sk);
386 struct net *net = sock_net(sk);
387
388 /* ICMPs are not backlogged, hence we cannot get
389 * an established socket here.
390 */
391 if (seq != tcp_rsk(req)->snt_isn) {
392 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
393 } else if (abort) {
394 /*
395 * Still in SYN_RECV, just remove it silently.
396 * There is no good way to pass the error to the newly
397 * created socket, and POSIX does not want network
398 * errors returned from accept().
399 */
400 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
401 tcp_listendrop(req->rsk_listener);
402 }
403 reqsk_put(req);
404}
405EXPORT_SYMBOL(tcp_req_err);
406
407/* TCP-LD (RFC 6069) logic */
408void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
409{
410 struct inet_connection_sock *icsk = inet_csk(sk);
411 struct tcp_sock *tp = tcp_sk(sk);
412 struct sk_buff *skb;
413 s32 remaining;
414 u32 delta_us;
415
416 if (sock_owned_by_user(sk))
417 return;
418
419 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
420 !icsk->icsk_backoff)
421 return;
422
423 skb = tcp_rtx_queue_head(sk);
424 if (WARN_ON_ONCE(!skb))
425 return;
426
427 icsk->icsk_backoff--;
428 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
429 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
430
431 tcp_mstamp_refresh(tp);
432 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
433 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
434
435 if (remaining > 0) {
436 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
437 remaining, TCP_RTO_MAX);
438 } else {
439 /* RTO revert clocked out retransmission.
440 * Will retransmit now.
441 */
442 tcp_retransmit_timer(sk);
443 }
444}
445EXPORT_SYMBOL(tcp_ld_RTO_revert);
446
447/*
448 * This routine is called by the ICMP module when it gets some
449 * sort of error condition. If err < 0 then the socket should
450 * be closed and the error returned to the user. If err > 0
451 * it's just the icmp type << 8 | icmp code. After adjustment
452 * header points to the first 8 bytes of the tcp header. We need
453 * to find the appropriate port.
454 *
455 * The locking strategy used here is very "optimistic". When
456 * someone else accesses the socket the ICMP is just dropped
457 * and for some paths there is no check at all.
458 * A more general error queue to queue errors for later handling
459 * is probably better.
460 *
461 */
462
463int tcp_v4_err(struct sk_buff *skb, u32 info)
464{
465 const struct iphdr *iph = (const struct iphdr *)skb->data;
466 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
467 struct tcp_sock *tp;
468 struct inet_sock *inet;
469 const int type = icmp_hdr(skb)->type;
470 const int code = icmp_hdr(skb)->code;
471 struct sock *sk;
472 struct request_sock *fastopen;
473 u32 seq, snd_una;
474 int err;
475 struct net *net = dev_net(skb->dev);
476
477 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
478 th->dest, iph->saddr, ntohs(th->source),
479 inet_iif(skb), 0);
480 if (!sk) {
481 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
482 return -ENOENT;
483 }
484 if (sk->sk_state == TCP_TIME_WAIT) {
485 inet_twsk_put(inet_twsk(sk));
486 return 0;
487 }
488 seq = ntohl(th->seq);
489 if (sk->sk_state == TCP_NEW_SYN_RECV) {
490 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
491 type == ICMP_TIME_EXCEEDED ||
492 (type == ICMP_DEST_UNREACH &&
493 (code == ICMP_NET_UNREACH ||
494 code == ICMP_HOST_UNREACH)));
495 return 0;
496 }
497
498 bh_lock_sock(sk);
499 /* If too many ICMPs get dropped on busy
500 * servers this needs to be solved differently.
501 * We do take care of PMTU discovery (RFC1191) special case :
502 * we can receive locally generated ICMP messages while socket is held.
503 */
504 if (sock_owned_by_user(sk)) {
505 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
506 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
507 }
508 if (sk->sk_state == TCP_CLOSE)
509 goto out;
510
511 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
512 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
513 goto out;
514 }
515
516 tp = tcp_sk(sk);
517 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
518 fastopen = rcu_dereference(tp->fastopen_rsk);
519 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
520 if (sk->sk_state != TCP_LISTEN &&
521 !between(seq, snd_una, tp->snd_nxt)) {
522 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
523 goto out;
524 }
525
526 switch (type) {
527 case ICMP_REDIRECT:
528 if (!sock_owned_by_user(sk))
529 do_redirect(skb, sk);
530 goto out;
531 case ICMP_SOURCE_QUENCH:
532 /* Just silently ignore these. */
533 goto out;
534 case ICMP_PARAMETERPROB:
535 err = EPROTO;
536 break;
537 case ICMP_DEST_UNREACH:
538 if (code > NR_ICMP_UNREACH)
539 goto out;
540
541 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
542 /* We are not interested in TCP_LISTEN and open_requests
543 * (SYN-ACKs send out by Linux are always <576bytes so
544 * they should go through unfragmented).
545 */
546 if (sk->sk_state == TCP_LISTEN)
547 goto out;
548
549 WRITE_ONCE(tp->mtu_info, info);
550 if (!sock_owned_by_user(sk)) {
551 tcp_v4_mtu_reduced(sk);
552 } else {
553 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
554 sock_hold(sk);
555 }
556 goto out;
557 }
558
559 err = icmp_err_convert[code].errno;
560 /* check if this ICMP message allows revert of backoff.
561 * (see RFC 6069)
562 */
563 if (!fastopen &&
564 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
565 tcp_ld_RTO_revert(sk, seq);
566 break;
567 case ICMP_TIME_EXCEEDED:
568 err = EHOSTUNREACH;
569 break;
570 default:
571 goto out;
572 }
573
574 switch (sk->sk_state) {
575 case TCP_SYN_SENT:
576 case TCP_SYN_RECV:
577 /* Only in fast or simultaneous open. If a fast open socket is
578 * already accepted it is treated as a connected one below.
579 */
580 if (fastopen && !fastopen->sk)
581 break;
582
583 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
584
585 if (!sock_owned_by_user(sk)) {
586 sk->sk_err = err;
587
588 sk_error_report(sk);
589
590 tcp_done(sk);
591 } else {
592 sk->sk_err_soft = err;
593 }
594 goto out;
595 }
596
597 /* If we've already connected we will keep trying
598 * until we time out, or the user gives up.
599 *
600 * rfc1122 4.2.3.9 allows to consider as hard errors
601 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
602 * but it is obsoleted by pmtu discovery).
603 *
604 * Note, that in modern internet, where routing is unreliable
605 * and in each dark corner broken firewalls sit, sending random
606 * errors ordered by their masters even this two messages finally lose
607 * their original sense (even Linux sends invalid PORT_UNREACHs)
608 *
609 * Now we are in compliance with RFCs.
610 * --ANK (980905)
611 */
612
613 inet = inet_sk(sk);
614 if (!sock_owned_by_user(sk) && inet->recverr) {
615 sk->sk_err = err;
616 sk_error_report(sk);
617 } else { /* Only an error on timeout */
618 sk->sk_err_soft = err;
619 }
620
621out:
622 bh_unlock_sock(sk);
623 sock_put(sk);
624 return 0;
625}
626
627void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
628{
629 struct tcphdr *th = tcp_hdr(skb);
630
631 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
632 skb->csum_start = skb_transport_header(skb) - skb->head;
633 skb->csum_offset = offsetof(struct tcphdr, check);
634}
635
636/* This routine computes an IPv4 TCP checksum. */
637void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
638{
639 const struct inet_sock *inet = inet_sk(sk);
640
641 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
642}
643EXPORT_SYMBOL(tcp_v4_send_check);
644
645/*
646 * This routine will send an RST to the other tcp.
647 *
648 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
649 * for reset.
650 * Answer: if a packet caused RST, it is not for a socket
651 * existing in our system, if it is matched to a socket,
652 * it is just duplicate segment or bug in other side's TCP.
653 * So that we build reply only basing on parameters
654 * arrived with segment.
655 * Exception: precedence violation. We do not implement it in any case.
656 */
657
658#ifdef CONFIG_TCP_MD5SIG
659#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
660#else
661#define OPTION_BYTES sizeof(__be32)
662#endif
663
664static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
665{
666 const struct tcphdr *th = tcp_hdr(skb);
667 struct {
668 struct tcphdr th;
669 __be32 opt[OPTION_BYTES / sizeof(__be32)];
670 } rep;
671 struct ip_reply_arg arg;
672#ifdef CONFIG_TCP_MD5SIG
673 struct tcp_md5sig_key *key = NULL;
674 const __u8 *hash_location = NULL;
675 unsigned char newhash[16];
676 int genhash;
677 struct sock *sk1 = NULL;
678#endif
679 u64 transmit_time = 0;
680 struct sock *ctl_sk;
681 struct net *net;
682
683 /* Never send a reset in response to a reset. */
684 if (th->rst)
685 return;
686
687 /* If sk not NULL, it means we did a successful lookup and incoming
688 * route had to be correct. prequeue might have dropped our dst.
689 */
690 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
691 return;
692
693 /* Swap the send and the receive. */
694 memset(&rep, 0, sizeof(rep));
695 rep.th.dest = th->source;
696 rep.th.source = th->dest;
697 rep.th.doff = sizeof(struct tcphdr) / 4;
698 rep.th.rst = 1;
699
700 if (th->ack) {
701 rep.th.seq = th->ack_seq;
702 } else {
703 rep.th.ack = 1;
704 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
705 skb->len - (th->doff << 2));
706 }
707
708 memset(&arg, 0, sizeof(arg));
709 arg.iov[0].iov_base = (unsigned char *)&rep;
710 arg.iov[0].iov_len = sizeof(rep.th);
711
712 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
713#ifdef CONFIG_TCP_MD5SIG
714 rcu_read_lock();
715 hash_location = tcp_parse_md5sig_option(th);
716 if (sk && sk_fullsock(sk)) {
717 const union tcp_md5_addr *addr;
718 int l3index;
719
720 /* sdif set, means packet ingressed via a device
721 * in an L3 domain and inet_iif is set to it.
722 */
723 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
724 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
725 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
726 } else if (hash_location) {
727 const union tcp_md5_addr *addr;
728 int sdif = tcp_v4_sdif(skb);
729 int dif = inet_iif(skb);
730 int l3index;
731
732 /*
733 * active side is lost. Try to find listening socket through
734 * source port, and then find md5 key through listening socket.
735 * we are not loose security here:
736 * Incoming packet is checked with md5 hash with finding key,
737 * no RST generated if md5 hash doesn't match.
738 */
739 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
740 ip_hdr(skb)->saddr,
741 th->source, ip_hdr(skb)->daddr,
742 ntohs(th->source), dif, sdif);
743 /* don't send rst if it can't find key */
744 if (!sk1)
745 goto out;
746
747 /* sdif set, means packet ingressed via a device
748 * in an L3 domain and dif is set to it.
749 */
750 l3index = sdif ? dif : 0;
751 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
752 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
753 if (!key)
754 goto out;
755
756
757 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
758 if (genhash || memcmp(hash_location, newhash, 16) != 0)
759 goto out;
760
761 }
762
763 if (key) {
764 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
765 (TCPOPT_NOP << 16) |
766 (TCPOPT_MD5SIG << 8) |
767 TCPOLEN_MD5SIG);
768 /* Update length and the length the header thinks exists */
769 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
770 rep.th.doff = arg.iov[0].iov_len / 4;
771
772 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
773 key, ip_hdr(skb)->saddr,
774 ip_hdr(skb)->daddr, &rep.th);
775 }
776#endif
777 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
778 if (rep.opt[0] == 0) {
779 __be32 mrst = mptcp_reset_option(skb);
780
781 if (mrst) {
782 rep.opt[0] = mrst;
783 arg.iov[0].iov_len += sizeof(mrst);
784 rep.th.doff = arg.iov[0].iov_len / 4;
785 }
786 }
787
788 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
789 ip_hdr(skb)->saddr, /* XXX */
790 arg.iov[0].iov_len, IPPROTO_TCP, 0);
791 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
792 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
793
794 /* When socket is gone, all binding information is lost.
795 * routing might fail in this case. No choice here, if we choose to force
796 * input interface, we will misroute in case of asymmetric route.
797 */
798 if (sk) {
799 arg.bound_dev_if = sk->sk_bound_dev_if;
800 if (sk_fullsock(sk))
801 trace_tcp_send_reset(sk, skb);
802 }
803
804 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
805 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
806
807 arg.tos = ip_hdr(skb)->tos;
808 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
809 local_bh_disable();
810 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
811 if (sk) {
812 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
813 inet_twsk(sk)->tw_mark : sk->sk_mark;
814 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
815 inet_twsk(sk)->tw_priority : sk->sk_priority;
816 transmit_time = tcp_transmit_time(sk);
817 }
818 ip_send_unicast_reply(ctl_sk,
819 skb, &TCP_SKB_CB(skb)->header.h4.opt,
820 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
821 &arg, arg.iov[0].iov_len,
822 transmit_time);
823
824 ctl_sk->sk_mark = 0;
825 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
826 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
827 local_bh_enable();
828
829#ifdef CONFIG_TCP_MD5SIG
830out:
831 rcu_read_unlock();
832#endif
833}
834
835/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
836 outside socket context is ugly, certainly. What can I do?
837 */
838
839static void tcp_v4_send_ack(const struct sock *sk,
840 struct sk_buff *skb, u32 seq, u32 ack,
841 u32 win, u32 tsval, u32 tsecr, int oif,
842 struct tcp_md5sig_key *key,
843 int reply_flags, u8 tos)
844{
845 const struct tcphdr *th = tcp_hdr(skb);
846 struct {
847 struct tcphdr th;
848 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
849#ifdef CONFIG_TCP_MD5SIG
850 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
851#endif
852 ];
853 } rep;
854 struct net *net = sock_net(sk);
855 struct ip_reply_arg arg;
856 struct sock *ctl_sk;
857 u64 transmit_time;
858
859 memset(&rep.th, 0, sizeof(struct tcphdr));
860 memset(&arg, 0, sizeof(arg));
861
862 arg.iov[0].iov_base = (unsigned char *)&rep;
863 arg.iov[0].iov_len = sizeof(rep.th);
864 if (tsecr) {
865 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
866 (TCPOPT_TIMESTAMP << 8) |
867 TCPOLEN_TIMESTAMP);
868 rep.opt[1] = htonl(tsval);
869 rep.opt[2] = htonl(tsecr);
870 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
871 }
872
873 /* Swap the send and the receive. */
874 rep.th.dest = th->source;
875 rep.th.source = th->dest;
876 rep.th.doff = arg.iov[0].iov_len / 4;
877 rep.th.seq = htonl(seq);
878 rep.th.ack_seq = htonl(ack);
879 rep.th.ack = 1;
880 rep.th.window = htons(win);
881
882#ifdef CONFIG_TCP_MD5SIG
883 if (key) {
884 int offset = (tsecr) ? 3 : 0;
885
886 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
887 (TCPOPT_NOP << 16) |
888 (TCPOPT_MD5SIG << 8) |
889 TCPOLEN_MD5SIG);
890 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
891 rep.th.doff = arg.iov[0].iov_len/4;
892
893 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
894 key, ip_hdr(skb)->saddr,
895 ip_hdr(skb)->daddr, &rep.th);
896 }
897#endif
898 arg.flags = reply_flags;
899 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
900 ip_hdr(skb)->saddr, /* XXX */
901 arg.iov[0].iov_len, IPPROTO_TCP, 0);
902 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
903 if (oif)
904 arg.bound_dev_if = oif;
905 arg.tos = tos;
906 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
907 local_bh_disable();
908 ctl_sk = this_cpu_read(*net->ipv4.tcp_sk);
909 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
910 inet_twsk(sk)->tw_mark : sk->sk_mark;
911 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
912 inet_twsk(sk)->tw_priority : sk->sk_priority;
913 transmit_time = tcp_transmit_time(sk);
914 ip_send_unicast_reply(ctl_sk,
915 skb, &TCP_SKB_CB(skb)->header.h4.opt,
916 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
917 &arg, arg.iov[0].iov_len,
918 transmit_time);
919
920 ctl_sk->sk_mark = 0;
921 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
922 local_bh_enable();
923}
924
925static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
926{
927 struct inet_timewait_sock *tw = inet_twsk(sk);
928 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
929
930 tcp_v4_send_ack(sk, skb,
931 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
932 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
933 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
934 tcptw->tw_ts_recent,
935 tw->tw_bound_dev_if,
936 tcp_twsk_md5_key(tcptw),
937 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
938 tw->tw_tos
939 );
940
941 inet_twsk_put(tw);
942}
943
944static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
945 struct request_sock *req)
946{
947 const union tcp_md5_addr *addr;
948 int l3index;
949
950 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
951 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
952 */
953 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
954 tcp_sk(sk)->snd_nxt;
955
956 /* RFC 7323 2.3
957 * The window field (SEG.WND) of every outgoing segment, with the
958 * exception of <SYN> segments, MUST be right-shifted by
959 * Rcv.Wind.Shift bits:
960 */
961 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
962 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
963 tcp_v4_send_ack(sk, skb, seq,
964 tcp_rsk(req)->rcv_nxt,
965 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
966 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
967 req->ts_recent,
968 0,
969 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
970 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
971 ip_hdr(skb)->tos);
972}
973
974/*
975 * Send a SYN-ACK after having received a SYN.
976 * This still operates on a request_sock only, not on a big
977 * socket.
978 */
979static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
980 struct flowi *fl,
981 struct request_sock *req,
982 struct tcp_fastopen_cookie *foc,
983 enum tcp_synack_type synack_type,
984 struct sk_buff *syn_skb)
985{
986 const struct inet_request_sock *ireq = inet_rsk(req);
987 struct flowi4 fl4;
988 int err = -1;
989 struct sk_buff *skb;
990 u8 tos;
991
992 /* First, grab a route. */
993 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
994 return -1;
995
996 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
997
998 if (skb) {
999 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1000
1001 tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
1002 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1003 (inet_sk(sk)->tos & INET_ECN_MASK) :
1004 inet_sk(sk)->tos;
1005
1006 if (!INET_ECN_is_capable(tos) &&
1007 tcp_bpf_ca_needs_ecn((struct sock *)req))
1008 tos |= INET_ECN_ECT_0;
1009
1010 rcu_read_lock();
1011 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1012 ireq->ir_rmt_addr,
1013 rcu_dereference(ireq->ireq_opt),
1014 tos);
1015 rcu_read_unlock();
1016 err = net_xmit_eval(err);
1017 }
1018
1019 return err;
1020}
1021
1022/*
1023 * IPv4 request_sock destructor.
1024 */
1025static void tcp_v4_reqsk_destructor(struct request_sock *req)
1026{
1027 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1028}
1029
1030#ifdef CONFIG_TCP_MD5SIG
1031/*
1032 * RFC2385 MD5 checksumming requires a mapping of
1033 * IP address->MD5 Key.
1034 * We need to maintain these in the sk structure.
1035 */
1036
1037DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1038EXPORT_SYMBOL(tcp_md5_needed);
1039
1040static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1041{
1042 if (!old)
1043 return true;
1044
1045 /* l3index always overrides non-l3index */
1046 if (old->l3index && new->l3index == 0)
1047 return false;
1048 if (old->l3index == 0 && new->l3index)
1049 return true;
1050
1051 return old->prefixlen < new->prefixlen;
1052}
1053
1054/* Find the Key structure for an address. */
1055struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1056 const union tcp_md5_addr *addr,
1057 int family)
1058{
1059 const struct tcp_sock *tp = tcp_sk(sk);
1060 struct tcp_md5sig_key *key;
1061 const struct tcp_md5sig_info *md5sig;
1062 __be32 mask;
1063 struct tcp_md5sig_key *best_match = NULL;
1064 bool match;
1065
1066 /* caller either holds rcu_read_lock() or socket lock */
1067 md5sig = rcu_dereference_check(tp->md5sig_info,
1068 lockdep_sock_is_held(sk));
1069 if (!md5sig)
1070 return NULL;
1071
1072 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1073 lockdep_sock_is_held(sk)) {
1074 if (key->family != family)
1075 continue;
1076 if (key->l3index && key->l3index != l3index)
1077 continue;
1078 if (family == AF_INET) {
1079 mask = inet_make_mask(key->prefixlen);
1080 match = (key->addr.a4.s_addr & mask) ==
1081 (addr->a4.s_addr & mask);
1082#if IS_ENABLED(CONFIG_IPV6)
1083 } else if (family == AF_INET6) {
1084 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1085 key->prefixlen);
1086#endif
1087 } else {
1088 match = false;
1089 }
1090
1091 if (match && better_md5_match(best_match, key))
1092 best_match = key;
1093 }
1094 return best_match;
1095}
1096EXPORT_SYMBOL(__tcp_md5_do_lookup);
1097
1098static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1099 const union tcp_md5_addr *addr,
1100 int family, u8 prefixlen,
1101 int l3index)
1102{
1103 const struct tcp_sock *tp = tcp_sk(sk);
1104 struct tcp_md5sig_key *key;
1105 unsigned int size = sizeof(struct in_addr);
1106 const struct tcp_md5sig_info *md5sig;
1107
1108 /* caller either holds rcu_read_lock() or socket lock */
1109 md5sig = rcu_dereference_check(tp->md5sig_info,
1110 lockdep_sock_is_held(sk));
1111 if (!md5sig)
1112 return NULL;
1113#if IS_ENABLED(CONFIG_IPV6)
1114 if (family == AF_INET6)
1115 size = sizeof(struct in6_addr);
1116#endif
1117 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1118 lockdep_sock_is_held(sk)) {
1119 if (key->family != family)
1120 continue;
1121 if (key->l3index != l3index)
1122 continue;
1123 if (!memcmp(&key->addr, addr, size) &&
1124 key->prefixlen == prefixlen)
1125 return key;
1126 }
1127 return NULL;
1128}
1129
1130struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1131 const struct sock *addr_sk)
1132{
1133 const union tcp_md5_addr *addr;
1134 int l3index;
1135
1136 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1137 addr_sk->sk_bound_dev_if);
1138 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1139 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1140}
1141EXPORT_SYMBOL(tcp_v4_md5_lookup);
1142
1143/* This can be called on a newly created socket, from other files */
1144int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1145 int family, u8 prefixlen, int l3index,
1146 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1147{
1148 /* Add Key to the list */
1149 struct tcp_md5sig_key *key;
1150 struct tcp_sock *tp = tcp_sk(sk);
1151 struct tcp_md5sig_info *md5sig;
1152
1153 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1154 if (key) {
1155 /* Pre-existing entry - just update that one.
1156 * Note that the key might be used concurrently.
1157 * data_race() is telling kcsan that we do not care of
1158 * key mismatches, since changing MD5 key on live flows
1159 * can lead to packet drops.
1160 */
1161 data_race(memcpy(key->key, newkey, newkeylen));
1162
1163 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1164 * Also note that a reader could catch new key->keylen value
1165 * but old key->key[], this is the reason we use __GFP_ZERO
1166 * at sock_kmalloc() time below these lines.
1167 */
1168 WRITE_ONCE(key->keylen, newkeylen);
1169
1170 return 0;
1171 }
1172
1173 md5sig = rcu_dereference_protected(tp->md5sig_info,
1174 lockdep_sock_is_held(sk));
1175 if (!md5sig) {
1176 md5sig = kmalloc(sizeof(*md5sig), gfp);
1177 if (!md5sig)
1178 return -ENOMEM;
1179
1180 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1181 INIT_HLIST_HEAD(&md5sig->head);
1182 rcu_assign_pointer(tp->md5sig_info, md5sig);
1183 }
1184
1185 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1186 if (!key)
1187 return -ENOMEM;
1188 if (!tcp_alloc_md5sig_pool()) {
1189 sock_kfree_s(sk, key, sizeof(*key));
1190 return -ENOMEM;
1191 }
1192
1193 memcpy(key->key, newkey, newkeylen);
1194 key->keylen = newkeylen;
1195 key->family = family;
1196 key->prefixlen = prefixlen;
1197 key->l3index = l3index;
1198 memcpy(&key->addr, addr,
1199 (family == AF_INET6) ? sizeof(struct in6_addr) :
1200 sizeof(struct in_addr));
1201 hlist_add_head_rcu(&key->node, &md5sig->head);
1202 return 0;
1203}
1204EXPORT_SYMBOL(tcp_md5_do_add);
1205
1206int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1207 u8 prefixlen, int l3index)
1208{
1209 struct tcp_md5sig_key *key;
1210
1211 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index);
1212 if (!key)
1213 return -ENOENT;
1214 hlist_del_rcu(&key->node);
1215 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1216 kfree_rcu(key, rcu);
1217 return 0;
1218}
1219EXPORT_SYMBOL(tcp_md5_do_del);
1220
1221static void tcp_clear_md5_list(struct sock *sk)
1222{
1223 struct tcp_sock *tp = tcp_sk(sk);
1224 struct tcp_md5sig_key *key;
1225 struct hlist_node *n;
1226 struct tcp_md5sig_info *md5sig;
1227
1228 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1229
1230 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1231 hlist_del_rcu(&key->node);
1232 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1233 kfree_rcu(key, rcu);
1234 }
1235}
1236
1237static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1238 sockptr_t optval, int optlen)
1239{
1240 struct tcp_md5sig cmd;
1241 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1242 const union tcp_md5_addr *addr;
1243 u8 prefixlen = 32;
1244 int l3index = 0;
1245
1246 if (optlen < sizeof(cmd))
1247 return -EINVAL;
1248
1249 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1250 return -EFAULT;
1251
1252 if (sin->sin_family != AF_INET)
1253 return -EINVAL;
1254
1255 if (optname == TCP_MD5SIG_EXT &&
1256 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1257 prefixlen = cmd.tcpm_prefixlen;
1258 if (prefixlen > 32)
1259 return -EINVAL;
1260 }
1261
1262 if (optname == TCP_MD5SIG_EXT &&
1263 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1264 struct net_device *dev;
1265
1266 rcu_read_lock();
1267 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1268 if (dev && netif_is_l3_master(dev))
1269 l3index = dev->ifindex;
1270
1271 rcu_read_unlock();
1272
1273 /* ok to reference set/not set outside of rcu;
1274 * right now device MUST be an L3 master
1275 */
1276 if (!dev || !l3index)
1277 return -EINVAL;
1278 }
1279
1280 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1281
1282 if (!cmd.tcpm_keylen)
1283 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index);
1284
1285 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1286 return -EINVAL;
1287
1288 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index,
1289 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1290}
1291
1292static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1293 __be32 daddr, __be32 saddr,
1294 const struct tcphdr *th, int nbytes)
1295{
1296 struct tcp4_pseudohdr *bp;
1297 struct scatterlist sg;
1298 struct tcphdr *_th;
1299
1300 bp = hp->scratch;
1301 bp->saddr = saddr;
1302 bp->daddr = daddr;
1303 bp->pad = 0;
1304 bp->protocol = IPPROTO_TCP;
1305 bp->len = cpu_to_be16(nbytes);
1306
1307 _th = (struct tcphdr *)(bp + 1);
1308 memcpy(_th, th, sizeof(*th));
1309 _th->check = 0;
1310
1311 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1312 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1313 sizeof(*bp) + sizeof(*th));
1314 return crypto_ahash_update(hp->md5_req);
1315}
1316
1317static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1318 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1319{
1320 struct tcp_md5sig_pool *hp;
1321 struct ahash_request *req;
1322
1323 hp = tcp_get_md5sig_pool();
1324 if (!hp)
1325 goto clear_hash_noput;
1326 req = hp->md5_req;
1327
1328 if (crypto_ahash_init(req))
1329 goto clear_hash;
1330 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1331 goto clear_hash;
1332 if (tcp_md5_hash_key(hp, key))
1333 goto clear_hash;
1334 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1335 if (crypto_ahash_final(req))
1336 goto clear_hash;
1337
1338 tcp_put_md5sig_pool();
1339 return 0;
1340
1341clear_hash:
1342 tcp_put_md5sig_pool();
1343clear_hash_noput:
1344 memset(md5_hash, 0, 16);
1345 return 1;
1346}
1347
1348int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1349 const struct sock *sk,
1350 const struct sk_buff *skb)
1351{
1352 struct tcp_md5sig_pool *hp;
1353 struct ahash_request *req;
1354 const struct tcphdr *th = tcp_hdr(skb);
1355 __be32 saddr, daddr;
1356
1357 if (sk) { /* valid for establish/request sockets */
1358 saddr = sk->sk_rcv_saddr;
1359 daddr = sk->sk_daddr;
1360 } else {
1361 const struct iphdr *iph = ip_hdr(skb);
1362 saddr = iph->saddr;
1363 daddr = iph->daddr;
1364 }
1365
1366 hp = tcp_get_md5sig_pool();
1367 if (!hp)
1368 goto clear_hash_noput;
1369 req = hp->md5_req;
1370
1371 if (crypto_ahash_init(req))
1372 goto clear_hash;
1373
1374 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1375 goto clear_hash;
1376 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1377 goto clear_hash;
1378 if (tcp_md5_hash_key(hp, key))
1379 goto clear_hash;
1380 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1381 if (crypto_ahash_final(req))
1382 goto clear_hash;
1383
1384 tcp_put_md5sig_pool();
1385 return 0;
1386
1387clear_hash:
1388 tcp_put_md5sig_pool();
1389clear_hash_noput:
1390 memset(md5_hash, 0, 16);
1391 return 1;
1392}
1393EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1394
1395#endif
1396
1397/* Called with rcu_read_lock() */
1398static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1399 const struct sk_buff *skb,
1400 int dif, int sdif)
1401{
1402#ifdef CONFIG_TCP_MD5SIG
1403 /*
1404 * This gets called for each TCP segment that arrives
1405 * so we want to be efficient.
1406 * We have 3 drop cases:
1407 * o No MD5 hash and one expected.
1408 * o MD5 hash and we're not expecting one.
1409 * o MD5 hash and its wrong.
1410 */
1411 const __u8 *hash_location = NULL;
1412 struct tcp_md5sig_key *hash_expected;
1413 const struct iphdr *iph = ip_hdr(skb);
1414 const struct tcphdr *th = tcp_hdr(skb);
1415 const union tcp_md5_addr *addr;
1416 unsigned char newhash[16];
1417 int genhash, l3index;
1418
1419 /* sdif set, means packet ingressed via a device
1420 * in an L3 domain and dif is set to the l3mdev
1421 */
1422 l3index = sdif ? dif : 0;
1423
1424 addr = (union tcp_md5_addr *)&iph->saddr;
1425 hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1426 hash_location = tcp_parse_md5sig_option(th);
1427
1428 /* We've parsed the options - do we have a hash? */
1429 if (!hash_expected && !hash_location)
1430 return false;
1431
1432 if (hash_expected && !hash_location) {
1433 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1434 return true;
1435 }
1436
1437 if (!hash_expected && hash_location) {
1438 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1439 return true;
1440 }
1441
1442 /* Okay, so this is hash_expected and hash_location -
1443 * so we need to calculate the checksum.
1444 */
1445 genhash = tcp_v4_md5_hash_skb(newhash,
1446 hash_expected,
1447 NULL, skb);
1448
1449 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1450 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1451 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n",
1452 &iph->saddr, ntohs(th->source),
1453 &iph->daddr, ntohs(th->dest),
1454 genhash ? " tcp_v4_calc_md5_hash failed"
1455 : "", l3index);
1456 return true;
1457 }
1458 return false;
1459#endif
1460 return false;
1461}
1462
1463static void tcp_v4_init_req(struct request_sock *req,
1464 const struct sock *sk_listener,
1465 struct sk_buff *skb)
1466{
1467 struct inet_request_sock *ireq = inet_rsk(req);
1468 struct net *net = sock_net(sk_listener);
1469
1470 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1471 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1472 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1473}
1474
1475static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1476 struct sk_buff *skb,
1477 struct flowi *fl,
1478 struct request_sock *req)
1479{
1480 tcp_v4_init_req(req, sk, skb);
1481
1482 if (security_inet_conn_request(sk, skb, req))
1483 return NULL;
1484
1485 return inet_csk_route_req(sk, &fl->u.ip4, req);
1486}
1487
1488struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1489 .family = PF_INET,
1490 .obj_size = sizeof(struct tcp_request_sock),
1491 .rtx_syn_ack = tcp_rtx_synack,
1492 .send_ack = tcp_v4_reqsk_send_ack,
1493 .destructor = tcp_v4_reqsk_destructor,
1494 .send_reset = tcp_v4_send_reset,
1495 .syn_ack_timeout = tcp_syn_ack_timeout,
1496};
1497
1498const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1499 .mss_clamp = TCP_MSS_DEFAULT,
1500#ifdef CONFIG_TCP_MD5SIG
1501 .req_md5_lookup = tcp_v4_md5_lookup,
1502 .calc_md5_hash = tcp_v4_md5_hash_skb,
1503#endif
1504#ifdef CONFIG_SYN_COOKIES
1505 .cookie_init_seq = cookie_v4_init_sequence,
1506#endif
1507 .route_req = tcp_v4_route_req,
1508 .init_seq = tcp_v4_init_seq,
1509 .init_ts_off = tcp_v4_init_ts_off,
1510 .send_synack = tcp_v4_send_synack,
1511};
1512
1513int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1514{
1515 /* Never answer to SYNs send to broadcast or multicast */
1516 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1517 goto drop;
1518
1519 return tcp_conn_request(&tcp_request_sock_ops,
1520 &tcp_request_sock_ipv4_ops, sk, skb);
1521
1522drop:
1523 tcp_listendrop(sk);
1524 return 0;
1525}
1526EXPORT_SYMBOL(tcp_v4_conn_request);
1527
1528
1529/*
1530 * The three way handshake has completed - we got a valid synack -
1531 * now create the new socket.
1532 */
1533struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1534 struct request_sock *req,
1535 struct dst_entry *dst,
1536 struct request_sock *req_unhash,
1537 bool *own_req)
1538{
1539 struct inet_request_sock *ireq;
1540 bool found_dup_sk = false;
1541 struct inet_sock *newinet;
1542 struct tcp_sock *newtp;
1543 struct sock *newsk;
1544#ifdef CONFIG_TCP_MD5SIG
1545 const union tcp_md5_addr *addr;
1546 struct tcp_md5sig_key *key;
1547 int l3index;
1548#endif
1549 struct ip_options_rcu *inet_opt;
1550
1551 if (sk_acceptq_is_full(sk))
1552 goto exit_overflow;
1553
1554 newsk = tcp_create_openreq_child(sk, req, skb);
1555 if (!newsk)
1556 goto exit_nonewsk;
1557
1558 newsk->sk_gso_type = SKB_GSO_TCPV4;
1559 inet_sk_rx_dst_set(newsk, skb);
1560
1561 newtp = tcp_sk(newsk);
1562 newinet = inet_sk(newsk);
1563 ireq = inet_rsk(req);
1564 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1565 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1566 newsk->sk_bound_dev_if = ireq->ir_iif;
1567 newinet->inet_saddr = ireq->ir_loc_addr;
1568 inet_opt = rcu_dereference(ireq->ireq_opt);
1569 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1570 newinet->mc_index = inet_iif(skb);
1571 newinet->mc_ttl = ip_hdr(skb)->ttl;
1572 newinet->rcv_tos = ip_hdr(skb)->tos;
1573 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1574 if (inet_opt)
1575 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1576 newinet->inet_id = prandom_u32();
1577
1578 /* Set ToS of the new socket based upon the value of incoming SYN.
1579 * ECT bits are set later in tcp_init_transfer().
1580 */
1581 if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
1582 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1583
1584 if (!dst) {
1585 dst = inet_csk_route_child_sock(sk, newsk, req);
1586 if (!dst)
1587 goto put_and_exit;
1588 } else {
1589 /* syncookie case : see end of cookie_v4_check() */
1590 }
1591 sk_setup_caps(newsk, dst);
1592
1593 tcp_ca_openreq_child(newsk, dst);
1594
1595 tcp_sync_mss(newsk, dst_mtu(dst));
1596 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1597
1598 tcp_initialize_rcv_mss(newsk);
1599
1600#ifdef CONFIG_TCP_MD5SIG
1601 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1602 /* Copy over the MD5 key from the original socket */
1603 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1604 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1605 if (key) {
1606 /*
1607 * We're using one, so create a matching key
1608 * on the newsk structure. If we fail to get
1609 * memory, then we end up not copying the key
1610 * across. Shucks.
1611 */
1612 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index,
1613 key->key, key->keylen, GFP_ATOMIC);
1614 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1615 }
1616#endif
1617
1618 if (__inet_inherit_port(sk, newsk) < 0)
1619 goto put_and_exit;
1620 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1621 &found_dup_sk);
1622 if (likely(*own_req)) {
1623 tcp_move_syn(newtp, req);
1624 ireq->ireq_opt = NULL;
1625 } else {
1626 newinet->inet_opt = NULL;
1627
1628 if (!req_unhash && found_dup_sk) {
1629 /* This code path should only be executed in the
1630 * syncookie case only
1631 */
1632 bh_unlock_sock(newsk);
1633 sock_put(newsk);
1634 newsk = NULL;
1635 }
1636 }
1637 return newsk;
1638
1639exit_overflow:
1640 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1641exit_nonewsk:
1642 dst_release(dst);
1643exit:
1644 tcp_listendrop(sk);
1645 return NULL;
1646put_and_exit:
1647 newinet->inet_opt = NULL;
1648 inet_csk_prepare_forced_close(newsk);
1649 tcp_done(newsk);
1650 goto exit;
1651}
1652EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1653
1654static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1655{
1656#ifdef CONFIG_SYN_COOKIES
1657 const struct tcphdr *th = tcp_hdr(skb);
1658
1659 if (!th->syn)
1660 sk = cookie_v4_check(sk, skb);
1661#endif
1662 return sk;
1663}
1664
1665u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1666 struct tcphdr *th, u32 *cookie)
1667{
1668 u16 mss = 0;
1669#ifdef CONFIG_SYN_COOKIES
1670 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1671 &tcp_request_sock_ipv4_ops, sk, th);
1672 if (mss) {
1673 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1674 tcp_synq_overflow(sk);
1675 }
1676#endif
1677 return mss;
1678}
1679
1680INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1681 u32));
1682/* The socket must have it's spinlock held when we get
1683 * here, unless it is a TCP_LISTEN socket.
1684 *
1685 * We have a potential double-lock case here, so even when
1686 * doing backlog processing we use the BH locking scheme.
1687 * This is because we cannot sleep with the original spinlock
1688 * held.
1689 */
1690int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1691{
1692 struct sock *rsk;
1693
1694 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1695 struct dst_entry *dst = sk->sk_rx_dst;
1696
1697 sock_rps_save_rxhash(sk, skb);
1698 sk_mark_napi_id(sk, skb);
1699 if (dst) {
1700 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1701 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1702 dst, 0)) {
1703 dst_release(dst);
1704 sk->sk_rx_dst = NULL;
1705 }
1706 }
1707 tcp_rcv_established(sk, skb);
1708 return 0;
1709 }
1710
1711 if (tcp_checksum_complete(skb))
1712 goto csum_err;
1713
1714 if (sk->sk_state == TCP_LISTEN) {
1715 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1716
1717 if (!nsk)
1718 goto discard;
1719 if (nsk != sk) {
1720 if (tcp_child_process(sk, nsk, skb)) {
1721 rsk = nsk;
1722 goto reset;
1723 }
1724 return 0;
1725 }
1726 } else
1727 sock_rps_save_rxhash(sk, skb);
1728
1729 if (tcp_rcv_state_process(sk, skb)) {
1730 rsk = sk;
1731 goto reset;
1732 }
1733 return 0;
1734
1735reset:
1736 tcp_v4_send_reset(rsk, skb);
1737discard:
1738 kfree_skb(skb);
1739 /* Be careful here. If this function gets more complicated and
1740 * gcc suffers from register pressure on the x86, sk (in %ebx)
1741 * might be destroyed here. This current version compiles correctly,
1742 * but you have been warned.
1743 */
1744 return 0;
1745
1746csum_err:
1747 trace_tcp_bad_csum(skb);
1748 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1749 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1750 goto discard;
1751}
1752EXPORT_SYMBOL(tcp_v4_do_rcv);
1753
1754int tcp_v4_early_demux(struct sk_buff *skb)
1755{
1756 const struct iphdr *iph;
1757 const struct tcphdr *th;
1758 struct sock *sk;
1759
1760 if (skb->pkt_type != PACKET_HOST)
1761 return 0;
1762
1763 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1764 return 0;
1765
1766 iph = ip_hdr(skb);
1767 th = tcp_hdr(skb);
1768
1769 if (th->doff < sizeof(struct tcphdr) / 4)
1770 return 0;
1771
1772 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1773 iph->saddr, th->source,
1774 iph->daddr, ntohs(th->dest),
1775 skb->skb_iif, inet_sdif(skb));
1776 if (sk) {
1777 skb->sk = sk;
1778 skb->destructor = sock_edemux;
1779 if (sk_fullsock(sk)) {
1780 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1781
1782 if (dst)
1783 dst = dst_check(dst, 0);
1784 if (dst &&
1785 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1786 skb_dst_set_noref(skb, dst);
1787 }
1788 }
1789 return 0;
1790}
1791
1792bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1793{
1794 u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
1795 u32 tail_gso_size, tail_gso_segs;
1796 struct skb_shared_info *shinfo;
1797 const struct tcphdr *th;
1798 struct tcphdr *thtail;
1799 struct sk_buff *tail;
1800 unsigned int hdrlen;
1801 bool fragstolen;
1802 u32 gso_segs;
1803 u32 gso_size;
1804 int delta;
1805
1806 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1807 * we can fix skb->truesize to its real value to avoid future drops.
1808 * This is valid because skb is not yet charged to the socket.
1809 * It has been noticed pure SACK packets were sometimes dropped
1810 * (if cooked by drivers without copybreak feature).
1811 */
1812 skb_condense(skb);
1813
1814 skb_dst_drop(skb);
1815
1816 if (unlikely(tcp_checksum_complete(skb))) {
1817 bh_unlock_sock(sk);
1818 trace_tcp_bad_csum(skb);
1819 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1820 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1821 return true;
1822 }
1823
1824 /* Attempt coalescing to last skb in backlog, even if we are
1825 * above the limits.
1826 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1827 */
1828 th = (const struct tcphdr *)skb->data;
1829 hdrlen = th->doff * 4;
1830
1831 tail = sk->sk_backlog.tail;
1832 if (!tail)
1833 goto no_coalesce;
1834 thtail = (struct tcphdr *)tail->data;
1835
1836 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1837 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1838 ((TCP_SKB_CB(tail)->tcp_flags |
1839 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1840 !((TCP_SKB_CB(tail)->tcp_flags &
1841 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1842 ((TCP_SKB_CB(tail)->tcp_flags ^
1843 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1844#ifdef CONFIG_TLS_DEVICE
1845 tail->decrypted != skb->decrypted ||
1846#endif
1847 thtail->doff != th->doff ||
1848 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1849 goto no_coalesce;
1850
1851 __skb_pull(skb, hdrlen);
1852
1853 shinfo = skb_shinfo(skb);
1854 gso_size = shinfo->gso_size ?: skb->len;
1855 gso_segs = shinfo->gso_segs ?: 1;
1856
1857 shinfo = skb_shinfo(tail);
1858 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1859 tail_gso_segs = shinfo->gso_segs ?: 1;
1860
1861 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1862 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1863
1864 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1865 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1866 thtail->window = th->window;
1867 }
1868
1869 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1870 * thtail->fin, so that the fast path in tcp_rcv_established()
1871 * is not entered if we append a packet with a FIN.
1872 * SYN, RST, URG are not present.
1873 * ACK is set on both packets.
1874 * PSH : we do not really care in TCP stack,
1875 * at least for 'GRO' packets.
1876 */
1877 thtail->fin |= th->fin;
1878 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1879
1880 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1881 TCP_SKB_CB(tail)->has_rxtstamp = true;
1882 tail->tstamp = skb->tstamp;
1883 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1884 }
1885
1886 /* Not as strict as GRO. We only need to carry mss max value */
1887 shinfo->gso_size = max(gso_size, tail_gso_size);
1888 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1889
1890 sk->sk_backlog.len += delta;
1891 __NET_INC_STATS(sock_net(sk),
1892 LINUX_MIB_TCPBACKLOGCOALESCE);
1893 kfree_skb_partial(skb, fragstolen);
1894 return false;
1895 }
1896 __skb_push(skb, hdrlen);
1897
1898no_coalesce:
1899 /* Only socket owner can try to collapse/prune rx queues
1900 * to reduce memory overhead, so add a little headroom here.
1901 * Few sockets backlog are possibly concurrently non empty.
1902 */
1903 limit += 64*1024;
1904
1905 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1906 bh_unlock_sock(sk);
1907 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1908 return true;
1909 }
1910 return false;
1911}
1912EXPORT_SYMBOL(tcp_add_backlog);
1913
1914int tcp_filter(struct sock *sk, struct sk_buff *skb)
1915{
1916 struct tcphdr *th = (struct tcphdr *)skb->data;
1917
1918 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1919}
1920EXPORT_SYMBOL(tcp_filter);
1921
1922static void tcp_v4_restore_cb(struct sk_buff *skb)
1923{
1924 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1925 sizeof(struct inet_skb_parm));
1926}
1927
1928static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1929 const struct tcphdr *th)
1930{
1931 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1932 * barrier() makes sure compiler wont play fool^Waliasing games.
1933 */
1934 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1935 sizeof(struct inet_skb_parm));
1936 barrier();
1937
1938 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1939 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1940 skb->len - th->doff * 4);
1941 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1942 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1943 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1944 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1945 TCP_SKB_CB(skb)->sacked = 0;
1946 TCP_SKB_CB(skb)->has_rxtstamp =
1947 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1948}
1949
1950/*
1951 * From tcp_input.c
1952 */
1953
1954int tcp_v4_rcv(struct sk_buff *skb)
1955{
1956 struct net *net = dev_net(skb->dev);
1957 struct sk_buff *skb_to_free;
1958 int sdif = inet_sdif(skb);
1959 int dif = inet_iif(skb);
1960 const struct iphdr *iph;
1961 const struct tcphdr *th;
1962 bool refcounted;
1963 struct sock *sk;
1964 int ret;
1965
1966 if (skb->pkt_type != PACKET_HOST)
1967 goto discard_it;
1968
1969 /* Count it even if it's bad */
1970 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1971
1972 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1973 goto discard_it;
1974
1975 th = (const struct tcphdr *)skb->data;
1976
1977 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1978 goto bad_packet;
1979 if (!pskb_may_pull(skb, th->doff * 4))
1980 goto discard_it;
1981
1982 /* An explanation is required here, I think.
1983 * Packet length and doff are validated by header prediction,
1984 * provided case of th->doff==0 is eliminated.
1985 * So, we defer the checks. */
1986
1987 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1988 goto csum_error;
1989
1990 th = (const struct tcphdr *)skb->data;
1991 iph = ip_hdr(skb);
1992lookup:
1993 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1994 th->dest, sdif, &refcounted);
1995 if (!sk)
1996 goto no_tcp_socket;
1997
1998process:
1999 if (sk->sk_state == TCP_TIME_WAIT)
2000 goto do_time_wait;
2001
2002 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2003 struct request_sock *req = inet_reqsk(sk);
2004 bool req_stolen = false;
2005 struct sock *nsk;
2006
2007 sk = req->rsk_listener;
2008 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) {
2009 sk_drops_add(sk, skb);
2010 reqsk_put(req);
2011 goto discard_it;
2012 }
2013 if (tcp_checksum_complete(skb)) {
2014 reqsk_put(req);
2015 goto csum_error;
2016 }
2017 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2018 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2019 if (!nsk) {
2020 inet_csk_reqsk_queue_drop_and_put(sk, req);
2021 goto lookup;
2022 }
2023 sk = nsk;
2024 /* reuseport_migrate_sock() has already held one sk_refcnt
2025 * before returning.
2026 */
2027 } else {
2028 /* We own a reference on the listener, increase it again
2029 * as we might lose it too soon.
2030 */
2031 sock_hold(sk);
2032 }
2033 refcounted = true;
2034 nsk = NULL;
2035 if (!tcp_filter(sk, skb)) {
2036 th = (const struct tcphdr *)skb->data;
2037 iph = ip_hdr(skb);
2038 tcp_v4_fill_cb(skb, iph, th);
2039 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2040 }
2041 if (!nsk) {
2042 reqsk_put(req);
2043 if (req_stolen) {
2044 /* Another cpu got exclusive access to req
2045 * and created a full blown socket.
2046 * Try to feed this packet to this socket
2047 * instead of discarding it.
2048 */
2049 tcp_v4_restore_cb(skb);
2050 sock_put(sk);
2051 goto lookup;
2052 }
2053 goto discard_and_relse;
2054 }
2055 if (nsk == sk) {
2056 reqsk_put(req);
2057 tcp_v4_restore_cb(skb);
2058 } else if (tcp_child_process(sk, nsk, skb)) {
2059 tcp_v4_send_reset(nsk, skb);
2060 goto discard_and_relse;
2061 } else {
2062 sock_put(sk);
2063 return 0;
2064 }
2065 }
2066 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2067 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2068 goto discard_and_relse;
2069 }
2070
2071 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2072 goto discard_and_relse;
2073
2074 if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))
2075 goto discard_and_relse;
2076
2077 nf_reset_ct(skb);
2078
2079 if (tcp_filter(sk, skb))
2080 goto discard_and_relse;
2081 th = (const struct tcphdr *)skb->data;
2082 iph = ip_hdr(skb);
2083 tcp_v4_fill_cb(skb, iph, th);
2084
2085 skb->dev = NULL;
2086
2087 if (sk->sk_state == TCP_LISTEN) {
2088 ret = tcp_v4_do_rcv(sk, skb);
2089 goto put_and_return;
2090 }
2091
2092 sk_incoming_cpu_update(sk);
2093
2094 bh_lock_sock_nested(sk);
2095 tcp_segs_in(tcp_sk(sk), skb);
2096 ret = 0;
2097 if (!sock_owned_by_user(sk)) {
2098 skb_to_free = sk->sk_rx_skb_cache;
2099 sk->sk_rx_skb_cache = NULL;
2100 ret = tcp_v4_do_rcv(sk, skb);
2101 } else {
2102 if (tcp_add_backlog(sk, skb))
2103 goto discard_and_relse;
2104 skb_to_free = NULL;
2105 }
2106 bh_unlock_sock(sk);
2107 if (skb_to_free)
2108 __kfree_skb(skb_to_free);
2109
2110put_and_return:
2111 if (refcounted)
2112 sock_put(sk);
2113
2114 return ret;
2115
2116no_tcp_socket:
2117 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2118 goto discard_it;
2119
2120 tcp_v4_fill_cb(skb, iph, th);
2121
2122 if (tcp_checksum_complete(skb)) {
2123csum_error:
2124 trace_tcp_bad_csum(skb);
2125 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2126bad_packet:
2127 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2128 } else {
2129 tcp_v4_send_reset(NULL, skb);
2130 }
2131
2132discard_it:
2133 /* Discard frame. */
2134 kfree_skb(skb);
2135 return 0;
2136
2137discard_and_relse:
2138 sk_drops_add(sk, skb);
2139 if (refcounted)
2140 sock_put(sk);
2141 goto discard_it;
2142
2143do_time_wait:
2144 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2145 inet_twsk_put(inet_twsk(sk));
2146 goto discard_it;
2147 }
2148
2149 tcp_v4_fill_cb(skb, iph, th);
2150
2151 if (tcp_checksum_complete(skb)) {
2152 inet_twsk_put(inet_twsk(sk));
2153 goto csum_error;
2154 }
2155 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2156 case TCP_TW_SYN: {
2157 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2158 &tcp_hashinfo, skb,
2159 __tcp_hdrlen(th),
2160 iph->saddr, th->source,
2161 iph->daddr, th->dest,
2162 inet_iif(skb),
2163 sdif);
2164 if (sk2) {
2165 inet_twsk_deschedule_put(inet_twsk(sk));
2166 sk = sk2;
2167 tcp_v4_restore_cb(skb);
2168 refcounted = false;
2169 goto process;
2170 }
2171 }
2172 /* to ACK */
2173 fallthrough;
2174 case TCP_TW_ACK:
2175 tcp_v4_timewait_ack(sk, skb);
2176 break;
2177 case TCP_TW_RST:
2178 tcp_v4_send_reset(sk, skb);
2179 inet_twsk_deschedule_put(inet_twsk(sk));
2180 goto discard_it;
2181 case TCP_TW_SUCCESS:;
2182 }
2183 goto discard_it;
2184}
2185
2186static struct timewait_sock_ops tcp_timewait_sock_ops = {
2187 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2188 .twsk_unique = tcp_twsk_unique,
2189 .twsk_destructor= tcp_twsk_destructor,
2190};
2191
2192void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2193{
2194 struct dst_entry *dst = skb_dst(skb);
2195
2196 if (dst && dst_hold_safe(dst)) {
2197 sk->sk_rx_dst = dst;
2198 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2199 }
2200}
2201EXPORT_SYMBOL(inet_sk_rx_dst_set);
2202
2203const struct inet_connection_sock_af_ops ipv4_specific = {
2204 .queue_xmit = ip_queue_xmit,
2205 .send_check = tcp_v4_send_check,
2206 .rebuild_header = inet_sk_rebuild_header,
2207 .sk_rx_dst_set = inet_sk_rx_dst_set,
2208 .conn_request = tcp_v4_conn_request,
2209 .syn_recv_sock = tcp_v4_syn_recv_sock,
2210 .net_header_len = sizeof(struct iphdr),
2211 .setsockopt = ip_setsockopt,
2212 .getsockopt = ip_getsockopt,
2213 .addr2sockaddr = inet_csk_addr2sockaddr,
2214 .sockaddr_len = sizeof(struct sockaddr_in),
2215 .mtu_reduced = tcp_v4_mtu_reduced,
2216};
2217EXPORT_SYMBOL(ipv4_specific);
2218
2219#ifdef CONFIG_TCP_MD5SIG
2220static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2221 .md5_lookup = tcp_v4_md5_lookup,
2222 .calc_md5_hash = tcp_v4_md5_hash_skb,
2223 .md5_parse = tcp_v4_parse_md5_keys,
2224};
2225#endif
2226
2227/* NOTE: A lot of things set to zero explicitly by call to
2228 * sk_alloc() so need not be done here.
2229 */
2230static int tcp_v4_init_sock(struct sock *sk)
2231{
2232 struct inet_connection_sock *icsk = inet_csk(sk);
2233
2234 tcp_init_sock(sk);
2235
2236 icsk->icsk_af_ops = &ipv4_specific;
2237
2238#ifdef CONFIG_TCP_MD5SIG
2239 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2240#endif
2241
2242 return 0;
2243}
2244
2245void tcp_v4_destroy_sock(struct sock *sk)
2246{
2247 struct tcp_sock *tp = tcp_sk(sk);
2248
2249 trace_tcp_destroy_sock(sk);
2250
2251 tcp_clear_xmit_timers(sk);
2252
2253 tcp_cleanup_congestion_control(sk);
2254
2255 tcp_cleanup_ulp(sk);
2256
2257 /* Cleanup up the write buffer. */
2258 tcp_write_queue_purge(sk);
2259
2260 /* Check if we want to disable active TFO */
2261 tcp_fastopen_active_disable_ofo_check(sk);
2262
2263 /* Cleans up our, hopefully empty, out_of_order_queue. */
2264 skb_rbtree_purge(&tp->out_of_order_queue);
2265
2266#ifdef CONFIG_TCP_MD5SIG
2267 /* Clean up the MD5 key list, if any */
2268 if (tp->md5sig_info) {
2269 tcp_clear_md5_list(sk);
2270 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2271 tp->md5sig_info = NULL;
2272 }
2273#endif
2274
2275 /* Clean up a referenced TCP bind bucket. */
2276 if (inet_csk(sk)->icsk_bind_hash)
2277 inet_put_port(sk);
2278
2279 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2280
2281 /* If socket is aborted during connect operation */
2282 tcp_free_fastopen_req(tp);
2283 tcp_fastopen_destroy_cipher(sk);
2284 tcp_saved_syn_free(tp);
2285
2286 sk_sockets_allocated_dec(sk);
2287}
2288EXPORT_SYMBOL(tcp_v4_destroy_sock);
2289
2290#ifdef CONFIG_PROC_FS
2291/* Proc filesystem TCP sock list dumping. */
2292
2293/*
2294 * Get next listener socket follow cur. If cur is NULL, get first socket
2295 * starting from bucket given in st->bucket; when st->bucket is zero the
2296 * very first socket in the hash table is returned.
2297 */
2298static void *listening_get_next(struct seq_file *seq, void *cur)
2299{
2300 struct tcp_seq_afinfo *afinfo;
2301 struct tcp_iter_state *st = seq->private;
2302 struct net *net = seq_file_net(seq);
2303 struct inet_listen_hashbucket *ilb;
2304 struct hlist_nulls_node *node;
2305 struct sock *sk = cur;
2306
2307 if (st->bpf_seq_afinfo)
2308 afinfo = st->bpf_seq_afinfo;
2309 else
2310 afinfo = PDE_DATA(file_inode(seq->file));
2311
2312 if (!sk) {
2313get_head:
2314 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2315 spin_lock(&ilb->lock);
2316 sk = sk_nulls_head(&ilb->nulls_head);
2317 st->offset = 0;
2318 goto get_sk;
2319 }
2320 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2321 ++st->num;
2322 ++st->offset;
2323
2324 sk = sk_nulls_next(sk);
2325get_sk:
2326 sk_nulls_for_each_from(sk, node) {
2327 if (!net_eq(sock_net(sk), net))
2328 continue;
2329 if (afinfo->family == AF_UNSPEC ||
2330 sk->sk_family == afinfo->family)
2331 return sk;
2332 }
2333 spin_unlock(&ilb->lock);
2334 st->offset = 0;
2335 if (++st->bucket < INET_LHTABLE_SIZE)
2336 goto get_head;
2337 return NULL;
2338}
2339
2340static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2341{
2342 struct tcp_iter_state *st = seq->private;
2343 void *rc;
2344
2345 st->bucket = 0;
2346 st->offset = 0;
2347 rc = listening_get_next(seq, NULL);
2348
2349 while (rc && *pos) {
2350 rc = listening_get_next(seq, rc);
2351 --*pos;
2352 }
2353 return rc;
2354}
2355
2356static inline bool empty_bucket(const struct tcp_iter_state *st)
2357{
2358 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2359}
2360
2361/*
2362 * Get first established socket starting from bucket given in st->bucket.
2363 * If st->bucket is zero, the very first socket in the hash is returned.
2364 */
2365static void *established_get_first(struct seq_file *seq)
2366{
2367 struct tcp_seq_afinfo *afinfo;
2368 struct tcp_iter_state *st = seq->private;
2369 struct net *net = seq_file_net(seq);
2370 void *rc = NULL;
2371
2372 if (st->bpf_seq_afinfo)
2373 afinfo = st->bpf_seq_afinfo;
2374 else
2375 afinfo = PDE_DATA(file_inode(seq->file));
2376
2377 st->offset = 0;
2378 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2379 struct sock *sk;
2380 struct hlist_nulls_node *node;
2381 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2382
2383 /* Lockless fast path for the common case of empty buckets */
2384 if (empty_bucket(st))
2385 continue;
2386
2387 spin_lock_bh(lock);
2388 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2389 if ((afinfo->family != AF_UNSPEC &&
2390 sk->sk_family != afinfo->family) ||
2391 !net_eq(sock_net(sk), net)) {
2392 continue;
2393 }
2394 rc = sk;
2395 goto out;
2396 }
2397 spin_unlock_bh(lock);
2398 }
2399out:
2400 return rc;
2401}
2402
2403static void *established_get_next(struct seq_file *seq, void *cur)
2404{
2405 struct tcp_seq_afinfo *afinfo;
2406 struct sock *sk = cur;
2407 struct hlist_nulls_node *node;
2408 struct tcp_iter_state *st = seq->private;
2409 struct net *net = seq_file_net(seq);
2410
2411 if (st->bpf_seq_afinfo)
2412 afinfo = st->bpf_seq_afinfo;
2413 else
2414 afinfo = PDE_DATA(file_inode(seq->file));
2415
2416 ++st->num;
2417 ++st->offset;
2418
2419 sk = sk_nulls_next(sk);
2420
2421 sk_nulls_for_each_from(sk, node) {
2422 if ((afinfo->family == AF_UNSPEC ||
2423 sk->sk_family == afinfo->family) &&
2424 net_eq(sock_net(sk), net))
2425 return sk;
2426 }
2427
2428 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2429 ++st->bucket;
2430 return established_get_first(seq);
2431}
2432
2433static void *established_get_idx(struct seq_file *seq, loff_t pos)
2434{
2435 struct tcp_iter_state *st = seq->private;
2436 void *rc;
2437
2438 st->bucket = 0;
2439 rc = established_get_first(seq);
2440
2441 while (rc && pos) {
2442 rc = established_get_next(seq, rc);
2443 --pos;
2444 }
2445 return rc;
2446}
2447
2448static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2449{
2450 void *rc;
2451 struct tcp_iter_state *st = seq->private;
2452
2453 st->state = TCP_SEQ_STATE_LISTENING;
2454 rc = listening_get_idx(seq, &pos);
2455
2456 if (!rc) {
2457 st->state = TCP_SEQ_STATE_ESTABLISHED;
2458 rc = established_get_idx(seq, pos);
2459 }
2460
2461 return rc;
2462}
2463
2464static void *tcp_seek_last_pos(struct seq_file *seq)
2465{
2466 struct tcp_iter_state *st = seq->private;
2467 int bucket = st->bucket;
2468 int offset = st->offset;
2469 int orig_num = st->num;
2470 void *rc = NULL;
2471
2472 switch (st->state) {
2473 case TCP_SEQ_STATE_LISTENING:
2474 if (st->bucket >= INET_LHTABLE_SIZE)
2475 break;
2476 st->state = TCP_SEQ_STATE_LISTENING;
2477 rc = listening_get_next(seq, NULL);
2478 while (offset-- && rc && bucket == st->bucket)
2479 rc = listening_get_next(seq, rc);
2480 if (rc)
2481 break;
2482 st->bucket = 0;
2483 st->state = TCP_SEQ_STATE_ESTABLISHED;
2484 fallthrough;
2485 case TCP_SEQ_STATE_ESTABLISHED:
2486 if (st->bucket > tcp_hashinfo.ehash_mask)
2487 break;
2488 rc = established_get_first(seq);
2489 while (offset-- && rc && bucket == st->bucket)
2490 rc = established_get_next(seq, rc);
2491 }
2492
2493 st->num = orig_num;
2494
2495 return rc;
2496}
2497
2498void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2499{
2500 struct tcp_iter_state *st = seq->private;
2501 void *rc;
2502
2503 if (*pos && *pos == st->last_pos) {
2504 rc = tcp_seek_last_pos(seq);
2505 if (rc)
2506 goto out;
2507 }
2508
2509 st->state = TCP_SEQ_STATE_LISTENING;
2510 st->num = 0;
2511 st->bucket = 0;
2512 st->offset = 0;
2513 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2514
2515out:
2516 st->last_pos = *pos;
2517 return rc;
2518}
2519EXPORT_SYMBOL(tcp_seq_start);
2520
2521void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2522{
2523 struct tcp_iter_state *st = seq->private;
2524 void *rc = NULL;
2525
2526 if (v == SEQ_START_TOKEN) {
2527 rc = tcp_get_idx(seq, 0);
2528 goto out;
2529 }
2530
2531 switch (st->state) {
2532 case TCP_SEQ_STATE_LISTENING:
2533 rc = listening_get_next(seq, v);
2534 if (!rc) {
2535 st->state = TCP_SEQ_STATE_ESTABLISHED;
2536 st->bucket = 0;
2537 st->offset = 0;
2538 rc = established_get_first(seq);
2539 }
2540 break;
2541 case TCP_SEQ_STATE_ESTABLISHED:
2542 rc = established_get_next(seq, v);
2543 break;
2544 }
2545out:
2546 ++*pos;
2547 st->last_pos = *pos;
2548 return rc;
2549}
2550EXPORT_SYMBOL(tcp_seq_next);
2551
2552void tcp_seq_stop(struct seq_file *seq, void *v)
2553{
2554 struct tcp_iter_state *st = seq->private;
2555
2556 switch (st->state) {
2557 case TCP_SEQ_STATE_LISTENING:
2558 if (v != SEQ_START_TOKEN)
2559 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2560 break;
2561 case TCP_SEQ_STATE_ESTABLISHED:
2562 if (v)
2563 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2564 break;
2565 }
2566}
2567EXPORT_SYMBOL(tcp_seq_stop);
2568
2569static void get_openreq4(const struct request_sock *req,
2570 struct seq_file *f, int i)
2571{
2572 const struct inet_request_sock *ireq = inet_rsk(req);
2573 long delta = req->rsk_timer.expires - jiffies;
2574
2575 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2576 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2577 i,
2578 ireq->ir_loc_addr,
2579 ireq->ir_num,
2580 ireq->ir_rmt_addr,
2581 ntohs(ireq->ir_rmt_port),
2582 TCP_SYN_RECV,
2583 0, 0, /* could print option size, but that is af dependent. */
2584 1, /* timers active (only the expire timer) */
2585 jiffies_delta_to_clock_t(delta),
2586 req->num_timeout,
2587 from_kuid_munged(seq_user_ns(f),
2588 sock_i_uid(req->rsk_listener)),
2589 0, /* non standard timer */
2590 0, /* open_requests have no inode */
2591 0,
2592 req);
2593}
2594
2595static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2596{
2597 int timer_active;
2598 unsigned long timer_expires;
2599 const struct tcp_sock *tp = tcp_sk(sk);
2600 const struct inet_connection_sock *icsk = inet_csk(sk);
2601 const struct inet_sock *inet = inet_sk(sk);
2602 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2603 __be32 dest = inet->inet_daddr;
2604 __be32 src = inet->inet_rcv_saddr;
2605 __u16 destp = ntohs(inet->inet_dport);
2606 __u16 srcp = ntohs(inet->inet_sport);
2607 int rx_queue;
2608 int state;
2609
2610 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2611 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2612 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2613 timer_active = 1;
2614 timer_expires = icsk->icsk_timeout;
2615 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2616 timer_active = 4;
2617 timer_expires = icsk->icsk_timeout;
2618 } else if (timer_pending(&sk->sk_timer)) {
2619 timer_active = 2;
2620 timer_expires = sk->sk_timer.expires;
2621 } else {
2622 timer_active = 0;
2623 timer_expires = jiffies;
2624 }
2625
2626 state = inet_sk_state_load(sk);
2627 if (state == TCP_LISTEN)
2628 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2629 else
2630 /* Because we don't lock the socket,
2631 * we might find a transient negative value.
2632 */
2633 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2634 READ_ONCE(tp->copied_seq), 0);
2635
2636 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2637 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2638 i, src, srcp, dest, destp, state,
2639 READ_ONCE(tp->write_seq) - tp->snd_una,
2640 rx_queue,
2641 timer_active,
2642 jiffies_delta_to_clock_t(timer_expires - jiffies),
2643 icsk->icsk_retransmits,
2644 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2645 icsk->icsk_probes_out,
2646 sock_i_ino(sk),
2647 refcount_read(&sk->sk_refcnt), sk,
2648 jiffies_to_clock_t(icsk->icsk_rto),
2649 jiffies_to_clock_t(icsk->icsk_ack.ato),
2650 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2651 tp->snd_cwnd,
2652 state == TCP_LISTEN ?
2653 fastopenq->max_qlen :
2654 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2655}
2656
2657static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2658 struct seq_file *f, int i)
2659{
2660 long delta = tw->tw_timer.expires - jiffies;
2661 __be32 dest, src;
2662 __u16 destp, srcp;
2663
2664 dest = tw->tw_daddr;
2665 src = tw->tw_rcv_saddr;
2666 destp = ntohs(tw->tw_dport);
2667 srcp = ntohs(tw->tw_sport);
2668
2669 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2670 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2671 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2672 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2673 refcount_read(&tw->tw_refcnt), tw);
2674}
2675
2676#define TMPSZ 150
2677
2678static int tcp4_seq_show(struct seq_file *seq, void *v)
2679{
2680 struct tcp_iter_state *st;
2681 struct sock *sk = v;
2682
2683 seq_setwidth(seq, TMPSZ - 1);
2684 if (v == SEQ_START_TOKEN) {
2685 seq_puts(seq, " sl local_address rem_address st tx_queue "
2686 "rx_queue tr tm->when retrnsmt uid timeout "
2687 "inode");
2688 goto out;
2689 }
2690 st = seq->private;
2691
2692 if (sk->sk_state == TCP_TIME_WAIT)
2693 get_timewait4_sock(v, seq, st->num);
2694 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2695 get_openreq4(v, seq, st->num);
2696 else
2697 get_tcp4_sock(v, seq, st->num);
2698out:
2699 seq_pad(seq, '\n');
2700 return 0;
2701}
2702
2703#ifdef CONFIG_BPF_SYSCALL
2704struct bpf_iter__tcp {
2705 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2706 __bpf_md_ptr(struct sock_common *, sk_common);
2707 uid_t uid __aligned(8);
2708};
2709
2710static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2711 struct sock_common *sk_common, uid_t uid)
2712{
2713 struct bpf_iter__tcp ctx;
2714
2715 meta->seq_num--; /* skip SEQ_START_TOKEN */
2716 ctx.meta = meta;
2717 ctx.sk_common = sk_common;
2718 ctx.uid = uid;
2719 return bpf_iter_run_prog(prog, &ctx);
2720}
2721
2722static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2723{
2724 struct bpf_iter_meta meta;
2725 struct bpf_prog *prog;
2726 struct sock *sk = v;
2727 uid_t uid;
2728
2729 if (v == SEQ_START_TOKEN)
2730 return 0;
2731
2732 if (sk->sk_state == TCP_TIME_WAIT) {
2733 uid = 0;
2734 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2735 const struct request_sock *req = v;
2736
2737 uid = from_kuid_munged(seq_user_ns(seq),
2738 sock_i_uid(req->rsk_listener));
2739 } else {
2740 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2741 }
2742
2743 meta.seq = seq;
2744 prog = bpf_iter_get_info(&meta, false);
2745 return tcp_prog_seq_show(prog, &meta, v, uid);
2746}
2747
2748static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2749{
2750 struct bpf_iter_meta meta;
2751 struct bpf_prog *prog;
2752
2753 if (!v) {
2754 meta.seq = seq;
2755 prog = bpf_iter_get_info(&meta, true);
2756 if (prog)
2757 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2758 }
2759
2760 tcp_seq_stop(seq, v);
2761}
2762
2763static const struct seq_operations bpf_iter_tcp_seq_ops = {
2764 .show = bpf_iter_tcp_seq_show,
2765 .start = tcp_seq_start,
2766 .next = tcp_seq_next,
2767 .stop = bpf_iter_tcp_seq_stop,
2768};
2769#endif
2770
2771static const struct seq_operations tcp4_seq_ops = {
2772 .show = tcp4_seq_show,
2773 .start = tcp_seq_start,
2774 .next = tcp_seq_next,
2775 .stop = tcp_seq_stop,
2776};
2777
2778static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2779 .family = AF_INET,
2780};
2781
2782static int __net_init tcp4_proc_init_net(struct net *net)
2783{
2784 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2785 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2786 return -ENOMEM;
2787 return 0;
2788}
2789
2790static void __net_exit tcp4_proc_exit_net(struct net *net)
2791{
2792 remove_proc_entry("tcp", net->proc_net);
2793}
2794
2795static struct pernet_operations tcp4_net_ops = {
2796 .init = tcp4_proc_init_net,
2797 .exit = tcp4_proc_exit_net,
2798};
2799
2800int __init tcp4_proc_init(void)
2801{
2802 return register_pernet_subsys(&tcp4_net_ops);
2803}
2804
2805void tcp4_proc_exit(void)
2806{
2807 unregister_pernet_subsys(&tcp4_net_ops);
2808}
2809#endif /* CONFIG_PROC_FS */
2810
2811/* @wake is one when sk_stream_write_space() calls us.
2812 * This sends EPOLLOUT only if notsent_bytes is half the limit.
2813 * This mimics the strategy used in sock_def_write_space().
2814 */
2815bool tcp_stream_memory_free(const struct sock *sk, int wake)
2816{
2817 const struct tcp_sock *tp = tcp_sk(sk);
2818 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
2819 READ_ONCE(tp->snd_nxt);
2820
2821 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
2822}
2823EXPORT_SYMBOL(tcp_stream_memory_free);
2824
2825struct proto tcp_prot = {
2826 .name = "TCP",
2827 .owner = THIS_MODULE,
2828 .close = tcp_close,
2829 .pre_connect = tcp_v4_pre_connect,
2830 .connect = tcp_v4_connect,
2831 .disconnect = tcp_disconnect,
2832 .accept = inet_csk_accept,
2833 .ioctl = tcp_ioctl,
2834 .init = tcp_v4_init_sock,
2835 .destroy = tcp_v4_destroy_sock,
2836 .shutdown = tcp_shutdown,
2837 .setsockopt = tcp_setsockopt,
2838 .getsockopt = tcp_getsockopt,
2839 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
2840 .keepalive = tcp_set_keepalive,
2841 .recvmsg = tcp_recvmsg,
2842 .sendmsg = tcp_sendmsg,
2843 .sendpage = tcp_sendpage,
2844 .backlog_rcv = tcp_v4_do_rcv,
2845 .release_cb = tcp_release_cb,
2846 .hash = inet_hash,
2847 .unhash = inet_unhash,
2848 .get_port = inet_csk_get_port,
2849#ifdef CONFIG_BPF_SYSCALL
2850 .psock_update_sk_prot = tcp_bpf_update_proto,
2851#endif
2852 .enter_memory_pressure = tcp_enter_memory_pressure,
2853 .leave_memory_pressure = tcp_leave_memory_pressure,
2854 .stream_memory_free = tcp_stream_memory_free,
2855 .sockets_allocated = &tcp_sockets_allocated,
2856 .orphan_count = &tcp_orphan_count,
2857 .memory_allocated = &tcp_memory_allocated,
2858 .memory_pressure = &tcp_memory_pressure,
2859 .sysctl_mem = sysctl_tcp_mem,
2860 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2861 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2862 .max_header = MAX_TCP_HEADER,
2863 .obj_size = sizeof(struct tcp_sock),
2864 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2865 .twsk_prot = &tcp_timewait_sock_ops,
2866 .rsk_prot = &tcp_request_sock_ops,
2867 .h.hashinfo = &tcp_hashinfo,
2868 .no_autobind = true,
2869 .diag_destroy = tcp_abort,
2870};
2871EXPORT_SYMBOL(tcp_prot);
2872
2873static void __net_exit tcp_sk_exit(struct net *net)
2874{
2875 int cpu;
2876
2877 if (net->ipv4.tcp_congestion_control)
2878 bpf_module_put(net->ipv4.tcp_congestion_control,
2879 net->ipv4.tcp_congestion_control->owner);
2880
2881 for_each_possible_cpu(cpu)
2882 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2883 free_percpu(net->ipv4.tcp_sk);
2884}
2885
2886static int __net_init tcp_sk_init(struct net *net)
2887{
2888 int res, cpu, cnt;
2889
2890 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2891 if (!net->ipv4.tcp_sk)
2892 return -ENOMEM;
2893
2894 for_each_possible_cpu(cpu) {
2895 struct sock *sk;
2896
2897 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2898 IPPROTO_TCP, net);
2899 if (res)
2900 goto fail;
2901 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2902
2903 /* Please enforce IP_DF and IPID==0 for RST and
2904 * ACK sent in SYN-RECV and TIME-WAIT state.
2905 */
2906 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
2907
2908 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2909 }
2910
2911 net->ipv4.sysctl_tcp_ecn = 2;
2912 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2913
2914 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2915 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
2916 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2917 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2918 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
2919
2920 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2921 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2922 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2923
2924 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2925 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2926 net->ipv4.sysctl_tcp_syncookies = 1;
2927 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2928 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2929 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2930 net->ipv4.sysctl_tcp_orphan_retries = 0;
2931 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2932 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2933 net->ipv4.sysctl_tcp_tw_reuse = 2;
2934 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
2935
2936 cnt = tcp_hashinfo.ehash_mask + 1;
2937 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
2938 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2939
2940 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
2941 net->ipv4.sysctl_tcp_sack = 1;
2942 net->ipv4.sysctl_tcp_window_scaling = 1;
2943 net->ipv4.sysctl_tcp_timestamps = 1;
2944 net->ipv4.sysctl_tcp_early_retrans = 3;
2945 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2946 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2947 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2948 net->ipv4.sysctl_tcp_max_reordering = 300;
2949 net->ipv4.sysctl_tcp_dsack = 1;
2950 net->ipv4.sysctl_tcp_app_win = 31;
2951 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2952 net->ipv4.sysctl_tcp_frto = 2;
2953 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2954 /* This limits the percentage of the congestion window which we
2955 * will allow a single TSO frame to consume. Building TSO frames
2956 * which are too large can cause TCP streams to be bursty.
2957 */
2958 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2959 /* Default TSQ limit of 16 TSO segments */
2960 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
2961 /* rfc5961 challenge ack rate limiting */
2962 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2963 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2964 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2965 net->ipv4.sysctl_tcp_autocorking = 1;
2966 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2967 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2968 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2969 if (net != &init_net) {
2970 memcpy(net->ipv4.sysctl_tcp_rmem,
2971 init_net.ipv4.sysctl_tcp_rmem,
2972 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2973 memcpy(net->ipv4.sysctl_tcp_wmem,
2974 init_net.ipv4.sysctl_tcp_wmem,
2975 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2976 }
2977 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
2978 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
2979 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
2980 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2981 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2982 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
2983 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2984
2985 /* Reno is always built in */
2986 if (!net_eq(net, &init_net) &&
2987 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
2988 init_net.ipv4.tcp_congestion_control->owner))
2989 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2990 else
2991 net->ipv4.tcp_congestion_control = &tcp_reno;
2992
2993 return 0;
2994fail:
2995 tcp_sk_exit(net);
2996
2997 return res;
2998}
2999
3000static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3001{
3002 struct net *net;
3003
3004 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3005
3006 list_for_each_entry(net, net_exit_list, exit_list)
3007 tcp_fastopen_ctx_destroy(net);
3008}
3009
3010static struct pernet_operations __net_initdata tcp_sk_ops = {
3011 .init = tcp_sk_init,
3012 .exit = tcp_sk_exit,
3013 .exit_batch = tcp_sk_exit_batch,
3014};
3015
3016#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3017DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3018 struct sock_common *sk_common, uid_t uid)
3019
3020static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3021{
3022 struct tcp_iter_state *st = priv_data;
3023 struct tcp_seq_afinfo *afinfo;
3024 int ret;
3025
3026 afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
3027 if (!afinfo)
3028 return -ENOMEM;
3029
3030 afinfo->family = AF_UNSPEC;
3031 st->bpf_seq_afinfo = afinfo;
3032 ret = bpf_iter_init_seq_net(priv_data, aux);
3033 if (ret)
3034 kfree(afinfo);
3035 return ret;
3036}
3037
3038static void bpf_iter_fini_tcp(void *priv_data)
3039{
3040 struct tcp_iter_state *st = priv_data;
3041
3042 kfree(st->bpf_seq_afinfo);
3043 bpf_iter_fini_seq_net(priv_data);
3044}
3045
3046static const struct bpf_iter_seq_info tcp_seq_info = {
3047 .seq_ops = &bpf_iter_tcp_seq_ops,
3048 .init_seq_private = bpf_iter_init_tcp,
3049 .fini_seq_private = bpf_iter_fini_tcp,
3050 .seq_priv_size = sizeof(struct tcp_iter_state),
3051};
3052
3053static struct bpf_iter_reg tcp_reg_info = {
3054 .target = "tcp",
3055 .ctx_arg_info_size = 1,
3056 .ctx_arg_info = {
3057 { offsetof(struct bpf_iter__tcp, sk_common),
3058 PTR_TO_BTF_ID_OR_NULL },
3059 },
3060 .seq_info = &tcp_seq_info,
3061};
3062
3063static void __init bpf_iter_register(void)
3064{
3065 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3066 if (bpf_iter_reg_target(&tcp_reg_info))
3067 pr_warn("Warning: could not register bpf iterator tcp\n");
3068}
3069
3070#endif
3071
3072void __init tcp_v4_init(void)
3073{
3074 if (register_pernet_subsys(&tcp_sk_ops))
3075 panic("Failed to create the TCP control socket.\n");
3076
3077#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3078 bpf_iter_register();
3079#endif
3080}
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * IPv4 specific functions
9 *
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 *
18 * This program is free software; you can redistribute it and/or
19 * modify it under the terms of the GNU General Public License
20 * as published by the Free Software Foundation; either version
21 * 2 of the License, or (at your option) any later version.
22 */
23
24/*
25 * Changes:
26 * David S. Miller : New socket lookup architecture.
27 * This code is dedicated to John Dyson.
28 * David S. Miller : Change semantics of established hash,
29 * half is devoted to TIME_WAIT sockets
30 * and the rest go in the other half.
31 * Andi Kleen : Add support for syncookies and fixed
32 * some bugs: ip options weren't passed to
33 * the TCP layer, missed a check for an
34 * ACK bit.
35 * Andi Kleen : Implemented fast path mtu discovery.
36 * Fixed many serious bugs in the
37 * request_sock handling and moved
38 * most of it into the af independent code.
39 * Added tail drop and some other bugfixes.
40 * Added new listen semantics.
41 * Mike McLagan : Routing by source
42 * Juan Jose Ciarlante: ip_dynaddr bits
43 * Andi Kleen: various fixes.
44 * Vitaly E. Lavrov : Transparent proxy revived after year
45 * coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
53#define pr_fmt(fmt) "TCP: " fmt
54
55#include <linux/bottom_half.h>
56#include <linux/types.h>
57#include <linux/fcntl.h>
58#include <linux/module.h>
59#include <linux/random.h>
60#include <linux/cache.h>
61#include <linux/jhash.h>
62#include <linux/init.h>
63#include <linux/times.h>
64#include <linux/slab.h>
65
66#include <net/net_namespace.h>
67#include <net/icmp.h>
68#include <net/inet_hashtables.h>
69#include <net/tcp.h>
70#include <net/transp_v6.h>
71#include <net/ipv6.h>
72#include <net/inet_common.h>
73#include <net/timewait_sock.h>
74#include <net/xfrm.h>
75#include <net/secure_seq.h>
76#include <net/busy_poll.h>
77
78#include <linux/inet.h>
79#include <linux/ipv6.h>
80#include <linux/stddef.h>
81#include <linux/proc_fs.h>
82#include <linux/seq_file.h>
83#include <linux/inetdevice.h>
84
85#include <crypto/hash.h>
86#include <linux/scatterlist.h>
87
88#include <trace/events/tcp.h>
89
90#ifdef CONFIG_TCP_MD5SIG
91static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 __be32 daddr, __be32 saddr, const struct tcphdr *th);
93#endif
94
95struct inet_hashinfo tcp_hashinfo;
96EXPORT_SYMBOL(tcp_hashinfo);
97
98static u32 tcp_v4_init_seq(const struct sk_buff *skb)
99{
100 return secure_tcp_seq(ip_hdr(skb)->daddr,
101 ip_hdr(skb)->saddr,
102 tcp_hdr(skb)->dest,
103 tcp_hdr(skb)->source);
104}
105
106static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
107{
108 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
109}
110
111int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
112{
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
115
116 /* With PAWS, it is safe from the viewpoint
117 of data integrity. Even without PAWS it is safe provided sequence
118 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
119
120 Actually, the idea is close to VJ's one, only timestamp cache is
121 held not per host, but per port pair and TW bucket is used as state
122 holder.
123
124 If TW bucket has been already destroyed we fall back to VJ's scheme
125 and use initial timestamp retrieved from peer table.
126 */
127 if (tcptw->tw_ts_recent_stamp &&
128 (!twp || (sock_net(sk)->ipv4.sysctl_tcp_tw_reuse &&
129 get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
130 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
131 if (tp->write_seq == 0)
132 tp->write_seq = 1;
133 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
134 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
135 sock_hold(sktw);
136 return 1;
137 }
138
139 return 0;
140}
141EXPORT_SYMBOL_GPL(tcp_twsk_unique);
142
143static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
144 int addr_len)
145{
146 /* This check is replicated from tcp_v4_connect() and intended to
147 * prevent BPF program called below from accessing bytes that are out
148 * of the bound specified by user in addr_len.
149 */
150 if (addr_len < sizeof(struct sockaddr_in))
151 return -EINVAL;
152
153 sock_owned_by_me(sk);
154
155 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
156}
157
158/* This will initiate an outgoing connection. */
159int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
160{
161 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
162 struct inet_sock *inet = inet_sk(sk);
163 struct tcp_sock *tp = tcp_sk(sk);
164 __be16 orig_sport, orig_dport;
165 __be32 daddr, nexthop;
166 struct flowi4 *fl4;
167 struct rtable *rt;
168 int err;
169 struct ip_options_rcu *inet_opt;
170 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
171
172 if (addr_len < sizeof(struct sockaddr_in))
173 return -EINVAL;
174
175 if (usin->sin_family != AF_INET)
176 return -EAFNOSUPPORT;
177
178 nexthop = daddr = usin->sin_addr.s_addr;
179 inet_opt = rcu_dereference_protected(inet->inet_opt,
180 lockdep_sock_is_held(sk));
181 if (inet_opt && inet_opt->opt.srr) {
182 if (!daddr)
183 return -EINVAL;
184 nexthop = inet_opt->opt.faddr;
185 }
186
187 orig_sport = inet->inet_sport;
188 orig_dport = usin->sin_port;
189 fl4 = &inet->cork.fl.u.ip4;
190 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
191 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
192 IPPROTO_TCP,
193 orig_sport, orig_dport, sk);
194 if (IS_ERR(rt)) {
195 err = PTR_ERR(rt);
196 if (err == -ENETUNREACH)
197 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
198 return err;
199 }
200
201 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
202 ip_rt_put(rt);
203 return -ENETUNREACH;
204 }
205
206 if (!inet_opt || !inet_opt->opt.srr)
207 daddr = fl4->daddr;
208
209 if (!inet->inet_saddr)
210 inet->inet_saddr = fl4->saddr;
211 sk_rcv_saddr_set(sk, inet->inet_saddr);
212
213 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
214 /* Reset inherited state */
215 tp->rx_opt.ts_recent = 0;
216 tp->rx_opt.ts_recent_stamp = 0;
217 if (likely(!tp->repair))
218 tp->write_seq = 0;
219 }
220
221 inet->inet_dport = usin->sin_port;
222 sk_daddr_set(sk, daddr);
223
224 inet_csk(sk)->icsk_ext_hdr_len = 0;
225 if (inet_opt)
226 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
227
228 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
229
230 /* Socket identity is still unknown (sport may be zero).
231 * However we set state to SYN-SENT and not releasing socket
232 * lock select source port, enter ourselves into the hash tables and
233 * complete initialization after this.
234 */
235 tcp_set_state(sk, TCP_SYN_SENT);
236 err = inet_hash_connect(tcp_death_row, sk);
237 if (err)
238 goto failure;
239
240 sk_set_txhash(sk);
241
242 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
243 inet->inet_sport, inet->inet_dport, sk);
244 if (IS_ERR(rt)) {
245 err = PTR_ERR(rt);
246 rt = NULL;
247 goto failure;
248 }
249 /* OK, now commit destination to socket. */
250 sk->sk_gso_type = SKB_GSO_TCPV4;
251 sk_setup_caps(sk, &rt->dst);
252 rt = NULL;
253
254 if (likely(!tp->repair)) {
255 if (!tp->write_seq)
256 tp->write_seq = secure_tcp_seq(inet->inet_saddr,
257 inet->inet_daddr,
258 inet->inet_sport,
259 usin->sin_port);
260 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
261 inet->inet_saddr,
262 inet->inet_daddr);
263 }
264
265 inet->inet_id = tp->write_seq ^ jiffies;
266
267 if (tcp_fastopen_defer_connect(sk, &err))
268 return err;
269 if (err)
270 goto failure;
271
272 err = tcp_connect(sk);
273
274 if (err)
275 goto failure;
276
277 return 0;
278
279failure:
280 /*
281 * This unhashes the socket and releases the local port,
282 * if necessary.
283 */
284 tcp_set_state(sk, TCP_CLOSE);
285 ip_rt_put(rt);
286 sk->sk_route_caps = 0;
287 inet->inet_dport = 0;
288 return err;
289}
290EXPORT_SYMBOL(tcp_v4_connect);
291
292/*
293 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
294 * It can be called through tcp_release_cb() if socket was owned by user
295 * at the time tcp_v4_err() was called to handle ICMP message.
296 */
297void tcp_v4_mtu_reduced(struct sock *sk)
298{
299 struct inet_sock *inet = inet_sk(sk);
300 struct dst_entry *dst;
301 u32 mtu;
302
303 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
304 return;
305 mtu = tcp_sk(sk)->mtu_info;
306 dst = inet_csk_update_pmtu(sk, mtu);
307 if (!dst)
308 return;
309
310 /* Something is about to be wrong... Remember soft error
311 * for the case, if this connection will not able to recover.
312 */
313 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
314 sk->sk_err_soft = EMSGSIZE;
315
316 mtu = dst_mtu(dst);
317
318 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
319 ip_sk_accept_pmtu(sk) &&
320 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
321 tcp_sync_mss(sk, mtu);
322
323 /* Resend the TCP packet because it's
324 * clear that the old packet has been
325 * dropped. This is the new "fast" path mtu
326 * discovery.
327 */
328 tcp_simple_retransmit(sk);
329 } /* else let the usual retransmit timer handle it */
330}
331EXPORT_SYMBOL(tcp_v4_mtu_reduced);
332
333static void do_redirect(struct sk_buff *skb, struct sock *sk)
334{
335 struct dst_entry *dst = __sk_dst_check(sk, 0);
336
337 if (dst)
338 dst->ops->redirect(dst, sk, skb);
339}
340
341
342/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
343void tcp_req_err(struct sock *sk, u32 seq, bool abort)
344{
345 struct request_sock *req = inet_reqsk(sk);
346 struct net *net = sock_net(sk);
347
348 /* ICMPs are not backlogged, hence we cannot get
349 * an established socket here.
350 */
351 if (seq != tcp_rsk(req)->snt_isn) {
352 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
353 } else if (abort) {
354 /*
355 * Still in SYN_RECV, just remove it silently.
356 * There is no good way to pass the error to the newly
357 * created socket, and POSIX does not want network
358 * errors returned from accept().
359 */
360 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
361 tcp_listendrop(req->rsk_listener);
362 }
363 reqsk_put(req);
364}
365EXPORT_SYMBOL(tcp_req_err);
366
367/*
368 * This routine is called by the ICMP module when it gets some
369 * sort of error condition. If err < 0 then the socket should
370 * be closed and the error returned to the user. If err > 0
371 * it's just the icmp type << 8 | icmp code. After adjustment
372 * header points to the first 8 bytes of the tcp header. We need
373 * to find the appropriate port.
374 *
375 * The locking strategy used here is very "optimistic". When
376 * someone else accesses the socket the ICMP is just dropped
377 * and for some paths there is no check at all.
378 * A more general error queue to queue errors for later handling
379 * is probably better.
380 *
381 */
382
383void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
384{
385 const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
386 struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
387 struct inet_connection_sock *icsk;
388 struct tcp_sock *tp;
389 struct inet_sock *inet;
390 const int type = icmp_hdr(icmp_skb)->type;
391 const int code = icmp_hdr(icmp_skb)->code;
392 struct sock *sk;
393 struct sk_buff *skb;
394 struct request_sock *fastopen;
395 u32 seq, snd_una;
396 s32 remaining;
397 u32 delta_us;
398 int err;
399 struct net *net = dev_net(icmp_skb->dev);
400
401 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
402 th->dest, iph->saddr, ntohs(th->source),
403 inet_iif(icmp_skb), 0);
404 if (!sk) {
405 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
406 return;
407 }
408 if (sk->sk_state == TCP_TIME_WAIT) {
409 inet_twsk_put(inet_twsk(sk));
410 return;
411 }
412 seq = ntohl(th->seq);
413 if (sk->sk_state == TCP_NEW_SYN_RECV)
414 return tcp_req_err(sk, seq,
415 type == ICMP_PARAMETERPROB ||
416 type == ICMP_TIME_EXCEEDED ||
417 (type == ICMP_DEST_UNREACH &&
418 (code == ICMP_NET_UNREACH ||
419 code == ICMP_HOST_UNREACH)));
420
421 bh_lock_sock(sk);
422 /* If too many ICMPs get dropped on busy
423 * servers this needs to be solved differently.
424 * We do take care of PMTU discovery (RFC1191) special case :
425 * we can receive locally generated ICMP messages while socket is held.
426 */
427 if (sock_owned_by_user(sk)) {
428 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
429 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
430 }
431 if (sk->sk_state == TCP_CLOSE)
432 goto out;
433
434 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
435 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
436 goto out;
437 }
438
439 icsk = inet_csk(sk);
440 tp = tcp_sk(sk);
441 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
442 fastopen = tp->fastopen_rsk;
443 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
444 if (sk->sk_state != TCP_LISTEN &&
445 !between(seq, snd_una, tp->snd_nxt)) {
446 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
447 goto out;
448 }
449
450 switch (type) {
451 case ICMP_REDIRECT:
452 if (!sock_owned_by_user(sk))
453 do_redirect(icmp_skb, sk);
454 goto out;
455 case ICMP_SOURCE_QUENCH:
456 /* Just silently ignore these. */
457 goto out;
458 case ICMP_PARAMETERPROB:
459 err = EPROTO;
460 break;
461 case ICMP_DEST_UNREACH:
462 if (code > NR_ICMP_UNREACH)
463 goto out;
464
465 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
466 /* We are not interested in TCP_LISTEN and open_requests
467 * (SYN-ACKs send out by Linux are always <576bytes so
468 * they should go through unfragmented).
469 */
470 if (sk->sk_state == TCP_LISTEN)
471 goto out;
472
473 tp->mtu_info = info;
474 if (!sock_owned_by_user(sk)) {
475 tcp_v4_mtu_reduced(sk);
476 } else {
477 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
478 sock_hold(sk);
479 }
480 goto out;
481 }
482
483 err = icmp_err_convert[code].errno;
484 /* check if icmp_skb allows revert of backoff
485 * (see draft-zimmermann-tcp-lcd) */
486 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
487 break;
488 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
489 !icsk->icsk_backoff || fastopen)
490 break;
491
492 if (sock_owned_by_user(sk))
493 break;
494
495 icsk->icsk_backoff--;
496 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
497 TCP_TIMEOUT_INIT;
498 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
499
500 skb = tcp_rtx_queue_head(sk);
501 BUG_ON(!skb);
502
503 tcp_mstamp_refresh(tp);
504 delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
505 remaining = icsk->icsk_rto -
506 usecs_to_jiffies(delta_us);
507
508 if (remaining > 0) {
509 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
510 remaining, TCP_RTO_MAX);
511 } else {
512 /* RTO revert clocked out retransmission.
513 * Will retransmit now */
514 tcp_retransmit_timer(sk);
515 }
516
517 break;
518 case ICMP_TIME_EXCEEDED:
519 err = EHOSTUNREACH;
520 break;
521 default:
522 goto out;
523 }
524
525 switch (sk->sk_state) {
526 case TCP_SYN_SENT:
527 case TCP_SYN_RECV:
528 /* Only in fast or simultaneous open. If a fast open socket is
529 * is already accepted it is treated as a connected one below.
530 */
531 if (fastopen && !fastopen->sk)
532 break;
533
534 if (!sock_owned_by_user(sk)) {
535 sk->sk_err = err;
536
537 sk->sk_error_report(sk);
538
539 tcp_done(sk);
540 } else {
541 sk->sk_err_soft = err;
542 }
543 goto out;
544 }
545
546 /* If we've already connected we will keep trying
547 * until we time out, or the user gives up.
548 *
549 * rfc1122 4.2.3.9 allows to consider as hard errors
550 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
551 * but it is obsoleted by pmtu discovery).
552 *
553 * Note, that in modern internet, where routing is unreliable
554 * and in each dark corner broken firewalls sit, sending random
555 * errors ordered by their masters even this two messages finally lose
556 * their original sense (even Linux sends invalid PORT_UNREACHs)
557 *
558 * Now we are in compliance with RFCs.
559 * --ANK (980905)
560 */
561
562 inet = inet_sk(sk);
563 if (!sock_owned_by_user(sk) && inet->recverr) {
564 sk->sk_err = err;
565 sk->sk_error_report(sk);
566 } else { /* Only an error on timeout */
567 sk->sk_err_soft = err;
568 }
569
570out:
571 bh_unlock_sock(sk);
572 sock_put(sk);
573}
574
575void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
576{
577 struct tcphdr *th = tcp_hdr(skb);
578
579 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
580 skb->csum_start = skb_transport_header(skb) - skb->head;
581 skb->csum_offset = offsetof(struct tcphdr, check);
582}
583
584/* This routine computes an IPv4 TCP checksum. */
585void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
586{
587 const struct inet_sock *inet = inet_sk(sk);
588
589 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
590}
591EXPORT_SYMBOL(tcp_v4_send_check);
592
593/*
594 * This routine will send an RST to the other tcp.
595 *
596 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
597 * for reset.
598 * Answer: if a packet caused RST, it is not for a socket
599 * existing in our system, if it is matched to a socket,
600 * it is just duplicate segment or bug in other side's TCP.
601 * So that we build reply only basing on parameters
602 * arrived with segment.
603 * Exception: precedence violation. We do not implement it in any case.
604 */
605
606static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
607{
608 const struct tcphdr *th = tcp_hdr(skb);
609 struct {
610 struct tcphdr th;
611#ifdef CONFIG_TCP_MD5SIG
612 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
613#endif
614 } rep;
615 struct ip_reply_arg arg;
616#ifdef CONFIG_TCP_MD5SIG
617 struct tcp_md5sig_key *key = NULL;
618 const __u8 *hash_location = NULL;
619 unsigned char newhash[16];
620 int genhash;
621 struct sock *sk1 = NULL;
622#endif
623 struct net *net;
624
625 /* Never send a reset in response to a reset. */
626 if (th->rst)
627 return;
628
629 /* If sk not NULL, it means we did a successful lookup and incoming
630 * route had to be correct. prequeue might have dropped our dst.
631 */
632 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
633 return;
634
635 /* Swap the send and the receive. */
636 memset(&rep, 0, sizeof(rep));
637 rep.th.dest = th->source;
638 rep.th.source = th->dest;
639 rep.th.doff = sizeof(struct tcphdr) / 4;
640 rep.th.rst = 1;
641
642 if (th->ack) {
643 rep.th.seq = th->ack_seq;
644 } else {
645 rep.th.ack = 1;
646 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
647 skb->len - (th->doff << 2));
648 }
649
650 memset(&arg, 0, sizeof(arg));
651 arg.iov[0].iov_base = (unsigned char *)&rep;
652 arg.iov[0].iov_len = sizeof(rep.th);
653
654 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
655#ifdef CONFIG_TCP_MD5SIG
656 rcu_read_lock();
657 hash_location = tcp_parse_md5sig_option(th);
658 if (sk && sk_fullsock(sk)) {
659 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
660 &ip_hdr(skb)->saddr, AF_INET);
661 } else if (hash_location) {
662 /*
663 * active side is lost. Try to find listening socket through
664 * source port, and then find md5 key through listening socket.
665 * we are not loose security here:
666 * Incoming packet is checked with md5 hash with finding key,
667 * no RST generated if md5 hash doesn't match.
668 */
669 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
670 ip_hdr(skb)->saddr,
671 th->source, ip_hdr(skb)->daddr,
672 ntohs(th->source), inet_iif(skb),
673 tcp_v4_sdif(skb));
674 /* don't send rst if it can't find key */
675 if (!sk1)
676 goto out;
677
678 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
679 &ip_hdr(skb)->saddr, AF_INET);
680 if (!key)
681 goto out;
682
683
684 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
685 if (genhash || memcmp(hash_location, newhash, 16) != 0)
686 goto out;
687
688 }
689
690 if (key) {
691 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
692 (TCPOPT_NOP << 16) |
693 (TCPOPT_MD5SIG << 8) |
694 TCPOLEN_MD5SIG);
695 /* Update length and the length the header thinks exists */
696 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
697 rep.th.doff = arg.iov[0].iov_len / 4;
698
699 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
700 key, ip_hdr(skb)->saddr,
701 ip_hdr(skb)->daddr, &rep.th);
702 }
703#endif
704 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
705 ip_hdr(skb)->saddr, /* XXX */
706 arg.iov[0].iov_len, IPPROTO_TCP, 0);
707 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
708 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
709
710 /* When socket is gone, all binding information is lost.
711 * routing might fail in this case. No choice here, if we choose to force
712 * input interface, we will misroute in case of asymmetric route.
713 */
714 if (sk) {
715 arg.bound_dev_if = sk->sk_bound_dev_if;
716 if (sk_fullsock(sk))
717 trace_tcp_send_reset(sk, skb);
718 }
719
720 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
721 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
722
723 arg.tos = ip_hdr(skb)->tos;
724 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
725 local_bh_disable();
726 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
727 skb, &TCP_SKB_CB(skb)->header.h4.opt,
728 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
729 &arg, arg.iov[0].iov_len);
730
731 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
732 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
733 local_bh_enable();
734
735#ifdef CONFIG_TCP_MD5SIG
736out:
737 rcu_read_unlock();
738#endif
739}
740
741/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
742 outside socket context is ugly, certainly. What can I do?
743 */
744
745static void tcp_v4_send_ack(const struct sock *sk,
746 struct sk_buff *skb, u32 seq, u32 ack,
747 u32 win, u32 tsval, u32 tsecr, int oif,
748 struct tcp_md5sig_key *key,
749 int reply_flags, u8 tos)
750{
751 const struct tcphdr *th = tcp_hdr(skb);
752 struct {
753 struct tcphdr th;
754 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
755#ifdef CONFIG_TCP_MD5SIG
756 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
757#endif
758 ];
759 } rep;
760 struct net *net = sock_net(sk);
761 struct ip_reply_arg arg;
762
763 memset(&rep.th, 0, sizeof(struct tcphdr));
764 memset(&arg, 0, sizeof(arg));
765
766 arg.iov[0].iov_base = (unsigned char *)&rep;
767 arg.iov[0].iov_len = sizeof(rep.th);
768 if (tsecr) {
769 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
770 (TCPOPT_TIMESTAMP << 8) |
771 TCPOLEN_TIMESTAMP);
772 rep.opt[1] = htonl(tsval);
773 rep.opt[2] = htonl(tsecr);
774 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
775 }
776
777 /* Swap the send and the receive. */
778 rep.th.dest = th->source;
779 rep.th.source = th->dest;
780 rep.th.doff = arg.iov[0].iov_len / 4;
781 rep.th.seq = htonl(seq);
782 rep.th.ack_seq = htonl(ack);
783 rep.th.ack = 1;
784 rep.th.window = htons(win);
785
786#ifdef CONFIG_TCP_MD5SIG
787 if (key) {
788 int offset = (tsecr) ? 3 : 0;
789
790 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
791 (TCPOPT_NOP << 16) |
792 (TCPOPT_MD5SIG << 8) |
793 TCPOLEN_MD5SIG);
794 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
795 rep.th.doff = arg.iov[0].iov_len/4;
796
797 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
798 key, ip_hdr(skb)->saddr,
799 ip_hdr(skb)->daddr, &rep.th);
800 }
801#endif
802 arg.flags = reply_flags;
803 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
804 ip_hdr(skb)->saddr, /* XXX */
805 arg.iov[0].iov_len, IPPROTO_TCP, 0);
806 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
807 if (oif)
808 arg.bound_dev_if = oif;
809 arg.tos = tos;
810 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
811 local_bh_disable();
812 ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
813 skb, &TCP_SKB_CB(skb)->header.h4.opt,
814 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
815 &arg, arg.iov[0].iov_len);
816
817 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
818 local_bh_enable();
819}
820
821static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
822{
823 struct inet_timewait_sock *tw = inet_twsk(sk);
824 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
825
826 tcp_v4_send_ack(sk, skb,
827 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
828 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
829 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
830 tcptw->tw_ts_recent,
831 tw->tw_bound_dev_if,
832 tcp_twsk_md5_key(tcptw),
833 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
834 tw->tw_tos
835 );
836
837 inet_twsk_put(tw);
838}
839
840static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
841 struct request_sock *req)
842{
843 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
844 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
845 */
846 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
847 tcp_sk(sk)->snd_nxt;
848
849 /* RFC 7323 2.3
850 * The window field (SEG.WND) of every outgoing segment, with the
851 * exception of <SYN> segments, MUST be right-shifted by
852 * Rcv.Wind.Shift bits:
853 */
854 tcp_v4_send_ack(sk, skb, seq,
855 tcp_rsk(req)->rcv_nxt,
856 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
857 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
858 req->ts_recent,
859 0,
860 tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
861 AF_INET),
862 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
863 ip_hdr(skb)->tos);
864}
865
866/*
867 * Send a SYN-ACK after having received a SYN.
868 * This still operates on a request_sock only, not on a big
869 * socket.
870 */
871static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
872 struct flowi *fl,
873 struct request_sock *req,
874 struct tcp_fastopen_cookie *foc,
875 enum tcp_synack_type synack_type)
876{
877 const struct inet_request_sock *ireq = inet_rsk(req);
878 struct flowi4 fl4;
879 int err = -1;
880 struct sk_buff *skb;
881
882 /* First, grab a route. */
883 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
884 return -1;
885
886 skb = tcp_make_synack(sk, dst, req, foc, synack_type);
887
888 if (skb) {
889 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
890
891 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
892 ireq->ir_rmt_addr,
893 ireq_opt_deref(ireq));
894 err = net_xmit_eval(err);
895 }
896
897 return err;
898}
899
900/*
901 * IPv4 request_sock destructor.
902 */
903static void tcp_v4_reqsk_destructor(struct request_sock *req)
904{
905 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
906}
907
908#ifdef CONFIG_TCP_MD5SIG
909/*
910 * RFC2385 MD5 checksumming requires a mapping of
911 * IP address->MD5 Key.
912 * We need to maintain these in the sk structure.
913 */
914
915/* Find the Key structure for an address. */
916struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
917 const union tcp_md5_addr *addr,
918 int family)
919{
920 const struct tcp_sock *tp = tcp_sk(sk);
921 struct tcp_md5sig_key *key;
922 const struct tcp_md5sig_info *md5sig;
923 __be32 mask;
924 struct tcp_md5sig_key *best_match = NULL;
925 bool match;
926
927 /* caller either holds rcu_read_lock() or socket lock */
928 md5sig = rcu_dereference_check(tp->md5sig_info,
929 lockdep_sock_is_held(sk));
930 if (!md5sig)
931 return NULL;
932
933 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
934 if (key->family != family)
935 continue;
936
937 if (family == AF_INET) {
938 mask = inet_make_mask(key->prefixlen);
939 match = (key->addr.a4.s_addr & mask) ==
940 (addr->a4.s_addr & mask);
941#if IS_ENABLED(CONFIG_IPV6)
942 } else if (family == AF_INET6) {
943 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
944 key->prefixlen);
945#endif
946 } else {
947 match = false;
948 }
949
950 if (match && (!best_match ||
951 key->prefixlen > best_match->prefixlen))
952 best_match = key;
953 }
954 return best_match;
955}
956EXPORT_SYMBOL(tcp_md5_do_lookup);
957
958static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
959 const union tcp_md5_addr *addr,
960 int family, u8 prefixlen)
961{
962 const struct tcp_sock *tp = tcp_sk(sk);
963 struct tcp_md5sig_key *key;
964 unsigned int size = sizeof(struct in_addr);
965 const struct tcp_md5sig_info *md5sig;
966
967 /* caller either holds rcu_read_lock() or socket lock */
968 md5sig = rcu_dereference_check(tp->md5sig_info,
969 lockdep_sock_is_held(sk));
970 if (!md5sig)
971 return NULL;
972#if IS_ENABLED(CONFIG_IPV6)
973 if (family == AF_INET6)
974 size = sizeof(struct in6_addr);
975#endif
976 hlist_for_each_entry_rcu(key, &md5sig->head, node) {
977 if (key->family != family)
978 continue;
979 if (!memcmp(&key->addr, addr, size) &&
980 key->prefixlen == prefixlen)
981 return key;
982 }
983 return NULL;
984}
985
986struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
987 const struct sock *addr_sk)
988{
989 const union tcp_md5_addr *addr;
990
991 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
992 return tcp_md5_do_lookup(sk, addr, AF_INET);
993}
994EXPORT_SYMBOL(tcp_v4_md5_lookup);
995
996/* This can be called on a newly created socket, from other files */
997int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
998 int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
999 gfp_t gfp)
1000{
1001 /* Add Key to the list */
1002 struct tcp_md5sig_key *key;
1003 struct tcp_sock *tp = tcp_sk(sk);
1004 struct tcp_md5sig_info *md5sig;
1005
1006 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1007 if (key) {
1008 /* Pre-existing entry - just update that one. */
1009 memcpy(key->key, newkey, newkeylen);
1010 key->keylen = newkeylen;
1011 return 0;
1012 }
1013
1014 md5sig = rcu_dereference_protected(tp->md5sig_info,
1015 lockdep_sock_is_held(sk));
1016 if (!md5sig) {
1017 md5sig = kmalloc(sizeof(*md5sig), gfp);
1018 if (!md5sig)
1019 return -ENOMEM;
1020
1021 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1022 INIT_HLIST_HEAD(&md5sig->head);
1023 rcu_assign_pointer(tp->md5sig_info, md5sig);
1024 }
1025
1026 key = sock_kmalloc(sk, sizeof(*key), gfp);
1027 if (!key)
1028 return -ENOMEM;
1029 if (!tcp_alloc_md5sig_pool()) {
1030 sock_kfree_s(sk, key, sizeof(*key));
1031 return -ENOMEM;
1032 }
1033
1034 memcpy(key->key, newkey, newkeylen);
1035 key->keylen = newkeylen;
1036 key->family = family;
1037 key->prefixlen = prefixlen;
1038 memcpy(&key->addr, addr,
1039 (family == AF_INET6) ? sizeof(struct in6_addr) :
1040 sizeof(struct in_addr));
1041 hlist_add_head_rcu(&key->node, &md5sig->head);
1042 return 0;
1043}
1044EXPORT_SYMBOL(tcp_md5_do_add);
1045
1046int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1047 u8 prefixlen)
1048{
1049 struct tcp_md5sig_key *key;
1050
1051 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
1052 if (!key)
1053 return -ENOENT;
1054 hlist_del_rcu(&key->node);
1055 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1056 kfree_rcu(key, rcu);
1057 return 0;
1058}
1059EXPORT_SYMBOL(tcp_md5_do_del);
1060
1061static void tcp_clear_md5_list(struct sock *sk)
1062{
1063 struct tcp_sock *tp = tcp_sk(sk);
1064 struct tcp_md5sig_key *key;
1065 struct hlist_node *n;
1066 struct tcp_md5sig_info *md5sig;
1067
1068 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1069
1070 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1071 hlist_del_rcu(&key->node);
1072 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1073 kfree_rcu(key, rcu);
1074 }
1075}
1076
1077static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1078 char __user *optval, int optlen)
1079{
1080 struct tcp_md5sig cmd;
1081 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1082 u8 prefixlen = 32;
1083
1084 if (optlen < sizeof(cmd))
1085 return -EINVAL;
1086
1087 if (copy_from_user(&cmd, optval, sizeof(cmd)))
1088 return -EFAULT;
1089
1090 if (sin->sin_family != AF_INET)
1091 return -EINVAL;
1092
1093 if (optname == TCP_MD5SIG_EXT &&
1094 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1095 prefixlen = cmd.tcpm_prefixlen;
1096 if (prefixlen > 32)
1097 return -EINVAL;
1098 }
1099
1100 if (!cmd.tcpm_keylen)
1101 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1102 AF_INET, prefixlen);
1103
1104 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1105 return -EINVAL;
1106
1107 return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1108 AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
1109 GFP_KERNEL);
1110}
1111
1112static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1113 __be32 daddr, __be32 saddr,
1114 const struct tcphdr *th, int nbytes)
1115{
1116 struct tcp4_pseudohdr *bp;
1117 struct scatterlist sg;
1118 struct tcphdr *_th;
1119
1120 bp = hp->scratch;
1121 bp->saddr = saddr;
1122 bp->daddr = daddr;
1123 bp->pad = 0;
1124 bp->protocol = IPPROTO_TCP;
1125 bp->len = cpu_to_be16(nbytes);
1126
1127 _th = (struct tcphdr *)(bp + 1);
1128 memcpy(_th, th, sizeof(*th));
1129 _th->check = 0;
1130
1131 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1132 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1133 sizeof(*bp) + sizeof(*th));
1134 return crypto_ahash_update(hp->md5_req);
1135}
1136
1137static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1138 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1139{
1140 struct tcp_md5sig_pool *hp;
1141 struct ahash_request *req;
1142
1143 hp = tcp_get_md5sig_pool();
1144 if (!hp)
1145 goto clear_hash_noput;
1146 req = hp->md5_req;
1147
1148 if (crypto_ahash_init(req))
1149 goto clear_hash;
1150 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1151 goto clear_hash;
1152 if (tcp_md5_hash_key(hp, key))
1153 goto clear_hash;
1154 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1155 if (crypto_ahash_final(req))
1156 goto clear_hash;
1157
1158 tcp_put_md5sig_pool();
1159 return 0;
1160
1161clear_hash:
1162 tcp_put_md5sig_pool();
1163clear_hash_noput:
1164 memset(md5_hash, 0, 16);
1165 return 1;
1166}
1167
1168int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1169 const struct sock *sk,
1170 const struct sk_buff *skb)
1171{
1172 struct tcp_md5sig_pool *hp;
1173 struct ahash_request *req;
1174 const struct tcphdr *th = tcp_hdr(skb);
1175 __be32 saddr, daddr;
1176
1177 if (sk) { /* valid for establish/request sockets */
1178 saddr = sk->sk_rcv_saddr;
1179 daddr = sk->sk_daddr;
1180 } else {
1181 const struct iphdr *iph = ip_hdr(skb);
1182 saddr = iph->saddr;
1183 daddr = iph->daddr;
1184 }
1185
1186 hp = tcp_get_md5sig_pool();
1187 if (!hp)
1188 goto clear_hash_noput;
1189 req = hp->md5_req;
1190
1191 if (crypto_ahash_init(req))
1192 goto clear_hash;
1193
1194 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1195 goto clear_hash;
1196 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1197 goto clear_hash;
1198 if (tcp_md5_hash_key(hp, key))
1199 goto clear_hash;
1200 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1201 if (crypto_ahash_final(req))
1202 goto clear_hash;
1203
1204 tcp_put_md5sig_pool();
1205 return 0;
1206
1207clear_hash:
1208 tcp_put_md5sig_pool();
1209clear_hash_noput:
1210 memset(md5_hash, 0, 16);
1211 return 1;
1212}
1213EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1214
1215#endif
1216
1217/* Called with rcu_read_lock() */
1218static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
1219 const struct sk_buff *skb)
1220{
1221#ifdef CONFIG_TCP_MD5SIG
1222 /*
1223 * This gets called for each TCP segment that arrives
1224 * so we want to be efficient.
1225 * We have 3 drop cases:
1226 * o No MD5 hash and one expected.
1227 * o MD5 hash and we're not expecting one.
1228 * o MD5 hash and its wrong.
1229 */
1230 const __u8 *hash_location = NULL;
1231 struct tcp_md5sig_key *hash_expected;
1232 const struct iphdr *iph = ip_hdr(skb);
1233 const struct tcphdr *th = tcp_hdr(skb);
1234 int genhash;
1235 unsigned char newhash[16];
1236
1237 hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1238 AF_INET);
1239 hash_location = tcp_parse_md5sig_option(th);
1240
1241 /* We've parsed the options - do we have a hash? */
1242 if (!hash_expected && !hash_location)
1243 return false;
1244
1245 if (hash_expected && !hash_location) {
1246 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1247 return true;
1248 }
1249
1250 if (!hash_expected && hash_location) {
1251 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1252 return true;
1253 }
1254
1255 /* Okay, so this is hash_expected and hash_location -
1256 * so we need to calculate the checksum.
1257 */
1258 genhash = tcp_v4_md5_hash_skb(newhash,
1259 hash_expected,
1260 NULL, skb);
1261
1262 if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1263 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
1264 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1265 &iph->saddr, ntohs(th->source),
1266 &iph->daddr, ntohs(th->dest),
1267 genhash ? " tcp_v4_calc_md5_hash failed"
1268 : "");
1269 return true;
1270 }
1271 return false;
1272#endif
1273 return false;
1274}
1275
1276static void tcp_v4_init_req(struct request_sock *req,
1277 const struct sock *sk_listener,
1278 struct sk_buff *skb)
1279{
1280 struct inet_request_sock *ireq = inet_rsk(req);
1281 struct net *net = sock_net(sk_listener);
1282
1283 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1284 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1285 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1286}
1287
1288static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1289 struct flowi *fl,
1290 const struct request_sock *req)
1291{
1292 return inet_csk_route_req(sk, &fl->u.ip4, req);
1293}
1294
1295struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1296 .family = PF_INET,
1297 .obj_size = sizeof(struct tcp_request_sock),
1298 .rtx_syn_ack = tcp_rtx_synack,
1299 .send_ack = tcp_v4_reqsk_send_ack,
1300 .destructor = tcp_v4_reqsk_destructor,
1301 .send_reset = tcp_v4_send_reset,
1302 .syn_ack_timeout = tcp_syn_ack_timeout,
1303};
1304
1305static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1306 .mss_clamp = TCP_MSS_DEFAULT,
1307#ifdef CONFIG_TCP_MD5SIG
1308 .req_md5_lookup = tcp_v4_md5_lookup,
1309 .calc_md5_hash = tcp_v4_md5_hash_skb,
1310#endif
1311 .init_req = tcp_v4_init_req,
1312#ifdef CONFIG_SYN_COOKIES
1313 .cookie_init_seq = cookie_v4_init_sequence,
1314#endif
1315 .route_req = tcp_v4_route_req,
1316 .init_seq = tcp_v4_init_seq,
1317 .init_ts_off = tcp_v4_init_ts_off,
1318 .send_synack = tcp_v4_send_synack,
1319};
1320
1321int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1322{
1323 /* Never answer to SYNs send to broadcast or multicast */
1324 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1325 goto drop;
1326
1327 return tcp_conn_request(&tcp_request_sock_ops,
1328 &tcp_request_sock_ipv4_ops, sk, skb);
1329
1330drop:
1331 tcp_listendrop(sk);
1332 return 0;
1333}
1334EXPORT_SYMBOL(tcp_v4_conn_request);
1335
1336
1337/*
1338 * The three way handshake has completed - we got a valid synack -
1339 * now create the new socket.
1340 */
1341struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1342 struct request_sock *req,
1343 struct dst_entry *dst,
1344 struct request_sock *req_unhash,
1345 bool *own_req)
1346{
1347 struct inet_request_sock *ireq;
1348 struct inet_sock *newinet;
1349 struct tcp_sock *newtp;
1350 struct sock *newsk;
1351#ifdef CONFIG_TCP_MD5SIG
1352 struct tcp_md5sig_key *key;
1353#endif
1354 struct ip_options_rcu *inet_opt;
1355
1356 if (sk_acceptq_is_full(sk))
1357 goto exit_overflow;
1358
1359 newsk = tcp_create_openreq_child(sk, req, skb);
1360 if (!newsk)
1361 goto exit_nonewsk;
1362
1363 newsk->sk_gso_type = SKB_GSO_TCPV4;
1364 inet_sk_rx_dst_set(newsk, skb);
1365
1366 newtp = tcp_sk(newsk);
1367 newinet = inet_sk(newsk);
1368 ireq = inet_rsk(req);
1369 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1370 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1371 newsk->sk_bound_dev_if = ireq->ir_iif;
1372 newinet->inet_saddr = ireq->ir_loc_addr;
1373 inet_opt = rcu_dereference(ireq->ireq_opt);
1374 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1375 newinet->mc_index = inet_iif(skb);
1376 newinet->mc_ttl = ip_hdr(skb)->ttl;
1377 newinet->rcv_tos = ip_hdr(skb)->tos;
1378 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1379 if (inet_opt)
1380 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1381 newinet->inet_id = newtp->write_seq ^ jiffies;
1382
1383 if (!dst) {
1384 dst = inet_csk_route_child_sock(sk, newsk, req);
1385 if (!dst)
1386 goto put_and_exit;
1387 } else {
1388 /* syncookie case : see end of cookie_v4_check() */
1389 }
1390 sk_setup_caps(newsk, dst);
1391
1392 tcp_ca_openreq_child(newsk, dst);
1393
1394 tcp_sync_mss(newsk, dst_mtu(dst));
1395 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1396
1397 tcp_initialize_rcv_mss(newsk);
1398
1399#ifdef CONFIG_TCP_MD5SIG
1400 /* Copy over the MD5 key from the original socket */
1401 key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1402 AF_INET);
1403 if (key) {
1404 /*
1405 * We're using one, so create a matching key
1406 * on the newsk structure. If we fail to get
1407 * memory, then we end up not copying the key
1408 * across. Shucks.
1409 */
1410 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1411 AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
1412 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1413 }
1414#endif
1415
1416 if (__inet_inherit_port(sk, newsk) < 0)
1417 goto put_and_exit;
1418 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
1419 if (likely(*own_req)) {
1420 tcp_move_syn(newtp, req);
1421 ireq->ireq_opt = NULL;
1422 } else {
1423 newinet->inet_opt = NULL;
1424 }
1425 return newsk;
1426
1427exit_overflow:
1428 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1429exit_nonewsk:
1430 dst_release(dst);
1431exit:
1432 tcp_listendrop(sk);
1433 return NULL;
1434put_and_exit:
1435 newinet->inet_opt = NULL;
1436 inet_csk_prepare_forced_close(newsk);
1437 tcp_done(newsk);
1438 goto exit;
1439}
1440EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1441
1442static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1443{
1444#ifdef CONFIG_SYN_COOKIES
1445 const struct tcphdr *th = tcp_hdr(skb);
1446
1447 if (!th->syn)
1448 sk = cookie_v4_check(sk, skb);
1449#endif
1450 return sk;
1451}
1452
1453/* The socket must have it's spinlock held when we get
1454 * here, unless it is a TCP_LISTEN socket.
1455 *
1456 * We have a potential double-lock case here, so even when
1457 * doing backlog processing we use the BH locking scheme.
1458 * This is because we cannot sleep with the original spinlock
1459 * held.
1460 */
1461int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1462{
1463 struct sock *rsk;
1464
1465 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1466 struct dst_entry *dst = sk->sk_rx_dst;
1467
1468 sock_rps_save_rxhash(sk, skb);
1469 sk_mark_napi_id(sk, skb);
1470 if (dst) {
1471 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1472 !dst->ops->check(dst, 0)) {
1473 dst_release(dst);
1474 sk->sk_rx_dst = NULL;
1475 }
1476 }
1477 tcp_rcv_established(sk, skb, tcp_hdr(skb));
1478 return 0;
1479 }
1480
1481 if (tcp_checksum_complete(skb))
1482 goto csum_err;
1483
1484 if (sk->sk_state == TCP_LISTEN) {
1485 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1486
1487 if (!nsk)
1488 goto discard;
1489 if (nsk != sk) {
1490 if (tcp_child_process(sk, nsk, skb)) {
1491 rsk = nsk;
1492 goto reset;
1493 }
1494 return 0;
1495 }
1496 } else
1497 sock_rps_save_rxhash(sk, skb);
1498
1499 if (tcp_rcv_state_process(sk, skb)) {
1500 rsk = sk;
1501 goto reset;
1502 }
1503 return 0;
1504
1505reset:
1506 tcp_v4_send_reset(rsk, skb);
1507discard:
1508 kfree_skb(skb);
1509 /* Be careful here. If this function gets more complicated and
1510 * gcc suffers from register pressure on the x86, sk (in %ebx)
1511 * might be destroyed here. This current version compiles correctly,
1512 * but you have been warned.
1513 */
1514 return 0;
1515
1516csum_err:
1517 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1518 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1519 goto discard;
1520}
1521EXPORT_SYMBOL(tcp_v4_do_rcv);
1522
1523int tcp_v4_early_demux(struct sk_buff *skb)
1524{
1525 const struct iphdr *iph;
1526 const struct tcphdr *th;
1527 struct sock *sk;
1528
1529 if (skb->pkt_type != PACKET_HOST)
1530 return 0;
1531
1532 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1533 return 0;
1534
1535 iph = ip_hdr(skb);
1536 th = tcp_hdr(skb);
1537
1538 if (th->doff < sizeof(struct tcphdr) / 4)
1539 return 0;
1540
1541 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1542 iph->saddr, th->source,
1543 iph->daddr, ntohs(th->dest),
1544 skb->skb_iif, inet_sdif(skb));
1545 if (sk) {
1546 skb->sk = sk;
1547 skb->destructor = sock_edemux;
1548 if (sk_fullsock(sk)) {
1549 struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
1550
1551 if (dst)
1552 dst = dst_check(dst, 0);
1553 if (dst &&
1554 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1555 skb_dst_set_noref(skb, dst);
1556 }
1557 }
1558 return 0;
1559}
1560
1561bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
1562{
1563 u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
1564
1565 /* Only socket owner can try to collapse/prune rx queues
1566 * to reduce memory overhead, so add a little headroom here.
1567 * Few sockets backlog are possibly concurrently non empty.
1568 */
1569 limit += 64*1024;
1570
1571 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1572 * we can fix skb->truesize to its real value to avoid future drops.
1573 * This is valid because skb is not yet charged to the socket.
1574 * It has been noticed pure SACK packets were sometimes dropped
1575 * (if cooked by drivers without copybreak feature).
1576 */
1577 skb_condense(skb);
1578
1579 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1580 bh_unlock_sock(sk);
1581 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1582 return true;
1583 }
1584 return false;
1585}
1586EXPORT_SYMBOL(tcp_add_backlog);
1587
1588int tcp_filter(struct sock *sk, struct sk_buff *skb)
1589{
1590 struct tcphdr *th = (struct tcphdr *)skb->data;
1591 unsigned int eaten = skb->len;
1592 int err;
1593
1594 err = sk_filter_trim_cap(sk, skb, th->doff * 4);
1595 if (!err) {
1596 eaten -= skb->len;
1597 TCP_SKB_CB(skb)->end_seq -= eaten;
1598 }
1599 return err;
1600}
1601EXPORT_SYMBOL(tcp_filter);
1602
1603static void tcp_v4_restore_cb(struct sk_buff *skb)
1604{
1605 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1606 sizeof(struct inet_skb_parm));
1607}
1608
1609static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1610 const struct tcphdr *th)
1611{
1612 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1613 * barrier() makes sure compiler wont play fool^Waliasing games.
1614 */
1615 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1616 sizeof(struct inet_skb_parm));
1617 barrier();
1618
1619 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1620 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1621 skb->len - th->doff * 4);
1622 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1623 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1624 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1625 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1626 TCP_SKB_CB(skb)->sacked = 0;
1627 TCP_SKB_CB(skb)->has_rxtstamp =
1628 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1629}
1630
1631/*
1632 * From tcp_input.c
1633 */
1634
1635int tcp_v4_rcv(struct sk_buff *skb)
1636{
1637 struct net *net = dev_net(skb->dev);
1638 int sdif = inet_sdif(skb);
1639 const struct iphdr *iph;
1640 const struct tcphdr *th;
1641 bool refcounted;
1642 struct sock *sk;
1643 int ret;
1644
1645 if (skb->pkt_type != PACKET_HOST)
1646 goto discard_it;
1647
1648 /* Count it even if it's bad */
1649 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1650
1651 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1652 goto discard_it;
1653
1654 th = (const struct tcphdr *)skb->data;
1655
1656 if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
1657 goto bad_packet;
1658 if (!pskb_may_pull(skb, th->doff * 4))
1659 goto discard_it;
1660
1661 /* An explanation is required here, I think.
1662 * Packet length and doff are validated by header prediction,
1663 * provided case of th->doff==0 is eliminated.
1664 * So, we defer the checks. */
1665
1666 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1667 goto csum_error;
1668
1669 th = (const struct tcphdr *)skb->data;
1670 iph = ip_hdr(skb);
1671lookup:
1672 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1673 th->dest, sdif, &refcounted);
1674 if (!sk)
1675 goto no_tcp_socket;
1676
1677process:
1678 if (sk->sk_state == TCP_TIME_WAIT)
1679 goto do_time_wait;
1680
1681 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1682 struct request_sock *req = inet_reqsk(sk);
1683 bool req_stolen = false;
1684 struct sock *nsk;
1685
1686 sk = req->rsk_listener;
1687 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
1688 sk_drops_add(sk, skb);
1689 reqsk_put(req);
1690 goto discard_it;
1691 }
1692 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1693 inet_csk_reqsk_queue_drop_and_put(sk, req);
1694 goto lookup;
1695 }
1696 /* We own a reference on the listener, increase it again
1697 * as we might lose it too soon.
1698 */
1699 sock_hold(sk);
1700 refcounted = true;
1701 nsk = NULL;
1702 if (!tcp_filter(sk, skb)) {
1703 th = (const struct tcphdr *)skb->data;
1704 iph = ip_hdr(skb);
1705 tcp_v4_fill_cb(skb, iph, th);
1706 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
1707 }
1708 if (!nsk) {
1709 reqsk_put(req);
1710 if (req_stolen) {
1711 /* Another cpu got exclusive access to req
1712 * and created a full blown socket.
1713 * Try to feed this packet to this socket
1714 * instead of discarding it.
1715 */
1716 tcp_v4_restore_cb(skb);
1717 sock_put(sk);
1718 goto lookup;
1719 }
1720 goto discard_and_relse;
1721 }
1722 if (nsk == sk) {
1723 reqsk_put(req);
1724 tcp_v4_restore_cb(skb);
1725 } else if (tcp_child_process(sk, nsk, skb)) {
1726 tcp_v4_send_reset(nsk, skb);
1727 goto discard_and_relse;
1728 } else {
1729 sock_put(sk);
1730 return 0;
1731 }
1732 }
1733 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1734 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
1735 goto discard_and_relse;
1736 }
1737
1738 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1739 goto discard_and_relse;
1740
1741 if (tcp_v4_inbound_md5_hash(sk, skb))
1742 goto discard_and_relse;
1743
1744 nf_reset(skb);
1745
1746 if (tcp_filter(sk, skb))
1747 goto discard_and_relse;
1748 th = (const struct tcphdr *)skb->data;
1749 iph = ip_hdr(skb);
1750 tcp_v4_fill_cb(skb, iph, th);
1751
1752 skb->dev = NULL;
1753
1754 if (sk->sk_state == TCP_LISTEN) {
1755 ret = tcp_v4_do_rcv(sk, skb);
1756 goto put_and_return;
1757 }
1758
1759 sk_incoming_cpu_update(sk);
1760
1761 bh_lock_sock_nested(sk);
1762 tcp_segs_in(tcp_sk(sk), skb);
1763 ret = 0;
1764 if (!sock_owned_by_user(sk)) {
1765 ret = tcp_v4_do_rcv(sk, skb);
1766 } else if (tcp_add_backlog(sk, skb)) {
1767 goto discard_and_relse;
1768 }
1769 bh_unlock_sock(sk);
1770
1771put_and_return:
1772 if (refcounted)
1773 sock_put(sk);
1774
1775 return ret;
1776
1777no_tcp_socket:
1778 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1779 goto discard_it;
1780
1781 tcp_v4_fill_cb(skb, iph, th);
1782
1783 if (tcp_checksum_complete(skb)) {
1784csum_error:
1785 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
1786bad_packet:
1787 __TCP_INC_STATS(net, TCP_MIB_INERRS);
1788 } else {
1789 tcp_v4_send_reset(NULL, skb);
1790 }
1791
1792discard_it:
1793 /* Discard frame. */
1794 kfree_skb(skb);
1795 return 0;
1796
1797discard_and_relse:
1798 sk_drops_add(sk, skb);
1799 if (refcounted)
1800 sock_put(sk);
1801 goto discard_it;
1802
1803do_time_wait:
1804 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1805 inet_twsk_put(inet_twsk(sk));
1806 goto discard_it;
1807 }
1808
1809 tcp_v4_fill_cb(skb, iph, th);
1810
1811 if (tcp_checksum_complete(skb)) {
1812 inet_twsk_put(inet_twsk(sk));
1813 goto csum_error;
1814 }
1815 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1816 case TCP_TW_SYN: {
1817 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1818 &tcp_hashinfo, skb,
1819 __tcp_hdrlen(th),
1820 iph->saddr, th->source,
1821 iph->daddr, th->dest,
1822 inet_iif(skb),
1823 sdif);
1824 if (sk2) {
1825 inet_twsk_deschedule_put(inet_twsk(sk));
1826 sk = sk2;
1827 tcp_v4_restore_cb(skb);
1828 refcounted = false;
1829 goto process;
1830 }
1831 }
1832 /* to ACK */
1833 /* fall through */
1834 case TCP_TW_ACK:
1835 tcp_v4_timewait_ack(sk, skb);
1836 break;
1837 case TCP_TW_RST:
1838 tcp_v4_send_reset(sk, skb);
1839 inet_twsk_deschedule_put(inet_twsk(sk));
1840 goto discard_it;
1841 case TCP_TW_SUCCESS:;
1842 }
1843 goto discard_it;
1844}
1845
1846static struct timewait_sock_ops tcp_timewait_sock_ops = {
1847 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
1848 .twsk_unique = tcp_twsk_unique,
1849 .twsk_destructor= tcp_twsk_destructor,
1850};
1851
1852void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1853{
1854 struct dst_entry *dst = skb_dst(skb);
1855
1856 if (dst && dst_hold_safe(dst)) {
1857 sk->sk_rx_dst = dst;
1858 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1859 }
1860}
1861EXPORT_SYMBOL(inet_sk_rx_dst_set);
1862
1863const struct inet_connection_sock_af_ops ipv4_specific = {
1864 .queue_xmit = ip_queue_xmit,
1865 .send_check = tcp_v4_send_check,
1866 .rebuild_header = inet_sk_rebuild_header,
1867 .sk_rx_dst_set = inet_sk_rx_dst_set,
1868 .conn_request = tcp_v4_conn_request,
1869 .syn_recv_sock = tcp_v4_syn_recv_sock,
1870 .net_header_len = sizeof(struct iphdr),
1871 .setsockopt = ip_setsockopt,
1872 .getsockopt = ip_getsockopt,
1873 .addr2sockaddr = inet_csk_addr2sockaddr,
1874 .sockaddr_len = sizeof(struct sockaddr_in),
1875#ifdef CONFIG_COMPAT
1876 .compat_setsockopt = compat_ip_setsockopt,
1877 .compat_getsockopt = compat_ip_getsockopt,
1878#endif
1879 .mtu_reduced = tcp_v4_mtu_reduced,
1880};
1881EXPORT_SYMBOL(ipv4_specific);
1882
1883#ifdef CONFIG_TCP_MD5SIG
1884static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1885 .md5_lookup = tcp_v4_md5_lookup,
1886 .calc_md5_hash = tcp_v4_md5_hash_skb,
1887 .md5_parse = tcp_v4_parse_md5_keys,
1888};
1889#endif
1890
1891/* NOTE: A lot of things set to zero explicitly by call to
1892 * sk_alloc() so need not be done here.
1893 */
1894static int tcp_v4_init_sock(struct sock *sk)
1895{
1896 struct inet_connection_sock *icsk = inet_csk(sk);
1897
1898 tcp_init_sock(sk);
1899
1900 icsk->icsk_af_ops = &ipv4_specific;
1901
1902#ifdef CONFIG_TCP_MD5SIG
1903 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
1904#endif
1905
1906 return 0;
1907}
1908
1909void tcp_v4_destroy_sock(struct sock *sk)
1910{
1911 struct tcp_sock *tp = tcp_sk(sk);
1912
1913 trace_tcp_destroy_sock(sk);
1914
1915 tcp_clear_xmit_timers(sk);
1916
1917 tcp_cleanup_congestion_control(sk);
1918
1919 tcp_cleanup_ulp(sk);
1920
1921 /* Cleanup up the write buffer. */
1922 tcp_write_queue_purge(sk);
1923
1924 /* Check if we want to disable active TFO */
1925 tcp_fastopen_active_disable_ofo_check(sk);
1926
1927 /* Cleans up our, hopefully empty, out_of_order_queue. */
1928 skb_rbtree_purge(&tp->out_of_order_queue);
1929
1930#ifdef CONFIG_TCP_MD5SIG
1931 /* Clean up the MD5 key list, if any */
1932 if (tp->md5sig_info) {
1933 tcp_clear_md5_list(sk);
1934 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
1935 tp->md5sig_info = NULL;
1936 }
1937#endif
1938
1939 /* Clean up a referenced TCP bind bucket. */
1940 if (inet_csk(sk)->icsk_bind_hash)
1941 inet_put_port(sk);
1942
1943 BUG_ON(tp->fastopen_rsk);
1944
1945 /* If socket is aborted during connect operation */
1946 tcp_free_fastopen_req(tp);
1947 tcp_fastopen_destroy_cipher(sk);
1948 tcp_saved_syn_free(tp);
1949
1950 sk_sockets_allocated_dec(sk);
1951}
1952EXPORT_SYMBOL(tcp_v4_destroy_sock);
1953
1954#ifdef CONFIG_PROC_FS
1955/* Proc filesystem TCP sock list dumping. */
1956
1957/*
1958 * Get next listener socket follow cur. If cur is NULL, get first socket
1959 * starting from bucket given in st->bucket; when st->bucket is zero the
1960 * very first socket in the hash table is returned.
1961 */
1962static void *listening_get_next(struct seq_file *seq, void *cur)
1963{
1964 struct tcp_iter_state *st = seq->private;
1965 struct net *net = seq_file_net(seq);
1966 struct inet_listen_hashbucket *ilb;
1967 struct sock *sk = cur;
1968
1969 if (!sk) {
1970get_head:
1971 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1972 spin_lock(&ilb->lock);
1973 sk = sk_head(&ilb->head);
1974 st->offset = 0;
1975 goto get_sk;
1976 }
1977 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1978 ++st->num;
1979 ++st->offset;
1980
1981 sk = sk_next(sk);
1982get_sk:
1983 sk_for_each_from(sk) {
1984 if (!net_eq(sock_net(sk), net))
1985 continue;
1986 if (sk->sk_family == st->family)
1987 return sk;
1988 }
1989 spin_unlock(&ilb->lock);
1990 st->offset = 0;
1991 if (++st->bucket < INET_LHTABLE_SIZE)
1992 goto get_head;
1993 return NULL;
1994}
1995
1996static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1997{
1998 struct tcp_iter_state *st = seq->private;
1999 void *rc;
2000
2001 st->bucket = 0;
2002 st->offset = 0;
2003 rc = listening_get_next(seq, NULL);
2004
2005 while (rc && *pos) {
2006 rc = listening_get_next(seq, rc);
2007 --*pos;
2008 }
2009 return rc;
2010}
2011
2012static inline bool empty_bucket(const struct tcp_iter_state *st)
2013{
2014 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2015}
2016
2017/*
2018 * Get first established socket starting from bucket given in st->bucket.
2019 * If st->bucket is zero, the very first socket in the hash is returned.
2020 */
2021static void *established_get_first(struct seq_file *seq)
2022{
2023 struct tcp_iter_state *st = seq->private;
2024 struct net *net = seq_file_net(seq);
2025 void *rc = NULL;
2026
2027 st->offset = 0;
2028 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2029 struct sock *sk;
2030 struct hlist_nulls_node *node;
2031 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2032
2033 /* Lockless fast path for the common case of empty buckets */
2034 if (empty_bucket(st))
2035 continue;
2036
2037 spin_lock_bh(lock);
2038 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2039 if (sk->sk_family != st->family ||
2040 !net_eq(sock_net(sk), net)) {
2041 continue;
2042 }
2043 rc = sk;
2044 goto out;
2045 }
2046 spin_unlock_bh(lock);
2047 }
2048out:
2049 return rc;
2050}
2051
2052static void *established_get_next(struct seq_file *seq, void *cur)
2053{
2054 struct sock *sk = cur;
2055 struct hlist_nulls_node *node;
2056 struct tcp_iter_state *st = seq->private;
2057 struct net *net = seq_file_net(seq);
2058
2059 ++st->num;
2060 ++st->offset;
2061
2062 sk = sk_nulls_next(sk);
2063
2064 sk_nulls_for_each_from(sk, node) {
2065 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2066 return sk;
2067 }
2068
2069 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2070 ++st->bucket;
2071 return established_get_first(seq);
2072}
2073
2074static void *established_get_idx(struct seq_file *seq, loff_t pos)
2075{
2076 struct tcp_iter_state *st = seq->private;
2077 void *rc;
2078
2079 st->bucket = 0;
2080 rc = established_get_first(seq);
2081
2082 while (rc && pos) {
2083 rc = established_get_next(seq, rc);
2084 --pos;
2085 }
2086 return rc;
2087}
2088
2089static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2090{
2091 void *rc;
2092 struct tcp_iter_state *st = seq->private;
2093
2094 st->state = TCP_SEQ_STATE_LISTENING;
2095 rc = listening_get_idx(seq, &pos);
2096
2097 if (!rc) {
2098 st->state = TCP_SEQ_STATE_ESTABLISHED;
2099 rc = established_get_idx(seq, pos);
2100 }
2101
2102 return rc;
2103}
2104
2105static void *tcp_seek_last_pos(struct seq_file *seq)
2106{
2107 struct tcp_iter_state *st = seq->private;
2108 int offset = st->offset;
2109 int orig_num = st->num;
2110 void *rc = NULL;
2111
2112 switch (st->state) {
2113 case TCP_SEQ_STATE_LISTENING:
2114 if (st->bucket >= INET_LHTABLE_SIZE)
2115 break;
2116 st->state = TCP_SEQ_STATE_LISTENING;
2117 rc = listening_get_next(seq, NULL);
2118 while (offset-- && rc)
2119 rc = listening_get_next(seq, rc);
2120 if (rc)
2121 break;
2122 st->bucket = 0;
2123 st->state = TCP_SEQ_STATE_ESTABLISHED;
2124 /* Fallthrough */
2125 case TCP_SEQ_STATE_ESTABLISHED:
2126 if (st->bucket > tcp_hashinfo.ehash_mask)
2127 break;
2128 rc = established_get_first(seq);
2129 while (offset-- && rc)
2130 rc = established_get_next(seq, rc);
2131 }
2132
2133 st->num = orig_num;
2134
2135 return rc;
2136}
2137
2138static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2139{
2140 struct tcp_iter_state *st = seq->private;
2141 void *rc;
2142
2143 if (*pos && *pos == st->last_pos) {
2144 rc = tcp_seek_last_pos(seq);
2145 if (rc)
2146 goto out;
2147 }
2148
2149 st->state = TCP_SEQ_STATE_LISTENING;
2150 st->num = 0;
2151 st->bucket = 0;
2152 st->offset = 0;
2153 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2154
2155out:
2156 st->last_pos = *pos;
2157 return rc;
2158}
2159
2160static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2161{
2162 struct tcp_iter_state *st = seq->private;
2163 void *rc = NULL;
2164
2165 if (v == SEQ_START_TOKEN) {
2166 rc = tcp_get_idx(seq, 0);
2167 goto out;
2168 }
2169
2170 switch (st->state) {
2171 case TCP_SEQ_STATE_LISTENING:
2172 rc = listening_get_next(seq, v);
2173 if (!rc) {
2174 st->state = TCP_SEQ_STATE_ESTABLISHED;
2175 st->bucket = 0;
2176 st->offset = 0;
2177 rc = established_get_first(seq);
2178 }
2179 break;
2180 case TCP_SEQ_STATE_ESTABLISHED:
2181 rc = established_get_next(seq, v);
2182 break;
2183 }
2184out:
2185 ++*pos;
2186 st->last_pos = *pos;
2187 return rc;
2188}
2189
2190static void tcp_seq_stop(struct seq_file *seq, void *v)
2191{
2192 struct tcp_iter_state *st = seq->private;
2193
2194 switch (st->state) {
2195 case TCP_SEQ_STATE_LISTENING:
2196 if (v != SEQ_START_TOKEN)
2197 spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
2198 break;
2199 case TCP_SEQ_STATE_ESTABLISHED:
2200 if (v)
2201 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2202 break;
2203 }
2204}
2205
2206int tcp_seq_open(struct inode *inode, struct file *file)
2207{
2208 struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
2209 struct tcp_iter_state *s;
2210 int err;
2211
2212 err = seq_open_net(inode, file, &afinfo->seq_ops,
2213 sizeof(struct tcp_iter_state));
2214 if (err < 0)
2215 return err;
2216
2217 s = ((struct seq_file *)file->private_data)->private;
2218 s->family = afinfo->family;
2219 s->last_pos = 0;
2220 return 0;
2221}
2222EXPORT_SYMBOL(tcp_seq_open);
2223
2224int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2225{
2226 int rc = 0;
2227 struct proc_dir_entry *p;
2228
2229 afinfo->seq_ops.start = tcp_seq_start;
2230 afinfo->seq_ops.next = tcp_seq_next;
2231 afinfo->seq_ops.stop = tcp_seq_stop;
2232
2233 p = proc_create_data(afinfo->name, 0444, net->proc_net,
2234 afinfo->seq_fops, afinfo);
2235 if (!p)
2236 rc = -ENOMEM;
2237 return rc;
2238}
2239EXPORT_SYMBOL(tcp_proc_register);
2240
2241void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2242{
2243 remove_proc_entry(afinfo->name, net->proc_net);
2244}
2245EXPORT_SYMBOL(tcp_proc_unregister);
2246
2247static void get_openreq4(const struct request_sock *req,
2248 struct seq_file *f, int i)
2249{
2250 const struct inet_request_sock *ireq = inet_rsk(req);
2251 long delta = req->rsk_timer.expires - jiffies;
2252
2253 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2254 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2255 i,
2256 ireq->ir_loc_addr,
2257 ireq->ir_num,
2258 ireq->ir_rmt_addr,
2259 ntohs(ireq->ir_rmt_port),
2260 TCP_SYN_RECV,
2261 0, 0, /* could print option size, but that is af dependent. */
2262 1, /* timers active (only the expire timer) */
2263 jiffies_delta_to_clock_t(delta),
2264 req->num_timeout,
2265 from_kuid_munged(seq_user_ns(f),
2266 sock_i_uid(req->rsk_listener)),
2267 0, /* non standard timer */
2268 0, /* open_requests have no inode */
2269 0,
2270 req);
2271}
2272
2273static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2274{
2275 int timer_active;
2276 unsigned long timer_expires;
2277 const struct tcp_sock *tp = tcp_sk(sk);
2278 const struct inet_connection_sock *icsk = inet_csk(sk);
2279 const struct inet_sock *inet = inet_sk(sk);
2280 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2281 __be32 dest = inet->inet_daddr;
2282 __be32 src = inet->inet_rcv_saddr;
2283 __u16 destp = ntohs(inet->inet_dport);
2284 __u16 srcp = ntohs(inet->inet_sport);
2285 int rx_queue;
2286 int state;
2287
2288 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2289 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2290 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2291 timer_active = 1;
2292 timer_expires = icsk->icsk_timeout;
2293 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2294 timer_active = 4;
2295 timer_expires = icsk->icsk_timeout;
2296 } else if (timer_pending(&sk->sk_timer)) {
2297 timer_active = 2;
2298 timer_expires = sk->sk_timer.expires;
2299 } else {
2300 timer_active = 0;
2301 timer_expires = jiffies;
2302 }
2303
2304 state = inet_sk_state_load(sk);
2305 if (state == TCP_LISTEN)
2306 rx_queue = sk->sk_ack_backlog;
2307 else
2308 /* Because we don't lock the socket,
2309 * we might find a transient negative value.
2310 */
2311 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2312
2313 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2314 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2315 i, src, srcp, dest, destp, state,
2316 tp->write_seq - tp->snd_una,
2317 rx_queue,
2318 timer_active,
2319 jiffies_delta_to_clock_t(timer_expires - jiffies),
2320 icsk->icsk_retransmits,
2321 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2322 icsk->icsk_probes_out,
2323 sock_i_ino(sk),
2324 refcount_read(&sk->sk_refcnt), sk,
2325 jiffies_to_clock_t(icsk->icsk_rto),
2326 jiffies_to_clock_t(icsk->icsk_ack.ato),
2327 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2328 tp->snd_cwnd,
2329 state == TCP_LISTEN ?
2330 fastopenq->max_qlen :
2331 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2332}
2333
2334static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2335 struct seq_file *f, int i)
2336{
2337 long delta = tw->tw_timer.expires - jiffies;
2338 __be32 dest, src;
2339 __u16 destp, srcp;
2340
2341 dest = tw->tw_daddr;
2342 src = tw->tw_rcv_saddr;
2343 destp = ntohs(tw->tw_dport);
2344 srcp = ntohs(tw->tw_sport);
2345
2346 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2347 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2348 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2349 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2350 refcount_read(&tw->tw_refcnt), tw);
2351}
2352
2353#define TMPSZ 150
2354
2355static int tcp4_seq_show(struct seq_file *seq, void *v)
2356{
2357 struct tcp_iter_state *st;
2358 struct sock *sk = v;
2359
2360 seq_setwidth(seq, TMPSZ - 1);
2361 if (v == SEQ_START_TOKEN) {
2362 seq_puts(seq, " sl local_address rem_address st tx_queue "
2363 "rx_queue tr tm->when retrnsmt uid timeout "
2364 "inode");
2365 goto out;
2366 }
2367 st = seq->private;
2368
2369 if (sk->sk_state == TCP_TIME_WAIT)
2370 get_timewait4_sock(v, seq, st->num);
2371 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2372 get_openreq4(v, seq, st->num);
2373 else
2374 get_tcp4_sock(v, seq, st->num);
2375out:
2376 seq_pad(seq, '\n');
2377 return 0;
2378}
2379
2380static const struct file_operations tcp_afinfo_seq_fops = {
2381 .open = tcp_seq_open,
2382 .read = seq_read,
2383 .llseek = seq_lseek,
2384 .release = seq_release_net
2385};
2386
2387static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2388 .name = "tcp",
2389 .family = AF_INET,
2390 .seq_fops = &tcp_afinfo_seq_fops,
2391 .seq_ops = {
2392 .show = tcp4_seq_show,
2393 },
2394};
2395
2396static int __net_init tcp4_proc_init_net(struct net *net)
2397{
2398 return tcp_proc_register(net, &tcp4_seq_afinfo);
2399}
2400
2401static void __net_exit tcp4_proc_exit_net(struct net *net)
2402{
2403 tcp_proc_unregister(net, &tcp4_seq_afinfo);
2404}
2405
2406static struct pernet_operations tcp4_net_ops = {
2407 .init = tcp4_proc_init_net,
2408 .exit = tcp4_proc_exit_net,
2409};
2410
2411int __init tcp4_proc_init(void)
2412{
2413 return register_pernet_subsys(&tcp4_net_ops);
2414}
2415
2416void tcp4_proc_exit(void)
2417{
2418 unregister_pernet_subsys(&tcp4_net_ops);
2419}
2420#endif /* CONFIG_PROC_FS */
2421
2422struct proto tcp_prot = {
2423 .name = "TCP",
2424 .owner = THIS_MODULE,
2425 .close = tcp_close,
2426 .pre_connect = tcp_v4_pre_connect,
2427 .connect = tcp_v4_connect,
2428 .disconnect = tcp_disconnect,
2429 .accept = inet_csk_accept,
2430 .ioctl = tcp_ioctl,
2431 .init = tcp_v4_init_sock,
2432 .destroy = tcp_v4_destroy_sock,
2433 .shutdown = tcp_shutdown,
2434 .setsockopt = tcp_setsockopt,
2435 .getsockopt = tcp_getsockopt,
2436 .keepalive = tcp_set_keepalive,
2437 .recvmsg = tcp_recvmsg,
2438 .sendmsg = tcp_sendmsg,
2439 .sendpage = tcp_sendpage,
2440 .backlog_rcv = tcp_v4_do_rcv,
2441 .release_cb = tcp_release_cb,
2442 .hash = inet_hash,
2443 .unhash = inet_unhash,
2444 .get_port = inet_csk_get_port,
2445 .enter_memory_pressure = tcp_enter_memory_pressure,
2446 .leave_memory_pressure = tcp_leave_memory_pressure,
2447 .stream_memory_free = tcp_stream_memory_free,
2448 .sockets_allocated = &tcp_sockets_allocated,
2449 .orphan_count = &tcp_orphan_count,
2450 .memory_allocated = &tcp_memory_allocated,
2451 .memory_pressure = &tcp_memory_pressure,
2452 .sysctl_mem = sysctl_tcp_mem,
2453 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
2454 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
2455 .max_header = MAX_TCP_HEADER,
2456 .obj_size = sizeof(struct tcp_sock),
2457 .slab_flags = SLAB_TYPESAFE_BY_RCU,
2458 .twsk_prot = &tcp_timewait_sock_ops,
2459 .rsk_prot = &tcp_request_sock_ops,
2460 .h.hashinfo = &tcp_hashinfo,
2461 .no_autobind = true,
2462#ifdef CONFIG_COMPAT
2463 .compat_setsockopt = compat_tcp_setsockopt,
2464 .compat_getsockopt = compat_tcp_getsockopt,
2465#endif
2466 .diag_destroy = tcp_abort,
2467};
2468EXPORT_SYMBOL(tcp_prot);
2469
2470static void __net_exit tcp_sk_exit(struct net *net)
2471{
2472 int cpu;
2473
2474 module_put(net->ipv4.tcp_congestion_control->owner);
2475
2476 for_each_possible_cpu(cpu)
2477 inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
2478 free_percpu(net->ipv4.tcp_sk);
2479}
2480
2481static int __net_init tcp_sk_init(struct net *net)
2482{
2483 int res, cpu, cnt;
2484
2485 net->ipv4.tcp_sk = alloc_percpu(struct sock *);
2486 if (!net->ipv4.tcp_sk)
2487 return -ENOMEM;
2488
2489 for_each_possible_cpu(cpu) {
2490 struct sock *sk;
2491
2492 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
2493 IPPROTO_TCP, net);
2494 if (res)
2495 goto fail;
2496 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
2497 *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
2498 }
2499
2500 net->ipv4.sysctl_tcp_ecn = 2;
2501 net->ipv4.sysctl_tcp_ecn_fallback = 1;
2502
2503 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
2504 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
2505 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
2506
2507 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
2508 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
2509 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
2510
2511 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
2512 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
2513 net->ipv4.sysctl_tcp_syncookies = 1;
2514 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
2515 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
2516 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
2517 net->ipv4.sysctl_tcp_orphan_retries = 0;
2518 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
2519 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
2520 net->ipv4.sysctl_tcp_tw_reuse = 0;
2521
2522 cnt = tcp_hashinfo.ehash_mask + 1;
2523 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
2524 net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
2525
2526 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
2527 net->ipv4.sysctl_tcp_sack = 1;
2528 net->ipv4.sysctl_tcp_window_scaling = 1;
2529 net->ipv4.sysctl_tcp_timestamps = 1;
2530 net->ipv4.sysctl_tcp_early_retrans = 3;
2531 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
2532 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
2533 net->ipv4.sysctl_tcp_retrans_collapse = 1;
2534 net->ipv4.sysctl_tcp_max_reordering = 300;
2535 net->ipv4.sysctl_tcp_dsack = 1;
2536 net->ipv4.sysctl_tcp_app_win = 31;
2537 net->ipv4.sysctl_tcp_adv_win_scale = 1;
2538 net->ipv4.sysctl_tcp_frto = 2;
2539 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
2540 /* This limits the percentage of the congestion window which we
2541 * will allow a single TSO frame to consume. Building TSO frames
2542 * which are too large can cause TCP streams to be bursty.
2543 */
2544 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
2545 /* Default TSQ limit of four TSO segments */
2546 net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
2547 /* rfc5961 challenge ack rate limiting */
2548 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
2549 net->ipv4.sysctl_tcp_min_tso_segs = 2;
2550 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
2551 net->ipv4.sysctl_tcp_autocorking = 1;
2552 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
2553 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
2554 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
2555 if (net != &init_net) {
2556 memcpy(net->ipv4.sysctl_tcp_rmem,
2557 init_net.ipv4.sysctl_tcp_rmem,
2558 sizeof(init_net.ipv4.sysctl_tcp_rmem));
2559 memcpy(net->ipv4.sysctl_tcp_wmem,
2560 init_net.ipv4.sysctl_tcp_wmem,
2561 sizeof(init_net.ipv4.sysctl_tcp_wmem));
2562 }
2563 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
2564 spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
2565 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
2566 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
2567
2568 /* Reno is always built in */
2569 if (!net_eq(net, &init_net) &&
2570 try_module_get(init_net.ipv4.tcp_congestion_control->owner))
2571 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
2572 else
2573 net->ipv4.tcp_congestion_control = &tcp_reno;
2574
2575 return 0;
2576fail:
2577 tcp_sk_exit(net);
2578
2579 return res;
2580}
2581
2582static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2583{
2584 struct net *net;
2585
2586 inet_twsk_purge(&tcp_hashinfo, AF_INET);
2587
2588 list_for_each_entry(net, net_exit_list, exit_list)
2589 tcp_fastopen_ctx_destroy(net);
2590}
2591
2592static struct pernet_operations __net_initdata tcp_sk_ops = {
2593 .init = tcp_sk_init,
2594 .exit = tcp_sk_exit,
2595 .exit_batch = tcp_sk_exit_batch,
2596};
2597
2598void __init tcp_v4_init(void)
2599{
2600 if (register_pernet_subsys(&tcp_sk_ops))
2601 panic("Failed to create the TCP control socket.\n");
2602}