Linux Audio

Check our new training course

Loading...
Note: File does not exist in v5.9.
   1// SPDX-License-Identifier: GPL-2.0
   2/* Multipath TCP
   3 *
   4 * Copyright (c) 2021, Red Hat.
   5 */
   6
   7#define pr_fmt(fmt) "MPTCP: " fmt
   8
   9#include <linux/kernel.h>
  10#include <linux/module.h>
  11#include <net/sock.h>
  12#include <net/protocol.h>
  13#include <net/tcp.h>
  14#include <net/mptcp.h>
  15#include "protocol.h"
  16
  17#define MIN_INFO_OPTLEN_SIZE		16
  18#define MIN_FULL_INFO_OPTLEN_SIZE	40
  19
  20static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
  21{
  22	msk_owned_by_me(msk);
  23
  24	if (likely(!__mptcp_check_fallback(msk)))
  25		return NULL;
  26
  27	return msk->first;
  28}
  29
  30static u32 sockopt_seq_reset(const struct sock *sk)
  31{
  32	sock_owned_by_me(sk);
  33
  34	/* Highbits contain state.  Allows to distinguish sockopt_seq
  35	 * of listener and established:
  36	 * s0 = new_listener()
  37	 * sockopt(s0) - seq is 1
  38	 * s1 = accept(s0) - s1 inherits seq 1 if listener sk (s0)
  39	 * sockopt(s0) - seq increments to 2 on s0
  40	 * sockopt(s1) // seq increments to 2 on s1 (different option)
  41	 * new ssk completes join, inherits options from s0 // seq 2
  42	 * Needs sync from mptcp join logic, but ssk->seq == msk->seq
  43	 *
  44	 * Set High order bits to sk_state so ssk->seq == msk->seq test
  45	 * will fail.
  46	 */
  47
  48	return (u32)sk->sk_state << 24u;
  49}
  50
  51static void sockopt_seq_inc(struct mptcp_sock *msk)
  52{
  53	u32 seq = (msk->setsockopt_seq + 1) & 0x00ffffff;
  54
  55	msk->setsockopt_seq = sockopt_seq_reset((struct sock *)msk) + seq;
  56}
  57
  58static int mptcp_get_int_option(struct mptcp_sock *msk, sockptr_t optval,
  59				unsigned int optlen, int *val)
  60{
  61	if (optlen < sizeof(int))
  62		return -EINVAL;
  63
  64	if (copy_from_sockptr(val, optval, sizeof(*val)))
  65		return -EFAULT;
  66
  67	return 0;
  68}
  69
  70static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, int val)
  71{
  72	struct mptcp_subflow_context *subflow;
  73	struct sock *sk = (struct sock *)msk;
  74
  75	lock_sock(sk);
  76	sockopt_seq_inc(msk);
  77
  78	mptcp_for_each_subflow(msk, subflow) {
  79		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
  80		bool slow = lock_sock_fast(ssk);
  81
  82		switch (optname) {
  83		case SO_DEBUG:
  84			sock_valbool_flag(ssk, SOCK_DBG, !!val);
  85			break;
  86		case SO_KEEPALIVE:
  87			if (ssk->sk_prot->keepalive)
  88				ssk->sk_prot->keepalive(ssk, !!val);
  89			sock_valbool_flag(ssk, SOCK_KEEPOPEN, !!val);
  90			break;
  91		case SO_PRIORITY:
  92			WRITE_ONCE(ssk->sk_priority, val);
  93			break;
  94		case SO_SNDBUF:
  95		case SO_SNDBUFFORCE:
  96			ssk->sk_userlocks |= SOCK_SNDBUF_LOCK;
  97			WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
  98			mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
  99			break;
 100		case SO_RCVBUF:
 101		case SO_RCVBUFFORCE:
 102			ssk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 103			WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf);
 104			break;
 105		case SO_MARK:
 106			if (READ_ONCE(ssk->sk_mark) != sk->sk_mark) {
 107				WRITE_ONCE(ssk->sk_mark, sk->sk_mark);
 108				sk_dst_reset(ssk);
 109			}
 110			break;
 111		case SO_INCOMING_CPU:
 112			WRITE_ONCE(ssk->sk_incoming_cpu, val);
 113			break;
 114		}
 115
 116		subflow->setsockopt_seq = msk->setsockopt_seq;
 117		unlock_sock_fast(ssk, slow);
 118	}
 119
 120	release_sock(sk);
 121}
 122
 123static int mptcp_sol_socket_intval(struct mptcp_sock *msk, int optname, int val)
 124{
 125	sockptr_t optval = KERNEL_SOCKPTR(&val);
 126	struct sock *sk = (struct sock *)msk;
 127	int ret;
 128
 129	ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
 130			      optval, sizeof(val));
 131	if (ret)
 132		return ret;
 133
 134	mptcp_sol_socket_sync_intval(msk, optname, val);
 135	return 0;
 136}
 137
 138static void mptcp_so_incoming_cpu(struct mptcp_sock *msk, int val)
 139{
 140	struct sock *sk = (struct sock *)msk;
 141
 142	WRITE_ONCE(sk->sk_incoming_cpu, val);
 143
 144	mptcp_sol_socket_sync_intval(msk, SO_INCOMING_CPU, val);
 145}
 146
 147static int mptcp_setsockopt_sol_socket_tstamp(struct mptcp_sock *msk, int optname, int val)
 148{
 149	sockptr_t optval = KERNEL_SOCKPTR(&val);
 150	struct mptcp_subflow_context *subflow;
 151	struct sock *sk = (struct sock *)msk;
 152	int ret;
 153
 154	ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
 155			      optval, sizeof(val));
 156	if (ret)
 157		return ret;
 158
 159	lock_sock(sk);
 160	mptcp_for_each_subflow(msk, subflow) {
 161		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 162		bool slow = lock_sock_fast(ssk);
 163
 164		sock_set_timestamp(sk, optname, !!val);
 165		unlock_sock_fast(ssk, slow);
 166	}
 167
 168	release_sock(sk);
 169	return 0;
 170}
 171
 172static int mptcp_setsockopt_sol_socket_int(struct mptcp_sock *msk, int optname,
 173					   sockptr_t optval,
 174					   unsigned int optlen)
 175{
 176	int val, ret;
 177
 178	ret = mptcp_get_int_option(msk, optval, optlen, &val);
 179	if (ret)
 180		return ret;
 181
 182	switch (optname) {
 183	case SO_KEEPALIVE:
 184	case SO_DEBUG:
 185	case SO_MARK:
 186	case SO_PRIORITY:
 187	case SO_SNDBUF:
 188	case SO_SNDBUFFORCE:
 189	case SO_RCVBUF:
 190	case SO_RCVBUFFORCE:
 191		return mptcp_sol_socket_intval(msk, optname, val);
 192	case SO_INCOMING_CPU:
 193		mptcp_so_incoming_cpu(msk, val);
 194		return 0;
 195	case SO_TIMESTAMP_OLD:
 196	case SO_TIMESTAMP_NEW:
 197	case SO_TIMESTAMPNS_OLD:
 198	case SO_TIMESTAMPNS_NEW:
 199		return mptcp_setsockopt_sol_socket_tstamp(msk, optname, val);
 200	}
 201
 202	return -ENOPROTOOPT;
 203}
 204
 205static int mptcp_setsockopt_sol_socket_timestamping(struct mptcp_sock *msk,
 206						    int optname,
 207						    sockptr_t optval,
 208						    unsigned int optlen)
 209{
 210	struct mptcp_subflow_context *subflow;
 211	struct sock *sk = (struct sock *)msk;
 212	struct so_timestamping timestamping;
 213	int ret;
 214
 215	if (optlen == sizeof(timestamping)) {
 216		if (copy_from_sockptr(&timestamping, optval,
 217				      sizeof(timestamping)))
 218			return -EFAULT;
 219	} else if (optlen == sizeof(int)) {
 220		memset(&timestamping, 0, sizeof(timestamping));
 221
 222		if (copy_from_sockptr(&timestamping.flags, optval, sizeof(int)))
 223			return -EFAULT;
 224	} else {
 225		return -EINVAL;
 226	}
 227
 228	ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname,
 229			      KERNEL_SOCKPTR(&timestamping),
 230			      sizeof(timestamping));
 231	if (ret)
 232		return ret;
 233
 234	lock_sock(sk);
 235
 236	mptcp_for_each_subflow(msk, subflow) {
 237		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 238		bool slow = lock_sock_fast(ssk);
 239
 240		sock_set_timestamping(sk, optname, timestamping);
 241		unlock_sock_fast(ssk, slow);
 242	}
 243
 244	release_sock(sk);
 245
 246	return 0;
 247}
 248
 249static int mptcp_setsockopt_sol_socket_linger(struct mptcp_sock *msk, sockptr_t optval,
 250					      unsigned int optlen)
 251{
 252	struct mptcp_subflow_context *subflow;
 253	struct sock *sk = (struct sock *)msk;
 254	struct linger ling;
 255	sockptr_t kopt;
 256	int ret;
 257
 258	if (optlen < sizeof(ling))
 259		return -EINVAL;
 260
 261	if (copy_from_sockptr(&ling, optval, sizeof(ling)))
 262		return -EFAULT;
 263
 264	kopt = KERNEL_SOCKPTR(&ling);
 265	ret = sock_setsockopt(sk->sk_socket, SOL_SOCKET, SO_LINGER, kopt, sizeof(ling));
 266	if (ret)
 267		return ret;
 268
 269	lock_sock(sk);
 270	sockopt_seq_inc(msk);
 271	mptcp_for_each_subflow(msk, subflow) {
 272		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 273		bool slow = lock_sock_fast(ssk);
 274
 275		if (!ling.l_onoff) {
 276			sock_reset_flag(ssk, SOCK_LINGER);
 277		} else {
 278			ssk->sk_lingertime = sk->sk_lingertime;
 279			sock_set_flag(ssk, SOCK_LINGER);
 280		}
 281
 282		subflow->setsockopt_seq = msk->setsockopt_seq;
 283		unlock_sock_fast(ssk, slow);
 284	}
 285
 286	release_sock(sk);
 287	return 0;
 288}
 289
 290static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
 291				       sockptr_t optval, unsigned int optlen)
 292{
 293	struct sock *sk = (struct sock *)msk;
 294	struct sock *ssk;
 295	int ret;
 296
 297	switch (optname) {
 298	case SO_REUSEPORT:
 299	case SO_REUSEADDR:
 300	case SO_BINDTODEVICE:
 301	case SO_BINDTOIFINDEX:
 302		lock_sock(sk);
 303		ssk = __mptcp_nmpc_sk(msk);
 304		if (IS_ERR(ssk)) {
 305			release_sock(sk);
 306			return PTR_ERR(ssk);
 307		}
 308
 309		ret = sk_setsockopt(ssk, SOL_SOCKET, optname, optval, optlen);
 310		if (ret == 0) {
 311			if (optname == SO_REUSEPORT)
 312				sk->sk_reuseport = ssk->sk_reuseport;
 313			else if (optname == SO_REUSEADDR)
 314				sk->sk_reuse = ssk->sk_reuse;
 315			else if (optname == SO_BINDTODEVICE)
 316				sk->sk_bound_dev_if = ssk->sk_bound_dev_if;
 317			else if (optname == SO_BINDTOIFINDEX)
 318				sk->sk_bound_dev_if = ssk->sk_bound_dev_if;
 319		}
 320		release_sock(sk);
 321		return ret;
 322	case SO_KEEPALIVE:
 323	case SO_PRIORITY:
 324	case SO_SNDBUF:
 325	case SO_SNDBUFFORCE:
 326	case SO_RCVBUF:
 327	case SO_RCVBUFFORCE:
 328	case SO_MARK:
 329	case SO_INCOMING_CPU:
 330	case SO_DEBUG:
 331	case SO_TIMESTAMP_OLD:
 332	case SO_TIMESTAMP_NEW:
 333	case SO_TIMESTAMPNS_OLD:
 334	case SO_TIMESTAMPNS_NEW:
 335		return mptcp_setsockopt_sol_socket_int(msk, optname, optval,
 336						       optlen);
 337	case SO_TIMESTAMPING_OLD:
 338	case SO_TIMESTAMPING_NEW:
 339		return mptcp_setsockopt_sol_socket_timestamping(msk, optname,
 340								optval, optlen);
 341	case SO_LINGER:
 342		return mptcp_setsockopt_sol_socket_linger(msk, optval, optlen);
 343	case SO_RCVLOWAT:
 344	case SO_RCVTIMEO_OLD:
 345	case SO_RCVTIMEO_NEW:
 346	case SO_SNDTIMEO_OLD:
 347	case SO_SNDTIMEO_NEW:
 348	case SO_BUSY_POLL:
 349	case SO_PREFER_BUSY_POLL:
 350	case SO_BUSY_POLL_BUDGET:
 351		/* No need to copy: only relevant for msk */
 352		return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen);
 353	case SO_NO_CHECK:
 354	case SO_DONTROUTE:
 355	case SO_BROADCAST:
 356	case SO_BSDCOMPAT:
 357	case SO_PASSCRED:
 358	case SO_PASSPIDFD:
 359	case SO_PASSSEC:
 360	case SO_RXQ_OVFL:
 361	case SO_WIFI_STATUS:
 362	case SO_NOFCS:
 363	case SO_SELECT_ERR_QUEUE:
 364		return 0;
 365	}
 366
 367	/* SO_OOBINLINE is not supported, let's avoid the related mess
 368	 * SO_ATTACH_FILTER, SO_ATTACH_BPF, SO_ATTACH_REUSEPORT_CBPF,
 369	 * SO_DETACH_REUSEPORT_BPF, SO_DETACH_FILTER, SO_LOCK_FILTER,
 370	 * we must be careful with subflows
 371	 *
 372	 * SO_ATTACH_REUSEPORT_EBPF is not supported, at it checks
 373	 * explicitly the sk_protocol field
 374	 *
 375	 * SO_PEEK_OFF is unsupported, as it is for plain TCP
 376	 * SO_MAX_PACING_RATE is unsupported, we must be careful with subflows
 377	 * SO_CNX_ADVICE is currently unsupported, could possibly be relevant,
 378	 * but likely needs careful design
 379	 *
 380	 * SO_ZEROCOPY is currently unsupported, TODO in sndmsg
 381	 * SO_TXTIME is currently unsupported
 382	 */
 383
 384	return -EOPNOTSUPP;
 385}
 386
 387static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
 388			       sockptr_t optval, unsigned int optlen)
 389{
 390	struct sock *sk = (struct sock *)msk;
 391	int ret = -EOPNOTSUPP;
 392	struct sock *ssk;
 393
 394	switch (optname) {
 395	case IPV6_V6ONLY:
 396	case IPV6_TRANSPARENT:
 397	case IPV6_FREEBIND:
 398		lock_sock(sk);
 399		ssk = __mptcp_nmpc_sk(msk);
 400		if (IS_ERR(ssk)) {
 401			release_sock(sk);
 402			return PTR_ERR(ssk);
 403		}
 404
 405		ret = tcp_setsockopt(ssk, SOL_IPV6, optname, optval, optlen);
 406		if (ret != 0) {
 407			release_sock(sk);
 408			return ret;
 409		}
 410
 411		sockopt_seq_inc(msk);
 412
 413		switch (optname) {
 414		case IPV6_V6ONLY:
 415			sk->sk_ipv6only = ssk->sk_ipv6only;
 416			break;
 417		case IPV6_TRANSPARENT:
 418			inet_assign_bit(TRANSPARENT, sk,
 419					inet_test_bit(TRANSPARENT, ssk));
 420			break;
 421		case IPV6_FREEBIND:
 422			inet_assign_bit(FREEBIND, sk,
 423					inet_test_bit(FREEBIND, ssk));
 424			break;
 425		}
 426
 427		release_sock(sk);
 428		break;
 429	}
 430
 431	return ret;
 432}
 433
 434static bool mptcp_supported_sockopt(int level, int optname)
 435{
 436	if (level == SOL_IP) {
 437		switch (optname) {
 438		/* should work fine */
 439		case IP_FREEBIND:
 440		case IP_TRANSPARENT:
 441		case IP_BIND_ADDRESS_NO_PORT:
 442		case IP_LOCAL_PORT_RANGE:
 443
 444		/* the following are control cmsg related */
 445		case IP_PKTINFO:
 446		case IP_RECVTTL:
 447		case IP_RECVTOS:
 448		case IP_RECVOPTS:
 449		case IP_RETOPTS:
 450		case IP_PASSSEC:
 451		case IP_RECVORIGDSTADDR:
 452		case IP_CHECKSUM:
 453		case IP_RECVFRAGSIZE:
 454
 455		/* common stuff that need some love */
 456		case IP_TOS:
 457		case IP_TTL:
 458		case IP_MTU_DISCOVER:
 459		case IP_RECVERR:
 460
 461		/* possibly less common may deserve some love */
 462		case IP_MINTTL:
 463
 464		/* the following is apparently a no-op for plain TCP */
 465		case IP_RECVERR_RFC4884:
 466			return true;
 467		}
 468
 469		/* IP_OPTIONS is not supported, needs subflow care */
 470		/* IP_HDRINCL, IP_NODEFRAG are not supported, RAW specific */
 471		/* IP_MULTICAST_TTL, IP_MULTICAST_LOOP, IP_UNICAST_IF,
 472		 * IP_ADD_MEMBERSHIP, IP_ADD_SOURCE_MEMBERSHIP, IP_DROP_MEMBERSHIP,
 473		 * IP_DROP_SOURCE_MEMBERSHIP, IP_BLOCK_SOURCE, IP_UNBLOCK_SOURCE,
 474		 * MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP MCAST_JOIN_SOURCE_GROUP,
 475		 * MCAST_LEAVE_SOURCE_GROUP, MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE,
 476		 * MCAST_MSFILTER, IP_MULTICAST_ALL are not supported, better not deal
 477		 * with mcast stuff
 478		 */
 479		/* IP_IPSEC_POLICY, IP_XFRM_POLICY are nut supported, unrelated here */
 480		return false;
 481	}
 482	if (level == SOL_IPV6) {
 483		switch (optname) {
 484		case IPV6_V6ONLY:
 485
 486		/* the following are control cmsg related */
 487		case IPV6_RECVPKTINFO:
 488		case IPV6_2292PKTINFO:
 489		case IPV6_RECVHOPLIMIT:
 490		case IPV6_2292HOPLIMIT:
 491		case IPV6_RECVRTHDR:
 492		case IPV6_2292RTHDR:
 493		case IPV6_RECVHOPOPTS:
 494		case IPV6_2292HOPOPTS:
 495		case IPV6_RECVDSTOPTS:
 496		case IPV6_2292DSTOPTS:
 497		case IPV6_RECVTCLASS:
 498		case IPV6_FLOWINFO:
 499		case IPV6_RECVPATHMTU:
 500		case IPV6_RECVORIGDSTADDR:
 501		case IPV6_RECVFRAGSIZE:
 502
 503		/* the following ones need some love but are quite common */
 504		case IPV6_TCLASS:
 505		case IPV6_TRANSPARENT:
 506		case IPV6_FREEBIND:
 507		case IPV6_PKTINFO:
 508		case IPV6_2292PKTOPTIONS:
 509		case IPV6_UNICAST_HOPS:
 510		case IPV6_MTU_DISCOVER:
 511		case IPV6_MTU:
 512		case IPV6_RECVERR:
 513		case IPV6_FLOWINFO_SEND:
 514		case IPV6_FLOWLABEL_MGR:
 515		case IPV6_MINHOPCOUNT:
 516		case IPV6_DONTFRAG:
 517		case IPV6_AUTOFLOWLABEL:
 518
 519		/* the following one is a no-op for plain TCP */
 520		case IPV6_RECVERR_RFC4884:
 521			return true;
 522		}
 523
 524		/* IPV6_HOPOPTS, IPV6_RTHDRDSTOPTS, IPV6_RTHDR, IPV6_DSTOPTS are
 525		 * not supported
 526		 */
 527		/* IPV6_MULTICAST_HOPS, IPV6_MULTICAST_LOOP, IPV6_UNICAST_IF,
 528		 * IPV6_MULTICAST_IF, IPV6_ADDRFORM,
 529		 * IPV6_ADD_MEMBERSHIP, IPV6_DROP_MEMBERSHIP, IPV6_JOIN_ANYCAST,
 530		 * IPV6_LEAVE_ANYCAST, IPV6_MULTICAST_ALL, MCAST_JOIN_GROUP, MCAST_LEAVE_GROUP,
 531		 * MCAST_JOIN_SOURCE_GROUP, MCAST_LEAVE_SOURCE_GROUP,
 532		 * MCAST_BLOCK_SOURCE, MCAST_UNBLOCK_SOURCE, MCAST_MSFILTER
 533		 * are not supported better not deal with mcast
 534		 */
 535		/* IPV6_ROUTER_ALERT, IPV6_ROUTER_ALERT_ISOLATE are not supported, since are evil */
 536
 537		/* IPV6_IPSEC_POLICY, IPV6_XFRM_POLICY are not supported */
 538		/* IPV6_ADDR_PREFERENCES is not supported, we must be careful with subflows */
 539		return false;
 540	}
 541	if (level == SOL_TCP) {
 542		switch (optname) {
 543		/* the following are no-op or should work just fine */
 544		case TCP_THIN_DUPACK:
 545		case TCP_DEFER_ACCEPT:
 546
 547		/* the following need some love */
 548		case TCP_MAXSEG:
 549		case TCP_NODELAY:
 550		case TCP_THIN_LINEAR_TIMEOUTS:
 551		case TCP_CONGESTION:
 552		case TCP_CORK:
 553		case TCP_KEEPIDLE:
 554		case TCP_KEEPINTVL:
 555		case TCP_KEEPCNT:
 556		case TCP_SYNCNT:
 557		case TCP_SAVE_SYN:
 558		case TCP_LINGER2:
 559		case TCP_WINDOW_CLAMP:
 560		case TCP_QUICKACK:
 561		case TCP_USER_TIMEOUT:
 562		case TCP_TIMESTAMP:
 563		case TCP_NOTSENT_LOWAT:
 564		case TCP_TX_DELAY:
 565		case TCP_INQ:
 566		case TCP_FASTOPEN:
 567		case TCP_FASTOPEN_CONNECT:
 568		case TCP_FASTOPEN_KEY:
 569		case TCP_FASTOPEN_NO_COOKIE:
 570			return true;
 571		}
 572
 573		/* TCP_MD5SIG, TCP_MD5SIG_EXT are not supported, MD5 is not compatible with MPTCP */
 574
 575		/* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS,
 576		 * TCP_REPAIR_WINDOW are not supported, better avoid this mess
 577		 */
 578	}
 579	return false;
 580}
 581
 582static int mptcp_setsockopt_sol_tcp_congestion(struct mptcp_sock *msk, sockptr_t optval,
 583					       unsigned int optlen)
 584{
 585	struct mptcp_subflow_context *subflow;
 586	struct sock *sk = (struct sock *)msk;
 587	char name[TCP_CA_NAME_MAX];
 588	bool cap_net_admin;
 589	int ret;
 590
 591	if (optlen < 1)
 592		return -EINVAL;
 593
 594	ret = strncpy_from_sockptr(name, optval,
 595				   min_t(long, TCP_CA_NAME_MAX - 1, optlen));
 596	if (ret < 0)
 597		return -EFAULT;
 598
 599	name[ret] = 0;
 600
 601	cap_net_admin = ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN);
 602
 603	ret = 0;
 604	lock_sock(sk);
 605	sockopt_seq_inc(msk);
 606	mptcp_for_each_subflow(msk, subflow) {
 607		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 608		int err;
 609
 610		lock_sock(ssk);
 611		err = tcp_set_congestion_control(ssk, name, true, cap_net_admin);
 612		if (err < 0 && ret == 0)
 613			ret = err;
 614		subflow->setsockopt_seq = msk->setsockopt_seq;
 615		release_sock(ssk);
 616	}
 617
 618	if (ret == 0)
 619		strscpy(msk->ca_name, name, sizeof(msk->ca_name));
 620
 621	release_sock(sk);
 622	return ret;
 623}
 624
 625static int __mptcp_setsockopt_set_val(struct mptcp_sock *msk, int max,
 626				      int (*set_val)(struct sock *, int),
 627				      int *msk_val, int val)
 628{
 629	struct mptcp_subflow_context *subflow;
 630	int err = 0;
 631
 632	mptcp_for_each_subflow(msk, subflow) {
 633		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 634		int ret;
 635
 636		lock_sock(ssk);
 637		ret = set_val(ssk, val);
 638		err = err ? : ret;
 639		release_sock(ssk);
 640	}
 641
 642	if (!err) {
 643		*msk_val = val;
 644		sockopt_seq_inc(msk);
 645	}
 646
 647	return err;
 648}
 649
 650static int __mptcp_setsockopt_sol_tcp_cork(struct mptcp_sock *msk, int val)
 651{
 652	struct mptcp_subflow_context *subflow;
 653	struct sock *sk = (struct sock *)msk;
 654
 655	sockopt_seq_inc(msk);
 656	msk->cork = !!val;
 657	mptcp_for_each_subflow(msk, subflow) {
 658		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 659
 660		lock_sock(ssk);
 661		__tcp_sock_set_cork(ssk, !!val);
 662		release_sock(ssk);
 663	}
 664	if (!val)
 665		mptcp_check_and_set_pending(sk);
 666
 667	return 0;
 668}
 669
 670static int __mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, int val)
 671{
 672	struct mptcp_subflow_context *subflow;
 673	struct sock *sk = (struct sock *)msk;
 674
 675	sockopt_seq_inc(msk);
 676	msk->nodelay = !!val;
 677	mptcp_for_each_subflow(msk, subflow) {
 678		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 679
 680		lock_sock(ssk);
 681		__tcp_sock_set_nodelay(ssk, !!val);
 682		release_sock(ssk);
 683	}
 684	if (val)
 685		mptcp_check_and_set_pending(sk);
 686	return 0;
 687}
 688
 689static int mptcp_setsockopt_sol_ip_set(struct mptcp_sock *msk, int optname,
 690				       sockptr_t optval, unsigned int optlen)
 691{
 692	struct sock *sk = (struct sock *)msk;
 693	struct sock *ssk;
 694	int err;
 695
 696	err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen);
 697	if (err != 0)
 698		return err;
 699
 700	lock_sock(sk);
 701
 702	ssk = __mptcp_nmpc_sk(msk);
 703	if (IS_ERR(ssk)) {
 704		release_sock(sk);
 705		return PTR_ERR(ssk);
 706	}
 707
 708	switch (optname) {
 709	case IP_FREEBIND:
 710		inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
 711		break;
 712	case IP_TRANSPARENT:
 713		inet_assign_bit(TRANSPARENT, ssk,
 714				inet_test_bit(TRANSPARENT, sk));
 715		break;
 716	case IP_BIND_ADDRESS_NO_PORT:
 717		inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk,
 718				inet_test_bit(BIND_ADDRESS_NO_PORT, sk));
 719		break;
 720	case IP_LOCAL_PORT_RANGE:
 721		WRITE_ONCE(inet_sk(ssk)->local_port_range,
 722			   READ_ONCE(inet_sk(sk)->local_port_range));
 723		break;
 724	default:
 725		release_sock(sk);
 726		WARN_ON_ONCE(1);
 727		return -EOPNOTSUPP;
 728	}
 729
 730	sockopt_seq_inc(msk);
 731	release_sock(sk);
 732	return 0;
 733}
 734
 735static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
 736				       sockptr_t optval, unsigned int optlen)
 737{
 738	struct mptcp_subflow_context *subflow;
 739	struct sock *sk = (struct sock *)msk;
 740	int err, val;
 741
 742	err = ip_setsockopt(sk, SOL_IP, optname, optval, optlen);
 743
 744	if (err != 0)
 745		return err;
 746
 747	lock_sock(sk);
 748	sockopt_seq_inc(msk);
 749	val = READ_ONCE(inet_sk(sk)->tos);
 750	mptcp_for_each_subflow(msk, subflow) {
 751		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 752		bool slow;
 753
 754		slow = lock_sock_fast(ssk);
 755		__ip_sock_set_tos(ssk, val);
 756		unlock_sock_fast(ssk, slow);
 757	}
 758	release_sock(sk);
 759
 760	return 0;
 761}
 762
 763static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
 764			       sockptr_t optval, unsigned int optlen)
 765{
 766	switch (optname) {
 767	case IP_FREEBIND:
 768	case IP_TRANSPARENT:
 769	case IP_BIND_ADDRESS_NO_PORT:
 770	case IP_LOCAL_PORT_RANGE:
 771		return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen);
 772	case IP_TOS:
 773		return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen);
 774	}
 775
 776	return -EOPNOTSUPP;
 777}
 778
 779static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname,
 780					  sockptr_t optval, unsigned int optlen)
 781{
 782	struct sock *sk = (struct sock *)msk;
 783	struct sock *ssk;
 784	int ret;
 785
 786	/* Limit to first subflow, before the connection establishment */
 787	lock_sock(sk);
 788	ssk = __mptcp_nmpc_sk(msk);
 789	if (IS_ERR(ssk)) {
 790		ret = PTR_ERR(ssk);
 791		goto unlock;
 792	}
 793
 794	ret = tcp_setsockopt(ssk, level, optname, optval, optlen);
 795
 796unlock:
 797	release_sock(sk);
 798	return ret;
 799}
 800
 801static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 802				    sockptr_t optval, unsigned int optlen)
 803{
 804	struct sock *sk = (void *)msk;
 805	int ret, val;
 806
 807	switch (optname) {
 808	case TCP_ULP:
 809		return -EOPNOTSUPP;
 810	case TCP_CONGESTION:
 811		return mptcp_setsockopt_sol_tcp_congestion(msk, optval, optlen);
 812	case TCP_DEFER_ACCEPT:
 813		/* See tcp.c: TCP_DEFER_ACCEPT does not fail */
 814		mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen);
 815		return 0;
 816	case TCP_FASTOPEN:
 817	case TCP_FASTOPEN_CONNECT:
 818	case TCP_FASTOPEN_KEY:
 819	case TCP_FASTOPEN_NO_COOKIE:
 820		return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname,
 821						      optval, optlen);
 822	}
 823
 824	ret = mptcp_get_int_option(msk, optval, optlen, &val);
 825	if (ret)
 826		return ret;
 827
 828	lock_sock(sk);
 829	switch (optname) {
 830	case TCP_INQ:
 831		if (val < 0 || val > 1)
 832			ret = -EINVAL;
 833		else
 834			msk->recvmsg_inq = !!val;
 835		break;
 836	case TCP_NOTSENT_LOWAT:
 837		WRITE_ONCE(msk->notsent_lowat, val);
 838		mptcp_write_space(sk);
 839		break;
 840	case TCP_CORK:
 841		ret = __mptcp_setsockopt_sol_tcp_cork(msk, val);
 842		break;
 843	case TCP_NODELAY:
 844		ret = __mptcp_setsockopt_sol_tcp_nodelay(msk, val);
 845		break;
 846	case TCP_KEEPIDLE:
 847		ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPIDLE,
 848						 &tcp_sock_set_keepidle_locked,
 849						 &msk->keepalive_idle, val);
 850		break;
 851	case TCP_KEEPINTVL:
 852		ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPINTVL,
 853						 &tcp_sock_set_keepintvl,
 854						 &msk->keepalive_intvl, val);
 855		break;
 856	case TCP_KEEPCNT:
 857		ret = __mptcp_setsockopt_set_val(msk, MAX_TCP_KEEPCNT,
 858						 &tcp_sock_set_keepcnt,
 859						 &msk->keepalive_cnt,
 860						 val);
 861		break;
 862	default:
 863		ret = -ENOPROTOOPT;
 864	}
 865
 866	release_sock(sk);
 867	return ret;
 868}
 869
 870int mptcp_setsockopt(struct sock *sk, int level, int optname,
 871		     sockptr_t optval, unsigned int optlen)
 872{
 873	struct mptcp_sock *msk = mptcp_sk(sk);
 874	struct sock *ssk;
 875
 876	pr_debug("msk=%p\n", msk);
 877
 878	if (level == SOL_SOCKET)
 879		return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);
 880
 881	if (!mptcp_supported_sockopt(level, optname))
 882		return -ENOPROTOOPT;
 883
 884	/* @@ the meaning of setsockopt() when the socket is connected and
 885	 * there are multiple subflows is not yet defined. It is up to the
 886	 * MPTCP-level socket to configure the subflows until the subflow
 887	 * is in TCP fallback, when TCP socket options are passed through
 888	 * to the one remaining subflow.
 889	 */
 890	lock_sock(sk);
 891	ssk = __mptcp_tcp_fallback(msk);
 892	release_sock(sk);
 893	if (ssk)
 894		return tcp_setsockopt(ssk, level, optname, optval, optlen);
 895
 896	if (level == SOL_IP)
 897		return mptcp_setsockopt_v4(msk, optname, optval, optlen);
 898
 899	if (level == SOL_IPV6)
 900		return mptcp_setsockopt_v6(msk, optname, optval, optlen);
 901
 902	if (level == SOL_TCP)
 903		return mptcp_setsockopt_sol_tcp(msk, optname, optval, optlen);
 904
 905	return -EOPNOTSUPP;
 906}
 907
 908static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname,
 909					  char __user *optval, int __user *optlen)
 910{
 911	struct sock *sk = (struct sock *)msk;
 912	struct sock *ssk;
 913	int ret;
 914
 915	lock_sock(sk);
 916	ssk = msk->first;
 917	if (ssk) {
 918		ret = tcp_getsockopt(ssk, level, optname, optval, optlen);
 919		goto out;
 920	}
 921
 922	ssk = __mptcp_nmpc_sk(msk);
 923	if (IS_ERR(ssk)) {
 924		ret = PTR_ERR(ssk);
 925		goto out;
 926	}
 927
 928	ret = tcp_getsockopt(ssk, level, optname, optval, optlen);
 929
 930out:
 931	release_sock(sk);
 932	return ret;
 933}
 934
 935void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
 936{
 937	struct sock *sk = (struct sock *)msk;
 938	u32 flags = 0;
 939	bool slow;
 940	u32 now;
 941
 942	memset(info, 0, sizeof(*info));
 943
 944	info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
 945	info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
 946	info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
 947	info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used);
 948
 949	if (inet_sk_state_load(sk) == TCP_LISTEN)
 950		return;
 951
 952	/* The following limits only make sense for the in-kernel PM */
 953	if (mptcp_pm_is_kernel(msk)) {
 954		info->mptcpi_subflows_max =
 955			mptcp_pm_get_subflows_max(msk);
 956		info->mptcpi_add_addr_signal_max =
 957			mptcp_pm_get_add_addr_signal_max(msk);
 958		info->mptcpi_add_addr_accepted_max =
 959			mptcp_pm_get_add_addr_accept_max(msk);
 960		info->mptcpi_local_addr_max =
 961			mptcp_pm_get_local_addr_max(msk);
 962	}
 963
 964	if (__mptcp_check_fallback(msk))
 965		flags |= MPTCP_INFO_FLAG_FALLBACK;
 966	if (READ_ONCE(msk->can_ack))
 967		flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
 968	info->mptcpi_flags = flags;
 969
 970	slow = lock_sock_fast(sk);
 971	info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
 972	info->mptcpi_token = msk->token;
 973	info->mptcpi_write_seq = msk->write_seq;
 974	info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits;
 975	info->mptcpi_bytes_sent = msk->bytes_sent;
 976	info->mptcpi_bytes_received = msk->bytes_received;
 977	info->mptcpi_bytes_retrans = msk->bytes_retrans;
 978	info->mptcpi_subflows_total = info->mptcpi_subflows +
 979		__mptcp_has_initial_subflow(msk);
 980	now = tcp_jiffies32;
 981	info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent);
 982	info->mptcpi_last_data_recv = jiffies_to_msecs(now - msk->last_data_recv);
 983	unlock_sock_fast(sk, slow);
 984
 985	mptcp_data_lock(sk);
 986	info->mptcpi_last_ack_recv = jiffies_to_msecs(now - msk->last_ack_recv);
 987	info->mptcpi_snd_una = msk->snd_una;
 988	info->mptcpi_rcv_nxt = msk->ack_seq;
 989	info->mptcpi_bytes_acked = msk->bytes_acked;
 990	mptcp_data_unlock(sk);
 991}
 992EXPORT_SYMBOL_GPL(mptcp_diag_fill_info);
 993
 994static int mptcp_getsockopt_info(struct mptcp_sock *msk, char __user *optval, int __user *optlen)
 995{
 996	struct mptcp_info m_info;
 997	int len;
 998
 999	if (get_user(len, optlen))
1000		return -EFAULT;
1001
1002	/* When used only to check if a fallback to TCP happened. */
1003	if (len == 0)
1004		return 0;
1005
1006	len = min_t(unsigned int, len, sizeof(struct mptcp_info));
1007
1008	mptcp_diag_fill_info(msk, &m_info);
1009
1010	if (put_user(len, optlen))
1011		return -EFAULT;
1012
1013	if (copy_to_user(optval, &m_info, len))
1014		return -EFAULT;
1015
1016	return 0;
1017}
1018
1019static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd,
1020				  char __user *optval,
1021				  u32 copied,
1022				  int __user *optlen)
1023{
1024	u32 copylen = min_t(u32, sfd->size_subflow_data, sizeof(*sfd));
1025
1026	if (copied)
1027		copied += sfd->size_subflow_data;
1028	else
1029		copied = copylen;
1030
1031	if (put_user(copied, optlen))
1032		return -EFAULT;
1033
1034	if (copy_to_user(optval, sfd, copylen))
1035		return -EFAULT;
1036
1037	return 0;
1038}
1039
1040static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd,
1041				  char __user *optval,
1042				  int __user *optlen)
1043{
1044	int len, copylen;
1045
1046	if (get_user(len, optlen))
1047		return -EFAULT;
1048
1049	/* if mptcp_subflow_data size is changed, need to adjust
1050	 * this function to deal with programs using old version.
1051	 */
1052	BUILD_BUG_ON(sizeof(*sfd) != MIN_INFO_OPTLEN_SIZE);
1053
1054	if (len < MIN_INFO_OPTLEN_SIZE)
1055		return -EINVAL;
1056
1057	memset(sfd, 0, sizeof(*sfd));
1058
1059	copylen = min_t(unsigned int, len, sizeof(*sfd));
1060	if (copy_from_user(sfd, optval, copylen))
1061		return -EFAULT;
1062
1063	/* size_subflow_data is u32, but len is signed */
1064	if (sfd->size_subflow_data > INT_MAX ||
1065	    sfd->size_user > INT_MAX)
1066		return -EINVAL;
1067
1068	if (sfd->size_subflow_data < MIN_INFO_OPTLEN_SIZE ||
1069	    sfd->size_subflow_data > len)
1070		return -EINVAL;
1071
1072	if (sfd->num_subflows || sfd->size_kernel)
1073		return -EINVAL;
1074
1075	return len - sfd->size_subflow_data;
1076}
1077
1078static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval,
1079				    int __user *optlen)
1080{
1081	struct mptcp_subflow_context *subflow;
1082	struct sock *sk = (struct sock *)msk;
1083	unsigned int sfcount = 0, copied = 0;
1084	struct mptcp_subflow_data sfd;
1085	char __user *infoptr;
1086	int len;
1087
1088	len = mptcp_get_subflow_data(&sfd, optval, optlen);
1089	if (len < 0)
1090		return len;
1091
1092	sfd.size_kernel = sizeof(struct tcp_info);
1093	sfd.size_user = min_t(unsigned int, sfd.size_user,
1094			      sizeof(struct tcp_info));
1095
1096	infoptr = optval + sfd.size_subflow_data;
1097
1098	lock_sock(sk);
1099
1100	mptcp_for_each_subflow(msk, subflow) {
1101		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1102
1103		++sfcount;
1104
1105		if (len && len >= sfd.size_user) {
1106			struct tcp_info info;
1107
1108			tcp_get_info(ssk, &info);
1109
1110			if (copy_to_user(infoptr, &info, sfd.size_user)) {
1111				release_sock(sk);
1112				return -EFAULT;
1113			}
1114
1115			infoptr += sfd.size_user;
1116			copied += sfd.size_user;
1117			len -= sfd.size_user;
1118		}
1119	}
1120
1121	release_sock(sk);
1122
1123	sfd.num_subflows = sfcount;
1124
1125	if (mptcp_put_subflow_data(&sfd, optval, copied, optlen))
1126		return -EFAULT;
1127
1128	return 0;
1129}
1130
1131static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addrs *a)
1132{
1133	const struct inet_sock *inet = inet_sk(sk);
1134
1135	memset(a, 0, sizeof(*a));
1136
1137	if (sk->sk_family == AF_INET) {
1138		a->sin_local.sin_family = AF_INET;
1139		a->sin_local.sin_port = inet->inet_sport;
1140		a->sin_local.sin_addr.s_addr = inet->inet_rcv_saddr;
1141
1142		if (!a->sin_local.sin_addr.s_addr)
1143			a->sin_local.sin_addr.s_addr = inet->inet_saddr;
1144
1145		a->sin_remote.sin_family = AF_INET;
1146		a->sin_remote.sin_port = inet->inet_dport;
1147		a->sin_remote.sin_addr.s_addr = inet->inet_daddr;
1148#if IS_ENABLED(CONFIG_IPV6)
1149	} else if (sk->sk_family == AF_INET6) {
1150		const struct ipv6_pinfo *np = inet6_sk(sk);
1151
1152		if (WARN_ON_ONCE(!np))
1153			return;
1154
1155		a->sin6_local.sin6_family = AF_INET6;
1156		a->sin6_local.sin6_port = inet->inet_sport;
1157
1158		if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
1159			a->sin6_local.sin6_addr = np->saddr;
1160		else
1161			a->sin6_local.sin6_addr = sk->sk_v6_rcv_saddr;
1162
1163		a->sin6_remote.sin6_family = AF_INET6;
1164		a->sin6_remote.sin6_port = inet->inet_dport;
1165		a->sin6_remote.sin6_addr = sk->sk_v6_daddr;
1166#endif
1167	}
1168}
1169
1170static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *optval,
1171					  int __user *optlen)
1172{
1173	struct mptcp_subflow_context *subflow;
1174	struct sock *sk = (struct sock *)msk;
1175	unsigned int sfcount = 0, copied = 0;
1176	struct mptcp_subflow_data sfd;
1177	char __user *addrptr;
1178	int len;
1179
1180	len = mptcp_get_subflow_data(&sfd, optval, optlen);
1181	if (len < 0)
1182		return len;
1183
1184	sfd.size_kernel = sizeof(struct mptcp_subflow_addrs);
1185	sfd.size_user = min_t(unsigned int, sfd.size_user,
1186			      sizeof(struct mptcp_subflow_addrs));
1187
1188	addrptr = optval + sfd.size_subflow_data;
1189
1190	lock_sock(sk);
1191
1192	mptcp_for_each_subflow(msk, subflow) {
1193		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1194
1195		++sfcount;
1196
1197		if (len && len >= sfd.size_user) {
1198			struct mptcp_subflow_addrs a;
1199
1200			mptcp_get_sub_addrs(ssk, &a);
1201
1202			if (copy_to_user(addrptr, &a, sfd.size_user)) {
1203				release_sock(sk);
1204				return -EFAULT;
1205			}
1206
1207			addrptr += sfd.size_user;
1208			copied += sfd.size_user;
1209			len -= sfd.size_user;
1210		}
1211	}
1212
1213	release_sock(sk);
1214
1215	sfd.num_subflows = sfcount;
1216
1217	if (mptcp_put_subflow_data(&sfd, optval, copied, optlen))
1218		return -EFAULT;
1219
1220	return 0;
1221}
1222
1223static int mptcp_get_full_info(struct mptcp_full_info *mfi,
1224			       char __user *optval,
1225			       int __user *optlen)
1226{
1227	int len;
1228
1229	BUILD_BUG_ON(offsetof(struct mptcp_full_info, mptcp_info) !=
1230		     MIN_FULL_INFO_OPTLEN_SIZE);
1231
1232	if (get_user(len, optlen))
1233		return -EFAULT;
1234
1235	if (len < MIN_FULL_INFO_OPTLEN_SIZE)
1236		return -EINVAL;
1237
1238	memset(mfi, 0, sizeof(*mfi));
1239	if (copy_from_user(mfi, optval, MIN_FULL_INFO_OPTLEN_SIZE))
1240		return -EFAULT;
1241
1242	if (mfi->size_tcpinfo_kernel ||
1243	    mfi->size_sfinfo_kernel ||
1244	    mfi->num_subflows)
1245		return -EINVAL;
1246
1247	if (mfi->size_sfinfo_user > INT_MAX ||
1248	    mfi->size_tcpinfo_user > INT_MAX)
1249		return -EINVAL;
1250
1251	return len - MIN_FULL_INFO_OPTLEN_SIZE;
1252}
1253
1254static int mptcp_put_full_info(struct mptcp_full_info *mfi,
1255			       char __user *optval,
1256			       u32 copylen,
1257			       int __user *optlen)
1258{
1259	copylen += MIN_FULL_INFO_OPTLEN_SIZE;
1260	if (put_user(copylen, optlen))
1261		return -EFAULT;
1262
1263	if (copy_to_user(optval, mfi, copylen))
1264		return -EFAULT;
1265	return 0;
1266}
1267
1268static int mptcp_getsockopt_full_info(struct mptcp_sock *msk, char __user *optval,
1269				      int __user *optlen)
1270{
1271	unsigned int sfcount = 0, copylen = 0;
1272	struct mptcp_subflow_context *subflow;
1273	struct sock *sk = (struct sock *)msk;
1274	void __user *tcpinfoptr, *sfinfoptr;
1275	struct mptcp_full_info mfi;
1276	int len;
1277
1278	len = mptcp_get_full_info(&mfi, optval, optlen);
1279	if (len < 0)
1280		return len;
1281
1282	/* don't bother filling the mptcp info if there is not enough
1283	 * user-space-provided storage
1284	 */
1285	if (len > 0) {
1286		mptcp_diag_fill_info(msk, &mfi.mptcp_info);
1287		copylen += min_t(unsigned int, len, sizeof(struct mptcp_info));
1288	}
1289
1290	mfi.size_tcpinfo_kernel = sizeof(struct tcp_info);
1291	mfi.size_tcpinfo_user = min_t(unsigned int, mfi.size_tcpinfo_user,
1292				      sizeof(struct tcp_info));
1293	sfinfoptr = u64_to_user_ptr(mfi.subflow_info);
1294	mfi.size_sfinfo_kernel = sizeof(struct mptcp_subflow_info);
1295	mfi.size_sfinfo_user = min_t(unsigned int, mfi.size_sfinfo_user,
1296				     sizeof(struct mptcp_subflow_info));
1297	tcpinfoptr = u64_to_user_ptr(mfi.tcp_info);
1298
1299	lock_sock(sk);
1300	mptcp_for_each_subflow(msk, subflow) {
1301		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1302		struct mptcp_subflow_info sfinfo;
1303		struct tcp_info tcp_info;
1304
1305		if (sfcount++ >= mfi.size_arrays_user)
1306			continue;
1307
1308		/* fetch addr/tcp_info only if the user space buffers
1309		 * are wide enough
1310		 */
1311		memset(&sfinfo, 0, sizeof(sfinfo));
1312		sfinfo.id = subflow->subflow_id;
1313		if (mfi.size_sfinfo_user >
1314		    offsetof(struct mptcp_subflow_info, addrs))
1315			mptcp_get_sub_addrs(ssk, &sfinfo.addrs);
1316		if (copy_to_user(sfinfoptr, &sfinfo, mfi.size_sfinfo_user))
1317			goto fail_release;
1318
1319		if (mfi.size_tcpinfo_user) {
1320			tcp_get_info(ssk, &tcp_info);
1321			if (copy_to_user(tcpinfoptr, &tcp_info,
1322					 mfi.size_tcpinfo_user))
1323				goto fail_release;
1324		}
1325
1326		tcpinfoptr += mfi.size_tcpinfo_user;
1327		sfinfoptr += mfi.size_sfinfo_user;
1328	}
1329	release_sock(sk);
1330
1331	mfi.num_subflows = sfcount;
1332	if (mptcp_put_full_info(&mfi, optval, copylen, optlen))
1333		return -EFAULT;
1334
1335	return 0;
1336
1337fail_release:
1338	release_sock(sk);
1339	return -EFAULT;
1340}
1341
1342static int mptcp_put_int_option(struct mptcp_sock *msk, char __user *optval,
1343				int __user *optlen, int val)
1344{
1345	int len;
1346
1347	if (get_user(len, optlen))
1348		return -EFAULT;
1349	if (len < 0)
1350		return -EINVAL;
1351
1352	if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
1353		unsigned char ucval = (unsigned char)val;
1354
1355		len = 1;
1356		if (put_user(len, optlen))
1357			return -EFAULT;
1358		if (copy_to_user(optval, &ucval, 1))
1359			return -EFAULT;
1360	} else {
1361		len = min_t(unsigned int, len, sizeof(int));
1362		if (put_user(len, optlen))
1363			return -EFAULT;
1364		if (copy_to_user(optval, &val, len))
1365			return -EFAULT;
1366	}
1367
1368	return 0;
1369}
1370
1371static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
1372				    char __user *optval, int __user *optlen)
1373{
1374	struct sock *sk = (void *)msk;
1375
1376	switch (optname) {
1377	case TCP_ULP:
1378	case TCP_CONGESTION:
1379	case TCP_INFO:
1380	case TCP_CC_INFO:
1381	case TCP_DEFER_ACCEPT:
1382	case TCP_FASTOPEN:
1383	case TCP_FASTOPEN_CONNECT:
1384	case TCP_FASTOPEN_KEY:
1385	case TCP_FASTOPEN_NO_COOKIE:
1386		return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname,
1387						      optval, optlen);
1388	case TCP_INQ:
1389		return mptcp_put_int_option(msk, optval, optlen, msk->recvmsg_inq);
1390	case TCP_CORK:
1391		return mptcp_put_int_option(msk, optval, optlen, msk->cork);
1392	case TCP_NODELAY:
1393		return mptcp_put_int_option(msk, optval, optlen, msk->nodelay);
1394	case TCP_KEEPIDLE:
1395		return mptcp_put_int_option(msk, optval, optlen,
1396					    msk->keepalive_idle ? :
1397					    READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_time) / HZ);
1398	case TCP_KEEPINTVL:
1399		return mptcp_put_int_option(msk, optval, optlen,
1400					    msk->keepalive_intvl ? :
1401					    READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_intvl) / HZ);
1402	case TCP_KEEPCNT:
1403		return mptcp_put_int_option(msk, optval, optlen,
1404					    msk->keepalive_cnt ? :
1405					    READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_keepalive_probes));
1406	case TCP_NOTSENT_LOWAT:
1407		return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat);
1408	case TCP_IS_MPTCP:
1409		return mptcp_put_int_option(msk, optval, optlen, 1);
1410	}
1411	return -EOPNOTSUPP;
1412}
1413
1414static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname,
1415			       char __user *optval, int __user *optlen)
1416{
1417	struct sock *sk = (void *)msk;
1418
1419	switch (optname) {
1420	case IP_TOS:
1421		return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos));
1422	case IP_BIND_ADDRESS_NO_PORT:
1423		return mptcp_put_int_option(msk, optval, optlen,
1424				inet_test_bit(BIND_ADDRESS_NO_PORT, sk));
1425	case IP_LOCAL_PORT_RANGE:
1426		return mptcp_put_int_option(msk, optval, optlen,
1427				READ_ONCE(inet_sk(sk)->local_port_range));
1428	}
1429
1430	return -EOPNOTSUPP;
1431}
1432
1433static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname,
1434				      char __user *optval, int __user *optlen)
1435{
1436	switch (optname) {
1437	case MPTCP_INFO:
1438		return mptcp_getsockopt_info(msk, optval, optlen);
1439	case MPTCP_FULL_INFO:
1440		return mptcp_getsockopt_full_info(msk, optval, optlen);
1441	case MPTCP_TCPINFO:
1442		return mptcp_getsockopt_tcpinfo(msk, optval, optlen);
1443	case MPTCP_SUBFLOW_ADDRS:
1444		return mptcp_getsockopt_subflow_addrs(msk, optval, optlen);
1445	}
1446
1447	return -EOPNOTSUPP;
1448}
1449
1450int mptcp_getsockopt(struct sock *sk, int level, int optname,
1451		     char __user *optval, int __user *option)
1452{
1453	struct mptcp_sock *msk = mptcp_sk(sk);
1454	struct sock *ssk;
1455
1456	pr_debug("msk=%p\n", msk);
1457
1458	/* @@ the meaning of setsockopt() when the socket is connected and
1459	 * there are multiple subflows is not yet defined. It is up to the
1460	 * MPTCP-level socket to configure the subflows until the subflow
1461	 * is in TCP fallback, when socket options are passed through
1462	 * to the one remaining subflow.
1463	 */
1464	lock_sock(sk);
1465	ssk = __mptcp_tcp_fallback(msk);
1466	release_sock(sk);
1467	if (ssk)
1468		return tcp_getsockopt(ssk, level, optname, optval, option);
1469
1470	if (level == SOL_IP)
1471		return mptcp_getsockopt_v4(msk, optname, optval, option);
1472	if (level == SOL_TCP)
1473		return mptcp_getsockopt_sol_tcp(msk, optname, optval, option);
1474	if (level == SOL_MPTCP)
1475		return mptcp_getsockopt_sol_mptcp(msk, optname, optval, option);
1476	return -EOPNOTSUPP;
1477}
1478
1479static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
1480{
1481	static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK;
1482	struct sock *sk = (struct sock *)msk;
1483
1484	if (ssk->sk_prot->keepalive) {
1485		if (sock_flag(sk, SOCK_KEEPOPEN))
1486			ssk->sk_prot->keepalive(ssk, 1);
1487		else
1488			ssk->sk_prot->keepalive(ssk, 0);
1489	}
1490
1491	ssk->sk_priority = sk->sk_priority;
1492	ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
1493	ssk->sk_incoming_cpu = sk->sk_incoming_cpu;
1494	ssk->sk_ipv6only = sk->sk_ipv6only;
1495	__ip_sock_set_tos(ssk, inet_sk(sk)->tos);
1496
1497	if (sk->sk_userlocks & tx_rx_locks) {
1498		ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
1499		if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) {
1500			WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
1501			mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
1502		}
1503		if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1504			WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf);
1505	}
1506
1507	if (sock_flag(sk, SOCK_LINGER)) {
1508		ssk->sk_lingertime = sk->sk_lingertime;
1509		sock_set_flag(ssk, SOCK_LINGER);
1510	} else {
1511		sock_reset_flag(ssk, SOCK_LINGER);
1512	}
1513
1514	if (sk->sk_mark != ssk->sk_mark) {
1515		ssk->sk_mark = sk->sk_mark;
1516		sk_dst_reset(ssk);
1517	}
1518
1519	sock_valbool_flag(ssk, SOCK_DBG, sock_flag(sk, SOCK_DBG));
1520
1521	if (inet_csk(sk)->icsk_ca_ops != inet_csk(ssk)->icsk_ca_ops)
1522		tcp_set_congestion_control(ssk, msk->ca_name, false, true);
1523	__tcp_sock_set_cork(ssk, !!msk->cork);
1524	__tcp_sock_set_nodelay(ssk, !!msk->nodelay);
1525	tcp_sock_set_keepidle_locked(ssk, msk->keepalive_idle);
1526	tcp_sock_set_keepintvl(ssk, msk->keepalive_intvl);
1527	tcp_sock_set_keepcnt(ssk, msk->keepalive_cnt);
1528
1529	inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk));
1530	inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk));
1531	inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk));
1532	WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range));
1533}
1534
1535void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk)
1536{
1537	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
1538
1539	msk_owned_by_me(msk);
1540
1541	ssk->sk_rcvlowat = 0;
1542
1543	/* subflows must ignore any latency-related settings: will not affect
1544	 * the user-space - only the msk is relevant - but will foul the
1545	 * mptcp scheduler
1546	 */
1547	tcp_sk(ssk)->notsent_lowat = UINT_MAX;
1548
1549	if (READ_ONCE(subflow->setsockopt_seq) != msk->setsockopt_seq) {
1550		sync_socket_options(msk, ssk);
1551
1552		subflow->setsockopt_seq = msk->setsockopt_seq;
1553	}
1554}
1555
1556/* unfortunately this is different enough from the tcp version so
1557 * that we can't factor it out
1558 */
1559int mptcp_set_rcvlowat(struct sock *sk, int val)
1560{
1561	struct mptcp_subflow_context *subflow;
1562	int space, cap;
1563
1564	/* bpf can land here with a wrong sk type */
1565	if (sk->sk_protocol == IPPROTO_TCP)
1566		return -EINVAL;
1567
1568	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1569		cap = sk->sk_rcvbuf >> 1;
1570	else
1571		cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
1572	val = min(val, cap);
1573	WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1574
1575	/* Check if we need to signal EPOLLIN right now */
1576	if (mptcp_epollin_ready(sk))
1577		sk->sk_data_ready(sk);
1578
1579	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1580		return 0;
1581
1582	space = mptcp_space_from_win(sk, val);
1583	if (space <= sk->sk_rcvbuf)
1584		return 0;
1585
1586	/* propagate the rcvbuf changes to all the subflows */
1587	WRITE_ONCE(sk->sk_rcvbuf, space);
1588	mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
1589		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
1590		bool slow;
1591
1592		slow = lock_sock_fast(ssk);
1593		WRITE_ONCE(ssk->sk_rcvbuf, space);
1594		WRITE_ONCE(tcp_sk(ssk)->window_clamp, val);
1595		unlock_sock_fast(ssk, slow);
1596	}
1597	return 0;
1598}