Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.5.6.
   1/*
   2 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
   3 *
   4 *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
   5 *  applies to SOCK_STREAM sockets only
   6 *  offers an alternative communication option for TCP-protocol sockets
   7 *  applicable with RoCE-cards only
   8 *
   9 *  Initial restrictions:
  10 *    - support for alternate links postponed
  11 *    - partial support for non-blocking sockets only
  12 *    - support for urgent data postponed
  13 *
  14 *  Copyright IBM Corp. 2016, 2018
  15 *
  16 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
  17 *              based on prototype from Frank Blaschka
  18 */
  19
  20#define KMSG_COMPONENT "smc"
  21#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  22
  23#include <linux/module.h>
  24#include <linux/socket.h>
  25#include <linux/workqueue.h>
  26#include <linux/in.h>
  27#include <linux/sched/signal.h>
  28
  29#include <net/sock.h>
  30#include <net/tcp.h>
  31#include <net/smc.h>
  32
  33#include "smc.h"
  34#include "smc_clc.h"
  35#include "smc_llc.h"
  36#include "smc_cdc.h"
  37#include "smc_core.h"
  38#include "smc_ib.h"
  39#include "smc_pnet.h"
  40#include "smc_tx.h"
  41#include "smc_rx.h"
  42#include "smc_close.h"
  43
  44static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
  45						 * creation
  46						 */
  47
  48struct smc_lgr_list smc_lgr_list = {		/* established link groups */
  49	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
  50	.list = LIST_HEAD_INIT(smc_lgr_list.list),
  51};
  52
  53static void smc_tcp_listen_work(struct work_struct *);
  54
  55static void smc_set_keepalive(struct sock *sk, int val)
  56{
  57	struct smc_sock *smc = smc_sk(sk);
  58
  59	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
  60}
  61
  62static struct smc_hashinfo smc_v4_hashinfo = {
  63	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
  64};
  65
  66static struct smc_hashinfo smc_v6_hashinfo = {
  67	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
  68};
  69
  70int smc_hash_sk(struct sock *sk)
  71{
  72	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
  73	struct hlist_head *head;
  74
  75	head = &h->ht;
  76
  77	write_lock_bh(&h->lock);
  78	sk_add_node(sk, head);
  79	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
  80	write_unlock_bh(&h->lock);
  81
  82	return 0;
  83}
  84EXPORT_SYMBOL_GPL(smc_hash_sk);
  85
  86void smc_unhash_sk(struct sock *sk)
  87{
  88	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
  89
  90	write_lock_bh(&h->lock);
  91	if (sk_del_node_init(sk))
  92		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
  93	write_unlock_bh(&h->lock);
  94}
  95EXPORT_SYMBOL_GPL(smc_unhash_sk);
  96
  97struct proto smc_proto = {
  98	.name		= "SMC",
  99	.owner		= THIS_MODULE,
 100	.keepalive	= smc_set_keepalive,
 101	.hash		= smc_hash_sk,
 102	.unhash		= smc_unhash_sk,
 103	.obj_size	= sizeof(struct smc_sock),
 104	.h.smc_hash	= &smc_v4_hashinfo,
 105	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
 106};
 107EXPORT_SYMBOL_GPL(smc_proto);
 108
 109struct proto smc_proto6 = {
 110	.name		= "SMC6",
 111	.owner		= THIS_MODULE,
 112	.keepalive	= smc_set_keepalive,
 113	.hash		= smc_hash_sk,
 114	.unhash		= smc_unhash_sk,
 115	.obj_size	= sizeof(struct smc_sock),
 116	.h.smc_hash	= &smc_v6_hashinfo,
 117	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
 118};
 119EXPORT_SYMBOL_GPL(smc_proto6);
 120
 121static int smc_release(struct socket *sock)
 122{
 123	struct sock *sk = sock->sk;
 124	struct smc_sock *smc;
 125	int rc = 0;
 126
 127	if (!sk)
 128		goto out;
 129
 130	smc = smc_sk(sk);
 131	if (sk->sk_state == SMC_LISTEN)
 132		/* smc_close_non_accepted() is called and acquires
 133		 * sock lock for child sockets again
 134		 */
 135		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
 136	else
 137		lock_sock(sk);
 138
 139	if (!smc->use_fallback) {
 140		rc = smc_close_active(smc);
 141		sock_set_flag(sk, SOCK_DEAD);
 142		sk->sk_shutdown |= SHUTDOWN_MASK;
 143	}
 144	if (smc->clcsock) {
 145		sock_release(smc->clcsock);
 146		smc->clcsock = NULL;
 147	}
 148	if (smc->use_fallback) {
 149		sock_put(sk); /* passive closing */
 150		sk->sk_state = SMC_CLOSED;
 151		sk->sk_state_change(sk);
 152	}
 153
 154	/* detach socket */
 155	sock_orphan(sk);
 156	sock->sk = NULL;
 157	if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
 158		smc_conn_free(&smc->conn);
 159	release_sock(sk);
 160
 161	sk->sk_prot->unhash(sk);
 162	sock_put(sk); /* final sock_put */
 163out:
 164	return rc;
 165}
 166
 167static void smc_destruct(struct sock *sk)
 168{
 169	if (sk->sk_state != SMC_CLOSED)
 170		return;
 171	if (!sock_flag(sk, SOCK_DEAD))
 172		return;
 173
 174	sk_refcnt_debug_dec(sk);
 175}
 176
 177static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
 178				   int protocol)
 179{
 180	struct smc_sock *smc;
 181	struct proto *prot;
 182	struct sock *sk;
 183
 184	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
 185	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
 186	if (!sk)
 187		return NULL;
 188
 189	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
 190	sk->sk_state = SMC_INIT;
 191	sk->sk_destruct = smc_destruct;
 192	sk->sk_protocol = protocol;
 193	smc = smc_sk(sk);
 194	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
 195	INIT_LIST_HEAD(&smc->accept_q);
 196	spin_lock_init(&smc->accept_q_lock);
 197	sk->sk_prot->hash(sk);
 198	sk_refcnt_debug_inc(sk);
 199
 200	return sk;
 201}
 202
 203static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
 204		    int addr_len)
 205{
 206	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
 207	struct sock *sk = sock->sk;
 208	struct smc_sock *smc;
 209	int rc;
 210
 211	smc = smc_sk(sk);
 212
 213	/* replicate tests from inet_bind(), to be safe wrt. future changes */
 214	rc = -EINVAL;
 215	if (addr_len < sizeof(struct sockaddr_in))
 216		goto out;
 217
 218	rc = -EAFNOSUPPORT;
 219	if (addr->sin_family != AF_INET &&
 220	    addr->sin_family != AF_INET6 &&
 221	    addr->sin_family != AF_UNSPEC)
 222		goto out;
 223	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
 224	if (addr->sin_family == AF_UNSPEC &&
 225	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
 226		goto out;
 227
 228	lock_sock(sk);
 229
 230	/* Check if socket is already active */
 231	rc = -EINVAL;
 232	if (sk->sk_state != SMC_INIT)
 233		goto out_rel;
 234
 235	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
 236	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
 237
 238out_rel:
 239	release_sock(sk);
 240out:
 241	return rc;
 242}
 243
 244static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
 245				   unsigned long mask)
 246{
 247	/* options we don't get control via setsockopt for */
 248	nsk->sk_type = osk->sk_type;
 249	nsk->sk_sndbuf = osk->sk_sndbuf;
 250	nsk->sk_rcvbuf = osk->sk_rcvbuf;
 251	nsk->sk_sndtimeo = osk->sk_sndtimeo;
 252	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
 253	nsk->sk_mark = osk->sk_mark;
 254	nsk->sk_priority = osk->sk_priority;
 255	nsk->sk_rcvlowat = osk->sk_rcvlowat;
 256	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
 257	nsk->sk_err = osk->sk_err;
 258
 259	nsk->sk_flags &= ~mask;
 260	nsk->sk_flags |= osk->sk_flags & mask;
 261}
 262
 263#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
 264			     (1UL << SOCK_KEEPOPEN) | \
 265			     (1UL << SOCK_LINGER) | \
 266			     (1UL << SOCK_BROADCAST) | \
 267			     (1UL << SOCK_TIMESTAMP) | \
 268			     (1UL << SOCK_DBG) | \
 269			     (1UL << SOCK_RCVTSTAMP) | \
 270			     (1UL << SOCK_RCVTSTAMPNS) | \
 271			     (1UL << SOCK_LOCALROUTE) | \
 272			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
 273			     (1UL << SOCK_RXQ_OVFL) | \
 274			     (1UL << SOCK_WIFI_STATUS) | \
 275			     (1UL << SOCK_NOFCS) | \
 276			     (1UL << SOCK_FILTER_LOCKED))
 277/* copy only relevant settings and flags of SOL_SOCKET level from smc to
 278 * clc socket (since smc is not called for these options from net/core)
 279 */
 280static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
 281{
 282	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
 283}
 284
 285#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
 286			     (1UL << SOCK_KEEPOPEN) | \
 287			     (1UL << SOCK_LINGER) | \
 288			     (1UL << SOCK_DBG))
 289/* copy only settings and flags relevant for smc from clc to smc socket */
 290static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
 291{
 292	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
 293}
 294
 295/* register a new rmb */
 296static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc)
 297{
 298	/* register memory region for new rmb */
 299	if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
 300		rmb_desc->regerr = 1;
 301		return -EFAULT;
 302	}
 303	return 0;
 304}
 305
 306static int smc_clnt_conf_first_link(struct smc_sock *smc)
 307{
 308	struct smc_link_group *lgr = smc->conn.lgr;
 309	struct smc_link *link;
 310	int rest;
 311	int rc;
 312
 313	link = &lgr->lnk[SMC_SINGLE_LINK];
 314	/* receive CONFIRM LINK request from server over RoCE fabric */
 315	rest = wait_for_completion_interruptible_timeout(
 316		&link->llc_confirm,
 317		SMC_LLC_WAIT_FIRST_TIME);
 318	if (rest <= 0) {
 319		struct smc_clc_msg_decline dclc;
 320
 321		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
 322				      SMC_CLC_DECLINE);
 323		return rc;
 324	}
 325
 326	if (link->llc_confirm_rc)
 327		return SMC_CLC_DECL_RMBE_EC;
 328
 329	rc = smc_ib_modify_qp_rts(link);
 330	if (rc)
 331		return SMC_CLC_DECL_INTERR;
 332
 333	smc_wr_remember_qp_attr(link);
 334
 335	if (smc_reg_rmb(link, smc->conn.rmb_desc))
 336		return SMC_CLC_DECL_INTERR;
 337
 338	/* send CONFIRM LINK response over RoCE fabric */
 339	rc = smc_llc_send_confirm_link(link,
 340				       link->smcibdev->mac[link->ibport - 1],
 341				       &link->smcibdev->gid[link->ibport - 1],
 342				       SMC_LLC_RESP);
 343	if (rc < 0)
 344		return SMC_CLC_DECL_TCL;
 345
 346	/* receive ADD LINK request from server over RoCE fabric */
 347	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
 348							 SMC_LLC_WAIT_TIME);
 349	if (rest <= 0) {
 350		struct smc_clc_msg_decline dclc;
 351
 352		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
 353				      SMC_CLC_DECLINE);
 354		return rc;
 355	}
 356
 357	/* send add link reject message, only one link supported for now */
 358	rc = smc_llc_send_add_link(link,
 359				   link->smcibdev->mac[link->ibport - 1],
 360				   &link->smcibdev->gid[link->ibport - 1],
 361				   SMC_LLC_RESP);
 362	if (rc < 0)
 363		return SMC_CLC_DECL_TCL;
 364
 365	link->state = SMC_LNK_ACTIVE;
 366
 367	return 0;
 368}
 369
 370static void smc_conn_save_peer_info(struct smc_sock *smc,
 371				    struct smc_clc_msg_accept_confirm *clc)
 372{
 373	smc->conn.peer_conn_idx = clc->conn_idx;
 374	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
 375	smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
 376	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
 377}
 378
 379static void smc_link_save_peer_info(struct smc_link *link,
 380				    struct smc_clc_msg_accept_confirm *clc)
 381{
 382	link->peer_qpn = ntoh24(clc->qpn);
 383	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
 384	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
 385	link->peer_psn = ntoh24(clc->psn);
 386	link->peer_mtu = clc->qp_mtu;
 387}
 388
 389/* setup for RDMA connection of client */
 390static int smc_connect_rdma(struct smc_sock *smc)
 391{
 392	struct smc_clc_msg_accept_confirm aclc;
 393	int local_contact = SMC_FIRST_CONTACT;
 394	struct smc_ib_device *smcibdev;
 395	struct smc_link *link;
 396	u8 srv_first_contact;
 397	int reason_code = 0;
 398	int rc = 0;
 399	u8 ibport;
 400
 401	sock_hold(&smc->sk); /* sock put in passive closing */
 402
 403	if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
 404		/* peer has not signalled SMC-capability */
 405		smc->use_fallback = true;
 406		goto out_connected;
 407	}
 408
 409	/* IPSec connections opt out of SMC-R optimizations */
 410	if (using_ipsec(smc)) {
 411		reason_code = SMC_CLC_DECL_IPSEC;
 412		goto decline_rdma;
 413	}
 414
 415	/* PNET table look up: search active ib_device and port
 416	 * within same PNETID that also contains the ethernet device
 417	 * used for the internal TCP socket
 418	 */
 419	smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
 420	if (!smcibdev) {
 421		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 422		goto decline_rdma;
 423	}
 424
 425	/* do inband token exchange */
 426	reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
 427	if (reason_code < 0) {
 428		rc = reason_code;
 429		goto out_err;
 430	}
 431	if (reason_code > 0) /* configuration error */
 432		goto decline_rdma;
 433	/* receive SMC Accept CLC message */
 434	reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
 435				       SMC_CLC_ACCEPT);
 436	if (reason_code < 0) {
 437		rc = reason_code;
 438		goto out_err;
 439	}
 440	if (reason_code > 0)
 441		goto decline_rdma;
 442
 443	srv_first_contact = aclc.hdr.flag;
 444	mutex_lock(&smc_create_lgr_pending);
 445	local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl,
 446					srv_first_contact);
 447	if (local_contact < 0) {
 448		rc = local_contact;
 449		if (rc == -ENOMEM)
 450			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
 451		else if (rc == -ENOLINK)
 452			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
 453		goto decline_rdma_unlock;
 454	}
 455	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 456
 457	smc_conn_save_peer_info(smc, &aclc);
 458
 459	/* create send buffer and rmb */
 460	rc = smc_buf_create(smc);
 461	if (rc) {
 462		reason_code = SMC_CLC_DECL_MEM;
 463		goto decline_rdma_unlock;
 464	}
 465
 466	if (local_contact == SMC_FIRST_CONTACT)
 467		smc_link_save_peer_info(link, &aclc);
 468
 469	rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
 470	if (rc) {
 471		reason_code = SMC_CLC_DECL_INTERR;
 472		goto decline_rdma_unlock;
 473	}
 474
 475	smc_close_init(smc);
 476	smc_rx_init(smc);
 477
 478	if (local_contact == SMC_FIRST_CONTACT) {
 479		rc = smc_ib_ready_link(link);
 480		if (rc) {
 481			reason_code = SMC_CLC_DECL_INTERR;
 482			goto decline_rdma_unlock;
 483		}
 484	} else {
 485		if (!smc->conn.rmb_desc->reused) {
 486			if (smc_reg_rmb(link, smc->conn.rmb_desc)) {
 487				reason_code = SMC_CLC_DECL_INTERR;
 488				goto decline_rdma_unlock;
 489			}
 490		}
 491	}
 492	smc_rmb_sync_sg_for_device(&smc->conn);
 493
 494	rc = smc_clc_send_confirm(smc);
 495	if (rc)
 496		goto out_err_unlock;
 497
 498	if (local_contact == SMC_FIRST_CONTACT) {
 499		/* QP confirmation over RoCE fabric */
 500		reason_code = smc_clnt_conf_first_link(smc);
 501		if (reason_code < 0) {
 502			rc = reason_code;
 503			goto out_err_unlock;
 504		}
 505		if (reason_code > 0)
 506			goto decline_rdma_unlock;
 507	}
 508
 509	mutex_unlock(&smc_create_lgr_pending);
 510	smc_tx_init(smc);
 511
 512out_connected:
 513	smc_copy_sock_settings_to_clc(smc);
 514	if (smc->sk.sk_state == SMC_INIT)
 515		smc->sk.sk_state = SMC_ACTIVE;
 516
 517	return rc ? rc : local_contact;
 518
 519decline_rdma_unlock:
 520	if (local_contact == SMC_FIRST_CONTACT)
 521		smc_lgr_forget(smc->conn.lgr);
 522	mutex_unlock(&smc_create_lgr_pending);
 523	smc_conn_free(&smc->conn);
 524decline_rdma:
 525	/* RDMA setup failed, switch back to TCP */
 526	smc->use_fallback = true;
 527	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
 528		rc = smc_clc_send_decline(smc, reason_code);
 529		if (rc < 0)
 530			goto out_err;
 531	}
 532	goto out_connected;
 533
 534out_err_unlock:
 535	if (local_contact == SMC_FIRST_CONTACT)
 536		smc_lgr_forget(smc->conn.lgr);
 537	mutex_unlock(&smc_create_lgr_pending);
 538	smc_conn_free(&smc->conn);
 539out_err:
 540	if (smc->sk.sk_state == SMC_INIT)
 541		sock_put(&smc->sk); /* passive closing */
 542	return rc;
 543}
 544
 545static int smc_connect(struct socket *sock, struct sockaddr *addr,
 546		       int alen, int flags)
 547{
 548	struct sock *sk = sock->sk;
 549	struct smc_sock *smc;
 550	int rc = -EINVAL;
 551
 552	smc = smc_sk(sk);
 553
 554	/* separate smc parameter checking to be safe */
 555	if (alen < sizeof(addr->sa_family))
 556		goto out_err;
 557	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
 558		goto out_err;
 559
 560	lock_sock(sk);
 561	switch (sk->sk_state) {
 562	default:
 563		goto out;
 564	case SMC_ACTIVE:
 565		rc = -EISCONN;
 566		goto out;
 567	case SMC_INIT:
 568		rc = 0;
 569		break;
 570	}
 571
 572	smc_copy_sock_settings_to_clc(smc);
 573	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
 574	rc = kernel_connect(smc->clcsock, addr, alen, flags);
 575	if (rc)
 576		goto out;
 577
 578	/* setup RDMA connection */
 579	rc = smc_connect_rdma(smc);
 580	if (rc < 0)
 581		goto out;
 582	else
 583		rc = 0; /* success cases including fallback */
 584
 585out:
 586	release_sock(sk);
 587out_err:
 588	return rc;
 589}
 590
 591static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 592{
 593	struct socket *new_clcsock = NULL;
 594	struct sock *lsk = &lsmc->sk;
 595	struct sock *new_sk;
 596	int rc;
 597
 598	release_sock(lsk);
 599	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
 600	if (!new_sk) {
 601		rc = -ENOMEM;
 602		lsk->sk_err = ENOMEM;
 603		*new_smc = NULL;
 604		lock_sock(lsk);
 605		goto out;
 606	}
 607	*new_smc = smc_sk(new_sk);
 608
 609	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
 610	lock_sock(lsk);
 611	if  (rc < 0)
 612		lsk->sk_err = -rc;
 613	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
 614		if (new_clcsock)
 615			sock_release(new_clcsock);
 616		new_sk->sk_state = SMC_CLOSED;
 617		sock_set_flag(new_sk, SOCK_DEAD);
 618		new_sk->sk_prot->unhash(new_sk);
 619		sock_put(new_sk); /* final */
 620		*new_smc = NULL;
 621		goto out;
 622	}
 623
 624	(*new_smc)->clcsock = new_clcsock;
 625out:
 626	return rc;
 627}
 628
 629/* add a just created sock to the accept queue of the listen sock as
 630 * candidate for a following socket accept call from user space
 631 */
 632static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
 633{
 634	struct smc_sock *par = smc_sk(parent);
 635
 636	sock_hold(sk); /* sock_put in smc_accept_unlink () */
 637	spin_lock(&par->accept_q_lock);
 638	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
 639	spin_unlock(&par->accept_q_lock);
 640	sk_acceptq_added(parent);
 641}
 642
 643/* remove a socket from the accept queue of its parental listening socket */
 644static void smc_accept_unlink(struct sock *sk)
 645{
 646	struct smc_sock *par = smc_sk(sk)->listen_smc;
 647
 648	spin_lock(&par->accept_q_lock);
 649	list_del_init(&smc_sk(sk)->accept_q);
 650	spin_unlock(&par->accept_q_lock);
 651	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
 652	sock_put(sk); /* sock_hold in smc_accept_enqueue */
 653}
 654
 655/* remove a sock from the accept queue to bind it to a new socket created
 656 * for a socket accept call from user space
 657 */
 658struct sock *smc_accept_dequeue(struct sock *parent,
 659				struct socket *new_sock)
 660{
 661	struct smc_sock *isk, *n;
 662	struct sock *new_sk;
 663
 664	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
 665		new_sk = (struct sock *)isk;
 666
 667		smc_accept_unlink(new_sk);
 668		if (new_sk->sk_state == SMC_CLOSED) {
 669			if (isk->clcsock) {
 670				sock_release(isk->clcsock);
 671				isk->clcsock = NULL;
 672			}
 673			new_sk->sk_prot->unhash(new_sk);
 674			sock_put(new_sk); /* final */
 675			continue;
 676		}
 677		if (new_sock)
 678			sock_graft(new_sk, new_sock);
 679		return new_sk;
 680	}
 681	return NULL;
 682}
 683
 684/* clean up for a created but never accepted sock */
 685void smc_close_non_accepted(struct sock *sk)
 686{
 687	struct smc_sock *smc = smc_sk(sk);
 688
 689	lock_sock(sk);
 690	if (!sk->sk_lingertime)
 691		/* wait for peer closing */
 692		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
 693	if (!smc->use_fallback) {
 694		smc_close_active(smc);
 695		sock_set_flag(sk, SOCK_DEAD);
 696		sk->sk_shutdown |= SHUTDOWN_MASK;
 697	}
 698	if (smc->clcsock) {
 699		struct socket *tcp;
 700
 701		tcp = smc->clcsock;
 702		smc->clcsock = NULL;
 703		sock_release(tcp);
 704	}
 705	if (smc->use_fallback) {
 706		sock_put(sk); /* passive closing */
 707		sk->sk_state = SMC_CLOSED;
 708	} else {
 709		if (sk->sk_state == SMC_CLOSED)
 710			smc_conn_free(&smc->conn);
 711	}
 712	release_sock(sk);
 713	sk->sk_prot->unhash(sk);
 714	sock_put(sk); /* final sock_put */
 715}
 716
 717static int smc_serv_conf_first_link(struct smc_sock *smc)
 718{
 719	struct smc_link_group *lgr = smc->conn.lgr;
 720	struct smc_link *link;
 721	int rest;
 722	int rc;
 723
 724	link = &lgr->lnk[SMC_SINGLE_LINK];
 725
 726	if (smc_reg_rmb(link, smc->conn.rmb_desc))
 727		return SMC_CLC_DECL_INTERR;
 728
 729	/* send CONFIRM LINK request to client over the RoCE fabric */
 730	rc = smc_llc_send_confirm_link(link,
 731				       link->smcibdev->mac[link->ibport - 1],
 732				       &link->smcibdev->gid[link->ibport - 1],
 733				       SMC_LLC_REQ);
 734	if (rc < 0)
 735		return SMC_CLC_DECL_TCL;
 736
 737	/* receive CONFIRM LINK response from client over the RoCE fabric */
 738	rest = wait_for_completion_interruptible_timeout(
 739		&link->llc_confirm_resp,
 740		SMC_LLC_WAIT_FIRST_TIME);
 741	if (rest <= 0) {
 742		struct smc_clc_msg_decline dclc;
 743
 744		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
 745				      SMC_CLC_DECLINE);
 746		return rc;
 747	}
 748
 749	if (link->llc_confirm_resp_rc)
 750		return SMC_CLC_DECL_RMBE_EC;
 751
 752	/* send ADD LINK request to client over the RoCE fabric */
 753	rc = smc_llc_send_add_link(link,
 754				   link->smcibdev->mac[link->ibport - 1],
 755				   &link->smcibdev->gid[link->ibport - 1],
 756				   SMC_LLC_REQ);
 757	if (rc < 0)
 758		return SMC_CLC_DECL_TCL;
 759
 760	/* receive ADD LINK response from client over the RoCE fabric */
 761	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
 762							 SMC_LLC_WAIT_TIME);
 763	if (rest <= 0) {
 764		struct smc_clc_msg_decline dclc;
 765
 766		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
 767				      SMC_CLC_DECLINE);
 768		return rc;
 769	}
 770
 771	link->state = SMC_LNK_ACTIVE;
 772
 773	return 0;
 774}
 775
 776/* setup for RDMA connection of server */
 777static void smc_listen_work(struct work_struct *work)
 778{
 779	struct smc_sock *new_smc = container_of(work, struct smc_sock,
 780						smc_listen_work);
 781	struct smc_clc_msg_proposal_prefix *pclc_prfx;
 782	struct socket *newclcsock = new_smc->clcsock;
 783	struct smc_sock *lsmc = new_smc->listen_smc;
 784	struct smc_clc_msg_accept_confirm cclc;
 785	int local_contact = SMC_REUSE_CONTACT;
 786	struct sock *newsmcsk = &new_smc->sk;
 787	struct smc_clc_msg_proposal *pclc;
 788	struct smc_ib_device *smcibdev;
 789	u8 buf[SMC_CLC_MAX_LEN];
 790	struct smc_link *link;
 791	int reason_code = 0;
 792	int rc = 0;
 793	u8 ibport;
 794
 795	/* check if peer is smc capable */
 796	if (!tcp_sk(newclcsock->sk)->syn_smc) {
 797		new_smc->use_fallback = true;
 798		goto out_connected;
 799	}
 800
 801	/* do inband token exchange -
 802	 *wait for and receive SMC Proposal CLC message
 803	 */
 804	reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf),
 805				       SMC_CLC_PROPOSAL);
 806	if (reason_code < 0)
 807		goto out_err;
 808	if (reason_code > 0)
 809		goto decline_rdma;
 810
 811	/* IPSec connections opt out of SMC-R optimizations */
 812	if (using_ipsec(new_smc)) {
 813		reason_code = SMC_CLC_DECL_IPSEC;
 814		goto decline_rdma;
 815	}
 816
 817	/* PNET table look up: search active ib_device and port
 818	 * within same PNETID that also contains the ethernet device
 819	 * used for the internal TCP socket
 820	 */
 821	smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
 822	if (!smcibdev) {
 823		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 824		goto decline_rdma;
 825	}
 826
 827	pclc = (struct smc_clc_msg_proposal *)&buf;
 828	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
 829
 830	rc = smc_clc_prfx_match(newclcsock, pclc_prfx);
 831	if (rc) {
 832		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 833		goto decline_rdma;
 834	}
 835
 836	/* allocate connection / link group */
 837	mutex_lock(&smc_create_lgr_pending);
 838	local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl,
 839					0);
 840	if (local_contact < 0) {
 841		rc = local_contact;
 842		if (rc == -ENOMEM)
 843			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
 844		goto decline_rdma_unlock;
 845	}
 846	link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 847
 848	/* create send buffer and rmb */
 849	rc = smc_buf_create(new_smc);
 850	if (rc) {
 851		reason_code = SMC_CLC_DECL_MEM;
 852		goto decline_rdma_unlock;
 853	}
 854
 855	smc_close_init(new_smc);
 856	smc_rx_init(new_smc);
 857
 858	if (local_contact != SMC_FIRST_CONTACT) {
 859		if (!new_smc->conn.rmb_desc->reused) {
 860			if (smc_reg_rmb(link, new_smc->conn.rmb_desc)) {
 861				reason_code = SMC_CLC_DECL_INTERR;
 862				goto decline_rdma_unlock;
 863			}
 864		}
 865	}
 866	smc_rmb_sync_sg_for_device(&new_smc->conn);
 867
 868	rc = smc_clc_send_accept(new_smc, local_contact);
 869	if (rc)
 870		goto out_err_unlock;
 871
 872	/* receive SMC Confirm CLC message */
 873	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
 874				       SMC_CLC_CONFIRM);
 875	if (reason_code < 0)
 876		goto out_err_unlock;
 877	if (reason_code > 0)
 878		goto decline_rdma_unlock;
 879	smc_conn_save_peer_info(new_smc, &cclc);
 880	if (local_contact == SMC_FIRST_CONTACT)
 881		smc_link_save_peer_info(link, &cclc);
 882
 883	rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
 884	if (rc) {
 885		reason_code = SMC_CLC_DECL_INTERR;
 886		goto decline_rdma_unlock;
 887	}
 888
 889	if (local_contact == SMC_FIRST_CONTACT) {
 890		rc = smc_ib_ready_link(link);
 891		if (rc) {
 892			reason_code = SMC_CLC_DECL_INTERR;
 893			goto decline_rdma_unlock;
 894		}
 895		/* QP confirmation over RoCE fabric */
 896		reason_code = smc_serv_conf_first_link(new_smc);
 897		if (reason_code < 0)
 898			/* peer is not aware of a problem */
 899			goto out_err_unlock;
 900		if (reason_code > 0)
 901			goto decline_rdma_unlock;
 902	}
 903
 904	smc_tx_init(new_smc);
 905	mutex_unlock(&smc_create_lgr_pending);
 906
 907out_connected:
 908	sk_refcnt_debug_inc(newsmcsk);
 909	if (newsmcsk->sk_state == SMC_INIT)
 910		newsmcsk->sk_state = SMC_ACTIVE;
 911enqueue:
 912	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
 913	if (lsmc->sk.sk_state == SMC_LISTEN) {
 914		smc_accept_enqueue(&lsmc->sk, newsmcsk);
 915	} else { /* no longer listening */
 916		smc_close_non_accepted(newsmcsk);
 917	}
 918	release_sock(&lsmc->sk);
 919
 920	/* Wake up accept */
 921	lsmc->sk.sk_data_ready(&lsmc->sk);
 922	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
 923	return;
 924
 925decline_rdma_unlock:
 926	if (local_contact == SMC_FIRST_CONTACT)
 927		smc_lgr_forget(new_smc->conn.lgr);
 928	mutex_unlock(&smc_create_lgr_pending);
 929decline_rdma:
 930	/* RDMA setup failed, switch back to TCP */
 931	smc_conn_free(&new_smc->conn);
 932	new_smc->use_fallback = true;
 933	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
 934		if (smc_clc_send_decline(new_smc, reason_code) < 0)
 935			goto out_err;
 936	}
 937	goto out_connected;
 938
 939out_err_unlock:
 940	if (local_contact == SMC_FIRST_CONTACT)
 941		smc_lgr_forget(new_smc->conn.lgr);
 942	mutex_unlock(&smc_create_lgr_pending);
 943out_err:
 944	if (newsmcsk->sk_state == SMC_INIT)
 945		sock_put(&new_smc->sk); /* passive closing */
 946	newsmcsk->sk_state = SMC_CLOSED;
 947	smc_conn_free(&new_smc->conn);
 948	goto enqueue; /* queue new sock with sk_err set */
 949}
 950
 951static void smc_tcp_listen_work(struct work_struct *work)
 952{
 953	struct smc_sock *lsmc = container_of(work, struct smc_sock,
 954					     tcp_listen_work);
 955	struct sock *lsk = &lsmc->sk;
 956	struct smc_sock *new_smc;
 957	int rc = 0;
 958
 959	lock_sock(lsk);
 960	while (lsk->sk_state == SMC_LISTEN) {
 961		rc = smc_clcsock_accept(lsmc, &new_smc);
 962		if (rc)
 963			goto out;
 964		if (!new_smc)
 965			continue;
 966
 967		new_smc->listen_smc = lsmc;
 968		new_smc->use_fallback = false; /* assume rdma capability first*/
 969		sock_hold(lsk); /* sock_put in smc_listen_work */
 970		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
 971		smc_copy_sock_settings_to_smc(new_smc);
 972		sock_hold(&new_smc->sk); /* sock_put in passive closing */
 973		if (!schedule_work(&new_smc->smc_listen_work))
 974			sock_put(&new_smc->sk);
 975	}
 976
 977out:
 978	release_sock(lsk);
 979	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
 980}
 981
 982static int smc_listen(struct socket *sock, int backlog)
 983{
 984	struct sock *sk = sock->sk;
 985	struct smc_sock *smc;
 986	int rc;
 987
 988	smc = smc_sk(sk);
 989	lock_sock(sk);
 990
 991	rc = -EINVAL;
 992	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
 993		goto out;
 994
 995	rc = 0;
 996	if (sk->sk_state == SMC_LISTEN) {
 997		sk->sk_max_ack_backlog = backlog;
 998		goto out;
 999	}
1000	/* some socket options are handled in core, so we could not apply
1001	 * them to the clc socket -- copy smc socket options to clc socket
1002	 */
1003	smc_copy_sock_settings_to_clc(smc);
1004	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1005
1006	rc = kernel_listen(smc->clcsock, backlog);
1007	if (rc)
1008		goto out;
1009	sk->sk_max_ack_backlog = backlog;
1010	sk->sk_ack_backlog = 0;
1011	sk->sk_state = SMC_LISTEN;
1012	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
1013	sock_hold(sk); /* sock_hold in tcp_listen_worker */
1014	if (!schedule_work(&smc->tcp_listen_work))
1015		sock_put(sk);
1016
1017out:
1018	release_sock(sk);
1019	return rc;
1020}
1021
1022static int smc_accept(struct socket *sock, struct socket *new_sock,
1023		      int flags, bool kern)
1024{
1025	struct sock *sk = sock->sk, *nsk;
1026	DECLARE_WAITQUEUE(wait, current);
1027	struct smc_sock *lsmc;
1028	long timeo;
1029	int rc = 0;
1030
1031	lsmc = smc_sk(sk);
1032	sock_hold(sk); /* sock_put below */
1033	lock_sock(sk);
1034
1035	if (lsmc->sk.sk_state != SMC_LISTEN) {
1036		rc = -EINVAL;
1037		goto out;
1038	}
1039
1040	/* Wait for an incoming connection */
1041	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1042	add_wait_queue_exclusive(sk_sleep(sk), &wait);
1043	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1044		set_current_state(TASK_INTERRUPTIBLE);
1045		if (!timeo) {
1046			rc = -EAGAIN;
1047			break;
1048		}
1049		release_sock(sk);
1050		timeo = schedule_timeout(timeo);
1051		/* wakeup by sk_data_ready in smc_listen_work() */
1052		sched_annotate_sleep();
1053		lock_sock(sk);
1054		if (signal_pending(current)) {
1055			rc = sock_intr_errno(timeo);
1056			break;
1057		}
1058	}
1059	set_current_state(TASK_RUNNING);
1060	remove_wait_queue(sk_sleep(sk), &wait);
1061
1062	if (!rc)
1063		rc = sock_error(nsk);
1064
1065out:
1066	release_sock(sk);
1067	sock_put(sk); /* sock_hold above */
1068	return rc;
1069}
1070
1071static int smc_getname(struct socket *sock, struct sockaddr *addr,
1072		       int peer)
1073{
1074	struct smc_sock *smc;
1075
1076	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1077	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1078		return -ENOTCONN;
1079
1080	smc = smc_sk(sock->sk);
1081
1082	return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1083}
1084
1085static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1086{
1087	struct sock *sk = sock->sk;
1088	struct smc_sock *smc;
1089	int rc = -EPIPE;
1090
1091	smc = smc_sk(sk);
1092	lock_sock(sk);
1093	if ((sk->sk_state != SMC_ACTIVE) &&
1094	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1095	    (sk->sk_state != SMC_INIT))
1096		goto out;
1097	if (smc->use_fallback)
1098		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1099	else
1100		rc = smc_tx_sendmsg(smc, msg, len);
1101out:
1102	release_sock(sk);
1103	return rc;
1104}
1105
1106static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1107		       int flags)
1108{
1109	struct sock *sk = sock->sk;
1110	struct smc_sock *smc;
1111	int rc = -ENOTCONN;
1112
1113	smc = smc_sk(sk);
1114	lock_sock(sk);
1115	if ((sk->sk_state == SMC_INIT) ||
1116	    (sk->sk_state == SMC_LISTEN) ||
1117	    (sk->sk_state == SMC_CLOSED))
1118		goto out;
1119
1120	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1121		rc = 0;
1122		goto out;
1123	}
1124
1125	if (smc->use_fallback)
1126		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1127	else
1128		rc = smc_rx_recvmsg(smc, msg, len, flags);
1129
1130out:
1131	release_sock(sk);
1132	return rc;
1133}
1134
1135static __poll_t smc_accept_poll(struct sock *parent)
1136{
1137	struct smc_sock *isk = smc_sk(parent);
1138	__poll_t mask = 0;
1139
1140	spin_lock(&isk->accept_q_lock);
1141	if (!list_empty(&isk->accept_q))
1142		mask = EPOLLIN | EPOLLRDNORM;
1143	spin_unlock(&isk->accept_q_lock);
1144
1145	return mask;
1146}
1147
1148static __poll_t smc_poll(struct file *file, struct socket *sock,
1149			     poll_table *wait)
1150{
1151	struct sock *sk = sock->sk;
1152	__poll_t mask = 0;
1153	struct smc_sock *smc;
1154	int rc;
1155
1156	if (!sk)
1157		return EPOLLNVAL;
1158
1159	smc = smc_sk(sock->sk);
1160	sock_hold(sk);
1161	lock_sock(sk);
1162	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1163		/* delegate to CLC child sock */
1164		release_sock(sk);
1165		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1166		lock_sock(sk);
1167		sk->sk_err = smc->clcsock->sk->sk_err;
1168		if (sk->sk_err) {
1169			mask |= EPOLLERR;
1170		} else {
1171			/* if non-blocking connect finished ... */
1172			if (sk->sk_state == SMC_INIT &&
1173			    mask & EPOLLOUT &&
1174			    smc->clcsock->sk->sk_state != TCP_CLOSE) {
1175				rc = smc_connect_rdma(smc);
1176				if (rc < 0)
1177					mask |= EPOLLERR;
1178				/* success cases including fallback */
1179				mask |= EPOLLOUT | EPOLLWRNORM;
1180			}
1181		}
1182	} else {
1183		if (sk->sk_state != SMC_CLOSED) {
1184			release_sock(sk);
1185			sock_poll_wait(file, sk_sleep(sk), wait);
1186			lock_sock(sk);
1187		}
1188		if (sk->sk_err)
1189			mask |= EPOLLERR;
1190		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1191		    (sk->sk_state == SMC_CLOSED))
1192			mask |= EPOLLHUP;
1193		if (sk->sk_state == SMC_LISTEN) {
1194			/* woken up by sk_data_ready in smc_listen_work() */
1195			mask = smc_accept_poll(sk);
1196		} else {
1197			if (atomic_read(&smc->conn.sndbuf_space) ||
1198			    sk->sk_shutdown & SEND_SHUTDOWN) {
1199				mask |= EPOLLOUT | EPOLLWRNORM;
1200			} else {
1201				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1202				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1203			}
1204			if (atomic_read(&smc->conn.bytes_to_rcv))
1205				mask |= EPOLLIN | EPOLLRDNORM;
1206			if (sk->sk_shutdown & RCV_SHUTDOWN)
1207				mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1208			if (sk->sk_state == SMC_APPCLOSEWAIT1)
1209				mask |= EPOLLIN;
1210		}
1211
1212	}
1213	release_sock(sk);
1214	sock_put(sk);
1215
1216	return mask;
1217}
1218
1219static int smc_shutdown(struct socket *sock, int how)
1220{
1221	struct sock *sk = sock->sk;
1222	struct smc_sock *smc;
1223	int rc = -EINVAL;
1224	int rc1 = 0;
1225
1226	smc = smc_sk(sk);
1227
1228	if ((how < SHUT_RD) || (how > SHUT_RDWR))
1229		return rc;
1230
1231	lock_sock(sk);
1232
1233	rc = -ENOTCONN;
1234	if ((sk->sk_state != SMC_LISTEN) &&
1235	    (sk->sk_state != SMC_ACTIVE) &&
1236	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1237	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1238	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1239	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1240	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
1241		goto out;
1242	if (smc->use_fallback) {
1243		rc = kernel_sock_shutdown(smc->clcsock, how);
1244		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1245		if (sk->sk_shutdown == SHUTDOWN_MASK)
1246			sk->sk_state = SMC_CLOSED;
1247		goto out;
1248	}
1249	switch (how) {
1250	case SHUT_RDWR:		/* shutdown in both directions */
1251		rc = smc_close_active(smc);
1252		break;
1253	case SHUT_WR:
1254		rc = smc_close_shutdown_write(smc);
1255		break;
1256	case SHUT_RD:
1257		rc = 0;
1258		/* nothing more to do because peer is not involved */
1259		break;
1260	}
1261	if (smc->clcsock)
1262		rc1 = kernel_sock_shutdown(smc->clcsock, how);
1263	/* map sock_shutdown_cmd constants to sk_shutdown value range */
1264	sk->sk_shutdown |= how + 1;
1265
1266out:
1267	release_sock(sk);
1268	return rc ? rc : rc1;
1269}
1270
1271static int smc_setsockopt(struct socket *sock, int level, int optname,
1272			  char __user *optval, unsigned int optlen)
1273{
1274	struct sock *sk = sock->sk;
1275	struct smc_sock *smc;
1276
1277	smc = smc_sk(sk);
1278
1279	/* generic setsockopts reaching us here always apply to the
1280	 * CLC socket
1281	 */
1282	return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1283					     optval, optlen);
1284}
1285
1286static int smc_getsockopt(struct socket *sock, int level, int optname,
1287			  char __user *optval, int __user *optlen)
1288{
1289	struct smc_sock *smc;
1290
1291	smc = smc_sk(sock->sk);
1292	/* socket options apply to the CLC socket */
1293	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1294					     optval, optlen);
1295}
1296
1297static int smc_ioctl(struct socket *sock, unsigned int cmd,
1298		     unsigned long arg)
1299{
1300	struct smc_sock *smc;
1301
1302	smc = smc_sk(sock->sk);
1303	if (smc->use_fallback)
1304		return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1305	else
1306		return sock_no_ioctl(sock, cmd, arg);
1307}
1308
1309static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1310			    int offset, size_t size, int flags)
1311{
1312	struct sock *sk = sock->sk;
1313	struct smc_sock *smc;
1314	int rc = -EPIPE;
1315
1316	smc = smc_sk(sk);
1317	lock_sock(sk);
1318	if (sk->sk_state != SMC_ACTIVE) {
1319		release_sock(sk);
1320		goto out;
1321	}
1322	release_sock(sk);
1323	if (smc->use_fallback)
1324		rc = kernel_sendpage(smc->clcsock, page, offset,
1325				     size, flags);
1326	else
1327		rc = sock_no_sendpage(sock, page, offset, size, flags);
1328
1329out:
1330	return rc;
1331}
1332
1333static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1334			       struct pipe_inode_info *pipe, size_t len,
1335				    unsigned int flags)
1336{
1337	struct sock *sk = sock->sk;
1338	struct smc_sock *smc;
1339	int rc = -ENOTCONN;
1340
1341	smc = smc_sk(sk);
1342	lock_sock(sk);
1343	if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
1344		goto out;
1345	if (smc->use_fallback) {
1346		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1347						    pipe, len, flags);
1348	} else {
1349		rc = -EOPNOTSUPP;
1350	}
1351out:
1352	release_sock(sk);
1353	return rc;
1354}
1355
1356/* must look like tcp */
1357static const struct proto_ops smc_sock_ops = {
1358	.family		= PF_SMC,
1359	.owner		= THIS_MODULE,
1360	.release	= smc_release,
1361	.bind		= smc_bind,
1362	.connect	= smc_connect,
1363	.socketpair	= sock_no_socketpair,
1364	.accept		= smc_accept,
1365	.getname	= smc_getname,
1366	.poll		= smc_poll,
1367	.ioctl		= smc_ioctl,
1368	.listen		= smc_listen,
1369	.shutdown	= smc_shutdown,
1370	.setsockopt	= smc_setsockopt,
1371	.getsockopt	= smc_getsockopt,
1372	.sendmsg	= smc_sendmsg,
1373	.recvmsg	= smc_recvmsg,
1374	.mmap		= sock_no_mmap,
1375	.sendpage	= smc_sendpage,
1376	.splice_read	= smc_splice_read,
1377};
1378
1379static int smc_create(struct net *net, struct socket *sock, int protocol,
1380		      int kern)
1381{
1382	int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1383	struct smc_sock *smc;
1384	struct sock *sk;
1385	int rc;
1386
1387	rc = -ESOCKTNOSUPPORT;
1388	if (sock->type != SOCK_STREAM)
1389		goto out;
1390
1391	rc = -EPROTONOSUPPORT;
1392	if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1393		goto out;
1394
1395	rc = -ENOBUFS;
1396	sock->ops = &smc_sock_ops;
1397	sk = smc_sock_alloc(net, sock, protocol);
1398	if (!sk)
1399		goto out;
1400
1401	/* create internal TCP socket for CLC handshake and fallback */
1402	smc = smc_sk(sk);
1403	smc->use_fallback = false; /* assume rdma capability first */
1404	rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1405			      &smc->clcsock);
1406	if (rc) {
1407		sk_common_release(sk);
1408		goto out;
1409	}
1410	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1411	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1412
1413out:
1414	return rc;
1415}
1416
1417static const struct net_proto_family smc_sock_family_ops = {
1418	.family	= PF_SMC,
1419	.owner	= THIS_MODULE,
1420	.create	= smc_create,
1421};
1422
1423static int __init smc_init(void)
1424{
1425	int rc;
1426
1427	rc = smc_pnet_init();
1428	if (rc)
1429		return rc;
1430
1431	rc = smc_llc_init();
1432	if (rc) {
1433		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1434		goto out_pnet;
1435	}
1436
1437	rc = smc_cdc_init();
1438	if (rc) {
1439		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1440		goto out_pnet;
1441	}
1442
1443	rc = proto_register(&smc_proto, 1);
1444	if (rc) {
1445		pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
1446		goto out_pnet;
1447	}
1448
1449	rc = proto_register(&smc_proto6, 1);
1450	if (rc) {
1451		pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
1452		goto out_proto;
1453	}
1454
1455	rc = sock_register(&smc_sock_family_ops);
1456	if (rc) {
1457		pr_err("%s: sock_register fails with %d\n", __func__, rc);
1458		goto out_proto6;
1459	}
1460	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1461	INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
1462
1463	rc = smc_ib_register_client();
1464	if (rc) {
1465		pr_err("%s: ib_register fails with %d\n", __func__, rc);
1466		goto out_sock;
1467	}
1468
1469	static_branch_enable(&tcp_have_smc);
1470	return 0;
1471
1472out_sock:
1473	sock_unregister(PF_SMC);
1474out_proto6:
1475	proto_unregister(&smc_proto6);
1476out_proto:
1477	proto_unregister(&smc_proto);
1478out_pnet:
1479	smc_pnet_exit();
1480	return rc;
1481}
1482
1483static void __exit smc_exit(void)
1484{
1485	struct smc_link_group *lgr, *lg;
1486	LIST_HEAD(lgr_freeing_list);
1487
1488	spin_lock_bh(&smc_lgr_list.lock);
1489	if (!list_empty(&smc_lgr_list.list))
1490		list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1491	spin_unlock_bh(&smc_lgr_list.lock);
1492	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1493		list_del_init(&lgr->list);
1494		cancel_delayed_work_sync(&lgr->free_work);
1495		smc_lgr_free(lgr); /* free link group */
1496	}
1497	static_branch_disable(&tcp_have_smc);
1498	smc_ib_unregister_client();
1499	sock_unregister(PF_SMC);
1500	proto_unregister(&smc_proto6);
1501	proto_unregister(&smc_proto);
1502	smc_pnet_exit();
1503}
1504
1505module_init(smc_init);
1506module_exit(smc_exit);
1507
1508MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1509MODULE_DESCRIPTION("smc socket address family");
1510MODULE_LICENSE("GPL");
1511MODULE_ALIAS_NETPROTO(PF_SMC);