Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.5.6.
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
   4 *
   5 *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
   6 *  applies to SOCK_STREAM sockets only
   7 *  offers an alternative communication option for TCP-protocol sockets
   8 *  applicable with RoCE-cards only
   9 *
  10 *  Initial restrictions:
  11 *    - support for alternate links postponed
  12 *
  13 *  Copyright IBM Corp. 2016, 2018
  14 *
  15 *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
  16 *              based on prototype from Frank Blaschka
  17 */
  18
  19#define KMSG_COMPONENT "smc"
  20#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  21
  22#include <linux/module.h>
  23#include <linux/socket.h>
  24#include <linux/workqueue.h>
  25#include <linux/in.h>
  26#include <linux/sched/signal.h>
  27#include <linux/if_vlan.h>
  28
  29#include <net/sock.h>
  30#include <net/tcp.h>
  31#include <net/smc.h>
  32#include <asm/ioctls.h>
  33
  34#include <net/net_namespace.h>
  35#include <net/netns/generic.h>
  36#include "smc_netns.h"
  37
  38#include "smc.h"
  39#include "smc_clc.h"
  40#include "smc_llc.h"
  41#include "smc_cdc.h"
  42#include "smc_core.h"
  43#include "smc_ib.h"
  44#include "smc_ism.h"
  45#include "smc_pnet.h"
  46#include "smc_tx.h"
  47#include "smc_rx.h"
  48#include "smc_close.h"
  49
  50static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
  51						 * creation on server
  52						 */
  53static DEFINE_MUTEX(smc_client_lgr_pending);	/* serialize link group
  54						 * creation on client
  55						 */
  56
  57static void smc_tcp_listen_work(struct work_struct *);
  58static void smc_connect_work(struct work_struct *);
  59
  60static void smc_set_keepalive(struct sock *sk, int val)
  61{
  62	struct smc_sock *smc = smc_sk(sk);
  63
  64	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
  65}
  66
  67static struct smc_hashinfo smc_v4_hashinfo = {
  68	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
  69};
  70
  71static struct smc_hashinfo smc_v6_hashinfo = {
  72	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
  73};
  74
  75int smc_hash_sk(struct sock *sk)
  76{
  77	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
  78	struct hlist_head *head;
  79
  80	head = &h->ht;
  81
  82	write_lock_bh(&h->lock);
  83	sk_add_node(sk, head);
  84	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
  85	write_unlock_bh(&h->lock);
  86
  87	return 0;
  88}
  89EXPORT_SYMBOL_GPL(smc_hash_sk);
  90
  91void smc_unhash_sk(struct sock *sk)
  92{
  93	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
  94
  95	write_lock_bh(&h->lock);
  96	if (sk_del_node_init(sk))
  97		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
  98	write_unlock_bh(&h->lock);
  99}
 100EXPORT_SYMBOL_GPL(smc_unhash_sk);
 101
 102struct proto smc_proto = {
 103	.name		= "SMC",
 104	.owner		= THIS_MODULE,
 105	.keepalive	= smc_set_keepalive,
 106	.hash		= smc_hash_sk,
 107	.unhash		= smc_unhash_sk,
 108	.obj_size	= sizeof(struct smc_sock),
 109	.h.smc_hash	= &smc_v4_hashinfo,
 110	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
 111};
 112EXPORT_SYMBOL_GPL(smc_proto);
 113
 114struct proto smc_proto6 = {
 115	.name		= "SMC6",
 116	.owner		= THIS_MODULE,
 117	.keepalive	= smc_set_keepalive,
 118	.hash		= smc_hash_sk,
 119	.unhash		= smc_unhash_sk,
 120	.obj_size	= sizeof(struct smc_sock),
 121	.h.smc_hash	= &smc_v6_hashinfo,
 122	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
 123};
 124EXPORT_SYMBOL_GPL(smc_proto6);
 125
 126static void smc_restore_fallback_changes(struct smc_sock *smc)
 127{
 128	smc->clcsock->file->private_data = smc->sk.sk_socket;
 129	smc->clcsock->file = NULL;
 130}
 131
 132static int __smc_release(struct smc_sock *smc)
 133{
 134	struct sock *sk = &smc->sk;
 135	int rc = 0;
 136
 137	if (!smc->use_fallback) {
 138		rc = smc_close_active(smc);
 139		sock_set_flag(sk, SOCK_DEAD);
 140		sk->sk_shutdown |= SHUTDOWN_MASK;
 141	} else {
 142		if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
 143			sock_put(sk); /* passive closing */
 144		if (sk->sk_state == SMC_LISTEN) {
 145			/* wake up clcsock accept */
 146			rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
 147		}
 148		sk->sk_state = SMC_CLOSED;
 149		sk->sk_state_change(sk);
 150		smc_restore_fallback_changes(smc);
 151	}
 152
 153	sk->sk_prot->unhash(sk);
 154
 155	if (sk->sk_state == SMC_CLOSED) {
 156		if (smc->clcsock) {
 157			release_sock(sk);
 158			smc_clcsock_release(smc);
 159			lock_sock(sk);
 160		}
 161		if (!smc->use_fallback)
 162			smc_conn_free(&smc->conn);
 163	}
 164
 165	return rc;
 166}
 167
 168static int smc_release(struct socket *sock)
 169{
 170	struct sock *sk = sock->sk;
 171	struct smc_sock *smc;
 172	int rc = 0;
 173
 174	if (!sk)
 175		goto out;
 176
 177	smc = smc_sk(sk);
 178
 179	/* cleanup for a dangling non-blocking connect */
 180	if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
 181		tcp_abort(smc->clcsock->sk, ECONNABORTED);
 182	flush_work(&smc->connect_work);
 183
 184	if (sk->sk_state == SMC_LISTEN)
 185		/* smc_close_non_accepted() is called and acquires
 186		 * sock lock for child sockets again
 187		 */
 188		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
 189	else
 190		lock_sock(sk);
 191
 192	rc = __smc_release(smc);
 193
 194	/* detach socket */
 195	sock_orphan(sk);
 196	sock->sk = NULL;
 197	release_sock(sk);
 198
 199	sock_put(sk); /* final sock_put */
 200out:
 201	return rc;
 202}
 203
 204static void smc_destruct(struct sock *sk)
 205{
 206	if (sk->sk_state != SMC_CLOSED)
 207		return;
 208	if (!sock_flag(sk, SOCK_DEAD))
 209		return;
 210
 211	sk_refcnt_debug_dec(sk);
 212}
 213
 214static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
 215				   int protocol)
 216{
 217	struct smc_sock *smc;
 218	struct proto *prot;
 219	struct sock *sk;
 220
 221	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
 222	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
 223	if (!sk)
 224		return NULL;
 225
 226	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
 227	sk->sk_state = SMC_INIT;
 228	sk->sk_destruct = smc_destruct;
 229	sk->sk_protocol = protocol;
 230	smc = smc_sk(sk);
 231	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
 232	INIT_WORK(&smc->connect_work, smc_connect_work);
 233	INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
 234	INIT_LIST_HEAD(&smc->accept_q);
 235	spin_lock_init(&smc->accept_q_lock);
 236	spin_lock_init(&smc->conn.send_lock);
 237	sk->sk_prot->hash(sk);
 238	sk_refcnt_debug_inc(sk);
 239	mutex_init(&smc->clcsock_release_lock);
 240
 241	return sk;
 242}
 243
 244static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
 245		    int addr_len)
 246{
 247	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
 248	struct sock *sk = sock->sk;
 249	struct smc_sock *smc;
 250	int rc;
 251
 252	smc = smc_sk(sk);
 253
 254	/* replicate tests from inet_bind(), to be safe wrt. future changes */
 255	rc = -EINVAL;
 256	if (addr_len < sizeof(struct sockaddr_in))
 257		goto out;
 258
 259	rc = -EAFNOSUPPORT;
 260	if (addr->sin_family != AF_INET &&
 261	    addr->sin_family != AF_INET6 &&
 262	    addr->sin_family != AF_UNSPEC)
 263		goto out;
 264	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
 265	if (addr->sin_family == AF_UNSPEC &&
 266	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
 267		goto out;
 268
 269	lock_sock(sk);
 270
 271	/* Check if socket is already active */
 272	rc = -EINVAL;
 273	if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
 274		goto out_rel;
 275
 276	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
 277	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
 278
 279out_rel:
 280	release_sock(sk);
 281out:
 282	return rc;
 283}
 284
 285static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
 286				   unsigned long mask)
 287{
 288	/* options we don't get control via setsockopt for */
 289	nsk->sk_type = osk->sk_type;
 290	nsk->sk_sndbuf = osk->sk_sndbuf;
 291	nsk->sk_rcvbuf = osk->sk_rcvbuf;
 292	nsk->sk_sndtimeo = osk->sk_sndtimeo;
 293	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
 294	nsk->sk_mark = osk->sk_mark;
 295	nsk->sk_priority = osk->sk_priority;
 296	nsk->sk_rcvlowat = osk->sk_rcvlowat;
 297	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
 298	nsk->sk_err = osk->sk_err;
 299
 300	nsk->sk_flags &= ~mask;
 301	nsk->sk_flags |= osk->sk_flags & mask;
 302}
 303
 304#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
 305			     (1UL << SOCK_KEEPOPEN) | \
 306			     (1UL << SOCK_LINGER) | \
 307			     (1UL << SOCK_BROADCAST) | \
 308			     (1UL << SOCK_TIMESTAMP) | \
 309			     (1UL << SOCK_DBG) | \
 310			     (1UL << SOCK_RCVTSTAMP) | \
 311			     (1UL << SOCK_RCVTSTAMPNS) | \
 312			     (1UL << SOCK_LOCALROUTE) | \
 313			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
 314			     (1UL << SOCK_RXQ_OVFL) | \
 315			     (1UL << SOCK_WIFI_STATUS) | \
 316			     (1UL << SOCK_NOFCS) | \
 317			     (1UL << SOCK_FILTER_LOCKED) | \
 318			     (1UL << SOCK_TSTAMP_NEW))
 319/* copy only relevant settings and flags of SOL_SOCKET level from smc to
 320 * clc socket (since smc is not called for these options from net/core)
 321 */
 322static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
 323{
 324	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
 325}
 326
 327#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
 328			     (1UL << SOCK_KEEPOPEN) | \
 329			     (1UL << SOCK_LINGER) | \
 330			     (1UL << SOCK_DBG))
 331/* copy only settings and flags relevant for smc from clc to smc socket */
 332static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
 333{
 334	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
 335}
 336
 337/* register a new rmb, send confirm_rkey msg to register with peer */
 338static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
 339		       bool conf_rkey)
 340{
 341	if (!rmb_desc->wr_reg) {
 342		/* register memory region for new rmb */
 343		if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
 344			rmb_desc->regerr = 1;
 345			return -EFAULT;
 346		}
 347		rmb_desc->wr_reg = 1;
 348	}
 349	if (!conf_rkey)
 350		return 0;
 351	/* exchange confirm_rkey msg with peer */
 352	if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
 353		rmb_desc->regerr = 1;
 354		return -EFAULT;
 355	}
 356	return 0;
 357}
 358
 359static int smc_clnt_conf_first_link(struct smc_sock *smc)
 360{
 361	struct net *net = sock_net(smc->clcsock->sk);
 362	struct smc_link_group *lgr = smc->conn.lgr;
 363	struct smc_link *link;
 364	int rest;
 365	int rc;
 366
 367	link = &lgr->lnk[SMC_SINGLE_LINK];
 368	/* receive CONFIRM LINK request from server over RoCE fabric */
 369	rest = wait_for_completion_interruptible_timeout(
 370		&link->llc_confirm,
 371		SMC_LLC_WAIT_FIRST_TIME);
 372	if (rest <= 0) {
 373		struct smc_clc_msg_decline dclc;
 374
 375		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
 376				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
 377		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
 378	}
 379
 380	if (link->llc_confirm_rc)
 381		return SMC_CLC_DECL_RMBE_EC;
 382
 383	rc = smc_ib_modify_qp_rts(link);
 384	if (rc)
 385		return SMC_CLC_DECL_ERR_RDYLNK;
 386
 387	smc_wr_remember_qp_attr(link);
 388
 389	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
 390		return SMC_CLC_DECL_ERR_REGRMB;
 391
 392	/* send CONFIRM LINK response over RoCE fabric */
 393	rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
 394	if (rc < 0)
 395		return SMC_CLC_DECL_TIMEOUT_CL;
 396
 397	/* receive ADD LINK request from server over RoCE fabric */
 398	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
 399							 SMC_LLC_WAIT_TIME);
 400	if (rest <= 0) {
 401		struct smc_clc_msg_decline dclc;
 402
 403		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
 404				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
 405		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
 406	}
 407
 408	/* send add link reject message, only one link supported for now */
 409	rc = smc_llc_send_add_link(link,
 410				   link->smcibdev->mac[link->ibport - 1],
 411				   link->gid, SMC_LLC_RESP);
 412	if (rc < 0)
 413		return SMC_CLC_DECL_TIMEOUT_AL;
 414
 415	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
 416
 417	return 0;
 418}
 419
 420static void smcr_conn_save_peer_info(struct smc_sock *smc,
 421				     struct smc_clc_msg_accept_confirm *clc)
 422{
 423	int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
 424
 425	smc->conn.peer_rmbe_idx = clc->rmbe_idx;
 426	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
 427	smc->conn.peer_rmbe_size = bufsize;
 428	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
 429	smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
 430}
 431
 432static void smcd_conn_save_peer_info(struct smc_sock *smc,
 433				     struct smc_clc_msg_accept_confirm *clc)
 434{
 435	int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
 436
 437	smc->conn.peer_rmbe_idx = clc->dmbe_idx;
 438	smc->conn.peer_token = clc->token;
 439	/* msg header takes up space in the buffer */
 440	smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
 441	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
 442	smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
 443}
 444
 445static void smc_conn_save_peer_info(struct smc_sock *smc,
 446				    struct smc_clc_msg_accept_confirm *clc)
 447{
 448	if (smc->conn.lgr->is_smcd)
 449		smcd_conn_save_peer_info(smc, clc);
 450	else
 451		smcr_conn_save_peer_info(smc, clc);
 452}
 453
 454static void smc_link_save_peer_info(struct smc_link *link,
 455				    struct smc_clc_msg_accept_confirm *clc)
 456{
 457	link->peer_qpn = ntoh24(clc->qpn);
 458	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
 459	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
 460	link->peer_psn = ntoh24(clc->psn);
 461	link->peer_mtu = clc->qp_mtu;
 462}
 463
 464static void smc_switch_to_fallback(struct smc_sock *smc)
 465{
 466	smc->use_fallback = true;
 467	if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
 468		smc->clcsock->file = smc->sk.sk_socket->file;
 469		smc->clcsock->file->private_data = smc->clcsock;
 470	}
 471}
 472
 473/* fall back during connect */
 474static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
 475{
 476	smc_switch_to_fallback(smc);
 477	smc->fallback_rsn = reason_code;
 478	smc_copy_sock_settings_to_clc(smc);
 479	smc->connect_nonblock = 0;
 480	if (smc->sk.sk_state == SMC_INIT)
 481		smc->sk.sk_state = SMC_ACTIVE;
 482	return 0;
 483}
 484
 485/* decline and fall back during connect */
 486static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
 487{
 488	int rc;
 489
 490	if (reason_code < 0) { /* error, fallback is not possible */
 491		if (smc->sk.sk_state == SMC_INIT)
 492			sock_put(&smc->sk); /* passive closing */
 493		return reason_code;
 494	}
 495	if (reason_code != SMC_CLC_DECL_PEERDECL) {
 496		rc = smc_clc_send_decline(smc, reason_code);
 497		if (rc < 0) {
 498			if (smc->sk.sk_state == SMC_INIT)
 499				sock_put(&smc->sk); /* passive closing */
 500			return rc;
 501		}
 502	}
 503	return smc_connect_fallback(smc, reason_code);
 504}
 505
 506/* abort connecting */
 507static int smc_connect_abort(struct smc_sock *smc, int reason_code,
 508			     int local_contact)
 509{
 510	if (local_contact == SMC_FIRST_CONTACT)
 511		smc_lgr_forget(smc->conn.lgr);
 512	if (smc->conn.lgr->is_smcd)
 513		/* there is only one lgr role for SMC-D; use server lock */
 514		mutex_unlock(&smc_server_lgr_pending);
 515	else
 516		mutex_unlock(&smc_client_lgr_pending);
 517
 518	smc_conn_free(&smc->conn);
 519	smc->connect_nonblock = 0;
 520	return reason_code;
 521}
 522
 523/* check if there is a rdma device available for this connection. */
 524/* called for connect and listen */
 525static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
 526{
 527	/* PNET table look up: search active ib_device and port
 528	 * within same PNETID that also contains the ethernet device
 529	 * used for the internal TCP socket
 530	 */
 531	smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
 532	if (!ini->ib_dev)
 533		return SMC_CLC_DECL_NOSMCRDEV;
 534	return 0;
 535}
 536
 537/* check if there is an ISM device available for this connection. */
 538/* called for connect and listen */
 539static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
 540{
 541	/* Find ISM device with same PNETID as connecting interface  */
 542	smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
 543	if (!ini->ism_dev)
 544		return SMC_CLC_DECL_NOSMCDDEV;
 545	return 0;
 546}
 547
 548/* Check for VLAN ID and register it on ISM device just for CLC handshake */
 549static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
 550				      struct smc_init_info *ini)
 551{
 552	if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id))
 553		return SMC_CLC_DECL_ISMVLANERR;
 554	return 0;
 555}
 556
 557/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
 558 * used, the VLAN ID will be registered again during the connection setup.
 559 */
 560static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
 561					struct smc_init_info *ini)
 562{
 563	if (!is_smcd)
 564		return 0;
 565	if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id))
 566		return SMC_CLC_DECL_CNFERR;
 567	return 0;
 568}
 569
 570/* CLC handshake during connect */
 571static int smc_connect_clc(struct smc_sock *smc, int smc_type,
 572			   struct smc_clc_msg_accept_confirm *aclc,
 573			   struct smc_init_info *ini)
 574{
 575	int rc = 0;
 576
 577	/* do inband token exchange */
 578	rc = smc_clc_send_proposal(smc, smc_type, ini);
 579	if (rc)
 580		return rc;
 581	/* receive SMC Accept CLC message */
 582	return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
 583				CLC_WAIT_TIME);
 584}
 585
 586/* setup for RDMA connection of client */
 587static int smc_connect_rdma(struct smc_sock *smc,
 588			    struct smc_clc_msg_accept_confirm *aclc,
 589			    struct smc_init_info *ini)
 590{
 591	struct smc_link *link;
 592	int reason_code = 0;
 593
 594	ini->is_smcd = false;
 595	ini->ib_lcl = &aclc->lcl;
 596	ini->ib_clcqpn = ntoh24(aclc->qpn);
 597	ini->srv_first_contact = aclc->hdr.flag;
 598
 599	mutex_lock(&smc_client_lgr_pending);
 600	reason_code = smc_conn_create(smc, ini);
 601	if (reason_code) {
 602		mutex_unlock(&smc_client_lgr_pending);
 603		return reason_code;
 604	}
 605	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
 606
 607	smc_conn_save_peer_info(smc, aclc);
 608
 609	/* create send buffer and rmb */
 610	if (smc_buf_create(smc, false))
 611		return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
 612					 ini->cln_first_contact);
 613
 614	if (ini->cln_first_contact == SMC_FIRST_CONTACT)
 615		smc_link_save_peer_info(link, aclc);
 616
 617	if (smc_rmb_rtoken_handling(&smc->conn, aclc))
 618		return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
 619					 ini->cln_first_contact);
 620
 621	smc_close_init(smc);
 622	smc_rx_init(smc);
 623
 624	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
 625		if (smc_ib_ready_link(link))
 626			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
 627						 ini->cln_first_contact);
 628	} else {
 629		if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
 630			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
 631						 ini->cln_first_contact);
 632	}
 633	smc_rmb_sync_sg_for_device(&smc->conn);
 634
 635	reason_code = smc_clc_send_confirm(smc);
 636	if (reason_code)
 637		return smc_connect_abort(smc, reason_code,
 638					 ini->cln_first_contact);
 639
 640	smc_tx_init(smc);
 641
 642	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
 643		/* QP confirmation over RoCE fabric */
 644		reason_code = smc_clnt_conf_first_link(smc);
 645		if (reason_code)
 646			return smc_connect_abort(smc, reason_code,
 647						 ini->cln_first_contact);
 648	}
 649	mutex_unlock(&smc_client_lgr_pending);
 650
 651	smc_copy_sock_settings_to_clc(smc);
 652	smc->connect_nonblock = 0;
 653	if (smc->sk.sk_state == SMC_INIT)
 654		smc->sk.sk_state = SMC_ACTIVE;
 655
 656	return 0;
 657}
 658
 659/* setup for ISM connection of client */
 660static int smc_connect_ism(struct smc_sock *smc,
 661			   struct smc_clc_msg_accept_confirm *aclc,
 662			   struct smc_init_info *ini)
 663{
 664	int rc = 0;
 665
 666	ini->is_smcd = true;
 667	ini->ism_gid = aclc->gid;
 668	ini->srv_first_contact = aclc->hdr.flag;
 669
 670	/* there is only one lgr role for SMC-D; use server lock */
 671	mutex_lock(&smc_server_lgr_pending);
 672	rc = smc_conn_create(smc, ini);
 673	if (rc) {
 674		mutex_unlock(&smc_server_lgr_pending);
 675		return rc;
 676	}
 677
 678	/* Create send and receive buffers */
 679	if (smc_buf_create(smc, true))
 680		return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
 681					 ini->cln_first_contact);
 682
 683	smc_conn_save_peer_info(smc, aclc);
 684	smc_close_init(smc);
 685	smc_rx_init(smc);
 686	smc_tx_init(smc);
 687
 688	rc = smc_clc_send_confirm(smc);
 689	if (rc)
 690		return smc_connect_abort(smc, rc, ini->cln_first_contact);
 691	mutex_unlock(&smc_server_lgr_pending);
 692
 693	smc_copy_sock_settings_to_clc(smc);
 694	smc->connect_nonblock = 0;
 695	if (smc->sk.sk_state == SMC_INIT)
 696		smc->sk.sk_state = SMC_ACTIVE;
 697
 698	return 0;
 699}
 700
 701/* perform steps before actually connecting */
 702static int __smc_connect(struct smc_sock *smc)
 703{
 704	bool ism_supported = false, rdma_supported = false;
 705	struct smc_clc_msg_accept_confirm aclc;
 706	struct smc_init_info ini = {0};
 707	int smc_type;
 708	int rc = 0;
 709
 710	if (smc->use_fallback)
 711		return smc_connect_fallback(smc, smc->fallback_rsn);
 712
 713	/* if peer has not signalled SMC-capability, fall back */
 714	if (!tcp_sk(smc->clcsock->sk)->syn_smc)
 715		return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
 716
 717	/* IPSec connections opt out of SMC-R optimizations */
 718	if (using_ipsec(smc))
 719		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
 720
 721	/* get vlan id from IP device */
 722	if (smc_vlan_by_tcpsk(smc->clcsock, &ini))
 723		return smc_connect_decline_fallback(smc,
 724						    SMC_CLC_DECL_GETVLANERR);
 725
 726	/* check if there is an ism device available */
 727	if (!smc_find_ism_device(smc, &ini) &&
 728	    !smc_connect_ism_vlan_setup(smc, &ini)) {
 729		/* ISM is supported for this connection */
 730		ism_supported = true;
 731		smc_type = SMC_TYPE_D;
 732	}
 733
 734	/* check if there is a rdma device available */
 735	if (!smc_find_rdma_device(smc, &ini)) {
 736		/* RDMA is supported for this connection */
 737		rdma_supported = true;
 738		if (ism_supported)
 739			smc_type = SMC_TYPE_B; /* both */
 740		else
 741			smc_type = SMC_TYPE_R; /* only RDMA */
 742	}
 743
 744	/* if neither ISM nor RDMA are supported, fallback */
 745	if (!rdma_supported && !ism_supported)
 746		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
 747
 748	/* perform CLC handshake */
 749	rc = smc_connect_clc(smc, smc_type, &aclc, &ini);
 750	if (rc) {
 751		smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
 752		return smc_connect_decline_fallback(smc, rc);
 753	}
 754
 755	/* depending on previous steps, connect using rdma or ism */
 756	if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
 757		rc = smc_connect_rdma(smc, &aclc, &ini);
 758	else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
 759		rc = smc_connect_ism(smc, &aclc, &ini);
 760	else
 761		rc = SMC_CLC_DECL_MODEUNSUPP;
 762	if (rc) {
 763		smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
 764		return smc_connect_decline_fallback(smc, rc);
 765	}
 766
 767	smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
 768	return 0;
 769}
 770
 771static void smc_connect_work(struct work_struct *work)
 772{
 773	struct smc_sock *smc = container_of(work, struct smc_sock,
 774					    connect_work);
 775	long timeo = smc->sk.sk_sndtimeo;
 776	int rc = 0;
 777
 778	if (!timeo)
 779		timeo = MAX_SCHEDULE_TIMEOUT;
 780	lock_sock(smc->clcsock->sk);
 781	if (smc->clcsock->sk->sk_err) {
 782		smc->sk.sk_err = smc->clcsock->sk->sk_err;
 783	} else if ((1 << smc->clcsock->sk->sk_state) &
 784					(TCPF_SYN_SENT | TCP_SYN_RECV)) {
 785		rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
 786		if ((rc == -EPIPE) &&
 787		    ((1 << smc->clcsock->sk->sk_state) &
 788					(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
 789			rc = 0;
 790	}
 791	release_sock(smc->clcsock->sk);
 792	lock_sock(&smc->sk);
 793	if (rc != 0 || smc->sk.sk_err) {
 794		smc->sk.sk_state = SMC_CLOSED;
 795		if (rc == -EPIPE || rc == -EAGAIN)
 796			smc->sk.sk_err = EPIPE;
 797		else if (signal_pending(current))
 798			smc->sk.sk_err = -sock_intr_errno(timeo);
 799		sock_put(&smc->sk); /* passive closing */
 800		goto out;
 801	}
 802
 803	rc = __smc_connect(smc);
 804	if (rc < 0)
 805		smc->sk.sk_err = -rc;
 806
 807out:
 808	if (!sock_flag(&smc->sk, SOCK_DEAD)) {
 809		if (smc->sk.sk_err) {
 810			smc->sk.sk_state_change(&smc->sk);
 811		} else { /* allow polling before and after fallback decision */
 812			smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
 813			smc->sk.sk_write_space(&smc->sk);
 814		}
 815	}
 816	release_sock(&smc->sk);
 817}
 818
 819static int smc_connect(struct socket *sock, struct sockaddr *addr,
 820		       int alen, int flags)
 821{
 822	struct sock *sk = sock->sk;
 823	struct smc_sock *smc;
 824	int rc = -EINVAL;
 825
 826	smc = smc_sk(sk);
 827
 828	/* separate smc parameter checking to be safe */
 829	if (alen < sizeof(addr->sa_family))
 830		goto out_err;
 831	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
 832		goto out_err;
 833
 834	lock_sock(sk);
 835	switch (sk->sk_state) {
 836	default:
 837		goto out;
 838	case SMC_ACTIVE:
 839		rc = -EISCONN;
 840		goto out;
 841	case SMC_INIT:
 842		rc = 0;
 843		break;
 844	}
 845
 846	smc_copy_sock_settings_to_clc(smc);
 847	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
 848	if (smc->connect_nonblock) {
 849		rc = -EALREADY;
 850		goto out;
 851	}
 852	rc = kernel_connect(smc->clcsock, addr, alen, flags);
 853	if (rc && rc != -EINPROGRESS)
 854		goto out;
 855
 856	sock_hold(&smc->sk); /* sock put in passive closing */
 857	if (flags & O_NONBLOCK) {
 858		if (schedule_work(&smc->connect_work))
 859			smc->connect_nonblock = 1;
 860		rc = -EINPROGRESS;
 861	} else {
 862		rc = __smc_connect(smc);
 863		if (rc < 0)
 864			goto out;
 865		else
 866			rc = 0; /* success cases including fallback */
 867	}
 868
 869out:
 870	release_sock(sk);
 871out_err:
 872	return rc;
 873}
 874
 875static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 876{
 877	struct socket *new_clcsock = NULL;
 878	struct sock *lsk = &lsmc->sk;
 879	struct sock *new_sk;
 880	int rc = -EINVAL;
 881
 882	release_sock(lsk);
 883	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
 884	if (!new_sk) {
 885		rc = -ENOMEM;
 886		lsk->sk_err = ENOMEM;
 887		*new_smc = NULL;
 888		lock_sock(lsk);
 889		goto out;
 890	}
 891	*new_smc = smc_sk(new_sk);
 892
 893	mutex_lock(&lsmc->clcsock_release_lock);
 894	if (lsmc->clcsock)
 895		rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
 896	mutex_unlock(&lsmc->clcsock_release_lock);
 897	lock_sock(lsk);
 898	if  (rc < 0)
 899		lsk->sk_err = -rc;
 900	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
 901		new_sk->sk_prot->unhash(new_sk);
 902		if (new_clcsock)
 903			sock_release(new_clcsock);
 904		new_sk->sk_state = SMC_CLOSED;
 905		sock_set_flag(new_sk, SOCK_DEAD);
 906		sock_put(new_sk); /* final */
 907		*new_smc = NULL;
 908		goto out;
 909	}
 910
 911	(*new_smc)->clcsock = new_clcsock;
 912out:
 913	return rc;
 914}
 915
 916/* add a just created sock to the accept queue of the listen sock as
 917 * candidate for a following socket accept call from user space
 918 */
 919static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
 920{
 921	struct smc_sock *par = smc_sk(parent);
 922
 923	sock_hold(sk); /* sock_put in smc_accept_unlink () */
 924	spin_lock(&par->accept_q_lock);
 925	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
 926	spin_unlock(&par->accept_q_lock);
 927	sk_acceptq_added(parent);
 928}
 929
 930/* remove a socket from the accept queue of its parental listening socket */
 931static void smc_accept_unlink(struct sock *sk)
 932{
 933	struct smc_sock *par = smc_sk(sk)->listen_smc;
 934
 935	spin_lock(&par->accept_q_lock);
 936	list_del_init(&smc_sk(sk)->accept_q);
 937	spin_unlock(&par->accept_q_lock);
 938	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
 939	sock_put(sk); /* sock_hold in smc_accept_enqueue */
 940}
 941
 942/* remove a sock from the accept queue to bind it to a new socket created
 943 * for a socket accept call from user space
 944 */
 945struct sock *smc_accept_dequeue(struct sock *parent,
 946				struct socket *new_sock)
 947{
 948	struct smc_sock *isk, *n;
 949	struct sock *new_sk;
 950
 951	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
 952		new_sk = (struct sock *)isk;
 953
 954		smc_accept_unlink(new_sk);
 955		if (new_sk->sk_state == SMC_CLOSED) {
 956			new_sk->sk_prot->unhash(new_sk);
 957			if (isk->clcsock) {
 958				sock_release(isk->clcsock);
 959				isk->clcsock = NULL;
 960			}
 961			sock_put(new_sk); /* final */
 962			continue;
 963		}
 964		if (new_sock) {
 965			sock_graft(new_sk, new_sock);
 966			if (isk->use_fallback) {
 967				smc_sk(new_sk)->clcsock->file = new_sock->file;
 968				isk->clcsock->file->private_data = isk->clcsock;
 969			}
 970		}
 971		return new_sk;
 972	}
 973	return NULL;
 974}
 975
 976/* clean up for a created but never accepted sock */
 977void smc_close_non_accepted(struct sock *sk)
 978{
 979	struct smc_sock *smc = smc_sk(sk);
 980
 981	lock_sock(sk);
 982	if (!sk->sk_lingertime)
 983		/* wait for peer closing */
 984		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
 985	__smc_release(smc);
 986	release_sock(sk);
 987	sock_put(sk); /* final sock_put */
 988}
 989
 990static int smc_serv_conf_first_link(struct smc_sock *smc)
 991{
 992	struct net *net = sock_net(smc->clcsock->sk);
 993	struct smc_link_group *lgr = smc->conn.lgr;
 994	struct smc_link *link;
 995	int rest;
 996	int rc;
 997
 998	link = &lgr->lnk[SMC_SINGLE_LINK];
 999
1000	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
1001		return SMC_CLC_DECL_ERR_REGRMB;
1002
1003	/* send CONFIRM LINK request to client over the RoCE fabric */
1004	rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
1005	if (rc < 0)
1006		return SMC_CLC_DECL_TIMEOUT_CL;
1007
1008	/* receive CONFIRM LINK response from client over the RoCE fabric */
1009	rest = wait_for_completion_interruptible_timeout(
1010		&link->llc_confirm_resp,
1011		SMC_LLC_WAIT_FIRST_TIME);
1012	if (rest <= 0) {
1013		struct smc_clc_msg_decline dclc;
1014
1015		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1016				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1017		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
1018	}
1019
1020	if (link->llc_confirm_resp_rc)
1021		return SMC_CLC_DECL_RMBE_EC;
1022
1023	/* send ADD LINK request to client over the RoCE fabric */
1024	rc = smc_llc_send_add_link(link,
1025				   link->smcibdev->mac[link->ibport - 1],
1026				   link->gid, SMC_LLC_REQ);
1027	if (rc < 0)
1028		return SMC_CLC_DECL_TIMEOUT_AL;
1029
1030	/* receive ADD LINK response from client over the RoCE fabric */
1031	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
1032							 SMC_LLC_WAIT_TIME);
1033	if (rest <= 0) {
1034		struct smc_clc_msg_decline dclc;
1035
1036		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1037				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1038		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
1039	}
1040
1041	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
1042
1043	return 0;
1044}
1045
1046/* listen worker: finish */
1047static void smc_listen_out(struct smc_sock *new_smc)
1048{
1049	struct smc_sock *lsmc = new_smc->listen_smc;
1050	struct sock *newsmcsk = &new_smc->sk;
1051
1052	if (lsmc->sk.sk_state == SMC_LISTEN) {
1053		lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1054		smc_accept_enqueue(&lsmc->sk, newsmcsk);
1055		release_sock(&lsmc->sk);
1056	} else { /* no longer listening */
1057		smc_close_non_accepted(newsmcsk);
1058	}
1059
1060	/* Wake up accept */
1061	lsmc->sk.sk_data_ready(&lsmc->sk);
1062	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1063}
1064
1065/* listen worker: finish in state connected */
1066static void smc_listen_out_connected(struct smc_sock *new_smc)
1067{
1068	struct sock *newsmcsk = &new_smc->sk;
1069
1070	sk_refcnt_debug_inc(newsmcsk);
1071	if (newsmcsk->sk_state == SMC_INIT)
1072		newsmcsk->sk_state = SMC_ACTIVE;
1073
1074	smc_listen_out(new_smc);
1075}
1076
1077/* listen worker: finish in error state */
1078static void smc_listen_out_err(struct smc_sock *new_smc)
1079{
1080	struct sock *newsmcsk = &new_smc->sk;
1081
1082	if (newsmcsk->sk_state == SMC_INIT)
1083		sock_put(&new_smc->sk); /* passive closing */
1084	newsmcsk->sk_state = SMC_CLOSED;
1085	smc_conn_free(&new_smc->conn);
1086
1087	smc_listen_out(new_smc);
1088}
1089
1090/* listen worker: decline and fall back if possible */
1091static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1092			       int local_contact)
1093{
1094	/* RDMA setup failed, switch back to TCP */
1095	if (local_contact == SMC_FIRST_CONTACT)
1096		smc_lgr_forget(new_smc->conn.lgr);
1097	if (reason_code < 0) { /* error, no fallback possible */
1098		smc_listen_out_err(new_smc);
1099		return;
1100	}
1101	smc_conn_free(&new_smc->conn);
1102	smc_switch_to_fallback(new_smc);
1103	new_smc->fallback_rsn = reason_code;
1104	if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1105		if (smc_clc_send_decline(new_smc, reason_code) < 0) {
1106			smc_listen_out_err(new_smc);
1107			return;
1108		}
1109	}
1110	smc_listen_out_connected(new_smc);
1111}
1112
1113/* listen worker: check prefixes */
1114static int smc_listen_prfx_check(struct smc_sock *new_smc,
1115				 struct smc_clc_msg_proposal *pclc)
1116{
1117	struct smc_clc_msg_proposal_prefix *pclc_prfx;
1118	struct socket *newclcsock = new_smc->clcsock;
1119
1120	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1121	if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1122		return SMC_CLC_DECL_DIFFPREFIX;
1123
1124	return 0;
1125}
1126
1127/* listen worker: initialize connection and buffers */
1128static int smc_listen_rdma_init(struct smc_sock *new_smc,
1129				struct smc_init_info *ini)
1130{
1131	int rc;
1132
1133	/* allocate connection / link group */
1134	rc = smc_conn_create(new_smc, ini);
1135	if (rc)
1136		return rc;
1137
1138	/* create send buffer and rmb */
1139	if (smc_buf_create(new_smc, false))
1140		return SMC_CLC_DECL_MEM;
1141
1142	return 0;
1143}
1144
1145/* listen worker: initialize connection and buffers for SMC-D */
1146static int smc_listen_ism_init(struct smc_sock *new_smc,
1147			       struct smc_clc_msg_proposal *pclc,
1148			       struct smc_init_info *ini)
1149{
1150	struct smc_clc_msg_smcd *pclc_smcd;
1151	int rc;
1152
1153	pclc_smcd = smc_get_clc_msg_smcd(pclc);
1154	ini->ism_gid = pclc_smcd->gid;
1155	rc = smc_conn_create(new_smc, ini);
1156	if (rc)
1157		return rc;
1158
1159	/* Check if peer can be reached via ISM device */
1160	if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1161			    new_smc->conn.lgr->vlan_id,
1162			    new_smc->conn.lgr->smcd)) {
1163		if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1164			smc_lgr_forget(new_smc->conn.lgr);
1165		smc_conn_free(&new_smc->conn);
1166		return SMC_CLC_DECL_SMCDNOTALK;
1167	}
1168
1169	/* Create send and receive buffers */
1170	if (smc_buf_create(new_smc, true)) {
1171		if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1172			smc_lgr_forget(new_smc->conn.lgr);
1173		smc_conn_free(&new_smc->conn);
1174		return SMC_CLC_DECL_MEM;
1175	}
1176
1177	return 0;
1178}
1179
1180/* listen worker: register buffers */
1181static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
1182{
1183	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1184
1185	if (local_contact != SMC_FIRST_CONTACT) {
1186		if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
1187			return SMC_CLC_DECL_ERR_REGRMB;
1188	}
1189	smc_rmb_sync_sg_for_device(&new_smc->conn);
1190
1191	return 0;
1192}
1193
1194/* listen worker: finish RDMA setup */
1195static int smc_listen_rdma_finish(struct smc_sock *new_smc,
1196				  struct smc_clc_msg_accept_confirm *cclc,
1197				  int local_contact)
1198{
1199	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1200	int reason_code = 0;
1201
1202	if (local_contact == SMC_FIRST_CONTACT)
1203		smc_link_save_peer_info(link, cclc);
1204
1205	if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
1206		reason_code = SMC_CLC_DECL_ERR_RTOK;
1207		goto decline;
1208	}
1209
1210	if (local_contact == SMC_FIRST_CONTACT) {
1211		if (smc_ib_ready_link(link)) {
1212			reason_code = SMC_CLC_DECL_ERR_RDYLNK;
1213			goto decline;
1214		}
1215		/* QP confirmation over RoCE fabric */
1216		reason_code = smc_serv_conf_first_link(new_smc);
1217		if (reason_code)
1218			goto decline;
1219	}
1220	return 0;
1221
1222decline:
1223	smc_listen_decline(new_smc, reason_code, local_contact);
1224	return reason_code;
1225}
1226
1227/* setup for RDMA connection of server */
1228static void smc_listen_work(struct work_struct *work)
1229{
1230	struct smc_sock *new_smc = container_of(work, struct smc_sock,
1231						smc_listen_work);
1232	struct socket *newclcsock = new_smc->clcsock;
1233	struct smc_clc_msg_accept_confirm cclc;
1234	struct smc_clc_msg_proposal *pclc;
1235	struct smc_init_info ini = {0};
1236	bool ism_supported = false;
1237	u8 buf[SMC_CLC_MAX_LEN];
1238	int rc = 0;
1239
1240	if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
1241		return smc_listen_out_err(new_smc);
1242
1243	if (new_smc->use_fallback) {
1244		smc_listen_out_connected(new_smc);
1245		return;
1246	}
1247
1248	/* check if peer is smc capable */
1249	if (!tcp_sk(newclcsock->sk)->syn_smc) {
1250		smc_switch_to_fallback(new_smc);
1251		new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
1252		smc_listen_out_connected(new_smc);
1253		return;
1254	}
1255
1256	/* do inband token exchange -
1257	 * wait for and receive SMC Proposal CLC message
1258	 */
1259	pclc = (struct smc_clc_msg_proposal *)&buf;
1260	rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1261			      SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
1262	if (rc)
1263		goto out_decl;
1264
1265	/* IPSec connections opt out of SMC-R optimizations */
1266	if (using_ipsec(new_smc)) {
1267		rc = SMC_CLC_DECL_IPSEC;
1268		goto out_decl;
1269	}
1270
1271	/* check for matching IP prefix and subnet length */
1272	rc = smc_listen_prfx_check(new_smc, pclc);
1273	if (rc)
1274		goto out_decl;
1275
1276	/* get vlan id from IP device */
1277	if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) {
1278		rc = SMC_CLC_DECL_GETVLANERR;
1279		goto out_decl;
1280	}
1281
1282	mutex_lock(&smc_server_lgr_pending);
1283	smc_close_init(new_smc);
1284	smc_rx_init(new_smc);
1285	smc_tx_init(new_smc);
1286
1287	/* check if ISM is available */
1288	if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) {
1289		ini.is_smcd = true; /* prepare ISM check */
1290		rc = smc_find_ism_device(new_smc, &ini);
1291		if (!rc)
1292			rc = smc_listen_ism_init(new_smc, pclc, &ini);
1293		if (!rc)
1294			ism_supported = true;
1295		else if (pclc->hdr.path == SMC_TYPE_D)
1296			goto out_unlock; /* skip RDMA and decline */
1297	}
1298
1299	/* check if RDMA is available */
1300	if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */
1301		/* prepare RDMA check */
1302		ini.is_smcd = false;
1303		ini.ism_dev = NULL;
1304		ini.ib_lcl = &pclc->lcl;
1305		rc = smc_find_rdma_device(new_smc, &ini);
1306		if (rc) {
1307			/* no RDMA device found */
1308			if (pclc->hdr.path == SMC_TYPE_B)
1309				/* neither ISM nor RDMA device found */
1310				rc = SMC_CLC_DECL_NOSMCDEV;
1311			goto out_unlock;
1312		}
1313		rc = smc_listen_rdma_init(new_smc, &ini);
1314		if (rc)
1315			goto out_unlock;
1316		rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact);
1317		if (rc)
1318			goto out_unlock;
1319	}
1320
1321	/* send SMC Accept CLC message */
1322	rc = smc_clc_send_accept(new_smc, ini.cln_first_contact);
1323	if (rc)
1324		goto out_unlock;
1325
1326	/* SMC-D does not need this lock any more */
1327	if (ism_supported)
1328		mutex_unlock(&smc_server_lgr_pending);
1329
1330	/* receive SMC Confirm CLC message */
1331	rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1332			      SMC_CLC_CONFIRM, CLC_WAIT_TIME);
1333	if (rc) {
1334		if (!ism_supported)
1335			goto out_unlock;
1336		goto out_decl;
1337	}
1338
1339	/* finish worker */
1340	if (!ism_supported) {
1341		rc = smc_listen_rdma_finish(new_smc, &cclc,
1342					    ini.cln_first_contact);
1343		mutex_unlock(&smc_server_lgr_pending);
1344		if (rc)
1345			return;
1346	}
1347	smc_conn_save_peer_info(new_smc, &cclc);
1348	smc_listen_out_connected(new_smc);
1349	return;
1350
1351out_unlock:
1352	mutex_unlock(&smc_server_lgr_pending);
1353out_decl:
1354	smc_listen_decline(new_smc, rc, ini.cln_first_contact);
1355}
1356
1357static void smc_tcp_listen_work(struct work_struct *work)
1358{
1359	struct smc_sock *lsmc = container_of(work, struct smc_sock,
1360					     tcp_listen_work);
1361	struct sock *lsk = &lsmc->sk;
1362	struct smc_sock *new_smc;
1363	int rc = 0;
1364
1365	lock_sock(lsk);
1366	while (lsk->sk_state == SMC_LISTEN) {
1367		rc = smc_clcsock_accept(lsmc, &new_smc);
1368		if (rc)
1369			goto out;
1370		if (!new_smc)
1371			continue;
1372
1373		new_smc->listen_smc = lsmc;
1374		new_smc->use_fallback = lsmc->use_fallback;
1375		new_smc->fallback_rsn = lsmc->fallback_rsn;
1376		sock_hold(lsk); /* sock_put in smc_listen_work */
1377		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1378		smc_copy_sock_settings_to_smc(new_smc);
1379		new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1380		new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1381		sock_hold(&new_smc->sk); /* sock_put in passive closing */
1382		if (!schedule_work(&new_smc->smc_listen_work))
1383			sock_put(&new_smc->sk);
1384	}
1385
1386out:
1387	release_sock(lsk);
1388	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1389}
1390
1391static int smc_listen(struct socket *sock, int backlog)
1392{
1393	struct sock *sk = sock->sk;
1394	struct smc_sock *smc;
1395	int rc;
1396
1397	smc = smc_sk(sk);
1398	lock_sock(sk);
1399
1400	rc = -EINVAL;
1401	if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
1402	    smc->connect_nonblock)
1403		goto out;
1404
1405	rc = 0;
1406	if (sk->sk_state == SMC_LISTEN) {
1407		sk->sk_max_ack_backlog = backlog;
1408		goto out;
1409	}
1410	/* some socket options are handled in core, so we could not apply
1411	 * them to the clc socket -- copy smc socket options to clc socket
1412	 */
1413	smc_copy_sock_settings_to_clc(smc);
1414	if (!smc->use_fallback)
1415		tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1416
1417	rc = kernel_listen(smc->clcsock, backlog);
1418	if (rc)
1419		goto out;
1420	sk->sk_max_ack_backlog = backlog;
1421	sk->sk_ack_backlog = 0;
1422	sk->sk_state = SMC_LISTEN;
1423	sock_hold(sk); /* sock_hold in tcp_listen_worker */
1424	if (!schedule_work(&smc->tcp_listen_work))
1425		sock_put(sk);
1426
1427out:
1428	release_sock(sk);
1429	return rc;
1430}
1431
1432static int smc_accept(struct socket *sock, struct socket *new_sock,
1433		      int flags, bool kern)
1434{
1435	struct sock *sk = sock->sk, *nsk;
1436	DECLARE_WAITQUEUE(wait, current);
1437	struct smc_sock *lsmc;
1438	long timeo;
1439	int rc = 0;
1440
1441	lsmc = smc_sk(sk);
1442	sock_hold(sk); /* sock_put below */
1443	lock_sock(sk);
1444
1445	if (lsmc->sk.sk_state != SMC_LISTEN) {
1446		rc = -EINVAL;
1447		release_sock(sk);
1448		goto out;
1449	}
1450
1451	/* Wait for an incoming connection */
1452	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1453	add_wait_queue_exclusive(sk_sleep(sk), &wait);
1454	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1455		set_current_state(TASK_INTERRUPTIBLE);
1456		if (!timeo) {
1457			rc = -EAGAIN;
1458			break;
1459		}
1460		release_sock(sk);
1461		timeo = schedule_timeout(timeo);
1462		/* wakeup by sk_data_ready in smc_listen_work() */
1463		sched_annotate_sleep();
1464		lock_sock(sk);
1465		if (signal_pending(current)) {
1466			rc = sock_intr_errno(timeo);
1467			break;
1468		}
1469	}
1470	set_current_state(TASK_RUNNING);
1471	remove_wait_queue(sk_sleep(sk), &wait);
1472
1473	if (!rc)
1474		rc = sock_error(nsk);
1475	release_sock(sk);
1476	if (rc)
1477		goto out;
1478
1479	if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1480		/* wait till data arrives on the socket */
1481		timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1482								MSEC_PER_SEC);
1483		if (smc_sk(nsk)->use_fallback) {
1484			struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1485
1486			lock_sock(clcsk);
1487			if (skb_queue_empty(&clcsk->sk_receive_queue))
1488				sk_wait_data(clcsk, &timeo, NULL);
1489			release_sock(clcsk);
1490		} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1491			lock_sock(nsk);
1492			smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1493			release_sock(nsk);
1494		}
1495	}
1496
1497out:
1498	sock_put(sk); /* sock_hold above */
1499	return rc;
1500}
1501
1502static int smc_getname(struct socket *sock, struct sockaddr *addr,
1503		       int peer)
1504{
1505	struct smc_sock *smc;
1506
1507	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1508	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1509		return -ENOTCONN;
1510
1511	smc = smc_sk(sock->sk);
1512
1513	return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1514}
1515
1516static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1517{
1518	struct sock *sk = sock->sk;
1519	struct smc_sock *smc;
1520	int rc = -EPIPE;
1521
1522	smc = smc_sk(sk);
1523	lock_sock(sk);
1524	if ((sk->sk_state != SMC_ACTIVE) &&
1525	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1526	    (sk->sk_state != SMC_INIT))
1527		goto out;
1528
1529	if (msg->msg_flags & MSG_FASTOPEN) {
1530		if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
1531			smc_switch_to_fallback(smc);
1532			smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1533		} else {
1534			rc = -EINVAL;
1535			goto out;
1536		}
1537	}
1538
1539	if (smc->use_fallback)
1540		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1541	else
1542		rc = smc_tx_sendmsg(smc, msg, len);
1543out:
1544	release_sock(sk);
1545	return rc;
1546}
1547
1548static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1549		       int flags)
1550{
1551	struct sock *sk = sock->sk;
1552	struct smc_sock *smc;
1553	int rc = -ENOTCONN;
1554
1555	smc = smc_sk(sk);
1556	lock_sock(sk);
1557	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1558		/* socket was connected before, no more data to read */
1559		rc = 0;
1560		goto out;
1561	}
1562	if ((sk->sk_state == SMC_INIT) ||
1563	    (sk->sk_state == SMC_LISTEN) ||
1564	    (sk->sk_state == SMC_CLOSED))
1565		goto out;
1566
1567	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1568		rc = 0;
1569		goto out;
1570	}
1571
1572	if (smc->use_fallback) {
1573		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1574	} else {
1575		msg->msg_namelen = 0;
1576		rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1577	}
1578
1579out:
1580	release_sock(sk);
1581	return rc;
1582}
1583
1584static __poll_t smc_accept_poll(struct sock *parent)
1585{
1586	struct smc_sock *isk = smc_sk(parent);
1587	__poll_t mask = 0;
1588
1589	spin_lock(&isk->accept_q_lock);
1590	if (!list_empty(&isk->accept_q))
1591		mask = EPOLLIN | EPOLLRDNORM;
1592	spin_unlock(&isk->accept_q_lock);
1593
1594	return mask;
1595}
1596
1597static __poll_t smc_poll(struct file *file, struct socket *sock,
1598			     poll_table *wait)
1599{
1600	struct sock *sk = sock->sk;
1601	struct smc_sock *smc;
1602	__poll_t mask = 0;
1603
1604	if (!sk)
1605		return EPOLLNVAL;
1606
1607	smc = smc_sk(sock->sk);
1608	if (smc->use_fallback) {
1609		/* delegate to CLC child sock */
1610		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1611		sk->sk_err = smc->clcsock->sk->sk_err;
1612	} else {
1613		if (sk->sk_state != SMC_CLOSED)
1614			sock_poll_wait(file, sock, wait);
1615		if (sk->sk_err)
1616			mask |= EPOLLERR;
1617		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1618		    (sk->sk_state == SMC_CLOSED))
1619			mask |= EPOLLHUP;
1620		if (sk->sk_state == SMC_LISTEN) {
1621			/* woken up by sk_data_ready in smc_listen_work() */
1622			mask |= smc_accept_poll(sk);
1623		} else if (smc->use_fallback) { /* as result of connect_work()*/
1624			mask |= smc->clcsock->ops->poll(file, smc->clcsock,
1625							   wait);
1626			sk->sk_err = smc->clcsock->sk->sk_err;
1627		} else {
1628			if ((sk->sk_state != SMC_INIT &&
1629			     atomic_read(&smc->conn.sndbuf_space)) ||
1630			    sk->sk_shutdown & SEND_SHUTDOWN) {
1631				mask |= EPOLLOUT | EPOLLWRNORM;
1632			} else {
1633				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1634				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1635			}
1636			if (atomic_read(&smc->conn.bytes_to_rcv))
1637				mask |= EPOLLIN | EPOLLRDNORM;
1638			if (sk->sk_shutdown & RCV_SHUTDOWN)
1639				mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1640			if (sk->sk_state == SMC_APPCLOSEWAIT1)
1641				mask |= EPOLLIN;
1642			if (smc->conn.urg_state == SMC_URG_VALID)
1643				mask |= EPOLLPRI;
1644		}
1645	}
1646
1647	return mask;
1648}
1649
1650static int smc_shutdown(struct socket *sock, int how)
1651{
1652	struct sock *sk = sock->sk;
1653	struct smc_sock *smc;
1654	int rc = -EINVAL;
1655	int rc1 = 0;
1656
1657	smc = smc_sk(sk);
1658
1659	if ((how < SHUT_RD) || (how > SHUT_RDWR))
1660		return rc;
1661
1662	lock_sock(sk);
1663
1664	rc = -ENOTCONN;
1665	if ((sk->sk_state != SMC_ACTIVE) &&
1666	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1667	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1668	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1669	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1670	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
1671		goto out;
1672	if (smc->use_fallback) {
1673		rc = kernel_sock_shutdown(smc->clcsock, how);
1674		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1675		if (sk->sk_shutdown == SHUTDOWN_MASK)
1676			sk->sk_state = SMC_CLOSED;
1677		goto out;
1678	}
1679	switch (how) {
1680	case SHUT_RDWR:		/* shutdown in both directions */
1681		rc = smc_close_active(smc);
1682		break;
1683	case SHUT_WR:
1684		rc = smc_close_shutdown_write(smc);
1685		break;
1686	case SHUT_RD:
1687		rc = 0;
1688		/* nothing more to do because peer is not involved */
1689		break;
1690	}
1691	if (smc->clcsock)
1692		rc1 = kernel_sock_shutdown(smc->clcsock, how);
1693	/* map sock_shutdown_cmd constants to sk_shutdown value range */
1694	sk->sk_shutdown |= how + 1;
1695
1696out:
1697	release_sock(sk);
1698	return rc ? rc : rc1;
1699}
1700
1701static int smc_setsockopt(struct socket *sock, int level, int optname,
1702			  char __user *optval, unsigned int optlen)
1703{
1704	struct sock *sk = sock->sk;
1705	struct smc_sock *smc;
1706	int val, rc;
1707
1708	smc = smc_sk(sk);
1709
1710	/* generic setsockopts reaching us here always apply to the
1711	 * CLC socket
1712	 */
1713	rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1714					   optval, optlen);
1715	if (smc->clcsock->sk->sk_err) {
1716		sk->sk_err = smc->clcsock->sk->sk_err;
1717		sk->sk_error_report(sk);
1718	}
1719	if (rc)
1720		return rc;
1721
1722	if (optlen < sizeof(int))
1723		return -EINVAL;
1724	if (get_user(val, (int __user *)optval))
1725		return -EFAULT;
1726
1727	lock_sock(sk);
1728	switch (optname) {
1729	case TCP_ULP:
1730	case TCP_FASTOPEN:
1731	case TCP_FASTOPEN_CONNECT:
1732	case TCP_FASTOPEN_KEY:
1733	case TCP_FASTOPEN_NO_COOKIE:
1734		/* option not supported by SMC */
1735		if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
1736			smc_switch_to_fallback(smc);
1737			smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1738		} else {
1739			if (!smc->use_fallback)
1740				rc = -EINVAL;
1741		}
1742		break;
1743	case TCP_NODELAY:
1744		if (sk->sk_state != SMC_INIT &&
1745		    sk->sk_state != SMC_LISTEN &&
1746		    sk->sk_state != SMC_CLOSED) {
1747			if (val && !smc->use_fallback)
1748				mod_delayed_work(system_wq, &smc->conn.tx_work,
1749						 0);
1750		}
1751		break;
1752	case TCP_CORK:
1753		if (sk->sk_state != SMC_INIT &&
1754		    sk->sk_state != SMC_LISTEN &&
1755		    sk->sk_state != SMC_CLOSED) {
1756			if (!val && !smc->use_fallback)
1757				mod_delayed_work(system_wq, &smc->conn.tx_work,
1758						 0);
1759		}
1760		break;
1761	case TCP_DEFER_ACCEPT:
1762		smc->sockopt_defer_accept = val;
1763		break;
1764	default:
1765		break;
1766	}
1767	release_sock(sk);
1768
1769	return rc;
1770}
1771
1772static int smc_getsockopt(struct socket *sock, int level, int optname,
1773			  char __user *optval, int __user *optlen)
1774{
1775	struct smc_sock *smc;
1776
1777	smc = smc_sk(sock->sk);
1778	/* socket options apply to the CLC socket */
1779	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1780					     optval, optlen);
1781}
1782
1783static int smc_ioctl(struct socket *sock, unsigned int cmd,
1784		     unsigned long arg)
1785{
1786	union smc_host_cursor cons, urg;
1787	struct smc_connection *conn;
1788	struct smc_sock *smc;
1789	int answ;
1790
1791	smc = smc_sk(sock->sk);
1792	conn = &smc->conn;
1793	lock_sock(&smc->sk);
1794	if (smc->use_fallback) {
1795		if (!smc->clcsock) {
1796			release_sock(&smc->sk);
1797			return -EBADF;
1798		}
1799		answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1800		release_sock(&smc->sk);
1801		return answ;
1802	}
1803	switch (cmd) {
1804	case SIOCINQ: /* same as FIONREAD */
1805		if (smc->sk.sk_state == SMC_LISTEN) {
1806			release_sock(&smc->sk);
1807			return -EINVAL;
1808		}
1809		if (smc->sk.sk_state == SMC_INIT ||
1810		    smc->sk.sk_state == SMC_CLOSED)
1811			answ = 0;
1812		else
1813			answ = atomic_read(&smc->conn.bytes_to_rcv);
1814		break;
1815	case SIOCOUTQ:
1816		/* output queue size (not send + not acked) */
1817		if (smc->sk.sk_state == SMC_LISTEN) {
1818			release_sock(&smc->sk);
1819			return -EINVAL;
1820		}
1821		if (smc->sk.sk_state == SMC_INIT ||
1822		    smc->sk.sk_state == SMC_CLOSED)
1823			answ = 0;
1824		else
1825			answ = smc->conn.sndbuf_desc->len -
1826					atomic_read(&smc->conn.sndbuf_space);
1827		break;
1828	case SIOCOUTQNSD:
1829		/* output queue size (not send only) */
1830		if (smc->sk.sk_state == SMC_LISTEN) {
1831			release_sock(&smc->sk);
1832			return -EINVAL;
1833		}
1834		if (smc->sk.sk_state == SMC_INIT ||
1835		    smc->sk.sk_state == SMC_CLOSED)
1836			answ = 0;
1837		else
1838			answ = smc_tx_prepared_sends(&smc->conn);
1839		break;
1840	case SIOCATMARK:
1841		if (smc->sk.sk_state == SMC_LISTEN) {
1842			release_sock(&smc->sk);
1843			return -EINVAL;
1844		}
1845		if (smc->sk.sk_state == SMC_INIT ||
1846		    smc->sk.sk_state == SMC_CLOSED) {
1847			answ = 0;
1848		} else {
1849			smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1850			smc_curs_copy(&urg, &conn->urg_curs, conn);
1851			answ = smc_curs_diff(conn->rmb_desc->len,
1852					     &cons, &urg) == 1;
1853		}
1854		break;
1855	default:
1856		release_sock(&smc->sk);
1857		return -ENOIOCTLCMD;
1858	}
1859	release_sock(&smc->sk);
1860
1861	return put_user(answ, (int __user *)arg);
1862}
1863
1864static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1865			    int offset, size_t size, int flags)
1866{
1867	struct sock *sk = sock->sk;
1868	struct smc_sock *smc;
1869	int rc = -EPIPE;
1870
1871	smc = smc_sk(sk);
1872	lock_sock(sk);
1873	if (sk->sk_state != SMC_ACTIVE) {
1874		release_sock(sk);
1875		goto out;
1876	}
1877	release_sock(sk);
1878	if (smc->use_fallback)
1879		rc = kernel_sendpage(smc->clcsock, page, offset,
1880				     size, flags);
1881	else
1882		rc = sock_no_sendpage(sock, page, offset, size, flags);
1883
1884out:
1885	return rc;
1886}
1887
1888/* Map the affected portions of the rmbe into an spd, note the number of bytes
1889 * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1890 * updates till whenever a respective page has been fully processed.
1891 * Note that subsequent recv() calls have to wait till all splice() processing
1892 * completed.
1893 */
1894static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1895			       struct pipe_inode_info *pipe, size_t len,
1896			       unsigned int flags)
1897{
1898	struct sock *sk = sock->sk;
1899	struct smc_sock *smc;
1900	int rc = -ENOTCONN;
1901
1902	smc = smc_sk(sk);
1903	lock_sock(sk);
1904	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1905		/* socket was connected before, no more data to read */
1906		rc = 0;
1907		goto out;
1908	}
1909	if (sk->sk_state == SMC_INIT ||
1910	    sk->sk_state == SMC_LISTEN ||
1911	    sk->sk_state == SMC_CLOSED)
1912		goto out;
1913
1914	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1915		rc = 0;
1916		goto out;
1917	}
1918
1919	if (smc->use_fallback) {
1920		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1921						    pipe, len, flags);
1922	} else {
1923		if (*ppos) {
1924			rc = -ESPIPE;
1925			goto out;
1926		}
1927		if (flags & SPLICE_F_NONBLOCK)
1928			flags = MSG_DONTWAIT;
1929		else
1930			flags = 0;
1931		rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1932	}
1933out:
1934	release_sock(sk);
1935
1936	return rc;
1937}
1938
1939/* must look like tcp */
1940static const struct proto_ops smc_sock_ops = {
1941	.family		= PF_SMC,
1942	.owner		= THIS_MODULE,
1943	.release	= smc_release,
1944	.bind		= smc_bind,
1945	.connect	= smc_connect,
1946	.socketpair	= sock_no_socketpair,
1947	.accept		= smc_accept,
1948	.getname	= smc_getname,
1949	.poll		= smc_poll,
1950	.ioctl		= smc_ioctl,
1951	.listen		= smc_listen,
1952	.shutdown	= smc_shutdown,
1953	.setsockopt	= smc_setsockopt,
1954	.getsockopt	= smc_getsockopt,
1955	.sendmsg	= smc_sendmsg,
1956	.recvmsg	= smc_recvmsg,
1957	.mmap		= sock_no_mmap,
1958	.sendpage	= smc_sendpage,
1959	.splice_read	= smc_splice_read,
1960};
1961
1962static int smc_create(struct net *net, struct socket *sock, int protocol,
1963		      int kern)
1964{
1965	int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1966	struct smc_sock *smc;
1967	struct sock *sk;
1968	int rc;
1969
1970	rc = -ESOCKTNOSUPPORT;
1971	if (sock->type != SOCK_STREAM)
1972		goto out;
1973
1974	rc = -EPROTONOSUPPORT;
1975	if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1976		goto out;
1977
1978	rc = -ENOBUFS;
1979	sock->ops = &smc_sock_ops;
1980	sk = smc_sock_alloc(net, sock, protocol);
1981	if (!sk)
1982		goto out;
1983
1984	/* create internal TCP socket for CLC handshake and fallback */
1985	smc = smc_sk(sk);
1986	smc->use_fallback = false; /* assume rdma capability first */
1987	smc->fallback_rsn = 0;
1988	rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1989			      &smc->clcsock);
1990	if (rc) {
1991		sk_common_release(sk);
1992		goto out;
1993	}
1994	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1995	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1996
1997out:
1998	return rc;
1999}
2000
2001static const struct net_proto_family smc_sock_family_ops = {
2002	.family	= PF_SMC,
2003	.owner	= THIS_MODULE,
2004	.create	= smc_create,
2005};
2006
2007unsigned int smc_net_id;
2008
2009static __net_init int smc_net_init(struct net *net)
2010{
2011	return smc_pnet_net_init(net);
2012}
2013
2014static void __net_exit smc_net_exit(struct net *net)
2015{
2016	smc_pnet_net_exit(net);
2017}
2018
2019static struct pernet_operations smc_net_ops = {
2020	.init = smc_net_init,
2021	.exit = smc_net_exit,
2022	.id   = &smc_net_id,
2023	.size = sizeof(struct smc_net),
2024};
2025
2026static int __init smc_init(void)
2027{
2028	int rc;
2029
2030	rc = register_pernet_subsys(&smc_net_ops);
2031	if (rc)
2032		return rc;
2033
2034	rc = smc_pnet_init();
2035	if (rc)
2036		goto out_pernet_subsys;
2037
2038	rc = smc_llc_init();
2039	if (rc) {
2040		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
2041		goto out_pnet;
2042	}
2043
2044	rc = smc_cdc_init();
2045	if (rc) {
2046		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
2047		goto out_pnet;
2048	}
2049
2050	rc = proto_register(&smc_proto, 1);
2051	if (rc) {
2052		pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
2053		goto out_pnet;
2054	}
2055
2056	rc = proto_register(&smc_proto6, 1);
2057	if (rc) {
2058		pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
2059		goto out_proto;
2060	}
2061
2062	rc = sock_register(&smc_sock_family_ops);
2063	if (rc) {
2064		pr_err("%s: sock_register fails with %d\n", __func__, rc);
2065		goto out_proto6;
2066	}
2067	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
2068	INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
2069
2070	rc = smc_ib_register_client();
2071	if (rc) {
2072		pr_err("%s: ib_register fails with %d\n", __func__, rc);
2073		goto out_sock;
2074	}
2075
2076	static_branch_enable(&tcp_have_smc);
2077	return 0;
2078
2079out_sock:
2080	sock_unregister(PF_SMC);
2081out_proto6:
2082	proto_unregister(&smc_proto6);
2083out_proto:
2084	proto_unregister(&smc_proto);
2085out_pnet:
2086	smc_pnet_exit();
2087out_pernet_subsys:
2088	unregister_pernet_subsys(&smc_net_ops);
2089
2090	return rc;
2091}
2092
2093static void __exit smc_exit(void)
2094{
2095	smc_core_exit();
2096	static_branch_disable(&tcp_have_smc);
2097	smc_ib_unregister_client();
2098	sock_unregister(PF_SMC);
2099	proto_unregister(&smc_proto6);
2100	proto_unregister(&smc_proto);
2101	smc_pnet_exit();
2102	unregister_pernet_subsys(&smc_net_ops);
2103}
2104
2105module_init(smc_init);
2106module_exit(smc_exit);
2107
2108MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2109MODULE_DESCRIPTION("smc socket address family");
2110MODULE_LICENSE("GPL");
2111MODULE_ALIAS_NETPROTO(PF_SMC);