Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.5.6.
  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * Shared Memory Communications over RDMA (SMC-R) and RoCE
  4 *
  5 * Work Requests exploiting Infiniband API
  6 *
  7 * Work requests (WR) of type ib_post_send or ib_post_recv respectively
  8 * are submitted to either RC SQ or RC RQ respectively
  9 * (reliably connected send/receive queue)
 10 * and become work queue entries (WQEs).
 11 * While an SQ WR/WQE is pending, we track it until transmission completion.
 12 * Through a send or receive completion queue (CQ) respectively,
 13 * we get completion queue entries (CQEs) [aka work completions (WCs)].
 14 * Since the CQ callback is called from IRQ context, we split work by using
 15 * bottom halves implemented by tasklets.
 16 *
 17 * SMC uses this to exchange LLC (link layer control)
 18 * and CDC (connection data control) messages.
 19 *
 20 * Copyright IBM Corp. 2016
 21 *
 22 * Author(s):  Steffen Maier <maier@linux.vnet.ibm.com>
 23 */
 24
 25#include <linux/atomic.h>
 26#include <linux/hashtable.h>
 27#include <linux/wait.h>
 28#include <rdma/ib_verbs.h>
 29#include <asm/div64.h>
 30
 31#include "smc.h"
 32#include "smc_wr.h"
 33
 34#define SMC_WR_MAX_POLL_CQE 10	/* max. # of compl. queue elements in 1 poll */
 35
 36#define SMC_WR_RX_HASH_BITS 4
 37static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
 38static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
 39
 40struct smc_wr_tx_pend {	/* control data for a pending send request */
 41	u64			wr_id;		/* work request id sent */
 42	smc_wr_tx_handler	handler;
 43	enum ib_wc_status	wc_status;	/* CQE status */
 44	struct smc_link		*link;
 45	u32			idx;
 46	struct smc_wr_tx_pend_priv priv;
 47	u8			compl_requested;
 48};
 49
 50/******************************** send queue *********************************/
 51
 52/*------------------------------- completion --------------------------------*/
 53
 54/* returns true if at least one tx work request is pending on the given link */
 55static inline bool smc_wr_is_tx_pend(struct smc_link *link)
 56{
 57	return !bitmap_empty(link->wr_tx_mask, link->wr_tx_cnt);
 58}
 59
 60/* wait till all pending tx work requests on the given link are completed */
 61void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
 62{
 63	wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
 64}
 65
 66static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
 67{
 68	u32 i;
 69
 70	for (i = 0; i < link->wr_tx_cnt; i++) {
 71		if (link->wr_tx_pends[i].wr_id == wr_id)
 72			return i;
 73	}
 74	return link->wr_tx_cnt;
 75}
 76
 77static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
 78{
 79	struct smc_wr_tx_pend pnd_snd;
 80	struct smc_link *link;
 81	u32 pnd_snd_idx;
 82
 83	link = wc->qp->qp_context;
 84
 85	if (wc->opcode == IB_WC_REG_MR) {
 86		if (wc->status)
 87			link->wr_reg_state = FAILED;
 88		else
 89			link->wr_reg_state = CONFIRMED;
 90		smc_wr_wakeup_reg_wait(link);
 91		return;
 92	}
 93
 94	pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
 95	if (pnd_snd_idx == link->wr_tx_cnt) {
 96		if (link->lgr->smc_version != SMC_V2 ||
 97		    link->wr_tx_v2_pend->wr_id != wc->wr_id)
 98			return;
 99		link->wr_tx_v2_pend->wc_status = wc->status;
100		memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
101		/* clear the full struct smc_wr_tx_pend including .priv */
102		memset(link->wr_tx_v2_pend, 0,
103		       sizeof(*link->wr_tx_v2_pend));
104		memset(link->lgr->wr_tx_buf_v2, 0,
105		       sizeof(*link->lgr->wr_tx_buf_v2));
106	} else {
107		link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
108		if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
109			complete(&link->wr_tx_compl[pnd_snd_idx]);
110		memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
111		       sizeof(pnd_snd));
112		/* clear the full struct smc_wr_tx_pend including .priv */
113		memset(&link->wr_tx_pends[pnd_snd_idx], 0,
114		       sizeof(link->wr_tx_pends[pnd_snd_idx]));
115		memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
116		       sizeof(link->wr_tx_bufs[pnd_snd_idx]));
117		if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
118			return;
119	}
120
121	if (wc->status) {
122		if (link->lgr->smc_version == SMC_V2) {
123			memset(link->wr_tx_v2_pend, 0,
124			       sizeof(*link->wr_tx_v2_pend));
125			memset(link->lgr->wr_tx_buf_v2, 0,
126			       sizeof(*link->lgr->wr_tx_buf_v2));
127		}
128		/* terminate link */
129		smcr_link_down_cond_sched(link);
130	}
131	if (pnd_snd.handler)
132		pnd_snd.handler(&pnd_snd.priv, link, wc->status);
133	wake_up(&link->wr_tx_wait);
134}
135
136static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t)
137{
138	struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet);
139	struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
140	int i = 0, rc;
141	int polled = 0;
142
143again:
144	polled++;
145	do {
146		memset(&wc, 0, sizeof(wc));
147		rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
148		if (polled == 1) {
149			ib_req_notify_cq(dev->roce_cq_send,
150					 IB_CQ_NEXT_COMP |
151					 IB_CQ_REPORT_MISSED_EVENTS);
152		}
153		if (!rc)
154			break;
155		for (i = 0; i < rc; i++)
156			smc_wr_tx_process_cqe(&wc[i]);
157	} while (rc > 0);
158	if (polled == 1)
159		goto again;
160}
161
162void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
163{
164	struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
165
166	tasklet_schedule(&dev->send_tasklet);
167}
168
169/*---------------------------- request submission ---------------------------*/
170
171static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
172{
173	*idx = link->wr_tx_cnt;
174	if (!smc_link_sendable(link))
175		return -ENOLINK;
176	for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
177		if (!test_and_set_bit(*idx, link->wr_tx_mask))
178			return 0;
179	}
180	*idx = link->wr_tx_cnt;
181	return -EBUSY;
182}
183
184/**
185 * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
186 *			and sets info for pending transmit tracking
187 * @link:		Pointer to smc_link used to later send the message.
188 * @handler:		Send completion handler function pointer.
189 * @wr_buf:		Out value returns pointer to message buffer.
190 * @wr_rdma_buf:	Out value returns pointer to rdma work request.
191 * @wr_pend_priv:	Out value returns pointer serving as handler context.
192 *
193 * Return: 0 on success, or -errno on error.
194 */
195int smc_wr_tx_get_free_slot(struct smc_link *link,
196			    smc_wr_tx_handler handler,
197			    struct smc_wr_buf **wr_buf,
198			    struct smc_rdma_wr **wr_rdma_buf,
199			    struct smc_wr_tx_pend_priv **wr_pend_priv)
200{
201	struct smc_link_group *lgr = smc_get_lgr(link);
202	struct smc_wr_tx_pend *wr_pend;
203	u32 idx = link->wr_tx_cnt;
204	struct ib_send_wr *wr_ib;
205	u64 wr_id;
206	int rc;
207
208	*wr_buf = NULL;
209	*wr_pend_priv = NULL;
210	if (in_softirq() || lgr->terminating) {
211		rc = smc_wr_tx_get_free_slot_index(link, &idx);
212		if (rc)
213			return rc;
214	} else {
215		rc = wait_event_interruptible_timeout(
216			link->wr_tx_wait,
217			!smc_link_sendable(link) ||
218			lgr->terminating ||
219			(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
220			SMC_WR_TX_WAIT_FREE_SLOT_TIME);
221		if (!rc) {
222			/* timeout - terminate link */
223			smcr_link_down_cond_sched(link);
224			return -EPIPE;
225		}
226		if (idx == link->wr_tx_cnt)
227			return -EPIPE;
228	}
229	wr_id = smc_wr_tx_get_next_wr_id(link);
230	wr_pend = &link->wr_tx_pends[idx];
231	wr_pend->wr_id = wr_id;
232	wr_pend->handler = handler;
233	wr_pend->link = link;
234	wr_pend->idx = idx;
235	wr_ib = &link->wr_tx_ibs[idx];
236	wr_ib->wr_id = wr_id;
237	*wr_buf = &link->wr_tx_bufs[idx];
238	if (wr_rdma_buf)
239		*wr_rdma_buf = &link->wr_tx_rdmas[idx];
240	*wr_pend_priv = &wr_pend->priv;
241	return 0;
242}
243
244int smc_wr_tx_get_v2_slot(struct smc_link *link,
245			  smc_wr_tx_handler handler,
246			  struct smc_wr_v2_buf **wr_buf,
247			  struct smc_wr_tx_pend_priv **wr_pend_priv)
248{
249	struct smc_wr_tx_pend *wr_pend;
250	struct ib_send_wr *wr_ib;
251	u64 wr_id;
252
253	if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt)
254		return -EBUSY;
255
256	*wr_buf = NULL;
257	*wr_pend_priv = NULL;
258	wr_id = smc_wr_tx_get_next_wr_id(link);
259	wr_pend = link->wr_tx_v2_pend;
260	wr_pend->wr_id = wr_id;
261	wr_pend->handler = handler;
262	wr_pend->link = link;
263	wr_pend->idx = link->wr_tx_cnt;
264	wr_ib = link->wr_tx_v2_ib;
265	wr_ib->wr_id = wr_id;
266	*wr_buf = link->lgr->wr_tx_buf_v2;
267	*wr_pend_priv = &wr_pend->priv;
268	return 0;
269}
270
271int smc_wr_tx_put_slot(struct smc_link *link,
272		       struct smc_wr_tx_pend_priv *wr_pend_priv)
273{
274	struct smc_wr_tx_pend *pend;
275
276	pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
277	if (pend->idx < link->wr_tx_cnt) {
278		u32 idx = pend->idx;
279
280		/* clear the full struct smc_wr_tx_pend including .priv */
281		memset(&link->wr_tx_pends[idx], 0,
282		       sizeof(link->wr_tx_pends[idx]));
283		memset(&link->wr_tx_bufs[idx], 0,
284		       sizeof(link->wr_tx_bufs[idx]));
285		test_and_clear_bit(idx, link->wr_tx_mask);
286		wake_up(&link->wr_tx_wait);
287		return 1;
288	} else if (link->lgr->smc_version == SMC_V2 &&
289		   pend->idx == link->wr_tx_cnt) {
290		/* Large v2 buffer */
291		memset(&link->wr_tx_v2_pend, 0,
292		       sizeof(link->wr_tx_v2_pend));
293		memset(&link->lgr->wr_tx_buf_v2, 0,
294		       sizeof(link->lgr->wr_tx_buf_v2));
295		return 1;
296	}
297
298	return 0;
299}
300
301/* Send prepared WR slot via ib_post_send.
302 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
303 */
304int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
305{
306	struct smc_wr_tx_pend *pend;
307	int rc;
308
309	ib_req_notify_cq(link->smcibdev->roce_cq_send,
310			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
311	pend = container_of(priv, struct smc_wr_tx_pend, priv);
312	rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
313	if (rc) {
314		smc_wr_tx_put_slot(link, priv);
315		smcr_link_down_cond_sched(link);
316	}
317	return rc;
318}
319
320int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
321		      int len)
322{
323	int rc;
324
325	link->wr_tx_v2_ib->sg_list[0].length = len;
326	ib_req_notify_cq(link->smcibdev->roce_cq_send,
327			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
328	rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL);
329	if (rc) {
330		smc_wr_tx_put_slot(link, priv);
331		smcr_link_down_cond_sched(link);
332	}
333	return rc;
334}
335
336/* Send prepared WR slot via ib_post_send and wait for send completion
337 * notification.
338 * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
339 */
340int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
341			unsigned long timeout)
342{
343	struct smc_wr_tx_pend *pend;
344	u32 pnd_idx;
345	int rc;
346
347	pend = container_of(priv, struct smc_wr_tx_pend, priv);
348	pend->compl_requested = 1;
349	pnd_idx = pend->idx;
350	init_completion(&link->wr_tx_compl[pnd_idx]);
351
352	rc = smc_wr_tx_send(link, priv);
353	if (rc)
354		return rc;
355	/* wait for completion by smc_wr_tx_process_cqe() */
356	rc = wait_for_completion_interruptible_timeout(
357					&link->wr_tx_compl[pnd_idx], timeout);
358	if (rc <= 0)
359		rc = -ENODATA;
360	if (rc > 0)
361		rc = 0;
362	return rc;
363}
364
365/* Register a memory region and wait for result. */
366int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
367{
368	int rc;
369
370	ib_req_notify_cq(link->smcibdev->roce_cq_send,
371			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
372	link->wr_reg_state = POSTED;
373	link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
374	link->wr_reg.mr = mr;
375	link->wr_reg.key = mr->rkey;
376	rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL);
377	if (rc)
378		return rc;
379
380	percpu_ref_get(&link->wr_reg_refs);
381	rc = wait_event_interruptible_timeout(link->wr_reg_wait,
382					      (link->wr_reg_state != POSTED),
383					      SMC_WR_REG_MR_WAIT_TIME);
384	percpu_ref_put(&link->wr_reg_refs);
385	if (!rc) {
386		/* timeout - terminate link */
387		smcr_link_down_cond_sched(link);
388		return -EPIPE;
389	}
390	if (rc == -ERESTARTSYS)
391		return -EINTR;
392	switch (link->wr_reg_state) {
393	case CONFIRMED:
394		rc = 0;
395		break;
396	case FAILED:
397		rc = -EIO;
398		break;
399	case POSTED:
400		rc = -EPIPE;
401		break;
402	}
403	return rc;
404}
405
406/****************************** receive queue ********************************/
407
408int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
409{
410	struct smc_wr_rx_handler *h_iter;
411	int rc = 0;
412
413	spin_lock(&smc_wr_rx_hash_lock);
414	hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
415		if (h_iter->type == handler->type) {
416			rc = -EEXIST;
417			goto out_unlock;
418		}
419	}
420	hash_add(smc_wr_rx_hash, &handler->list, handler->type);
421out_unlock:
422	spin_unlock(&smc_wr_rx_hash_lock);
423	return rc;
424}
425
426/* Demultiplex a received work request based on the message type to its handler.
427 * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
428 * and not being modified any more afterwards so we don't need to lock it.
429 */
430static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
431{
432	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
433	struct smc_wr_rx_handler *handler;
434	struct smc_wr_rx_hdr *wr_rx;
435	u64 temp_wr_id;
436	u32 index;
437
438	if (wc->byte_len < sizeof(*wr_rx))
439		return; /* short message */
440	temp_wr_id = wc->wr_id;
441	index = do_div(temp_wr_id, link->wr_rx_cnt);
442	wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
443	hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
444		if (handler->type == wr_rx->type)
445			handler->handler(wc, wr_rx);
446	}
447}
448
449static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
450{
451	struct smc_link *link;
452	int i;
453
454	for (i = 0; i < num; i++) {
455		link = wc[i].qp->qp_context;
456		link->wr_rx_id_compl = wc[i].wr_id;
457		if (wc[i].status == IB_WC_SUCCESS) {
458			link->wr_rx_tstamp = jiffies;
459			smc_wr_rx_demultiplex(&wc[i]);
460			smc_wr_rx_post(link); /* refill WR RX */
461		} else {
462			/* handle status errors */
463			switch (wc[i].status) {
464			case IB_WC_RETRY_EXC_ERR:
465			case IB_WC_RNR_RETRY_EXC_ERR:
466			case IB_WC_WR_FLUSH_ERR:
467				smcr_link_down_cond_sched(link);
468				if (link->wr_rx_id_compl == link->wr_rx_id)
469					wake_up(&link->wr_rx_empty_wait);
470				break;
471			default:
472				smc_wr_rx_post(link); /* refill WR RX */
473				break;
474			}
475		}
476	}
477}
478
479static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
480{
481	struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
482	struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
483	int polled = 0;
484	int rc;
485
486again:
487	polled++;
488	do {
489		memset(&wc, 0, sizeof(wc));
490		rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
491		if (polled == 1) {
492			ib_req_notify_cq(dev->roce_cq_recv,
493					 IB_CQ_SOLICITED_MASK
494					 | IB_CQ_REPORT_MISSED_EVENTS);
495		}
496		if (!rc)
497			break;
498		smc_wr_rx_process_cqes(&wc[0], rc);
499	} while (rc > 0);
500	if (polled == 1)
501		goto again;
502}
503
504void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
505{
506	struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
507
508	tasklet_schedule(&dev->recv_tasklet);
509}
510
511int smc_wr_rx_post_init(struct smc_link *link)
512{
513	u32 i;
514	int rc = 0;
515
516	for (i = 0; i < link->wr_rx_cnt; i++)
517		rc = smc_wr_rx_post(link);
518	return rc;
519}
520
521/***************************** init, exit, misc ******************************/
522
523void smc_wr_remember_qp_attr(struct smc_link *lnk)
524{
525	struct ib_qp_attr *attr = &lnk->qp_attr;
526	struct ib_qp_init_attr init_attr;
527
528	memset(attr, 0, sizeof(*attr));
529	memset(&init_attr, 0, sizeof(init_attr));
530	ib_query_qp(lnk->roce_qp, attr,
531		    IB_QP_STATE |
532		    IB_QP_CUR_STATE |
533		    IB_QP_PKEY_INDEX |
534		    IB_QP_PORT |
535		    IB_QP_QKEY |
536		    IB_QP_AV |
537		    IB_QP_PATH_MTU |
538		    IB_QP_TIMEOUT |
539		    IB_QP_RETRY_CNT |
540		    IB_QP_RNR_RETRY |
541		    IB_QP_RQ_PSN |
542		    IB_QP_ALT_PATH |
543		    IB_QP_MIN_RNR_TIMER |
544		    IB_QP_SQ_PSN |
545		    IB_QP_PATH_MIG_STATE |
546		    IB_QP_CAP |
547		    IB_QP_DEST_QPN,
548		    &init_attr);
549
550	lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
551			       lnk->qp_attr.cap.max_send_wr);
552	lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
553			       lnk->qp_attr.cap.max_recv_wr);
554}
555
556static void smc_wr_init_sge(struct smc_link *lnk)
557{
558	int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
559	bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
560	u32 i;
561
562	for (i = 0; i < lnk->wr_tx_cnt; i++) {
563		lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) :
564			lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
565		lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
566		lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
567		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey =
568			lnk->roce_pd->local_dma_lkey;
569		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey =
570			lnk->roce_pd->local_dma_lkey;
571		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey =
572			lnk->roce_pd->local_dma_lkey;
573		lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey =
574			lnk->roce_pd->local_dma_lkey;
575		lnk->wr_tx_ibs[i].next = NULL;
576		lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
577		lnk->wr_tx_ibs[i].num_sge = 1;
578		lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
579		lnk->wr_tx_ibs[i].send_flags =
580			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
581		if (send_inline)
582			lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
583		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
584		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
585		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =
586			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
587		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
588			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
589	}
590
591	if (lnk->lgr->smc_version == SMC_V2) {
592		lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
593		lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
594		lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
595
596		lnk->wr_tx_v2_ib->next = NULL;
597		lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
598		lnk->wr_tx_v2_ib->num_sge = 1;
599		lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
600		lnk->wr_tx_v2_ib->send_flags =
601			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
602	}
603
604	/* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
605	 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
606	 * and the same buffer for all sges. When a larger message arrived then
607	 * the content of the first small sge is copied to the beginning of
608	 * the larger spillover buffer, allowing easy data mapping.
609	 */
610	for (i = 0; i < lnk->wr_rx_cnt; i++) {
611		int x = i * sges_per_buf;
612
613		lnk->wr_rx_sges[x].addr =
614			lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
615		lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
616		lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
617		if (lnk->lgr->smc_version == SMC_V2) {
618			lnk->wr_rx_sges[x + 1].addr =
619					lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
620			lnk->wr_rx_sges[x + 1].length =
621					SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
622			lnk->wr_rx_sges[x + 1].lkey =
623					lnk->roce_pd->local_dma_lkey;
624		}
625		lnk->wr_rx_ibs[i].next = NULL;
626		lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
627		lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
628	}
629	lnk->wr_reg.wr.next = NULL;
630	lnk->wr_reg.wr.num_sge = 0;
631	lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
632	lnk->wr_reg.wr.opcode = IB_WR_REG_MR;
633	lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
634}
635
636void smc_wr_free_link(struct smc_link *lnk)
637{
638	struct ib_device *ibdev;
639
640	if (!lnk->smcibdev)
641		return;
642	ibdev = lnk->smcibdev->ibdev;
643
644	smc_wr_drain_cq(lnk);
645	smc_wr_wakeup_reg_wait(lnk);
646	smc_wr_wakeup_tx_wait(lnk);
647
648	smc_wr_tx_wait_no_pending_sends(lnk);
649	percpu_ref_kill(&lnk->wr_reg_refs);
650	wait_for_completion(&lnk->reg_ref_comp);
651	percpu_ref_exit(&lnk->wr_reg_refs);
652	percpu_ref_kill(&lnk->wr_tx_refs);
653	wait_for_completion(&lnk->tx_ref_comp);
654	percpu_ref_exit(&lnk->wr_tx_refs);
655
656	if (lnk->wr_rx_dma_addr) {
657		ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
658				    SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
659				    DMA_FROM_DEVICE);
660		lnk->wr_rx_dma_addr = 0;
661	}
662	if (lnk->wr_rx_v2_dma_addr) {
663		ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
664				    SMC_WR_BUF_V2_SIZE,
665				    DMA_FROM_DEVICE);
666		lnk->wr_rx_v2_dma_addr = 0;
667	}
668	if (lnk->wr_tx_dma_addr) {
669		ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
670				    SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
671				    DMA_TO_DEVICE);
672		lnk->wr_tx_dma_addr = 0;
673	}
674	if (lnk->wr_tx_v2_dma_addr) {
675		ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
676				    SMC_WR_BUF_V2_SIZE,
677				    DMA_TO_DEVICE);
678		lnk->wr_tx_v2_dma_addr = 0;
679	}
680}
681
682void smc_wr_free_lgr_mem(struct smc_link_group *lgr)
683{
684	if (lgr->smc_version < SMC_V2)
685		return;
686
687	kfree(lgr->wr_rx_buf_v2);
688	lgr->wr_rx_buf_v2 = NULL;
689	kfree(lgr->wr_tx_buf_v2);
690	lgr->wr_tx_buf_v2 = NULL;
691}
692
693void smc_wr_free_link_mem(struct smc_link *lnk)
694{
695	kfree(lnk->wr_tx_v2_ib);
696	lnk->wr_tx_v2_ib = NULL;
697	kfree(lnk->wr_tx_v2_sge);
698	lnk->wr_tx_v2_sge = NULL;
699	kfree(lnk->wr_tx_v2_pend);
700	lnk->wr_tx_v2_pend = NULL;
701	kfree(lnk->wr_tx_compl);
702	lnk->wr_tx_compl = NULL;
703	kfree(lnk->wr_tx_pends);
704	lnk->wr_tx_pends = NULL;
705	bitmap_free(lnk->wr_tx_mask);
706	lnk->wr_tx_mask = NULL;
707	kfree(lnk->wr_tx_sges);
708	lnk->wr_tx_sges = NULL;
709	kfree(lnk->wr_tx_rdma_sges);
710	lnk->wr_tx_rdma_sges = NULL;
711	kfree(lnk->wr_rx_sges);
712	lnk->wr_rx_sges = NULL;
713	kfree(lnk->wr_tx_rdmas);
714	lnk->wr_tx_rdmas = NULL;
715	kfree(lnk->wr_rx_ibs);
716	lnk->wr_rx_ibs = NULL;
717	kfree(lnk->wr_tx_ibs);
718	lnk->wr_tx_ibs = NULL;
719	kfree(lnk->wr_tx_bufs);
720	lnk->wr_tx_bufs = NULL;
721	kfree(lnk->wr_rx_bufs);
722	lnk->wr_rx_bufs = NULL;
723}
724
725int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr)
726{
727	if (lgr->smc_version < SMC_V2)
728		return 0;
729
730	lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
731	if (!lgr->wr_rx_buf_v2)
732		return -ENOMEM;
733	lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
734	if (!lgr->wr_tx_buf_v2) {
735		kfree(lgr->wr_rx_buf_v2);
736		return -ENOMEM;
737	}
738	return 0;
739}
740
741int smc_wr_alloc_link_mem(struct smc_link *link)
742{
743	int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1;
744
745	/* allocate link related memory */
746	link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
747	if (!link->wr_tx_bufs)
748		goto no_mem;
749	link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
750				   GFP_KERNEL);
751	if (!link->wr_rx_bufs)
752		goto no_mem_wr_tx_bufs;
753	link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
754				  GFP_KERNEL);
755	if (!link->wr_tx_ibs)
756		goto no_mem_wr_rx_bufs;
757	link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
758				  sizeof(link->wr_rx_ibs[0]),
759				  GFP_KERNEL);
760	if (!link->wr_rx_ibs)
761		goto no_mem_wr_tx_ibs;
762	link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT,
763				    sizeof(link->wr_tx_rdmas[0]),
764				    GFP_KERNEL);
765	if (!link->wr_tx_rdmas)
766		goto no_mem_wr_rx_ibs;
767	link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT,
768					sizeof(link->wr_tx_rdma_sges[0]),
769					GFP_KERNEL);
770	if (!link->wr_tx_rdma_sges)
771		goto no_mem_wr_tx_rdmas;
772	link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
773				   GFP_KERNEL);
774	if (!link->wr_tx_sges)
775		goto no_mem_wr_tx_rdma_sges;
776	link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
777				   sizeof(link->wr_rx_sges[0]) * sges_per_buf,
778				   GFP_KERNEL);
779	if (!link->wr_rx_sges)
780		goto no_mem_wr_tx_sges;
781	link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL);
782	if (!link->wr_tx_mask)
783		goto no_mem_wr_rx_sges;
784	link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
785				    sizeof(link->wr_tx_pends[0]),
786				    GFP_KERNEL);
787	if (!link->wr_tx_pends)
788		goto no_mem_wr_tx_mask;
789	link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT,
790				    sizeof(link->wr_tx_compl[0]),
791				    GFP_KERNEL);
792	if (!link->wr_tx_compl)
793		goto no_mem_wr_tx_pends;
794
795	if (link->lgr->smc_version == SMC_V2) {
796		link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib),
797					    GFP_KERNEL);
798		if (!link->wr_tx_v2_ib)
799			goto no_mem_tx_compl;
800		link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge),
801					     GFP_KERNEL);
802		if (!link->wr_tx_v2_sge)
803			goto no_mem_v2_ib;
804		link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend),
805					      GFP_KERNEL);
806		if (!link->wr_tx_v2_pend)
807			goto no_mem_v2_sge;
808	}
809	return 0;
810
811no_mem_v2_sge:
812	kfree(link->wr_tx_v2_sge);
813no_mem_v2_ib:
814	kfree(link->wr_tx_v2_ib);
815no_mem_tx_compl:
816	kfree(link->wr_tx_compl);
817no_mem_wr_tx_pends:
818	kfree(link->wr_tx_pends);
819no_mem_wr_tx_mask:
820	kfree(link->wr_tx_mask);
821no_mem_wr_rx_sges:
822	kfree(link->wr_rx_sges);
823no_mem_wr_tx_sges:
824	kfree(link->wr_tx_sges);
825no_mem_wr_tx_rdma_sges:
826	kfree(link->wr_tx_rdma_sges);
827no_mem_wr_tx_rdmas:
828	kfree(link->wr_tx_rdmas);
829no_mem_wr_rx_ibs:
830	kfree(link->wr_rx_ibs);
831no_mem_wr_tx_ibs:
832	kfree(link->wr_tx_ibs);
833no_mem_wr_rx_bufs:
834	kfree(link->wr_rx_bufs);
835no_mem_wr_tx_bufs:
836	kfree(link->wr_tx_bufs);
837no_mem:
838	return -ENOMEM;
839}
840
841void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
842{
843	tasklet_kill(&smcibdev->recv_tasklet);
844	tasklet_kill(&smcibdev->send_tasklet);
845}
846
847void smc_wr_add_dev(struct smc_ib_device *smcibdev)
848{
849	tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn);
850	tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn);
851}
852
853static void smcr_wr_tx_refs_free(struct percpu_ref *ref)
854{
855	struct smc_link *lnk = container_of(ref, struct smc_link, wr_tx_refs);
856
857	complete(&lnk->tx_ref_comp);
858}
859
860static void smcr_wr_reg_refs_free(struct percpu_ref *ref)
861{
862	struct smc_link *lnk = container_of(ref, struct smc_link, wr_reg_refs);
863
864	complete(&lnk->reg_ref_comp);
865}
866
867int smc_wr_create_link(struct smc_link *lnk)
868{
869	struct ib_device *ibdev = lnk->smcibdev->ibdev;
870	int rc = 0;
871
872	smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
873	lnk->wr_rx_id = 0;
874	lnk->wr_rx_dma_addr = ib_dma_map_single(
875		ibdev, lnk->wr_rx_bufs,	SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
876		DMA_FROM_DEVICE);
877	if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
878		lnk->wr_rx_dma_addr = 0;
879		rc = -EIO;
880		goto out;
881	}
882	if (lnk->lgr->smc_version == SMC_V2) {
883		lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev,
884			lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE,
885			DMA_FROM_DEVICE);
886		if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) {
887			lnk->wr_rx_v2_dma_addr = 0;
888			rc = -EIO;
889			goto dma_unmap;
890		}
891		lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev,
892			lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE,
893			DMA_TO_DEVICE);
894		if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) {
895			lnk->wr_tx_v2_dma_addr = 0;
896			rc = -EIO;
897			goto dma_unmap;
898		}
899	}
900	lnk->wr_tx_dma_addr = ib_dma_map_single(
901		ibdev, lnk->wr_tx_bufs,	SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
902		DMA_TO_DEVICE);
903	if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
904		rc = -EIO;
905		goto dma_unmap;
906	}
907	smc_wr_init_sge(lnk);
908	bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT);
909	init_waitqueue_head(&lnk->wr_tx_wait);
910	rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free, 0, GFP_KERNEL);
911	if (rc)
912		goto dma_unmap;
913	init_completion(&lnk->tx_ref_comp);
914	init_waitqueue_head(&lnk->wr_reg_wait);
915	rc = percpu_ref_init(&lnk->wr_reg_refs, smcr_wr_reg_refs_free, 0, GFP_KERNEL);
916	if (rc)
917		goto cancel_ref;
918	init_completion(&lnk->reg_ref_comp);
919	init_waitqueue_head(&lnk->wr_rx_empty_wait);
920	return rc;
921
922cancel_ref:
923	percpu_ref_exit(&lnk->wr_tx_refs);
924dma_unmap:
925	if (lnk->wr_rx_v2_dma_addr) {
926		ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
927				    SMC_WR_BUF_V2_SIZE,
928				    DMA_FROM_DEVICE);
929		lnk->wr_rx_v2_dma_addr = 0;
930	}
931	if (lnk->wr_tx_v2_dma_addr) {
932		ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
933				    SMC_WR_BUF_V2_SIZE,
934				    DMA_TO_DEVICE);
935		lnk->wr_tx_v2_dma_addr = 0;
936	}
937	ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
938			    SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
939			    DMA_FROM_DEVICE);
940	lnk->wr_rx_dma_addr = 0;
941out:
942	return rc;
943}