user_sdma.c - drivers/infiniband/hw/hfi1/user_sdma.c - Linux diff v6.8

   1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2/*
   3 * Copyright(c) 2020 - 2023 Cornelis Networks, Inc.
   4 * Copyright(c) 2015 - 2018 Intel Corporation.
   5 */
   6
   7#include <linux/mm.h>
   8#include <linux/types.h>
   9#include <linux/device.h>
  10#include <linux/dmapool.h>
  11#include <linux/slab.h>
  12#include <linux/list.h>
  13#include <linux/highmem.h>
  14#include <linux/io.h>
  15#include <linux/uio.h>
  16#include <linux/rbtree.h>
  17#include <linux/spinlock.h>
  18#include <linux/delay.h>
  19#include <linux/kthread.h>
  20#include <linux/mmu_context.h>
  21#include <linux/module.h>
  22#include <linux/vmalloc.h>
  23#include <linux/string.h>
  24
  25#include "hfi.h"
  26#include "sdma.h"
  27#include "user_sdma.h"
  28#include "verbs.h"  /* for the headers */
  29#include "common.h" /* for struct hfi1_tid_info */
  30#include "trace.h"
  31
  32static uint hfi1_sdma_comp_ring_size = 128;
  33module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
  34MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
  35
  36static unsigned initial_pkt_count = 8;
  37
  38static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
  39static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
  40static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
  41static void user_sdma_free_request(struct user_sdma_request *req);
  42static int check_header_template(struct user_sdma_request *req,
  43				 struct hfi1_pkt_header *hdr, u32 lrhlen,
  44				 u32 datalen);
  45static int set_txreq_header(struct user_sdma_request *req,
  46			    struct user_sdma_txreq *tx, u32 datalen);
  47static int set_txreq_header_ahg(struct user_sdma_request *req,
  48				struct user_sdma_txreq *tx, u32 len);
  49static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  50				  struct hfi1_user_sdma_comp_q *cq,
  51				  u16 idx, enum hfi1_sdma_comp_state state,
  52				  int ret);
  53static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
  54static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
  55
  56static int defer_packet_queue(
  57	struct sdma_engine *sde,
  58	struct iowait_work *wait,
  59	struct sdma_txreq *txreq,
  60	uint seq,
  61	bool pkts_sent);
  62static void activate_packet_queue(struct iowait *wait, int reason);
  63
  64static int defer_packet_queue(
  65	struct sdma_engine *sde,
  66	struct iowait_work *wait,
  67	struct sdma_txreq *txreq,
  68	uint seq,
  69	bool pkts_sent)
  70{
  71	struct hfi1_user_sdma_pkt_q *pq =
  72		container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
  73
  74	write_seqlock(&sde->waitlock);
  75	trace_hfi1_usdma_defer(pq, sde, &pq->busy);
  76	if (sdma_progress(sde, seq, txreq))
  77		goto eagain;
  78	/*
  79	 * We are assuming that if the list is enqueued somewhere, it
  80	 * is to the dmawait list since that is the only place where
  81	 * it is supposed to be enqueued.
  82	 */
  83	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
  84	if (list_empty(&pq->busy.list)) {
  85		pq->busy.lock = &sde->waitlock;
  86		iowait_get_priority(&pq->busy);
  87		iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
  88	}
  89	write_sequnlock(&sde->waitlock);
  90	return -EBUSY;
  91eagain:
  92	write_sequnlock(&sde->waitlock);
  93	return -EAGAIN;
  94}
  95
  96static void activate_packet_queue(struct iowait *wait, int reason)
  97{
  98	struct hfi1_user_sdma_pkt_q *pq =
  99		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
 100
 101	trace_hfi1_usdma_activate(pq, wait, reason);
 102	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
 103	wake_up(&wait->wait_dma);
 104};
 105
 106int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 107				struct hfi1_filedata *fd)
 108{
 109	int ret = -ENOMEM;
 110	char buf[64];
 111	struct hfi1_devdata *dd;
 112	struct hfi1_user_sdma_comp_q *cq;
 113	struct hfi1_user_sdma_pkt_q *pq;
 114
 115	if (!uctxt || !fd)
 116		return -EBADF;
 117
 118	if (!hfi1_sdma_comp_ring_size)
 119		return -EINVAL;
 120
 121	dd = uctxt->dd;
 122
 123	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
 124	if (!pq)
 125		return -ENOMEM;
 126	pq->dd = dd;
 127	pq->ctxt = uctxt->ctxt;
 128	pq->subctxt = fd->subctxt;
 129	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
 130	atomic_set(&pq->n_reqs, 0);
 131	init_waitqueue_head(&pq->wait);
 132	atomic_set(&pq->n_locked, 0);
 133
 134	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
 135		    activate_packet_queue, NULL, NULL);
 136	pq->reqidx = 0;
 137
 138	pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
 139			   sizeof(*pq->reqs),
 140			   GFP_KERNEL);
 141	if (!pq->reqs)
 142		goto pq_reqs_nomem;
 143
 144	pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL);
 145	if (!pq->req_in_use)
 146		goto pq_reqs_no_in_use;
 147
 148	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
 149		 fd->subctxt);
 150	pq->txreq_cache = kmem_cache_create(buf,
 151					    sizeof(struct user_sdma_txreq),
 152					    L1_CACHE_BYTES,
 153					    SLAB_HWCACHE_ALIGN,
 154					    NULL);
 155	if (!pq->txreq_cache) {
 156		dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
 157			   uctxt->ctxt);
 158		goto pq_txreq_nomem;
 159	}
 160
 161	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 162	if (!cq)
 163		goto cq_nomem;
 164
 165	cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
 166				 * hfi1_sdma_comp_ring_size));
 167	if (!cq->comps)
 168		goto cq_comps_nomem;
 169
 170	cq->nentries = hfi1_sdma_comp_ring_size;
 171
 172	ret = hfi1_init_system_pinning(pq);
 173	if (ret)
 174		goto pq_mmu_fail;
 175
 176	rcu_assign_pointer(fd->pq, pq);
 177	fd->cq = cq;
 178
 179	return 0;
 180
 181pq_mmu_fail:
 182	vfree(cq->comps);
 183cq_comps_nomem:
 184	kfree(cq);
 185cq_nomem:
 186	kmem_cache_destroy(pq->txreq_cache);
 187pq_txreq_nomem:
 188	bitmap_free(pq->req_in_use);
 189pq_reqs_no_in_use:
 190	kfree(pq->reqs);
 191pq_reqs_nomem:
 192	kfree(pq);
 193
 194	return ret;
 195}
 196
 197static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq)
 198{
 199	unsigned long flags;
 200	seqlock_t *lock = pq->busy.lock;
 201
 202	if (!lock)
 203		return;
 204	write_seqlock_irqsave(lock, flags);
 205	if (!list_empty(&pq->busy.list)) {
 206		list_del_init(&pq->busy.list);
 207		pq->busy.lock = NULL;
 208	}
 209	write_sequnlock_irqrestore(lock, flags);
 210}
 211
 212int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
 213			       struct hfi1_ctxtdata *uctxt)
 214{
 215	struct hfi1_user_sdma_pkt_q *pq;
 216
 217	trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
 218
 219	spin_lock(&fd->pq_rcu_lock);
 220	pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
 221				    lockdep_is_held(&fd->pq_rcu_lock));
 222	if (pq) {
 223		rcu_assign_pointer(fd->pq, NULL);
 224		spin_unlock(&fd->pq_rcu_lock);
 225		synchronize_srcu(&fd->pq_srcu);
 226		/* at this point there can be no more new requests */
 227		iowait_sdma_drain(&pq->busy);
 228		/* Wait until all requests have been freed. */
 229		wait_event_interruptible(
 230			pq->wait,
 231			!atomic_read(&pq->n_reqs));
 232		kfree(pq->reqs);
 233		hfi1_free_system_pinning(pq);
 234		bitmap_free(pq->req_in_use);
 235		kmem_cache_destroy(pq->txreq_cache);
 236		flush_pq_iowait(pq);
 237		kfree(pq);
 238	} else {
 239		spin_unlock(&fd->pq_rcu_lock);
 240	}
 241	if (fd->cq) {
 242		vfree(fd->cq->comps);
 243		kfree(fd->cq);
 244		fd->cq = NULL;
 245	}
 246	return 0;
 247}
 248
 249static u8 dlid_to_selector(u16 dlid)
 250{
 251	static u8 mapping[256];
 252	static int initialized;
 253	static u8 next;
 254	int hash;
 255
 256	if (!initialized) {
 257		memset(mapping, 0xFF, 256);
 258		initialized = 1;
 259	}
 260
 261	hash = ((dlid >> 8) ^ dlid) & 0xFF;
 262	if (mapping[hash] == 0xFF) {
 263		mapping[hash] = next;
 264		next = (next + 1) & 0x7F;
 265	}
 266
 267	return mapping[hash];
 268}
 269
 270/**
 271 * hfi1_user_sdma_process_request() - Process and start a user sdma request
 272 * @fd: valid file descriptor
 273 * @iovec: array of io vectors to process
 274 * @dim: overall iovec array size
 275 * @count: number of io vector array entries processed
 276 */
 277int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 278				   struct iovec *iovec, unsigned long dim,
 279				   unsigned long *count)
 280{
 281	int ret = 0, i;
 282	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 283	struct hfi1_user_sdma_pkt_q *pq =
 284		srcu_dereference(fd->pq, &fd->pq_srcu);
 285	struct hfi1_user_sdma_comp_q *cq = fd->cq;
 286	struct hfi1_devdata *dd = pq->dd;
 287	unsigned long idx = 0;
 288	u8 pcount = initial_pkt_count;
 289	struct sdma_req_info info;
 290	struct user_sdma_request *req;
 291	u8 opcode, sc, vl;
 292	u16 pkey;
 293	u32 slid;
 294	u16 dlid;
 295	u32 selector;
 296
 297	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
 298		hfi1_cdbg(
 299		   SDMA,
 300		   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
 301		   dd->unit, uctxt->ctxt, fd->subctxt,
 302		   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
 303		return -EINVAL;
 304	}
 305	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
 306	if (ret) {
 307		hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
 308			  dd->unit, uctxt->ctxt, fd->subctxt, ret);
 309		return -EFAULT;
 310	}
 311
 312	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
 313				     (u16 *)&info);
 314	if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
 315		hfi1_cdbg(SDMA,
 316			  "[%u:%u:%u:%u] Invalid comp index",
 317			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 318		return -EINVAL;
 319	}
 320
 321	/*
 322	 * Sanity check the header io vector count.  Need at least 1 vector
 323	 * (header) and cannot be larger than the actual io vector count.
 324	 */
 325	if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
 326		hfi1_cdbg(SDMA,
 327			  "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
 328			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
 329			  req_iovcnt(info.ctrl), dim);
 330		return -EINVAL;
 331	}
 332
 333	if (!info.fragsize) {
 334		hfi1_cdbg(SDMA,
 335			  "[%u:%u:%u:%u] Request does not specify fragsize",
 336			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 337		return -EINVAL;
 338	}
 339
 340	/* Try to claim the request. */
 341	if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
 342		hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
 343			  dd->unit, uctxt->ctxt, fd->subctxt,
 344			  info.comp_idx);
 345		return -EBADSLT;
 346	}
 347	/*
 348	 * All safety checks have been done and this request has been claimed.
 349	 */
 350	trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
 351					     info.comp_idx);
 352	req = pq->reqs + info.comp_idx;
 353	req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
 354	req->data_len  = 0;
 355	req->pq = pq;
 356	req->cq = cq;
 357	req->ahg_idx = -1;
 358	req->iov_idx = 0;
 359	req->sent = 0;
 360	req->seqnum = 0;
 361	req->seqcomp = 0;
 362	req->seqsubmitted = 0;
 363	req->tids = NULL;
 364	req->has_error = 0;
 365	INIT_LIST_HEAD(&req->txps);
 366
 367	memcpy(&req->info, &info, sizeof(info));
 368
 369	/* The request is initialized, count it */
 370	atomic_inc(&pq->n_reqs);
 371
 372	if (req_opcode(info.ctrl) == EXPECTED) {
 373		/* expected must have a TID info and at least one data vector */
 374		if (req->data_iovs < 2) {
 375			SDMA_DBG(req,
 376				 "Not enough vectors for expected request");
 377			ret = -EINVAL;
 378			goto free_req;
 379		}
 380		req->data_iovs--;
 381	}
 382
 383	if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
 384		SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
 385			 MAX_VECTORS_PER_REQ);
 386		ret = -EINVAL;
 387		goto free_req;
 388	}
 389
 390	/* Copy the header from the user buffer */
 391	ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
 392			     sizeof(req->hdr));
 393	if (ret) {
 394		SDMA_DBG(req, "Failed to copy header template (%d)", ret);
 395		ret = -EFAULT;
 396		goto free_req;
 397	}
 398
 399	/* If Static rate control is not enabled, sanitize the header. */
 400	if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
 401		req->hdr.pbc[2] = 0;
 402
 403	/* Validate the opcode. Do not trust packets from user space blindly. */
 404	opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
 405	if ((opcode & USER_OPCODE_CHECK_MASK) !=
 406	     USER_OPCODE_CHECK_VAL) {
 407		SDMA_DBG(req, "Invalid opcode (%d)", opcode);
 408		ret = -EINVAL;
 409		goto free_req;
 410	}
 411	/*
 412	 * Validate the vl. Do not trust packets from user space blindly.
 413	 * VL comes from PBC, SC comes from LRH, and the VL needs to
 414	 * match the SC look up.
 415	 */
 416	vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
 417	sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
 418	      (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
 419	if (vl >= dd->pport->vls_operational ||
 420	    vl != sc_to_vlt(dd, sc)) {
 421		SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
 422		ret = -EINVAL;
 423		goto free_req;
 424	}
 425
 426	/* Checking P_KEY for requests from user-space */
 427	pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
 428	slid = be16_to_cpu(req->hdr.lrh[3]);
 429	if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
 430		ret = -EINVAL;
 431		goto free_req;
 432	}
 433
 434	/*
 435	 * Also should check the BTH.lnh. If it says the next header is GRH then
 436	 * the RXE parsing will be off and will land in the middle of the KDETH
 437	 * or miss it entirely.
 438	 */
 439	if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
 440		SDMA_DBG(req, "User tried to pass in a GRH");
 441		ret = -EINVAL;
 442		goto free_req;
 443	}
 444
 445	req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
 446	/*
 447	 * Calculate the initial TID offset based on the values of
 448	 * KDETH.OFFSET and KDETH.OM that are passed in.
 449	 */
 450	req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
 451		(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 452		 KDETH_OM_LARGE : KDETH_OM_SMALL);
 453	trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
 454					       info.comp_idx, req->tidoffset);
 455	idx++;
 456
 457	/* Save all the IO vector structures */
 458	for (i = 0; i < req->data_iovs; i++) {
 459		req->iovs[i].offset = 0;
 460		INIT_LIST_HEAD(&req->iovs[i].list);
 461		memcpy(&req->iovs[i].iov,
 462		       iovec + idx++,
 463		       sizeof(req->iovs[i].iov));
 464		if (req->iovs[i].iov.iov_len == 0) {
 465			ret = -EINVAL;
 466			goto free_req;
 467		}
 468		req->data_len += req->iovs[i].iov.iov_len;
 469	}
 470	trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
 471					 info.comp_idx, req->data_len);
 472	if (pcount > req->info.npkts)
 473		pcount = req->info.npkts;
 474	/*
 475	 * Copy any TID info
 476	 * User space will provide the TID info only when the
 477	 * request type is EXPECTED. This is true even if there is
 478	 * only one packet in the request and the header is already
 479	 * setup. The reason for the singular TID case is that the
 480	 * driver needs to perform safety checks.
 481	 */
 482	if (req_opcode(req->info.ctrl) == EXPECTED) {
 483		u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
 484		u32 *tmp;
 485
 486		if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
 487			ret = -EINVAL;
 488			goto free_req;
 489		}
 490
 491		/*
 492		 * We have to copy all of the tids because they may vary
 493		 * in size and, therefore, the TID count might not be
 494		 * equal to the pkt count. However, there is no way to
 495		 * tell at this point.
 496		 */
 497		tmp = memdup_array_user(iovec[idx].iov_base,
 498					ntids, sizeof(*req->tids));
 499		if (IS_ERR(tmp)) {
 500			ret = PTR_ERR(tmp);
 501			SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
 502				 ntids, ret);
 503			goto free_req;
 504		}
 505		req->tids = tmp;
 506		req->n_tids = ntids;
 507		req->tididx = 0;
 508		idx++;
 509	}
 510
 511	dlid = be16_to_cpu(req->hdr.lrh[1]);
 512	selector = dlid_to_selector(dlid);
 513	selector += uctxt->ctxt + fd->subctxt;
 514	req->sde = sdma_select_user_engine(dd, selector, vl);
 515
 516	if (!req->sde || !sdma_running(req->sde)) {
 517		ret = -ECOMM;
 518		goto free_req;
 519	}
 520
 521	/* We don't need an AHG entry if the request contains only one packet */
 522	if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
 523		req->ahg_idx = sdma_ahg_alloc(req->sde);
 524
 525	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
 526	pq->state = SDMA_PKT_Q_ACTIVE;
 527
 528	/*
 529	 * This is a somewhat blocking send implementation.
 530	 * The driver will block the caller until all packets of the
 531	 * request have been submitted to the SDMA engine. However, it
 532	 * will not wait for send completions.
 533	 */
 534	while (req->seqsubmitted != req->info.npkts) {
 535		ret = user_sdma_send_pkts(req, pcount);
 536		if (ret < 0) {
 537			int we_ret;
 538
 539			if (ret != -EBUSY)
 540				goto free_req;
 541			we_ret = wait_event_interruptible_timeout(
 542				pq->busy.wait_dma,
 543				pq->state == SDMA_PKT_Q_ACTIVE,
 544				msecs_to_jiffies(
 545					SDMA_IOWAIT_TIMEOUT));
 546			trace_hfi1_usdma_we(pq, we_ret);
 547			if (we_ret <= 0)
 548				flush_pq_iowait(pq);
 549		}
 550	}
 551	*count += idx;
 552	return 0;
 553free_req:
 554	/*
 555	 * If the submitted seqsubmitted == npkts, the completion routine
 556	 * controls the final state.  If sequbmitted < npkts, wait for any
 557	 * outstanding packets to finish before cleaning up.
 558	 */
 559	if (req->seqsubmitted < req->info.npkts) {
 560		if (req->seqsubmitted)
 561			wait_event(pq->busy.wait_dma,
 562				   (req->seqcomp == req->seqsubmitted - 1));
 563		user_sdma_free_request(req);
 564		pq_update(pq);
 565		set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
 566	}
 567	return ret;
 568}
 569
 570static inline u32 compute_data_length(struct user_sdma_request *req,
 571				      struct user_sdma_txreq *tx)
 572{
 573	/*
 574	 * Determine the proper size of the packet data.
 575	 * The size of the data of the first packet is in the header
 576	 * template. However, it includes the header and ICRC, which need
 577	 * to be subtracted.
 578	 * The minimum representable packet data length in a header is 4 bytes,
 579	 * therefore, when the data length request is less than 4 bytes, there's
 580	 * only one packet, and the packet data length is equal to that of the
 581	 * request data length.
 582	 * The size of the remaining packets is the minimum of the frag
 583	 * size (MTU) or remaining data in the request.
 584	 */
 585	u32 len;
 586
 587	if (!req->seqnum) {
 588		if (req->data_len < sizeof(u32))
 589			len = req->data_len;
 590		else
 591			len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
 592			       (sizeof(tx->hdr) - 4));
 593	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
 594		u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
 595			PAGE_SIZE;
 596		/*
 597		 * Get the data length based on the remaining space in the
 598		 * TID pair.
 599		 */
 600		len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
 601		/* If we've filled up the TID pair, move to the next one. */
 602		if (unlikely(!len) && ++req->tididx < req->n_tids &&
 603		    req->tids[req->tididx]) {
 604			tidlen = EXP_TID_GET(req->tids[req->tididx],
 605					     LEN) * PAGE_SIZE;
 606			req->tidoffset = 0;
 607			len = min_t(u32, tidlen, req->info.fragsize);
 608		}
 609		/*
 610		 * Since the TID pairs map entire pages, make sure that we
 611		 * are not going to try to send more data that we have
 612		 * remaining.
 613		 */
 614		len = min(len, req->data_len - req->sent);
 615	} else {
 616		len = min(req->data_len - req->sent, (u32)req->info.fragsize);
 617	}
 618	trace_hfi1_sdma_user_compute_length(req->pq->dd,
 619					    req->pq->ctxt,
 620					    req->pq->subctxt,
 621					    req->info.comp_idx,
 622					    len);
 623	return len;
 624}
 625
 626static inline u32 pad_len(u32 len)
 627{
 628	if (len & (sizeof(u32) - 1))
 629		len += sizeof(u32) - (len & (sizeof(u32) - 1));
 630	return len;
 631}
 632
 633static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
 634{
 635	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
 636	return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
 637}
 638
 639static int user_sdma_txadd_ahg(struct user_sdma_request *req,
 640			       struct user_sdma_txreq *tx,
 641			       u32 datalen)
 642{
 643	int ret;
 644	u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
 645	u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
 646	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 647
 648	/*
 649	 * Copy the request header into the tx header
 650	 * because the HW needs a cacheline-aligned
 651	 * address.
 652	 * This copy can be optimized out if the hdr
 653	 * member of user_sdma_request were also
 654	 * cacheline aligned.
 655	 */
 656	memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
 657	if (PBC2LRH(pbclen) != lrhlen) {
 658		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 659		tx->hdr.pbc[0] = cpu_to_le16(pbclen);
 660	}
 661	ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
 662	if (ret)
 663		return ret;
 664	ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
 665			      sizeof(tx->hdr) + datalen, req->ahg_idx,
 666			      0, NULL, 0, user_sdma_txreq_cb);
 667	if (ret)
 668		return ret;
 669	ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
 670	if (ret)
 671		sdma_txclean(pq->dd, &tx->txreq);
 672	return ret;
 673}
 674
 675static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
 676{
 677	int ret = 0;
 678	u16 count;
 679	unsigned npkts = 0;
 680	struct user_sdma_txreq *tx = NULL;
 681	struct hfi1_user_sdma_pkt_q *pq = NULL;
 682	struct user_sdma_iovec *iovec = NULL;
 683
 684	if (!req->pq)
 685		return -EINVAL;
 686
 687	pq = req->pq;
 688
 689	/* If tx completion has reported an error, we are done. */
 690	if (READ_ONCE(req->has_error))
 691		return -EFAULT;
 692
 693	/*
 694	 * Check if we might have sent the entire request already
 695	 */
 696	if (unlikely(req->seqnum == req->info.npkts)) {
 697		if (!list_empty(&req->txps))
 698			goto dosend;
 699		return ret;
 700	}
 701
 702	if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
 703		maxpkts = req->info.npkts - req->seqnum;
 704
 705	while (npkts < maxpkts) {
 706		u32 datalen = 0;
 707
 708		/*
 709		 * Check whether any of the completions have come back
 710		 * with errors. If so, we are not going to process any
 711		 * more packets from this request.
 712		 */
 713		if (READ_ONCE(req->has_error))
 714			return -EFAULT;
 715
 716		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
 717		if (!tx)
 718			return -ENOMEM;
 719
 720		tx->flags = 0;
 721		tx->req = req;
 722		INIT_LIST_HEAD(&tx->list);
 723
 724		/*
 725		 * For the last packet set the ACK request
 726		 * and disable header suppression.
 727		 */
 728		if (req->seqnum == req->info.npkts - 1)
 729			tx->flags |= (TXREQ_FLAGS_REQ_ACK |
 730				      TXREQ_FLAGS_REQ_DISABLE_SH);
 731
 732		/*
 733		 * Calculate the payload size - this is min of the fragment
 734		 * (MTU) size or the remaining bytes in the request but only
 735		 * if we have payload data.
 736		 */
 737		if (req->data_len) {
 738			iovec = &req->iovs[req->iov_idx];
 739			if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
 740				if (++req->iov_idx == req->data_iovs) {
 741					ret = -EFAULT;
 742					goto free_tx;
 743				}
 744				iovec = &req->iovs[req->iov_idx];
 745				WARN_ON(iovec->offset);
 746			}
 747
 748			datalen = compute_data_length(req, tx);
 749
 750			/*
 751			 * Disable header suppression for the payload <= 8DWS.
 752			 * If there is an uncorrectable error in the receive
 753			 * data FIFO when the received payload size is less than
 754			 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
 755			 * not reported.There is set RHF.EccErr if the header
 756			 * is not suppressed.
 757			 */
 758			if (!datalen) {
 759				SDMA_DBG(req,
 760					 "Request has data but pkt len is 0");
 761				ret = -EFAULT;
 762				goto free_tx;
 763			} else if (datalen <= 32) {
 764				tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
 765			}
 766		}
 767
 768		if (req->ahg_idx >= 0) {
 769			if (!req->seqnum) {
 770				ret = user_sdma_txadd_ahg(req, tx, datalen);
 771				if (ret)
 772					goto free_tx;
 773			} else {
 774				int changes;
 775
 776				changes = set_txreq_header_ahg(req, tx,
 777							       datalen);
 778				if (changes < 0) {
 779					ret = changes;
 780					goto free_tx;
 781				}
 782			}
 783		} else {
 784			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
 785					  datalen, user_sdma_txreq_cb);
 786			if (ret)
 787				goto free_tx;
 788			/*
 789			 * Modify the header for this packet. This only needs
 790			 * to be done if we are not going to use AHG. Otherwise,
 791			 * the HW will do it based on the changes we gave it
 792			 * during sdma_txinit_ahg().
 793			 */
 794			ret = set_txreq_header(req, tx, datalen);
 795			if (ret)
 796				goto free_txreq;
 797		}
 798
 799		req->koffset += datalen;
 800		if (req_opcode(req->info.ctrl) == EXPECTED)
 801			req->tidoffset += datalen;
 802		req->sent += datalen;
 803		while (datalen) {
 804			ret = hfi1_add_pages_to_sdma_packet(req, tx, iovec,
 805							    &datalen);
 806			if (ret)
 807				goto free_txreq;
 808			iovec = &req->iovs[req->iov_idx];
 809		}
 810		list_add_tail(&tx->txreq.list, &req->txps);
 811		/*
 812		 * It is important to increment this here as it is used to
 813		 * generate the BTH.PSN and, therefore, can't be bulk-updated
 814		 * outside of the loop.
 815		 */
 816		tx->seqnum = req->seqnum++;
 817		npkts++;
 818	}
 819dosend:
 820	ret = sdma_send_txlist(req->sde,
 821			       iowait_get_ib_work(&pq->busy),
 822			       &req->txps, &count);
 823	req->seqsubmitted += count;
 824	if (req->seqsubmitted == req->info.npkts) {
 825		/*
 826		 * The txreq has already been submitted to the HW queue
 827		 * so we can free the AHG entry now. Corruption will not
 828		 * happen due to the sequential manner in which
 829		 * descriptors are processed.
 830		 */
 831		if (req->ahg_idx >= 0)
 832			sdma_ahg_free(req->sde, req->ahg_idx);
 833	}
 834	return ret;
 835
 836free_txreq:
 837	sdma_txclean(pq->dd, &tx->txreq);
 838free_tx:
 839	kmem_cache_free(pq->txreq_cache, tx);
 840	return ret;
 841}
 842
 843static int check_header_template(struct user_sdma_request *req,
 844				 struct hfi1_pkt_header *hdr, u32 lrhlen,
 845				 u32 datalen)
 846{
 847	/*
 848	 * Perform safety checks for any type of packet:
 849	 *    - transfer size is multiple of 64bytes
 850	 *    - packet length is multiple of 4 bytes
 851	 *    - packet length is not larger than MTU size
 852	 *
 853	 * These checks are only done for the first packet of the
 854	 * transfer since the header is "given" to us by user space.
 855	 * For the remainder of the packets we compute the values.
 856	 */
 857	if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
 858	    lrhlen > get_lrh_len(*hdr, req->info.fragsize))
 859		return -EINVAL;
 860
 861	if (req_opcode(req->info.ctrl) == EXPECTED) {
 862		/*
 863		 * The header is checked only on the first packet. Furthermore,
 864		 * we ensure that at least one TID entry is copied when the
 865		 * request is submitted. Therefore, we don't have to verify that
 866		 * tididx points to something sane.
 867		 */
 868		u32 tidval = req->tids[req->tididx],
 869			tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
 870			tididx = EXP_TID_GET(tidval, IDX),
 871			tidctrl = EXP_TID_GET(tidval, CTRL),
 872			tidoff;
 873		__le32 kval = hdr->kdeth.ver_tid_offset;
 874
 875		tidoff = KDETH_GET(kval, OFFSET) *
 876			  (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 877			   KDETH_OM_LARGE : KDETH_OM_SMALL);
 878		/*
 879		 * Expected receive packets have the following
 880		 * additional checks:
 881		 *     - offset is not larger than the TID size
 882		 *     - TIDCtrl values match between header and TID array
 883		 *     - TID indexes match between header and TID array
 884		 */
 885		if ((tidoff + datalen > tidlen) ||
 886		    KDETH_GET(kval, TIDCTRL) != tidctrl ||
 887		    KDETH_GET(kval, TID) != tididx)
 888			return -EINVAL;
 889	}
 890	return 0;
 891}
 892
 893/*
 894 * Correctly set the BTH.PSN field based on type of
 895 * transfer - eager packets can just increment the PSN but
 896 * expected packets encode generation and sequence in the
 897 * BTH.PSN field so just incrementing will result in errors.
 898 */
 899static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
 900{
 901	u32 val = be32_to_cpu(bthpsn),
 902		mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
 903			0xffffffull),
 904		psn = val & mask;
 905	if (expct)
 906		psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
 907			((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
 908	else
 909		psn = psn + frags;
 910	return psn & mask;
 911}
 912
 913static int set_txreq_header(struct user_sdma_request *req,
 914			    struct user_sdma_txreq *tx, u32 datalen)
 915{
 916	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 917	struct hfi1_pkt_header *hdr = &tx->hdr;
 918	u8 omfactor; /* KDETH.OM */
 919	u16 pbclen;
 920	int ret;
 921	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
 922
 923	/* Copy the header template to the request before modification */
 924	memcpy(hdr, &req->hdr, sizeof(*hdr));
 925
 926	/*
 927	 * Check if the PBC and LRH length are mismatched. If so
 928	 * adjust both in the header.
 929	 */
 930	pbclen = le16_to_cpu(hdr->pbc[0]);
 931	if (PBC2LRH(pbclen) != lrhlen) {
 932		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 933		hdr->pbc[0] = cpu_to_le16(pbclen);
 934		hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
 935		/*
 936		 * Third packet
 937		 * This is the first packet in the sequence that has
 938		 * a "static" size that can be used for the rest of
 939		 * the packets (besides the last one).
 940		 */
 941		if (unlikely(req->seqnum == 2)) {
 942			/*
 943			 * From this point on the lengths in both the
 944			 * PBC and LRH are the same until the last
 945			 * packet.
 946			 * Adjust the template so we don't have to update
 947			 * every packet
 948			 */
 949			req->hdr.pbc[0] = hdr->pbc[0];
 950			req->hdr.lrh[2] = hdr->lrh[2];
 951		}
 952	}
 953	/*
 954	 * We only have to modify the header if this is not the
 955	 * first packet in the request. Otherwise, we use the
 956	 * header given to us.
 957	 */
 958	if (unlikely(!req->seqnum)) {
 959		ret = check_header_template(req, hdr, lrhlen, datalen);
 960		if (ret)
 961			return ret;
 962		goto done;
 963	}
 964
 965	hdr->bth[2] = cpu_to_be32(
 966		set_pkt_bth_psn(hdr->bth[2],
 967				(req_opcode(req->info.ctrl) == EXPECTED),
 968				req->seqnum));
 969
 970	/* Set ACK request on last packet */
 971	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
 972		hdr->bth[2] |= cpu_to_be32(1UL << 31);
 973
 974	/* Set the new offset */
 975	hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
 976	/* Expected packets have to fill in the new TID information */
 977	if (req_opcode(req->info.ctrl) == EXPECTED) {
 978		tidval = req->tids[req->tididx];
 979		/*
 980		 * If the offset puts us at the end of the current TID,
 981		 * advance everything.
 982		 */
 983		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
 984					 PAGE_SIZE)) {
 985			req->tidoffset = 0;
 986			/*
 987			 * Since we don't copy all the TIDs, all at once,
 988			 * we have to check again.
 989			 */
 990			if (++req->tididx > req->n_tids - 1 ||
 991			    !req->tids[req->tididx]) {
 992				return -EINVAL;
 993			}
 994			tidval = req->tids[req->tididx];
 995		}
 996		omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
 997			KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
 998			KDETH_OM_SMALL_SHIFT;
 999		/* Set KDETH.TIDCtrl based on value for this TID. */
1000		KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1001			  EXP_TID_GET(tidval, CTRL));
1002		/* Set KDETH.TID based on value for this TID */
1003		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1004			  EXP_TID_GET(tidval, IDX));
1005		/* Clear KDETH.SH when DISABLE_SH flag is set */
1006		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
1007			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1008		/*
1009		 * Set the KDETH.OFFSET and KDETH.OM based on size of
1010		 * transfer.
1011		 */
1012		trace_hfi1_sdma_user_tid_info(
1013			pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
1014			req->tidoffset, req->tidoffset >> omfactor,
1015			omfactor != KDETH_OM_SMALL_SHIFT);
1016		KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1017			  req->tidoffset >> omfactor);
1018		KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1019			  omfactor != KDETH_OM_SMALL_SHIFT);
1020	}
1021done:
1022	trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1023				    req->info.comp_idx, hdr, tidval);
1024	return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1025}
1026
1027static int set_txreq_header_ahg(struct user_sdma_request *req,
1028				struct user_sdma_txreq *tx, u32 datalen)
1029{
1030	u32 ahg[AHG_KDETH_ARRAY_SIZE];
1031	int idx = 0;
1032	u8 omfactor; /* KDETH.OM */
1033	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1034	struct hfi1_pkt_header *hdr = &req->hdr;
1035	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1036	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1037	size_t array_size = ARRAY_SIZE(ahg);
1038
1039	if (PBC2LRH(pbclen) != lrhlen) {
1040		/* PBC.PbcLengthDWs */
1041		idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
1042				     (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
1043		if (idx < 0)
1044			return idx;
1045		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
1046		idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
1047				     (__force u16)cpu_to_be16(lrhlen >> 2));
1048		if (idx < 0)
1049			return idx;
1050	}
1051
1052	/*
1053	 * Do the common updates
1054	 */
1055	/* BTH.PSN and BTH.A */
1056	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1057		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1058	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1059		val32 |= 1UL << 31;
1060	idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
1061			     (__force u16)cpu_to_be16(val32 >> 16));
1062	if (idx < 0)
1063		return idx;
1064	idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
1065			     (__force u16)cpu_to_be16(val32 & 0xffff));
1066	if (idx < 0)
1067		return idx;
1068	/* KDETH.Offset */
1069	idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
1070			     (__force u16)cpu_to_le16(req->koffset & 0xffff));
1071	if (idx < 0)
1072		return idx;
1073	idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
1074			     (__force u16)cpu_to_le16(req->koffset >> 16));
1075	if (idx < 0)
1076		return idx;
1077	if (req_opcode(req->info.ctrl) == EXPECTED) {
1078		__le16 val;
1079
1080		tidval = req->tids[req->tididx];
1081
1082		/*
1083		 * If the offset puts us at the end of the current TID,
1084		 * advance everything.
1085		 */
1086		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1087					 PAGE_SIZE)) {
1088			req->tidoffset = 0;
1089			/*
1090			 * Since we don't copy all the TIDs, all at once,
1091			 * we have to check again.
1092			 */
1093			if (++req->tididx > req->n_tids - 1 ||
1094			    !req->tids[req->tididx])
1095				return -EINVAL;
1096			tidval = req->tids[req->tididx];
1097		}
1098		omfactor = ((EXP_TID_GET(tidval, LEN) *
1099				  PAGE_SIZE) >=
1100				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
1101				 KDETH_OM_SMALL_SHIFT;
1102		/* KDETH.OM and KDETH.OFFSET (TID) */
1103		idx = ahg_header_set(
1104				ahg, idx, array_size, 7, 0, 16,
1105				((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
1106				((req->tidoffset >> omfactor)
1107				& 0x7fff)));
1108		if (idx < 0)
1109			return idx;
1110		/* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
1111		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1112				   (EXP_TID_GET(tidval, IDX) & 0x3ff));
1113
1114		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
1115			val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1116						      INTR) <<
1117					    AHG_KDETH_INTR_SHIFT));
1118		} else {
1119			val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
1120			       cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
1121			       cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1122						      INTR) <<
1123					     AHG_KDETH_INTR_SHIFT));
1124		}
1125
1126		idx = ahg_header_set(ahg, idx, array_size,
1127				     7, 16, 14, (__force u16)val);
1128		if (idx < 0)
1129			return idx;
1130	}
1131
1132	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1133					req->info.comp_idx, req->sde->this_idx,
1134					req->ahg_idx, ahg, idx, tidval);
1135	sdma_txinit_ahg(&tx->txreq,
1136			SDMA_TXREQ_F_USE_AHG,
1137			datalen, req->ahg_idx, idx,
1138			ahg, sizeof(req->hdr),
1139			user_sdma_txreq_cb);
1140
1141	return idx;
1142}
1143
1144/**
1145 * user_sdma_txreq_cb() - SDMA tx request completion callback.
1146 * @txreq: valid sdma tx request
1147 * @status: success/failure of request
1148 *
1149 * Called when the SDMA progress state machine gets notification that
1150 * the SDMA descriptors for this tx request have been processed by the
1151 * DMA engine. Called in interrupt context.
1152 * Only do work on completed sequences.
1153 */
1154static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
1155{
1156	struct user_sdma_txreq *tx =
1157		container_of(txreq, struct user_sdma_txreq, txreq);
1158	struct user_sdma_request *req;
1159	struct hfi1_user_sdma_pkt_q *pq;
1160	struct hfi1_user_sdma_comp_q *cq;
1161	enum hfi1_sdma_comp_state state = COMPLETE;
1162
1163	if (!tx->req)
1164		return;
1165
1166	req = tx->req;
1167	pq = req->pq;
1168	cq = req->cq;
1169
1170	if (status != SDMA_TXREQ_S_OK) {
1171		SDMA_DBG(req, "SDMA completion with error %d",
1172			 status);
1173		WRITE_ONCE(req->has_error, 1);
1174		state = ERROR;
1175	}
1176
1177	req->seqcomp = tx->seqnum;
1178	kmem_cache_free(pq->txreq_cache, tx);
1179
1180	/* sequence isn't complete?  We are done */
1181	if (req->seqcomp != req->info.npkts - 1)
1182		return;
1183
1184	user_sdma_free_request(req);
1185	set_comp_state(pq, cq, req->info.comp_idx, state, status);
1186	pq_update(pq);
1187}
1188
1189static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
1190{
1191	if (atomic_dec_and_test(&pq->n_reqs))
1192		wake_up(&pq->wait);
1193}
1194
1195static void user_sdma_free_request(struct user_sdma_request *req)
1196{
1197	if (!list_empty(&req->txps)) {
1198		struct sdma_txreq *t, *p;
1199
1200		list_for_each_entry_safe(t, p, &req->txps, list) {
1201			struct user_sdma_txreq *tx =
1202				container_of(t, struct user_sdma_txreq, txreq);
1203			list_del_init(&t->list);
1204			sdma_txclean(req->pq->dd, t);
1205			kmem_cache_free(req->pq->txreq_cache, tx);
1206		}
1207	}
1208
1209	kfree(req->tids);
1210	clear_bit(req->info.comp_idx, req->pq->req_in_use);
1211}
1212
1213static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
1214				  struct hfi1_user_sdma_comp_q *cq,
1215				  u16 idx, enum hfi1_sdma_comp_state state,
1216				  int ret)
1217{
1218	if (state == ERROR)
1219		cq->comps[idx].errcode = -ret;
1220	smp_wmb(); /* make sure errcode is visible first */
1221	cq->comps[idx].status = state;
1222	trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
1223					idx, state, ret);
1224}

   1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2/*
   3 * Copyright(c) 2020 - 2023 Cornelis Networks, Inc.
   4 * Copyright(c) 2015 - 2018 Intel Corporation.
   5 */
   6
   7#include <linux/mm.h>
   8#include <linux/types.h>
   9#include <linux/device.h>
  10#include <linux/dmapool.h>
  11#include <linux/slab.h>
  12#include <linux/list.h>
  13#include <linux/highmem.h>
  14#include <linux/io.h>
  15#include <linux/uio.h>
  16#include <linux/rbtree.h>
  17#include <linux/spinlock.h>
  18#include <linux/delay.h>
  19#include <linux/kthread.h>
  20#include <linux/mmu_context.h>
  21#include <linux/module.h>
  22#include <linux/vmalloc.h>
  23#include <linux/string.h>
  24
  25#include "hfi.h"
  26#include "sdma.h"
  27#include "user_sdma.h"
  28#include "verbs.h"  /* for the headers */
  29#include "common.h" /* for struct hfi1_tid_info */
  30#include "trace.h"
  31
  32static uint hfi1_sdma_comp_ring_size = 128;
  33module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
  34MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
  35
  36static unsigned initial_pkt_count = 8;
  37
  38static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
  39static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
  40static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
  41static void user_sdma_free_request(struct user_sdma_request *req);
  42static int check_header_template(struct user_sdma_request *req,
  43				 struct hfi1_pkt_header *hdr, u32 lrhlen,
  44				 u32 datalen);
  45static int set_txreq_header(struct user_sdma_request *req,
  46			    struct user_sdma_txreq *tx, u32 datalen);
  47static int set_txreq_header_ahg(struct user_sdma_request *req,
  48				struct user_sdma_txreq *tx, u32 len);
  49static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  50				  struct hfi1_user_sdma_comp_q *cq,
  51				  u16 idx, enum hfi1_sdma_comp_state state,
  52				  int ret);
  53static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
  54static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
  55
  56static int defer_packet_queue(
  57	struct sdma_engine *sde,
  58	struct iowait_work *wait,
  59	struct sdma_txreq *txreq,
  60	uint seq,
  61	bool pkts_sent);
  62static void activate_packet_queue(struct iowait *wait, int reason);
  63
  64static int defer_packet_queue(
  65	struct sdma_engine *sde,
  66	struct iowait_work *wait,
  67	struct sdma_txreq *txreq,
  68	uint seq,
  69	bool pkts_sent)
  70{
  71	struct hfi1_user_sdma_pkt_q *pq =
  72		container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
  73
  74	write_seqlock(&sde->waitlock);
  75	trace_hfi1_usdma_defer(pq, sde, &pq->busy);
  76	if (sdma_progress(sde, seq, txreq))
  77		goto eagain;
  78	/*
  79	 * We are assuming that if the list is enqueued somewhere, it
  80	 * is to the dmawait list since that is the only place where
  81	 * it is supposed to be enqueued.
  82	 */
  83	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
  84	if (list_empty(&pq->busy.list)) {
  85		pq->busy.lock = &sde->waitlock;
  86		iowait_get_priority(&pq->busy);
  87		iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
  88	}
  89	write_sequnlock(&sde->waitlock);
  90	return -EBUSY;
  91eagain:
  92	write_sequnlock(&sde->waitlock);
  93	return -EAGAIN;
  94}
  95
  96static void activate_packet_queue(struct iowait *wait, int reason)
  97{
  98	struct hfi1_user_sdma_pkt_q *pq =
  99		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
 100
 101	trace_hfi1_usdma_activate(pq, wait, reason);
 102	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
 103	wake_up(&wait->wait_dma);
 104};
 105
 106int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 107				struct hfi1_filedata *fd)
 108{
 109	int ret = -ENOMEM;
 110	char buf[64];
 111	struct hfi1_devdata *dd;
 112	struct hfi1_user_sdma_comp_q *cq;
 113	struct hfi1_user_sdma_pkt_q *pq;
 114
 115	if (!uctxt || !fd)
 116		return -EBADF;
 117
 118	if (!hfi1_sdma_comp_ring_size)
 119		return -EINVAL;
 120
 121	dd = uctxt->dd;
 122
 123	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
 124	if (!pq)
 125		return -ENOMEM;
 126	pq->dd = dd;
 127	pq->ctxt = uctxt->ctxt;
 128	pq->subctxt = fd->subctxt;
 129	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
 130	atomic_set(&pq->n_reqs, 0);
 131	init_waitqueue_head(&pq->wait);
 132	atomic_set(&pq->n_locked, 0);
 133
 134	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
 135		    activate_packet_queue, NULL, NULL);
 136	pq->reqidx = 0;
 137
 138	pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
 139			   sizeof(*pq->reqs),
 140			   GFP_KERNEL);
 141	if (!pq->reqs)
 142		goto pq_reqs_nomem;
 143
 144	pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL);
 145	if (!pq->req_in_use)
 146		goto pq_reqs_no_in_use;
 147
 148	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
 149		 fd->subctxt);
 150	pq->txreq_cache = kmem_cache_create(buf,
 151					    sizeof(struct user_sdma_txreq),
 152					    L1_CACHE_BYTES,
 153					    SLAB_HWCACHE_ALIGN,
 154					    NULL);
 155	if (!pq->txreq_cache) {
 156		dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
 157			   uctxt->ctxt);
 158		goto pq_txreq_nomem;
 159	}
 160
 161	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 162	if (!cq)
 163		goto cq_nomem;
 164
 165	cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
 166				 * hfi1_sdma_comp_ring_size));
 167	if (!cq->comps)
 168		goto cq_comps_nomem;
 169
 170	cq->nentries = hfi1_sdma_comp_ring_size;
 171
 172	ret = hfi1_init_system_pinning(pq);
 173	if (ret)
 174		goto pq_mmu_fail;
 175
 176	rcu_assign_pointer(fd->pq, pq);
 177	fd->cq = cq;
 178
 179	return 0;
 180
 181pq_mmu_fail:
 182	vfree(cq->comps);
 183cq_comps_nomem:
 184	kfree(cq);
 185cq_nomem:
 186	kmem_cache_destroy(pq->txreq_cache);
 187pq_txreq_nomem:
 188	bitmap_free(pq->req_in_use);
 189pq_reqs_no_in_use:
 190	kfree(pq->reqs);
 191pq_reqs_nomem:
 192	kfree(pq);
 193
 194	return ret;
 195}
 196
 197static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq)
 198{
 199	unsigned long flags;
 200	seqlock_t *lock = pq->busy.lock;
 201
 202	if (!lock)
 203		return;
 204	write_seqlock_irqsave(lock, flags);
 205	if (!list_empty(&pq->busy.list)) {
 206		list_del_init(&pq->busy.list);
 207		pq->busy.lock = NULL;
 208	}
 209	write_sequnlock_irqrestore(lock, flags);
 210}
 211
 212int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
 213			       struct hfi1_ctxtdata *uctxt)
 214{
 215	struct hfi1_user_sdma_pkt_q *pq;
 216
 217	trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
 218
 219	spin_lock(&fd->pq_rcu_lock);
 220	pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
 221				    lockdep_is_held(&fd->pq_rcu_lock));
 222	if (pq) {
 223		rcu_assign_pointer(fd->pq, NULL);
 224		spin_unlock(&fd->pq_rcu_lock);
 225		synchronize_srcu(&fd->pq_srcu);
 226		/* at this point there can be no more new requests */
 227		iowait_sdma_drain(&pq->busy);
 228		/* Wait until all requests have been freed. */
 229		wait_event_interruptible(
 230			pq->wait,
 231			!atomic_read(&pq->n_reqs));
 232		kfree(pq->reqs);
 233		hfi1_free_system_pinning(pq);
 234		bitmap_free(pq->req_in_use);
 235		kmem_cache_destroy(pq->txreq_cache);
 236		flush_pq_iowait(pq);
 237		kfree(pq);
 238	} else {
 239		spin_unlock(&fd->pq_rcu_lock);
 240	}
 241	if (fd->cq) {
 242		vfree(fd->cq->comps);
 243		kfree(fd->cq);
 244		fd->cq = NULL;
 245	}
 246	return 0;
 247}
 248
 249static u8 dlid_to_selector(u16 dlid)
 250{
 251	static u8 mapping[256];
 252	static int initialized;
 253	static u8 next;
 254	int hash;
 255
 256	if (!initialized) {
 257		memset(mapping, 0xFF, 256);
 258		initialized = 1;
 259	}
 260
 261	hash = ((dlid >> 8) ^ dlid) & 0xFF;
 262	if (mapping[hash] == 0xFF) {
 263		mapping[hash] = next;
 264		next = (next + 1) & 0x7F;
 265	}
 266
 267	return mapping[hash];
 268}
 269
 270/**
 271 * hfi1_user_sdma_process_request() - Process and start a user sdma request
 272 * @fd: valid file descriptor
 273 * @iovec: array of io vectors to process
 274 * @dim: overall iovec array size
 275 * @count: number of io vector array entries processed
 276 */
 277int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 278				   struct iovec *iovec, unsigned long dim,
 279				   unsigned long *count)
 280{
 281	int ret = 0, i;
 282	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 283	struct hfi1_user_sdma_pkt_q *pq =
 284		srcu_dereference(fd->pq, &fd->pq_srcu);
 285	struct hfi1_user_sdma_comp_q *cq = fd->cq;
 286	struct hfi1_devdata *dd = pq->dd;
 287	unsigned long idx = 0;
 288	u8 pcount = initial_pkt_count;
 289	struct sdma_req_info info;
 290	struct user_sdma_request *req;
 291	u8 opcode, sc, vl;
 292	u16 pkey;
 293	u32 slid;
 294	u16 dlid;
 295	u32 selector;
 296
 297	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
 298		hfi1_cdbg(
 299		   SDMA,
 300		   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
 301		   dd->unit, uctxt->ctxt, fd->subctxt,
 302		   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
 303		return -EINVAL;
 304	}
 305	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
 306	if (ret) {
 307		hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
 308			  dd->unit, uctxt->ctxt, fd->subctxt, ret);
 309		return -EFAULT;
 310	}
 311
 312	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
 313				     (u16 *)&info);
 314	if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
 315		hfi1_cdbg(SDMA,
 316			  "[%u:%u:%u:%u] Invalid comp index",
 317			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 318		return -EINVAL;
 319	}
 320
 321	/*
 322	 * Sanity check the header io vector count.  Need at least 1 vector
 323	 * (header) and cannot be larger than the actual io vector count.
 324	 */
 325	if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
 326		hfi1_cdbg(SDMA,
 327			  "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
 328			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
 329			  req_iovcnt(info.ctrl), dim);
 330		return -EINVAL;
 331	}
 332
 333	if (!info.fragsize) {
 334		hfi1_cdbg(SDMA,
 335			  "[%u:%u:%u:%u] Request does not specify fragsize",
 336			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 337		return -EINVAL;
 338	}
 339
 340	/* Try to claim the request. */
 341	if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
 342		hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
 343			  dd->unit, uctxt->ctxt, fd->subctxt,
 344			  info.comp_idx);
 345		return -EBADSLT;
 346	}
 347	/*
 348	 * All safety checks have been done and this request has been claimed.
 349	 */
 350	trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
 351					     info.comp_idx);
 352	req = pq->reqs + info.comp_idx;
 353	req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
 354	req->data_len  = 0;
 355	req->pq = pq;
 356	req->cq = cq;
 357	req->ahg_idx = -1;
 358	req->iov_idx = 0;
 359	req->sent = 0;
 360	req->seqnum = 0;
 361	req->seqcomp = 0;
 362	req->seqsubmitted = 0;
 363	req->tids = NULL;
 364	req->has_error = 0;
 365	INIT_LIST_HEAD(&req->txps);
 366
 367	memcpy(&req->info, &info, sizeof(info));
 368
 369	/* The request is initialized, count it */
 370	atomic_inc(&pq->n_reqs);
 371
 372	if (req_opcode(info.ctrl) == EXPECTED) {
 373		/* expected must have a TID info and at least one data vector */
 374		if (req->data_iovs < 2) {
 375			SDMA_DBG(req,
 376				 "Not enough vectors for expected request");
 377			ret = -EINVAL;
 378			goto free_req;
 379		}
 380		req->data_iovs--;
 381	}
 382
 383	if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
 384		SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
 385			 MAX_VECTORS_PER_REQ);
 386		ret = -EINVAL;
 387		goto free_req;
 388	}
 389
 390	/* Copy the header from the user buffer */
 391	ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
 392			     sizeof(req->hdr));
 393	if (ret) {
 394		SDMA_DBG(req, "Failed to copy header template (%d)", ret);
 395		ret = -EFAULT;
 396		goto free_req;
 397	}
 398
 399	/* If Static rate control is not enabled, sanitize the header. */
 400	if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
 401		req->hdr.pbc[2] = 0;
 402
 403	/* Validate the opcode. Do not trust packets from user space blindly. */
 404	opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
 405	if ((opcode & USER_OPCODE_CHECK_MASK) !=
 406	     USER_OPCODE_CHECK_VAL) {
 407		SDMA_DBG(req, "Invalid opcode (%d)", opcode);
 408		ret = -EINVAL;
 409		goto free_req;
 410	}
 411	/*
 412	 * Validate the vl. Do not trust packets from user space blindly.
 413	 * VL comes from PBC, SC comes from LRH, and the VL needs to
 414	 * match the SC look up.
 415	 */
 416	vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
 417	sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
 418	      (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
 419	if (vl >= dd->pport->vls_operational ||
 420	    vl != sc_to_vlt(dd, sc)) {
 421		SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
 422		ret = -EINVAL;
 423		goto free_req;
 424	}
 425
 426	/* Checking P_KEY for requests from user-space */
 427	pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
 428	slid = be16_to_cpu(req->hdr.lrh[3]);
 429	if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
 430		ret = -EINVAL;
 431		goto free_req;
 432	}
 433
 434	/*
 435	 * Also should check the BTH.lnh. If it says the next header is GRH then
 436	 * the RXE parsing will be off and will land in the middle of the KDETH
 437	 * or miss it entirely.
 438	 */
 439	if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
 440		SDMA_DBG(req, "User tried to pass in a GRH");
 441		ret = -EINVAL;
 442		goto free_req;
 443	}
 444
 445	req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
 446	/*
 447	 * Calculate the initial TID offset based on the values of
 448	 * KDETH.OFFSET and KDETH.OM that are passed in.
 449	 */
 450	req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
 451		(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 452		 KDETH_OM_LARGE : KDETH_OM_SMALL);
 453	trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
 454					       info.comp_idx, req->tidoffset);
 455	idx++;
 456
 457	/* Save all the IO vector structures */
 458	for (i = 0; i < req->data_iovs; i++) {
 459		req->iovs[i].offset = 0;
 460		INIT_LIST_HEAD(&req->iovs[i].list);
 461		memcpy(&req->iovs[i].iov,
 462		       iovec + idx++,
 463		       sizeof(req->iovs[i].iov));
 464		if (req->iovs[i].iov.iov_len == 0) {
 465			ret = -EINVAL;
 466			goto free_req;
 467		}
 468		req->data_len += req->iovs[i].iov.iov_len;
 469	}
 470	trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
 471					 info.comp_idx, req->data_len);
 472	if (pcount > req->info.npkts)
 473		pcount = req->info.npkts;
 474	/*
 475	 * Copy any TID info
 476	 * User space will provide the TID info only when the
 477	 * request type is EXPECTED. This is true even if there is
 478	 * only one packet in the request and the header is already
 479	 * setup. The reason for the singular TID case is that the
 480	 * driver needs to perform safety checks.
 481	 */
 482	if (req_opcode(req->info.ctrl) == EXPECTED) {
 483		u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
 484		u32 *tmp;
 485
 486		if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
 487			ret = -EINVAL;
 488			goto free_req;
 489		}
 490
 491		/*
 492		 * We have to copy all of the tids because they may vary
 493		 * in size and, therefore, the TID count might not be
 494		 * equal to the pkt count. However, there is no way to
 495		 * tell at this point.
 496		 */
 497		tmp = memdup_array_user(iovec[idx].iov_base,
 498					ntids, sizeof(*req->tids));
 499		if (IS_ERR(tmp)) {
 500			ret = PTR_ERR(tmp);
 501			SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
 502				 ntids, ret);
 503			goto free_req;
 504		}
 505		req->tids = tmp;
 506		req->n_tids = ntids;
 507		req->tididx = 0;
 508		idx++;
 509	}
 510
 511	dlid = be16_to_cpu(req->hdr.lrh[1]);
 512	selector = dlid_to_selector(dlid);
 513	selector += uctxt->ctxt + fd->subctxt;
 514	req->sde = sdma_select_user_engine(dd, selector, vl);
 515
 516	if (!req->sde || !sdma_running(req->sde)) {
 517		ret = -ECOMM;
 518		goto free_req;
 519	}
 520
 521	/* We don't need an AHG entry if the request contains only one packet */
 522	if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
 523		req->ahg_idx = sdma_ahg_alloc(req->sde);
 524
 525	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
 526	pq->state = SDMA_PKT_Q_ACTIVE;
 527
 528	/*
 529	 * This is a somewhat blocking send implementation.
 530	 * The driver will block the caller until all packets of the
 531	 * request have been submitted to the SDMA engine. However, it
 532	 * will not wait for send completions.
 533	 */
 534	while (req->seqsubmitted != req->info.npkts) {
 535		ret = user_sdma_send_pkts(req, pcount);
 536		if (ret < 0) {
 537			int we_ret;
 538
 539			if (ret != -EBUSY)
 540				goto free_req;
 541			we_ret = wait_event_interruptible_timeout(
 542				pq->busy.wait_dma,
 543				pq->state == SDMA_PKT_Q_ACTIVE,
 544				msecs_to_jiffies(
 545					SDMA_IOWAIT_TIMEOUT));
 546			trace_hfi1_usdma_we(pq, we_ret);
 547			if (we_ret <= 0)
 548				flush_pq_iowait(pq);
 549		}
 550	}
 551	*count += idx;
 552	return 0;
 553free_req:
 554	/*
 555	 * If the submitted seqsubmitted == npkts, the completion routine
 556	 * controls the final state.  If sequbmitted < npkts, wait for any
 557	 * outstanding packets to finish before cleaning up.
 558	 */
 559	if (req->seqsubmitted < req->info.npkts) {
 560		if (req->seqsubmitted)
 561			wait_event(pq->busy.wait_dma,
 562				   (req->seqcomp == req->seqsubmitted - 1));
 563		user_sdma_free_request(req);
 564		pq_update(pq);
 565		set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
 566	}
 567	return ret;
 568}
 569
 570static inline u32 compute_data_length(struct user_sdma_request *req,
 571				      struct user_sdma_txreq *tx)
 572{
 573	/*
 574	 * Determine the proper size of the packet data.
 575	 * The size of the data of the first packet is in the header
 576	 * template. However, it includes the header and ICRC, which need
 577	 * to be subtracted.
 578	 * The minimum representable packet data length in a header is 4 bytes,
 579	 * therefore, when the data length request is less than 4 bytes, there's
 580	 * only one packet, and the packet data length is equal to that of the
 581	 * request data length.
 582	 * The size of the remaining packets is the minimum of the frag
 583	 * size (MTU) or remaining data in the request.
 584	 */
 585	u32 len;
 586
 587	if (!req->seqnum) {
 588		if (req->data_len < sizeof(u32))
 589			len = req->data_len;
 590		else
 591			len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
 592			       (sizeof(tx->hdr) - 4));
 593	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
 594		u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
 595			PAGE_SIZE;
 596		/*
 597		 * Get the data length based on the remaining space in the
 598		 * TID pair.
 599		 */
 600		len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
 601		/* If we've filled up the TID pair, move to the next one. */
 602		if (unlikely(!len) && ++req->tididx < req->n_tids &&
 603		    req->tids[req->tididx]) {
 604			tidlen = EXP_TID_GET(req->tids[req->tididx],
 605					     LEN) * PAGE_SIZE;
 606			req->tidoffset = 0;
 607			len = min_t(u32, tidlen, req->info.fragsize);
 608		}
 609		/*
 610		 * Since the TID pairs map entire pages, make sure that we
 611		 * are not going to try to send more data that we have
 612		 * remaining.
 613		 */
 614		len = min(len, req->data_len - req->sent);
 615	} else {
 616		len = min(req->data_len - req->sent, (u32)req->info.fragsize);
 617	}
 618	trace_hfi1_sdma_user_compute_length(req->pq->dd,
 619					    req->pq->ctxt,
 620					    req->pq->subctxt,
 621					    req->info.comp_idx,
 622					    len);
 623	return len;
 624}
 625
 626static inline u32 pad_len(u32 len)
 627{
 628	if (len & (sizeof(u32) - 1))
 629		len += sizeof(u32) - (len & (sizeof(u32) - 1));
 630	return len;
 631}
 632
 633static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
 634{
 635	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
 636	return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
 637}
 638
 639static int user_sdma_txadd_ahg(struct user_sdma_request *req,
 640			       struct user_sdma_txreq *tx,
 641			       u32 datalen)
 642{
 643	int ret;
 644	u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
 645	u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
 646	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 647
 648	/*
 649	 * Copy the request header into the tx header
 650	 * because the HW needs a cacheline-aligned
 651	 * address.
 652	 * This copy can be optimized out if the hdr
 653	 * member of user_sdma_request were also
 654	 * cacheline aligned.
 655	 */
 656	memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
 657	if (PBC2LRH(pbclen) != lrhlen) {
 658		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 659		tx->hdr.pbc[0] = cpu_to_le16(pbclen);
 660	}
 661	ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
 662	if (ret)
 663		return ret;
 664	ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
 665			      sizeof(tx->hdr) + datalen, req->ahg_idx,
 666			      0, NULL, 0, user_sdma_txreq_cb);
 667	if (ret)
 668		return ret;
 669	ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
 670	if (ret)
 671		sdma_txclean(pq->dd, &tx->txreq);
 672	return ret;
 673}
 674
 675static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
 676{
 677	int ret = 0;
 678	u16 count;
 679	unsigned npkts = 0;
 680	struct user_sdma_txreq *tx = NULL;
 681	struct hfi1_user_sdma_pkt_q *pq = NULL;
 682	struct user_sdma_iovec *iovec = NULL;
 683
 684	if (!req->pq)
 685		return -EINVAL;
 686
 687	pq = req->pq;
 688
 689	/* If tx completion has reported an error, we are done. */
 690	if (READ_ONCE(req->has_error))
 691		return -EFAULT;
 692
 693	/*
 694	 * Check if we might have sent the entire request already
 695	 */
 696	if (unlikely(req->seqnum == req->info.npkts)) {
 697		if (!list_empty(&req->txps))
 698			goto dosend;
 699		return ret;
 700	}
 701
 702	if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
 703		maxpkts = req->info.npkts - req->seqnum;
 704
 705	while (npkts < maxpkts) {
 706		u32 datalen = 0;
 707
 708		/*
 709		 * Check whether any of the completions have come back
 710		 * with errors. If so, we are not going to process any
 711		 * more packets from this request.
 712		 */
 713		if (READ_ONCE(req->has_error))
 714			return -EFAULT;
 715
 716		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
 717		if (!tx)
 718			return -ENOMEM;
 719
 720		tx->flags = 0;
 721		tx->req = req;
 722		INIT_LIST_HEAD(&tx->list);
 723
 724		/*
 725		 * For the last packet set the ACK request
 726		 * and disable header suppression.
 727		 */
 728		if (req->seqnum == req->info.npkts - 1)
 729			tx->flags |= (TXREQ_FLAGS_REQ_ACK |
 730				      TXREQ_FLAGS_REQ_DISABLE_SH);
 731
 732		/*
 733		 * Calculate the payload size - this is min of the fragment
 734		 * (MTU) size or the remaining bytes in the request but only
 735		 * if we have payload data.
 736		 */
 737		if (req->data_len) {
 738			iovec = &req->iovs[req->iov_idx];
 739			if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
 740				if (++req->iov_idx == req->data_iovs) {
 741					ret = -EFAULT;
 742					goto free_tx;
 743				}
 744				iovec = &req->iovs[req->iov_idx];
 745				WARN_ON(iovec->offset);
 746			}
 747
 748			datalen = compute_data_length(req, tx);
 749
 750			/*
 751			 * Disable header suppression for the payload <= 8DWS.
 752			 * If there is an uncorrectable error in the receive
 753			 * data FIFO when the received payload size is less than
 754			 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
 755			 * not reported.There is set RHF.EccErr if the header
 756			 * is not suppressed.
 757			 */
 758			if (!datalen) {
 759				SDMA_DBG(req,
 760					 "Request has data but pkt len is 0");
 761				ret = -EFAULT;
 762				goto free_tx;
 763			} else if (datalen <= 32) {
 764				tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
 765			}
 766		}
 767
 768		if (req->ahg_idx >= 0) {
 769			if (!req->seqnum) {
 770				ret = user_sdma_txadd_ahg(req, tx, datalen);
 771				if (ret)
 772					goto free_tx;
 773			} else {
 774				int changes;
 775
 776				changes = set_txreq_header_ahg(req, tx,
 777							       datalen);
 778				if (changes < 0) {
 779					ret = changes;
 780					goto free_tx;
 781				}
 782			}
 783		} else {
 784			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
 785					  datalen, user_sdma_txreq_cb);
 786			if (ret)
 787				goto free_tx;
 788			/*
 789			 * Modify the header for this packet. This only needs
 790			 * to be done if we are not going to use AHG. Otherwise,
 791			 * the HW will do it based on the changes we gave it
 792			 * during sdma_txinit_ahg().
 793			 */
 794			ret = set_txreq_header(req, tx, datalen);
 795			if (ret)
 796				goto free_txreq;
 797		}
 798
 799		req->koffset += datalen;
 800		if (req_opcode(req->info.ctrl) == EXPECTED)
 801			req->tidoffset += datalen;
 802		req->sent += datalen;
 803		while (datalen) {
 804			ret = hfi1_add_pages_to_sdma_packet(req, tx, iovec,
 805							    &datalen);
 806			if (ret)
 807				goto free_txreq;
 808			iovec = &req->iovs[req->iov_idx];
 809		}
 810		list_add_tail(&tx->txreq.list, &req->txps);
 811		/*
 812		 * It is important to increment this here as it is used to
 813		 * generate the BTH.PSN and, therefore, can't be bulk-updated
 814		 * outside of the loop.
 815		 */
 816		tx->seqnum = req->seqnum++;
 817		npkts++;
 818	}
 819dosend:
 820	ret = sdma_send_txlist(req->sde,
 821			       iowait_get_ib_work(&pq->busy),
 822			       &req->txps, &count);
 823	req->seqsubmitted += count;
 824	if (req->seqsubmitted == req->info.npkts) {
 825		/*
 826		 * The txreq has already been submitted to the HW queue
 827		 * so we can free the AHG entry now. Corruption will not
 828		 * happen due to the sequential manner in which
 829		 * descriptors are processed.
 830		 */
 831		if (req->ahg_idx >= 0)
 832			sdma_ahg_free(req->sde, req->ahg_idx);
 833	}
 834	return ret;
 835
 836free_txreq:
 837	sdma_txclean(pq->dd, &tx->txreq);
 838free_tx:
 839	kmem_cache_free(pq->txreq_cache, tx);
 840	return ret;
 841}
 842
 843static int check_header_template(struct user_sdma_request *req,
 844				 struct hfi1_pkt_header *hdr, u32 lrhlen,
 845				 u32 datalen)
 846{
 847	/*
 848	 * Perform safety checks for any type of packet:
 849	 *    - transfer size is multiple of 64bytes
 850	 *    - packet length is multiple of 4 bytes
 851	 *    - packet length is not larger than MTU size
 852	 *
 853	 * These checks are only done for the first packet of the
 854	 * transfer since the header is "given" to us by user space.
 855	 * For the remainder of the packets we compute the values.
 856	 */
 857	if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
 858	    lrhlen > get_lrh_len(*hdr, req->info.fragsize))
 859		return -EINVAL;
 860
 861	if (req_opcode(req->info.ctrl) == EXPECTED) {
 862		/*
 863		 * The header is checked only on the first packet. Furthermore,
 864		 * we ensure that at least one TID entry is copied when the
 865		 * request is submitted. Therefore, we don't have to verify that
 866		 * tididx points to something sane.
 867		 */
 868		u32 tidval = req->tids[req->tididx],
 869			tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
 870			tididx = EXP_TID_GET(tidval, IDX),
 871			tidctrl = EXP_TID_GET(tidval, CTRL),
 872			tidoff;
 873		__le32 kval = hdr->kdeth.ver_tid_offset;
 874
 875		tidoff = KDETH_GET(kval, OFFSET) *
 876			  (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 877			   KDETH_OM_LARGE : KDETH_OM_SMALL);
 878		/*
 879		 * Expected receive packets have the following
 880		 * additional checks:
 881		 *     - offset is not larger than the TID size
 882		 *     - TIDCtrl values match between header and TID array
 883		 *     - TID indexes match between header and TID array
 884		 */
 885		if ((tidoff + datalen > tidlen) ||
 886		    KDETH_GET(kval, TIDCTRL) != tidctrl ||
 887		    KDETH_GET(kval, TID) != tididx)
 888			return -EINVAL;
 889	}
 890	return 0;
 891}
 892
 893/*
 894 * Correctly set the BTH.PSN field based on type of
 895 * transfer - eager packets can just increment the PSN but
 896 * expected packets encode generation and sequence in the
 897 * BTH.PSN field so just incrementing will result in errors.
 898 */
 899static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
 900{
 901	u32 val = be32_to_cpu(bthpsn),
 902		mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
 903			0xffffffull),
 904		psn = val & mask;
 905	if (expct)
 906		psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
 907			((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
 908	else
 909		psn = psn + frags;
 910	return psn & mask;
 911}
 912
 913static int set_txreq_header(struct user_sdma_request *req,
 914			    struct user_sdma_txreq *tx, u32 datalen)
 915{
 916	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 917	struct hfi1_pkt_header *hdr = &tx->hdr;
 918	u8 omfactor; /* KDETH.OM */
 919	u16 pbclen;
 920	int ret;
 921	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
 922
 923	/* Copy the header template to the request before modification */
 924	memcpy(hdr, &req->hdr, sizeof(*hdr));
 925
 926	/*
 927	 * Check if the PBC and LRH length are mismatched. If so
 928	 * adjust both in the header.
 929	 */
 930	pbclen = le16_to_cpu(hdr->pbc[0]);
 931	if (PBC2LRH(pbclen) != lrhlen) {
 932		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 933		hdr->pbc[0] = cpu_to_le16(pbclen);
 934		hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
 935		/*
 936		 * Third packet
 937		 * This is the first packet in the sequence that has
 938		 * a "static" size that can be used for the rest of
 939		 * the packets (besides the last one).
 940		 */
 941		if (unlikely(req->seqnum == 2)) {
 942			/*
 943			 * From this point on the lengths in both the
 944			 * PBC and LRH are the same until the last
 945			 * packet.
 946			 * Adjust the template so we don't have to update
 947			 * every packet
 948			 */
 949			req->hdr.pbc[0] = hdr->pbc[0];
 950			req->hdr.lrh[2] = hdr->lrh[2];
 951		}
 952	}
 953	/*
 954	 * We only have to modify the header if this is not the
 955	 * first packet in the request. Otherwise, we use the
 956	 * header given to us.
 957	 */
 958	if (unlikely(!req->seqnum)) {
 959		ret = check_header_template(req, hdr, lrhlen, datalen);
 960		if (ret)
 961			return ret;
 962		goto done;
 963	}
 964
 965	hdr->bth[2] = cpu_to_be32(
 966		set_pkt_bth_psn(hdr->bth[2],
 967				(req_opcode(req->info.ctrl) == EXPECTED),
 968				req->seqnum));
 969
 970	/* Set ACK request on last packet */
 971	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
 972		hdr->bth[2] |= cpu_to_be32(1UL << 31);
 973
 974	/* Set the new offset */
 975	hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
 976	/* Expected packets have to fill in the new TID information */
 977	if (req_opcode(req->info.ctrl) == EXPECTED) {
 978		tidval = req->tids[req->tididx];
 979		/*
 980		 * If the offset puts us at the end of the current TID,
 981		 * advance everything.
 982		 */
 983		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
 984					 PAGE_SIZE)) {
 985			req->tidoffset = 0;
 986			/*
 987			 * Since we don't copy all the TIDs, all at once,
 988			 * we have to check again.
 989			 */
 990			if (++req->tididx > req->n_tids - 1 ||
 991			    !req->tids[req->tididx]) {
 992				return -EINVAL;
 993			}
 994			tidval = req->tids[req->tididx];
 995		}
 996		omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
 997			KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
 998			KDETH_OM_SMALL_SHIFT;
 999		/* Set KDETH.TIDCtrl based on value for this TID. */
1000		KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1001			  EXP_TID_GET(tidval, CTRL));
1002		/* Set KDETH.TID based on value for this TID */
1003		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1004			  EXP_TID_GET(tidval, IDX));
1005		/* Clear KDETH.SH when DISABLE_SH flag is set */
1006		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
1007			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1008		/*
1009		 * Set the KDETH.OFFSET and KDETH.OM based on size of
1010		 * transfer.
1011		 */
1012		trace_hfi1_sdma_user_tid_info(
1013			pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
1014			req->tidoffset, req->tidoffset >> omfactor,
1015			omfactor != KDETH_OM_SMALL_SHIFT);
1016		KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1017			  req->tidoffset >> omfactor);
1018		KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1019			  omfactor != KDETH_OM_SMALL_SHIFT);
1020	}
1021done:
1022	trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1023				    req->info.comp_idx, hdr, tidval);
1024	return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1025}
1026
1027static int set_txreq_header_ahg(struct user_sdma_request *req,
1028				struct user_sdma_txreq *tx, u32 datalen)
1029{
1030	u32 ahg[AHG_KDETH_ARRAY_SIZE];
1031	int idx = 0;
1032	u8 omfactor; /* KDETH.OM */
1033	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1034	struct hfi1_pkt_header *hdr = &req->hdr;
1035	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1036	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1037	size_t array_size = ARRAY_SIZE(ahg);
1038
1039	if (PBC2LRH(pbclen) != lrhlen) {
1040		/* PBC.PbcLengthDWs */
1041		idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
1042				     (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
1043		if (idx < 0)
1044			return idx;
1045		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
1046		idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
1047				     (__force u16)cpu_to_be16(lrhlen >> 2));
1048		if (idx < 0)
1049			return idx;
1050	}
1051
1052	/*
1053	 * Do the common updates
1054	 */
1055	/* BTH.PSN and BTH.A */
1056	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1057		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1058	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1059		val32 |= 1UL << 31;
1060	idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
1061			     (__force u16)cpu_to_be16(val32 >> 16));
1062	if (idx < 0)
1063		return idx;
1064	idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
1065			     (__force u16)cpu_to_be16(val32 & 0xffff));
1066	if (idx < 0)
1067		return idx;
1068	/* KDETH.Offset */
1069	idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
1070			     (__force u16)cpu_to_le16(req->koffset & 0xffff));
1071	if (idx < 0)
1072		return idx;
1073	idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
1074			     (__force u16)cpu_to_le16(req->koffset >> 16));
1075	if (idx < 0)
1076		return idx;
1077	if (req_opcode(req->info.ctrl) == EXPECTED) {
1078		__le16 val;
1079
1080		tidval = req->tids[req->tididx];
1081
1082		/*
1083		 * If the offset puts us at the end of the current TID,
1084		 * advance everything.
1085		 */
1086		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1087					 PAGE_SIZE)) {
1088			req->tidoffset = 0;
1089			/*
1090			 * Since we don't copy all the TIDs, all at once,
1091			 * we have to check again.
1092			 */
1093			if (++req->tididx > req->n_tids - 1 ||
1094			    !req->tids[req->tididx])
1095				return -EINVAL;
1096			tidval = req->tids[req->tididx];
1097		}
1098		omfactor = ((EXP_TID_GET(tidval, LEN) *
1099				  PAGE_SIZE) >=
1100				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
1101				 KDETH_OM_SMALL_SHIFT;
1102		/* KDETH.OM and KDETH.OFFSET (TID) */
1103		idx = ahg_header_set(
1104				ahg, idx, array_size, 7, 0, 16,
1105				((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
1106				((req->tidoffset >> omfactor)
1107				& 0x7fff)));
1108		if (idx < 0)
1109			return idx;
1110		/* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
1111		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1112				   (EXP_TID_GET(tidval, IDX) & 0x3ff));
1113
1114		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
1115			val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1116						      INTR) <<
1117					    AHG_KDETH_INTR_SHIFT));
1118		} else {
1119			val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
1120			       cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
1121			       cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1122						      INTR) <<
1123					     AHG_KDETH_INTR_SHIFT));
1124		}
1125
1126		idx = ahg_header_set(ahg, idx, array_size,
1127				     7, 16, 14, (__force u16)val);
1128		if (idx < 0)
1129			return idx;
1130	}
1131
1132	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1133					req->info.comp_idx, req->sde->this_idx,
1134					req->ahg_idx, ahg, idx, tidval);
1135	sdma_txinit_ahg(&tx->txreq,
1136			SDMA_TXREQ_F_USE_AHG,
1137			datalen, req->ahg_idx, idx,
1138			ahg, sizeof(req->hdr),
1139			user_sdma_txreq_cb);
1140
1141	return idx;
1142}
1143
1144/**
1145 * user_sdma_txreq_cb() - SDMA tx request completion callback.
1146 * @txreq: valid sdma tx request
1147 * @status: success/failure of request
1148 *
1149 * Called when the SDMA progress state machine gets notification that
1150 * the SDMA descriptors for this tx request have been processed by the
1151 * DMA engine. Called in interrupt context.
1152 * Only do work on completed sequences.
1153 */
1154static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
1155{
1156	struct user_sdma_txreq *tx =
1157		container_of(txreq, struct user_sdma_txreq, txreq);
1158	struct user_sdma_request *req;
1159	struct hfi1_user_sdma_pkt_q *pq;
1160	struct hfi1_user_sdma_comp_q *cq;
1161	enum hfi1_sdma_comp_state state = COMPLETE;
1162
1163	if (!tx->req)
1164		return;
1165
1166	req = tx->req;
1167	pq = req->pq;
1168	cq = req->cq;
1169
1170	if (status != SDMA_TXREQ_S_OK) {
1171		SDMA_DBG(req, "SDMA completion with error %d",
1172			 status);
1173		WRITE_ONCE(req->has_error, 1);
1174		state = ERROR;
1175	}
1176
1177	req->seqcomp = tx->seqnum;
1178	kmem_cache_free(pq->txreq_cache, tx);
1179
1180	/* sequence isn't complete?  We are done */
1181	if (req->seqcomp != req->info.npkts - 1)
1182		return;
1183
1184	user_sdma_free_request(req);
1185	set_comp_state(pq, cq, req->info.comp_idx, state, status);
1186	pq_update(pq);
1187}
1188
1189static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
1190{
1191	if (atomic_dec_and_test(&pq->n_reqs))
1192		wake_up(&pq->wait);
1193}
1194
1195static void user_sdma_free_request(struct user_sdma_request *req)
1196{
1197	if (!list_empty(&req->txps)) {
1198		struct sdma_txreq *t, *p;
1199
1200		list_for_each_entry_safe(t, p, &req->txps, list) {
1201			struct user_sdma_txreq *tx =
1202				container_of(t, struct user_sdma_txreq, txreq);
1203			list_del_init(&t->list);
1204			sdma_txclean(req->pq->dd, t);
1205			kmem_cache_free(req->pq->txreq_cache, tx);
1206		}
1207	}
1208
1209	kfree(req->tids);
1210	clear_bit(req->info.comp_idx, req->pq->req_in_use);
1211}
1212
1213static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
1214				  struct hfi1_user_sdma_comp_q *cq,
1215				  u16 idx, enum hfi1_sdma_comp_state state,
1216				  int ret)
1217{
1218	if (state == ERROR)
1219		cq->comps[idx].errcode = -ret;
1220	smp_wmb(); /* make sure errcode is visible first */
1221	cq->comps[idx].status = state;
1222	trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
1223					idx, state, ret);
1224}