user_sdma.c - drivers/infiniband/hw/hfi1/user_sdma.c - Linux diff v6.8

   1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2/*
   3 * Copyright(c) 2020 - 2023 Cornelis Networks, Inc.
   4 * Copyright(c) 2015 - 2018 Intel Corporation.
   5 */
   6
   7#include <linux/mm.h>
   8#include <linux/types.h>
   9#include <linux/device.h>
  10#include <linux/dmapool.h>
  11#include <linux/slab.h>
  12#include <linux/list.h>
  13#include <linux/highmem.h>
  14#include <linux/io.h>
  15#include <linux/uio.h>
  16#include <linux/rbtree.h>
  17#include <linux/spinlock.h>
  18#include <linux/delay.h>
  19#include <linux/kthread.h>
  20#include <linux/mmu_context.h>
  21#include <linux/module.h>
  22#include <linux/vmalloc.h>
  23#include <linux/string.h>
  24
  25#include "hfi.h"
  26#include "sdma.h"
 
  27#include "user_sdma.h"
  28#include "verbs.h"  /* for the headers */
  29#include "common.h" /* for struct hfi1_tid_info */
  30#include "trace.h"
  31
  32static uint hfi1_sdma_comp_ring_size = 128;
  33module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
  34MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
  35
  36static unsigned initial_pkt_count = 8;
  37
  38static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
  39static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
  40static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
  41static void user_sdma_free_request(struct user_sdma_request *req);
 
 
 
 
  42static int check_header_template(struct user_sdma_request *req,
  43				 struct hfi1_pkt_header *hdr, u32 lrhlen,
  44				 u32 datalen);
  45static int set_txreq_header(struct user_sdma_request *req,
  46			    struct user_sdma_txreq *tx, u32 datalen);
  47static int set_txreq_header_ahg(struct user_sdma_request *req,
  48				struct user_sdma_txreq *tx, u32 len);
  49static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  50				  struct hfi1_user_sdma_comp_q *cq,
  51				  u16 idx, enum hfi1_sdma_comp_state state,
  52				  int ret);
  53static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
  54static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
  55
  56static int defer_packet_queue(
  57	struct sdma_engine *sde,
  58	struct iowait_work *wait,
  59	struct sdma_txreq *txreq,
  60	uint seq,
  61	bool pkts_sent);
  62static void activate_packet_queue(struct iowait *wait, int reason);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  63
  64static int defer_packet_queue(
  65	struct sdma_engine *sde,
  66	struct iowait_work *wait,
  67	struct sdma_txreq *txreq,
  68	uint seq,
  69	bool pkts_sent)
  70{
  71	struct hfi1_user_sdma_pkt_q *pq =
  72		container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
  73
  74	write_seqlock(&sde->waitlock);
  75	trace_hfi1_usdma_defer(pq, sde, &pq->busy);
  76	if (sdma_progress(sde, seq, txreq))
  77		goto eagain;
  78	/*
  79	 * We are assuming that if the list is enqueued somewhere, it
  80	 * is to the dmawait list since that is the only place where
  81	 * it is supposed to be enqueued.
  82	 */
  83	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
  84	if (list_empty(&pq->busy.list)) {
  85		pq->busy.lock = &sde->waitlock;
  86		iowait_get_priority(&pq->busy);
  87		iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
  88	}
  89	write_sequnlock(&sde->waitlock);
  90	return -EBUSY;
  91eagain:
  92	write_sequnlock(&sde->waitlock);
  93	return -EAGAIN;
  94}
  95
  96static void activate_packet_queue(struct iowait *wait, int reason)
  97{
  98	struct hfi1_user_sdma_pkt_q *pq =
  99		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
 100
 101	trace_hfi1_usdma_activate(pq, wait, reason);
 102	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
 103	wake_up(&wait->wait_dma);
 104};
 105
 106int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 107				struct hfi1_filedata *fd)
 108{
 109	int ret = -ENOMEM;
 110	char buf[64];
 111	struct hfi1_devdata *dd;
 112	struct hfi1_user_sdma_comp_q *cq;
 113	struct hfi1_user_sdma_pkt_q *pq;
 114
 115	if (!uctxt || !fd)
 116		return -EBADF;
 117
 118	if (!hfi1_sdma_comp_ring_size)
 119		return -EINVAL;
 120
 121	dd = uctxt->dd;
 122
 123	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
 124	if (!pq)
 125		return -ENOMEM;
 126	pq->dd = dd;
 127	pq->ctxt = uctxt->ctxt;
 128	pq->subctxt = fd->subctxt;
 129	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
 130	atomic_set(&pq->n_reqs, 0);
 131	init_waitqueue_head(&pq->wait);
 132	atomic_set(&pq->n_locked, 0);
 133
 134	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
 135		    activate_packet_queue, NULL, NULL);
 136	pq->reqidx = 0;
 137
 138	pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
 139			   sizeof(*pq->reqs),
 140			   GFP_KERNEL);
 141	if (!pq->reqs)
 142		goto pq_reqs_nomem;
 143
 144	pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL);
 145	if (!pq->req_in_use)
 146		goto pq_reqs_no_in_use;
 147
 148	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
 149		 fd->subctxt);
 150	pq->txreq_cache = kmem_cache_create(buf,
 151					    sizeof(struct user_sdma_txreq),
 152					    L1_CACHE_BYTES,
 153					    SLAB_HWCACHE_ALIGN,
 154					    NULL);
 155	if (!pq->txreq_cache) {
 156		dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
 157			   uctxt->ctxt);
 158		goto pq_txreq_nomem;
 159	}
 160
 161	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 162	if (!cq)
 163		goto cq_nomem;
 164
 165	cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
 166				 * hfi1_sdma_comp_ring_size));
 167	if (!cq->comps)
 168		goto cq_comps_nomem;
 169
 170	cq->nentries = hfi1_sdma_comp_ring_size;
 171
 172	ret = hfi1_init_system_pinning(pq);
 173	if (ret)
 
 
 174		goto pq_mmu_fail;
 
 175
 176	rcu_assign_pointer(fd->pq, pq);
 177	fd->cq = cq;
 178
 179	return 0;
 180
 181pq_mmu_fail:
 182	vfree(cq->comps);
 183cq_comps_nomem:
 184	kfree(cq);
 185cq_nomem:
 186	kmem_cache_destroy(pq->txreq_cache);
 187pq_txreq_nomem:
 188	bitmap_free(pq->req_in_use);
 189pq_reqs_no_in_use:
 190	kfree(pq->reqs);
 191pq_reqs_nomem:
 192	kfree(pq);
 193
 194	return ret;
 195}
 196
 197static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq)
 198{
 199	unsigned long flags;
 200	seqlock_t *lock = pq->busy.lock;
 201
 202	if (!lock)
 203		return;
 204	write_seqlock_irqsave(lock, flags);
 205	if (!list_empty(&pq->busy.list)) {
 206		list_del_init(&pq->busy.list);
 207		pq->busy.lock = NULL;
 208	}
 209	write_sequnlock_irqrestore(lock, flags);
 210}
 211
 212int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
 213			       struct hfi1_ctxtdata *uctxt)
 214{
 215	struct hfi1_user_sdma_pkt_q *pq;
 216
 217	trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
 218
 219	spin_lock(&fd->pq_rcu_lock);
 220	pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
 221				    lockdep_is_held(&fd->pq_rcu_lock));
 222	if (pq) {
 223		rcu_assign_pointer(fd->pq, NULL);
 224		spin_unlock(&fd->pq_rcu_lock);
 225		synchronize_srcu(&fd->pq_srcu);
 226		/* at this point there can be no more new requests */
 
 
 227		iowait_sdma_drain(&pq->busy);
 228		/* Wait until all requests have been freed. */
 229		wait_event_interruptible(
 230			pq->wait,
 231			!atomic_read(&pq->n_reqs));
 232		kfree(pq->reqs);
 233		hfi1_free_system_pinning(pq);
 234		bitmap_free(pq->req_in_use);
 235		kmem_cache_destroy(pq->txreq_cache);
 236		flush_pq_iowait(pq);
 237		kfree(pq);
 238	} else {
 239		spin_unlock(&fd->pq_rcu_lock);
 240	}
 241	if (fd->cq) {
 242		vfree(fd->cq->comps);
 243		kfree(fd->cq);
 244		fd->cq = NULL;
 245	}
 246	return 0;
 247}
 248
 249static u8 dlid_to_selector(u16 dlid)
 250{
 251	static u8 mapping[256];
 252	static int initialized;
 253	static u8 next;
 254	int hash;
 255
 256	if (!initialized) {
 257		memset(mapping, 0xFF, 256);
 258		initialized = 1;
 259	}
 260
 261	hash = ((dlid >> 8) ^ dlid) & 0xFF;
 262	if (mapping[hash] == 0xFF) {
 263		mapping[hash] = next;
 264		next = (next + 1) & 0x7F;
 265	}
 266
 267	return mapping[hash];
 268}
 269
 270/**
 271 * hfi1_user_sdma_process_request() - Process and start a user sdma request
 272 * @fd: valid file descriptor
 273 * @iovec: array of io vectors to process
 274 * @dim: overall iovec array size
 275 * @count: number of io vector array entries processed
 276 */
 277int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 278				   struct iovec *iovec, unsigned long dim,
 279				   unsigned long *count)
 280{
 281	int ret = 0, i;
 282	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 283	struct hfi1_user_sdma_pkt_q *pq =
 284		srcu_dereference(fd->pq, &fd->pq_srcu);
 285	struct hfi1_user_sdma_comp_q *cq = fd->cq;
 286	struct hfi1_devdata *dd = pq->dd;
 287	unsigned long idx = 0;
 288	u8 pcount = initial_pkt_count;
 289	struct sdma_req_info info;
 290	struct user_sdma_request *req;
 291	u8 opcode, sc, vl;
 292	u16 pkey;
 293	u32 slid;
 294	u16 dlid;
 295	u32 selector;
 296
 297	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
 298		hfi1_cdbg(
 299		   SDMA,
 300		   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
 301		   dd->unit, uctxt->ctxt, fd->subctxt,
 302		   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
 303		return -EINVAL;
 304	}
 305	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
 306	if (ret) {
 307		hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
 308			  dd->unit, uctxt->ctxt, fd->subctxt, ret);
 309		return -EFAULT;
 310	}
 311
 312	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
 313				     (u16 *)&info);
 314	if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
 315		hfi1_cdbg(SDMA,
 316			  "[%u:%u:%u:%u] Invalid comp index",
 317			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 318		return -EINVAL;
 319	}
 320
 321	/*
 322	 * Sanity check the header io vector count.  Need at least 1 vector
 323	 * (header) and cannot be larger than the actual io vector count.
 324	 */
 325	if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
 326		hfi1_cdbg(SDMA,
 327			  "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
 328			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
 329			  req_iovcnt(info.ctrl), dim);
 330		return -EINVAL;
 331	}
 332
 333	if (!info.fragsize) {
 334		hfi1_cdbg(SDMA,
 335			  "[%u:%u:%u:%u] Request does not specify fragsize",
 336			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 337		return -EINVAL;
 338	}
 339
 340	/* Try to claim the request. */
 341	if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
 342		hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
 343			  dd->unit, uctxt->ctxt, fd->subctxt,
 344			  info.comp_idx);
 345		return -EBADSLT;
 346	}
 347	/*
 348	 * All safety checks have been done and this request has been claimed.
 349	 */
 350	trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
 351					     info.comp_idx);
 352	req = pq->reqs + info.comp_idx;
 353	req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
 354	req->data_len  = 0;
 355	req->pq = pq;
 356	req->cq = cq;
 357	req->ahg_idx = -1;
 358	req->iov_idx = 0;
 359	req->sent = 0;
 360	req->seqnum = 0;
 361	req->seqcomp = 0;
 362	req->seqsubmitted = 0;
 363	req->tids = NULL;
 364	req->has_error = 0;
 365	INIT_LIST_HEAD(&req->txps);
 366
 367	memcpy(&req->info, &info, sizeof(info));
 368
 369	/* The request is initialized, count it */
 370	atomic_inc(&pq->n_reqs);
 371
 372	if (req_opcode(info.ctrl) == EXPECTED) {
 373		/* expected must have a TID info and at least one data vector */
 374		if (req->data_iovs < 2) {
 375			SDMA_DBG(req,
 376				 "Not enough vectors for expected request");
 377			ret = -EINVAL;
 378			goto free_req;
 379		}
 380		req->data_iovs--;
 381	}
 382
 383	if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
 384		SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
 385			 MAX_VECTORS_PER_REQ);
 386		ret = -EINVAL;
 387		goto free_req;
 388	}
 389
 390	/* Copy the header from the user buffer */
 391	ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
 392			     sizeof(req->hdr));
 393	if (ret) {
 394		SDMA_DBG(req, "Failed to copy header template (%d)", ret);
 395		ret = -EFAULT;
 396		goto free_req;
 397	}
 398
 399	/* If Static rate control is not enabled, sanitize the header. */
 400	if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
 401		req->hdr.pbc[2] = 0;
 402
 403	/* Validate the opcode. Do not trust packets from user space blindly. */
 404	opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
 405	if ((opcode & USER_OPCODE_CHECK_MASK) !=
 406	     USER_OPCODE_CHECK_VAL) {
 407		SDMA_DBG(req, "Invalid opcode (%d)", opcode);
 408		ret = -EINVAL;
 409		goto free_req;
 410	}
 411	/*
 412	 * Validate the vl. Do not trust packets from user space blindly.
 413	 * VL comes from PBC, SC comes from LRH, and the VL needs to
 414	 * match the SC look up.
 415	 */
 416	vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
 417	sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
 418	      (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
 419	if (vl >= dd->pport->vls_operational ||
 420	    vl != sc_to_vlt(dd, sc)) {
 421		SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
 422		ret = -EINVAL;
 423		goto free_req;
 424	}
 425
 426	/* Checking P_KEY for requests from user-space */
 427	pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
 428	slid = be16_to_cpu(req->hdr.lrh[3]);
 429	if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
 430		ret = -EINVAL;
 431		goto free_req;
 432	}
 433
 434	/*
 435	 * Also should check the BTH.lnh. If it says the next header is GRH then
 436	 * the RXE parsing will be off and will land in the middle of the KDETH
 437	 * or miss it entirely.
 438	 */
 439	if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
 440		SDMA_DBG(req, "User tried to pass in a GRH");
 441		ret = -EINVAL;
 442		goto free_req;
 443	}
 444
 445	req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
 446	/*
 447	 * Calculate the initial TID offset based on the values of
 448	 * KDETH.OFFSET and KDETH.OM that are passed in.
 449	 */
 450	req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
 451		(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 452		 KDETH_OM_LARGE : KDETH_OM_SMALL);
 453	trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
 454					       info.comp_idx, req->tidoffset);
 455	idx++;
 456
 457	/* Save all the IO vector structures */
 458	for (i = 0; i < req->data_iovs; i++) {
 459		req->iovs[i].offset = 0;
 460		INIT_LIST_HEAD(&req->iovs[i].list);
 461		memcpy(&req->iovs[i].iov,
 462		       iovec + idx++,
 463		       sizeof(req->iovs[i].iov));
 464		if (req->iovs[i].iov.iov_len == 0) {
 465			ret = -EINVAL;
 
 466			goto free_req;
 467		}
 468		req->data_len += req->iovs[i].iov.iov_len;
 469	}
 470	trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
 471					 info.comp_idx, req->data_len);
 472	if (pcount > req->info.npkts)
 473		pcount = req->info.npkts;
 474	/*
 475	 * Copy any TID info
 476	 * User space will provide the TID info only when the
 477	 * request type is EXPECTED. This is true even if there is
 478	 * only one packet in the request and the header is already
 479	 * setup. The reason for the singular TID case is that the
 480	 * driver needs to perform safety checks.
 481	 */
 482	if (req_opcode(req->info.ctrl) == EXPECTED) {
 483		u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
 484		u32 *tmp;
 485
 486		if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
 487			ret = -EINVAL;
 488			goto free_req;
 489		}
 490
 491		/*
 492		 * We have to copy all of the tids because they may vary
 493		 * in size and, therefore, the TID count might not be
 494		 * equal to the pkt count. However, there is no way to
 495		 * tell at this point.
 496		 */
 497		tmp = memdup_array_user(iovec[idx].iov_base,
 498					ntids, sizeof(*req->tids));
 499		if (IS_ERR(tmp)) {
 500			ret = PTR_ERR(tmp);
 501			SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
 502				 ntids, ret);
 503			goto free_req;
 504		}
 505		req->tids = tmp;
 506		req->n_tids = ntids;
 507		req->tididx = 0;
 508		idx++;
 509	}
 510
 511	dlid = be16_to_cpu(req->hdr.lrh[1]);
 512	selector = dlid_to_selector(dlid);
 513	selector += uctxt->ctxt + fd->subctxt;
 514	req->sde = sdma_select_user_engine(dd, selector, vl);
 515
 516	if (!req->sde || !sdma_running(req->sde)) {
 517		ret = -ECOMM;
 518		goto free_req;
 519	}
 520
 521	/* We don't need an AHG entry if the request contains only one packet */
 522	if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
 523		req->ahg_idx = sdma_ahg_alloc(req->sde);
 524
 525	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
 526	pq->state = SDMA_PKT_Q_ACTIVE;
 527
 528	/*
 529	 * This is a somewhat blocking send implementation.
 530	 * The driver will block the caller until all packets of the
 531	 * request have been submitted to the SDMA engine. However, it
 532	 * will not wait for send completions.
 533	 */
 534	while (req->seqsubmitted != req->info.npkts) {
 535		ret = user_sdma_send_pkts(req, pcount);
 536		if (ret < 0) {
 537			int we_ret;
 538
 539			if (ret != -EBUSY)
 540				goto free_req;
 541			we_ret = wait_event_interruptible_timeout(
 542				pq->busy.wait_dma,
 543				pq->state == SDMA_PKT_Q_ACTIVE,
 544				msecs_to_jiffies(
 545					SDMA_IOWAIT_TIMEOUT));
 546			trace_hfi1_usdma_we(pq, we_ret);
 547			if (we_ret <= 0)
 548				flush_pq_iowait(pq);
 549		}
 550	}
 551	*count += idx;
 552	return 0;
 553free_req:
 554	/*
 555	 * If the submitted seqsubmitted == npkts, the completion routine
 556	 * controls the final state.  If sequbmitted < npkts, wait for any
 557	 * outstanding packets to finish before cleaning up.
 558	 */
 559	if (req->seqsubmitted < req->info.npkts) {
 560		if (req->seqsubmitted)
 561			wait_event(pq->busy.wait_dma,
 562				   (req->seqcomp == req->seqsubmitted - 1));
 563		user_sdma_free_request(req);
 564		pq_update(pq);
 565		set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
 566	}
 567	return ret;
 568}
 569
 570static inline u32 compute_data_length(struct user_sdma_request *req,
 571				      struct user_sdma_txreq *tx)
 572{
 573	/*
 574	 * Determine the proper size of the packet data.
 575	 * The size of the data of the first packet is in the header
 576	 * template. However, it includes the header and ICRC, which need
 577	 * to be subtracted.
 578	 * The minimum representable packet data length in a header is 4 bytes,
 579	 * therefore, when the data length request is less than 4 bytes, there's
 580	 * only one packet, and the packet data length is equal to that of the
 581	 * request data length.
 582	 * The size of the remaining packets is the minimum of the frag
 583	 * size (MTU) or remaining data in the request.
 584	 */
 585	u32 len;
 586
 587	if (!req->seqnum) {
 588		if (req->data_len < sizeof(u32))
 589			len = req->data_len;
 590		else
 591			len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
 592			       (sizeof(tx->hdr) - 4));
 593	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
 594		u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
 595			PAGE_SIZE;
 596		/*
 597		 * Get the data length based on the remaining space in the
 598		 * TID pair.
 599		 */
 600		len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
 601		/* If we've filled up the TID pair, move to the next one. */
 602		if (unlikely(!len) && ++req->tididx < req->n_tids &&
 603		    req->tids[req->tididx]) {
 604			tidlen = EXP_TID_GET(req->tids[req->tididx],
 605					     LEN) * PAGE_SIZE;
 606			req->tidoffset = 0;
 607			len = min_t(u32, tidlen, req->info.fragsize);
 608		}
 609		/*
 610		 * Since the TID pairs map entire pages, make sure that we
 611		 * are not going to try to send more data that we have
 612		 * remaining.
 613		 */
 614		len = min(len, req->data_len - req->sent);
 615	} else {
 616		len = min(req->data_len - req->sent, (u32)req->info.fragsize);
 617	}
 618	trace_hfi1_sdma_user_compute_length(req->pq->dd,
 619					    req->pq->ctxt,
 620					    req->pq->subctxt,
 621					    req->info.comp_idx,
 622					    len);
 623	return len;
 624}
 625
 626static inline u32 pad_len(u32 len)
 627{
 628	if (len & (sizeof(u32) - 1))
 629		len += sizeof(u32) - (len & (sizeof(u32) - 1));
 630	return len;
 631}
 632
 633static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
 634{
 635	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
 636	return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
 637}
 638
 639static int user_sdma_txadd_ahg(struct user_sdma_request *req,
 640			       struct user_sdma_txreq *tx,
 641			       u32 datalen)
 642{
 643	int ret;
 644	u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
 645	u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
 646	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 647
 648	/*
 649	 * Copy the request header into the tx header
 650	 * because the HW needs a cacheline-aligned
 651	 * address.
 652	 * This copy can be optimized out if the hdr
 653	 * member of user_sdma_request were also
 654	 * cacheline aligned.
 655	 */
 656	memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
 657	if (PBC2LRH(pbclen) != lrhlen) {
 658		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 659		tx->hdr.pbc[0] = cpu_to_le16(pbclen);
 660	}
 661	ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
 662	if (ret)
 663		return ret;
 664	ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
 665			      sizeof(tx->hdr) + datalen, req->ahg_idx,
 666			      0, NULL, 0, user_sdma_txreq_cb);
 667	if (ret)
 668		return ret;
 669	ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
 670	if (ret)
 671		sdma_txclean(pq->dd, &tx->txreq);
 672	return ret;
 673}
 674
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 675static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
 676{
 677	int ret = 0;
 678	u16 count;
 679	unsigned npkts = 0;
 680	struct user_sdma_txreq *tx = NULL;
 681	struct hfi1_user_sdma_pkt_q *pq = NULL;
 682	struct user_sdma_iovec *iovec = NULL;
 683
 684	if (!req->pq)
 685		return -EINVAL;
 686
 687	pq = req->pq;
 688
 689	/* If tx completion has reported an error, we are done. */
 690	if (READ_ONCE(req->has_error))
 691		return -EFAULT;
 692
 693	/*
 694	 * Check if we might have sent the entire request already
 695	 */
 696	if (unlikely(req->seqnum == req->info.npkts)) {
 697		if (!list_empty(&req->txps))
 698			goto dosend;
 699		return ret;
 700	}
 701
 702	if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
 703		maxpkts = req->info.npkts - req->seqnum;
 704
 705	while (npkts < maxpkts) {
 706		u32 datalen = 0;
 
 707
 708		/*
 709		 * Check whether any of the completions have come back
 710		 * with errors. If so, we are not going to process any
 711		 * more packets from this request.
 712		 */
 713		if (READ_ONCE(req->has_error))
 714			return -EFAULT;
 715
 716		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
 717		if (!tx)
 718			return -ENOMEM;
 719
 720		tx->flags = 0;
 721		tx->req = req;
 722		INIT_LIST_HEAD(&tx->list);
 723
 724		/*
 725		 * For the last packet set the ACK request
 726		 * and disable header suppression.
 727		 */
 728		if (req->seqnum == req->info.npkts - 1)
 729			tx->flags |= (TXREQ_FLAGS_REQ_ACK |
 730				      TXREQ_FLAGS_REQ_DISABLE_SH);
 731
 732		/*
 733		 * Calculate the payload size - this is min of the fragment
 734		 * (MTU) size or the remaining bytes in the request but only
 735		 * if we have payload data.
 736		 */
 737		if (req->data_len) {
 738			iovec = &req->iovs[req->iov_idx];
 739			if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
 740				if (++req->iov_idx == req->data_iovs) {
 741					ret = -EFAULT;
 742					goto free_tx;
 743				}
 744				iovec = &req->iovs[req->iov_idx];
 745				WARN_ON(iovec->offset);
 746			}
 747
 748			datalen = compute_data_length(req, tx);
 749
 750			/*
 751			 * Disable header suppression for the payload <= 8DWS.
 752			 * If there is an uncorrectable error in the receive
 753			 * data FIFO when the received payload size is less than
 754			 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
 755			 * not reported.There is set RHF.EccErr if the header
 756			 * is not suppressed.
 757			 */
 758			if (!datalen) {
 759				SDMA_DBG(req,
 760					 "Request has data but pkt len is 0");
 761				ret = -EFAULT;
 762				goto free_tx;
 763			} else if (datalen <= 32) {
 764				tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
 765			}
 766		}
 767
 768		if (req->ahg_idx >= 0) {
 769			if (!req->seqnum) {
 770				ret = user_sdma_txadd_ahg(req, tx, datalen);
 771				if (ret)
 772					goto free_tx;
 773			} else {
 774				int changes;
 775
 776				changes = set_txreq_header_ahg(req, tx,
 777							       datalen);
 778				if (changes < 0) {
 779					ret = changes;
 780					goto free_tx;
 781				}
 782			}
 783		} else {
 784			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
 785					  datalen, user_sdma_txreq_cb);
 786			if (ret)
 787				goto free_tx;
 788			/*
 789			 * Modify the header for this packet. This only needs
 790			 * to be done if we are not going to use AHG. Otherwise,
 791			 * the HW will do it based on the changes we gave it
 792			 * during sdma_txinit_ahg().
 793			 */
 794			ret = set_txreq_header(req, tx, datalen);
 795			if (ret)
 796				goto free_txreq;
 797		}
 798
 799		req->koffset += datalen;
 800		if (req_opcode(req->info.ctrl) == EXPECTED)
 801			req->tidoffset += datalen;
 802		req->sent += datalen;
 803		while (datalen) {
 804			ret = hfi1_add_pages_to_sdma_packet(req, tx, iovec,
 805							    &datalen);
 
 806			if (ret)
 807				goto free_txreq;
 808			iovec = &req->iovs[req->iov_idx];
 809		}
 
 
 
 
 
 
 
 
 
 
 810		list_add_tail(&tx->txreq.list, &req->txps);
 811		/*
 812		 * It is important to increment this here as it is used to
 813		 * generate the BTH.PSN and, therefore, can't be bulk-updated
 814		 * outside of the loop.
 815		 */
 816		tx->seqnum = req->seqnum++;
 817		npkts++;
 818	}
 819dosend:
 820	ret = sdma_send_txlist(req->sde,
 821			       iowait_get_ib_work(&pq->busy),
 822			       &req->txps, &count);
 823	req->seqsubmitted += count;
 824	if (req->seqsubmitted == req->info.npkts) {
 825		/*
 826		 * The txreq has already been submitted to the HW queue
 827		 * so we can free the AHG entry now. Corruption will not
 828		 * happen due to the sequential manner in which
 829		 * descriptors are processed.
 830		 */
 831		if (req->ahg_idx >= 0)
 832			sdma_ahg_free(req->sde, req->ahg_idx);
 833	}
 834	return ret;
 835
 836free_txreq:
 837	sdma_txclean(pq->dd, &tx->txreq);
 838free_tx:
 839	kmem_cache_free(pq->txreq_cache, tx);
 840	return ret;
 841}
 842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 843static int check_header_template(struct user_sdma_request *req,
 844				 struct hfi1_pkt_header *hdr, u32 lrhlen,
 845				 u32 datalen)
 846{
 847	/*
 848	 * Perform safety checks for any type of packet:
 849	 *    - transfer size is multiple of 64bytes
 850	 *    - packet length is multiple of 4 bytes
 851	 *    - packet length is not larger than MTU size
 852	 *
 853	 * These checks are only done for the first packet of the
 854	 * transfer since the header is "given" to us by user space.
 855	 * For the remainder of the packets we compute the values.
 856	 */
 857	if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
 858	    lrhlen > get_lrh_len(*hdr, req->info.fragsize))
 859		return -EINVAL;
 860
 861	if (req_opcode(req->info.ctrl) == EXPECTED) {
 862		/*
 863		 * The header is checked only on the first packet. Furthermore,
 864		 * we ensure that at least one TID entry is copied when the
 865		 * request is submitted. Therefore, we don't have to verify that
 866		 * tididx points to something sane.
 867		 */
 868		u32 tidval = req->tids[req->tididx],
 869			tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
 870			tididx = EXP_TID_GET(tidval, IDX),
 871			tidctrl = EXP_TID_GET(tidval, CTRL),
 872			tidoff;
 873		__le32 kval = hdr->kdeth.ver_tid_offset;
 874
 875		tidoff = KDETH_GET(kval, OFFSET) *
 876			  (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 877			   KDETH_OM_LARGE : KDETH_OM_SMALL);
 878		/*
 879		 * Expected receive packets have the following
 880		 * additional checks:
 881		 *     - offset is not larger than the TID size
 882		 *     - TIDCtrl values match between header and TID array
 883		 *     - TID indexes match between header and TID array
 884		 */
 885		if ((tidoff + datalen > tidlen) ||
 886		    KDETH_GET(kval, TIDCTRL) != tidctrl ||
 887		    KDETH_GET(kval, TID) != tididx)
 888			return -EINVAL;
 889	}
 890	return 0;
 891}
 892
 893/*
 894 * Correctly set the BTH.PSN field based on type of
 895 * transfer - eager packets can just increment the PSN but
 896 * expected packets encode generation and sequence in the
 897 * BTH.PSN field so just incrementing will result in errors.
 898 */
 899static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
 900{
 901	u32 val = be32_to_cpu(bthpsn),
 902		mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
 903			0xffffffull),
 904		psn = val & mask;
 905	if (expct)
 906		psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
 907			((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
 908	else
 909		psn = psn + frags;
 910	return psn & mask;
 911}
 912
 913static int set_txreq_header(struct user_sdma_request *req,
 914			    struct user_sdma_txreq *tx, u32 datalen)
 915{
 916	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 917	struct hfi1_pkt_header *hdr = &tx->hdr;
 918	u8 omfactor; /* KDETH.OM */
 919	u16 pbclen;
 920	int ret;
 921	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
 922
 923	/* Copy the header template to the request before modification */
 924	memcpy(hdr, &req->hdr, sizeof(*hdr));
 925
 926	/*
 927	 * Check if the PBC and LRH length are mismatched. If so
 928	 * adjust both in the header.
 929	 */
 930	pbclen = le16_to_cpu(hdr->pbc[0]);
 931	if (PBC2LRH(pbclen) != lrhlen) {
 932		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 933		hdr->pbc[0] = cpu_to_le16(pbclen);
 934		hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
 935		/*
 936		 * Third packet
 937		 * This is the first packet in the sequence that has
 938		 * a "static" size that can be used for the rest of
 939		 * the packets (besides the last one).
 940		 */
 941		if (unlikely(req->seqnum == 2)) {
 942			/*
 943			 * From this point on the lengths in both the
 944			 * PBC and LRH are the same until the last
 945			 * packet.
 946			 * Adjust the template so we don't have to update
 947			 * every packet
 948			 */
 949			req->hdr.pbc[0] = hdr->pbc[0];
 950			req->hdr.lrh[2] = hdr->lrh[2];
 951		}
 952	}
 953	/*
 954	 * We only have to modify the header if this is not the
 955	 * first packet in the request. Otherwise, we use the
 956	 * header given to us.
 957	 */
 958	if (unlikely(!req->seqnum)) {
 959		ret = check_header_template(req, hdr, lrhlen, datalen);
 960		if (ret)
 961			return ret;
 962		goto done;
 963	}
 964
 965	hdr->bth[2] = cpu_to_be32(
 966		set_pkt_bth_psn(hdr->bth[2],
 967				(req_opcode(req->info.ctrl) == EXPECTED),
 968				req->seqnum));
 969
 970	/* Set ACK request on last packet */
 971	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
 972		hdr->bth[2] |= cpu_to_be32(1UL << 31);
 973
 974	/* Set the new offset */
 975	hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
 976	/* Expected packets have to fill in the new TID information */
 977	if (req_opcode(req->info.ctrl) == EXPECTED) {
 978		tidval = req->tids[req->tididx];
 979		/*
 980		 * If the offset puts us at the end of the current TID,
 981		 * advance everything.
 982		 */
 983		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
 984					 PAGE_SIZE)) {
 985			req->tidoffset = 0;
 986			/*
 987			 * Since we don't copy all the TIDs, all at once,
 988			 * we have to check again.
 989			 */
 990			if (++req->tididx > req->n_tids - 1 ||
 991			    !req->tids[req->tididx]) {
 992				return -EINVAL;
 993			}
 994			tidval = req->tids[req->tididx];
 995		}
 996		omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
 997			KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
 998			KDETH_OM_SMALL_SHIFT;
 999		/* Set KDETH.TIDCtrl based on value for this TID. */
1000		KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1001			  EXP_TID_GET(tidval, CTRL));
1002		/* Set KDETH.TID based on value for this TID */
1003		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1004			  EXP_TID_GET(tidval, IDX));
1005		/* Clear KDETH.SH when DISABLE_SH flag is set */
1006		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
1007			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1008		/*
1009		 * Set the KDETH.OFFSET and KDETH.OM based on size of
1010		 * transfer.
1011		 */
1012		trace_hfi1_sdma_user_tid_info(
1013			pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
1014			req->tidoffset, req->tidoffset >> omfactor,
1015			omfactor != KDETH_OM_SMALL_SHIFT);
1016		KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1017			  req->tidoffset >> omfactor);
1018		KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1019			  omfactor != KDETH_OM_SMALL_SHIFT);
1020	}
1021done:
1022	trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1023				    req->info.comp_idx, hdr, tidval);
1024	return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1025}
1026
1027static int set_txreq_header_ahg(struct user_sdma_request *req,
1028				struct user_sdma_txreq *tx, u32 datalen)
1029{
1030	u32 ahg[AHG_KDETH_ARRAY_SIZE];
1031	int idx = 0;
1032	u8 omfactor; /* KDETH.OM */
1033	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1034	struct hfi1_pkt_header *hdr = &req->hdr;
1035	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1036	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1037	size_t array_size = ARRAY_SIZE(ahg);
1038
1039	if (PBC2LRH(pbclen) != lrhlen) {
1040		/* PBC.PbcLengthDWs */
1041		idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
1042				     (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
1043		if (idx < 0)
1044			return idx;
1045		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
1046		idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
1047				     (__force u16)cpu_to_be16(lrhlen >> 2));
1048		if (idx < 0)
1049			return idx;
1050	}
1051
1052	/*
1053	 * Do the common updates
1054	 */
1055	/* BTH.PSN and BTH.A */
1056	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1057		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1058	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1059		val32 |= 1UL << 31;
1060	idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
1061			     (__force u16)cpu_to_be16(val32 >> 16));
1062	if (idx < 0)
1063		return idx;
1064	idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
1065			     (__force u16)cpu_to_be16(val32 & 0xffff));
1066	if (idx < 0)
1067		return idx;
1068	/* KDETH.Offset */
1069	idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
1070			     (__force u16)cpu_to_le16(req->koffset & 0xffff));
1071	if (idx < 0)
1072		return idx;
1073	idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
1074			     (__force u16)cpu_to_le16(req->koffset >> 16));
1075	if (idx < 0)
1076		return idx;
1077	if (req_opcode(req->info.ctrl) == EXPECTED) {
1078		__le16 val;
1079
1080		tidval = req->tids[req->tididx];
1081
1082		/*
1083		 * If the offset puts us at the end of the current TID,
1084		 * advance everything.
1085		 */
1086		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1087					 PAGE_SIZE)) {
1088			req->tidoffset = 0;
1089			/*
1090			 * Since we don't copy all the TIDs, all at once,
1091			 * we have to check again.
1092			 */
1093			if (++req->tididx > req->n_tids - 1 ||
1094			    !req->tids[req->tididx])
1095				return -EINVAL;
1096			tidval = req->tids[req->tididx];
1097		}
1098		omfactor = ((EXP_TID_GET(tidval, LEN) *
1099				  PAGE_SIZE) >=
1100				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
1101				 KDETH_OM_SMALL_SHIFT;
1102		/* KDETH.OM and KDETH.OFFSET (TID) */
1103		idx = ahg_header_set(
1104				ahg, idx, array_size, 7, 0, 16,
1105				((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
1106				((req->tidoffset >> omfactor)
1107				& 0x7fff)));
1108		if (idx < 0)
1109			return idx;
1110		/* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
1111		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1112				   (EXP_TID_GET(tidval, IDX) & 0x3ff));
1113
1114		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
1115			val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1116						      INTR) <<
1117					    AHG_KDETH_INTR_SHIFT));
1118		} else {
1119			val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
1120			       cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
1121			       cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1122						      INTR) <<
1123					     AHG_KDETH_INTR_SHIFT));
1124		}
1125
1126		idx = ahg_header_set(ahg, idx, array_size,
1127				     7, 16, 14, (__force u16)val);
1128		if (idx < 0)
1129			return idx;
1130	}
1131
1132	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1133					req->info.comp_idx, req->sde->this_idx,
1134					req->ahg_idx, ahg, idx, tidval);
1135	sdma_txinit_ahg(&tx->txreq,
1136			SDMA_TXREQ_F_USE_AHG,
1137			datalen, req->ahg_idx, idx,
1138			ahg, sizeof(req->hdr),
1139			user_sdma_txreq_cb);
1140
1141	return idx;
1142}
1143
1144/**
1145 * user_sdma_txreq_cb() - SDMA tx request completion callback.
1146 * @txreq: valid sdma tx request
1147 * @status: success/failure of request
1148 *
1149 * Called when the SDMA progress state machine gets notification that
1150 * the SDMA descriptors for this tx request have been processed by the
1151 * DMA engine. Called in interrupt context.
1152 * Only do work on completed sequences.
1153 */
1154static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
1155{
1156	struct user_sdma_txreq *tx =
1157		container_of(txreq, struct user_sdma_txreq, txreq);
1158	struct user_sdma_request *req;
1159	struct hfi1_user_sdma_pkt_q *pq;
1160	struct hfi1_user_sdma_comp_q *cq;
1161	enum hfi1_sdma_comp_state state = COMPLETE;
1162
1163	if (!tx->req)
1164		return;
1165
1166	req = tx->req;
1167	pq = req->pq;
1168	cq = req->cq;
1169
1170	if (status != SDMA_TXREQ_S_OK) {
1171		SDMA_DBG(req, "SDMA completion with error %d",
1172			 status);
1173		WRITE_ONCE(req->has_error, 1);
1174		state = ERROR;
1175	}
1176
1177	req->seqcomp = tx->seqnum;
1178	kmem_cache_free(pq->txreq_cache, tx);
1179
1180	/* sequence isn't complete?  We are done */
1181	if (req->seqcomp != req->info.npkts - 1)
1182		return;
1183
1184	user_sdma_free_request(req);
1185	set_comp_state(pq, cq, req->info.comp_idx, state, status);
1186	pq_update(pq);
1187}
1188
1189static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
1190{
1191	if (atomic_dec_and_test(&pq->n_reqs))
1192		wake_up(&pq->wait);
1193}
1194
1195static void user_sdma_free_request(struct user_sdma_request *req)
1196{
 
 
1197	if (!list_empty(&req->txps)) {
1198		struct sdma_txreq *t, *p;
1199
1200		list_for_each_entry_safe(t, p, &req->txps, list) {
1201			struct user_sdma_txreq *tx =
1202				container_of(t, struct user_sdma_txreq, txreq);
1203			list_del_init(&t->list);
1204			sdma_txclean(req->pq->dd, t);
1205			kmem_cache_free(req->pq->txreq_cache, tx);
1206		}
1207	}
1208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1209	kfree(req->tids);
1210	clear_bit(req->info.comp_idx, req->pq->req_in_use);
1211}
1212
1213static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
1214				  struct hfi1_user_sdma_comp_q *cq,
1215				  u16 idx, enum hfi1_sdma_comp_state state,
1216				  int ret)
1217{
1218	if (state == ERROR)
1219		cq->comps[idx].errcode = -ret;
1220	smp_wmb(); /* make sure errcode is visible first */
1221	cq->comps[idx].status = state;
1222	trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
1223					idx, state, ret);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1224}

   1// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
   2/*
   3 * Copyright(c) 2020 - Cornelis Networks, Inc.
   4 * Copyright(c) 2015 - 2018 Intel Corporation.
   5 */
   6
   7#include <linux/mm.h>
   8#include <linux/types.h>
   9#include <linux/device.h>
  10#include <linux/dmapool.h>
  11#include <linux/slab.h>
  12#include <linux/list.h>
  13#include <linux/highmem.h>
  14#include <linux/io.h>
  15#include <linux/uio.h>
  16#include <linux/rbtree.h>
  17#include <linux/spinlock.h>
  18#include <linux/delay.h>
  19#include <linux/kthread.h>
  20#include <linux/mmu_context.h>
  21#include <linux/module.h>
  22#include <linux/vmalloc.h>
  23#include <linux/string.h>
  24
  25#include "hfi.h"
  26#include "sdma.h"
  27#include "mmu_rb.h"
  28#include "user_sdma.h"
  29#include "verbs.h"  /* for the headers */
  30#include "common.h" /* for struct hfi1_tid_info */
  31#include "trace.h"
  32
  33static uint hfi1_sdma_comp_ring_size = 128;
  34module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
  35MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
  36
  37static unsigned initial_pkt_count = 8;
  38
  39static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
  40static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
  41static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
  42static void user_sdma_free_request(struct user_sdma_request *req, bool unpin);
  43static int pin_vector_pages(struct user_sdma_request *req,
  44			    struct user_sdma_iovec *iovec);
  45static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
  46			       unsigned start, unsigned npages);
  47static int check_header_template(struct user_sdma_request *req,
  48				 struct hfi1_pkt_header *hdr, u32 lrhlen,
  49				 u32 datalen);
  50static int set_txreq_header(struct user_sdma_request *req,
  51			    struct user_sdma_txreq *tx, u32 datalen);
  52static int set_txreq_header_ahg(struct user_sdma_request *req,
  53				struct user_sdma_txreq *tx, u32 len);
  54static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  55				  struct hfi1_user_sdma_comp_q *cq,
  56				  u16 idx, enum hfi1_sdma_comp_state state,
  57				  int ret);
  58static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
  59static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
  60
  61static int defer_packet_queue(
  62	struct sdma_engine *sde,
  63	struct iowait_work *wait,
  64	struct sdma_txreq *txreq,
  65	uint seq,
  66	bool pkts_sent);
  67static void activate_packet_queue(struct iowait *wait, int reason);
  68static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
  69			   unsigned long len);
  70static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode);
  71static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
  72			 void *arg2, bool *stop);
  73static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode);
  74static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
  75
  76static struct mmu_rb_ops sdma_rb_ops = {
  77	.filter = sdma_rb_filter,
  78	.insert = sdma_rb_insert,
  79	.evict = sdma_rb_evict,
  80	.remove = sdma_rb_remove,
  81	.invalidate = sdma_rb_invalidate
  82};
  83
  84static int defer_packet_queue(
  85	struct sdma_engine *sde,
  86	struct iowait_work *wait,
  87	struct sdma_txreq *txreq,
  88	uint seq,
  89	bool pkts_sent)
  90{
  91	struct hfi1_user_sdma_pkt_q *pq =
  92		container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
  93
  94	write_seqlock(&sde->waitlock);
  95	trace_hfi1_usdma_defer(pq, sde, &pq->busy);
  96	if (sdma_progress(sde, seq, txreq))
  97		goto eagain;
  98	/*
  99	 * We are assuming that if the list is enqueued somewhere, it
 100	 * is to the dmawait list since that is the only place where
 101	 * it is supposed to be enqueued.
 102	 */
 103	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
 104	if (list_empty(&pq->busy.list)) {
 105		pq->busy.lock = &sde->waitlock;
 106		iowait_get_priority(&pq->busy);
 107		iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
 108	}
 109	write_sequnlock(&sde->waitlock);
 110	return -EBUSY;
 111eagain:
 112	write_sequnlock(&sde->waitlock);
 113	return -EAGAIN;
 114}
 115
 116static void activate_packet_queue(struct iowait *wait, int reason)
 117{
 118	struct hfi1_user_sdma_pkt_q *pq =
 119		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
 120
 121	trace_hfi1_usdma_activate(pq, wait, reason);
 122	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
 123	wake_up(&wait->wait_dma);
 124};
 125
 126int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 127				struct hfi1_filedata *fd)
 128{
 129	int ret = -ENOMEM;
 130	char buf[64];
 131	struct hfi1_devdata *dd;
 132	struct hfi1_user_sdma_comp_q *cq;
 133	struct hfi1_user_sdma_pkt_q *pq;
 134
 135	if (!uctxt || !fd)
 136		return -EBADF;
 137
 138	if (!hfi1_sdma_comp_ring_size)
 139		return -EINVAL;
 140
 141	dd = uctxt->dd;
 142
 143	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
 144	if (!pq)
 145		return -ENOMEM;
 146	pq->dd = dd;
 147	pq->ctxt = uctxt->ctxt;
 148	pq->subctxt = fd->subctxt;
 149	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
 150	atomic_set(&pq->n_reqs, 0);
 151	init_waitqueue_head(&pq->wait);
 152	atomic_set(&pq->n_locked, 0);
 153
 154	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
 155		    activate_packet_queue, NULL, NULL);
 156	pq->reqidx = 0;
 157
 158	pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
 159			   sizeof(*pq->reqs),
 160			   GFP_KERNEL);
 161	if (!pq->reqs)
 162		goto pq_reqs_nomem;
 163
 164	pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL);
 165	if (!pq->req_in_use)
 166		goto pq_reqs_no_in_use;
 167
 168	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
 169		 fd->subctxt);
 170	pq->txreq_cache = kmem_cache_create(buf,
 171					    sizeof(struct user_sdma_txreq),
 172					    L1_CACHE_BYTES,
 173					    SLAB_HWCACHE_ALIGN,
 174					    NULL);
 175	if (!pq->txreq_cache) {
 176		dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
 177			   uctxt->ctxt);
 178		goto pq_txreq_nomem;
 179	}
 180
 181	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 182	if (!cq)
 183		goto cq_nomem;
 184
 185	cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
 186				 * hfi1_sdma_comp_ring_size));
 187	if (!cq->comps)
 188		goto cq_comps_nomem;
 189
 190	cq->nentries = hfi1_sdma_comp_ring_size;
 191
 192	ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq,
 193				   &pq->handler);
 194	if (ret) {
 195		dd_dev_err(dd, "Failed to register with MMU %d", ret);
 196		goto pq_mmu_fail;
 197	}
 198
 199	rcu_assign_pointer(fd->pq, pq);
 200	fd->cq = cq;
 201
 202	return 0;
 203
 204pq_mmu_fail:
 205	vfree(cq->comps);
 206cq_comps_nomem:
 207	kfree(cq);
 208cq_nomem:
 209	kmem_cache_destroy(pq->txreq_cache);
 210pq_txreq_nomem:
 211	bitmap_free(pq->req_in_use);
 212pq_reqs_no_in_use:
 213	kfree(pq->reqs);
 214pq_reqs_nomem:
 215	kfree(pq);
 216
 217	return ret;
 218}
 219
 220static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq)
 221{
 222	unsigned long flags;
 223	seqlock_t *lock = pq->busy.lock;
 224
 225	if (!lock)
 226		return;
 227	write_seqlock_irqsave(lock, flags);
 228	if (!list_empty(&pq->busy.list)) {
 229		list_del_init(&pq->busy.list);
 230		pq->busy.lock = NULL;
 231	}
 232	write_sequnlock_irqrestore(lock, flags);
 233}
 234
 235int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
 236			       struct hfi1_ctxtdata *uctxt)
 237{
 238	struct hfi1_user_sdma_pkt_q *pq;
 239
 240	trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
 241
 242	spin_lock(&fd->pq_rcu_lock);
 243	pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
 244				    lockdep_is_held(&fd->pq_rcu_lock));
 245	if (pq) {
 246		rcu_assign_pointer(fd->pq, NULL);
 247		spin_unlock(&fd->pq_rcu_lock);
 248		synchronize_srcu(&fd->pq_srcu);
 249		/* at this point there can be no more new requests */
 250		if (pq->handler)
 251			hfi1_mmu_rb_unregister(pq->handler);
 252		iowait_sdma_drain(&pq->busy);
 253		/* Wait until all requests have been freed. */
 254		wait_event_interruptible(
 255			pq->wait,
 256			!atomic_read(&pq->n_reqs));
 257		kfree(pq->reqs);
 
 258		bitmap_free(pq->req_in_use);
 259		kmem_cache_destroy(pq->txreq_cache);
 260		flush_pq_iowait(pq);
 261		kfree(pq);
 262	} else {
 263		spin_unlock(&fd->pq_rcu_lock);
 264	}
 265	if (fd->cq) {
 266		vfree(fd->cq->comps);
 267		kfree(fd->cq);
 268		fd->cq = NULL;
 269	}
 270	return 0;
 271}
 272
 273static u8 dlid_to_selector(u16 dlid)
 274{
 275	static u8 mapping[256];
 276	static int initialized;
 277	static u8 next;
 278	int hash;
 279
 280	if (!initialized) {
 281		memset(mapping, 0xFF, 256);
 282		initialized = 1;
 283	}
 284
 285	hash = ((dlid >> 8) ^ dlid) & 0xFF;
 286	if (mapping[hash] == 0xFF) {
 287		mapping[hash] = next;
 288		next = (next + 1) & 0x7F;
 289	}
 290
 291	return mapping[hash];
 292}
 293
 294/**
 295 * hfi1_user_sdma_process_request() - Process and start a user sdma request
 296 * @fd: valid file descriptor
 297 * @iovec: array of io vectors to process
 298 * @dim: overall iovec array size
 299 * @count: number of io vector array entries processed
 300 */
 301int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 302				   struct iovec *iovec, unsigned long dim,
 303				   unsigned long *count)
 304{
 305	int ret = 0, i;
 306	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 307	struct hfi1_user_sdma_pkt_q *pq =
 308		srcu_dereference(fd->pq, &fd->pq_srcu);
 309	struct hfi1_user_sdma_comp_q *cq = fd->cq;
 310	struct hfi1_devdata *dd = pq->dd;
 311	unsigned long idx = 0;
 312	u8 pcount = initial_pkt_count;
 313	struct sdma_req_info info;
 314	struct user_sdma_request *req;
 315	u8 opcode, sc, vl;
 316	u16 pkey;
 317	u32 slid;
 318	u16 dlid;
 319	u32 selector;
 320
 321	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
 322		hfi1_cdbg(
 323		   SDMA,
 324		   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
 325		   dd->unit, uctxt->ctxt, fd->subctxt,
 326		   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
 327		return -EINVAL;
 328	}
 329	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
 330	if (ret) {
 331		hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
 332			  dd->unit, uctxt->ctxt, fd->subctxt, ret);
 333		return -EFAULT;
 334	}
 335
 336	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
 337				     (u16 *)&info);
 338	if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
 339		hfi1_cdbg(SDMA,
 340			  "[%u:%u:%u:%u] Invalid comp index",
 341			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 342		return -EINVAL;
 343	}
 344
 345	/*
 346	 * Sanity check the header io vector count.  Need at least 1 vector
 347	 * (header) and cannot be larger than the actual io vector count.
 348	 */
 349	if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
 350		hfi1_cdbg(SDMA,
 351			  "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
 352			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
 353			  req_iovcnt(info.ctrl), dim);
 354		return -EINVAL;
 355	}
 356
 357	if (!info.fragsize) {
 358		hfi1_cdbg(SDMA,
 359			  "[%u:%u:%u:%u] Request does not specify fragsize",
 360			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 361		return -EINVAL;
 362	}
 363
 364	/* Try to claim the request. */
 365	if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
 366		hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
 367			  dd->unit, uctxt->ctxt, fd->subctxt,
 368			  info.comp_idx);
 369		return -EBADSLT;
 370	}
 371	/*
 372	 * All safety checks have been done and this request has been claimed.
 373	 */
 374	trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
 375					     info.comp_idx);
 376	req = pq->reqs + info.comp_idx;
 377	req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
 378	req->data_len  = 0;
 379	req->pq = pq;
 380	req->cq = cq;
 381	req->ahg_idx = -1;
 382	req->iov_idx = 0;
 383	req->sent = 0;
 384	req->seqnum = 0;
 385	req->seqcomp = 0;
 386	req->seqsubmitted = 0;
 387	req->tids = NULL;
 388	req->has_error = 0;
 389	INIT_LIST_HEAD(&req->txps);
 390
 391	memcpy(&req->info, &info, sizeof(info));
 392
 393	/* The request is initialized, count it */
 394	atomic_inc(&pq->n_reqs);
 395
 396	if (req_opcode(info.ctrl) == EXPECTED) {
 397		/* expected must have a TID info and at least one data vector */
 398		if (req->data_iovs < 2) {
 399			SDMA_DBG(req,
 400				 "Not enough vectors for expected request");
 401			ret = -EINVAL;
 402			goto free_req;
 403		}
 404		req->data_iovs--;
 405	}
 406
 407	if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
 408		SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
 409			 MAX_VECTORS_PER_REQ);
 410		ret = -EINVAL;
 411		goto free_req;
 412	}
 
 413	/* Copy the header from the user buffer */
 414	ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
 415			     sizeof(req->hdr));
 416	if (ret) {
 417		SDMA_DBG(req, "Failed to copy header template (%d)", ret);
 418		ret = -EFAULT;
 419		goto free_req;
 420	}
 421
 422	/* If Static rate control is not enabled, sanitize the header. */
 423	if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
 424		req->hdr.pbc[2] = 0;
 425
 426	/* Validate the opcode. Do not trust packets from user space blindly. */
 427	opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
 428	if ((opcode & USER_OPCODE_CHECK_MASK) !=
 429	     USER_OPCODE_CHECK_VAL) {
 430		SDMA_DBG(req, "Invalid opcode (%d)", opcode);
 431		ret = -EINVAL;
 432		goto free_req;
 433	}
 434	/*
 435	 * Validate the vl. Do not trust packets from user space blindly.
 436	 * VL comes from PBC, SC comes from LRH, and the VL needs to
 437	 * match the SC look up.
 438	 */
 439	vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
 440	sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
 441	      (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
 442	if (vl >= dd->pport->vls_operational ||
 443	    vl != sc_to_vlt(dd, sc)) {
 444		SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
 445		ret = -EINVAL;
 446		goto free_req;
 447	}
 448
 449	/* Checking P_KEY for requests from user-space */
 450	pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
 451	slid = be16_to_cpu(req->hdr.lrh[3]);
 452	if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
 453		ret = -EINVAL;
 454		goto free_req;
 455	}
 456
 457	/*
 458	 * Also should check the BTH.lnh. If it says the next header is GRH then
 459	 * the RXE parsing will be off and will land in the middle of the KDETH
 460	 * or miss it entirely.
 461	 */
 462	if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
 463		SDMA_DBG(req, "User tried to pass in a GRH");
 464		ret = -EINVAL;
 465		goto free_req;
 466	}
 467
 468	req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
 469	/*
 470	 * Calculate the initial TID offset based on the values of
 471	 * KDETH.OFFSET and KDETH.OM that are passed in.
 472	 */
 473	req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
 474		(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 475		 KDETH_OM_LARGE : KDETH_OM_SMALL);
 476	trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
 477					       info.comp_idx, req->tidoffset);
 478	idx++;
 479
 480	/* Save all the IO vector structures */
 481	for (i = 0; i < req->data_iovs; i++) {
 482		req->iovs[i].offset = 0;
 483		INIT_LIST_HEAD(&req->iovs[i].list);
 484		memcpy(&req->iovs[i].iov,
 485		       iovec + idx++,
 486		       sizeof(req->iovs[i].iov));
 487		ret = pin_vector_pages(req, &req->iovs[i]);
 488		if (ret) {
 489			req->data_iovs = i;
 490			goto free_req;
 491		}
 492		req->data_len += req->iovs[i].iov.iov_len;
 493	}
 494	trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
 495					 info.comp_idx, req->data_len);
 496	if (pcount > req->info.npkts)
 497		pcount = req->info.npkts;
 498	/*
 499	 * Copy any TID info
 500	 * User space will provide the TID info only when the
 501	 * request type is EXPECTED. This is true even if there is
 502	 * only one packet in the request and the header is already
 503	 * setup. The reason for the singular TID case is that the
 504	 * driver needs to perform safety checks.
 505	 */
 506	if (req_opcode(req->info.ctrl) == EXPECTED) {
 507		u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
 508		u32 *tmp;
 509
 510		if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
 511			ret = -EINVAL;
 512			goto free_req;
 513		}
 514
 515		/*
 516		 * We have to copy all of the tids because they may vary
 517		 * in size and, therefore, the TID count might not be
 518		 * equal to the pkt count. However, there is no way to
 519		 * tell at this point.
 520		 */
 521		tmp = memdup_user(iovec[idx].iov_base,
 522				  ntids * sizeof(*req->tids));
 523		if (IS_ERR(tmp)) {
 524			ret = PTR_ERR(tmp);
 525			SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
 526				 ntids, ret);
 527			goto free_req;
 528		}
 529		req->tids = tmp;
 530		req->n_tids = ntids;
 531		req->tididx = 0;
 532		idx++;
 533	}
 534
 535	dlid = be16_to_cpu(req->hdr.lrh[1]);
 536	selector = dlid_to_selector(dlid);
 537	selector += uctxt->ctxt + fd->subctxt;
 538	req->sde = sdma_select_user_engine(dd, selector, vl);
 539
 540	if (!req->sde || !sdma_running(req->sde)) {
 541		ret = -ECOMM;
 542		goto free_req;
 543	}
 544
 545	/* We don't need an AHG entry if the request contains only one packet */
 546	if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
 547		req->ahg_idx = sdma_ahg_alloc(req->sde);
 548
 549	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
 550	pq->state = SDMA_PKT_Q_ACTIVE;
 551
 552	/*
 553	 * This is a somewhat blocking send implementation.
 554	 * The driver will block the caller until all packets of the
 555	 * request have been submitted to the SDMA engine. However, it
 556	 * will not wait for send completions.
 557	 */
 558	while (req->seqsubmitted != req->info.npkts) {
 559		ret = user_sdma_send_pkts(req, pcount);
 560		if (ret < 0) {
 561			int we_ret;
 562
 563			if (ret != -EBUSY)
 564				goto free_req;
 565			we_ret = wait_event_interruptible_timeout(
 566				pq->busy.wait_dma,
 567				pq->state == SDMA_PKT_Q_ACTIVE,
 568				msecs_to_jiffies(
 569					SDMA_IOWAIT_TIMEOUT));
 570			trace_hfi1_usdma_we(pq, we_ret);
 571			if (we_ret <= 0)
 572				flush_pq_iowait(pq);
 573		}
 574	}
 575	*count += idx;
 576	return 0;
 577free_req:
 578	/*
 579	 * If the submitted seqsubmitted == npkts, the completion routine
 580	 * controls the final state.  If sequbmitted < npkts, wait for any
 581	 * outstanding packets to finish before cleaning up.
 582	 */
 583	if (req->seqsubmitted < req->info.npkts) {
 584		if (req->seqsubmitted)
 585			wait_event(pq->busy.wait_dma,
 586				   (req->seqcomp == req->seqsubmitted - 1));
 587		user_sdma_free_request(req, true);
 588		pq_update(pq);
 589		set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
 590	}
 591	return ret;
 592}
 593
 594static inline u32 compute_data_length(struct user_sdma_request *req,
 595				      struct user_sdma_txreq *tx)
 596{
 597	/*
 598	 * Determine the proper size of the packet data.
 599	 * The size of the data of the first packet is in the header
 600	 * template. However, it includes the header and ICRC, which need
 601	 * to be subtracted.
 602	 * The minimum representable packet data length in a header is 4 bytes,
 603	 * therefore, when the data length request is less than 4 bytes, there's
 604	 * only one packet, and the packet data length is equal to that of the
 605	 * request data length.
 606	 * The size of the remaining packets is the minimum of the frag
 607	 * size (MTU) or remaining data in the request.
 608	 */
 609	u32 len;
 610
 611	if (!req->seqnum) {
 612		if (req->data_len < sizeof(u32))
 613			len = req->data_len;
 614		else
 615			len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
 616			       (sizeof(tx->hdr) - 4));
 617	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
 618		u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
 619			PAGE_SIZE;
 620		/*
 621		 * Get the data length based on the remaining space in the
 622		 * TID pair.
 623		 */
 624		len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
 625		/* If we've filled up the TID pair, move to the next one. */
 626		if (unlikely(!len) && ++req->tididx < req->n_tids &&
 627		    req->tids[req->tididx]) {
 628			tidlen = EXP_TID_GET(req->tids[req->tididx],
 629					     LEN) * PAGE_SIZE;
 630			req->tidoffset = 0;
 631			len = min_t(u32, tidlen, req->info.fragsize);
 632		}
 633		/*
 634		 * Since the TID pairs map entire pages, make sure that we
 635		 * are not going to try to send more data that we have
 636		 * remaining.
 637		 */
 638		len = min(len, req->data_len - req->sent);
 639	} else {
 640		len = min(req->data_len - req->sent, (u32)req->info.fragsize);
 641	}
 642	trace_hfi1_sdma_user_compute_length(req->pq->dd,
 643					    req->pq->ctxt,
 644					    req->pq->subctxt,
 645					    req->info.comp_idx,
 646					    len);
 647	return len;
 648}
 649
 650static inline u32 pad_len(u32 len)
 651{
 652	if (len & (sizeof(u32) - 1))
 653		len += sizeof(u32) - (len & (sizeof(u32) - 1));
 654	return len;
 655}
 656
 657static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
 658{
 659	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
 660	return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
 661}
 662
 663static int user_sdma_txadd_ahg(struct user_sdma_request *req,
 664			       struct user_sdma_txreq *tx,
 665			       u32 datalen)
 666{
 667	int ret;
 668	u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
 669	u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
 670	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 671
 672	/*
 673	 * Copy the request header into the tx header
 674	 * because the HW needs a cacheline-aligned
 675	 * address.
 676	 * This copy can be optimized out if the hdr
 677	 * member of user_sdma_request were also
 678	 * cacheline aligned.
 679	 */
 680	memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
 681	if (PBC2LRH(pbclen) != lrhlen) {
 682		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 683		tx->hdr.pbc[0] = cpu_to_le16(pbclen);
 684	}
 685	ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
 686	if (ret)
 687		return ret;
 688	ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
 689			      sizeof(tx->hdr) + datalen, req->ahg_idx,
 690			      0, NULL, 0, user_sdma_txreq_cb);
 691	if (ret)
 692		return ret;
 693	ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
 694	if (ret)
 695		sdma_txclean(pq->dd, &tx->txreq);
 696	return ret;
 697}
 698
 699static int user_sdma_txadd(struct user_sdma_request *req,
 700			   struct user_sdma_txreq *tx,
 701			   struct user_sdma_iovec *iovec, u32 datalen,
 702			   u32 *queued_ptr, u32 *data_sent_ptr,
 703			   u64 *iov_offset_ptr)
 704{
 705	int ret;
 706	unsigned int pageidx, len;
 707	unsigned long base, offset;
 708	u64 iov_offset = *iov_offset_ptr;
 709	u32 queued = *queued_ptr, data_sent = *data_sent_ptr;
 710	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 711
 712	base = (unsigned long)iovec->iov.iov_base;
 713	offset = offset_in_page(base + iovec->offset + iov_offset);
 714	pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >>
 715		   PAGE_SHIFT);
 716	len = offset + req->info.fragsize > PAGE_SIZE ?
 717		PAGE_SIZE - offset : req->info.fragsize;
 718	len = min((datalen - queued), len);
 719	ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx],
 720			      offset, len);
 721	if (ret) {
 722		SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret);
 723		return ret;
 724	}
 725	iov_offset += len;
 726	queued += len;
 727	data_sent += len;
 728	if (unlikely(queued < datalen && pageidx == iovec->npages &&
 729		     req->iov_idx < req->data_iovs - 1)) {
 730		iovec->offset += iov_offset;
 731		iovec = &req->iovs[++req->iov_idx];
 732		iov_offset = 0;
 733	}
 734
 735	*queued_ptr = queued;
 736	*data_sent_ptr = data_sent;
 737	*iov_offset_ptr = iov_offset;
 738	return ret;
 739}
 740
 741static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
 742{
 743	int ret = 0;
 744	u16 count;
 745	unsigned npkts = 0;
 746	struct user_sdma_txreq *tx = NULL;
 747	struct hfi1_user_sdma_pkt_q *pq = NULL;
 748	struct user_sdma_iovec *iovec = NULL;
 749
 750	if (!req->pq)
 751		return -EINVAL;
 752
 753	pq = req->pq;
 754
 755	/* If tx completion has reported an error, we are done. */
 756	if (READ_ONCE(req->has_error))
 757		return -EFAULT;
 758
 759	/*
 760	 * Check if we might have sent the entire request already
 761	 */
 762	if (unlikely(req->seqnum == req->info.npkts)) {
 763		if (!list_empty(&req->txps))
 764			goto dosend;
 765		return ret;
 766	}
 767
 768	if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
 769		maxpkts = req->info.npkts - req->seqnum;
 770
 771	while (npkts < maxpkts) {
 772		u32 datalen = 0, queued = 0, data_sent = 0;
 773		u64 iov_offset = 0;
 774
 775		/*
 776		 * Check whether any of the completions have come back
 777		 * with errors. If so, we are not going to process any
 778		 * more packets from this request.
 779		 */
 780		if (READ_ONCE(req->has_error))
 781			return -EFAULT;
 782
 783		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
 784		if (!tx)
 785			return -ENOMEM;
 786
 787		tx->flags = 0;
 788		tx->req = req;
 789		INIT_LIST_HEAD(&tx->list);
 790
 791		/*
 792		 * For the last packet set the ACK request
 793		 * and disable header suppression.
 794		 */
 795		if (req->seqnum == req->info.npkts - 1)
 796			tx->flags |= (TXREQ_FLAGS_REQ_ACK |
 797				      TXREQ_FLAGS_REQ_DISABLE_SH);
 798
 799		/*
 800		 * Calculate the payload size - this is min of the fragment
 801		 * (MTU) size or the remaining bytes in the request but only
 802		 * if we have payload data.
 803		 */
 804		if (req->data_len) {
 805			iovec = &req->iovs[req->iov_idx];
 806			if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
 807				if (++req->iov_idx == req->data_iovs) {
 808					ret = -EFAULT;
 809					goto free_tx;
 810				}
 811				iovec = &req->iovs[req->iov_idx];
 812				WARN_ON(iovec->offset);
 813			}
 814
 815			datalen = compute_data_length(req, tx);
 816
 817			/*
 818			 * Disable header suppression for the payload <= 8DWS.
 819			 * If there is an uncorrectable error in the receive
 820			 * data FIFO when the received payload size is less than
 821			 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
 822			 * not reported.There is set RHF.EccErr if the header
 823			 * is not suppressed.
 824			 */
 825			if (!datalen) {
 826				SDMA_DBG(req,
 827					 "Request has data but pkt len is 0");
 828				ret = -EFAULT;
 829				goto free_tx;
 830			} else if (datalen <= 32) {
 831				tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
 832			}
 833		}
 834
 835		if (req->ahg_idx >= 0) {
 836			if (!req->seqnum) {
 837				ret = user_sdma_txadd_ahg(req, tx, datalen);
 838				if (ret)
 839					goto free_tx;
 840			} else {
 841				int changes;
 842
 843				changes = set_txreq_header_ahg(req, tx,
 844							       datalen);
 845				if (changes < 0) {
 846					ret = changes;
 847					goto free_tx;
 848				}
 849			}
 850		} else {
 851			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
 852					  datalen, user_sdma_txreq_cb);
 853			if (ret)
 854				goto free_tx;
 855			/*
 856			 * Modify the header for this packet. This only needs
 857			 * to be done if we are not going to use AHG. Otherwise,
 858			 * the HW will do it based on the changes we gave it
 859			 * during sdma_txinit_ahg().
 860			 */
 861			ret = set_txreq_header(req, tx, datalen);
 862			if (ret)
 863				goto free_txreq;
 864		}
 865
 866		/*
 867		 * If the request contains any data vectors, add up to
 868		 * fragsize bytes to the descriptor.
 869		 */
 870		while (queued < datalen &&
 871		       (req->sent + data_sent) < req->data_len) {
 872			ret = user_sdma_txadd(req, tx, iovec, datalen,
 873					      &queued, &data_sent, &iov_offset);
 874			if (ret)
 875				goto free_txreq;
 
 876		}
 877		/*
 878		 * The txreq was submitted successfully so we can update
 879		 * the counters.
 880		 */
 881		req->koffset += datalen;
 882		if (req_opcode(req->info.ctrl) == EXPECTED)
 883			req->tidoffset += datalen;
 884		req->sent += data_sent;
 885		if (req->data_len)
 886			iovec->offset += iov_offset;
 887		list_add_tail(&tx->txreq.list, &req->txps);
 888		/*
 889		 * It is important to increment this here as it is used to
 890		 * generate the BTH.PSN and, therefore, can't be bulk-updated
 891		 * outside of the loop.
 892		 */
 893		tx->seqnum = req->seqnum++;
 894		npkts++;
 895	}
 896dosend:
 897	ret = sdma_send_txlist(req->sde,
 898			       iowait_get_ib_work(&pq->busy),
 899			       &req->txps, &count);
 900	req->seqsubmitted += count;
 901	if (req->seqsubmitted == req->info.npkts) {
 902		/*
 903		 * The txreq has already been submitted to the HW queue
 904		 * so we can free the AHG entry now. Corruption will not
 905		 * happen due to the sequential manner in which
 906		 * descriptors are processed.
 907		 */
 908		if (req->ahg_idx >= 0)
 909			sdma_ahg_free(req->sde, req->ahg_idx);
 910	}
 911	return ret;
 912
 913free_txreq:
 914	sdma_txclean(pq->dd, &tx->txreq);
 915free_tx:
 916	kmem_cache_free(pq->txreq_cache, tx);
 917	return ret;
 918}
 919
 920static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
 921{
 922	struct evict_data evict_data;
 923
 924	evict_data.cleared = 0;
 925	evict_data.target = npages;
 926	hfi1_mmu_rb_evict(pq->handler, &evict_data);
 927	return evict_data.cleared;
 928}
 929
 930static int pin_sdma_pages(struct user_sdma_request *req,
 931			  struct user_sdma_iovec *iovec,
 932			  struct sdma_mmu_node *node,
 933			  int npages)
 934{
 935	int pinned, cleared;
 936	struct page **pages;
 937	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 938
 939	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
 940	if (!pages)
 941		return -ENOMEM;
 942	memcpy(pages, node->pages, node->npages * sizeof(*pages));
 943
 944	npages -= node->npages;
 945retry:
 946	if (!hfi1_can_pin_pages(pq->dd, current->mm,
 947				atomic_read(&pq->n_locked), npages)) {
 948		cleared = sdma_cache_evict(pq, npages);
 949		if (cleared >= npages)
 950			goto retry;
 951	}
 952	pinned = hfi1_acquire_user_pages(current->mm,
 953					 ((unsigned long)iovec->iov.iov_base +
 954					 (node->npages * PAGE_SIZE)), npages, 0,
 955					 pages + node->npages);
 956	if (pinned < 0) {
 957		kfree(pages);
 958		return pinned;
 959	}
 960	if (pinned != npages) {
 961		unpin_vector_pages(current->mm, pages, node->npages, pinned);
 962		return -EFAULT;
 963	}
 964	kfree(node->pages);
 965	node->rb.len = iovec->iov.iov_len;
 966	node->pages = pages;
 967	atomic_add(pinned, &pq->n_locked);
 968	return pinned;
 969}
 970
 971static void unpin_sdma_pages(struct sdma_mmu_node *node)
 972{
 973	if (node->npages) {
 974		unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0,
 975				   node->npages);
 976		atomic_sub(node->npages, &node->pq->n_locked);
 977	}
 978}
 979
 980static int pin_vector_pages(struct user_sdma_request *req,
 981			    struct user_sdma_iovec *iovec)
 982{
 983	int ret = 0, pinned, npages;
 984	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 985	struct sdma_mmu_node *node = NULL;
 986	struct mmu_rb_node *rb_node;
 987	struct iovec *iov;
 988	bool extracted;
 989
 990	extracted =
 991		hfi1_mmu_rb_remove_unless_exact(pq->handler,
 992						(unsigned long)
 993						iovec->iov.iov_base,
 994						iovec->iov.iov_len, &rb_node);
 995	if (rb_node) {
 996		node = container_of(rb_node, struct sdma_mmu_node, rb);
 997		if (!extracted) {
 998			atomic_inc(&node->refcount);
 999			iovec->pages = node->pages;
1000			iovec->npages = node->npages;
1001			iovec->node = node;
1002			return 0;
1003		}
1004	}
1005
1006	if (!node) {
1007		node = kzalloc(sizeof(*node), GFP_KERNEL);
1008		if (!node)
1009			return -ENOMEM;
1010
1011		node->rb.addr = (unsigned long)iovec->iov.iov_base;
1012		node->pq = pq;
1013		atomic_set(&node->refcount, 0);
1014	}
1015
1016	iov = &iovec->iov;
1017	npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len);
1018	if (node->npages < npages) {
1019		pinned = pin_sdma_pages(req, iovec, node, npages);
1020		if (pinned < 0) {
1021			ret = pinned;
1022			goto bail;
1023		}
1024		node->npages += pinned;
1025		npages = node->npages;
1026	}
1027	iovec->pages = node->pages;
1028	iovec->npages = npages;
1029	iovec->node = node;
1030
1031	ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
1032	if (ret) {
1033		iovec->node = NULL;
1034		goto bail;
1035	}
1036	return 0;
1037bail:
1038	unpin_sdma_pages(node);
1039	kfree(node);
1040	return ret;
1041}
1042
1043static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
1044			       unsigned start, unsigned npages)
1045{
1046	hfi1_release_user_pages(mm, pages + start, npages, false);
1047	kfree(pages);
1048}
1049
1050static int check_header_template(struct user_sdma_request *req,
1051				 struct hfi1_pkt_header *hdr, u32 lrhlen,
1052				 u32 datalen)
1053{
1054	/*
1055	 * Perform safety checks for any type of packet:
1056	 *    - transfer size is multiple of 64bytes
1057	 *    - packet length is multiple of 4 bytes
1058	 *    - packet length is not larger than MTU size
1059	 *
1060	 * These checks are only done for the first packet of the
1061	 * transfer since the header is "given" to us by user space.
1062	 * For the remainder of the packets we compute the values.
1063	 */
1064	if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
1065	    lrhlen > get_lrh_len(*hdr, req->info.fragsize))
1066		return -EINVAL;
1067
1068	if (req_opcode(req->info.ctrl) == EXPECTED) {
1069		/*
1070		 * The header is checked only on the first packet. Furthermore,
1071		 * we ensure that at least one TID entry is copied when the
1072		 * request is submitted. Therefore, we don't have to verify that
1073		 * tididx points to something sane.
1074		 */
1075		u32 tidval = req->tids[req->tididx],
1076			tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
1077			tididx = EXP_TID_GET(tidval, IDX),
1078			tidctrl = EXP_TID_GET(tidval, CTRL),
1079			tidoff;
1080		__le32 kval = hdr->kdeth.ver_tid_offset;
1081
1082		tidoff = KDETH_GET(kval, OFFSET) *
1083			  (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
1084			   KDETH_OM_LARGE : KDETH_OM_SMALL);
1085		/*
1086		 * Expected receive packets have the following
1087		 * additional checks:
1088		 *     - offset is not larger than the TID size
1089		 *     - TIDCtrl values match between header and TID array
1090		 *     - TID indexes match between header and TID array
1091		 */
1092		if ((tidoff + datalen > tidlen) ||
1093		    KDETH_GET(kval, TIDCTRL) != tidctrl ||
1094		    KDETH_GET(kval, TID) != tididx)
1095			return -EINVAL;
1096	}
1097	return 0;
1098}
1099
1100/*
1101 * Correctly set the BTH.PSN field based on type of
1102 * transfer - eager packets can just increment the PSN but
1103 * expected packets encode generation and sequence in the
1104 * BTH.PSN field so just incrementing will result in errors.
1105 */
1106static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
1107{
1108	u32 val = be32_to_cpu(bthpsn),
1109		mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
1110			0xffffffull),
1111		psn = val & mask;
1112	if (expct)
1113		psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
1114			((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
1115	else
1116		psn = psn + frags;
1117	return psn & mask;
1118}
1119
1120static int set_txreq_header(struct user_sdma_request *req,
1121			    struct user_sdma_txreq *tx, u32 datalen)
1122{
1123	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1124	struct hfi1_pkt_header *hdr = &tx->hdr;
1125	u8 omfactor; /* KDETH.OM */
1126	u16 pbclen;
1127	int ret;
1128	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1129
1130	/* Copy the header template to the request before modification */
1131	memcpy(hdr, &req->hdr, sizeof(*hdr));
1132
1133	/*
1134	 * Check if the PBC and LRH length are mismatched. If so
1135	 * adjust both in the header.
1136	 */
1137	pbclen = le16_to_cpu(hdr->pbc[0]);
1138	if (PBC2LRH(pbclen) != lrhlen) {
1139		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
1140		hdr->pbc[0] = cpu_to_le16(pbclen);
1141		hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
1142		/*
1143		 * Third packet
1144		 * This is the first packet in the sequence that has
1145		 * a "static" size that can be used for the rest of
1146		 * the packets (besides the last one).
1147		 */
1148		if (unlikely(req->seqnum == 2)) {
1149			/*
1150			 * From this point on the lengths in both the
1151			 * PBC and LRH are the same until the last
1152			 * packet.
1153			 * Adjust the template so we don't have to update
1154			 * every packet
1155			 */
1156			req->hdr.pbc[0] = hdr->pbc[0];
1157			req->hdr.lrh[2] = hdr->lrh[2];
1158		}
1159	}
1160	/*
1161	 * We only have to modify the header if this is not the
1162	 * first packet in the request. Otherwise, we use the
1163	 * header given to us.
1164	 */
1165	if (unlikely(!req->seqnum)) {
1166		ret = check_header_template(req, hdr, lrhlen, datalen);
1167		if (ret)
1168			return ret;
1169		goto done;
1170	}
1171
1172	hdr->bth[2] = cpu_to_be32(
1173		set_pkt_bth_psn(hdr->bth[2],
1174				(req_opcode(req->info.ctrl) == EXPECTED),
1175				req->seqnum));
1176
1177	/* Set ACK request on last packet */
1178	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1179		hdr->bth[2] |= cpu_to_be32(1UL << 31);
1180
1181	/* Set the new offset */
1182	hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
1183	/* Expected packets have to fill in the new TID information */
1184	if (req_opcode(req->info.ctrl) == EXPECTED) {
1185		tidval = req->tids[req->tididx];
1186		/*
1187		 * If the offset puts us at the end of the current TID,
1188		 * advance everything.
1189		 */
1190		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1191					 PAGE_SIZE)) {
1192			req->tidoffset = 0;
1193			/*
1194			 * Since we don't copy all the TIDs, all at once,
1195			 * we have to check again.
1196			 */
1197			if (++req->tididx > req->n_tids - 1 ||
1198			    !req->tids[req->tididx]) {
1199				return -EINVAL;
1200			}
1201			tidval = req->tids[req->tididx];
1202		}
1203		omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
1204			KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
1205			KDETH_OM_SMALL_SHIFT;
1206		/* Set KDETH.TIDCtrl based on value for this TID. */
1207		KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1208			  EXP_TID_GET(tidval, CTRL));
1209		/* Set KDETH.TID based on value for this TID */
1210		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1211			  EXP_TID_GET(tidval, IDX));
1212		/* Clear KDETH.SH when DISABLE_SH flag is set */
1213		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
1214			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1215		/*
1216		 * Set the KDETH.OFFSET and KDETH.OM based on size of
1217		 * transfer.
1218		 */
1219		trace_hfi1_sdma_user_tid_info(
1220			pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
1221			req->tidoffset, req->tidoffset >> omfactor,
1222			omfactor != KDETH_OM_SMALL_SHIFT);
1223		KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1224			  req->tidoffset >> omfactor);
1225		KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1226			  omfactor != KDETH_OM_SMALL_SHIFT);
1227	}
1228done:
1229	trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1230				    req->info.comp_idx, hdr, tidval);
1231	return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1232}
1233
1234static int set_txreq_header_ahg(struct user_sdma_request *req,
1235				struct user_sdma_txreq *tx, u32 datalen)
1236{
1237	u32 ahg[AHG_KDETH_ARRAY_SIZE];
1238	int idx = 0;
1239	u8 omfactor; /* KDETH.OM */
1240	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1241	struct hfi1_pkt_header *hdr = &req->hdr;
1242	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1243	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1244	size_t array_size = ARRAY_SIZE(ahg);
1245
1246	if (PBC2LRH(pbclen) != lrhlen) {
1247		/* PBC.PbcLengthDWs */
1248		idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
1249				     (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
1250		if (idx < 0)
1251			return idx;
1252		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
1253		idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
1254				     (__force u16)cpu_to_be16(lrhlen >> 2));
1255		if (idx < 0)
1256			return idx;
1257	}
1258
1259	/*
1260	 * Do the common updates
1261	 */
1262	/* BTH.PSN and BTH.A */
1263	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1264		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1265	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1266		val32 |= 1UL << 31;
1267	idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
1268			     (__force u16)cpu_to_be16(val32 >> 16));
1269	if (idx < 0)
1270		return idx;
1271	idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
1272			     (__force u16)cpu_to_be16(val32 & 0xffff));
1273	if (idx < 0)
1274		return idx;
1275	/* KDETH.Offset */
1276	idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
1277			     (__force u16)cpu_to_le16(req->koffset & 0xffff));
1278	if (idx < 0)
1279		return idx;
1280	idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
1281			     (__force u16)cpu_to_le16(req->koffset >> 16));
1282	if (idx < 0)
1283		return idx;
1284	if (req_opcode(req->info.ctrl) == EXPECTED) {
1285		__le16 val;
1286
1287		tidval = req->tids[req->tididx];
1288
1289		/*
1290		 * If the offset puts us at the end of the current TID,
1291		 * advance everything.
1292		 */
1293		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1294					 PAGE_SIZE)) {
1295			req->tidoffset = 0;
1296			/*
1297			 * Since we don't copy all the TIDs, all at once,
1298			 * we have to check again.
1299			 */
1300			if (++req->tididx > req->n_tids - 1 ||
1301			    !req->tids[req->tididx])
1302				return -EINVAL;
1303			tidval = req->tids[req->tididx];
1304		}
1305		omfactor = ((EXP_TID_GET(tidval, LEN) *
1306				  PAGE_SIZE) >=
1307				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
1308				 KDETH_OM_SMALL_SHIFT;
1309		/* KDETH.OM and KDETH.OFFSET (TID) */
1310		idx = ahg_header_set(
1311				ahg, idx, array_size, 7, 0, 16,
1312				((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
1313				((req->tidoffset >> omfactor)
1314				& 0x7fff)));
1315		if (idx < 0)
1316			return idx;
1317		/* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
1318		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1319				   (EXP_TID_GET(tidval, IDX) & 0x3ff));
1320
1321		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
1322			val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1323						      INTR) <<
1324					    AHG_KDETH_INTR_SHIFT));
1325		} else {
1326			val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
1327			       cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
1328			       cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1329						      INTR) <<
1330					     AHG_KDETH_INTR_SHIFT));
1331		}
1332
1333		idx = ahg_header_set(ahg, idx, array_size,
1334				     7, 16, 14, (__force u16)val);
1335		if (idx < 0)
1336			return idx;
1337	}
1338
1339	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1340					req->info.comp_idx, req->sde->this_idx,
1341					req->ahg_idx, ahg, idx, tidval);
1342	sdma_txinit_ahg(&tx->txreq,
1343			SDMA_TXREQ_F_USE_AHG,
1344			datalen, req->ahg_idx, idx,
1345			ahg, sizeof(req->hdr),
1346			user_sdma_txreq_cb);
1347
1348	return idx;
1349}
1350
1351/**
1352 * user_sdma_txreq_cb() - SDMA tx request completion callback.
1353 * @txreq: valid sdma tx request
1354 * @status: success/failure of request
1355 *
1356 * Called when the SDMA progress state machine gets notification that
1357 * the SDMA descriptors for this tx request have been processed by the
1358 * DMA engine. Called in interrupt context.
1359 * Only do work on completed sequences.
1360 */
1361static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
1362{
1363	struct user_sdma_txreq *tx =
1364		container_of(txreq, struct user_sdma_txreq, txreq);
1365	struct user_sdma_request *req;
1366	struct hfi1_user_sdma_pkt_q *pq;
1367	struct hfi1_user_sdma_comp_q *cq;
1368	enum hfi1_sdma_comp_state state = COMPLETE;
1369
1370	if (!tx->req)
1371		return;
1372
1373	req = tx->req;
1374	pq = req->pq;
1375	cq = req->cq;
1376
1377	if (status != SDMA_TXREQ_S_OK) {
1378		SDMA_DBG(req, "SDMA completion with error %d",
1379			 status);
1380		WRITE_ONCE(req->has_error, 1);
1381		state = ERROR;
1382	}
1383
1384	req->seqcomp = tx->seqnum;
1385	kmem_cache_free(pq->txreq_cache, tx);
1386
1387	/* sequence isn't complete?  We are done */
1388	if (req->seqcomp != req->info.npkts - 1)
1389		return;
1390
1391	user_sdma_free_request(req, false);
1392	set_comp_state(pq, cq, req->info.comp_idx, state, status);
1393	pq_update(pq);
1394}
1395
1396static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
1397{
1398	if (atomic_dec_and_test(&pq->n_reqs))
1399		wake_up(&pq->wait);
1400}
1401
1402static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
1403{
1404	int i;
1405
1406	if (!list_empty(&req->txps)) {
1407		struct sdma_txreq *t, *p;
1408
1409		list_for_each_entry_safe(t, p, &req->txps, list) {
1410			struct user_sdma_txreq *tx =
1411				container_of(t, struct user_sdma_txreq, txreq);
1412			list_del_init(&t->list);
1413			sdma_txclean(req->pq->dd, t);
1414			kmem_cache_free(req->pq->txreq_cache, tx);
1415		}
1416	}
1417
1418	for (i = 0; i < req->data_iovs; i++) {
1419		struct sdma_mmu_node *node = req->iovs[i].node;
1420
1421		if (!node)
1422			continue;
1423
1424		req->iovs[i].node = NULL;
1425
1426		if (unpin)
1427			hfi1_mmu_rb_remove(req->pq->handler,
1428					   &node->rb);
1429		else
1430			atomic_dec(&node->refcount);
1431	}
1432
1433	kfree(req->tids);
1434	clear_bit(req->info.comp_idx, req->pq->req_in_use);
1435}
1436
1437static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
1438				  struct hfi1_user_sdma_comp_q *cq,
1439				  u16 idx, enum hfi1_sdma_comp_state state,
1440				  int ret)
1441{
1442	if (state == ERROR)
1443		cq->comps[idx].errcode = -ret;
1444	smp_wmb(); /* make sure errcode is visible first */
1445	cq->comps[idx].status = state;
1446	trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
1447					idx, state, ret);
1448}
1449
1450static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
1451			   unsigned long len)
1452{
1453	return (bool)(node->addr == addr);
1454}
1455
1456static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode)
1457{
1458	struct sdma_mmu_node *node =
1459		container_of(mnode, struct sdma_mmu_node, rb);
1460
1461	atomic_inc(&node->refcount);
1462	return 0;
1463}
1464
1465/*
1466 * Return 1 to remove the node from the rb tree and call the remove op.
1467 *
1468 * Called with the rb tree lock held.
1469 */
1470static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
1471			 void *evict_arg, bool *stop)
1472{
1473	struct sdma_mmu_node *node =
1474		container_of(mnode, struct sdma_mmu_node, rb);
1475	struct evict_data *evict_data = evict_arg;
1476
1477	/* is this node still being used? */
1478	if (atomic_read(&node->refcount))
1479		return 0; /* keep this node */
1480
1481	/* this node will be evicted, add its pages to our count */
1482	evict_data->cleared += node->npages;
1483
1484	/* have enough pages been cleared? */
1485	if (evict_data->cleared >= evict_data->target)
1486		*stop = true;
1487
1488	return 1; /* remove this node */
1489}
1490
1491static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
1492{
1493	struct sdma_mmu_node *node =
1494		container_of(mnode, struct sdma_mmu_node, rb);
1495
1496	unpin_sdma_pages(node);
1497	kfree(node);
1498}
1499
1500static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
1501{
1502	struct sdma_mmu_node *node =
1503		container_of(mnode, struct sdma_mmu_node, rb);
1504
1505	if (!atomic_read(&node->refcount))
1506		return 1;
1507	return 0;
1508}