user_sdma.c - drivers/infiniband/hw/hfi1/user_sdma.c - Linux diff v6.8

   1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2/*
   3 * Copyright(c) 2020 - 2023 Cornelis Networks, Inc.
   4 * Copyright(c) 2015 - 2018 Intel Corporation.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
   5 */
   6
   7#include <linux/mm.h>
   8#include <linux/types.h>
   9#include <linux/device.h>
  10#include <linux/dmapool.h>
  11#include <linux/slab.h>
  12#include <linux/list.h>
  13#include <linux/highmem.h>
  14#include <linux/io.h>
  15#include <linux/uio.h>
  16#include <linux/rbtree.h>
  17#include <linux/spinlock.h>
  18#include <linux/delay.h>
  19#include <linux/kthread.h>
  20#include <linux/mmu_context.h>
  21#include <linux/module.h>
  22#include <linux/vmalloc.h>
  23#include <linux/string.h>
  24
  25#include "hfi.h"
  26#include "sdma.h"
 
  27#include "user_sdma.h"
  28#include "verbs.h"  /* for the headers */
  29#include "common.h" /* for struct hfi1_tid_info */
  30#include "trace.h"
  31
  32static uint hfi1_sdma_comp_ring_size = 128;
  33module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
  34MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
  35
  36static unsigned initial_pkt_count = 8;
  37
  38static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
  39static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
  40static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
  41static void user_sdma_free_request(struct user_sdma_request *req);
 
 
 
 
  42static int check_header_template(struct user_sdma_request *req,
  43				 struct hfi1_pkt_header *hdr, u32 lrhlen,
  44				 u32 datalen);
  45static int set_txreq_header(struct user_sdma_request *req,
  46			    struct user_sdma_txreq *tx, u32 datalen);
  47static int set_txreq_header_ahg(struct user_sdma_request *req,
  48				struct user_sdma_txreq *tx, u32 len);
  49static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  50				  struct hfi1_user_sdma_comp_q *cq,
  51				  u16 idx, enum hfi1_sdma_comp_state state,
  52				  int ret);
  53static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
  54static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
  55
  56static int defer_packet_queue(
  57	struct sdma_engine *sde,
  58	struct iowait_work *wait,
  59	struct sdma_txreq *txreq,
  60	uint seq,
  61	bool pkts_sent);
  62static void activate_packet_queue(struct iowait *wait, int reason);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  63
  64static int defer_packet_queue(
  65	struct sdma_engine *sde,
  66	struct iowait_work *wait,
  67	struct sdma_txreq *txreq,
  68	uint seq,
  69	bool pkts_sent)
  70{
  71	struct hfi1_user_sdma_pkt_q *pq =
  72		container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
  73
  74	write_seqlock(&sde->waitlock);
  75	trace_hfi1_usdma_defer(pq, sde, &pq->busy);
  76	if (sdma_progress(sde, seq, txreq))
  77		goto eagain;
  78	/*
  79	 * We are assuming that if the list is enqueued somewhere, it
  80	 * is to the dmawait list since that is the only place where
  81	 * it is supposed to be enqueued.
  82	 */
  83	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
  84	if (list_empty(&pq->busy.list)) {
  85		pq->busy.lock = &sde->waitlock;
  86		iowait_get_priority(&pq->busy);
  87		iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
  88	}
  89	write_sequnlock(&sde->waitlock);
  90	return -EBUSY;
  91eagain:
  92	write_sequnlock(&sde->waitlock);
  93	return -EAGAIN;
  94}
  95
  96static void activate_packet_queue(struct iowait *wait, int reason)
  97{
  98	struct hfi1_user_sdma_pkt_q *pq =
  99		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
 100
 101	trace_hfi1_usdma_activate(pq, wait, reason);
 102	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
 103	wake_up(&wait->wait_dma);
 104};
 105
 106int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 107				struct hfi1_filedata *fd)
 108{
 109	int ret = -ENOMEM;
 110	char buf[64];
 111	struct hfi1_devdata *dd;
 112	struct hfi1_user_sdma_comp_q *cq;
 113	struct hfi1_user_sdma_pkt_q *pq;
 114
 115	if (!uctxt || !fd)
 116		return -EBADF;
 117
 118	if (!hfi1_sdma_comp_ring_size)
 119		return -EINVAL;
 120
 121	dd = uctxt->dd;
 122
 123	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
 124	if (!pq)
 125		return -ENOMEM;
 126	pq->dd = dd;
 127	pq->ctxt = uctxt->ctxt;
 128	pq->subctxt = fd->subctxt;
 129	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
 130	atomic_set(&pq->n_reqs, 0);
 131	init_waitqueue_head(&pq->wait);
 132	atomic_set(&pq->n_locked, 0);
 
 133
 134	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
 135		    activate_packet_queue, NULL, NULL);
 136	pq->reqidx = 0;
 137
 138	pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
 139			   sizeof(*pq->reqs),
 140			   GFP_KERNEL);
 141	if (!pq->reqs)
 142		goto pq_reqs_nomem;
 143
 144	pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL);
 
 
 145	if (!pq->req_in_use)
 146		goto pq_reqs_no_in_use;
 147
 148	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
 149		 fd->subctxt);
 150	pq->txreq_cache = kmem_cache_create(buf,
 151					    sizeof(struct user_sdma_txreq),
 152					    L1_CACHE_BYTES,
 153					    SLAB_HWCACHE_ALIGN,
 154					    NULL);
 155	if (!pq->txreq_cache) {
 156		dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
 157			   uctxt->ctxt);
 158		goto pq_txreq_nomem;
 159	}
 160
 161	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 162	if (!cq)
 163		goto cq_nomem;
 164
 165	cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
 166				 * hfi1_sdma_comp_ring_size));
 167	if (!cq->comps)
 168		goto cq_comps_nomem;
 169
 170	cq->nentries = hfi1_sdma_comp_ring_size;
 171
 172	ret = hfi1_init_system_pinning(pq);
 173	if (ret)
 
 
 174		goto pq_mmu_fail;
 
 175
 176	rcu_assign_pointer(fd->pq, pq);
 177	fd->cq = cq;
 178
 179	return 0;
 180
 181pq_mmu_fail:
 182	vfree(cq->comps);
 183cq_comps_nomem:
 184	kfree(cq);
 185cq_nomem:
 186	kmem_cache_destroy(pq->txreq_cache);
 187pq_txreq_nomem:
 188	bitmap_free(pq->req_in_use);
 189pq_reqs_no_in_use:
 190	kfree(pq->reqs);
 191pq_reqs_nomem:
 192	kfree(pq);
 193
 194	return ret;
 195}
 196
 197static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq)
 198{
 199	unsigned long flags;
 200	seqlock_t *lock = pq->busy.lock;
 201
 202	if (!lock)
 203		return;
 204	write_seqlock_irqsave(lock, flags);
 205	if (!list_empty(&pq->busy.list)) {
 206		list_del_init(&pq->busy.list);
 207		pq->busy.lock = NULL;
 208	}
 209	write_sequnlock_irqrestore(lock, flags);
 210}
 211
 212int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
 213			       struct hfi1_ctxtdata *uctxt)
 214{
 215	struct hfi1_user_sdma_pkt_q *pq;
 216
 217	trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
 218
 219	spin_lock(&fd->pq_rcu_lock);
 220	pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
 221				    lockdep_is_held(&fd->pq_rcu_lock));
 222	if (pq) {
 223		rcu_assign_pointer(fd->pq, NULL);
 224		spin_unlock(&fd->pq_rcu_lock);
 225		synchronize_srcu(&fd->pq_srcu);
 226		/* at this point there can be no more new requests */
 
 
 227		iowait_sdma_drain(&pq->busy);
 228		/* Wait until all requests have been freed. */
 229		wait_event_interruptible(
 230			pq->wait,
 231			!atomic_read(&pq->n_reqs));
 232		kfree(pq->reqs);
 233		hfi1_free_system_pinning(pq);
 234		bitmap_free(pq->req_in_use);
 235		kmem_cache_destroy(pq->txreq_cache);
 236		flush_pq_iowait(pq);
 237		kfree(pq);
 238	} else {
 239		spin_unlock(&fd->pq_rcu_lock);
 240	}
 241	if (fd->cq) {
 242		vfree(fd->cq->comps);
 243		kfree(fd->cq);
 244		fd->cq = NULL;
 245	}
 246	return 0;
 247}
 248
 249static u8 dlid_to_selector(u16 dlid)
 250{
 251	static u8 mapping[256];
 252	static int initialized;
 253	static u8 next;
 254	int hash;
 255
 256	if (!initialized) {
 257		memset(mapping, 0xFF, 256);
 258		initialized = 1;
 259	}
 260
 261	hash = ((dlid >> 8) ^ dlid) & 0xFF;
 262	if (mapping[hash] == 0xFF) {
 263		mapping[hash] = next;
 264		next = (next + 1) & 0x7F;
 265	}
 266
 267	return mapping[hash];
 268}
 269
 270/**
 271 * hfi1_user_sdma_process_request() - Process and start a user sdma request
 272 * @fd: valid file descriptor
 273 * @iovec: array of io vectors to process
 274 * @dim: overall iovec array size
 275 * @count: number of io vector array entries processed
 276 */
 277int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 278				   struct iovec *iovec, unsigned long dim,
 279				   unsigned long *count)
 280{
 281	int ret = 0, i;
 282	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 283	struct hfi1_user_sdma_pkt_q *pq =
 284		srcu_dereference(fd->pq, &fd->pq_srcu);
 285	struct hfi1_user_sdma_comp_q *cq = fd->cq;
 286	struct hfi1_devdata *dd = pq->dd;
 287	unsigned long idx = 0;
 288	u8 pcount = initial_pkt_count;
 289	struct sdma_req_info info;
 290	struct user_sdma_request *req;
 291	u8 opcode, sc, vl;
 292	u16 pkey;
 293	u32 slid;
 294	u16 dlid;
 295	u32 selector;
 296
 297	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
 298		hfi1_cdbg(
 299		   SDMA,
 300		   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
 301		   dd->unit, uctxt->ctxt, fd->subctxt,
 302		   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
 303		return -EINVAL;
 304	}
 305	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
 306	if (ret) {
 307		hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
 308			  dd->unit, uctxt->ctxt, fd->subctxt, ret);
 309		return -EFAULT;
 310	}
 311
 312	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
 313				     (u16 *)&info);
 314	if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
 315		hfi1_cdbg(SDMA,
 316			  "[%u:%u:%u:%u] Invalid comp index",
 317			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 318		return -EINVAL;
 319	}
 320
 321	/*
 322	 * Sanity check the header io vector count.  Need at least 1 vector
 323	 * (header) and cannot be larger than the actual io vector count.
 324	 */
 325	if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
 326		hfi1_cdbg(SDMA,
 327			  "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
 328			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
 329			  req_iovcnt(info.ctrl), dim);
 330		return -EINVAL;
 331	}
 332
 333	if (!info.fragsize) {
 334		hfi1_cdbg(SDMA,
 335			  "[%u:%u:%u:%u] Request does not specify fragsize",
 336			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 337		return -EINVAL;
 338	}
 339
 340	/* Try to claim the request. */
 341	if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
 342		hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
 343			  dd->unit, uctxt->ctxt, fd->subctxt,
 344			  info.comp_idx);
 345		return -EBADSLT;
 346	}
 347	/*
 348	 * All safety checks have been done and this request has been claimed.
 349	 */
 350	trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
 351					     info.comp_idx);
 352	req = pq->reqs + info.comp_idx;
 353	req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
 354	req->data_len  = 0;
 355	req->pq = pq;
 356	req->cq = cq;
 357	req->ahg_idx = -1;
 358	req->iov_idx = 0;
 359	req->sent = 0;
 360	req->seqnum = 0;
 361	req->seqcomp = 0;
 362	req->seqsubmitted = 0;
 363	req->tids = NULL;
 364	req->has_error = 0;
 365	INIT_LIST_HEAD(&req->txps);
 366
 367	memcpy(&req->info, &info, sizeof(info));
 368
 369	/* The request is initialized, count it */
 370	atomic_inc(&pq->n_reqs);
 371
 372	if (req_opcode(info.ctrl) == EXPECTED) {
 373		/* expected must have a TID info and at least one data vector */
 374		if (req->data_iovs < 2) {
 375			SDMA_DBG(req,
 376				 "Not enough vectors for expected request");
 377			ret = -EINVAL;
 378			goto free_req;
 379		}
 380		req->data_iovs--;
 381	}
 382
 383	if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
 384		SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
 385			 MAX_VECTORS_PER_REQ);
 386		ret = -EINVAL;
 387		goto free_req;
 388	}
 389
 390	/* Copy the header from the user buffer */
 391	ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
 392			     sizeof(req->hdr));
 393	if (ret) {
 394		SDMA_DBG(req, "Failed to copy header template (%d)", ret);
 395		ret = -EFAULT;
 396		goto free_req;
 397	}
 398
 399	/* If Static rate control is not enabled, sanitize the header. */
 400	if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
 401		req->hdr.pbc[2] = 0;
 402
 403	/* Validate the opcode. Do not trust packets from user space blindly. */
 404	opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
 405	if ((opcode & USER_OPCODE_CHECK_MASK) !=
 406	     USER_OPCODE_CHECK_VAL) {
 407		SDMA_DBG(req, "Invalid opcode (%d)", opcode);
 408		ret = -EINVAL;
 409		goto free_req;
 410	}
 411	/*
 412	 * Validate the vl. Do not trust packets from user space blindly.
 413	 * VL comes from PBC, SC comes from LRH, and the VL needs to
 414	 * match the SC look up.
 415	 */
 416	vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
 417	sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
 418	      (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
 419	if (vl >= dd->pport->vls_operational ||
 420	    vl != sc_to_vlt(dd, sc)) {
 421		SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
 422		ret = -EINVAL;
 423		goto free_req;
 424	}
 425
 426	/* Checking P_KEY for requests from user-space */
 427	pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
 428	slid = be16_to_cpu(req->hdr.lrh[3]);
 429	if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
 430		ret = -EINVAL;
 431		goto free_req;
 432	}
 433
 434	/*
 435	 * Also should check the BTH.lnh. If it says the next header is GRH then
 436	 * the RXE parsing will be off and will land in the middle of the KDETH
 437	 * or miss it entirely.
 438	 */
 439	if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
 440		SDMA_DBG(req, "User tried to pass in a GRH");
 441		ret = -EINVAL;
 442		goto free_req;
 443	}
 444
 445	req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
 446	/*
 447	 * Calculate the initial TID offset based on the values of
 448	 * KDETH.OFFSET and KDETH.OM that are passed in.
 449	 */
 450	req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
 451		(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 452		 KDETH_OM_LARGE : KDETH_OM_SMALL);
 453	trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
 454					       info.comp_idx, req->tidoffset);
 455	idx++;
 456
 457	/* Save all the IO vector structures */
 458	for (i = 0; i < req->data_iovs; i++) {
 459		req->iovs[i].offset = 0;
 460		INIT_LIST_HEAD(&req->iovs[i].list);
 461		memcpy(&req->iovs[i].iov,
 462		       iovec + idx++,
 463		       sizeof(req->iovs[i].iov));
 464		if (req->iovs[i].iov.iov_len == 0) {
 465			ret = -EINVAL;
 
 466			goto free_req;
 467		}
 468		req->data_len += req->iovs[i].iov.iov_len;
 469	}
 470	trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
 471					 info.comp_idx, req->data_len);
 472	if (pcount > req->info.npkts)
 473		pcount = req->info.npkts;
 474	/*
 475	 * Copy any TID info
 476	 * User space will provide the TID info only when the
 477	 * request type is EXPECTED. This is true even if there is
 478	 * only one packet in the request and the header is already
 479	 * setup. The reason for the singular TID case is that the
 480	 * driver needs to perform safety checks.
 481	 */
 482	if (req_opcode(req->info.ctrl) == EXPECTED) {
 483		u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
 484		u32 *tmp;
 485
 486		if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
 487			ret = -EINVAL;
 488			goto free_req;
 489		}
 490
 491		/*
 492		 * We have to copy all of the tids because they may vary
 493		 * in size and, therefore, the TID count might not be
 494		 * equal to the pkt count. However, there is no way to
 495		 * tell at this point.
 496		 */
 497		tmp = memdup_array_user(iovec[idx].iov_base,
 498					ntids, sizeof(*req->tids));
 499		if (IS_ERR(tmp)) {
 500			ret = PTR_ERR(tmp);
 501			SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
 502				 ntids, ret);
 503			goto free_req;
 504		}
 505		req->tids = tmp;
 506		req->n_tids = ntids;
 507		req->tididx = 0;
 508		idx++;
 509	}
 510
 511	dlid = be16_to_cpu(req->hdr.lrh[1]);
 512	selector = dlid_to_selector(dlid);
 513	selector += uctxt->ctxt + fd->subctxt;
 514	req->sde = sdma_select_user_engine(dd, selector, vl);
 515
 516	if (!req->sde || !sdma_running(req->sde)) {
 517		ret = -ECOMM;
 518		goto free_req;
 519	}
 520
 521	/* We don't need an AHG entry if the request contains only one packet */
 522	if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
 523		req->ahg_idx = sdma_ahg_alloc(req->sde);
 524
 525	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
 526	pq->state = SDMA_PKT_Q_ACTIVE;
 527
 528	/*
 529	 * This is a somewhat blocking send implementation.
 530	 * The driver will block the caller until all packets of the
 531	 * request have been submitted to the SDMA engine. However, it
 532	 * will not wait for send completions.
 533	 */
 534	while (req->seqsubmitted != req->info.npkts) {
 535		ret = user_sdma_send_pkts(req, pcount);
 536		if (ret < 0) {
 537			int we_ret;
 538
 539			if (ret != -EBUSY)
 540				goto free_req;
 541			we_ret = wait_event_interruptible_timeout(
 542				pq->busy.wait_dma,
 543				pq->state == SDMA_PKT_Q_ACTIVE,
 544				msecs_to_jiffies(
 545					SDMA_IOWAIT_TIMEOUT));
 546			trace_hfi1_usdma_we(pq, we_ret);
 547			if (we_ret <= 0)
 548				flush_pq_iowait(pq);
 549		}
 550	}
 551	*count += idx;
 552	return 0;
 553free_req:
 554	/*
 555	 * If the submitted seqsubmitted == npkts, the completion routine
 556	 * controls the final state.  If sequbmitted < npkts, wait for any
 557	 * outstanding packets to finish before cleaning up.
 558	 */
 559	if (req->seqsubmitted < req->info.npkts) {
 560		if (req->seqsubmitted)
 561			wait_event(pq->busy.wait_dma,
 562				   (req->seqcomp == req->seqsubmitted - 1));
 563		user_sdma_free_request(req);
 564		pq_update(pq);
 565		set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
 566	}
 567	return ret;
 568}
 569
 570static inline u32 compute_data_length(struct user_sdma_request *req,
 571				      struct user_sdma_txreq *tx)
 572{
 573	/*
 574	 * Determine the proper size of the packet data.
 575	 * The size of the data of the first packet is in the header
 576	 * template. However, it includes the header and ICRC, which need
 577	 * to be subtracted.
 578	 * The minimum representable packet data length in a header is 4 bytes,
 579	 * therefore, when the data length request is less than 4 bytes, there's
 580	 * only one packet, and the packet data length is equal to that of the
 581	 * request data length.
 582	 * The size of the remaining packets is the minimum of the frag
 583	 * size (MTU) or remaining data in the request.
 584	 */
 585	u32 len;
 586
 587	if (!req->seqnum) {
 588		if (req->data_len < sizeof(u32))
 589			len = req->data_len;
 590		else
 591			len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
 592			       (sizeof(tx->hdr) - 4));
 593	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
 594		u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
 595			PAGE_SIZE;
 596		/*
 597		 * Get the data length based on the remaining space in the
 598		 * TID pair.
 599		 */
 600		len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
 601		/* If we've filled up the TID pair, move to the next one. */
 602		if (unlikely(!len) && ++req->tididx < req->n_tids &&
 603		    req->tids[req->tididx]) {
 604			tidlen = EXP_TID_GET(req->tids[req->tididx],
 605					     LEN) * PAGE_SIZE;
 606			req->tidoffset = 0;
 607			len = min_t(u32, tidlen, req->info.fragsize);
 608		}
 609		/*
 610		 * Since the TID pairs map entire pages, make sure that we
 611		 * are not going to try to send more data that we have
 612		 * remaining.
 613		 */
 614		len = min(len, req->data_len - req->sent);
 615	} else {
 616		len = min(req->data_len - req->sent, (u32)req->info.fragsize);
 617	}
 618	trace_hfi1_sdma_user_compute_length(req->pq->dd,
 619					    req->pq->ctxt,
 620					    req->pq->subctxt,
 621					    req->info.comp_idx,
 622					    len);
 623	return len;
 624}
 625
 626static inline u32 pad_len(u32 len)
 627{
 628	if (len & (sizeof(u32) - 1))
 629		len += sizeof(u32) - (len & (sizeof(u32) - 1));
 630	return len;
 631}
 632
 633static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
 634{
 635	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
 636	return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
 637}
 638
 639static int user_sdma_txadd_ahg(struct user_sdma_request *req,
 640			       struct user_sdma_txreq *tx,
 641			       u32 datalen)
 642{
 643	int ret;
 644	u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
 645	u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
 646	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 647
 648	/*
 649	 * Copy the request header into the tx header
 650	 * because the HW needs a cacheline-aligned
 651	 * address.
 652	 * This copy can be optimized out if the hdr
 653	 * member of user_sdma_request were also
 654	 * cacheline aligned.
 655	 */
 656	memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
 657	if (PBC2LRH(pbclen) != lrhlen) {
 658		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 659		tx->hdr.pbc[0] = cpu_to_le16(pbclen);
 660	}
 661	ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
 662	if (ret)
 663		return ret;
 664	ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
 665			      sizeof(tx->hdr) + datalen, req->ahg_idx,
 666			      0, NULL, 0, user_sdma_txreq_cb);
 667	if (ret)
 668		return ret;
 669	ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
 670	if (ret)
 671		sdma_txclean(pq->dd, &tx->txreq);
 672	return ret;
 673}
 674
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 675static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
 676{
 677	int ret = 0;
 678	u16 count;
 679	unsigned npkts = 0;
 680	struct user_sdma_txreq *tx = NULL;
 681	struct hfi1_user_sdma_pkt_q *pq = NULL;
 682	struct user_sdma_iovec *iovec = NULL;
 683
 684	if (!req->pq)
 685		return -EINVAL;
 686
 687	pq = req->pq;
 688
 689	/* If tx completion has reported an error, we are done. */
 690	if (READ_ONCE(req->has_error))
 691		return -EFAULT;
 692
 693	/*
 694	 * Check if we might have sent the entire request already
 695	 */
 696	if (unlikely(req->seqnum == req->info.npkts)) {
 697		if (!list_empty(&req->txps))
 698			goto dosend;
 699		return ret;
 700	}
 701
 702	if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
 703		maxpkts = req->info.npkts - req->seqnum;
 704
 705	while (npkts < maxpkts) {
 706		u32 datalen = 0;
 
 707
 708		/*
 709		 * Check whether any of the completions have come back
 710		 * with errors. If so, we are not going to process any
 711		 * more packets from this request.
 712		 */
 713		if (READ_ONCE(req->has_error))
 714			return -EFAULT;
 715
 716		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
 717		if (!tx)
 718			return -ENOMEM;
 719
 720		tx->flags = 0;
 721		tx->req = req;
 722		INIT_LIST_HEAD(&tx->list);
 723
 724		/*
 725		 * For the last packet set the ACK request
 726		 * and disable header suppression.
 727		 */
 728		if (req->seqnum == req->info.npkts - 1)
 729			tx->flags |= (TXREQ_FLAGS_REQ_ACK |
 730				      TXREQ_FLAGS_REQ_DISABLE_SH);
 731
 732		/*
 733		 * Calculate the payload size - this is min of the fragment
 734		 * (MTU) size or the remaining bytes in the request but only
 735		 * if we have payload data.
 736		 */
 737		if (req->data_len) {
 738			iovec = &req->iovs[req->iov_idx];
 739			if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
 740				if (++req->iov_idx == req->data_iovs) {
 741					ret = -EFAULT;
 742					goto free_tx;
 743				}
 744				iovec = &req->iovs[req->iov_idx];
 745				WARN_ON(iovec->offset);
 746			}
 747
 748			datalen = compute_data_length(req, tx);
 749
 750			/*
 751			 * Disable header suppression for the payload <= 8DWS.
 752			 * If there is an uncorrectable error in the receive
 753			 * data FIFO when the received payload size is less than
 754			 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
 755			 * not reported.There is set RHF.EccErr if the header
 756			 * is not suppressed.
 757			 */
 758			if (!datalen) {
 759				SDMA_DBG(req,
 760					 "Request has data but pkt len is 0");
 761				ret = -EFAULT;
 762				goto free_tx;
 763			} else if (datalen <= 32) {
 764				tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
 765			}
 766		}
 767
 768		if (req->ahg_idx >= 0) {
 769			if (!req->seqnum) {
 770				ret = user_sdma_txadd_ahg(req, tx, datalen);
 771				if (ret)
 772					goto free_tx;
 773			} else {
 774				int changes;
 775
 776				changes = set_txreq_header_ahg(req, tx,
 777							       datalen);
 778				if (changes < 0) {
 779					ret = changes;
 780					goto free_tx;
 781				}
 782			}
 783		} else {
 784			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
 785					  datalen, user_sdma_txreq_cb);
 786			if (ret)
 787				goto free_tx;
 788			/*
 789			 * Modify the header for this packet. This only needs
 790			 * to be done if we are not going to use AHG. Otherwise,
 791			 * the HW will do it based on the changes we gave it
 792			 * during sdma_txinit_ahg().
 793			 */
 794			ret = set_txreq_header(req, tx, datalen);
 795			if (ret)
 796				goto free_txreq;
 797		}
 798
 799		req->koffset += datalen;
 800		if (req_opcode(req->info.ctrl) == EXPECTED)
 801			req->tidoffset += datalen;
 802		req->sent += datalen;
 803		while (datalen) {
 804			ret = hfi1_add_pages_to_sdma_packet(req, tx, iovec,
 805							    &datalen);
 
 806			if (ret)
 807				goto free_txreq;
 808			iovec = &req->iovs[req->iov_idx];
 809		}
 
 
 
 
 
 
 
 
 
 
 810		list_add_tail(&tx->txreq.list, &req->txps);
 811		/*
 812		 * It is important to increment this here as it is used to
 813		 * generate the BTH.PSN and, therefore, can't be bulk-updated
 814		 * outside of the loop.
 815		 */
 816		tx->seqnum = req->seqnum++;
 817		npkts++;
 818	}
 819dosend:
 820	ret = sdma_send_txlist(req->sde,
 821			       iowait_get_ib_work(&pq->busy),
 822			       &req->txps, &count);
 823	req->seqsubmitted += count;
 824	if (req->seqsubmitted == req->info.npkts) {
 825		/*
 826		 * The txreq has already been submitted to the HW queue
 827		 * so we can free the AHG entry now. Corruption will not
 828		 * happen due to the sequential manner in which
 829		 * descriptors are processed.
 830		 */
 831		if (req->ahg_idx >= 0)
 832			sdma_ahg_free(req->sde, req->ahg_idx);
 833	}
 834	return ret;
 835
 836free_txreq:
 837	sdma_txclean(pq->dd, &tx->txreq);
 838free_tx:
 839	kmem_cache_free(pq->txreq_cache, tx);
 840	return ret;
 841}
 842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 843static int check_header_template(struct user_sdma_request *req,
 844				 struct hfi1_pkt_header *hdr, u32 lrhlen,
 845				 u32 datalen)
 846{
 847	/*
 848	 * Perform safety checks for any type of packet:
 849	 *    - transfer size is multiple of 64bytes
 850	 *    - packet length is multiple of 4 bytes
 851	 *    - packet length is not larger than MTU size
 852	 *
 853	 * These checks are only done for the first packet of the
 854	 * transfer since the header is "given" to us by user space.
 855	 * For the remainder of the packets we compute the values.
 856	 */
 857	if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
 858	    lrhlen > get_lrh_len(*hdr, req->info.fragsize))
 859		return -EINVAL;
 860
 861	if (req_opcode(req->info.ctrl) == EXPECTED) {
 862		/*
 863		 * The header is checked only on the first packet. Furthermore,
 864		 * we ensure that at least one TID entry is copied when the
 865		 * request is submitted. Therefore, we don't have to verify that
 866		 * tididx points to something sane.
 867		 */
 868		u32 tidval = req->tids[req->tididx],
 869			tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
 870			tididx = EXP_TID_GET(tidval, IDX),
 871			tidctrl = EXP_TID_GET(tidval, CTRL),
 872			tidoff;
 873		__le32 kval = hdr->kdeth.ver_tid_offset;
 874
 875		tidoff = KDETH_GET(kval, OFFSET) *
 876			  (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 877			   KDETH_OM_LARGE : KDETH_OM_SMALL);
 878		/*
 879		 * Expected receive packets have the following
 880		 * additional checks:
 881		 *     - offset is not larger than the TID size
 882		 *     - TIDCtrl values match between header and TID array
 883		 *     - TID indexes match between header and TID array
 884		 */
 885		if ((tidoff + datalen > tidlen) ||
 886		    KDETH_GET(kval, TIDCTRL) != tidctrl ||
 887		    KDETH_GET(kval, TID) != tididx)
 888			return -EINVAL;
 889	}
 890	return 0;
 891}
 892
 893/*
 894 * Correctly set the BTH.PSN field based on type of
 895 * transfer - eager packets can just increment the PSN but
 896 * expected packets encode generation and sequence in the
 897 * BTH.PSN field so just incrementing will result in errors.
 898 */
 899static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
 900{
 901	u32 val = be32_to_cpu(bthpsn),
 902		mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
 903			0xffffffull),
 904		psn = val & mask;
 905	if (expct)
 906		psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
 907			((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
 908	else
 909		psn = psn + frags;
 910	return psn & mask;
 911}
 912
 913static int set_txreq_header(struct user_sdma_request *req,
 914			    struct user_sdma_txreq *tx, u32 datalen)
 915{
 916	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 917	struct hfi1_pkt_header *hdr = &tx->hdr;
 918	u8 omfactor; /* KDETH.OM */
 919	u16 pbclen;
 920	int ret;
 921	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
 922
 923	/* Copy the header template to the request before modification */
 924	memcpy(hdr, &req->hdr, sizeof(*hdr));
 925
 926	/*
 927	 * Check if the PBC and LRH length are mismatched. If so
 928	 * adjust both in the header.
 929	 */
 930	pbclen = le16_to_cpu(hdr->pbc[0]);
 931	if (PBC2LRH(pbclen) != lrhlen) {
 932		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 933		hdr->pbc[0] = cpu_to_le16(pbclen);
 934		hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
 935		/*
 936		 * Third packet
 937		 * This is the first packet in the sequence that has
 938		 * a "static" size that can be used for the rest of
 939		 * the packets (besides the last one).
 940		 */
 941		if (unlikely(req->seqnum == 2)) {
 942			/*
 943			 * From this point on the lengths in both the
 944			 * PBC and LRH are the same until the last
 945			 * packet.
 946			 * Adjust the template so we don't have to update
 947			 * every packet
 948			 */
 949			req->hdr.pbc[0] = hdr->pbc[0];
 950			req->hdr.lrh[2] = hdr->lrh[2];
 951		}
 952	}
 953	/*
 954	 * We only have to modify the header if this is not the
 955	 * first packet in the request. Otherwise, we use the
 956	 * header given to us.
 957	 */
 958	if (unlikely(!req->seqnum)) {
 959		ret = check_header_template(req, hdr, lrhlen, datalen);
 960		if (ret)
 961			return ret;
 962		goto done;
 963	}
 964
 965	hdr->bth[2] = cpu_to_be32(
 966		set_pkt_bth_psn(hdr->bth[2],
 967				(req_opcode(req->info.ctrl) == EXPECTED),
 968				req->seqnum));
 969
 970	/* Set ACK request on last packet */
 971	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
 972		hdr->bth[2] |= cpu_to_be32(1UL << 31);
 973
 974	/* Set the new offset */
 975	hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
 976	/* Expected packets have to fill in the new TID information */
 977	if (req_opcode(req->info.ctrl) == EXPECTED) {
 978		tidval = req->tids[req->tididx];
 979		/*
 980		 * If the offset puts us at the end of the current TID,
 981		 * advance everything.
 982		 */
 983		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
 984					 PAGE_SIZE)) {
 985			req->tidoffset = 0;
 986			/*
 987			 * Since we don't copy all the TIDs, all at once,
 988			 * we have to check again.
 989			 */
 990			if (++req->tididx > req->n_tids - 1 ||
 991			    !req->tids[req->tididx]) {
 992				return -EINVAL;
 993			}
 994			tidval = req->tids[req->tididx];
 995		}
 996		omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
 997			KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
 998			KDETH_OM_SMALL_SHIFT;
 999		/* Set KDETH.TIDCtrl based on value for this TID. */
1000		KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1001			  EXP_TID_GET(tidval, CTRL));
1002		/* Set KDETH.TID based on value for this TID */
1003		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1004			  EXP_TID_GET(tidval, IDX));
1005		/* Clear KDETH.SH when DISABLE_SH flag is set */
1006		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
1007			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1008		/*
1009		 * Set the KDETH.OFFSET and KDETH.OM based on size of
1010		 * transfer.
1011		 */
1012		trace_hfi1_sdma_user_tid_info(
1013			pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
1014			req->tidoffset, req->tidoffset >> omfactor,
1015			omfactor != KDETH_OM_SMALL_SHIFT);
1016		KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1017			  req->tidoffset >> omfactor);
1018		KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1019			  omfactor != KDETH_OM_SMALL_SHIFT);
1020	}
1021done:
1022	trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1023				    req->info.comp_idx, hdr, tidval);
1024	return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1025}
1026
1027static int set_txreq_header_ahg(struct user_sdma_request *req,
1028				struct user_sdma_txreq *tx, u32 datalen)
1029{
1030	u32 ahg[AHG_KDETH_ARRAY_SIZE];
1031	int idx = 0;
1032	u8 omfactor; /* KDETH.OM */
1033	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1034	struct hfi1_pkt_header *hdr = &req->hdr;
1035	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1036	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1037	size_t array_size = ARRAY_SIZE(ahg);
1038
1039	if (PBC2LRH(pbclen) != lrhlen) {
1040		/* PBC.PbcLengthDWs */
1041		idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
1042				     (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
1043		if (idx < 0)
1044			return idx;
1045		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
1046		idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
1047				     (__force u16)cpu_to_be16(lrhlen >> 2));
1048		if (idx < 0)
1049			return idx;
1050	}
1051
1052	/*
1053	 * Do the common updates
1054	 */
1055	/* BTH.PSN and BTH.A */
1056	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1057		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1058	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1059		val32 |= 1UL << 31;
1060	idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
1061			     (__force u16)cpu_to_be16(val32 >> 16));
1062	if (idx < 0)
1063		return idx;
1064	idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
1065			     (__force u16)cpu_to_be16(val32 & 0xffff));
1066	if (idx < 0)
1067		return idx;
1068	/* KDETH.Offset */
1069	idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
1070			     (__force u16)cpu_to_le16(req->koffset & 0xffff));
1071	if (idx < 0)
1072		return idx;
1073	idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
1074			     (__force u16)cpu_to_le16(req->koffset >> 16));
1075	if (idx < 0)
1076		return idx;
1077	if (req_opcode(req->info.ctrl) == EXPECTED) {
1078		__le16 val;
1079
1080		tidval = req->tids[req->tididx];
1081
1082		/*
1083		 * If the offset puts us at the end of the current TID,
1084		 * advance everything.
1085		 */
1086		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1087					 PAGE_SIZE)) {
1088			req->tidoffset = 0;
1089			/*
1090			 * Since we don't copy all the TIDs, all at once,
1091			 * we have to check again.
1092			 */
1093			if (++req->tididx > req->n_tids - 1 ||
1094			    !req->tids[req->tididx])
1095				return -EINVAL;
1096			tidval = req->tids[req->tididx];
1097		}
1098		omfactor = ((EXP_TID_GET(tidval, LEN) *
1099				  PAGE_SIZE) >=
1100				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
1101				 KDETH_OM_SMALL_SHIFT;
1102		/* KDETH.OM and KDETH.OFFSET (TID) */
1103		idx = ahg_header_set(
1104				ahg, idx, array_size, 7, 0, 16,
1105				((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
1106				((req->tidoffset >> omfactor)
1107				& 0x7fff)));
1108		if (idx < 0)
1109			return idx;
1110		/* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
1111		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1112				   (EXP_TID_GET(tidval, IDX) & 0x3ff));
1113
1114		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
1115			val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1116						      INTR) <<
1117					    AHG_KDETH_INTR_SHIFT));
1118		} else {
1119			val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
1120			       cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
1121			       cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1122						      INTR) <<
1123					     AHG_KDETH_INTR_SHIFT));
1124		}
1125
1126		idx = ahg_header_set(ahg, idx, array_size,
1127				     7, 16, 14, (__force u16)val);
1128		if (idx < 0)
1129			return idx;
1130	}
1131
1132	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1133					req->info.comp_idx, req->sde->this_idx,
1134					req->ahg_idx, ahg, idx, tidval);
1135	sdma_txinit_ahg(&tx->txreq,
1136			SDMA_TXREQ_F_USE_AHG,
1137			datalen, req->ahg_idx, idx,
1138			ahg, sizeof(req->hdr),
1139			user_sdma_txreq_cb);
1140
1141	return idx;
1142}
1143
1144/**
1145 * user_sdma_txreq_cb() - SDMA tx request completion callback.
1146 * @txreq: valid sdma tx request
1147 * @status: success/failure of request
1148 *
1149 * Called when the SDMA progress state machine gets notification that
1150 * the SDMA descriptors for this tx request have been processed by the
1151 * DMA engine. Called in interrupt context.
1152 * Only do work on completed sequences.
1153 */
1154static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
1155{
1156	struct user_sdma_txreq *tx =
1157		container_of(txreq, struct user_sdma_txreq, txreq);
1158	struct user_sdma_request *req;
1159	struct hfi1_user_sdma_pkt_q *pq;
1160	struct hfi1_user_sdma_comp_q *cq;
1161	enum hfi1_sdma_comp_state state = COMPLETE;
1162
1163	if (!tx->req)
1164		return;
1165
1166	req = tx->req;
1167	pq = req->pq;
1168	cq = req->cq;
1169
1170	if (status != SDMA_TXREQ_S_OK) {
1171		SDMA_DBG(req, "SDMA completion with error %d",
1172			 status);
1173		WRITE_ONCE(req->has_error, 1);
1174		state = ERROR;
1175	}
1176
1177	req->seqcomp = tx->seqnum;
1178	kmem_cache_free(pq->txreq_cache, tx);
1179
1180	/* sequence isn't complete?  We are done */
1181	if (req->seqcomp != req->info.npkts - 1)
1182		return;
1183
1184	user_sdma_free_request(req);
1185	set_comp_state(pq, cq, req->info.comp_idx, state, status);
1186	pq_update(pq);
1187}
1188
1189static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
1190{
1191	if (atomic_dec_and_test(&pq->n_reqs))
1192		wake_up(&pq->wait);
1193}
1194
1195static void user_sdma_free_request(struct user_sdma_request *req)
1196{
 
 
1197	if (!list_empty(&req->txps)) {
1198		struct sdma_txreq *t, *p;
1199
1200		list_for_each_entry_safe(t, p, &req->txps, list) {
1201			struct user_sdma_txreq *tx =
1202				container_of(t, struct user_sdma_txreq, txreq);
1203			list_del_init(&t->list);
1204			sdma_txclean(req->pq->dd, t);
1205			kmem_cache_free(req->pq->txreq_cache, tx);
1206		}
1207	}
1208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1209	kfree(req->tids);
1210	clear_bit(req->info.comp_idx, req->pq->req_in_use);
1211}
1212
1213static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
1214				  struct hfi1_user_sdma_comp_q *cq,
1215				  u16 idx, enum hfi1_sdma_comp_state state,
1216				  int ret)
1217{
1218	if (state == ERROR)
1219		cq->comps[idx].errcode = -ret;
1220	smp_wmb(); /* make sure errcode is visible first */
1221	cq->comps[idx].status = state;
1222	trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
1223					idx, state, ret);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1224}

 
   1/*
 
   2 * Copyright(c) 2015 - 2018 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
 
  47#include <linux/mm.h>
  48#include <linux/types.h>
  49#include <linux/device.h>
  50#include <linux/dmapool.h>
  51#include <linux/slab.h>
  52#include <linux/list.h>
  53#include <linux/highmem.h>
  54#include <linux/io.h>
  55#include <linux/uio.h>
  56#include <linux/rbtree.h>
  57#include <linux/spinlock.h>
  58#include <linux/delay.h>
  59#include <linux/kthread.h>
  60#include <linux/mmu_context.h>
  61#include <linux/module.h>
  62#include <linux/vmalloc.h>
  63#include <linux/string.h>
  64
  65#include "hfi.h"
  66#include "sdma.h"
  67#include "mmu_rb.h"
  68#include "user_sdma.h"
  69#include "verbs.h"  /* for the headers */
  70#include "common.h" /* for struct hfi1_tid_info */
  71#include "trace.h"
  72
  73static uint hfi1_sdma_comp_ring_size = 128;
  74module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
  75MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
  76
  77static unsigned initial_pkt_count = 8;
  78
  79static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
  80static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
  81static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
  82static void user_sdma_free_request(struct user_sdma_request *req, bool unpin);
  83static int pin_vector_pages(struct user_sdma_request *req,
  84			    struct user_sdma_iovec *iovec);
  85static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
  86			       unsigned start, unsigned npages);
  87static int check_header_template(struct user_sdma_request *req,
  88				 struct hfi1_pkt_header *hdr, u32 lrhlen,
  89				 u32 datalen);
  90static int set_txreq_header(struct user_sdma_request *req,
  91			    struct user_sdma_txreq *tx, u32 datalen);
  92static int set_txreq_header_ahg(struct user_sdma_request *req,
  93				struct user_sdma_txreq *tx, u32 len);
  94static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  95				  struct hfi1_user_sdma_comp_q *cq,
  96				  u16 idx, enum hfi1_sdma_comp_state state,
  97				  int ret);
  98static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
  99static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
 100
 101static int defer_packet_queue(
 102	struct sdma_engine *sde,
 103	struct iowait_work *wait,
 104	struct sdma_txreq *txreq,
 105	uint seq,
 106	bool pkts_sent);
 107static void activate_packet_queue(struct iowait *wait, int reason);
 108static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
 109			   unsigned long len);
 110static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode);
 111static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
 112			 void *arg2, bool *stop);
 113static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode);
 114static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
 115
 116static struct mmu_rb_ops sdma_rb_ops = {
 117	.filter = sdma_rb_filter,
 118	.insert = sdma_rb_insert,
 119	.evict = sdma_rb_evict,
 120	.remove = sdma_rb_remove,
 121	.invalidate = sdma_rb_invalidate
 122};
 123
 124static int defer_packet_queue(
 125	struct sdma_engine *sde,
 126	struct iowait_work *wait,
 127	struct sdma_txreq *txreq,
 128	uint seq,
 129	bool pkts_sent)
 130{
 131	struct hfi1_user_sdma_pkt_q *pq =
 132		container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
 133
 134	write_seqlock(&sde->waitlock);
 
 135	if (sdma_progress(sde, seq, txreq))
 136		goto eagain;
 137	/*
 138	 * We are assuming that if the list is enqueued somewhere, it
 139	 * is to the dmawait list since that is the only place where
 140	 * it is supposed to be enqueued.
 141	 */
 142	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
 143	if (list_empty(&pq->busy.list)) {
 144		pq->busy.lock = &sde->waitlock;
 145		iowait_get_priority(&pq->busy);
 146		iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
 147	}
 148	write_sequnlock(&sde->waitlock);
 149	return -EBUSY;
 150eagain:
 151	write_sequnlock(&sde->waitlock);
 152	return -EAGAIN;
 153}
 154
 155static void activate_packet_queue(struct iowait *wait, int reason)
 156{
 157	struct hfi1_user_sdma_pkt_q *pq =
 158		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
 159	pq->busy.lock = NULL;
 
 160	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
 161	wake_up(&wait->wait_dma);
 162};
 163
 164int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 165				struct hfi1_filedata *fd)
 166{
 167	int ret = -ENOMEM;
 168	char buf[64];
 169	struct hfi1_devdata *dd;
 170	struct hfi1_user_sdma_comp_q *cq;
 171	struct hfi1_user_sdma_pkt_q *pq;
 172
 173	if (!uctxt || !fd)
 174		return -EBADF;
 175
 176	if (!hfi1_sdma_comp_ring_size)
 177		return -EINVAL;
 178
 179	dd = uctxt->dd;
 180
 181	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
 182	if (!pq)
 183		return -ENOMEM;
 184	pq->dd = dd;
 185	pq->ctxt = uctxt->ctxt;
 186	pq->subctxt = fd->subctxt;
 187	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
 188	atomic_set(&pq->n_reqs, 0);
 189	init_waitqueue_head(&pq->wait);
 190	atomic_set(&pq->n_locked, 0);
 191	pq->mm = fd->mm;
 192
 193	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
 194		    activate_packet_queue, NULL, NULL);
 195	pq->reqidx = 0;
 196
 197	pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
 198			   sizeof(*pq->reqs),
 199			   GFP_KERNEL);
 200	if (!pq->reqs)
 201		goto pq_reqs_nomem;
 202
 203	pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size),
 204				 sizeof(*pq->req_in_use),
 205				 GFP_KERNEL);
 206	if (!pq->req_in_use)
 207		goto pq_reqs_no_in_use;
 208
 209	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
 210		 fd->subctxt);
 211	pq->txreq_cache = kmem_cache_create(buf,
 212					    sizeof(struct user_sdma_txreq),
 213					    L1_CACHE_BYTES,
 214					    SLAB_HWCACHE_ALIGN,
 215					    NULL);
 216	if (!pq->txreq_cache) {
 217		dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
 218			   uctxt->ctxt);
 219		goto pq_txreq_nomem;
 220	}
 221
 222	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 223	if (!cq)
 224		goto cq_nomem;
 225
 226	cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
 227				 * hfi1_sdma_comp_ring_size));
 228	if (!cq->comps)
 229		goto cq_comps_nomem;
 230
 231	cq->nentries = hfi1_sdma_comp_ring_size;
 232
 233	ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq,
 234				   &pq->handler);
 235	if (ret) {
 236		dd_dev_err(dd, "Failed to register with MMU %d", ret);
 237		goto pq_mmu_fail;
 238	}
 239
 240	rcu_assign_pointer(fd->pq, pq);
 241	fd->cq = cq;
 242
 243	return 0;
 244
 245pq_mmu_fail:
 246	vfree(cq->comps);
 247cq_comps_nomem:
 248	kfree(cq);
 249cq_nomem:
 250	kmem_cache_destroy(pq->txreq_cache);
 251pq_txreq_nomem:
 252	kfree(pq->req_in_use);
 253pq_reqs_no_in_use:
 254	kfree(pq->reqs);
 255pq_reqs_nomem:
 256	kfree(pq);
 257
 258	return ret;
 259}
 260
 261static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq)
 262{
 263	unsigned long flags;
 264	seqlock_t *lock = pq->busy.lock;
 265
 266	if (!lock)
 267		return;
 268	write_seqlock_irqsave(lock, flags);
 269	if (!list_empty(&pq->busy.list)) {
 270		list_del_init(&pq->busy.list);
 271		pq->busy.lock = NULL;
 272	}
 273	write_sequnlock_irqrestore(lock, flags);
 274}
 275
 276int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
 277			       struct hfi1_ctxtdata *uctxt)
 278{
 279	struct hfi1_user_sdma_pkt_q *pq;
 280
 281	trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
 282
 283	spin_lock(&fd->pq_rcu_lock);
 284	pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
 285				    lockdep_is_held(&fd->pq_rcu_lock));
 286	if (pq) {
 287		rcu_assign_pointer(fd->pq, NULL);
 288		spin_unlock(&fd->pq_rcu_lock);
 289		synchronize_srcu(&fd->pq_srcu);
 290		/* at this point there can be no more new requests */
 291		if (pq->handler)
 292			hfi1_mmu_rb_unregister(pq->handler);
 293		iowait_sdma_drain(&pq->busy);
 294		/* Wait until all requests have been freed. */
 295		wait_event_interruptible(
 296			pq->wait,
 297			!atomic_read(&pq->n_reqs));
 298		kfree(pq->reqs);
 299		kfree(pq->req_in_use);
 
 300		kmem_cache_destroy(pq->txreq_cache);
 301		flush_pq_iowait(pq);
 302		kfree(pq);
 303	} else {
 304		spin_unlock(&fd->pq_rcu_lock);
 305	}
 306	if (fd->cq) {
 307		vfree(fd->cq->comps);
 308		kfree(fd->cq);
 309		fd->cq = NULL;
 310	}
 311	return 0;
 312}
 313
 314static u8 dlid_to_selector(u16 dlid)
 315{
 316	static u8 mapping[256];
 317	static int initialized;
 318	static u8 next;
 319	int hash;
 320
 321	if (!initialized) {
 322		memset(mapping, 0xFF, 256);
 323		initialized = 1;
 324	}
 325
 326	hash = ((dlid >> 8) ^ dlid) & 0xFF;
 327	if (mapping[hash] == 0xFF) {
 328		mapping[hash] = next;
 329		next = (next + 1) & 0x7F;
 330	}
 331
 332	return mapping[hash];
 333}
 334
 335/**
 336 * hfi1_user_sdma_process_request() - Process and start a user sdma request
 337 * @fd: valid file descriptor
 338 * @iovec: array of io vectors to process
 339 * @dim: overall iovec array size
 340 * @count: number of io vector array entries processed
 341 */
 342int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 343				   struct iovec *iovec, unsigned long dim,
 344				   unsigned long *count)
 345{
 346	int ret = 0, i;
 347	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 348	struct hfi1_user_sdma_pkt_q *pq =
 349		srcu_dereference(fd->pq, &fd->pq_srcu);
 350	struct hfi1_user_sdma_comp_q *cq = fd->cq;
 351	struct hfi1_devdata *dd = pq->dd;
 352	unsigned long idx = 0;
 353	u8 pcount = initial_pkt_count;
 354	struct sdma_req_info info;
 355	struct user_sdma_request *req;
 356	u8 opcode, sc, vl;
 357	u16 pkey;
 358	u32 slid;
 359	u16 dlid;
 360	u32 selector;
 361
 362	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
 363		hfi1_cdbg(
 364		   SDMA,
 365		   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
 366		   dd->unit, uctxt->ctxt, fd->subctxt,
 367		   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
 368		return -EINVAL;
 369	}
 370	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
 371	if (ret) {
 372		hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
 373			  dd->unit, uctxt->ctxt, fd->subctxt, ret);
 374		return -EFAULT;
 375	}
 376
 377	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
 378				     (u16 *)&info);
 379	if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
 380		hfi1_cdbg(SDMA,
 381			  "[%u:%u:%u:%u] Invalid comp index",
 382			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 383		return -EINVAL;
 384	}
 385
 386	/*
 387	 * Sanity check the header io vector count.  Need at least 1 vector
 388	 * (header) and cannot be larger than the actual io vector count.
 389	 */
 390	if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
 391		hfi1_cdbg(SDMA,
 392			  "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
 393			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
 394			  req_iovcnt(info.ctrl), dim);
 395		return -EINVAL;
 396	}
 397
 398	if (!info.fragsize) {
 399		hfi1_cdbg(SDMA,
 400			  "[%u:%u:%u:%u] Request does not specify fragsize",
 401			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 402		return -EINVAL;
 403	}
 404
 405	/* Try to claim the request. */
 406	if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
 407		hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
 408			  dd->unit, uctxt->ctxt, fd->subctxt,
 409			  info.comp_idx);
 410		return -EBADSLT;
 411	}
 412	/*
 413	 * All safety checks have been done and this request has been claimed.
 414	 */
 415	trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
 416					     info.comp_idx);
 417	req = pq->reqs + info.comp_idx;
 418	req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
 419	req->data_len  = 0;
 420	req->pq = pq;
 421	req->cq = cq;
 422	req->ahg_idx = -1;
 423	req->iov_idx = 0;
 424	req->sent = 0;
 425	req->seqnum = 0;
 426	req->seqcomp = 0;
 427	req->seqsubmitted = 0;
 428	req->tids = NULL;
 429	req->has_error = 0;
 430	INIT_LIST_HEAD(&req->txps);
 431
 432	memcpy(&req->info, &info, sizeof(info));
 433
 434	/* The request is initialized, count it */
 435	atomic_inc(&pq->n_reqs);
 436
 437	if (req_opcode(info.ctrl) == EXPECTED) {
 438		/* expected must have a TID info and at least one data vector */
 439		if (req->data_iovs < 2) {
 440			SDMA_DBG(req,
 441				 "Not enough vectors for expected request");
 442			ret = -EINVAL;
 443			goto free_req;
 444		}
 445		req->data_iovs--;
 446	}
 447
 448	if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
 449		SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
 450			 MAX_VECTORS_PER_REQ);
 451		ret = -EINVAL;
 452		goto free_req;
 453	}
 
 454	/* Copy the header from the user buffer */
 455	ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
 456			     sizeof(req->hdr));
 457	if (ret) {
 458		SDMA_DBG(req, "Failed to copy header template (%d)", ret);
 459		ret = -EFAULT;
 460		goto free_req;
 461	}
 462
 463	/* If Static rate control is not enabled, sanitize the header. */
 464	if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
 465		req->hdr.pbc[2] = 0;
 466
 467	/* Validate the opcode. Do not trust packets from user space blindly. */
 468	opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
 469	if ((opcode & USER_OPCODE_CHECK_MASK) !=
 470	     USER_OPCODE_CHECK_VAL) {
 471		SDMA_DBG(req, "Invalid opcode (%d)", opcode);
 472		ret = -EINVAL;
 473		goto free_req;
 474	}
 475	/*
 476	 * Validate the vl. Do not trust packets from user space blindly.
 477	 * VL comes from PBC, SC comes from LRH, and the VL needs to
 478	 * match the SC look up.
 479	 */
 480	vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
 481	sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
 482	      (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
 483	if (vl >= dd->pport->vls_operational ||
 484	    vl != sc_to_vlt(dd, sc)) {
 485		SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
 486		ret = -EINVAL;
 487		goto free_req;
 488	}
 489
 490	/* Checking P_KEY for requests from user-space */
 491	pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
 492	slid = be16_to_cpu(req->hdr.lrh[3]);
 493	if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
 494		ret = -EINVAL;
 495		goto free_req;
 496	}
 497
 498	/*
 499	 * Also should check the BTH.lnh. If it says the next header is GRH then
 500	 * the RXE parsing will be off and will land in the middle of the KDETH
 501	 * or miss it entirely.
 502	 */
 503	if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
 504		SDMA_DBG(req, "User tried to pass in a GRH");
 505		ret = -EINVAL;
 506		goto free_req;
 507	}
 508
 509	req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
 510	/*
 511	 * Calculate the initial TID offset based on the values of
 512	 * KDETH.OFFSET and KDETH.OM that are passed in.
 513	 */
 514	req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
 515		(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 516		 KDETH_OM_LARGE : KDETH_OM_SMALL);
 517	trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
 518					       info.comp_idx, req->tidoffset);
 519	idx++;
 520
 521	/* Save all the IO vector structures */
 522	for (i = 0; i < req->data_iovs; i++) {
 523		req->iovs[i].offset = 0;
 524		INIT_LIST_HEAD(&req->iovs[i].list);
 525		memcpy(&req->iovs[i].iov,
 526		       iovec + idx++,
 527		       sizeof(req->iovs[i].iov));
 528		ret = pin_vector_pages(req, &req->iovs[i]);
 529		if (ret) {
 530			req->data_iovs = i;
 531			goto free_req;
 532		}
 533		req->data_len += req->iovs[i].iov.iov_len;
 534	}
 535	trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
 536					 info.comp_idx, req->data_len);
 537	if (pcount > req->info.npkts)
 538		pcount = req->info.npkts;
 539	/*
 540	 * Copy any TID info
 541	 * User space will provide the TID info only when the
 542	 * request type is EXPECTED. This is true even if there is
 543	 * only one packet in the request and the header is already
 544	 * setup. The reason for the singular TID case is that the
 545	 * driver needs to perform safety checks.
 546	 */
 547	if (req_opcode(req->info.ctrl) == EXPECTED) {
 548		u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
 549		u32 *tmp;
 550
 551		if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
 552			ret = -EINVAL;
 553			goto free_req;
 554		}
 555
 556		/*
 557		 * We have to copy all of the tids because they may vary
 558		 * in size and, therefore, the TID count might not be
 559		 * equal to the pkt count. However, there is no way to
 560		 * tell at this point.
 561		 */
 562		tmp = memdup_user(iovec[idx].iov_base,
 563				  ntids * sizeof(*req->tids));
 564		if (IS_ERR(tmp)) {
 565			ret = PTR_ERR(tmp);
 566			SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
 567				 ntids, ret);
 568			goto free_req;
 569		}
 570		req->tids = tmp;
 571		req->n_tids = ntids;
 572		req->tididx = 0;
 573		idx++;
 574	}
 575
 576	dlid = be16_to_cpu(req->hdr.lrh[1]);
 577	selector = dlid_to_selector(dlid);
 578	selector += uctxt->ctxt + fd->subctxt;
 579	req->sde = sdma_select_user_engine(dd, selector, vl);
 580
 581	if (!req->sde || !sdma_running(req->sde)) {
 582		ret = -ECOMM;
 583		goto free_req;
 584	}
 585
 586	/* We don't need an AHG entry if the request contains only one packet */
 587	if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
 588		req->ahg_idx = sdma_ahg_alloc(req->sde);
 589
 590	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
 591	pq->state = SDMA_PKT_Q_ACTIVE;
 592
 593	/*
 594	 * This is a somewhat blocking send implementation.
 595	 * The driver will block the caller until all packets of the
 596	 * request have been submitted to the SDMA engine. However, it
 597	 * will not wait for send completions.
 598	 */
 599	while (req->seqsubmitted != req->info.npkts) {
 600		ret = user_sdma_send_pkts(req, pcount);
 601		if (ret < 0) {
 
 
 602			if (ret != -EBUSY)
 603				goto free_req;
 604			if (wait_event_interruptible_timeout(
 605				pq->busy.wait_dma,
 606				pq->state == SDMA_PKT_Q_ACTIVE,
 607				msecs_to_jiffies(
 608					SDMA_IOWAIT_TIMEOUT)) <= 0)
 
 
 609				flush_pq_iowait(pq);
 610		}
 611	}
 612	*count += idx;
 613	return 0;
 614free_req:
 615	/*
 616	 * If the submitted seqsubmitted == npkts, the completion routine
 617	 * controls the final state.  If sequbmitted < npkts, wait for any
 618	 * outstanding packets to finish before cleaning up.
 619	 */
 620	if (req->seqsubmitted < req->info.npkts) {
 621		if (req->seqsubmitted)
 622			wait_event(pq->busy.wait_dma,
 623				   (req->seqcomp == req->seqsubmitted - 1));
 624		user_sdma_free_request(req, true);
 625		pq_update(pq);
 626		set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
 627	}
 628	return ret;
 629}
 630
 631static inline u32 compute_data_length(struct user_sdma_request *req,
 632				      struct user_sdma_txreq *tx)
 633{
 634	/*
 635	 * Determine the proper size of the packet data.
 636	 * The size of the data of the first packet is in the header
 637	 * template. However, it includes the header and ICRC, which need
 638	 * to be subtracted.
 639	 * The minimum representable packet data length in a header is 4 bytes,
 640	 * therefore, when the data length request is less than 4 bytes, there's
 641	 * only one packet, and the packet data length is equal to that of the
 642	 * request data length.
 643	 * The size of the remaining packets is the minimum of the frag
 644	 * size (MTU) or remaining data in the request.
 645	 */
 646	u32 len;
 647
 648	if (!req->seqnum) {
 649		if (req->data_len < sizeof(u32))
 650			len = req->data_len;
 651		else
 652			len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
 653			       (sizeof(tx->hdr) - 4));
 654	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
 655		u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
 656			PAGE_SIZE;
 657		/*
 658		 * Get the data length based on the remaining space in the
 659		 * TID pair.
 660		 */
 661		len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
 662		/* If we've filled up the TID pair, move to the next one. */
 663		if (unlikely(!len) && ++req->tididx < req->n_tids &&
 664		    req->tids[req->tididx]) {
 665			tidlen = EXP_TID_GET(req->tids[req->tididx],
 666					     LEN) * PAGE_SIZE;
 667			req->tidoffset = 0;
 668			len = min_t(u32, tidlen, req->info.fragsize);
 669		}
 670		/*
 671		 * Since the TID pairs map entire pages, make sure that we
 672		 * are not going to try to send more data that we have
 673		 * remaining.
 674		 */
 675		len = min(len, req->data_len - req->sent);
 676	} else {
 677		len = min(req->data_len - req->sent, (u32)req->info.fragsize);
 678	}
 679	trace_hfi1_sdma_user_compute_length(req->pq->dd,
 680					    req->pq->ctxt,
 681					    req->pq->subctxt,
 682					    req->info.comp_idx,
 683					    len);
 684	return len;
 685}
 686
 687static inline u32 pad_len(u32 len)
 688{
 689	if (len & (sizeof(u32) - 1))
 690		len += sizeof(u32) - (len & (sizeof(u32) - 1));
 691	return len;
 692}
 693
 694static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
 695{
 696	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
 697	return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
 698}
 699
 700static int user_sdma_txadd_ahg(struct user_sdma_request *req,
 701			       struct user_sdma_txreq *tx,
 702			       u32 datalen)
 703{
 704	int ret;
 705	u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
 706	u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
 707	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 708
 709	/*
 710	 * Copy the request header into the tx header
 711	 * because the HW needs a cacheline-aligned
 712	 * address.
 713	 * This copy can be optimized out if the hdr
 714	 * member of user_sdma_request were also
 715	 * cacheline aligned.
 716	 */
 717	memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
 718	if (PBC2LRH(pbclen) != lrhlen) {
 719		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 720		tx->hdr.pbc[0] = cpu_to_le16(pbclen);
 721	}
 722	ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
 723	if (ret)
 724		return ret;
 725	ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
 726			      sizeof(tx->hdr) + datalen, req->ahg_idx,
 727			      0, NULL, 0, user_sdma_txreq_cb);
 728	if (ret)
 729		return ret;
 730	ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
 731	if (ret)
 732		sdma_txclean(pq->dd, &tx->txreq);
 733	return ret;
 734}
 735
 736static int user_sdma_txadd(struct user_sdma_request *req,
 737			   struct user_sdma_txreq *tx,
 738			   struct user_sdma_iovec *iovec, u32 datalen,
 739			   u32 *queued_ptr, u32 *data_sent_ptr,
 740			   u64 *iov_offset_ptr)
 741{
 742	int ret;
 743	unsigned int pageidx, len;
 744	unsigned long base, offset;
 745	u64 iov_offset = *iov_offset_ptr;
 746	u32 queued = *queued_ptr, data_sent = *data_sent_ptr;
 747	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 748
 749	base = (unsigned long)iovec->iov.iov_base;
 750	offset = offset_in_page(base + iovec->offset + iov_offset);
 751	pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >>
 752		   PAGE_SHIFT);
 753	len = offset + req->info.fragsize > PAGE_SIZE ?
 754		PAGE_SIZE - offset : req->info.fragsize;
 755	len = min((datalen - queued), len);
 756	ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx],
 757			      offset, len);
 758	if (ret) {
 759		SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret);
 760		return ret;
 761	}
 762	iov_offset += len;
 763	queued += len;
 764	data_sent += len;
 765	if (unlikely(queued < datalen && pageidx == iovec->npages &&
 766		     req->iov_idx < req->data_iovs - 1)) {
 767		iovec->offset += iov_offset;
 768		iovec = &req->iovs[++req->iov_idx];
 769		iov_offset = 0;
 770	}
 771
 772	*queued_ptr = queued;
 773	*data_sent_ptr = data_sent;
 774	*iov_offset_ptr = iov_offset;
 775	return ret;
 776}
 777
 778static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
 779{
 780	int ret = 0;
 781	u16 count;
 782	unsigned npkts = 0;
 783	struct user_sdma_txreq *tx = NULL;
 784	struct hfi1_user_sdma_pkt_q *pq = NULL;
 785	struct user_sdma_iovec *iovec = NULL;
 786
 787	if (!req->pq)
 788		return -EINVAL;
 789
 790	pq = req->pq;
 791
 792	/* If tx completion has reported an error, we are done. */
 793	if (READ_ONCE(req->has_error))
 794		return -EFAULT;
 795
 796	/*
 797	 * Check if we might have sent the entire request already
 798	 */
 799	if (unlikely(req->seqnum == req->info.npkts)) {
 800		if (!list_empty(&req->txps))
 801			goto dosend;
 802		return ret;
 803	}
 804
 805	if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
 806		maxpkts = req->info.npkts - req->seqnum;
 807
 808	while (npkts < maxpkts) {
 809		u32 datalen = 0, queued = 0, data_sent = 0;
 810		u64 iov_offset = 0;
 811
 812		/*
 813		 * Check whether any of the completions have come back
 814		 * with errors. If so, we are not going to process any
 815		 * more packets from this request.
 816		 */
 817		if (READ_ONCE(req->has_error))
 818			return -EFAULT;
 819
 820		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
 821		if (!tx)
 822			return -ENOMEM;
 823
 824		tx->flags = 0;
 825		tx->req = req;
 826		INIT_LIST_HEAD(&tx->list);
 827
 828		/*
 829		 * For the last packet set the ACK request
 830		 * and disable header suppression.
 831		 */
 832		if (req->seqnum == req->info.npkts - 1)
 833			tx->flags |= (TXREQ_FLAGS_REQ_ACK |
 834				      TXREQ_FLAGS_REQ_DISABLE_SH);
 835
 836		/*
 837		 * Calculate the payload size - this is min of the fragment
 838		 * (MTU) size or the remaining bytes in the request but only
 839		 * if we have payload data.
 840		 */
 841		if (req->data_len) {
 842			iovec = &req->iovs[req->iov_idx];
 843			if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
 844				if (++req->iov_idx == req->data_iovs) {
 845					ret = -EFAULT;
 846					goto free_tx;
 847				}
 848				iovec = &req->iovs[req->iov_idx];
 849				WARN_ON(iovec->offset);
 850			}
 851
 852			datalen = compute_data_length(req, tx);
 853
 854			/*
 855			 * Disable header suppression for the payload <= 8DWS.
 856			 * If there is an uncorrectable error in the receive
 857			 * data FIFO when the received payload size is less than
 858			 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
 859			 * not reported.There is set RHF.EccErr if the header
 860			 * is not suppressed.
 861			 */
 862			if (!datalen) {
 863				SDMA_DBG(req,
 864					 "Request has data but pkt len is 0");
 865				ret = -EFAULT;
 866				goto free_tx;
 867			} else if (datalen <= 32) {
 868				tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
 869			}
 870		}
 871
 872		if (req->ahg_idx >= 0) {
 873			if (!req->seqnum) {
 874				ret = user_sdma_txadd_ahg(req, tx, datalen);
 875				if (ret)
 876					goto free_tx;
 877			} else {
 878				int changes;
 879
 880				changes = set_txreq_header_ahg(req, tx,
 881							       datalen);
 882				if (changes < 0) {
 883					ret = changes;
 884					goto free_tx;
 885				}
 886			}
 887		} else {
 888			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
 889					  datalen, user_sdma_txreq_cb);
 890			if (ret)
 891				goto free_tx;
 892			/*
 893			 * Modify the header for this packet. This only needs
 894			 * to be done if we are not going to use AHG. Otherwise,
 895			 * the HW will do it based on the changes we gave it
 896			 * during sdma_txinit_ahg().
 897			 */
 898			ret = set_txreq_header(req, tx, datalen);
 899			if (ret)
 900				goto free_txreq;
 901		}
 902
 903		/*
 904		 * If the request contains any data vectors, add up to
 905		 * fragsize bytes to the descriptor.
 906		 */
 907		while (queued < datalen &&
 908		       (req->sent + data_sent) < req->data_len) {
 909			ret = user_sdma_txadd(req, tx, iovec, datalen,
 910					      &queued, &data_sent, &iov_offset);
 911			if (ret)
 912				goto free_txreq;
 
 913		}
 914		/*
 915		 * The txreq was submitted successfully so we can update
 916		 * the counters.
 917		 */
 918		req->koffset += datalen;
 919		if (req_opcode(req->info.ctrl) == EXPECTED)
 920			req->tidoffset += datalen;
 921		req->sent += data_sent;
 922		if (req->data_len)
 923			iovec->offset += iov_offset;
 924		list_add_tail(&tx->txreq.list, &req->txps);
 925		/*
 926		 * It is important to increment this here as it is used to
 927		 * generate the BTH.PSN and, therefore, can't be bulk-updated
 928		 * outside of the loop.
 929		 */
 930		tx->seqnum = req->seqnum++;
 931		npkts++;
 932	}
 933dosend:
 934	ret = sdma_send_txlist(req->sde,
 935			       iowait_get_ib_work(&pq->busy),
 936			       &req->txps, &count);
 937	req->seqsubmitted += count;
 938	if (req->seqsubmitted == req->info.npkts) {
 939		/*
 940		 * The txreq has already been submitted to the HW queue
 941		 * so we can free the AHG entry now. Corruption will not
 942		 * happen due to the sequential manner in which
 943		 * descriptors are processed.
 944		 */
 945		if (req->ahg_idx >= 0)
 946			sdma_ahg_free(req->sde, req->ahg_idx);
 947	}
 948	return ret;
 949
 950free_txreq:
 951	sdma_txclean(pq->dd, &tx->txreq);
 952free_tx:
 953	kmem_cache_free(pq->txreq_cache, tx);
 954	return ret;
 955}
 956
 957static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
 958{
 959	struct evict_data evict_data;
 960
 961	evict_data.cleared = 0;
 962	evict_data.target = npages;
 963	hfi1_mmu_rb_evict(pq->handler, &evict_data);
 964	return evict_data.cleared;
 965}
 966
 967static int pin_sdma_pages(struct user_sdma_request *req,
 968			  struct user_sdma_iovec *iovec,
 969			  struct sdma_mmu_node *node,
 970			  int npages)
 971{
 972	int pinned, cleared;
 973	struct page **pages;
 974	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 975
 976	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
 977	if (!pages)
 978		return -ENOMEM;
 979	memcpy(pages, node->pages, node->npages * sizeof(*pages));
 980
 981	npages -= node->npages;
 982retry:
 983	if (!hfi1_can_pin_pages(pq->dd, pq->mm,
 984				atomic_read(&pq->n_locked), npages)) {
 985		cleared = sdma_cache_evict(pq, npages);
 986		if (cleared >= npages)
 987			goto retry;
 988	}
 989	pinned = hfi1_acquire_user_pages(pq->mm,
 990					 ((unsigned long)iovec->iov.iov_base +
 991					 (node->npages * PAGE_SIZE)), npages, 0,
 992					 pages + node->npages);
 993	if (pinned < 0) {
 994		kfree(pages);
 995		return pinned;
 996	}
 997	if (pinned != npages) {
 998		unpin_vector_pages(pq->mm, pages, node->npages, pinned);
 999		return -EFAULT;
1000	}
1001	kfree(node->pages);
1002	node->rb.len = iovec->iov.iov_len;
1003	node->pages = pages;
1004	atomic_add(pinned, &pq->n_locked);
1005	return pinned;
1006}
1007
1008static void unpin_sdma_pages(struct sdma_mmu_node *node)
1009{
1010	if (node->npages) {
1011		unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages);
1012		atomic_sub(node->npages, &node->pq->n_locked);
1013	}
1014}
1015
1016static int pin_vector_pages(struct user_sdma_request *req,
1017			    struct user_sdma_iovec *iovec)
1018{
1019	int ret = 0, pinned, npages;
1020	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1021	struct sdma_mmu_node *node = NULL;
1022	struct mmu_rb_node *rb_node;
1023	struct iovec *iov;
1024	bool extracted;
1025
1026	extracted =
1027		hfi1_mmu_rb_remove_unless_exact(pq->handler,
1028						(unsigned long)
1029						iovec->iov.iov_base,
1030						iovec->iov.iov_len, &rb_node);
1031	if (rb_node) {
1032		node = container_of(rb_node, struct sdma_mmu_node, rb);
1033		if (!extracted) {
1034			atomic_inc(&node->refcount);
1035			iovec->pages = node->pages;
1036			iovec->npages = node->npages;
1037			iovec->node = node;
1038			return 0;
1039		}
1040	}
1041
1042	if (!node) {
1043		node = kzalloc(sizeof(*node), GFP_KERNEL);
1044		if (!node)
1045			return -ENOMEM;
1046
1047		node->rb.addr = (unsigned long)iovec->iov.iov_base;
1048		node->pq = pq;
1049		atomic_set(&node->refcount, 0);
1050	}
1051
1052	iov = &iovec->iov;
1053	npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len);
1054	if (node->npages < npages) {
1055		pinned = pin_sdma_pages(req, iovec, node, npages);
1056		if (pinned < 0) {
1057			ret = pinned;
1058			goto bail;
1059		}
1060		node->npages += pinned;
1061		npages = node->npages;
1062	}
1063	iovec->pages = node->pages;
1064	iovec->npages = npages;
1065	iovec->node = node;
1066
1067	ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
1068	if (ret) {
1069		iovec->node = NULL;
1070		goto bail;
1071	}
1072	return 0;
1073bail:
1074	unpin_sdma_pages(node);
1075	kfree(node);
1076	return ret;
1077}
1078
1079static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
1080			       unsigned start, unsigned npages)
1081{
1082	hfi1_release_user_pages(mm, pages + start, npages, false);
1083	kfree(pages);
1084}
1085
1086static int check_header_template(struct user_sdma_request *req,
1087				 struct hfi1_pkt_header *hdr, u32 lrhlen,
1088				 u32 datalen)
1089{
1090	/*
1091	 * Perform safety checks for any type of packet:
1092	 *    - transfer size is multiple of 64bytes
1093	 *    - packet length is multiple of 4 bytes
1094	 *    - packet length is not larger than MTU size
1095	 *
1096	 * These checks are only done for the first packet of the
1097	 * transfer since the header is "given" to us by user space.
1098	 * For the remainder of the packets we compute the values.
1099	 */
1100	if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
1101	    lrhlen > get_lrh_len(*hdr, req->info.fragsize))
1102		return -EINVAL;
1103
1104	if (req_opcode(req->info.ctrl) == EXPECTED) {
1105		/*
1106		 * The header is checked only on the first packet. Furthermore,
1107		 * we ensure that at least one TID entry is copied when the
1108		 * request is submitted. Therefore, we don't have to verify that
1109		 * tididx points to something sane.
1110		 */
1111		u32 tidval = req->tids[req->tididx],
1112			tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
1113			tididx = EXP_TID_GET(tidval, IDX),
1114			tidctrl = EXP_TID_GET(tidval, CTRL),
1115			tidoff;
1116		__le32 kval = hdr->kdeth.ver_tid_offset;
1117
1118		tidoff = KDETH_GET(kval, OFFSET) *
1119			  (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
1120			   KDETH_OM_LARGE : KDETH_OM_SMALL);
1121		/*
1122		 * Expected receive packets have the following
1123		 * additional checks:
1124		 *     - offset is not larger than the TID size
1125		 *     - TIDCtrl values match between header and TID array
1126		 *     - TID indexes match between header and TID array
1127		 */
1128		if ((tidoff + datalen > tidlen) ||
1129		    KDETH_GET(kval, TIDCTRL) != tidctrl ||
1130		    KDETH_GET(kval, TID) != tididx)
1131			return -EINVAL;
1132	}
1133	return 0;
1134}
1135
1136/*
1137 * Correctly set the BTH.PSN field based on type of
1138 * transfer - eager packets can just increment the PSN but
1139 * expected packets encode generation and sequence in the
1140 * BTH.PSN field so just incrementing will result in errors.
1141 */
1142static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
1143{
1144	u32 val = be32_to_cpu(bthpsn),
1145		mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
1146			0xffffffull),
1147		psn = val & mask;
1148	if (expct)
1149		psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
1150			((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
1151	else
1152		psn = psn + frags;
1153	return psn & mask;
1154}
1155
1156static int set_txreq_header(struct user_sdma_request *req,
1157			    struct user_sdma_txreq *tx, u32 datalen)
1158{
1159	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1160	struct hfi1_pkt_header *hdr = &tx->hdr;
1161	u8 omfactor; /* KDETH.OM */
1162	u16 pbclen;
1163	int ret;
1164	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1165
1166	/* Copy the header template to the request before modification */
1167	memcpy(hdr, &req->hdr, sizeof(*hdr));
1168
1169	/*
1170	 * Check if the PBC and LRH length are mismatched. If so
1171	 * adjust both in the header.
1172	 */
1173	pbclen = le16_to_cpu(hdr->pbc[0]);
1174	if (PBC2LRH(pbclen) != lrhlen) {
1175		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
1176		hdr->pbc[0] = cpu_to_le16(pbclen);
1177		hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
1178		/*
1179		 * Third packet
1180		 * This is the first packet in the sequence that has
1181		 * a "static" size that can be used for the rest of
1182		 * the packets (besides the last one).
1183		 */
1184		if (unlikely(req->seqnum == 2)) {
1185			/*
1186			 * From this point on the lengths in both the
1187			 * PBC and LRH are the same until the last
1188			 * packet.
1189			 * Adjust the template so we don't have to update
1190			 * every packet
1191			 */
1192			req->hdr.pbc[0] = hdr->pbc[0];
1193			req->hdr.lrh[2] = hdr->lrh[2];
1194		}
1195	}
1196	/*
1197	 * We only have to modify the header if this is not the
1198	 * first packet in the request. Otherwise, we use the
1199	 * header given to us.
1200	 */
1201	if (unlikely(!req->seqnum)) {
1202		ret = check_header_template(req, hdr, lrhlen, datalen);
1203		if (ret)
1204			return ret;
1205		goto done;
1206	}
1207
1208	hdr->bth[2] = cpu_to_be32(
1209		set_pkt_bth_psn(hdr->bth[2],
1210				(req_opcode(req->info.ctrl) == EXPECTED),
1211				req->seqnum));
1212
1213	/* Set ACK request on last packet */
1214	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1215		hdr->bth[2] |= cpu_to_be32(1UL << 31);
1216
1217	/* Set the new offset */
1218	hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
1219	/* Expected packets have to fill in the new TID information */
1220	if (req_opcode(req->info.ctrl) == EXPECTED) {
1221		tidval = req->tids[req->tididx];
1222		/*
1223		 * If the offset puts us at the end of the current TID,
1224		 * advance everything.
1225		 */
1226		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1227					 PAGE_SIZE)) {
1228			req->tidoffset = 0;
1229			/*
1230			 * Since we don't copy all the TIDs, all at once,
1231			 * we have to check again.
1232			 */
1233			if (++req->tididx > req->n_tids - 1 ||
1234			    !req->tids[req->tididx]) {
1235				return -EINVAL;
1236			}
1237			tidval = req->tids[req->tididx];
1238		}
1239		omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
1240			KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
1241			KDETH_OM_SMALL_SHIFT;
1242		/* Set KDETH.TIDCtrl based on value for this TID. */
1243		KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1244			  EXP_TID_GET(tidval, CTRL));
1245		/* Set KDETH.TID based on value for this TID */
1246		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1247			  EXP_TID_GET(tidval, IDX));
1248		/* Clear KDETH.SH when DISABLE_SH flag is set */
1249		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
1250			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1251		/*
1252		 * Set the KDETH.OFFSET and KDETH.OM based on size of
1253		 * transfer.
1254		 */
1255		trace_hfi1_sdma_user_tid_info(
1256			pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
1257			req->tidoffset, req->tidoffset >> omfactor,
1258			omfactor != KDETH_OM_SMALL_SHIFT);
1259		KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1260			  req->tidoffset >> omfactor);
1261		KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1262			  omfactor != KDETH_OM_SMALL_SHIFT);
1263	}
1264done:
1265	trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1266				    req->info.comp_idx, hdr, tidval);
1267	return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1268}
1269
1270static int set_txreq_header_ahg(struct user_sdma_request *req,
1271				struct user_sdma_txreq *tx, u32 datalen)
1272{
1273	u32 ahg[AHG_KDETH_ARRAY_SIZE];
1274	int idx = 0;
1275	u8 omfactor; /* KDETH.OM */
1276	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1277	struct hfi1_pkt_header *hdr = &req->hdr;
1278	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1279	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1280	size_t array_size = ARRAY_SIZE(ahg);
1281
1282	if (PBC2LRH(pbclen) != lrhlen) {
1283		/* PBC.PbcLengthDWs */
1284		idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
1285				     (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
1286		if (idx < 0)
1287			return idx;
1288		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
1289		idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
1290				     (__force u16)cpu_to_be16(lrhlen >> 2));
1291		if (idx < 0)
1292			return idx;
1293	}
1294
1295	/*
1296	 * Do the common updates
1297	 */
1298	/* BTH.PSN and BTH.A */
1299	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1300		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1301	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1302		val32 |= 1UL << 31;
1303	idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
1304			     (__force u16)cpu_to_be16(val32 >> 16));
1305	if (idx < 0)
1306		return idx;
1307	idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
1308			     (__force u16)cpu_to_be16(val32 & 0xffff));
1309	if (idx < 0)
1310		return idx;
1311	/* KDETH.Offset */
1312	idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
1313			     (__force u16)cpu_to_le16(req->koffset & 0xffff));
1314	if (idx < 0)
1315		return idx;
1316	idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
1317			     (__force u16)cpu_to_le16(req->koffset >> 16));
1318	if (idx < 0)
1319		return idx;
1320	if (req_opcode(req->info.ctrl) == EXPECTED) {
1321		__le16 val;
1322
1323		tidval = req->tids[req->tididx];
1324
1325		/*
1326		 * If the offset puts us at the end of the current TID,
1327		 * advance everything.
1328		 */
1329		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1330					 PAGE_SIZE)) {
1331			req->tidoffset = 0;
1332			/*
1333			 * Since we don't copy all the TIDs, all at once,
1334			 * we have to check again.
1335			 */
1336			if (++req->tididx > req->n_tids - 1 ||
1337			    !req->tids[req->tididx])
1338				return -EINVAL;
1339			tidval = req->tids[req->tididx];
1340		}
1341		omfactor = ((EXP_TID_GET(tidval, LEN) *
1342				  PAGE_SIZE) >=
1343				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
1344				 KDETH_OM_SMALL_SHIFT;
1345		/* KDETH.OM and KDETH.OFFSET (TID) */
1346		idx = ahg_header_set(
1347				ahg, idx, array_size, 7, 0, 16,
1348				((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
1349				((req->tidoffset >> omfactor)
1350				& 0x7fff)));
1351		if (idx < 0)
1352			return idx;
1353		/* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
1354		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1355				   (EXP_TID_GET(tidval, IDX) & 0x3ff));
1356
1357		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
1358			val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1359						      INTR) <<
1360					    AHG_KDETH_INTR_SHIFT));
1361		} else {
1362			val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
1363			       cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
1364			       cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1365						      INTR) <<
1366					     AHG_KDETH_INTR_SHIFT));
1367		}
1368
1369		idx = ahg_header_set(ahg, idx, array_size,
1370				     7, 16, 14, (__force u16)val);
1371		if (idx < 0)
1372			return idx;
1373	}
1374
1375	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1376					req->info.comp_idx, req->sde->this_idx,
1377					req->ahg_idx, ahg, idx, tidval);
1378	sdma_txinit_ahg(&tx->txreq,
1379			SDMA_TXREQ_F_USE_AHG,
1380			datalen, req->ahg_idx, idx,
1381			ahg, sizeof(req->hdr),
1382			user_sdma_txreq_cb);
1383
1384	return idx;
1385}
1386
1387/**
1388 * user_sdma_txreq_cb() - SDMA tx request completion callback.
1389 * @txreq: valid sdma tx request
1390 * @status: success/failure of request
1391 *
1392 * Called when the SDMA progress state machine gets notification that
1393 * the SDMA descriptors for this tx request have been processed by the
1394 * DMA engine. Called in interrupt context.
1395 * Only do work on completed sequences.
1396 */
1397static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
1398{
1399	struct user_sdma_txreq *tx =
1400		container_of(txreq, struct user_sdma_txreq, txreq);
1401	struct user_sdma_request *req;
1402	struct hfi1_user_sdma_pkt_q *pq;
1403	struct hfi1_user_sdma_comp_q *cq;
1404	enum hfi1_sdma_comp_state state = COMPLETE;
1405
1406	if (!tx->req)
1407		return;
1408
1409	req = tx->req;
1410	pq = req->pq;
1411	cq = req->cq;
1412
1413	if (status != SDMA_TXREQ_S_OK) {
1414		SDMA_DBG(req, "SDMA completion with error %d",
1415			 status);
1416		WRITE_ONCE(req->has_error, 1);
1417		state = ERROR;
1418	}
1419
1420	req->seqcomp = tx->seqnum;
1421	kmem_cache_free(pq->txreq_cache, tx);
1422
1423	/* sequence isn't complete?  We are done */
1424	if (req->seqcomp != req->info.npkts - 1)
1425		return;
1426
1427	user_sdma_free_request(req, false);
1428	set_comp_state(pq, cq, req->info.comp_idx, state, status);
1429	pq_update(pq);
1430}
1431
1432static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
1433{
1434	if (atomic_dec_and_test(&pq->n_reqs))
1435		wake_up(&pq->wait);
1436}
1437
1438static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
1439{
1440	int i;
1441
1442	if (!list_empty(&req->txps)) {
1443		struct sdma_txreq *t, *p;
1444
1445		list_for_each_entry_safe(t, p, &req->txps, list) {
1446			struct user_sdma_txreq *tx =
1447				container_of(t, struct user_sdma_txreq, txreq);
1448			list_del_init(&t->list);
1449			sdma_txclean(req->pq->dd, t);
1450			kmem_cache_free(req->pq->txreq_cache, tx);
1451		}
1452	}
1453
1454	for (i = 0; i < req->data_iovs; i++) {
1455		struct sdma_mmu_node *node = req->iovs[i].node;
1456
1457		if (!node)
1458			continue;
1459
1460		req->iovs[i].node = NULL;
1461
1462		if (unpin)
1463			hfi1_mmu_rb_remove(req->pq->handler,
1464					   &node->rb);
1465		else
1466			atomic_dec(&node->refcount);
1467	}
1468
1469	kfree(req->tids);
1470	clear_bit(req->info.comp_idx, req->pq->req_in_use);
1471}
1472
1473static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
1474				  struct hfi1_user_sdma_comp_q *cq,
1475				  u16 idx, enum hfi1_sdma_comp_state state,
1476				  int ret)
1477{
1478	if (state == ERROR)
1479		cq->comps[idx].errcode = -ret;
1480	smp_wmb(); /* make sure errcode is visible first */
1481	cq->comps[idx].status = state;
1482	trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
1483					idx, state, ret);
1484}
1485
1486static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
1487			   unsigned long len)
1488{
1489	return (bool)(node->addr == addr);
1490}
1491
1492static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode)
1493{
1494	struct sdma_mmu_node *node =
1495		container_of(mnode, struct sdma_mmu_node, rb);
1496
1497	atomic_inc(&node->refcount);
1498	return 0;
1499}
1500
1501/*
1502 * Return 1 to remove the node from the rb tree and call the remove op.
1503 *
1504 * Called with the rb tree lock held.
1505 */
1506static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
1507			 void *evict_arg, bool *stop)
1508{
1509	struct sdma_mmu_node *node =
1510		container_of(mnode, struct sdma_mmu_node, rb);
1511	struct evict_data *evict_data = evict_arg;
1512
1513	/* is this node still being used? */
1514	if (atomic_read(&node->refcount))
1515		return 0; /* keep this node */
1516
1517	/* this node will be evicted, add its pages to our count */
1518	evict_data->cleared += node->npages;
1519
1520	/* have enough pages been cleared? */
1521	if (evict_data->cleared >= evict_data->target)
1522		*stop = true;
1523
1524	return 1; /* remove this node */
1525}
1526
1527static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
1528{
1529	struct sdma_mmu_node *node =
1530		container_of(mnode, struct sdma_mmu_node, rb);
1531
1532	unpin_sdma_pages(node);
1533	kfree(node);
1534}
1535
1536static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
1537{
1538	struct sdma_mmu_node *node =
1539		container_of(mnode, struct sdma_mmu_node, rb);
1540
1541	if (!atomic_read(&node->refcount))
1542		return 1;
1543	return 0;
1544}