user_sdma.c - drivers/infiniband/hw/hfi1/user_sdma.c - Linux diff v6.8

   1// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2/*
   3 * Copyright(c) 2020 - 2023 Cornelis Networks, Inc.
   4 * Copyright(c) 2015 - 2018 Intel Corporation.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
   5 */
   6
   7#include <linux/mm.h>
   8#include <linux/types.h>
   9#include <linux/device.h>
  10#include <linux/dmapool.h>
  11#include <linux/slab.h>
  12#include <linux/list.h>
  13#include <linux/highmem.h>
  14#include <linux/io.h>
  15#include <linux/uio.h>
  16#include <linux/rbtree.h>
  17#include <linux/spinlock.h>
  18#include <linux/delay.h>
  19#include <linux/kthread.h>
  20#include <linux/mmu_context.h>
  21#include <linux/module.h>
  22#include <linux/vmalloc.h>
  23#include <linux/string.h>
  24
  25#include "hfi.h"
  26#include "sdma.h"
 
  27#include "user_sdma.h"
  28#include "verbs.h"  /* for the headers */
  29#include "common.h" /* for struct hfi1_tid_info */
  30#include "trace.h"
  31
  32static uint hfi1_sdma_comp_ring_size = 128;
  33module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
  34MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
  35
  36static unsigned initial_pkt_count = 8;
  37
  38static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
  39static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
  40static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
  41static void user_sdma_free_request(struct user_sdma_request *req);
 
 
 
 
  42static int check_header_template(struct user_sdma_request *req,
  43				 struct hfi1_pkt_header *hdr, u32 lrhlen,
  44				 u32 datalen);
  45static int set_txreq_header(struct user_sdma_request *req,
  46			    struct user_sdma_txreq *tx, u32 datalen);
  47static int set_txreq_header_ahg(struct user_sdma_request *req,
  48				struct user_sdma_txreq *tx, u32 len);
  49static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  50				  struct hfi1_user_sdma_comp_q *cq,
  51				  u16 idx, enum hfi1_sdma_comp_state state,
  52				  int ret);
  53static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
  54static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
  55
  56static int defer_packet_queue(
  57	struct sdma_engine *sde,
  58	struct iowait_work *wait,
  59	struct sdma_txreq *txreq,
  60	uint seq,
  61	bool pkts_sent);
  62static void activate_packet_queue(struct iowait *wait, int reason);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  63
  64static int defer_packet_queue(
  65	struct sdma_engine *sde,
  66	struct iowait_work *wait,
  67	struct sdma_txreq *txreq,
  68	uint seq,
  69	bool pkts_sent)
  70{
  71	struct hfi1_user_sdma_pkt_q *pq =
  72		container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
  73
  74	write_seqlock(&sde->waitlock);
  75	trace_hfi1_usdma_defer(pq, sde, &pq->busy);
  76	if (sdma_progress(sde, seq, txreq))
  77		goto eagain;
  78	/*
  79	 * We are assuming that if the list is enqueued somewhere, it
  80	 * is to the dmawait list since that is the only place where
  81	 * it is supposed to be enqueued.
  82	 */
  83	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
  84	if (list_empty(&pq->busy.list)) {
  85		pq->busy.lock = &sde->waitlock;
  86		iowait_get_priority(&pq->busy);
  87		iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
  88	}
  89	write_sequnlock(&sde->waitlock);
  90	return -EBUSY;
  91eagain:
  92	write_sequnlock(&sde->waitlock);
  93	return -EAGAIN;
  94}
  95
  96static void activate_packet_queue(struct iowait *wait, int reason)
  97{
  98	struct hfi1_user_sdma_pkt_q *pq =
  99		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
 100
 101	trace_hfi1_usdma_activate(pq, wait, reason);
 102	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
 103	wake_up(&wait->wait_dma);
 104};
 105
 106int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 107				struct hfi1_filedata *fd)
 108{
 109	int ret = -ENOMEM;
 110	char buf[64];
 111	struct hfi1_devdata *dd;
 112	struct hfi1_user_sdma_comp_q *cq;
 113	struct hfi1_user_sdma_pkt_q *pq;
 114
 115	if (!uctxt || !fd)
 116		return -EBADF;
 117
 118	if (!hfi1_sdma_comp_ring_size)
 119		return -EINVAL;
 120
 121	dd = uctxt->dd;
 122
 123	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
 124	if (!pq)
 125		return -ENOMEM;
 
 126	pq->dd = dd;
 127	pq->ctxt = uctxt->ctxt;
 128	pq->subctxt = fd->subctxt;
 129	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
 130	atomic_set(&pq->n_reqs, 0);
 131	init_waitqueue_head(&pq->wait);
 132	atomic_set(&pq->n_locked, 0);
 
 133
 134	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
 135		    activate_packet_queue, NULL, NULL);
 136	pq->reqidx = 0;
 137
 138	pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
 139			   sizeof(*pq->reqs),
 140			   GFP_KERNEL);
 141	if (!pq->reqs)
 142		goto pq_reqs_nomem;
 143
 144	pq->req_in_use = bitmap_zalloc(hfi1_sdma_comp_ring_size, GFP_KERNEL);
 
 
 145	if (!pq->req_in_use)
 146		goto pq_reqs_no_in_use;
 147
 148	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
 149		 fd->subctxt);
 150	pq->txreq_cache = kmem_cache_create(buf,
 151					    sizeof(struct user_sdma_txreq),
 152					    L1_CACHE_BYTES,
 153					    SLAB_HWCACHE_ALIGN,
 154					    NULL);
 155	if (!pq->txreq_cache) {
 156		dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
 157			   uctxt->ctxt);
 158		goto pq_txreq_nomem;
 159	}
 160
 161	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 162	if (!cq)
 163		goto cq_nomem;
 164
 165	cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
 166				 * hfi1_sdma_comp_ring_size));
 167	if (!cq->comps)
 168		goto cq_comps_nomem;
 169
 170	cq->nentries = hfi1_sdma_comp_ring_size;
 171
 172	ret = hfi1_init_system_pinning(pq);
 173	if (ret)
 
 
 174		goto pq_mmu_fail;
 
 175
 176	rcu_assign_pointer(fd->pq, pq);
 177	fd->cq = cq;
 178
 179	return 0;
 180
 181pq_mmu_fail:
 182	vfree(cq->comps);
 183cq_comps_nomem:
 184	kfree(cq);
 185cq_nomem:
 186	kmem_cache_destroy(pq->txreq_cache);
 187pq_txreq_nomem:
 188	bitmap_free(pq->req_in_use);
 189pq_reqs_no_in_use:
 190	kfree(pq->reqs);
 191pq_reqs_nomem:
 192	kfree(pq);
 193
 194	return ret;
 195}
 196
 197static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq)
 198{
 199	unsigned long flags;
 200	seqlock_t *lock = pq->busy.lock;
 201
 202	if (!lock)
 203		return;
 204	write_seqlock_irqsave(lock, flags);
 205	if (!list_empty(&pq->busy.list)) {
 206		list_del_init(&pq->busy.list);
 207		pq->busy.lock = NULL;
 208	}
 209	write_sequnlock_irqrestore(lock, flags);
 210}
 211
 212int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
 213			       struct hfi1_ctxtdata *uctxt)
 214{
 215	struct hfi1_user_sdma_pkt_q *pq;
 216
 217	trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
 218
 219	spin_lock(&fd->pq_rcu_lock);
 220	pq = srcu_dereference_check(fd->pq, &fd->pq_srcu,
 221				    lockdep_is_held(&fd->pq_rcu_lock));
 222	if (pq) {
 223		rcu_assign_pointer(fd->pq, NULL);
 224		spin_unlock(&fd->pq_rcu_lock);
 225		synchronize_srcu(&fd->pq_srcu);
 226		/* at this point there can be no more new requests */
 227		iowait_sdma_drain(&pq->busy);
 228		/* Wait until all requests have been freed. */
 229		wait_event_interruptible(
 230			pq->wait,
 231			!atomic_read(&pq->n_reqs));
 232		kfree(pq->reqs);
 233		hfi1_free_system_pinning(pq);
 234		bitmap_free(pq->req_in_use);
 235		kmem_cache_destroy(pq->txreq_cache);
 236		flush_pq_iowait(pq);
 237		kfree(pq);
 238	} else {
 239		spin_unlock(&fd->pq_rcu_lock);
 240	}
 241	if (fd->cq) {
 242		vfree(fd->cq->comps);
 243		kfree(fd->cq);
 244		fd->cq = NULL;
 245	}
 246	return 0;
 247}
 248
 249static u8 dlid_to_selector(u16 dlid)
 250{
 251	static u8 mapping[256];
 252	static int initialized;
 253	static u8 next;
 254	int hash;
 255
 256	if (!initialized) {
 257		memset(mapping, 0xFF, 256);
 258		initialized = 1;
 259	}
 260
 261	hash = ((dlid >> 8) ^ dlid) & 0xFF;
 262	if (mapping[hash] == 0xFF) {
 263		mapping[hash] = next;
 264		next = (next + 1) & 0x7F;
 265	}
 266
 267	return mapping[hash];
 268}
 269
 270/**
 271 * hfi1_user_sdma_process_request() - Process and start a user sdma request
 272 * @fd: valid file descriptor
 273 * @iovec: array of io vectors to process
 274 * @dim: overall iovec array size
 275 * @count: number of io vector array entries processed
 276 */
 277int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 278				   struct iovec *iovec, unsigned long dim,
 279				   unsigned long *count)
 280{
 281	int ret = 0, i;
 282	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 283	struct hfi1_user_sdma_pkt_q *pq =
 284		srcu_dereference(fd->pq, &fd->pq_srcu);
 285	struct hfi1_user_sdma_comp_q *cq = fd->cq;
 286	struct hfi1_devdata *dd = pq->dd;
 287	unsigned long idx = 0;
 288	u8 pcount = initial_pkt_count;
 289	struct sdma_req_info info;
 290	struct user_sdma_request *req;
 291	u8 opcode, sc, vl;
 292	u16 pkey;
 293	u32 slid;
 294	u16 dlid;
 295	u32 selector;
 296
 297	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
 298		hfi1_cdbg(
 299		   SDMA,
 300		   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
 301		   dd->unit, uctxt->ctxt, fd->subctxt,
 302		   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
 303		return -EINVAL;
 304	}
 305	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
 306	if (ret) {
 307		hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
 308			  dd->unit, uctxt->ctxt, fd->subctxt, ret);
 309		return -EFAULT;
 310	}
 311
 312	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
 313				     (u16 *)&info);
 314	if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
 315		hfi1_cdbg(SDMA,
 316			  "[%u:%u:%u:%u] Invalid comp index",
 317			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 318		return -EINVAL;
 319	}
 320
 321	/*
 322	 * Sanity check the header io vector count.  Need at least 1 vector
 323	 * (header) and cannot be larger than the actual io vector count.
 324	 */
 325	if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
 326		hfi1_cdbg(SDMA,
 327			  "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
 328			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
 329			  req_iovcnt(info.ctrl), dim);
 330		return -EINVAL;
 331	}
 332
 333	if (!info.fragsize) {
 334		hfi1_cdbg(SDMA,
 335			  "[%u:%u:%u:%u] Request does not specify fragsize",
 336			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 337		return -EINVAL;
 338	}
 339
 340	/* Try to claim the request. */
 341	if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
 342		hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
 343			  dd->unit, uctxt->ctxt, fd->subctxt,
 344			  info.comp_idx);
 345		return -EBADSLT;
 346	}
 347	/*
 348	 * All safety checks have been done and this request has been claimed.
 349	 */
 350	trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
 351					     info.comp_idx);
 352	req = pq->reqs + info.comp_idx;
 353	req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
 354	req->data_len  = 0;
 355	req->pq = pq;
 356	req->cq = cq;
 357	req->ahg_idx = -1;
 358	req->iov_idx = 0;
 359	req->sent = 0;
 360	req->seqnum = 0;
 361	req->seqcomp = 0;
 362	req->seqsubmitted = 0;
 363	req->tids = NULL;
 364	req->has_error = 0;
 365	INIT_LIST_HEAD(&req->txps);
 366
 367	memcpy(&req->info, &info, sizeof(info));
 368
 369	/* The request is initialized, count it */
 370	atomic_inc(&pq->n_reqs);
 371
 372	if (req_opcode(info.ctrl) == EXPECTED) {
 373		/* expected must have a TID info and at least one data vector */
 374		if (req->data_iovs < 2) {
 375			SDMA_DBG(req,
 376				 "Not enough vectors for expected request");
 377			ret = -EINVAL;
 378			goto free_req;
 379		}
 380		req->data_iovs--;
 381	}
 382
 383	if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
 384		SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
 385			 MAX_VECTORS_PER_REQ);
 386		ret = -EINVAL;
 387		goto free_req;
 388	}
 389
 390	/* Copy the header from the user buffer */
 391	ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
 392			     sizeof(req->hdr));
 393	if (ret) {
 394		SDMA_DBG(req, "Failed to copy header template (%d)", ret);
 395		ret = -EFAULT;
 396		goto free_req;
 397	}
 398
 399	/* If Static rate control is not enabled, sanitize the header. */
 400	if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
 401		req->hdr.pbc[2] = 0;
 402
 403	/* Validate the opcode. Do not trust packets from user space blindly. */
 404	opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
 405	if ((opcode & USER_OPCODE_CHECK_MASK) !=
 406	     USER_OPCODE_CHECK_VAL) {
 407		SDMA_DBG(req, "Invalid opcode (%d)", opcode);
 408		ret = -EINVAL;
 409		goto free_req;
 410	}
 411	/*
 412	 * Validate the vl. Do not trust packets from user space blindly.
 413	 * VL comes from PBC, SC comes from LRH, and the VL needs to
 414	 * match the SC look up.
 415	 */
 416	vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
 417	sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
 418	      (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
 419	if (vl >= dd->pport->vls_operational ||
 420	    vl != sc_to_vlt(dd, sc)) {
 421		SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
 422		ret = -EINVAL;
 423		goto free_req;
 424	}
 425
 426	/* Checking P_KEY for requests from user-space */
 427	pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
 428	slid = be16_to_cpu(req->hdr.lrh[3]);
 429	if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
 430		ret = -EINVAL;
 431		goto free_req;
 432	}
 433
 434	/*
 435	 * Also should check the BTH.lnh. If it says the next header is GRH then
 436	 * the RXE parsing will be off and will land in the middle of the KDETH
 437	 * or miss it entirely.
 438	 */
 439	if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
 440		SDMA_DBG(req, "User tried to pass in a GRH");
 441		ret = -EINVAL;
 442		goto free_req;
 443	}
 444
 445	req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
 446	/*
 447	 * Calculate the initial TID offset based on the values of
 448	 * KDETH.OFFSET and KDETH.OM that are passed in.
 449	 */
 450	req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
 451		(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 452		 KDETH_OM_LARGE : KDETH_OM_SMALL);
 453	trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
 454					       info.comp_idx, req->tidoffset);
 455	idx++;
 456
 457	/* Save all the IO vector structures */
 458	for (i = 0; i < req->data_iovs; i++) {
 459		req->iovs[i].offset = 0;
 460		INIT_LIST_HEAD(&req->iovs[i].list);
 461		memcpy(&req->iovs[i].iov,
 462		       iovec + idx++,
 463		       sizeof(req->iovs[i].iov));
 464		if (req->iovs[i].iov.iov_len == 0) {
 465			ret = -EINVAL;
 
 466			goto free_req;
 467		}
 468		req->data_len += req->iovs[i].iov.iov_len;
 469	}
 470	trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
 471					 info.comp_idx, req->data_len);
 472	if (pcount > req->info.npkts)
 473		pcount = req->info.npkts;
 474	/*
 475	 * Copy any TID info
 476	 * User space will provide the TID info only when the
 477	 * request type is EXPECTED. This is true even if there is
 478	 * only one packet in the request and the header is already
 479	 * setup. The reason for the singular TID case is that the
 480	 * driver needs to perform safety checks.
 481	 */
 482	if (req_opcode(req->info.ctrl) == EXPECTED) {
 483		u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
 484		u32 *tmp;
 485
 486		if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
 487			ret = -EINVAL;
 488			goto free_req;
 489		}
 490
 491		/*
 492		 * We have to copy all of the tids because they may vary
 493		 * in size and, therefore, the TID count might not be
 494		 * equal to the pkt count. However, there is no way to
 495		 * tell at this point.
 496		 */
 497		tmp = memdup_array_user(iovec[idx].iov_base,
 498					ntids, sizeof(*req->tids));
 499		if (IS_ERR(tmp)) {
 500			ret = PTR_ERR(tmp);
 501			SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
 502				 ntids, ret);
 503			goto free_req;
 504		}
 505		req->tids = tmp;
 506		req->n_tids = ntids;
 507		req->tididx = 0;
 508		idx++;
 509	}
 510
 511	dlid = be16_to_cpu(req->hdr.lrh[1]);
 512	selector = dlid_to_selector(dlid);
 513	selector += uctxt->ctxt + fd->subctxt;
 514	req->sde = sdma_select_user_engine(dd, selector, vl);
 515
 516	if (!req->sde || !sdma_running(req->sde)) {
 517		ret = -ECOMM;
 518		goto free_req;
 519	}
 520
 521	/* We don't need an AHG entry if the request contains only one packet */
 522	if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
 523		req->ahg_idx = sdma_ahg_alloc(req->sde);
 524
 525	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
 526	pq->state = SDMA_PKT_Q_ACTIVE;
 
 
 
 
 527
 528	/*
 529	 * This is a somewhat blocking send implementation.
 530	 * The driver will block the caller until all packets of the
 531	 * request have been submitted to the SDMA engine. However, it
 532	 * will not wait for send completions.
 533	 */
 534	while (req->seqsubmitted != req->info.npkts) {
 535		ret = user_sdma_send_pkts(req, pcount);
 536		if (ret < 0) {
 537			int we_ret;
 538
 539			if (ret != -EBUSY)
 540				goto free_req;
 541			we_ret = wait_event_interruptible_timeout(
 542				pq->busy.wait_dma,
 543				pq->state == SDMA_PKT_Q_ACTIVE,
 544				msecs_to_jiffies(
 545					SDMA_IOWAIT_TIMEOUT));
 546			trace_hfi1_usdma_we(pq, we_ret);
 547			if (we_ret <= 0)
 548				flush_pq_iowait(pq);
 549		}
 550	}
 551	*count += idx;
 552	return 0;
 553free_req:
 554	/*
 555	 * If the submitted seqsubmitted == npkts, the completion routine
 556	 * controls the final state.  If sequbmitted < npkts, wait for any
 557	 * outstanding packets to finish before cleaning up.
 558	 */
 559	if (req->seqsubmitted < req->info.npkts) {
 560		if (req->seqsubmitted)
 561			wait_event(pq->busy.wait_dma,
 562				   (req->seqcomp == req->seqsubmitted - 1));
 563		user_sdma_free_request(req);
 564		pq_update(pq);
 565		set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
 566	}
 567	return ret;
 568}
 569
 570static inline u32 compute_data_length(struct user_sdma_request *req,
 571				      struct user_sdma_txreq *tx)
 572{
 573	/*
 574	 * Determine the proper size of the packet data.
 575	 * The size of the data of the first packet is in the header
 576	 * template. However, it includes the header and ICRC, which need
 577	 * to be subtracted.
 578	 * The minimum representable packet data length in a header is 4 bytes,
 579	 * therefore, when the data length request is less than 4 bytes, there's
 580	 * only one packet, and the packet data length is equal to that of the
 581	 * request data length.
 582	 * The size of the remaining packets is the minimum of the frag
 583	 * size (MTU) or remaining data in the request.
 584	 */
 585	u32 len;
 586
 587	if (!req->seqnum) {
 588		if (req->data_len < sizeof(u32))
 589			len = req->data_len;
 590		else
 591			len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
 592			       (sizeof(tx->hdr) - 4));
 593	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
 594		u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
 595			PAGE_SIZE;
 596		/*
 597		 * Get the data length based on the remaining space in the
 598		 * TID pair.
 599		 */
 600		len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
 601		/* If we've filled up the TID pair, move to the next one. */
 602		if (unlikely(!len) && ++req->tididx < req->n_tids &&
 603		    req->tids[req->tididx]) {
 604			tidlen = EXP_TID_GET(req->tids[req->tididx],
 605					     LEN) * PAGE_SIZE;
 606			req->tidoffset = 0;
 607			len = min_t(u32, tidlen, req->info.fragsize);
 608		}
 609		/*
 610		 * Since the TID pairs map entire pages, make sure that we
 611		 * are not going to try to send more data that we have
 612		 * remaining.
 613		 */
 614		len = min(len, req->data_len - req->sent);
 615	} else {
 616		len = min(req->data_len - req->sent, (u32)req->info.fragsize);
 617	}
 618	trace_hfi1_sdma_user_compute_length(req->pq->dd,
 619					    req->pq->ctxt,
 620					    req->pq->subctxt,
 621					    req->info.comp_idx,
 622					    len);
 623	return len;
 624}
 625
 626static inline u32 pad_len(u32 len)
 627{
 628	if (len & (sizeof(u32) - 1))
 629		len += sizeof(u32) - (len & (sizeof(u32) - 1));
 630	return len;
 631}
 632
 633static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
 634{
 635	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
 636	return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
 637}
 638
 639static int user_sdma_txadd_ahg(struct user_sdma_request *req,
 640			       struct user_sdma_txreq *tx,
 641			       u32 datalen)
 642{
 643	int ret;
 644	u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
 645	u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
 646	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 647
 648	/*
 649	 * Copy the request header into the tx header
 650	 * because the HW needs a cacheline-aligned
 651	 * address.
 652	 * This copy can be optimized out if the hdr
 653	 * member of user_sdma_request were also
 654	 * cacheline aligned.
 655	 */
 656	memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
 657	if (PBC2LRH(pbclen) != lrhlen) {
 658		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 659		tx->hdr.pbc[0] = cpu_to_le16(pbclen);
 660	}
 661	ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
 662	if (ret)
 663		return ret;
 664	ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
 665			      sizeof(tx->hdr) + datalen, req->ahg_idx,
 666			      0, NULL, 0, user_sdma_txreq_cb);
 667	if (ret)
 668		return ret;
 669	ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
 670	if (ret)
 671		sdma_txclean(pq->dd, &tx->txreq);
 672	return ret;
 673}
 674
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 675static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
 676{
 677	int ret = 0;
 678	u16 count;
 679	unsigned npkts = 0;
 680	struct user_sdma_txreq *tx = NULL;
 681	struct hfi1_user_sdma_pkt_q *pq = NULL;
 682	struct user_sdma_iovec *iovec = NULL;
 683
 684	if (!req->pq)
 685		return -EINVAL;
 686
 687	pq = req->pq;
 688
 689	/* If tx completion has reported an error, we are done. */
 690	if (READ_ONCE(req->has_error))
 691		return -EFAULT;
 692
 693	/*
 694	 * Check if we might have sent the entire request already
 695	 */
 696	if (unlikely(req->seqnum == req->info.npkts)) {
 697		if (!list_empty(&req->txps))
 698			goto dosend;
 699		return ret;
 700	}
 701
 702	if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
 703		maxpkts = req->info.npkts - req->seqnum;
 704
 705	while (npkts < maxpkts) {
 706		u32 datalen = 0;
 
 707
 708		/*
 709		 * Check whether any of the completions have come back
 710		 * with errors. If so, we are not going to process any
 711		 * more packets from this request.
 712		 */
 713		if (READ_ONCE(req->has_error))
 714			return -EFAULT;
 715
 716		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
 717		if (!tx)
 718			return -ENOMEM;
 719
 720		tx->flags = 0;
 721		tx->req = req;
 722		INIT_LIST_HEAD(&tx->list);
 723
 724		/*
 725		 * For the last packet set the ACK request
 726		 * and disable header suppression.
 727		 */
 728		if (req->seqnum == req->info.npkts - 1)
 729			tx->flags |= (TXREQ_FLAGS_REQ_ACK |
 730				      TXREQ_FLAGS_REQ_DISABLE_SH);
 731
 732		/*
 733		 * Calculate the payload size - this is min of the fragment
 734		 * (MTU) size or the remaining bytes in the request but only
 735		 * if we have payload data.
 736		 */
 737		if (req->data_len) {
 738			iovec = &req->iovs[req->iov_idx];
 739			if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
 740				if (++req->iov_idx == req->data_iovs) {
 741					ret = -EFAULT;
 742					goto free_tx;
 743				}
 744				iovec = &req->iovs[req->iov_idx];
 745				WARN_ON(iovec->offset);
 746			}
 747
 748			datalen = compute_data_length(req, tx);
 749
 750			/*
 751			 * Disable header suppression for the payload <= 8DWS.
 752			 * If there is an uncorrectable error in the receive
 753			 * data FIFO when the received payload size is less than
 754			 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
 755			 * not reported.There is set RHF.EccErr if the header
 756			 * is not suppressed.
 757			 */
 758			if (!datalen) {
 759				SDMA_DBG(req,
 760					 "Request has data but pkt len is 0");
 761				ret = -EFAULT;
 762				goto free_tx;
 763			} else if (datalen <= 32) {
 764				tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
 765			}
 766		}
 767
 768		if (req->ahg_idx >= 0) {
 769			if (!req->seqnum) {
 770				ret = user_sdma_txadd_ahg(req, tx, datalen);
 771				if (ret)
 772					goto free_tx;
 773			} else {
 774				int changes;
 775
 776				changes = set_txreq_header_ahg(req, tx,
 777							       datalen);
 778				if (changes < 0) {
 779					ret = changes;
 780					goto free_tx;
 781				}
 782			}
 783		} else {
 784			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
 785					  datalen, user_sdma_txreq_cb);
 786			if (ret)
 787				goto free_tx;
 788			/*
 789			 * Modify the header for this packet. This only needs
 790			 * to be done if we are not going to use AHG. Otherwise,
 791			 * the HW will do it based on the changes we gave it
 792			 * during sdma_txinit_ahg().
 793			 */
 794			ret = set_txreq_header(req, tx, datalen);
 795			if (ret)
 796				goto free_txreq;
 797		}
 798
 799		req->koffset += datalen;
 800		if (req_opcode(req->info.ctrl) == EXPECTED)
 801			req->tidoffset += datalen;
 802		req->sent += datalen;
 803		while (datalen) {
 804			ret = hfi1_add_pages_to_sdma_packet(req, tx, iovec,
 805							    &datalen);
 
 806			if (ret)
 807				goto free_txreq;
 808			iovec = &req->iovs[req->iov_idx];
 809		}
 
 
 
 
 
 
 
 
 
 
 810		list_add_tail(&tx->txreq.list, &req->txps);
 811		/*
 812		 * It is important to increment this here as it is used to
 813		 * generate the BTH.PSN and, therefore, can't be bulk-updated
 814		 * outside of the loop.
 815		 */
 816		tx->seqnum = req->seqnum++;
 817		npkts++;
 818	}
 819dosend:
 820	ret = sdma_send_txlist(req->sde,
 821			       iowait_get_ib_work(&pq->busy),
 822			       &req->txps, &count);
 823	req->seqsubmitted += count;
 824	if (req->seqsubmitted == req->info.npkts) {
 825		/*
 826		 * The txreq has already been submitted to the HW queue
 827		 * so we can free the AHG entry now. Corruption will not
 828		 * happen due to the sequential manner in which
 829		 * descriptors are processed.
 830		 */
 831		if (req->ahg_idx >= 0)
 832			sdma_ahg_free(req->sde, req->ahg_idx);
 833	}
 834	return ret;
 835
 836free_txreq:
 837	sdma_txclean(pq->dd, &tx->txreq);
 838free_tx:
 839	kmem_cache_free(pq->txreq_cache, tx);
 840	return ret;
 841}
 842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 843static int check_header_template(struct user_sdma_request *req,
 844				 struct hfi1_pkt_header *hdr, u32 lrhlen,
 845				 u32 datalen)
 846{
 847	/*
 848	 * Perform safety checks for any type of packet:
 849	 *    - transfer size is multiple of 64bytes
 850	 *    - packet length is multiple of 4 bytes
 851	 *    - packet length is not larger than MTU size
 852	 *
 853	 * These checks are only done for the first packet of the
 854	 * transfer since the header is "given" to us by user space.
 855	 * For the remainder of the packets we compute the values.
 856	 */
 857	if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
 858	    lrhlen > get_lrh_len(*hdr, req->info.fragsize))
 859		return -EINVAL;
 860
 861	if (req_opcode(req->info.ctrl) == EXPECTED) {
 862		/*
 863		 * The header is checked only on the first packet. Furthermore,
 864		 * we ensure that at least one TID entry is copied when the
 865		 * request is submitted. Therefore, we don't have to verify that
 866		 * tididx points to something sane.
 867		 */
 868		u32 tidval = req->tids[req->tididx],
 869			tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
 870			tididx = EXP_TID_GET(tidval, IDX),
 871			tidctrl = EXP_TID_GET(tidval, CTRL),
 872			tidoff;
 873		__le32 kval = hdr->kdeth.ver_tid_offset;
 874
 875		tidoff = KDETH_GET(kval, OFFSET) *
 876			  (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 877			   KDETH_OM_LARGE : KDETH_OM_SMALL);
 878		/*
 879		 * Expected receive packets have the following
 880		 * additional checks:
 881		 *     - offset is not larger than the TID size
 882		 *     - TIDCtrl values match between header and TID array
 883		 *     - TID indexes match between header and TID array
 884		 */
 885		if ((tidoff + datalen > tidlen) ||
 886		    KDETH_GET(kval, TIDCTRL) != tidctrl ||
 887		    KDETH_GET(kval, TID) != tididx)
 888			return -EINVAL;
 889	}
 890	return 0;
 891}
 892
 893/*
 894 * Correctly set the BTH.PSN field based on type of
 895 * transfer - eager packets can just increment the PSN but
 896 * expected packets encode generation and sequence in the
 897 * BTH.PSN field so just incrementing will result in errors.
 898 */
 899static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
 900{
 901	u32 val = be32_to_cpu(bthpsn),
 902		mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
 903			0xffffffull),
 904		psn = val & mask;
 905	if (expct)
 906		psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
 907			((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
 908	else
 909		psn = psn + frags;
 910	return psn & mask;
 911}
 912
 913static int set_txreq_header(struct user_sdma_request *req,
 914			    struct user_sdma_txreq *tx, u32 datalen)
 915{
 916	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 917	struct hfi1_pkt_header *hdr = &tx->hdr;
 918	u8 omfactor; /* KDETH.OM */
 919	u16 pbclen;
 920	int ret;
 921	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
 922
 923	/* Copy the header template to the request before modification */
 924	memcpy(hdr, &req->hdr, sizeof(*hdr));
 925
 926	/*
 927	 * Check if the PBC and LRH length are mismatched. If so
 928	 * adjust both in the header.
 929	 */
 930	pbclen = le16_to_cpu(hdr->pbc[0]);
 931	if (PBC2LRH(pbclen) != lrhlen) {
 932		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 933		hdr->pbc[0] = cpu_to_le16(pbclen);
 934		hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
 935		/*
 936		 * Third packet
 937		 * This is the first packet in the sequence that has
 938		 * a "static" size that can be used for the rest of
 939		 * the packets (besides the last one).
 940		 */
 941		if (unlikely(req->seqnum == 2)) {
 942			/*
 943			 * From this point on the lengths in both the
 944			 * PBC and LRH are the same until the last
 945			 * packet.
 946			 * Adjust the template so we don't have to update
 947			 * every packet
 948			 */
 949			req->hdr.pbc[0] = hdr->pbc[0];
 950			req->hdr.lrh[2] = hdr->lrh[2];
 951		}
 952	}
 953	/*
 954	 * We only have to modify the header if this is not the
 955	 * first packet in the request. Otherwise, we use the
 956	 * header given to us.
 957	 */
 958	if (unlikely(!req->seqnum)) {
 959		ret = check_header_template(req, hdr, lrhlen, datalen);
 960		if (ret)
 961			return ret;
 962		goto done;
 963	}
 964
 965	hdr->bth[2] = cpu_to_be32(
 966		set_pkt_bth_psn(hdr->bth[2],
 967				(req_opcode(req->info.ctrl) == EXPECTED),
 968				req->seqnum));
 969
 970	/* Set ACK request on last packet */
 971	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
 972		hdr->bth[2] |= cpu_to_be32(1UL << 31);
 973
 974	/* Set the new offset */
 975	hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
 976	/* Expected packets have to fill in the new TID information */
 977	if (req_opcode(req->info.ctrl) == EXPECTED) {
 978		tidval = req->tids[req->tididx];
 979		/*
 980		 * If the offset puts us at the end of the current TID,
 981		 * advance everything.
 982		 */
 983		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
 984					 PAGE_SIZE)) {
 985			req->tidoffset = 0;
 986			/*
 987			 * Since we don't copy all the TIDs, all at once,
 988			 * we have to check again.
 989			 */
 990			if (++req->tididx > req->n_tids - 1 ||
 991			    !req->tids[req->tididx]) {
 992				return -EINVAL;
 993			}
 994			tidval = req->tids[req->tididx];
 995		}
 996		omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
 997			KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
 998			KDETH_OM_SMALL_SHIFT;
 999		/* Set KDETH.TIDCtrl based on value for this TID. */
1000		KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1001			  EXP_TID_GET(tidval, CTRL));
1002		/* Set KDETH.TID based on value for this TID */
1003		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1004			  EXP_TID_GET(tidval, IDX));
1005		/* Clear KDETH.SH when DISABLE_SH flag is set */
1006		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
1007			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1008		/*
1009		 * Set the KDETH.OFFSET and KDETH.OM based on size of
1010		 * transfer.
1011		 */
1012		trace_hfi1_sdma_user_tid_info(
1013			pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
1014			req->tidoffset, req->tidoffset >> omfactor,
1015			omfactor != KDETH_OM_SMALL_SHIFT);
1016		KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1017			  req->tidoffset >> omfactor);
1018		KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1019			  omfactor != KDETH_OM_SMALL_SHIFT);
1020	}
1021done:
1022	trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1023				    req->info.comp_idx, hdr, tidval);
1024	return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1025}
1026
1027static int set_txreq_header_ahg(struct user_sdma_request *req,
1028				struct user_sdma_txreq *tx, u32 datalen)
1029{
1030	u32 ahg[AHG_KDETH_ARRAY_SIZE];
1031	int idx = 0;
1032	u8 omfactor; /* KDETH.OM */
1033	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1034	struct hfi1_pkt_header *hdr = &req->hdr;
1035	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1036	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1037	size_t array_size = ARRAY_SIZE(ahg);
1038
1039	if (PBC2LRH(pbclen) != lrhlen) {
1040		/* PBC.PbcLengthDWs */
1041		idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
1042				     (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
1043		if (idx < 0)
1044			return idx;
1045		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
1046		idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
1047				     (__force u16)cpu_to_be16(lrhlen >> 2));
1048		if (idx < 0)
1049			return idx;
1050	}
1051
1052	/*
1053	 * Do the common updates
1054	 */
1055	/* BTH.PSN and BTH.A */
1056	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1057		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1058	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1059		val32 |= 1UL << 31;
1060	idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
1061			     (__force u16)cpu_to_be16(val32 >> 16));
1062	if (idx < 0)
1063		return idx;
1064	idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
1065			     (__force u16)cpu_to_be16(val32 & 0xffff));
1066	if (idx < 0)
1067		return idx;
1068	/* KDETH.Offset */
1069	idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
1070			     (__force u16)cpu_to_le16(req->koffset & 0xffff));
1071	if (idx < 0)
1072		return idx;
1073	idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
1074			     (__force u16)cpu_to_le16(req->koffset >> 16));
1075	if (idx < 0)
1076		return idx;
1077	if (req_opcode(req->info.ctrl) == EXPECTED) {
1078		__le16 val;
1079
1080		tidval = req->tids[req->tididx];
1081
1082		/*
1083		 * If the offset puts us at the end of the current TID,
1084		 * advance everything.
1085		 */
1086		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1087					 PAGE_SIZE)) {
1088			req->tidoffset = 0;
1089			/*
1090			 * Since we don't copy all the TIDs, all at once,
1091			 * we have to check again.
1092			 */
1093			if (++req->tididx > req->n_tids - 1 ||
1094			    !req->tids[req->tididx])
1095				return -EINVAL;
1096			tidval = req->tids[req->tididx];
1097		}
1098		omfactor = ((EXP_TID_GET(tidval, LEN) *
1099				  PAGE_SIZE) >=
1100				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
1101				 KDETH_OM_SMALL_SHIFT;
1102		/* KDETH.OM and KDETH.OFFSET (TID) */
1103		idx = ahg_header_set(
1104				ahg, idx, array_size, 7, 0, 16,
1105				((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
1106				((req->tidoffset >> omfactor)
1107				& 0x7fff)));
1108		if (idx < 0)
1109			return idx;
1110		/* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
1111		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1112				   (EXP_TID_GET(tidval, IDX) & 0x3ff));
1113
1114		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
1115			val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1116						      INTR) <<
1117					    AHG_KDETH_INTR_SHIFT));
1118		} else {
1119			val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
1120			       cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
1121			       cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1122						      INTR) <<
1123					     AHG_KDETH_INTR_SHIFT));
1124		}
1125
1126		idx = ahg_header_set(ahg, idx, array_size,
1127				     7, 16, 14, (__force u16)val);
1128		if (idx < 0)
1129			return idx;
1130	}
1131
1132	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1133					req->info.comp_idx, req->sde->this_idx,
1134					req->ahg_idx, ahg, idx, tidval);
1135	sdma_txinit_ahg(&tx->txreq,
1136			SDMA_TXREQ_F_USE_AHG,
1137			datalen, req->ahg_idx, idx,
1138			ahg, sizeof(req->hdr),
1139			user_sdma_txreq_cb);
1140
1141	return idx;
1142}
1143
1144/**
1145 * user_sdma_txreq_cb() - SDMA tx request completion callback.
1146 * @txreq: valid sdma tx request
1147 * @status: success/failure of request
1148 *
1149 * Called when the SDMA progress state machine gets notification that
1150 * the SDMA descriptors for this tx request have been processed by the
1151 * DMA engine. Called in interrupt context.
1152 * Only do work on completed sequences.
1153 */
1154static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
1155{
1156	struct user_sdma_txreq *tx =
1157		container_of(txreq, struct user_sdma_txreq, txreq);
1158	struct user_sdma_request *req;
1159	struct hfi1_user_sdma_pkt_q *pq;
1160	struct hfi1_user_sdma_comp_q *cq;
1161	enum hfi1_sdma_comp_state state = COMPLETE;
1162
1163	if (!tx->req)
1164		return;
1165
1166	req = tx->req;
1167	pq = req->pq;
1168	cq = req->cq;
1169
1170	if (status != SDMA_TXREQ_S_OK) {
1171		SDMA_DBG(req, "SDMA completion with error %d",
1172			 status);
1173		WRITE_ONCE(req->has_error, 1);
1174		state = ERROR;
1175	}
1176
1177	req->seqcomp = tx->seqnum;
1178	kmem_cache_free(pq->txreq_cache, tx);
1179
1180	/* sequence isn't complete?  We are done */
1181	if (req->seqcomp != req->info.npkts - 1)
1182		return;
1183
1184	user_sdma_free_request(req);
1185	set_comp_state(pq, cq, req->info.comp_idx, state, status);
1186	pq_update(pq);
1187}
1188
1189static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
1190{
1191	if (atomic_dec_and_test(&pq->n_reqs))
1192		wake_up(&pq->wait);
1193}
1194
1195static void user_sdma_free_request(struct user_sdma_request *req)
1196{
 
 
1197	if (!list_empty(&req->txps)) {
1198		struct sdma_txreq *t, *p;
1199
1200		list_for_each_entry_safe(t, p, &req->txps, list) {
1201			struct user_sdma_txreq *tx =
1202				container_of(t, struct user_sdma_txreq, txreq);
1203			list_del_init(&t->list);
1204			sdma_txclean(req->pq->dd, t);
1205			kmem_cache_free(req->pq->txreq_cache, tx);
1206		}
1207	}
1208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1209	kfree(req->tids);
1210	clear_bit(req->info.comp_idx, req->pq->req_in_use);
1211}
1212
1213static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
1214				  struct hfi1_user_sdma_comp_q *cq,
1215				  u16 idx, enum hfi1_sdma_comp_state state,
1216				  int ret)
1217{
1218	if (state == ERROR)
1219		cq->comps[idx].errcode = -ret;
1220	smp_wmb(); /* make sure errcode is visible first */
1221	cq->comps[idx].status = state;
1222	trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
1223					idx, state, ret);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1224}

 
   1/*
 
   2 * Copyright(c) 2015 - 2018 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
 
  47#include <linux/mm.h>
  48#include <linux/types.h>
  49#include <linux/device.h>
  50#include <linux/dmapool.h>
  51#include <linux/slab.h>
  52#include <linux/list.h>
  53#include <linux/highmem.h>
  54#include <linux/io.h>
  55#include <linux/uio.h>
  56#include <linux/rbtree.h>
  57#include <linux/spinlock.h>
  58#include <linux/delay.h>
  59#include <linux/kthread.h>
  60#include <linux/mmu_context.h>
  61#include <linux/module.h>
  62#include <linux/vmalloc.h>
  63#include <linux/string.h>
  64
  65#include "hfi.h"
  66#include "sdma.h"
  67#include "mmu_rb.h"
  68#include "user_sdma.h"
  69#include "verbs.h"  /* for the headers */
  70#include "common.h" /* for struct hfi1_tid_info */
  71#include "trace.h"
  72
  73static uint hfi1_sdma_comp_ring_size = 128;
  74module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
  75MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
  76
  77static unsigned initial_pkt_count = 8;
  78
  79static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
  80static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
  81static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
  82static void user_sdma_free_request(struct user_sdma_request *req, bool unpin);
  83static int pin_vector_pages(struct user_sdma_request *req,
  84			    struct user_sdma_iovec *iovec);
  85static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
  86			       unsigned start, unsigned npages);
  87static int check_header_template(struct user_sdma_request *req,
  88				 struct hfi1_pkt_header *hdr, u32 lrhlen,
  89				 u32 datalen);
  90static int set_txreq_header(struct user_sdma_request *req,
  91			    struct user_sdma_txreq *tx, u32 datalen);
  92static int set_txreq_header_ahg(struct user_sdma_request *req,
  93				struct user_sdma_txreq *tx, u32 len);
  94static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
  95				  struct hfi1_user_sdma_comp_q *cq,
  96				  u16 idx, enum hfi1_sdma_comp_state state,
  97				  int ret);
  98static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags);
  99static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
 100
 101static int defer_packet_queue(
 102	struct sdma_engine *sde,
 103	struct iowait_work *wait,
 104	struct sdma_txreq *txreq,
 105	uint seq,
 106	bool pkts_sent);
 107static void activate_packet_queue(struct iowait *wait, int reason);
 108static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
 109			   unsigned long len);
 110static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode);
 111static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
 112			 void *arg2, bool *stop);
 113static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode);
 114static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
 115
 116static struct mmu_rb_ops sdma_rb_ops = {
 117	.filter = sdma_rb_filter,
 118	.insert = sdma_rb_insert,
 119	.evict = sdma_rb_evict,
 120	.remove = sdma_rb_remove,
 121	.invalidate = sdma_rb_invalidate
 122};
 123
 124static int defer_packet_queue(
 125	struct sdma_engine *sde,
 126	struct iowait_work *wait,
 127	struct sdma_txreq *txreq,
 128	uint seq,
 129	bool pkts_sent)
 130{
 131	struct hfi1_user_sdma_pkt_q *pq =
 132		container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
 133
 134	write_seqlock(&sde->waitlock);
 
 135	if (sdma_progress(sde, seq, txreq))
 136		goto eagain;
 137	/*
 138	 * We are assuming that if the list is enqueued somewhere, it
 139	 * is to the dmawait list since that is the only place where
 140	 * it is supposed to be enqueued.
 141	 */
 142	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
 143	if (list_empty(&pq->busy.list)) {
 
 144		iowait_get_priority(&pq->busy);
 145		iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
 146	}
 147	write_sequnlock(&sde->waitlock);
 148	return -EBUSY;
 149eagain:
 150	write_sequnlock(&sde->waitlock);
 151	return -EAGAIN;
 152}
 153
 154static void activate_packet_queue(struct iowait *wait, int reason)
 155{
 156	struct hfi1_user_sdma_pkt_q *pq =
 157		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
 
 
 158	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
 159	wake_up(&wait->wait_dma);
 160};
 161
 162int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 163				struct hfi1_filedata *fd)
 164{
 165	int ret = -ENOMEM;
 166	char buf[64];
 167	struct hfi1_devdata *dd;
 168	struct hfi1_user_sdma_comp_q *cq;
 169	struct hfi1_user_sdma_pkt_q *pq;
 170
 171	if (!uctxt || !fd)
 172		return -EBADF;
 173
 174	if (!hfi1_sdma_comp_ring_size)
 175		return -EINVAL;
 176
 177	dd = uctxt->dd;
 178
 179	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
 180	if (!pq)
 181		return -ENOMEM;
 182
 183	pq->dd = dd;
 184	pq->ctxt = uctxt->ctxt;
 185	pq->subctxt = fd->subctxt;
 186	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
 187	atomic_set(&pq->n_reqs, 0);
 188	init_waitqueue_head(&pq->wait);
 189	atomic_set(&pq->n_locked, 0);
 190	pq->mm = fd->mm;
 191
 192	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
 193		    activate_packet_queue, NULL, NULL);
 194	pq->reqidx = 0;
 195
 196	pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
 197			   sizeof(*pq->reqs),
 198			   GFP_KERNEL);
 199	if (!pq->reqs)
 200		goto pq_reqs_nomem;
 201
 202	pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size),
 203				 sizeof(*pq->req_in_use),
 204				 GFP_KERNEL);
 205	if (!pq->req_in_use)
 206		goto pq_reqs_no_in_use;
 207
 208	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
 209		 fd->subctxt);
 210	pq->txreq_cache = kmem_cache_create(buf,
 211					    sizeof(struct user_sdma_txreq),
 212					    L1_CACHE_BYTES,
 213					    SLAB_HWCACHE_ALIGN,
 214					    NULL);
 215	if (!pq->txreq_cache) {
 216		dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
 217			   uctxt->ctxt);
 218		goto pq_txreq_nomem;
 219	}
 220
 221	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
 222	if (!cq)
 223		goto cq_nomem;
 224
 225	cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps)
 226				 * hfi1_sdma_comp_ring_size));
 227	if (!cq->comps)
 228		goto cq_comps_nomem;
 229
 230	cq->nentries = hfi1_sdma_comp_ring_size;
 231
 232	ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq,
 233				   &pq->handler);
 234	if (ret) {
 235		dd_dev_err(dd, "Failed to register with MMU %d", ret);
 236		goto pq_mmu_fail;
 237	}
 238
 239	fd->pq = pq;
 240	fd->cq = cq;
 241
 242	return 0;
 243
 244pq_mmu_fail:
 245	vfree(cq->comps);
 246cq_comps_nomem:
 247	kfree(cq);
 248cq_nomem:
 249	kmem_cache_destroy(pq->txreq_cache);
 250pq_txreq_nomem:
 251	kfree(pq->req_in_use);
 252pq_reqs_no_in_use:
 253	kfree(pq->reqs);
 254pq_reqs_nomem:
 255	kfree(pq);
 256
 257	return ret;
 258}
 259
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 260int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
 261			       struct hfi1_ctxtdata *uctxt)
 262{
 263	struct hfi1_user_sdma_pkt_q *pq;
 264
 265	trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt);
 266
 267	pq = fd->pq;
 
 
 268	if (pq) {
 269		if (pq->handler)
 270			hfi1_mmu_rb_unregister(pq->handler);
 
 
 271		iowait_sdma_drain(&pq->busy);
 272		/* Wait until all requests have been freed. */
 273		wait_event_interruptible(
 274			pq->wait,
 275			!atomic_read(&pq->n_reqs));
 276		kfree(pq->reqs);
 277		kfree(pq->req_in_use);
 
 278		kmem_cache_destroy(pq->txreq_cache);
 
 279		kfree(pq);
 280		fd->pq = NULL;
 
 281	}
 282	if (fd->cq) {
 283		vfree(fd->cq->comps);
 284		kfree(fd->cq);
 285		fd->cq = NULL;
 286	}
 287	return 0;
 288}
 289
 290static u8 dlid_to_selector(u16 dlid)
 291{
 292	static u8 mapping[256];
 293	static int initialized;
 294	static u8 next;
 295	int hash;
 296
 297	if (!initialized) {
 298		memset(mapping, 0xFF, 256);
 299		initialized = 1;
 300	}
 301
 302	hash = ((dlid >> 8) ^ dlid) & 0xFF;
 303	if (mapping[hash] == 0xFF) {
 304		mapping[hash] = next;
 305		next = (next + 1) & 0x7F;
 306	}
 307
 308	return mapping[hash];
 309}
 310
 311/**
 312 * hfi1_user_sdma_process_request() - Process and start a user sdma request
 313 * @fd: valid file descriptor
 314 * @iovec: array of io vectors to process
 315 * @dim: overall iovec array size
 316 * @count: number of io vector array entries processed
 317 */
 318int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 319				   struct iovec *iovec, unsigned long dim,
 320				   unsigned long *count)
 321{
 322	int ret = 0, i;
 323	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 324	struct hfi1_user_sdma_pkt_q *pq = fd->pq;
 
 325	struct hfi1_user_sdma_comp_q *cq = fd->cq;
 326	struct hfi1_devdata *dd = pq->dd;
 327	unsigned long idx = 0;
 328	u8 pcount = initial_pkt_count;
 329	struct sdma_req_info info;
 330	struct user_sdma_request *req;
 331	u8 opcode, sc, vl;
 332	u16 pkey;
 333	u32 slid;
 334	u16 dlid;
 335	u32 selector;
 336
 337	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
 338		hfi1_cdbg(
 339		   SDMA,
 340		   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
 341		   dd->unit, uctxt->ctxt, fd->subctxt,
 342		   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
 343		return -EINVAL;
 344	}
 345	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
 346	if (ret) {
 347		hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
 348			  dd->unit, uctxt->ctxt, fd->subctxt, ret);
 349		return -EFAULT;
 350	}
 351
 352	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
 353				     (u16 *)&info);
 354	if (info.comp_idx >= hfi1_sdma_comp_ring_size) {
 355		hfi1_cdbg(SDMA,
 356			  "[%u:%u:%u:%u] Invalid comp index",
 357			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 358		return -EINVAL;
 359	}
 360
 361	/*
 362	 * Sanity check the header io vector count.  Need at least 1 vector
 363	 * (header) and cannot be larger than the actual io vector count.
 364	 */
 365	if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) {
 366		hfi1_cdbg(SDMA,
 367			  "[%u:%u:%u:%u] Invalid iov count %d, dim %ld",
 368			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx,
 369			  req_iovcnt(info.ctrl), dim);
 370		return -EINVAL;
 371	}
 372
 373	if (!info.fragsize) {
 374		hfi1_cdbg(SDMA,
 375			  "[%u:%u:%u:%u] Request does not specify fragsize",
 376			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
 377		return -EINVAL;
 378	}
 379
 380	/* Try to claim the request. */
 381	if (test_and_set_bit(info.comp_idx, pq->req_in_use)) {
 382		hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use",
 383			  dd->unit, uctxt->ctxt, fd->subctxt,
 384			  info.comp_idx);
 385		return -EBADSLT;
 386	}
 387	/*
 388	 * All safety checks have been done and this request has been claimed.
 389	 */
 390	trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt,
 391					     info.comp_idx);
 392	req = pq->reqs + info.comp_idx;
 393	req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */
 394	req->data_len  = 0;
 395	req->pq = pq;
 396	req->cq = cq;
 397	req->ahg_idx = -1;
 398	req->iov_idx = 0;
 399	req->sent = 0;
 400	req->seqnum = 0;
 401	req->seqcomp = 0;
 402	req->seqsubmitted = 0;
 403	req->tids = NULL;
 404	req->has_error = 0;
 405	INIT_LIST_HEAD(&req->txps);
 406
 407	memcpy(&req->info, &info, sizeof(info));
 408
 409	/* The request is initialized, count it */
 410	atomic_inc(&pq->n_reqs);
 411
 412	if (req_opcode(info.ctrl) == EXPECTED) {
 413		/* expected must have a TID info and at least one data vector */
 414		if (req->data_iovs < 2) {
 415			SDMA_DBG(req,
 416				 "Not enough vectors for expected request");
 417			ret = -EINVAL;
 418			goto free_req;
 419		}
 420		req->data_iovs--;
 421	}
 422
 423	if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
 424		SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
 425			 MAX_VECTORS_PER_REQ);
 426		ret = -EINVAL;
 427		goto free_req;
 428	}
 
 429	/* Copy the header from the user buffer */
 430	ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
 431			     sizeof(req->hdr));
 432	if (ret) {
 433		SDMA_DBG(req, "Failed to copy header template (%d)", ret);
 434		ret = -EFAULT;
 435		goto free_req;
 436	}
 437
 438	/* If Static rate control is not enabled, sanitize the header. */
 439	if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
 440		req->hdr.pbc[2] = 0;
 441
 442	/* Validate the opcode. Do not trust packets from user space blindly. */
 443	opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
 444	if ((opcode & USER_OPCODE_CHECK_MASK) !=
 445	     USER_OPCODE_CHECK_VAL) {
 446		SDMA_DBG(req, "Invalid opcode (%d)", opcode);
 447		ret = -EINVAL;
 448		goto free_req;
 449	}
 450	/*
 451	 * Validate the vl. Do not trust packets from user space blindly.
 452	 * VL comes from PBC, SC comes from LRH, and the VL needs to
 453	 * match the SC look up.
 454	 */
 455	vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
 456	sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
 457	      (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
 458	if (vl >= dd->pport->vls_operational ||
 459	    vl != sc_to_vlt(dd, sc)) {
 460		SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
 461		ret = -EINVAL;
 462		goto free_req;
 463	}
 464
 465	/* Checking P_KEY for requests from user-space */
 466	pkey = (u16)be32_to_cpu(req->hdr.bth[0]);
 467	slid = be16_to_cpu(req->hdr.lrh[3]);
 468	if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) {
 469		ret = -EINVAL;
 470		goto free_req;
 471	}
 472
 473	/*
 474	 * Also should check the BTH.lnh. If it says the next header is GRH then
 475	 * the RXE parsing will be off and will land in the middle of the KDETH
 476	 * or miss it entirely.
 477	 */
 478	if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
 479		SDMA_DBG(req, "User tried to pass in a GRH");
 480		ret = -EINVAL;
 481		goto free_req;
 482	}
 483
 484	req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
 485	/*
 486	 * Calculate the initial TID offset based on the values of
 487	 * KDETH.OFFSET and KDETH.OM that are passed in.
 488	 */
 489	req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
 490		(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
 491		 KDETH_OM_LARGE : KDETH_OM_SMALL);
 492	trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt,
 493					       info.comp_idx, req->tidoffset);
 494	idx++;
 495
 496	/* Save all the IO vector structures */
 497	for (i = 0; i < req->data_iovs; i++) {
 498		req->iovs[i].offset = 0;
 499		INIT_LIST_HEAD(&req->iovs[i].list);
 500		memcpy(&req->iovs[i].iov,
 501		       iovec + idx++,
 502		       sizeof(req->iovs[i].iov));
 503		ret = pin_vector_pages(req, &req->iovs[i]);
 504		if (ret) {
 505			req->data_iovs = i;
 506			goto free_req;
 507		}
 508		req->data_len += req->iovs[i].iov.iov_len;
 509	}
 510	trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt,
 511					 info.comp_idx, req->data_len);
 512	if (pcount > req->info.npkts)
 513		pcount = req->info.npkts;
 514	/*
 515	 * Copy any TID info
 516	 * User space will provide the TID info only when the
 517	 * request type is EXPECTED. This is true even if there is
 518	 * only one packet in the request and the header is already
 519	 * setup. The reason for the singular TID case is that the
 520	 * driver needs to perform safety checks.
 521	 */
 522	if (req_opcode(req->info.ctrl) == EXPECTED) {
 523		u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
 524		u32 *tmp;
 525
 526		if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
 527			ret = -EINVAL;
 528			goto free_req;
 529		}
 530
 531		/*
 532		 * We have to copy all of the tids because they may vary
 533		 * in size and, therefore, the TID count might not be
 534		 * equal to the pkt count. However, there is no way to
 535		 * tell at this point.
 536		 */
 537		tmp = memdup_user(iovec[idx].iov_base,
 538				  ntids * sizeof(*req->tids));
 539		if (IS_ERR(tmp)) {
 540			ret = PTR_ERR(tmp);
 541			SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
 542				 ntids, ret);
 543			goto free_req;
 544		}
 545		req->tids = tmp;
 546		req->n_tids = ntids;
 547		req->tididx = 0;
 548		idx++;
 549	}
 550
 551	dlid = be16_to_cpu(req->hdr.lrh[1]);
 552	selector = dlid_to_selector(dlid);
 553	selector += uctxt->ctxt + fd->subctxt;
 554	req->sde = sdma_select_user_engine(dd, selector, vl);
 555
 556	if (!req->sde || !sdma_running(req->sde)) {
 557		ret = -ECOMM;
 558		goto free_req;
 559	}
 560
 561	/* We don't need an AHG entry if the request contains only one packet */
 562	if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG))
 563		req->ahg_idx = sdma_ahg_alloc(req->sde);
 564
 565	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
 566	pq->state = SDMA_PKT_Q_ACTIVE;
 567	/* Send the first N packets in the request to buy us some time */
 568	ret = user_sdma_send_pkts(req, pcount);
 569	if (unlikely(ret < 0 && ret != -EBUSY))
 570		goto free_req;
 571
 572	/*
 573	 * This is a somewhat blocking send implementation.
 574	 * The driver will block the caller until all packets of the
 575	 * request have been submitted to the SDMA engine. However, it
 576	 * will not wait for send completions.
 577	 */
 578	while (req->seqsubmitted != req->info.npkts) {
 579		ret = user_sdma_send_pkts(req, pcount);
 580		if (ret < 0) {
 
 
 581			if (ret != -EBUSY)
 582				goto free_req;
 583			wait_event_interruptible_timeout(
 584				pq->busy.wait_dma,
 585				(pq->state == SDMA_PKT_Q_ACTIVE),
 586				msecs_to_jiffies(
 587					SDMA_IOWAIT_TIMEOUT));
 
 
 
 588		}
 589	}
 590	*count += idx;
 591	return 0;
 592free_req:
 593	/*
 594	 * If the submitted seqsubmitted == npkts, the completion routine
 595	 * controls the final state.  If sequbmitted < npkts, wait for any
 596	 * outstanding packets to finish before cleaning up.
 597	 */
 598	if (req->seqsubmitted < req->info.npkts) {
 599		if (req->seqsubmitted)
 600			wait_event(pq->busy.wait_dma,
 601				   (req->seqcomp == req->seqsubmitted - 1));
 602		user_sdma_free_request(req, true);
 603		pq_update(pq);
 604		set_comp_state(pq, cq, info.comp_idx, ERROR, ret);
 605	}
 606	return ret;
 607}
 608
 609static inline u32 compute_data_length(struct user_sdma_request *req,
 610				      struct user_sdma_txreq *tx)
 611{
 612	/*
 613	 * Determine the proper size of the packet data.
 614	 * The size of the data of the first packet is in the header
 615	 * template. However, it includes the header and ICRC, which need
 616	 * to be subtracted.
 617	 * The minimum representable packet data length in a header is 4 bytes,
 618	 * therefore, when the data length request is less than 4 bytes, there's
 619	 * only one packet, and the packet data length is equal to that of the
 620	 * request data length.
 621	 * The size of the remaining packets is the minimum of the frag
 622	 * size (MTU) or remaining data in the request.
 623	 */
 624	u32 len;
 625
 626	if (!req->seqnum) {
 627		if (req->data_len < sizeof(u32))
 628			len = req->data_len;
 629		else
 630			len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
 631			       (sizeof(tx->hdr) - 4));
 632	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
 633		u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
 634			PAGE_SIZE;
 635		/*
 636		 * Get the data length based on the remaining space in the
 637		 * TID pair.
 638		 */
 639		len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
 640		/* If we've filled up the TID pair, move to the next one. */
 641		if (unlikely(!len) && ++req->tididx < req->n_tids &&
 642		    req->tids[req->tididx]) {
 643			tidlen = EXP_TID_GET(req->tids[req->tididx],
 644					     LEN) * PAGE_SIZE;
 645			req->tidoffset = 0;
 646			len = min_t(u32, tidlen, req->info.fragsize);
 647		}
 648		/*
 649		 * Since the TID pairs map entire pages, make sure that we
 650		 * are not going to try to send more data that we have
 651		 * remaining.
 652		 */
 653		len = min(len, req->data_len - req->sent);
 654	} else {
 655		len = min(req->data_len - req->sent, (u32)req->info.fragsize);
 656	}
 657	trace_hfi1_sdma_user_compute_length(req->pq->dd,
 658					    req->pq->ctxt,
 659					    req->pq->subctxt,
 660					    req->info.comp_idx,
 661					    len);
 662	return len;
 663}
 664
 665static inline u32 pad_len(u32 len)
 666{
 667	if (len & (sizeof(u32) - 1))
 668		len += sizeof(u32) - (len & (sizeof(u32) - 1));
 669	return len;
 670}
 671
 672static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
 673{
 674	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
 675	return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
 676}
 677
 678static int user_sdma_txadd_ahg(struct user_sdma_request *req,
 679			       struct user_sdma_txreq *tx,
 680			       u32 datalen)
 681{
 682	int ret;
 683	u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
 684	u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen));
 685	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 686
 687	/*
 688	 * Copy the request header into the tx header
 689	 * because the HW needs a cacheline-aligned
 690	 * address.
 691	 * This copy can be optimized out if the hdr
 692	 * member of user_sdma_request were also
 693	 * cacheline aligned.
 694	 */
 695	memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
 696	if (PBC2LRH(pbclen) != lrhlen) {
 697		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
 698		tx->hdr.pbc[0] = cpu_to_le16(pbclen);
 699	}
 700	ret = check_header_template(req, &tx->hdr, lrhlen, datalen);
 701	if (ret)
 702		return ret;
 703	ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY,
 704			      sizeof(tx->hdr) + datalen, req->ahg_idx,
 705			      0, NULL, 0, user_sdma_txreq_cb);
 706	if (ret)
 707		return ret;
 708	ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr));
 709	if (ret)
 710		sdma_txclean(pq->dd, &tx->txreq);
 711	return ret;
 712}
 713
 714static int user_sdma_txadd(struct user_sdma_request *req,
 715			   struct user_sdma_txreq *tx,
 716			   struct user_sdma_iovec *iovec, u32 datalen,
 717			   u32 *queued_ptr, u32 *data_sent_ptr,
 718			   u64 *iov_offset_ptr)
 719{
 720	int ret;
 721	unsigned int pageidx, len;
 722	unsigned long base, offset;
 723	u64 iov_offset = *iov_offset_ptr;
 724	u32 queued = *queued_ptr, data_sent = *data_sent_ptr;
 725	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 726
 727	base = (unsigned long)iovec->iov.iov_base;
 728	offset = offset_in_page(base + iovec->offset + iov_offset);
 729	pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >>
 730		   PAGE_SHIFT);
 731	len = offset + req->info.fragsize > PAGE_SIZE ?
 732		PAGE_SIZE - offset : req->info.fragsize;
 733	len = min((datalen - queued), len);
 734	ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx],
 735			      offset, len);
 736	if (ret) {
 737		SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret);
 738		return ret;
 739	}
 740	iov_offset += len;
 741	queued += len;
 742	data_sent += len;
 743	if (unlikely(queued < datalen && pageidx == iovec->npages &&
 744		     req->iov_idx < req->data_iovs - 1)) {
 745		iovec->offset += iov_offset;
 746		iovec = &req->iovs[++req->iov_idx];
 747		iov_offset = 0;
 748	}
 749
 750	*queued_ptr = queued;
 751	*data_sent_ptr = data_sent;
 752	*iov_offset_ptr = iov_offset;
 753	return ret;
 754}
 755
 756static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
 757{
 758	int ret = 0;
 759	u16 count;
 760	unsigned npkts = 0;
 761	struct user_sdma_txreq *tx = NULL;
 762	struct hfi1_user_sdma_pkt_q *pq = NULL;
 763	struct user_sdma_iovec *iovec = NULL;
 764
 765	if (!req->pq)
 766		return -EINVAL;
 767
 768	pq = req->pq;
 769
 770	/* If tx completion has reported an error, we are done. */
 771	if (READ_ONCE(req->has_error))
 772		return -EFAULT;
 773
 774	/*
 775	 * Check if we might have sent the entire request already
 776	 */
 777	if (unlikely(req->seqnum == req->info.npkts)) {
 778		if (!list_empty(&req->txps))
 779			goto dosend;
 780		return ret;
 781	}
 782
 783	if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
 784		maxpkts = req->info.npkts - req->seqnum;
 785
 786	while (npkts < maxpkts) {
 787		u32 datalen = 0, queued = 0, data_sent = 0;
 788		u64 iov_offset = 0;
 789
 790		/*
 791		 * Check whether any of the completions have come back
 792		 * with errors. If so, we are not going to process any
 793		 * more packets from this request.
 794		 */
 795		if (READ_ONCE(req->has_error))
 796			return -EFAULT;
 797
 798		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
 799		if (!tx)
 800			return -ENOMEM;
 801
 802		tx->flags = 0;
 803		tx->req = req;
 804		INIT_LIST_HEAD(&tx->list);
 805
 806		/*
 807		 * For the last packet set the ACK request
 808		 * and disable header suppression.
 809		 */
 810		if (req->seqnum == req->info.npkts - 1)
 811			tx->flags |= (TXREQ_FLAGS_REQ_ACK |
 812				      TXREQ_FLAGS_REQ_DISABLE_SH);
 813
 814		/*
 815		 * Calculate the payload size - this is min of the fragment
 816		 * (MTU) size or the remaining bytes in the request but only
 817		 * if we have payload data.
 818		 */
 819		if (req->data_len) {
 820			iovec = &req->iovs[req->iov_idx];
 821			if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
 822				if (++req->iov_idx == req->data_iovs) {
 823					ret = -EFAULT;
 824					goto free_tx;
 825				}
 826				iovec = &req->iovs[req->iov_idx];
 827				WARN_ON(iovec->offset);
 828			}
 829
 830			datalen = compute_data_length(req, tx);
 831
 832			/*
 833			 * Disable header suppression for the payload <= 8DWS.
 834			 * If there is an uncorrectable error in the receive
 835			 * data FIFO when the received payload size is less than
 836			 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is
 837			 * not reported.There is set RHF.EccErr if the header
 838			 * is not suppressed.
 839			 */
 840			if (!datalen) {
 841				SDMA_DBG(req,
 842					 "Request has data but pkt len is 0");
 843				ret = -EFAULT;
 844				goto free_tx;
 845			} else if (datalen <= 32) {
 846				tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH;
 847			}
 848		}
 849
 850		if (req->ahg_idx >= 0) {
 851			if (!req->seqnum) {
 852				ret = user_sdma_txadd_ahg(req, tx, datalen);
 853				if (ret)
 854					goto free_tx;
 855			} else {
 856				int changes;
 857
 858				changes = set_txreq_header_ahg(req, tx,
 859							       datalen);
 860				if (changes < 0) {
 861					ret = changes;
 862					goto free_tx;
 863				}
 864			}
 865		} else {
 866			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
 867					  datalen, user_sdma_txreq_cb);
 868			if (ret)
 869				goto free_tx;
 870			/*
 871			 * Modify the header for this packet. This only needs
 872			 * to be done if we are not going to use AHG. Otherwise,
 873			 * the HW will do it based on the changes we gave it
 874			 * during sdma_txinit_ahg().
 875			 */
 876			ret = set_txreq_header(req, tx, datalen);
 877			if (ret)
 878				goto free_txreq;
 879		}
 880
 881		/*
 882		 * If the request contains any data vectors, add up to
 883		 * fragsize bytes to the descriptor.
 884		 */
 885		while (queued < datalen &&
 886		       (req->sent + data_sent) < req->data_len) {
 887			ret = user_sdma_txadd(req, tx, iovec, datalen,
 888					      &queued, &data_sent, &iov_offset);
 889			if (ret)
 890				goto free_txreq;
 
 891		}
 892		/*
 893		 * The txreq was submitted successfully so we can update
 894		 * the counters.
 895		 */
 896		req->koffset += datalen;
 897		if (req_opcode(req->info.ctrl) == EXPECTED)
 898			req->tidoffset += datalen;
 899		req->sent += data_sent;
 900		if (req->data_len)
 901			iovec->offset += iov_offset;
 902		list_add_tail(&tx->txreq.list, &req->txps);
 903		/*
 904		 * It is important to increment this here as it is used to
 905		 * generate the BTH.PSN and, therefore, can't be bulk-updated
 906		 * outside of the loop.
 907		 */
 908		tx->seqnum = req->seqnum++;
 909		npkts++;
 910	}
 911dosend:
 912	ret = sdma_send_txlist(req->sde,
 913			       iowait_get_ib_work(&pq->busy),
 914			       &req->txps, &count);
 915	req->seqsubmitted += count;
 916	if (req->seqsubmitted == req->info.npkts) {
 917		/*
 918		 * The txreq has already been submitted to the HW queue
 919		 * so we can free the AHG entry now. Corruption will not
 920		 * happen due to the sequential manner in which
 921		 * descriptors are processed.
 922		 */
 923		if (req->ahg_idx >= 0)
 924			sdma_ahg_free(req->sde, req->ahg_idx);
 925	}
 926	return ret;
 927
 928free_txreq:
 929	sdma_txclean(pq->dd, &tx->txreq);
 930free_tx:
 931	kmem_cache_free(pq->txreq_cache, tx);
 932	return ret;
 933}
 934
 935static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
 936{
 937	struct evict_data evict_data;
 938
 939	evict_data.cleared = 0;
 940	evict_data.target = npages;
 941	hfi1_mmu_rb_evict(pq->handler, &evict_data);
 942	return evict_data.cleared;
 943}
 944
 945static int pin_sdma_pages(struct user_sdma_request *req,
 946			  struct user_sdma_iovec *iovec,
 947			  struct sdma_mmu_node *node,
 948			  int npages)
 949{
 950	int pinned, cleared;
 951	struct page **pages;
 952	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 953
 954	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
 955	if (!pages)
 956		return -ENOMEM;
 957	memcpy(pages, node->pages, node->npages * sizeof(*pages));
 958
 959	npages -= node->npages;
 960retry:
 961	if (!hfi1_can_pin_pages(pq->dd, pq->mm,
 962				atomic_read(&pq->n_locked), npages)) {
 963		cleared = sdma_cache_evict(pq, npages);
 964		if (cleared >= npages)
 965			goto retry;
 966	}
 967	pinned = hfi1_acquire_user_pages(pq->mm,
 968					 ((unsigned long)iovec->iov.iov_base +
 969					 (node->npages * PAGE_SIZE)), npages, 0,
 970					 pages + node->npages);
 971	if (pinned < 0) {
 972		kfree(pages);
 973		return pinned;
 974	}
 975	if (pinned != npages) {
 976		unpin_vector_pages(pq->mm, pages, node->npages, pinned);
 977		return -EFAULT;
 978	}
 979	kfree(node->pages);
 980	node->rb.len = iovec->iov.iov_len;
 981	node->pages = pages;
 982	atomic_add(pinned, &pq->n_locked);
 983	return pinned;
 984}
 985
 986static void unpin_sdma_pages(struct sdma_mmu_node *node)
 987{
 988	if (node->npages) {
 989		unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages);
 990		atomic_sub(node->npages, &node->pq->n_locked);
 991	}
 992}
 993
 994static int pin_vector_pages(struct user_sdma_request *req,
 995			    struct user_sdma_iovec *iovec)
 996{
 997	int ret = 0, pinned, npages;
 998	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 999	struct sdma_mmu_node *node = NULL;
1000	struct mmu_rb_node *rb_node;
1001	struct iovec *iov;
1002	bool extracted;
1003
1004	extracted =
1005		hfi1_mmu_rb_remove_unless_exact(pq->handler,
1006						(unsigned long)
1007						iovec->iov.iov_base,
1008						iovec->iov.iov_len, &rb_node);
1009	if (rb_node) {
1010		node = container_of(rb_node, struct sdma_mmu_node, rb);
1011		if (!extracted) {
1012			atomic_inc(&node->refcount);
1013			iovec->pages = node->pages;
1014			iovec->npages = node->npages;
1015			iovec->node = node;
1016			return 0;
1017		}
1018	}
1019
1020	if (!node) {
1021		node = kzalloc(sizeof(*node), GFP_KERNEL);
1022		if (!node)
1023			return -ENOMEM;
1024
1025		node->rb.addr = (unsigned long)iovec->iov.iov_base;
1026		node->pq = pq;
1027		atomic_set(&node->refcount, 0);
1028	}
1029
1030	iov = &iovec->iov;
1031	npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len);
1032	if (node->npages < npages) {
1033		pinned = pin_sdma_pages(req, iovec, node, npages);
1034		if (pinned < 0) {
1035			ret = pinned;
1036			goto bail;
1037		}
1038		node->npages += pinned;
1039		npages = node->npages;
1040	}
1041	iovec->pages = node->pages;
1042	iovec->npages = npages;
1043	iovec->node = node;
1044
1045	ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb);
1046	if (ret) {
1047		iovec->node = NULL;
1048		goto bail;
1049	}
1050	return 0;
1051bail:
1052	unpin_sdma_pages(node);
1053	kfree(node);
1054	return ret;
1055}
1056
1057static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
1058			       unsigned start, unsigned npages)
1059{
1060	hfi1_release_user_pages(mm, pages + start, npages, false);
1061	kfree(pages);
1062}
1063
1064static int check_header_template(struct user_sdma_request *req,
1065				 struct hfi1_pkt_header *hdr, u32 lrhlen,
1066				 u32 datalen)
1067{
1068	/*
1069	 * Perform safety checks for any type of packet:
1070	 *    - transfer size is multiple of 64bytes
1071	 *    - packet length is multiple of 4 bytes
1072	 *    - packet length is not larger than MTU size
1073	 *
1074	 * These checks are only done for the first packet of the
1075	 * transfer since the header is "given" to us by user space.
1076	 * For the remainder of the packets we compute the values.
1077	 */
1078	if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
1079	    lrhlen > get_lrh_len(*hdr, req->info.fragsize))
1080		return -EINVAL;
1081
1082	if (req_opcode(req->info.ctrl) == EXPECTED) {
1083		/*
1084		 * The header is checked only on the first packet. Furthermore,
1085		 * we ensure that at least one TID entry is copied when the
1086		 * request is submitted. Therefore, we don't have to verify that
1087		 * tididx points to something sane.
1088		 */
1089		u32 tidval = req->tids[req->tididx],
1090			tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
1091			tididx = EXP_TID_GET(tidval, IDX),
1092			tidctrl = EXP_TID_GET(tidval, CTRL),
1093			tidoff;
1094		__le32 kval = hdr->kdeth.ver_tid_offset;
1095
1096		tidoff = KDETH_GET(kval, OFFSET) *
1097			  (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
1098			   KDETH_OM_LARGE : KDETH_OM_SMALL);
1099		/*
1100		 * Expected receive packets have the following
1101		 * additional checks:
1102		 *     - offset is not larger than the TID size
1103		 *     - TIDCtrl values match between header and TID array
1104		 *     - TID indexes match between header and TID array
1105		 */
1106		if ((tidoff + datalen > tidlen) ||
1107		    KDETH_GET(kval, TIDCTRL) != tidctrl ||
1108		    KDETH_GET(kval, TID) != tididx)
1109			return -EINVAL;
1110	}
1111	return 0;
1112}
1113
1114/*
1115 * Correctly set the BTH.PSN field based on type of
1116 * transfer - eager packets can just increment the PSN but
1117 * expected packets encode generation and sequence in the
1118 * BTH.PSN field so just incrementing will result in errors.
1119 */
1120static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
1121{
1122	u32 val = be32_to_cpu(bthpsn),
1123		mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
1124			0xffffffull),
1125		psn = val & mask;
1126	if (expct)
1127		psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
1128			((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
1129	else
1130		psn = psn + frags;
1131	return psn & mask;
1132}
1133
1134static int set_txreq_header(struct user_sdma_request *req,
1135			    struct user_sdma_txreq *tx, u32 datalen)
1136{
1137	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1138	struct hfi1_pkt_header *hdr = &tx->hdr;
1139	u8 omfactor; /* KDETH.OM */
1140	u16 pbclen;
1141	int ret;
1142	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1143
1144	/* Copy the header template to the request before modification */
1145	memcpy(hdr, &req->hdr, sizeof(*hdr));
1146
1147	/*
1148	 * Check if the PBC and LRH length are mismatched. If so
1149	 * adjust both in the header.
1150	 */
1151	pbclen = le16_to_cpu(hdr->pbc[0]);
1152	if (PBC2LRH(pbclen) != lrhlen) {
1153		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
1154		hdr->pbc[0] = cpu_to_le16(pbclen);
1155		hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
1156		/*
1157		 * Third packet
1158		 * This is the first packet in the sequence that has
1159		 * a "static" size that can be used for the rest of
1160		 * the packets (besides the last one).
1161		 */
1162		if (unlikely(req->seqnum == 2)) {
1163			/*
1164			 * From this point on the lengths in both the
1165			 * PBC and LRH are the same until the last
1166			 * packet.
1167			 * Adjust the template so we don't have to update
1168			 * every packet
1169			 */
1170			req->hdr.pbc[0] = hdr->pbc[0];
1171			req->hdr.lrh[2] = hdr->lrh[2];
1172		}
1173	}
1174	/*
1175	 * We only have to modify the header if this is not the
1176	 * first packet in the request. Otherwise, we use the
1177	 * header given to us.
1178	 */
1179	if (unlikely(!req->seqnum)) {
1180		ret = check_header_template(req, hdr, lrhlen, datalen);
1181		if (ret)
1182			return ret;
1183		goto done;
1184	}
1185
1186	hdr->bth[2] = cpu_to_be32(
1187		set_pkt_bth_psn(hdr->bth[2],
1188				(req_opcode(req->info.ctrl) == EXPECTED),
1189				req->seqnum));
1190
1191	/* Set ACK request on last packet */
1192	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1193		hdr->bth[2] |= cpu_to_be32(1UL << 31);
1194
1195	/* Set the new offset */
1196	hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
1197	/* Expected packets have to fill in the new TID information */
1198	if (req_opcode(req->info.ctrl) == EXPECTED) {
1199		tidval = req->tids[req->tididx];
1200		/*
1201		 * If the offset puts us at the end of the current TID,
1202		 * advance everything.
1203		 */
1204		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1205					 PAGE_SIZE)) {
1206			req->tidoffset = 0;
1207			/*
1208			 * Since we don't copy all the TIDs, all at once,
1209			 * we have to check again.
1210			 */
1211			if (++req->tididx > req->n_tids - 1 ||
1212			    !req->tids[req->tididx]) {
1213				return -EINVAL;
1214			}
1215			tidval = req->tids[req->tididx];
1216		}
1217		omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
1218			KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT :
1219			KDETH_OM_SMALL_SHIFT;
1220		/* Set KDETH.TIDCtrl based on value for this TID. */
1221		KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1222			  EXP_TID_GET(tidval, CTRL));
1223		/* Set KDETH.TID based on value for this TID */
1224		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1225			  EXP_TID_GET(tidval, IDX));
1226		/* Clear KDETH.SH when DISABLE_SH flag is set */
1227		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH))
1228			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1229		/*
1230		 * Set the KDETH.OFFSET and KDETH.OM based on size of
1231		 * transfer.
1232		 */
1233		trace_hfi1_sdma_user_tid_info(
1234			pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx,
1235			req->tidoffset, req->tidoffset >> omfactor,
1236			omfactor != KDETH_OM_SMALL_SHIFT);
1237		KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1238			  req->tidoffset >> omfactor);
1239		KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1240			  omfactor != KDETH_OM_SMALL_SHIFT);
1241	}
1242done:
1243	trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1244				    req->info.comp_idx, hdr, tidval);
1245	return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1246}
1247
1248static int set_txreq_header_ahg(struct user_sdma_request *req,
1249				struct user_sdma_txreq *tx, u32 datalen)
1250{
1251	u32 ahg[AHG_KDETH_ARRAY_SIZE];
1252	int idx = 0;
1253	u8 omfactor; /* KDETH.OM */
1254	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1255	struct hfi1_pkt_header *hdr = &req->hdr;
1256	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1257	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
1258	size_t array_size = ARRAY_SIZE(ahg);
1259
1260	if (PBC2LRH(pbclen) != lrhlen) {
1261		/* PBC.PbcLengthDWs */
1262		idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12,
1263				     (__force u16)cpu_to_le16(LRH2PBC(lrhlen)));
1264		if (idx < 0)
1265			return idx;
1266		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
1267		idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16,
1268				     (__force u16)cpu_to_be16(lrhlen >> 2));
1269		if (idx < 0)
1270			return idx;
1271	}
1272
1273	/*
1274	 * Do the common updates
1275	 */
1276	/* BTH.PSN and BTH.A */
1277	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1278		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1279	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK))
1280		val32 |= 1UL << 31;
1281	idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16,
1282			     (__force u16)cpu_to_be16(val32 >> 16));
1283	if (idx < 0)
1284		return idx;
1285	idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16,
1286			     (__force u16)cpu_to_be16(val32 & 0xffff));
1287	if (idx < 0)
1288		return idx;
1289	/* KDETH.Offset */
1290	idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16,
1291			     (__force u16)cpu_to_le16(req->koffset & 0xffff));
1292	if (idx < 0)
1293		return idx;
1294	idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16,
1295			     (__force u16)cpu_to_le16(req->koffset >> 16));
1296	if (idx < 0)
1297		return idx;
1298	if (req_opcode(req->info.ctrl) == EXPECTED) {
1299		__le16 val;
1300
1301		tidval = req->tids[req->tididx];
1302
1303		/*
1304		 * If the offset puts us at the end of the current TID,
1305		 * advance everything.
1306		 */
1307		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1308					 PAGE_SIZE)) {
1309			req->tidoffset = 0;
1310			/*
1311			 * Since we don't copy all the TIDs, all at once,
1312			 * we have to check again.
1313			 */
1314			if (++req->tididx > req->n_tids - 1 ||
1315			    !req->tids[req->tididx])
1316				return -EINVAL;
1317			tidval = req->tids[req->tididx];
1318		}
1319		omfactor = ((EXP_TID_GET(tidval, LEN) *
1320				  PAGE_SIZE) >=
1321				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT :
1322				 KDETH_OM_SMALL_SHIFT;
1323		/* KDETH.OM and KDETH.OFFSET (TID) */
1324		idx = ahg_header_set(
1325				ahg, idx, array_size, 7, 0, 16,
1326				((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 |
1327				((req->tidoffset >> omfactor)
1328				& 0x7fff)));
1329		if (idx < 0)
1330			return idx;
1331		/* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */
1332		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1333				   (EXP_TID_GET(tidval, IDX) & 0x3ff));
1334
1335		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) {
1336			val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1337						      INTR) <<
1338					    AHG_KDETH_INTR_SHIFT));
1339		} else {
1340			val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ?
1341			       cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) :
1342			       cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset,
1343						      INTR) <<
1344					     AHG_KDETH_INTR_SHIFT));
1345		}
1346
1347		idx = ahg_header_set(ahg, idx, array_size,
1348				     7, 16, 14, (__force u16)val);
1349		if (idx < 0)
1350			return idx;
1351	}
1352
1353	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1354					req->info.comp_idx, req->sde->this_idx,
1355					req->ahg_idx, ahg, idx, tidval);
1356	sdma_txinit_ahg(&tx->txreq,
1357			SDMA_TXREQ_F_USE_AHG,
1358			datalen, req->ahg_idx, idx,
1359			ahg, sizeof(req->hdr),
1360			user_sdma_txreq_cb);
1361
1362	return idx;
1363}
1364
1365/**
1366 * user_sdma_txreq_cb() - SDMA tx request completion callback.
1367 * @txreq: valid sdma tx request
1368 * @status: success/failure of request
1369 *
1370 * Called when the SDMA progress state machine gets notification that
1371 * the SDMA descriptors for this tx request have been processed by the
1372 * DMA engine. Called in interrupt context.
1373 * Only do work on completed sequences.
1374 */
1375static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
1376{
1377	struct user_sdma_txreq *tx =
1378		container_of(txreq, struct user_sdma_txreq, txreq);
1379	struct user_sdma_request *req;
1380	struct hfi1_user_sdma_pkt_q *pq;
1381	struct hfi1_user_sdma_comp_q *cq;
1382	enum hfi1_sdma_comp_state state = COMPLETE;
1383
1384	if (!tx->req)
1385		return;
1386
1387	req = tx->req;
1388	pq = req->pq;
1389	cq = req->cq;
1390
1391	if (status != SDMA_TXREQ_S_OK) {
1392		SDMA_DBG(req, "SDMA completion with error %d",
1393			 status);
1394		WRITE_ONCE(req->has_error, 1);
1395		state = ERROR;
1396	}
1397
1398	req->seqcomp = tx->seqnum;
1399	kmem_cache_free(pq->txreq_cache, tx);
1400
1401	/* sequence isn't complete?  We are done */
1402	if (req->seqcomp != req->info.npkts - 1)
1403		return;
1404
1405	user_sdma_free_request(req, false);
1406	set_comp_state(pq, cq, req->info.comp_idx, state, status);
1407	pq_update(pq);
1408}
1409
1410static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
1411{
1412	if (atomic_dec_and_test(&pq->n_reqs))
1413		wake_up(&pq->wait);
1414}
1415
1416static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
1417{
1418	int i;
1419
1420	if (!list_empty(&req->txps)) {
1421		struct sdma_txreq *t, *p;
1422
1423		list_for_each_entry_safe(t, p, &req->txps, list) {
1424			struct user_sdma_txreq *tx =
1425				container_of(t, struct user_sdma_txreq, txreq);
1426			list_del_init(&t->list);
1427			sdma_txclean(req->pq->dd, t);
1428			kmem_cache_free(req->pq->txreq_cache, tx);
1429		}
1430	}
1431
1432	for (i = 0; i < req->data_iovs; i++) {
1433		struct sdma_mmu_node *node = req->iovs[i].node;
1434
1435		if (!node)
1436			continue;
1437
1438		req->iovs[i].node = NULL;
1439
1440		if (unpin)
1441			hfi1_mmu_rb_remove(req->pq->handler,
1442					   &node->rb);
1443		else
1444			atomic_dec(&node->refcount);
1445	}
1446
1447	kfree(req->tids);
1448	clear_bit(req->info.comp_idx, req->pq->req_in_use);
1449}
1450
1451static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
1452				  struct hfi1_user_sdma_comp_q *cq,
1453				  u16 idx, enum hfi1_sdma_comp_state state,
1454				  int ret)
1455{
1456	if (state == ERROR)
1457		cq->comps[idx].errcode = -ret;
1458	smp_wmb(); /* make sure errcode is visible first */
1459	cq->comps[idx].status = state;
1460	trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
1461					idx, state, ret);
1462}
1463
1464static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
1465			   unsigned long len)
1466{
1467	return (bool)(node->addr == addr);
1468}
1469
1470static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode)
1471{
1472	struct sdma_mmu_node *node =
1473		container_of(mnode, struct sdma_mmu_node, rb);
1474
1475	atomic_inc(&node->refcount);
1476	return 0;
1477}
1478
1479/*
1480 * Return 1 to remove the node from the rb tree and call the remove op.
1481 *
1482 * Called with the rb tree lock held.
1483 */
1484static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
1485			 void *evict_arg, bool *stop)
1486{
1487	struct sdma_mmu_node *node =
1488		container_of(mnode, struct sdma_mmu_node, rb);
1489	struct evict_data *evict_data = evict_arg;
1490
1491	/* is this node still being used? */
1492	if (atomic_read(&node->refcount))
1493		return 0; /* keep this node */
1494
1495	/* this node will be evicted, add its pages to our count */
1496	evict_data->cleared += node->npages;
1497
1498	/* have enough pages been cleared? */
1499	if (evict_data->cleared >= evict_data->target)
1500		*stop = true;
1501
1502	return 1; /* remove this node */
1503}
1504
1505static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
1506{
1507	struct sdma_mmu_node *node =
1508		container_of(mnode, struct sdma_mmu_node, rb);
1509
1510	unpin_sdma_pages(node);
1511	kfree(node);
1512}
1513
1514static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
1515{
1516	struct sdma_mmu_node *node =
1517		container_of(mnode, struct sdma_mmu_node, rb);
1518
1519	if (!atomic_read(&node->refcount))
1520		return 1;
1521	return 0;
1522}