Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1/*
   2 * Copyright(c) 2016 Intel Corporation.
   3 *
   4 * This file is provided under a dual BSD/GPLv2 license.  When using or
   5 * redistributing this file, you may do so under either license.
   6 *
   7 * GPL LICENSE SUMMARY
   8 *
   9 * This program is free software; you can redistribute it and/or modify
  10 * it under the terms of version 2 of the GNU General Public License as
  11 * published by the Free Software Foundation.
  12 *
  13 * This program is distributed in the hope that it will be useful, but
  14 * WITHOUT ANY WARRANTY; without even the implied warranty of
  15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 * General Public License for more details.
  17 *
  18 * BSD LICENSE
  19 *
  20 * Redistribution and use in source and binary forms, with or without
  21 * modification, are permitted provided that the following conditions
  22 * are met:
  23 *
  24 *  - Redistributions of source code must retain the above copyright
  25 *    notice, this list of conditions and the following disclaimer.
  26 *  - Redistributions in binary form must reproduce the above copyright
  27 *    notice, this list of conditions and the following disclaimer in
  28 *    the documentation and/or other materials provided with the
  29 *    distribution.
  30 *  - Neither the name of Intel Corporation nor the names of its
  31 *    contributors may be used to endorse or promote products derived
  32 *    from this software without specific prior written permission.
  33 *
  34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  45 *
  46 */
  47
  48#include <linux/slab.h>
  49#include <linux/vmalloc.h>
  50#include <rdma/ib_umem.h>
  51#include <rdma/rdma_vt.h>
  52#include "vt.h"
  53#include "mr.h"
  54#include "trace.h"
  55
  56/**
  57 * rvt_driver_mr_init - Init MR resources per driver
  58 * @rdi: rvt dev struct
  59 *
  60 * Do any intilization needed when a driver registers with rdmavt.
  61 *
  62 * Return: 0 on success or errno on failure
  63 */
  64int rvt_driver_mr_init(struct rvt_dev_info *rdi)
  65{
  66	unsigned int lkey_table_size = rdi->dparms.lkey_table_size;
  67	unsigned lk_tab_size;
  68	int i;
  69
  70	/*
  71	 * The top hfi1_lkey_table_size bits are used to index the
  72	 * table.  The lower 8 bits can be owned by the user (copied from
  73	 * the LKEY).  The remaining bits act as a generation number or tag.
  74	 */
  75	if (!lkey_table_size)
  76		return -EINVAL;
  77
  78	spin_lock_init(&rdi->lkey_table.lock);
  79
  80	/* ensure generation is at least 4 bits */
  81	if (lkey_table_size > RVT_MAX_LKEY_TABLE_BITS) {
  82		rvt_pr_warn(rdi, "lkey bits %u too large, reduced to %u\n",
  83			    lkey_table_size, RVT_MAX_LKEY_TABLE_BITS);
  84		rdi->dparms.lkey_table_size = RVT_MAX_LKEY_TABLE_BITS;
  85		lkey_table_size = rdi->dparms.lkey_table_size;
  86	}
  87	rdi->lkey_table.max = 1 << lkey_table_size;
  88	rdi->lkey_table.shift = 32 - lkey_table_size;
  89	lk_tab_size = rdi->lkey_table.max * sizeof(*rdi->lkey_table.table);
  90	rdi->lkey_table.table = (struct rvt_mregion __rcu **)
  91			       vmalloc_node(lk_tab_size, rdi->dparms.node);
  92	if (!rdi->lkey_table.table)
  93		return -ENOMEM;
  94
  95	RCU_INIT_POINTER(rdi->dma_mr, NULL);
  96	for (i = 0; i < rdi->lkey_table.max; i++)
  97		RCU_INIT_POINTER(rdi->lkey_table.table[i], NULL);
  98
  99	return 0;
 100}
 101
 102/**
 103 *rvt_mr_exit: clean up MR
 104 *@rdi: rvt dev structure
 105 *
 106 * called when drivers have unregistered or perhaps failed to register with us
 107 */
 108void rvt_mr_exit(struct rvt_dev_info *rdi)
 109{
 110	if (rdi->dma_mr)
 111		rvt_pr_err(rdi, "DMA MR not null!\n");
 112
 113	vfree(rdi->lkey_table.table);
 114}
 115
 116static void rvt_deinit_mregion(struct rvt_mregion *mr)
 117{
 118	int i = mr->mapsz;
 119
 120	mr->mapsz = 0;
 121	while (i)
 122		kfree(mr->map[--i]);
 123	percpu_ref_exit(&mr->refcount);
 124}
 125
 126static void __rvt_mregion_complete(struct percpu_ref *ref)
 127{
 128	struct rvt_mregion *mr = container_of(ref, struct rvt_mregion,
 129					      refcount);
 130
 131	complete(&mr->comp);
 132}
 133
 134static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd,
 135			    int count, unsigned int percpu_flags)
 136{
 137	int m, i = 0;
 138	struct rvt_dev_info *dev = ib_to_rvt(pd->device);
 139
 140	mr->mapsz = 0;
 141	m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ;
 142	for (; i < m; i++) {
 143		mr->map[i] = kzalloc_node(sizeof(*mr->map[0]), GFP_KERNEL,
 144					  dev->dparms.node);
 145		if (!mr->map[i])
 146			goto bail;
 147		mr->mapsz++;
 148	}
 149	init_completion(&mr->comp);
 150	/* count returning the ptr to user */
 151	if (percpu_ref_init(&mr->refcount, &__rvt_mregion_complete,
 152			    percpu_flags, GFP_KERNEL))
 153		goto bail;
 154
 155	atomic_set(&mr->lkey_invalid, 0);
 156	mr->pd = pd;
 157	mr->max_segs = count;
 158	return 0;
 159bail:
 160	rvt_deinit_mregion(mr);
 161	return -ENOMEM;
 162}
 163
 164/**
 165 * rvt_alloc_lkey - allocate an lkey
 166 * @mr: memory region that this lkey protects
 167 * @dma_region: 0->normal key, 1->restricted DMA key
 168 *
 169 * Returns 0 if successful, otherwise returns -errno.
 170 *
 171 * Increments mr reference count as required.
 172 *
 173 * Sets the lkey field mr for non-dma regions.
 174 *
 175 */
 176static int rvt_alloc_lkey(struct rvt_mregion *mr, int dma_region)
 177{
 178	unsigned long flags;
 179	u32 r;
 180	u32 n;
 181	int ret = 0;
 182	struct rvt_dev_info *dev = ib_to_rvt(mr->pd->device);
 183	struct rvt_lkey_table *rkt = &dev->lkey_table;
 184
 185	rvt_get_mr(mr);
 186	spin_lock_irqsave(&rkt->lock, flags);
 187
 188	/* special case for dma_mr lkey == 0 */
 189	if (dma_region) {
 190		struct rvt_mregion *tmr;
 191
 192		tmr = rcu_access_pointer(dev->dma_mr);
 193		if (!tmr) {
 194			mr->lkey_published = 1;
 195			/* Insure published written first */
 196			rcu_assign_pointer(dev->dma_mr, mr);
 197			rvt_get_mr(mr);
 198		}
 199		goto success;
 200	}
 201
 202	/* Find the next available LKEY */
 203	r = rkt->next;
 204	n = r;
 205	for (;;) {
 206		if (!rcu_access_pointer(rkt->table[r]))
 207			break;
 208		r = (r + 1) & (rkt->max - 1);
 209		if (r == n)
 210			goto bail;
 211	}
 212	rkt->next = (r + 1) & (rkt->max - 1);
 213	/*
 214	 * Make sure lkey is never zero which is reserved to indicate an
 215	 * unrestricted LKEY.
 216	 */
 217	rkt->gen++;
 218	/*
 219	 * bits are capped to ensure enough bits for generation number
 220	 */
 221	mr->lkey = (r << (32 - dev->dparms.lkey_table_size)) |
 222		((((1 << (24 - dev->dparms.lkey_table_size)) - 1) & rkt->gen)
 223		 << 8);
 224	if (mr->lkey == 0) {
 225		mr->lkey |= 1 << 8;
 226		rkt->gen++;
 227	}
 228	mr->lkey_published = 1;
 229	/* Insure published written first */
 230	rcu_assign_pointer(rkt->table[r], mr);
 231success:
 232	spin_unlock_irqrestore(&rkt->lock, flags);
 233out:
 234	return ret;
 235bail:
 236	rvt_put_mr(mr);
 237	spin_unlock_irqrestore(&rkt->lock, flags);
 238	ret = -ENOMEM;
 239	goto out;
 240}
 241
 242/**
 243 * rvt_free_lkey - free an lkey
 244 * @mr: mr to free from tables
 245 */
 246static void rvt_free_lkey(struct rvt_mregion *mr)
 247{
 248	unsigned long flags;
 249	u32 lkey = mr->lkey;
 250	u32 r;
 251	struct rvt_dev_info *dev = ib_to_rvt(mr->pd->device);
 252	struct rvt_lkey_table *rkt = &dev->lkey_table;
 253	int freed = 0;
 254
 255	spin_lock_irqsave(&rkt->lock, flags);
 256	if (!lkey) {
 257		if (mr->lkey_published) {
 258			mr->lkey_published = 0;
 259			/* insure published is written before pointer */
 260			rcu_assign_pointer(dev->dma_mr, NULL);
 261			rvt_put_mr(mr);
 262		}
 263	} else {
 264		if (!mr->lkey_published)
 265			goto out;
 266		r = lkey >> (32 - dev->dparms.lkey_table_size);
 267		mr->lkey_published = 0;
 268		/* insure published is written before pointer */
 269		rcu_assign_pointer(rkt->table[r], NULL);
 270	}
 271	freed++;
 272out:
 273	spin_unlock_irqrestore(&rkt->lock, flags);
 274	if (freed)
 275		percpu_ref_kill(&mr->refcount);
 276}
 277
 278static struct rvt_mr *__rvt_alloc_mr(int count, struct ib_pd *pd)
 279{
 280	struct rvt_mr *mr;
 281	int rval = -ENOMEM;
 282	int m;
 283
 284	/* Allocate struct plus pointers to first level page tables. */
 285	m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ;
 286	mr = kzalloc(sizeof(*mr) + m * sizeof(mr->mr.map[0]), GFP_KERNEL);
 287	if (!mr)
 288		goto bail;
 289
 290	rval = rvt_init_mregion(&mr->mr, pd, count, 0);
 291	if (rval)
 292		goto bail;
 293	/*
 294	 * ib_reg_phys_mr() will initialize mr->ibmr except for
 295	 * lkey and rkey.
 296	 */
 297	rval = rvt_alloc_lkey(&mr->mr, 0);
 298	if (rval)
 299		goto bail_mregion;
 300	mr->ibmr.lkey = mr->mr.lkey;
 301	mr->ibmr.rkey = mr->mr.lkey;
 302done:
 303	return mr;
 304
 305bail_mregion:
 306	rvt_deinit_mregion(&mr->mr);
 307bail:
 308	kfree(mr);
 309	mr = ERR_PTR(rval);
 310	goto done;
 311}
 312
 313static void __rvt_free_mr(struct rvt_mr *mr)
 314{
 315	rvt_free_lkey(&mr->mr);
 316	rvt_deinit_mregion(&mr->mr);
 317	kfree(mr);
 318}
 319
 320/**
 321 * rvt_get_dma_mr - get a DMA memory region
 322 * @pd: protection domain for this memory region
 323 * @acc: access flags
 324 *
 325 * Return: the memory region on success, otherwise returns an errno.
 326 * Note that all DMA addresses should be created via the functions in
 327 * struct dma_virt_ops.
 328 */
 329struct ib_mr *rvt_get_dma_mr(struct ib_pd *pd, int acc)
 330{
 331	struct rvt_mr *mr;
 332	struct ib_mr *ret;
 333	int rval;
 334
 335	if (ibpd_to_rvtpd(pd)->user)
 336		return ERR_PTR(-EPERM);
 337
 338	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 339	if (!mr) {
 340		ret = ERR_PTR(-ENOMEM);
 341		goto bail;
 342	}
 343
 344	rval = rvt_init_mregion(&mr->mr, pd, 0, 0);
 345	if (rval) {
 346		ret = ERR_PTR(rval);
 347		goto bail;
 348	}
 349
 350	rval = rvt_alloc_lkey(&mr->mr, 1);
 351	if (rval) {
 352		ret = ERR_PTR(rval);
 353		goto bail_mregion;
 354	}
 355
 356	mr->mr.access_flags = acc;
 357	ret = &mr->ibmr;
 358done:
 359	return ret;
 360
 361bail_mregion:
 362	rvt_deinit_mregion(&mr->mr);
 363bail:
 364	kfree(mr);
 365	goto done;
 366}
 367
 368/**
 369 * rvt_reg_user_mr - register a userspace memory region
 370 * @pd: protection domain for this memory region
 371 * @start: starting userspace address
 372 * @length: length of region to register
 373 * @mr_access_flags: access flags for this memory region
 374 * @udata: unused by the driver
 375 *
 376 * Return: the memory region on success, otherwise returns an errno.
 377 */
 378struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 379			      u64 virt_addr, int mr_access_flags,
 380			      struct ib_udata *udata)
 381{
 382	struct rvt_mr *mr;
 383	struct ib_umem *umem;
 384	struct scatterlist *sg;
 385	int n, m, entry;
 386	struct ib_mr *ret;
 387
 388	if (length == 0)
 389		return ERR_PTR(-EINVAL);
 390
 391	umem = ib_umem_get(pd->uobject->context, start, length,
 392			   mr_access_flags, 0);
 393	if (IS_ERR(umem))
 394		return (void *)umem;
 395
 396	n = umem->nmap;
 397
 398	mr = __rvt_alloc_mr(n, pd);
 399	if (IS_ERR(mr)) {
 400		ret = (struct ib_mr *)mr;
 401		goto bail_umem;
 402	}
 403
 404	mr->mr.user_base = start;
 405	mr->mr.iova = virt_addr;
 406	mr->mr.length = length;
 407	mr->mr.offset = ib_umem_offset(umem);
 408	mr->mr.access_flags = mr_access_flags;
 409	mr->umem = umem;
 410
 411	mr->mr.page_shift = umem->page_shift;
 412	m = 0;
 413	n = 0;
 414	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
 415		void *vaddr;
 416
 417		vaddr = page_address(sg_page(sg));
 418		if (!vaddr) {
 419			ret = ERR_PTR(-EINVAL);
 420			goto bail_inval;
 421		}
 422		mr->mr.map[m]->segs[n].vaddr = vaddr;
 423		mr->mr.map[m]->segs[n].length = BIT(umem->page_shift);
 424		trace_rvt_mr_user_seg(&mr->mr, m, n, vaddr,
 425				      BIT(umem->page_shift));
 426		n++;
 427		if (n == RVT_SEGSZ) {
 428			m++;
 429			n = 0;
 430		}
 431	}
 432	return &mr->ibmr;
 433
 434bail_inval:
 435	__rvt_free_mr(mr);
 436
 437bail_umem:
 438	ib_umem_release(umem);
 439
 440	return ret;
 441}
 442
 443/**
 444 * rvt_dereg_clean_qp_cb - callback from iterator
 445 * @qp - the qp
 446 * @v - the mregion (as u64)
 447 *
 448 * This routine fields the callback for all QPs and
 449 * for QPs in the same PD as the MR will call the
 450 * rvt_qp_mr_clean() to potentially cleanup references.
 451 */
 452static void rvt_dereg_clean_qp_cb(struct rvt_qp *qp, u64 v)
 453{
 454	struct rvt_mregion *mr = (struct rvt_mregion *)v;
 455
 456	/* skip PDs that are not ours */
 457	if (mr->pd != qp->ibqp.pd)
 458		return;
 459	rvt_qp_mr_clean(qp, mr->lkey);
 460}
 461
 462/**
 463 * rvt_dereg_clean_qps - find QPs for reference cleanup
 464 * @mr - the MR that is being deregistered
 465 *
 466 * This routine iterates RC QPs looking for references
 467 * to the lkey noted in mr.
 468 */
 469static void rvt_dereg_clean_qps(struct rvt_mregion *mr)
 470{
 471	struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
 472
 473	rvt_qp_iter(rdi, (u64)mr, rvt_dereg_clean_qp_cb);
 474}
 475
 476/**
 477 * rvt_check_refs - check references
 478 * @mr - the megion
 479 * @t - the caller identification
 480 *
 481 * This routine checks MRs holding a reference during
 482 * when being de-registered.
 483 *
 484 * If the count is non-zero, the code calls a clean routine then
 485 * waits for the timeout for the count to zero.
 486 */
 487static int rvt_check_refs(struct rvt_mregion *mr, const char *t)
 488{
 489	unsigned long timeout;
 490	struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
 491
 492	if (mr->lkey) {
 493		/* avoid dma mr */
 494		rvt_dereg_clean_qps(mr);
 495		/* @mr was indexed on rcu protected @lkey_table */
 496		synchronize_rcu();
 497	}
 498
 499	timeout = wait_for_completion_timeout(&mr->comp, 5 * HZ);
 500	if (!timeout) {
 501		rvt_pr_err(rdi,
 502			   "%s timeout mr %p pd %p lkey %x refcount %ld\n",
 503			   t, mr, mr->pd, mr->lkey,
 504			   atomic_long_read(&mr->refcount.count));
 505		rvt_get_mr(mr);
 506		return -EBUSY;
 507	}
 508	return 0;
 509}
 510
 511/**
 512 * rvt_mr_has_lkey - is MR
 513 * @mr - the mregion
 514 * @lkey - the lkey
 515 */
 516bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey)
 517{
 518	return mr && lkey == mr->lkey;
 519}
 520
 521/**
 522 * rvt_ss_has_lkey - is mr in sge tests
 523 * @ss - the sge state
 524 * @lkey
 525 *
 526 * This code tests for an MR in the indicated
 527 * sge state.
 528 */
 529bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey)
 530{
 531	int i;
 532	bool rval = false;
 533
 534	if (!ss->num_sge)
 535		return rval;
 536	/* first one */
 537	rval = rvt_mr_has_lkey(ss->sge.mr, lkey);
 538	/* any others */
 539	for (i = 0; !rval && i < ss->num_sge - 1; i++)
 540		rval = rvt_mr_has_lkey(ss->sg_list[i].mr, lkey);
 541	return rval;
 542}
 543
 544/**
 545 * rvt_dereg_mr - unregister and free a memory region
 546 * @ibmr: the memory region to free
 547 *
 548 *
 549 * Note that this is called to free MRs created by rvt_get_dma_mr()
 550 * or rvt_reg_user_mr().
 551 *
 552 * Returns 0 on success.
 553 */
 554int rvt_dereg_mr(struct ib_mr *ibmr)
 555{
 556	struct rvt_mr *mr = to_imr(ibmr);
 557	int ret;
 558
 559	rvt_free_lkey(&mr->mr);
 560
 561	rvt_put_mr(&mr->mr); /* will set completion if last */
 562	ret = rvt_check_refs(&mr->mr, __func__);
 563	if (ret)
 564		goto out;
 565	rvt_deinit_mregion(&mr->mr);
 566	if (mr->umem)
 567		ib_umem_release(mr->umem);
 568	kfree(mr);
 569out:
 570	return ret;
 571}
 572
 573/**
 574 * rvt_alloc_mr - Allocate a memory region usable with the
 575 * @pd: protection domain for this memory region
 576 * @mr_type: mem region type
 577 * @max_num_sg: Max number of segments allowed
 578 *
 579 * Return: the memory region on success, otherwise return an errno.
 580 */
 581struct ib_mr *rvt_alloc_mr(struct ib_pd *pd,
 582			   enum ib_mr_type mr_type,
 583			   u32 max_num_sg)
 584{
 585	struct rvt_mr *mr;
 586
 587	if (mr_type != IB_MR_TYPE_MEM_REG)
 588		return ERR_PTR(-EINVAL);
 589
 590	mr = __rvt_alloc_mr(max_num_sg, pd);
 591	if (IS_ERR(mr))
 592		return (struct ib_mr *)mr;
 593
 594	return &mr->ibmr;
 595}
 596
 597/**
 598 * rvt_set_page - page assignment function called by ib_sg_to_pages
 599 * @ibmr: memory region
 600 * @addr: dma address of mapped page
 601 *
 602 * Return: 0 on success
 603 */
 604static int rvt_set_page(struct ib_mr *ibmr, u64 addr)
 605{
 606	struct rvt_mr *mr = to_imr(ibmr);
 607	u32 ps = 1 << mr->mr.page_shift;
 608	u32 mapped_segs = mr->mr.length >> mr->mr.page_shift;
 609	int m, n;
 610
 611	if (unlikely(mapped_segs == mr->mr.max_segs))
 612		return -ENOMEM;
 613
 614	if (mr->mr.length == 0) {
 615		mr->mr.user_base = addr;
 616		mr->mr.iova = addr;
 617	}
 618
 619	m = mapped_segs / RVT_SEGSZ;
 620	n = mapped_segs % RVT_SEGSZ;
 621	mr->mr.map[m]->segs[n].vaddr = (void *)addr;
 622	mr->mr.map[m]->segs[n].length = ps;
 623	trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps);
 624	mr->mr.length += ps;
 625
 626	return 0;
 627}
 628
 629/**
 630 * rvt_map_mr_sg - map sg list and set it the memory region
 631 * @ibmr: memory region
 632 * @sg: dma mapped scatterlist
 633 * @sg_nents: number of entries in sg
 634 * @sg_offset: offset in bytes into sg
 635 *
 636 * Return: number of sg elements mapped to the memory region
 637 */
 638int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
 639		  int sg_nents, unsigned int *sg_offset)
 640{
 641	struct rvt_mr *mr = to_imr(ibmr);
 642
 643	mr->mr.length = 0;
 644	mr->mr.page_shift = PAGE_SHIFT;
 645	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
 646			      rvt_set_page);
 647}
 648
 649/**
 650 * rvt_fast_reg_mr - fast register physical MR
 651 * @qp: the queue pair where the work request comes from
 652 * @ibmr: the memory region to be registered
 653 * @key: updated key for this memory region
 654 * @access: access flags for this memory region
 655 *
 656 * Returns 0 on success.
 657 */
 658int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
 659		    int access)
 660{
 661	struct rvt_mr *mr = to_imr(ibmr);
 662
 663	if (qp->ibqp.pd != mr->mr.pd)
 664		return -EACCES;
 665
 666	/* not applicable to dma MR or user MR */
 667	if (!mr->mr.lkey || mr->umem)
 668		return -EINVAL;
 669
 670	if ((key & 0xFFFFFF00) != (mr->mr.lkey & 0xFFFFFF00))
 671		return -EINVAL;
 672
 673	ibmr->lkey = key;
 674	ibmr->rkey = key;
 675	mr->mr.lkey = key;
 676	mr->mr.access_flags = access;
 677	atomic_set(&mr->mr.lkey_invalid, 0);
 678
 679	return 0;
 680}
 681EXPORT_SYMBOL(rvt_fast_reg_mr);
 682
 683/**
 684 * rvt_invalidate_rkey - invalidate an MR rkey
 685 * @qp: queue pair associated with the invalidate op
 686 * @rkey: rkey to invalidate
 687 *
 688 * Returns 0 on success.
 689 */
 690int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey)
 691{
 692	struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device);
 693	struct rvt_lkey_table *rkt = &dev->lkey_table;
 694	struct rvt_mregion *mr;
 695
 696	if (rkey == 0)
 697		return -EINVAL;
 698
 699	rcu_read_lock();
 700	mr = rcu_dereference(
 701		rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]);
 702	if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
 703		goto bail;
 704
 705	atomic_set(&mr->lkey_invalid, 1);
 706	rcu_read_unlock();
 707	return 0;
 708
 709bail:
 710	rcu_read_unlock();
 711	return -EINVAL;
 712}
 713EXPORT_SYMBOL(rvt_invalidate_rkey);
 714
 715/**
 716 * rvt_alloc_fmr - allocate a fast memory region
 717 * @pd: the protection domain for this memory region
 718 * @mr_access_flags: access flags for this memory region
 719 * @fmr_attr: fast memory region attributes
 720 *
 721 * Return: the memory region on success, otherwise returns an errno.
 722 */
 723struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
 724			     struct ib_fmr_attr *fmr_attr)
 725{
 726	struct rvt_fmr *fmr;
 727	int m;
 728	struct ib_fmr *ret;
 729	int rval = -ENOMEM;
 730
 731	/* Allocate struct plus pointers to first level page tables. */
 732	m = (fmr_attr->max_pages + RVT_SEGSZ - 1) / RVT_SEGSZ;
 733	fmr = kzalloc(sizeof(*fmr) + m * sizeof(fmr->mr.map[0]), GFP_KERNEL);
 734	if (!fmr)
 735		goto bail;
 736
 737	rval = rvt_init_mregion(&fmr->mr, pd, fmr_attr->max_pages,
 738				PERCPU_REF_INIT_ATOMIC);
 739	if (rval)
 740		goto bail;
 741
 742	/*
 743	 * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
 744	 * rkey.
 745	 */
 746	rval = rvt_alloc_lkey(&fmr->mr, 0);
 747	if (rval)
 748		goto bail_mregion;
 749	fmr->ibfmr.rkey = fmr->mr.lkey;
 750	fmr->ibfmr.lkey = fmr->mr.lkey;
 751	/*
 752	 * Resources are allocated but no valid mapping (RKEY can't be
 753	 * used).
 754	 */
 755	fmr->mr.access_flags = mr_access_flags;
 756	fmr->mr.max_segs = fmr_attr->max_pages;
 757	fmr->mr.page_shift = fmr_attr->page_shift;
 758
 759	ret = &fmr->ibfmr;
 760done:
 761	return ret;
 762
 763bail_mregion:
 764	rvt_deinit_mregion(&fmr->mr);
 765bail:
 766	kfree(fmr);
 767	ret = ERR_PTR(rval);
 768	goto done;
 769}
 770
 771/**
 772 * rvt_map_phys_fmr - set up a fast memory region
 773 * @ibfmr: the fast memory region to set up
 774 * @page_list: the list of pages to associate with the fast memory region
 775 * @list_len: the number of pages to associate with the fast memory region
 776 * @iova: the virtual address of the start of the fast memory region
 777 *
 778 * This may be called from interrupt context.
 779 *
 780 * Return: 0 on success
 781 */
 782
 783int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
 784		     int list_len, u64 iova)
 785{
 786	struct rvt_fmr *fmr = to_ifmr(ibfmr);
 787	struct rvt_lkey_table *rkt;
 788	unsigned long flags;
 789	int m, n;
 790	unsigned long i;
 791	u32 ps;
 792	struct rvt_dev_info *rdi = ib_to_rvt(ibfmr->device);
 793
 794	i = atomic_long_read(&fmr->mr.refcount.count);
 795	if (i > 2)
 796		return -EBUSY;
 797
 798	if (list_len > fmr->mr.max_segs)
 799		return -EINVAL;
 800
 801	rkt = &rdi->lkey_table;
 802	spin_lock_irqsave(&rkt->lock, flags);
 803	fmr->mr.user_base = iova;
 804	fmr->mr.iova = iova;
 805	ps = 1 << fmr->mr.page_shift;
 806	fmr->mr.length = list_len * ps;
 807	m = 0;
 808	n = 0;
 809	for (i = 0; i < list_len; i++) {
 810		fmr->mr.map[m]->segs[n].vaddr = (void *)page_list[i];
 811		fmr->mr.map[m]->segs[n].length = ps;
 812		trace_rvt_mr_fmr_seg(&fmr->mr, m, n, (void *)page_list[i], ps);
 813		if (++n == RVT_SEGSZ) {
 814			m++;
 815			n = 0;
 816		}
 817	}
 818	spin_unlock_irqrestore(&rkt->lock, flags);
 819	return 0;
 820}
 821
 822/**
 823 * rvt_unmap_fmr - unmap fast memory regions
 824 * @fmr_list: the list of fast memory regions to unmap
 825 *
 826 * Return: 0 on success.
 827 */
 828int rvt_unmap_fmr(struct list_head *fmr_list)
 829{
 830	struct rvt_fmr *fmr;
 831	struct rvt_lkey_table *rkt;
 832	unsigned long flags;
 833	struct rvt_dev_info *rdi;
 834
 835	list_for_each_entry(fmr, fmr_list, ibfmr.list) {
 836		rdi = ib_to_rvt(fmr->ibfmr.device);
 837		rkt = &rdi->lkey_table;
 838		spin_lock_irqsave(&rkt->lock, flags);
 839		fmr->mr.user_base = 0;
 840		fmr->mr.iova = 0;
 841		fmr->mr.length = 0;
 842		spin_unlock_irqrestore(&rkt->lock, flags);
 843	}
 844	return 0;
 845}
 846
 847/**
 848 * rvt_dealloc_fmr - deallocate a fast memory region
 849 * @ibfmr: the fast memory region to deallocate
 850 *
 851 * Return: 0 on success.
 852 */
 853int rvt_dealloc_fmr(struct ib_fmr *ibfmr)
 854{
 855	struct rvt_fmr *fmr = to_ifmr(ibfmr);
 856	int ret = 0;
 857
 858	rvt_free_lkey(&fmr->mr);
 859	rvt_put_mr(&fmr->mr); /* will set completion if last */
 860	ret = rvt_check_refs(&fmr->mr, __func__);
 861	if (ret)
 862		goto out;
 863	rvt_deinit_mregion(&fmr->mr);
 864	kfree(fmr);
 865out:
 866	return ret;
 867}
 868
 869/**
 870 * rvt_sge_adjacent - is isge compressible
 871 * @last_sge: last outgoing SGE written
 872 * @sge: SGE to check
 873 *
 874 * If adjacent will update last_sge to add length.
 875 *
 876 * Return: true if isge is adjacent to last sge
 877 */
 878static inline bool rvt_sge_adjacent(struct rvt_sge *last_sge,
 879				    struct ib_sge *sge)
 880{
 881	if (last_sge && sge->lkey == last_sge->mr->lkey &&
 882	    ((uint64_t)(last_sge->vaddr + last_sge->length) == sge->addr)) {
 883		if (sge->lkey) {
 884			if (unlikely((sge->addr - last_sge->mr->user_base +
 885			      sge->length > last_sge->mr->length)))
 886				return false; /* overrun, caller will catch */
 887		} else {
 888			last_sge->length += sge->length;
 889		}
 890		last_sge->sge_length += sge->length;
 891		trace_rvt_sge_adjacent(last_sge, sge);
 892		return true;
 893	}
 894	return false;
 895}
 896
 897/**
 898 * rvt_lkey_ok - check IB SGE for validity and initialize
 899 * @rkt: table containing lkey to check SGE against
 900 * @pd: protection domain
 901 * @isge: outgoing internal SGE
 902 * @last_sge: last outgoing SGE written
 903 * @sge: SGE to check
 904 * @acc: access flags
 905 *
 906 * Check the IB SGE for validity and initialize our internal version
 907 * of it.
 908 *
 909 * Increments the reference count when a new sge is stored.
 910 *
 911 * Return: 0 if compressed, 1 if added , otherwise returns -errno.
 912 */
 913int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
 914		struct rvt_sge *isge, struct rvt_sge *last_sge,
 915		struct ib_sge *sge, int acc)
 916{
 917	struct rvt_mregion *mr;
 918	unsigned n, m;
 919	size_t off;
 920
 921	/*
 922	 * We use LKEY == zero for kernel virtual addresses
 923	 * (see rvt_get_dma_mr() and dma_virt_ops).
 924	 */
 925	if (sge->lkey == 0) {
 926		struct rvt_dev_info *dev = ib_to_rvt(pd->ibpd.device);
 927
 928		if (pd->user)
 929			return -EINVAL;
 930		if (rvt_sge_adjacent(last_sge, sge))
 931			return 0;
 932		rcu_read_lock();
 933		mr = rcu_dereference(dev->dma_mr);
 934		if (!mr)
 935			goto bail;
 936		rvt_get_mr(mr);
 937		rcu_read_unlock();
 938
 939		isge->mr = mr;
 940		isge->vaddr = (void *)sge->addr;
 941		isge->length = sge->length;
 942		isge->sge_length = sge->length;
 943		isge->m = 0;
 944		isge->n = 0;
 945		goto ok;
 946	}
 947	if (rvt_sge_adjacent(last_sge, sge))
 948		return 0;
 949	rcu_read_lock();
 950	mr = rcu_dereference(rkt->table[sge->lkey >> rkt->shift]);
 951	if (!mr)
 952		goto bail;
 953	rvt_get_mr(mr);
 954	if (!READ_ONCE(mr->lkey_published))
 955		goto bail_unref;
 956
 957	if (unlikely(atomic_read(&mr->lkey_invalid) ||
 958		     mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
 959		goto bail_unref;
 960
 961	off = sge->addr - mr->user_base;
 962	if (unlikely(sge->addr < mr->user_base ||
 963		     off + sge->length > mr->length ||
 964		     (mr->access_flags & acc) != acc))
 965		goto bail_unref;
 966	rcu_read_unlock();
 967
 968	off += mr->offset;
 969	if (mr->page_shift) {
 970		/*
 971		 * page sizes are uniform power of 2 so no loop is necessary
 972		 * entries_spanned_by_off is the number of times the loop below
 973		 * would have executed.
 974		*/
 975		size_t entries_spanned_by_off;
 976
 977		entries_spanned_by_off = off >> mr->page_shift;
 978		off -= (entries_spanned_by_off << mr->page_shift);
 979		m = entries_spanned_by_off / RVT_SEGSZ;
 980		n = entries_spanned_by_off % RVT_SEGSZ;
 981	} else {
 982		m = 0;
 983		n = 0;
 984		while (off >= mr->map[m]->segs[n].length) {
 985			off -= mr->map[m]->segs[n].length;
 986			n++;
 987			if (n >= RVT_SEGSZ) {
 988				m++;
 989				n = 0;
 990			}
 991		}
 992	}
 993	isge->mr = mr;
 994	isge->vaddr = mr->map[m]->segs[n].vaddr + off;
 995	isge->length = mr->map[m]->segs[n].length - off;
 996	isge->sge_length = sge->length;
 997	isge->m = m;
 998	isge->n = n;
 999ok:
1000	trace_rvt_sge_new(isge, sge);
1001	return 1;
1002bail_unref:
1003	rvt_put_mr(mr);
1004bail:
1005	rcu_read_unlock();
1006	return -EINVAL;
1007}
1008EXPORT_SYMBOL(rvt_lkey_ok);
1009
1010/**
1011 * rvt_rkey_ok - check the IB virtual address, length, and RKEY
1012 * @qp: qp for validation
1013 * @sge: SGE state
1014 * @len: length of data
1015 * @vaddr: virtual address to place data
1016 * @rkey: rkey to check
1017 * @acc: access flags
1018 *
1019 * Return: 1 if successful, otherwise 0.
1020 *
1021 * increments the reference count upon success
1022 */
1023int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
1024		u32 len, u64 vaddr, u32 rkey, int acc)
1025{
1026	struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device);
1027	struct rvt_lkey_table *rkt = &dev->lkey_table;
1028	struct rvt_mregion *mr;
1029	unsigned n, m;
1030	size_t off;
1031
1032	/*
1033	 * We use RKEY == zero for kernel virtual addresses
1034	 * (see rvt_get_dma_mr() and dma_virt_ops).
1035	 */
1036	rcu_read_lock();
1037	if (rkey == 0) {
1038		struct rvt_pd *pd = ibpd_to_rvtpd(qp->ibqp.pd);
1039		struct rvt_dev_info *rdi = ib_to_rvt(pd->ibpd.device);
1040
1041		if (pd->user)
1042			goto bail;
1043		mr = rcu_dereference(rdi->dma_mr);
1044		if (!mr)
1045			goto bail;
1046		rvt_get_mr(mr);
1047		rcu_read_unlock();
1048
1049		sge->mr = mr;
1050		sge->vaddr = (void *)vaddr;
1051		sge->length = len;
1052		sge->sge_length = len;
1053		sge->m = 0;
1054		sge->n = 0;
1055		goto ok;
1056	}
1057
1058	mr = rcu_dereference(rkt->table[rkey >> rkt->shift]);
1059	if (!mr)
1060		goto bail;
1061	rvt_get_mr(mr);
1062	/* insure mr read is before test */
1063	if (!READ_ONCE(mr->lkey_published))
1064		goto bail_unref;
1065	if (unlikely(atomic_read(&mr->lkey_invalid) ||
1066		     mr->lkey != rkey || qp->ibqp.pd != mr->pd))
1067		goto bail_unref;
1068
1069	off = vaddr - mr->iova;
1070	if (unlikely(vaddr < mr->iova || off + len > mr->length ||
1071		     (mr->access_flags & acc) == 0))
1072		goto bail_unref;
1073	rcu_read_unlock();
1074
1075	off += mr->offset;
1076	if (mr->page_shift) {
1077		/*
1078		 * page sizes are uniform power of 2 so no loop is necessary
1079		 * entries_spanned_by_off is the number of times the loop below
1080		 * would have executed.
1081		*/
1082		size_t entries_spanned_by_off;
1083
1084		entries_spanned_by_off = off >> mr->page_shift;
1085		off -= (entries_spanned_by_off << mr->page_shift);
1086		m = entries_spanned_by_off / RVT_SEGSZ;
1087		n = entries_spanned_by_off % RVT_SEGSZ;
1088	} else {
1089		m = 0;
1090		n = 0;
1091		while (off >= mr->map[m]->segs[n].length) {
1092			off -= mr->map[m]->segs[n].length;
1093			n++;
1094			if (n >= RVT_SEGSZ) {
1095				m++;
1096				n = 0;
1097			}
1098		}
1099	}
1100	sge->mr = mr;
1101	sge->vaddr = mr->map[m]->segs[n].vaddr + off;
1102	sge->length = mr->map[m]->segs[n].length - off;
1103	sge->sge_length = len;
1104	sge->m = m;
1105	sge->n = n;
1106ok:
1107	return 1;
1108bail_unref:
1109	rvt_put_mr(mr);
1110bail:
1111	rcu_read_unlock();
1112	return 0;
1113}
1114EXPORT_SYMBOL(rvt_rkey_ok);