Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1/*
   2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
   3 * Copyright (c) 2020, Intel Corporation. All rights reserved.
   4 *
   5 * This software is available to you under a choice of one of two
   6 * licenses.  You may choose to be licensed under the terms of the GNU
   7 * General Public License (GPL) Version 2, available from the file
   8 * COPYING in the main directory of this source tree, or the
   9 * OpenIB.org BSD license below:
  10 *
  11 *     Redistribution and use in source and binary forms, with or
  12 *     without modification, are permitted provided that the following
  13 *     conditions are met:
  14 *
  15 *      - Redistributions of source code must retain the above
  16 *        copyright notice, this list of conditions and the following
  17 *        disclaimer.
  18 *
  19 *      - Redistributions in binary form must reproduce the above
  20 *        copyright notice, this list of conditions and the following
  21 *        disclaimer in the documentation and/or other materials
  22 *        provided with the distribution.
  23 *
  24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  31 * SOFTWARE.
  32 */
  33
  34
  35#include <linux/kref.h>
  36#include <linux/random.h>
  37#include <linux/debugfs.h>
  38#include <linux/export.h>
  39#include <linux/delay.h>
  40#include <linux/dma-buf.h>
  41#include <linux/dma-resv.h>
  42#include <rdma/ib_umem_odp.h>
  43#include "dm.h"
  44#include "mlx5_ib.h"
  45#include "umr.h"
  46#include "data_direct.h"
  47
  48enum {
  49	MAX_PENDING_REG_MR = 8,
  50};
  51
  52#define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4
  53#define MLX5_UMR_ALIGN 2048
  54
  55static void
  56create_mkey_callback(int status, struct mlx5_async_work *context);
  57static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
  58				     u64 iova, int access_flags,
  59				     unsigned int page_size, bool populate,
  60				     int access_mode);
  61static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr);
  62
  63static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
  64					  struct ib_pd *pd)
  65{
  66	struct mlx5_ib_dev *dev = to_mdev(pd->device);
  67
  68	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
  69	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
  70	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
  71	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
  72	MLX5_SET(mkc, mkc, lr, 1);
  73
  74	if (acc & IB_ACCESS_RELAXED_ORDERING) {
  75		if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
  76			MLX5_SET(mkc, mkc, relaxed_ordering_write, 1);
  77
  78		if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
  79		    (MLX5_CAP_GEN(dev->mdev,
  80				  relaxed_ordering_read_pci_enabled) &&
  81		     pcie_relaxed_ordering_enabled(dev->mdev->pdev)))
  82			MLX5_SET(mkc, mkc, relaxed_ordering_read, 1);
  83	}
  84
  85	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
  86	MLX5_SET(mkc, mkc, qpn, 0xffffff);
  87	MLX5_SET64(mkc, mkc, start_addr, start_addr);
  88}
  89
  90static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
  91{
  92	u8 key = atomic_inc_return(&dev->mkey_var);
  93	void *mkc;
  94
  95	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
  96	MLX5_SET(mkc, mkc, mkey_7_0, key);
  97	*mkey = key;
  98}
  99
 100static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
 101			       struct mlx5_ib_mkey *mkey, u32 *in, int inlen)
 102{
 103	int ret;
 104
 105	assign_mkey_variant(dev, &mkey->key, in);
 106	ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
 107	if (!ret)
 108		init_waitqueue_head(&mkey->wait);
 109
 110	return ret;
 111}
 112
 113static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create)
 114{
 115	struct mlx5_ib_dev *dev = async_create->ent->dev;
 116	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 117	size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out);
 118
 119	MLX5_SET(create_mkey_in, async_create->in, opcode,
 120		 MLX5_CMD_OP_CREATE_MKEY);
 121	assign_mkey_variant(dev, &async_create->mkey, async_create->in);
 122	return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen,
 123				async_create->out, outlen, create_mkey_callback,
 124				&async_create->cb_work);
 125}
 126
 127static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
 128static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
 129
 130static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 131{
 132	WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
 133
 134	return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
 135}
 136
 137static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out)
 138{
 139	if (status == -ENXIO) /* core driver is not available */
 140		return;
 141
 142	mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
 143	if (status != -EREMOTEIO) /* driver specific failure */
 144		return;
 145
 146	/* Failed in FW, print cmd out failure details */
 147	mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out);
 148}
 149
 150static int push_mkey_locked(struct mlx5_cache_ent *ent, u32 mkey)
 151{
 152	unsigned long tmp = ent->mkeys_queue.ci % NUM_MKEYS_PER_PAGE;
 153	struct mlx5_mkeys_page *page;
 154
 155	lockdep_assert_held(&ent->mkeys_queue.lock);
 156	if (ent->mkeys_queue.ci >=
 157	    ent->mkeys_queue.num_pages * NUM_MKEYS_PER_PAGE) {
 158		page = kzalloc(sizeof(*page), GFP_ATOMIC);
 159		if (!page)
 160			return -ENOMEM;
 161		ent->mkeys_queue.num_pages++;
 162		list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
 163	} else {
 164		page = list_last_entry(&ent->mkeys_queue.pages_list,
 165				       struct mlx5_mkeys_page, list);
 166	}
 167
 168	page->mkeys[tmp] = mkey;
 169	ent->mkeys_queue.ci++;
 170	return 0;
 171}
 172
 173static int pop_mkey_locked(struct mlx5_cache_ent *ent)
 174{
 175	unsigned long tmp = (ent->mkeys_queue.ci - 1) % NUM_MKEYS_PER_PAGE;
 176	struct mlx5_mkeys_page *last_page;
 177	u32 mkey;
 178
 179	lockdep_assert_held(&ent->mkeys_queue.lock);
 180	last_page = list_last_entry(&ent->mkeys_queue.pages_list,
 181				    struct mlx5_mkeys_page, list);
 182	mkey = last_page->mkeys[tmp];
 183	last_page->mkeys[tmp] = 0;
 184	ent->mkeys_queue.ci--;
 185	if (ent->mkeys_queue.num_pages > 1 && !tmp) {
 186		list_del(&last_page->list);
 187		ent->mkeys_queue.num_pages--;
 188		kfree(last_page);
 189	}
 190	return mkey;
 191}
 192
 193static void create_mkey_callback(int status, struct mlx5_async_work *context)
 194{
 195	struct mlx5r_async_create_mkey *mkey_out =
 196		container_of(context, struct mlx5r_async_create_mkey, cb_work);
 197	struct mlx5_cache_ent *ent = mkey_out->ent;
 198	struct mlx5_ib_dev *dev = ent->dev;
 199	unsigned long flags;
 200
 201	if (status) {
 202		create_mkey_warn(dev, status, mkey_out->out);
 203		kfree(mkey_out);
 204		spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
 205		ent->pending--;
 206		WRITE_ONCE(dev->fill_delay, 1);
 207		spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
 208		mod_timer(&dev->delay_timer, jiffies + HZ);
 209		return;
 210	}
 211
 212	mkey_out->mkey |= mlx5_idx_to_mkey(
 213		MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
 214	WRITE_ONCE(dev->cache.last_add, jiffies);
 215
 216	spin_lock_irqsave(&ent->mkeys_queue.lock, flags);
 217	push_mkey_locked(ent, mkey_out->mkey);
 218	ent->pending--;
 219	/* If we are doing fill_to_high_water then keep going. */
 220	queue_adjust_cache_locked(ent);
 221	spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags);
 222	kfree(mkey_out);
 223}
 224
 225static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
 226{
 227	int ret = 0;
 228
 229	switch (access_mode) {
 230	case MLX5_MKC_ACCESS_MODE_MTT:
 231		ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
 232						   sizeof(struct mlx5_mtt));
 233		break;
 234	case MLX5_MKC_ACCESS_MODE_KSM:
 235		ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
 236						   sizeof(struct mlx5_klm));
 237		break;
 238	default:
 239		WARN_ON(1);
 240	}
 241	return ret;
 242}
 243
 244static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
 245{
 246	set_mkc_access_pd_addr_fields(mkc, ent->rb_key.access_flags, 0,
 247				      ent->dev->umrc.pd);
 248	MLX5_SET(mkc, mkc, free, 1);
 249	MLX5_SET(mkc, mkc, umr_en, 1);
 250	MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3);
 251	MLX5_SET(mkc, mkc, access_mode_4_2,
 252		(ent->rb_key.access_mode >> 2) & 0x7);
 253	MLX5_SET(mkc, mkc, ma_translation_mode, !!ent->rb_key.ats);
 254
 255	MLX5_SET(mkc, mkc, translations_octword_size,
 256		 get_mkc_octo_size(ent->rb_key.access_mode,
 257				   ent->rb_key.ndescs));
 258	MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
 259}
 260
 261/* Asynchronously schedule new MRs to be populated in the cache. */
 262static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 263{
 264	struct mlx5r_async_create_mkey *async_create;
 265	void *mkc;
 266	int err = 0;
 267	int i;
 268
 269	for (i = 0; i < num; i++) {
 270		async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey),
 271				       GFP_KERNEL);
 272		if (!async_create)
 273			return -ENOMEM;
 274		mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in,
 275				   memory_key_mkey_entry);
 276		set_cache_mkc(ent, mkc);
 277		async_create->ent = ent;
 278
 279		spin_lock_irq(&ent->mkeys_queue.lock);
 280		if (ent->pending >= MAX_PENDING_REG_MR) {
 281			err = -EAGAIN;
 282			goto free_async_create;
 283		}
 284		ent->pending++;
 285		spin_unlock_irq(&ent->mkeys_queue.lock);
 286
 287		err = mlx5_ib_create_mkey_cb(async_create);
 288		if (err) {
 289			mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
 290			goto err_create_mkey;
 291		}
 292	}
 293
 294	return 0;
 295
 296err_create_mkey:
 297	spin_lock_irq(&ent->mkeys_queue.lock);
 298	ent->pending--;
 299free_async_create:
 300	spin_unlock_irq(&ent->mkeys_queue.lock);
 301	kfree(async_create);
 302	return err;
 303}
 304
 305/* Synchronously create a MR in the cache */
 306static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey)
 307{
 308	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 309	void *mkc;
 310	u32 *in;
 311	int err;
 312
 313	in = kzalloc(inlen, GFP_KERNEL);
 314	if (!in)
 315		return -ENOMEM;
 316	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 317	set_cache_mkc(ent, mkc);
 318
 319	err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen);
 320	if (err)
 321		goto free_in;
 322
 323	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
 324free_in:
 325	kfree(in);
 326	return err;
 327}
 328
 329static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
 330{
 331	u32 mkey;
 332
 333	lockdep_assert_held(&ent->mkeys_queue.lock);
 334	if (!ent->mkeys_queue.ci)
 335		return;
 336	mkey = pop_mkey_locked(ent);
 337	spin_unlock_irq(&ent->mkeys_queue.lock);
 338	mlx5_core_destroy_mkey(ent->dev->mdev, mkey);
 339	spin_lock_irq(&ent->mkeys_queue.lock);
 340}
 341
 342static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
 343				bool limit_fill)
 344	__acquires(&ent->mkeys_queue.lock) __releases(&ent->mkeys_queue.lock)
 345{
 346	int err;
 347
 348	lockdep_assert_held(&ent->mkeys_queue.lock);
 349
 350	while (true) {
 351		if (limit_fill)
 352			target = ent->limit * 2;
 353		if (target == ent->pending + ent->mkeys_queue.ci)
 354			return 0;
 355		if (target > ent->pending + ent->mkeys_queue.ci) {
 356			u32 todo = target - (ent->pending + ent->mkeys_queue.ci);
 357
 358			spin_unlock_irq(&ent->mkeys_queue.lock);
 359			err = add_keys(ent, todo);
 360			if (err == -EAGAIN)
 361				usleep_range(3000, 5000);
 362			spin_lock_irq(&ent->mkeys_queue.lock);
 363			if (err) {
 364				if (err != -EAGAIN)
 365					return err;
 366			} else
 367				return 0;
 368		} else {
 369			remove_cache_mr_locked(ent);
 370		}
 371	}
 372}
 373
 374static ssize_t size_write(struct file *filp, const char __user *buf,
 375			  size_t count, loff_t *pos)
 376{
 377	struct mlx5_cache_ent *ent = filp->private_data;
 378	u32 target;
 379	int err;
 380
 381	err = kstrtou32_from_user(buf, count, 0, &target);
 382	if (err)
 383		return err;
 384
 385	/*
 386	 * Target is the new value of total_mrs the user requests, however we
 387	 * cannot free MRs that are in use. Compute the target value for stored
 388	 * mkeys.
 389	 */
 390	spin_lock_irq(&ent->mkeys_queue.lock);
 391	if (target < ent->in_use) {
 392		err = -EINVAL;
 393		goto err_unlock;
 394	}
 395	target = target - ent->in_use;
 396	if (target < ent->limit || target > ent->limit*2) {
 397		err = -EINVAL;
 398		goto err_unlock;
 399	}
 400	err = resize_available_mrs(ent, target, false);
 401	if (err)
 402		goto err_unlock;
 403	spin_unlock_irq(&ent->mkeys_queue.lock);
 404
 405	return count;
 406
 407err_unlock:
 408	spin_unlock_irq(&ent->mkeys_queue.lock);
 409	return err;
 410}
 411
 412static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
 413			 loff_t *pos)
 414{
 415	struct mlx5_cache_ent *ent = filp->private_data;
 416	char lbuf[20];
 417	int err;
 418
 419	err = snprintf(lbuf, sizeof(lbuf), "%ld\n",
 420		       ent->mkeys_queue.ci + ent->in_use);
 421	if (err < 0)
 422		return err;
 423
 424	return simple_read_from_buffer(buf, count, pos, lbuf, err);
 425}
 426
 427static const struct file_operations size_fops = {
 428	.owner	= THIS_MODULE,
 429	.open	= simple_open,
 430	.write	= size_write,
 431	.read	= size_read,
 432};
 433
 434static ssize_t limit_write(struct file *filp, const char __user *buf,
 435			   size_t count, loff_t *pos)
 436{
 437	struct mlx5_cache_ent *ent = filp->private_data;
 438	u32 var;
 439	int err;
 440
 441	err = kstrtou32_from_user(buf, count, 0, &var);
 442	if (err)
 443		return err;
 444
 445	/*
 446	 * Upon set we immediately fill the cache to high water mark implied by
 447	 * the limit.
 448	 */
 449	spin_lock_irq(&ent->mkeys_queue.lock);
 450	ent->limit = var;
 451	err = resize_available_mrs(ent, 0, true);
 452	spin_unlock_irq(&ent->mkeys_queue.lock);
 453	if (err)
 454		return err;
 455	return count;
 456}
 457
 458static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
 459			  loff_t *pos)
 460{
 461	struct mlx5_cache_ent *ent = filp->private_data;
 462	char lbuf[20];
 463	int err;
 464
 465	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
 466	if (err < 0)
 467		return err;
 468
 469	return simple_read_from_buffer(buf, count, pos, lbuf, err);
 470}
 471
 472static const struct file_operations limit_fops = {
 473	.owner	= THIS_MODULE,
 474	.open	= simple_open,
 475	.write	= limit_write,
 476	.read	= limit_read,
 477};
 478
 479static bool someone_adding(struct mlx5_mkey_cache *cache)
 480{
 481	struct mlx5_cache_ent *ent;
 482	struct rb_node *node;
 483	bool ret;
 484
 485	mutex_lock(&cache->rb_lock);
 486	for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) {
 487		ent = rb_entry(node, struct mlx5_cache_ent, node);
 488		spin_lock_irq(&ent->mkeys_queue.lock);
 489		ret = ent->mkeys_queue.ci < ent->limit;
 490		spin_unlock_irq(&ent->mkeys_queue.lock);
 491		if (ret) {
 492			mutex_unlock(&cache->rb_lock);
 493			return true;
 494		}
 495	}
 496	mutex_unlock(&cache->rb_lock);
 497	return false;
 498}
 499
 500/*
 501 * Check if the bucket is outside the high/low water mark and schedule an async
 502 * update. The cache refill has hysteresis, once the low water mark is hit it is
 503 * refilled up to the high mark.
 504 */
 505static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 506{
 507	lockdep_assert_held(&ent->mkeys_queue.lock);
 508
 509	if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
 510		return;
 511	if (ent->mkeys_queue.ci < ent->limit) {
 512		ent->fill_to_high_water = true;
 513		mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
 514	} else if (ent->fill_to_high_water &&
 515		   ent->mkeys_queue.ci + ent->pending < 2 * ent->limit) {
 516		/*
 517		 * Once we start populating due to hitting a low water mark
 518		 * continue until we pass the high water mark.
 519		 */
 520		mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
 521	} else if (ent->mkeys_queue.ci == 2 * ent->limit) {
 522		ent->fill_to_high_water = false;
 523	} else if (ent->mkeys_queue.ci > 2 * ent->limit) {
 524		/* Queue deletion of excess entries */
 525		ent->fill_to_high_water = false;
 526		if (ent->pending)
 527			queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
 528					   msecs_to_jiffies(1000));
 529		else
 530			mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0);
 531	}
 532}
 533
 534static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
 535{
 536	u32 mkey;
 537
 538	spin_lock_irq(&ent->mkeys_queue.lock);
 539	while (ent->mkeys_queue.ci) {
 540		mkey = pop_mkey_locked(ent);
 541		spin_unlock_irq(&ent->mkeys_queue.lock);
 542		mlx5_core_destroy_mkey(dev->mdev, mkey);
 543		spin_lock_irq(&ent->mkeys_queue.lock);
 544	}
 545	ent->tmp_cleanup_scheduled = false;
 546	spin_unlock_irq(&ent->mkeys_queue.lock);
 547}
 548
 549static void __cache_work_func(struct mlx5_cache_ent *ent)
 550{
 551	struct mlx5_ib_dev *dev = ent->dev;
 552	struct mlx5_mkey_cache *cache = &dev->cache;
 553	int err;
 554
 555	spin_lock_irq(&ent->mkeys_queue.lock);
 556	if (ent->disabled)
 557		goto out;
 558
 559	if (ent->fill_to_high_water &&
 560	    ent->mkeys_queue.ci + ent->pending < 2 * ent->limit &&
 561	    !READ_ONCE(dev->fill_delay)) {
 562		spin_unlock_irq(&ent->mkeys_queue.lock);
 563		err = add_keys(ent, 1);
 564		spin_lock_irq(&ent->mkeys_queue.lock);
 565		if (ent->disabled)
 566			goto out;
 567		if (err) {
 568			/*
 569			 * EAGAIN only happens if there are pending MRs, so we
 570			 * will be rescheduled when storing them. The only
 571			 * failure path here is ENOMEM.
 572			 */
 573			if (err != -EAGAIN) {
 574				mlx5_ib_warn(
 575					dev,
 576					"add keys command failed, err %d\n",
 577					err);
 578				queue_delayed_work(cache->wq, &ent->dwork,
 579						   msecs_to_jiffies(1000));
 580			}
 581		}
 582	} else if (ent->mkeys_queue.ci > 2 * ent->limit) {
 583		bool need_delay;
 584
 585		/*
 586		 * The remove_cache_mr() logic is performed as garbage
 587		 * collection task. Such task is intended to be run when no
 588		 * other active processes are running.
 589		 *
 590		 * The need_resched() will return TRUE if there are user tasks
 591		 * to be activated in near future.
 592		 *
 593		 * In such case, we don't execute remove_cache_mr() and postpone
 594		 * the garbage collection work to try to run in next cycle, in
 595		 * order to free CPU resources to other tasks.
 596		 */
 597		spin_unlock_irq(&ent->mkeys_queue.lock);
 598		need_delay = need_resched() || someone_adding(cache) ||
 599			     !time_after(jiffies,
 600					 READ_ONCE(cache->last_add) + 300 * HZ);
 601		spin_lock_irq(&ent->mkeys_queue.lock);
 602		if (ent->disabled)
 603			goto out;
 604		if (need_delay) {
 605			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
 606			goto out;
 607		}
 608		remove_cache_mr_locked(ent);
 609		queue_adjust_cache_locked(ent);
 610	}
 611out:
 612	spin_unlock_irq(&ent->mkeys_queue.lock);
 613}
 614
 615static void delayed_cache_work_func(struct work_struct *work)
 616{
 617	struct mlx5_cache_ent *ent;
 618
 619	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
 620	/* temp entries are never filled, only cleaned */
 621	if (ent->is_tmp)
 622		clean_keys(ent->dev, ent);
 623	else
 624		__cache_work_func(ent);
 625}
 626
 627static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1,
 628			     struct mlx5r_cache_rb_key key2)
 629{
 630	int res;
 631
 632	res = key1.ats - key2.ats;
 633	if (res)
 634		return res;
 635
 636	res = key1.access_mode - key2.access_mode;
 637	if (res)
 638		return res;
 639
 640	res = key1.access_flags - key2.access_flags;
 641	if (res)
 642		return res;
 643
 644	/*
 645	 * keep ndescs the last in the compare table since the find function
 646	 * searches for an exact match on all properties and only closest
 647	 * match in size.
 648	 */
 649	return key1.ndescs - key2.ndescs;
 650}
 651
 652static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache,
 653				 struct mlx5_cache_ent *ent)
 654{
 655	struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL;
 656	struct mlx5_cache_ent *cur;
 657	int cmp;
 658
 659	/* Figure out where to put new node */
 660	while (*new) {
 661		cur = rb_entry(*new, struct mlx5_cache_ent, node);
 662		parent = *new;
 663		cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key);
 664		if (cmp > 0)
 665			new = &((*new)->rb_left);
 666		if (cmp < 0)
 667			new = &((*new)->rb_right);
 668		if (cmp == 0)
 669			return -EEXIST;
 670	}
 671
 672	/* Add new node and rebalance tree. */
 673	rb_link_node(&ent->node, parent, new);
 674	rb_insert_color(&ent->node, &cache->rb_root);
 675
 676	return 0;
 677}
 678
 679static struct mlx5_cache_ent *
 680mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev,
 681			   struct mlx5r_cache_rb_key rb_key)
 682{
 683	struct rb_node *node = dev->cache.rb_root.rb_node;
 684	struct mlx5_cache_ent *cur, *smallest = NULL;
 685	u64 ndescs_limit;
 686	int cmp;
 687
 688	/*
 689	 * Find the smallest ent with order >= requested_order.
 690	 */
 691	while (node) {
 692		cur = rb_entry(node, struct mlx5_cache_ent, node);
 693		cmp = cache_ent_key_cmp(cur->rb_key, rb_key);
 694		if (cmp > 0) {
 695			smallest = cur;
 696			node = node->rb_left;
 697		}
 698		if (cmp < 0)
 699			node = node->rb_right;
 700		if (cmp == 0)
 701			return cur;
 702	}
 703
 704	/*
 705	 * Limit the usage of mkeys larger than twice the required size while
 706	 * also allowing the usage of smallest cache entry for small MRs.
 707	 */
 708	ndescs_limit = max_t(u64, rb_key.ndescs * 2,
 709			     MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS);
 710
 711	return (smallest &&
 712		smallest->rb_key.access_mode == rb_key.access_mode &&
 713		smallest->rb_key.access_flags == rb_key.access_flags &&
 714		smallest->rb_key.ats == rb_key.ats &&
 715		smallest->rb_key.ndescs <= ndescs_limit) ?
 716		       smallest :
 717		       NULL;
 718}
 719
 720static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 721					struct mlx5_cache_ent *ent,
 722					int access_flags)
 723{
 724	struct mlx5_ib_mr *mr;
 725	int err;
 726
 727	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 728	if (!mr)
 729		return ERR_PTR(-ENOMEM);
 730
 731	spin_lock_irq(&ent->mkeys_queue.lock);
 732	ent->in_use++;
 733
 734	if (!ent->mkeys_queue.ci) {
 735		queue_adjust_cache_locked(ent);
 736		ent->miss++;
 737		spin_unlock_irq(&ent->mkeys_queue.lock);
 738		err = create_cache_mkey(ent, &mr->mmkey.key);
 739		if (err) {
 740			spin_lock_irq(&ent->mkeys_queue.lock);
 741			ent->in_use--;
 742			spin_unlock_irq(&ent->mkeys_queue.lock);
 743			kfree(mr);
 744			return ERR_PTR(err);
 745		}
 746	} else {
 747		mr->mmkey.key = pop_mkey_locked(ent);
 748		queue_adjust_cache_locked(ent);
 749		spin_unlock_irq(&ent->mkeys_queue.lock);
 750	}
 751	mr->mmkey.cache_ent = ent;
 752	mr->mmkey.type = MLX5_MKEY_MR;
 753	mr->mmkey.rb_key = ent->rb_key;
 754	mr->mmkey.cacheable = true;
 755	init_waitqueue_head(&mr->mmkey.wait);
 756	return mr;
 757}
 758
 759static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev,
 760					 int access_flags)
 761{
 762	int ret = 0;
 763
 764	if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
 765	    MLX5_CAP_GEN(dev->mdev, atomic) &&
 766	    MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
 767		ret |= IB_ACCESS_REMOTE_ATOMIC;
 768
 769	if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
 770	    MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
 771	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
 772		ret |= IB_ACCESS_RELAXED_ORDERING;
 773
 774	if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
 775	    (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
 776	     MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_pci_enabled)) &&
 777	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
 778		ret |= IB_ACCESS_RELAXED_ORDERING;
 779
 780	return ret;
 781}
 782
 783struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 784				       int access_flags, int access_mode,
 785				       int ndescs)
 786{
 787	struct mlx5r_cache_rb_key rb_key = {
 788		.ndescs = ndescs,
 789		.access_mode = access_mode,
 790		.access_flags = get_unchangeable_access_flags(dev, access_flags)
 791	};
 792	struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key);
 793
 794	if (!ent)
 795		return ERR_PTR(-EOPNOTSUPP);
 796
 797	return _mlx5_mr_cache_alloc(dev, ent, access_flags);
 798}
 799
 800static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 801{
 802	if (!mlx5_debugfs_root || dev->is_rep)
 803		return;
 804
 805	debugfs_remove_recursive(dev->cache.fs_root);
 806	dev->cache.fs_root = NULL;
 807}
 808
 809static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev,
 810					    struct mlx5_cache_ent *ent)
 811{
 812	int order = order_base_2(ent->rb_key.ndescs);
 813	struct dentry *dir;
 814
 815	if (!mlx5_debugfs_root || dev->is_rep)
 816		return;
 817
 818	if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
 819		order = MLX5_IMR_KSM_CACHE_ENTRY + 2;
 820
 821	sprintf(ent->name, "%d", order);
 822	dir = debugfs_create_dir(ent->name, dev->cache.fs_root);
 823	debugfs_create_file("size", 0600, dir, ent, &size_fops);
 824	debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
 825	debugfs_create_ulong("cur", 0400, dir, &ent->mkeys_queue.ci);
 826	debugfs_create_u32("miss", 0600, dir, &ent->miss);
 827}
 828
 829static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev)
 830{
 831	struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev);
 832	struct mlx5_mkey_cache *cache = &dev->cache;
 833
 834	if (!mlx5_debugfs_root || dev->is_rep)
 835		return;
 836
 837	cache->fs_root = debugfs_create_dir("mr_cache", dbg_root);
 838}
 839
 840static void delay_time_func(struct timer_list *t)
 841{
 842	struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
 843
 844	WRITE_ONCE(dev->fill_delay, 0);
 845}
 846
 847static int mlx5r_mkeys_init(struct mlx5_cache_ent *ent)
 848{
 849	struct mlx5_mkeys_page *page;
 850
 851	page = kzalloc(sizeof(*page), GFP_KERNEL);
 852	if (!page)
 853		return -ENOMEM;
 854	INIT_LIST_HEAD(&ent->mkeys_queue.pages_list);
 855	spin_lock_init(&ent->mkeys_queue.lock);
 856	list_add_tail(&page->list, &ent->mkeys_queue.pages_list);
 857	ent->mkeys_queue.num_pages++;
 858	return 0;
 859}
 860
 861static void mlx5r_mkeys_uninit(struct mlx5_cache_ent *ent)
 862{
 863	struct mlx5_mkeys_page *page;
 864
 865	WARN_ON(ent->mkeys_queue.ci || ent->mkeys_queue.num_pages > 1);
 866	page = list_last_entry(&ent->mkeys_queue.pages_list,
 867			       struct mlx5_mkeys_page, list);
 868	list_del(&page->list);
 869	kfree(page);
 870}
 871
 872struct mlx5_cache_ent *
 873mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev,
 874			      struct mlx5r_cache_rb_key rb_key,
 875			      bool persistent_entry)
 876{
 877	struct mlx5_cache_ent *ent;
 878	int order;
 879	int ret;
 880
 881	ent = kzalloc(sizeof(*ent), GFP_KERNEL);
 882	if (!ent)
 883		return ERR_PTR(-ENOMEM);
 884
 885	ret = mlx5r_mkeys_init(ent);
 886	if (ret)
 887		goto mkeys_err;
 888	ent->rb_key = rb_key;
 889	ent->dev = dev;
 890	ent->is_tmp = !persistent_entry;
 891
 892	INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 893
 894	ret = mlx5_cache_ent_insert(&dev->cache, ent);
 895	if (ret)
 896		goto ent_insert_err;
 897
 898	if (persistent_entry) {
 899		if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM)
 900			order = MLX5_IMR_KSM_CACHE_ENTRY;
 901		else
 902			order = order_base_2(rb_key.ndescs) - 2;
 903
 904		if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
 905		    !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
 906		    mlx5r_umr_can_load_pas(dev, 0))
 907			ent->limit = dev->mdev->profile.mr_cache[order].limit;
 908		else
 909			ent->limit = 0;
 910
 911		mlx5_mkey_cache_debugfs_add_ent(dev, ent);
 912	}
 913
 914	return ent;
 915ent_insert_err:
 916	mlx5r_mkeys_uninit(ent);
 917mkeys_err:
 918	kfree(ent);
 919	return ERR_PTR(ret);
 920}
 921
 922int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
 923{
 924	struct mlx5_mkey_cache *cache = &dev->cache;
 925	struct rb_root *root = &dev->cache.rb_root;
 926	struct mlx5r_cache_rb_key rb_key = {
 927		.access_mode = MLX5_MKC_ACCESS_MODE_MTT,
 928	};
 929	struct mlx5_cache_ent *ent;
 930	struct rb_node *node;
 931	int ret;
 932	int i;
 933
 934	mutex_init(&dev->slow_path_mutex);
 935	mutex_init(&dev->cache.rb_lock);
 936	dev->cache.rb_root = RB_ROOT;
 937	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
 938	if (!cache->wq) {
 939		mlx5_ib_warn(dev, "failed to create work queue\n");
 940		return -ENOMEM;
 941	}
 942
 943	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
 944	timer_setup(&dev->delay_timer, delay_time_func, 0);
 945	mlx5_mkey_cache_debugfs_init(dev);
 946	mutex_lock(&cache->rb_lock);
 947	for (i = 0; i <= mkey_cache_max_order(dev); i++) {
 948		rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i;
 949		ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
 950		if (IS_ERR(ent)) {
 951			ret = PTR_ERR(ent);
 952			goto err;
 953		}
 954	}
 955
 956	ret = mlx5_odp_init_mkey_cache(dev);
 957	if (ret)
 958		goto err;
 959
 960	mutex_unlock(&cache->rb_lock);
 961	for (node = rb_first(root); node; node = rb_next(node)) {
 962		ent = rb_entry(node, struct mlx5_cache_ent, node);
 963		spin_lock_irq(&ent->mkeys_queue.lock);
 964		queue_adjust_cache_locked(ent);
 965		spin_unlock_irq(&ent->mkeys_queue.lock);
 966	}
 967
 968	return 0;
 969
 970err:
 971	mutex_unlock(&cache->rb_lock);
 972	mlx5_mkey_cache_debugfs_cleanup(dev);
 973	mlx5_ib_warn(dev, "failed to create mkey cache entry\n");
 974	return ret;
 975}
 976
 977void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
 978{
 979	struct rb_root *root = &dev->cache.rb_root;
 980	struct mlx5_cache_ent *ent;
 981	struct rb_node *node;
 982
 983	if (!dev->cache.wq)
 984		return;
 985
 986	mutex_lock(&dev->cache.rb_lock);
 987	for (node = rb_first(root); node; node = rb_next(node)) {
 988		ent = rb_entry(node, struct mlx5_cache_ent, node);
 989		spin_lock_irq(&ent->mkeys_queue.lock);
 990		ent->disabled = true;
 991		spin_unlock_irq(&ent->mkeys_queue.lock);
 992		cancel_delayed_work(&ent->dwork);
 993	}
 994	mutex_unlock(&dev->cache.rb_lock);
 995
 996	/*
 997	 * After all entries are disabled and will not reschedule on WQ,
 998	 * flush it and all async commands.
 999	 */
1000	flush_workqueue(dev->cache.wq);
1001
1002	mlx5_mkey_cache_debugfs_cleanup(dev);
1003	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
1004
1005	/* At this point all entries are disabled and have no concurrent work. */
1006	mutex_lock(&dev->cache.rb_lock);
1007	node = rb_first(root);
1008	while (node) {
1009		ent = rb_entry(node, struct mlx5_cache_ent, node);
1010		node = rb_next(node);
1011		clean_keys(dev, ent);
1012		rb_erase(&ent->node, root);
1013		mlx5r_mkeys_uninit(ent);
1014		kfree(ent);
1015	}
1016	mutex_unlock(&dev->cache.rb_lock);
1017
1018	destroy_workqueue(dev->cache.wq);
1019	del_timer_sync(&dev->delay_timer);
1020}
1021
1022struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
1023{
1024	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1025	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1026	struct mlx5_ib_mr *mr;
1027	void *mkc;
1028	u32 *in;
1029	int err;
1030
1031	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1032	if (!mr)
1033		return ERR_PTR(-ENOMEM);
1034
1035	in = kzalloc(inlen, GFP_KERNEL);
1036	if (!in) {
1037		err = -ENOMEM;
1038		goto err_free;
1039	}
1040
1041	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1042
1043	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
1044	MLX5_SET(mkc, mkc, length64, 1);
1045	set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0,
1046				      pd);
1047	MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
1048
1049	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1050	if (err)
1051		goto err_in;
1052
1053	kfree(in);
1054	mr->mmkey.type = MLX5_MKEY_MR;
1055	mr->ibmr.lkey = mr->mmkey.key;
1056	mr->ibmr.rkey = mr->mmkey.key;
1057	mr->umem = NULL;
1058
1059	return &mr->ibmr;
1060
1061err_in:
1062	kfree(in);
1063
1064err_free:
1065	kfree(mr);
1066
1067	return ERR_PTR(err);
1068}
1069
1070static int get_octo_len(u64 addr, u64 len, int page_shift)
1071{
1072	u64 page_size = 1ULL << page_shift;
1073	u64 offset;
1074	int npages;
1075
1076	offset = addr & (page_size - 1);
1077	npages = ALIGN(len + offset, page_size) >> page_shift;
1078	return (npages + 1) / 2;
1079}
1080
1081static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
1082{
1083	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
1084		return MKEY_CACHE_LAST_STD_ENTRY;
1085	return MLX5_MAX_UMR_SHIFT;
1086}
1087
1088static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
1089			  u64 length, int access_flags, u64 iova)
1090{
1091	mr->ibmr.lkey = mr->mmkey.key;
1092	mr->ibmr.rkey = mr->mmkey.key;
1093	mr->ibmr.length = length;
1094	mr->ibmr.device = &dev->ib_dev;
1095	mr->ibmr.iova = iova;
1096	mr->access_flags = access_flags;
1097}
1098
1099static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
1100						  u64 iova)
1101{
1102	/*
1103	 * The alignment of iova has already been checked upon entering
1104	 * UVERBS_METHOD_REG_DMABUF_MR
1105	 */
1106	umem->iova = iova;
1107	return PAGE_SIZE;
1108}
1109
1110static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
1111					     struct ib_umem *umem, u64 iova,
1112					     int access_flags, int access_mode)
1113{
1114	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1115	struct mlx5r_cache_rb_key rb_key = {};
1116	struct mlx5_cache_ent *ent;
1117	struct mlx5_ib_mr *mr;
1118	unsigned int page_size;
1119
1120	if (umem->is_dmabuf)
1121		page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
1122	else
1123		page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
1124	if (WARN_ON(!page_size))
1125		return ERR_PTR(-EINVAL);
1126
1127	rb_key.access_mode = access_mode;
1128	rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
1129	rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
1130	rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
1131	ent = mkey_cache_ent_from_rb_key(dev, rb_key);
1132	/*
1133	 * If the MR can't come from the cache then synchronously create an uncached
1134	 * one.
1135	 */
1136	if (!ent) {
1137		mutex_lock(&dev->slow_path_mutex);
1138		mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode);
1139		mutex_unlock(&dev->slow_path_mutex);
1140		if (IS_ERR(mr))
1141			return mr;
1142		mr->mmkey.rb_key = rb_key;
1143		mr->mmkey.cacheable = true;
1144		return mr;
1145	}
1146
1147	mr = _mlx5_mr_cache_alloc(dev, ent, access_flags);
1148	if (IS_ERR(mr))
1149		return mr;
1150
1151	mr->ibmr.pd = pd;
1152	mr->umem = umem;
1153	mr->page_shift = order_base_2(page_size);
1154	set_mr_fields(dev, mr, umem->length, access_flags, iova);
1155
1156	return mr;
1157}
1158
1159static struct ib_mr *
1160reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags,
1161			    u32 crossed_lkey)
1162{
1163	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1164	int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING;
1165	struct mlx5_ib_mr *mr;
1166	void *mkc;
1167	int inlen;
1168	u32 *in;
1169	int err;
1170
1171	if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey))
1172		return ERR_PTR(-EOPNOTSUPP);
1173
1174	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1175	if (!mr)
1176		return ERR_PTR(-ENOMEM);
1177
1178	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1179	in = kvzalloc(inlen, GFP_KERNEL);
1180	if (!in) {
1181		err = -ENOMEM;
1182		goto err_1;
1183	}
1184
1185	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1186	MLX5_SET(mkc, mkc, crossing_target_vhca_id,
1187		 MLX5_CAP_GEN(dev->mdev, vhca_id));
1188	MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey);
1189	MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
1190	MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
1191
1192	/* for this crossing mkey IOVA should be 0 and len should be IOVA + len */
1193	set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd);
1194	MLX5_SET64(mkc, mkc, len, iova + length);
1195
1196	MLX5_SET(mkc, mkc, free, 0);
1197	MLX5_SET(mkc, mkc, umr_en, 0);
1198	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1199	if (err)
1200		goto err_2;
1201
1202	mr->mmkey.type = MLX5_MKEY_MR;
1203	set_mr_fields(dev, mr, length, access_flags, iova);
1204	mr->ibmr.pd = pd;
1205	kvfree(in);
1206	mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key);
1207
1208	return &mr->ibmr;
1209err_2:
1210	kvfree(in);
1211err_1:
1212	kfree(mr);
1213	return ERR_PTR(err);
1214}
1215
1216/*
1217 * If ibmr is NULL it will be allocated by reg_create.
1218 * Else, the given ibmr will be used.
1219 */
1220static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
1221				     u64 iova, int access_flags,
1222				     unsigned int page_size, bool populate,
1223				     int access_mode)
1224{
1225	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1226	struct mlx5_ib_mr *mr;
1227	__be64 *pas;
1228	void *mkc;
1229	int inlen;
1230	u32 *in;
1231	int err;
1232	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) &&
1233		(access_mode == MLX5_MKC_ACCESS_MODE_MTT);
1234	bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
1235
1236	if (!page_size)
1237		return ERR_PTR(-EINVAL);
1238	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1239	if (!mr)
1240		return ERR_PTR(-ENOMEM);
1241
1242	mr->ibmr.pd = pd;
1243	mr->access_flags = access_flags;
1244	mr->page_shift = order_base_2(page_size);
1245
1246	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1247	if (populate)
1248		inlen += sizeof(*pas) *
1249			 roundup(ib_umem_num_dma_blocks(umem, page_size), 2);
1250	in = kvzalloc(inlen, GFP_KERNEL);
1251	if (!in) {
1252		err = -ENOMEM;
1253		goto err_1;
1254	}
1255	pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
1256	if (populate) {
1257		if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) {
1258			err = -EINVAL;
1259			goto err_2;
1260		}
1261		mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas,
1262				     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
1263	}
1264
1265	/* The pg_access bit allows setting the access flags
1266	 * in the page list submitted with the command.
1267	 */
1268	MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
1269
1270	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1271	set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
1272				      populate ? pd : dev->umrc.pd);
1273	/* In case a data direct flow, overwrite the pdn field by its internal kernel PD */
1274	if (umem->is_dmabuf && ksm_mode)
1275		MLX5_SET(mkc, mkc, pd, dev->ddr.pdn);
1276
1277	MLX5_SET(mkc, mkc, free, !populate);
1278	MLX5_SET(mkc, mkc, access_mode_1_0, access_mode);
1279	MLX5_SET(mkc, mkc, umr_en, 1);
1280
1281	MLX5_SET64(mkc, mkc, len, umem->length);
1282	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
1283	if (ksm_mode)
1284		MLX5_SET(mkc, mkc, translations_octword_size,
1285			 get_octo_len(iova, umem->length, mr->page_shift) * 2);
1286	else
1287		MLX5_SET(mkc, mkc, translations_octword_size,
1288			 get_octo_len(iova, umem->length, mr->page_shift));
1289	MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
1290	if (mlx5_umem_needs_ats(dev, umem, access_flags))
1291		MLX5_SET(mkc, mkc, ma_translation_mode, 1);
1292	if (populate) {
1293		MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
1294			 get_octo_len(iova, umem->length, mr->page_shift));
1295	}
1296
1297	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1298	if (err) {
1299		mlx5_ib_warn(dev, "create mkey failed\n");
1300		goto err_2;
1301	}
1302	mr->mmkey.type = MLX5_MKEY_MR;
1303	mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift);
1304	mr->umem = umem;
1305	set_mr_fields(dev, mr, umem->length, access_flags, iova);
1306	kvfree(in);
1307
1308	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
1309
1310	return mr;
1311
1312err_2:
1313	kvfree(in);
1314err_1:
1315	kfree(mr);
1316	return ERR_PTR(err);
1317}
1318
1319static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
1320				       u64 length, int acc, int mode)
1321{
1322	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1323	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1324	struct mlx5_ib_mr *mr;
1325	void *mkc;
1326	u32 *in;
1327	int err;
1328
1329	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1330	if (!mr)
1331		return ERR_PTR(-ENOMEM);
1332
1333	in = kzalloc(inlen, GFP_KERNEL);
1334	if (!in) {
1335		err = -ENOMEM;
1336		goto err_free;
1337	}
1338
1339	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1340
1341	MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
1342	MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
1343	MLX5_SET64(mkc, mkc, len, length);
1344	set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
1345
1346	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
1347	if (err)
1348		goto err_in;
1349
1350	kfree(in);
1351
1352	set_mr_fields(dev, mr, length, acc, start_addr);
1353
1354	return &mr->ibmr;
1355
1356err_in:
1357	kfree(in);
1358
1359err_free:
1360	kfree(mr);
1361
1362	return ERR_PTR(err);
1363}
1364
1365int mlx5_ib_advise_mr(struct ib_pd *pd,
1366		      enum ib_uverbs_advise_mr_advice advice,
1367		      u32 flags,
1368		      struct ib_sge *sg_list,
1369		      u32 num_sge,
1370		      struct uverbs_attr_bundle *attrs)
1371{
1372	if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
1373	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1374	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1375		return -EOPNOTSUPP;
1376
1377	return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
1378					 sg_list, num_sge);
1379}
1380
1381struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
1382				struct ib_dm_mr_attr *attr,
1383				struct uverbs_attr_bundle *attrs)
1384{
1385	struct mlx5_ib_dm *mdm = to_mdm(dm);
1386	struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
1387	u64 start_addr = mdm->dev_addr + attr->offset;
1388	int mode;
1389
1390	switch (mdm->type) {
1391	case MLX5_IB_UAPI_DM_TYPE_MEMIC:
1392		if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
1393			return ERR_PTR(-EINVAL);
1394
1395		mode = MLX5_MKC_ACCESS_MODE_MEMIC;
1396		start_addr -= pci_resource_start(dev->pdev, 0);
1397		break;
1398	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
1399	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
1400	case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM:
1401	case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM:
1402		if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
1403			return ERR_PTR(-EINVAL);
1404
1405		mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
1406		break;
1407	default:
1408		return ERR_PTR(-EINVAL);
1409	}
1410
1411	return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
1412				 attr->access_flags, mode);
1413}
1414
1415static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
1416				    u64 iova, int access_flags)
1417{
1418	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1419	struct mlx5_ib_mr *mr = NULL;
1420	bool xlt_with_umr;
1421	int err;
1422
1423	xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
1424	if (xlt_with_umr) {
1425		mr = alloc_cacheable_mr(pd, umem, iova, access_flags,
1426					MLX5_MKC_ACCESS_MODE_MTT);
1427	} else {
1428		unsigned int page_size =
1429			mlx5_umem_mkc_find_best_pgsz(dev, umem, iova);
1430
1431		mutex_lock(&dev->slow_path_mutex);
1432		mr = reg_create(pd, umem, iova, access_flags, page_size,
1433				true, MLX5_MKC_ACCESS_MODE_MTT);
1434		mutex_unlock(&dev->slow_path_mutex);
1435	}
1436	if (IS_ERR(mr)) {
1437		ib_umem_release(umem);
1438		return ERR_CAST(mr);
1439	}
1440
1441	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1442
1443	atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1444
1445	if (xlt_with_umr) {
1446		/*
1447		 * If the MR was created with reg_create then it will be
1448		 * configured properly but left disabled. It is safe to go ahead
1449		 * and configure it again via UMR while enabling it.
1450		 */
1451		err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE);
1452		if (err) {
1453			mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1454			return ERR_PTR(err);
1455		}
1456	}
1457	return &mr->ibmr;
1458}
1459
1460static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
1461					u64 iova, int access_flags,
1462					struct ib_udata *udata)
1463{
1464	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1465	struct ib_umem_odp *odp;
1466	struct mlx5_ib_mr *mr;
1467	int err;
1468
1469	if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1470		return ERR_PTR(-EOPNOTSUPP);
1471
1472	err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq);
1473	if (err)
1474		return ERR_PTR(err);
1475	if (!start && length == U64_MAX) {
1476		if (iova != 0)
1477			return ERR_PTR(-EINVAL);
1478		if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1479			return ERR_PTR(-EINVAL);
1480
1481		mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1482		if (IS_ERR(mr))
1483			return ERR_CAST(mr);
1484		return &mr->ibmr;
1485	}
1486
1487	/* ODP requires xlt update via umr to work. */
1488	if (!mlx5r_umr_can_load_pas(dev, length))
1489		return ERR_PTR(-EINVAL);
1490
1491	odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
1492			      &mlx5_mn_ops);
1493	if (IS_ERR(odp))
1494		return ERR_CAST(odp);
1495
1496	mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags,
1497				MLX5_MKC_ACCESS_MODE_MTT);
1498	if (IS_ERR(mr)) {
1499		ib_umem_release(&odp->umem);
1500		return ERR_CAST(mr);
1501	}
1502	xa_init(&mr->implicit_children);
1503
1504	odp->private = mr;
1505	err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1506	if (err)
1507		goto err_dereg_mr;
1508
1509	err = mlx5_ib_init_odp_mr(mr);
1510	if (err)
1511		goto err_dereg_mr;
1512	return &mr->ibmr;
1513
1514err_dereg_mr:
1515	mlx5_ib_dereg_mr(&mr->ibmr, NULL);
1516	return ERR_PTR(err);
1517}
1518
1519struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1520				  u64 iova, int access_flags,
1521				  struct ib_udata *udata)
1522{
1523	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1524	struct ib_umem *umem;
1525	int err;
1526
1527	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
1528		return ERR_PTR(-EOPNOTSUPP);
1529
1530	mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1531		    start, iova, length, access_flags);
1532
1533	err = mlx5r_umr_resource_init(dev);
1534	if (err)
1535		return ERR_PTR(err);
1536
1537	if (access_flags & IB_ACCESS_ON_DEMAND)
1538		return create_user_odp_mr(pd, start, length, iova, access_flags,
1539					  udata);
1540	umem = ib_umem_get(&dev->ib_dev, start, length, access_flags);
1541	if (IS_ERR(umem))
1542		return ERR_CAST(umem);
1543	return create_real_mr(pd, umem, iova, access_flags);
1544}
1545
1546static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach)
1547{
1548	struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv;
1549	struct mlx5_ib_mr *mr = umem_dmabuf->private;
1550
1551	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
1552
1553	if (!umem_dmabuf->sgt || !mr)
1554		return;
1555
1556	mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP);
1557	ib_umem_dmabuf_unmap_pages(umem_dmabuf);
1558}
1559
1560static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
1561	.allow_peer2peer = 1,
1562	.move_notify = mlx5_ib_dmabuf_invalidate_cb,
1563};
1564
1565static struct ib_mr *
1566reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
1567		   u64 offset, u64 length, u64 virt_addr,
1568		   int fd, int access_flags, int access_mode)
1569{
1570	bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
1571	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1572	struct mlx5_ib_mr *mr = NULL;
1573	struct ib_umem_dmabuf *umem_dmabuf;
1574	int err;
1575
1576	err = mlx5r_umr_resource_init(dev);
1577	if (err)
1578		return ERR_PTR(err);
1579
1580	if (!pinned_mode)
1581		umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev,
1582						 offset, length, fd,
1583						 access_flags,
1584						 &mlx5_ib_dmabuf_attach_ops);
1585	else
1586		umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev,
1587				dma_device, offset, length,
1588				fd, access_flags);
1589
1590	if (IS_ERR(umem_dmabuf)) {
1591		mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
1592			    PTR_ERR(umem_dmabuf));
1593		return ERR_CAST(umem_dmabuf);
1594	}
1595
1596	mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
1597				access_flags, access_mode);
1598	if (IS_ERR(mr)) {
1599		ib_umem_release(&umem_dmabuf->umem);
1600		return ERR_CAST(mr);
1601	}
1602
1603	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1604
1605	atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
1606	umem_dmabuf->private = mr;
1607	if (!pinned_mode) {
1608		err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
1609		if (err)
1610			goto err_dereg_mr;
1611	} else {
1612		mr->data_direct = true;
1613	}
1614
1615	err = mlx5_ib_init_dmabuf_mr(mr);
1616	if (err)
1617		goto err_dereg_mr;
1618	return &mr->ibmr;
1619
1620err_dereg_mr:
1621	__mlx5_ib_dereg_mr(&mr->ibmr);
1622	return ERR_PTR(err);
1623}
1624
1625static struct ib_mr *
1626reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset,
1627				  u64 length, u64 virt_addr,
1628				  int fd, int access_flags)
1629{
1630	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1631	struct mlx5_data_direct_dev *data_direct_dev;
1632	struct ib_mr *crossing_mr;
1633	struct ib_mr *crossed_mr;
1634	int ret = 0;
1635
1636	/* As of HW behaviour the IOVA must be page aligned in KSM mode */
1637	if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND))
1638		return ERR_PTR(-EOPNOTSUPP);
1639
1640	mutex_lock(&dev->data_direct_lock);
1641	data_direct_dev = dev->data_direct_dev;
1642	if (!data_direct_dev) {
1643		ret = -EINVAL;
1644		goto end;
1645	}
1646
1647	/* The device's 'data direct mkey' was created without RO flags to
1648	 * simplify things and allow for a single mkey per device.
1649	 * Since RO is not a must, mask it out accordingly.
1650	 */
1651	access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
1652	crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev,
1653					offset, length, virt_addr, fd,
1654					access_flags, MLX5_MKC_ACCESS_MODE_KSM);
1655	if (IS_ERR(crossed_mr)) {
1656		ret = PTR_ERR(crossed_mr);
1657		goto end;
1658	}
1659
1660	mutex_lock(&dev->slow_path_mutex);
1661	crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags,
1662						  crossed_mr->lkey);
1663	mutex_unlock(&dev->slow_path_mutex);
1664	if (IS_ERR(crossing_mr)) {
1665		__mlx5_ib_dereg_mr(crossed_mr);
1666		ret = PTR_ERR(crossing_mr);
1667		goto end;
1668	}
1669
1670	list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list);
1671	to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr);
1672	to_mmr(crossing_mr)->data_direct = true;
1673end:
1674	mutex_unlock(&dev->data_direct_lock);
1675	return ret ? ERR_PTR(ret) : crossing_mr;
1676}
1677
1678struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
1679					 u64 length, u64 virt_addr,
1680					 int fd, int access_flags,
1681					 struct uverbs_attr_bundle *attrs)
1682{
1683	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1684	int mlx5_access_flags = 0;
1685	int err;
1686
1687	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
1688	    !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
1689		return ERR_PTR(-EOPNOTSUPP);
1690
1691	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) {
1692		err = uverbs_get_flags32(&mlx5_access_flags, attrs,
1693					 MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
1694					 MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT);
1695		if (err)
1696			return ERR_PTR(err);
1697	}
1698
1699	mlx5_ib_dbg(dev,
1700		    "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n",
1701		    offset, virt_addr, length, fd, access_flags, mlx5_access_flags);
1702
1703	/* dmabuf requires xlt update via umr to work. */
1704	if (!mlx5r_umr_can_load_pas(dev, length))
1705		return ERR_PTR(-EINVAL);
1706
1707	if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT)
1708		return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr,
1709							 fd, access_flags);
1710
1711	return reg_user_mr_dmabuf(pd, pd->device->dma_device,
1712				  offset, length, virt_addr,
1713				  fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT);
1714}
1715
1716/*
1717 * True if the change in access flags can be done via UMR, only some access
1718 * flags can be updated.
1719 */
1720static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev,
1721				     unsigned int current_access_flags,
1722				     unsigned int target_access_flags)
1723{
1724	unsigned int diffs = current_access_flags ^ target_access_flags;
1725
1726	if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
1727		      IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING |
1728		      IB_ACCESS_REMOTE_ATOMIC))
1729		return false;
1730	return mlx5r_umr_can_reconfig(dev, current_access_flags,
1731				      target_access_flags);
1732}
1733
1734static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
1735				  struct ib_umem *new_umem,
1736				  int new_access_flags, u64 iova,
1737				  unsigned long *page_size)
1738{
1739	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1740
1741	/* We only track the allocated sizes of MRs from the cache */
1742	if (!mr->mmkey.cache_ent)
1743		return false;
1744	if (!mlx5r_umr_can_load_pas(dev, new_umem->length))
1745		return false;
1746
1747	*page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova);
1748	if (WARN_ON(!*page_size))
1749		return false;
1750	return (mr->mmkey.cache_ent->rb_key.ndescs) >=
1751	       ib_umem_num_dma_blocks(new_umem, *page_size);
1752}
1753
1754static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd,
1755			 int access_flags, int flags, struct ib_umem *new_umem,
1756			 u64 iova, unsigned long page_size)
1757{
1758	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1759	int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE;
1760	struct ib_umem *old_umem = mr->umem;
1761	int err;
1762
1763	/*
1764	 * To keep everything simple the MR is revoked before we start to mess
1765	 * with it. This ensure the change is atomic relative to any use of the
1766	 * MR.
1767	 */
1768	err = mlx5r_umr_revoke_mr(mr);
1769	if (err)
1770		return err;
1771
1772	if (flags & IB_MR_REREG_PD) {
1773		mr->ibmr.pd = pd;
1774		upd_flags |= MLX5_IB_UPD_XLT_PD;
1775	}
1776	if (flags & IB_MR_REREG_ACCESS) {
1777		mr->access_flags = access_flags;
1778		upd_flags |= MLX5_IB_UPD_XLT_ACCESS;
1779	}
1780
1781	mr->ibmr.iova = iova;
1782	mr->ibmr.length = new_umem->length;
1783	mr->page_shift = order_base_2(page_size);
1784	mr->umem = new_umem;
1785	err = mlx5r_umr_update_mr_pas(mr, upd_flags);
1786	if (err) {
1787		/*
1788		 * The MR is revoked at this point so there is no issue to free
1789		 * new_umem.
1790		 */
1791		mr->umem = old_umem;
1792		return err;
1793	}
1794
1795	atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages);
1796	ib_umem_release(old_umem);
1797	atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages);
1798	return 0;
1799}
1800
1801struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1802				    u64 length, u64 iova, int new_access_flags,
1803				    struct ib_pd *new_pd,
1804				    struct ib_udata *udata)
1805{
1806	struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1807	struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1808	int err;
1809
1810	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct)
1811		return ERR_PTR(-EOPNOTSUPP);
1812
1813	mlx5_ib_dbg(
1814		dev,
1815		"start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1816		start, iova, length, new_access_flags);
1817
1818	if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS))
1819		return ERR_PTR(-EOPNOTSUPP);
1820
1821	if (!(flags & IB_MR_REREG_ACCESS))
1822		new_access_flags = mr->access_flags;
1823	if (!(flags & IB_MR_REREG_PD))
1824		new_pd = ib_mr->pd;
1825
1826	if (!(flags & IB_MR_REREG_TRANS)) {
1827		struct ib_umem *umem;
1828
1829		/* Fast path for PD/access change */
1830		if (can_use_umr_rereg_access(dev, mr->access_flags,
1831					     new_access_flags)) {
1832			err = mlx5r_umr_rereg_pd_access(mr, new_pd,
1833							new_access_flags);
1834			if (err)
1835				return ERR_PTR(err);
1836			return NULL;
1837		}
1838		/* DM or ODP MR's don't have a normal umem so we can't re-use it */
1839		if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1840			goto recreate;
1841
1842		/*
1843		 * Only one active MR can refer to a umem at one time, revoke
1844		 * the old MR before assigning the umem to the new one.
1845		 */
1846		err = mlx5r_umr_revoke_mr(mr);
1847		if (err)
1848			return ERR_PTR(err);
1849		umem = mr->umem;
1850		mr->umem = NULL;
1851		atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages);
1852
1853		return create_real_mr(new_pd, umem, mr->ibmr.iova,
1854				      new_access_flags);
1855	}
1856
1857	/*
1858	 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1859	 * but the logic around releasing the umem is different
1860	 */
1861	if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr))
1862		goto recreate;
1863
1864	if (!(new_access_flags & IB_ACCESS_ON_DEMAND) &&
1865	    can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) {
1866		struct ib_umem *new_umem;
1867		unsigned long page_size;
1868
1869		new_umem = ib_umem_get(&dev->ib_dev, start, length,
1870				       new_access_flags);
1871		if (IS_ERR(new_umem))
1872			return ERR_CAST(new_umem);
1873
1874		/* Fast path for PAS change */
1875		if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova,
1876					  &page_size)) {
1877			err = umr_rereg_pas(mr, new_pd, new_access_flags, flags,
1878					    new_umem, iova, page_size);
1879			if (err) {
1880				ib_umem_release(new_umem);
1881				return ERR_PTR(err);
1882			}
1883			return NULL;
1884		}
1885		return create_real_mr(new_pd, new_umem, iova, new_access_flags);
1886	}
1887
1888	/*
1889	 * Everything else has no state we can preserve, just create a new MR
1890	 * from scratch
1891	 */
1892recreate:
1893	return mlx5_ib_reg_user_mr(new_pd, start, length, iova,
1894				   new_access_flags, udata);
1895}
1896
1897static int
1898mlx5_alloc_priv_descs(struct ib_device *device,
1899		      struct mlx5_ib_mr *mr,
1900		      int ndescs,
1901		      int desc_size)
1902{
1903	struct mlx5_ib_dev *dev = to_mdev(device);
1904	struct device *ddev = &dev->mdev->pdev->dev;
1905	int size = ndescs * desc_size;
1906	int add_size;
1907	int ret;
1908
1909	add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0);
1910	if (is_power_of_2(MLX5_UMR_ALIGN) && add_size) {
1911		int end = max_t(int, MLX5_UMR_ALIGN, roundup_pow_of_two(size));
1912
1913		add_size = min_t(int, end - size, add_size);
1914	}
1915
1916	mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1917	if (!mr->descs_alloc)
1918		return -ENOMEM;
1919
1920	mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1921
1922	mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE);
1923	if (dma_mapping_error(ddev, mr->desc_map)) {
1924		ret = -ENOMEM;
1925		goto err;
1926	}
1927
1928	return 0;
1929err:
1930	kfree(mr->descs_alloc);
1931
1932	return ret;
1933}
1934
1935static void
1936mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1937{
1938	if (!mr->umem && !mr->data_direct &&
1939	    mr->ibmr.type != IB_MR_TYPE_DM && mr->descs) {
1940		struct ib_device *device = mr->ibmr.device;
1941		int size = mr->max_descs * mr->desc_size;
1942		struct mlx5_ib_dev *dev = to_mdev(device);
1943
1944		dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size,
1945				 DMA_TO_DEVICE);
1946		kfree(mr->descs_alloc);
1947		mr->descs = NULL;
1948	}
1949}
1950
1951static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
1952				    struct mlx5_ib_mr *mr)
1953{
1954	struct mlx5_mkey_cache *cache = &dev->cache;
1955	struct mlx5_cache_ent *ent;
1956	int ret;
1957
1958	if (mr->mmkey.cache_ent) {
1959		spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
1960		mr->mmkey.cache_ent->in_use--;
1961		goto end;
1962	}
1963
1964	mutex_lock(&cache->rb_lock);
1965	ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key);
1966	if (ent) {
1967		if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) {
1968			if (ent->disabled) {
1969				mutex_unlock(&cache->rb_lock);
1970				return -EOPNOTSUPP;
1971			}
1972			mr->mmkey.cache_ent = ent;
1973			spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
1974			mutex_unlock(&cache->rb_lock);
1975			goto end;
1976		}
1977	}
1978
1979	ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false);
1980	mutex_unlock(&cache->rb_lock);
1981	if (IS_ERR(ent))
1982		return PTR_ERR(ent);
1983
1984	mr->mmkey.cache_ent = ent;
1985	spin_lock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
1986
1987end:
1988	ret = push_mkey_locked(mr->mmkey.cache_ent, mr->mmkey.key);
1989	spin_unlock_irq(&mr->mmkey.cache_ent->mkeys_queue.lock);
1990	return ret;
1991}
1992
1993static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr)
1994{
1995	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1996	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
1997	int err;
1998
1999	lockdep_assert_held(&dev->data_direct_lock);
2000	mr->revoked = true;
2001	err = mlx5r_umr_revoke_mr(mr);
2002	if (WARN_ON(err))
2003		return err;
2004
2005	ib_umem_dmabuf_revoke(umem_dmabuf);
2006	return 0;
2007}
2008
2009void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev)
2010{
2011	struct mlx5_ib_mr *mr, *next;
2012
2013	lockdep_assert_held(&dev->data_direct_lock);
2014
2015	list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) {
2016		list_del(&mr->dd_node);
2017		mlx5_ib_revoke_data_direct_mr(mr);
2018	}
2019}
2020
2021static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
2022{
2023	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
2024	struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
2025	bool is_odp = is_odp_mr(mr);
2026	bool is_odp_dma_buf = is_dmabuf_mr(mr) &&
2027			!to_ib_umem_dmabuf(mr->umem)->pinned;
2028	int ret = 0;
2029
2030	if (is_odp)
2031		mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2032
2033	if (is_odp_dma_buf)
2034		dma_resv_lock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv, NULL);
2035
2036	if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) {
2037		ent = mr->mmkey.cache_ent;
2038		/* upon storing to a clean temp entry - schedule its cleanup */
2039		spin_lock_irq(&ent->mkeys_queue.lock);
2040		if (ent->is_tmp && !ent->tmp_cleanup_scheduled) {
2041			mod_delayed_work(ent->dev->cache.wq, &ent->dwork,
2042					 msecs_to_jiffies(30 * 1000));
2043			ent->tmp_cleanup_scheduled = true;
2044		}
2045		spin_unlock_irq(&ent->mkeys_queue.lock);
2046		goto out;
2047	}
2048
2049	if (ent) {
2050		spin_lock_irq(&ent->mkeys_queue.lock);
2051		ent->in_use--;
2052		mr->mmkey.cache_ent = NULL;
2053		spin_unlock_irq(&ent->mkeys_queue.lock);
2054	}
2055	ret = destroy_mkey(dev, mr);
2056out:
2057	if (is_odp) {
2058		if (!ret)
2059			to_ib_umem_odp(mr->umem)->private = NULL;
2060		mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex);
2061	}
2062
2063	if (is_odp_dma_buf) {
2064		if (!ret)
2065			to_ib_umem_dmabuf(mr->umem)->private = NULL;
2066		dma_resv_unlock(to_ib_umem_dmabuf(mr->umem)->attach->dmabuf->resv);
2067	}
2068
2069	return ret;
2070}
2071
2072static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr)
2073{
2074	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2075	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
2076	int rc;
2077
2078	/*
2079	 * Any async use of the mr must hold the refcount, once the refcount
2080	 * goes to zero no other thread, such as ODP page faults, prefetch, any
2081	 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
2082	 */
2083	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2084	    refcount_read(&mr->mmkey.usecount) != 0 &&
2085	    xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)))
2086		mlx5r_deref_wait_odp_mkey(&mr->mmkey);
2087
2088	if (ibmr->type == IB_MR_TYPE_INTEGRITY) {
2089		xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2090			   mr->sig, NULL, GFP_KERNEL);
2091
2092		if (mr->mtt_mr) {
2093			rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2094			if (rc)
2095				return rc;
2096			mr->mtt_mr = NULL;
2097		}
2098		if (mr->klm_mr) {
2099			rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2100			if (rc)
2101				return rc;
2102			mr->klm_mr = NULL;
2103		}
2104
2105		if (mlx5_core_destroy_psv(dev->mdev,
2106					  mr->sig->psv_memory.psv_idx))
2107			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2108				     mr->sig->psv_memory.psv_idx);
2109		if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2110			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2111				     mr->sig->psv_wire.psv_idx);
2112		kfree(mr->sig);
2113		mr->sig = NULL;
2114	}
2115
2116	/* Stop DMA */
2117	rc = mlx5_revoke_mr(mr);
2118	if (rc)
2119		return rc;
2120
2121	if (mr->umem) {
2122		bool is_odp = is_odp_mr(mr);
2123
2124		if (!is_odp)
2125			atomic_sub(ib_umem_num_pages(mr->umem),
2126				   &dev->mdev->priv.reg_pages);
2127		ib_umem_release(mr->umem);
2128		if (is_odp)
2129			mlx5_ib_free_odp_mr(mr);
2130	}
2131
2132	if (!mr->mmkey.cache_ent)
2133		mlx5_free_priv_descs(mr);
2134
2135	kfree(mr);
2136	return 0;
2137}
2138
2139static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev,
2140					struct mlx5_ib_mr *mr)
2141{
2142	struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr;
2143	int ret;
2144
2145	ret = __mlx5_ib_dereg_mr(&mr->ibmr);
2146	if (ret)
2147		return ret;
2148
2149	mutex_lock(&dev->data_direct_lock);
2150	if (!dd_crossed_mr->revoked)
2151		list_del(&dd_crossed_mr->dd_node);
2152
2153	ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr);
2154	mutex_unlock(&dev->data_direct_lock);
2155	return ret;
2156}
2157
2158int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
2159{
2160	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2161	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
2162
2163	if (mr->data_direct)
2164		return dereg_crossing_data_direct_mr(dev, mr);
2165
2166	return __mlx5_ib_dereg_mr(ibmr);
2167}
2168
2169static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
2170				   int access_mode, int page_shift)
2171{
2172	struct mlx5_ib_dev *dev = to_mdev(pd->device);
2173	void *mkc;
2174
2175	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2176
2177	/* This is only used from the kernel, so setting the PD is OK. */
2178	set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd);
2179	MLX5_SET(mkc, mkc, free, 1);
2180	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2181	MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
2182	MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
2183	MLX5_SET(mkc, mkc, umr_en, 1);
2184	MLX5_SET(mkc, mkc, log_page_size, page_shift);
2185	if (access_mode == MLX5_MKC_ACCESS_MODE_PA ||
2186	    access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2187		MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats));
2188}
2189
2190static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2191				  int ndescs, int desc_size, int page_shift,
2192				  int access_mode, u32 *in, int inlen)
2193{
2194	struct mlx5_ib_dev *dev = to_mdev(pd->device);
2195	int err;
2196
2197	mr->access_mode = access_mode;
2198	mr->desc_size = desc_size;
2199	mr->max_descs = ndescs;
2200
2201	err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size);
2202	if (err)
2203		return err;
2204
2205	mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
2206
2207	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
2208	if (err)
2209		goto err_free_descs;
2210
2211	mr->mmkey.type = MLX5_MKEY_MR;
2212	mr->ibmr.lkey = mr->mmkey.key;
2213	mr->ibmr.rkey = mr->mmkey.key;
2214
2215	return 0;
2216
2217err_free_descs:
2218	mlx5_free_priv_descs(mr);
2219	return err;
2220}
2221
2222static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd,
2223				u32 max_num_sg, u32 max_num_meta_sg,
2224				int desc_size, int access_mode)
2225{
2226	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2227	int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4);
2228	int page_shift = 0;
2229	struct mlx5_ib_mr *mr;
2230	u32 *in;
2231	int err;
2232
2233	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2234	if (!mr)
2235		return ERR_PTR(-ENOMEM);
2236
2237	mr->ibmr.pd = pd;
2238	mr->ibmr.device = pd->device;
2239
2240	in = kzalloc(inlen, GFP_KERNEL);
2241	if (!in) {
2242		err = -ENOMEM;
2243		goto err_free;
2244	}
2245
2246	if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
2247		page_shift = PAGE_SHIFT;
2248
2249	err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift,
2250				     access_mode, in, inlen);
2251	if (err)
2252		goto err_free_in;
2253
2254	mr->umem = NULL;
2255	kfree(in);
2256
2257	return mr;
2258
2259err_free_in:
2260	kfree(in);
2261err_free:
2262	kfree(mr);
2263	return ERR_PTR(err);
2264}
2265
2266static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2267				    int ndescs, u32 *in, int inlen)
2268{
2269	return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt),
2270				      PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in,
2271				      inlen);
2272}
2273
2274static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2275				    int ndescs, u32 *in, int inlen)
2276{
2277	return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm),
2278				      0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2279}
2280
2281static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr,
2282				      int max_num_sg, int max_num_meta_sg,
2283				      u32 *in, int inlen)
2284{
2285	struct mlx5_ib_dev *dev = to_mdev(pd->device);
2286	u32 psv_index[2];
2287	void *mkc;
2288	int err;
2289
2290	mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
2291	if (!mr->sig)
2292		return -ENOMEM;
2293
2294	/* create mem & wire PSVs */
2295	err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index);
2296	if (err)
2297		goto err_free_sig;
2298
2299	mr->sig->psv_memory.psv_idx = psv_index[0];
2300	mr->sig->psv_wire.psv_idx = psv_index[1];
2301
2302	mr->sig->sig_status_checked = true;
2303	mr->sig->sig_err_exists = false;
2304	/* Next UMR, Arm SIGERR */
2305	++mr->sig->sigerr_count;
2306	mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2307					 sizeof(struct mlx5_klm),
2308					 MLX5_MKC_ACCESS_MODE_KLMS);
2309	if (IS_ERR(mr->klm_mr)) {
2310		err = PTR_ERR(mr->klm_mr);
2311		goto err_destroy_psv;
2312	}
2313	mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg,
2314					 sizeof(struct mlx5_mtt),
2315					 MLX5_MKC_ACCESS_MODE_MTT);
2316	if (IS_ERR(mr->mtt_mr)) {
2317		err = PTR_ERR(mr->mtt_mr);
2318		goto err_free_klm_mr;
2319	}
2320
2321	/* Set bsf descriptors for mkey */
2322	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2323	MLX5_SET(mkc, mkc, bsf_en, 1);
2324	MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
2325
2326	err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0,
2327				     MLX5_MKC_ACCESS_MODE_KLMS, in, inlen);
2328	if (err)
2329		goto err_free_mtt_mr;
2330
2331	err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
2332			      mr->sig, GFP_KERNEL));
2333	if (err)
2334		goto err_free_descs;
2335	return 0;
2336
2337err_free_descs:
2338	destroy_mkey(dev, mr);
2339	mlx5_free_priv_descs(mr);
2340err_free_mtt_mr:
2341	mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL);
2342	mr->mtt_mr = NULL;
2343err_free_klm_mr:
2344	mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL);
2345	mr->klm_mr = NULL;
2346err_destroy_psv:
2347	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
2348		mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
2349			     mr->sig->psv_memory.psv_idx);
2350	if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx))
2351		mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
2352			     mr->sig->psv_wire.psv_idx);
2353err_free_sig:
2354	kfree(mr->sig);
2355
2356	return err;
2357}
2358
2359static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd,
2360					enum ib_mr_type mr_type, u32 max_num_sg,
2361					u32 max_num_meta_sg)
2362{
2363	struct mlx5_ib_dev *dev = to_mdev(pd->device);
2364	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2365	int ndescs = ALIGN(max_num_sg, 4);
2366	struct mlx5_ib_mr *mr;
2367	u32 *in;
2368	int err;
2369
2370	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
2371	if (!mr)
2372		return ERR_PTR(-ENOMEM);
2373
2374	in = kzalloc(inlen, GFP_KERNEL);
2375	if (!in) {
2376		err = -ENOMEM;
2377		goto err_free;
2378	}
2379
2380	mr->ibmr.device = pd->device;
2381	mr->umem = NULL;
2382
2383	switch (mr_type) {
2384	case IB_MR_TYPE_MEM_REG:
2385		err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen);
2386		break;
2387	case IB_MR_TYPE_SG_GAPS:
2388		err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen);
2389		break;
2390	case IB_MR_TYPE_INTEGRITY:
2391		err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg,
2392						 max_num_meta_sg, in, inlen);
2393		break;
2394	default:
2395		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
2396		err = -EINVAL;
2397	}
2398
2399	if (err)
2400		goto err_free_in;
2401
2402	kfree(in);
2403
2404	return &mr->ibmr;
2405
2406err_free_in:
2407	kfree(in);
2408err_free:
2409	kfree(mr);
2410	return ERR_PTR(err);
2411}
2412
2413struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
2414			       u32 max_num_sg)
2415{
2416	return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
2417}
2418
2419struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
2420					 u32 max_num_sg, u32 max_num_meta_sg)
2421{
2422	return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg,
2423				  max_num_meta_sg);
2424}
2425
2426int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
2427{
2428	struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
2429	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
2430	struct mlx5_ib_mw *mw = to_mmw(ibmw);
2431	unsigned int ndescs;
2432	u32 *in = NULL;
2433	void *mkc;
2434	int err;
2435	struct mlx5_ib_alloc_mw req = {};
2436	struct {
2437		__u32	comp_mask;
2438		__u32	response_length;
2439	} resp = {};
2440
2441	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
2442	if (err)
2443		return err;
2444
2445	if (req.comp_mask || req.reserved1 || req.reserved2)
2446		return -EOPNOTSUPP;
2447
2448	if (udata->inlen > sizeof(req) &&
2449	    !ib_is_udata_cleared(udata, sizeof(req),
2450				 udata->inlen - sizeof(req)))
2451		return -EOPNOTSUPP;
2452
2453	ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
2454
2455	in = kzalloc(inlen, GFP_KERNEL);
2456	if (!in)
2457		return -ENOMEM;
2458
2459	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
2460
2461	MLX5_SET(mkc, mkc, free, 1);
2462	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
2463	MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
2464	MLX5_SET(mkc, mkc, umr_en, 1);
2465	MLX5_SET(mkc, mkc, lr, 1);
2466	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
2467	MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
2468	MLX5_SET(mkc, mkc, qpn, 0xffffff);
2469
2470	err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
2471	if (err)
2472		goto free;
2473
2474	mw->mmkey.type = MLX5_MKEY_MW;
2475	ibmw->rkey = mw->mmkey.key;
2476	mw->mmkey.ndescs = ndescs;
2477
2478	resp.response_length =
2479		min(offsetofend(typeof(resp), response_length), udata->outlen);
2480	if (resp.response_length) {
2481		err = ib_copy_to_udata(udata, &resp, resp.response_length);
2482		if (err)
2483			goto free_mkey;
2484	}
2485
2486	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
2487		err = mlx5r_store_odp_mkey(dev, &mw->mmkey);
2488		if (err)
2489			goto free_mkey;
2490	}
2491
2492	kfree(in);
2493	return 0;
2494
2495free_mkey:
2496	mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key);
2497free:
2498	kfree(in);
2499	return err;
2500}
2501
2502int mlx5_ib_dealloc_mw(struct ib_mw *mw)
2503{
2504	struct mlx5_ib_dev *dev = to_mdev(mw->device);
2505	struct mlx5_ib_mw *mmw = to_mmw(mw);
2506
2507	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) &&
2508	    xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)))
2509		/*
2510		 * pagefault_single_data_segment() may be accessing mmw
2511		 * if the user bound an ODP MR to this MW.
2512		 */
2513		mlx5r_deref_wait_odp_mkey(&mmw->mmkey);
2514
2515	return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key);
2516}
2517
2518int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
2519			    struct ib_mr_status *mr_status)
2520{
2521	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
2522	int ret = 0;
2523
2524	if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
2525		pr_err("Invalid status check mask\n");
2526		ret = -EINVAL;
2527		goto done;
2528	}
2529
2530	mr_status->fail_status = 0;
2531	if (check_mask & IB_MR_CHECK_SIG_STATUS) {
2532		if (!mmr->sig) {
2533			ret = -EINVAL;
2534			pr_err("signature status check requested on a non-signature enabled MR\n");
2535			goto done;
2536		}
2537
2538		mmr->sig->sig_status_checked = true;
2539		if (!mmr->sig->sig_err_exists)
2540			goto done;
2541
2542		if (ibmr->lkey == mmr->sig->err_item.key)
2543			memcpy(&mr_status->sig_err, &mmr->sig->err_item,
2544			       sizeof(mr_status->sig_err));
2545		else {
2546			mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
2547			mr_status->sig_err.sig_err_offset = 0;
2548			mr_status->sig_err.key = mmr->sig->err_item.key;
2549		}
2550
2551		mmr->sig->sig_err_exists = false;
2552		mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
2553	}
2554
2555done:
2556	return ret;
2557}
2558
2559static int
2560mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2561			int data_sg_nents, unsigned int *data_sg_offset,
2562			struct scatterlist *meta_sg, int meta_sg_nents,
2563			unsigned int *meta_sg_offset)
2564{
2565	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2566	unsigned int sg_offset = 0;
2567	int n = 0;
2568
2569	mr->meta_length = 0;
2570	if (data_sg_nents == 1) {
2571		n++;
2572		mr->mmkey.ndescs = 1;
2573		if (data_sg_offset)
2574			sg_offset = *data_sg_offset;
2575		mr->data_length = sg_dma_len(data_sg) - sg_offset;
2576		mr->data_iova = sg_dma_address(data_sg) + sg_offset;
2577		if (meta_sg_nents == 1) {
2578			n++;
2579			mr->meta_ndescs = 1;
2580			if (meta_sg_offset)
2581				sg_offset = *meta_sg_offset;
2582			else
2583				sg_offset = 0;
2584			mr->meta_length = sg_dma_len(meta_sg) - sg_offset;
2585			mr->pi_iova = sg_dma_address(meta_sg) + sg_offset;
2586		}
2587		ibmr->length = mr->data_length + mr->meta_length;
2588	}
2589
2590	return n;
2591}
2592
2593static int
2594mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
2595		   struct scatterlist *sgl,
2596		   unsigned short sg_nents,
2597		   unsigned int *sg_offset_p,
2598		   struct scatterlist *meta_sgl,
2599		   unsigned short meta_sg_nents,
2600		   unsigned int *meta_sg_offset_p)
2601{
2602	struct scatterlist *sg = sgl;
2603	struct mlx5_klm *klms = mr->descs;
2604	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
2605	u32 lkey = mr->ibmr.pd->local_dma_lkey;
2606	int i, j = 0;
2607
2608	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
2609	mr->ibmr.length = 0;
2610
2611	for_each_sg(sgl, sg, sg_nents, i) {
2612		if (unlikely(i >= mr->max_descs))
2613			break;
2614		klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
2615		klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
2616		klms[i].key = cpu_to_be32(lkey);
2617		mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2618
2619		sg_offset = 0;
2620	}
2621
2622	if (sg_offset_p)
2623		*sg_offset_p = sg_offset;
2624
2625	mr->mmkey.ndescs = i;
2626	mr->data_length = mr->ibmr.length;
2627
2628	if (meta_sg_nents) {
2629		sg = meta_sgl;
2630		sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0;
2631		for_each_sg(meta_sgl, sg, meta_sg_nents, j) {
2632			if (unlikely(i + j >= mr->max_descs))
2633				break;
2634			klms[i + j].va = cpu_to_be64(sg_dma_address(sg) +
2635						     sg_offset);
2636			klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) -
2637							 sg_offset);
2638			klms[i + j].key = cpu_to_be32(lkey);
2639			mr->ibmr.length += sg_dma_len(sg) - sg_offset;
2640
2641			sg_offset = 0;
2642		}
2643		if (meta_sg_offset_p)
2644			*meta_sg_offset_p = sg_offset;
2645
2646		mr->meta_ndescs = j;
2647		mr->meta_length = mr->ibmr.length - mr->data_length;
2648	}
2649
2650	return i + j;
2651}
2652
2653static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
2654{
2655	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2656	__be64 *descs;
2657
2658	if (unlikely(mr->mmkey.ndescs == mr->max_descs))
2659		return -ENOMEM;
2660
2661	descs = mr->descs;
2662	descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2663
2664	return 0;
2665}
2666
2667static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr)
2668{
2669	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2670	__be64 *descs;
2671
2672	if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs))
2673		return -ENOMEM;
2674
2675	descs = mr->descs;
2676	descs[mr->mmkey.ndescs + mr->meta_ndescs++] =
2677		cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
2678
2679	return 0;
2680}
2681
2682static int
2683mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2684			 int data_sg_nents, unsigned int *data_sg_offset,
2685			 struct scatterlist *meta_sg, int meta_sg_nents,
2686			 unsigned int *meta_sg_offset)
2687{
2688	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2689	struct mlx5_ib_mr *pi_mr = mr->mtt_mr;
2690	int n;
2691
2692	pi_mr->mmkey.ndescs = 0;
2693	pi_mr->meta_ndescs = 0;
2694	pi_mr->meta_length = 0;
2695
2696	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2697				   pi_mr->desc_size * pi_mr->max_descs,
2698				   DMA_TO_DEVICE);
2699
2700	pi_mr->ibmr.page_size = ibmr->page_size;
2701	n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset,
2702			   mlx5_set_page);
2703	if (n != data_sg_nents)
2704		return n;
2705
2706	pi_mr->data_iova = pi_mr->ibmr.iova;
2707	pi_mr->data_length = pi_mr->ibmr.length;
2708	pi_mr->ibmr.length = pi_mr->data_length;
2709	ibmr->length = pi_mr->data_length;
2710
2711	if (meta_sg_nents) {
2712		u64 page_mask = ~((u64)ibmr->page_size - 1);
2713		u64 iova = pi_mr->data_iova;
2714
2715		n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents,
2716				    meta_sg_offset, mlx5_set_page_pi);
2717
2718		pi_mr->meta_length = pi_mr->ibmr.length;
2719		/*
2720		 * PI address for the HW is the offset of the metadata address
2721		 * relative to the first data page address.
2722		 * It equals to first data page address + size of data pages +
2723		 * metadata offset at the first metadata page
2724		 */
2725		pi_mr->pi_iova = (iova & page_mask) +
2726				 pi_mr->mmkey.ndescs * ibmr->page_size +
2727				 (pi_mr->ibmr.iova & ~page_mask);
2728		/*
2729		 * In order to use one MTT MR for data and metadata, we register
2730		 * also the gaps between the end of the data and the start of
2731		 * the metadata (the sig MR will verify that the HW will access
2732		 * to right addresses). This mapping is safe because we use
2733		 * internal mkey for the registration.
2734		 */
2735		pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova;
2736		pi_mr->ibmr.iova = iova;
2737		ibmr->length += pi_mr->meta_length;
2738	}
2739
2740	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2741				      pi_mr->desc_size * pi_mr->max_descs,
2742				      DMA_TO_DEVICE);
2743
2744	return n;
2745}
2746
2747static int
2748mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2749			 int data_sg_nents, unsigned int *data_sg_offset,
2750			 struct scatterlist *meta_sg, int meta_sg_nents,
2751			 unsigned int *meta_sg_offset)
2752{
2753	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2754	struct mlx5_ib_mr *pi_mr = mr->klm_mr;
2755	int n;
2756
2757	pi_mr->mmkey.ndescs = 0;
2758	pi_mr->meta_ndescs = 0;
2759	pi_mr->meta_length = 0;
2760
2761	ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map,
2762				   pi_mr->desc_size * pi_mr->max_descs,
2763				   DMA_TO_DEVICE);
2764
2765	n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset,
2766			       meta_sg, meta_sg_nents, meta_sg_offset);
2767
2768	ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map,
2769				      pi_mr->desc_size * pi_mr->max_descs,
2770				      DMA_TO_DEVICE);
2771
2772	/* This is zero-based memory region */
2773	pi_mr->data_iova = 0;
2774	pi_mr->ibmr.iova = 0;
2775	pi_mr->pi_iova = pi_mr->data_length;
2776	ibmr->length = pi_mr->ibmr.length;
2777
2778	return n;
2779}
2780
2781int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg,
2782			 int data_sg_nents, unsigned int *data_sg_offset,
2783			 struct scatterlist *meta_sg, int meta_sg_nents,
2784			 unsigned int *meta_sg_offset)
2785{
2786	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2787	struct mlx5_ib_mr *pi_mr = NULL;
2788	int n;
2789
2790	WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY);
2791
2792	mr->mmkey.ndescs = 0;
2793	mr->data_length = 0;
2794	mr->data_iova = 0;
2795	mr->meta_ndescs = 0;
2796	mr->pi_iova = 0;
2797	/*
2798	 * As a performance optimization, if possible, there is no need to
2799	 * perform UMR operation to register the data/metadata buffers.
2800	 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2801	 * Fallback to UMR only in case of a failure.
2802	 */
2803	n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2804				    data_sg_offset, meta_sg, meta_sg_nents,
2805				    meta_sg_offset);
2806	if (n == data_sg_nents + meta_sg_nents)
2807		goto out;
2808	/*
2809	 * As a performance optimization, if possible, there is no need to map
2810	 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2811	 * descriptors and fallback to KLM only in case of a failure.
2812	 * It's more efficient for the HW to work with MTT descriptors
2813	 * (especially in high load).
2814	 * Use KLM (indirect access) only if it's mandatory.
2815	 */
2816	pi_mr = mr->mtt_mr;
2817	n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2818				     data_sg_offset, meta_sg, meta_sg_nents,
2819				     meta_sg_offset);
2820	if (n == data_sg_nents + meta_sg_nents)
2821		goto out;
2822
2823	pi_mr = mr->klm_mr;
2824	n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents,
2825				     data_sg_offset, meta_sg, meta_sg_nents,
2826				     meta_sg_offset);
2827	if (unlikely(n != data_sg_nents + meta_sg_nents))
2828		return -ENOMEM;
2829
2830out:
2831	/* This is zero-based memory region */
2832	ibmr->iova = 0;
2833	mr->pi_mr = pi_mr;
2834	if (pi_mr)
2835		ibmr->sig_attrs->meta_length = pi_mr->meta_length;
2836	else
2837		ibmr->sig_attrs->meta_length = mr->meta_length;
2838
2839	return 0;
2840}
2841
2842int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
2843		      unsigned int *sg_offset)
2844{
2845	struct mlx5_ib_mr *mr = to_mmr(ibmr);
2846	int n;
2847
2848	mr->mmkey.ndescs = 0;
2849
2850	ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
2851				   mr->desc_size * mr->max_descs,
2852				   DMA_TO_DEVICE);
2853
2854	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
2855		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0,
2856				       NULL);
2857	else
2858		n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
2859				mlx5_set_page);
2860
2861	ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
2862				      mr->desc_size * mr->max_descs,
2863				      DMA_TO_DEVICE);
2864
2865	return n;
2866}