Linux Audio

Check our new training course

Loading...
v6.8
  1// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
  2/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. */
  3
  4#include <rdma/ib_umem_odp.h>
  5#include "mlx5_ib.h"
  6#include "umr.h"
  7#include "wr.h"
  8
  9/*
 10 * We can't use an array for xlt_emergency_page because dma_map_single doesn't
 11 * work on kernel modules memory
 12 */
 13void *xlt_emergency_page;
 14static DEFINE_MUTEX(xlt_emergency_page_mutex);
 15
 16static __be64 get_umr_enable_mr_mask(void)
 17{
 18	u64 result;
 19
 20	result = MLX5_MKEY_MASK_KEY |
 21		 MLX5_MKEY_MASK_FREE;
 22
 23	return cpu_to_be64(result);
 24}
 25
 26static __be64 get_umr_disable_mr_mask(void)
 27{
 28	u64 result;
 29
 30	result = MLX5_MKEY_MASK_FREE;
 31
 32	return cpu_to_be64(result);
 33}
 34
 35static __be64 get_umr_update_translation_mask(void)
 36{
 37	u64 result;
 38
 39	result = MLX5_MKEY_MASK_LEN |
 40		 MLX5_MKEY_MASK_PAGE_SIZE |
 41		 MLX5_MKEY_MASK_START_ADDR;
 42
 43	return cpu_to_be64(result);
 44}
 45
 46static __be64 get_umr_update_access_mask(struct mlx5_ib_dev *dev)
 47{
 48	u64 result;
 49
 50	result = MLX5_MKEY_MASK_LR |
 51		 MLX5_MKEY_MASK_LW |
 52		 MLX5_MKEY_MASK_RR |
 53		 MLX5_MKEY_MASK_RW;
 54
 55	if (MLX5_CAP_GEN(dev->mdev, atomic))
 56		result |= MLX5_MKEY_MASK_A;
 57
 58	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
 59		result |= MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE;
 60
 61	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
 62		result |= MLX5_MKEY_MASK_RELAXED_ORDERING_READ;
 63
 64	return cpu_to_be64(result);
 65}
 66
 67static __be64 get_umr_update_pd_mask(void)
 68{
 69	u64 result;
 70
 71	result = MLX5_MKEY_MASK_PD;
 72
 73	return cpu_to_be64(result);
 74}
 75
 76static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask)
 77{
 78	if (mask & MLX5_MKEY_MASK_PAGE_SIZE &&
 79	    MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
 80		return -EPERM;
 81
 82	if (mask & MLX5_MKEY_MASK_A &&
 83	    MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
 84		return -EPERM;
 85
 86	if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE &&
 87	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
 88		return -EPERM;
 89
 90	if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_READ &&
 91	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
 92		return -EPERM;
 93
 94	return 0;
 95}
 96
 97enum {
 98	MAX_UMR_WR = 128,
 99};
100
101static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp)
102{
103	struct ib_qp_attr attr = {};
104	int ret;
105
106	attr.qp_state = IB_QPS_INIT;
107	attr.port_num = 1;
108	ret = ib_modify_qp(qp, &attr,
109			   IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT);
110	if (ret) {
111		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
112		return ret;
113	}
114
115	memset(&attr, 0, sizeof(attr));
116	attr.qp_state = IB_QPS_RTR;
117
118	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
119	if (ret) {
120		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
121		return ret;
122	}
123
124	memset(&attr, 0, sizeof(attr));
125	attr.qp_state = IB_QPS_RTS;
126	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
127	if (ret) {
128		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
129		return ret;
130	}
131
132	return 0;
133}
134
135int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
136{
137	struct ib_qp_init_attr init_attr = {};
138	struct ib_pd *pd;
139	struct ib_cq *cq;
140	struct ib_qp *qp;
141	int ret;
142
143	pd = ib_alloc_pd(&dev->ib_dev, 0);
144	if (IS_ERR(pd)) {
145		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
146		return PTR_ERR(pd);
147	}
148
149	cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
150	if (IS_ERR(cq)) {
151		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
152		ret = PTR_ERR(cq);
153		goto destroy_pd;
154	}
155
156	init_attr.send_cq = cq;
157	init_attr.recv_cq = cq;
158	init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
159	init_attr.cap.max_send_wr = MAX_UMR_WR;
160	init_attr.cap.max_send_sge = 1;
161	init_attr.qp_type = MLX5_IB_QPT_REG_UMR;
162	init_attr.port_num = 1;
163	qp = ib_create_qp(pd, &init_attr);
164	if (IS_ERR(qp)) {
165		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
166		ret = PTR_ERR(qp);
167		goto destroy_cq;
168	}
169
170	ret = mlx5r_umr_qp_rst2rts(dev, qp);
171	if (ret)
172		goto destroy_qp;
173
174	dev->umrc.qp = qp;
175	dev->umrc.cq = cq;
176	dev->umrc.pd = pd;
177
178	sema_init(&dev->umrc.sem, MAX_UMR_WR);
179	mutex_init(&dev->umrc.lock);
180	dev->umrc.state = MLX5_UMR_STATE_ACTIVE;
181
182	return 0;
183
184destroy_qp:
185	ib_destroy_qp(qp);
186destroy_cq:
187	ib_free_cq(cq);
188destroy_pd:
189	ib_dealloc_pd(pd);
190	return ret;
191}
192
193void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
194{
195	if (dev->umrc.state == MLX5_UMR_STATE_UNINIT)
196		return;
197	ib_destroy_qp(dev->umrc.qp);
198	ib_free_cq(dev->umrc.cq);
199	ib_dealloc_pd(dev->umrc.pd);
200}
201
202static int mlx5r_umr_recover(struct mlx5_ib_dev *dev)
203{
204	struct umr_common *umrc = &dev->umrc;
205	struct ib_qp_attr attr;
206	int err;
207
208	attr.qp_state = IB_QPS_RESET;
209	err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
210	if (err) {
211		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
212		goto err;
213	}
214
215	err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
216	if (err)
217		goto err;
218
219	umrc->state = MLX5_UMR_STATE_ACTIVE;
220	return 0;
221
222err:
223	umrc->state = MLX5_UMR_STATE_ERR;
224	return err;
225}
226
227static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
228			       struct mlx5r_umr_wqe *wqe, bool with_data)
229{
230	unsigned int wqe_size =
231		with_data ? sizeof(struct mlx5r_umr_wqe) :
232			    sizeof(struct mlx5r_umr_wqe) -
233				    sizeof(struct mlx5_wqe_data_seg);
234	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
235	struct mlx5_core_dev *mdev = dev->mdev;
236	struct mlx5_ib_qp *qp = to_mqp(ibqp);
237	struct mlx5_wqe_ctrl_seg *ctrl;
238	union {
239		struct ib_cqe *ib_cqe;
240		u64 wr_id;
241	} id;
242	void *cur_edge, *seg;
243	unsigned long flags;
244	unsigned int idx;
245	int size, err;
246
247	if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR))
248		return -EIO;
249
250	spin_lock_irqsave(&qp->sq.lock, flags);
251
252	err = mlx5r_begin_wqe(qp, &seg, &ctrl, &idx, &size, &cur_edge, 0,
253			      cpu_to_be32(mkey), false, false);
254	if (WARN_ON(err))
255		goto out;
256
257	qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
258
259	mlx5r_memcpy_send_wqe(&qp->sq, &cur_edge, &seg, &size, wqe, wqe_size);
260
261	id.ib_cqe = cqe;
262	mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
263			 MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR);
264
265	mlx5r_ring_db(qp, 1, ctrl);
266
267out:
268	spin_unlock_irqrestore(&qp->sq.lock, flags);
269
270	return err;
271}
272
273static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc)
274{
275	struct mlx5_ib_umr_context *context =
276		container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
277
278	context->status = wc->status;
279	complete(&context->done);
280}
281
282static inline void mlx5r_umr_init_context(struct mlx5r_umr_context *context)
283{
284	context->cqe.done = mlx5r_umr_done;
285	init_completion(&context->done);
286}
287
288static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
289				   struct mlx5r_umr_wqe *wqe, bool with_data)
290{
291	struct umr_common *umrc = &dev->umrc;
292	struct mlx5r_umr_context umr_context;
293	int err;
294
295	err = umr_check_mkey_mask(dev, be64_to_cpu(wqe->ctrl_seg.mkey_mask));
296	if (WARN_ON(err))
297		return err;
298
299	mlx5r_umr_init_context(&umr_context);
300
301	down(&umrc->sem);
302	while (true) {
303		mutex_lock(&umrc->lock);
304		if (umrc->state == MLX5_UMR_STATE_ERR) {
305			mutex_unlock(&umrc->lock);
306			err = -EFAULT;
307			break;
308		}
309
310		if (umrc->state == MLX5_UMR_STATE_RECOVER) {
311			mutex_unlock(&umrc->lock);
312			usleep_range(3000, 5000);
313			continue;
314		}
315
316		err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
317					  with_data);
318		mutex_unlock(&umrc->lock);
319		if (err) {
320			mlx5_ib_warn(dev, "UMR post send failed, err %d\n",
321				     err);
322			break;
323		}
324
325		wait_for_completion(&umr_context.done);
326
327		if (umr_context.status == IB_WC_SUCCESS)
328			break;
329
330		if (umr_context.status == IB_WC_WR_FLUSH_ERR)
331			continue;
332
333		WARN_ON_ONCE(1);
334		mlx5_ib_warn(dev,
335			"reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n",
336			umr_context.status, mkey);
337		mutex_lock(&umrc->lock);
338		err = mlx5r_umr_recover(dev);
339		mutex_unlock(&umrc->lock);
340		if (err)
341			mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
342				     err);
343		err = -EFAULT;
344		break;
345	}
346	up(&umrc->sem);
347	return err;
348}
349
350/**
351 * mlx5r_umr_revoke_mr - Fence all DMA on the MR
352 * @mr: The MR to fence
353 *
354 * Upon return the NIC will not be doing any DMA to the pages under the MR,
355 * and any DMA in progress will be completed. Failure of this function
356 * indicates the HW has failed catastrophically.
357 */
358int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr)
359{
360	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
361	struct mlx5r_umr_wqe wqe = {};
362
363	if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
364		return 0;
365
366	wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
367	wqe.ctrl_seg.mkey_mask |= get_umr_disable_mr_mask();
368	wqe.ctrl_seg.flags |= MLX5_UMR_INLINE;
369
370	MLX5_SET(mkc, &wqe.mkey_seg, free, 1);
371	MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(dev->umrc.pd)->pdn);
372	MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
373	MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
374		 mlx5_mkey_variant(mr->mmkey.key));
375
376	return mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
377}
378
379static void mlx5r_umr_set_access_flags(struct mlx5_ib_dev *dev,
380				       struct mlx5_mkey_seg *seg,
381				       unsigned int access_flags)
382{
383	bool ro_read = (access_flags & IB_ACCESS_RELAXED_ORDERING) &&
384		       (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
385			pcie_relaxed_ordering_enabled(dev->mdev->pdev));
386
387	MLX5_SET(mkc, seg, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
388	MLX5_SET(mkc, seg, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
389	MLX5_SET(mkc, seg, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
390	MLX5_SET(mkc, seg, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
391	MLX5_SET(mkc, seg, lr, 1);
392	MLX5_SET(mkc, seg, relaxed_ordering_write,
393		 !!(access_flags & IB_ACCESS_RELAXED_ORDERING));
394	MLX5_SET(mkc, seg, relaxed_ordering_read, ro_read);
395}
396
397int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
398			      int access_flags)
399{
400	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
401	struct mlx5r_umr_wqe wqe = {};
402	int err;
403
404	wqe.ctrl_seg.mkey_mask = get_umr_update_access_mask(dev);
405	wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
406	wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE;
407	wqe.ctrl_seg.flags |= MLX5_UMR_INLINE;
408
409	mlx5r_umr_set_access_flags(dev, &wqe.mkey_seg, access_flags);
410	MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(pd)->pdn);
411	MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
412	MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
413		 mlx5_mkey_variant(mr->mmkey.key));
414
415	err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
416	if (err)
417		return err;
418
419	mr->access_flags = access_flags;
420	return 0;
421}
422
423#define MLX5_MAX_UMR_CHUNK                                                     \
424	((1 << (MLX5_MAX_UMR_SHIFT + 4)) - MLX5_UMR_FLEX_ALIGNMENT)
425#define MLX5_SPARE_UMR_CHUNK 0x10000
426
427/*
428 * Allocate a temporary buffer to hold the per-page information to transfer to
429 * HW. For efficiency this should be as large as it can be, but buffer
430 * allocation failure is not allowed, so try smaller sizes.
431 */
432static void *mlx5r_umr_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
433{
434	const size_t xlt_chunk_align = MLX5_UMR_FLEX_ALIGNMENT / ent_size;
435	size_t size;
436	void *res = NULL;
437
438	static_assert(PAGE_SIZE % MLX5_UMR_FLEX_ALIGNMENT == 0);
439
440	/*
441	 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
442	 * allocation can't trigger any kind of reclaim.
443	 */
444	might_sleep();
445
446	gfp_mask |= __GFP_ZERO | __GFP_NORETRY;
447
448	/*
449	 * If the system already has a suitable high order page then just use
450	 * that, but don't try hard to create one. This max is about 1M, so a
451	 * free x86 huge page will satisfy it.
452	 */
453	size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
454		     MLX5_MAX_UMR_CHUNK);
455	*nents = size / ent_size;
456	res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
457				       get_order(size));
458	if (res)
459		return res;
460
461	if (size > MLX5_SPARE_UMR_CHUNK) {
462		size = MLX5_SPARE_UMR_CHUNK;
463		*nents = size / ent_size;
464		res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
465					       get_order(size));
466		if (res)
467			return res;
468	}
469
470	*nents = PAGE_SIZE / ent_size;
471	res = (void *)__get_free_page(gfp_mask);
472	if (res)
473		return res;
474
475	mutex_lock(&xlt_emergency_page_mutex);
476	memset(xlt_emergency_page, 0, PAGE_SIZE);
477	return xlt_emergency_page;
478}
479
480static void mlx5r_umr_free_xlt(void *xlt, size_t length)
481{
482	if (xlt == xlt_emergency_page) {
483		mutex_unlock(&xlt_emergency_page_mutex);
484		return;
485	}
486
487	free_pages((unsigned long)xlt, get_order(length));
488}
489
490static void mlx5r_umr_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
491				     struct ib_sge *sg)
492{
493	struct device *ddev = &dev->mdev->pdev->dev;
494
495	dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
496	mlx5r_umr_free_xlt(xlt, sg->length);
497}
498
499/*
500 * Create an XLT buffer ready for submission.
501 */
502static void *mlx5r_umr_create_xlt(struct mlx5_ib_dev *dev, struct ib_sge *sg,
503				  size_t nents, size_t ent_size,
504				  unsigned int flags)
505{
506	struct device *ddev = &dev->mdev->pdev->dev;
507	dma_addr_t dma;
508	void *xlt;
509
510	xlt = mlx5r_umr_alloc_xlt(&nents, ent_size,
511				 flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
512								  GFP_KERNEL);
513	sg->length = nents * ent_size;
514	dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
515	if (dma_mapping_error(ddev, dma)) {
516		mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
517		mlx5r_umr_free_xlt(xlt, sg->length);
518		return NULL;
519	}
520	sg->addr = dma;
521	sg->lkey = dev->umrc.pd->local_dma_lkey;
522
523	return xlt;
524}
525
526static void
527mlx5r_umr_set_update_xlt_ctrl_seg(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg,
528				  unsigned int flags, struct ib_sge *sg)
529{
530	if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
531		/* fail if free */
532		ctrl_seg->flags = MLX5_UMR_CHECK_FREE;
533	else
534		/* fail if not free */
535		ctrl_seg->flags = MLX5_UMR_CHECK_NOT_FREE;
536	ctrl_seg->xlt_octowords =
537		cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length));
538}
539
540static void mlx5r_umr_set_update_xlt_mkey_seg(struct mlx5_ib_dev *dev,
541					      struct mlx5_mkey_seg *mkey_seg,
542					      struct mlx5_ib_mr *mr,
543					      unsigned int page_shift)
544{
545	mlx5r_umr_set_access_flags(dev, mkey_seg, mr->access_flags);
546	MLX5_SET(mkc, mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn);
547	MLX5_SET64(mkc, mkey_seg, start_addr, mr->ibmr.iova);
548	MLX5_SET64(mkc, mkey_seg, len, mr->ibmr.length);
549	MLX5_SET(mkc, mkey_seg, log_page_size, page_shift);
550	MLX5_SET(mkc, mkey_seg, qpn, 0xffffff);
551	MLX5_SET(mkc, mkey_seg, mkey_7_0, mlx5_mkey_variant(mr->mmkey.key));
552}
553
554static void
555mlx5r_umr_set_update_xlt_data_seg(struct mlx5_wqe_data_seg *data_seg,
556				  struct ib_sge *sg)
557{
558	data_seg->byte_count = cpu_to_be32(sg->length);
559	data_seg->lkey = cpu_to_be32(sg->lkey);
560	data_seg->addr = cpu_to_be64(sg->addr);
561}
562
563static void mlx5r_umr_update_offset(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg,
564				    u64 offset)
565{
566	u64 octo_offset = mlx5r_umr_get_xlt_octo(offset);
567
568	ctrl_seg->xlt_offset = cpu_to_be16(octo_offset & 0xffff);
569	ctrl_seg->xlt_offset_47_16 = cpu_to_be32(octo_offset >> 16);
570	ctrl_seg->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
571}
572
573static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
574				       struct mlx5r_umr_wqe *wqe,
575				       struct mlx5_ib_mr *mr, struct ib_sge *sg,
576				       unsigned int flags)
577{
578	bool update_pd_access, update_translation;
579
580	if (flags & MLX5_IB_UPD_XLT_ENABLE)
581		wqe->ctrl_seg.mkey_mask |= get_umr_enable_mr_mask();
582
583	update_pd_access = flags & MLX5_IB_UPD_XLT_ENABLE ||
584			   flags & MLX5_IB_UPD_XLT_PD ||
585			   flags & MLX5_IB_UPD_XLT_ACCESS;
586
587	if (update_pd_access) {
588		wqe->ctrl_seg.mkey_mask |= get_umr_update_access_mask(dev);
589		wqe->ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
590	}
591
592	update_translation =
593		flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR;
594
595	if (update_translation) {
596		wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask();
597		if (!mr->ibmr.length)
598			MLX5_SET(mkc, &wqe->mkey_seg, length64, 1);
599	}
600
601	wqe->ctrl_seg.xlt_octowords =
602		cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length));
603	wqe->data_seg.byte_count = cpu_to_be32(sg->length);
604}
605
606/*
607 * Send the DMA list to the HW for a normal MR using UMR.
608 * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
609 * flag may be used.
610 */
611int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
612{
613	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
614	struct device *ddev = &dev->mdev->pdev->dev;
615	struct mlx5r_umr_wqe wqe = {};
616	struct ib_block_iter biter;
617	struct mlx5_mtt *cur_mtt;
618	size_t orig_sg_length;
619	struct mlx5_mtt *mtt;
620	size_t final_size;
621	struct ib_sge sg;
622	u64 offset = 0;
623	int err = 0;
624
625	if (WARN_ON(mr->umem->is_odp))
626		return -EINVAL;
627
628	mtt = mlx5r_umr_create_xlt(
629		dev, &sg, ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift),
630		sizeof(*mtt), flags);
631	if (!mtt)
632		return -ENOMEM;
633
634	orig_sg_length = sg.length;
635
636	mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
637	mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr,
638					  mr->page_shift);
639	mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
640
641	cur_mtt = mtt;
642	rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) {
643		if (cur_mtt == (void *)mtt + sg.length) {
644			dma_sync_single_for_device(ddev, sg.addr, sg.length,
645						   DMA_TO_DEVICE);
646
647			err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe,
648						       true);
649			if (err)
650				goto err;
651			dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
652						DMA_TO_DEVICE);
653			offset += sg.length;
654			mlx5r_umr_update_offset(&wqe.ctrl_seg, offset);
655
656			cur_mtt = mtt;
657		}
658
659		cur_mtt->ptag =
660			cpu_to_be64(rdma_block_iter_dma_address(&biter) |
661				    MLX5_IB_MTT_PRESENT);
662
663		if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
664			cur_mtt->ptag = 0;
665
666		cur_mtt++;
667	}
668
669	final_size = (void *)cur_mtt - (void *)mtt;
670	sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT);
671	memset(cur_mtt, 0, sg.length - final_size);
672	mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
673
674	dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
675	err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true);
676
677err:
678	sg.length = orig_sg_length;
679	mlx5r_umr_unmap_free_xlt(dev, mtt, &sg);
680	return err;
681}
682
683static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
684{
685	return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
686}
687
688int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
689			 int page_shift, int flags)
690{
691	int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
692			       ? sizeof(struct mlx5_klm)
693			       : sizeof(struct mlx5_mtt);
694	const int page_align = MLX5_UMR_FLEX_ALIGNMENT / desc_size;
695	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
696	struct device *ddev = &dev->mdev->pdev->dev;
697	const int page_mask = page_align - 1;
698	struct mlx5r_umr_wqe wqe = {};
699	size_t pages_mapped = 0;
700	size_t pages_to_map = 0;
701	size_t size_to_map = 0;
702	size_t orig_sg_length;
703	size_t pages_iter;
704	struct ib_sge sg;
705	int err = 0;
706	void *xlt;
707
708	if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
709	    !umr_can_use_indirect_mkey(dev))
710		return -EPERM;
711
712	if (WARN_ON(!mr->umem->is_odp))
713		return -EINVAL;
714
715	/* UMR copies MTTs in units of MLX5_UMR_FLEX_ALIGNMENT bytes,
716	 * so we need to align the offset and length accordingly
717	 */
718	if (idx & page_mask) {
719		npages += idx & page_mask;
720		idx &= ~page_mask;
721	}
722	pages_to_map = ALIGN(npages, page_align);
723
724	xlt = mlx5r_umr_create_xlt(dev, &sg, npages, desc_size, flags);
725	if (!xlt)
726		return -ENOMEM;
727
728	pages_iter = sg.length / desc_size;
729	orig_sg_length = sg.length;
730
731	if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
732		struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
733		size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
734
735		pages_to_map = min_t(size_t, pages_to_map, max_pages);
736	}
737
738	mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
739	mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, page_shift);
740	mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
741
742	for (pages_mapped = 0;
743	     pages_mapped < pages_to_map && !err;
744	     pages_mapped += pages_iter, idx += pages_iter) {
745		npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
746		size_to_map = npages * desc_size;
747		dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
748					DMA_TO_DEVICE);
749		mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
750		dma_sync_single_for_device(ddev, sg.addr, sg.length,
751					   DMA_TO_DEVICE);
752		sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);
753
754		if (pages_mapped + pages_iter >= pages_to_map)
755			mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
756		mlx5r_umr_update_offset(&wqe.ctrl_seg, idx * desc_size);
757		err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true);
758	}
759	sg.length = orig_sg_length;
760	mlx5r_umr_unmap_free_xlt(dev, xlt, &sg);
761	return err;
762}
v6.9.4
  1// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
  2/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. */
  3
  4#include <rdma/ib_umem_odp.h>
  5#include "mlx5_ib.h"
  6#include "umr.h"
  7#include "wr.h"
  8
  9/*
 10 * We can't use an array for xlt_emergency_page because dma_map_single doesn't
 11 * work on kernel modules memory
 12 */
 13void *xlt_emergency_page;
 14static DEFINE_MUTEX(xlt_emergency_page_mutex);
 15
 16static __be64 get_umr_enable_mr_mask(void)
 17{
 18	u64 result;
 19
 20	result = MLX5_MKEY_MASK_KEY |
 21		 MLX5_MKEY_MASK_FREE;
 22
 23	return cpu_to_be64(result);
 24}
 25
 26static __be64 get_umr_disable_mr_mask(void)
 27{
 28	u64 result;
 29
 30	result = MLX5_MKEY_MASK_FREE;
 31
 32	return cpu_to_be64(result);
 33}
 34
 35static __be64 get_umr_update_translation_mask(void)
 36{
 37	u64 result;
 38
 39	result = MLX5_MKEY_MASK_LEN |
 40		 MLX5_MKEY_MASK_PAGE_SIZE |
 41		 MLX5_MKEY_MASK_START_ADDR;
 42
 43	return cpu_to_be64(result);
 44}
 45
 46static __be64 get_umr_update_access_mask(struct mlx5_ib_dev *dev)
 47{
 48	u64 result;
 49
 50	result = MLX5_MKEY_MASK_LR |
 51		 MLX5_MKEY_MASK_LW |
 52		 MLX5_MKEY_MASK_RR |
 53		 MLX5_MKEY_MASK_RW;
 54
 55	if (MLX5_CAP_GEN(dev->mdev, atomic))
 56		result |= MLX5_MKEY_MASK_A;
 57
 58	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
 59		result |= MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE;
 60
 61	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
 62		result |= MLX5_MKEY_MASK_RELAXED_ORDERING_READ;
 63
 64	return cpu_to_be64(result);
 65}
 66
 67static __be64 get_umr_update_pd_mask(void)
 68{
 69	u64 result;
 70
 71	result = MLX5_MKEY_MASK_PD;
 72
 73	return cpu_to_be64(result);
 74}
 75
 76static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask)
 77{
 78	if (mask & MLX5_MKEY_MASK_PAGE_SIZE &&
 79	    MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
 80		return -EPERM;
 81
 82	if (mask & MLX5_MKEY_MASK_A &&
 83	    MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
 84		return -EPERM;
 85
 86	if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE &&
 87	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
 88		return -EPERM;
 89
 90	if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_READ &&
 91	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
 92		return -EPERM;
 93
 94	return 0;
 95}
 96
 97enum {
 98	MAX_UMR_WR = 128,
 99};
100
101static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp)
102{
103	struct ib_qp_attr attr = {};
104	int ret;
105
106	attr.qp_state = IB_QPS_INIT;
107	attr.port_num = 1;
108	ret = ib_modify_qp(qp, &attr,
109			   IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT);
110	if (ret) {
111		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
112		return ret;
113	}
114
115	memset(&attr, 0, sizeof(attr));
116	attr.qp_state = IB_QPS_RTR;
117
118	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
119	if (ret) {
120		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
121		return ret;
122	}
123
124	memset(&attr, 0, sizeof(attr));
125	attr.qp_state = IB_QPS_RTS;
126	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
127	if (ret) {
128		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
129		return ret;
130	}
131
132	return 0;
133}
134
135int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev)
136{
137	struct ib_qp_init_attr init_attr = {};
138	struct ib_pd *pd;
139	struct ib_cq *cq;
140	struct ib_qp *qp;
141	int ret;
142
143	pd = ib_alloc_pd(&dev->ib_dev, 0);
144	if (IS_ERR(pd)) {
145		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
146		return PTR_ERR(pd);
147	}
148
149	cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
150	if (IS_ERR(cq)) {
151		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
152		ret = PTR_ERR(cq);
153		goto destroy_pd;
154	}
155
156	init_attr.send_cq = cq;
157	init_attr.recv_cq = cq;
158	init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
159	init_attr.cap.max_send_wr = MAX_UMR_WR;
160	init_attr.cap.max_send_sge = 1;
161	init_attr.qp_type = MLX5_IB_QPT_REG_UMR;
162	init_attr.port_num = 1;
163	qp = ib_create_qp(pd, &init_attr);
164	if (IS_ERR(qp)) {
165		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
166		ret = PTR_ERR(qp);
167		goto destroy_cq;
168	}
169
170	ret = mlx5r_umr_qp_rst2rts(dev, qp);
171	if (ret)
172		goto destroy_qp;
173
174	dev->umrc.qp = qp;
175	dev->umrc.cq = cq;
176	dev->umrc.pd = pd;
177
178	sema_init(&dev->umrc.sem, MAX_UMR_WR);
179	mutex_init(&dev->umrc.lock);
180	dev->umrc.state = MLX5_UMR_STATE_ACTIVE;
181
182	return 0;
183
184destroy_qp:
185	ib_destroy_qp(qp);
186destroy_cq:
187	ib_free_cq(cq);
188destroy_pd:
189	ib_dealloc_pd(pd);
190	return ret;
191}
192
193void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev)
194{
195	if (dev->umrc.state == MLX5_UMR_STATE_UNINIT)
196		return;
197	ib_destroy_qp(dev->umrc.qp);
198	ib_free_cq(dev->umrc.cq);
199	ib_dealloc_pd(dev->umrc.pd);
200}
201
202static int mlx5r_umr_recover(struct mlx5_ib_dev *dev)
203{
204	struct umr_common *umrc = &dev->umrc;
205	struct ib_qp_attr attr;
206	int err;
207
208	attr.qp_state = IB_QPS_RESET;
209	err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE);
210	if (err) {
211		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
212		goto err;
213	}
214
215	err = mlx5r_umr_qp_rst2rts(dev, umrc->qp);
216	if (err)
217		goto err;
218
219	umrc->state = MLX5_UMR_STATE_ACTIVE;
220	return 0;
221
222err:
223	umrc->state = MLX5_UMR_STATE_ERR;
224	return err;
225}
226
227static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe,
228			       struct mlx5r_umr_wqe *wqe, bool with_data)
229{
230	unsigned int wqe_size =
231		with_data ? sizeof(struct mlx5r_umr_wqe) :
232			    sizeof(struct mlx5r_umr_wqe) -
233				    sizeof(struct mlx5_wqe_data_seg);
234	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
235	struct mlx5_core_dev *mdev = dev->mdev;
236	struct mlx5_ib_qp *qp = to_mqp(ibqp);
237	struct mlx5_wqe_ctrl_seg *ctrl;
238	union {
239		struct ib_cqe *ib_cqe;
240		u64 wr_id;
241	} id;
242	void *cur_edge, *seg;
243	unsigned long flags;
244	unsigned int idx;
245	int size, err;
246
247	if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR))
248		return -EIO;
249
250	spin_lock_irqsave(&qp->sq.lock, flags);
251
252	err = mlx5r_begin_wqe(qp, &seg, &ctrl, &idx, &size, &cur_edge, 0,
253			      cpu_to_be32(mkey), false, false);
254	if (WARN_ON(err))
255		goto out;
256
257	qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
258
259	mlx5r_memcpy_send_wqe(&qp->sq, &cur_edge, &seg, &size, wqe, wqe_size);
260
261	id.ib_cqe = cqe;
262	mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0,
263			 MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR);
264
265	mlx5r_ring_db(qp, 1, ctrl);
266
267out:
268	spin_unlock_irqrestore(&qp->sq.lock, flags);
269
270	return err;
271}
272
273static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc)
274{
275	struct mlx5_ib_umr_context *context =
276		container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
277
278	context->status = wc->status;
279	complete(&context->done);
280}
281
282static inline void mlx5r_umr_init_context(struct mlx5r_umr_context *context)
283{
284	context->cqe.done = mlx5r_umr_done;
285	init_completion(&context->done);
286}
287
288static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey,
289				   struct mlx5r_umr_wqe *wqe, bool with_data)
290{
291	struct umr_common *umrc = &dev->umrc;
292	struct mlx5r_umr_context umr_context;
293	int err;
294
295	err = umr_check_mkey_mask(dev, be64_to_cpu(wqe->ctrl_seg.mkey_mask));
296	if (WARN_ON(err))
297		return err;
298
299	mlx5r_umr_init_context(&umr_context);
300
301	down(&umrc->sem);
302	while (true) {
303		mutex_lock(&umrc->lock);
304		if (umrc->state == MLX5_UMR_STATE_ERR) {
305			mutex_unlock(&umrc->lock);
306			err = -EFAULT;
307			break;
308		}
309
310		if (umrc->state == MLX5_UMR_STATE_RECOVER) {
311			mutex_unlock(&umrc->lock);
312			usleep_range(3000, 5000);
313			continue;
314		}
315
316		err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe,
317					  with_data);
318		mutex_unlock(&umrc->lock);
319		if (err) {
320			mlx5_ib_warn(dev, "UMR post send failed, err %d\n",
321				     err);
322			break;
323		}
324
325		wait_for_completion(&umr_context.done);
326
327		if (umr_context.status == IB_WC_SUCCESS)
328			break;
329
330		if (umr_context.status == IB_WC_WR_FLUSH_ERR)
331			continue;
332
333		WARN_ON_ONCE(1);
334		mlx5_ib_warn(dev,
335			"reg umr failed (%u). Trying to recover and resubmit the flushed WQEs, mkey = %u\n",
336			umr_context.status, mkey);
337		mutex_lock(&umrc->lock);
338		err = mlx5r_umr_recover(dev);
339		mutex_unlock(&umrc->lock);
340		if (err)
341			mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n",
342				     err);
343		err = -EFAULT;
344		break;
345	}
346	up(&umrc->sem);
347	return err;
348}
349
350/**
351 * mlx5r_umr_revoke_mr - Fence all DMA on the MR
352 * @mr: The MR to fence
353 *
354 * Upon return the NIC will not be doing any DMA to the pages under the MR,
355 * and any DMA in progress will be completed. Failure of this function
356 * indicates the HW has failed catastrophically.
357 */
358int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr)
359{
360	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
361	struct mlx5r_umr_wqe wqe = {};
362
363	if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
364		return 0;
365
366	wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
367	wqe.ctrl_seg.mkey_mask |= get_umr_disable_mr_mask();
368	wqe.ctrl_seg.flags |= MLX5_UMR_INLINE;
369
370	MLX5_SET(mkc, &wqe.mkey_seg, free, 1);
371	MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(dev->umrc.pd)->pdn);
372	MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
373	MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
374		 mlx5_mkey_variant(mr->mmkey.key));
375
376	return mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
377}
378
379static void mlx5r_umr_set_access_flags(struct mlx5_ib_dev *dev,
380				       struct mlx5_mkey_seg *seg,
381				       unsigned int access_flags)
382{
383	bool ro_read = (access_flags & IB_ACCESS_RELAXED_ORDERING) &&
384		       (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) ||
385			pcie_relaxed_ordering_enabled(dev->mdev->pdev));
386
387	MLX5_SET(mkc, seg, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
388	MLX5_SET(mkc, seg, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
389	MLX5_SET(mkc, seg, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
390	MLX5_SET(mkc, seg, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
391	MLX5_SET(mkc, seg, lr, 1);
392	MLX5_SET(mkc, seg, relaxed_ordering_write,
393		 !!(access_flags & IB_ACCESS_RELAXED_ORDERING));
394	MLX5_SET(mkc, seg, relaxed_ordering_read, ro_read);
395}
396
397int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
398			      int access_flags)
399{
400	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
401	struct mlx5r_umr_wqe wqe = {};
402	int err;
403
404	wqe.ctrl_seg.mkey_mask = get_umr_update_access_mask(dev);
405	wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
406	wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE;
407	wqe.ctrl_seg.flags |= MLX5_UMR_INLINE;
408
409	mlx5r_umr_set_access_flags(dev, &wqe.mkey_seg, access_flags);
410	MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(pd)->pdn);
411	MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff);
412	MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0,
413		 mlx5_mkey_variant(mr->mmkey.key));
414
415	err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false);
416	if (err)
417		return err;
418
419	mr->access_flags = access_flags;
420	return 0;
421}
422
423#define MLX5_MAX_UMR_CHUNK                                                     \
424	((1 << (MLX5_MAX_UMR_SHIFT + 4)) - MLX5_UMR_FLEX_ALIGNMENT)
425#define MLX5_SPARE_UMR_CHUNK 0x10000
426
427/*
428 * Allocate a temporary buffer to hold the per-page information to transfer to
429 * HW. For efficiency this should be as large as it can be, but buffer
430 * allocation failure is not allowed, so try smaller sizes.
431 */
432static void *mlx5r_umr_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask)
433{
434	const size_t xlt_chunk_align = MLX5_UMR_FLEX_ALIGNMENT / ent_size;
435	size_t size;
436	void *res = NULL;
437
438	static_assert(PAGE_SIZE % MLX5_UMR_FLEX_ALIGNMENT == 0);
439
440	/*
441	 * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the
442	 * allocation can't trigger any kind of reclaim.
443	 */
444	might_sleep();
445
446	gfp_mask |= __GFP_ZERO | __GFP_NORETRY;
447
448	/*
449	 * If the system already has a suitable high order page then just use
450	 * that, but don't try hard to create one. This max is about 1M, so a
451	 * free x86 huge page will satisfy it.
452	 */
453	size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align),
454		     MLX5_MAX_UMR_CHUNK);
455	*nents = size / ent_size;
456	res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
457				       get_order(size));
458	if (res)
459		return res;
460
461	if (size > MLX5_SPARE_UMR_CHUNK) {
462		size = MLX5_SPARE_UMR_CHUNK;
463		*nents = size / ent_size;
464		res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN,
465					       get_order(size));
466		if (res)
467			return res;
468	}
469
470	*nents = PAGE_SIZE / ent_size;
471	res = (void *)__get_free_page(gfp_mask);
472	if (res)
473		return res;
474
475	mutex_lock(&xlt_emergency_page_mutex);
476	memset(xlt_emergency_page, 0, PAGE_SIZE);
477	return xlt_emergency_page;
478}
479
480static void mlx5r_umr_free_xlt(void *xlt, size_t length)
481{
482	if (xlt == xlt_emergency_page) {
483		mutex_unlock(&xlt_emergency_page_mutex);
484		return;
485	}
486
487	free_pages((unsigned long)xlt, get_order(length));
488}
489
490static void mlx5r_umr_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt,
491				     struct ib_sge *sg)
492{
493	struct device *ddev = &dev->mdev->pdev->dev;
494
495	dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE);
496	mlx5r_umr_free_xlt(xlt, sg->length);
497}
498
499/*
500 * Create an XLT buffer ready for submission.
501 */
502static void *mlx5r_umr_create_xlt(struct mlx5_ib_dev *dev, struct ib_sge *sg,
503				  size_t nents, size_t ent_size,
504				  unsigned int flags)
505{
506	struct device *ddev = &dev->mdev->pdev->dev;
507	dma_addr_t dma;
508	void *xlt;
509
510	xlt = mlx5r_umr_alloc_xlt(&nents, ent_size,
511				 flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC :
512								  GFP_KERNEL);
513	sg->length = nents * ent_size;
514	dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE);
515	if (dma_mapping_error(ddev, dma)) {
516		mlx5_ib_err(dev, "unable to map DMA during XLT update.\n");
517		mlx5r_umr_free_xlt(xlt, sg->length);
518		return NULL;
519	}
520	sg->addr = dma;
521	sg->lkey = dev->umrc.pd->local_dma_lkey;
522
523	return xlt;
524}
525
526static void
527mlx5r_umr_set_update_xlt_ctrl_seg(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg,
528				  unsigned int flags, struct ib_sge *sg)
529{
530	if (!(flags & MLX5_IB_UPD_XLT_ENABLE))
531		/* fail if free */
532		ctrl_seg->flags = MLX5_UMR_CHECK_FREE;
533	else
534		/* fail if not free */
535		ctrl_seg->flags = MLX5_UMR_CHECK_NOT_FREE;
536	ctrl_seg->xlt_octowords =
537		cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length));
538}
539
540static void mlx5r_umr_set_update_xlt_mkey_seg(struct mlx5_ib_dev *dev,
541					      struct mlx5_mkey_seg *mkey_seg,
542					      struct mlx5_ib_mr *mr,
543					      unsigned int page_shift)
544{
545	mlx5r_umr_set_access_flags(dev, mkey_seg, mr->access_flags);
546	MLX5_SET(mkc, mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn);
547	MLX5_SET64(mkc, mkey_seg, start_addr, mr->ibmr.iova);
548	MLX5_SET64(mkc, mkey_seg, len, mr->ibmr.length);
549	MLX5_SET(mkc, mkey_seg, log_page_size, page_shift);
550	MLX5_SET(mkc, mkey_seg, qpn, 0xffffff);
551	MLX5_SET(mkc, mkey_seg, mkey_7_0, mlx5_mkey_variant(mr->mmkey.key));
552}
553
554static void
555mlx5r_umr_set_update_xlt_data_seg(struct mlx5_wqe_data_seg *data_seg,
556				  struct ib_sge *sg)
557{
558	data_seg->byte_count = cpu_to_be32(sg->length);
559	data_seg->lkey = cpu_to_be32(sg->lkey);
560	data_seg->addr = cpu_to_be64(sg->addr);
561}
562
563static void mlx5r_umr_update_offset(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg,
564				    u64 offset)
565{
566	u64 octo_offset = mlx5r_umr_get_xlt_octo(offset);
567
568	ctrl_seg->xlt_offset = cpu_to_be16(octo_offset & 0xffff);
569	ctrl_seg->xlt_offset_47_16 = cpu_to_be32(octo_offset >> 16);
570	ctrl_seg->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
571}
572
573static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
574				       struct mlx5r_umr_wqe *wqe,
575				       struct mlx5_ib_mr *mr, struct ib_sge *sg,
576				       unsigned int flags)
577{
578	bool update_pd_access, update_translation;
579
580	if (flags & MLX5_IB_UPD_XLT_ENABLE)
581		wqe->ctrl_seg.mkey_mask |= get_umr_enable_mr_mask();
582
583	update_pd_access = flags & MLX5_IB_UPD_XLT_ENABLE ||
584			   flags & MLX5_IB_UPD_XLT_PD ||
585			   flags & MLX5_IB_UPD_XLT_ACCESS;
586
587	if (update_pd_access) {
588		wqe->ctrl_seg.mkey_mask |= get_umr_update_access_mask(dev);
589		wqe->ctrl_seg.mkey_mask |= get_umr_update_pd_mask();
590	}
591
592	update_translation =
593		flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR;
594
595	if (update_translation) {
596		wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask();
597		if (!mr->ibmr.length)
598			MLX5_SET(mkc, &wqe->mkey_seg, length64, 1);
599	}
600
601	wqe->ctrl_seg.xlt_octowords =
602		cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length));
603	wqe->data_seg.byte_count = cpu_to_be32(sg->length);
604}
605
606/*
607 * Send the DMA list to the HW for a normal MR using UMR.
608 * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
609 * flag may be used.
610 */
611int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
612{
613	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
614	struct device *ddev = &dev->mdev->pdev->dev;
615	struct mlx5r_umr_wqe wqe = {};
616	struct ib_block_iter biter;
617	struct mlx5_mtt *cur_mtt;
618	size_t orig_sg_length;
619	struct mlx5_mtt *mtt;
620	size_t final_size;
621	struct ib_sge sg;
622	u64 offset = 0;
623	int err = 0;
624
625	if (WARN_ON(mr->umem->is_odp))
626		return -EINVAL;
627
628	mtt = mlx5r_umr_create_xlt(
629		dev, &sg, ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift),
630		sizeof(*mtt), flags);
631	if (!mtt)
632		return -ENOMEM;
633
634	orig_sg_length = sg.length;
635
636	mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
637	mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr,
638					  mr->page_shift);
639	mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
640
641	cur_mtt = mtt;
642	rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) {
643		if (cur_mtt == (void *)mtt + sg.length) {
644			dma_sync_single_for_device(ddev, sg.addr, sg.length,
645						   DMA_TO_DEVICE);
646
647			err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe,
648						       true);
649			if (err)
650				goto err;
651			dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
652						DMA_TO_DEVICE);
653			offset += sg.length;
654			mlx5r_umr_update_offset(&wqe.ctrl_seg, offset);
655
656			cur_mtt = mtt;
657		}
658
659		cur_mtt->ptag =
660			cpu_to_be64(rdma_block_iter_dma_address(&biter) |
661				    MLX5_IB_MTT_PRESENT);
662
663		if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
664			cur_mtt->ptag = 0;
665
666		cur_mtt++;
667	}
668
669	final_size = (void *)cur_mtt - (void *)mtt;
670	sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT);
671	memset(cur_mtt, 0, sg.length - final_size);
672	mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
673
674	dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
675	err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true);
676
677err:
678	sg.length = orig_sg_length;
679	mlx5r_umr_unmap_free_xlt(dev, mtt, &sg);
680	return err;
681}
682
683static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
684{
685	return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
686}
687
688int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
689			 int page_shift, int flags)
690{
691	int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
692			       ? sizeof(struct mlx5_klm)
693			       : sizeof(struct mlx5_mtt);
694	const int page_align = MLX5_UMR_FLEX_ALIGNMENT / desc_size;
695	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
696	struct device *ddev = &dev->mdev->pdev->dev;
697	const int page_mask = page_align - 1;
698	struct mlx5r_umr_wqe wqe = {};
699	size_t pages_mapped = 0;
700	size_t pages_to_map = 0;
701	size_t size_to_map = 0;
702	size_t orig_sg_length;
703	size_t pages_iter;
704	struct ib_sge sg;
705	int err = 0;
706	void *xlt;
707
708	if ((flags & MLX5_IB_UPD_XLT_INDIRECT) &&
709	    !umr_can_use_indirect_mkey(dev))
710		return -EPERM;
711
712	if (WARN_ON(!mr->umem->is_odp))
713		return -EINVAL;
714
715	/* UMR copies MTTs in units of MLX5_UMR_FLEX_ALIGNMENT bytes,
716	 * so we need to align the offset and length accordingly
717	 */
718	if (idx & page_mask) {
719		npages += idx & page_mask;
720		idx &= ~page_mask;
721	}
722	pages_to_map = ALIGN(npages, page_align);
723
724	xlt = mlx5r_umr_create_xlt(dev, &sg, npages, desc_size, flags);
725	if (!xlt)
726		return -ENOMEM;
727
728	pages_iter = sg.length / desc_size;
729	orig_sg_length = sg.length;
730
731	if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
732		struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
733		size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
734
735		pages_to_map = min_t(size_t, pages_to_map, max_pages);
736	}
737
738	mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
739	mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, page_shift);
740	mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
741
742	for (pages_mapped = 0;
743	     pages_mapped < pages_to_map && !err;
744	     pages_mapped += pages_iter, idx += pages_iter) {
745		npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
746		size_to_map = npages * desc_size;
747		dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
748					DMA_TO_DEVICE);
749		mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
750		dma_sync_single_for_device(ddev, sg.addr, sg.length,
751					   DMA_TO_DEVICE);
752		sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);
753
754		if (pages_mapped + pages_iter >= pages_to_map)
755			mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
756		mlx5r_umr_update_offset(&wqe.ctrl_seg, idx * desc_size);
757		err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true);
758	}
759	sg.length = orig_sg_length;
760	mlx5r_umr_unmap_free_xlt(dev, xlt, &sg);
761	return err;
762}