Linux Audio

Check our new training course

Loading...
v3.1
   1/*
   2 * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
   3 * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
   4 *
   5 * This software is available to you under a choice of one of two
   6 * licenses.  You may choose to be licensed under the terms of the GNU
   7 * General Public License (GPL) Version 2, available from the file
   8 * COPYING in the main directory of this source tree, or the
   9 * OpenIB.org BSD license below:
  10 *
  11 *     Redistribution and use in source and binary forms, with or
  12 *     without modification, are permitted provided that the following
  13 *     conditions are met:
  14 *
  15 *      - Redistributions of source code must retain the above
  16 *        copyright notice, this list of conditions and the following
  17 *        disclaimer.
  18 *
  19 *      - Redistributions in binary form must reproduce the above
  20 *        copyright notice, this list of conditions and the following
  21 *        disclaimer in the documentation and/or other materials
  22 *        provided with the distribution.
  23 *
  24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  31 * SOFTWARE.
  32 */
  33
  34#include <linux/log2.h>
 
 
  35#include <linux/slab.h>
  36#include <linux/netdevice.h>
  37
  38#include <rdma/ib_cache.h>
  39#include <rdma/ib_pack.h>
  40#include <rdma/ib_addr.h>
 
 
  41
 
  42#include <linux/mlx4/qp.h>
  43
  44#include "mlx4_ib.h"
  45#include "user.h"
 
 
 
 
 
 
 
  46
  47enum {
  48	MLX4_IB_ACK_REQ_FREQ	= 8,
  49};
  50
  51enum {
  52	MLX4_IB_DEFAULT_SCHED_QUEUE	= 0x83,
  53	MLX4_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f,
  54	MLX4_IB_LINK_TYPE_IB		= 0,
  55	MLX4_IB_LINK_TYPE_ETH		= 1
  56};
  57
  58enum {
  59	/*
  60	 * Largest possible UD header: send with GRH and immediate
  61	 * data plus 18 bytes for an Ethernet header with VLAN/802.1Q
  62	 * tag.  (LRH would only use 8 bytes, so Ethernet is the
  63	 * biggest case)
  64	 */
  65	MLX4_IB_UD_HEADER_SIZE		= 82,
  66	MLX4_IB_LSO_HEADER_SPARE	= 128,
  67};
  68
  69enum {
  70	MLX4_IB_IBOE_ETHERTYPE		= 0x8915
  71};
  72
  73struct mlx4_ib_sqp {
  74	struct mlx4_ib_qp	qp;
  75	int			pkey_index;
  76	u32			qkey;
  77	u32			send_psn;
  78	struct ib_ud_header	ud_header;
  79	u8			header_buf[MLX4_IB_UD_HEADER_SIZE];
 
  80};
  81
  82enum {
  83	MLX4_IB_MIN_SQ_STRIDE	= 6,
  84	MLX4_IB_CACHE_LINE_SIZE	= 64,
  85};
  86
 
 
 
 
 
 
 
 
 
  87static const __be32 mlx4_ib_opcode[] = {
  88	[IB_WR_SEND]				= cpu_to_be32(MLX4_OPCODE_SEND),
  89	[IB_WR_LSO]				= cpu_to_be32(MLX4_OPCODE_LSO),
  90	[IB_WR_SEND_WITH_IMM]			= cpu_to_be32(MLX4_OPCODE_SEND_IMM),
  91	[IB_WR_RDMA_WRITE]			= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
  92	[IB_WR_RDMA_WRITE_WITH_IMM]		= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
  93	[IB_WR_RDMA_READ]			= cpu_to_be32(MLX4_OPCODE_RDMA_READ),
  94	[IB_WR_ATOMIC_CMP_AND_SWP]		= cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
  95	[IB_WR_ATOMIC_FETCH_AND_ADD]		= cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
  96	[IB_WR_SEND_WITH_INV]			= cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
  97	[IB_WR_LOCAL_INV]			= cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
  98	[IB_WR_FAST_REG_MR]			= cpu_to_be32(MLX4_OPCODE_FMR),
  99	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
 100	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
 101};
 102
 
 
 
 
 
 103static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
 104{
 105	return container_of(mqp, struct mlx4_ib_sqp, qp);
 106}
 107
 
 
 
 
 
 
 
 
 
 
 108static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 109{
 110	return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
 111		qp->mqp.qpn <= dev->dev->caps.sqp_start + 3;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 112}
 113
 
 114static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 115{
 116	return qp->mqp.qpn >= dev->dev->caps.sqp_start &&
 117		qp->mqp.qpn <= dev->dev->caps.sqp_start + 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 118}
 119
 120static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
 121{
 122	return mlx4_buf_offset(&qp->buf, offset);
 123}
 124
 125static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
 126{
 127	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
 128}
 129
 130static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
 131{
 132	return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
 133}
 134
 135/*
 136 * Stamp a SQ WQE so that it is invalid if prefetched by marking the
 137 * first four bytes of every 64 byte chunk with
 138 *     0x7FFFFFF | (invalid_ownership_value << 31).
 139 *
 140 * When the max work request size is less than or equal to the WQE
 141 * basic block size, as an optimization, we can stamp all WQEs with
 142 * 0xffffffff, and skip the very first chunk of each WQE.
 143 */
 144static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
 145{
 146	__be32 *wqe;
 147	int i;
 148	int s;
 149	int ind;
 150	void *buf;
 151	__be32 stamp;
 152	struct mlx4_wqe_ctrl_seg *ctrl;
 153
 154	if (qp->sq_max_wqes_per_wr > 1) {
 155		s = roundup(size, 1U << qp->sq.wqe_shift);
 156		for (i = 0; i < s; i += 64) {
 157			ind = (i >> qp->sq.wqe_shift) + n;
 158			stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
 159						       cpu_to_be32(0xffffffff);
 160			buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
 161			wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
 162			*wqe = stamp;
 163		}
 164	} else {
 165		ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
 166		s = (ctrl->fence_size & 0x3f) << 4;
 167		for (i = 64; i < s; i += 64) {
 168			wqe = buf + i;
 169			*wqe = cpu_to_be32(0xffffffff);
 170		}
 171	}
 172}
 173
 174static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
 175{
 176	struct mlx4_wqe_ctrl_seg *ctrl;
 177	struct mlx4_wqe_inline_seg *inl;
 178	void *wqe;
 179	int s;
 180
 181	ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
 182	s = sizeof(struct mlx4_wqe_ctrl_seg);
 183
 184	if (qp->ibqp.qp_type == IB_QPT_UD) {
 185		struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
 186		struct mlx4_av *av = (struct mlx4_av *)dgram->av;
 187		memset(dgram, 0, sizeof *dgram);
 188		av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
 189		s += sizeof(struct mlx4_wqe_datagram_seg);
 190	}
 191
 192	/* Pad the remainder of the WQE with an inline data segment. */
 193	if (size > s) {
 194		inl = wqe + s;
 195		inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
 196	}
 197	ctrl->srcrb_flags = 0;
 198	ctrl->fence_size = size / 16;
 199	/*
 200	 * Make sure descriptor is fully written before setting ownership bit
 201	 * (because HW can start executing as soon as we do).
 202	 */
 203	wmb();
 204
 205	ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
 206		(n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
 207
 208	stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
 209}
 210
 211/* Post NOP WQE to prevent wrap-around in the middle of WR */
 212static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
 213{
 214	unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
 215	if (unlikely(s < qp->sq_max_wqes_per_wr)) {
 216		post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
 217		ind += s;
 218	}
 219	return ind;
 220}
 221
 222static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
 223{
 224	struct ib_event event;
 225	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
 226
 227	if (type == MLX4_EVENT_TYPE_PATH_MIG)
 228		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
 229
 230	if (ibqp->event_handler) {
 231		event.device     = ibqp->device;
 232		event.element.qp = ibqp;
 233		switch (type) {
 234		case MLX4_EVENT_TYPE_PATH_MIG:
 235			event.event = IB_EVENT_PATH_MIG;
 236			break;
 237		case MLX4_EVENT_TYPE_COMM_EST:
 238			event.event = IB_EVENT_COMM_EST;
 239			break;
 240		case MLX4_EVENT_TYPE_SQ_DRAINED:
 241			event.event = IB_EVENT_SQ_DRAINED;
 242			break;
 243		case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
 244			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
 245			break;
 246		case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
 247			event.event = IB_EVENT_QP_FATAL;
 248			break;
 249		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
 250			event.event = IB_EVENT_PATH_MIG_ERR;
 251			break;
 252		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
 253			event.event = IB_EVENT_QP_REQ_ERR;
 254			break;
 255		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
 256			event.event = IB_EVENT_QP_ACCESS_ERR;
 257			break;
 258		default:
 259			printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
 260			       "on QP %06x\n", type, qp->qpn);
 261			return;
 262		}
 263
 264		ibqp->event_handler(&event, ibqp->qp_context);
 265	}
 266}
 267
 268static int send_wqe_overhead(enum ib_qp_type type, u32 flags)
 
 
 
 
 
 
 269{
 270	/*
 271	 * UD WQEs must have a datagram segment.
 272	 * RC and UC WQEs might have a remote address segment.
 273	 * MLX WQEs need two extra inline data segments (for the UD
 274	 * header and space for the ICRC).
 275	 */
 276	switch (type) {
 277	case IB_QPT_UD:
 278		return sizeof (struct mlx4_wqe_ctrl_seg) +
 279			sizeof (struct mlx4_wqe_datagram_seg) +
 280			((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0);
 281	case IB_QPT_UC:
 
 
 
 
 
 
 
 
 
 
 282		return sizeof (struct mlx4_wqe_ctrl_seg) +
 283			sizeof (struct mlx4_wqe_raddr_seg);
 284	case IB_QPT_RC:
 285		return sizeof (struct mlx4_wqe_ctrl_seg) +
 286			sizeof (struct mlx4_wqe_atomic_seg) +
 287			sizeof (struct mlx4_wqe_raddr_seg);
 288	case IB_QPT_SMI:
 289	case IB_QPT_GSI:
 290		return sizeof (struct mlx4_wqe_ctrl_seg) +
 291			ALIGN(MLX4_IB_UD_HEADER_SIZE +
 292			      DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
 293					   MLX4_INLINE_ALIGN) *
 294			      sizeof (struct mlx4_wqe_inline_seg),
 295			      sizeof (struct mlx4_wqe_data_seg)) +
 296			ALIGN(4 +
 297			      sizeof (struct mlx4_wqe_inline_seg),
 298			      sizeof (struct mlx4_wqe_data_seg));
 299	default:
 300		return sizeof (struct mlx4_wqe_ctrl_seg);
 301	}
 302}
 303
 304static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 305		       int is_user, int has_srq, struct mlx4_ib_qp *qp)
 
 306{
 307	/* Sanity check RQ size before proceeding */
 308	if (cap->max_recv_wr  > dev->dev->caps.max_wqes  ||
 309	    cap->max_recv_sge > dev->dev->caps.max_rq_sg)
 310		return -EINVAL;
 311
 312	if (has_srq) {
 313		/* QPs attached to an SRQ should have no RQ */
 314		if (cap->max_recv_wr)
 315			return -EINVAL;
 316
 317		qp->rq.wqe_cnt = qp->rq.max_gs = 0;
 318	} else {
 
 
 
 
 319		/* HW requires >= 1 RQ entry with >= 1 gather entry */
 320		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge))
 
 321			return -EINVAL;
 322
 323		qp->rq.wqe_cnt	 = roundup_pow_of_two(max(1U, cap->max_recv_wr));
 324		qp->rq.max_gs	 = roundup_pow_of_two(max(1U, cap->max_recv_sge));
 325		qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg));
 
 326	}
 327
 328	cap->max_recv_wr  = qp->rq.max_post = qp->rq.wqe_cnt;
 329	cap->max_recv_sge = qp->rq.max_gs;
 
 
 
 
 
 
 
 
 
 330
 331	return 0;
 332}
 333
 334static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 335			      enum ib_qp_type type, struct mlx4_ib_qp *qp)
 336{
 337	int s;
 338
 339	/* Sanity check SQ size before proceeding */
 340	if (cap->max_send_wr	 > dev->dev->caps.max_wqes  ||
 341	    cap->max_send_sge	 > dev->dev->caps.max_sq_sg ||
 342	    cap->max_inline_data + send_wqe_overhead(type, qp->flags) +
 343	    sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
 344		return -EINVAL;
 345
 346	/*
 347	 * For MLX transport we need 2 extra S/G entries:
 348	 * one for the header and one for the checksum at the end
 349	 */
 350	if ((type == IB_QPT_SMI || type == IB_QPT_GSI) &&
 
 351	    cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
 352		return -EINVAL;
 353
 354	s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
 355		cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
 356		send_wqe_overhead(type, qp->flags);
 357
 358	if (s > dev->dev->caps.max_sq_desc_sz)
 359		return -EINVAL;
 360
 361	/*
 362	 * Hermon supports shrinking WQEs, such that a single work
 363	 * request can include multiple units of 1 << wqe_shift.  This
 364	 * way, work requests can differ in size, and do not have to
 365	 * be a power of 2 in size, saving memory and speeding up send
 366	 * WR posting.  Unfortunately, if we do this then the
 367	 * wqe_index field in CQEs can't be used to look up the WR ID
 368	 * anymore, so we do this only if selective signaling is off.
 369	 *
 370	 * Further, on 32-bit platforms, we can't use vmap() to make
 371	 * the QP buffer virtually contiguous.  Thus we have to use
 372	 * constant-sized WRs to make sure a WR is always fully within
 373	 * a single page-sized chunk.
 374	 *
 375	 * Finally, we use NOP work requests to pad the end of the
 376	 * work queue, to avoid wrap-around in the middle of WR.  We
 377	 * set NEC bit to avoid getting completions with error for
 378	 * these NOP WRs, but since NEC is only supported starting
 379	 * with firmware 2.2.232, we use constant-sized WRs for older
 380	 * firmware.
 381	 *
 382	 * And, since MLX QPs only support SEND, we use constant-sized
 383	 * WRs in this case.
 384	 *
 385	 * We look for the smallest value of wqe_shift such that the
 386	 * resulting number of wqes does not exceed device
 387	 * capabilities.
 388	 *
 389	 * We set WQE size to at least 64 bytes, this way stamping
 390	 * invalidates each WQE.
 391	 */
 392	if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
 393	    qp->sq_signal_bits && BITS_PER_LONG == 64 &&
 394	    type != IB_QPT_SMI && type != IB_QPT_GSI)
 395		qp->sq.wqe_shift = ilog2(64);
 396	else
 397		qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
 398
 399	for (;;) {
 400		qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
 401
 402		/*
 403		 * We need to leave 2 KB + 1 WR of headroom in the SQ to
 404		 * allow HW to prefetch.
 405		 */
 406		qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
 407		qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
 408						    qp->sq_max_wqes_per_wr +
 409						    qp->sq_spare_wqes);
 410
 411		if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
 412			break;
 413
 414		if (qp->sq_max_wqes_per_wr <= 1)
 415			return -EINVAL;
 416
 417		++qp->sq.wqe_shift;
 418	}
 419
 420	qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz,
 421			     (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) -
 422			 send_wqe_overhead(type, qp->flags)) /
 423		sizeof (struct mlx4_wqe_data_seg);
 424
 425	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
 426		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
 427	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
 428		qp->rq.offset = 0;
 429		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
 430	} else {
 431		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
 432		qp->sq.offset = 0;
 433	}
 434
 435	cap->max_send_wr  = qp->sq.max_post =
 436		(qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
 437	cap->max_send_sge = min(qp->sq.max_gs,
 438				min(dev->dev->caps.max_sq_sg,
 439				    dev->dev->caps.max_rq_sg));
 440	/* We don't support inline sends for kernel QPs (yet) */
 441	cap->max_inline_data = 0;
 442
 443	return 0;
 444}
 445
 446static int set_user_sq_size(struct mlx4_ib_dev *dev,
 447			    struct mlx4_ib_qp *qp,
 448			    struct mlx4_ib_create_qp *ucmd)
 449{
 450	/* Sanity check SQ size before proceeding */
 451	if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes	 ||
 452	    ucmd->log_sq_stride >
 453		ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) ||
 454	    ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE)
 455		return -EINVAL;
 456
 457	qp->sq.wqe_cnt   = 1 << ucmd->log_sq_bb_count;
 458	qp->sq.wqe_shift = ucmd->log_sq_stride;
 459
 460	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
 461		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
 462
 463	return 0;
 464}
 465
 466static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 467			    struct ib_qp_init_attr *init_attr,
 468			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 469{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 470	int qpn;
 471	int err;
 
 
 
 
 
 
 
 
 
 
 
 472
 473	mutex_init(&qp->mutex);
 474	spin_lock_init(&qp->sq.lock);
 475	spin_lock_init(&qp->rq.lock);
 476	INIT_LIST_HEAD(&qp->gid_list);
 
 477
 478	qp->state	 = IB_QPS_RESET;
 479	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
 480		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
 
 
 
 
 
 481
 482	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 483	if (err)
 484		goto err;
 485
 486	if (pd->uobject) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 487		struct mlx4_ib_create_qp ucmd;
 
 
 
 
 
 488
 489		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
 490			err = -EFAULT;
 491			goto err;
 492		}
 493
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 494		qp->sq_no_prefetch = ucmd.sq_no_prefetch;
 495
 496		err = set_user_sq_size(dev, qp, &ucmd);
 497		if (err)
 498			goto err;
 499
 500		qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
 501				       qp->buf_size, 0, 0);
 502		if (IS_ERR(qp->umem)) {
 503			err = PTR_ERR(qp->umem);
 504			goto err;
 505		}
 506
 507		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem),
 508				    ilog2(qp->umem->page_size), &qp->mtt);
 
 
 509		if (err)
 510			goto err_buf;
 511
 512		err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
 513		if (err)
 514			goto err_mtt;
 515
 516		if (!init_attr->srq) {
 517			err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
 518						  ucmd.db_addr, &qp->db);
 519			if (err)
 520				goto err_mtt;
 521		}
 
 522	} else {
 523		qp->sq_no_prefetch = 0;
 
 
 
 524
 525		if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
 526			qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
 527
 528		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
 529			qp->flags |= MLX4_IB_QP_LSO;
 530
 531		err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp);
 
 
 
 
 
 
 
 
 532		if (err)
 533			goto err;
 534
 535		if (!init_attr->srq) {
 536			err = mlx4_db_alloc(dev->dev, &qp->db, 0);
 537			if (err)
 538				goto err;
 539
 540			*qp->db.db = 0;
 541		}
 542
 543		if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) {
 
 544			err = -ENOMEM;
 545			goto err_db;
 546		}
 547
 548		err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
 549				    &qp->mtt);
 550		if (err)
 551			goto err_buf;
 552
 553		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
 554		if (err)
 555			goto err_mtt;
 556
 557		qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL);
 558		qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL);
 559
 
 560		if (!qp->sq.wrid || !qp->rq.wrid) {
 561			err = -ENOMEM;
 562			goto err_wrid;
 563		}
 
 564	}
 565
 566	if (sqpn) {
 567		qpn = sqpn;
 
 
 
 
 
 
 568	} else {
 569		err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 570		if (err)
 571			goto err_wrid;
 572	}
 573
 
 
 
 574	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
 575	if (err)
 576		goto err_qpn;
 577
 
 
 
 578	/*
 579	 * Hardware wants QPN written in big-endian order (after
 580	 * shifting) for send doorbell.  Precompute this value to save
 581	 * a little bit when posting sends.
 582	 */
 583	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
 584
 585	qp->mqp.event = mlx4_ib_qp_event;
 586
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 587	return 0;
 588
 589err_qpn:
 590	if (!sqpn)
 591		mlx4_qp_release_range(dev->dev, qpn, 1);
 592
 
 
 
 
 
 
 593err_wrid:
 594	if (pd->uobject) {
 595		if (!init_attr->srq)
 596			mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context),
 597					      &qp->db);
 598	} else {
 599		kfree(qp->sq.wrid);
 600		kfree(qp->rq.wrid);
 601	}
 602
 603err_mtt:
 604	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 605
 606err_buf:
 607	if (pd->uobject)
 608		ib_umem_release(qp->umem);
 609	else
 610		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
 
 611
 612err_db:
 613	if (!pd->uobject && !init_attr->srq)
 614		mlx4_db_free(dev->dev, &qp->db);
 615
 616err:
 
 
 
 
 617	return err;
 618}
 619
 620static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
 621{
 622	switch (state) {
 623	case IB_QPS_RESET:	return MLX4_QP_STATE_RST;
 624	case IB_QPS_INIT:	return MLX4_QP_STATE_INIT;
 625	case IB_QPS_RTR:	return MLX4_QP_STATE_RTR;
 626	case IB_QPS_RTS:	return MLX4_QP_STATE_RTS;
 627	case IB_QPS_SQD:	return MLX4_QP_STATE_SQD;
 628	case IB_QPS_SQE:	return MLX4_QP_STATE_SQER;
 629	case IB_QPS_ERR:	return MLX4_QP_STATE_ERR;
 630	default:		return -1;
 631	}
 632}
 633
 634static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
 635	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
 636{
 637	if (send_cq == recv_cq) {
 638		spin_lock_irq(&send_cq->lock);
 639		__acquire(&recv_cq->lock);
 640	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
 641		spin_lock_irq(&send_cq->lock);
 642		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
 643	} else {
 644		spin_lock_irq(&recv_cq->lock);
 645		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
 646	}
 647}
 648
 649static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
 650	__releases(&send_cq->lock) __releases(&recv_cq->lock)
 651{
 652	if (send_cq == recv_cq) {
 653		__release(&recv_cq->lock);
 654		spin_unlock_irq(&send_cq->lock);
 655	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
 656		spin_unlock(&recv_cq->lock);
 657		spin_unlock_irq(&send_cq->lock);
 658	} else {
 659		spin_unlock(&send_cq->lock);
 660		spin_unlock_irq(&recv_cq->lock);
 661	}
 662}
 663
 664static void del_gid_entries(struct mlx4_ib_qp *qp)
 665{
 666	struct mlx4_ib_gid_entry *ge, *tmp;
 667
 668	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
 669		list_del(&ge->list);
 670		kfree(ge);
 671	}
 672}
 673
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 674static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 675			      int is_user)
 
 676{
 677	struct mlx4_ib_cq *send_cq, *recv_cq;
 
 678
 679	if (qp->state != IB_QPS_RESET)
 680		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
 681				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
 682			printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n",
 683			       qp->mqp.qpn);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 684
 685	send_cq = to_mcq(qp->ibqp.send_cq);
 686	recv_cq = to_mcq(qp->ibqp.recv_cq);
 687
 
 688	mlx4_ib_lock_cqs(send_cq, recv_cq);
 689
 690	if (!is_user) {
 
 
 
 
 691		__mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
 692				 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
 693		if (send_cq != recv_cq)
 694			__mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
 695	}
 696
 697	mlx4_qp_remove(dev->dev, &qp->mqp);
 698
 699	mlx4_ib_unlock_cqs(send_cq, recv_cq);
 
 700
 701	mlx4_qp_free(dev->dev, &qp->mqp);
 702
 703	if (!is_sqp(dev, qp))
 704		mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
 
 
 
 
 
 
 
 
 
 
 
 705
 706	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 707
 708	if (is_user) {
 709		if (!qp->ibqp.srq)
 710			mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context),
 711					      &qp->db);
 712		ib_umem_release(qp->umem);
 
 
 
 
 
 713	} else {
 714		kfree(qp->sq.wrid);
 715		kfree(qp->rq.wrid);
 
 
 
 716		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
 717		if (!qp->ibqp.srq)
 718			mlx4_db_free(dev->dev, &qp->db);
 719	}
 
 720
 721	del_gid_entries(qp);
 722}
 723
 724struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 725				struct ib_qp_init_attr *init_attr,
 726				struct ib_udata *udata)
 727{
 728	struct mlx4_ib_dev *dev = to_mdev(pd->device);
 729	struct mlx4_ib_sqp *sqp;
 730	struct mlx4_ib_qp *qp;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 731	int err;
 
 
 
 
 
 732
 733	/*
 734	 * We only support LSO and multicast loopback blocking, and
 735	 * only for kernel UD QPs.
 736	 */
 737	if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO |
 738					IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
 
 
 
 
 739		return ERR_PTR(-EINVAL);
 740
 741	if (init_attr->create_flags &&
 742	    (pd->uobject || init_attr->qp_type != IB_QPT_UD))
 743		return ERR_PTR(-EINVAL);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 744
 745	switch (init_attr->qp_type) {
 
 
 
 
 
 
 
 
 
 
 746	case IB_QPT_RC:
 747	case IB_QPT_UC:
 748	case IB_QPT_UD:
 749	{
 750		qp = kzalloc(sizeof *qp, GFP_KERNEL);
 751		if (!qp)
 752			return ERR_PTR(-ENOMEM);
 753
 754		err = create_qp_common(dev, pd, init_attr, udata, 0, qp);
 
 
 
 
 755		if (err) {
 756			kfree(qp);
 757			return ERR_PTR(err);
 758		}
 759
 760		qp->ibqp.qp_num = qp->mqp.qpn;
 
 761
 762		break;
 763	}
 764	case IB_QPT_SMI:
 765	case IB_QPT_GSI:
 766	{
 
 
 767		/* Userspace is not allowed to create special QPs: */
 768		if (pd->uobject)
 769			return ERR_PTR(-EINVAL);
 
 
 
 
 770
 771		sqp = kzalloc(sizeof *sqp, GFP_KERNEL);
 772		if (!sqp)
 773			return ERR_PTR(-ENOMEM);
 774
 775		qp = &sqp->qp;
 776
 777		err = create_qp_common(dev, pd, init_attr, udata,
 778				       dev->dev->caps.sqp_start +
 779				       (init_attr->qp_type == IB_QPT_SMI ? 0 : 2) +
 780				       init_attr->port_num - 1,
 781				       qp);
 782		if (err) {
 783			kfree(sqp);
 784			return ERR_PTR(err);
 785		}
 786
 787		qp->port	= init_attr->port_num;
 788		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
 789
 790		break;
 791	}
 792	default:
 793		/* Don't support raw QPs */
 794		return ERR_PTR(-EINVAL);
 795	}
 796
 797	return &qp->ibqp;
 798}
 799
 800int mlx4_ib_destroy_qp(struct ib_qp *qp)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 801{
 802	struct mlx4_ib_dev *dev = to_mdev(qp->device);
 803	struct mlx4_ib_qp *mqp = to_mqp(qp);
 804
 805	if (is_qp0(dev, mqp))
 806		mlx4_CLOSE_PORT(dev->dev, mqp->port);
 807
 808	destroy_qp_common(dev, mqp, !!qp->pd->uobject);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 809
 810	if (is_sqp(dev, mqp))
 811		kfree(to_msqp(mqp));
 812	else
 813		kfree(mqp);
 814
 815	return 0;
 816}
 817
 818static int to_mlx4_st(enum ib_qp_type type)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 819{
 820	switch (type) {
 821	case IB_QPT_RC:		return MLX4_QP_ST_RC;
 822	case IB_QPT_UC:		return MLX4_QP_ST_UC;
 823	case IB_QPT_UD:		return MLX4_QP_ST_UD;
 824	case IB_QPT_SMI:
 825	case IB_QPT_GSI:	return MLX4_QP_ST_MLX;
 826	default:		return -1;
 
 
 
 
 
 
 
 
 
 
 
 
 827	}
 828}
 829
 830static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
 831				   int attr_mask)
 832{
 833	u8 dest_rd_atomic;
 834	u32 access_flags;
 835	u32 hw_access_flags = 0;
 836
 837	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
 838		dest_rd_atomic = attr->max_dest_rd_atomic;
 839	else
 840		dest_rd_atomic = qp->resp_depth;
 841
 842	if (attr_mask & IB_QP_ACCESS_FLAGS)
 843		access_flags = attr->qp_access_flags;
 844	else
 845		access_flags = qp->atomic_rd_en;
 846
 847	if (!dest_rd_atomic)
 848		access_flags &= IB_ACCESS_REMOTE_WRITE;
 849
 850	if (access_flags & IB_ACCESS_REMOTE_READ)
 851		hw_access_flags |= MLX4_QP_BIT_RRE;
 852	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
 853		hw_access_flags |= MLX4_QP_BIT_RAE;
 854	if (access_flags & IB_ACCESS_REMOTE_WRITE)
 855		hw_access_flags |= MLX4_QP_BIT_RWE;
 856
 857	return cpu_to_be32(hw_access_flags);
 858}
 859
 860static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr,
 861			    int attr_mask)
 862{
 863	if (attr_mask & IB_QP_PKEY_INDEX)
 864		sqp->pkey_index = attr->pkey_index;
 865	if (attr_mask & IB_QP_QKEY)
 866		sqp->qkey = attr->qkey;
 867	if (attr_mask & IB_QP_SQ_PSN)
 868		sqp->send_psn = attr->sq_psn;
 869}
 870
 871static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
 872{
 873	path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
 874}
 875
 876static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
 877			 struct mlx4_qp_path *path, u8 port)
 
 
 878{
 879	int err;
 880	int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) ==
 881		IB_LINK_LAYER_ETHERNET;
 882	u8 mac[6];
 883	int is_mcast;
 884	u16 vlan_tag;
 885	int vidx;
 
 
 886
 887	path->grh_mylmc     = ah->src_path_bits & 0x7f;
 888	path->rlid	    = cpu_to_be16(ah->dlid);
 889	if (ah->static_rate) {
 890		path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET;
 
 891		while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
 892		       !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
 893			--path->static_rate;
 894	} else
 895		path->static_rate = 0;
 896
 897	if (ah->ah_flags & IB_AH_GRH) {
 898		if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) {
 899			printk(KERN_ERR "sgid_index (%u) too large. max is %d\n",
 900			       ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1);
 
 
 
 
 
 
 901			return -1;
 902		}
 903
 904		path->grh_mylmc |= 1 << 7;
 905		path->mgid_index = ah->grh.sgid_index;
 906		path->hop_limit  = ah->grh.hop_limit;
 907		path->tclass_flowlabel =
 908			cpu_to_be32((ah->grh.traffic_class << 20) |
 909				    (ah->grh.flow_label));
 910		memcpy(path->rgid, ah->grh.dgid.raw, 16);
 911	}
 912
 913	if (is_eth) {
 914		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
 915			((port - 1) << 6) | ((ah->sl & 7) << 3) | ((ah->sl & 8) >> 1);
 916
 917		if (!(ah->ah_flags & IB_AH_GRH))
 918			return -1;
 919
 920		err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast, port);
 921		if (err)
 922			return err;
 923
 924		memcpy(path->dmac, mac, 6);
 925		path->ackto = MLX4_IB_LINK_TYPE_ETH;
 926		/* use index 0 into MAC table for IBoE */
 927		path->grh_mylmc &= 0x80;
 928
 929		vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]);
 930		if (vlan_tag < 0x1000) {
 931			if (mlx4_find_cached_vlan(dev->dev, port, vlan_tag, &vidx))
 932				return -ENOENT;
 933
 934			path->vlan_index = vidx;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 935			path->fl = 1 << 6;
 
 
 
 
 
 
 936		}
 937	} else
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 938		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
 939			((port - 1) << 6) | ((ah->sl & 0xf) << 2);
 
 940
 941	return 0;
 942}
 943
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 944static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 945{
 946	struct mlx4_ib_gid_entry *ge, *tmp;
 947
 948	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
 949		if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) {
 950			ge->added = 1;
 951			ge->port = qp->port;
 952		}
 953	}
 954}
 955
 956static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 957			       const struct ib_qp_attr *attr, int attr_mask,
 958			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 959{
 960	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
 961	struct mlx4_ib_qp *qp = to_mqp(ibqp);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 962	struct mlx4_qp_context *context;
 963	enum mlx4_qp_optpar optpar = 0;
 964	int sqd_event;
 
 965	int err = -EINVAL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 966
 967	context = kzalloc(sizeof *context, GFP_KERNEL);
 968	if (!context)
 969		return -ENOMEM;
 970
 971	context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
 972				     (to_mlx4_st(ibqp->qp_type) << 16));
 973
 974	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
 975		context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
 976	else {
 977		optpar |= MLX4_QP_OPTPAR_PM_STATE;
 978		switch (attr->path_mig_state) {
 979		case IB_MIG_MIGRATED:
 980			context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
 981			break;
 982		case IB_MIG_REARM:
 983			context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11);
 984			break;
 985		case IB_MIG_ARMED:
 986			context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11);
 987			break;
 988		}
 989	}
 990
 991	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI)
 
 
 
 
 
 
 992		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
 993	else if (ibqp->qp_type == IB_QPT_UD) {
 
 
 994		if (qp->flags & MLX4_IB_QP_LSO)
 995			context->mtu_msgmax = (IB_MTU_4096 << 5) |
 996					      ilog2(dev->dev->caps.max_gso_sz);
 997		else
 998			context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
 999	} else if (attr_mask & IB_QP_PATH_MTU) {
1000		if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
1001			printk(KERN_ERR "path MTU (%u) is invalid\n",
1002			       attr->path_mtu);
1003			goto out;
1004		}
1005		context->mtu_msgmax = (attr->path_mtu << 5) |
1006			ilog2(dev->dev->caps.max_msg_sz);
1007	}
1008
1009	if (qp->rq.wqe_cnt)
1010		context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
1011	context->rq_size_stride |= qp->rq.wqe_shift - 4;
 
 
1012
1013	if (qp->sq.wqe_cnt)
1014		context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
1015	context->sq_size_stride |= qp->sq.wqe_shift - 4;
1016
1017	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
 
 
 
1018		context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
 
 
 
 
1019
1020	if (qp->ibqp.uobject)
1021		context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index);
 
1022	else
1023		context->usr_page = cpu_to_be32(dev->priv_uar.index);
 
1024
1025	if (attr_mask & IB_QP_DEST_QPN)
1026		context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
1027
1028	if (attr_mask & IB_QP_PORT) {
1029		if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD &&
1030		    !(attr_mask & IB_QP_AV)) {
1031			mlx4_set_sched(&context->pri_path, attr->port_num);
1032			optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE;
1033		}
1034	}
1035
1036	if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
1037		if (dev->counters[qp->port - 1] != -1) {
1038			context->pri_path.counter_index =
1039						dev->counters[qp->port - 1];
 
 
 
 
 
 
 
 
1040			optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
 
 
 
 
 
 
1041		} else
1042			context->pri_path.counter_index = 0xff;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1043	}
1044
1045	if (attr_mask & IB_QP_PKEY_INDEX) {
 
 
1046		context->pri_path.pkey_index = attr->pkey_index;
1047		optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
1048	}
1049
1050	if (attr_mask & IB_QP_AV) {
1051		if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path,
1052				  attr_mask & IB_QP_PORT ? attr->port_num : qp->port))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1053			goto out;
1054
1055		optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
1056			   MLX4_QP_OPTPAR_SCHED_QUEUE);
 
 
 
 
 
 
 
 
 
 
 
 
1057	}
1058
1059	if (attr_mask & IB_QP_TIMEOUT) {
1060		context->pri_path.ackto |= attr->timeout << 3;
1061		optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
1062	}
1063
1064	if (attr_mask & IB_QP_ALT_PATH) {
1065		if (attr->alt_port_num == 0 ||
1066		    attr->alt_port_num > dev->dev->caps.num_ports)
1067			goto out;
1068
1069		if (attr->alt_pkey_index >=
1070		    dev->dev->caps.pkey_table_len[attr->alt_port_num])
1071			goto out;
1072
1073		if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
1074				  attr->alt_port_num))
 
1075			goto out;
1076
1077		context->alt_path.pkey_index = attr->alt_pkey_index;
1078		context->alt_path.ackto = attr->alt_timeout << 3;
1079		optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
1080	}
1081
1082	context->pd	    = cpu_to_be32(to_mpd(ibqp->pd)->pdn);
1083	context->params1    = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
 
 
 
 
 
 
 
 
 
1084
1085	/* Set "fast registration enabled" for all kernel QPs */
1086	if (!qp->ibqp.uobject)
1087		context->params1 |= cpu_to_be32(1 << 11);
1088
1089	if (attr_mask & IB_QP_RNR_RETRY) {
1090		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
1091		optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
1092	}
1093
1094	if (attr_mask & IB_QP_RETRY_CNT) {
1095		context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
1096		optpar |= MLX4_QP_OPTPAR_RETRY_COUNT;
1097	}
1098
1099	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
1100		if (attr->max_rd_atomic)
1101			context->params1 |=
1102				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
1103		optpar |= MLX4_QP_OPTPAR_SRA_MAX;
1104	}
1105
1106	if (attr_mask & IB_QP_SQ_PSN)
1107		context->next_send_psn = cpu_to_be32(attr->sq_psn);
1108
1109	context->cqn_send = cpu_to_be32(to_mcq(ibqp->send_cq)->mcq.cqn);
1110
1111	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
1112		if (attr->max_dest_rd_atomic)
1113			context->params2 |=
1114				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
1115		optpar |= MLX4_QP_OPTPAR_RRA_MAX;
1116	}
1117
1118	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
1119		context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask);
1120		optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
1121	}
1122
1123	if (ibqp->srq)
1124		context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
1125
1126	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
1127		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
1128		optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT;
1129	}
1130	if (attr_mask & IB_QP_RQ_PSN)
1131		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
1132
1133	context->cqn_recv = cpu_to_be32(to_mcq(ibqp->recv_cq)->mcq.cqn);
1134
1135	if (attr_mask & IB_QP_QKEY) {
1136		context->qkey = cpu_to_be32(attr->qkey);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1137		optpar |= MLX4_QP_OPTPAR_Q_KEY;
1138	}
1139
1140	if (ibqp->srq)
1141		context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn);
1142
1143	if (!ibqp->srq && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
 
 
 
1144		context->db_rec_addr = cpu_to_be64(qp->db.dma);
1145
1146	if (cur_state == IB_QPS_INIT &&
1147	    new_state == IB_QPS_RTR  &&
1148	    (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
1149	     ibqp->qp_type == IB_QPT_UD)) {
1150		context->pri_path.sched_queue = (qp->port - 1) << 6;
1151		if (is_qp0(dev, qp))
 
 
1152			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
1153		else
 
 
 
 
1154			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1155	}
1156
1157	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
1158	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
1159		sqd_event = 1;
1160	else
1161		sqd_event = 0;
1162
1163	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
1164		context->rlkey |= (1 << 4);
 
 
1165
1166	/*
1167	 * Before passing a kernel QP to the HW, make sure that the
1168	 * ownership bits of the send queue are set and the SQ
1169	 * headroom is stamped so that the hardware doesn't start
1170	 * processing stale work requests.
1171	 */
1172	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
 
 
1173		struct mlx4_wqe_ctrl_seg *ctrl;
1174		int i;
1175
1176		for (i = 0; i < qp->sq.wqe_cnt; ++i) {
1177			ctrl = get_send_wqe(qp, i);
1178			ctrl->owner_opcode = cpu_to_be32(1 << 31);
1179			if (qp->sq_max_wqes_per_wr == 1)
1180				ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
1181
1182			stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
1183		}
1184	}
1185
 
 
 
 
 
 
 
1186	err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
1187			     to_mlx4_state(new_state), context, optpar,
1188			     sqd_event, &qp->mqp);
1189	if (err)
1190		goto out;
1191
1192	qp->state = new_state;
1193
1194	if (attr_mask & IB_QP_ACCESS_FLAGS)
1195		qp->atomic_rd_en = attr->qp_access_flags;
1196	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
1197		qp->resp_depth = attr->max_dest_rd_atomic;
1198	if (attr_mask & IB_QP_PORT) {
1199		qp->port = attr->port_num;
1200		update_mcg_macs(dev, qp);
1201	}
1202	if (attr_mask & IB_QP_ALT_PATH)
1203		qp->alt_port = attr->alt_port_num;
1204
1205	if (is_sqp(dev, qp))
1206		store_sqp_attrs(to_msqp(qp), attr, attr_mask);
1207
1208	/*
1209	 * If we moved QP0 to RTR, bring the IB link up; if we moved
1210	 * QP0 to RESET or ERROR, bring the link back down.
1211	 */
1212	if (is_qp0(dev, qp)) {
1213		if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR)
1214			if (mlx4_INIT_PORT(dev->dev, qp->port))
1215				printk(KERN_WARNING "INIT_PORT failed for port %d\n",
1216				       qp->port);
1217
1218		if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
1219		    (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
1220			mlx4_CLOSE_PORT(dev->dev, qp->port);
1221	}
1222
1223	/*
1224	 * If we moved a kernel QP to RESET, clean up all old CQ
1225	 * entries and reinitialize the QP.
1226	 */
1227	if (new_state == IB_QPS_RESET && !ibqp->uobject) {
1228		mlx4_ib_cq_clean(to_mcq(ibqp->recv_cq), qp->mqp.qpn,
1229				 ibqp->srq ? to_msrq(ibqp->srq): NULL);
1230		if (ibqp->send_cq != ibqp->recv_cq)
1231			mlx4_ib_cq_clean(to_mcq(ibqp->send_cq), qp->mqp.qpn, NULL);
1232
1233		qp->rq.head = 0;
1234		qp->rq.tail = 0;
1235		qp->sq.head = 0;
1236		qp->sq.tail = 0;
1237		qp->sq_next_wqe = 0;
1238		if (!ibqp->srq)
1239			*qp->db.db  = 0;
1240	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1241
 
 
 
 
 
 
 
1242out:
 
 
 
 
1243	kfree(context);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1244	return err;
1245}
1246
1247int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
1248		      int attr_mask, struct ib_udata *udata)
 
 
 
 
 
1249{
1250	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
1251	struct mlx4_ib_qp *qp = to_mqp(ibqp);
1252	enum ib_qp_state cur_state, new_state;
1253	int err = -EINVAL;
1254
1255	mutex_lock(&qp->mutex);
1256
1257	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
1258	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
1259
1260	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask))
 
 
 
 
 
 
1261		goto out;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1262
1263	if ((attr_mask & IB_QP_PORT) &&
1264	    (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) {
 
 
 
 
1265		goto out;
1266	}
1267
 
 
 
 
 
1268	if (attr_mask & IB_QP_PKEY_INDEX) {
1269		int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
1270		if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p])
 
 
 
 
1271			goto out;
 
1272	}
1273
1274	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
1275	    attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
 
 
 
 
1276		goto out;
1277	}
1278
1279	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
1280	    attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
 
 
 
 
1281		goto out;
1282	}
1283
1284	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
1285		err = 0;
1286		goto out;
1287	}
1288
1289	err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1290
1291out:
1292	mutex_unlock(&qp->mutex);
1293	return err;
1294}
1295
1296static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1297			    void *wqe, unsigned *mlx_seg_len)
1298{
1299	struct ib_device *ib_dev = sqp->qp.ibqp.device;
 
1300	struct mlx4_wqe_mlx_seg *mlx = wqe;
 
1301	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
1302	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
1303	union ib_gid sgid;
1304	u16 pkey;
1305	int send_size;
1306	int header_size;
1307	int spc;
1308	int i;
1309	int is_eth;
1310	int is_vlan = 0;
1311	int is_grh;
1312	u16 vlan;
 
 
 
1313
1314	send_size = 0;
1315	for (i = 0; i < wr->num_sge; ++i)
1316		send_size += wr->sg_list[i].length;
1317
1318	is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
1319	is_grh = mlx4_ib_ah_grh_present(ah);
1320	if (is_eth) {
1321		ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
1322				  ah->av.ib.gid_index, &sgid);
1323		vlan = rdma_get_vlan_id(&sgid);
1324		is_vlan = vlan < 0x1000;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1325	}
1326	ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
 
 
 
1327
1328	if (!is_eth) {
1329		sqp->ud_header.lrh.service_level =
1330			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
1331		sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid;
1332		sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
1333	}
1334
1335	if (is_grh) {
1336		sqp->ud_header.grh.traffic_class =
1337			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
1338		sqp->ud_header.grh.flow_label    =
1339			ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
1340		sqp->ud_header.grh.hop_limit     = ah->av.ib.hop_limit;
1341		ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
1342				  ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1343		memcpy(sqp->ud_header.grh.destination_gid.raw,
1344		       ah->av.ib.dgid, 16);
1345	}
1346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1347	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
1348
1349	if (!is_eth) {
1350		mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
1351					  (sqp->ud_header.lrh.destination_lid ==
1352					   IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
1353					  (sqp->ud_header.lrh.service_level << 8));
 
 
1354		mlx->rlid = sqp->ud_header.lrh.destination_lid;
1355	}
1356
1357	switch (wr->opcode) {
1358	case IB_WR_SEND:
1359		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY;
1360		sqp->ud_header.immediate_present = 0;
1361		break;
1362	case IB_WR_SEND_WITH_IMM:
1363		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
1364		sqp->ud_header.immediate_present = 1;
1365		sqp->ud_header.immediate_data    = wr->ex.imm_data;
1366		break;
1367	default:
1368		return -EINVAL;
1369	}
1370
1371	if (is_eth) {
1372		u8 *smac;
 
 
1373
 
 
 
 
 
 
1374		memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
1375		/* FIXME: cache smac value? */
1376		smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr;
1377		memcpy(sqp->ud_header.eth.smac_h, smac, 6);
 
 
1378		if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
1379			mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
1380		if (!is_vlan) {
1381			sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
1382		} else {
1383			u16 pcp;
1384
1385			sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
1386			pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 27 & 3) << 13;
1387			sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
1388		}
1389	} else {
1390		sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
 
 
 
 
 
1391		if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
1392			sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
1393	}
1394	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
1395	if (!sqp->qp.ibqp.qp_num)
1396		ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
 
1397	else
1398		ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey);
 
 
 
 
1399	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
1400	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
1401	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
1402	sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
1403					       sqp->qkey : wr->wr.ud.remote_qkey);
1404	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
1405
1406	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
1407
1408	if (0) {
1409		printk(KERN_ERR "built UD header of size %d:\n", header_size);
1410		for (i = 0; i < header_size / 4; ++i) {
1411			if (i % 8 == 0)
1412				printk("  [%02x] ", i * 4);
1413			printk(" %08x",
1414			       be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
1415			if ((i + 1) % 8 == 0)
1416				printk("\n");
1417		}
1418		printk("\n");
1419	}
1420
1421	/*
1422	 * Inline data segments may not cross a 64 byte boundary.  If
1423	 * our UD header is bigger than the space available up to the
1424	 * next 64 byte boundary in the WQE, use two inline data
1425	 * segments to hold the UD header.
1426	 */
1427	spc = MLX4_INLINE_ALIGN -
1428		((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
1429	if (header_size <= spc) {
1430		inl->byte_count = cpu_to_be32(1 << 31 | header_size);
1431		memcpy(inl + 1, sqp->header_buf, header_size);
1432		i = 1;
1433	} else {
1434		inl->byte_count = cpu_to_be32(1 << 31 | spc);
1435		memcpy(inl + 1, sqp->header_buf, spc);
1436
1437		inl = (void *) (inl + 1) + spc;
1438		memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
1439		/*
1440		 * Need a barrier here to make sure all the data is
1441		 * visible before the byte_count field is set.
1442		 * Otherwise the HCA prefetcher could grab the 64-byte
1443		 * chunk with this inline segment and get a valid (!=
1444		 * 0xffffffff) byte count but stale data, and end up
1445		 * generating a packet with bad headers.
1446		 *
1447		 * The first inline segment's byte_count field doesn't
1448		 * need a barrier, because it comes after a
1449		 * control/MLX segment and therefore is at an offset
1450		 * of 16 mod 64.
1451		 */
1452		wmb();
1453		inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
1454		i = 2;
1455	}
1456
1457	*mlx_seg_len =
1458		ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
1459	return 0;
1460}
1461
1462static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
1463{
1464	unsigned cur;
1465	struct mlx4_ib_cq *cq;
1466
1467	cur = wq->head - wq->tail;
1468	if (likely(cur + nreq < wq->max_post))
1469		return 0;
1470
1471	cq = to_mcq(ib_cq);
1472	spin_lock(&cq->lock);
1473	cur = wq->head - wq->tail;
1474	spin_unlock(&cq->lock);
1475
1476	return cur + nreq >= wq->max_post;
1477}
1478
1479static __be32 convert_access(int acc)
1480{
1481	return (acc & IB_ACCESS_REMOTE_ATOMIC ? cpu_to_be32(MLX4_WQE_FMR_PERM_ATOMIC)       : 0) |
1482	       (acc & IB_ACCESS_REMOTE_WRITE  ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_WRITE) : 0) |
1483	       (acc & IB_ACCESS_REMOTE_READ   ? cpu_to_be32(MLX4_WQE_FMR_PERM_REMOTE_READ)  : 0) |
 
 
 
1484	       (acc & IB_ACCESS_LOCAL_WRITE   ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE)  : 0) |
1485		cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
1486}
1487
1488static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr)
 
1489{
1490	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list);
1491	int i;
1492
1493	for (i = 0; i < wr->wr.fast_reg.page_list_len; ++i)
1494		mfrpl->mapped_page_list[i] =
1495			cpu_to_be64(wr->wr.fast_reg.page_list->page_list[i] |
1496				    MLX4_MTT_FLAG_PRESENT);
1497
1498	fseg->flags		= convert_access(wr->wr.fast_reg.access_flags);
1499	fseg->mem_key		= cpu_to_be32(wr->wr.fast_reg.rkey);
1500	fseg->buf_list		= cpu_to_be64(mfrpl->map);
1501	fseg->start_addr	= cpu_to_be64(wr->wr.fast_reg.iova_start);
1502	fseg->reg_len		= cpu_to_be64(wr->wr.fast_reg.length);
1503	fseg->offset		= 0; /* XXX -- is this just for ZBVA? */
1504	fseg->page_size		= cpu_to_be32(wr->wr.fast_reg.page_shift);
1505	fseg->reserved[0]	= 0;
1506	fseg->reserved[1]	= 0;
1507}
1508
1509static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
1510{
1511	iseg->flags	= 0;
1512	iseg->mem_key	= cpu_to_be32(rkey);
1513	iseg->guest_id	= 0;
1514	iseg->pa	= 0;
1515}
1516
1517static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
1518					  u64 remote_addr, u32 rkey)
1519{
1520	rseg->raddr    = cpu_to_be64(remote_addr);
1521	rseg->rkey     = cpu_to_be32(rkey);
1522	rseg->reserved = 0;
1523}
1524
1525static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr)
 
1526{
1527	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
1528		aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
1529		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add);
1530	} else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
1531		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
1532		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add_mask);
1533	} else {
1534		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
1535		aseg->compare  = 0;
1536	}
1537
1538}
1539
1540static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg,
1541				  struct ib_send_wr *wr)
1542{
1543	aseg->swap_add		= cpu_to_be64(wr->wr.atomic.swap);
1544	aseg->swap_add_mask	= cpu_to_be64(wr->wr.atomic.swap_mask);
1545	aseg->compare		= cpu_to_be64(wr->wr.atomic.compare_add);
1546	aseg->compare_mask	= cpu_to_be64(wr->wr.atomic.compare_add_mask);
1547}
1548
1549static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
1550			     struct ib_send_wr *wr, __be16 *vlan)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1551{
1552	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
1553	dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
1554	dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
1555	dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan;
1556	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6);
1557	*vlan = dseg->vlan;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1558}
1559
1560static void set_mlx_icrc_seg(void *dseg)
1561{
1562	u32 *t = dseg;
1563	struct mlx4_wqe_inline_seg *iseg = dseg;
1564
1565	t[1] = 0;
1566
1567	/*
1568	 * Need a barrier here before writing the byte_count field to
1569	 * make sure that all the data is visible before the
1570	 * byte_count field is set.  Otherwise, if the segment begins
1571	 * a new cacheline, the HCA prefetcher could grab the 64-byte
1572	 * chunk and get a valid (!= * 0xffffffff) byte count but
1573	 * stale data, and end up sending the wrong data.
1574	 */
1575	wmb();
1576
1577	iseg->byte_count = cpu_to_be32((1 << 31) | 4);
1578}
1579
1580static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
1581{
1582	dseg->lkey       = cpu_to_be32(sg->lkey);
1583	dseg->addr       = cpu_to_be64(sg->addr);
1584
1585	/*
1586	 * Need a barrier here before writing the byte_count field to
1587	 * make sure that all the data is visible before the
1588	 * byte_count field is set.  Otherwise, if the segment begins
1589	 * a new cacheline, the HCA prefetcher could grab the 64-byte
1590	 * chunk and get a valid (!= * 0xffffffff) byte count but
1591	 * stale data, and end up sending the wrong data.
1592	 */
1593	wmb();
1594
1595	dseg->byte_count = cpu_to_be32(sg->length);
1596}
1597
1598static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
1599{
1600	dseg->byte_count = cpu_to_be32(sg->length);
1601	dseg->lkey       = cpu_to_be32(sg->lkey);
1602	dseg->addr       = cpu_to_be64(sg->addr);
1603}
1604
1605static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
1606			 struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
1607			 __be32 *lso_hdr_sz, __be32 *blh)
1608{
1609	unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16);
1610
1611	if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE))
1612		*blh = cpu_to_be32(1 << 6);
1613
1614	if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&
1615		     wr->num_sge > qp->sq.max_gs - (halign >> 4)))
1616		return -EINVAL;
1617
1618	memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen);
1619
1620	*lso_hdr_sz  = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
1621				   wr->wr.ud.hlen);
1622	*lso_seg_len = halign;
1623	return 0;
1624}
1625
1626static __be32 send_ieth(struct ib_send_wr *wr)
1627{
1628	switch (wr->opcode) {
1629	case IB_WR_SEND_WITH_IMM:
1630	case IB_WR_RDMA_WRITE_WITH_IMM:
1631		return wr->ex.imm_data;
1632
1633	case IB_WR_SEND_WITH_INV:
1634		return cpu_to_be32(wr->ex.invalidate_rkey);
1635
1636	default:
1637		return 0;
1638	}
1639}
1640
1641int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
1642		      struct ib_send_wr **bad_wr)
 
 
 
 
 
 
 
1643{
1644	struct mlx4_ib_qp *qp = to_mqp(ibqp);
1645	void *wqe;
1646	struct mlx4_wqe_ctrl_seg *ctrl;
1647	struct mlx4_wqe_data_seg *dseg;
1648	unsigned long flags;
1649	int nreq;
1650	int err = 0;
1651	unsigned ind;
1652	int uninitialized_var(stamp);
1653	int uninitialized_var(size);
1654	unsigned uninitialized_var(seglen);
1655	__be32 dummy;
1656	__be32 *lso_wqe;
1657	__be32 uninitialized_var(lso_hdr_sz);
1658	__be32 blh;
1659	int i;
1660	__be16 vlan = cpu_to_be16(0xffff);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1661
1662	spin_lock_irqsave(&qp->sq.lock, flags);
 
 
 
 
 
 
 
1663
1664	ind = qp->sq_next_wqe;
1665
1666	for (nreq = 0; wr; ++nreq, wr = wr->next) {
1667		lso_wqe = &dummy;
1668		blh = 0;
1669
1670		if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
1671			err = -ENOMEM;
1672			*bad_wr = wr;
1673			goto out;
1674		}
1675
1676		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
1677			err = -EINVAL;
1678			*bad_wr = wr;
1679			goto out;
1680		}
1681
1682		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
1683		qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
1684
1685		ctrl->srcrb_flags =
1686			(wr->send_flags & IB_SEND_SIGNALED ?
1687			 cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
1688			(wr->send_flags & IB_SEND_SOLICITED ?
1689			 cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
1690			((wr->send_flags & IB_SEND_IP_CSUM) ?
1691			 cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
1692				     MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) |
1693			qp->sq_signal_bits;
1694
1695		ctrl->imm = send_ieth(wr);
1696
1697		wqe += sizeof *ctrl;
1698		size = sizeof *ctrl / 16;
1699
1700		switch (ibqp->qp_type) {
1701		case IB_QPT_RC:
1702		case IB_QPT_UC:
1703			switch (wr->opcode) {
1704			case IB_WR_ATOMIC_CMP_AND_SWP:
1705			case IB_WR_ATOMIC_FETCH_AND_ADD:
1706			case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
1707				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
1708					      wr->wr.atomic.rkey);
1709				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
1710
1711				set_atomic_seg(wqe, wr);
1712				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
1713
1714				size += (sizeof (struct mlx4_wqe_raddr_seg) +
1715					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
1716
1717				break;
1718
1719			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
1720				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
1721					      wr->wr.atomic.rkey);
1722				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
1723
1724				set_masked_atomic_seg(wqe, wr);
1725				wqe  += sizeof (struct mlx4_wqe_masked_atomic_seg);
1726
1727				size += (sizeof (struct mlx4_wqe_raddr_seg) +
1728					 sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16;
1729
1730				break;
1731
1732			case IB_WR_RDMA_READ:
1733			case IB_WR_RDMA_WRITE:
1734			case IB_WR_RDMA_WRITE_WITH_IMM:
1735				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
1736					      wr->wr.rdma.rkey);
1737				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
1738				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
1739				break;
1740
1741			case IB_WR_LOCAL_INV:
1742				ctrl->srcrb_flags |=
1743					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
1744				set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
1745				wqe  += sizeof (struct mlx4_wqe_local_inval_seg);
1746				size += sizeof (struct mlx4_wqe_local_inval_seg) / 16;
1747				break;
1748
1749			case IB_WR_FAST_REG_MR:
1750				ctrl->srcrb_flags |=
1751					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
1752				set_fmr_seg(wqe, wr);
1753				wqe  += sizeof (struct mlx4_wqe_fmr_seg);
1754				size += sizeof (struct mlx4_wqe_fmr_seg) / 16;
1755				break;
1756
1757			default:
1758				/* No extra segments required for sends */
1759				break;
1760			}
1761			break;
1762
1763		case IB_QPT_UD:
1764			set_datagram_seg(wqe, wr, &vlan);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1765			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
1766			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
1767
1768			if (wr->opcode == IB_WR_LSO) {
1769				err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh);
 
1770				if (unlikely(err)) {
1771					*bad_wr = wr;
1772					goto out;
1773				}
1774				lso_wqe = (__be32 *) wqe;
1775				wqe  += seglen;
1776				size += seglen / 16;
1777			}
1778			break;
1779
1780		case IB_QPT_SMI:
1781		case IB_QPT_GSI:
1782			err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1783			if (unlikely(err)) {
1784				*bad_wr = wr;
1785				goto out;
1786			}
1787			wqe  += seglen;
1788			size += seglen / 16;
1789			break;
1790
1791		default:
1792			break;
1793		}
1794
1795		/*
1796		 * Write data segments in reverse order, so as to
1797		 * overwrite cacheline stamp last within each
1798		 * cacheline.  This avoids issues with WQE
1799		 * prefetching.
1800		 */
1801
1802		dseg = wqe;
1803		dseg += wr->num_sge - 1;
1804		size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
1805
1806		/* Add one more inline data segment for ICRC for MLX sends */
1807		if (unlikely(qp->ibqp.qp_type == IB_QPT_SMI ||
1808			     qp->ibqp.qp_type == IB_QPT_GSI)) {
 
 
1809			set_mlx_icrc_seg(dseg + 1);
1810			size += sizeof (struct mlx4_wqe_data_seg) / 16;
1811		}
1812
1813		for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
1814			set_data_seg(dseg, wr->sg_list + i);
1815
1816		/*
1817		 * Possibly overwrite stamping in cacheline with LSO
1818		 * segment only after making sure all data segments
1819		 * are written.
1820		 */
1821		wmb();
1822		*lso_wqe = lso_hdr_sz;
1823
1824		ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
1825				    MLX4_WQE_CTRL_FENCE : 0) | size;
1826
1827		if (be16_to_cpu(vlan) < 0x1000) {
1828			ctrl->ins_vlan = 1 << 6;
1829			ctrl->vlan_tag = vlan;
1830		}
1831
1832		/*
1833		 * Make sure descriptor is fully written before
1834		 * setting ownership bit (because HW can start
1835		 * executing as soon as we do).
1836		 */
1837		wmb();
1838
1839		if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
 
1840			err = -EINVAL;
1841			goto out;
1842		}
1843
1844		ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
1845			(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh;
1846
1847		stamp = ind + qp->sq_spare_wqes;
1848		ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
1849
1850		/*
1851		 * We can improve latency by not stamping the last
1852		 * send queue WQE until after ringing the doorbell, so
1853		 * only stamp here if there are still more WQEs to post.
1854		 *
1855		 * Same optimization applies to padding with NOP wqe
1856		 * in case of WQE shrinking (used to prevent wrap-around
1857		 * in the middle of WR).
1858		 */
1859		if (wr->next) {
1860			stamp_send_wqe(qp, stamp, size * 16);
1861			ind = pad_wraparound(qp, ind);
1862		}
1863	}
1864
1865out:
1866	if (likely(nreq)) {
1867		qp->sq.head += nreq;
1868
1869		/*
1870		 * Make sure that descriptors are written before
1871		 * doorbell record.
1872		 */
1873		wmb();
1874
1875		writel(qp->doorbell_qpn,
1876		       to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL);
1877
1878		/*
1879		 * Make sure doorbells don't leak out of SQ spinlock
1880		 * and reach the HCA out of order.
1881		 */
1882		mmiowb();
1883
1884		stamp_send_wqe(qp, stamp, size * 16);
1885
1886		ind = pad_wraparound(qp, ind);
1887		qp->sq_next_wqe = ind;
1888	}
1889
1890	spin_unlock_irqrestore(&qp->sq.lock, flags);
1891
1892	return err;
1893}
1894
1895int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
1896		      struct ib_recv_wr **bad_wr)
 
 
 
 
 
 
1897{
1898	struct mlx4_ib_qp *qp = to_mqp(ibqp);
1899	struct mlx4_wqe_data_seg *scat;
1900	unsigned long flags;
1901	int err = 0;
1902	int nreq;
1903	int ind;
 
1904	int i;
 
1905
 
1906	spin_lock_irqsave(&qp->rq.lock, flags);
1907
 
 
 
 
 
 
 
 
1908	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
1909
1910	for (nreq = 0; wr; ++nreq, wr = wr->next) {
1911		if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
1912			err = -ENOMEM;
1913			*bad_wr = wr;
1914			goto out;
1915		}
1916
1917		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
1918			err = -EINVAL;
1919			*bad_wr = wr;
1920			goto out;
1921		}
1922
1923		scat = get_recv_wqe(qp, ind);
1924
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1925		for (i = 0; i < wr->num_sge; ++i)
1926			__set_data_seg(scat + i, wr->sg_list + i);
1927
1928		if (i < qp->rq.max_gs) {
1929			scat[i].byte_count = 0;
1930			scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
1931			scat[i].addr       = 0;
1932		}
1933
1934		qp->rq.wrid[ind] = wr->wr_id;
1935
1936		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
1937	}
1938
1939out:
1940	if (likely(nreq)) {
1941		qp->rq.head += nreq;
1942
1943		/*
1944		 * Make sure that descriptors are written before
1945		 * doorbell record.
1946		 */
1947		wmb();
1948
1949		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
1950	}
1951
1952	spin_unlock_irqrestore(&qp->rq.lock, flags);
1953
1954	return err;
1955}
1956
 
 
 
 
 
 
1957static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state)
1958{
1959	switch (mlx4_state) {
1960	case MLX4_QP_STATE_RST:      return IB_QPS_RESET;
1961	case MLX4_QP_STATE_INIT:     return IB_QPS_INIT;
1962	case MLX4_QP_STATE_RTR:      return IB_QPS_RTR;
1963	case MLX4_QP_STATE_RTS:      return IB_QPS_RTS;
1964	case MLX4_QP_STATE_SQ_DRAINING:
1965	case MLX4_QP_STATE_SQD:      return IB_QPS_SQD;
1966	case MLX4_QP_STATE_SQER:     return IB_QPS_SQE;
1967	case MLX4_QP_STATE_ERR:      return IB_QPS_ERR;
1968	default:		     return -1;
1969	}
1970}
1971
1972static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state)
1973{
1974	switch (mlx4_mig_state) {
1975	case MLX4_QP_PM_ARMED:		return IB_MIG_ARMED;
1976	case MLX4_QP_PM_REARM:		return IB_MIG_REARM;
1977	case MLX4_QP_PM_MIGRATED:	return IB_MIG_MIGRATED;
1978	default: return -1;
1979	}
1980}
1981
1982static int to_ib_qp_access_flags(int mlx4_flags)
1983{
1984	int ib_flags = 0;
1985
1986	if (mlx4_flags & MLX4_QP_BIT_RRE)
1987		ib_flags |= IB_ACCESS_REMOTE_READ;
1988	if (mlx4_flags & MLX4_QP_BIT_RWE)
1989		ib_flags |= IB_ACCESS_REMOTE_WRITE;
1990	if (mlx4_flags & MLX4_QP_BIT_RAE)
1991		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
1992
1993	return ib_flags;
1994}
1995
1996static void to_ib_ah_attr(struct mlx4_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr,
1997				struct mlx4_qp_path *path)
 
1998{
1999	struct mlx4_dev *dev = ibdev->dev;
2000	int is_eth;
2001
2002	memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
2003	ib_ah_attr->port_num	  = path->sched_queue & 0x40 ? 2 : 1;
2004
2005	if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports)
 
2006		return;
 
2007
2008	is_eth = rdma_port_get_link_layer(&ibdev->ib_dev, ib_ah_attr->port_num) ==
2009		IB_LINK_LAYER_ETHERNET;
2010	if (is_eth)
2011		ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) |
2012		((path->sched_queue & 4) << 1);
2013	else
2014		ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf;
 
2015
2016	ib_ah_attr->dlid	  = be16_to_cpu(path->rlid);
2017	ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f;
2018	ib_ah_attr->static_rate   = path->static_rate ? path->static_rate - 5 : 0;
2019	ib_ah_attr->ah_flags      = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
2020	if (ib_ah_attr->ah_flags) {
2021		ib_ah_attr->grh.sgid_index = path->mgid_index;
2022		ib_ah_attr->grh.hop_limit  = path->hop_limit;
2023		ib_ah_attr->grh.traffic_class =
2024			(be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff;
2025		ib_ah_attr->grh.flow_label =
2026			be32_to_cpu(path->tclass_flowlabel) & 0xfffff;
2027		memcpy(ib_ah_attr->grh.dgid.raw,
2028			path->rgid, sizeof ib_ah_attr->grh.dgid.raw);
2029	}
2030}
2031
2032int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
2033		     struct ib_qp_init_attr *qp_init_attr)
2034{
2035	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
2036	struct mlx4_ib_qp *qp = to_mqp(ibqp);
2037	struct mlx4_qp_context context;
2038	int mlx4_state;
2039	int err = 0;
2040
 
 
 
2041	mutex_lock(&qp->mutex);
2042
2043	if (qp->state == IB_QPS_RESET) {
2044		qp_attr->qp_state = IB_QPS_RESET;
2045		goto done;
2046	}
2047
2048	err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
2049	if (err) {
2050		err = -EINVAL;
2051		goto out;
2052	}
2053
2054	mlx4_state = be32_to_cpu(context.flags) >> 28;
2055
2056	qp->state		     = to_ib_qp_state(mlx4_state);
2057	qp_attr->qp_state	     = qp->state;
2058	qp_attr->path_mtu	     = context.mtu_msgmax >> 5;
2059	qp_attr->path_mig_state	     =
2060		to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
2061	qp_attr->qkey		     = be32_to_cpu(context.qkey);
2062	qp_attr->rq_psn		     = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
2063	qp_attr->sq_psn		     = be32_to_cpu(context.next_send_psn) & 0xffffff;
2064	qp_attr->dest_qp_num	     = be32_to_cpu(context.remote_qpn) & 0xffffff;
2065	qp_attr->qp_access_flags     =
2066		to_ib_qp_access_flags(be32_to_cpu(context.params2));
2067
2068	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
2069		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
2070		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path);
2071		qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
2072		qp_attr->alt_port_num	= qp_attr->alt_ah_attr.port_num;
 
2073	}
2074
2075	qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
2076	if (qp_attr->qp_state == IB_QPS_INIT)
2077		qp_attr->port_num = qp->port;
2078	else
2079		qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
2080
2081	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
2082	qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
2083
2084	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
2085
2086	qp_attr->max_dest_rd_atomic =
2087		1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
2088	qp_attr->min_rnr_timer	    =
2089		(be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
2090	qp_attr->timeout	    = context.pri_path.ackto >> 3;
2091	qp_attr->retry_cnt	    = (be32_to_cpu(context.params1) >> 16) & 0x7;
2092	qp_attr->rnr_retry	    = (be32_to_cpu(context.params1) >> 13) & 0x7;
2093	qp_attr->alt_timeout	    = context.alt_path.ackto >> 3;
2094
2095done:
2096	qp_attr->cur_qp_state	     = qp_attr->qp_state;
2097	qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
2098	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
2099
2100	if (!ibqp->uobject) {
2101		qp_attr->cap.max_send_wr  = qp->sq.wqe_cnt;
2102		qp_attr->cap.max_send_sge = qp->sq.max_gs;
2103	} else {
2104		qp_attr->cap.max_send_wr  = 0;
2105		qp_attr->cap.max_send_sge = 0;
2106	}
2107
2108	/*
2109	 * We don't support inline sends for kernel QPs (yet), and we
2110	 * don't know what userspace's value should be.
2111	 */
2112	qp_attr->cap.max_inline_data = 0;
2113
2114	qp_init_attr->cap	     = qp_attr->cap;
2115
2116	qp_init_attr->create_flags = 0;
2117	if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
2118		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
2119
2120	if (qp->flags & MLX4_IB_QP_LSO)
2121		qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
2122
 
 
 
 
 
 
 
2123out:
2124	mutex_unlock(&qp->mutex);
2125	return err;
2126}
2127
v5.9
   1/*
   2 * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
   3 * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
   4 *
   5 * This software is available to you under a choice of one of two
   6 * licenses.  You may choose to be licensed under the terms of the GNU
   7 * General Public License (GPL) Version 2, available from the file
   8 * COPYING in the main directory of this source tree, or the
   9 * OpenIB.org BSD license below:
  10 *
  11 *     Redistribution and use in source and binary forms, with or
  12 *     without modification, are permitted provided that the following
  13 *     conditions are met:
  14 *
  15 *      - Redistributions of source code must retain the above
  16 *        copyright notice, this list of conditions and the following
  17 *        disclaimer.
  18 *
  19 *      - Redistributions in binary form must reproduce the above
  20 *        copyright notice, this list of conditions and the following
  21 *        disclaimer in the documentation and/or other materials
  22 *        provided with the distribution.
  23 *
  24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  31 * SOFTWARE.
  32 */
  33
  34#include <linux/log2.h>
  35#include <linux/etherdevice.h>
  36#include <net/ip.h>
  37#include <linux/slab.h>
  38#include <linux/netdevice.h>
  39
  40#include <rdma/ib_cache.h>
  41#include <rdma/ib_pack.h>
  42#include <rdma/ib_addr.h>
  43#include <rdma/ib_mad.h>
  44#include <rdma/uverbs_ioctl.h>
  45
  46#include <linux/mlx4/driver.h>
  47#include <linux/mlx4/qp.h>
  48
  49#include "mlx4_ib.h"
  50#include <rdma/mlx4-abi.h>
  51
  52static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq,
  53			     struct mlx4_ib_cq *recv_cq);
  54static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
  55			       struct mlx4_ib_cq *recv_cq);
  56static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state,
  57			      struct ib_udata *udata);
  58
  59enum {
  60	MLX4_IB_ACK_REQ_FREQ	= 8,
  61};
  62
  63enum {
  64	MLX4_IB_DEFAULT_SCHED_QUEUE	= 0x83,
  65	MLX4_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f,
  66	MLX4_IB_LINK_TYPE_IB		= 0,
  67	MLX4_IB_LINK_TYPE_ETH		= 1
  68};
  69
  70enum {
  71	/*
  72	 * Largest possible UD header: send with GRH and immediate
  73	 * data plus 18 bytes for an Ethernet header with VLAN/802.1Q
  74	 * tag.  (LRH would only use 8 bytes, so Ethernet is the
  75	 * biggest case)
  76	 */
  77	MLX4_IB_UD_HEADER_SIZE		= 82,
  78	MLX4_IB_LSO_HEADER_SPARE	= 128,
  79};
  80
 
 
 
 
  81struct mlx4_ib_sqp {
  82	struct mlx4_ib_qp	qp;
  83	int			pkey_index;
  84	u32			qkey;
  85	u32			send_psn;
  86	struct ib_ud_header	ud_header;
  87	u8			header_buf[MLX4_IB_UD_HEADER_SIZE];
  88	struct ib_qp		*roce_v2_gsi;
  89};
  90
  91enum {
  92	MLX4_IB_MIN_SQ_STRIDE	= 6,
  93	MLX4_IB_CACHE_LINE_SIZE	= 64,
  94};
  95
  96enum {
  97	MLX4_RAW_QP_MTU		= 7,
  98	MLX4_RAW_QP_MSGMAX	= 31,
  99};
 100
 101#ifndef ETH_ALEN
 102#define ETH_ALEN        6
 103#endif
 104
 105static const __be32 mlx4_ib_opcode[] = {
 106	[IB_WR_SEND]				= cpu_to_be32(MLX4_OPCODE_SEND),
 107	[IB_WR_LSO]				= cpu_to_be32(MLX4_OPCODE_LSO),
 108	[IB_WR_SEND_WITH_IMM]			= cpu_to_be32(MLX4_OPCODE_SEND_IMM),
 109	[IB_WR_RDMA_WRITE]			= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
 110	[IB_WR_RDMA_WRITE_WITH_IMM]		= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
 111	[IB_WR_RDMA_READ]			= cpu_to_be32(MLX4_OPCODE_RDMA_READ),
 112	[IB_WR_ATOMIC_CMP_AND_SWP]		= cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
 113	[IB_WR_ATOMIC_FETCH_AND_ADD]		= cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
 114	[IB_WR_SEND_WITH_INV]			= cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
 115	[IB_WR_LOCAL_INV]			= cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
 116	[IB_WR_REG_MR]				= cpu_to_be32(MLX4_OPCODE_FMR),
 117	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
 118	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
 119};
 120
 121enum mlx4_ib_source_type {
 122	MLX4_IB_QP_SRC	= 0,
 123	MLX4_IB_RWQ_SRC	= 1,
 124};
 125
 126static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
 127{
 128	return container_of(mqp, struct mlx4_ib_sqp, qp);
 129}
 130
 131static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 132{
 133	if (!mlx4_is_master(dev->dev))
 134		return 0;
 135
 136	return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn &&
 137	       qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn +
 138		8 * MLX4_MFUNC_MAX;
 139}
 140
 141static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 142{
 143	int proxy_sqp = 0;
 144	int real_sqp = 0;
 145	int i;
 146	/* PPF or Native -- real SQP */
 147	real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
 148		    qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
 149		    qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3);
 150	if (real_sqp)
 151		return 1;
 152	/* VF or PF -- proxy SQP */
 153	if (mlx4_is_mfunc(dev->dev)) {
 154		for (i = 0; i < dev->dev->caps.num_ports; i++) {
 155			if (qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp0_proxy ||
 156			    qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp1_proxy) {
 157				proxy_sqp = 1;
 158				break;
 159			}
 160		}
 161	}
 162	if (proxy_sqp)
 163		return 1;
 164
 165	return !!(qp->flags & MLX4_IB_ROCE_V2_GSI_QP);
 166}
 167
 168/* used for INIT/CLOSE port logic */
 169static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 170{
 171	int proxy_qp0 = 0;
 172	int real_qp0 = 0;
 173	int i;
 174	/* PPF or Native -- real QP0 */
 175	real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
 176		    qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
 177		    qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1);
 178	if (real_qp0)
 179		return 1;
 180	/* VF or PF -- proxy QP0 */
 181	if (mlx4_is_mfunc(dev->dev)) {
 182		for (i = 0; i < dev->dev->caps.num_ports; i++) {
 183			if (qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp0_proxy) {
 184				proxy_qp0 = 1;
 185				break;
 186			}
 187		}
 188	}
 189	return proxy_qp0;
 190}
 191
 192static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
 193{
 194	return mlx4_buf_offset(&qp->buf, offset);
 195}
 196
 197static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
 198{
 199	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
 200}
 201
 202static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
 203{
 204	return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
 205}
 206
 207/*
 208 * Stamp a SQ WQE so that it is invalid if prefetched by marking the
 209 * first four bytes of every 64 byte chunk with 0xffffffff, except for
 210 * the very first chunk of the WQE.
 
 
 
 
 211 */
 212static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
 213{
 214	__be32 *wqe;
 215	int i;
 216	int s;
 
 217	void *buf;
 
 218	struct mlx4_wqe_ctrl_seg *ctrl;
 219
 220	buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
 221	ctrl = (struct mlx4_wqe_ctrl_seg *)buf;
 222	s = (ctrl->qpn_vlan.fence_size & 0x3f) << 4;
 223	for (i = 64; i < s; i += 64) {
 224		wqe = buf + i;
 225		*wqe = cpu_to_be32(0xffffffff);
 
 
 
 
 
 
 
 
 
 
 
 226	}
 227}
 228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 229static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
 230{
 231	struct ib_event event;
 232	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
 233
 234	if (type == MLX4_EVENT_TYPE_PATH_MIG)
 235		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
 236
 237	if (ibqp->event_handler) {
 238		event.device     = ibqp->device;
 239		event.element.qp = ibqp;
 240		switch (type) {
 241		case MLX4_EVENT_TYPE_PATH_MIG:
 242			event.event = IB_EVENT_PATH_MIG;
 243			break;
 244		case MLX4_EVENT_TYPE_COMM_EST:
 245			event.event = IB_EVENT_COMM_EST;
 246			break;
 247		case MLX4_EVENT_TYPE_SQ_DRAINED:
 248			event.event = IB_EVENT_SQ_DRAINED;
 249			break;
 250		case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
 251			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
 252			break;
 253		case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
 254			event.event = IB_EVENT_QP_FATAL;
 255			break;
 256		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
 257			event.event = IB_EVENT_PATH_MIG_ERR;
 258			break;
 259		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
 260			event.event = IB_EVENT_QP_REQ_ERR;
 261			break;
 262		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
 263			event.event = IB_EVENT_QP_ACCESS_ERR;
 264			break;
 265		default:
 266			pr_warn("Unexpected event type %d "
 267			       "on QP %06x\n", type, qp->qpn);
 268			return;
 269		}
 270
 271		ibqp->event_handler(&event, ibqp->qp_context);
 272	}
 273}
 274
 275static void mlx4_ib_wq_event(struct mlx4_qp *qp, enum mlx4_event type)
 276{
 277	pr_warn_ratelimited("Unexpected event type %d on WQ 0x%06x. Events are not supported for WQs\n",
 278			    type, qp->qpn);
 279}
 280
 281static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
 282{
 283	/*
 284	 * UD WQEs must have a datagram segment.
 285	 * RC and UC WQEs might have a remote address segment.
 286	 * MLX WQEs need two extra inline data segments (for the UD
 287	 * header and space for the ICRC).
 288	 */
 289	switch (type) {
 290	case MLX4_IB_QPT_UD:
 291		return sizeof (struct mlx4_wqe_ctrl_seg) +
 292			sizeof (struct mlx4_wqe_datagram_seg) +
 293			((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0);
 294	case MLX4_IB_QPT_PROXY_SMI_OWNER:
 295	case MLX4_IB_QPT_PROXY_SMI:
 296	case MLX4_IB_QPT_PROXY_GSI:
 297		return sizeof (struct mlx4_wqe_ctrl_seg) +
 298			sizeof (struct mlx4_wqe_datagram_seg) + 64;
 299	case MLX4_IB_QPT_TUN_SMI_OWNER:
 300	case MLX4_IB_QPT_TUN_GSI:
 301		return sizeof (struct mlx4_wqe_ctrl_seg) +
 302			sizeof (struct mlx4_wqe_datagram_seg);
 303
 304	case MLX4_IB_QPT_UC:
 305		return sizeof (struct mlx4_wqe_ctrl_seg) +
 306			sizeof (struct mlx4_wqe_raddr_seg);
 307	case MLX4_IB_QPT_RC:
 308		return sizeof (struct mlx4_wqe_ctrl_seg) +
 309			sizeof (struct mlx4_wqe_masked_atomic_seg) +
 310			sizeof (struct mlx4_wqe_raddr_seg);
 311	case MLX4_IB_QPT_SMI:
 312	case MLX4_IB_QPT_GSI:
 313		return sizeof (struct mlx4_wqe_ctrl_seg) +
 314			ALIGN(MLX4_IB_UD_HEADER_SIZE +
 315			      DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
 316					   MLX4_INLINE_ALIGN) *
 317			      sizeof (struct mlx4_wqe_inline_seg),
 318			      sizeof (struct mlx4_wqe_data_seg)) +
 319			ALIGN(4 +
 320			      sizeof (struct mlx4_wqe_inline_seg),
 321			      sizeof (struct mlx4_wqe_data_seg));
 322	default:
 323		return sizeof (struct mlx4_wqe_ctrl_seg);
 324	}
 325}
 326
 327static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 328		       bool is_user, bool has_rq, struct mlx4_ib_qp *qp,
 329		       u32 inl_recv_sz)
 330{
 331	/* Sanity check RQ size before proceeding */
 332	if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
 333	    cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg))
 334		return -EINVAL;
 335
 336	if (!has_rq) {
 337		if (cap->max_recv_wr || inl_recv_sz)
 
 338			return -EINVAL;
 339
 340		qp->rq.wqe_cnt = qp->rq.max_gs = 0;
 341	} else {
 342		u32 max_inl_recv_sz = dev->dev->caps.max_rq_sg *
 343			sizeof(struct mlx4_wqe_data_seg);
 344		u32 wqe_size;
 345
 346		/* HW requires >= 1 RQ entry with >= 1 gather entry */
 347		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge ||
 348				inl_recv_sz > max_inl_recv_sz))
 349			return -EINVAL;
 350
 351		qp->rq.wqe_cnt	 = roundup_pow_of_two(max(1U, cap->max_recv_wr));
 352		qp->rq.max_gs	 = roundup_pow_of_two(max(1U, cap->max_recv_sge));
 353		wqe_size = qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg);
 354		qp->rq.wqe_shift = ilog2(max_t(u32, wqe_size, inl_recv_sz));
 355	}
 356
 357	/* leave userspace return values as they were, so as not to break ABI */
 358	if (is_user) {
 359		cap->max_recv_wr  = qp->rq.max_post = qp->rq.wqe_cnt;
 360		cap->max_recv_sge = qp->rq.max_gs;
 361	} else {
 362		cap->max_recv_wr  = qp->rq.max_post =
 363			min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt);
 364		cap->max_recv_sge = min(qp->rq.max_gs,
 365					min(dev->dev->caps.max_sq_sg,
 366					    dev->dev->caps.max_rq_sg));
 367	}
 368
 369	return 0;
 370}
 371
 372static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 373			      enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
 374{
 375	int s;
 376
 377	/* Sanity check SQ size before proceeding */
 378	if (cap->max_send_wr  > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) ||
 379	    cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) ||
 380	    cap->max_inline_data + send_wqe_overhead(type, qp->flags) +
 381	    sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
 382		return -EINVAL;
 383
 384	/*
 385	 * For MLX transport we need 2 extra S/G entries:
 386	 * one for the header and one for the checksum at the end
 387	 */
 388	if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI ||
 389	     type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) &&
 390	    cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
 391		return -EINVAL;
 392
 393	s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
 394		cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
 395		send_wqe_overhead(type, qp->flags);
 396
 397	if (s > dev->dev->caps.max_sq_desc_sz)
 398		return -EINVAL;
 399
 400	qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 401
 402	/*
 403	 * We need to leave 2 KB + 1 WR of headroom in the SQ to
 404	 * allow HW to prefetch.
 405	 */
 406	qp->sq_spare_wqes = MLX4_IB_SQ_HEADROOM(qp->sq.wqe_shift);
 407	qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr +
 408					    qp->sq_spare_wqes);
 409
 410	qp->sq.max_gs =
 411		(min(dev->dev->caps.max_sq_desc_sz,
 412		     (1 << qp->sq.wqe_shift)) -
 413		 send_wqe_overhead(type, qp->flags)) /
 
 
 
 
 
 
 
 
 
 
 
 
 414		sizeof (struct mlx4_wqe_data_seg);
 415
 416	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
 417		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
 418	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
 419		qp->rq.offset = 0;
 420		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
 421	} else {
 422		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
 423		qp->sq.offset = 0;
 424	}
 425
 426	cap->max_send_wr  = qp->sq.max_post =
 427		qp->sq.wqe_cnt - qp->sq_spare_wqes;
 428	cap->max_send_sge = min(qp->sq.max_gs,
 429				min(dev->dev->caps.max_sq_sg,
 430				    dev->dev->caps.max_rq_sg));
 431	/* We don't support inline sends for kernel QPs (yet) */
 432	cap->max_inline_data = 0;
 433
 434	return 0;
 435}
 436
 437static int set_user_sq_size(struct mlx4_ib_dev *dev,
 438			    struct mlx4_ib_qp *qp,
 439			    struct mlx4_ib_create_qp *ucmd)
 440{
 441	/* Sanity check SQ size before proceeding */
 442	if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes	 ||
 443	    ucmd->log_sq_stride >
 444		ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) ||
 445	    ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE)
 446		return -EINVAL;
 447
 448	qp->sq.wqe_cnt   = 1 << ucmd->log_sq_bb_count;
 449	qp->sq.wqe_shift = ucmd->log_sq_stride;
 450
 451	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
 452		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
 453
 454	return 0;
 455}
 456
 457static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
 458{
 459	int i;
 460
 461	qp->sqp_proxy_rcv =
 462		kmalloc_array(qp->rq.wqe_cnt, sizeof(struct mlx4_ib_buf),
 463			      GFP_KERNEL);
 464	if (!qp->sqp_proxy_rcv)
 465		return -ENOMEM;
 466	for (i = 0; i < qp->rq.wqe_cnt; i++) {
 467		qp->sqp_proxy_rcv[i].addr =
 468			kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr),
 469				GFP_KERNEL);
 470		if (!qp->sqp_proxy_rcv[i].addr)
 471			goto err;
 472		qp->sqp_proxy_rcv[i].map =
 473			ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr,
 474					  sizeof (struct mlx4_ib_proxy_sqp_hdr),
 475					  DMA_FROM_DEVICE);
 476		if (ib_dma_mapping_error(dev, qp->sqp_proxy_rcv[i].map)) {
 477			kfree(qp->sqp_proxy_rcv[i].addr);
 478			goto err;
 479		}
 480	}
 481	return 0;
 482
 483err:
 484	while (i > 0) {
 485		--i;
 486		ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
 487				    sizeof (struct mlx4_ib_proxy_sqp_hdr),
 488				    DMA_FROM_DEVICE);
 489		kfree(qp->sqp_proxy_rcv[i].addr);
 490	}
 491	kfree(qp->sqp_proxy_rcv);
 492	qp->sqp_proxy_rcv = NULL;
 493	return -ENOMEM;
 494}
 495
 496static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
 497{
 498	int i;
 499
 500	for (i = 0; i < qp->rq.wqe_cnt; i++) {
 501		ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
 502				    sizeof (struct mlx4_ib_proxy_sqp_hdr),
 503				    DMA_FROM_DEVICE);
 504		kfree(qp->sqp_proxy_rcv[i].addr);
 505	}
 506	kfree(qp->sqp_proxy_rcv);
 507}
 508
 509static bool qp_has_rq(struct ib_qp_init_attr *attr)
 510{
 511	if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
 512		return false;
 513
 514	return !attr->srq;
 515}
 516
 517static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn)
 518{
 519	int i;
 520	for (i = 0; i < dev->caps.num_ports; i++) {
 521		if (qpn == dev->caps.spec_qps[i].qp0_proxy)
 522			return !!dev->caps.spec_qps[i].qp0_qkey;
 523	}
 524	return 0;
 525}
 526
 527static void mlx4_ib_free_qp_counter(struct mlx4_ib_dev *dev,
 528				    struct mlx4_ib_qp *qp)
 529{
 530	mutex_lock(&dev->counters_table[qp->port - 1].mutex);
 531	mlx4_counter_free(dev->dev, qp->counter_index->index);
 532	list_del(&qp->counter_index->list);
 533	mutex_unlock(&dev->counters_table[qp->port - 1].mutex);
 534
 535	kfree(qp->counter_index);
 536	qp->counter_index = NULL;
 537}
 538
 539static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx,
 540		      struct ib_qp_init_attr *init_attr,
 541		      struct mlx4_ib_create_qp_rss *ucmd)
 542{
 543	rss_ctx->base_qpn_tbl_sz = init_attr->rwq_ind_tbl->ind_tbl[0]->wq_num |
 544		(init_attr->rwq_ind_tbl->log_ind_tbl_size << 24);
 545
 546	if ((ucmd->rx_hash_function == MLX4_IB_RX_HASH_FUNC_TOEPLITZ) &&
 547	    (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP)) {
 548		memcpy(rss_ctx->rss_key, ucmd->rx_hash_key,
 549		       MLX4_EN_RSS_KEY_SIZE);
 550	} else {
 551		pr_debug("RX Hash function is not supported\n");
 552		return (-EOPNOTSUPP);
 553	}
 554
 555	if (ucmd->rx_hash_fields_mask & ~(MLX4_IB_RX_HASH_SRC_IPV4	|
 556					  MLX4_IB_RX_HASH_DST_IPV4	|
 557					  MLX4_IB_RX_HASH_SRC_IPV6	|
 558					  MLX4_IB_RX_HASH_DST_IPV6	|
 559					  MLX4_IB_RX_HASH_SRC_PORT_TCP	|
 560					  MLX4_IB_RX_HASH_DST_PORT_TCP	|
 561					  MLX4_IB_RX_HASH_SRC_PORT_UDP	|
 562					  MLX4_IB_RX_HASH_DST_PORT_UDP  |
 563					  MLX4_IB_RX_HASH_INNER)) {
 564		pr_debug("RX Hash fields_mask has unsupported mask (0x%llx)\n",
 565			 ucmd->rx_hash_fields_mask);
 566		return (-EOPNOTSUPP);
 567	}
 568
 569	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) &&
 570	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) {
 571		rss_ctx->flags = MLX4_RSS_IPV4;
 572	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) ||
 573		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) {
 574		pr_debug("RX Hash fields_mask is not supported - both IPv4 SRC and DST must be set\n");
 575		return (-EOPNOTSUPP);
 576	}
 577
 578	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) &&
 579	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) {
 580		rss_ctx->flags |= MLX4_RSS_IPV6;
 581	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) ||
 582		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) {
 583		pr_debug("RX Hash fields_mask is not supported - both IPv6 SRC and DST must be set\n");
 584		return (-EOPNOTSUPP);
 585	}
 586
 587	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) &&
 588	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) {
 589		if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS)) {
 590			pr_debug("RX Hash fields_mask for UDP is not supported\n");
 591			return (-EOPNOTSUPP);
 592		}
 593
 594		if (rss_ctx->flags & MLX4_RSS_IPV4)
 595			rss_ctx->flags |= MLX4_RSS_UDP_IPV4;
 596		if (rss_ctx->flags & MLX4_RSS_IPV6)
 597			rss_ctx->flags |= MLX4_RSS_UDP_IPV6;
 598		if (!(rss_ctx->flags & (MLX4_RSS_IPV6 | MLX4_RSS_IPV4))) {
 599			pr_debug("RX Hash fields_mask is not supported - UDP must be set with IPv4 or IPv6\n");
 600			return (-EOPNOTSUPP);
 601		}
 602	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) ||
 603		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) {
 604		pr_debug("RX Hash fields_mask is not supported - both UDP SRC and DST must be set\n");
 605		return (-EOPNOTSUPP);
 606	}
 607
 608	if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) &&
 609	    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) {
 610		if (rss_ctx->flags & MLX4_RSS_IPV4)
 611			rss_ctx->flags |= MLX4_RSS_TCP_IPV4;
 612		if (rss_ctx->flags & MLX4_RSS_IPV6)
 613			rss_ctx->flags |= MLX4_RSS_TCP_IPV6;
 614		if (!(rss_ctx->flags & (MLX4_RSS_IPV6 | MLX4_RSS_IPV4))) {
 615			pr_debug("RX Hash fields_mask is not supported - TCP must be set with IPv4 or IPv6\n");
 616			return (-EOPNOTSUPP);
 617		}
 618	} else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) ||
 619		   (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) {
 620		pr_debug("RX Hash fields_mask is not supported - both TCP SRC and DST must be set\n");
 621		return (-EOPNOTSUPP);
 622	}
 623
 624	if (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_INNER) {
 625		if (dev->dev->caps.tunnel_offload_mode ==
 626		    MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
 627			/*
 628			 * Hash according to inner headers if exist, otherwise
 629			 * according to outer headers.
 630			 */
 631			rss_ctx->flags |= MLX4_RSS_BY_INNER_HEADERS_IPONLY;
 632		} else {
 633			pr_debug("RSS Hash for inner headers isn't supported\n");
 634			return (-EOPNOTSUPP);
 635		}
 636	}
 637
 638	return 0;
 639}
 640
 641static int create_qp_rss(struct mlx4_ib_dev *dev,
 642			 struct ib_qp_init_attr *init_attr,
 643			 struct mlx4_ib_create_qp_rss *ucmd,
 644			 struct mlx4_ib_qp *qp)
 645{
 646	int qpn;
 647	int err;
 648
 649	qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
 650
 651	err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn, 0, qp->mqp.usage);
 652	if (err)
 653		return err;
 654
 655	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
 656	if (err)
 657		goto err_qpn;
 658
 659	mutex_init(&qp->mutex);
 660
 661	INIT_LIST_HEAD(&qp->gid_list);
 662	INIT_LIST_HEAD(&qp->steering_rules);
 663
 664	qp->mlx4_ib_qp_type = MLX4_IB_QPT_RAW_PACKET;
 665	qp->state = IB_QPS_RESET;
 666
 667	/* Set dummy send resources to be compatible with HV and PRM */
 668	qp->sq_no_prefetch = 1;
 669	qp->sq.wqe_cnt = 1;
 670	qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE;
 671	qp->buf_size = qp->sq.wqe_cnt << MLX4_IB_MIN_SQ_STRIDE;
 672	qp->mtt = (to_mqp(
 673		   (struct ib_qp *)init_attr->rwq_ind_tbl->ind_tbl[0]))->mtt;
 674
 675	qp->rss_ctx = kzalloc(sizeof(*qp->rss_ctx), GFP_KERNEL);
 676	if (!qp->rss_ctx) {
 677		err = -ENOMEM;
 678		goto err_qp_alloc;
 679	}
 680
 681	err = set_qp_rss(dev, qp->rss_ctx, init_attr, ucmd);
 682	if (err)
 683		goto err;
 684
 685	return 0;
 686
 687err:
 688	kfree(qp->rss_ctx);
 689
 690err_qp_alloc:
 691	mlx4_qp_remove(dev->dev, &qp->mqp);
 692	mlx4_qp_free(dev->dev, &qp->mqp);
 693
 694err_qpn:
 695	mlx4_qp_release_range(dev->dev, qpn, 1);
 696	return err;
 697}
 698
 699static struct ib_qp *_mlx4_ib_create_qp_rss(struct ib_pd *pd,
 700					    struct ib_qp_init_attr *init_attr,
 701					    struct ib_udata *udata)
 702{
 703	struct mlx4_ib_qp *qp;
 704	struct mlx4_ib_create_qp_rss ucmd = {};
 705	size_t required_cmd_sz;
 706	int err;
 707
 708	if (!udata) {
 709		pr_debug("RSS QP with NULL udata\n");
 710		return ERR_PTR(-EINVAL);
 711	}
 712
 713	if (udata->outlen)
 714		return ERR_PTR(-EOPNOTSUPP);
 715
 716	required_cmd_sz = offsetof(typeof(ucmd), reserved1) +
 717					sizeof(ucmd.reserved1);
 718	if (udata->inlen < required_cmd_sz) {
 719		pr_debug("invalid inlen\n");
 720		return ERR_PTR(-EINVAL);
 721	}
 722
 723	if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
 724		pr_debug("copy failed\n");
 725		return ERR_PTR(-EFAULT);
 726	}
 727
 728	if (memchr_inv(ucmd.reserved, 0, sizeof(ucmd.reserved)))
 729		return ERR_PTR(-EOPNOTSUPP);
 730
 731	if (ucmd.comp_mask || ucmd.reserved1)
 732		return ERR_PTR(-EOPNOTSUPP);
 733
 734	if (udata->inlen > sizeof(ucmd) &&
 735	    !ib_is_udata_cleared(udata, sizeof(ucmd),
 736				 udata->inlen - sizeof(ucmd))) {
 737		pr_debug("inlen is not supported\n");
 738		return ERR_PTR(-EOPNOTSUPP);
 739	}
 740
 741	if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
 742		pr_debug("RSS QP with unsupported QP type %d\n",
 743			 init_attr->qp_type);
 744		return ERR_PTR(-EOPNOTSUPP);
 745	}
 746
 747	if (init_attr->create_flags) {
 748		pr_debug("RSS QP doesn't support create flags\n");
 749		return ERR_PTR(-EOPNOTSUPP);
 750	}
 751
 752	if (init_attr->send_cq || init_attr->cap.max_send_wr) {
 753		pr_debug("RSS QP with unsupported send attributes\n");
 754		return ERR_PTR(-EOPNOTSUPP);
 755	}
 756
 757	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
 758	if (!qp)
 759		return ERR_PTR(-ENOMEM);
 760
 761	qp->pri.vid = 0xFFFF;
 762	qp->alt.vid = 0xFFFF;
 763
 764	err = create_qp_rss(to_mdev(pd->device), init_attr, &ucmd, qp);
 765	if (err) {
 766		kfree(qp);
 767		return ERR_PTR(err);
 768	}
 769
 770	qp->ibqp.qp_num = qp->mqp.qpn;
 771
 772	return &qp->ibqp;
 773}
 774
 775/*
 776 * This function allocates a WQN from a range which is consecutive and aligned
 777 * to its size. In case the range is full, then it creates a new range and
 778 * allocates WQN from it. The new range will be used for following allocations.
 779 */
 780static int mlx4_ib_alloc_wqn(struct mlx4_ib_ucontext *context,
 781			     struct mlx4_ib_qp *qp, int range_size, int *wqn)
 782{
 783	struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device);
 784	struct mlx4_wqn_range *range;
 785	int err = 0;
 786
 787	mutex_lock(&context->wqn_ranges_mutex);
 788
 789	range = list_first_entry_or_null(&context->wqn_ranges_list,
 790					 struct mlx4_wqn_range, list);
 791
 792	if (!range || (range->refcount == range->size) || range->dirty) {
 793		range = kzalloc(sizeof(*range), GFP_KERNEL);
 794		if (!range) {
 795			err = -ENOMEM;
 796			goto out;
 797		}
 798
 799		err = mlx4_qp_reserve_range(dev->dev, range_size,
 800					    range_size, &range->base_wqn, 0,
 801					    qp->mqp.usage);
 802		if (err) {
 803			kfree(range);
 804			goto out;
 805		}
 806
 807		range->size = range_size;
 808		list_add(&range->list, &context->wqn_ranges_list);
 809	} else if (range_size != 1) {
 810		/*
 811		 * Requesting a new range (>1) when last range is still open, is
 812		 * not valid.
 813		 */
 814		err = -EINVAL;
 815		goto out;
 816	}
 817
 818	qp->wqn_range = range;
 819
 820	*wqn = range->base_wqn + range->refcount;
 821
 822	range->refcount++;
 823
 824out:
 825	mutex_unlock(&context->wqn_ranges_mutex);
 826
 827	return err;
 828}
 829
 830static void mlx4_ib_release_wqn(struct mlx4_ib_ucontext *context,
 831				struct mlx4_ib_qp *qp, bool dirty_release)
 832{
 833	struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device);
 834	struct mlx4_wqn_range *range;
 835
 836	mutex_lock(&context->wqn_ranges_mutex);
 837
 838	range = qp->wqn_range;
 839
 840	range->refcount--;
 841	if (!range->refcount) {
 842		mlx4_qp_release_range(dev->dev, range->base_wqn,
 843				      range->size);
 844		list_del(&range->list);
 845		kfree(range);
 846	} else if (dirty_release) {
 847	/*
 848	 * A range which one of its WQNs is destroyed, won't be able to be
 849	 * reused for further WQN allocations.
 850	 * The next created WQ will allocate a new range.
 851	 */
 852		range->dirty = true;
 853	}
 854
 855	mutex_unlock(&context->wqn_ranges_mutex);
 856}
 857
 858static int create_rq(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
 859		     struct ib_udata *udata, struct mlx4_ib_qp *qp)
 860{
 861	struct mlx4_ib_dev *dev = to_mdev(pd->device);
 862	int qpn;
 863	int err;
 864	struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
 865		udata, struct mlx4_ib_ucontext, ibucontext);
 866	struct mlx4_ib_cq *mcq;
 867	unsigned long flags;
 868	int range_size;
 869	struct mlx4_ib_create_wq wq;
 870	size_t copy_len;
 871	int shift;
 872	int n;
 873
 874	qp->mlx4_ib_qp_type = MLX4_IB_QPT_RAW_PACKET;
 875
 876	mutex_init(&qp->mutex);
 877	spin_lock_init(&qp->sq.lock);
 878	spin_lock_init(&qp->rq.lock);
 879	INIT_LIST_HEAD(&qp->gid_list);
 880	INIT_LIST_HEAD(&qp->steering_rules);
 881
 882	qp->state = IB_QPS_RESET;
 883
 884	copy_len = min(sizeof(struct mlx4_ib_create_wq), udata->inlen);
 885
 886	if (ib_copy_from_udata(&wq, udata, copy_len)) {
 887		err = -EFAULT;
 888		goto err;
 889	}
 890
 891	if (wq.comp_mask || wq.reserved[0] || wq.reserved[1] ||
 892	    wq.reserved[2]) {
 893		pr_debug("user command isn't supported\n");
 894		err = -EOPNOTSUPP;
 895		goto err;
 896	}
 897
 898	if (wq.log_range_size > ilog2(dev->dev->caps.max_rss_tbl_sz)) {
 899		pr_debug("WQN range size must be equal or smaller than %d\n",
 900			 dev->dev->caps.max_rss_tbl_sz);
 901		err = -EOPNOTSUPP;
 902		goto err;
 903	}
 904	range_size = 1 << wq.log_range_size;
 905
 906	if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS)
 907		qp->flags |= MLX4_IB_QP_SCATTER_FCS;
 908
 909	err = set_rq_size(dev, &init_attr->cap, true, true, qp, qp->inl_recv_sz);
 910	if (err)
 911		goto err;
 912
 913	qp->sq_no_prefetch = 1;
 914	qp->sq.wqe_cnt = 1;
 915	qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE;
 916	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
 917		       (qp->sq.wqe_cnt << qp->sq.wqe_shift);
 918
 919	qp->umem = ib_umem_get(pd->device, wq.buf_addr, qp->buf_size, 0);
 920	if (IS_ERR(qp->umem)) {
 921		err = PTR_ERR(qp->umem);
 922		goto err;
 923	}
 924
 925	n = ib_umem_page_count(qp->umem);
 926	shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n);
 927	err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt);
 928
 929	if (err)
 930		goto err_buf;
 931
 932	err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
 933	if (err)
 934		goto err_mtt;
 935
 936	err = mlx4_ib_db_map_user(udata, wq.db_addr, &qp->db);
 937	if (err)
 938		goto err_mtt;
 939	qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
 940
 941	err = mlx4_ib_alloc_wqn(context, qp, range_size, &qpn);
 942	if (err)
 943		goto err_wrid;
 944
 945	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
 946	if (err)
 947		goto err_qpn;
 948
 949	/*
 950	 * Hardware wants QPN written in big-endian order (after
 951	 * shifting) for send doorbell.  Precompute this value to save
 952	 * a little bit when posting sends.
 953	 */
 954	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
 955
 956	qp->mqp.event = mlx4_ib_wq_event;
 957
 958	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
 959	mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq),
 960			 to_mcq(init_attr->recv_cq));
 961	/* Maintain device to QPs access, needed for further handling
 962	 * via reset flow
 963	 */
 964	list_add_tail(&qp->qps_list, &dev->qp_list);
 965	/* Maintain CQ to QPs access, needed for further handling
 966	 * via reset flow
 967	 */
 968	mcq = to_mcq(init_attr->send_cq);
 969	list_add_tail(&qp->cq_send_list, &mcq->send_qp_list);
 970	mcq = to_mcq(init_attr->recv_cq);
 971	list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list);
 972	mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq),
 973			   to_mcq(init_attr->recv_cq));
 974	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
 975	return 0;
 976
 977err_qpn:
 978	mlx4_ib_release_wqn(context, qp, 0);
 979err_wrid:
 980	mlx4_ib_db_unmap_user(context, &qp->db);
 981
 982err_mtt:
 983	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 984err_buf:
 985	ib_umem_release(qp->umem);
 986err:
 987	return err;
 988}
 989
 990static int create_qp_common(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
 991			    struct ib_udata *udata, int sqpn,
 992			    struct mlx4_ib_qp **caller_qp)
 993{
 994	struct mlx4_ib_dev *dev = to_mdev(pd->device);
 995	int qpn;
 996	int err;
 997	struct mlx4_ib_sqp *sqp = NULL;
 998	struct mlx4_ib_qp *qp;
 999	struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
1000		udata, struct mlx4_ib_ucontext, ibucontext);
1001	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
1002	struct mlx4_ib_cq *mcq;
1003	unsigned long flags;
1004
1005	/* When tunneling special qps, we use a plain UD qp */
1006	if (sqpn) {
1007		if (mlx4_is_mfunc(dev->dev) &&
1008		    (!mlx4_is_master(dev->dev) ||
1009		     !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) {
1010			if (init_attr->qp_type == IB_QPT_GSI)
1011				qp_type = MLX4_IB_QPT_PROXY_GSI;
1012			else {
1013				if (mlx4_is_master(dev->dev) ||
1014				    qp0_enabled_vf(dev->dev, sqpn))
1015					qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER;
1016				else
1017					qp_type = MLX4_IB_QPT_PROXY_SMI;
1018			}
1019		}
1020		qpn = sqpn;
1021		/* add extra sg entry for tunneling */
1022		init_attr->cap.max_recv_sge++;
1023	} else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) {
1024		struct mlx4_ib_qp_tunnel_init_attr *tnl_init =
1025			container_of(init_attr,
1026				     struct mlx4_ib_qp_tunnel_init_attr, init_attr);
1027		if ((tnl_init->proxy_qp_type != IB_QPT_SMI &&
1028		     tnl_init->proxy_qp_type != IB_QPT_GSI)   ||
1029		    !mlx4_is_master(dev->dev))
1030			return -EINVAL;
1031		if (tnl_init->proxy_qp_type == IB_QPT_GSI)
1032			qp_type = MLX4_IB_QPT_TUN_GSI;
1033		else if (tnl_init->slave == mlx4_master_func_num(dev->dev) ||
1034			 mlx4_vf_smi_enabled(dev->dev, tnl_init->slave,
1035					     tnl_init->port))
1036			qp_type = MLX4_IB_QPT_TUN_SMI_OWNER;
1037		else
1038			qp_type = MLX4_IB_QPT_TUN_SMI;
1039		/* we are definitely in the PPF here, since we are creating
1040		 * tunnel QPs. base_tunnel_sqpn is therefore valid. */
1041		qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave
1042			+ tnl_init->proxy_qp_type * 2 + tnl_init->port - 1;
1043		sqpn = qpn;
1044	}
1045
1046	if (!*caller_qp) {
1047		if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI ||
1048		    (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER |
1049				MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) {
1050			sqp = kzalloc(sizeof(struct mlx4_ib_sqp), GFP_KERNEL);
1051			if (!sqp)
1052				return -ENOMEM;
1053			qp = &sqp->qp;
1054		} else {
1055			qp = kzalloc(sizeof(struct mlx4_ib_qp), GFP_KERNEL);
1056			if (!qp)
1057				return -ENOMEM;
1058		}
1059		qp->pri.vid = 0xFFFF;
1060		qp->alt.vid = 0xFFFF;
1061	} else
1062		qp = *caller_qp;
1063
1064	qp->mlx4_ib_qp_type = qp_type;
1065
1066	mutex_init(&qp->mutex);
1067	spin_lock_init(&qp->sq.lock);
1068	spin_lock_init(&qp->rq.lock);
1069	INIT_LIST_HEAD(&qp->gid_list);
1070	INIT_LIST_HEAD(&qp->steering_rules);
1071
1072	qp->state = IB_QPS_RESET;
1073	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
1074		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
1075
1076	if (udata) {
1077		struct mlx4_ib_create_qp ucmd;
1078		size_t copy_len;
1079		int shift;
1080		int n;
1081
1082		copy_len = sizeof(struct mlx4_ib_create_qp);
1083
1084		if (ib_copy_from_udata(&ucmd, udata, copy_len)) {
1085			err = -EFAULT;
1086			goto err;
1087		}
1088
1089		qp->inl_recv_sz = ucmd.inl_recv_sz;
1090
1091		if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) {
1092			if (!(dev->dev->caps.flags &
1093			      MLX4_DEV_CAP_FLAG_FCS_KEEP)) {
1094				pr_debug("scatter FCS is unsupported\n");
1095				err = -EOPNOTSUPP;
1096				goto err;
1097			}
1098
1099			qp->flags |= MLX4_IB_QP_SCATTER_FCS;
1100		}
1101
1102		err = set_rq_size(dev, &init_attr->cap, udata,
1103				  qp_has_rq(init_attr), qp, qp->inl_recv_sz);
1104		if (err)
1105			goto err;
1106
1107		qp->sq_no_prefetch = ucmd.sq_no_prefetch;
1108
1109		err = set_user_sq_size(dev, qp, &ucmd);
1110		if (err)
1111			goto err;
1112
1113		qp->umem =
1114			ib_umem_get(pd->device, ucmd.buf_addr, qp->buf_size, 0);
1115		if (IS_ERR(qp->umem)) {
1116			err = PTR_ERR(qp->umem);
1117			goto err;
1118		}
1119
1120		n = ib_umem_page_count(qp->umem);
1121		shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n);
1122		err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt);
1123
1124		if (err)
1125			goto err_buf;
1126
1127		err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
1128		if (err)
1129			goto err_mtt;
1130
1131		if (qp_has_rq(init_attr)) {
1132			err = mlx4_ib_db_map_user(udata, ucmd.db_addr, &qp->db);
 
1133			if (err)
1134				goto err_mtt;
1135		}
1136		qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
1137	} else {
1138		err = set_rq_size(dev, &init_attr->cap, udata,
1139				  qp_has_rq(init_attr), qp, 0);
1140		if (err)
1141			goto err;
1142
1143		qp->sq_no_prefetch = 0;
 
1144
1145		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
1146			qp->flags |= MLX4_IB_QP_LSO;
1147
1148		if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
1149			if (dev->steering_support ==
1150			    MLX4_STEERING_MODE_DEVICE_MANAGED)
1151				qp->flags |= MLX4_IB_QP_NETIF;
1152			else
1153				goto err;
1154		}
1155
1156		err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
1157		if (err)
1158			goto err;
1159
1160		if (qp_has_rq(init_attr)) {
1161			err = mlx4_db_alloc(dev->dev, &qp->db, 0);
1162			if (err)
1163				goto err;
1164
1165			*qp->db.db = 0;
1166		}
1167
1168		if (mlx4_buf_alloc(dev->dev, qp->buf_size,  PAGE_SIZE * 2,
1169				   &qp->buf)) {
1170			err = -ENOMEM;
1171			goto err_db;
1172		}
1173
1174		err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
1175				    &qp->mtt);
1176		if (err)
1177			goto err_buf;
1178
1179		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
1180		if (err)
1181			goto err_mtt;
1182
1183		qp->sq.wrid = kvmalloc_array(qp->sq.wqe_cnt,
1184					     sizeof(u64), GFP_KERNEL);
1185		qp->rq.wrid = kvmalloc_array(qp->rq.wqe_cnt,
1186					     sizeof(u64), GFP_KERNEL);
1187		if (!qp->sq.wrid || !qp->rq.wrid) {
1188			err = -ENOMEM;
1189			goto err_wrid;
1190		}
1191		qp->mqp.usage = MLX4_RES_USAGE_DRIVER;
1192	}
1193
1194	if (sqpn) {
1195		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
1196		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
1197			if (alloc_proxy_bufs(pd->device, qp)) {
1198				err = -ENOMEM;
1199				goto err_wrid;
1200			}
1201		}
1202	} else {
1203		/* Raw packet QPNs may not have bits 6,7 set in their qp_num;
1204		 * otherwise, the WQE BlueFlame setup flow wrongly causes
1205		 * VLAN insertion. */
1206		if (init_attr->qp_type == IB_QPT_RAW_PACKET)
1207			err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn,
1208						    (init_attr->cap.max_send_wr ?
1209						     MLX4_RESERVE_ETH_BF_QP : 0) |
1210						    (init_attr->cap.max_recv_wr ?
1211						     MLX4_RESERVE_A0_QP : 0),
1212						    qp->mqp.usage);
1213		else
1214			if (qp->flags & MLX4_IB_QP_NETIF)
1215				err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn);
1216			else
1217				err = mlx4_qp_reserve_range(dev->dev, 1, 1,
1218							    &qpn, 0, qp->mqp.usage);
1219		if (err)
1220			goto err_proxy;
1221	}
1222
1223	if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
1224		qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
1225
1226	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
1227	if (err)
1228		goto err_qpn;
1229
1230	if (init_attr->qp_type == IB_QPT_XRC_TGT)
1231		qp->mqp.qpn |= (1 << 23);
1232
1233	/*
1234	 * Hardware wants QPN written in big-endian order (after
1235	 * shifting) for send doorbell.  Precompute this value to save
1236	 * a little bit when posting sends.
1237	 */
1238	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
1239
1240	qp->mqp.event = mlx4_ib_qp_event;
1241
1242	if (!*caller_qp)
1243		*caller_qp = qp;
1244
1245	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
1246	mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq),
1247			 to_mcq(init_attr->recv_cq));
1248	/* Maintain device to QPs access, needed for further handling
1249	 * via reset flow
1250	 */
1251	list_add_tail(&qp->qps_list, &dev->qp_list);
1252	/* Maintain CQ to QPs access, needed for further handling
1253	 * via reset flow
1254	 */
1255	mcq = to_mcq(init_attr->send_cq);
1256	list_add_tail(&qp->cq_send_list, &mcq->send_qp_list);
1257	mcq = to_mcq(init_attr->recv_cq);
1258	list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list);
1259	mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq),
1260			   to_mcq(init_attr->recv_cq));
1261	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
1262	return 0;
1263
1264err_qpn:
1265	if (!sqpn) {
1266		if (qp->flags & MLX4_IB_QP_NETIF)
1267			mlx4_ib_steer_qp_free(dev, qpn, 1);
1268		else
1269			mlx4_qp_release_range(dev->dev, qpn, 1);
1270	}
1271err_proxy:
1272	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
1273		free_proxy_bufs(pd->device, qp);
1274err_wrid:
1275	if (udata) {
1276		if (qp_has_rq(init_attr))
1277			mlx4_ib_db_unmap_user(context, &qp->db);
 
1278	} else {
1279		kvfree(qp->sq.wrid);
1280		kvfree(qp->rq.wrid);
1281	}
1282
1283err_mtt:
1284	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
1285
1286err_buf:
1287	if (!qp->umem)
 
 
1288		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
1289	ib_umem_release(qp->umem);
1290
1291err_db:
1292	if (!udata && qp_has_rq(init_attr))
1293		mlx4_db_free(dev->dev, &qp->db);
1294
1295err:
1296	if (!sqp && !*caller_qp)
1297		kfree(qp);
1298	kfree(sqp);
1299
1300	return err;
1301}
1302
1303static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
1304{
1305	switch (state) {
1306	case IB_QPS_RESET:	return MLX4_QP_STATE_RST;
1307	case IB_QPS_INIT:	return MLX4_QP_STATE_INIT;
1308	case IB_QPS_RTR:	return MLX4_QP_STATE_RTR;
1309	case IB_QPS_RTS:	return MLX4_QP_STATE_RTS;
1310	case IB_QPS_SQD:	return MLX4_QP_STATE_SQD;
1311	case IB_QPS_SQE:	return MLX4_QP_STATE_SQER;
1312	case IB_QPS_ERR:	return MLX4_QP_STATE_ERR;
1313	default:		return -1;
1314	}
1315}
1316
1317static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
1318	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
1319{
1320	if (send_cq == recv_cq) {
1321		spin_lock(&send_cq->lock);
1322		__acquire(&recv_cq->lock);
1323	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
1324		spin_lock(&send_cq->lock);
1325		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
1326	} else {
1327		spin_lock(&recv_cq->lock);
1328		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
1329	}
1330}
1331
1332static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
1333	__releases(&send_cq->lock) __releases(&recv_cq->lock)
1334{
1335	if (send_cq == recv_cq) {
1336		__release(&recv_cq->lock);
1337		spin_unlock(&send_cq->lock);
1338	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
1339		spin_unlock(&recv_cq->lock);
1340		spin_unlock(&send_cq->lock);
1341	} else {
1342		spin_unlock(&send_cq->lock);
1343		spin_unlock(&recv_cq->lock);
1344	}
1345}
1346
1347static void del_gid_entries(struct mlx4_ib_qp *qp)
1348{
1349	struct mlx4_ib_gid_entry *ge, *tmp;
1350
1351	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
1352		list_del(&ge->list);
1353		kfree(ge);
1354	}
1355}
1356
1357static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp)
1358{
1359	if (qp->ibqp.qp_type == IB_QPT_XRC_TGT)
1360		return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd);
1361	else
1362		return to_mpd(qp->ibqp.pd);
1363}
1364
1365static void get_cqs(struct mlx4_ib_qp *qp, enum mlx4_ib_source_type src,
1366		    struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq)
1367{
1368	switch (qp->ibqp.qp_type) {
1369	case IB_QPT_XRC_TGT:
1370		*send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq);
1371		*recv_cq = *send_cq;
1372		break;
1373	case IB_QPT_XRC_INI:
1374		*send_cq = to_mcq(qp->ibqp.send_cq);
1375		*recv_cq = *send_cq;
1376		break;
1377	default:
1378		*recv_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.recv_cq) :
1379						     to_mcq(qp->ibwq.cq);
1380		*send_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.send_cq) :
1381						     *recv_cq;
1382		break;
1383	}
1384}
1385
1386static void destroy_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
1387{
1388	if (qp->state != IB_QPS_RESET) {
1389		int i;
1390
1391		for (i = 0; i < (1 << qp->ibqp.rwq_ind_tbl->log_ind_tbl_size);
1392		     i++) {
1393			struct ib_wq *ibwq = qp->ibqp.rwq_ind_tbl->ind_tbl[i];
1394			struct mlx4_ib_qp *wq =	to_mqp((struct ib_qp *)ibwq);
1395
1396			mutex_lock(&wq->mutex);
1397
1398			wq->rss_usecnt--;
1399
1400			mutex_unlock(&wq->mutex);
1401		}
1402
1403		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
1404				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
1405			pr_warn("modify QP %06x to RESET failed.\n",
1406				qp->mqp.qpn);
1407	}
1408
1409	mlx4_qp_remove(dev->dev, &qp->mqp);
1410	mlx4_qp_free(dev->dev, &qp->mqp);
1411	mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
1412	del_gid_entries(qp);
1413	kfree(qp->rss_ctx);
1414}
1415
1416static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
1417			      enum mlx4_ib_source_type src,
1418			      struct ib_udata *udata)
1419{
1420	struct mlx4_ib_cq *send_cq, *recv_cq;
1421	unsigned long flags;
1422
1423	if (qp->state != IB_QPS_RESET) {
1424		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
1425				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
1426			pr_warn("modify QP %06x to RESET failed.\n",
1427			       qp->mqp.qpn);
1428		if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
1429			mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
1430			qp->pri.smac = 0;
1431			qp->pri.smac_port = 0;
1432		}
1433		if (qp->alt.smac) {
1434			mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
1435			qp->alt.smac = 0;
1436		}
1437		if (qp->pri.vid < 0x1000) {
1438			mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
1439			qp->pri.vid = 0xFFFF;
1440			qp->pri.candidate_vid = 0xFFFF;
1441			qp->pri.update_vid = 0;
1442		}
1443		if (qp->alt.vid < 0x1000) {
1444			mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
1445			qp->alt.vid = 0xFFFF;
1446			qp->alt.candidate_vid = 0xFFFF;
1447			qp->alt.update_vid = 0;
1448		}
1449	}
1450
1451	get_cqs(qp, src, &send_cq, &recv_cq);
 
1452
1453	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
1454	mlx4_ib_lock_cqs(send_cq, recv_cq);
1455
1456	/* del from lists under both locks above to protect reset flow paths */
1457	list_del(&qp->qps_list);
1458	list_del(&qp->cq_send_list);
1459	list_del(&qp->cq_recv_list);
1460	if (!udata) {
1461		__mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
1462				 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
1463		if (send_cq != recv_cq)
1464			__mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
1465	}
1466
1467	mlx4_qp_remove(dev->dev, &qp->mqp);
1468
1469	mlx4_ib_unlock_cqs(send_cq, recv_cq);
1470	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
1471
1472	mlx4_qp_free(dev->dev, &qp->mqp);
1473
1474	if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) {
1475		if (qp->flags & MLX4_IB_QP_NETIF)
1476			mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1);
1477		else if (src == MLX4_IB_RWQ_SRC)
1478			mlx4_ib_release_wqn(
1479				rdma_udata_to_drv_context(
1480					udata,
1481					struct mlx4_ib_ucontext,
1482					ibucontext),
1483				qp, 1);
1484		else
1485			mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
1486	}
1487
1488	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
1489
1490	if (udata) {
1491		if (qp->rq.wqe_cnt) {
1492			struct mlx4_ib_ucontext *mcontext =
1493				rdma_udata_to_drv_context(
1494					udata,
1495					struct mlx4_ib_ucontext,
1496					ibucontext);
1497
1498			mlx4_ib_db_unmap_user(mcontext, &qp->db);
1499		}
1500	} else {
1501		kvfree(qp->sq.wrid);
1502		kvfree(qp->rq.wrid);
1503		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
1504		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
1505			free_proxy_bufs(&dev->ib_dev, qp);
1506		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
1507		if (qp->rq.wqe_cnt)
1508			mlx4_db_free(dev->dev, &qp->db);
1509	}
1510	ib_umem_release(qp->umem);
1511
1512	del_gid_entries(qp);
1513}
1514
1515static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
 
 
1516{
1517	/* Native or PPF */
1518	if (!mlx4_is_mfunc(dev->dev) ||
1519	    (mlx4_is_master(dev->dev) &&
1520	     attr->create_flags & MLX4_IB_SRIOV_SQP)) {
1521		return  dev->dev->phys_caps.base_sqpn +
1522			(attr->qp_type == IB_QPT_SMI ? 0 : 2) +
1523			attr->port_num - 1;
1524	}
1525	/* PF or VF -- creating proxies */
1526	if (attr->qp_type == IB_QPT_SMI)
1527		return dev->dev->caps.spec_qps[attr->port_num - 1].qp0_proxy;
1528	else
1529		return dev->dev->caps.spec_qps[attr->port_num - 1].qp1_proxy;
1530}
1531
1532static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
1533					struct ib_qp_init_attr *init_attr,
1534					struct ib_udata *udata)
1535{
1536	struct mlx4_ib_qp *qp = NULL;
1537	int err;
1538	int sup_u_create_flags = MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
1539	u16 xrcdn = 0;
1540
1541	if (init_attr->rwq_ind_tbl)
1542		return _mlx4_ib_create_qp_rss(pd, init_attr, udata);
1543
1544	/*
1545	 * We only support LSO, vendor flag1, and multicast loopback blocking,
1546	 * and only for kernel UD QPs.
1547	 */
1548	if (init_attr->create_flags & ~(MLX4_IB_QP_LSO |
1549					MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK |
1550					MLX4_IB_SRIOV_TUNNEL_QP |
1551					MLX4_IB_SRIOV_SQP |
1552					MLX4_IB_QP_NETIF |
1553					MLX4_IB_QP_CREATE_ROCE_V2_GSI))
1554		return ERR_PTR(-EINVAL);
1555
1556	if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
1557		if (init_attr->qp_type != IB_QPT_UD)
1558			return ERR_PTR(-EINVAL);
1559	}
1560
1561	if (init_attr->create_flags) {
1562		if (udata && init_attr->create_flags & ~(sup_u_create_flags))
1563			return ERR_PTR(-EINVAL);
1564
1565		if ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP |
1566						 MLX4_IB_QP_CREATE_ROCE_V2_GSI  |
1567						 MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) &&
1568		     init_attr->qp_type != IB_QPT_UD) ||
1569		    (init_attr->create_flags & MLX4_IB_SRIOV_SQP &&
1570		     init_attr->qp_type > IB_QPT_GSI) ||
1571		    (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI &&
1572		     init_attr->qp_type != IB_QPT_GSI))
1573			return ERR_PTR(-EINVAL);
1574	}
1575
1576	switch (init_attr->qp_type) {
1577	case IB_QPT_XRC_TGT:
1578		pd = to_mxrcd(init_attr->xrcd)->pd;
1579		xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
1580		init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq;
1581		fallthrough;
1582	case IB_QPT_XRC_INI:
1583		if (!(to_mdev(pd->device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
1584			return ERR_PTR(-ENOSYS);
1585		init_attr->recv_cq = init_attr->send_cq;
1586		fallthrough;
1587	case IB_QPT_RC:
1588	case IB_QPT_UC:
1589	case IB_QPT_RAW_PACKET:
1590		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
 
1591		if (!qp)
1592			return ERR_PTR(-ENOMEM);
1593		qp->pri.vid = 0xFFFF;
1594		qp->alt.vid = 0xFFFF;
1595		fallthrough;
1596	case IB_QPT_UD:
1597	{
1598		err = create_qp_common(pd, init_attr, udata, 0, &qp);
1599		if (err) {
1600			kfree(qp);
1601			return ERR_PTR(err);
1602		}
1603
1604		qp->ibqp.qp_num = qp->mqp.qpn;
1605		qp->xrcdn = xrcdn;
1606
1607		break;
1608	}
1609	case IB_QPT_SMI:
1610	case IB_QPT_GSI:
1611	{
1612		int sqpn;
1613
1614		/* Userspace is not allowed to create special QPs: */
1615		if (udata)
1616			return ERR_PTR(-EINVAL);
1617		if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) {
1618			int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev,
1619							1, 1, &sqpn, 0,
1620							MLX4_RES_USAGE_DRIVER);
1621
1622			if (res)
1623				return ERR_PTR(res);
1624		} else {
1625			sqpn = get_sqp_num(to_mdev(pd->device), init_attr);
1626		}
1627
1628		err = create_qp_common(pd, init_attr, udata, sqpn, &qp);
1629		if (err)
 
 
 
 
 
1630			return ERR_PTR(err);
 
1631
1632		qp->port	= init_attr->port_num;
1633		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 :
1634			init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI ? sqpn : 1;
1635		break;
1636	}
1637	default:
1638		/* Don't support raw QPs */
1639		return ERR_PTR(-EOPNOTSUPP);
1640	}
1641
1642	return &qp->ibqp;
1643}
1644
1645struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
1646				struct ib_qp_init_attr *init_attr,
1647				struct ib_udata *udata) {
1648	struct ib_device *device = pd ? pd->device : init_attr->xrcd->device;
1649	struct ib_qp *ibqp;
1650	struct mlx4_ib_dev *dev = to_mdev(device);
1651
1652	ibqp = _mlx4_ib_create_qp(pd, init_attr, udata);
1653
1654	if (!IS_ERR(ibqp) &&
1655	    (init_attr->qp_type == IB_QPT_GSI) &&
1656	    !(init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI)) {
1657		struct mlx4_ib_sqp *sqp = to_msqp((to_mqp(ibqp)));
1658		int is_eth = rdma_cap_eth_ah(&dev->ib_dev, init_attr->port_num);
1659
1660		if (is_eth &&
1661		    dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
1662			init_attr->create_flags |= MLX4_IB_QP_CREATE_ROCE_V2_GSI;
1663			sqp->roce_v2_gsi = ib_create_qp(pd, init_attr);
1664
1665			if (IS_ERR(sqp->roce_v2_gsi)) {
1666				pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi));
1667				sqp->roce_v2_gsi = NULL;
1668			} else {
1669				sqp = to_msqp(to_mqp(sqp->roce_v2_gsi));
1670				sqp->qp.flags |= MLX4_IB_ROCE_V2_GSI_QP;
1671			}
1672
1673			init_attr->create_flags &= ~MLX4_IB_QP_CREATE_ROCE_V2_GSI;
1674		}
1675	}
1676	return ibqp;
1677}
1678
1679static int _mlx4_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
1680{
1681	struct mlx4_ib_dev *dev = to_mdev(qp->device);
1682	struct mlx4_ib_qp *mqp = to_mqp(qp);
1683
1684	if (is_qp0(dev, mqp))
1685		mlx4_CLOSE_PORT(dev->dev, mqp->port);
1686
1687	if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI &&
1688	    dev->qp1_proxy[mqp->port - 1] == mqp) {
1689		mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]);
1690		dev->qp1_proxy[mqp->port - 1] = NULL;
1691		mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]);
1692	}
1693
1694	if (mqp->counter_index)
1695		mlx4_ib_free_qp_counter(dev, mqp);
1696
1697	if (qp->rwq_ind_tbl) {
1698		destroy_qp_rss(dev, mqp);
1699	} else {
1700		destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, udata);
1701	}
1702
1703	if (is_sqp(dev, mqp))
1704		kfree(to_msqp(mqp));
1705	else
1706		kfree(mqp);
1707
1708	return 0;
1709}
1710
1711int mlx4_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
1712{
1713	struct mlx4_ib_qp *mqp = to_mqp(qp);
1714
1715	if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
1716		struct mlx4_ib_sqp *sqp = to_msqp(mqp);
1717
1718		if (sqp->roce_v2_gsi)
1719			ib_destroy_qp(sqp->roce_v2_gsi);
1720	}
1721
1722	return _mlx4_ib_destroy_qp(qp, udata);
1723}
1724
1725static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
1726{
1727	switch (type) {
1728	case MLX4_IB_QPT_RC:		return MLX4_QP_ST_RC;
1729	case MLX4_IB_QPT_UC:		return MLX4_QP_ST_UC;
1730	case MLX4_IB_QPT_UD:		return MLX4_QP_ST_UD;
1731	case MLX4_IB_QPT_XRC_INI:
1732	case MLX4_IB_QPT_XRC_TGT:	return MLX4_QP_ST_XRC;
1733	case MLX4_IB_QPT_SMI:
1734	case MLX4_IB_QPT_GSI:
1735	case MLX4_IB_QPT_RAW_PACKET:	return MLX4_QP_ST_MLX;
1736
1737	case MLX4_IB_QPT_PROXY_SMI_OWNER:
1738	case MLX4_IB_QPT_TUN_SMI_OWNER:	return (mlx4_is_mfunc(dev->dev) ?
1739						MLX4_QP_ST_MLX : -1);
1740	case MLX4_IB_QPT_PROXY_SMI:
1741	case MLX4_IB_QPT_TUN_SMI:
1742	case MLX4_IB_QPT_PROXY_GSI:
1743	case MLX4_IB_QPT_TUN_GSI:	return (mlx4_is_mfunc(dev->dev) ?
1744						MLX4_QP_ST_UD : -1);
1745	default:			return -1;
1746	}
1747}
1748
1749static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
1750				   int attr_mask)
1751{
1752	u8 dest_rd_atomic;
1753	u32 access_flags;
1754	u32 hw_access_flags = 0;
1755
1756	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
1757		dest_rd_atomic = attr->max_dest_rd_atomic;
1758	else
1759		dest_rd_atomic = qp->resp_depth;
1760
1761	if (attr_mask & IB_QP_ACCESS_FLAGS)
1762		access_flags = attr->qp_access_flags;
1763	else
1764		access_flags = qp->atomic_rd_en;
1765
1766	if (!dest_rd_atomic)
1767		access_flags &= IB_ACCESS_REMOTE_WRITE;
1768
1769	if (access_flags & IB_ACCESS_REMOTE_READ)
1770		hw_access_flags |= MLX4_QP_BIT_RRE;
1771	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
1772		hw_access_flags |= MLX4_QP_BIT_RAE;
1773	if (access_flags & IB_ACCESS_REMOTE_WRITE)
1774		hw_access_flags |= MLX4_QP_BIT_RWE;
1775
1776	return cpu_to_be32(hw_access_flags);
1777}
1778
1779static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr,
1780			    int attr_mask)
1781{
1782	if (attr_mask & IB_QP_PKEY_INDEX)
1783		sqp->pkey_index = attr->pkey_index;
1784	if (attr_mask & IB_QP_QKEY)
1785		sqp->qkey = attr->qkey;
1786	if (attr_mask & IB_QP_SQ_PSN)
1787		sqp->send_psn = attr->sq_psn;
1788}
1789
1790static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
1791{
1792	path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
1793}
1794
1795static int _mlx4_set_path(struct mlx4_ib_dev *dev,
1796			  const struct rdma_ah_attr *ah,
1797			  u64 smac, u16 vlan_tag, struct mlx4_qp_path *path,
1798			  struct mlx4_roce_smac_vlan_info *smac_info, u8 port)
1799{
 
 
 
 
 
 
1800	int vidx;
1801	int smac_index;
1802	int err;
1803
1804	path->grh_mylmc = rdma_ah_get_path_bits(ah) & 0x7f;
1805	path->rlid = cpu_to_be16(rdma_ah_get_dlid(ah));
1806	if (rdma_ah_get_static_rate(ah)) {
1807		path->static_rate = rdma_ah_get_static_rate(ah) +
1808				    MLX4_STAT_RATE_OFFSET;
1809		while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
1810		       !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
1811			--path->static_rate;
1812	} else
1813		path->static_rate = 0;
1814
1815	if (rdma_ah_get_ah_flags(ah) & IB_AH_GRH) {
1816		const struct ib_global_route *grh = rdma_ah_read_grh(ah);
1817		int real_sgid_index =
1818			mlx4_ib_gid_index_to_real_index(dev, grh->sgid_attr);
1819
1820		if (real_sgid_index < 0)
1821			return real_sgid_index;
1822		if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) {
1823			pr_err("sgid_index (%u) too large. max is %d\n",
1824			       real_sgid_index, dev->dev->caps.gid_table_len[port] - 1);
1825			return -1;
1826		}
1827
1828		path->grh_mylmc |= 1 << 7;
1829		path->mgid_index = real_sgid_index;
1830		path->hop_limit  = grh->hop_limit;
1831		path->tclass_flowlabel =
1832			cpu_to_be32((grh->traffic_class << 20) |
1833				    (grh->flow_label));
1834		memcpy(path->rgid, grh->dgid.raw, 16);
1835	}
1836
1837	if (ah->type == RDMA_AH_ATTR_TYPE_ROCE) {
1838		if (!(rdma_ah_get_ah_flags(ah) & IB_AH_GRH))
 
 
 
1839			return -1;
1840
1841		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
1842			((port - 1) << 6) | ((rdma_ah_get_sl(ah) & 7) << 3);
 
 
 
 
 
 
1843
1844		path->feup |= MLX4_FEUP_FORCE_ETH_UP;
1845		if (vlan_tag < 0x1000) {
1846			if (smac_info->vid < 0x1000) {
1847				/* both valid vlan ids */
1848				if (smac_info->vid != vlan_tag) {
1849					/* different VIDs.  unreg old and reg new */
1850					err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
1851					if (err)
1852						return err;
1853					smac_info->candidate_vid = vlan_tag;
1854					smac_info->candidate_vlan_index = vidx;
1855					smac_info->candidate_vlan_port = port;
1856					smac_info->update_vid = 1;
1857					path->vlan_index = vidx;
1858				} else {
1859					path->vlan_index = smac_info->vlan_index;
1860				}
1861			} else {
1862				/* no current vlan tag in qp */
1863				err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
1864				if (err)
1865					return err;
1866				smac_info->candidate_vid = vlan_tag;
1867				smac_info->candidate_vlan_index = vidx;
1868				smac_info->candidate_vlan_port = port;
1869				smac_info->update_vid = 1;
1870				path->vlan_index = vidx;
1871			}
1872			path->feup |= MLX4_FVL_FORCE_ETH_VLAN;
1873			path->fl = 1 << 6;
1874		} else {
1875			/* have current vlan tag. unregister it at modify-qp success */
1876			if (smac_info->vid < 0x1000) {
1877				smac_info->candidate_vid = 0xFFFF;
1878				smac_info->update_vid = 1;
1879			}
1880		}
1881
1882		/* get smac_index for RoCE use.
1883		 * If no smac was yet assigned, register one.
1884		 * If one was already assigned, but the new mac differs,
1885		 * unregister the old one and register the new one.
1886		*/
1887		if ((!smac_info->smac && !smac_info->smac_port) ||
1888		    smac_info->smac != smac) {
1889			/* register candidate now, unreg if needed, after success */
1890			smac_index = mlx4_register_mac(dev->dev, port, smac);
1891			if (smac_index >= 0) {
1892				smac_info->candidate_smac_index = smac_index;
1893				smac_info->candidate_smac = smac;
1894				smac_info->candidate_smac_port = port;
1895			} else {
1896				return -EINVAL;
1897			}
1898		} else {
1899			smac_index = smac_info->smac_index;
1900		}
1901		memcpy(path->dmac, ah->roce.dmac, 6);
1902		path->ackto = MLX4_IB_LINK_TYPE_ETH;
1903		/* put MAC table smac index for IBoE */
1904		path->grh_mylmc = (u8) (smac_index) | 0x80;
1905	} else {
1906		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
1907			((port - 1) << 6) | ((rdma_ah_get_sl(ah) & 0xf) << 2);
1908	}
1909
1910	return 0;
1911}
1912
1913static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp,
1914			 enum ib_qp_attr_mask qp_attr_mask,
1915			 struct mlx4_ib_qp *mqp,
1916			 struct mlx4_qp_path *path, u8 port,
1917			 u16 vlan_id, u8 *smac)
1918{
1919	return _mlx4_set_path(dev, &qp->ah_attr,
1920			      mlx4_mac_to_u64(smac),
1921			      vlan_id,
1922			      path, &mqp->pri, port);
1923}
1924
1925static int mlx4_set_alt_path(struct mlx4_ib_dev *dev,
1926			     const struct ib_qp_attr *qp,
1927			     enum ib_qp_attr_mask qp_attr_mask,
1928			     struct mlx4_ib_qp *mqp,
1929			     struct mlx4_qp_path *path, u8 port)
1930{
1931	return _mlx4_set_path(dev, &qp->alt_ah_attr,
1932			      0,
1933			      0xffff,
1934			      path, &mqp->alt, port);
1935}
1936
1937static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
1938{
1939	struct mlx4_ib_gid_entry *ge, *tmp;
1940
1941	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
1942		if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) {
1943			ge->added = 1;
1944			ge->port = qp->port;
1945		}
1946	}
1947}
1948
1949static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev,
1950				    struct mlx4_ib_qp *qp,
1951				    struct mlx4_qp_context *context)
1952{
1953	u64 u64_mac;
1954	int smac_index;
1955
1956	u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]);
1957
1958	context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6);
1959	if (!qp->pri.smac && !qp->pri.smac_port) {
1960		smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac);
1961		if (smac_index >= 0) {
1962			qp->pri.candidate_smac_index = smac_index;
1963			qp->pri.candidate_smac = u64_mac;
1964			qp->pri.candidate_smac_port = qp->port;
1965			context->pri_path.grh_mylmc = 0x80 | (u8) smac_index;
1966		} else {
1967			return -ENOENT;
1968		}
1969	}
1970	return 0;
1971}
1972
1973static int create_qp_lb_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
1974{
1975	struct counter_index *new_counter_index;
1976	int err;
1977	u32 tmp_idx;
1978
1979	if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) !=
1980	    IB_LINK_LAYER_ETHERNET ||
1981	    !(qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) ||
1982	    !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK))
1983		return 0;
1984
1985	err = mlx4_counter_alloc(dev->dev, &tmp_idx, MLX4_RES_USAGE_DRIVER);
1986	if (err)
1987		return err;
1988
1989	new_counter_index = kmalloc(sizeof(*new_counter_index), GFP_KERNEL);
1990	if (!new_counter_index) {
1991		mlx4_counter_free(dev->dev, tmp_idx);
1992		return -ENOMEM;
1993	}
1994
1995	new_counter_index->index = tmp_idx;
1996	new_counter_index->allocated = 1;
1997	qp->counter_index = new_counter_index;
1998
1999	mutex_lock(&dev->counters_table[qp->port - 1].mutex);
2000	list_add_tail(&new_counter_index->list,
2001		      &dev->counters_table[qp->port - 1].counters_list);
2002	mutex_unlock(&dev->counters_table[qp->port - 1].mutex);
2003
2004	return 0;
2005}
2006
2007enum {
2008	MLX4_QPC_ROCE_MODE_1 = 0,
2009	MLX4_QPC_ROCE_MODE_2 = 2,
2010	MLX4_QPC_ROCE_MODE_UNDEFINED = 0xff
2011};
2012
2013static u8 gid_type_to_qpc(enum ib_gid_type gid_type)
2014{
2015	switch (gid_type) {
2016	case IB_GID_TYPE_ROCE:
2017		return MLX4_QPC_ROCE_MODE_1;
2018	case IB_GID_TYPE_ROCE_UDP_ENCAP:
2019		return MLX4_QPC_ROCE_MODE_2;
2020	default:
2021		return MLX4_QPC_ROCE_MODE_UNDEFINED;
2022	}
2023}
2024
2025/*
2026 * Go over all RSS QP's childes (WQs) and apply their HW state according to
2027 * their logic state if the RSS QP is the first RSS QP associated for the WQ.
2028 */
2029static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num,
2030			    struct ib_udata *udata)
2031{
2032	int err = 0;
2033	int i;
2034
2035	for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
2036		struct ib_wq *ibwq = ind_tbl->ind_tbl[i];
2037		struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
2038
2039		mutex_lock(&wq->mutex);
2040
2041		/* Mlx4_ib restrictions:
2042		 * WQ's is associated to a port according to the RSS QP it is
2043		 * associates to.
2044		 * In case the WQ is associated to a different port by another
2045		 * RSS QP, return a failure.
2046		 */
2047		if ((wq->rss_usecnt > 0) && (wq->port != port_num)) {
2048			err = -EINVAL;
2049			mutex_unlock(&wq->mutex);
2050			break;
2051		}
2052		wq->port = port_num;
2053		if ((wq->rss_usecnt == 0) && (ibwq->state == IB_WQS_RDY)) {
2054			err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY, udata);
2055			if (err) {
2056				mutex_unlock(&wq->mutex);
2057				break;
2058			}
2059		}
2060		wq->rss_usecnt++;
2061
2062		mutex_unlock(&wq->mutex);
2063	}
2064
2065	if (i && err) {
2066		int j;
2067
2068		for (j = (i - 1); j >= 0; j--) {
2069			struct ib_wq *ibwq = ind_tbl->ind_tbl[j];
2070			struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
2071
2072			mutex_lock(&wq->mutex);
2073
2074			if ((wq->rss_usecnt == 1) &&
2075			    (ibwq->state == IB_WQS_RDY))
2076				if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET,
2077						       udata))
2078					pr_warn("failed to reverse WQN=0x%06x\n",
2079						ibwq->wq_num);
2080			wq->rss_usecnt--;
2081
2082			mutex_unlock(&wq->mutex);
2083		}
2084	}
2085
2086	return err;
2087}
2088
2089static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl,
2090				struct ib_udata *udata)
2091{
2092	int i;
2093
2094	for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
2095		struct ib_wq *ibwq = ind_tbl->ind_tbl[i];
2096		struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
2097
2098		mutex_lock(&wq->mutex);
2099
2100		if ((wq->rss_usecnt == 1) && (ibwq->state == IB_WQS_RDY))
2101			if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET, udata))
2102				pr_warn("failed to reverse WQN=%x\n",
2103					ibwq->wq_num);
2104		wq->rss_usecnt--;
2105
2106		mutex_unlock(&wq->mutex);
2107	}
2108}
2109
2110static void fill_qp_rss_context(struct mlx4_qp_context *context,
2111				struct mlx4_ib_qp *qp)
2112{
2113	struct mlx4_rss_context *rss_context;
2114
2115	rss_context = (void *)context + offsetof(struct mlx4_qp_context,
2116			pri_path) + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH;
2117
2118	rss_context->base_qpn = cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz);
2119	rss_context->default_qpn =
2120		cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz & 0xffffff);
2121	if (qp->rss_ctx->flags & (MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6))
2122		rss_context->base_qpn_udp = rss_context->default_qpn;
2123	rss_context->flags = qp->rss_ctx->flags;
2124	/* Currently support just toeplitz */
2125	rss_context->hash_fn = MLX4_RSS_HASH_TOP;
2126
2127	memcpy(rss_context->rss_key, qp->rss_ctx->rss_key,
2128	       MLX4_EN_RSS_KEY_SIZE);
2129}
2130
2131static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
2132			       const struct ib_qp_attr *attr, int attr_mask,
2133			       enum ib_qp_state cur_state,
2134			       enum ib_qp_state new_state,
2135			       struct ib_udata *udata)
2136{
2137	struct ib_srq  *ibsrq;
2138	const struct ib_gid_attr *gid_attr = NULL;
2139	struct ib_rwq_ind_table *rwq_ind_tbl;
2140	enum ib_qp_type qp_type;
2141	struct mlx4_ib_dev *dev;
2142	struct mlx4_ib_qp *qp;
2143	struct mlx4_ib_pd *pd;
2144	struct mlx4_ib_cq *send_cq, *recv_cq;
2145	struct mlx4_ib_ucontext *ucontext = rdma_udata_to_drv_context(
2146		udata, struct mlx4_ib_ucontext, ibucontext);
2147	struct mlx4_qp_context *context;
2148	enum mlx4_qp_optpar optpar = 0;
2149	int sqd_event;
2150	int steer_qp = 0;
2151	int err = -EINVAL;
2152	int counter_index;
2153
2154	if (src_type == MLX4_IB_RWQ_SRC) {
2155		struct ib_wq *ibwq;
2156
2157		ibwq	    = (struct ib_wq *)src;
2158		ibsrq	    = NULL;
2159		rwq_ind_tbl = NULL;
2160		qp_type     = IB_QPT_RAW_PACKET;
2161		qp	    = to_mqp((struct ib_qp *)ibwq);
2162		dev	    = to_mdev(ibwq->device);
2163		pd	    = to_mpd(ibwq->pd);
2164	} else {
2165		struct ib_qp *ibqp;
2166
2167		ibqp	    = (struct ib_qp *)src;
2168		ibsrq	    = ibqp->srq;
2169		rwq_ind_tbl = ibqp->rwq_ind_tbl;
2170		qp_type     = ibqp->qp_type;
2171		qp	    = to_mqp(ibqp);
2172		dev	    = to_mdev(ibqp->device);
2173		pd	    = get_pd(qp);
2174	}
2175
2176	/* APM is not supported under RoCE */
2177	if (attr_mask & IB_QP_ALT_PATH &&
2178	    rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
2179	    IB_LINK_LAYER_ETHERNET)
2180		return -ENOTSUPP;
2181
2182	context = kzalloc(sizeof *context, GFP_KERNEL);
2183	if (!context)
2184		return -ENOMEM;
2185
2186	context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
2187				     (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16));
2188
2189	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
2190		context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
2191	else {
2192		optpar |= MLX4_QP_OPTPAR_PM_STATE;
2193		switch (attr->path_mig_state) {
2194		case IB_MIG_MIGRATED:
2195			context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
2196			break;
2197		case IB_MIG_REARM:
2198			context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11);
2199			break;
2200		case IB_MIG_ARMED:
2201			context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11);
2202			break;
2203		}
2204	}
2205
2206	if (qp->inl_recv_sz)
2207		context->param3 |= cpu_to_be32(1 << 25);
2208
2209	if (qp->flags & MLX4_IB_QP_SCATTER_FCS)
2210		context->param3 |= cpu_to_be32(1 << 29);
2211
2212	if (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI)
2213		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
2214	else if (qp_type == IB_QPT_RAW_PACKET)
2215		context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX;
2216	else if (qp_type == IB_QPT_UD) {
2217		if (qp->flags & MLX4_IB_QP_LSO)
2218			context->mtu_msgmax = (IB_MTU_4096 << 5) |
2219					      ilog2(dev->dev->caps.max_gso_sz);
2220		else
2221			context->mtu_msgmax = (IB_MTU_4096 << 5) | 13;
2222	} else if (attr_mask & IB_QP_PATH_MTU) {
2223		if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
2224			pr_err("path MTU (%u) is invalid\n",
2225			       attr->path_mtu);
2226			goto out;
2227		}
2228		context->mtu_msgmax = (attr->path_mtu << 5) |
2229			ilog2(dev->dev->caps.max_msg_sz);
2230	}
2231
2232	if (!rwq_ind_tbl) { /* PRM RSS receive side should be left zeros */
2233		if (qp->rq.wqe_cnt)
2234			context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
2235		context->rq_size_stride |= qp->rq.wqe_shift - 4;
2236	}
2237
2238	if (qp->sq.wqe_cnt)
2239		context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
2240	context->sq_size_stride |= qp->sq.wqe_shift - 4;
2241
2242	if (new_state == IB_QPS_RESET && qp->counter_index)
2243		mlx4_ib_free_qp_counter(dev, qp);
2244
2245	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
2246		context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
2247		context->xrcd = cpu_to_be32((u32) qp->xrcdn);
2248		if (qp_type == IB_QPT_RAW_PACKET)
2249			context->param3 |= cpu_to_be32(1 << 30);
2250	}
2251
2252	if (ucontext)
2253		context->usr_page = cpu_to_be32(
2254			mlx4_to_hw_uar_index(dev->dev, ucontext->uar.index));
2255	else
2256		context->usr_page = cpu_to_be32(
2257			mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index));
2258
2259	if (attr_mask & IB_QP_DEST_QPN)
2260		context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
2261
2262	if (attr_mask & IB_QP_PORT) {
2263		if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD &&
2264		    !(attr_mask & IB_QP_AV)) {
2265			mlx4_set_sched(&context->pri_path, attr->port_num);
2266			optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE;
2267		}
2268	}
2269
2270	if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
2271		err = create_qp_lb_counter(dev, qp);
2272		if (err)
2273			goto out;
2274
2275		counter_index =
2276			dev->counters_table[qp->port - 1].default_counter;
2277		if (qp->counter_index)
2278			counter_index = qp->counter_index->index;
2279
2280		if (counter_index != -1) {
2281			context->pri_path.counter_index = counter_index;
2282			optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
2283			if (qp->counter_index) {
2284				context->pri_path.fl |=
2285					MLX4_FL_ETH_SRC_CHECK_MC_LB;
2286				context->pri_path.vlan_control |=
2287					MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER;
2288			}
2289		} else
2290			context->pri_path.counter_index =
2291				MLX4_SINK_COUNTER_INDEX(dev->dev);
2292
2293		if (qp->flags & MLX4_IB_QP_NETIF) {
2294			mlx4_ib_steer_qp_reg(dev, qp, 1);
2295			steer_qp = 1;
2296		}
2297
2298		if (qp_type == IB_QPT_GSI) {
2299			enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ?
2300				IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE;
2301			u8 qpc_roce_mode = gid_type_to_qpc(gid_type);
2302
2303			context->rlkey_roce_mode |= (qpc_roce_mode << 6);
2304		}
2305	}
2306
2307	if (attr_mask & IB_QP_PKEY_INDEX) {
2308		if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
2309			context->pri_path.disable_pkey_check = 0x40;
2310		context->pri_path.pkey_index = attr->pkey_index;
2311		optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
2312	}
2313
2314	if (attr_mask & IB_QP_AV) {
2315		u8 port_num = mlx4_is_bonded(dev->dev) ? 1 :
2316			attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
2317		u16 vlan = 0xffff;
2318		u8 smac[ETH_ALEN];
2319		int is_eth =
2320			rdma_cap_eth_ah(&dev->ib_dev, port_num) &&
2321			rdma_ah_get_ah_flags(&attr->ah_attr) & IB_AH_GRH;
2322
2323		if (is_eth) {
2324			gid_attr = attr->ah_attr.grh.sgid_attr;
2325			err = rdma_read_gid_l2_fields(gid_attr, &vlan,
2326						      &smac[0]);
2327			if (err)
2328				goto out;
2329		}
2330
2331		if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path,
2332				  port_num, vlan, smac))
2333			goto out;
2334
2335		optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
2336			   MLX4_QP_OPTPAR_SCHED_QUEUE);
2337
2338		if (is_eth &&
2339		    (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)) {
2340			u8 qpc_roce_mode = gid_type_to_qpc(gid_attr->gid_type);
2341
2342			if (qpc_roce_mode == MLX4_QPC_ROCE_MODE_UNDEFINED) {
2343				err = -EINVAL;
2344				goto out;
2345			}
2346			context->rlkey_roce_mode |= (qpc_roce_mode << 6);
2347		}
2348
2349	}
2350
2351	if (attr_mask & IB_QP_TIMEOUT) {
2352		context->pri_path.ackto |= attr->timeout << 3;
2353		optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
2354	}
2355
2356	if (attr_mask & IB_QP_ALT_PATH) {
2357		if (attr->alt_port_num == 0 ||
2358		    attr->alt_port_num > dev->dev->caps.num_ports)
2359			goto out;
2360
2361		if (attr->alt_pkey_index >=
2362		    dev->dev->caps.pkey_table_len[attr->alt_port_num])
2363			goto out;
2364
2365		if (mlx4_set_alt_path(dev, attr, attr_mask, qp,
2366				      &context->alt_path,
2367				      attr->alt_port_num))
2368			goto out;
2369
2370		context->alt_path.pkey_index = attr->alt_pkey_index;
2371		context->alt_path.ackto = attr->alt_timeout << 3;
2372		optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
2373	}
2374
2375	context->pd = cpu_to_be32(pd->pdn);
2376
2377	if (!rwq_ind_tbl) {
2378		context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
2379		get_cqs(qp, src_type, &send_cq, &recv_cq);
2380	} else { /* Set dummy CQs to be compatible with HV and PRM */
2381		send_cq = to_mcq(rwq_ind_tbl->ind_tbl[0]->cq);
2382		recv_cq = send_cq;
2383	}
2384	context->cqn_send = cpu_to_be32(send_cq->mcq.cqn);
2385	context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
2386
2387	/* Set "fast registration enabled" for all kernel QPs */
2388	if (!ucontext)
2389		context->params1 |= cpu_to_be32(1 << 11);
2390
2391	if (attr_mask & IB_QP_RNR_RETRY) {
2392		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
2393		optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
2394	}
2395
2396	if (attr_mask & IB_QP_RETRY_CNT) {
2397		context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
2398		optpar |= MLX4_QP_OPTPAR_RETRY_COUNT;
2399	}
2400
2401	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
2402		if (attr->max_rd_atomic)
2403			context->params1 |=
2404				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
2405		optpar |= MLX4_QP_OPTPAR_SRA_MAX;
2406	}
2407
2408	if (attr_mask & IB_QP_SQ_PSN)
2409		context->next_send_psn = cpu_to_be32(attr->sq_psn);
2410
 
 
2411	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
2412		if (attr->max_dest_rd_atomic)
2413			context->params2 |=
2414				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
2415		optpar |= MLX4_QP_OPTPAR_RRA_MAX;
2416	}
2417
2418	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
2419		context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask);
2420		optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
2421	}
2422
2423	if (ibsrq)
2424		context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
2425
2426	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
2427		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
2428		optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT;
2429	}
2430	if (attr_mask & IB_QP_RQ_PSN)
2431		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
2432
2433	/* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */
 
2434	if (attr_mask & IB_QP_QKEY) {
2435		if (qp->mlx4_ib_qp_type &
2436		    (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))
2437			context->qkey = cpu_to_be32(IB_QP_SET_QKEY);
2438		else {
2439			if (mlx4_is_mfunc(dev->dev) &&
2440			    !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) &&
2441			    (attr->qkey & MLX4_RESERVED_QKEY_MASK) ==
2442			    MLX4_RESERVED_QKEY_BASE) {
2443				pr_err("Cannot use reserved QKEY"
2444				       " 0x%x (range 0xffff0000..0xffffffff"
2445				       " is reserved)\n", attr->qkey);
2446				err = -EINVAL;
2447				goto out;
2448			}
2449			context->qkey = cpu_to_be32(attr->qkey);
2450		}
2451		optpar |= MLX4_QP_OPTPAR_Q_KEY;
2452	}
2453
2454	if (ibsrq)
2455		context->srqn = cpu_to_be32(1 << 24 |
2456					    to_msrq(ibsrq)->msrq.srqn);
2457
2458	if (qp->rq.wqe_cnt &&
2459	    cur_state == IB_QPS_RESET &&
2460	    new_state == IB_QPS_INIT)
2461		context->db_rec_addr = cpu_to_be64(qp->db.dma);
2462
2463	if (cur_state == IB_QPS_INIT &&
2464	    new_state == IB_QPS_RTR  &&
2465	    (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI ||
2466	     qp_type == IB_QPT_UD || qp_type == IB_QPT_RAW_PACKET)) {
2467		context->pri_path.sched_queue = (qp->port - 1) << 6;
2468		if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
2469		    qp->mlx4_ib_qp_type &
2470		    (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) {
2471			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
2472			if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI)
2473				context->pri_path.fl = 0x80;
2474		} else {
2475			if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
2476				context->pri_path.fl = 0x80;
2477			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
2478		}
2479		if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
2480		    IB_LINK_LAYER_ETHERNET) {
2481			if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI ||
2482			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI)
2483				context->pri_path.feup = 1 << 7; /* don't fsm */
2484			/* handle smac_index */
2485			if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD ||
2486			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI ||
2487			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) {
2488				err = handle_eth_ud_smac_index(dev, qp, context);
2489				if (err) {
2490					err = -EINVAL;
2491					goto out;
2492				}
2493				if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
2494					dev->qp1_proxy[qp->port - 1] = qp;
2495			}
2496		}
2497	}
2498
2499	if (qp_type == IB_QPT_RAW_PACKET) {
2500		context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
2501					MLX4_IB_LINK_TYPE_ETH;
2502		if (dev->dev->caps.tunnel_offload_mode ==  MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
2503			/* set QP to receive both tunneled & non-tunneled packets */
2504			if (!rwq_ind_tbl)
2505				context->srqn = cpu_to_be32(7 << 28);
2506		}
2507	}
2508
2509	if (qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) {
2510		int is_eth = rdma_port_get_link_layer(
2511				&dev->ib_dev, qp->port) ==
2512				IB_LINK_LAYER_ETHERNET;
2513		if (is_eth) {
2514			context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH;
2515			optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH;
2516		}
2517	}
2518
2519	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
2520	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
2521		sqd_event = 1;
2522	else
2523		sqd_event = 0;
2524
2525	if (!ucontext &&
2526	    cur_state == IB_QPS_RESET &&
2527	    new_state == IB_QPS_INIT)
2528		context->rlkey_roce_mode |= (1 << 4);
2529
2530	/*
2531	 * Before passing a kernel QP to the HW, make sure that the
2532	 * ownership bits of the send queue are set and the SQ
2533	 * headroom is stamped so that the hardware doesn't start
2534	 * processing stale work requests.
2535	 */
2536	if (!ucontext &&
2537	    cur_state == IB_QPS_RESET &&
2538	    new_state == IB_QPS_INIT) {
2539		struct mlx4_wqe_ctrl_seg *ctrl;
2540		int i;
2541
2542		for (i = 0; i < qp->sq.wqe_cnt; ++i) {
2543			ctrl = get_send_wqe(qp, i);
2544			ctrl->owner_opcode = cpu_to_be32(1 << 31);
2545			ctrl->qpn_vlan.fence_size =
2546				1 << (qp->sq.wqe_shift - 4);
2547			stamp_send_wqe(qp, i);
 
2548		}
2549	}
2550
2551	if (rwq_ind_tbl	&&
2552	    cur_state == IB_QPS_RESET &&
2553	    new_state == IB_QPS_INIT) {
2554		fill_qp_rss_context(context, qp);
2555		context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET);
2556	}
2557
2558	err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
2559			     to_mlx4_state(new_state), context, optpar,
2560			     sqd_event, &qp->mqp);
2561	if (err)
2562		goto out;
2563
2564	qp->state = new_state;
2565
2566	if (attr_mask & IB_QP_ACCESS_FLAGS)
2567		qp->atomic_rd_en = attr->qp_access_flags;
2568	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
2569		qp->resp_depth = attr->max_dest_rd_atomic;
2570	if (attr_mask & IB_QP_PORT) {
2571		qp->port = attr->port_num;
2572		update_mcg_macs(dev, qp);
2573	}
2574	if (attr_mask & IB_QP_ALT_PATH)
2575		qp->alt_port = attr->alt_port_num;
2576
2577	if (is_sqp(dev, qp))
2578		store_sqp_attrs(to_msqp(qp), attr, attr_mask);
2579
2580	/*
2581	 * If we moved QP0 to RTR, bring the IB link up; if we moved
2582	 * QP0 to RESET or ERROR, bring the link back down.
2583	 */
2584	if (is_qp0(dev, qp)) {
2585		if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR)
2586			if (mlx4_INIT_PORT(dev->dev, qp->port))
2587				pr_warn("INIT_PORT failed for port %d\n",
2588				       qp->port);
2589
2590		if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
2591		    (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
2592			mlx4_CLOSE_PORT(dev->dev, qp->port);
2593	}
2594
2595	/*
2596	 * If we moved a kernel QP to RESET, clean up all old CQ
2597	 * entries and reinitialize the QP.
2598	 */
2599	if (new_state == IB_QPS_RESET) {
2600		if (!ucontext) {
2601			mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
2602					 ibsrq ? to_msrq(ibsrq) : NULL);
2603			if (send_cq != recv_cq)
2604				mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
2605
2606			qp->rq.head = 0;
2607			qp->rq.tail = 0;
2608			qp->sq.head = 0;
2609			qp->sq.tail = 0;
2610			qp->sq_next_wqe = 0;
2611			if (qp->rq.wqe_cnt)
2612				*qp->db.db  = 0;
2613
2614			if (qp->flags & MLX4_IB_QP_NETIF)
2615				mlx4_ib_steer_qp_reg(dev, qp, 0);
2616		}
2617		if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
2618			mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
2619			qp->pri.smac = 0;
2620			qp->pri.smac_port = 0;
2621		}
2622		if (qp->alt.smac) {
2623			mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
2624			qp->alt.smac = 0;
2625		}
2626		if (qp->pri.vid < 0x1000) {
2627			mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
2628			qp->pri.vid = 0xFFFF;
2629			qp->pri.candidate_vid = 0xFFFF;
2630			qp->pri.update_vid = 0;
2631		}
2632
2633		if (qp->alt.vid < 0x1000) {
2634			mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
2635			qp->alt.vid = 0xFFFF;
2636			qp->alt.candidate_vid = 0xFFFF;
2637			qp->alt.update_vid = 0;
2638		}
2639	}
2640out:
2641	if (err && qp->counter_index)
2642		mlx4_ib_free_qp_counter(dev, qp);
2643	if (err && steer_qp)
2644		mlx4_ib_steer_qp_reg(dev, qp, 0);
2645	kfree(context);
2646	if (qp->pri.candidate_smac ||
2647	    (!qp->pri.candidate_smac && qp->pri.candidate_smac_port)) {
2648		if (err) {
2649			mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac);
2650		} else {
2651			if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port))
2652				mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
2653			qp->pri.smac = qp->pri.candidate_smac;
2654			qp->pri.smac_index = qp->pri.candidate_smac_index;
2655			qp->pri.smac_port = qp->pri.candidate_smac_port;
2656		}
2657		qp->pri.candidate_smac = 0;
2658		qp->pri.candidate_smac_index = 0;
2659		qp->pri.candidate_smac_port = 0;
2660	}
2661	if (qp->alt.candidate_smac) {
2662		if (err) {
2663			mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac);
2664		} else {
2665			if (qp->alt.smac)
2666				mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
2667			qp->alt.smac = qp->alt.candidate_smac;
2668			qp->alt.smac_index = qp->alt.candidate_smac_index;
2669			qp->alt.smac_port = qp->alt.candidate_smac_port;
2670		}
2671		qp->alt.candidate_smac = 0;
2672		qp->alt.candidate_smac_index = 0;
2673		qp->alt.candidate_smac_port = 0;
2674	}
2675
2676	if (qp->pri.update_vid) {
2677		if (err) {
2678			if (qp->pri.candidate_vid < 0x1000)
2679				mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port,
2680						     qp->pri.candidate_vid);
2681		} else {
2682			if (qp->pri.vid < 0x1000)
2683				mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port,
2684						     qp->pri.vid);
2685			qp->pri.vid = qp->pri.candidate_vid;
2686			qp->pri.vlan_port = qp->pri.candidate_vlan_port;
2687			qp->pri.vlan_index =  qp->pri.candidate_vlan_index;
2688		}
2689		qp->pri.candidate_vid = 0xFFFF;
2690		qp->pri.update_vid = 0;
2691	}
2692
2693	if (qp->alt.update_vid) {
2694		if (err) {
2695			if (qp->alt.candidate_vid < 0x1000)
2696				mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port,
2697						     qp->alt.candidate_vid);
2698		} else {
2699			if (qp->alt.vid < 0x1000)
2700				mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port,
2701						     qp->alt.vid);
2702			qp->alt.vid = qp->alt.candidate_vid;
2703			qp->alt.vlan_port = qp->alt.candidate_vlan_port;
2704			qp->alt.vlan_index =  qp->alt.candidate_vlan_index;
2705		}
2706		qp->alt.candidate_vid = 0xFFFF;
2707		qp->alt.update_vid = 0;
2708	}
2709
2710	return err;
2711}
2712
2713enum {
2714	MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK = (IB_QP_STATE	|
2715					      IB_QP_PORT),
2716};
2717
2718static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
2719			      int attr_mask, struct ib_udata *udata)
2720{
2721	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
2722	struct mlx4_ib_qp *qp = to_mqp(ibqp);
2723	enum ib_qp_state cur_state, new_state;
2724	int err = -EINVAL;
 
2725	mutex_lock(&qp->mutex);
2726
2727	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
2728	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
2729
2730	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
2731				attr_mask)) {
2732		pr_debug("qpn 0x%x: invalid attribute mask specified "
2733			 "for transition %d to %d. qp_type %d,"
2734			 " attr_mask 0x%x\n",
2735			 ibqp->qp_num, cur_state, new_state,
2736			 ibqp->qp_type, attr_mask);
2737		goto out;
2738	}
2739
2740	if (ibqp->rwq_ind_tbl) {
2741		if (!(((cur_state == IB_QPS_RESET) &&
2742		       (new_state == IB_QPS_INIT)) ||
2743		      ((cur_state == IB_QPS_INIT)  &&
2744		       (new_state == IB_QPS_RTR)))) {
2745			pr_debug("qpn 0x%x: RSS QP unsupported transition %d to %d\n",
2746				 ibqp->qp_num, cur_state, new_state);
2747
2748			err = -EOPNOTSUPP;
2749			goto out;
2750		}
2751
2752		if (attr_mask & ~MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK) {
2753			pr_debug("qpn 0x%x: RSS QP unsupported attribute mask 0x%x for transition %d to %d\n",
2754				 ibqp->qp_num, attr_mask, cur_state, new_state);
2755
2756			err = -EOPNOTSUPP;
2757			goto out;
2758		}
2759	}
2760
2761	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) {
2762		if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) {
2763			if ((ibqp->qp_type == IB_QPT_RC) ||
2764			    (ibqp->qp_type == IB_QPT_UD) ||
2765			    (ibqp->qp_type == IB_QPT_UC) ||
2766			    (ibqp->qp_type == IB_QPT_RAW_PACKET) ||
2767			    (ibqp->qp_type == IB_QPT_XRC_INI)) {
2768				attr->port_num = mlx4_ib_bond_next_port(dev);
2769			}
2770		} else {
2771			/* no sense in changing port_num
2772			 * when ports are bonded */
2773			attr_mask &= ~IB_QP_PORT;
2774		}
2775	}
2776
2777	if ((attr_mask & IB_QP_PORT) &&
2778	    (attr->port_num == 0 || attr->port_num > dev->num_ports)) {
2779		pr_debug("qpn 0x%x: invalid port number (%d) specified "
2780			 "for transition %d to %d. qp_type %d\n",
2781			 ibqp->qp_num, attr->port_num, cur_state,
2782			 new_state, ibqp->qp_type);
2783		goto out;
2784	}
2785
2786	if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) &&
2787	    (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) !=
2788	     IB_LINK_LAYER_ETHERNET))
2789		goto out;
2790
2791	if (attr_mask & IB_QP_PKEY_INDEX) {
2792		int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
2793		if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) {
2794			pr_debug("qpn 0x%x: invalid pkey index (%d) specified "
2795				 "for transition %d to %d. qp_type %d\n",
2796				 ibqp->qp_num, attr->pkey_index, cur_state,
2797				 new_state, ibqp->qp_type);
2798			goto out;
2799		}
2800	}
2801
2802	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
2803	    attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
2804		pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. "
2805			 "Transition %d to %d. qp_type %d\n",
2806			 ibqp->qp_num, attr->max_rd_atomic, cur_state,
2807			 new_state, ibqp->qp_type);
2808		goto out;
2809	}
2810
2811	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
2812	    attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
2813		pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. "
2814			 "Transition %d to %d. qp_type %d\n",
2815			 ibqp->qp_num, attr->max_dest_rd_atomic, cur_state,
2816			 new_state, ibqp->qp_type);
2817		goto out;
2818	}
2819
2820	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
2821		err = 0;
2822		goto out;
2823	}
2824
2825	if (ibqp->rwq_ind_tbl && (new_state == IB_QPS_INIT)) {
2826		err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num,
2827				       udata);
2828		if (err)
2829			goto out;
2830	}
2831
2832	err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask,
2833				  cur_state, new_state, udata);
2834
2835	if (ibqp->rwq_ind_tbl && err)
2836		bring_down_rss_rwqs(ibqp->rwq_ind_tbl, udata);
2837
2838	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT))
2839		attr->port_num = 1;
2840
2841out:
2842	mutex_unlock(&qp->mutex);
2843	return err;
2844}
2845
2846int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
2847		      int attr_mask, struct ib_udata *udata)
2848{
2849	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
2850	int ret;
2851
2852	ret = _mlx4_ib_modify_qp(ibqp, attr, attr_mask, udata);
2853
2854	if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
2855		struct mlx4_ib_sqp *sqp = to_msqp(mqp);
2856		int err = 0;
2857
2858		if (sqp->roce_v2_gsi)
2859			err = ib_modify_qp(sqp->roce_v2_gsi, attr, attr_mask);
2860		if (err)
2861			pr_err("Failed to modify GSI QP for RoCEv2 (%d)\n",
2862			       err);
2863	}
2864	return ret;
2865}
2866
2867static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey)
2868{
2869	int i;
2870	for (i = 0; i < dev->caps.num_ports; i++) {
2871		if (qpn == dev->caps.spec_qps[i].qp0_proxy ||
2872		    qpn == dev->caps.spec_qps[i].qp0_tunnel) {
2873			*qkey = dev->caps.spec_qps[i].qp0_qkey;
2874			return 0;
2875		}
2876	}
2877	return -EINVAL;
2878}
2879
2880static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
2881				  const struct ib_ud_wr *wr,
2882				  void *wqe, unsigned *mlx_seg_len)
2883{
2884	struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device);
2885	struct ib_device *ib_dev = &mdev->ib_dev;
2886	struct mlx4_wqe_mlx_seg *mlx = wqe;
2887	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
2888	struct mlx4_ib_ah *ah = to_mah(wr->ah);
2889	u16 pkey;
2890	u32 qkey;
2891	int send_size;
2892	int header_size;
2893	int spc;
2894	int err;
2895	int i;
2896
2897	if (wr->wr.opcode != IB_WR_SEND)
2898		return -EINVAL;
2899
2900	send_size = 0;
2901
2902	for (i = 0; i < wr->wr.num_sge; ++i)
2903		send_size += wr->wr.sg_list[i].length;
2904
2905	/* for proxy-qp0 sends, need to add in size of tunnel header */
2906	/* for tunnel-qp0 sends, tunnel header is already in s/g list */
2907	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
2908		send_size += sizeof (struct mlx4_ib_tunnel_header);
2909
2910	ib_ud_header_init(send_size, 1, 0, 0, 0, 0, 0, 0, &sqp->ud_header);
2911
2912	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
2913		sqp->ud_header.lrh.service_level =
2914			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
2915		sqp->ud_header.lrh.destination_lid =
2916			cpu_to_be16(ah->av.ib.g_slid & 0x7f);
2917		sqp->ud_header.lrh.source_lid =
2918			cpu_to_be16(ah->av.ib.g_slid & 0x7f);
2919	}
2920
2921	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
2922
2923	/* force loopback */
2924	mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR);
2925	mlx->rlid = sqp->ud_header.lrh.destination_lid;
2926
2927	sqp->ud_header.lrh.virtual_lane    = 0;
2928	sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED);
2929	err = ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey);
2930	if (err)
2931		return err;
2932	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
2933	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER)
2934		sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn);
2935	else
2936		sqp->ud_header.bth.destination_qpn =
2937			cpu_to_be32(mdev->dev->caps.spec_qps[sqp->qp.port - 1].qp0_tunnel);
2938
2939	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
2940	if (mlx4_is_master(mdev->dev)) {
2941		if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
2942			return -EINVAL;
2943	} else {
2944		if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
2945			return -EINVAL;
2946	}
2947	sqp->ud_header.deth.qkey = cpu_to_be32(qkey);
2948	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn);
2949
2950	sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY;
2951	sqp->ud_header.immediate_present = 0;
2952
2953	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
2954
2955	/*
2956	 * Inline data segments may not cross a 64 byte boundary.  If
2957	 * our UD header is bigger than the space available up to the
2958	 * next 64 byte boundary in the WQE, use two inline data
2959	 * segments to hold the UD header.
2960	 */
2961	spc = MLX4_INLINE_ALIGN -
2962	      ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
2963	if (header_size <= spc) {
2964		inl->byte_count = cpu_to_be32(1 << 31 | header_size);
2965		memcpy(inl + 1, sqp->header_buf, header_size);
2966		i = 1;
2967	} else {
2968		inl->byte_count = cpu_to_be32(1 << 31 | spc);
2969		memcpy(inl + 1, sqp->header_buf, spc);
2970
2971		inl = (void *) (inl + 1) + spc;
2972		memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
2973		/*
2974		 * Need a barrier here to make sure all the data is
2975		 * visible before the byte_count field is set.
2976		 * Otherwise the HCA prefetcher could grab the 64-byte
2977		 * chunk with this inline segment and get a valid (!=
2978		 * 0xffffffff) byte count but stale data, and end up
2979		 * generating a packet with bad headers.
2980		 *
2981		 * The first inline segment's byte_count field doesn't
2982		 * need a barrier, because it comes after a
2983		 * control/MLX segment and therefore is at an offset
2984		 * of 16 mod 64.
2985		 */
2986		wmb();
2987		inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
2988		i = 2;
2989	}
2990
2991	*mlx_seg_len =
2992	ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
2993	return 0;
2994}
2995
2996static u8 sl_to_vl(struct mlx4_ib_dev *dev, u8 sl, int port_num)
2997{
2998	union sl2vl_tbl_to_u64 tmp_vltab;
2999	u8 vl;
3000
3001	if (sl > 15)
3002		return 0xf;
3003	tmp_vltab.sl64 = atomic64_read(&dev->sl2vl[port_num - 1]);
3004	vl = tmp_vltab.sl8[sl >> 1];
3005	if (sl & 1)
3006		vl &= 0x0f;
3007	else
3008		vl >>= 4;
3009	return vl;
3010}
3011
3012static int fill_gid_by_hw_index(struct mlx4_ib_dev *ibdev, u8 port_num,
3013				int index, union ib_gid *gid,
3014				enum ib_gid_type *gid_type)
3015{
3016	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
3017	struct mlx4_port_gid_table *port_gid_table;
3018	unsigned long flags;
3019
3020	port_gid_table = &iboe->gids[port_num - 1];
3021	spin_lock_irqsave(&iboe->lock, flags);
3022	memcpy(gid, &port_gid_table->gids[index].gid, sizeof(*gid));
3023	*gid_type = port_gid_table->gids[index].gid_type;
3024	spin_unlock_irqrestore(&iboe->lock, flags);
3025	if (rdma_is_zero_gid(gid))
3026		return -ENOENT;
3027
3028	return 0;
3029}
3030
3031#define MLX4_ROCEV2_QP1_SPORT 0xC000
3032static int build_mlx_header(struct mlx4_ib_sqp *sqp, const struct ib_ud_wr *wr,
3033			    void *wqe, unsigned *mlx_seg_len)
3034{
3035	struct ib_device *ib_dev = sqp->qp.ibqp.device;
3036	struct mlx4_ib_dev *ibdev = to_mdev(ib_dev);
3037	struct mlx4_wqe_mlx_seg *mlx = wqe;
3038	struct mlx4_wqe_ctrl_seg *ctrl = wqe;
3039	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
3040	struct mlx4_ib_ah *ah = to_mah(wr->ah);
3041	union ib_gid sgid;
3042	u16 pkey;
3043	int send_size;
3044	int header_size;
3045	int spc;
3046	int i;
3047	int err = 0;
3048	u16 vlan = 0xffff;
3049	bool is_eth;
3050	bool is_vlan = false;
3051	bool is_grh;
3052	bool is_udp = false;
3053	int ip_version = 0;
3054
3055	send_size = 0;
3056	for (i = 0; i < wr->wr.num_sge; ++i)
3057		send_size += wr->wr.sg_list[i].length;
3058
3059	is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
3060	is_grh = mlx4_ib_ah_grh_present(ah);
3061	if (is_eth) {
3062		enum ib_gid_type gid_type;
3063		if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
3064			/* When multi-function is enabled, the ib_core gid
3065			 * indexes don't necessarily match the hw ones, so
3066			 * we must use our own cache */
3067			err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev,
3068							   be32_to_cpu(ah->av.ib.port_pd) >> 24,
3069							   ah->av.ib.gid_index, &sgid.raw[0]);
3070			if (err)
3071				return err;
3072		} else  {
3073			err = fill_gid_by_hw_index(ibdev, sqp->qp.port,
3074					    ah->av.ib.gid_index,
3075					    &sgid, &gid_type);
3076			if (!err) {
3077				is_udp = gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP;
3078				if (is_udp) {
3079					if (ipv6_addr_v4mapped((struct in6_addr *)&sgid))
3080						ip_version = 4;
3081					else
3082						ip_version = 6;
3083					is_grh = false;
3084				}
3085			} else {
3086				return err;
3087			}
3088		}
3089		if (ah->av.eth.vlan != cpu_to_be16(0xffff)) {
3090			vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff;
3091			is_vlan = true;
3092		}
3093	}
3094	err = ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh,
3095			  ip_version, is_udp, 0, &sqp->ud_header);
3096	if (err)
3097		return err;
3098
3099	if (!is_eth) {
3100		sqp->ud_header.lrh.service_level =
3101			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
3102		sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid;
3103		sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
3104	}
3105
3106	if (is_grh || (ip_version == 6)) {
3107		sqp->ud_header.grh.traffic_class =
3108			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
3109		sqp->ud_header.grh.flow_label    =
3110			ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
3111		sqp->ud_header.grh.hop_limit     = ah->av.ib.hop_limit;
3112		if (is_eth) {
3113			memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16);
3114		} else {
3115			if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
3116				/* When multi-function is enabled, the ib_core gid
3117				 * indexes don't necessarily match the hw ones, so
3118				 * we must use our own cache
3119				 */
3120				sqp->ud_header.grh.source_gid.global.subnet_prefix =
3121					cpu_to_be64(atomic64_read(&(to_mdev(ib_dev)->sriov.
3122								    demux[sqp->qp.port - 1].
3123								    subnet_prefix)));
3124				sqp->ud_header.grh.source_gid.global.interface_id =
3125					to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
3126						       guid_cache[ah->av.ib.gid_index];
3127			} else {
3128				sqp->ud_header.grh.source_gid =
3129					ah->ibah.sgid_attr->gid;
3130			}
3131		}
3132		memcpy(sqp->ud_header.grh.destination_gid.raw,
3133		       ah->av.ib.dgid, 16);
3134	}
3135
3136	if (ip_version == 4) {
3137		sqp->ud_header.ip4.tos =
3138			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
3139		sqp->ud_header.ip4.id = 0;
3140		sqp->ud_header.ip4.frag_off = htons(IP_DF);
3141		sqp->ud_header.ip4.ttl = ah->av.eth.hop_limit;
3142
3143		memcpy(&sqp->ud_header.ip4.saddr,
3144		       sgid.raw + 12, 4);
3145		memcpy(&sqp->ud_header.ip4.daddr, ah->av.ib.dgid + 12, 4);
3146		sqp->ud_header.ip4.check = ib_ud_ip4_csum(&sqp->ud_header);
3147	}
3148
3149	if (is_udp) {
3150		sqp->ud_header.udp.dport = htons(ROCE_V2_UDP_DPORT);
3151		sqp->ud_header.udp.sport = htons(MLX4_ROCEV2_QP1_SPORT);
3152		sqp->ud_header.udp.csum = 0;
3153	}
3154
3155	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
3156
3157	if (!is_eth) {
3158		mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
3159					  (sqp->ud_header.lrh.destination_lid ==
3160					   IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
3161					  (sqp->ud_header.lrh.service_level << 8));
3162		if (ah->av.ib.port_pd & cpu_to_be32(0x80000000))
3163			mlx->flags |= cpu_to_be32(0x1); /* force loopback */
3164		mlx->rlid = sqp->ud_header.lrh.destination_lid;
3165	}
3166
3167	switch (wr->wr.opcode) {
3168	case IB_WR_SEND:
3169		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY;
3170		sqp->ud_header.immediate_present = 0;
3171		break;
3172	case IB_WR_SEND_WITH_IMM:
3173		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
3174		sqp->ud_header.immediate_present = 1;
3175		sqp->ud_header.immediate_data    = wr->wr.ex.imm_data;
3176		break;
3177	default:
3178		return -EINVAL;
3179	}
3180
3181	if (is_eth) {
3182		struct in6_addr in6;
3183		u16 ether_type;
3184		u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
3185
3186		ether_type = (!is_udp) ? ETH_P_IBOE:
3187			(ip_version == 4 ? ETH_P_IP : ETH_P_IPV6);
3188
3189		mlx->sched_prio = cpu_to_be16(pcp);
3190
3191		ether_addr_copy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac);
3192		memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
3193		memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
3194		memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
3195		memcpy(&in6, sgid.raw, sizeof(in6));
3196
3197
3198		if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
3199			mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
3200		if (!is_vlan) {
3201			sqp->ud_header.eth.type = cpu_to_be16(ether_type);
3202		} else {
3203			sqp->ud_header.vlan.type = cpu_to_be16(ether_type);
 
 
 
3204			sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
3205		}
3206	} else {
3207		sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 :
3208							sl_to_vl(to_mdev(ib_dev),
3209								 sqp->ud_header.lrh.service_level,
3210								 sqp->qp.port);
3211		if (sqp->qp.ibqp.qp_num && sqp->ud_header.lrh.virtual_lane == 15)
3212			return -EINVAL;
3213		if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
3214			sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
3215	}
3216	sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED);
3217	if (!sqp->qp.ibqp.qp_num)
3218		err = ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index,
3219					 &pkey);
3220	else
3221		err = ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->pkey_index,
3222					 &pkey);
3223	if (err)
3224		return err;
3225
3226	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
3227	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn);
3228	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
3229	sqp->ud_header.deth.qkey = cpu_to_be32(wr->remote_qkey & 0x80000000 ?
3230					       sqp->qkey : wr->remote_qkey);
3231	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
3232
3233	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
3234
3235	if (0) {
3236		pr_err("built UD header of size %d:\n", header_size);
3237		for (i = 0; i < header_size / 4; ++i) {
3238			if (i % 8 == 0)
3239				pr_err("  [%02x] ", i * 4);
3240			pr_cont(" %08x",
3241				be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
3242			if ((i + 1) % 8 == 0)
3243				pr_cont("\n");
3244		}
3245		pr_err("\n");
3246	}
3247
3248	/*
3249	 * Inline data segments may not cross a 64 byte boundary.  If
3250	 * our UD header is bigger than the space available up to the
3251	 * next 64 byte boundary in the WQE, use two inline data
3252	 * segments to hold the UD header.
3253	 */
3254	spc = MLX4_INLINE_ALIGN -
3255		((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
3256	if (header_size <= spc) {
3257		inl->byte_count = cpu_to_be32(1 << 31 | header_size);
3258		memcpy(inl + 1, sqp->header_buf, header_size);
3259		i = 1;
3260	} else {
3261		inl->byte_count = cpu_to_be32(1 << 31 | spc);
3262		memcpy(inl + 1, sqp->header_buf, spc);
3263
3264		inl = (void *) (inl + 1) + spc;
3265		memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
3266		/*
3267		 * Need a barrier here to make sure all the data is
3268		 * visible before the byte_count field is set.
3269		 * Otherwise the HCA prefetcher could grab the 64-byte
3270		 * chunk with this inline segment and get a valid (!=
3271		 * 0xffffffff) byte count but stale data, and end up
3272		 * generating a packet with bad headers.
3273		 *
3274		 * The first inline segment's byte_count field doesn't
3275		 * need a barrier, because it comes after a
3276		 * control/MLX segment and therefore is at an offset
3277		 * of 16 mod 64.
3278		 */
3279		wmb();
3280		inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
3281		i = 2;
3282	}
3283
3284	*mlx_seg_len =
3285		ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
3286	return 0;
3287}
3288
3289static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
3290{
3291	unsigned cur;
3292	struct mlx4_ib_cq *cq;
3293
3294	cur = wq->head - wq->tail;
3295	if (likely(cur + nreq < wq->max_post))
3296		return 0;
3297
3298	cq = to_mcq(ib_cq);
3299	spin_lock(&cq->lock);
3300	cur = wq->head - wq->tail;
3301	spin_unlock(&cq->lock);
3302
3303	return cur + nreq >= wq->max_post;
3304}
3305
3306static __be32 convert_access(int acc)
3307{
3308	return (acc & IB_ACCESS_REMOTE_ATOMIC ?
3309		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC)       : 0) |
3310	       (acc & IB_ACCESS_REMOTE_WRITE  ?
3311		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) |
3312	       (acc & IB_ACCESS_REMOTE_READ   ?
3313		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ)  : 0) |
3314	       (acc & IB_ACCESS_LOCAL_WRITE   ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE)  : 0) |
3315		cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
3316}
3317
3318static void set_reg_seg(struct mlx4_wqe_fmr_seg *fseg,
3319			const struct ib_reg_wr *wr)
3320{
3321	struct mlx4_ib_mr *mr = to_mmr(wr->mr);
 
3322
3323	fseg->flags		= convert_access(wr->access);
3324	fseg->mem_key		= cpu_to_be32(wr->key);
3325	fseg->buf_list		= cpu_to_be64(mr->page_map);
3326	fseg->start_addr	= cpu_to_be64(mr->ibmr.iova);
3327	fseg->reg_len		= cpu_to_be64(mr->ibmr.length);
 
 
 
 
 
3328	fseg->offset		= 0; /* XXX -- is this just for ZBVA? */
3329	fseg->page_size		= cpu_to_be32(ilog2(mr->ibmr.page_size));
3330	fseg->reserved[0]	= 0;
3331	fseg->reserved[1]	= 0;
3332}
3333
3334static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
3335{
3336	memset(iseg, 0, sizeof(*iseg));
3337	iseg->mem_key = cpu_to_be32(rkey);
 
 
3338}
3339
3340static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
3341					  u64 remote_addr, u32 rkey)
3342{
3343	rseg->raddr    = cpu_to_be64(remote_addr);
3344	rseg->rkey     = cpu_to_be32(rkey);
3345	rseg->reserved = 0;
3346}
3347
3348static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg,
3349			   const struct ib_atomic_wr *wr)
3350{
3351	if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
3352		aseg->swap_add = cpu_to_be64(wr->swap);
3353		aseg->compare  = cpu_to_be64(wr->compare_add);
3354	} else if (wr->wr.opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
3355		aseg->swap_add = cpu_to_be64(wr->compare_add);
3356		aseg->compare  = cpu_to_be64(wr->compare_add_mask);
3357	} else {
3358		aseg->swap_add = cpu_to_be64(wr->compare_add);
3359		aseg->compare  = 0;
3360	}
3361
3362}
3363
3364static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg,
3365				  const struct ib_atomic_wr *wr)
3366{
3367	aseg->swap_add		= cpu_to_be64(wr->swap);
3368	aseg->swap_add_mask	= cpu_to_be64(wr->swap_mask);
3369	aseg->compare		= cpu_to_be64(wr->compare_add);
3370	aseg->compare_mask	= cpu_to_be64(wr->compare_add_mask);
3371}
3372
3373static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
3374			     const struct ib_ud_wr *wr)
3375{
3376	memcpy(dseg->av, &to_mah(wr->ah)->av, sizeof (struct mlx4_av));
3377	dseg->dqpn = cpu_to_be32(wr->remote_qpn);
3378	dseg->qkey = cpu_to_be32(wr->remote_qkey);
3379	dseg->vlan = to_mah(wr->ah)->av.eth.vlan;
3380	memcpy(dseg->mac, to_mah(wr->ah)->av.eth.mac, 6);
3381}
3382
3383static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,
3384				    struct mlx4_wqe_datagram_seg *dseg,
3385				    const struct ib_ud_wr *wr,
3386				    enum mlx4_ib_qp_type qpt)
3387{
3388	union mlx4_ext_av *av = &to_mah(wr->ah)->av;
3389	struct mlx4_av sqp_av = {0};
3390	int port = *((u8 *) &av->ib.port_pd) & 0x3;
3391
3392	/* force loopback */
3393	sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000);
3394	sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */
3395	sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel &
3396			cpu_to_be32(0xf0000000);
3397
3398	memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av));
3399	if (qpt == MLX4_IB_QPT_PROXY_GSI)
3400		dseg->dqpn = cpu_to_be32(dev->dev->caps.spec_qps[port - 1].qp1_tunnel);
3401	else
3402		dseg->dqpn = cpu_to_be32(dev->dev->caps.spec_qps[port - 1].qp0_tunnel);
3403	/* Use QKEY from the QP context, which is set by master */
3404	dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY);
3405}
3406
3407static void build_tunnel_header(const struct ib_ud_wr *wr, void *wqe,
3408				unsigned *mlx_seg_len)
3409{
3410	struct mlx4_wqe_inline_seg *inl = wqe;
3411	struct mlx4_ib_tunnel_header hdr;
3412	struct mlx4_ib_ah *ah = to_mah(wr->ah);
3413	int spc;
3414	int i;
3415
3416	memcpy(&hdr.av, &ah->av, sizeof hdr.av);
3417	hdr.remote_qpn = cpu_to_be32(wr->remote_qpn);
3418	hdr.pkey_index = cpu_to_be16(wr->pkey_index);
3419	hdr.qkey = cpu_to_be32(wr->remote_qkey);
3420	memcpy(hdr.mac, ah->av.eth.mac, 6);
3421	hdr.vlan = ah->av.eth.vlan;
3422
3423	spc = MLX4_INLINE_ALIGN -
3424		((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
3425	if (sizeof (hdr) <= spc) {
3426		memcpy(inl + 1, &hdr, sizeof (hdr));
3427		wmb();
3428		inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr));
3429		i = 1;
3430	} else {
3431		memcpy(inl + 1, &hdr, spc);
3432		wmb();
3433		inl->byte_count = cpu_to_be32(1 << 31 | spc);
3434
3435		inl = (void *) (inl + 1) + spc;
3436		memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc);
3437		wmb();
3438		inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc));
3439		i = 2;
3440	}
3441
3442	*mlx_seg_len =
3443		ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16);
3444}
3445
3446static void set_mlx_icrc_seg(void *dseg)
3447{
3448	u32 *t = dseg;
3449	struct mlx4_wqe_inline_seg *iseg = dseg;
3450
3451	t[1] = 0;
3452
3453	/*
3454	 * Need a barrier here before writing the byte_count field to
3455	 * make sure that all the data is visible before the
3456	 * byte_count field is set.  Otherwise, if the segment begins
3457	 * a new cacheline, the HCA prefetcher could grab the 64-byte
3458	 * chunk and get a valid (!= * 0xffffffff) byte count but
3459	 * stale data, and end up sending the wrong data.
3460	 */
3461	wmb();
3462
3463	iseg->byte_count = cpu_to_be32((1 << 31) | 4);
3464}
3465
3466static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
3467{
3468	dseg->lkey       = cpu_to_be32(sg->lkey);
3469	dseg->addr       = cpu_to_be64(sg->addr);
3470
3471	/*
3472	 * Need a barrier here before writing the byte_count field to
3473	 * make sure that all the data is visible before the
3474	 * byte_count field is set.  Otherwise, if the segment begins
3475	 * a new cacheline, the HCA prefetcher could grab the 64-byte
3476	 * chunk and get a valid (!= * 0xffffffff) byte count but
3477	 * stale data, and end up sending the wrong data.
3478	 */
3479	wmb();
3480
3481	dseg->byte_count = cpu_to_be32(sg->length);
3482}
3483
3484static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
3485{
3486	dseg->byte_count = cpu_to_be32(sg->length);
3487	dseg->lkey       = cpu_to_be32(sg->lkey);
3488	dseg->addr       = cpu_to_be64(sg->addr);
3489}
3490
3491static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe,
3492			 const struct ib_ud_wr *wr, struct mlx4_ib_qp *qp,
3493			 unsigned *lso_seg_len, __be32 *lso_hdr_sz, __be32 *blh)
3494{
3495	unsigned halign = ALIGN(sizeof *wqe + wr->hlen, 16);
3496
3497	if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE))
3498		*blh = cpu_to_be32(1 << 6);
3499
3500	if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&
3501		     wr->wr.num_sge > qp->sq.max_gs - (halign >> 4)))
3502		return -EINVAL;
3503
3504	memcpy(wqe->header, wr->header, wr->hlen);
3505
3506	*lso_hdr_sz  = cpu_to_be32(wr->mss << 16 | wr->hlen);
 
3507	*lso_seg_len = halign;
3508	return 0;
3509}
3510
3511static __be32 send_ieth(const struct ib_send_wr *wr)
3512{
3513	switch (wr->opcode) {
3514	case IB_WR_SEND_WITH_IMM:
3515	case IB_WR_RDMA_WRITE_WITH_IMM:
3516		return wr->ex.imm_data;
3517
3518	case IB_WR_SEND_WITH_INV:
3519		return cpu_to_be32(wr->ex.invalidate_rkey);
3520
3521	default:
3522		return 0;
3523	}
3524}
3525
3526static void add_zero_len_inline(void *wqe)
3527{
3528	struct mlx4_wqe_inline_seg *inl = wqe;
3529	memset(wqe, 0, 16);
3530	inl->byte_count = cpu_to_be32(1 << 31);
3531}
3532
3533static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
3534			      const struct ib_send_wr **bad_wr, bool drain)
3535{
3536	struct mlx4_ib_qp *qp = to_mqp(ibqp);
3537	void *wqe;
3538	struct mlx4_wqe_ctrl_seg *ctrl;
3539	struct mlx4_wqe_data_seg *dseg;
3540	unsigned long flags;
3541	int nreq;
3542	int err = 0;
3543	unsigned ind;
3544	int size;
3545	unsigned seglen;
 
3546	__be32 dummy;
3547	__be32 *lso_wqe;
3548	__be32 lso_hdr_sz;
3549	__be32 blh;
3550	int i;
3551	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
3552
3553	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
3554		struct mlx4_ib_sqp *sqp = to_msqp(qp);
3555
3556		if (sqp->roce_v2_gsi) {
3557			struct mlx4_ib_ah *ah = to_mah(ud_wr(wr)->ah);
3558			enum ib_gid_type gid_type;
3559			union ib_gid gid;
3560
3561			if (!fill_gid_by_hw_index(mdev, sqp->qp.port,
3562					   ah->av.ib.gid_index,
3563					   &gid, &gid_type))
3564				qp = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ?
3565						to_mqp(sqp->roce_v2_gsi) : qp;
3566			else
3567				pr_err("Failed to get gid at index %d. RoCEv2 will not work properly\n",
3568				       ah->av.ib.gid_index);
3569		}
3570	}
3571
3572	spin_lock_irqsave(&qp->sq.lock, flags);
3573	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR &&
3574	    !drain) {
3575		err = -EIO;
3576		*bad_wr = wr;
3577		nreq = 0;
3578		goto out;
3579	}
3580
3581	ind = qp->sq_next_wqe;
3582
3583	for (nreq = 0; wr; ++nreq, wr = wr->next) {
3584		lso_wqe = &dummy;
3585		blh = 0;
3586
3587		if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
3588			err = -ENOMEM;
3589			*bad_wr = wr;
3590			goto out;
3591		}
3592
3593		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
3594			err = -EINVAL;
3595			*bad_wr = wr;
3596			goto out;
3597		}
3598
3599		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
3600		qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
3601
3602		ctrl->srcrb_flags =
3603			(wr->send_flags & IB_SEND_SIGNALED ?
3604			 cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
3605			(wr->send_flags & IB_SEND_SOLICITED ?
3606			 cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
3607			((wr->send_flags & IB_SEND_IP_CSUM) ?
3608			 cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
3609				     MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) |
3610			qp->sq_signal_bits;
3611
3612		ctrl->imm = send_ieth(wr);
3613
3614		wqe += sizeof *ctrl;
3615		size = sizeof *ctrl / 16;
3616
3617		switch (qp->mlx4_ib_qp_type) {
3618		case MLX4_IB_QPT_RC:
3619		case MLX4_IB_QPT_UC:
3620			switch (wr->opcode) {
3621			case IB_WR_ATOMIC_CMP_AND_SWP:
3622			case IB_WR_ATOMIC_FETCH_AND_ADD:
3623			case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
3624				set_raddr_seg(wqe, atomic_wr(wr)->remote_addr,
3625					      atomic_wr(wr)->rkey);
3626				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
3627
3628				set_atomic_seg(wqe, atomic_wr(wr));
3629				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
3630
3631				size += (sizeof (struct mlx4_wqe_raddr_seg) +
3632					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
3633
3634				break;
3635
3636			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
3637				set_raddr_seg(wqe, atomic_wr(wr)->remote_addr,
3638					      atomic_wr(wr)->rkey);
3639				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
3640
3641				set_masked_atomic_seg(wqe, atomic_wr(wr));
3642				wqe  += sizeof (struct mlx4_wqe_masked_atomic_seg);
3643
3644				size += (sizeof (struct mlx4_wqe_raddr_seg) +
3645					 sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16;
3646
3647				break;
3648
3649			case IB_WR_RDMA_READ:
3650			case IB_WR_RDMA_WRITE:
3651			case IB_WR_RDMA_WRITE_WITH_IMM:
3652				set_raddr_seg(wqe, rdma_wr(wr)->remote_addr,
3653					      rdma_wr(wr)->rkey);
3654				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
3655				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
3656				break;
3657
3658			case IB_WR_LOCAL_INV:
3659				ctrl->srcrb_flags |=
3660					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
3661				set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
3662				wqe  += sizeof (struct mlx4_wqe_local_inval_seg);
3663				size += sizeof (struct mlx4_wqe_local_inval_seg) / 16;
3664				break;
3665
3666			case IB_WR_REG_MR:
3667				ctrl->srcrb_flags |=
3668					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
3669				set_reg_seg(wqe, reg_wr(wr));
3670				wqe  += sizeof(struct mlx4_wqe_fmr_seg);
3671				size += sizeof(struct mlx4_wqe_fmr_seg) / 16;
3672				break;
3673
3674			default:
3675				/* No extra segments required for sends */
3676				break;
3677			}
3678			break;
3679
3680		case MLX4_IB_QPT_TUN_SMI_OWNER:
3681			err =  build_sriov_qp0_header(to_msqp(qp), ud_wr(wr),
3682					ctrl, &seglen);
3683			if (unlikely(err)) {
3684				*bad_wr = wr;
3685				goto out;
3686			}
3687			wqe  += seglen;
3688			size += seglen / 16;
3689			break;
3690		case MLX4_IB_QPT_TUN_SMI:
3691		case MLX4_IB_QPT_TUN_GSI:
3692			/* this is a UD qp used in MAD responses to slaves. */
3693			set_datagram_seg(wqe, ud_wr(wr));
3694			/* set the forced-loopback bit in the data seg av */
3695			*(__be32 *) wqe |= cpu_to_be32(0x80000000);
3696			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
3697			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
3698			break;
3699		case MLX4_IB_QPT_UD:
3700			set_datagram_seg(wqe, ud_wr(wr));
3701			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
3702			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
3703
3704			if (wr->opcode == IB_WR_LSO) {
3705				err = build_lso_seg(wqe, ud_wr(wr), qp, &seglen,
3706						&lso_hdr_sz, &blh);
3707				if (unlikely(err)) {
3708					*bad_wr = wr;
3709					goto out;
3710				}
3711				lso_wqe = (__be32 *) wqe;
3712				wqe  += seglen;
3713				size += seglen / 16;
3714			}
3715			break;
3716
3717		case MLX4_IB_QPT_PROXY_SMI_OWNER:
3718			err = build_sriov_qp0_header(to_msqp(qp), ud_wr(wr),
3719					ctrl, &seglen);
3720			if (unlikely(err)) {
3721				*bad_wr = wr;
3722				goto out;
3723			}
3724			wqe  += seglen;
3725			size += seglen / 16;
3726			/* to start tunnel header on a cache-line boundary */
3727			add_zero_len_inline(wqe);
3728			wqe += 16;
3729			size++;
3730			build_tunnel_header(ud_wr(wr), wqe, &seglen);
3731			wqe  += seglen;
3732			size += seglen / 16;
3733			break;
3734		case MLX4_IB_QPT_PROXY_SMI:
3735		case MLX4_IB_QPT_PROXY_GSI:
3736			/* If we are tunneling special qps, this is a UD qp.
3737			 * In this case we first add a UD segment targeting
3738			 * the tunnel qp, and then add a header with address
3739			 * information */
3740			set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe,
3741						ud_wr(wr),
3742						qp->mlx4_ib_qp_type);
3743			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
3744			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
3745			build_tunnel_header(ud_wr(wr), wqe, &seglen);
3746			wqe  += seglen;
3747			size += seglen / 16;
3748			break;
3749
3750		case MLX4_IB_QPT_SMI:
3751		case MLX4_IB_QPT_GSI:
3752			err = build_mlx_header(to_msqp(qp), ud_wr(wr), ctrl,
3753					&seglen);
3754			if (unlikely(err)) {
3755				*bad_wr = wr;
3756				goto out;
3757			}
3758			wqe  += seglen;
3759			size += seglen / 16;
3760			break;
3761
3762		default:
3763			break;
3764		}
3765
3766		/*
3767		 * Write data segments in reverse order, so as to
3768		 * overwrite cacheline stamp last within each
3769		 * cacheline.  This avoids issues with WQE
3770		 * prefetching.
3771		 */
3772
3773		dseg = wqe;
3774		dseg += wr->num_sge - 1;
3775		size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
3776
3777		/* Add one more inline data segment for ICRC for MLX sends */
3778		if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
3779			     qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI ||
3780			     qp->mlx4_ib_qp_type &
3781			     (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) {
3782			set_mlx_icrc_seg(dseg + 1);
3783			size += sizeof (struct mlx4_wqe_data_seg) / 16;
3784		}
3785
3786		for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
3787			set_data_seg(dseg, wr->sg_list + i);
3788
3789		/*
3790		 * Possibly overwrite stamping in cacheline with LSO
3791		 * segment only after making sure all data segments
3792		 * are written.
3793		 */
3794		wmb();
3795		*lso_wqe = lso_hdr_sz;
3796
3797		ctrl->qpn_vlan.fence_size = (wr->send_flags & IB_SEND_FENCE ?
3798					     MLX4_WQE_CTRL_FENCE : 0) | size;
 
 
 
 
 
3799
3800		/*
3801		 * Make sure descriptor is fully written before
3802		 * setting ownership bit (because HW can start
3803		 * executing as soon as we do).
3804		 */
3805		wmb();
3806
3807		if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
3808			*bad_wr = wr;
3809			err = -EINVAL;
3810			goto out;
3811		}
3812
3813		ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
3814			(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh;
3815
 
 
 
3816		/*
3817		 * We can improve latency by not stamping the last
3818		 * send queue WQE until after ringing the doorbell, so
3819		 * only stamp here if there are still more WQEs to post.
 
 
 
 
3820		 */
3821		if (wr->next)
3822			stamp_send_wqe(qp, ind + qp->sq_spare_wqes);
3823		ind++;
 
3824	}
3825
3826out:
3827	if (likely(nreq)) {
3828		qp->sq.head += nreq;
3829
3830		/*
3831		 * Make sure that descriptors are written before
3832		 * doorbell record.
3833		 */
3834		wmb();
3835
3836		writel_relaxed(qp->doorbell_qpn,
3837			to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL);
 
 
 
 
 
 
3838
3839		stamp_send_wqe(qp, ind + qp->sq_spare_wqes - 1);
3840
 
3841		qp->sq_next_wqe = ind;
3842	}
3843
3844	spin_unlock_irqrestore(&qp->sq.lock, flags);
3845
3846	return err;
3847}
3848
3849int mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
3850		      const struct ib_send_wr **bad_wr)
3851{
3852	return _mlx4_ib_post_send(ibqp, wr, bad_wr, false);
3853}
3854
3855static int _mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
3856			      const struct ib_recv_wr **bad_wr, bool drain)
3857{
3858	struct mlx4_ib_qp *qp = to_mqp(ibqp);
3859	struct mlx4_wqe_data_seg *scat;
3860	unsigned long flags;
3861	int err = 0;
3862	int nreq;
3863	int ind;
3864	int max_gs;
3865	int i;
3866	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
3867
3868	max_gs = qp->rq.max_gs;
3869	spin_lock_irqsave(&qp->rq.lock, flags);
3870
3871	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR &&
3872	    !drain) {
3873		err = -EIO;
3874		*bad_wr = wr;
3875		nreq = 0;
3876		goto out;
3877	}
3878
3879	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
3880
3881	for (nreq = 0; wr; ++nreq, wr = wr->next) {
3882		if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
3883			err = -ENOMEM;
3884			*bad_wr = wr;
3885			goto out;
3886		}
3887
3888		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
3889			err = -EINVAL;
3890			*bad_wr = wr;
3891			goto out;
3892		}
3893
3894		scat = get_recv_wqe(qp, ind);
3895
3896		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
3897		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
3898			ib_dma_sync_single_for_device(ibqp->device,
3899						      qp->sqp_proxy_rcv[ind].map,
3900						      sizeof (struct mlx4_ib_proxy_sqp_hdr),
3901						      DMA_FROM_DEVICE);
3902			scat->byte_count =
3903				cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr));
3904			/* use dma lkey from upper layer entry */
3905			scat->lkey = cpu_to_be32(wr->sg_list->lkey);
3906			scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map);
3907			scat++;
3908			max_gs--;
3909		}
3910
3911		for (i = 0; i < wr->num_sge; ++i)
3912			__set_data_seg(scat + i, wr->sg_list + i);
3913
3914		if (i < max_gs) {
3915			scat[i].byte_count = 0;
3916			scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
3917			scat[i].addr       = 0;
3918		}
3919
3920		qp->rq.wrid[ind] = wr->wr_id;
3921
3922		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
3923	}
3924
3925out:
3926	if (likely(nreq)) {
3927		qp->rq.head += nreq;
3928
3929		/*
3930		 * Make sure that descriptors are written before
3931		 * doorbell record.
3932		 */
3933		wmb();
3934
3935		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
3936	}
3937
3938	spin_unlock_irqrestore(&qp->rq.lock, flags);
3939
3940	return err;
3941}
3942
3943int mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
3944		      const struct ib_recv_wr **bad_wr)
3945{
3946	return _mlx4_ib_post_recv(ibqp, wr, bad_wr, false);
3947}
3948
3949static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state)
3950{
3951	switch (mlx4_state) {
3952	case MLX4_QP_STATE_RST:      return IB_QPS_RESET;
3953	case MLX4_QP_STATE_INIT:     return IB_QPS_INIT;
3954	case MLX4_QP_STATE_RTR:      return IB_QPS_RTR;
3955	case MLX4_QP_STATE_RTS:      return IB_QPS_RTS;
3956	case MLX4_QP_STATE_SQ_DRAINING:
3957	case MLX4_QP_STATE_SQD:      return IB_QPS_SQD;
3958	case MLX4_QP_STATE_SQER:     return IB_QPS_SQE;
3959	case MLX4_QP_STATE_ERR:      return IB_QPS_ERR;
3960	default:		     return -1;
3961	}
3962}
3963
3964static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state)
3965{
3966	switch (mlx4_mig_state) {
3967	case MLX4_QP_PM_ARMED:		return IB_MIG_ARMED;
3968	case MLX4_QP_PM_REARM:		return IB_MIG_REARM;
3969	case MLX4_QP_PM_MIGRATED:	return IB_MIG_MIGRATED;
3970	default: return -1;
3971	}
3972}
3973
3974static int to_ib_qp_access_flags(int mlx4_flags)
3975{
3976	int ib_flags = 0;
3977
3978	if (mlx4_flags & MLX4_QP_BIT_RRE)
3979		ib_flags |= IB_ACCESS_REMOTE_READ;
3980	if (mlx4_flags & MLX4_QP_BIT_RWE)
3981		ib_flags |= IB_ACCESS_REMOTE_WRITE;
3982	if (mlx4_flags & MLX4_QP_BIT_RAE)
3983		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
3984
3985	return ib_flags;
3986}
3987
3988static void to_rdma_ah_attr(struct mlx4_ib_dev *ibdev,
3989			    struct rdma_ah_attr *ah_attr,
3990			    struct mlx4_qp_path *path)
3991{
3992	struct mlx4_dev *dev = ibdev->dev;
3993	u8 port_num = path->sched_queue & 0x40 ? 2 : 1;
 
 
 
3994
3995	memset(ah_attr, 0, sizeof(*ah_attr));
3996	if (port_num == 0 || port_num > dev->caps.num_ports)
3997		return;
3998	ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, port_num);
3999
4000	if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE)
4001		rdma_ah_set_sl(ah_attr, ((path->sched_queue >> 3) & 0x7) |
4002			       ((path->sched_queue & 4) << 1));
 
 
4003	else
4004		rdma_ah_set_sl(ah_attr, (path->sched_queue >> 2) & 0xf);
4005	rdma_ah_set_port_num(ah_attr, port_num);
4006
4007	rdma_ah_set_dlid(ah_attr, be16_to_cpu(path->rlid));
4008	rdma_ah_set_path_bits(ah_attr, path->grh_mylmc & 0x7f);
4009	rdma_ah_set_static_rate(ah_attr,
4010				path->static_rate ? path->static_rate - 5 : 0);
4011	if (path->grh_mylmc & (1 << 7)) {
4012		rdma_ah_set_grh(ah_attr, NULL,
4013				be32_to_cpu(path->tclass_flowlabel) & 0xfffff,
4014				path->mgid_index,
4015				path->hop_limit,
4016				(be32_to_cpu(path->tclass_flowlabel)
4017				 >> 20) & 0xff);
4018		rdma_ah_set_dgid_raw(ah_attr, path->rgid);
 
4019	}
4020}
4021
4022int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
4023		     struct ib_qp_init_attr *qp_init_attr)
4024{
4025	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
4026	struct mlx4_ib_qp *qp = to_mqp(ibqp);
4027	struct mlx4_qp_context context;
4028	int mlx4_state;
4029	int err = 0;
4030
4031	if (ibqp->rwq_ind_tbl)
4032		return -EOPNOTSUPP;
4033
4034	mutex_lock(&qp->mutex);
4035
4036	if (qp->state == IB_QPS_RESET) {
4037		qp_attr->qp_state = IB_QPS_RESET;
4038		goto done;
4039	}
4040
4041	err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
4042	if (err) {
4043		err = -EINVAL;
4044		goto out;
4045	}
4046
4047	mlx4_state = be32_to_cpu(context.flags) >> 28;
4048
4049	qp->state		     = to_ib_qp_state(mlx4_state);
4050	qp_attr->qp_state	     = qp->state;
4051	qp_attr->path_mtu	     = context.mtu_msgmax >> 5;
4052	qp_attr->path_mig_state	     =
4053		to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
4054	qp_attr->qkey		     = be32_to_cpu(context.qkey);
4055	qp_attr->rq_psn		     = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
4056	qp_attr->sq_psn		     = be32_to_cpu(context.next_send_psn) & 0xffffff;
4057	qp_attr->dest_qp_num	     = be32_to_cpu(context.remote_qpn) & 0xffffff;
4058	qp_attr->qp_access_flags     =
4059		to_ib_qp_access_flags(be32_to_cpu(context.params2));
4060
4061	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
4062		to_rdma_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
4063		to_rdma_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path);
4064		qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
4065		qp_attr->alt_port_num	=
4066			rdma_ah_get_port_num(&qp_attr->alt_ah_attr);
4067	}
4068
4069	qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
4070	if (qp_attr->qp_state == IB_QPS_INIT)
4071		qp_attr->port_num = qp->port;
4072	else
4073		qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
4074
4075	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
4076	qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
4077
4078	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
4079
4080	qp_attr->max_dest_rd_atomic =
4081		1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
4082	qp_attr->min_rnr_timer	    =
4083		(be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
4084	qp_attr->timeout	    = context.pri_path.ackto >> 3;
4085	qp_attr->retry_cnt	    = (be32_to_cpu(context.params1) >> 16) & 0x7;
4086	qp_attr->rnr_retry	    = (be32_to_cpu(context.params1) >> 13) & 0x7;
4087	qp_attr->alt_timeout	    = context.alt_path.ackto >> 3;
4088
4089done:
4090	qp_attr->cur_qp_state	     = qp_attr->qp_state;
4091	qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
4092	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
4093
4094	if (!ibqp->uobject) {
4095		qp_attr->cap.max_send_wr  = qp->sq.wqe_cnt;
4096		qp_attr->cap.max_send_sge = qp->sq.max_gs;
4097	} else {
4098		qp_attr->cap.max_send_wr  = 0;
4099		qp_attr->cap.max_send_sge = 0;
4100	}
4101
4102	/*
4103	 * We don't support inline sends for kernel QPs (yet), and we
4104	 * don't know what userspace's value should be.
4105	 */
4106	qp_attr->cap.max_inline_data = 0;
4107
4108	qp_init_attr->cap	     = qp_attr->cap;
4109
4110	qp_init_attr->create_flags = 0;
4111	if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
4112		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
4113
4114	if (qp->flags & MLX4_IB_QP_LSO)
4115		qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
4116
4117	if (qp->flags & MLX4_IB_QP_NETIF)
4118		qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP;
4119
4120	qp_init_attr->sq_sig_type =
4121		qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ?
4122		IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
4123
4124out:
4125	mutex_unlock(&qp->mutex);
4126	return err;
4127}
4128
4129struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
4130				struct ib_wq_init_attr *init_attr,
4131				struct ib_udata *udata)
4132{
4133	struct mlx4_dev *dev = to_mdev(pd->device)->dev;
4134	struct ib_qp_init_attr ib_qp_init_attr = {};
4135	struct mlx4_ib_qp *qp;
4136	struct mlx4_ib_create_wq ucmd;
4137	int err, required_cmd_sz;
4138
4139	if (!udata)
4140		return ERR_PTR(-EINVAL);
4141
4142	required_cmd_sz = offsetof(typeof(ucmd), comp_mask) +
4143			  sizeof(ucmd.comp_mask);
4144	if (udata->inlen < required_cmd_sz) {
4145		pr_debug("invalid inlen\n");
4146		return ERR_PTR(-EINVAL);
4147	}
4148
4149	if (udata->inlen > sizeof(ucmd) &&
4150	    !ib_is_udata_cleared(udata, sizeof(ucmd),
4151				 udata->inlen - sizeof(ucmd))) {
4152		pr_debug("inlen is not supported\n");
4153		return ERR_PTR(-EOPNOTSUPP);
4154	}
4155
4156	if (udata->outlen)
4157		return ERR_PTR(-EOPNOTSUPP);
4158
4159	if (init_attr->wq_type != IB_WQT_RQ) {
4160		pr_debug("unsupported wq type %d\n", init_attr->wq_type);
4161		return ERR_PTR(-EOPNOTSUPP);
4162	}
4163
4164	if (init_attr->create_flags & ~IB_WQ_FLAGS_SCATTER_FCS ||
4165	    !(dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP)) {
4166		pr_debug("unsupported create_flags %u\n",
4167			 init_attr->create_flags);
4168		return ERR_PTR(-EOPNOTSUPP);
4169	}
4170
4171	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
4172	if (!qp)
4173		return ERR_PTR(-ENOMEM);
4174
4175	qp->pri.vid = 0xFFFF;
4176	qp->alt.vid = 0xFFFF;
4177
4178	ib_qp_init_attr.qp_context = init_attr->wq_context;
4179	ib_qp_init_attr.qp_type = IB_QPT_RAW_PACKET;
4180	ib_qp_init_attr.cap.max_recv_wr = init_attr->max_wr;
4181	ib_qp_init_attr.cap.max_recv_sge = init_attr->max_sge;
4182	ib_qp_init_attr.recv_cq = init_attr->cq;
4183	ib_qp_init_attr.send_cq = ib_qp_init_attr.recv_cq; /* Dummy CQ */
4184
4185	if (init_attr->create_flags & IB_WQ_FLAGS_SCATTER_FCS)
4186		ib_qp_init_attr.create_flags |= IB_QP_CREATE_SCATTER_FCS;
4187
4188	err = create_rq(pd, &ib_qp_init_attr, udata, qp);
4189	if (err) {
4190		kfree(qp);
4191		return ERR_PTR(err);
4192	}
4193
4194	qp->ibwq.event_handler = init_attr->event_handler;
4195	qp->ibwq.wq_num = qp->mqp.qpn;
4196	qp->ibwq.state = IB_WQS_RESET;
4197
4198	return &qp->ibwq;
4199}
4200
4201static int ib_wq2qp_state(enum ib_wq_state state)
4202{
4203	switch (state) {
4204	case IB_WQS_RESET:
4205		return IB_QPS_RESET;
4206	case IB_WQS_RDY:
4207		return IB_QPS_RTR;
4208	default:
4209		return IB_QPS_ERR;
4210	}
4211}
4212
4213static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state,
4214			      struct ib_udata *udata)
4215{
4216	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
4217	enum ib_qp_state qp_cur_state;
4218	enum ib_qp_state qp_new_state;
4219	int attr_mask;
4220	int err;
4221
4222	/* ib_qp.state represents the WQ HW state while ib_wq.state represents
4223	 * the WQ logic state.
4224	 */
4225	qp_cur_state = qp->state;
4226	qp_new_state = ib_wq2qp_state(new_state);
4227
4228	if (ib_wq2qp_state(new_state) == qp_cur_state)
4229		return 0;
4230
4231	if (new_state == IB_WQS_RDY) {
4232		struct ib_qp_attr attr = {};
4233
4234		attr.port_num = qp->port;
4235		attr_mask = IB_QP_PORT;
4236
4237		err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, &attr,
4238					  attr_mask, IB_QPS_RESET, IB_QPS_INIT,
4239					  udata);
4240		if (err) {
4241			pr_debug("WQN=0x%06x failed to apply RST->INIT on the HW QP\n",
4242				 ibwq->wq_num);
4243			return err;
4244		}
4245
4246		qp_cur_state = IB_QPS_INIT;
4247	}
4248
4249	attr_mask = 0;
4250	err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, attr_mask,
4251				  qp_cur_state,  qp_new_state, udata);
4252
4253	if (err && (qp_cur_state == IB_QPS_INIT)) {
4254		qp_new_state = IB_QPS_RESET;
4255		if (__mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL,
4256					attr_mask, IB_QPS_INIT, IB_QPS_RESET,
4257					udata)) {
4258			pr_warn("WQN=0x%06x failed with reverting HW's resources failure\n",
4259				ibwq->wq_num);
4260			qp_new_state = IB_QPS_INIT;
4261		}
4262	}
4263
4264	qp->state = qp_new_state;
4265
4266	return err;
4267}
4268
4269int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr,
4270		      u32 wq_attr_mask, struct ib_udata *udata)
4271{
4272	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
4273	struct mlx4_ib_modify_wq ucmd = {};
4274	size_t required_cmd_sz;
4275	enum ib_wq_state cur_state, new_state;
4276	int err = 0;
4277
4278	required_cmd_sz = offsetof(typeof(ucmd), reserved) +
4279				   sizeof(ucmd.reserved);
4280	if (udata->inlen < required_cmd_sz)
4281		return -EINVAL;
4282
4283	if (udata->inlen > sizeof(ucmd) &&
4284	    !ib_is_udata_cleared(udata, sizeof(ucmd),
4285				 udata->inlen - sizeof(ucmd)))
4286		return -EOPNOTSUPP;
4287
4288	if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)))
4289		return -EFAULT;
4290
4291	if (ucmd.comp_mask || ucmd.reserved)
4292		return -EOPNOTSUPP;
4293
4294	if (wq_attr_mask & IB_WQ_FLAGS)
4295		return -EOPNOTSUPP;
4296
4297	cur_state = wq_attr_mask & IB_WQ_CUR_STATE ? wq_attr->curr_wq_state :
4298						     ibwq->state;
4299	new_state = wq_attr_mask & IB_WQ_STATE ? wq_attr->wq_state : cur_state;
4300
4301	if (cur_state  < IB_WQS_RESET || cur_state  > IB_WQS_ERR ||
4302	    new_state < IB_WQS_RESET || new_state > IB_WQS_ERR)
4303		return -EINVAL;
4304
4305	if ((new_state == IB_WQS_RDY) && (cur_state == IB_WQS_ERR))
4306		return -EINVAL;
4307
4308	if ((new_state == IB_WQS_ERR) && (cur_state == IB_WQS_RESET))
4309		return -EINVAL;
4310
4311	/* Need to protect against the parent RSS which also may modify WQ
4312	 * state.
4313	 */
4314	mutex_lock(&qp->mutex);
4315
4316	/* Can update HW state only if a RSS QP has already associated to this
4317	 * WQ, so we can apply its port on the WQ.
4318	 */
4319	if (qp->rss_usecnt)
4320		err = _mlx4_ib_modify_wq(ibwq, new_state, udata);
4321
4322	if (!err)
4323		ibwq->state = new_state;
4324
4325	mutex_unlock(&qp->mutex);
4326
4327	return err;
4328}
4329
4330void mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
4331{
4332	struct mlx4_ib_dev *dev = to_mdev(ibwq->device);
4333	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
4334
4335	if (qp->counter_index)
4336		mlx4_ib_free_qp_counter(dev, qp);
4337
4338	destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, udata);
4339
4340	kfree(qp);
4341}
4342
4343struct ib_rwq_ind_table
4344*mlx4_ib_create_rwq_ind_table(struct ib_device *device,
4345			      struct ib_rwq_ind_table_init_attr *init_attr,
4346			      struct ib_udata *udata)
4347{
4348	struct ib_rwq_ind_table *rwq_ind_table;
4349	struct mlx4_ib_create_rwq_ind_tbl_resp resp = {};
4350	unsigned int ind_tbl_size = 1 << init_attr->log_ind_tbl_size;
4351	unsigned int base_wqn;
4352	size_t min_resp_len;
4353	int i;
4354	int err;
4355
4356	if (udata->inlen > 0 &&
4357	    !ib_is_udata_cleared(udata, 0,
4358				 udata->inlen))
4359		return ERR_PTR(-EOPNOTSUPP);
4360
4361	min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
4362	if (udata->outlen && udata->outlen < min_resp_len)
4363		return ERR_PTR(-EINVAL);
4364
4365	if (ind_tbl_size >
4366	    device->attrs.rss_caps.max_rwq_indirection_table_size) {
4367		pr_debug("log_ind_tbl_size = %d is bigger than supported = %d\n",
4368			 ind_tbl_size,
4369			 device->attrs.rss_caps.max_rwq_indirection_table_size);
4370		return ERR_PTR(-EINVAL);
4371	}
4372
4373	base_wqn = init_attr->ind_tbl[0]->wq_num;
4374
4375	if (base_wqn % ind_tbl_size) {
4376		pr_debug("WQN=0x%x isn't aligned with indirection table size\n",
4377			 base_wqn);
4378		return ERR_PTR(-EINVAL);
4379	}
4380
4381	for (i = 1; i < ind_tbl_size; i++) {
4382		if (++base_wqn != init_attr->ind_tbl[i]->wq_num) {
4383			pr_debug("indirection table's WQNs aren't consecutive\n");
4384			return ERR_PTR(-EINVAL);
4385		}
4386	}
4387
4388	rwq_ind_table = kzalloc(sizeof(*rwq_ind_table), GFP_KERNEL);
4389	if (!rwq_ind_table)
4390		return ERR_PTR(-ENOMEM);
4391
4392	if (udata->outlen) {
4393		resp.response_length = offsetof(typeof(resp), response_length) +
4394					sizeof(resp.response_length);
4395		err = ib_copy_to_udata(udata, &resp, resp.response_length);
4396		if (err)
4397			goto err;
4398	}
4399
4400	return rwq_ind_table;
4401
4402err:
4403	kfree(rwq_ind_table);
4404	return ERR_PTR(err);
4405}
4406
4407int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
4408{
4409	kfree(ib_rwq_ind_tbl);
4410	return 0;
4411}
4412
4413struct mlx4_ib_drain_cqe {
4414	struct ib_cqe cqe;
4415	struct completion done;
4416};
4417
4418static void mlx4_ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
4419{
4420	struct mlx4_ib_drain_cqe *cqe = container_of(wc->wr_cqe,
4421						     struct mlx4_ib_drain_cqe,
4422						     cqe);
4423
4424	complete(&cqe->done);
4425}
4426
4427/* This function returns only once the drained WR was completed */
4428static void handle_drain_completion(struct ib_cq *cq,
4429				    struct mlx4_ib_drain_cqe *sdrain,
4430				    struct mlx4_ib_dev *dev)
4431{
4432	struct mlx4_dev *mdev = dev->dev;
4433
4434	if (cq->poll_ctx == IB_POLL_DIRECT) {
4435		while (wait_for_completion_timeout(&sdrain->done, HZ / 10) <= 0)
4436			ib_process_cq_direct(cq, -1);
4437		return;
4438	}
4439
4440	if (mdev->persist->state == MLX4_DEVICE_STATE_INTERNAL_ERROR) {
4441		struct mlx4_ib_cq *mcq = to_mcq(cq);
4442		bool triggered = false;
4443		unsigned long flags;
4444
4445		spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
4446		/* Make sure that the CQ handler won't run if wasn't run yet */
4447		if (!mcq->mcq.reset_notify_added)
4448			mcq->mcq.reset_notify_added = 1;
4449		else
4450			triggered = true;
4451		spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
4452
4453		if (triggered) {
4454			/* Wait for any scheduled/running task to be ended */
4455			switch (cq->poll_ctx) {
4456			case IB_POLL_SOFTIRQ:
4457				irq_poll_disable(&cq->iop);
4458				irq_poll_enable(&cq->iop);
4459				break;
4460			case IB_POLL_WORKQUEUE:
4461				cancel_work_sync(&cq->work);
4462				break;
4463			default:
4464				WARN_ON_ONCE(1);
4465			}
4466		}
4467
4468		/* Run the CQ handler - this makes sure that the drain WR will
4469		 * be processed if wasn't processed yet.
4470		 */
4471		mcq->mcq.comp(&mcq->mcq);
4472	}
4473
4474	wait_for_completion(&sdrain->done);
4475}
4476
4477void mlx4_ib_drain_sq(struct ib_qp *qp)
4478{
4479	struct ib_cq *cq = qp->send_cq;
4480	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
4481	struct mlx4_ib_drain_cqe sdrain;
4482	const struct ib_send_wr *bad_swr;
4483	struct ib_rdma_wr swr = {
4484		.wr = {
4485			.next = NULL,
4486			{ .wr_cqe	= &sdrain.cqe, },
4487			.opcode	= IB_WR_RDMA_WRITE,
4488		},
4489	};
4490	int ret;
4491	struct mlx4_ib_dev *dev = to_mdev(qp->device);
4492	struct mlx4_dev *mdev = dev->dev;
4493
4494	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
4495	if (ret && mdev->persist->state != MLX4_DEVICE_STATE_INTERNAL_ERROR) {
4496		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
4497		return;
4498	}
4499
4500	sdrain.cqe.done = mlx4_ib_drain_qp_done;
4501	init_completion(&sdrain.done);
4502
4503	ret = _mlx4_ib_post_send(qp, &swr.wr, &bad_swr, true);
4504	if (ret) {
4505		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
4506		return;
4507	}
4508
4509	handle_drain_completion(cq, &sdrain, dev);
4510}
4511
4512void mlx4_ib_drain_rq(struct ib_qp *qp)
4513{
4514	struct ib_cq *cq = qp->recv_cq;
4515	struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
4516	struct mlx4_ib_drain_cqe rdrain;
4517	struct ib_recv_wr rwr = {};
4518	const struct ib_recv_wr *bad_rwr;
4519	int ret;
4520	struct mlx4_ib_dev *dev = to_mdev(qp->device);
4521	struct mlx4_dev *mdev = dev->dev;
4522
4523	ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
4524	if (ret && mdev->persist->state != MLX4_DEVICE_STATE_INTERNAL_ERROR) {
4525		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
4526		return;
4527	}
4528
4529	rwr.wr_cqe = &rdrain.cqe;
4530	rdrain.cqe.done = mlx4_ib_drain_qp_done;
4531	init_completion(&rdrain.done);
4532
4533	ret = _mlx4_ib_post_recv(qp, &rwr, &bad_rwr, true);
4534	if (ret) {
4535		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
4536		return;
4537	}
4538
4539	handle_drain_completion(cq, &rdrain, dev);
4540}