Linux Audio

Check our new training course

Loading...
v4.6
  1/*
  2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  3 *
  4 * This software is available to you under a choice of one of two
  5 * licenses.  You may choose to be licensed under the terms of the GNU
  6 * General Public License (GPL) Version 2, available from the file
  7 * COPYING in the main directory of this source tree, or the BSD-type
  8 * license below:
  9 *
 10 * Redistribution and use in source and binary forms, with or without
 11 * modification, are permitted provided that the following conditions
 12 * are met:
 13 *
 14 *      Redistributions of source code must retain the above copyright
 15 *      notice, this list of conditions and the following disclaimer.
 16 *
 17 *      Redistributions in binary form must reproduce the above
 18 *      copyright notice, this list of conditions and the following
 19 *      disclaimer in the documentation and/or other materials provided
 20 *      with the distribution.
 21 *
 22 *      Neither the name of the Network Appliance, Inc. nor the names of
 23 *      its contributors may be used to endorse or promote products
 24 *      derived from this software without specific prior written
 25 *      permission.
 26 *
 27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 38 */
 39
 40#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
 41#define _LINUX_SUNRPC_XPRT_RDMA_H
 42
 43#include <linux/wait.h> 		/* wait_queue_head_t, etc */
 44#include <linux/spinlock.h> 		/* spinlock_t, etc */
 45#include <linux/atomic.h>			/* atomic_t, etc */
 46#include <linux/workqueue.h>		/* struct work_struct */
 47
 48#include <rdma/rdma_cm.h>		/* RDMA connection api */
 49#include <rdma/ib_verbs.h>		/* RDMA verbs api */
 50
 51#include <linux/sunrpc/clnt.h> 		/* rpc_xprt */
 52#include <linux/sunrpc/rpc_rdma.h> 	/* RPC/RDMA protocol */
 53#include <linux/sunrpc/xprtrdma.h> 	/* xprt parameters */
 54
 55#define RDMA_RESOLVE_TIMEOUT	(5000)	/* 5 seconds */
 56#define RDMA_CONNECT_RETRY_MAX	(2)	/* retries if no listener backlog */
 57
 58#define RPCRDMA_BIND_TO		(60U * HZ)
 59#define RPCRDMA_INIT_REEST_TO	(5U * HZ)
 60#define RPCRDMA_MAX_REEST_TO	(30U * HZ)
 61#define RPCRDMA_IDLE_DISC_TO	(5U * 60 * HZ)
 62
 63/*
 64 * Interface Adapter -- one per transport instance
 65 */
 66struct rpcrdma_ia {
 67	const struct rpcrdma_memreg_ops	*ri_ops;
 68	rwlock_t		ri_qplock;
 69	struct ib_device	*ri_device;
 70	struct rdma_cm_id 	*ri_id;
 71	struct ib_pd		*ri_pd;
 72	struct ib_mr		*ri_dma_mr;
 
 
 73	struct completion	ri_done;
 74	int			ri_async_rc;
 75	unsigned int		ri_max_frmr_depth;
 76	struct ib_qp_attr	ri_qp_attr;
 77	struct ib_qp_init_attr	ri_qp_init_attr;
 78};
 79
 80/*
 81 * RDMA Endpoint -- one per transport instance
 82 */
 83
 84struct rpcrdma_ep {
 85	atomic_t		rep_cqcount;
 86	int			rep_cqinit;
 87	int			rep_connected;
 
 
 88	struct ib_qp_init_attr	rep_attr;
 89	wait_queue_head_t 	rep_connect_wait;
 
 
 
 
 90	struct rdma_conn_param	rep_remote_cma;
 91	struct sockaddr_storage	rep_remote_addr;
 92	struct delayed_work	rep_connect_worker;
 93};
 94
 95#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
 96#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
 97
 98/* Pre-allocate extra Work Requests for handling backward receives
 99 * and sends. This is a fixed value because the Work Queues are
100 * allocated when the forward channel is set up.
101 */
102#if defined(CONFIG_SUNRPC_BACKCHANNEL)
103#define RPCRDMA_BACKWARD_WRS		(8)
104#else
105#define RPCRDMA_BACKWARD_WRS		(0)
106#endif
107
108/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
109 *
110 * The below structure appears at the front of a large region of kmalloc'd
111 * memory, which always starts on a good alignment boundary.
112 */
113
114struct rpcrdma_regbuf {
115	size_t			rg_size;
116	struct rpcrdma_req	*rg_owner;
117	struct ib_sge		rg_iov;
118	__be32			rg_base[0] __attribute__ ((aligned(256)));
119};
120
121static inline u64
122rdmab_addr(struct rpcrdma_regbuf *rb)
123{
124	return rb->rg_iov.addr;
125}
126
127static inline u32
128rdmab_length(struct rpcrdma_regbuf *rb)
129{
130	return rb->rg_iov.length;
131}
132
133static inline u32
134rdmab_lkey(struct rpcrdma_regbuf *rb)
135{
136	return rb->rg_iov.lkey;
137}
138
139static inline struct rpcrdma_msg *
140rdmab_to_msg(struct rpcrdma_regbuf *rb)
141{
142	return (struct rpcrdma_msg *)rb->rg_base;
143}
144
145#define RPCRDMA_DEF_GFP		(GFP_NOIO | __GFP_NOWARN)
146
147/*
148 * struct rpcrdma_rep -- this structure encapsulates state required to recv
149 * and complete a reply, asychronously. It needs several pieces of
150 * state:
151 *   o recv buffer (posted to provider)
152 *   o ib_sge (also donated to provider)
153 *   o status of reply (length, success or not)
154 *   o bookkeeping state to get run by tasklet (list, etc)
155 *
156 * These are allocated during initialization, per-transport instance;
157 * however, the tasklet execution list itself is global, as it should
158 * always be pretty short.
159 *
160 * N of these are associated with a transport instance, and stored in
161 * struct rpcrdma_buffer. N is the max number of outstanding requests.
162 */
163
164#define RPCRDMA_MAX_DATA_SEGS	((1 * 1024 * 1024) / PAGE_SIZE)
 
165#define RPCRDMA_MAX_SEGS 	(RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
 
 
 
 
166
167struct rpcrdma_buffer;
168
169struct rpcrdma_rep {
170	struct ib_cqe		rr_cqe;
171	unsigned int		rr_len;
172	struct ib_device	*rr_device;
173	struct rpcrdma_xprt	*rr_rxprt;
174	struct work_struct	rr_work;
175	struct list_head	rr_list;
176	struct rpcrdma_regbuf	*rr_rdmabuf;
177};
178
179#define RPCRDMA_BAD_LEN		(~0U)
180
181/*
182 * struct rpcrdma_mw - external memory region metadata
183 *
184 * An external memory region is any buffer or page that is registered
185 * on the fly (ie, not pre-registered).
186 *
187 * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During
188 * call_allocate, rpcrdma_buffer_get() assigns one to each segment in
189 * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep
190 * track of registration metadata while each RPC is pending.
191 * rpcrdma_deregister_external() uses this metadata to unmap and
192 * release these resources when an RPC is complete.
193 */
194enum rpcrdma_frmr_state {
195	FRMR_IS_INVALID,	/* ready to be used */
196	FRMR_IS_VALID,		/* in use */
197	FRMR_IS_STALE,		/* failed completion */
198};
199
200struct rpcrdma_frmr {
201	struct scatterlist		*sg;
202	int				sg_nents;
203	struct ib_mr			*fr_mr;
204	struct ib_cqe			fr_cqe;
205	enum rpcrdma_frmr_state		fr_state;
206	struct completion		fr_linv_done;
207	struct work_struct		fr_work;
208	struct rpcrdma_xprt		*fr_xprt;
209	union {
210		struct ib_reg_wr	fr_regwr;
211		struct ib_send_wr	fr_invwr;
212	};
213};
214
215struct rpcrdma_fmr {
216	struct ib_fmr		*fmr;
217	u64			*physaddrs;
218};
219
220struct rpcrdma_mw {
221	union {
222		struct rpcrdma_fmr	fmr;
223		struct rpcrdma_frmr	frmr;
224	};
225	struct list_head	mw_list;
226	struct list_head	mw_all;
227};
228
229/*
230 * struct rpcrdma_req -- structure central to the request/reply sequence.
231 *
232 * N of these are associated with a transport instance, and stored in
233 * struct rpcrdma_buffer. N is the max number of outstanding requests.
234 *
235 * It includes pre-registered buffer memory for send AND recv.
236 * The recv buffer, however, is not owned by this structure, and
237 * is "donated" to the hardware when a recv is posted. When a
238 * reply is handled, the recv buffer used is given back to the
239 * struct rpcrdma_req associated with the request.
240 *
241 * In addition to the basic memory, this structure includes an array
242 * of iovs for send operations. The reason is that the iovs passed to
243 * ib_post_{send,recv} must not be modified until the work request
244 * completes.
245 *
246 * NOTES:
247 *   o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
248 *     marshal. The number needed varies depending on the iov lists that
249 *     are passed to us, the memory registration mode we are in, and if
250 *     physical addressing is used, the layout.
251 */
252
253struct rpcrdma_mr_seg {		/* chunk descriptors */
254	struct rpcrdma_mw *rl_mw;	/* registered MR */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255	u64		mr_base;	/* registration result */
256	u32		mr_rkey;	/* registration result */
257	u32		mr_len;		/* length of chunk or segment */
258	int		mr_nsegs;	/* number of segments in chunk or 0 */
259	enum dma_data_direction	mr_dir;	/* segment mapping direction */
260	dma_addr_t	mr_dma;		/* segment mapping address */
261	size_t		mr_dmalen;	/* segment mapping length */
262	struct page	*mr_page;	/* owning page, if any */
263	char		*mr_offset;	/* kva if no page, else offset */
264};
265
266#define RPCRDMA_MAX_IOVS	(2)
267
268struct rpcrdma_req {
269	struct list_head	rl_free;
270	unsigned int		rl_niovs;
271	unsigned int		rl_nchunks;
272	unsigned int		rl_connect_cookie;
273	struct rpcrdma_buffer	*rl_buffer;
274	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
275	struct ib_sge		rl_send_iov[RPCRDMA_MAX_IOVS];
276	struct rpcrdma_regbuf	*rl_rdmabuf;
277	struct rpcrdma_regbuf	*rl_sendbuf;
278	struct rpcrdma_mr_seg	rl_segments[RPCRDMA_MAX_SEGS];
279
280	struct ib_cqe		rl_cqe;
281	struct list_head	rl_all;
282	bool			rl_backchannel;
283};
284
285static inline struct rpcrdma_req *
286rpcr_to_rdmar(struct rpc_rqst *rqst)
287{
288	void *buffer = rqst->rq_buffer;
289	struct rpcrdma_regbuf *rb;
290
291	rb = container_of(buffer, struct rpcrdma_regbuf, rg_base);
292	return rb->rg_owner;
293}
294
295/*
296 * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
297 * inline requests/replies, and client/server credits.
298 *
299 * One of these is associated with a transport instance
300 */
301struct rpcrdma_buffer {
302	spinlock_t		rb_mwlock;	/* protect rb_mws list */
303	struct list_head	rb_mws;
304	struct list_head	rb_all;
305	char			*rb_pool;
306
307	spinlock_t		rb_lock;	/* protect buf lists */
308	struct list_head	rb_send_bufs;
309	struct list_head	rb_recv_bufs;
310	u32			rb_max_requests;
311	atomic_t		rb_credits;	/* most recent credit grant */
312
313	u32			rb_bc_srv_max_requests;
314	spinlock_t		rb_reqslock;	/* protect rb_allreqs */
315	struct list_head	rb_allreqs;
316
317	u32			rb_bc_max_requests;
318};
319#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
320
321/*
322 * Internal structure for transport instance creation. This
323 * exists primarily for modularity.
324 *
325 * This data should be set with mount options
326 */
327struct rpcrdma_create_data_internal {
328	struct sockaddr_storage	addr;	/* RDMA server address */
329	unsigned int	max_requests;	/* max requests (slots) in flight */
330	unsigned int	rsize;		/* mount rsize - max read hdr+data */
331	unsigned int	wsize;		/* mount wsize - max write hdr+data */
332	unsigned int	inline_rsize;	/* max non-rdma read data payload */
333	unsigned int	inline_wsize;	/* max non-rdma write data payload */
334	unsigned int	padding;	/* non-rdma write header padding */
335};
336
337#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
338	(rpcx_to_rdmad(rq->rq_xprt).inline_rsize)
339
340#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
341	(rpcx_to_rdmad(rq->rq_xprt).inline_wsize)
342
343#define RPCRDMA_INLINE_PAD_VALUE(rq)\
344	rpcx_to_rdmad(rq->rq_xprt).padding
345
346/*
347 * Statistics for RPCRDMA
348 */
349struct rpcrdma_stats {
350	unsigned long		read_chunk_count;
351	unsigned long		write_chunk_count;
352	unsigned long		reply_chunk_count;
353
354	unsigned long long	total_rdma_request;
355	unsigned long long	total_rdma_reply;
356
357	unsigned long long	pullup_copy_count;
358	unsigned long long	fixup_copy_count;
359	unsigned long		hardway_register_count;
360	unsigned long		failed_marshal_count;
361	unsigned long		bad_reply_count;
362	unsigned long		nomsg_call_count;
363	unsigned long		bcall_count;
364};
365
366/*
367 * Per-registration mode operations
368 */
369struct rpcrdma_xprt;
370struct rpcrdma_memreg_ops {
371	int		(*ro_map)(struct rpcrdma_xprt *,
372				  struct rpcrdma_mr_seg *, int, bool);
373	void		(*ro_unmap_sync)(struct rpcrdma_xprt *,
374					 struct rpcrdma_req *);
375	int		(*ro_unmap)(struct rpcrdma_xprt *,
376				    struct rpcrdma_mr_seg *);
377	int		(*ro_open)(struct rpcrdma_ia *,
378				   struct rpcrdma_ep *,
379				   struct rpcrdma_create_data_internal *);
380	size_t		(*ro_maxpages)(struct rpcrdma_xprt *);
381	int		(*ro_init)(struct rpcrdma_xprt *);
382	void		(*ro_destroy)(struct rpcrdma_buffer *);
383	const char	*ro_displayname;
384};
385
386extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
387extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops;
388extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops;
389
390/*
391 * RPCRDMA transport -- encapsulates the structures above for
392 * integration with RPC.
393 *
394 * The contained structures are embedded, not pointers,
395 * for convenience. This structure need not be visible externally.
396 *
397 * It is allocated and initialized during mount, and released
398 * during unmount.
399 */
400struct rpcrdma_xprt {
401	struct rpc_xprt		rx_xprt;
402	struct rpcrdma_ia	rx_ia;
403	struct rpcrdma_ep	rx_ep;
404	struct rpcrdma_buffer	rx_buf;
405	struct rpcrdma_create_data_internal rx_data;
406	struct delayed_work	rx_connect_worker;
407	struct rpcrdma_stats	rx_stats;
408};
409
410#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, rx_xprt)
411#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
412
413/* Setting this to 0 ensures interoperability with early servers.
414 * Setting this to 1 enhances certain unaligned read/write performance.
415 * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
416extern int xprt_rdma_pad_optimize;
417
418/*
419 * Interface Adapter calls - xprtrdma/verbs.c
420 */
421int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
422void rpcrdma_ia_close(struct rpcrdma_ia *);
423
424/*
425 * Endpoint calls - xprtrdma/verbs.c
426 */
427int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
428				struct rpcrdma_create_data_internal *);
429void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
430int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
431void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
432
433int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
434				struct rpcrdma_req *);
435int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
436				struct rpcrdma_rep *);
437
438/*
439 * Buffer calls - xprtrdma/verbs.c
440 */
441struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
442struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *);
443void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *);
444int rpcrdma_buffer_create(struct rpcrdma_xprt *);
445void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
446
447struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *);
448void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *);
449struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
450void rpcrdma_buffer_put(struct rpcrdma_req *);
451void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
452void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
453
454struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
455					    size_t, gfp_t);
456void rpcrdma_free_regbuf(struct rpcrdma_ia *,
457			 struct rpcrdma_regbuf *);
458
459unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
460int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
461
462int frwr_alloc_recovery_wq(void);
463void frwr_destroy_recovery_wq(void);
464
465int rpcrdma_alloc_wq(void);
466void rpcrdma_destroy_wq(void);
467
468/*
469 * Wrappers for chunk registration, shared by read/write chunk code.
470 */
471
472void rpcrdma_mapping_error(struct rpcrdma_mr_seg *);
473
474static inline enum dma_data_direction
475rpcrdma_data_dir(bool writing)
476{
477	return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
478}
479
480static inline void
481rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg,
482		enum dma_data_direction direction)
483{
484	seg->mr_dir = direction;
485	seg->mr_dmalen = seg->mr_len;
486
487	if (seg->mr_page)
488		seg->mr_dma = ib_dma_map_page(device,
489				seg->mr_page, offset_in_page(seg->mr_offset),
490				seg->mr_dmalen, seg->mr_dir);
491	else
492		seg->mr_dma = ib_dma_map_single(device,
493				seg->mr_offset,
494				seg->mr_dmalen, seg->mr_dir);
495
496	if (ib_dma_mapping_error(device, seg->mr_dma))
497		rpcrdma_mapping_error(seg);
498}
499
500static inline void
501rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg)
502{
503	if (seg->mr_page)
504		ib_dma_unmap_page(device,
505				  seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
506	else
507		ib_dma_unmap_single(device,
508				    seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
509}
510
511/*
512 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
513 */
514void rpcrdma_connect_worker(struct work_struct *);
515void rpcrdma_conn_func(struct rpcrdma_ep *);
516void rpcrdma_reply_handler(struct rpcrdma_rep *);
517
518/*
519 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
520 */
521int rpcrdma_marshal_req(struct rpc_rqst *);
522
523/* RPC/RDMA module init - xprtrdma/transport.c
524 */
525extern unsigned int xprt_rdma_max_inline_read;
526void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
527void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
528void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
529int xprt_rdma_init(void);
530void xprt_rdma_cleanup(void);
531
532/* Backchannel calls - xprtrdma/backchannel.c
533 */
534#if defined(CONFIG_SUNRPC_BACKCHANNEL)
535int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
536int xprt_rdma_bc_up(struct svc_serv *, struct net *);
537int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
538void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
539int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
540void xprt_rdma_bc_free_rqst(struct rpc_rqst *);
541void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
542#endif	/* CONFIG_SUNRPC_BACKCHANNEL */
543
544extern struct xprt_class xprt_rdma_bc;
545
546#endif				/* _LINUX_SUNRPC_XPRT_RDMA_H */
v3.5.6
  1/*
  2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
  3 *
  4 * This software is available to you under a choice of one of two
  5 * licenses.  You may choose to be licensed under the terms of the GNU
  6 * General Public License (GPL) Version 2, available from the file
  7 * COPYING in the main directory of this source tree, or the BSD-type
  8 * license below:
  9 *
 10 * Redistribution and use in source and binary forms, with or without
 11 * modification, are permitted provided that the following conditions
 12 * are met:
 13 *
 14 *      Redistributions of source code must retain the above copyright
 15 *      notice, this list of conditions and the following disclaimer.
 16 *
 17 *      Redistributions in binary form must reproduce the above
 18 *      copyright notice, this list of conditions and the following
 19 *      disclaimer in the documentation and/or other materials provided
 20 *      with the distribution.
 21 *
 22 *      Neither the name of the Network Appliance, Inc. nor the names of
 23 *      its contributors may be used to endorse or promote products
 24 *      derived from this software without specific prior written
 25 *      permission.
 26 *
 27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 38 */
 39
 40#ifndef _LINUX_SUNRPC_XPRT_RDMA_H
 41#define _LINUX_SUNRPC_XPRT_RDMA_H
 42
 43#include <linux/wait.h> 		/* wait_queue_head_t, etc */
 44#include <linux/spinlock.h> 		/* spinlock_t, etc */
 45#include <linux/atomic.h>			/* atomic_t, etc */
 
 46
 47#include <rdma/rdma_cm.h>		/* RDMA connection api */
 48#include <rdma/ib_verbs.h>		/* RDMA verbs api */
 49
 50#include <linux/sunrpc/clnt.h> 		/* rpc_xprt */
 51#include <linux/sunrpc/rpc_rdma.h> 	/* RPC/RDMA protocol */
 52#include <linux/sunrpc/xprtrdma.h> 	/* xprt parameters */
 53
 54#define RDMA_RESOLVE_TIMEOUT	(5000)	/* 5 seconds */
 55#define RDMA_CONNECT_RETRY_MAX	(2)	/* retries if no listener backlog */
 56
 
 
 
 
 
 57/*
 58 * Interface Adapter -- one per transport instance
 59 */
 60struct rpcrdma_ia {
 
 
 
 61	struct rdma_cm_id 	*ri_id;
 62	struct ib_pd		*ri_pd;
 63	struct ib_mr		*ri_bind_mem;
 64	u32			ri_dma_lkey;
 65	int			ri_have_dma_lkey;
 66	struct completion	ri_done;
 67	int			ri_async_rc;
 68	enum rpcrdma_memreg	ri_memreg_strategy;
 
 
 69};
 70
 71/*
 72 * RDMA Endpoint -- one per transport instance
 73 */
 74
 75struct rpcrdma_ep {
 76	atomic_t		rep_cqcount;
 77	int			rep_cqinit;
 78	int			rep_connected;
 79	struct rpcrdma_ia	*rep_ia;
 80	struct ib_cq		*rep_cq;
 81	struct ib_qp_init_attr	rep_attr;
 82	wait_queue_head_t 	rep_connect_wait;
 83	struct ib_sge		rep_pad;	/* holds zeroed pad */
 84	struct ib_mr		*rep_pad_mr;	/* holds zeroed pad */
 85	void			(*rep_func)(struct rpcrdma_ep *);
 86	struct rpc_xprt		*rep_xprt;	/* for rep_func */
 87	struct rdma_conn_param	rep_remote_cma;
 88	struct sockaddr_storage	rep_remote_addr;
 
 89};
 90
 91#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
 92#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
 93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 94/*
 95 * struct rpcrdma_rep -- this structure encapsulates state required to recv
 96 * and complete a reply, asychronously. It needs several pieces of
 97 * state:
 98 *   o recv buffer (posted to provider)
 99 *   o ib_sge (also donated to provider)
100 *   o status of reply (length, success or not)
101 *   o bookkeeping state to get run by tasklet (list, etc)
102 *
103 * These are allocated during initialization, per-transport instance;
104 * however, the tasklet execution list itself is global, as it should
105 * always be pretty short.
106 *
107 * N of these are associated with a transport instance, and stored in
108 * struct rpcrdma_buffer. N is the max number of outstanding requests.
109 */
110
111/* temporary static scatter/gather max */
112#define RPCRDMA_MAX_DATA_SEGS	(64)	/* max scatter/gather */
113#define RPCRDMA_MAX_SEGS 	(RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
114#define MAX_RPCRDMAHDR	(\
115	/* max supported RPC/RDMA header */ \
116	sizeof(struct rpcrdma_msg) + (2 * sizeof(u32)) + \
117	(sizeof(struct rpcrdma_read_chunk) * RPCRDMA_MAX_SEGS) + sizeof(u32))
118
119struct rpcrdma_buffer;
120
121struct rpcrdma_rep {
122	unsigned int	rr_len;		/* actual received reply length */
123	struct rpcrdma_buffer *rr_buffer; /* home base for this structure */
124	struct rpc_xprt	*rr_xprt;	/* needed for request/reply matching */
125	void (*rr_func)(struct rpcrdma_rep *);/* called by tasklet in softint */
126	struct list_head rr_list;	/* tasklet list */
127	wait_queue_head_t rr_unbind;	/* optional unbind wait */
128	struct ib_sge	rr_iov;		/* for posting */
129	struct ib_mr	*rr_handle;	/* handle for mem in rr_iov */
130	char	rr_base[MAX_RPCRDMAHDR]; /* minimal inline receive buffer */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131};
132
133/*
134 * struct rpcrdma_req -- structure central to the request/reply sequence.
135 *
136 * N of these are associated with a transport instance, and stored in
137 * struct rpcrdma_buffer. N is the max number of outstanding requests.
138 *
139 * It includes pre-registered buffer memory for send AND recv.
140 * The recv buffer, however, is not owned by this structure, and
141 * is "donated" to the hardware when a recv is posted. When a
142 * reply is handled, the recv buffer used is given back to the
143 * struct rpcrdma_req associated with the request.
144 *
145 * In addition to the basic memory, this structure includes an array
146 * of iovs for send operations. The reason is that the iovs passed to
147 * ib_post_{send,recv} must not be modified until the work request
148 * completes.
149 *
150 * NOTES:
151 *   o RPCRDMA_MAX_SEGS is the max number of addressible chunk elements we
152 *     marshal. The number needed varies depending on the iov lists that
153 *     are passed to us, the memory registration mode we are in, and if
154 *     physical addressing is used, the layout.
155 */
156
157struct rpcrdma_mr_seg {		/* chunk descriptors */
158	union {				/* chunk memory handles */
159		struct ib_mr	*rl_mr;		/* if registered directly */
160		struct rpcrdma_mw {		/* if registered from region */
161			union {
162				struct ib_mw	*mw;
163				struct ib_fmr	*fmr;
164				struct {
165					struct ib_fast_reg_page_list *fr_pgl;
166					struct ib_mr *fr_mr;
167					enum { FRMR_IS_INVALID, FRMR_IS_VALID  } state;
168				} frmr;
169			} r;
170			struct list_head mw_list;
171		} *rl_mw;
172	} mr_chunk;
173	u64		mr_base;	/* registration result */
174	u32		mr_rkey;	/* registration result */
175	u32		mr_len;		/* length of chunk or segment */
176	int		mr_nsegs;	/* number of segments in chunk or 0 */
177	enum dma_data_direction	mr_dir;	/* segment mapping direction */
178	dma_addr_t	mr_dma;		/* segment mapping address */
179	size_t		mr_dmalen;	/* segment mapping length */
180	struct page	*mr_page;	/* owning page, if any */
181	char		*mr_offset;	/* kva if no page, else offset */
182};
183
 
 
184struct rpcrdma_req {
185	size_t 		rl_size;	/* actual length of buffer */
186	unsigned int	rl_niovs;	/* 0, 2 or 4 */
187	unsigned int	rl_nchunks;	/* non-zero if chunks */
188	unsigned int	rl_connect_cookie;	/* retry detection */
189	struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
190	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
191	struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */
192	struct ib_sge	rl_send_iov[4];	/* for active requests */
193	struct ib_sge	rl_iov;		/* for posting */
194	struct ib_mr	*rl_handle;	/* handle for mem in rl_iov */
195	char		rl_base[MAX_RPCRDMAHDR]; /* start of actual buffer */
196	__u32 		rl_xdr_buf[0];	/* start of returned rpc rq_buffer */
 
 
197};
198#define rpcr_to_rdmar(r) \
199	container_of((r)->rq_buffer, struct rpcrdma_req, rl_xdr_buf[0])
 
 
 
 
 
 
 
 
200
201/*
202 * struct rpcrdma_buffer -- holds list/queue of pre-registered memory for
203 * inline requests/replies, and client/server credits.
204 *
205 * One of these is associated with a transport instance
206 */
207struct rpcrdma_buffer {
208	spinlock_t	rb_lock;	/* protects indexes */
209	atomic_t	rb_credits;	/* most recent server credits */
210	unsigned long	rb_cwndscale;	/* cached framework rpc_cwndscale */
211	int		rb_max_requests;/* client max requests */
212	struct list_head rb_mws;	/* optional memory windows/fmrs/frmrs */
213	int		rb_send_index;
214	struct rpcrdma_req	**rb_send_bufs;
215	int		rb_recv_index;
216	struct rpcrdma_rep	**rb_recv_bufs;
217	char		*rb_pool;
 
 
 
 
 
 
218};
219#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
220
221/*
222 * Internal structure for transport instance creation. This
223 * exists primarily for modularity.
224 *
225 * This data should be set with mount options
226 */
227struct rpcrdma_create_data_internal {
228	struct sockaddr_storage	addr;	/* RDMA server address */
229	unsigned int	max_requests;	/* max requests (slots) in flight */
230	unsigned int	rsize;		/* mount rsize - max read hdr+data */
231	unsigned int	wsize;		/* mount wsize - max write hdr+data */
232	unsigned int	inline_rsize;	/* max non-rdma read data payload */
233	unsigned int	inline_wsize;	/* max non-rdma write data payload */
234	unsigned int	padding;	/* non-rdma write header padding */
235};
236
237#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
238	(rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_rsize)
239
240#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
241	(rpcx_to_rdmad(rq->rq_task->tk_xprt).inline_wsize)
242
243#define RPCRDMA_INLINE_PAD_VALUE(rq)\
244	rpcx_to_rdmad(rq->rq_task->tk_xprt).padding
245
246/*
247 * Statistics for RPCRDMA
248 */
249struct rpcrdma_stats {
250	unsigned long		read_chunk_count;
251	unsigned long		write_chunk_count;
252	unsigned long		reply_chunk_count;
253
254	unsigned long long	total_rdma_request;
255	unsigned long long	total_rdma_reply;
256
257	unsigned long long	pullup_copy_count;
258	unsigned long long	fixup_copy_count;
259	unsigned long		hardway_register_count;
260	unsigned long		failed_marshal_count;
261	unsigned long		bad_reply_count;
 
 
262};
263
264/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265 * RPCRDMA transport -- encapsulates the structures above for
266 * integration with RPC.
267 *
268 * The contained structures are embedded, not pointers,
269 * for convenience. This structure need not be visible externally.
270 *
271 * It is allocated and initialized during mount, and released
272 * during unmount.
273 */
274struct rpcrdma_xprt {
275	struct rpc_xprt		xprt;
276	struct rpcrdma_ia	rx_ia;
277	struct rpcrdma_ep	rx_ep;
278	struct rpcrdma_buffer	rx_buf;
279	struct rpcrdma_create_data_internal rx_data;
280	struct delayed_work	rdma_connect;
281	struct rpcrdma_stats	rx_stats;
282};
283
284#define rpcx_to_rdmax(x) container_of(x, struct rpcrdma_xprt, xprt)
285#define rpcx_to_rdmad(x) (rpcx_to_rdmax(x)->rx_data)
286
287/* Setting this to 0 ensures interoperability with early servers.
288 * Setting this to 1 enhances certain unaligned read/write performance.
289 * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */
290extern int xprt_rdma_pad_optimize;
291
292/*
293 * Interface Adapter calls - xprtrdma/verbs.c
294 */
295int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int);
296void rpcrdma_ia_close(struct rpcrdma_ia *);
297
298/*
299 * Endpoint calls - xprtrdma/verbs.c
300 */
301int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *,
302				struct rpcrdma_create_data_internal *);
303int rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *);
304int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
305int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
306
307int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
308				struct rpcrdma_req *);
309int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
310				struct rpcrdma_rep *);
311
312/*
313 * Buffer calls - xprtrdma/verbs.c
314 */
315int rpcrdma_buffer_create(struct rpcrdma_buffer *, struct rpcrdma_ep *,
316				struct rpcrdma_ia *,
317				struct rpcrdma_create_data_internal *);
 
318void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
319
 
 
320struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
321void rpcrdma_buffer_put(struct rpcrdma_req *);
322void rpcrdma_recv_buffer_get(struct rpcrdma_req *);
323void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
324
325int rpcrdma_register_internal(struct rpcrdma_ia *, void *, int,
326				struct ib_mr **, struct ib_sge *);
327int rpcrdma_deregister_internal(struct rpcrdma_ia *,
328				struct ib_mr *, struct ib_sge *);
329
330int rpcrdma_register_external(struct rpcrdma_mr_seg *,
331				int, int, struct rpcrdma_xprt *);
332int rpcrdma_deregister_external(struct rpcrdma_mr_seg *,
333				struct rpcrdma_xprt *, void *);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
335/*
336 * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c
337 */
 
338void rpcrdma_conn_func(struct rpcrdma_ep *);
339void rpcrdma_reply_handler(struct rpcrdma_rep *);
340
341/*
342 * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
343 */
344int rpcrdma_marshal_req(struct rpc_rqst *);
345
346/* Temporary NFS request map cache. Created in svc_rdma.c  */
347extern struct kmem_cache *svc_rdma_map_cachep;
348/* WR context cache. Created in svc_rdma.c  */
349extern struct kmem_cache *svc_rdma_ctxt_cachep;
350/* Workqueue created in svc_rdma.c */
351extern struct workqueue_struct *svc_rdma_wq;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
353#endif				/* _LINUX_SUNRPC_XPRT_RDMA_H */