blkback.c - drivers/block/xen-blkback/blkback.c - Linux diff v3.1

  1/******************************************************************************
  2 *
  3 * Back-end of the driver for virtual block devices. This portion of the
  4 * driver exports a 'unified' block-device interface that can be accessed
  5 * by any operating system that implements a compatible front end. A
  6 * reference front-end implementation can be found in:
  7 *  drivers/block/xen-blkfront.c
  8 *
  9 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
 10 * Copyright (c) 2005, Christopher Clark
 11 *
 12 * This program is free software; you can redistribute it and/or
 13 * modify it under the terms of the GNU General Public License version 2
 14 * as published by the Free Software Foundation; or, when distributed
 15 * separately from the Linux kernel or incorporated into other
 16 * software packages, subject to the following license:
 17 *
 18 * Permission is hereby granted, free of charge, to any person obtaining a copy
 19 * of this source file (the "Software"), to deal in the Software without
 20 * restriction, including without limitation the rights to use, copy, modify,
 21 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
 22 * and to permit persons to whom the Software is furnished to do so, subject to
 23 * the following conditions:
 24 *
 25 * The above copyright notice and this permission notice shall be included in
 26 * all copies or substantial portions of the Software.
 27 *
 28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 34 * IN THE SOFTWARE.
 35 */
 36
 
 
 37#include <linux/spinlock.h>
 38#include <linux/kthread.h>
 39#include <linux/list.h>
 40#include <linux/delay.h>
 41#include <linux/freezer.h>
 
 42
 43#include <xen/events.h>
 44#include <xen/page.h>
 
 45#include <asm/xen/hypervisor.h>
 46#include <asm/xen/hypercall.h>
 
 
 47#include "common.h"
 48
 49/*
 50 * These are rather arbitrary. They are fairly large because adjacent requests
 51 * pulled from a communication ring are quite likely to end up being part of
 52 * the same scatter/gather request at the disc.
 53 *
 54 * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 55 *
 56 * This will increase the chances of being able to write whole tracks.
 57 * 64 should be enough to keep us competitive with Linux.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 58 */
 59static int xen_blkif_reqs = 64;
 60module_param_named(reqs, xen_blkif_reqs, int, 0);
 61MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
 
 
 
 
 
 62
 63/* Run-time switchable: /sys/module/blkback/parameters/ */
 64static unsigned int log_stats;
 65module_param(log_stats, int, 0644);
 66
 67/*
 68 * Each outstanding request that we've passed to the lower device layers has a
 69 * 'pending_req' allocated to it. Each buffer_head that completes decrements
 70 * the pendcnt towards zero. When it hits zero, the specified domain has a
 71 * response queued for it, with the saved 'id' passed back.
 72 */
 73struct pending_req {
 74	struct xen_blkif	*blkif;
 75	u64			id;
 76	int			nr_pages;
 77	atomic_t		pendcnt;
 78	unsigned short		operation;
 79	int			status;
 80	struct list_head	free_list;
 81};
 82
 83#define BLKBACK_INVALID_HANDLE (~0)
 84
 85struct xen_blkbk {
 86	struct pending_req	*pending_reqs;
 87	/* List of all 'pending_req' available */
 88	struct list_head	pending_free;
 89	/* And its spinlock. */
 90	spinlock_t		pending_free_lock;
 91	wait_queue_head_t	pending_free_wq;
 92	/* The list of all pages that are available. */
 93	struct page		**pending_pages;
 94	/* And the grant handles that are available. */
 95	grant_handle_t		*pending_grant_handles;
 96};
 
 
 
 
 
 
 
 
 
 97
 98static struct xen_blkbk *blkbk;
 99
100/*
101 * Little helpful macro to figure out the index and virtual address of the
102 * pending_pages[..]. For each 'pending_req' we have have up to
103 * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
104 * 10 and would index in the pending_pages[..].
 
 
 
 
105 */
106static inline int vaddr_pagenr(struct pending_req *req, int seg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107{
108	return (req - blkbk->pending_reqs) *
109		BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110}
111
112#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
114static inline unsigned long vaddr(struct pending_req *req, int seg)
 
 
 
 
115{
116	unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg));
117	return (unsigned long)pfn_to_kaddr(pfn);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118}
119
120#define pending_handle(_req, _seg) \
121	(blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
 
 
 
 
 
122
 
 
 
 
123
124static int do_block_io_op(struct xen_blkif *blkif);
125static int dispatch_rw_block_io(struct xen_blkif *blkif,
126				struct blkif_request *req,
127				struct pending_req *pending_req);
128static void make_response(struct xen_blkif *blkif, u64 id,
129			  unsigned short op, int st);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
131/*
132 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
133 */
134static struct pending_req *alloc_req(void)
135{
136	struct pending_req *req = NULL;
137	unsigned long flags;
138
139	spin_lock_irqsave(&blkbk->pending_free_lock, flags);
140	if (!list_empty(&blkbk->pending_free)) {
141		req = list_entry(blkbk->pending_free.next, struct pending_req,
142				 free_list);
143		list_del(&req->free_list);
144	}
145	spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
146	return req;
147}
148
149/*
150 * Return the 'pending_req' structure back to the freepool. We also
151 * wake up the thread if it was waiting for a free page.
152 */
153static void free_req(struct pending_req *req)
154{
155	unsigned long flags;
156	int was_empty;
157
158	spin_lock_irqsave(&blkbk->pending_free_lock, flags);
159	was_empty = list_empty(&blkbk->pending_free);
160	list_add(&req->free_list, &blkbk->pending_free);
161	spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
162	if (was_empty)
163		wake_up(&blkbk->pending_free_wq);
164}
165
166/*
167 * Routines for managing virtual block devices (vbds).
168 */
169static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
170			     int operation)
171{
172	struct xen_vbd *vbd = &blkif->vbd;
173	int rc = -EACCES;
174
175	if ((operation != READ) && vbd->readonly)
176		goto out;
177
178	if (likely(req->nr_sects)) {
179		blkif_sector_t end = req->sector_number + req->nr_sects;
180
181		if (unlikely(end < req->sector_number))
182			goto out;
183		if (unlikely(end > vbd_sz(vbd)))
184			goto out;
185	}
186
187	req->dev  = vbd->pdevice;
188	req->bdev = vbd->bdev;
189	rc = 0;
190
191 out:
192	return rc;
193}
194
195static void xen_vbd_resize(struct xen_blkif *blkif)
196{
197	struct xen_vbd *vbd = &blkif->vbd;
198	struct xenbus_transaction xbt;
199	int err;
200	struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
201	unsigned long long new_size = vbd_sz(vbd);
202
203	pr_info(DRV_PFX "VBD Resize: Domid: %d, Device: (%d, %d)\n",
204		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
205	pr_info(DRV_PFX "VBD Resize: new size %llu\n", new_size);
206	vbd->size = new_size;
207again:
208	err = xenbus_transaction_start(&xbt);
209	if (err) {
210		pr_warn(DRV_PFX "Error starting transaction");
211		return;
212	}
213	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
214			    (unsigned long long)vbd_sz(vbd));
215	if (err) {
216		pr_warn(DRV_PFX "Error writing new size");
217		goto abort;
218	}
219	/*
220	 * Write the current state; we will use this to synchronize
221	 * the front-end. If the current state is "connected" the
222	 * front-end will get the new size information online.
223	 */
224	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
225	if (err) {
226		pr_warn(DRV_PFX "Error writing the state");
227		goto abort;
228	}
229
230	err = xenbus_transaction_end(xbt, 0);
231	if (err == -EAGAIN)
232		goto again;
233	if (err)
234		pr_warn(DRV_PFX "Error ending transaction");
235	return;
236abort:
237	xenbus_transaction_end(xbt, 1);
238}
239
240/*
241 * Notification from the guest OS.
242 */
243static void blkif_notify_work(struct xen_blkif *blkif)
244{
245	blkif->waiting_reqs = 1;
246	wake_up(&blkif->wq);
247}
248
249irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
250{
251	blkif_notify_work(dev_id);
252	return IRQ_HANDLED;
253}
254
255/*
256 * SCHEDULER FUNCTIONS
257 */
258
259static void print_stats(struct xen_blkif *blkif)
260{
261	pr_info("xen-blkback (%s): oo %3d  |  rd %4d  |  wr %4d  |  f %4d\n",
262		 current->comm, blkif->st_oo_req,
263		 blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req);
264	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
265	blkif->st_rd_req = 0;
266	blkif->st_wr_req = 0;
267	blkif->st_oo_req = 0;
 
 
 
 
268}
269
270int xen_blkif_schedule(void *arg)
271{
272	struct xen_blkif *blkif = arg;
 
273	struct xen_vbd *vbd = &blkif->vbd;
 
 
 
 
274
275	xen_blkif_get(blkif);
276
277	while (!kthread_should_stop()) {
278		if (try_to_freeze())
279			continue;
280		if (unlikely(vbd->size != vbd_sz(vbd)))
281			xen_vbd_resize(blkif);
282
283		wait_event_interruptible(
284			blkif->wq,
285			blkif->waiting_reqs || kthread_should_stop());
286		wait_event_interruptible(
287			blkbk->pending_free_wq,
288			!list_empty(&blkbk->pending_free) ||
289			kthread_should_stop());
 
 
 
 
 
 
 
 
290
291		blkif->waiting_reqs = 0;
 
 
292		smp_mb(); /* clear flag *before* checking for work */
293
294		if (do_block_io_op(blkif))
295			blkif->waiting_reqs = 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
297		if (log_stats && time_after(jiffies, blkif->st_print))
298			print_stats(blkif);
 
 
 
 
 
 
 
299	}
300
 
 
 
301	if (log_stats)
302		print_stats(blkif);
303
304	blkif->xenblkd = NULL;
305	xen_blkif_put(blkif);
306
307	return 0;
308}
309
310struct seg_buf {
311	unsigned long buf;
312	unsigned int nsec;
313};
314/*
315 * Unmap the grant references, and also remove the M2P over-rides
316 * used in the 'pending_req'.
317 */
318static void xen_blkbk_unmap(struct pending_req *req)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319{
320	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
321	unsigned int i, invcount = 0;
322	grant_handle_t handle;
323	int ret;
324
325	for (i = 0; i < req->nr_pages; i++) {
326		handle = pending_handle(req, i);
327		if (handle == BLKBACK_INVALID_HANDLE)
 
 
 
328			continue;
329		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
330				    GNTMAP_host_map, handle);
331		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
 
332		invcount++;
333	}
334
335	ret = HYPERVISOR_grant_table_op(
336		GNTTABOP_unmap_grant_ref, unmap, invcount);
337	BUG_ON(ret);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338	/*
339	 * Note, we use invcount, so nr->pages, so we can't index
340	 * using vaddr(req, i).
 
 
 
 
 
 
 
 
341	 */
342	for (i = 0; i < invcount; i++) {
343		ret = m2p_remove_override(
344			virt_to_page(unmap[i].host_addr), false);
345		if (ret) {
346			pr_alert(DRV_PFX "Failed to remove M2P override for %lx\n",
347				 (unsigned long)unmap[i].host_addr);
348			continue;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349		}
 
 
350	}
351}
352
353static int xen_blkbk_map(struct blkif_request *req,
354			 struct pending_req *pending_req,
355			 struct seg_buf seg[])
356{
357	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
358	int i;
359	int nseg = req->nr_segments;
 
 
 
360	int ret = 0;
 
 
 
 
 
361
362	/*
363	 * Fill out preq.nr_sects with proper amount of sectors, and setup
364	 * assign map[..] with the PFN of the page in our domain with the
365	 * corresponding grant reference for each page.
366	 */
367	for (i = 0; i < nseg; i++) {
 
368		uint32_t flags;
369
370		flags = GNTMAP_host_map;
371		if (pending_req->operation != BLKIF_OP_READ)
372			flags |= GNTMAP_readonly;
373		gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags,
374				  req->u.rw.seg[i].gref,
375				  pending_req->blkif->domid);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376	}
377
378	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
379	BUG_ON(ret);
380
381	/*
382	 * Now swizzle the MFN in our domain with the MFN from the other domain
383	 * so that when we access vaddr(pending_req,i) it has the contents of
384	 * the page from the other domain.
385	 */
386	for (i = 0; i < nseg; i++) {
387		if (unlikely(map[i].status != 0)) {
388			pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
389			map[i].handle = BLKBACK_INVALID_HANDLE;
390			ret |= 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391		}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
393		pending_handle(pending_req, i) = map[i].handle;
 
 
 
 
 
 
394
395		if (ret)
396			continue;
397
398		ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
399			blkbk->pending_page(pending_req, i), false);
400		if (ret) {
401			pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n",
402				 (unsigned long)map[i].dev_bus_addr, ret);
403			/* We could switch over to GNTTABOP_copy */
404			continue;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405		}
406
407		seg[i].buf  = map[i].dev_bus_addr |
408			(req->u.rw.seg[i].first_sect << 9);
 
409	}
410	return ret;
 
 
 
 
 
411}
412
413/*
414 * Completion callback on the bio's. Called as bh->b_end_io()
415 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
417static void __end_block_io_op(struct pending_req *pending_req, int error)
 
418{
419	/* An error fails the entire request. */
420	if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
421	    (error == -EOPNOTSUPP)) {
422		pr_debug(DRV_PFX "flush diskcache op failed, not supported\n");
423		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
 
 
 
 
 
424		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
425	} else if (error) {
426		pr_debug(DRV_PFX "Buffer not up-to-date at end of operation,"
427			 " error=%d\n", error);
428		pending_req->status = BLKIF_RSP_ERROR;
429	}
430
431	/*
432	 * If all of the bio's have completed it is time to unmap
433	 * the grant references associated with 'request' and provide
434	 * the proper response on the ring.
435	 */
436	if (atomic_dec_and_test(&pending_req->pendcnt)) {
437		xen_blkbk_unmap(pending_req);
438		make_response(pending_req->blkif, pending_req->id,
439			      pending_req->operation, pending_req->status);
440		xen_blkif_put(pending_req->blkif);
441		free_req(pending_req);
442	}
443}
444
445/*
446 * bio callback.
447 */
448static void end_block_io_op(struct bio *bio, int error)
449{
450	__end_block_io_op(bio->bi_private, error);
451	bio_put(bio);
452}
453
 
 
 
 
454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
456/*
457 * Function to copy the from the ring buffer the 'struct blkif_request'
458 * (which has the sectors we want, number of them, grant references, etc),
459 * and transmute  it to the block API to hand it over to the proper block disk.
460 */
461static int
462__do_block_io_op(struct xen_blkif *blkif)
463{
464	union blkif_back_rings *blk_rings = &blkif->blk_rings;
465	struct blkif_request req;
466	struct pending_req *pending_req;
467	RING_IDX rc, rp;
468	int more_to_do = 0;
469
470	rc = blk_rings->common.req_cons;
471	rp = blk_rings->common.sring->req_prod;
472	rmb(); /* Ensure we see queued requests up to 'rp'. */
473
 
 
 
 
 
 
474	while (rc != rp) {
475
476		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
477			break;
478
 
 
 
479		if (kthread_should_stop()) {
480			more_to_do = 1;
481			break;
482		}
483
484		pending_req = alloc_req();
485		if (NULL == pending_req) {
486			blkif->st_oo_req++;
487			more_to_do = 1;
488			break;
489		}
490
491		switch (blkif->blk_protocol) {
492		case BLKIF_PROTOCOL_NATIVE:
493			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
494			break;
495		case BLKIF_PROTOCOL_X86_32:
496			blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
497			break;
498		case BLKIF_PROTOCOL_X86_64:
499			blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
500			break;
501		default:
502			BUG();
503		}
504		blk_rings->common.req_cons = ++rc; /* before make_response() */
505
506		/* Apply all sanity checks to /private copy/ of request. */
507		barrier();
508
509		if (dispatch_rw_block_io(blkif, &req, pending_req))
 
 
 
 
 
 
 
510			break;
 
 
 
 
 
 
 
 
 
 
511
512		/* Yield point for this unbounded loop. */
513		cond_resched();
514	}
515
516	return more_to_do;
517}
518
519static int
520do_block_io_op(struct xen_blkif *blkif)
521{
522	union blkif_back_rings *blk_rings = &blkif->blk_rings;
523	int more_to_do;
524
525	do {
526		more_to_do = __do_block_io_op(blkif);
527		if (more_to_do)
528			break;
529
530		RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
531	} while (more_to_do);
532
533	return more_to_do;
534}
535
536/*
537 * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
538 * and call the 'submit_bio' to pass it to the underlying storage.
539 */
540static int dispatch_rw_block_io(struct xen_blkif *blkif,
541				struct blkif_request *req,
542				struct pending_req *pending_req)
543{
544	struct phys_req preq;
545	struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
546	unsigned int nseg;
547	struct bio *bio = NULL;
548	struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
549	int i, nbio = 0;
550	int operation;
 
551	struct blk_plug plug;
 
 
 
 
 
 
 
 
 
 
 
 
 
552
553	switch (req->operation) {
554	case BLKIF_OP_READ:
555		blkif->st_rd_req++;
556		operation = READ;
557		break;
558	case BLKIF_OP_WRITE:
559		blkif->st_wr_req++;
560		operation = WRITE_ODIRECT;
 
561		break;
 
 
 
562	case BLKIF_OP_FLUSH_DISKCACHE:
563		blkif->st_f_req++;
564		operation = WRITE_FLUSH;
 
565		break;
566	case BLKIF_OP_WRITE_BARRIER:
567	default:
568		operation = 0; /* make gcc happy */
569		goto fail_response;
570		break;
571	}
572
573	/* Check that the number of segments is sane. */
574	nseg = req->nr_segments;
575	if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
576	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
577		pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
578			 nseg);
 
 
 
 
579		/* Haven't submitted any bio's yet. */
580		goto fail_response;
581	}
582
583	preq.dev           = req->handle;
584	preq.sector_number = req->u.rw.sector_number;
585	preq.nr_sects      = 0;
586
587	pending_req->blkif     = blkif;
588	pending_req->id        = req->id;
589	pending_req->operation = req->operation;
590	pending_req->status    = BLKIF_RSP_OKAY;
591	pending_req->nr_pages  = nseg;
592
593	for (i = 0; i < nseg; i++) {
594		seg[i].nsec = req->u.rw.seg[i].last_sect -
595			req->u.rw.seg[i].first_sect + 1;
596		if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
597		    (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
 
 
 
 
 
 
 
 
 
 
 
 
 
598			goto fail_response;
599		preq.nr_sects += seg[i].nsec;
600
601	}
602
603	if (xen_vbd_translate(&preq, blkif, operation) != 0) {
604		pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n",
605			 operation == READ ? "read" : "write",
606			 preq.sector_number,
607			 preq.sector_number + preq.nr_sects, preq.dev);
 
608		goto fail_response;
609	}
610
611	/*
612	 * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
613	 * is set there.
614	 */
615	for (i = 0; i < nseg; i++) {
616		if (((int)preq.sector_number|(int)seg[i].nsec) &
617		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
618			pr_debug(DRV_PFX "Misaligned I/O request from domain %d",
619				 blkif->domid);
620			goto fail_response;
621		}
622	}
623
 
 
 
 
 
 
624	/*
625	 * If we have failed at this point, we need to undo the M2P override,
626	 * set gnttab_set_unmap_op on all of the grant references and perform
627	 * the hypercall to unmap the grants - that is all done in
628	 * xen_blkbk_unmap.
629	 */
630	if (xen_blkbk_map(req, pending_req, seg))
631		goto fail_flush;
632
633	/* This corresponding xen_blkif_put is done in __end_block_io_op */
634	xen_blkif_get(blkif);
 
 
 
 
635
636	for (i = 0; i < nseg; i++) {
637		while ((bio == NULL) ||
638		       (bio_add_page(bio,
639				     blkbk->pending_page(pending_req, i),
640				     seg[i].nsec << 9,
641				     seg[i].buf & ~PAGE_MASK) == 0)) {
642
643			bio = bio_alloc(GFP_KERNEL, nseg-i);
644			if (unlikely(bio == NULL))
645				goto fail_put_bio;
646
647			biolist[nbio++] = bio;
648			bio->bi_bdev    = preq.bdev;
649			bio->bi_private = pending_req;
650			bio->bi_end_io  = end_block_io_op;
651			bio->bi_sector  = preq.sector_number;
652		}
653
654		preq.sector_number += seg[i].nsec;
655	}
656
657	/* This will be hit if the operation was a flush. */
658	if (!bio) {
659		BUG_ON(operation != WRITE_FLUSH);
660
661		bio = bio_alloc(GFP_KERNEL, 0);
662		if (unlikely(bio == NULL))
663			goto fail_put_bio;
664
 
 
665		biolist[nbio++] = bio;
666		bio->bi_bdev    = preq.bdev;
667		bio->bi_private = pending_req;
668		bio->bi_end_io  = end_block_io_op;
669	}
670
671	/*
672	 * We set it one so that the last submit_bio does not have to call
673	 * atomic_inc.
674	 */
675	atomic_set(&pending_req->pendcnt, nbio);
676
677	/* Get a reference count for the disk queue and start sending I/O */
678	blk_start_plug(&plug);
679
680	for (i = 0; i < nbio; i++)
681		submit_bio(operation, biolist[i]);
682
683	/* Let the I/Os go.. */
684	blk_finish_plug(&plug);
685
686	if (operation == READ)
687		blkif->st_rd_sect += preq.nr_sects;
688	else if (operation == WRITE || operation == WRITE_FLUSH)
689		blkif->st_wr_sect += preq.nr_sects;
690
691	return 0;
692
693 fail_flush:
694	xen_blkbk_unmap(pending_req);
 
695 fail_response:
696	/* Haven't submitted any bio's yet. */
697	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
698	free_req(pending_req);
699	msleep(1); /* back off a bit */
700	return -EIO;
701
702 fail_put_bio:
703	for (i = 0; i < nbio; i++)
704		bio_put(biolist[i]);
705	__end_block_io_op(pending_req, -EINVAL);
706	msleep(1); /* back off a bit */
707	return -EIO;
708}
709
710
711
712/*
713 * Put a response on the ring on how the operation fared.
714 */
715static void make_response(struct xen_blkif *blkif, u64 id,
716			  unsigned short op, int st)
717{
718	struct blkif_response  resp;
719	unsigned long     flags;
720	union blkif_back_rings *blk_rings = &blkif->blk_rings;
721	int notify;
722
723	resp.id        = id;
724	resp.operation = op;
725	resp.status    = st;
726
727	spin_lock_irqsave(&blkif->blk_ring_lock, flags);
728	/* Place on the response ring for the relevant domain. */
729	switch (blkif->blk_protocol) {
730	case BLKIF_PROTOCOL_NATIVE:
731		memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
732		       &resp, sizeof(resp));
733		break;
734	case BLKIF_PROTOCOL_X86_32:
735		memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
736		       &resp, sizeof(resp));
737		break;
738	case BLKIF_PROTOCOL_X86_64:
739		memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
740		       &resp, sizeof(resp));
741		break;
742	default:
743		BUG();
744	}
 
 
 
 
 
745	blk_rings->common.rsp_prod_pvt++;
746	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
747	spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
748	if (notify)
749		notify_remote_via_irq(blkif->irq);
750}
751
752static int __init xen_blkif_init(void)
753{
754	int i, mmap_pages;
755	int rc = 0;
756
757	if (!xen_pv_domain())
758		return -ENODEV;
759
760	blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
761	if (!blkbk) {
762		pr_alert(DRV_PFX "%s: out of memory!\n", __func__);
763		return -ENOMEM;
764	}
765
766	mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
767
768	blkbk->pending_reqs          = kmalloc(sizeof(blkbk->pending_reqs[0]) *
769					xen_blkif_reqs, GFP_KERNEL);
770	blkbk->pending_grant_handles = kzalloc(sizeof(blkbk->pending_grant_handles[0]) *
771					mmap_pages, GFP_KERNEL);
772	blkbk->pending_pages         = kzalloc(sizeof(blkbk->pending_pages[0]) *
773					mmap_pages, GFP_KERNEL);
774
775	if (!blkbk->pending_reqs || !blkbk->pending_grant_handles ||
776	    !blkbk->pending_pages) {
777		rc = -ENOMEM;
778		goto out_of_memory;
779	}
780
781	for (i = 0; i < mmap_pages; i++) {
782		blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
783		blkbk->pending_pages[i] = alloc_page(GFP_KERNEL);
784		if (blkbk->pending_pages[i] == NULL) {
785			rc = -ENOMEM;
786			goto out_of_memory;
787		}
788	}
 
 
 
 
789	rc = xen_blkif_interface_init();
790	if (rc)
791		goto failed_init;
792
793	memset(blkbk->pending_reqs, 0, sizeof(blkbk->pending_reqs));
794
795	INIT_LIST_HEAD(&blkbk->pending_free);
796	spin_lock_init(&blkbk->pending_free_lock);
797	init_waitqueue_head(&blkbk->pending_free_wq);
798
799	for (i = 0; i < xen_blkif_reqs; i++)
800		list_add_tail(&blkbk->pending_reqs[i].free_list,
801			      &blkbk->pending_free);
802
803	rc = xen_blkif_xenbus_init();
804	if (rc)
805		goto failed_init;
806
807	return 0;
808
809 out_of_memory:
810	pr_alert(DRV_PFX "%s: out of memory\n", __func__);
811 failed_init:
812	kfree(blkbk->pending_reqs);
813	kfree(blkbk->pending_grant_handles);
814	if (blkbk->pending_pages) {
815		for (i = 0; i < mmap_pages; i++) {
816			if (blkbk->pending_pages[i])
817				__free_page(blkbk->pending_pages[i]);
818		}
819		kfree(blkbk->pending_pages);
820	}
821	kfree(blkbk);
822	blkbk = NULL;
823	return rc;
824}
825
826module_init(xen_blkif_init);
827
 
 
 
 
 
 
 
 
 
828MODULE_LICENSE("Dual BSD/GPL");
829MODULE_ALIAS("xen-backend:vbd");

   1/******************************************************************************
   2 *
   3 * Back-end of the driver for virtual block devices. This portion of the
   4 * driver exports a 'unified' block-device interface that can be accessed
   5 * by any operating system that implements a compatible front end. A
   6 * reference front-end implementation can be found in:
   7 *  drivers/block/xen-blkfront.c
   8 *
   9 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
  10 * Copyright (c) 2005, Christopher Clark
  11 *
  12 * This program is free software; you can redistribute it and/or
  13 * modify it under the terms of the GNU General Public License version 2
  14 * as published by the Free Software Foundation; or, when distributed
  15 * separately from the Linux kernel or incorporated into other
  16 * software packages, subject to the following license:
  17 *
  18 * Permission is hereby granted, free of charge, to any person obtaining a copy
  19 * of this source file (the "Software"), to deal in the Software without
  20 * restriction, including without limitation the rights to use, copy, modify,
  21 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  22 * and to permit persons to whom the Software is furnished to do so, subject to
  23 * the following conditions:
  24 *
  25 * The above copyright notice and this permission notice shall be included in
  26 * all copies or substantial portions of the Software.
  27 *
  28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  34 * IN THE SOFTWARE.
  35 */
  36
  37#define pr_fmt(fmt) "xen-blkback: " fmt
  38
  39#include <linux/spinlock.h>
  40#include <linux/kthread.h>
  41#include <linux/list.h>
  42#include <linux/delay.h>
  43#include <linux/freezer.h>
  44#include <linux/bitmap.h>
  45
  46#include <xen/events.h>
  47#include <xen/page.h>
  48#include <xen/xen.h>
  49#include <asm/xen/hypervisor.h>
  50#include <asm/xen/hypercall.h>
  51#include <xen/balloon.h>
  52#include <xen/grant_table.h>
  53#include "common.h"
  54
  55/*
  56 * Maximum number of unused free pages to keep in the internal buffer.
  57 * Setting this to a value too low will reduce memory used in each backend,
  58 * but can have a performance penalty.
  59 *
  60 * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
  61 * be set to a lower value that might degrade performance on some intensive
  62 * IO workloads.
  63 */
  64
  65static int max_buffer_pages = 1024;
  66module_param_named(max_buffer_pages, max_buffer_pages, int, 0644);
  67MODULE_PARM_DESC(max_buffer_pages,
  68"Maximum number of free pages to keep in each block backend buffer");
  69
  70/*
  71 * Maximum number of grants to map persistently in blkback. For maximum
  72 * performance this should be the total numbers of grants that can be used
  73 * to fill the ring, but since this might become too high, specially with
  74 * the use of indirect descriptors, we set it to a value that provides good
  75 * performance without using too much memory.
  76 *
  77 * When the list of persistent grants is full we clean it up using a LRU
  78 * algorithm.
  79 */
  80
  81static int max_pgrants = 1056;
  82module_param_named(max_persistent_grants, max_pgrants, int, 0644);
  83MODULE_PARM_DESC(max_persistent_grants,
  84                 "Maximum number of grants to map persistently");
  85
  86/*
  87 * How long a persistent grant is allowed to remain allocated without being in
  88 * use. The time is in seconds, 0 means indefinitely long.
  89 */
  90
  91static unsigned int pgrant_timeout = 60;
  92module_param_named(persistent_grant_unused_seconds, pgrant_timeout,
  93		   uint, 0644);
  94MODULE_PARM_DESC(persistent_grant_unused_seconds,
  95		 "Time in seconds an unused persistent grant is allowed to "
  96		 "remain allocated. Default is 60, 0 means unlimited.");
  97
  98/*
  99 * Maximum number of rings/queues blkback supports, allow as many queues as there
 100 * are CPUs if user has not specified a value.
 101 */
 102unsigned int xenblk_max_queues;
 103module_param_named(max_queues, xenblk_max_queues, uint, 0644);
 104MODULE_PARM_DESC(max_queues,
 105		 "Maximum number of hardware queues per virtual disk." \
 106		 "By default it is the number of online CPUs.");
 107
 108/*
 109 * Maximum order of pages to be used for the shared ring between front and
 110 * backend, 4KB page granularity is used.
 111 */
 112unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
 113module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
 114MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
 115/*
 116 * The LRU mechanism to clean the lists of persistent grants needs to
 117 * be executed periodically. The time interval between consecutive executions
 118 * of the purge mechanism is set in ms.
 119 */
 120#define LRU_INTERVAL 100
 121
 122/*
 123 * When the persistent grants list is full we will remove unused grants
 124 * from the list. The percent number of grants to be removed at each LRU
 125 * execution.
 126 */
 127#define LRU_PERCENT_CLEAN 5
 128
 129/* Run-time switchable: /sys/module/blkback/parameters/ */
 130static unsigned int log_stats;
 131module_param(log_stats, int, 0644);
 132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 133#define BLKBACK_INVALID_HANDLE (~0)
 134
 135static inline bool persistent_gnt_timeout(struct persistent_gnt *persistent_gnt)
 136{
 137	return pgrant_timeout && (jiffies - persistent_gnt->last_used >=
 138			HZ * pgrant_timeout);
 139}
 140
 141#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
 142
 143static int do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags);
 144static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
 145				struct blkif_request *req,
 146				struct pending_req *pending_req);
 147static void make_response(struct xen_blkif_ring *ring, u64 id,
 148			  unsigned short op, int st);
 149
 150#define foreach_grant_safe(pos, n, rbtree, node) \
 151	for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
 152	     (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \
 153	     &(pos)->node != NULL; \
 154	     (pos) = container_of(n, typeof(*(pos)), node), \
 155	     (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
 156
 
 157
 158/*
 159 * We don't need locking around the persistent grant helpers
 160 * because blkback uses a single-thread for each backend, so we
 161 * can be sure that this functions will never be called recursively.
 162 *
 163 * The only exception to that is put_persistent_grant, that can be called
 164 * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
 165 * bit operations to modify the flags of a persistent grant and to count
 166 * the number of used grants.
 167 */
 168static int add_persistent_gnt(struct xen_blkif_ring *ring,
 169			       struct persistent_gnt *persistent_gnt)
 170{
 171	struct rb_node **new = NULL, *parent = NULL;
 172	struct persistent_gnt *this;
 173	struct xen_blkif *blkif = ring->blkif;
 174
 175	if (ring->persistent_gnt_c >= max_pgrants) {
 176		if (!blkif->vbd.overflow_max_grants)
 177			blkif->vbd.overflow_max_grants = 1;
 178		return -EBUSY;
 179	}
 180	/* Figure out where to put new node */
 181	new = &ring->persistent_gnts.rb_node;
 182	while (*new) {
 183		this = container_of(*new, struct persistent_gnt, node);
 184
 185		parent = *new;
 186		if (persistent_gnt->gnt < this->gnt)
 187			new = &((*new)->rb_left);
 188		else if (persistent_gnt->gnt > this->gnt)
 189			new = &((*new)->rb_right);
 190		else {
 191			pr_alert_ratelimited("trying to add a gref that's already in the tree\n");
 192			return -EINVAL;
 193		}
 194	}
 195
 196	persistent_gnt->active = true;
 197	/* Add new node and rebalance tree. */
 198	rb_link_node(&(persistent_gnt->node), parent, new);
 199	rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
 200	ring->persistent_gnt_c++;
 201	atomic_inc(&ring->persistent_gnt_in_use);
 202	return 0;
 203}
 204
 205static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
 206						 grant_ref_t gref)
 207{
 208	struct persistent_gnt *data;
 209	struct rb_node *node = NULL;
 210
 211	node = ring->persistent_gnts.rb_node;
 212	while (node) {
 213		data = container_of(node, struct persistent_gnt, node);
 214
 215		if (gref < data->gnt)
 216			node = node->rb_left;
 217		else if (gref > data->gnt)
 218			node = node->rb_right;
 219		else {
 220			if (data->active) {
 221				pr_alert_ratelimited("requesting a grant already in use\n");
 222				return NULL;
 223			}
 224			data->active = true;
 225			atomic_inc(&ring->persistent_gnt_in_use);
 226			return data;
 227		}
 228	}
 229	return NULL;
 230}
 231
 232static void put_persistent_gnt(struct xen_blkif_ring *ring,
 233                               struct persistent_gnt *persistent_gnt)
 234{
 235	if (!persistent_gnt->active)
 236		pr_alert_ratelimited("freeing a grant already unused\n");
 237	persistent_gnt->last_used = jiffies;
 238	persistent_gnt->active = false;
 239	atomic_dec(&ring->persistent_gnt_in_use);
 240}
 241
 242static void free_persistent_gnts(struct xen_blkif_ring *ring)
 243{
 244	struct rb_root *root = &ring->persistent_gnts;
 245	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 246	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 247	struct persistent_gnt *persistent_gnt;
 248	struct rb_node *n;
 249	int segs_to_unmap = 0;
 250	struct gntab_unmap_queue_data unmap_data;
 251
 252	if (RB_EMPTY_ROOT(root))
 253		return;
 254
 255	unmap_data.pages = pages;
 256	unmap_data.unmap_ops = unmap;
 257	unmap_data.kunmap_ops = NULL;
 258
 259	foreach_grant_safe(persistent_gnt, n, root, node) {
 260		BUG_ON(persistent_gnt->handle ==
 261			BLKBACK_INVALID_HANDLE);
 262		gnttab_set_unmap_op(&unmap[segs_to_unmap],
 263			(unsigned long) pfn_to_kaddr(page_to_pfn(
 264				persistent_gnt->page)),
 265			GNTMAP_host_map,
 266			persistent_gnt->handle);
 267
 268		pages[segs_to_unmap] = persistent_gnt->page;
 269
 270		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
 271			!rb_next(&persistent_gnt->node)) {
 272
 273			unmap_data.count = segs_to_unmap;
 274			BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
 275
 276			gnttab_page_cache_put(&ring->free_pages, pages,
 277					      segs_to_unmap);
 278			segs_to_unmap = 0;
 279		}
 280
 281		rb_erase(&persistent_gnt->node, root);
 282		kfree(persistent_gnt);
 283		ring->persistent_gnt_c--;
 284	}
 285
 286	BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
 287	BUG_ON(ring->persistent_gnt_c != 0);
 288}
 289
 290void xen_blkbk_unmap_purged_grants(struct work_struct *work)
 291{
 292	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 293	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 294	struct persistent_gnt *persistent_gnt;
 295	int segs_to_unmap = 0;
 296	struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
 297	struct gntab_unmap_queue_data unmap_data;
 298
 299	unmap_data.pages = pages;
 300	unmap_data.unmap_ops = unmap;
 301	unmap_data.kunmap_ops = NULL;
 302
 303	while(!list_empty(&ring->persistent_purge_list)) {
 304		persistent_gnt = list_first_entry(&ring->persistent_purge_list,
 305		                                  struct persistent_gnt,
 306		                                  remove_node);
 307		list_del(&persistent_gnt->remove_node);
 308
 309		gnttab_set_unmap_op(&unmap[segs_to_unmap],
 310			vaddr(persistent_gnt->page),
 311			GNTMAP_host_map,
 312			persistent_gnt->handle);
 313
 314		pages[segs_to_unmap] = persistent_gnt->page;
 315
 316		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
 317			unmap_data.count = segs_to_unmap;
 318			BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
 319			gnttab_page_cache_put(&ring->free_pages, pages,
 320					      segs_to_unmap);
 321			segs_to_unmap = 0;
 322		}
 323		kfree(persistent_gnt);
 324	}
 325	if (segs_to_unmap > 0) {
 326		unmap_data.count = segs_to_unmap;
 327		BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
 328		gnttab_page_cache_put(&ring->free_pages, pages, segs_to_unmap);
 329	}
 330}
 331
 332static void purge_persistent_gnt(struct xen_blkif_ring *ring)
 333{
 334	struct persistent_gnt *persistent_gnt;
 335	struct rb_node *n;
 336	unsigned int num_clean, total;
 337	bool scan_used = false;
 338	struct rb_root *root;
 339
 340	if (work_busy(&ring->persistent_purge_work)) {
 341		pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
 342		goto out;
 343	}
 344
 345	if (ring->persistent_gnt_c < max_pgrants ||
 346	    (ring->persistent_gnt_c == max_pgrants &&
 347	    !ring->blkif->vbd.overflow_max_grants)) {
 348		num_clean = 0;
 349	} else {
 350		num_clean = (max_pgrants / 100) * LRU_PERCENT_CLEAN;
 351		num_clean = ring->persistent_gnt_c - max_pgrants + num_clean;
 352		num_clean = min(ring->persistent_gnt_c, num_clean);
 353		pr_debug("Going to purge at least %u persistent grants\n",
 354			 num_clean);
 355	}
 356
 357	/*
 358	 * At this point, we can assure that there will be no calls
 359         * to get_persistent_grant (because we are executing this code from
 360         * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
 361         * which means that the number of currently used grants will go down,
 362         * but never up, so we will always be able to remove the requested
 363         * number of grants.
 364	 */
 365
 366	total = 0;
 367
 368	BUG_ON(!list_empty(&ring->persistent_purge_list));
 369	root = &ring->persistent_gnts;
 370purge_list:
 371	foreach_grant_safe(persistent_gnt, n, root, node) {
 372		BUG_ON(persistent_gnt->handle ==
 373			BLKBACK_INVALID_HANDLE);
 374
 375		if (persistent_gnt->active)
 376			continue;
 377		if (!scan_used && !persistent_gnt_timeout(persistent_gnt))
 378			continue;
 379		if (scan_used && total >= num_clean)
 380			continue;
 381
 382		rb_erase(&persistent_gnt->node, root);
 383		list_add(&persistent_gnt->remove_node,
 384			 &ring->persistent_purge_list);
 385		total++;
 386	}
 387	/*
 388	 * Check whether we also need to start cleaning
 389	 * grants that were used since last purge in order to cope
 390	 * with the requested num
 391	 */
 392	if (!scan_used && total < num_clean) {
 393		pr_debug("Still missing %u purged frames\n", num_clean - total);
 394		scan_used = true;
 395		goto purge_list;
 396	}
 397
 398	if (total) {
 399		ring->persistent_gnt_c -= total;
 400		ring->blkif->vbd.overflow_max_grants = 0;
 401
 402		/* We can defer this work */
 403		schedule_work(&ring->persistent_purge_work);
 404		pr_debug("Purged %u/%u\n", num_clean, total);
 405	}
 406
 407out:
 408	return;
 409}
 410
 411/*
 412 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
 413 */
 414static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
 415{
 416	struct pending_req *req = NULL;
 417	unsigned long flags;
 418
 419	spin_lock_irqsave(&ring->pending_free_lock, flags);
 420	if (!list_empty(&ring->pending_free)) {
 421		req = list_entry(ring->pending_free.next, struct pending_req,
 422				 free_list);
 423		list_del(&req->free_list);
 424	}
 425	spin_unlock_irqrestore(&ring->pending_free_lock, flags);
 426	return req;
 427}
 428
 429/*
 430 * Return the 'pending_req' structure back to the freepool. We also
 431 * wake up the thread if it was waiting for a free page.
 432 */
 433static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
 434{
 435	unsigned long flags;
 436	int was_empty;
 437
 438	spin_lock_irqsave(&ring->pending_free_lock, flags);
 439	was_empty = list_empty(&ring->pending_free);
 440	list_add(&req->free_list, &ring->pending_free);
 441	spin_unlock_irqrestore(&ring->pending_free_lock, flags);
 442	if (was_empty)
 443		wake_up(&ring->pending_free_wq);
 444}
 445
 446/*
 447 * Routines for managing virtual block devices (vbds).
 448 */
 449static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
 450			     enum req_op operation)
 451{
 452	struct xen_vbd *vbd = &blkif->vbd;
 453	int rc = -EACCES;
 454
 455	if ((operation != REQ_OP_READ) && vbd->readonly)
 456		goto out;
 457
 458	if (likely(req->nr_sects)) {
 459		blkif_sector_t end = req->sector_number + req->nr_sects;
 460
 461		if (unlikely(end < req->sector_number))
 462			goto out;
 463		if (unlikely(end > vbd_sz(vbd)))
 464			goto out;
 465	}
 466
 467	req->dev  = vbd->pdevice;
 468	req->bdev = file_bdev(vbd->bdev_file);
 469	rc = 0;
 470
 471 out:
 472	return rc;
 473}
 474
 475static void xen_vbd_resize(struct xen_blkif *blkif)
 476{
 477	struct xen_vbd *vbd = &blkif->vbd;
 478	struct xenbus_transaction xbt;
 479	int err;
 480	struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
 481	unsigned long long new_size = vbd_sz(vbd);
 482
 483	pr_info("VBD Resize: Domid: %d, Device: (%d, %d)\n",
 484		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
 485	pr_info("VBD Resize: new size %llu\n", new_size);
 486	vbd->size = new_size;
 487again:
 488	err = xenbus_transaction_start(&xbt);
 489	if (err) {
 490		pr_warn("Error starting transaction\n");
 491		return;
 492	}
 493	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
 494			    (unsigned long long)vbd_sz(vbd));
 495	if (err) {
 496		pr_warn("Error writing new size\n");
 497		goto abort;
 498	}
 499	/*
 500	 * Write the current state; we will use this to synchronize
 501	 * the front-end. If the current state is "connected" the
 502	 * front-end will get the new size information online.
 503	 */
 504	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
 505	if (err) {
 506		pr_warn("Error writing the state\n");
 507		goto abort;
 508	}
 509
 510	err = xenbus_transaction_end(xbt, 0);
 511	if (err == -EAGAIN)
 512		goto again;
 513	if (err)
 514		pr_warn("Error ending transaction\n");
 515	return;
 516abort:
 517	xenbus_transaction_end(xbt, 1);
 518}
 519
 520/*
 521 * Notification from the guest OS.
 522 */
 523static void blkif_notify_work(struct xen_blkif_ring *ring)
 524{
 525	ring->waiting_reqs = 1;
 526	wake_up(&ring->wq);
 527}
 528
 529irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
 530{
 531	blkif_notify_work(dev_id);
 532	return IRQ_HANDLED;
 533}
 534
 535/*
 536 * SCHEDULER FUNCTIONS
 537 */
 538
 539static void print_stats(struct xen_blkif_ring *ring)
 540{
 541	pr_info("(%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
 542		 "  |  ds %4llu | pg: %4u/%4d\n",
 543		 current->comm, ring->st_oo_req,
 544		 ring->st_rd_req, ring->st_wr_req,
 545		 ring->st_f_req, ring->st_ds_req,
 546		 ring->persistent_gnt_c, max_pgrants);
 547	ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
 548	ring->st_rd_req = 0;
 549	ring->st_wr_req = 0;
 550	ring->st_oo_req = 0;
 551	ring->st_ds_req = 0;
 552}
 553
 554int xen_blkif_schedule(void *arg)
 555{
 556	struct xen_blkif_ring *ring = arg;
 557	struct xen_blkif *blkif = ring->blkif;
 558	struct xen_vbd *vbd = &blkif->vbd;
 559	unsigned long timeout;
 560	int ret;
 561	bool do_eoi;
 562	unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
 563
 564	set_freezable();
 
 565	while (!kthread_should_stop()) {
 566		if (try_to_freeze())
 567			continue;
 568		if (unlikely(vbd->size != vbd_sz(vbd)))
 569			xen_vbd_resize(blkif);
 570
 571		timeout = msecs_to_jiffies(LRU_INTERVAL);
 572
 573		timeout = wait_event_interruptible_timeout(
 574			ring->wq,
 575			ring->waiting_reqs || kthread_should_stop(),
 576			timeout);
 577		if (timeout == 0)
 578			goto purge_gnt_list;
 579		timeout = wait_event_interruptible_timeout(
 580			ring->pending_free_wq,
 581			!list_empty(&ring->pending_free) ||
 582			kthread_should_stop(),
 583			timeout);
 584		if (timeout == 0)
 585			goto purge_gnt_list;
 586
 587		do_eoi = ring->waiting_reqs;
 588
 589		ring->waiting_reqs = 0;
 590		smp_mb(); /* clear flag *before* checking for work */
 591
 592		ret = do_block_io_op(ring, &eoi_flags);
 593		if (ret > 0)
 594			ring->waiting_reqs = 1;
 595		if (ret == -EACCES)
 596			wait_event_interruptible(ring->shutdown_wq,
 597						 kthread_should_stop());
 598
 599		if (do_eoi && !ring->waiting_reqs) {
 600			xen_irq_lateeoi(ring->irq, eoi_flags);
 601			eoi_flags |= XEN_EOI_FLAG_SPURIOUS;
 602		}
 603
 604purge_gnt_list:
 605		if (blkif->vbd.feature_gnt_persistent &&
 606		    time_after(jiffies, ring->next_lru)) {
 607			purge_persistent_gnt(ring);
 608			ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
 609		}
 610
 611		/* Shrink the free pages pool if it is too large. */
 612		if (time_before(jiffies, blkif->buffer_squeeze_end))
 613			gnttab_page_cache_shrink(&ring->free_pages, 0);
 614		else
 615			gnttab_page_cache_shrink(&ring->free_pages,
 616						 max_buffer_pages);
 617
 618		if (log_stats && time_after(jiffies, ring->st_print))
 619			print_stats(ring);
 620	}
 621
 622	/* Drain pending purge work */
 623	flush_work(&ring->persistent_purge_work);
 624
 625	if (log_stats)
 626		print_stats(ring);
 627
 628	ring->xenblkd = NULL;
 
 629
 630	return 0;
 631}
 632
 
 
 
 
 633/*
 634 * Remove persistent grants and empty the pool of free pages
 
 635 */
 636void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
 637{
 638	/* Free all persistent grant pages */
 639	free_persistent_gnts(ring);
 640
 641	/* Since we are shutting down remove all pages from the buffer */
 642	gnttab_page_cache_shrink(&ring->free_pages, 0 /* All */);
 643}
 644
 645static unsigned int xen_blkbk_unmap_prepare(
 646	struct xen_blkif_ring *ring,
 647	struct grant_page **pages,
 648	unsigned int num,
 649	struct gnttab_unmap_grant_ref *unmap_ops,
 650	struct page **unmap_pages)
 651{
 
 652	unsigned int i, invcount = 0;
 
 
 653
 654	for (i = 0; i < num; i++) {
 655		if (pages[i]->persistent_gnt != NULL) {
 656			put_persistent_gnt(ring, pages[i]->persistent_gnt);
 657			continue;
 658		}
 659		if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
 660			continue;
 661		unmap_pages[invcount] = pages[i]->page;
 662		gnttab_set_unmap_op(&unmap_ops[invcount], vaddr(pages[i]->page),
 663				    GNTMAP_host_map, pages[i]->handle);
 664		pages[i]->handle = BLKBACK_INVALID_HANDLE;
 665		invcount++;
 666	}
 667
 668	return invcount;
 669}
 670
 671static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
 672{
 673	struct pending_req *pending_req = (struct pending_req *)(data->data);
 674	struct xen_blkif_ring *ring = pending_req->ring;
 675	struct xen_blkif *blkif = ring->blkif;
 676
 677	/* BUG_ON used to reproduce existing behaviour,
 678	   but is this the best way to deal with this? */
 679	BUG_ON(result);
 680
 681	gnttab_page_cache_put(&ring->free_pages, data->pages, data->count);
 682	make_response(ring, pending_req->id,
 683		      pending_req->operation, pending_req->status);
 684	free_req(ring, pending_req);
 685	/*
 686	 * Make sure the request is freed before releasing blkif,
 687	 * or there could be a race between free_req and the
 688	 * cleanup done in xen_blkif_free during shutdown.
 689	 *
 690	 * NB: The fact that we might try to wake up pending_free_wq
 691	 * before drain_complete (in case there's a drain going on)
 692	 * it's not a problem with our current implementation
 693	 * because we can assure there's no thread waiting on
 694	 * pending_free_wq if there's a drain going on, but it has
 695	 * to be taken into account if the current model is changed.
 696	 */
 697	if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
 698		complete(&blkif->drain_complete);
 699	}
 700	xen_blkif_put(blkif);
 701}
 702
 703static void xen_blkbk_unmap_and_respond(struct pending_req *req)
 704{
 705	struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
 706	struct xen_blkif_ring *ring = req->ring;
 707	struct grant_page **pages = req->segments;
 708	unsigned int invcount;
 709
 710	invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
 711					   req->unmap, req->unmap_pages);
 712
 713	work->data = req;
 714	work->done = xen_blkbk_unmap_and_respond_callback;
 715	work->unmap_ops = req->unmap;
 716	work->kunmap_ops = NULL;
 717	work->pages = req->unmap_pages;
 718	work->count = invcount;
 719
 720	gnttab_unmap_refs_async(&req->gnttab_unmap_data);
 721}
 722
 723
 724/*
 725 * Unmap the grant references.
 726 *
 727 * This could accumulate ops up to the batch size to reduce the number
 728 * of hypercalls, but since this is only used in error paths there's
 729 * no real need.
 730 */
 731static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
 732                            struct grant_page *pages[],
 733                            int num)
 734{
 735	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 736	struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 737	unsigned int invcount = 0;
 738	int ret;
 739
 740	while (num) {
 741		unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 742
 743		invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
 744						   unmap, unmap_pages);
 745		if (invcount) {
 746			ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
 747			BUG_ON(ret);
 748			gnttab_page_cache_put(&ring->free_pages, unmap_pages,
 749					      invcount);
 750		}
 751		pages += batch;
 752		num -= batch;
 753	}
 754}
 755
 756static int xen_blkbk_map(struct xen_blkif_ring *ring,
 757			 struct grant_page *pages[],
 758			 int num, bool ro)
 759{
 760	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 761	struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 762	struct persistent_gnt *persistent_gnt = NULL;
 763	phys_addr_t addr = 0;
 764	int i, seg_idx, new_map_idx;
 765	int segs_to_map = 0;
 766	int ret = 0;
 767	int last_map = 0, map_until = 0;
 768	int use_persistent_gnts;
 769	struct xen_blkif *blkif = ring->blkif;
 770
 771	use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
 772
 773	/*
 774	 * Fill out preq.nr_sects with proper amount of sectors, and setup
 775	 * assign map[..] with the PFN of the page in our domain with the
 776	 * corresponding grant reference for each page.
 777	 */
 778again:
 779	for (i = map_until; i < num; i++) {
 780		uint32_t flags;
 781
 782		if (use_persistent_gnts) {
 783			persistent_gnt = get_persistent_gnt(
 784				ring,
 785				pages[i]->gref);
 786		}
 787
 788		if (persistent_gnt) {
 789			/*
 790			 * We are using persistent grants and
 791			 * the grant is already mapped
 792			 */
 793			pages[i]->page = persistent_gnt->page;
 794			pages[i]->persistent_gnt = persistent_gnt;
 795		} else {
 796			if (gnttab_page_cache_get(&ring->free_pages,
 797						  &pages[i]->page)) {
 798				gnttab_page_cache_put(&ring->free_pages,
 799						      pages_to_gnt,
 800						      segs_to_map);
 801				ret = -ENOMEM;
 802				goto out;
 803			}
 804			addr = vaddr(pages[i]->page);
 805			pages_to_gnt[segs_to_map] = pages[i]->page;
 806			pages[i]->persistent_gnt = NULL;
 807			flags = GNTMAP_host_map;
 808			if (!use_persistent_gnts && ro)
 809				flags |= GNTMAP_readonly;
 810			gnttab_set_map_op(&map[segs_to_map++], addr,
 811					  flags, pages[i]->gref,
 812					  blkif->domid);
 813		}
 814		map_until = i + 1;
 815		if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
 816			break;
 817	}
 818
 819	if (segs_to_map)
 820		ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
 821
 822	/*
 823	 * Now swizzle the MFN in our domain with the MFN from the other domain
 824	 * so that when we access vaddr(pending_req,i) it has the contents of
 825	 * the page from the other domain.
 826	 */
 827	for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
 828		if (!pages[seg_idx]->persistent_gnt) {
 829			/* This is a newly mapped grant */
 830			BUG_ON(new_map_idx >= segs_to_map);
 831			if (unlikely(map[new_map_idx].status != 0)) {
 832				pr_debug("invalid buffer -- could not remap it\n");
 833				gnttab_page_cache_put(&ring->free_pages,
 834						      &pages[seg_idx]->page, 1);
 835				pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
 836				ret |= !ret;
 837				goto next;
 838			}
 839			pages[seg_idx]->handle = map[new_map_idx].handle;
 840		} else {
 841			continue;
 842		}
 843		if (use_persistent_gnts &&
 844		    ring->persistent_gnt_c < max_pgrants) {
 845			/*
 846			 * We are using persistent grants, the grant is
 847			 * not mapped but we might have room for it.
 848			 */
 849			persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
 850				                 GFP_KERNEL);
 851			if (!persistent_gnt) {
 852				/*
 853				 * If we don't have enough memory to
 854				 * allocate the persistent_gnt struct
 855				 * map this grant non-persistenly
 856				 */
 857				goto next;
 858			}
 859			persistent_gnt->gnt = map[new_map_idx].ref;
 860			persistent_gnt->handle = map[new_map_idx].handle;
 861			persistent_gnt->page = pages[seg_idx]->page;
 862			if (add_persistent_gnt(ring,
 863			                       persistent_gnt)) {
 864				kfree(persistent_gnt);
 865				persistent_gnt = NULL;
 866				goto next;
 867			}
 868			pages[seg_idx]->persistent_gnt = persistent_gnt;
 869			pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
 870				 persistent_gnt->gnt, ring->persistent_gnt_c,
 871				 max_pgrants);
 872			goto next;
 873		}
 874		if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
 875			blkif->vbd.overflow_max_grants = 1;
 876			pr_debug("domain %u, device %#x is using maximum number of persistent grants\n",
 877			         blkif->domid, blkif->vbd.handle);
 878		}
 879		/*
 880		 * We could not map this grant persistently, so use it as
 881		 * a non-persistent grant.
 882		 */
 883next:
 884		new_map_idx++;
 885	}
 886	segs_to_map = 0;
 887	last_map = map_until;
 888	if (!ret && map_until != num)
 889		goto again;
 890
 891out:
 892	for (i = last_map; i < num; i++) {
 893		/* Don't zap current batch's valid persistent grants. */
 894		if (i >= map_until)
 895			pages[i]->persistent_gnt = NULL;
 896		pages[i]->handle = BLKBACK_INVALID_HANDLE;
 897	}
 898
 899	return ret;
 900}
 901
 902static int xen_blkbk_map_seg(struct pending_req *pending_req)
 903{
 904	int rc;
 905
 906	rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
 907			   pending_req->nr_segs,
 908	                   (pending_req->operation != BLKIF_OP_READ));
 909
 910	return rc;
 911}
 912
 913static int xen_blkbk_parse_indirect(struct blkif_request *req,
 914				    struct pending_req *pending_req,
 915				    struct seg_buf seg[],
 916				    struct phys_req *preq)
 917{
 918	struct grant_page **pages = pending_req->indirect_pages;
 919	struct xen_blkif_ring *ring = pending_req->ring;
 920	int indirect_grefs, rc, n, nseg, i;
 921	struct blkif_request_segment *segments = NULL;
 922
 923	nseg = pending_req->nr_segs;
 924	indirect_grefs = INDIRECT_PAGES(nseg);
 925	BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
 926
 927	for (i = 0; i < indirect_grefs; i++)
 928		pages[i]->gref = req->u.indirect.indirect_grefs[i];
 929
 930	rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
 931	if (rc)
 932		goto unmap;
 933
 934	for (n = 0; n < nseg; n++) {
 935		uint8_t first_sect, last_sect;
 936
 937		if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
 938			/* Map indirect segments */
 939			if (segments)
 940				kunmap_atomic(segments);
 941			segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
 942		}
 943		i = n % SEGS_PER_INDIRECT_FRAME;
 944
 945		pending_req->segments[n]->gref = segments[i].gref;
 946
 947		first_sect = READ_ONCE(segments[i].first_sect);
 948		last_sect = READ_ONCE(segments[i].last_sect);
 949		if (last_sect >= (XEN_PAGE_SIZE >> 9) || last_sect < first_sect) {
 950			rc = -EINVAL;
 951			goto unmap;
 952		}
 953
 954		seg[n].nsec = last_sect - first_sect + 1;
 955		seg[n].offset = first_sect << 9;
 956		preq->nr_sects += seg[n].nsec;
 957	}
 958
 959unmap:
 960	if (segments)
 961		kunmap_atomic(segments);
 962	xen_blkbk_unmap(ring, pages, indirect_grefs);
 963	return rc;
 964}
 965
 966static int dispatch_discard_io(struct xen_blkif_ring *ring,
 967				struct blkif_request *req)
 968{
 969	int err = 0;
 970	int status = BLKIF_RSP_OKAY;
 971	struct xen_blkif *blkif = ring->blkif;
 972	struct block_device *bdev = file_bdev(blkif->vbd.bdev_file);
 973	struct phys_req preq;
 974
 975	xen_blkif_get(blkif);
 976
 977	preq.sector_number = req->u.discard.sector_number;
 978	preq.nr_sects      = req->u.discard.nr_sectors;
 979
 980	err = xen_vbd_translate(&preq, blkif, REQ_OP_WRITE);
 981	if (err) {
 982		pr_warn("access denied: DISCARD [%llu->%llu] on dev=%04x\n",
 983			preq.sector_number,
 984			preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
 985		goto fail_response;
 986	}
 987	ring->st_ds_req++;
 988
 989	if (blkif->vbd.discard_secure &&
 990	    (req->u.discard.flag & BLKIF_DISCARD_SECURE))
 991		err = blkdev_issue_secure_erase(bdev,
 992				req->u.discard.sector_number,
 993				req->u.discard.nr_sectors, GFP_KERNEL);
 994	else
 995		err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
 996				req->u.discard.nr_sectors, GFP_KERNEL);
 997
 998fail_response:
 999	if (err == -EOPNOTSUPP) {
1000		pr_debug("discard op failed, not supported\n");
1001		status = BLKIF_RSP_EOPNOTSUPP;
1002	} else if (err)
1003		status = BLKIF_RSP_ERROR;
1004
1005	make_response(ring, req->u.discard.id, req->operation, status);
1006	xen_blkif_put(blkif);
1007	return err;
1008}
1009
1010static int dispatch_other_io(struct xen_blkif_ring *ring,
1011			     struct blkif_request *req,
1012			     struct pending_req *pending_req)
1013{
1014	free_req(ring, pending_req);
1015	make_response(ring, req->u.other.id, req->operation,
1016		      BLKIF_RSP_EOPNOTSUPP);
1017	return -EIO;
1018}
1019
1020static void xen_blk_drain_io(struct xen_blkif_ring *ring)
1021{
1022	struct xen_blkif *blkif = ring->blkif;
1023
1024	atomic_set(&blkif->drain, 1);
1025	do {
1026		if (atomic_read(&ring->inflight) == 0)
1027			break;
1028		wait_for_completion_interruptible_timeout(
1029				&blkif->drain_complete, HZ);
1030
1031		if (!atomic_read(&blkif->drain))
1032			break;
1033	} while (!kthread_should_stop());
1034	atomic_set(&blkif->drain, 0);
1035}
1036
1037static void __end_block_io_op(struct pending_req *pending_req,
1038		blk_status_t error)
1039{
1040	/* An error fails the entire request. */
1041	if (pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE &&
1042	    error == BLK_STS_NOTSUPP) {
1043		pr_debug("flush diskcache op failed, not supported\n");
1044		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
1045		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
1046	} else if (pending_req->operation == BLKIF_OP_WRITE_BARRIER &&
1047		   error == BLK_STS_NOTSUPP) {
1048		pr_debug("write barrier op failed, not supported\n");
1049		xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
1050		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
1051	} else if (error) {
1052		pr_debug("Buffer not up-to-date at end of operation,"
1053			 " error=%d\n", error);
1054		pending_req->status = BLKIF_RSP_ERROR;
1055	}
1056
1057	/*
1058	 * If all of the bio's have completed it is time to unmap
1059	 * the grant references associated with 'request' and provide
1060	 * the proper response on the ring.
1061	 */
1062	if (atomic_dec_and_test(&pending_req->pendcnt))
1063		xen_blkbk_unmap_and_respond(pending_req);
 
 
 
 
 
1064}
1065
1066/*
1067 * bio callback.
1068 */
1069static void end_block_io_op(struct bio *bio)
1070{
1071	__end_block_io_op(bio->bi_private, bio->bi_status);
1072	bio_put(bio);
1073}
1074
1075static void blkif_get_x86_32_req(struct blkif_request *dst,
1076				 const struct blkif_x86_32_request *src)
1077{
1078	unsigned int i, n;
1079
1080	dst->operation = READ_ONCE(src->operation);
1081
1082	switch (dst->operation) {
1083	case BLKIF_OP_READ:
1084	case BLKIF_OP_WRITE:
1085	case BLKIF_OP_WRITE_BARRIER:
1086	case BLKIF_OP_FLUSH_DISKCACHE:
1087		dst->u.rw.nr_segments = READ_ONCE(src->u.rw.nr_segments);
1088		dst->u.rw.handle = src->u.rw.handle;
1089		dst->u.rw.id = src->u.rw.id;
1090		dst->u.rw.sector_number = src->u.rw.sector_number;
1091		n = min_t(unsigned int, BLKIF_MAX_SEGMENTS_PER_REQUEST,
1092			  dst->u.rw.nr_segments);
1093		for (i = 0; i < n; i++)
1094			dst->u.rw.seg[i] = src->u.rw.seg[i];
1095		break;
1096
1097	case BLKIF_OP_DISCARD:
1098		dst->u.discard.flag = src->u.discard.flag;
1099		dst->u.discard.id = src->u.discard.id;
1100		dst->u.discard.sector_number = src->u.discard.sector_number;
1101		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
1102		break;
1103
1104	case BLKIF_OP_INDIRECT:
1105		dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
1106		dst->u.indirect.nr_segments =
1107			READ_ONCE(src->u.indirect.nr_segments);
1108		dst->u.indirect.handle = src->u.indirect.handle;
1109		dst->u.indirect.id = src->u.indirect.id;
1110		dst->u.indirect.sector_number = src->u.indirect.sector_number;
1111		n = min(MAX_INDIRECT_PAGES,
1112			INDIRECT_PAGES(dst->u.indirect.nr_segments));
1113		for (i = 0; i < n; i++)
1114			dst->u.indirect.indirect_grefs[i] =
1115				src->u.indirect.indirect_grefs[i];
1116		break;
1117
1118	default:
1119		/*
1120		 * Don't know how to translate this op. Only get the
1121		 * ID so failure can be reported to the frontend.
1122		 */
1123		dst->u.other.id = src->u.other.id;
1124		break;
1125	}
1126}
1127
1128static void blkif_get_x86_64_req(struct blkif_request *dst,
1129				 const struct blkif_x86_64_request *src)
1130{
1131	unsigned int i, n;
1132
1133	dst->operation = READ_ONCE(src->operation);
1134
1135	switch (dst->operation) {
1136	case BLKIF_OP_READ:
1137	case BLKIF_OP_WRITE:
1138	case BLKIF_OP_WRITE_BARRIER:
1139	case BLKIF_OP_FLUSH_DISKCACHE:
1140		dst->u.rw.nr_segments = READ_ONCE(src->u.rw.nr_segments);
1141		dst->u.rw.handle = src->u.rw.handle;
1142		dst->u.rw.id = src->u.rw.id;
1143		dst->u.rw.sector_number = src->u.rw.sector_number;
1144		n = min_t(unsigned int, BLKIF_MAX_SEGMENTS_PER_REQUEST,
1145			  dst->u.rw.nr_segments);
1146		for (i = 0; i < n; i++)
1147			dst->u.rw.seg[i] = src->u.rw.seg[i];
1148		break;
1149
1150	case BLKIF_OP_DISCARD:
1151		dst->u.discard.flag = src->u.discard.flag;
1152		dst->u.discard.id = src->u.discard.id;
1153		dst->u.discard.sector_number = src->u.discard.sector_number;
1154		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
1155		break;
1156
1157	case BLKIF_OP_INDIRECT:
1158		dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
1159		dst->u.indirect.nr_segments =
1160			READ_ONCE(src->u.indirect.nr_segments);
1161		dst->u.indirect.handle = src->u.indirect.handle;
1162		dst->u.indirect.id = src->u.indirect.id;
1163		dst->u.indirect.sector_number = src->u.indirect.sector_number;
1164		n = min(MAX_INDIRECT_PAGES,
1165			INDIRECT_PAGES(dst->u.indirect.nr_segments));
1166		for (i = 0; i < n; i++)
1167			dst->u.indirect.indirect_grefs[i] =
1168				src->u.indirect.indirect_grefs[i];
1169		break;
1170
1171	default:
1172		/*
1173		 * Don't know how to translate this op. Only get the
1174		 * ID so failure can be reported to the frontend.
1175		 */
1176		dst->u.other.id = src->u.other.id;
1177		break;
1178	}
1179}
1180
1181/*
1182 * Function to copy the from the ring buffer the 'struct blkif_request'
1183 * (which has the sectors we want, number of them, grant references, etc),
1184 * and transmute  it to the block API to hand it over to the proper block disk.
1185 */
1186static int
1187__do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags)
1188{
1189	union blkif_back_rings *blk_rings = &ring->blk_rings;
1190	struct blkif_request req;
1191	struct pending_req *pending_req;
1192	RING_IDX rc, rp;
1193	int more_to_do = 0;
1194
1195	rc = blk_rings->common.req_cons;
1196	rp = blk_rings->common.sring->req_prod;
1197	rmb(); /* Ensure we see queued requests up to 'rp'. */
1198
1199	if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
1200		rc = blk_rings->common.rsp_prod_pvt;
1201		pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
1202			rp, rc, rp - rc, ring->blkif->vbd.pdevice);
1203		return -EACCES;
1204	}
1205	while (rc != rp) {
1206
1207		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
1208			break;
1209
1210		/* We've seen a request, so clear spurious eoi flag. */
1211		*eoi_flags &= ~XEN_EOI_FLAG_SPURIOUS;
1212
1213		if (kthread_should_stop()) {
1214			more_to_do = 1;
1215			break;
1216		}
1217
1218		pending_req = alloc_req(ring);
1219		if (NULL == pending_req) {
1220			ring->st_oo_req++;
1221			more_to_do = 1;
1222			break;
1223		}
1224
1225		switch (ring->blkif->blk_protocol) {
1226		case BLKIF_PROTOCOL_NATIVE:
1227			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
1228			break;
1229		case BLKIF_PROTOCOL_X86_32:
1230			blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
1231			break;
1232		case BLKIF_PROTOCOL_X86_64:
1233			blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
1234			break;
1235		default:
1236			BUG();
1237		}
1238		blk_rings->common.req_cons = ++rc; /* before make_response() */
1239
1240		/* Apply all sanity checks to /private copy/ of request. */
1241		barrier();
1242
1243		switch (req.operation) {
1244		case BLKIF_OP_READ:
1245		case BLKIF_OP_WRITE:
1246		case BLKIF_OP_WRITE_BARRIER:
1247		case BLKIF_OP_FLUSH_DISKCACHE:
1248		case BLKIF_OP_INDIRECT:
1249			if (dispatch_rw_block_io(ring, &req, pending_req))
1250				goto done;
1251			break;
1252		case BLKIF_OP_DISCARD:
1253			free_req(ring, pending_req);
1254			if (dispatch_discard_io(ring, &req))
1255				goto done;
1256			break;
1257		default:
1258			if (dispatch_other_io(ring, &req, pending_req))
1259				goto done;
1260			break;
1261		}
1262
1263		/* Yield point for this unbounded loop. */
1264		cond_resched();
1265	}
1266done:
1267	return more_to_do;
1268}
1269
1270static int
1271do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags)
1272{
1273	union blkif_back_rings *blk_rings = &ring->blk_rings;
1274	int more_to_do;
1275
1276	do {
1277		more_to_do = __do_block_io_op(ring, eoi_flags);
1278		if (more_to_do)
1279			break;
1280
1281		RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
1282	} while (more_to_do);
1283
1284	return more_to_do;
1285}
 
1286/*
1287 * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
1288 * and call the 'submit_bio' to pass it to the underlying storage.
1289 */
1290static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
1291				struct blkif_request *req,
1292				struct pending_req *pending_req)
1293{
1294	struct phys_req preq;
1295	struct seg_buf *seg = pending_req->seg;
1296	unsigned int nseg;
1297	struct bio *bio = NULL;
1298	struct bio **biolist = pending_req->biolist;
1299	int i, nbio = 0;
1300	enum req_op operation;
1301	blk_opf_t operation_flags = 0;
1302	struct blk_plug plug;
1303	bool drain = false;
1304	struct grant_page **pages = pending_req->segments;
1305	unsigned short req_operation;
1306
1307	req_operation = req->operation == BLKIF_OP_INDIRECT ?
1308			req->u.indirect.indirect_op : req->operation;
1309
1310	if ((req->operation == BLKIF_OP_INDIRECT) &&
1311	    (req_operation != BLKIF_OP_READ) &&
1312	    (req_operation != BLKIF_OP_WRITE)) {
1313		pr_debug("Invalid indirect operation (%u)\n", req_operation);
1314		goto fail_response;
1315	}
1316
1317	switch (req_operation) {
1318	case BLKIF_OP_READ:
1319		ring->st_rd_req++;
1320		operation = REQ_OP_READ;
1321		break;
1322	case BLKIF_OP_WRITE:
1323		ring->st_wr_req++;
1324		operation = REQ_OP_WRITE;
1325		operation_flags = REQ_SYNC | REQ_IDLE;
1326		break;
1327	case BLKIF_OP_WRITE_BARRIER:
1328		drain = true;
1329		fallthrough;
1330	case BLKIF_OP_FLUSH_DISKCACHE:
1331		ring->st_f_req++;
1332		operation = REQ_OP_WRITE;
1333		operation_flags = REQ_PREFLUSH;
1334		break;
 
1335	default:
1336		operation = 0; /* make gcc happy */
1337		goto fail_response;
1338		break;
1339	}
1340
1341	/* Check that the number of segments is sane. */
1342	nseg = req->operation == BLKIF_OP_INDIRECT ?
1343	       req->u.indirect.nr_segments : req->u.rw.nr_segments;
1344
1345	if (unlikely(nseg == 0 && operation_flags != REQ_PREFLUSH) ||
1346	    unlikely((req->operation != BLKIF_OP_INDIRECT) &&
1347		     (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
1348	    unlikely((req->operation == BLKIF_OP_INDIRECT) &&
1349		     (nseg > MAX_INDIRECT_SEGMENTS))) {
1350		pr_debug("Bad number of segments in request (%d)\n", nseg);
1351		/* Haven't submitted any bio's yet. */
1352		goto fail_response;
1353	}
1354
 
 
1355	preq.nr_sects      = 0;
1356
1357	pending_req->ring      = ring;
1358	pending_req->id        = req->u.rw.id;
1359	pending_req->operation = req_operation;
1360	pending_req->status    = BLKIF_RSP_OKAY;
1361	pending_req->nr_segs   = nseg;
1362
1363	if (req->operation != BLKIF_OP_INDIRECT) {
1364		preq.dev               = req->u.rw.handle;
1365		preq.sector_number     = req->u.rw.sector_number;
1366		for (i = 0; i < nseg; i++) {
1367			pages[i]->gref = req->u.rw.seg[i].gref;
1368			seg[i].nsec = req->u.rw.seg[i].last_sect -
1369				req->u.rw.seg[i].first_sect + 1;
1370			seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
1371			if ((req->u.rw.seg[i].last_sect >= (XEN_PAGE_SIZE >> 9)) ||
1372			    (req->u.rw.seg[i].last_sect <
1373			     req->u.rw.seg[i].first_sect))
1374				goto fail_response;
1375			preq.nr_sects += seg[i].nsec;
1376		}
1377	} else {
1378		preq.dev               = req->u.indirect.handle;
1379		preq.sector_number     = req->u.indirect.sector_number;
1380		if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
1381			goto fail_response;
 
 
1382	}
1383
1384	if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
1385		pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
1386			 operation == REQ_OP_READ ? "read" : "write",
1387			 preq.sector_number,
1388			 preq.sector_number + preq.nr_sects,
1389			 ring->blkif->vbd.pdevice);
1390		goto fail_response;
1391	}
1392
1393	/*
1394	 * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
1395	 * is set there.
1396	 */
1397	for (i = 0; i < nseg; i++) {
1398		if (((int)preq.sector_number|(int)seg[i].nsec) &
1399		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
1400			pr_debug("Misaligned I/O request from domain %d\n",
1401				 ring->blkif->domid);
1402			goto fail_response;
1403		}
1404	}
1405
1406	/* Wait on all outstanding I/O's and once that has been completed
1407	 * issue the flush.
1408	 */
1409	if (drain)
1410		xen_blk_drain_io(pending_req->ring);
1411
1412	/*
1413	 * If we have failed at this point, we need to undo the M2P override,
1414	 * set gnttab_set_unmap_op on all of the grant references and perform
1415	 * the hypercall to unmap the grants - that is all done in
1416	 * xen_blkbk_unmap.
1417	 */
1418	if (xen_blkbk_map_seg(pending_req))
1419		goto fail_flush;
1420
1421	/*
1422	 * This corresponding xen_blkif_put is done in __end_block_io_op, or
1423	 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
1424	 */
1425	xen_blkif_get(ring->blkif);
1426	atomic_inc(&ring->inflight);
1427
1428	for (i = 0; i < nseg; i++) {
1429		while ((bio == NULL) ||
1430		       (bio_add_page(bio,
1431				     pages[i]->page,
1432				     seg[i].nsec << 9,
1433				     seg[i].offset) == 0)) {
1434			bio = bio_alloc(preq.bdev, bio_max_segs(nseg - i),
1435					operation | operation_flags,
1436					GFP_KERNEL);
 
 
1437			biolist[nbio++] = bio;
 
1438			bio->bi_private = pending_req;
1439			bio->bi_end_io  = end_block_io_op;
1440			bio->bi_iter.bi_sector  = preq.sector_number;
1441		}
1442
1443		preq.sector_number += seg[i].nsec;
1444	}
1445
1446	/* This will be hit if the operation was a flush or discard. */
1447	if (!bio) {
1448		BUG_ON(operation_flags != REQ_PREFLUSH);
 
 
 
 
1449
1450		bio = bio_alloc(preq.bdev, 0, operation | operation_flags,
1451				GFP_KERNEL);
1452		biolist[nbio++] = bio;
 
1453		bio->bi_private = pending_req;
1454		bio->bi_end_io  = end_block_io_op;
1455	}
1456
 
 
 
 
1457	atomic_set(&pending_req->pendcnt, nbio);
 
 
1458	blk_start_plug(&plug);
1459
1460	for (i = 0; i < nbio; i++)
1461		submit_bio(biolist[i]);
1462
1463	/* Let the I/Os go.. */
1464	blk_finish_plug(&plug);
1465
1466	if (operation == REQ_OP_READ)
1467		ring->st_rd_sect += preq.nr_sects;
1468	else if (operation == REQ_OP_WRITE)
1469		ring->st_wr_sect += preq.nr_sects;
1470
1471	return 0;
1472
1473 fail_flush:
1474	xen_blkbk_unmap(ring, pending_req->segments,
1475	                pending_req->nr_segs);
1476 fail_response:
1477	/* Haven't submitted any bio's yet. */
1478	make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
1479	free_req(ring, pending_req);
 
 
 
 
 
 
 
1480	msleep(1); /* back off a bit */
1481	return -EIO;
1482}
1483
1484
1485
1486/*
1487 * Put a response on the ring on how the operation fared.
1488 */
1489static void make_response(struct xen_blkif_ring *ring, u64 id,
1490			  unsigned short op, int st)
1491{
1492	struct blkif_response *resp;
1493	unsigned long     flags;
1494	union blkif_back_rings *blk_rings;
1495	int notify;
1496
1497	spin_lock_irqsave(&ring->blk_ring_lock, flags);
1498	blk_rings = &ring->blk_rings;
 
 
 
1499	/* Place on the response ring for the relevant domain. */
1500	switch (ring->blkif->blk_protocol) {
1501	case BLKIF_PROTOCOL_NATIVE:
1502		resp = RING_GET_RESPONSE(&blk_rings->native,
1503					 blk_rings->native.rsp_prod_pvt);
1504		break;
1505	case BLKIF_PROTOCOL_X86_32:
1506		resp = RING_GET_RESPONSE(&blk_rings->x86_32,
1507					 blk_rings->x86_32.rsp_prod_pvt);
1508		break;
1509	case BLKIF_PROTOCOL_X86_64:
1510		resp = RING_GET_RESPONSE(&blk_rings->x86_64,
1511					 blk_rings->x86_64.rsp_prod_pvt);
1512		break;
1513	default:
1514		BUG();
1515	}
1516
1517	resp->id        = id;
1518	resp->operation = op;
1519	resp->status    = st;
1520
1521	blk_rings->common.rsp_prod_pvt++;
1522	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
1523	spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
1524	if (notify)
1525		notify_remote_via_irq(ring->irq);
1526}
1527
1528static int __init xen_blkif_init(void)
1529{
 
1530	int rc = 0;
1531
1532	if (!xen_domain())
1533		return -ENODEV;
1534
1535	if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
1536		pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
1537			xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
1538		xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1539	}
1540
1541	if (xenblk_max_queues == 0)
1542		xenblk_max_queues = num_online_cpus();
1543
1544	rc = xen_blkif_interface_init();
1545	if (rc)
1546		goto failed_init;
1547
 
 
 
 
 
 
 
 
 
 
1548	rc = xen_blkif_xenbus_init();
1549	if (rc)
1550		goto failed_init;
1551
 
 
 
 
1552 failed_init:
 
 
 
 
 
 
 
 
 
 
 
1553	return rc;
1554}
1555
1556module_init(xen_blkif_init);
1557
1558static void __exit xen_blkif_fini(void)
1559{
1560	xen_blkif_xenbus_fini();
1561	xen_blkif_interface_fini();
1562}
1563
1564module_exit(xen_blkif_fini);
1565
1566MODULE_DESCRIPTION("Virtual block device back-end driver");
1567MODULE_LICENSE("Dual BSD/GPL");
1568MODULE_ALIAS("xen-backend:vbd");