rbd.c - drivers/block/rbd.c - Linux diff v3.15 - Bootlin Elixir Cross Referencer

   1
   2/*
   3   rbd.c -- Export ceph rados objects as a Linux block device
   4
   5
   6   based on drivers/block/osdblk.c:
   7
   8   Copyright 2009 Red Hat, Inc.
   9
  10   This program is free software; you can redistribute it and/or modify
  11   it under the terms of the GNU General Public License as published by
  12   the Free Software Foundation.
  13
  14   This program is distributed in the hope that it will be useful,
  15   but WITHOUT ANY WARRANTY; without even the implied warranty of
  16   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17   GNU General Public License for more details.
  18
  19   You should have received a copy of the GNU General Public License
  20   along with this program; see the file COPYING.  If not, write to
  21   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  22
  23
  24
  25   For usage instructions, please refer to:
  26
  27                 Documentation/ABI/testing/sysfs-bus-rbd
  28
  29 */
  30
  31#include <linux/ceph/libceph.h>
  32#include <linux/ceph/osd_client.h>
  33#include <linux/ceph/mon_client.h>
 
 
  34#include <linux/ceph/decode.h>
  35#include <linux/parser.h>
  36#include <linux/bsearch.h>
  37
  38#include <linux/kernel.h>
  39#include <linux/device.h>
  40#include <linux/module.h>
 
  41#include <linux/fs.h>
  42#include <linux/blkdev.h>
  43#include <linux/slab.h>
  44#include <linux/idr.h>
 
  45
  46#include "rbd_types.h"
  47
  48#define RBD_DEBUG	/* Activate rbd_assert() calls */
  49
  50/*
  51 * The basic unit of block I/O is a sector.  It is interpreted in a
  52 * number of contexts in Linux (blk, bio, genhd), but the default is
  53 * universally 512 bytes.  These symbols are just slightly more
  54 * meaningful than the bare numbers they represent.
  55 */
  56#define	SECTOR_SHIFT	9
  57#define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
  58
  59/*
  60 * Increment the given counter and return its updated value.
  61 * If the counter is already 0 it will not be incremented.
  62 * If the counter is already at its maximum value returns
  63 * -EINVAL without updating it.
  64 */
  65static int atomic_inc_return_safe(atomic_t *v)
  66{
  67	unsigned int counter;
  68
  69	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
  70	if (counter <= (unsigned int)INT_MAX)
  71		return (int)counter;
  72
  73	atomic_dec(v);
  74
  75	return -EINVAL;
  76}
  77
  78/* Decrement the counter.  Return the resulting value, or -EINVAL */
  79static int atomic_dec_return_safe(atomic_t *v)
  80{
  81	int counter;
  82
  83	counter = atomic_dec_return(v);
  84	if (counter >= 0)
  85		return counter;
  86
  87	atomic_inc(v);
  88
  89	return -EINVAL;
  90}
  91
  92#define RBD_DRV_NAME "rbd"
  93
  94#define RBD_MINORS_PER_MAJOR		256
  95#define RBD_SINGLE_MAJOR_PART_SHIFT	4
  96
 
 
  97#define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
  98#define RBD_MAX_SNAP_NAME_LEN	\
  99			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
 100
 101#define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
 102
 103#define RBD_SNAP_HEAD_NAME	"-"
 104
 105#define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
 106
 107/* This allows a single page to hold an image name sent by OSD */
 108#define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
 109#define RBD_IMAGE_ID_LEN_MAX	64
 110
 111#define RBD_OBJ_PREFIX_LEN_MAX	64
 112
 
 
 
 113/* Feature bits */
 114
 115#define RBD_FEATURE_LAYERING	(1<<0)
 116#define RBD_FEATURE_STRIPINGV2	(1<<1)
 117#define RBD_FEATURES_ALL \
 118	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 119
 120/* Features supported by this (client software) implementation. */
 121
 122#define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
 123
 124/*
 125 * An RBD device name will be "rbd#", where the "rbd" comes from
 126 * RBD_DRV_NAME above, and # is a unique integer identifier.
 127 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
 128 * enough to hold all possible device names.
 129 */
 130#define DEV_NAME_LEN		32
 131#define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
 132
 133/*
 134 * block device image metadata (in-memory version)
 135 */
 136struct rbd_image_header {
 137	/* These six fields never change for a given rbd image */
 138	char *object_prefix;
 139	__u8 obj_order;
 140	__u8 crypt_type;
 141	__u8 comp_type;
 142	u64 stripe_unit;
 143	u64 stripe_count;
 
 144	u64 features;		/* Might be changeable someday? */
 145
 146	/* The remaining fields need to be updated occasionally */
 147	u64 image_size;
 148	struct ceph_snap_context *snapc;
 149	char *snap_names;	/* format 1 only */
 150	u64 *snap_sizes;	/* format 1 only */
 151};
 152
 153/*
 154 * An rbd image specification.
 155 *
 156 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
 157 * identify an image.  Each rbd_dev structure includes a pointer to
 158 * an rbd_spec structure that encapsulates this identity.
 159 *
 160 * Each of the id's in an rbd_spec has an associated name.  For a
 161 * user-mapped image, the names are supplied and the id's associated
 162 * with them are looked up.  For a layered image, a parent image is
 163 * defined by the tuple, and the names are looked up.
 164 *
 165 * An rbd_dev structure contains a parent_spec pointer which is
 166 * non-null if the image it represents is a child in a layered
 167 * image.  This pointer will refer to the rbd_spec structure used
 168 * by the parent rbd_dev for its own identity (i.e., the structure
 169 * is shared between the parent and child).
 170 *
 171 * Since these structures are populated once, during the discovery
 172 * phase of image construction, they are effectively immutable so
 173 * we make no effort to synchronize access to them.
 174 *
 175 * Note that code herein does not assume the image name is known (it
 176 * could be a null pointer).
 177 */
 178struct rbd_spec {
 179	u64		pool_id;
 180	const char	*pool_name;
 
 181
 182	const char	*image_id;
 183	const char	*image_name;
 184
 185	u64		snap_id;
 186	const char	*snap_name;
 187
 188	struct kref	kref;
 189};
 190
 191/*
 192 * an instance of the client.  multiple devices may share an rbd client.
 193 */
 194struct rbd_client {
 195	struct ceph_client	*client;
 196	struct kref		kref;
 197	struct list_head	node;
 198};
 199
 
 
 
 
 
 200struct rbd_img_request;
 201typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
 202
 203#define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
 
 
 
 
 
 204
 205struct rbd_obj_request;
 206typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
 
 
 
 
 207
 208enum obj_request_type {
 209	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
 
 
 
 
 
 
 
 
 210};
 211
 212enum obj_req_flags {
 213	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
 214	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
 215	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
 216	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 217};
 218
 219struct rbd_obj_request {
 220	const char		*object_name;
 221	u64			offset;		/* object start byte */
 222	u64			length;		/* bytes from offset */
 223	unsigned long		flags;
 
 
 
 224
 225	/*
 226	 * An object request associated with an image will have its
 227	 * img_data flag set; a standalone object request will not.
 228	 *
 229	 * A standalone object request will have which == BAD_WHICH
 230	 * and a null obj_request pointer.
 231	 *
 232	 * An object request initiated in support of a layered image
 233	 * object (to check for its existence before a write) will
 234	 * have which == BAD_WHICH and a non-null obj_request pointer.
 235	 *
 236	 * Finally, an object request for rbd image data will have
 237	 * which != BAD_WHICH, and will have a non-null img_request
 238	 * pointer.  The value of which will be in the range
 239	 * 0..(img_request->obj_request_count-1).
 240	 */
 241	union {
 242		struct rbd_obj_request	*obj_request;	/* STAT op */
 243		struct {
 244			struct rbd_img_request	*img_request;
 245			u64			img_offset;
 246			/* links for img_request->obj_requests list */
 247			struct list_head	links;
 248		};
 249	};
 250	u32			which;		/* posn image request list */
 251
 252	enum obj_request_type	type;
 
 
 
 253	union {
 254		struct bio	*bio_list;
 255		struct {
 256			struct page	**pages;
 257			u32		page_count;
 
 258		};
 259	};
 260	struct page		**copyup_pages;
 261	u32			copyup_page_count;
 262
 263	struct ceph_osd_request	*osd_req;
 264
 265	u64			xferred;	/* bytes transferred */
 266	int			result;
 
 267
 268	rbd_obj_callback_t	callback;
 269	struct completion	completion;
 270
 
 
 271	struct kref		kref;
 272};
 273
 274enum img_req_flags {
 275	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
 276	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
 277	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
 278};
 279
 
 
 
 
 
 
 
 280struct rbd_img_request {
 281	struct rbd_device	*rbd_dev;
 282	u64			offset;	/* starting image byte offset */
 283	u64			length;	/* byte count from offset */
 284	unsigned long		flags;
 
 285	union {
 286		u64			snap_id;	/* for reads */
 287		struct ceph_snap_context *snapc;	/* for writes */
 288	};
 289	union {
 290		struct request		*rq;		/* block request */
 291		struct rbd_obj_request	*obj_request;	/* obj req initiator */
 292	};
 293	struct page		**copyup_pages;
 294	u32			copyup_page_count;
 295	spinlock_t		completion_lock;/* protects next_completion */
 296	u32			next_completion;
 297	rbd_img_callback_t	callback;
 298	u64			xferred;/* aggregate bytes transferred */
 299	int			result;	/* first nonzero obj_request result */
 300
 301	u32			obj_request_count;
 302	struct list_head	obj_requests;	/* rbd_obj_request structs */
 303
 
 
 
 
 304	struct kref		kref;
 305};
 306
 307#define for_each_obj_request(ireq, oreq) \
 308	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
 309#define for_each_obj_request_from(ireq, oreq) \
 310	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
 311#define for_each_obj_request_safe(ireq, oreq, n) \
 312	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 313
 314struct rbd_mapping {
 315	u64                     size;
 316	u64                     features;
 317	bool			read_only;
 318};
 319
 320/*
 321 * a single device
 322 */
 323struct rbd_device {
 324	int			dev_id;		/* blkdev unique id */
 325
 326	int			major;		/* blkdev assigned major */
 327	int			minor;
 328	struct gendisk		*disk;		/* blkdev's gendisk and rq */
 329
 330	u32			image_format;	/* Either 1 or 2 */
 331	struct rbd_client	*rbd_client;
 332
 333	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 334
 335	spinlock_t		lock;		/* queue, flags, open_count */
 336
 337	struct rbd_image_header	header;
 338	unsigned long		flags;		/* possibly lock protected */
 339	struct rbd_spec		*spec;
 
 
 
 
 
 340
 341	char			*header_name;
 342
 343	struct ceph_file_layout	layout;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 344
 345	struct ceph_osd_event   *watch_event;
 346	struct rbd_obj_request	*watch_request;
 347
 348	struct rbd_spec		*parent_spec;
 349	u64			parent_overlap;
 350	atomic_t		parent_ref;
 351	struct rbd_device	*parent;
 352
 
 
 
 353	/* protects updating the header */
 354	struct rw_semaphore     header_rwsem;
 355
 356	struct rbd_mapping	mapping;
 357
 358	struct list_head	node;
 359
 360	/* sysfs related */
 361	struct device		dev;
 362	unsigned long		open_count;	/* protected by lock */
 363};
 364
 365/*
 366 * Flag bits for rbd_dev->flags.  If atomicity is required,
 367 * rbd_dev->lock is used to protect access.
 368 *
 369 * Currently, only the "removing" flag (which is coupled with the
 370 * "open_count" field) requires atomic access.
 371 */
 372enum rbd_dev_flags {
 373	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
 374	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
 375};
 376
 377static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
 378
 379static LIST_HEAD(rbd_dev_list);    /* devices */
 380static DEFINE_SPINLOCK(rbd_dev_list_lock);
 381
 382static LIST_HEAD(rbd_client_list);		/* clients */
 383static DEFINE_SPINLOCK(rbd_client_list_lock);
 384
 385/* Slab caches for frequently-allocated structures */
 386
 387static struct kmem_cache	*rbd_img_request_cache;
 388static struct kmem_cache	*rbd_obj_request_cache;
 389static struct kmem_cache	*rbd_segment_name_cache;
 390
 391static int rbd_major;
 392static DEFINE_IDA(rbd_dev_id_ida);
 393
 
 
 
 
 
 
 394/*
 395 * Default to false for now, as single-major requires >= 0.75 version of
 396 * userspace rbd utility.
 397 */
 398static bool single_major = false;
 399module_param(single_major, bool, S_IRUGO);
 400MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
 401
 402static int rbd_img_request_submit(struct rbd_img_request *img_request);
 403
 404static void rbd_dev_device_release(struct device *dev);
 405
 406static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 407		       size_t count);
 408static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 409			  size_t count);
 410static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
 411				    size_t count);
 412static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
 413				       size_t count);
 414static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
 415static void rbd_spec_put(struct rbd_spec *spec);
 416
 417static int rbd_dev_id_to_minor(int dev_id)
 418{
 419	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
 420}
 421
 422static int minor_to_rbd_dev_id(int minor)
 423{
 424	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
 425}
 426
 427static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
 428static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
 429static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
 430static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 431
 432static struct attribute *rbd_bus_attrs[] = {
 433	&bus_attr_add.attr,
 434	&bus_attr_remove.attr,
 435	&bus_attr_add_single_major.attr,
 436	&bus_attr_remove_single_major.attr,
 
 437	NULL,
 438};
 439
 440static umode_t rbd_bus_is_visible(struct kobject *kobj,
 441				  struct attribute *attr, int index)
 442{
 443	if (!single_major &&
 444	    (attr == &bus_attr_add_single_major.attr ||
 445	     attr == &bus_attr_remove_single_major.attr))
 446		return 0;
 447
 448	return attr->mode;
 449}
 450
 451static const struct attribute_group rbd_bus_group = {
 452	.attrs = rbd_bus_attrs,
 453	.is_visible = rbd_bus_is_visible,
 454};
 455__ATTRIBUTE_GROUPS(rbd_bus);
 456
 457static struct bus_type rbd_bus_type = {
 458	.name		= "rbd",
 459	.bus_groups	= rbd_bus_groups,
 460};
 461
 462static void rbd_root_dev_release(struct device *dev)
 463{
 464}
 465
 466static struct device rbd_root_dev = {
 467	.init_name =    "rbd",
 468	.release =      rbd_root_dev_release,
 469};
 470
 471static __printf(2, 3)
 472void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
 473{
 474	struct va_format vaf;
 475	va_list args;
 476
 477	va_start(args, fmt);
 478	vaf.fmt = fmt;
 479	vaf.va = &args;
 480
 481	if (!rbd_dev)
 482		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
 483	else if (rbd_dev->disk)
 484		printk(KERN_WARNING "%s: %s: %pV\n",
 485			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
 486	else if (rbd_dev->spec && rbd_dev->spec->image_name)
 487		printk(KERN_WARNING "%s: image %s: %pV\n",
 488			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
 489	else if (rbd_dev->spec && rbd_dev->spec->image_id)
 490		printk(KERN_WARNING "%s: id %s: %pV\n",
 491			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
 492	else	/* punt */
 493		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
 494			RBD_DRV_NAME, rbd_dev, &vaf);
 495	va_end(args);
 496}
 497
 498#ifdef RBD_DEBUG
 499#define rbd_assert(expr)						\
 500		if (unlikely(!(expr))) {				\
 501			printk(KERN_ERR "\nAssertion failure in %s() "	\
 502						"at line %d:\n\n"	\
 503					"\trbd_assert(%s);\n\n",	\
 504					__func__, __LINE__, #expr);	\
 505			BUG();						\
 506		}
 507#else /* !RBD_DEBUG */
 508#  define rbd_assert(expr)	((void) 0)
 509#endif /* !RBD_DEBUG */
 510
 511static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
 512static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
 513static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
 514
 515static int rbd_dev_refresh(struct rbd_device *rbd_dev);
 516static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
 517static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev);
 
 518static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
 519					u64 snap_id);
 520static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 521				u8 *order, u64 *snap_size);
 522static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 523		u64 *snap_features);
 524static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 525
 526static int rbd_open(struct block_device *bdev, fmode_t mode)
 527{
 528	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 529	bool removing = false;
 530
 531	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 532		return -EROFS;
 533
 534	spin_lock_irq(&rbd_dev->lock);
 535	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
 536		removing = true;
 537	else
 538		rbd_dev->open_count++;
 539	spin_unlock_irq(&rbd_dev->lock);
 540	if (removing)
 541		return -ENOENT;
 542
 543	(void) get_device(&rbd_dev->dev);
 544	set_device_ro(bdev, rbd_dev->mapping.read_only);
 545
 546	return 0;
 547}
 548
 549static void rbd_release(struct gendisk *disk, fmode_t mode)
 550{
 551	struct rbd_device *rbd_dev = disk->private_data;
 552	unsigned long open_count_before;
 553
 554	spin_lock_irq(&rbd_dev->lock);
 555	open_count_before = rbd_dev->open_count--;
 556	spin_unlock_irq(&rbd_dev->lock);
 557	rbd_assert(open_count_before > 0);
 558
 559	put_device(&rbd_dev->dev);
 560}
 561
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 562static const struct block_device_operations rbd_bd_ops = {
 563	.owner			= THIS_MODULE,
 564	.open			= rbd_open,
 565	.release		= rbd_release,
 
 
 
 
 566};
 567
 568/*
 569 * Initialize an rbd client instance.  Success or not, this function
 570 * consumes ceph_opts.  Caller holds client_mutex.
 571 */
 572static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 573{
 574	struct rbd_client *rbdc;
 575	int ret = -ENOMEM;
 576
 577	dout("%s:\n", __func__);
 578	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 579	if (!rbdc)
 580		goto out_opt;
 581
 582	kref_init(&rbdc->kref);
 583	INIT_LIST_HEAD(&rbdc->node);
 584
 585	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
 586	if (IS_ERR(rbdc->client))
 587		goto out_rbdc;
 588	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 589
 590	ret = ceph_open_session(rbdc->client);
 591	if (ret < 0)
 592		goto out_client;
 593
 594	spin_lock(&rbd_client_list_lock);
 595	list_add_tail(&rbdc->node, &rbd_client_list);
 596	spin_unlock(&rbd_client_list_lock);
 597
 598	dout("%s: rbdc %p\n", __func__, rbdc);
 599
 600	return rbdc;
 601out_client:
 602	ceph_destroy_client(rbdc->client);
 603out_rbdc:
 604	kfree(rbdc);
 605out_opt:
 606	if (ceph_opts)
 607		ceph_destroy_options(ceph_opts);
 608	dout("%s: error %d\n", __func__, ret);
 609
 610	return ERR_PTR(ret);
 611}
 612
 613static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
 614{
 615	kref_get(&rbdc->kref);
 616
 617	return rbdc;
 618}
 619
 620/*
 621 * Find a ceph client with specific addr and configuration.  If
 622 * found, bump its reference count.
 623 */
 624static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 625{
 626	struct rbd_client *client_node;
 627	bool found = false;
 628
 629	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 630		return NULL;
 631
 632	spin_lock(&rbd_client_list_lock);
 633	list_for_each_entry(client_node, &rbd_client_list, node) {
 634		if (!ceph_compare_options(ceph_opts, client_node->client)) {
 635			__rbd_get_client(client_node);
 636
 637			found = true;
 638			break;
 639		}
 640	}
 641	spin_unlock(&rbd_client_list_lock);
 642
 643	return found ? client_node : NULL;
 644}
 645
 646/*
 647 * mount options
 648 */
 649enum {
 
 
 
 650	Opt_last_int,
 651	/* int args above */
 
 652	Opt_last_string,
 653	/* string args above */
 654	Opt_read_only,
 655	Opt_read_write,
 656	/* Boolean args above */
 657	Opt_last_bool,
 
 
 658};
 659
 660static match_table_t rbd_opts_tokens = {
 
 
 
 661	/* int args above */
 
 662	/* string args above */
 663	{Opt_read_only, "read_only"},
 664	{Opt_read_only, "ro"},		/* Alternate spelling */
 665	{Opt_read_write, "read_write"},
 666	{Opt_read_write, "rw"},		/* Alternate spelling */
 667	/* Boolean args above */
 668	{-1, NULL}
 
 
 669};
 670
 671struct rbd_options {
 
 
 
 672	bool	read_only;
 
 
 
 673};
 674
 
 
 
 675#define RBD_READ_ONLY_DEFAULT	false
 
 
 
 
 
 
 
 
 676
 677static int parse_rbd_opts_token(char *c, void *private)
 678{
 679	struct rbd_options *rbd_opts = private;
 680	substring_t argstr[MAX_OPT_ARGS];
 681	int token, intval, ret;
 682
 683	token = match_token(c, rbd_opts_tokens, argstr);
 684	if (token < 0)
 685		return -EINVAL;
 686
 687	if (token < Opt_last_int) {
 688		ret = match_int(&argstr[0], &intval);
 689		if (ret < 0) {
 690			pr_err("bad mount option arg (not int) "
 691			       "at '%s'\n", c);
 692			return ret;
 693		}
 694		dout("got int token %d val %d\n", token, intval);
 695	} else if (token > Opt_last_int && token < Opt_last_string) {
 696		dout("got string token %d val %s\n", token,
 697		     argstr[0].from);
 698	} else if (token > Opt_last_string && token < Opt_last_bool) {
 699		dout("got Boolean token %d\n", token);
 700	} else {
 701		dout("got token %d\n", token);
 702	}
 703
 704	switch (token) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 705	case Opt_read_only:
 706		rbd_opts->read_only = true;
 707		break;
 708	case Opt_read_write:
 709		rbd_opts->read_only = false;
 710		break;
 711	default:
 712		rbd_assert(false);
 
 
 
 713		break;
 
 
 
 
 
 
 714	}
 
 715	return 0;
 716}
 717
 718/*
 719 * Get a ceph client with specific addr and configuration, if one does
 720 * not exist create it.  Either way, ceph_opts is consumed by this
 721 * function.
 722 */
 723static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 724{
 725	struct rbd_client *rbdc;
 726
 727	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
 728	rbdc = rbd_client_find(ceph_opts);
 729	if (rbdc)	/* using an existing client */
 730		ceph_destroy_options(ceph_opts);
 731	else
 732		rbdc = rbd_client_create(ceph_opts);
 733	mutex_unlock(&client_mutex);
 734
 735	return rbdc;
 
 736}
 737
 738/*
 739 * Destroy ceph client
 740 *
 741 * Caller must hold rbd_client_list_lock.
 742 */
 743static void rbd_client_release(struct kref *kref)
 744{
 745	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 746
 747	dout("%s: rbdc %p\n", __func__, rbdc);
 748	spin_lock(&rbd_client_list_lock);
 749	list_del(&rbdc->node);
 750	spin_unlock(&rbd_client_list_lock);
 751
 752	ceph_destroy_client(rbdc->client);
 753	kfree(rbdc);
 754}
 755
 756/*
 757 * Drop reference to ceph client node. If it's not referenced anymore, release
 758 * it.
 759 */
 760static void rbd_put_client(struct rbd_client *rbdc)
 761{
 762	if (rbdc)
 763		kref_put(&rbdc->kref, rbd_client_release);
 764}
 765
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 766static bool rbd_image_format_valid(u32 image_format)
 767{
 768	return image_format == 1 || image_format == 2;
 769}
 770
 771static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 772{
 773	size_t size;
 774	u32 snap_count;
 775
 776	/* The header has to start with the magic rbd header text */
 777	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
 778		return false;
 779
 780	/* The bio layer requires at least sector-sized I/O */
 781
 782	if (ondisk->options.order < SECTOR_SHIFT)
 783		return false;
 784
 785	/* If we use u64 in a few spots we may be able to loosen this */
 786
 787	if (ondisk->options.order > 8 * sizeof (int) - 1)
 788		return false;
 789
 790	/*
 791	 * The size of a snapshot header has to fit in a size_t, and
 792	 * that limits the number of snapshots.
 793	 */
 794	snap_count = le32_to_cpu(ondisk->snap_count);
 795	size = SIZE_MAX - sizeof (struct ceph_snap_context);
 796	if (snap_count > size / sizeof (__le64))
 797		return false;
 798
 799	/*
 800	 * Not only that, but the size of the entire the snapshot
 801	 * header must also be representable in a size_t.
 802	 */
 803	size -= snap_count * sizeof (__le64);
 804	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
 805		return false;
 806
 807	return true;
 808}
 809
 810/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 811 * Fill an rbd image header with information from the given format 1
 812 * on-disk header.
 813 */
 814static int rbd_header_from_disk(struct rbd_device *rbd_dev,
 815				 struct rbd_image_header_ondisk *ondisk)
 816{
 817	struct rbd_image_header *header = &rbd_dev->header;
 818	bool first_time = header->object_prefix == NULL;
 819	struct ceph_snap_context *snapc;
 820	char *object_prefix = NULL;
 821	char *snap_names = NULL;
 822	u64 *snap_sizes = NULL;
 823	u32 snap_count;
 824	size_t size;
 825	int ret = -ENOMEM;
 826	u32 i;
 827
 828	/* Allocate this now to avoid having to handle failure below */
 829
 830	if (first_time) {
 831		size_t len;
 832
 833		len = strnlen(ondisk->object_prefix,
 834				sizeof (ondisk->object_prefix));
 835		object_prefix = kmalloc(len + 1, GFP_KERNEL);
 836		if (!object_prefix)
 837			return -ENOMEM;
 838		memcpy(object_prefix, ondisk->object_prefix, len);
 839		object_prefix[len] = '\0';
 840	}
 841
 842	/* Allocate the snapshot context and fill it in */
 843
 844	snap_count = le32_to_cpu(ondisk->snap_count);
 845	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
 846	if (!snapc)
 847		goto out_err;
 848	snapc->seq = le64_to_cpu(ondisk->snap_seq);
 849	if (snap_count) {
 850		struct rbd_image_snap_ondisk *snaps;
 851		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 852
 853		/* We'll keep a copy of the snapshot names... */
 854
 855		if (snap_names_len > (u64)SIZE_MAX)
 856			goto out_2big;
 857		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
 858		if (!snap_names)
 859			goto out_err;
 860
 861		/* ...as well as the array of their sizes. */
 862
 863		size = snap_count * sizeof (*header->snap_sizes);
 864		snap_sizes = kmalloc(size, GFP_KERNEL);
 865		if (!snap_sizes)
 866			goto out_err;
 867
 868		/*
 869		 * Copy the names, and fill in each snapshot's id
 870		 * and size.
 871		 *
 872		 * Note that rbd_dev_v1_header_info() guarantees the
 873		 * ondisk buffer we're working with has
 874		 * snap_names_len bytes beyond the end of the
 875		 * snapshot id array, this memcpy() is safe.
 876		 */
 877		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
 878		snaps = ondisk->snaps;
 879		for (i = 0; i < snap_count; i++) {
 880			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
 881			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
 882		}
 883	}
 884
 885	/* We won't fail any more, fill in the header */
 886
 887	if (first_time) {
 888		header->object_prefix = object_prefix;
 889		header->obj_order = ondisk->options.order;
 890		header->crypt_type = ondisk->options.crypt_type;
 891		header->comp_type = ondisk->options.comp_type;
 892		/* The rest aren't used for format 1 images */
 893		header->stripe_unit = 0;
 894		header->stripe_count = 0;
 895		header->features = 0;
 896	} else {
 897		ceph_put_snap_context(header->snapc);
 898		kfree(header->snap_names);
 899		kfree(header->snap_sizes);
 900	}
 901
 902	/* The remaining fields always get updated (when we refresh) */
 903
 904	header->image_size = le64_to_cpu(ondisk->image_size);
 905	header->snapc = snapc;
 906	header->snap_names = snap_names;
 907	header->snap_sizes = snap_sizes;
 908
 909	/* Make sure mapping size is consistent with header info */
 910
 911	if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
 912		if (rbd_dev->mapping.size != header->image_size)
 913			rbd_dev->mapping.size = header->image_size;
 914
 915	return 0;
 916out_2big:
 917	ret = -EIO;
 918out_err:
 919	kfree(snap_sizes);
 920	kfree(snap_names);
 921	ceph_put_snap_context(snapc);
 922	kfree(object_prefix);
 923
 924	return ret;
 925}
 926
 927static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
 928{
 929	const char *snap_name;
 930
 931	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
 932
 933	/* Skip over names until we find the one we are looking for */
 934
 935	snap_name = rbd_dev->header.snap_names;
 936	while (which--)
 937		snap_name += strlen(snap_name) + 1;
 938
 939	return kstrdup(snap_name, GFP_KERNEL);
 940}
 941
 942/*
 943 * Snapshot id comparison function for use with qsort()/bsearch().
 944 * Note that result is for snapshots in *descending* order.
 945 */
 946static int snapid_compare_reverse(const void *s1, const void *s2)
 947{
 948	u64 snap_id1 = *(u64 *)s1;
 949	u64 snap_id2 = *(u64 *)s2;
 950
 951	if (snap_id1 < snap_id2)
 952		return 1;
 953	return snap_id1 == snap_id2 ? 0 : -1;
 954}
 955
 956/*
 957 * Search a snapshot context to see if the given snapshot id is
 958 * present.
 959 *
 960 * Returns the position of the snapshot id in the array if it's found,
 961 * or BAD_SNAP_INDEX otherwise.
 962 *
 963 * Note: The snapshot array is in kept sorted (by the osd) in
 964 * reverse order, highest snapshot id first.
 965 */
 966static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
 967{
 968	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
 969	u64 *found;
 970
 971	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
 972				sizeof (snap_id), snapid_compare_reverse);
 973
 974	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
 975}
 976
 977static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
 978					u64 snap_id)
 979{
 980	u32 which;
 981	const char *snap_name;
 982
 983	which = rbd_dev_snap_index(rbd_dev, snap_id);
 984	if (which == BAD_SNAP_INDEX)
 985		return ERR_PTR(-ENOENT);
 986
 987	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
 988	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
 989}
 990
 991static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
 992{
 993	if (snap_id == CEPH_NOSNAP)
 994		return RBD_SNAP_HEAD_NAME;
 995
 996	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 997	if (rbd_dev->image_format == 1)
 998		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
 999
1000	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
1001}
1002
1003static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1004				u64 *snap_size)
1005{
1006	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1007	if (snap_id == CEPH_NOSNAP) {
1008		*snap_size = rbd_dev->header.image_size;
1009	} else if (rbd_dev->image_format == 1) {
1010		u32 which;
1011
1012		which = rbd_dev_snap_index(rbd_dev, snap_id);
1013		if (which == BAD_SNAP_INDEX)
1014			return -ENOENT;
1015
1016		*snap_size = rbd_dev->header.snap_sizes[which];
1017	} else {
1018		u64 size = 0;
1019		int ret;
1020
1021		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1022		if (ret)
1023			return ret;
1024
1025		*snap_size = size;
1026	}
1027	return 0;
1028}
1029
1030static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1031			u64 *snap_features)
1032{
1033	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1034	if (snap_id == CEPH_NOSNAP) {
1035		*snap_features = rbd_dev->header.features;
1036	} else if (rbd_dev->image_format == 1) {
1037		*snap_features = 0;	/* No features for format 1 */
1038	} else {
1039		u64 features = 0;
1040		int ret;
1041
1042		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1043		if (ret)
1044			return ret;
1045
1046		*snap_features = features;
1047	}
1048	return 0;
1049}
1050
1051static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1052{
1053	u64 snap_id = rbd_dev->spec->snap_id;
1054	u64 size = 0;
1055	u64 features = 0;
1056	int ret;
1057
1058	ret = rbd_snap_size(rbd_dev, snap_id, &size);
1059	if (ret)
1060		return ret;
1061	ret = rbd_snap_features(rbd_dev, snap_id, &features);
1062	if (ret)
1063		return ret;
1064
1065	rbd_dev->mapping.size = size;
1066	rbd_dev->mapping.features = features;
1067
1068	return 0;
1069}
1070
1071static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1072{
1073	rbd_dev->mapping.size = 0;
1074	rbd_dev->mapping.features = 0;
1075}
1076
1077static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1078{
1079	char *name;
1080	u64 segment;
1081	int ret;
1082	char *name_format;
1083
1084	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
1085	if (!name)
1086		return NULL;
1087	segment = offset >> rbd_dev->header.obj_order;
1088	name_format = "%s.%012llx";
1089	if (rbd_dev->image_format == 2)
1090		name_format = "%s.%016llx";
1091	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
1092			rbd_dev->header.object_prefix, segment);
1093	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
1094		pr_err("error formatting segment name for #%llu (%d)\n",
1095			segment, ret);
1096		kfree(name);
1097		name = NULL;
1098	}
1099
1100	return name;
 
 
 
1101}
1102
1103static void rbd_segment_name_free(const char *name)
1104{
1105	/* The explicit cast here is needed to drop the const qualifier */
1106
1107	kmem_cache_free(rbd_segment_name_cache, (void *)name);
 
 
 
1108}
1109
1110static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
 
 
 
 
 
 
 
1111{
1112	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
 
 
 
 
 
 
 
 
 
 
 
 
 
1113
1114	return offset & (segment_size - 1);
 
 
 
 
 
 
1115}
1116
1117static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1118				u64 offset, u64 length)
1119{
1120	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
 
 
 
 
1121
1122	offset &= segment_size - 1;
 
 
 
1123
1124	rbd_assert(length <= U64_MAX - offset);
1125	if (offset + length > segment_size)
1126		length = segment_size - offset;
 
1127
1128	return length;
 
 
 
 
 
 
1129}
1130
1131/*
1132 * returns the size of an object in the image
1133 */
1134static u64 rbd_obj_bytes(struct rbd_image_header *header)
1135{
1136	return 1 << header->obj_order;
 
 
 
 
 
1137}
1138
1139/*
1140 * bio helpers
 
 
1141 */
 
 
 
 
 
1142
1143static void bio_chain_put(struct bio *chain)
1144{
1145	struct bio *tmp;
 
 
1146
1147	while (chain) {
1148		tmp = chain;
1149		chain = chain->bi_next;
1150		bio_put(tmp);
1151	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1152}
1153
1154/*
1155 * zeros a bio chain, starting at specific offset
1156 */
1157static void zero_bio_chain(struct bio *chain, int start_ofs)
1158{
1159	struct bio_vec bv;
1160	struct bvec_iter iter;
1161	unsigned long flags;
1162	void *buf;
1163	int pos = 0;
1164
1165	while (chain) {
1166		bio_for_each_segment(bv, chain, iter) {
1167			if (pos + bv.bv_len > start_ofs) {
1168				int remainder = max(start_ofs - pos, 0);
1169				buf = bvec_kmap_irq(&bv, &flags);
1170				memset(buf + remainder, 0,
1171				       bv.bv_len - remainder);
1172				flush_dcache_page(bv.bv_page);
1173				bvec_kunmap_irq(buf, &flags);
1174			}
1175			pos += bv.bv_len;
1176		}
1177
1178		chain = chain->bi_next;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1179	}
1180}
1181
1182/*
1183 * similar to zero_bio_chain(), zeros data defined by a page array,
1184 * starting at the given byte offset from the start of the array and
1185 * continuing up to the given end offset.  The pages array is
1186 * assumed to be big enough to hold all bytes up to the end.
1187 */
1188static void zero_pages(struct page **pages, u64 offset, u64 end)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1189{
1190	struct page **page = &pages[offset >> PAGE_SHIFT];
 
 
 
 
 
 
1191
1192	rbd_assert(end > offset);
1193	rbd_assert(end - offset <= (u64)SIZE_MAX);
1194	while (offset < end) {
1195		size_t page_offset;
1196		size_t length;
1197		unsigned long flags;
1198		void *kaddr;
1199
1200		page_offset = offset & ~PAGE_MASK;
1201		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1202		local_irq_save(flags);
1203		kaddr = kmap_atomic(*page);
1204		memset(kaddr + page_offset, 0, length);
1205		flush_dcache_page(*page);
1206		kunmap_atomic(kaddr);
1207		local_irq_restore(flags);
 
 
 
1208
1209		offset += length;
1210		page++;
 
 
 
 
 
1211	}
 
 
 
 
 
 
 
 
 
 
 
 
 
1212}
1213
1214/*
1215 * Clone a portion of a bio, starting at the given byte offset
1216 * and continuing for the number of bytes indicated.
 
 
1217 */
1218static struct bio *bio_clone_range(struct bio *bio_src,
1219					unsigned int offset,
1220					unsigned int len,
1221					gfp_t gfpmask)
1222{
1223	struct bio *bio;
1224
1225	bio = bio_clone(bio_src, gfpmask);
1226	if (!bio)
1227		return NULL;	/* ENOMEM */
1228
1229	bio_advance(bio, offset);
1230	bio->bi_iter.bi_size = len;
 
 
 
1231
1232	return bio;
 
 
 
1233}
1234
1235/*
1236 * Clone a portion of a bio chain, starting at the given byte offset
1237 * into the first bio in the source chain and continuing for the
1238 * number of bytes indicated.  The result is another bio chain of
1239 * exactly the given length, or a null pointer on error.
1240 *
1241 * The bio_src and offset parameters are both in-out.  On entry they
1242 * refer to the first source bio and the offset into that bio where
1243 * the start of data to be cloned is located.
1244 *
1245 * On return, bio_src is updated to refer to the bio in the source
1246 * chain that contains first un-cloned byte, and *offset will
1247 * contain the offset of that byte within that bio.
1248 */
1249static struct bio *bio_chain_clone_range(struct bio **bio_src,
1250					unsigned int *offset,
1251					unsigned int len,
1252					gfp_t gfpmask)
1253{
1254	struct bio *bi = *bio_src;
1255	unsigned int off = *offset;
1256	struct bio *chain = NULL;
1257	struct bio **end;
1258
1259	/* Build up a chain of clone bios up to the limit */
 
1260
1261	if (!bi || off >= bi->bi_iter.bi_size || !len)
1262		return NULL;		/* Nothing to clone */
 
 
1263
1264	end = &chain;
1265	while (len) {
1266		unsigned int bi_size;
1267		struct bio *bio;
1268
1269		if (!bi) {
1270			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1271			goto out_err;	/* EINVAL; ran out of bio's */
1272		}
1273		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1274		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1275		if (!bio)
1276			goto out_err;	/* ENOMEM */
1277
1278		*end = bio;
1279		end = &bio->bi_next;
 
 
 
 
 
 
 
 
 
1280
1281		off += bi_size;
1282		if (off == bi->bi_iter.bi_size) {
1283			bi = bi->bi_next;
1284			off = 0;
1285		}
1286		len -= bi_size;
1287	}
1288	*bio_src = bi;
1289	*offset = off;
1290
1291	return chain;
1292out_err:
1293	bio_chain_put(chain);
 
 
 
1294
1295	return NULL;
 
 
 
 
 
 
 
 
1296}
1297
1298/*
1299 * The default/initial value for all object request flags is 0.  For
1300 * each flag, once its value is set to 1 it is never reset to 0
1301 * again.
1302 */
1303static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1304{
1305	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
1306		struct rbd_device *rbd_dev;
 
 
 
 
 
 
 
 
 
1307
1308		rbd_dev = obj_request->img_request->rbd_dev;
1309		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1310			obj_request);
1311	}
 
 
 
 
 
1312}
1313
1314static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
 
 
 
 
 
1315{
1316	smp_mb();
1317	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
 
 
 
1318}
1319
1320static void obj_request_done_set(struct rbd_obj_request *obj_request)
1321{
1322	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1323		struct rbd_device *rbd_dev = NULL;
1324
1325		if (obj_request_img_data_test(obj_request))
1326			rbd_dev = obj_request->img_request->rbd_dev;
1327		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1328			obj_request);
1329	}
1330}
1331
1332static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1333{
1334	smp_mb();
1335	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
 
 
 
 
 
 
 
 
1336}
1337
1338/*
1339 * This sets the KNOWN flag after (possibly) setting the EXISTS
1340 * flag.  The latter is set based on the "exists" value provided.
1341 *
1342 * Note that for our purposes once an object exists it never goes
1343 * away again.  It's possible that the response from two existence
1344 * checks are separated by the creation of the target object, and
1345 * the first ("doesn't exist") response arrives *after* the second
1346 * ("does exist").  In that case we ignore the second one.
1347 */
1348static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1349				bool exists)
1350{
1351	if (exists)
1352		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1353	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1354	smp_mb();
1355}
1356
1357static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1358{
1359	smp_mb();
1360	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1361}
1362
1363static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1364{
1365	smp_mb();
1366	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
 
 
 
 
 
 
1367}
1368
1369static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
 
1370{
1371	dout("%s: obj %p (was %d)\n", __func__, obj_request,
1372		atomic_read(&obj_request->kref.refcount));
1373	kref_get(&obj_request->kref);
 
 
 
1374}
1375
1376static void rbd_obj_request_destroy(struct kref *kref);
1377static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1378{
1379	rbd_assert(obj_request != NULL);
1380	dout("%s: obj %p (was %d)\n", __func__, obj_request,
1381		atomic_read(&obj_request->kref.refcount));
1382	kref_put(&obj_request->kref, rbd_obj_request_destroy);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1383}
1384
1385static bool img_request_child_test(struct rbd_img_request *img_request);
1386static void rbd_parent_request_destroy(struct kref *kref);
1387static void rbd_img_request_destroy(struct kref *kref);
1388static void rbd_img_request_put(struct rbd_img_request *img_request)
1389{
1390	rbd_assert(img_request != NULL);
1391	dout("%s: img %p (was %d)\n", __func__, img_request,
1392		atomic_read(&img_request->kref.refcount));
1393	if (img_request_child_test(img_request))
1394		kref_put(&img_request->kref, rbd_parent_request_destroy);
1395	else
1396		kref_put(&img_request->kref, rbd_img_request_destroy);
 
 
 
1397}
1398
1399static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1400					struct rbd_obj_request *obj_request)
1401{
1402	rbd_assert(obj_request->img_request == NULL);
 
 
 
 
1403
1404	/* Image request now owns object's original reference */
1405	obj_request->img_request = img_request;
1406	obj_request->which = img_request->obj_request_count;
1407	rbd_assert(!obj_request_img_data_test(obj_request));
1408	obj_request_img_data_set(obj_request);
1409	rbd_assert(obj_request->which != BAD_WHICH);
1410	img_request->obj_request_count++;
1411	list_add_tail(&obj_request->links, &img_request->obj_requests);
1412	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1413		obj_request->which);
 
 
 
 
 
1414}
1415
1416static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1417					struct rbd_obj_request *obj_request)
1418{
1419	rbd_assert(obj_request->which != BAD_WHICH);
 
 
 
 
 
 
 
 
 
1420
1421	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1422		obj_request->which);
1423	list_del(&obj_request->links);
1424	rbd_assert(img_request->obj_request_count > 0);
1425	img_request->obj_request_count--;
1426	rbd_assert(obj_request->which == img_request->obj_request_count);
1427	obj_request->which = BAD_WHICH;
1428	rbd_assert(obj_request_img_data_test(obj_request));
1429	rbd_assert(obj_request->img_request == img_request);
1430	obj_request->img_request = NULL;
1431	obj_request->callback = NULL;
1432	rbd_obj_request_put(obj_request);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1433}
1434
1435static bool obj_request_type_valid(enum obj_request_type type)
1436{
1437	switch (type) {
1438	case OBJ_REQUEST_NODATA:
1439	case OBJ_REQUEST_BIO:
1440	case OBJ_REQUEST_PAGES:
1441		return true;
1442	default:
1443		return false;
 
 
 
 
 
 
 
 
 
 
1444	}
 
 
 
 
 
1445}
1446
1447static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1448				struct rbd_obj_request *obj_request)
1449{
1450	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
 
 
 
 
 
 
 
 
 
 
1451
1452	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1453}
1454
1455static void rbd_img_request_complete(struct rbd_img_request *img_request)
1456{
 
 
 
1457
1458	dout("%s: img %p\n", __func__, img_request);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1459
1460	/*
1461	 * If no error occurred, compute the aggregate transfer
1462	 * count for the image request.  We could instead use
1463	 * atomic64_cmpxchg() to update it as each object request
1464	 * completes; not clear which way is better off hand.
1465	 */
1466	if (!img_request->result) {
1467		struct rbd_obj_request *obj_request;
1468		u64 xferred = 0;
1469
1470		for_each_obj_request(img_request, obj_request)
1471			xferred += obj_request->xferred;
1472		img_request->xferred = xferred;
1473	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1474
1475	if (img_request->callback)
1476		img_request->callback(img_request);
1477	else
1478		rbd_img_request_put(img_request);
1479}
1480
1481/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
 
 
 
 
 
 
 
 
 
 
1482
1483static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1484{
1485	dout("%s: obj %p\n", __func__, obj_request);
 
 
 
 
 
1486
1487	return wait_for_completion_interruptible(&obj_request->completion);
1488}
1489
1490/*
1491 * The default/initial value for all image request flags is 0.  Each
1492 * is conditionally set to 1 at image request initialization time
1493 * and currently never change thereafter.
1494 */
1495static void img_request_write_set(struct rbd_img_request *img_request)
1496{
1497	set_bit(IMG_REQ_WRITE, &img_request->flags);
1498	smp_mb();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1499}
1500
1501static bool img_request_write_test(struct rbd_img_request *img_request)
 
 
 
 
 
 
 
1502{
1503	smp_mb();
1504	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1505}
1506
1507static void img_request_child_set(struct rbd_img_request *img_request)
 
1508{
1509	set_bit(IMG_REQ_CHILD, &img_request->flags);
1510	smp_mb();
 
 
 
 
 
 
 
 
 
 
 
 
 
1511}
1512
1513static void img_request_child_clear(struct rbd_img_request *img_request)
 
 
 
 
 
1514{
1515	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1516	smp_mb();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1517}
1518
1519static bool img_request_child_test(struct rbd_img_request *img_request)
1520{
1521	smp_mb();
1522	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1523}
1524
1525static void img_request_layered_set(struct rbd_img_request *img_request)
1526{
1527	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1528	smp_mb();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1529}
1530
1531static void img_request_layered_clear(struct rbd_img_request *img_request)
 
1532{
1533	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1534	smp_mb();
 
 
 
 
 
 
 
 
1535}
1536
1537static bool img_request_layered_test(struct rbd_img_request *img_request)
1538{
1539	smp_mb();
1540	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1541}
1542
1543static void
1544rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1545{
1546	u64 xferred = obj_request->xferred;
1547	u64 length = obj_request->length;
 
1548
1549	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1550		obj_request, obj_request->img_request, obj_request->result,
1551		xferred, length);
1552	/*
1553	 * ENOENT means a hole in the image.  We zero-fill the entire
1554	 * length of the request.  A short read also implies zero-fill
1555	 * to the end of the request.  An error requires the whole
1556	 * length of the request to be reported finished with an error
1557	 * to the block layer.  In each case we update the xferred
1558	 * count to indicate the whole request was satisfied.
1559	 */
1560	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
1561	if (obj_request->result == -ENOENT) {
1562		if (obj_request->type == OBJ_REQUEST_BIO)
1563			zero_bio_chain(obj_request->bio_list, 0);
1564		else
1565			zero_pages(obj_request->pages, 0, length);
1566		obj_request->result = 0;
1567	} else if (xferred < length && !obj_request->result) {
1568		if (obj_request->type == OBJ_REQUEST_BIO)
1569			zero_bio_chain(obj_request->bio_list, xferred);
1570		else
1571			zero_pages(obj_request->pages, xferred, length);
1572	}
1573	obj_request->xferred = length;
1574	obj_request_done_set(obj_request);
 
 
 
 
 
 
 
1575}
1576
1577static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1578{
1579	dout("%s: obj %p cb %p\n", __func__, obj_request,
1580		obj_request->callback);
1581	if (obj_request->callback)
1582		obj_request->callback(obj_request);
1583	else
1584		complete_all(&obj_request->completion);
 
 
 
 
 
 
1585}
1586
1587static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1588{
1589	dout("%s: obj %p\n", __func__, obj_request);
1590	obj_request_done_set(obj_request);
1591}
1592
1593static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
 
1594{
1595	struct rbd_img_request *img_request = NULL;
1596	struct rbd_device *rbd_dev = NULL;
1597	bool layered = false;
1598
1599	if (obj_request_img_data_test(obj_request)) {
1600		img_request = obj_request->img_request;
1601		layered = img_request && img_request_layered_test(img_request);
1602		rbd_dev = img_request->rbd_dev;
1603	}
1604
1605	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1606		obj_request, img_request, obj_request->result,
1607		obj_request->xferred, obj_request->length);
1608	if (layered && obj_request->result == -ENOENT &&
1609			obj_request->img_offset < rbd_dev->parent_overlap)
1610		rbd_img_parent_read(obj_request);
1611	else if (img_request)
1612		rbd_img_obj_request_read_callback(obj_request);
1613	else
1614		obj_request_done_set(obj_request);
1615}
1616
1617static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1618{
1619	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1620		obj_request->result, obj_request->length);
 
 
1621	/*
1622	 * There is no such thing as a successful short write.  Set
1623	 * it to our originally-requested length.
1624	 */
1625	obj_request->xferred = obj_request->length;
1626	obj_request_done_set(obj_request);
1627}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1628
1629/*
1630 * For a simple stat call there's nothing to do.  We'll do more if
1631 * this is part of a write sequence for a layered image.
1632 */
1633static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1634{
1635	dout("%s: obj %p\n", __func__, obj_request);
1636	obj_request_done_set(obj_request);
 
 
 
1637}
1638
1639static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1640				struct ceph_msg *msg)
1641{
1642	struct rbd_obj_request *obj_request = osd_req->r_priv;
1643	u16 opcode;
1644
1645	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1646	rbd_assert(osd_req == obj_request->osd_req);
1647	if (obj_request_img_data_test(obj_request)) {
1648		rbd_assert(obj_request->img_request);
1649		rbd_assert(obj_request->which != BAD_WHICH);
 
 
 
 
 
 
 
1650	} else {
1651		rbd_assert(obj_request->which == BAD_WHICH);
1652	}
1653
1654	if (osd_req->r_result < 0)
1655		obj_request->result = osd_req->r_result;
 
 
 
1656
1657	rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
 
 
1658
1659	/*
1660	 * We support a 64-bit length, but ultimately it has to be
1661	 * passed to blk_end_request(), which takes an unsigned int.
1662	 */
1663	obj_request->xferred = osd_req->r_reply_op_len[0];
1664	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
1665
1666	opcode = osd_req->r_ops[0].op;
1667	switch (opcode) {
1668	case CEPH_OSD_OP_READ:
1669		rbd_osd_read_callback(obj_request);
1670		break;
1671	case CEPH_OSD_OP_SETALLOCHINT:
1672		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
1673		/* fall through */
1674	case CEPH_OSD_OP_WRITE:
1675		rbd_osd_write_callback(obj_request);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1676		break;
1677	case CEPH_OSD_OP_STAT:
1678		rbd_osd_stat_callback(obj_request);
1679		break;
1680	case CEPH_OSD_OP_CALL:
1681	case CEPH_OSD_OP_NOTIFY_ACK:
1682	case CEPH_OSD_OP_WATCH:
1683		rbd_osd_trivial_callback(obj_request);
1684		break;
1685	default:
1686		rbd_warn(NULL, "%s: unsupported op %hu\n",
1687			obj_request->object_name, (unsigned short) opcode);
1688		break;
1689	}
1690
1691	if (obj_request_done_test(obj_request))
1692		rbd_obj_request_complete(obj_request);
1693}
1694
1695static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
 
 
 
 
 
1696{
1697	struct rbd_img_request *img_request = obj_request->img_request;
1698	struct ceph_osd_request *osd_req = obj_request->osd_req;
1699	u64 snap_id;
1700
1701	rbd_assert(osd_req != NULL);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1702
1703	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
1704	ceph_osdc_build_request(osd_req, obj_request->offset,
1705			NULL, snap_id, NULL);
1706}
1707
1708static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1709{
1710	struct rbd_img_request *img_request = obj_request->img_request;
1711	struct ceph_osd_request *osd_req = obj_request->osd_req;
1712	struct ceph_snap_context *snapc;
1713	struct timespec mtime = CURRENT_TIME;
1714
1715	rbd_assert(osd_req != NULL);
 
 
1716
1717	snapc = img_request ? img_request->snapc : NULL;
1718	ceph_osdc_build_request(osd_req, obj_request->offset,
1719			snapc, CEPH_NOSNAP, &mtime);
1720}
1721
1722/*
1723 * Create an osd request.  A read request has one osd op (read).
1724 * A write request has either one (watch) or two (hint+write) osd ops.
1725 * (All rbd data writes are prefixed with an allocation hint op, but
1726 * technically osd watch is a write request, hence this distinction.)
 
1727 */
1728static struct ceph_osd_request *rbd_osd_req_create(
1729					struct rbd_device *rbd_dev,
1730					bool write_request,
1731					unsigned int num_ops,
1732					struct rbd_obj_request *obj_request)
1733{
1734	struct ceph_snap_context *snapc = NULL;
1735	struct ceph_osd_client *osdc;
1736	struct ceph_osd_request *osd_req;
1737
1738	if (obj_request_img_data_test(obj_request)) {
1739		struct rbd_img_request *img_request = obj_request->img_request;
 
 
 
 
 
 
 
1740
1741		rbd_assert(write_request ==
1742				img_request_write_test(img_request));
1743		if (write_request)
1744			snapc = img_request->snapc;
 
 
 
 
 
 
 
 
 
 
1745	}
1746
1747	rbd_assert(num_ops == 1 || (write_request && num_ops == 2));
 
1748
1749	/* Allocate and initialize the request, for the num_ops ops */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1750
1751	osdc = &rbd_dev->rbd_client->client->osdc;
1752	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
1753					  GFP_ATOMIC);
1754	if (!osd_req)
1755		return NULL;	/* ENOMEM */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1756
1757	if (write_request)
1758		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1759	else
1760		osd_req->r_flags = CEPH_OSD_FLAG_READ;
 
 
 
1761
1762	osd_req->r_callback = rbd_osd_req_callback;
1763	osd_req->r_priv = obj_request;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1764
1765	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
1766	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
 
 
 
 
 
 
 
1767
1768	return osd_req;
1769}
1770
1771/*
1772 * Create a copyup osd request based on the information in the
1773 * object request supplied.  A copyup request has three osd ops,
1774 * a copyup method call, a hint op, and a write op.
1775 */
1776static struct ceph_osd_request *
1777rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1778{
1779	struct rbd_img_request *img_request;
1780	struct ceph_snap_context *snapc;
1781	struct rbd_device *rbd_dev;
1782	struct ceph_osd_client *osdc;
1783	struct ceph_osd_request *osd_req;
1784
1785	rbd_assert(obj_request_img_data_test(obj_request));
1786	img_request = obj_request->img_request;
1787	rbd_assert(img_request);
1788	rbd_assert(img_request_write_test(img_request));
1789
1790	/* Allocate and initialize the request, for the three ops */
 
 
 
 
1791
1792	snapc = img_request->snapc;
1793	rbd_dev = img_request->rbd_dev;
1794	osdc = &rbd_dev->rbd_client->client->osdc;
1795	osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC);
1796	if (!osd_req)
1797		return NULL;	/* ENOMEM */
1798
1799	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1800	osd_req->r_callback = rbd_osd_req_callback;
1801	osd_req->r_priv = obj_request;
1802
1803	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
1804	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
 
 
 
1805
1806	return osd_req;
 
 
 
 
1807}
1808
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1809
1810static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
 
1811{
1812	ceph_osdc_put_request(osd_req);
 
 
 
1813}
1814
1815/* object_name is assumed to be a non-null pointer and NUL-terminated */
 
 
 
 
 
 
 
 
 
1816
1817static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1818						u64 offset, u64 length,
1819						enum obj_request_type type)
1820{
1821	struct rbd_obj_request *obj_request;
1822	size_t size;
1823	char *name;
1824
1825	rbd_assert(obj_request_type_valid(type));
 
 
 
1826
1827	size = strlen(object_name) + 1;
1828	name = kmalloc(size, GFP_KERNEL);
1829	if (!name)
1830		return NULL;
 
1831
1832	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1833	if (!obj_request) {
1834		kfree(name);
1835		return NULL;
1836	}
1837
1838	obj_request->object_name = memcpy(name, object_name, size);
1839	obj_request->offset = offset;
1840	obj_request->length = length;
1841	obj_request->flags = 0;
1842	obj_request->which = BAD_WHICH;
1843	obj_request->type = type;
1844	INIT_LIST_HEAD(&obj_request->links);
1845	init_completion(&obj_request->completion);
1846	kref_init(&obj_request->kref);
 
 
 
1847
1848	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1849		offset, length, (int)type, obj_request);
 
1850
1851	return obj_request;
 
 
 
 
 
 
 
 
 
 
 
 
1852}
1853
1854static void rbd_obj_request_destroy(struct kref *kref)
1855{
1856	struct rbd_obj_request *obj_request;
 
1857
1858	obj_request = container_of(kref, struct rbd_obj_request, kref);
1859
1860	dout("%s: obj %p\n", __func__, obj_request);
1861
1862	rbd_assert(obj_request->img_request == NULL);
1863	rbd_assert(obj_request->which == BAD_WHICH);
 
 
 
 
1864
1865	if (obj_request->osd_req)
1866		rbd_osd_req_destroy(obj_request->osd_req);
 
1867
1868	rbd_assert(obj_request_type_valid(obj_request->type));
1869	switch (obj_request->type) {
1870	case OBJ_REQUEST_NODATA:
1871		break;		/* Nothing to do */
1872	case OBJ_REQUEST_BIO:
1873		if (obj_request->bio_list)
1874			bio_chain_put(obj_request->bio_list);
1875		break;
1876	case OBJ_REQUEST_PAGES:
1877		if (obj_request->pages)
1878			ceph_release_page_vector(obj_request->pages,
1879						obj_request->page_count);
1880		break;
1881	}
1882
1883	kfree(obj_request->object_name);
1884	obj_request->object_name = NULL;
1885	kmem_cache_free(rbd_obj_request_cache, obj_request);
1886}
1887
1888/* It's OK to call this for a device with no parent */
1889
1890static void rbd_spec_put(struct rbd_spec *spec);
1891static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1892{
1893	rbd_dev_remove_parent(rbd_dev);
1894	rbd_spec_put(rbd_dev->parent_spec);
1895	rbd_dev->parent_spec = NULL;
1896	rbd_dev->parent_overlap = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1897}
1898
1899/*
1900 * Parent image reference counting is used to determine when an
1901 * image's parent fields can be safely torn down--after there are no
1902 * more in-flight requests to the parent image.  When the last
1903 * reference is dropped, cleaning them up is safe.
1904 */
1905static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1906{
1907	int counter;
 
 
1908
1909	if (!rbd_dev->parent_spec)
1910		return;
 
 
1911
1912	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1913	if (counter > 0)
1914		return;
1915
1916	/* Last reference; clean up parent data structures */
 
1917
1918	if (!counter)
1919		rbd_dev_unparent(rbd_dev);
1920	else
1921		rbd_warn(rbd_dev, "parent reference underflow\n");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1922}
1923
1924/*
1925 * If an image has a non-zero parent overlap, get a reference to its
1926 * parent.
1927 *
1928 * We must get the reference before checking for the overlap to
1929 * coordinate properly with zeroing the parent overlap in
1930 * rbd_dev_v2_parent_info() when an image gets flattened.  We
1931 * drop it again if there is no overlap.
1932 *
1933 * Returns true if the rbd device has a parent with a non-zero
1934 * overlap and a reference for it was successfully taken, or
1935 * false otherwise.
1936 */
1937static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1938{
1939	int counter;
 
1940
1941	if (!rbd_dev->parent_spec)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1942		return false;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1943
1944	counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1945	if (counter > 0 && rbd_dev->parent_overlap)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1946		return true;
 
 
 
 
 
 
 
1947
1948	/* Image was flattened, but parent is not yet torn down */
 
 
 
 
 
 
 
 
1949
1950	if (counter < 0)
1951		rbd_warn(rbd_dev, "parent reference overflow\n");
 
 
 
 
 
 
 
 
 
 
1952
1953	return false;
1954}
1955
1956/*
1957 * Caller is responsible for filling in the list of object requests
1958 * that comprises the image request, and the Linux request pointer
1959 * (if there is one).
 
1960 */
1961static struct rbd_img_request *rbd_img_request_create(
1962					struct rbd_device *rbd_dev,
1963					u64 offset, u64 length,
1964					bool write_request)
1965{
1966	struct rbd_img_request *img_request;
 
1967
1968	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
1969	if (!img_request)
1970		return NULL;
1971
1972	if (write_request) {
1973		down_read(&rbd_dev->header_rwsem);
1974		ceph_get_snap_context(rbd_dev->header.snapc);
1975		up_read(&rbd_dev->header_rwsem);
1976	}
1977
1978	img_request->rq = NULL;
1979	img_request->rbd_dev = rbd_dev;
1980	img_request->offset = offset;
1981	img_request->length = length;
1982	img_request->flags = 0;
1983	if (write_request) {
1984		img_request_write_set(img_request);
1985		img_request->snapc = rbd_dev->header.snapc;
1986	} else {
1987		img_request->snap_id = rbd_dev->spec->snap_id;
 
 
 
 
 
 
 
 
 
 
 
1988	}
1989	if (rbd_dev_parent_get(rbd_dev))
1990		img_request_layered_set(img_request);
1991	spin_lock_init(&img_request->completion_lock);
1992	img_request->next_completion = 0;
1993	img_request->callback = NULL;
1994	img_request->result = 0;
1995	img_request->obj_request_count = 0;
1996	INIT_LIST_HEAD(&img_request->obj_requests);
1997	kref_init(&img_request->kref);
1998
1999	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
2000		write_request ? "write" : "read", offset, length,
2001		img_request);
2002
2003	return img_request;
 
 
 
 
 
2004}
2005
2006static void rbd_img_request_destroy(struct kref *kref)
 
 
 
2007{
2008	struct rbd_img_request *img_request;
2009	struct rbd_obj_request *obj_request;
2010	struct rbd_obj_request *next_obj_request;
 
2011
2012	img_request = container_of(kref, struct rbd_img_request, kref);
 
 
 
 
 
 
2013
2014	dout("%s: img %p\n", __func__, img_request);
2015
2016	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2017		rbd_img_obj_request_del(img_request, obj_request);
2018	rbd_assert(img_request->obj_request_count == 0);
 
 
2019
2020	if (img_request_layered_test(img_request)) {
2021		img_request_layered_clear(img_request);
2022		rbd_dev_parent_put(img_request->rbd_dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2023	}
2024
2025	if (img_request_write_test(img_request))
2026		ceph_put_snap_context(img_request->snapc);
2027
2028	kmem_cache_free(rbd_img_request_cache, img_request);
 
 
 
 
 
2029}
2030
2031static struct rbd_img_request *rbd_parent_request_create(
2032					struct rbd_obj_request *obj_request,
2033					u64 img_offset, u64 length)
2034{
2035	struct rbd_img_request *parent_request;
2036	struct rbd_device *rbd_dev;
2037
2038	rbd_assert(obj_request->img_request);
2039	rbd_dev = obj_request->img_request->rbd_dev;
 
 
 
 
 
2040
2041	parent_request = rbd_img_request_create(rbd_dev->parent,
2042						img_offset, length, false);
2043	if (!parent_request)
2044		return NULL;
2045
2046	img_request_child_set(parent_request);
2047	rbd_obj_request_get(obj_request);
2048	parent_request->obj_request = obj_request;
2049
2050	return parent_request;
 
 
 
 
 
 
2051}
2052
2053static void rbd_parent_request_destroy(struct kref *kref)
 
 
 
 
 
2054{
2055	struct rbd_img_request *parent_request;
2056	struct rbd_obj_request *orig_request;
2057
2058	parent_request = container_of(kref, struct rbd_img_request, kref);
2059	orig_request = parent_request->obj_request;
 
 
 
 
 
 
 
 
 
 
2060
2061	parent_request->obj_request = NULL;
2062	rbd_obj_request_put(orig_request);
2063	img_request_child_clear(parent_request);
2064
2065	rbd_img_request_destroy(kref);
2066}
2067
2068static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
2069{
2070	struct rbd_img_request *img_request;
2071	unsigned int xferred;
2072	int result;
2073	bool more;
 
2074
2075	rbd_assert(obj_request_img_data_test(obj_request));
2076	img_request = obj_request->img_request;
2077
2078	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
2079	xferred = (unsigned int)obj_request->xferred;
2080	result = obj_request->result;
2081	if (result) {
2082		struct rbd_device *rbd_dev = img_request->rbd_dev;
2083
2084		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
2085			img_request_write_test(img_request) ? "write" : "read",
2086			obj_request->length, obj_request->img_offset,
2087			obj_request->offset);
2088		rbd_warn(rbd_dev, "  result %d xferred %x\n",
2089			result, xferred);
2090		if (!img_request->result)
2091			img_request->result = result;
2092	}
2093
2094	/* Image object requests don't own their page array */
 
 
 
 
 
2095
2096	if (obj_request->type == OBJ_REQUEST_PAGES) {
2097		obj_request->pages = NULL;
2098		obj_request->page_count = 0;
2099	}
 
 
2100
2101	if (img_request_child_test(img_request)) {
2102		rbd_assert(img_request->obj_request != NULL);
2103		more = obj_request->which < img_request->obj_request_count - 1;
2104	} else {
2105		rbd_assert(img_request->rq != NULL);
2106		more = blk_end_request(img_request->rq, result, xferred);
2107	}
2108
2109	return more;
2110}
2111
2112static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
2113{
2114	struct rbd_img_request *img_request;
2115	u32 which = obj_request->which;
2116	bool more = true;
2117
2118	rbd_assert(obj_request_img_data_test(obj_request));
2119	img_request = obj_request->img_request;
2120
2121	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
2122	rbd_assert(img_request != NULL);
2123	rbd_assert(img_request->obj_request_count > 0);
2124	rbd_assert(which != BAD_WHICH);
2125	rbd_assert(which < img_request->obj_request_count);
 
 
2126
2127	spin_lock_irq(&img_request->completion_lock);
2128	if (which != img_request->next_completion)
2129		goto out;
 
 
 
 
 
 
 
 
 
2130
2131	for_each_obj_request_from(img_request, obj_request) {
2132		rbd_assert(more);
2133		rbd_assert(which < img_request->obj_request_count);
2134
2135		if (!obj_request_done_test(obj_request))
2136			break;
2137		more = rbd_img_obj_end_request(obj_request);
2138		which++;
2139	}
2140
2141	rbd_assert(more ^ (which == img_request->obj_request_count));
2142	img_request->next_completion = which;
2143out:
2144	spin_unlock_irq(&img_request->completion_lock);
2145
2146	if (!more)
2147		rbd_img_request_complete(img_request);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2148}
2149
2150/*
2151 * Split up an image request into one or more object requests, each
2152 * to a different object.  The "type" parameter indicates whether
2153 * "data_desc" is the pointer to the head of a list of bio
2154 * structures, or the base of a page array.  In either case this
2155 * function assumes data_desc describes memory sufficient to hold
2156 * all data described by the image request.
2157 */
2158static int rbd_img_request_fill(struct rbd_img_request *img_request,
2159					enum obj_request_type type,
2160					void *data_desc)
2161{
2162	struct rbd_device *rbd_dev = img_request->rbd_dev;
2163	struct rbd_obj_request *obj_request = NULL;
2164	struct rbd_obj_request *next_obj_request;
2165	bool write_request = img_request_write_test(img_request);
2166	struct bio *bio_list = NULL;
2167	unsigned int bio_offset = 0;
2168	struct page **pages = NULL;
2169	u64 img_offset;
2170	u64 resid;
2171	u16 opcode;
2172
2173	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2174		(int)type, data_desc);
2175
2176	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
2177	img_offset = img_request->offset;
2178	resid = img_request->length;
2179	rbd_assert(resid > 0);
2180
2181	if (type == OBJ_REQUEST_BIO) {
2182		bio_list = data_desc;
2183		rbd_assert(img_offset ==
2184			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
2185	} else {
2186		rbd_assert(type == OBJ_REQUEST_PAGES);
2187		pages = data_desc;
2188	}
2189
2190	while (resid) {
2191		struct ceph_osd_request *osd_req;
2192		const char *object_name;
2193		u64 offset;
2194		u64 length;
2195		unsigned int which = 0;
2196
2197		object_name = rbd_segment_name(rbd_dev, img_offset);
2198		if (!object_name)
2199			goto out_unwind;
2200		offset = rbd_segment_offset(rbd_dev, img_offset);
2201		length = rbd_segment_length(rbd_dev, img_offset, resid);
2202		obj_request = rbd_obj_request_create(object_name,
2203						offset, length, type);
2204		/* object request has its own copy of the object name */
2205		rbd_segment_name_free(object_name);
2206		if (!obj_request)
2207			goto out_unwind;
2208
2209		/*
2210		 * set obj_request->img_request before creating the
2211		 * osd_request so that it gets the right snapc
2212		 */
2213		rbd_img_obj_request_add(img_request, obj_request);
2214
2215		if (type == OBJ_REQUEST_BIO) {
2216			unsigned int clone_size;
 
 
2217
2218			rbd_assert(length <= (u64)UINT_MAX);
2219			clone_size = (unsigned int)length;
2220			obj_request->bio_list =
2221					bio_chain_clone_range(&bio_list,
2222								&bio_offset,
2223								clone_size,
2224								GFP_ATOMIC);
2225			if (!obj_request->bio_list)
2226				goto out_unwind;
2227		} else {
2228			unsigned int page_count;
2229
2230			obj_request->pages = pages;
2231			page_count = (u32)calc_pages_for(offset, length);
2232			obj_request->page_count = page_count;
2233			if ((offset + length) & ~PAGE_MASK)
2234				page_count--;	/* more on last page */
2235			pages += page_count;
2236		}
2237
2238		osd_req = rbd_osd_req_create(rbd_dev, write_request,
2239					     (write_request ? 2 : 1),
2240					     obj_request);
2241		if (!osd_req)
2242			goto out_unwind;
2243		obj_request->osd_req = osd_req;
2244		obj_request->callback = rbd_img_obj_callback;
2245
2246		if (write_request) {
2247			osd_req_op_alloc_hint_init(osd_req, which,
2248					     rbd_obj_bytes(&rbd_dev->header),
2249					     rbd_obj_bytes(&rbd_dev->header));
2250			which++;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2251		}
 
 
2252
2253		osd_req_op_extent_init(osd_req, which, opcode, offset, length,
2254				       0, 0);
2255		if (type == OBJ_REQUEST_BIO)
2256			osd_req_op_extent_osd_data_bio(osd_req, which,
2257					obj_request->bio_list, length);
2258		else
2259			osd_req_op_extent_osd_data_pages(osd_req, which,
2260					obj_request->pages, length,
2261					offset & ~PAGE_MASK, false, false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2262
2263		if (write_request)
2264			rbd_osd_req_format_write(obj_request);
2265		else
2266			rbd_osd_req_format_read(obj_request);
 
 
 
 
 
2267
2268		obj_request->img_offset = img_offset;
 
 
 
 
 
2269
2270		img_offset += length;
2271		resid -= length;
 
 
 
2272	}
 
 
2273
2274	return 0;
 
 
 
 
 
 
 
 
2275
2276out_unwind:
2277	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2278		rbd_img_obj_request_del(img_request, obj_request);
2279
2280	return -ENOMEM;
 
 
 
 
 
 
 
 
 
 
 
2281}
2282
2283static void
2284rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2285{
2286	struct rbd_img_request *img_request;
2287	struct rbd_device *rbd_dev;
2288	struct page **pages;
2289	u32 page_count;
2290
2291	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2292	rbd_assert(obj_request_img_data_test(obj_request));
2293	img_request = obj_request->img_request;
2294	rbd_assert(img_request);
2295
2296	rbd_dev = img_request->rbd_dev;
2297	rbd_assert(rbd_dev);
2298
2299	pages = obj_request->copyup_pages;
2300	rbd_assert(pages != NULL);
2301	obj_request->copyup_pages = NULL;
2302	page_count = obj_request->copyup_page_count;
2303	rbd_assert(page_count);
2304	obj_request->copyup_page_count = 0;
2305	ceph_release_page_vector(pages, page_count);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2306
2307	/*
2308	 * We want the transfer count to reflect the size of the
2309	 * original write request.  There is no such thing as a
2310	 * successful short write, so if the request was successful
2311	 * we can just set it to the originally-requested length.
2312	 */
2313	if (!obj_request->result)
2314		obj_request->xferred = obj_request->length;
 
 
 
 
 
 
2315
2316	/* Finish up with the normal image object callback */
2317
2318	rbd_img_obj_callback(obj_request);
 
 
 
 
 
 
 
 
 
 
 
2319}
2320
2321static void
2322rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2323{
2324	struct rbd_obj_request *orig_request;
2325	struct ceph_osd_request *osd_req;
2326	struct ceph_osd_client *osdc;
2327	struct rbd_device *rbd_dev;
2328	struct page **pages;
2329	u32 page_count;
2330	int img_result;
2331	u64 parent_length;
2332	u64 offset;
2333	u64 length;
2334
2335	rbd_assert(img_request_child_test(img_request));
2336
2337	/* First get what we need from the image request */
2338
2339	pages = img_request->copyup_pages;
2340	rbd_assert(pages != NULL);
2341	img_request->copyup_pages = NULL;
2342	page_count = img_request->copyup_page_count;
2343	rbd_assert(page_count);
2344	img_request->copyup_page_count = 0;
2345
2346	orig_request = img_request->obj_request;
2347	rbd_assert(orig_request != NULL);
2348	rbd_assert(obj_request_type_valid(orig_request->type));
2349	img_result = img_request->result;
2350	parent_length = img_request->length;
2351	rbd_assert(parent_length == img_request->xferred);
2352	rbd_img_request_put(img_request);
2353
2354	rbd_assert(orig_request->img_request);
2355	rbd_dev = orig_request->img_request->rbd_dev;
2356	rbd_assert(rbd_dev);
 
2357
2358	/*
2359	 * If the overlap has become 0 (most likely because the
2360	 * image has been flattened) we need to free the pages
2361	 * and re-submit the original write request.
2362	 */
2363	if (!rbd_dev->parent_overlap) {
2364		struct ceph_osd_client *osdc;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2365
2366		ceph_release_page_vector(pages, page_count);
2367		osdc = &rbd_dev->rbd_client->client->osdc;
2368		img_result = rbd_obj_request_submit(osdc, orig_request);
2369		if (!img_result)
2370			return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2371	}
2372
2373	if (img_result)
2374		goto out_err;
 
 
 
 
 
 
2375
2376	/*
2377	 * The original osd request is of no use to use any more.
2378	 * We need a new one that can hold the three ops in a copyup
2379	 * request.  Allocate the new copyup osd request for the
2380	 * original request, and release the old one.
2381	 */
2382	img_result = -ENOMEM;
2383	osd_req = rbd_osd_req_create_copyup(orig_request);
2384	if (!osd_req)
2385		goto out_err;
2386	rbd_osd_req_destroy(orig_request->osd_req);
2387	orig_request->osd_req = osd_req;
2388	orig_request->copyup_pages = pages;
2389	orig_request->copyup_page_count = page_count;
2390
2391	/* Initialize the copyup op */
2392
2393	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2394	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
2395						false, false);
2396
2397	/* Then the hint op */
2398
2399	osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header),
2400				   rbd_obj_bytes(&rbd_dev->header));
2401
2402	/* And the original write request op */
2403
2404	offset = orig_request->offset;
2405	length = orig_request->length;
2406	osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE,
2407					offset, length, 0, 0);
2408	if (orig_request->type == OBJ_REQUEST_BIO)
2409		osd_req_op_extent_osd_data_bio(osd_req, 2,
2410					orig_request->bio_list, length);
2411	else
2412		osd_req_op_extent_osd_data_pages(osd_req, 2,
2413					orig_request->pages, length,
2414					offset & ~PAGE_MASK, false, false);
2415
2416	rbd_osd_req_format_write(orig_request);
2417
2418	/* All set, send it off. */
2419
2420	orig_request->callback = rbd_img_obj_copyup_callback;
2421	osdc = &rbd_dev->rbd_client->client->osdc;
2422	img_result = rbd_obj_request_submit(osdc, orig_request);
2423	if (!img_result)
2424		return;
2425out_err:
2426	/* Record the error code and complete the request */
2427
2428	orig_request->result = img_result;
2429	orig_request->xferred = 0;
2430	obj_request_done_set(orig_request);
2431	rbd_obj_request_complete(orig_request);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2432}
2433
2434/*
2435 * Read from the parent image the range of data that covers the
2436 * entire target of the given object request.  This is used for
2437 * satisfying a layered image write request when the target of an
2438 * object request from the image request does not exist.
2439 *
2440 * A page array big enough to hold the returned data is allocated
2441 * and supplied to rbd_img_request_fill() as the "data descriptor."
2442 * When the read completes, this page array will be transferred to
2443 * the original object request for the copyup operation.
2444 *
2445 * If an error occurs, record it as the result of the original
2446 * object request and mark it done so it gets completed.
2447 */
2448static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
 
2449{
2450	struct rbd_img_request *img_request = NULL;
2451	struct rbd_img_request *parent_request = NULL;
2452	struct rbd_device *rbd_dev;
2453	u64 img_offset;
2454	u64 length;
2455	struct page **pages = NULL;
2456	u32 page_count;
2457	int result;
2458
2459	rbd_assert(obj_request_img_data_test(obj_request));
2460	rbd_assert(obj_request_type_valid(obj_request->type));
 
 
 
 
2461
2462	img_request = obj_request->img_request;
2463	rbd_assert(img_request != NULL);
2464	rbd_dev = img_request->rbd_dev;
2465	rbd_assert(rbd_dev->parent != NULL);
2466
2467	/*
2468	 * Determine the byte range covered by the object in the
2469	 * child image to which the original request was to be sent.
2470	 */
2471	img_offset = obj_request->img_offset - obj_request->offset;
2472	length = (u64)1 << rbd_dev->header.obj_order;
2473
2474	/*
2475	 * There is no defined parent data beyond the parent
2476	 * overlap, so limit what we read at that boundary if
2477	 * necessary.
2478	 */
2479	if (img_offset + length > rbd_dev->parent_overlap) {
2480		rbd_assert(img_offset < rbd_dev->parent_overlap);
2481		length = rbd_dev->parent_overlap - img_offset;
2482	}
2483
2484	/*
2485	 * Allocate a page array big enough to receive the data read
2486	 * from the parent.
2487	 */
2488	page_count = (u32)calc_pages_for(0, length);
2489	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2490	if (IS_ERR(pages)) {
2491		result = PTR_ERR(pages);
2492		pages = NULL;
2493		goto out_err;
2494	}
2495
2496	result = -ENOMEM;
2497	parent_request = rbd_parent_request_create(obj_request,
2498						img_offset, length);
2499	if (!parent_request)
2500		goto out_err;
 
2501
2502	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2503	if (result)
2504		goto out_err;
2505	parent_request->copyup_pages = pages;
2506	parent_request->copyup_page_count = page_count;
2507
2508	parent_request->callback = rbd_img_obj_parent_read_full_callback;
2509	result = rbd_img_request_submit(parent_request);
2510	if (!result)
2511		return 0;
 
 
 
2512
2513	parent_request->copyup_pages = NULL;
2514	parent_request->copyup_page_count = 0;
2515	parent_request->obj_request = NULL;
2516	rbd_obj_request_put(obj_request);
2517out_err:
2518	if (pages)
2519		ceph_release_page_vector(pages, page_count);
2520	if (parent_request)
2521		rbd_img_request_put(parent_request);
2522	obj_request->result = result;
2523	obj_request->xferred = 0;
2524	obj_request_done_set(obj_request);
2525
2526	return result;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2527}
2528
2529static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
 
2530{
2531	struct rbd_obj_request *orig_request;
2532	struct rbd_device *rbd_dev;
2533	int result;
2534
2535	rbd_assert(!obj_request_img_data_test(obj_request));
 
 
2536
2537	/*
2538	 * All we need from the object request is the original
2539	 * request and the result of the STAT op.  Grab those, then
2540	 * we're done with the request.
2541	 */
2542	orig_request = obj_request->obj_request;
2543	obj_request->obj_request = NULL;
2544	rbd_obj_request_put(orig_request);
2545	rbd_assert(orig_request);
2546	rbd_assert(orig_request->img_request);
2547
2548	result = obj_request->result;
2549	obj_request->result = 0;
2550
2551	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2552		obj_request, orig_request, result,
2553		obj_request->xferred, obj_request->length);
2554	rbd_obj_request_put(obj_request);
2555
2556	/*
2557	 * If the overlap has become 0 (most likely because the
2558	 * image has been flattened) we need to free the pages
2559	 * and re-submit the original write request.
2560	 */
2561	rbd_dev = orig_request->img_request->rbd_dev;
2562	if (!rbd_dev->parent_overlap) {
2563		struct ceph_osd_client *osdc;
2564
2565		osdc = &rbd_dev->rbd_client->client->osdc;
2566		result = rbd_obj_request_submit(osdc, orig_request);
2567		if (!result)
2568			return;
2569	}
2570
2571	/*
2572	 * Our only purpose here is to determine whether the object
2573	 * exists, and we don't want to treat the non-existence as
2574	 * an error.  If something else comes back, transfer the
2575	 * error to the original request and complete it now.
2576	 */
2577	if (!result) {
2578		obj_request_existence_set(orig_request, true);
2579	} else if (result == -ENOENT) {
2580		obj_request_existence_set(orig_request, false);
2581	} else if (result) {
2582		orig_request->result = result;
 
 
 
 
 
 
 
 
 
2583		goto out;
2584	}
2585
2586	/*
2587	 * Resubmit the original request now that we have recorded
2588	 * whether the target object exists.
2589	 */
2590	orig_request->result = rbd_img_obj_request_submit(orig_request);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2591out:
2592	if (orig_request->result)
2593		rbd_obj_request_complete(orig_request);
 
 
 
 
2594}
2595
2596static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
 
 
 
 
2597{
2598	struct rbd_obj_request *stat_request;
2599	struct rbd_device *rbd_dev;
2600	struct ceph_osd_client *osdc;
2601	struct page **pages = NULL;
2602	u32 page_count;
2603	size_t size;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2604	int ret;
2605
2606	/*
2607	 * The response data for a STAT call consists of:
2608	 *     le64 length;
2609	 *     struct {
2610	 *         le32 tv_sec;
2611	 *         le32 tv_nsec;
2612	 *     } mtime;
2613	 */
2614	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2615	page_count = (u32)calc_pages_for(0, size);
2616	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2617	if (IS_ERR(pages))
2618		return PTR_ERR(pages);
2619
2620	ret = -ENOMEM;
2621	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2622							OBJ_REQUEST_PAGES);
2623	if (!stat_request)
 
 
 
 
2624		goto out;
 
 
 
 
 
 
 
 
2625
2626	rbd_obj_request_get(obj_request);
2627	stat_request->obj_request = obj_request;
2628	stat_request->pages = pages;
2629	stat_request->page_count = page_count;
2630
2631	rbd_assert(obj_request->img_request);
2632	rbd_dev = obj_request->img_request->rbd_dev;
2633	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
2634						   stat_request);
2635	if (!stat_request->osd_req)
2636		goto out;
2637	stat_request->callback = rbd_img_obj_exists_callback;
2638
2639	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2640	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2641					false, false);
2642	rbd_osd_req_format_read(stat_request);
 
 
 
2643
2644	osdc = &rbd_dev->rbd_client->client->osdc;
2645	ret = rbd_obj_request_submit(osdc, stat_request);
2646out:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2647	if (ret)
2648		rbd_obj_request_put(obj_request);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2649
 
 
 
 
2650	return ret;
2651}
2652
2653static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
 
 
 
2654{
2655	struct rbd_img_request *img_request;
2656	struct rbd_device *rbd_dev;
2657	bool known;
 
2658
2659	rbd_assert(obj_request_img_data_test(obj_request));
 
 
 
2660
2661	img_request = obj_request->img_request;
2662	rbd_assert(img_request);
2663	rbd_dev = img_request->rbd_dev;
 
2664
2665	/*
2666	 * Only writes to layered images need special handling.
2667	 * Reads and non-layered writes are simple object requests.
2668	 * Layered writes that start beyond the end of the overlap
2669	 * with the parent have no parent data, so they too are
2670	 * simple object requests.  Finally, if the target object is
2671	 * known to already exist, its parent data has already been
2672	 * copied, so a write to the object can also be handled as a
2673	 * simple object request.
2674	 */
2675	if (!img_request_write_test(img_request) ||
2676		!img_request_layered_test(img_request) ||
2677		rbd_dev->parent_overlap <= obj_request->img_offset ||
2678		((known = obj_request_known_test(obj_request)) &&
2679			obj_request_exists_test(obj_request))) {
2680
2681		struct rbd_device *rbd_dev;
2682		struct ceph_osd_client *osdc;
 
2683
2684		rbd_dev = obj_request->img_request->rbd_dev;
2685		osdc = &rbd_dev->rbd_client->client->osdc;
2686
2687		return rbd_obj_request_submit(osdc, obj_request);
2688	}
 
 
 
 
 
2689
2690	/*
2691	 * It's a layered write.  The target object might exist but
2692	 * we may not know that yet.  If we know it doesn't exist,
2693	 * start by reading the data for the full target object from
2694	 * the parent so we can use it for a copyup to the target.
2695	 */
2696	if (known)
2697		return rbd_img_obj_parent_read_full(obj_request);
2698
2699	/* We don't know whether the target exists.  Go find out. */
 
 
2700
2701	return rbd_img_obj_exists_submit(obj_request);
 
 
2702}
2703
2704static int rbd_img_request_submit(struct rbd_img_request *img_request)
2705{
2706	struct rbd_obj_request *obj_request;
2707	struct rbd_obj_request *next_obj_request;
2708
2709	dout("%s: img %p\n", __func__, img_request);
2710	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2711		int ret;
2712
2713		ret = rbd_img_obj_request_submit(obj_request);
 
2714		if (ret)
2715			return ret;
2716	}
2717
2718	return 0;
2719}
2720
2721static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
 
 
 
 
 
 
2722{
2723	struct rbd_obj_request *obj_request;
2724	struct rbd_device *rbd_dev;
2725	u64 obj_end;
2726	u64 img_xferred;
2727	int img_result;
2728
2729	rbd_assert(img_request_child_test(img_request));
 
 
 
 
 
 
2730
2731	/* First get what we need from the image request and release it */
 
 
 
 
 
 
 
2732
2733	obj_request = img_request->obj_request;
2734	img_xferred = img_request->xferred;
2735	img_result = img_request->result;
2736	rbd_img_request_put(img_request);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2737
2738	/*
2739	 * If the overlap has become 0 (most likely because the
2740	 * image has been flattened) we need to re-submit the
2741	 * original request.
2742	 */
2743	rbd_assert(obj_request);
2744	rbd_assert(obj_request->img_request);
2745	rbd_dev = obj_request->img_request->rbd_dev;
2746	if (!rbd_dev->parent_overlap) {
2747		struct ceph_osd_client *osdc;
2748
2749		osdc = &rbd_dev->rbd_client->client->osdc;
2750		img_result = rbd_obj_request_submit(osdc, obj_request);
2751		if (!img_result)
2752			return;
2753	}
2754
2755	obj_request->result = img_result;
2756	if (obj_request->result)
2757		goto out;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2758
2759	/*
2760	 * We need to zero anything beyond the parent overlap
2761	 * boundary.  Since rbd_img_obj_request_read_callback()
2762	 * will zero anything beyond the end of a short read, an
2763	 * easy way to do this is to pretend the data from the
2764	 * parent came up short--ending at the overlap boundary.
2765	 */
2766	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2767	obj_end = obj_request->img_offset + obj_request->length;
2768	if (obj_end > rbd_dev->parent_overlap) {
2769		u64 xferred = 0;
2770
2771		if (obj_request->img_offset < rbd_dev->parent_overlap)
2772			xferred = rbd_dev->parent_overlap -
2773					obj_request->img_offset;
2774
2775		obj_request->xferred = min(img_xferred, xferred);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2776	} else {
2777		obj_request->xferred = img_xferred;
2778	}
2779out:
2780	rbd_img_obj_request_read_callback(obj_request);
2781	rbd_obj_request_complete(obj_request);
2782}
2783
2784static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
 
2785{
2786	struct rbd_img_request *img_request;
2787	int result;
2788
2789	rbd_assert(obj_request_img_data_test(obj_request));
2790	rbd_assert(obj_request->img_request != NULL);
2791	rbd_assert(obj_request->result == (s32) -ENOENT);
2792	rbd_assert(obj_request_type_valid(obj_request->type));
2793
2794	/* rbd_read_finish(obj_request, obj_request->length); */
2795	img_request = rbd_parent_request_create(obj_request,
2796						obj_request->img_offset,
2797						obj_request->length);
2798	result = -ENOMEM;
2799	if (!img_request)
2800		goto out_err;
2801
2802	if (obj_request->type == OBJ_REQUEST_BIO)
2803		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2804						obj_request->bio_list);
2805	else
2806		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
2807						obj_request->pages);
2808	if (result)
2809		goto out_err;
 
 
 
2810
2811	img_request->callback = rbd_img_parent_read_callback;
2812	result = rbd_img_request_submit(img_request);
2813	if (result)
2814		goto out_err;
 
2815
2816	return;
2817out_err:
2818	if (img_request)
2819		rbd_img_request_put(img_request);
2820	obj_request->result = result;
2821	obj_request->xferred = 0;
2822	obj_request_done_set(obj_request);
2823}
2824
2825static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2826{
2827	struct rbd_obj_request *obj_request;
2828	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 
 
2829	int ret;
2830
2831	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2832							OBJ_REQUEST_NODATA);
2833	if (!obj_request)
2834		return -ENOMEM;
2835
2836	ret = -ENOMEM;
2837	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
2838						  obj_request);
2839	if (!obj_request->osd_req)
2840		goto out;
2841
2842	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2843					notify_id, 0, 0);
2844	rbd_osd_req_format_read(obj_request);
 
 
 
 
2845
2846	ret = rbd_obj_request_submit(osdc, obj_request);
 
 
2847	if (ret)
2848		goto out;
2849	ret = rbd_obj_request_wait(obj_request);
2850out:
2851	rbd_obj_request_put(obj_request);
2852
2853	return ret;
 
 
 
 
2854}
2855
2856static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 
2857{
2858	struct rbd_device *rbd_dev = (struct rbd_device *)data;
 
 
 
 
 
 
 
 
 
 
 
 
2859	int ret;
2860
2861	if (!rbd_dev)
2862		return;
 
 
 
 
 
 
 
 
2863
2864	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2865		rbd_dev->header_name, (unsigned long long)notify_id,
2866		(unsigned int)opcode);
2867	ret = rbd_dev_refresh(rbd_dev);
2868	if (ret)
2869		rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
2870
2871	rbd_obj_notify_ack_sync(rbd_dev, notify_id);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2872}
2873
2874/*
2875 * Request sync osd watch/unwatch.  The value of "start" determines
2876 * whether a watch request is being initiated or torn down.
2877 */
2878static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
2879{
2880	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2881	struct rbd_obj_request *obj_request;
2882	int ret;
2883
2884	rbd_assert(start ^ !!rbd_dev->watch_event);
2885	rbd_assert(start ^ !!rbd_dev->watch_request);
2886
2887	if (start) {
2888		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
2889						&rbd_dev->watch_event);
2890		if (ret < 0)
2891			return ret;
2892		rbd_assert(rbd_dev->watch_event != NULL);
2893	}
2894
2895	ret = -ENOMEM;
2896	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2897							OBJ_REQUEST_NODATA);
2898	if (!obj_request)
2899		goto out_cancel;
2900
2901	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
2902						  obj_request);
2903	if (!obj_request->osd_req)
2904		goto out_cancel;
2905
2906	if (start)
2907		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
2908	else
2909		ceph_osdc_unregister_linger_request(osdc,
2910					rbd_dev->watch_request->osd_req);
 
 
2911
2912	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2913				rbd_dev->watch_event->cookie, 0, start ? 1 : 0);
2914	rbd_osd_req_format_write(obj_request);
2915
2916	ret = rbd_obj_request_submit(osdc, obj_request);
2917	if (ret)
2918		goto out_cancel;
2919	ret = rbd_obj_request_wait(obj_request);
2920	if (ret)
2921		goto out_cancel;
2922	ret = obj_request->result;
2923	if (ret)
2924		goto out_cancel;
2925
2926	/*
2927	 * A watch request is set to linger, so the underlying osd
2928	 * request won't go away until we unregister it.  We retain
2929	 * a pointer to the object request during that time (in
2930	 * rbd_dev->watch_request), so we'll keep a reference to
2931	 * it.  We'll drop that reference (below) after we've
2932	 * unregistered it.
2933	 */
2934	if (start) {
2935		rbd_dev->watch_request = obj_request;
2936
2937		return 0;
2938	}
 
2939
2940	/* We have successfully torn down the watch request */
 
 
 
 
2941
2942	rbd_obj_request_put(rbd_dev->watch_request);
2943	rbd_dev->watch_request = NULL;
2944out_cancel:
2945	/* Cancel the event if we're tearing down, or on error */
2946	ceph_osdc_cancel_event(rbd_dev->watch_event);
2947	rbd_dev->watch_event = NULL;
2948	if (obj_request)
2949		rbd_obj_request_put(obj_request);
2950
 
 
2951	return ret;
2952}
2953
2954static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
2955{
2956	return __rbd_dev_header_watch_sync(rbd_dev, true);
 
 
 
 
 
2957}
2958
2959static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
2960{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2961	int ret;
2962
2963	ret = __rbd_dev_header_watch_sync(rbd_dev, false);
 
 
 
 
 
 
 
2964	if (ret) {
2965		rbd_warn(rbd_dev, "unable to tear down watch request: %d\n",
2966			 ret);
 
 
 
 
 
 
 
 
 
 
 
2967	}
2968}
2969
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2970/*
2971 * Synchronous osd object method call.  Returns the number of bytes
2972 * returned in the outbound buffer, or a negative error code.
2973 */
2974static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2975			     const char *object_name,
2976			     const char *class_name,
2977			     const char *method_name,
2978			     const void *outbound,
2979			     size_t outbound_size,
2980			     void *inbound,
2981			     size_t inbound_size)
2982{
2983	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2984	struct rbd_obj_request *obj_request;
2985	struct page **pages;
2986	u32 page_count;
2987	int ret;
2988
2989	/*
2990	 * Method calls are ultimately read operations.  The result
2991	 * should placed into the inbound buffer provided.  They
2992	 * also supply outbound data--parameters for the object
2993	 * method.  Currently if this is present it will be a
2994	 * snapshot id.
2995	 */
2996	page_count = (u32)calc_pages_for(0, inbound_size);
2997	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2998	if (IS_ERR(pages))
2999		return PTR_ERR(pages);
3000
3001	ret = -ENOMEM;
3002	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
3003							OBJ_REQUEST_PAGES);
3004	if (!obj_request)
3005		goto out;
3006
3007	obj_request->pages = pages;
3008	obj_request->page_count = page_count;
3009
3010	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
3011						  obj_request);
3012	if (!obj_request->osd_req)
3013		goto out;
3014
3015	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
3016					class_name, method_name);
3017	if (outbound_size) {
3018		struct ceph_pagelist *pagelist;
3019
3020		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
3021		if (!pagelist)
3022			goto out;
3023
3024		ceph_pagelist_init(pagelist);
3025		ceph_pagelist_append(pagelist, outbound, outbound_size);
3026		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
3027						pagelist);
3028	}
3029	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
3030					obj_request->pages, inbound_size,
3031					0, false, false);
3032	rbd_osd_req_format_read(obj_request);
3033
3034	ret = rbd_obj_request_submit(osdc, obj_request);
3035	if (ret)
3036		goto out;
3037	ret = rbd_obj_request_wait(obj_request);
3038	if (ret)
3039		goto out;
3040
3041	ret = obj_request->result;
3042	if (ret < 0)
3043		goto out;
 
 
 
3044
3045	rbd_assert(obj_request->xferred < (u64)INT_MAX);
3046	ret = (int)obj_request->xferred;
3047	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
3048out:
3049	if (obj_request)
3050		rbd_obj_request_put(obj_request);
3051	else
3052		ceph_release_page_vector(pages, page_count);
3053
 
 
 
3054	return ret;
3055}
3056
3057static void rbd_request_fn(struct request_queue *q)
3058		__releases(q->queue_lock) __acquires(q->queue_lock)
3059{
3060	struct rbd_device *rbd_dev = q->queuedata;
3061	bool read_only = rbd_dev->mapping.read_only;
3062	struct request *rq;
 
 
 
 
 
3063	int result;
3064
3065	while ((rq = blk_fetch_request(q))) {
3066		bool write_request = rq_data_dir(rq) == WRITE;
3067		struct rbd_img_request *img_request;
3068		u64 offset;
3069		u64 length;
3070
3071		/* Ignore any non-FS requests that filter through. */
3072
3073		if (rq->cmd_type != REQ_TYPE_FS) {
3074			dout("%s: non-fs request type %d\n", __func__,
3075				(int) rq->cmd_type);
3076			__blk_end_request_all(rq, 0);
3077			continue;
3078		}
3079
3080		/* Ignore/skip any zero-length requests */
3081
3082		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
3083		length = (u64) blk_rq_bytes(rq);
3084
3085		if (!length) {
3086			dout("%s: zero-length request\n", __func__);
3087			__blk_end_request_all(rq, 0);
3088			continue;
3089		}
3090
3091		spin_unlock_irq(q->queue_lock);
3092
3093		/* Disallow writes to a read-only device */
 
 
 
 
3094
3095		if (write_request) {
3096			result = -EROFS;
3097			if (read_only)
3098				goto end_request;
3099			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3100		}
3101
3102		/*
3103		 * Quit early if the mapped snapshot no longer
3104		 * exists.  It's still possible the snapshot will
3105		 * have disappeared by the time our request arrives
3106		 * at the osd, but there's no sense in sending it if
3107		 * we already know.
3108		 */
3109		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3110			dout("request for non-existent snapshot");
3111			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3112			result = -ENXIO;
3113			goto end_request;
3114		}
3115
 
 
 
3116		result = -EINVAL;
3117		if (offset && length > U64_MAX - offset + 1) {
3118			rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
3119				offset, length);
3120			goto end_request;	/* Shouldn't happen */
3121		}
3122
 
 
 
 
 
 
 
 
 
 
 
 
 
3123		result = -EIO;
3124		if (offset + length > rbd_dev->mapping.size) {
3125			rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
3126				offset, length, rbd_dev->mapping.size);
3127			goto end_request;
3128		}
3129
 
 
3130		result = -ENOMEM;
3131		img_request = rbd_img_request_create(rbd_dev, offset, length,
3132							write_request);
3133		if (!img_request)
3134			goto end_request;
3135
3136		img_request->rq = rq;
3137
3138		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3139						rq->bio);
3140		if (!result)
3141			result = rbd_img_request_submit(img_request);
3142		if (result)
3143			rbd_img_request_put(img_request);
3144end_request:
3145		spin_lock_irq(q->queue_lock);
3146		if (result < 0) {
3147			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
3148				write_request ? "write" : "read",
3149				length, offset, result);
3150
3151			__blk_end_request_all(rq, result);
3152		}
3153	}
3154}
 
3155
3156/*
3157 * a queue callback. Makes sure that we don't create a bio that spans across
3158 * multiple osd objects. One exception would be with a single page bios,
3159 * which we handle later at bio_chain_clone_range()
3160 */
3161static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
3162			  struct bio_vec *bvec)
3163{
3164	struct rbd_device *rbd_dev = q->queuedata;
3165	sector_t sector_offset;
3166	sector_t sectors_per_obj;
3167	sector_t obj_sector_offset;
3168	int ret;
3169
3170	/*
3171	 * Find how far into its rbd object the partition-relative
3172	 * bio start sector is to offset relative to the enclosing
3173	 * device.
3174	 */
3175	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
3176	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
3177	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
3178
3179	/*
3180	 * Compute the number of bytes from that offset to the end
3181	 * of the object.  Account for what's already used by the bio.
3182	 */
3183	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
3184	if (ret > bmd->bi_size)
3185		ret -= bmd->bi_size;
3186	else
3187		ret = 0;
 
 
 
3188
3189	/*
3190	 * Don't send back more than was asked for.  And if the bio
3191	 * was empty, let the whole thing through because:  "Note
3192	 * that a block device *must* allow a single page to be
3193	 * added to an empty bio."
3194	 */
3195	rbd_assert(bvec->bv_len <= PAGE_SIZE);
3196	if (ret > (int) bvec->bv_len || !bmd->bi_size)
3197		ret = (int) bvec->bv_len;
3198
3199	return ret;
 
 
 
 
 
 
 
 
3200}
3201
3202static void rbd_free_disk(struct rbd_device *rbd_dev)
 
3203{
3204	struct gendisk *disk = rbd_dev->disk;
 
3205
3206	if (!disk)
3207		return;
 
3208
 
 
 
 
 
3209	rbd_dev->disk = NULL;
3210	if (disk->flags & GENHD_FL_UP) {
3211		del_gendisk(disk);
3212		if (disk->queue)
3213			blk_cleanup_queue(disk->queue);
3214	}
3215	put_disk(disk);
3216}
3217
3218static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3219				const char *object_name,
3220				u64 offset, u64 length, void *buf)
 
3221
3222{
3223	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3224	struct rbd_obj_request *obj_request;
3225	struct page **pages = NULL;
3226	u32 page_count;
3227	size_t size;
3228	int ret;
3229
3230	page_count = (u32) calc_pages_for(offset, length);
3231	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3232	if (IS_ERR(pages))
3233		ret = PTR_ERR(pages);
3234
3235	ret = -ENOMEM;
3236	obj_request = rbd_obj_request_create(object_name, offset, length,
3237							OBJ_REQUEST_PAGES);
3238	if (!obj_request)
3239		goto out;
3240
3241	obj_request->pages = pages;
3242	obj_request->page_count = page_count;
 
3243
3244	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
3245						  obj_request);
3246	if (!obj_request->osd_req)
3247		goto out;
 
3248
3249	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3250					offset, length, 0, 0);
3251	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3252					obj_request->pages,
3253					obj_request->length,
3254					obj_request->offset & ~PAGE_MASK,
3255					false, false);
3256	rbd_osd_req_format_read(obj_request);
3257
3258	ret = rbd_obj_request_submit(osdc, obj_request);
3259	if (ret)
3260		goto out;
3261	ret = rbd_obj_request_wait(obj_request);
3262	if (ret)
3263		goto out;
3264
3265	ret = obj_request->result;
3266	if (ret < 0)
3267		goto out;
3268
3269	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
3270	size = (size_t) obj_request->xferred;
3271	ceph_copy_from_page_vector(pages, buf, 0, size);
3272	rbd_assert(size <= (size_t)INT_MAX);
3273	ret = (int)size;
3274out:
3275	if (obj_request)
3276		rbd_obj_request_put(obj_request);
3277	else
3278		ceph_release_page_vector(pages, page_count);
3279
 
 
3280	return ret;
3281}
3282
3283/*
3284 * Read the complete header for the given rbd device.  On successful
3285 * return, the rbd_dev->header field will contain up-to-date
3286 * information about the image.
3287 */
3288static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
3289{
3290	struct rbd_image_header_ondisk *ondisk = NULL;
3291	u32 snap_count = 0;
3292	u64 names_size = 0;
3293	u32 want_count;
3294	int ret;
3295
3296	/*
3297	 * The complete header will include an array of its 64-bit
3298	 * snapshot ids, followed by the names of those snapshots as
3299	 * a contiguous block of NUL-terminated strings.  Note that
3300	 * the number of snapshots could change by the time we read
3301	 * it in, in which case we re-read it.
3302	 */
3303	do {
3304		size_t size;
3305
3306		kfree(ondisk);
3307
3308		size = sizeof (*ondisk);
3309		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3310		size += names_size;
3311		ondisk = kmalloc(size, GFP_KERNEL);
3312		if (!ondisk)
3313			return -ENOMEM;
3314
3315		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
3316				       0, size, ondisk);
3317		if (ret < 0)
3318			goto out;
3319		if ((size_t)ret < size) {
3320			ret = -ENXIO;
3321			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3322				size, ret);
3323			goto out;
3324		}
3325		if (!rbd_dev_ondisk_valid(ondisk)) {
3326			ret = -ENXIO;
3327			rbd_warn(rbd_dev, "invalid header");
3328			goto out;
3329		}
3330
3331		names_size = le64_to_cpu(ondisk->snap_names_len);
3332		want_count = snap_count;
3333		snap_count = le32_to_cpu(ondisk->snap_count);
3334	} while (snap_count != want_count);
3335
3336	ret = rbd_header_from_disk(rbd_dev, ondisk);
3337out:
3338	kfree(ondisk);
3339
3340	return ret;
3341}
3342
3343/*
3344 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3345 * has disappeared from the (just updated) snapshot context.
3346 */
3347static void rbd_exists_validate(struct rbd_device *rbd_dev)
3348{
3349	u64 snap_id;
3350
3351	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3352		return;
3353
3354	snap_id = rbd_dev->spec->snap_id;
3355	if (snap_id == CEPH_NOSNAP)
3356		return;
3357
3358	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3359		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3360}
3361
3362static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3363{
3364	sector_t size;
3365	bool removing;
3366
3367	/*
3368	 * Don't hold the lock while doing disk operations,
3369	 * or lock ordering will conflict with the bdev mutex via:
3370	 * rbd_add() -> blkdev_get() -> rbd_open()
3371	 */
3372	spin_lock_irq(&rbd_dev->lock);
3373	removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
3374	spin_unlock_irq(&rbd_dev->lock);
3375	/*
3376	 * If the device is being removed, rbd_dev->disk has
3377	 * been destroyed, so don't try to update its size
3378	 */
3379	if (!removing) {
 
3380		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3381		dout("setting size to %llu sectors", (unsigned long long)size);
3382		set_capacity(rbd_dev->disk, size);
3383		revalidate_disk(rbd_dev->disk);
3384	}
3385}
3386
3387static int rbd_dev_refresh(struct rbd_device *rbd_dev)
3388{
3389	u64 mapping_size;
3390	int ret;
3391
3392	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3393	down_write(&rbd_dev->header_rwsem);
3394	mapping_size = rbd_dev->mapping.size;
3395	if (rbd_dev->image_format == 1)
3396		ret = rbd_dev_v1_header_info(rbd_dev);
3397	else
3398		ret = rbd_dev_v2_header_info(rbd_dev);
3399
3400	/* If it's a mapped snapshot, validate its EXISTS flag */
 
 
3401
3402	rbd_exists_validate(rbd_dev);
3403	up_write(&rbd_dev->header_rwsem);
 
 
 
 
 
 
 
3404
3405	if (mapping_size != rbd_dev->mapping.size) {
3406		rbd_dev_update_size(rbd_dev);
 
 
 
3407	}
3408
 
 
 
 
 
3409	return ret;
3410}
3411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3412static int rbd_init_disk(struct rbd_device *rbd_dev)
3413{
3414	struct gendisk *disk;
3415	struct request_queue *q;
3416	u64 segment_size;
 
 
3417
3418	/* create gendisk info */
3419	disk = alloc_disk(single_major ?
3420			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
3421			  RBD_MINORS_PER_MAJOR);
3422	if (!disk)
3423		return -ENOMEM;
3424
3425	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3426		 rbd_dev->dev_id);
3427	disk->major = rbd_dev->major;
3428	disk->first_minor = rbd_dev->minor;
3429	if (single_major)
3430		disk->flags |= GENHD_FL_EXT_DEVT;
3431	disk->fops = &rbd_bd_ops;
3432	disk->private_data = rbd_dev;
3433
3434	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3435	if (!q)
 
 
 
 
 
 
 
 
3436		goto out_disk;
3437
3438	/* We use the default size, but let's be explicit about it. */
3439	blk_queue_physical_block_size(q, SECTOR_SIZE);
 
 
 
3440
3441	/* set io sizes to object size */
3442	segment_size = rbd_obj_bytes(&rbd_dev->header);
3443	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3444	blk_queue_max_segment_size(q, segment_size);
3445	blk_queue_io_min(q, segment_size);
3446	blk_queue_io_opt(q, segment_size);
3447
3448	blk_queue_merge_bvec(q, rbd_merge_bvec);
3449	disk->queue = q;
 
 
 
 
3450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3451	q->queuedata = rbd_dev;
3452
3453	rbd_dev->disk = disk;
3454
3455	return 0;
 
 
3456out_disk:
3457	put_disk(disk);
3458
3459	return -ENOMEM;
3460}
3461
3462/*
3463  sysfs
3464*/
3465
3466static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3467{
3468	return container_of(dev, struct rbd_device, dev);
3469}
3470
3471static ssize_t rbd_size_show(struct device *dev,
3472			     struct device_attribute *attr, char *buf)
3473{
3474	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3475
3476	return sprintf(buf, "%llu\n",
3477		(unsigned long long)rbd_dev->mapping.size);
3478}
3479
3480/*
3481 * Note this shows the features for whatever's mapped, which is not
3482 * necessarily the base image.
3483 */
3484static ssize_t rbd_features_show(struct device *dev,
3485			     struct device_attribute *attr, char *buf)
3486{
3487	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3488
3489	return sprintf(buf, "0x%016llx\n",
3490			(unsigned long long)rbd_dev->mapping.features);
3491}
3492
3493static ssize_t rbd_major_show(struct device *dev,
3494			      struct device_attribute *attr, char *buf)
3495{
3496	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3497
3498	if (rbd_dev->major)
3499		return sprintf(buf, "%d\n", rbd_dev->major);
3500
3501	return sprintf(buf, "(none)\n");
3502}
3503
3504static ssize_t rbd_minor_show(struct device *dev,
3505			      struct device_attribute *attr, char *buf)
3506{
3507	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3508
3509	return sprintf(buf, "%d\n", rbd_dev->minor);
3510}
3511
 
 
 
 
 
 
 
 
 
 
 
3512static ssize_t rbd_client_id_show(struct device *dev,
3513				  struct device_attribute *attr, char *buf)
3514{
3515	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3516
3517	return sprintf(buf, "client%lld\n",
3518			ceph_client_id(rbd_dev->rbd_client->client));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3519}
3520
3521static ssize_t rbd_pool_show(struct device *dev,
3522			     struct device_attribute *attr, char *buf)
3523{
3524	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3525
3526	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3527}
3528
3529static ssize_t rbd_pool_id_show(struct device *dev,
3530			     struct device_attribute *attr, char *buf)
3531{
3532	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3533
3534	return sprintf(buf, "%llu\n",
3535			(unsigned long long) rbd_dev->spec->pool_id);
3536}
3537
 
 
 
 
 
 
 
 
3538static ssize_t rbd_name_show(struct device *dev,
3539			     struct device_attribute *attr, char *buf)
3540{
3541	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3542
3543	if (rbd_dev->spec->image_name)
3544		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3545
3546	return sprintf(buf, "(unknown)\n");
3547}
3548
3549static ssize_t rbd_image_id_show(struct device *dev,
3550			     struct device_attribute *attr, char *buf)
3551{
3552	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3553
3554	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3555}
3556
3557/*
3558 * Shows the name of the currently-mapped snapshot (or
3559 * RBD_SNAP_HEAD_NAME for the base image).
3560 */
3561static ssize_t rbd_snap_show(struct device *dev,
3562			     struct device_attribute *attr,
3563			     char *buf)
3564{
3565	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3566
3567	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3568}
3569
 
 
 
 
 
 
 
 
3570/*
3571 * For an rbd v2 image, shows the pool id, image id, and snapshot id
3572 * for the parent image.  If there is no parent, simply shows
3573 * "(no parent image)".
3574 */
3575static ssize_t rbd_parent_show(struct device *dev,
3576			     struct device_attribute *attr,
3577			     char *buf)
3578{
3579	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3580	struct rbd_spec *spec = rbd_dev->parent_spec;
3581	int count;
3582	char *bufp = buf;
3583
3584	if (!spec)
3585		return sprintf(buf, "(no parent image)\n");
3586
3587	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3588			(unsigned long long) spec->pool_id, spec->pool_name);
3589	if (count < 0)
3590		return count;
3591	bufp += count;
3592
3593	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3594			spec->image_name ? spec->image_name : "(unknown)");
3595	if (count < 0)
3596		return count;
3597	bufp += count;
3598
3599	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3600			(unsigned long long) spec->snap_id, spec->snap_name);
3601	if (count < 0)
3602		return count;
3603	bufp += count;
3604
3605	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3606	if (count < 0)
3607		return count;
3608	bufp += count;
3609
3610	return (ssize_t) (bufp - buf);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3611}
3612
3613static ssize_t rbd_image_refresh(struct device *dev,
3614				 struct device_attribute *attr,
3615				 const char *buf,
3616				 size_t size)
3617{
3618	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3619	int ret;
3620
3621	ret = rbd_dev_refresh(rbd_dev);
3622	if (ret)
3623		rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
3624
3625	return ret < 0 ? ret : size;
3626}
3627
3628static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
3629static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3630static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3631static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
3632static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3633static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
3634static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3635static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3636static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3637static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3638static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
3639static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
 
 
 
 
 
3640
3641static struct attribute *rbd_attrs[] = {
3642	&dev_attr_size.attr,
3643	&dev_attr_features.attr,
3644	&dev_attr_major.attr,
3645	&dev_attr_minor.attr,
 
3646	&dev_attr_client_id.attr,
 
 
3647	&dev_attr_pool.attr,
3648	&dev_attr_pool_id.attr,
 
3649	&dev_attr_name.attr,
3650	&dev_attr_image_id.attr,
3651	&dev_attr_current_snap.attr,
 
3652	&dev_attr_parent.attr,
3653	&dev_attr_refresh.attr,
3654	NULL
3655};
3656
3657static struct attribute_group rbd_attr_group = {
3658	.attrs = rbd_attrs,
3659};
3660
3661static const struct attribute_group *rbd_attr_groups[] = {
3662	&rbd_attr_group,
3663	NULL
3664};
3665
3666static void rbd_sysfs_dev_release(struct device *dev)
3667{
3668}
3669
3670static struct device_type rbd_device_type = {
3671	.name		= "rbd",
3672	.groups		= rbd_attr_groups,
3673	.release	= rbd_sysfs_dev_release,
3674};
3675
3676static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3677{
3678	kref_get(&spec->kref);
3679
3680	return spec;
3681}
3682
3683static void rbd_spec_free(struct kref *kref);
3684static void rbd_spec_put(struct rbd_spec *spec)
3685{
3686	if (spec)
3687		kref_put(&spec->kref, rbd_spec_free);
3688}
3689
3690static struct rbd_spec *rbd_spec_alloc(void)
3691{
3692	struct rbd_spec *spec;
3693
3694	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3695	if (!spec)
3696		return NULL;
 
 
 
3697	kref_init(&spec->kref);
3698
3699	return spec;
3700}
3701
3702static void rbd_spec_free(struct kref *kref)
3703{
3704	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3705
3706	kfree(spec->pool_name);
 
3707	kfree(spec->image_id);
3708	kfree(spec->image_name);
3709	kfree(spec->snap_name);
3710	kfree(spec);
3711}
3712
3713static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3714				struct rbd_spec *spec)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3715{
3716	struct rbd_device *rbd_dev;
3717
3718	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3719	if (!rbd_dev)
3720		return NULL;
3721
3722	spin_lock_init(&rbd_dev->lock);
3723	rbd_dev->flags = 0;
3724	atomic_set(&rbd_dev->parent_ref, 0);
3725	INIT_LIST_HEAD(&rbd_dev->node);
3726	init_rwsem(&rbd_dev->header_rwsem);
3727
3728	rbd_dev->spec = spec;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3729	rbd_dev->rbd_client = rbdc;
 
3730
3731	/* Initialize the layout used for all rbd requests */
 
3732
3733	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3734	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3735	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3736	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
 
 
 
 
 
 
 
 
3737
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3738	return rbd_dev;
 
 
 
 
 
 
3739}
3740
3741static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3742{
3743	rbd_put_client(rbd_dev->rbd_client);
3744	rbd_spec_put(rbd_dev->spec);
3745	kfree(rbd_dev);
3746}
3747
3748/*
3749 * Get the size and object order for an image snapshot, or if
3750 * snap_id is CEPH_NOSNAP, gets this information for the base
3751 * image.
3752 */
3753static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3754				u8 *order, u64 *snap_size)
3755{
3756	__le64 snapid = cpu_to_le64(snap_id);
3757	int ret;
3758	struct {
3759		u8 order;
3760		__le64 size;
3761	} __attribute__ ((packed)) size_buf = { 0 };
3762
3763	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3764				"rbd", "get_size",
3765				&snapid, sizeof (snapid),
3766				&size_buf, sizeof (size_buf));
3767	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3768	if (ret < 0)
3769		return ret;
3770	if (ret < sizeof (size_buf))
3771		return -ERANGE;
3772
3773	if (order) {
3774		*order = size_buf.order;
3775		dout("  order %u", (unsigned int)*order);
3776	}
3777	*snap_size = le64_to_cpu(size_buf.size);
3778
3779	dout("  snap_id 0x%016llx snap_size = %llu\n",
3780		(unsigned long long)snap_id,
3781		(unsigned long long)*snap_size);
3782
3783	return 0;
3784}
3785
3786static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3787{
3788	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3789					&rbd_dev->header.obj_order,
3790					&rbd_dev->header.image_size);
3791}
3792
3793static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3794{
 
3795	void *reply_buf;
3796	int ret;
3797	void *p;
3798
3799	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
 
 
3800	if (!reply_buf)
3801		return -ENOMEM;
3802
3803	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3804				"rbd", "get_object_prefix", NULL, 0,
3805				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
3806	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3807	if (ret < 0)
3808		goto out;
3809
3810	p = reply_buf;
3811	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
3812						p + ret, NULL, GFP_NOIO);
3813	ret = 0;
3814
3815	if (IS_ERR(rbd_dev->header.object_prefix)) {
3816		ret = PTR_ERR(rbd_dev->header.object_prefix);
3817		rbd_dev->header.object_prefix = NULL;
3818	} else {
3819		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
3820	}
3821out:
3822	kfree(reply_buf);
3823
3824	return ret;
3825}
3826
3827static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3828		u64 *snap_features)
3829{
3830	__le64 snapid = cpu_to_le64(snap_id);
3831	struct {
3832		__le64 features;
3833		__le64 incompat;
3834	} __attribute__ ((packed)) features_buf = { 0 };
3835	u64 incompat;
3836	int ret;
3837
3838	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3839				"rbd", "get_features",
3840				&snapid, sizeof (snapid),
3841				&features_buf, sizeof (features_buf));
3842	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3843	if (ret < 0)
3844		return ret;
3845	if (ret < sizeof (features_buf))
3846		return -ERANGE;
3847
3848	incompat = le64_to_cpu(features_buf.incompat);
3849	if (incompat & ~RBD_FEATURES_SUPPORTED)
 
 
3850		return -ENXIO;
 
3851
3852	*snap_features = le64_to_cpu(features_buf.features);
3853
3854	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3855		(unsigned long long)snap_id,
3856		(unsigned long long)*snap_features,
3857		(unsigned long long)le64_to_cpu(features_buf.incompat));
3858
3859	return 0;
3860}
3861
3862static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3863{
3864	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3865						&rbd_dev->header.features);
3866}
3867
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3868static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3869{
3870	struct rbd_spec *parent_spec;
3871	size_t size;
3872	void *reply_buf = NULL;
3873	__le64 snapid;
3874	void *p;
3875	void *end;
3876	u64 pool_id;
3877	char *image_id;
3878	u64 snap_id;
3879	u64 overlap;
3880	int ret;
3881
3882	parent_spec = rbd_spec_alloc();
3883	if (!parent_spec)
3884		return -ENOMEM;
3885
3886	size = sizeof (__le64) +				/* pool_id */
3887		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
3888		sizeof (__le64) +				/* snap_id */
3889		sizeof (__le64);				/* overlap */
3890	reply_buf = kmalloc(size, GFP_KERNEL);
3891	if (!reply_buf) {
3892		ret = -ENOMEM;
3893		goto out_err;
3894	}
3895
3896	snapid = cpu_to_le64(CEPH_NOSNAP);
3897	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3898				"rbd", "get_parent",
3899				&snapid, sizeof (snapid),
3900				reply_buf, size);
3901	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3902	if (ret < 0)
3903		goto out_err;
3904
3905	p = reply_buf;
3906	end = reply_buf + ret;
3907	ret = -ERANGE;
3908	ceph_decode_64_safe(&p, end, pool_id, out_err);
3909	if (pool_id == CEPH_NOPOOL) {
3910		/*
3911		 * Either the parent never existed, or we have
3912		 * record of it but the image got flattened so it no
3913		 * longer has a parent.  When the parent of a
3914		 * layered image disappears we immediately set the
3915		 * overlap to 0.  The effect of this is that all new
3916		 * requests will be treated as if the image had no
3917		 * parent.
 
 
 
 
3918		 */
3919		if (rbd_dev->parent_overlap) {
3920			rbd_dev->parent_overlap = 0;
3921			smp_mb();
3922			rbd_dev_parent_put(rbd_dev);
3923			pr_info("%s: clone image has been flattened\n",
3924				rbd_dev->disk->disk_name);
3925		}
3926
3927		goto out;	/* No parent?  No problem. */
3928	}
3929
3930	/* The ceph file layout needs to fit pool id in 32 bits */
3931
3932	ret = -EIO;
3933	if (pool_id > (u64)U32_MAX) {
3934		rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3935			(unsigned long long)pool_id, U32_MAX);
3936		goto out_err;
3937	}
3938
3939	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3940	if (IS_ERR(image_id)) {
3941		ret = PTR_ERR(image_id);
3942		goto out_err;
3943	}
3944	ceph_decode_64_safe(&p, end, snap_id, out_err);
3945	ceph_decode_64_safe(&p, end, overlap, out_err);
3946
3947	/*
3948	 * The parent won't change (except when the clone is
3949	 * flattened, already handled that).  So we only need to
3950	 * record the parent spec we have not already done so.
3951	 */
3952	if (!rbd_dev->parent_spec) {
3953		parent_spec->pool_id = pool_id;
3954		parent_spec->image_id = image_id;
3955		parent_spec->snap_id = snap_id;
 
 
 
 
 
 
3956		rbd_dev->parent_spec = parent_spec;
3957		parent_spec = NULL;	/* rbd_dev now owns this */
3958	}
3959
3960	/*
3961	 * We always update the parent overlap.  If it's zero we
3962	 * treat it specially.
3963	 */
3964	rbd_dev->parent_overlap = overlap;
3965	smp_mb();
3966	if (!overlap) {
3967
3968		/* A null parent_spec indicates it's the initial probe */
3969
3970		if (parent_spec) {
3971			/*
3972			 * The overlap has become zero, so the clone
3973			 * must have been resized down to 0 at some
3974			 * point.  Treat this the same as a flatten.
3975			 */
3976			rbd_dev_parent_put(rbd_dev);
3977			pr_info("%s: clone image now standalone\n",
3978				rbd_dev->disk->disk_name);
3979		} else {
3980			/*
3981			 * For the initial probe, if we find the
3982			 * overlap is zero we just pretend there was
3983			 * no parent image.
3984			 */
3985			rbd_warn(rbd_dev, "ignoring parent of "
3986						"clone with overlap 0\n");
3987		}
3988	}
 
 
3989out:
3990	ret = 0;
3991out_err:
3992	kfree(reply_buf);
 
3993	rbd_spec_put(parent_spec);
3994
3995	return ret;
3996}
3997
3998static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3999{
4000	struct {
4001		__le64 stripe_unit;
4002		__le64 stripe_count;
4003	} __attribute__ ((packed)) striping_info_buf = { 0 };
4004	size_t size = sizeof (striping_info_buf);
4005	void *p;
4006	u64 obj_size;
4007	u64 stripe_unit;
4008	u64 stripe_count;
4009	int ret;
4010
4011	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4012				"rbd", "get_stripe_unit_count", NULL, 0,
4013				(char *)&striping_info_buf, size);
4014	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4015	if (ret < 0)
4016		return ret;
4017	if (ret < size)
4018		return -ERANGE;
4019
4020	/*
4021	 * We don't actually support the "fancy striping" feature
4022	 * (STRIPINGV2) yet, but if the striping sizes are the
4023	 * defaults the behavior is the same as before.  So find
4024	 * out, and only fail if the image has non-default values.
4025	 */
4026	ret = -EINVAL;
4027	obj_size = (u64)1 << rbd_dev->header.obj_order;
4028	p = &striping_info_buf;
4029	stripe_unit = ceph_decode_64(&p);
4030	if (stripe_unit != obj_size) {
4031		rbd_warn(rbd_dev, "unsupported stripe unit "
4032				"(got %llu want %llu)",
4033				stripe_unit, obj_size);
4034		return -EINVAL;
4035	}
4036	stripe_count = ceph_decode_64(&p);
4037	if (stripe_count != 1) {
4038		rbd_warn(rbd_dev, "unsupported stripe count "
4039				"(got %llu want 1)", stripe_count);
4040		return -EINVAL;
4041	}
4042	rbd_dev->header.stripe_unit = stripe_unit;
4043	rbd_dev->header.stripe_count = stripe_count;
4044
 
 
 
 
 
 
 
 
 
 
4045	return 0;
4046}
4047
4048static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4049{
 
4050	size_t image_id_size;
4051	char *image_id;
4052	void *p;
4053	void *end;
4054	size_t size;
4055	void *reply_buf = NULL;
4056	size_t len = 0;
4057	char *image_name = NULL;
4058	int ret;
4059
4060	rbd_assert(!rbd_dev->spec->image_name);
4061
4062	len = strlen(rbd_dev->spec->image_id);
4063	image_id_size = sizeof (__le32) + len;
4064	image_id = kmalloc(image_id_size, GFP_KERNEL);
4065	if (!image_id)
4066		return NULL;
4067
4068	p = image_id;
4069	end = image_id + image_id_size;
4070	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
4071
4072	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4073	reply_buf = kmalloc(size, GFP_KERNEL);
4074	if (!reply_buf)
4075		goto out;
4076
4077	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
4078				"rbd", "dir_get_name",
4079				image_id, image_id_size,
4080				reply_buf, size);
4081	if (ret < 0)
4082		goto out;
4083	p = reply_buf;
4084	end = reply_buf + ret;
4085
4086	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4087	if (IS_ERR(image_name))
4088		image_name = NULL;
4089	else
4090		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4091out:
4092	kfree(reply_buf);
4093	kfree(image_id);
4094
4095	return image_name;
4096}
4097
4098static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4099{
4100	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4101	const char *snap_name;
4102	u32 which = 0;
4103
4104	/* Skip over names until we find the one we are looking for */
4105
4106	snap_name = rbd_dev->header.snap_names;
4107	while (which < snapc->num_snaps) {
4108		if (!strcmp(name, snap_name))
4109			return snapc->snaps[which];
4110		snap_name += strlen(snap_name) + 1;
4111		which++;
4112	}
4113	return CEPH_NOSNAP;
4114}
4115
4116static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4117{
4118	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4119	u32 which;
4120	bool found = false;
4121	u64 snap_id;
4122
4123	for (which = 0; !found && which < snapc->num_snaps; which++) {
4124		const char *snap_name;
4125
4126		snap_id = snapc->snaps[which];
4127		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
4128		if (IS_ERR(snap_name)) {
4129			/* ignore no-longer existing snapshots */
4130			if (PTR_ERR(snap_name) == -ENOENT)
4131				continue;
4132			else
4133				break;
4134		}
4135		found = !strcmp(name, snap_name);
4136		kfree(snap_name);
4137	}
4138	return found ? snap_id : CEPH_NOSNAP;
4139}
4140
4141/*
4142 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4143 * no snapshot by that name is found, or if an error occurs.
4144 */
4145static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4146{
4147	if (rbd_dev->image_format == 1)
4148		return rbd_v1_snap_id_by_name(rbd_dev, name);
4149
4150	return rbd_v2_snap_id_by_name(rbd_dev, name);
4151}
4152
4153/*
4154 * When an rbd image has a parent image, it is identified by the
4155 * pool, image, and snapshot ids (not names).  This function fills
4156 * in the names for those ids.  (It's OK if we can't figure out the
4157 * name for an image id, but the pool and snapshot ids should always
4158 * exist and have names.)  All names in an rbd spec are dynamically
4159 * allocated.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4160 *
4161 * When an image being mapped (not a parent) is probed, we have the
4162 * pool name and pool id, image name and image id, and the snapshot
4163 * name.  The only thing we're missing is the snapshot id.
4164 */
4165static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
4166{
4167	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4168	struct rbd_spec *spec = rbd_dev->spec;
4169	const char *pool_name;
4170	const char *image_name;
4171	const char *snap_name;
4172	int ret;
4173
4174	/*
4175	 * An image being mapped will have the pool name (etc.), but
4176	 * we need to look up the snapshot id.
4177	 */
4178	if (spec->pool_name) {
4179		if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4180			u64 snap_id;
4181
4182			snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
4183			if (snap_id == CEPH_NOSNAP)
4184				return -ENOENT;
4185			spec->snap_id = snap_id;
4186		} else {
4187			spec->snap_id = CEPH_NOSNAP;
4188		}
4189
4190		return 0;
4191	}
4192
4193	/* Get the pool name; we have to make our own copy of this */
4194
4195	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
4196	if (!pool_name) {
4197		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4198		return -EIO;
4199	}
4200	pool_name = kstrdup(pool_name, GFP_KERNEL);
4201	if (!pool_name)
4202		return -ENOMEM;
4203
4204	/* Fetch the image name; tolerate failure here */
4205
4206	image_name = rbd_dev_image_name(rbd_dev);
4207	if (!image_name)
4208		rbd_warn(rbd_dev, "unable to get image name");
4209
4210	/* Look up the snapshot name, and make a copy */
4211
4212	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
4213	if (IS_ERR(snap_name)) {
4214		ret = PTR_ERR(snap_name);
4215		goto out_err;
4216	}
4217
4218	spec->pool_name = pool_name;
4219	spec->image_name = image_name;
4220	spec->snap_name = snap_name;
4221
4222	return 0;
 
4223out_err:
4224	kfree(image_name);
4225	kfree(pool_name);
4226
4227	return ret;
4228}
4229
4230static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
4231{
4232	size_t size;
4233	int ret;
4234	void *reply_buf;
4235	void *p;
4236	void *end;
4237	u64 seq;
4238	u32 snap_count;
4239	struct ceph_snap_context *snapc;
4240	u32 i;
4241
4242	/*
4243	 * We'll need room for the seq value (maximum snapshot id),
4244	 * snapshot count, and array of that many snapshot ids.
4245	 * For now we have a fixed upper limit on the number we're
4246	 * prepared to receive.
4247	 */
4248	size = sizeof (__le64) + sizeof (__le32) +
4249			RBD_MAX_SNAP_COUNT * sizeof (__le64);
4250	reply_buf = kzalloc(size, GFP_KERNEL);
4251	if (!reply_buf)
4252		return -ENOMEM;
4253
4254	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4255				"rbd", "get_snapcontext", NULL, 0,
4256				reply_buf, size);
4257	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4258	if (ret < 0)
4259		goto out;
4260
4261	p = reply_buf;
4262	end = reply_buf + ret;
4263	ret = -ERANGE;
4264	ceph_decode_64_safe(&p, end, seq, out);
4265	ceph_decode_32_safe(&p, end, snap_count, out);
4266
4267	/*
4268	 * Make sure the reported number of snapshot ids wouldn't go
4269	 * beyond the end of our buffer.  But before checking that,
4270	 * make sure the computed size of the snapshot context we
4271	 * allocate is representable in a size_t.
4272	 */
4273	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
4274				 / sizeof (u64)) {
4275		ret = -EINVAL;
4276		goto out;
4277	}
4278	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
4279		goto out;
4280	ret = 0;
4281
4282	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
4283	if (!snapc) {
4284		ret = -ENOMEM;
4285		goto out;
4286	}
4287	snapc->seq = seq;
4288	for (i = 0; i < snap_count; i++)
4289		snapc->snaps[i] = ceph_decode_64(&p);
4290
4291	ceph_put_snap_context(rbd_dev->header.snapc);
4292	rbd_dev->header.snapc = snapc;
4293
4294	dout("  snap context seq = %llu, snap_count = %u\n",
4295		(unsigned long long)seq, (unsigned int)snap_count);
4296out:
4297	kfree(reply_buf);
4298
4299	return ret;
4300}
4301
4302static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4303					u64 snap_id)
4304{
4305	size_t size;
4306	void *reply_buf;
4307	__le64 snapid;
4308	int ret;
4309	void *p;
4310	void *end;
4311	char *snap_name;
4312
4313	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4314	reply_buf = kmalloc(size, GFP_KERNEL);
4315	if (!reply_buf)
4316		return ERR_PTR(-ENOMEM);
4317
4318	snapid = cpu_to_le64(snap_id);
4319	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4320				"rbd", "get_snapshot_name",
4321				&snapid, sizeof (snapid),
4322				reply_buf, size);
4323	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4324	if (ret < 0) {
4325		snap_name = ERR_PTR(ret);
4326		goto out;
4327	}
4328
4329	p = reply_buf;
4330	end = reply_buf + ret;
4331	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4332	if (IS_ERR(snap_name))
4333		goto out;
4334
4335	dout("  snap_id 0x%016llx snap_name = %s\n",
4336		(unsigned long long)snap_id, snap_name);
4337out:
4338	kfree(reply_buf);
4339
4340	return snap_name;
4341}
4342
4343static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4344{
4345	bool first_time = rbd_dev->header.object_prefix == NULL;
4346	int ret;
4347
4348	ret = rbd_dev_v2_image_size(rbd_dev);
4349	if (ret)
4350		return ret;
4351
4352	if (first_time) {
4353		ret = rbd_dev_v2_header_onetime(rbd_dev);
4354		if (ret)
4355			return ret;
4356	}
4357
4358	/*
4359	 * If the image supports layering, get the parent info.  We
4360	 * need to probe the first time regardless.  Thereafter we
4361	 * only need to if there's a parent, to see if it has
4362	 * disappeared due to the mapped image getting flattened.
4363	 */
4364	if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
4365			(first_time || rbd_dev->parent_spec)) {
4366		bool warn;
4367
4368		ret = rbd_dev_v2_parent_info(rbd_dev);
4369		if (ret)
4370			return ret;
4371
4372		/*
4373		 * Print a warning if this is the initial probe and
4374		 * the image has a parent.  Don't print it if the
4375		 * image now being probed is itself a parent.  We
4376		 * can tell at this point because we won't know its
4377		 * pool name yet (just its pool id).
4378		 */
4379		warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
4380		if (first_time && warn)
4381			rbd_warn(rbd_dev, "WARNING: kernel layering "
4382					"is EXPERIMENTAL!");
4383	}
4384
4385	if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
4386		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
4387			rbd_dev->mapping.size = rbd_dev->header.image_size;
4388
4389	ret = rbd_dev_v2_snap_context(rbd_dev);
4390	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4391
4392	return ret;
4393}
4394
4395static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4396{
4397	struct device *dev;
4398	int ret;
4399
4400	dev = &rbd_dev->dev;
4401	dev->bus = &rbd_bus_type;
4402	dev->type = &rbd_device_type;
4403	dev->parent = &rbd_root_dev;
4404	dev->release = rbd_dev_device_release;
4405	dev_set_name(dev, "%d", rbd_dev->dev_id);
4406	ret = device_register(dev);
4407
4408	return ret;
4409}
4410
4411static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4412{
4413	device_unregister(&rbd_dev->dev);
4414}
4415
4416/*
4417 * Get a unique rbd identifier for the given new rbd_dev, and add
4418 * the rbd_dev to the global list.
4419 */
4420static int rbd_dev_id_get(struct rbd_device *rbd_dev)
4421{
4422	int new_dev_id;
4423
4424	new_dev_id = ida_simple_get(&rbd_dev_id_ida,
4425				    0, minor_to_rbd_dev_id(1 << MINORBITS),
4426				    GFP_KERNEL);
4427	if (new_dev_id < 0)
4428		return new_dev_id;
4429
4430	rbd_dev->dev_id = new_dev_id;
4431
4432	spin_lock(&rbd_dev_list_lock);
4433	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4434	spin_unlock(&rbd_dev_list_lock);
4435
4436	dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
4437
4438	return 0;
4439}
4440
4441/*
4442 * Remove an rbd_dev from the global list, and record that its
4443 * identifier is no longer in use.
4444 */
4445static void rbd_dev_id_put(struct rbd_device *rbd_dev)
4446{
4447	spin_lock(&rbd_dev_list_lock);
4448	list_del_init(&rbd_dev->node);
4449	spin_unlock(&rbd_dev_list_lock);
4450
4451	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
 
4452
4453	dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
4454}
4455
4456/*
4457 * Skips over white space at *buf, and updates *buf to point to the
4458 * first found non-space character (if any). Returns the length of
4459 * the token (string of non-white space characters) found.  Note
4460 * that *buf must be terminated with '\0'.
4461 */
4462static inline size_t next_token(const char **buf)
4463{
4464        /*
4465        * These are the characters that produce nonzero for
4466        * isspace() in the "C" and "POSIX" locales.
4467        */
4468        const char *spaces = " \f\n\r\t\v";
4469
4470        *buf += strspn(*buf, spaces);	/* Find start of token */
4471
4472	return strcspn(*buf, spaces);   /* Return token length */
4473}
4474
4475/*
4476 * Finds the next token in *buf, and if the provided token buffer is
4477 * big enough, copies the found token into it.  The result, if
4478 * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4479 * must be terminated with '\0' on entry.
4480 *
4481 * Returns the length of the token found (not including the '\0').
4482 * Return value will be 0 if no token is found, and it will be >=
4483 * token_size if the token would not fit.
4484 *
4485 * The *buf pointer will be updated to point beyond the end of the
4486 * found token.  Note that this occurs even if the token buffer is
4487 * too small to hold it.
4488 */
4489static inline size_t copy_token(const char **buf,
4490				char *token,
4491				size_t token_size)
4492{
4493        size_t len;
4494
4495	len = next_token(buf);
4496	if (len < token_size) {
4497		memcpy(token, *buf, len);
4498		*(token + len) = '\0';
4499	}
4500	*buf += len;
4501
4502        return len;
4503}
4504
4505/*
4506 * Finds the next token in *buf, dynamically allocates a buffer big
4507 * enough to hold a copy of it, and copies the token into the new
4508 * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4509 * that a duplicate buffer is created even for a zero-length token.
4510 *
4511 * Returns a pointer to the newly-allocated duplicate, or a null
4512 * pointer if memory for the duplicate was not available.  If
4513 * the lenp argument is a non-null pointer, the length of the token
4514 * (not including the '\0') is returned in *lenp.
4515 *
4516 * If successful, the *buf pointer will be updated to point beyond
4517 * the end of the found token.
4518 *
4519 * Note: uses GFP_KERNEL for allocation.
4520 */
4521static inline char *dup_token(const char **buf, size_t *lenp)
4522{
4523	char *dup;
4524	size_t len;
4525
4526	len = next_token(buf);
4527	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4528	if (!dup)
4529		return NULL;
4530	*(dup + len) = '\0';
4531	*buf += len;
4532
4533	if (lenp)
4534		*lenp = len;
4535
4536	return dup;
4537}
4538
4539/*
4540 * Parse the options provided for an "rbd add" (i.e., rbd image
4541 * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4542 * and the data written is passed here via a NUL-terminated buffer.
4543 * Returns 0 if successful or an error code otherwise.
4544 *
4545 * The information extracted from these options is recorded in
4546 * the other parameters which return dynamically-allocated
4547 * structures:
4548 *  ceph_opts
4549 *      The address of a pointer that will refer to a ceph options
4550 *      structure.  Caller must release the returned pointer using
4551 *      ceph_destroy_options() when it is no longer needed.
4552 *  rbd_opts
4553 *	Address of an rbd options pointer.  Fully initialized by
4554 *	this function; caller must release with kfree().
4555 *  spec
4556 *	Address of an rbd image specification pointer.  Fully
4557 *	initialized by this function based on parsed options.
4558 *	Caller must release with rbd_spec_put().
4559 *
4560 * The options passed take this form:
4561 *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4562 * where:
4563 *  <mon_addrs>
4564 *      A comma-separated list of one or more monitor addresses.
4565 *      A monitor address is an ip address, optionally followed
4566 *      by a port number (separated by a colon).
4567 *        I.e.:  ip1[:port1][,ip2[:port2]...]
4568 *  <options>
4569 *      A comma-separated list of ceph and/or rbd options.
4570 *  <pool_name>
4571 *      The name of the rados pool containing the rbd image.
4572 *  <image_name>
4573 *      The name of the image in that pool to map.
4574 *  <snap_id>
4575 *      An optional snapshot id.  If provided, the mapping will
4576 *      present data from the image at the time that snapshot was
4577 *      created.  The image head is used if no snapshot id is
4578 *      provided.  Snapshot mappings are always read-only.
4579 */
4580static int rbd_add_parse_args(const char *buf,
4581				struct ceph_options **ceph_opts,
4582				struct rbd_options **opts,
4583				struct rbd_spec **rbd_spec)
4584{
4585	size_t len;
4586	char *options;
4587	const char *mon_addrs;
4588	char *snap_name;
4589	size_t mon_addrs_size;
4590	struct rbd_spec *spec = NULL;
4591	struct rbd_options *rbd_opts = NULL;
4592	struct ceph_options *copts;
4593	int ret;
4594
4595	/* The first four tokens are required */
4596
4597	len = next_token(&buf);
4598	if (!len) {
4599		rbd_warn(NULL, "no monitor address(es) provided");
4600		return -EINVAL;
4601	}
4602	mon_addrs = buf;
4603	mon_addrs_size = len + 1;
4604	buf += len;
4605
4606	ret = -EINVAL;
4607	options = dup_token(&buf, NULL);
4608	if (!options)
4609		return -ENOMEM;
4610	if (!*options) {
4611		rbd_warn(NULL, "no options provided");
4612		goto out_err;
4613	}
4614
4615	spec = rbd_spec_alloc();
4616	if (!spec)
4617		goto out_mem;
4618
4619	spec->pool_name = dup_token(&buf, NULL);
4620	if (!spec->pool_name)
4621		goto out_mem;
4622	if (!*spec->pool_name) {
4623		rbd_warn(NULL, "no pool name provided");
4624		goto out_err;
4625	}
4626
4627	spec->image_name = dup_token(&buf, NULL);
4628	if (!spec->image_name)
4629		goto out_mem;
4630	if (!*spec->image_name) {
4631		rbd_warn(NULL, "no image name provided");
4632		goto out_err;
4633	}
4634
4635	/*
4636	 * Snapshot name is optional; default is to use "-"
4637	 * (indicating the head/no snapshot).
4638	 */
4639	len = next_token(&buf);
4640	if (!len) {
4641		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4642		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4643	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4644		ret = -ENAMETOOLONG;
4645		goto out_err;
4646	}
4647	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4648	if (!snap_name)
4649		goto out_mem;
4650	*(snap_name + len) = '\0';
4651	spec->snap_name = snap_name;
4652
4653	/* Initialize all rbd options to the defaults */
4654
4655	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4656	if (!rbd_opts)
4657		goto out_mem;
4658
4659	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
 
 
 
 
 
 
4660
4661	copts = ceph_parse_options(options, mon_addrs,
4662					mon_addrs + mon_addrs_size - 1,
4663					parse_rbd_opts_token, rbd_opts);
4664	if (IS_ERR(copts)) {
4665		ret = PTR_ERR(copts);
4666		goto out_err;
4667	}
4668	kfree(options);
4669
4670	*ceph_opts = copts;
4671	*opts = rbd_opts;
4672	*rbd_spec = spec;
4673
4674	return 0;
4675out_mem:
4676	ret = -ENOMEM;
4677out_err:
4678	kfree(rbd_opts);
4679	rbd_spec_put(spec);
4680	kfree(options);
4681
4682	return ret;
4683}
4684
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4685/*
4686 * An rbd format 2 image has a unique identifier, distinct from the
4687 * name given to it by the user.  Internally, that identifier is
4688 * what's used to specify the names of objects related to the image.
4689 *
4690 * A special "rbd id" object is used to map an rbd image name to its
4691 * id.  If that object doesn't exist, then there is no v2 rbd image
4692 * with the supplied name.
4693 *
4694 * This function will record the given rbd_dev's image_id field if
4695 * it can be determined, and in that case will return 0.  If any
4696 * errors occur a negative errno will be returned and the rbd_dev's
4697 * image_id field will be unchanged (and should be NULL).
4698 */
4699static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4700{
4701	int ret;
4702	size_t size;
4703	char *object_name;
4704	void *response;
4705	char *image_id;
4706
4707	/*
4708	 * When probing a parent image, the image id is already
4709	 * known (and the image name likely is not).  There's no
4710	 * need to fetch the image id again in this case.  We
4711	 * do still need to set the image format though.
4712	 */
4713	if (rbd_dev->spec->image_id) {
4714		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4715
4716		return 0;
4717	}
4718
4719	/*
4720	 * First, see if the format 2 image id file exists, and if
4721	 * so, get the image's persistent id from it.
4722	 */
4723	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4724	object_name = kmalloc(size, GFP_NOIO);
4725	if (!object_name)
4726		return -ENOMEM;
4727	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4728	dout("rbd id object name is %s\n", object_name);
4729
4730	/* Response will be an encoded string, which includes a length */
4731
 
4732	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4733	response = kzalloc(size, GFP_NOIO);
4734	if (!response) {
4735		ret = -ENOMEM;
4736		goto out;
4737	}
4738
4739	/* If it doesn't exist we'll assume it's a format 1 image */
4740
4741	ret = rbd_obj_method_sync(rbd_dev, object_name,
4742				"rbd", "get_id", NULL, 0,
4743				response, RBD_IMAGE_ID_LEN_MAX);
4744	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4745	if (ret == -ENOENT) {
4746		image_id = kstrdup("", GFP_KERNEL);
4747		ret = image_id ? 0 : -ENOMEM;
4748		if (!ret)
4749			rbd_dev->image_format = 1;
4750	} else if (ret > sizeof (__le32)) {
4751		void *p = response;
4752
4753		image_id = ceph_extract_encoded_string(&p, p + ret,
4754						NULL, GFP_NOIO);
4755		ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4756		if (!ret)
4757			rbd_dev->image_format = 2;
4758	} else {
4759		ret = -EINVAL;
4760	}
4761
4762	if (!ret) {
4763		rbd_dev->spec->image_id = image_id;
4764		dout("image_id is %s\n", image_id);
4765	}
4766out:
4767	kfree(response);
4768	kfree(object_name);
4769
4770	return ret;
4771}
4772
4773/*
4774 * Undo whatever state changes are made by v1 or v2 header info
4775 * call.
4776 */
4777static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
4778{
4779	struct rbd_image_header	*header;
4780
4781	/* Drop parent reference unless it's already been done (or none) */
4782
4783	if (rbd_dev->parent_overlap)
4784		rbd_dev_parent_put(rbd_dev);
4785
4786	/* Free dynamic fields from the header, then zero it out */
4787
4788	header = &rbd_dev->header;
4789	ceph_put_snap_context(header->snapc);
4790	kfree(header->snap_sizes);
4791	kfree(header->snap_names);
4792	kfree(header->object_prefix);
4793	memset(header, 0, sizeof (*header));
4794}
4795
4796static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
4797{
4798	int ret;
4799
4800	ret = rbd_dev_v2_object_prefix(rbd_dev);
4801	if (ret)
4802		goto out_err;
4803
4804	/*
4805	 * Get the and check features for the image.  Currently the
4806	 * features are assumed to never change.
4807	 */
4808	ret = rbd_dev_v2_features(rbd_dev);
4809	if (ret)
4810		goto out_err;
4811
4812	/* If the image supports fancy striping, get its parameters */
4813
4814	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4815		ret = rbd_dev_v2_striping_info(rbd_dev);
4816		if (ret < 0)
4817			goto out_err;
4818	}
4819	/* No support for crypto and compression type format 2 images */
4820
 
 
 
 
 
 
 
4821	return 0;
 
4822out_err:
4823	rbd_dev->header.features = 0;
4824	kfree(rbd_dev->header.object_prefix);
4825	rbd_dev->header.object_prefix = NULL;
4826
4827	return ret;
4828}
4829
4830static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
 
 
 
 
 
4831{
4832	struct rbd_device *parent = NULL;
4833	struct rbd_spec *parent_spec;
4834	struct rbd_client *rbdc;
4835	int ret;
4836
4837	if (!rbd_dev->parent_spec)
4838		return 0;
4839	/*
4840	 * We need to pass a reference to the client and the parent
4841	 * spec when creating the parent rbd_dev.  Images related by
4842	 * parent/child relationships always share both.
4843	 */
4844	parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4845	rbdc = __rbd_get_client(rbd_dev->rbd_client);
4846
4847	ret = -ENOMEM;
4848	parent = rbd_dev_create(rbdc, parent_spec);
4849	if (!parent)
4850		goto out_err;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4851
4852	ret = rbd_dev_image_probe(parent, false);
4853	if (ret < 0)
4854		goto out_err;
 
4855	rbd_dev->parent = parent;
4856	atomic_set(&rbd_dev->parent_ref, 1);
4857
4858	return 0;
4859out_err:
4860	if (parent) {
4861		rbd_dev_unparent(rbd_dev);
4862		kfree(rbd_dev->header_name);
4863		rbd_dev_destroy(parent);
4864	} else {
4865		rbd_put_client(rbdc);
4866		rbd_spec_put(parent_spec);
4867	}
4868
 
 
 
4869	return ret;
4870}
4871
 
 
 
 
 
 
 
 
 
 
 
 
4872static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4873{
4874	int ret;
4875
4876	/* Get an id and fill in device name. */
4877
4878	ret = rbd_dev_id_get(rbd_dev);
4879	if (ret)
4880		return ret;
4881
4882	BUILD_BUG_ON(DEV_NAME_LEN
4883			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4884	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4885
4886	/* Record our major and minor device numbers. */
4887
4888	if (!single_major) {
4889		ret = register_blkdev(0, rbd_dev->name);
4890		if (ret < 0)
4891			goto err_out_id;
4892
4893		rbd_dev->major = ret;
4894		rbd_dev->minor = 0;
4895	} else {
4896		rbd_dev->major = rbd_major;
4897		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
4898	}
4899
4900	/* Set up the blkdev mapping. */
4901
4902	ret = rbd_init_disk(rbd_dev);
4903	if (ret)
4904		goto err_out_blkdev;
4905
4906	ret = rbd_dev_mapping_set(rbd_dev);
4907	if (ret)
4908		goto err_out_disk;
4909	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
 
4910
4911	ret = rbd_bus_add_dev(rbd_dev);
4912	if (ret)
4913		goto err_out_mapping;
4914
4915	/* Everything's ready.  Announce the disk to the world. */
4916
4917	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4918	add_disk(rbd_dev->disk);
4919
4920	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4921		(unsigned long long) rbd_dev->mapping.size);
4922
4923	return ret;
4924
4925err_out_mapping:
4926	rbd_dev_mapping_clear(rbd_dev);
4927err_out_disk:
4928	rbd_free_disk(rbd_dev);
4929err_out_blkdev:
4930	if (!single_major)
4931		unregister_blkdev(rbd_dev->major, rbd_dev->name);
4932err_out_id:
4933	rbd_dev_id_put(rbd_dev);
4934	rbd_dev_mapping_clear(rbd_dev);
4935
4936	return ret;
4937}
4938
4939static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4940{
4941	struct rbd_spec *spec = rbd_dev->spec;
4942	size_t size;
4943
4944	/* Record the header object name for this rbd image. */
4945
4946	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4947
4948	if (rbd_dev->image_format == 1)
4949		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
 
4950	else
4951		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4952
4953	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4954	if (!rbd_dev->header_name)
4955		return -ENOMEM;
4956
4957	if (rbd_dev->image_format == 1)
4958		sprintf(rbd_dev->header_name, "%s%s",
4959			spec->image_name, RBD_SUFFIX);
4960	else
4961		sprintf(rbd_dev->header_name, "%s%s",
4962			RBD_HEADER_PREFIX, spec->image_id);
4963	return 0;
4964}
4965
4966static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4967{
4968	rbd_dev_unprobe(rbd_dev);
4969	kfree(rbd_dev->header_name);
4970	rbd_dev->header_name = NULL;
4971	rbd_dev->image_format = 0;
4972	kfree(rbd_dev->spec->image_id);
4973	rbd_dev->spec->image_id = NULL;
4974
4975	rbd_dev_destroy(rbd_dev);
4976}
4977
4978/*
4979 * Probe for the existence of the header object for the given rbd
4980 * device.  If this image is the one being mapped (i.e., not a
4981 * parent), initiate a watch on its header object before using that
4982 * object to get detailed information about the rbd image.
4983 */
4984static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
4985{
4986	int ret;
4987
4988	/*
4989	 * Get the id from the image id object.  Unless there's an
4990	 * error, rbd_dev->spec->image_id will be filled in with
4991	 * a dynamically-allocated string, and rbd_dev->image_format
4992	 * will be set to either 1 or 2.
4993	 */
4994	ret = rbd_dev_image_id(rbd_dev);
4995	if (ret)
4996		return ret;
4997	rbd_assert(rbd_dev->spec->image_id);
4998	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4999
5000	ret = rbd_dev_header_name(rbd_dev);
5001	if (ret)
5002		goto err_out_format;
5003
5004	if (mapping) {
5005		ret = rbd_dev_header_watch_sync(rbd_dev);
5006		if (ret)
5007			goto out_header_name;
 
 
 
 
 
 
 
5008	}
5009
5010	if (rbd_dev->image_format == 1)
5011		ret = rbd_dev_v1_header_info(rbd_dev);
5012	else
5013		ret = rbd_dev_v2_header_info(rbd_dev);
5014	if (ret)
5015		goto err_out_watch;
5016
5017	ret = rbd_dev_spec_update(rbd_dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5018	if (ret)
5019		goto err_out_probe;
5020
5021	ret = rbd_dev_probe_parent(rbd_dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
5022	if (ret)
5023		goto err_out_probe;
5024
5025	dout("discovered format %u image, header name is %s\n",
5026		rbd_dev->image_format, rbd_dev->header_name);
5027
5028	return 0;
 
5029err_out_probe:
5030	rbd_dev_unprobe(rbd_dev);
5031err_out_watch:
5032	if (mapping)
5033		rbd_dev_header_unwatch_sync(rbd_dev);
5034out_header_name:
5035	kfree(rbd_dev->header_name);
5036	rbd_dev->header_name = NULL;
5037err_out_format:
5038	rbd_dev->image_format = 0;
5039	kfree(rbd_dev->spec->image_id);
5040	rbd_dev->spec->image_id = NULL;
5041
5042	dout("probe failed, returning %d\n", ret);
5043
5044	return ret;
5045}
5046
5047static ssize_t do_rbd_add(struct bus_type *bus,
5048			  const char *buf,
5049			  size_t count)
5050{
5051	struct rbd_device *rbd_dev = NULL;
5052	struct ceph_options *ceph_opts = NULL;
5053	struct rbd_options *rbd_opts = NULL;
5054	struct rbd_spec *spec = NULL;
5055	struct rbd_client *rbdc;
5056	struct ceph_osd_client *osdc;
5057	bool read_only;
5058	int rc = -ENOMEM;
5059
5060	if (!try_module_get(THIS_MODULE))
5061		return -ENODEV;
5062
5063	/* parse add command */
5064	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5065	if (rc < 0)
5066		goto err_out_module;
5067	read_only = rbd_opts->read_only;
5068	kfree(rbd_opts);
5069	rbd_opts = NULL;	/* done with this */
5070
5071	rbdc = rbd_get_client(ceph_opts);
5072	if (IS_ERR(rbdc)) {
5073		rc = PTR_ERR(rbdc);
5074		goto err_out_args;
5075	}
5076
5077	/* pick the pool */
5078	osdc = &rbdc->client->osdc;
5079	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
5080	if (rc < 0)
 
5081		goto err_out_client;
 
5082	spec->pool_id = (u64)rc;
5083
5084	/* The ceph file layout needs to fit pool id in 32 bits */
5085
5086	if (spec->pool_id > (u64)U32_MAX) {
5087		rbd_warn(NULL, "pool id too large (%llu > %u)\n",
5088				(unsigned long long)spec->pool_id, U32_MAX);
5089		rc = -EIO;
5090		goto err_out_client;
5091	}
5092
5093	rbd_dev = rbd_dev_create(rbdc, spec);
5094	if (!rbd_dev)
5095		goto err_out_client;
5096	rbdc = NULL;		/* rbd_dev now owns this */
5097	spec = NULL;		/* rbd_dev now owns this */
 
5098
5099	rc = rbd_dev_image_probe(rbd_dev, true);
5100	if (rc < 0)
 
5101		goto err_out_rbd_dev;
 
5102
5103	/* If we are mapping a snapshot it must be marked read-only */
 
 
 
 
 
5104
 
5105	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
5106		read_only = true;
5107	rbd_dev->mapping.read_only = read_only;
5108
5109	rc = rbd_dev_device_setup(rbd_dev);
5110	if (rc) {
5111		/*
5112		 * rbd_dev_header_unwatch_sync() can't be moved into
5113		 * rbd_dev_image_release() without refactoring, see
5114		 * commit 1f3ef78861ac.
5115		 */
5116		rbd_dev_header_unwatch_sync(rbd_dev);
5117		rbd_dev_image_release(rbd_dev);
5118		goto err_out_module;
5119	}
5120
5121	return count;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5123err_out_rbd_dev:
5124	rbd_dev_destroy(rbd_dev);
5125err_out_client:
5126	rbd_put_client(rbdc);
5127err_out_args:
5128	rbd_spec_put(spec);
5129err_out_module:
5130	module_put(THIS_MODULE);
5131
5132	dout("Error adding device %s\n", buf);
5133
5134	return (ssize_t)rc;
5135}
5136
5137static ssize_t rbd_add(struct bus_type *bus,
5138		       const char *buf,
5139		       size_t count)
5140{
5141	if (single_major)
5142		return -EINVAL;
5143
5144	return do_rbd_add(bus, buf, count);
5145}
5146
5147static ssize_t rbd_add_single_major(struct bus_type *bus,
5148				    const char *buf,
5149				    size_t count)
5150{
5151	return do_rbd_add(bus, buf, count);
5152}
5153
5154static void rbd_dev_device_release(struct device *dev)
5155{
5156	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5157
5158	rbd_free_disk(rbd_dev);
5159	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5160	rbd_dev_mapping_clear(rbd_dev);
5161	if (!single_major)
5162		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5163	rbd_dev_id_put(rbd_dev);
5164	rbd_dev_mapping_clear(rbd_dev);
5165}
5166
5167static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5168{
5169	while (rbd_dev->parent) {
5170		struct rbd_device *first = rbd_dev;
5171		struct rbd_device *second = first->parent;
5172		struct rbd_device *third;
5173
5174		/*
5175		 * Follow to the parent with no grandparent and
5176		 * remove it.
5177		 */
5178		while (second && (third = second->parent)) {
5179			first = second;
5180			second = third;
5181		}
5182		rbd_assert(second);
5183		rbd_dev_image_release(second);
 
5184		first->parent = NULL;
5185		first->parent_overlap = 0;
5186
5187		rbd_assert(first->parent_spec);
5188		rbd_spec_put(first->parent_spec);
5189		first->parent_spec = NULL;
5190	}
5191}
5192
5193static ssize_t do_rbd_remove(struct bus_type *bus,
5194			     const char *buf,
5195			     size_t count)
5196{
5197	struct rbd_device *rbd_dev = NULL;
5198	struct list_head *tmp;
5199	int dev_id;
5200	unsigned long ul;
5201	bool already = false;
5202	int ret;
5203
5204	ret = kstrtoul(buf, 10, &ul);
5205	if (ret)
5206		return ret;
5207
5208	/* convert to int; abort if we lost anything in the conversion */
5209	dev_id = (int)ul;
5210	if (dev_id != ul)
5211		return -EINVAL;
 
 
 
 
 
 
 
 
 
5212
5213	ret = -ENOENT;
5214	spin_lock(&rbd_dev_list_lock);
5215	list_for_each(tmp, &rbd_dev_list) {
5216		rbd_dev = list_entry(tmp, struct rbd_device, node);
5217		if (rbd_dev->dev_id == dev_id) {
5218			ret = 0;
5219			break;
5220		}
5221	}
5222	if (!ret) {
5223		spin_lock_irq(&rbd_dev->lock);
5224		if (rbd_dev->open_count)
5225			ret = -EBUSY;
5226		else
5227			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
5228							&rbd_dev->flags);
5229		spin_unlock_irq(&rbd_dev->lock);
5230	}
5231	spin_unlock(&rbd_dev_list_lock);
5232	if (ret < 0 || already)
5233		return ret;
5234
5235	rbd_dev_header_unwatch_sync(rbd_dev);
5236	/*
5237	 * flush remaining watch callbacks - these must be complete
5238	 * before the osd_client is shutdown
5239	 */
5240	dout("%s: flushing notifies", __func__);
5241	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
 
5242
5243	/*
5244	 * Don't free anything from rbd_dev->disk until after all
5245	 * notifies are completely processed. Otherwise
5246	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
5247	 * in a potential use after free of rbd_dev->disk or rbd_dev.
5248	 */
5249	rbd_bus_del_dev(rbd_dev);
5250	rbd_dev_image_release(rbd_dev);
5251	module_put(THIS_MODULE);
5252
 
 
 
 
5253	return count;
5254}
5255
5256static ssize_t rbd_remove(struct bus_type *bus,
5257			  const char *buf,
5258			  size_t count)
5259{
5260	if (single_major)
5261		return -EINVAL;
5262
5263	return do_rbd_remove(bus, buf, count);
5264}
5265
5266static ssize_t rbd_remove_single_major(struct bus_type *bus,
5267				       const char *buf,
5268				       size_t count)
5269{
5270	return do_rbd_remove(bus, buf, count);
5271}
5272
5273/*
5274 * create control files in sysfs
5275 * /sys/bus/rbd/...
5276 */
5277static int rbd_sysfs_init(void)
5278{
5279	int ret;
5280
5281	ret = device_register(&rbd_root_dev);
5282	if (ret < 0)
5283		return ret;
5284
5285	ret = bus_register(&rbd_bus_type);
5286	if (ret < 0)
5287		device_unregister(&rbd_root_dev);
5288
5289	return ret;
5290}
5291
5292static void rbd_sysfs_cleanup(void)
5293{
5294	bus_unregister(&rbd_bus_type);
5295	device_unregister(&rbd_root_dev);
5296}
5297
5298static int rbd_slab_init(void)
5299{
5300	rbd_assert(!rbd_img_request_cache);
5301	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
5302					sizeof (struct rbd_img_request),
5303					__alignof__(struct rbd_img_request),
5304					0, NULL);
5305	if (!rbd_img_request_cache)
5306		return -ENOMEM;
5307
5308	rbd_assert(!rbd_obj_request_cache);
5309	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5310					sizeof (struct rbd_obj_request),
5311					__alignof__(struct rbd_obj_request),
5312					0, NULL);
5313	if (!rbd_obj_request_cache)
5314		goto out_err;
5315
5316	rbd_assert(!rbd_segment_name_cache);
5317	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
5318					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
5319	if (rbd_segment_name_cache)
5320		return 0;
5321out_err:
5322	if (rbd_obj_request_cache) {
5323		kmem_cache_destroy(rbd_obj_request_cache);
5324		rbd_obj_request_cache = NULL;
5325	}
5326
 
5327	kmem_cache_destroy(rbd_img_request_cache);
5328	rbd_img_request_cache = NULL;
5329
5330	return -ENOMEM;
5331}
5332
5333static void rbd_slab_exit(void)
5334{
5335	rbd_assert(rbd_segment_name_cache);
5336	kmem_cache_destroy(rbd_segment_name_cache);
5337	rbd_segment_name_cache = NULL;
5338
5339	rbd_assert(rbd_obj_request_cache);
5340	kmem_cache_destroy(rbd_obj_request_cache);
5341	rbd_obj_request_cache = NULL;
5342
5343	rbd_assert(rbd_img_request_cache);
5344	kmem_cache_destroy(rbd_img_request_cache);
5345	rbd_img_request_cache = NULL;
5346}
5347
5348static int __init rbd_init(void)
5349{
5350	int rc;
5351
5352	if (!libceph_compatible(NULL)) {
5353		rbd_warn(NULL, "libceph incompatibility (quitting)");
5354		return -EINVAL;
5355	}
5356
5357	rc = rbd_slab_init();
5358	if (rc)
5359		return rc;
5360
 
 
 
 
 
 
 
 
 
 
5361	if (single_major) {
5362		rbd_major = register_blkdev(0, RBD_DRV_NAME);
5363		if (rbd_major < 0) {
5364			rc = rbd_major;
5365			goto err_out_slab;
5366		}
5367	}
5368
5369	rc = rbd_sysfs_init();
5370	if (rc)
5371		goto err_out_blkdev;
5372
5373	if (single_major)
5374		pr_info("loaded (major %d)\n", rbd_major);
5375	else
5376		pr_info("loaded\n");
5377
5378	return 0;
5379
5380err_out_blkdev:
5381	if (single_major)
5382		unregister_blkdev(rbd_major, RBD_DRV_NAME);
 
 
5383err_out_slab:
5384	rbd_slab_exit();
5385	return rc;
5386}
5387
5388static void __exit rbd_exit(void)
5389{
 
5390	rbd_sysfs_cleanup();
5391	if (single_major)
5392		unregister_blkdev(rbd_major, RBD_DRV_NAME);
 
5393	rbd_slab_exit();
5394}
5395
5396module_init(rbd_init);
5397module_exit(rbd_exit);
5398
5399MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
5400MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5401MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5402/* following authorship retained from original osdblk.c */
5403MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5404
5405MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
5406MODULE_LICENSE("GPL");

   1
   2/*
   3   rbd.c -- Export ceph rados objects as a Linux block device
   4
   5
   6   based on drivers/block/osdblk.c:
   7
   8   Copyright 2009 Red Hat, Inc.
   9
  10   This program is free software; you can redistribute it and/or modify
  11   it under the terms of the GNU General Public License as published by
  12   the Free Software Foundation.
  13
  14   This program is distributed in the hope that it will be useful,
  15   but WITHOUT ANY WARRANTY; without even the implied warranty of
  16   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17   GNU General Public License for more details.
  18
  19   You should have received a copy of the GNU General Public License
  20   along with this program; see the file COPYING.  If not, write to
  21   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  22
  23
  24
  25   For usage instructions, please refer to:
  26
  27                 Documentation/ABI/testing/sysfs-bus-rbd
  28
  29 */
  30
  31#include <linux/ceph/libceph.h>
  32#include <linux/ceph/osd_client.h>
  33#include <linux/ceph/mon_client.h>
  34#include <linux/ceph/cls_lock_client.h>
  35#include <linux/ceph/striper.h>
  36#include <linux/ceph/decode.h>
  37#include <linux/parser.h>
  38#include <linux/bsearch.h>
  39
  40#include <linux/kernel.h>
  41#include <linux/device.h>
  42#include <linux/module.h>
  43#include <linux/blk-mq.h>
  44#include <linux/fs.h>
  45#include <linux/blkdev.h>
  46#include <linux/slab.h>
  47#include <linux/idr.h>
  48#include <linux/workqueue.h>
  49
  50#include "rbd_types.h"
  51
  52#define RBD_DEBUG	/* Activate rbd_assert() calls */
  53
  54/*
 
 
 
 
 
 
 
 
 
  55 * Increment the given counter and return its updated value.
  56 * If the counter is already 0 it will not be incremented.
  57 * If the counter is already at its maximum value returns
  58 * -EINVAL without updating it.
  59 */
  60static int atomic_inc_return_safe(atomic_t *v)
  61{
  62	unsigned int counter;
  63
  64	counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
  65	if (counter <= (unsigned int)INT_MAX)
  66		return (int)counter;
  67
  68	atomic_dec(v);
  69
  70	return -EINVAL;
  71}
  72
  73/* Decrement the counter.  Return the resulting value, or -EINVAL */
  74static int atomic_dec_return_safe(atomic_t *v)
  75{
  76	int counter;
  77
  78	counter = atomic_dec_return(v);
  79	if (counter >= 0)
  80		return counter;
  81
  82	atomic_inc(v);
  83
  84	return -EINVAL;
  85}
  86
  87#define RBD_DRV_NAME "rbd"
  88
  89#define RBD_MINORS_PER_MAJOR		256
  90#define RBD_SINGLE_MAJOR_PART_SHIFT	4
  91
  92#define RBD_MAX_PARENT_CHAIN_LEN	16
  93
  94#define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
  95#define RBD_MAX_SNAP_NAME_LEN	\
  96			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
  97
  98#define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
  99
 100#define RBD_SNAP_HEAD_NAME	"-"
 101
 102#define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
 103
 104/* This allows a single page to hold an image name sent by OSD */
 105#define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
 106#define RBD_IMAGE_ID_LEN_MAX	64
 107
 108#define RBD_OBJ_PREFIX_LEN_MAX	64
 109
 110#define RBD_NOTIFY_TIMEOUT	5	/* seconds */
 111#define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
 112
 113/* Feature bits */
 114
 115#define RBD_FEATURE_LAYERING		(1ULL<<0)
 116#define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
 117#define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
 118#define RBD_FEATURE_OBJECT_MAP		(1ULL<<3)
 119#define RBD_FEATURE_FAST_DIFF		(1ULL<<4)
 120#define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
 121#define RBD_FEATURE_DATA_POOL		(1ULL<<7)
 122#define RBD_FEATURE_OPERATIONS		(1ULL<<8)
 123
 124#define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
 125				 RBD_FEATURE_STRIPINGV2 |	\
 126				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
 127				 RBD_FEATURE_OBJECT_MAP |	\
 128				 RBD_FEATURE_FAST_DIFF |	\
 129				 RBD_FEATURE_DEEP_FLATTEN |	\
 130				 RBD_FEATURE_DATA_POOL |	\
 131				 RBD_FEATURE_OPERATIONS)
 132
 133/* Features supported by this (client software) implementation. */
 134
 135#define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
 136
 137/*
 138 * An RBD device name will be "rbd#", where the "rbd" comes from
 139 * RBD_DRV_NAME above, and # is a unique integer identifier.
 
 
 140 */
 141#define DEV_NAME_LEN		32
 
 142
 143/*
 144 * block device image metadata (in-memory version)
 145 */
 146struct rbd_image_header {
 147	/* These six fields never change for a given rbd image */
 148	char *object_prefix;
 149	__u8 obj_order;
 
 
 150	u64 stripe_unit;
 151	u64 stripe_count;
 152	s64 data_pool_id;
 153	u64 features;		/* Might be changeable someday? */
 154
 155	/* The remaining fields need to be updated occasionally */
 156	u64 image_size;
 157	struct ceph_snap_context *snapc;
 158	char *snap_names;	/* format 1 only */
 159	u64 *snap_sizes;	/* format 1 only */
 160};
 161
 162/*
 163 * An rbd image specification.
 164 *
 165 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
 166 * identify an image.  Each rbd_dev structure includes a pointer to
 167 * an rbd_spec structure that encapsulates this identity.
 168 *
 169 * Each of the id's in an rbd_spec has an associated name.  For a
 170 * user-mapped image, the names are supplied and the id's associated
 171 * with them are looked up.  For a layered image, a parent image is
 172 * defined by the tuple, and the names are looked up.
 173 *
 174 * An rbd_dev structure contains a parent_spec pointer which is
 175 * non-null if the image it represents is a child in a layered
 176 * image.  This pointer will refer to the rbd_spec structure used
 177 * by the parent rbd_dev for its own identity (i.e., the structure
 178 * is shared between the parent and child).
 179 *
 180 * Since these structures are populated once, during the discovery
 181 * phase of image construction, they are effectively immutable so
 182 * we make no effort to synchronize access to them.
 183 *
 184 * Note that code herein does not assume the image name is known (it
 185 * could be a null pointer).
 186 */
 187struct rbd_spec {
 188	u64		pool_id;
 189	const char	*pool_name;
 190	const char	*pool_ns;	/* NULL if default, never "" */
 191
 192	const char	*image_id;
 193	const char	*image_name;
 194
 195	u64		snap_id;
 196	const char	*snap_name;
 197
 198	struct kref	kref;
 199};
 200
 201/*
 202 * an instance of the client.  multiple devices may share an rbd client.
 203 */
 204struct rbd_client {
 205	struct ceph_client	*client;
 206	struct kref		kref;
 207	struct list_head	node;
 208};
 209
 210struct pending_result {
 211	int			result;		/* first nonzero result */
 212	int			num_pending;
 213};
 214
 215struct rbd_img_request;
 
 216
 217enum obj_request_type {
 218	OBJ_REQUEST_NODATA = 1,
 219	OBJ_REQUEST_BIO,	/* pointer into provided bio (list) */
 220	OBJ_REQUEST_BVECS,	/* pointer into provided bio_vec array */
 221	OBJ_REQUEST_OWN_BVECS,	/* private bio_vec array, doesn't own pages */
 222};
 223
 224enum obj_operation_type {
 225	OBJ_OP_READ = 1,
 226	OBJ_OP_WRITE,
 227	OBJ_OP_DISCARD,
 228	OBJ_OP_ZEROOUT,
 229};
 230
 231#define RBD_OBJ_FLAG_DELETION			(1U << 0)
 232#define RBD_OBJ_FLAG_COPYUP_ENABLED		(1U << 1)
 233#define RBD_OBJ_FLAG_COPYUP_ZEROS		(1U << 2)
 234#define RBD_OBJ_FLAG_MAY_EXIST			(1U << 3)
 235#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT	(1U << 4)
 236
 237enum rbd_obj_read_state {
 238	RBD_OBJ_READ_START = 1,
 239	RBD_OBJ_READ_OBJECT,
 240	RBD_OBJ_READ_PARENT,
 241};
 242
 243/*
 244 * Writes go through the following state machine to deal with
 245 * layering:
 246 *
 247 *            . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
 248 *            .                 |                                    .
 249 *            .                 v                                    .
 250 *            .    RBD_OBJ_WRITE_READ_FROM_PARENT. . .               .
 251 *            .                 |                    .               .
 252 *            .                 v                    v (deep-copyup  .
 253 *    (image  .   RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC   .  not needed)  .
 254 * flattened) v                 |                    .               .
 255 *            .                 v                    .               .
 256 *            . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . .      (copyup  .
 257 *                              |                        not needed) v
 258 *                              v                                    .
 259 *                            done . . . . . . . . . . . . . . . . . .
 260 *                              ^
 261 *                              |
 262 *                     RBD_OBJ_WRITE_FLAT
 263 *
 264 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
 265 * assert_exists guard is needed or not (in some cases it's not needed
 266 * even if there is a parent).
 267 */
 268enum rbd_obj_write_state {
 269	RBD_OBJ_WRITE_START = 1,
 270	RBD_OBJ_WRITE_PRE_OBJECT_MAP,
 271	RBD_OBJ_WRITE_OBJECT,
 272	__RBD_OBJ_WRITE_COPYUP,
 273	RBD_OBJ_WRITE_COPYUP,
 274	RBD_OBJ_WRITE_POST_OBJECT_MAP,
 275};
 276
 277enum rbd_obj_copyup_state {
 278	RBD_OBJ_COPYUP_START = 1,
 279	RBD_OBJ_COPYUP_READ_PARENT,
 280	__RBD_OBJ_COPYUP_OBJECT_MAPS,
 281	RBD_OBJ_COPYUP_OBJECT_MAPS,
 282	__RBD_OBJ_COPYUP_WRITE_OBJECT,
 283	RBD_OBJ_COPYUP_WRITE_OBJECT,
 284};
 285
 286struct rbd_obj_request {
 287	struct ceph_object_extent ex;
 288	unsigned int		flags;	/* RBD_OBJ_FLAG_* */
 
 
 
 
 
 
 
 
 
 
 
 
 
 289	union {
 290		enum rbd_obj_read_state	 read_state;	/* for reads */
 291		enum rbd_obj_write_state write_state;	/* for writes */
 
 
 
 
 
 292	};
 
 293
 294	struct rbd_img_request	*img_request;
 295	struct ceph_file_extent	*img_extents;
 296	u32			num_img_extents;
 297
 298	union {
 299		struct ceph_bio_iter	bio_pos;
 300		struct {
 301			struct ceph_bvec_iter	bvec_pos;
 302			u32			bvec_count;
 303			u32			bvec_idx;
 304		};
 305	};
 
 
 
 
 306
 307	enum rbd_obj_copyup_state copyup_state;
 308	struct bio_vec		*copyup_bvecs;
 309	u32			copyup_bvec_count;
 310
 311	struct list_head	osd_reqs;	/* w/ r_private_item */
 
 312
 313	struct mutex		state_mutex;
 314	struct pending_result	pending;
 315	struct kref		kref;
 316};
 317
 318enum img_req_flags {
 
 319	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
 320	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
 321};
 322
 323enum rbd_img_state {
 324	RBD_IMG_START = 1,
 325	RBD_IMG_EXCLUSIVE_LOCK,
 326	__RBD_IMG_OBJECT_REQUESTS,
 327	RBD_IMG_OBJECT_REQUESTS,
 328};
 329
 330struct rbd_img_request {
 331	struct rbd_device	*rbd_dev;
 332	enum obj_operation_type	op_type;
 333	enum obj_request_type	data_type;
 334	unsigned long		flags;
 335	enum rbd_img_state	state;
 336	union {
 337		u64			snap_id;	/* for reads */
 338		struct ceph_snap_context *snapc;	/* for writes */
 339	};
 340	union {
 341		struct request		*rq;		/* block request */
 342		struct rbd_obj_request	*obj_request;	/* obj req initiator */
 343	};
 
 
 
 
 
 
 
 344
 345	struct list_head	lock_item;
 346	struct list_head	object_extents;	/* obj_req.ex structs */
 347
 348	struct mutex		state_mutex;
 349	struct pending_result	pending;
 350	struct work_struct	work;
 351	int			work_result;
 352	struct kref		kref;
 353};
 354
 355#define for_each_obj_request(ireq, oreq) \
 356	list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
 
 
 357#define for_each_obj_request_safe(ireq, oreq, n) \
 358	list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
 359
 360enum rbd_watch_state {
 361	RBD_WATCH_STATE_UNREGISTERED,
 362	RBD_WATCH_STATE_REGISTERED,
 363	RBD_WATCH_STATE_ERROR,
 364};
 365
 366enum rbd_lock_state {
 367	RBD_LOCK_STATE_UNLOCKED,
 368	RBD_LOCK_STATE_LOCKED,
 369	RBD_LOCK_STATE_RELEASING,
 370};
 371
 372/* WatchNotify::ClientId */
 373struct rbd_client_id {
 374	u64 gid;
 375	u64 handle;
 376};
 377
 378struct rbd_mapping {
 379	u64                     size;
 380	u64                     features;
 
 381};
 382
 383/*
 384 * a single device
 385 */
 386struct rbd_device {
 387	int			dev_id;		/* blkdev unique id */
 388
 389	int			major;		/* blkdev assigned major */
 390	int			minor;
 391	struct gendisk		*disk;		/* blkdev's gendisk and rq */
 392
 393	u32			image_format;	/* Either 1 or 2 */
 394	struct rbd_client	*rbd_client;
 395
 396	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 397
 398	spinlock_t		lock;		/* queue, flags, open_count */
 399
 400	struct rbd_image_header	header;
 401	unsigned long		flags;		/* possibly lock protected */
 402	struct rbd_spec		*spec;
 403	struct rbd_options	*opts;
 404	char			*config_info;	/* add{,_single_major} string */
 405
 406	struct ceph_object_id	header_oid;
 407	struct ceph_object_locator header_oloc;
 408
 409	struct ceph_file_layout	layout;		/* used for all rbd requests */
 410
 411	struct mutex		watch_mutex;
 412	enum rbd_watch_state	watch_state;
 413	struct ceph_osd_linger_request *watch_handle;
 414	u64			watch_cookie;
 415	struct delayed_work	watch_dwork;
 416
 417	struct rw_semaphore	lock_rwsem;
 418	enum rbd_lock_state	lock_state;
 419	char			lock_cookie[32];
 420	struct rbd_client_id	owner_cid;
 421	struct work_struct	acquired_lock_work;
 422	struct work_struct	released_lock_work;
 423	struct delayed_work	lock_dwork;
 424	struct work_struct	unlock_work;
 425	spinlock_t		lock_lists_lock;
 426	struct list_head	acquiring_list;
 427	struct list_head	running_list;
 428	struct completion	acquire_wait;
 429	int			acquire_err;
 430	struct completion	releasing_wait;
 431
 432	spinlock_t		object_map_lock;
 433	u8			*object_map;
 434	u64			object_map_size;	/* in objects */
 435	u64			object_map_flags;
 436
 437	struct workqueue_struct	*task_wq;
 
 438
 439	struct rbd_spec		*parent_spec;
 440	u64			parent_overlap;
 441	atomic_t		parent_ref;
 442	struct rbd_device	*parent;
 443
 444	/* Block layer tags. */
 445	struct blk_mq_tag_set	tag_set;
 446
 447	/* protects updating the header */
 448	struct rw_semaphore     header_rwsem;
 449
 450	struct rbd_mapping	mapping;
 451
 452	struct list_head	node;
 453
 454	/* sysfs related */
 455	struct device		dev;
 456	unsigned long		open_count;	/* protected by lock */
 457};
 458
 459/*
 460 * Flag bits for rbd_dev->flags:
 461 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
 462 *   by rbd_dev->lock
 
 
 463 */
 464enum rbd_dev_flags {
 465	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
 466	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
 467};
 468
 469static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
 470
 471static LIST_HEAD(rbd_dev_list);    /* devices */
 472static DEFINE_SPINLOCK(rbd_dev_list_lock);
 473
 474static LIST_HEAD(rbd_client_list);		/* clients */
 475static DEFINE_SPINLOCK(rbd_client_list_lock);
 476
 477/* Slab caches for frequently-allocated structures */
 478
 479static struct kmem_cache	*rbd_img_request_cache;
 480static struct kmem_cache	*rbd_obj_request_cache;
 
 481
 482static int rbd_major;
 483static DEFINE_IDA(rbd_dev_id_ida);
 484
 485static struct workqueue_struct *rbd_wq;
 486
 487static struct ceph_snap_context rbd_empty_snapc = {
 488	.nref = REFCOUNT_INIT(1),
 489};
 490
 491/*
 492 * single-major requires >= 0.75 version of userspace rbd utility.
 
 493 */
 494static bool single_major = true;
 495module_param(single_major, bool, 0444);
 496MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
 497
 498static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
 499static ssize_t remove_store(struct bus_type *bus, const char *buf,
 500			    size_t count);
 501static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
 502				      size_t count);
 503static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
 504					 size_t count);
 505static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
 
 
 
 
 
 
 506
 507static int rbd_dev_id_to_minor(int dev_id)
 508{
 509	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
 510}
 511
 512static int minor_to_rbd_dev_id(int minor)
 513{
 514	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
 515}
 516
 517static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
 518{
 519	lockdep_assert_held(&rbd_dev->lock_rwsem);
 520
 521	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
 522	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
 523}
 524
 525static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
 526{
 527	bool is_lock_owner;
 528
 529	down_read(&rbd_dev->lock_rwsem);
 530	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
 531	up_read(&rbd_dev->lock_rwsem);
 532	return is_lock_owner;
 533}
 534
 535static ssize_t supported_features_show(struct bus_type *bus, char *buf)
 536{
 537	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
 538}
 539
 540static BUS_ATTR_WO(add);
 541static BUS_ATTR_WO(remove);
 542static BUS_ATTR_WO(add_single_major);
 543static BUS_ATTR_WO(remove_single_major);
 544static BUS_ATTR_RO(supported_features);
 545
 546static struct attribute *rbd_bus_attrs[] = {
 547	&bus_attr_add.attr,
 548	&bus_attr_remove.attr,
 549	&bus_attr_add_single_major.attr,
 550	&bus_attr_remove_single_major.attr,
 551	&bus_attr_supported_features.attr,
 552	NULL,
 553};
 554
 555static umode_t rbd_bus_is_visible(struct kobject *kobj,
 556				  struct attribute *attr, int index)
 557{
 558	if (!single_major &&
 559	    (attr == &bus_attr_add_single_major.attr ||
 560	     attr == &bus_attr_remove_single_major.attr))
 561		return 0;
 562
 563	return attr->mode;
 564}
 565
 566static const struct attribute_group rbd_bus_group = {
 567	.attrs = rbd_bus_attrs,
 568	.is_visible = rbd_bus_is_visible,
 569};
 570__ATTRIBUTE_GROUPS(rbd_bus);
 571
 572static struct bus_type rbd_bus_type = {
 573	.name		= "rbd",
 574	.bus_groups	= rbd_bus_groups,
 575};
 576
 577static void rbd_root_dev_release(struct device *dev)
 578{
 579}
 580
 581static struct device rbd_root_dev = {
 582	.init_name =    "rbd",
 583	.release =      rbd_root_dev_release,
 584};
 585
 586static __printf(2, 3)
 587void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
 588{
 589	struct va_format vaf;
 590	va_list args;
 591
 592	va_start(args, fmt);
 593	vaf.fmt = fmt;
 594	vaf.va = &args;
 595
 596	if (!rbd_dev)
 597		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
 598	else if (rbd_dev->disk)
 599		printk(KERN_WARNING "%s: %s: %pV\n",
 600			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
 601	else if (rbd_dev->spec && rbd_dev->spec->image_name)
 602		printk(KERN_WARNING "%s: image %s: %pV\n",
 603			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
 604	else if (rbd_dev->spec && rbd_dev->spec->image_id)
 605		printk(KERN_WARNING "%s: id %s: %pV\n",
 606			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
 607	else	/* punt */
 608		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
 609			RBD_DRV_NAME, rbd_dev, &vaf);
 610	va_end(args);
 611}
 612
 613#ifdef RBD_DEBUG
 614#define rbd_assert(expr)						\
 615		if (unlikely(!(expr))) {				\
 616			printk(KERN_ERR "\nAssertion failure in %s() "	\
 617						"at line %d:\n\n"	\
 618					"\trbd_assert(%s);\n\n",	\
 619					__func__, __LINE__, #expr);	\
 620			BUG();						\
 621		}
 622#else /* !RBD_DEBUG */
 623#  define rbd_assert(expr)	((void) 0)
 624#endif /* !RBD_DEBUG */
 625
 
 
 626static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
 627
 628static int rbd_dev_refresh(struct rbd_device *rbd_dev);
 629static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
 630static int rbd_dev_header_info(struct rbd_device *rbd_dev);
 631static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
 632static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
 633					u64 snap_id);
 634static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 635				u8 *order, u64 *snap_size);
 636static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 637		u64 *snap_features);
 638static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
 639
 640static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
 641static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
 642
 643/*
 644 * Return true if nothing else is pending.
 645 */
 646static bool pending_result_dec(struct pending_result *pending, int *result)
 647{
 648	rbd_assert(pending->num_pending > 0);
 649
 650	if (*result && !pending->result)
 651		pending->result = *result;
 652	if (--pending->num_pending)
 653		return false;
 654
 655	*result = pending->result;
 656	return true;
 657}
 658
 659static int rbd_open(struct block_device *bdev, fmode_t mode)
 660{
 661	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 662	bool removing = false;
 663
 
 
 
 664	spin_lock_irq(&rbd_dev->lock);
 665	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
 666		removing = true;
 667	else
 668		rbd_dev->open_count++;
 669	spin_unlock_irq(&rbd_dev->lock);
 670	if (removing)
 671		return -ENOENT;
 672
 673	(void) get_device(&rbd_dev->dev);
 
 674
 675	return 0;
 676}
 677
 678static void rbd_release(struct gendisk *disk, fmode_t mode)
 679{
 680	struct rbd_device *rbd_dev = disk->private_data;
 681	unsigned long open_count_before;
 682
 683	spin_lock_irq(&rbd_dev->lock);
 684	open_count_before = rbd_dev->open_count--;
 685	spin_unlock_irq(&rbd_dev->lock);
 686	rbd_assert(open_count_before > 0);
 687
 688	put_device(&rbd_dev->dev);
 689}
 690
 691static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
 692{
 693	int ro;
 694
 695	if (get_user(ro, (int __user *)arg))
 696		return -EFAULT;
 697
 698	/* Snapshots can't be marked read-write */
 699	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
 700		return -EROFS;
 701
 702	/* Let blkdev_roset() handle it */
 703	return -ENOTTY;
 704}
 705
 706static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
 707			unsigned int cmd, unsigned long arg)
 708{
 709	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 710	int ret;
 711
 712	switch (cmd) {
 713	case BLKROSET:
 714		ret = rbd_ioctl_set_ro(rbd_dev, arg);
 715		break;
 716	default:
 717		ret = -ENOTTY;
 718	}
 719
 720	return ret;
 721}
 722
 723#ifdef CONFIG_COMPAT
 724static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
 725				unsigned int cmd, unsigned long arg)
 726{
 727	return rbd_ioctl(bdev, mode, cmd, arg);
 728}
 729#endif /* CONFIG_COMPAT */
 730
 731static const struct block_device_operations rbd_bd_ops = {
 732	.owner			= THIS_MODULE,
 733	.open			= rbd_open,
 734	.release		= rbd_release,
 735	.ioctl			= rbd_ioctl,
 736#ifdef CONFIG_COMPAT
 737	.compat_ioctl		= rbd_compat_ioctl,
 738#endif
 739};
 740
 741/*
 742 * Initialize an rbd client instance.  Success or not, this function
 743 * consumes ceph_opts.  Caller holds client_mutex.
 744 */
 745static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 746{
 747	struct rbd_client *rbdc;
 748	int ret = -ENOMEM;
 749
 750	dout("%s:\n", __func__);
 751	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 752	if (!rbdc)
 753		goto out_opt;
 754
 755	kref_init(&rbdc->kref);
 756	INIT_LIST_HEAD(&rbdc->node);
 757
 758	rbdc->client = ceph_create_client(ceph_opts, rbdc);
 759	if (IS_ERR(rbdc->client))
 760		goto out_rbdc;
 761	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 762
 763	ret = ceph_open_session(rbdc->client);
 764	if (ret < 0)
 765		goto out_client;
 766
 767	spin_lock(&rbd_client_list_lock);
 768	list_add_tail(&rbdc->node, &rbd_client_list);
 769	spin_unlock(&rbd_client_list_lock);
 770
 771	dout("%s: rbdc %p\n", __func__, rbdc);
 772
 773	return rbdc;
 774out_client:
 775	ceph_destroy_client(rbdc->client);
 776out_rbdc:
 777	kfree(rbdc);
 778out_opt:
 779	if (ceph_opts)
 780		ceph_destroy_options(ceph_opts);
 781	dout("%s: error %d\n", __func__, ret);
 782
 783	return ERR_PTR(ret);
 784}
 785
 786static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
 787{
 788	kref_get(&rbdc->kref);
 789
 790	return rbdc;
 791}
 792
 793/*
 794 * Find a ceph client with specific addr and configuration.  If
 795 * found, bump its reference count.
 796 */
 797static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 798{
 799	struct rbd_client *client_node;
 800	bool found = false;
 801
 802	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 803		return NULL;
 804
 805	spin_lock(&rbd_client_list_lock);
 806	list_for_each_entry(client_node, &rbd_client_list, node) {
 807		if (!ceph_compare_options(ceph_opts, client_node->client)) {
 808			__rbd_get_client(client_node);
 809
 810			found = true;
 811			break;
 812		}
 813	}
 814	spin_unlock(&rbd_client_list_lock);
 815
 816	return found ? client_node : NULL;
 817}
 818
 819/*
 820 * (Per device) rbd map options
 821 */
 822enum {
 823	Opt_queue_depth,
 824	Opt_alloc_size,
 825	Opt_lock_timeout,
 826	Opt_last_int,
 827	/* int args above */
 828	Opt_pool_ns,
 829	Opt_last_string,
 830	/* string args above */
 831	Opt_read_only,
 832	Opt_read_write,
 833	Opt_lock_on_read,
 834	Opt_exclusive,
 835	Opt_notrim,
 836	Opt_err
 837};
 838
 839static match_table_t rbd_opts_tokens = {
 840	{Opt_queue_depth, "queue_depth=%d"},
 841	{Opt_alloc_size, "alloc_size=%d"},
 842	{Opt_lock_timeout, "lock_timeout=%d"},
 843	/* int args above */
 844	{Opt_pool_ns, "_pool_ns=%s"},
 845	/* string args above */
 846	{Opt_read_only, "read_only"},
 847	{Opt_read_only, "ro"},		/* Alternate spelling */
 848	{Opt_read_write, "read_write"},
 849	{Opt_read_write, "rw"},		/* Alternate spelling */
 850	{Opt_lock_on_read, "lock_on_read"},
 851	{Opt_exclusive, "exclusive"},
 852	{Opt_notrim, "notrim"},
 853	{Opt_err, NULL}
 854};
 855
 856struct rbd_options {
 857	int	queue_depth;
 858	int	alloc_size;
 859	unsigned long	lock_timeout;
 860	bool	read_only;
 861	bool	lock_on_read;
 862	bool	exclusive;
 863	bool	trim;
 864};
 865
 866#define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
 867#define RBD_ALLOC_SIZE_DEFAULT	(64 * 1024)
 868#define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
 869#define RBD_READ_ONLY_DEFAULT	false
 870#define RBD_LOCK_ON_READ_DEFAULT false
 871#define RBD_EXCLUSIVE_DEFAULT	false
 872#define RBD_TRIM_DEFAULT	true
 873
 874struct parse_rbd_opts_ctx {
 875	struct rbd_spec		*spec;
 876	struct rbd_options	*opts;
 877};
 878
 879static int parse_rbd_opts_token(char *c, void *private)
 880{
 881	struct parse_rbd_opts_ctx *pctx = private;
 882	substring_t argstr[MAX_OPT_ARGS];
 883	int token, intval, ret;
 884
 885	token = match_token(c, rbd_opts_tokens, argstr);
 
 
 
 886	if (token < Opt_last_int) {
 887		ret = match_int(&argstr[0], &intval);
 888		if (ret < 0) {
 889			pr_err("bad option arg (not int) at '%s'\n", c);
 
 890			return ret;
 891		}
 892		dout("got int token %d val %d\n", token, intval);
 893	} else if (token > Opt_last_int && token < Opt_last_string) {
 894		dout("got string token %d val %s\n", token, argstr[0].from);
 
 
 
 895	} else {
 896		dout("got token %d\n", token);
 897	}
 898
 899	switch (token) {
 900	case Opt_queue_depth:
 901		if (intval < 1) {
 902			pr_err("queue_depth out of range\n");
 903			return -EINVAL;
 904		}
 905		pctx->opts->queue_depth = intval;
 906		break;
 907	case Opt_alloc_size:
 908		if (intval < SECTOR_SIZE) {
 909			pr_err("alloc_size out of range\n");
 910			return -EINVAL;
 911		}
 912		if (!is_power_of_2(intval)) {
 913			pr_err("alloc_size must be a power of 2\n");
 914			return -EINVAL;
 915		}
 916		pctx->opts->alloc_size = intval;
 917		break;
 918	case Opt_lock_timeout:
 919		/* 0 is "wait forever" (i.e. infinite timeout) */
 920		if (intval < 0 || intval > INT_MAX / 1000) {
 921			pr_err("lock_timeout out of range\n");
 922			return -EINVAL;
 923		}
 924		pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
 925		break;
 926	case Opt_pool_ns:
 927		kfree(pctx->spec->pool_ns);
 928		pctx->spec->pool_ns = match_strdup(argstr);
 929		if (!pctx->spec->pool_ns)
 930			return -ENOMEM;
 931		break;
 932	case Opt_read_only:
 933		pctx->opts->read_only = true;
 934		break;
 935	case Opt_read_write:
 936		pctx->opts->read_only = false;
 937		break;
 938	case Opt_lock_on_read:
 939		pctx->opts->lock_on_read = true;
 940		break;
 941	case Opt_exclusive:
 942		pctx->opts->exclusive = true;
 943		break;
 944	case Opt_notrim:
 945		pctx->opts->trim = false;
 946		break;
 947	default:
 948		/* libceph prints "bad option" msg */
 949		return -EINVAL;
 950	}
 951
 952	return 0;
 953}
 954
 955static char* obj_op_name(enum obj_operation_type op_type)
 
 
 
 
 
 956{
 957	switch (op_type) {
 958	case OBJ_OP_READ:
 959		return "read";
 960	case OBJ_OP_WRITE:
 961		return "write";
 962	case OBJ_OP_DISCARD:
 963		return "discard";
 964	case OBJ_OP_ZEROOUT:
 965		return "zeroout";
 966	default:
 967		return "???";
 968	}
 969}
 970
 971/*
 972 * Destroy ceph client
 973 *
 974 * Caller must hold rbd_client_list_lock.
 975 */
 976static void rbd_client_release(struct kref *kref)
 977{
 978	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 979
 980	dout("%s: rbdc %p\n", __func__, rbdc);
 981	spin_lock(&rbd_client_list_lock);
 982	list_del(&rbdc->node);
 983	spin_unlock(&rbd_client_list_lock);
 984
 985	ceph_destroy_client(rbdc->client);
 986	kfree(rbdc);
 987}
 988
 989/*
 990 * Drop reference to ceph client node. If it's not referenced anymore, release
 991 * it.
 992 */
 993static void rbd_put_client(struct rbd_client *rbdc)
 994{
 995	if (rbdc)
 996		kref_put(&rbdc->kref, rbd_client_release);
 997}
 998
 999/*
1000 * Get a ceph client with specific addr and configuration, if one does
1001 * not exist create it.  Either way, ceph_opts is consumed by this
1002 * function.
1003 */
1004static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
1005{
1006	struct rbd_client *rbdc;
1007	int ret;
1008
1009	mutex_lock(&client_mutex);
1010	rbdc = rbd_client_find(ceph_opts);
1011	if (rbdc) {
1012		ceph_destroy_options(ceph_opts);
1013
1014		/*
1015		 * Using an existing client.  Make sure ->pg_pools is up to
1016		 * date before we look up the pool id in do_rbd_add().
1017		 */
1018		ret = ceph_wait_for_latest_osdmap(rbdc->client,
1019					rbdc->client->options->mount_timeout);
1020		if (ret) {
1021			rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
1022			rbd_put_client(rbdc);
1023			rbdc = ERR_PTR(ret);
1024		}
1025	} else {
1026		rbdc = rbd_client_create(ceph_opts);
1027	}
1028	mutex_unlock(&client_mutex);
1029
1030	return rbdc;
1031}
1032
1033static bool rbd_image_format_valid(u32 image_format)
1034{
1035	return image_format == 1 || image_format == 2;
1036}
1037
1038static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
1039{
1040	size_t size;
1041	u32 snap_count;
1042
1043	/* The header has to start with the magic rbd header text */
1044	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
1045		return false;
1046
1047	/* The bio layer requires at least sector-sized I/O */
1048
1049	if (ondisk->options.order < SECTOR_SHIFT)
1050		return false;
1051
1052	/* If we use u64 in a few spots we may be able to loosen this */
1053
1054	if (ondisk->options.order > 8 * sizeof (int) - 1)
1055		return false;
1056
1057	/*
1058	 * The size of a snapshot header has to fit in a size_t, and
1059	 * that limits the number of snapshots.
1060	 */
1061	snap_count = le32_to_cpu(ondisk->snap_count);
1062	size = SIZE_MAX - sizeof (struct ceph_snap_context);
1063	if (snap_count > size / sizeof (__le64))
1064		return false;
1065
1066	/*
1067	 * Not only that, but the size of the entire the snapshot
1068	 * header must also be representable in a size_t.
1069	 */
1070	size -= snap_count * sizeof (__le64);
1071	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1072		return false;
1073
1074	return true;
1075}
1076
1077/*
1078 * returns the size of an object in the image
1079 */
1080static u32 rbd_obj_bytes(struct rbd_image_header *header)
1081{
1082	return 1U << header->obj_order;
1083}
1084
1085static void rbd_init_layout(struct rbd_device *rbd_dev)
1086{
1087	if (rbd_dev->header.stripe_unit == 0 ||
1088	    rbd_dev->header.stripe_count == 0) {
1089		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1090		rbd_dev->header.stripe_count = 1;
1091	}
1092
1093	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1094	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1095	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
1096	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1097			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
1098	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1099}
1100
1101/*
1102 * Fill an rbd image header with information from the given format 1
1103 * on-disk header.
1104 */
1105static int rbd_header_from_disk(struct rbd_device *rbd_dev,
1106				 struct rbd_image_header_ondisk *ondisk)
1107{
1108	struct rbd_image_header *header = &rbd_dev->header;
1109	bool first_time = header->object_prefix == NULL;
1110	struct ceph_snap_context *snapc;
1111	char *object_prefix = NULL;
1112	char *snap_names = NULL;
1113	u64 *snap_sizes = NULL;
1114	u32 snap_count;
 
1115	int ret = -ENOMEM;
1116	u32 i;
1117
1118	/* Allocate this now to avoid having to handle failure below */
1119
1120	if (first_time) {
1121		object_prefix = kstrndup(ondisk->object_prefix,
1122					 sizeof(ondisk->object_prefix),
1123					 GFP_KERNEL);
 
 
1124		if (!object_prefix)
1125			return -ENOMEM;
 
 
1126	}
1127
1128	/* Allocate the snapshot context and fill it in */
1129
1130	snap_count = le32_to_cpu(ondisk->snap_count);
1131	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1132	if (!snapc)
1133		goto out_err;
1134	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1135	if (snap_count) {
1136		struct rbd_image_snap_ondisk *snaps;
1137		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1138
1139		/* We'll keep a copy of the snapshot names... */
1140
1141		if (snap_names_len > (u64)SIZE_MAX)
1142			goto out_2big;
1143		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1144		if (!snap_names)
1145			goto out_err;
1146
1147		/* ...as well as the array of their sizes. */
1148		snap_sizes = kmalloc_array(snap_count,
1149					   sizeof(*header->snap_sizes),
1150					   GFP_KERNEL);
1151		if (!snap_sizes)
1152			goto out_err;
1153
1154		/*
1155		 * Copy the names, and fill in each snapshot's id
1156		 * and size.
1157		 *
1158		 * Note that rbd_dev_v1_header_info() guarantees the
1159		 * ondisk buffer we're working with has
1160		 * snap_names_len bytes beyond the end of the
1161		 * snapshot id array, this memcpy() is safe.
1162		 */
1163		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1164		snaps = ondisk->snaps;
1165		for (i = 0; i < snap_count; i++) {
1166			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1167			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1168		}
1169	}
1170
1171	/* We won't fail any more, fill in the header */
1172
1173	if (first_time) {
1174		header->object_prefix = object_prefix;
1175		header->obj_order = ondisk->options.order;
1176		rbd_init_layout(rbd_dev);
 
 
 
 
 
1177	} else {
1178		ceph_put_snap_context(header->snapc);
1179		kfree(header->snap_names);
1180		kfree(header->snap_sizes);
1181	}
1182
1183	/* The remaining fields always get updated (when we refresh) */
1184
1185	header->image_size = le64_to_cpu(ondisk->image_size);
1186	header->snapc = snapc;
1187	header->snap_names = snap_names;
1188	header->snap_sizes = snap_sizes;
1189
 
 
 
 
 
 
1190	return 0;
1191out_2big:
1192	ret = -EIO;
1193out_err:
1194	kfree(snap_sizes);
1195	kfree(snap_names);
1196	ceph_put_snap_context(snapc);
1197	kfree(object_prefix);
1198
1199	return ret;
1200}
1201
1202static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1203{
1204	const char *snap_name;
1205
1206	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1207
1208	/* Skip over names until we find the one we are looking for */
1209
1210	snap_name = rbd_dev->header.snap_names;
1211	while (which--)
1212		snap_name += strlen(snap_name) + 1;
1213
1214	return kstrdup(snap_name, GFP_KERNEL);
1215}
1216
1217/*
1218 * Snapshot id comparison function for use with qsort()/bsearch().
1219 * Note that result is for snapshots in *descending* order.
1220 */
1221static int snapid_compare_reverse(const void *s1, const void *s2)
1222{
1223	u64 snap_id1 = *(u64 *)s1;
1224	u64 snap_id2 = *(u64 *)s2;
1225
1226	if (snap_id1 < snap_id2)
1227		return 1;
1228	return snap_id1 == snap_id2 ? 0 : -1;
1229}
1230
1231/*
1232 * Search a snapshot context to see if the given snapshot id is
1233 * present.
1234 *
1235 * Returns the position of the snapshot id in the array if it's found,
1236 * or BAD_SNAP_INDEX otherwise.
1237 *
1238 * Note: The snapshot array is in kept sorted (by the osd) in
1239 * reverse order, highest snapshot id first.
1240 */
1241static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1242{
1243	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
1244	u64 *found;
1245
1246	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1247				sizeof (snap_id), snapid_compare_reverse);
1248
1249	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
1250}
1251
1252static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1253					u64 snap_id)
1254{
1255	u32 which;
1256	const char *snap_name;
1257
1258	which = rbd_dev_snap_index(rbd_dev, snap_id);
1259	if (which == BAD_SNAP_INDEX)
1260		return ERR_PTR(-ENOENT);
1261
1262	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1263	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
1264}
1265
1266static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1267{
1268	if (snap_id == CEPH_NOSNAP)
1269		return RBD_SNAP_HEAD_NAME;
1270
1271	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1272	if (rbd_dev->image_format == 1)
1273		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
1274
1275	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
1276}
1277
1278static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1279				u64 *snap_size)
1280{
1281	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1282	if (snap_id == CEPH_NOSNAP) {
1283		*snap_size = rbd_dev->header.image_size;
1284	} else if (rbd_dev->image_format == 1) {
1285		u32 which;
1286
1287		which = rbd_dev_snap_index(rbd_dev, snap_id);
1288		if (which == BAD_SNAP_INDEX)
1289			return -ENOENT;
1290
1291		*snap_size = rbd_dev->header.snap_sizes[which];
1292	} else {
1293		u64 size = 0;
1294		int ret;
1295
1296		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1297		if (ret)
1298			return ret;
1299
1300		*snap_size = size;
1301	}
1302	return 0;
1303}
1304
1305static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1306			u64 *snap_features)
1307{
1308	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1309	if (snap_id == CEPH_NOSNAP) {
1310		*snap_features = rbd_dev->header.features;
1311	} else if (rbd_dev->image_format == 1) {
1312		*snap_features = 0;	/* No features for format 1 */
1313	} else {
1314		u64 features = 0;
1315		int ret;
1316
1317		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1318		if (ret)
1319			return ret;
1320
1321		*snap_features = features;
1322	}
1323	return 0;
1324}
1325
1326static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1327{
1328	u64 snap_id = rbd_dev->spec->snap_id;
1329	u64 size = 0;
1330	u64 features = 0;
1331	int ret;
1332
1333	ret = rbd_snap_size(rbd_dev, snap_id, &size);
1334	if (ret)
1335		return ret;
1336	ret = rbd_snap_features(rbd_dev, snap_id, &features);
1337	if (ret)
1338		return ret;
1339
1340	rbd_dev->mapping.size = size;
1341	rbd_dev->mapping.features = features;
1342
1343	return 0;
1344}
1345
1346static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1347{
1348	rbd_dev->mapping.size = 0;
1349	rbd_dev->mapping.features = 0;
1350}
1351
1352static void zero_bvec(struct bio_vec *bv)
1353{
1354	void *buf;
1355	unsigned long flags;
 
 
1356
1357	buf = bvec_kmap_irq(bv, &flags);
1358	memset(buf, 0, bv->bv_len);
1359	flush_dcache_page(bv->bv_page);
1360	bvec_kunmap_irq(buf, &flags);
1361}
1362
1363static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1364{
1365	struct ceph_bio_iter it = *bio_pos;
 
 
 
 
 
 
1366
1367	ceph_bio_iter_advance(&it, off);
1368	ceph_bio_iter_advance_step(&it, bytes, ({
1369		zero_bvec(&bv);
1370	}));
1371}
1372
1373static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1374{
1375	struct ceph_bvec_iter it = *bvec_pos;
1376
1377	ceph_bvec_iter_advance(&it, off);
1378	ceph_bvec_iter_advance_step(&it, bytes, ({
1379		zero_bvec(&bv);
1380	}));
1381}
1382
1383/*
1384 * Zero a range in @obj_req data buffer defined by a bio (list) or
1385 * (private) bio_vec array.
1386 *
1387 * @off is relative to the start of the data buffer.
1388 */
1389static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1390			       u32 bytes)
1391{
1392	dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1393
1394	switch (obj_req->img_request->data_type) {
1395	case OBJ_REQUEST_BIO:
1396		zero_bios(&obj_req->bio_pos, off, bytes);
1397		break;
1398	case OBJ_REQUEST_BVECS:
1399	case OBJ_REQUEST_OWN_BVECS:
1400		zero_bvecs(&obj_req->bvec_pos, off, bytes);
1401		break;
1402	default:
1403		BUG();
1404	}
1405}
1406
1407static void rbd_obj_request_destroy(struct kref *kref);
1408static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1409{
1410	rbd_assert(obj_request != NULL);
1411	dout("%s: obj %p (was %d)\n", __func__, obj_request,
1412		kref_read(&obj_request->kref));
1413	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1414}
1415
1416static void rbd_img_request_destroy(struct kref *kref);
1417static void rbd_img_request_put(struct rbd_img_request *img_request)
1418{
1419	rbd_assert(img_request != NULL);
1420	dout("%s: img %p (was %d)\n", __func__, img_request,
1421		kref_read(&img_request->kref));
1422	kref_put(&img_request->kref, rbd_img_request_destroy);
1423}
1424
1425static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1426					struct rbd_obj_request *obj_request)
1427{
1428	rbd_assert(obj_request->img_request == NULL);
1429
1430	/* Image request now owns object's original reference */
1431	obj_request->img_request = img_request;
1432	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1433}
1434
1435static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1436					struct rbd_obj_request *obj_request)
1437{
1438	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1439	list_del(&obj_request->ex.oe_item);
1440	rbd_assert(obj_request->img_request == img_request);
1441	rbd_obj_request_put(obj_request);
1442}
1443
1444static void rbd_osd_submit(struct ceph_osd_request *osd_req)
 
 
 
1445{
1446	struct rbd_obj_request *obj_req = osd_req->r_priv;
1447
1448	dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1449	     __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1450	     obj_req->ex.oe_off, obj_req->ex.oe_len);
1451	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1452}
1453
1454/*
1455 * The default/initial value for all image request flags is 0.  Each
1456 * is conditionally set to 1 at image request initialization time
1457 * and currently never change thereafter.
1458 */
1459static void img_request_layered_set(struct rbd_img_request *img_request)
1460{
1461	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1462	smp_mb();
1463}
1464
1465static void img_request_layered_clear(struct rbd_img_request *img_request)
1466{
1467	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1468	smp_mb();
1469}
1470
1471static bool img_request_layered_test(struct rbd_img_request *img_request)
1472{
1473	smp_mb();
1474	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1475}
1476
1477static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
1478{
1479	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1480
1481	return !obj_req->ex.oe_off &&
1482	       obj_req->ex.oe_len == rbd_dev->layout.object_size;
1483}
1484
1485static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
1486{
1487	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1488
1489	return obj_req->ex.oe_off + obj_req->ex.oe_len ==
1490					rbd_dev->layout.object_size;
1491}
1492
1493/*
1494 * Must be called after rbd_obj_calc_img_extents().
1495 */
1496static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1497{
1498	if (!obj_req->num_img_extents ||
1499	    (rbd_obj_is_entire(obj_req) &&
1500	     !obj_req->img_request->snapc->num_snaps))
1501		return false;
 
1502
1503	return true;
1504}
 
 
 
 
 
 
 
 
 
 
1505
1506static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1507{
1508	return ceph_file_extents_bytes(obj_req->img_extents,
1509				       obj_req->num_img_extents);
1510}
1511
1512static bool rbd_img_is_write(struct rbd_img_request *img_req)
1513{
1514	switch (img_req->op_type) {
1515	case OBJ_OP_READ:
1516		return false;
1517	case OBJ_OP_WRITE:
1518	case OBJ_OP_DISCARD:
1519	case OBJ_OP_ZEROOUT:
1520		return true;
1521	default:
1522		BUG();
1523	}
1524}
1525
1526static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1527{
1528	struct rbd_obj_request *obj_req = osd_req->r_priv;
1529	int result;
1530
1531	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1532	     osd_req->r_result, obj_req);
1533
1534	/*
1535	 * Writes aren't allowed to return a data payload.  In some
1536	 * guarded write cases (e.g. stat + zero on an empty object)
1537	 * a stat response makes it through, but we don't care.
1538	 */
1539	if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1540		result = 0;
1541	else
1542		result = osd_req->r_result;
1543
1544	rbd_obj_handle_request(obj_req, result);
1545}
1546
1547static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
1548{
1549	struct rbd_obj_request *obj_request = osd_req->r_priv;
1550
1551	osd_req->r_flags = CEPH_OSD_FLAG_READ;
1552	osd_req->r_snapid = obj_request->img_request->snap_id;
1553}
1554
1555static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
1556{
1557	struct rbd_obj_request *obj_request = osd_req->r_priv;
1558
1559	osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1560	ktime_get_real_ts64(&osd_req->r_mtime);
1561	osd_req->r_data_offset = obj_request->ex.oe_off;
1562}
1563
1564static struct ceph_osd_request *
1565__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1566			  struct ceph_snap_context *snapc, int num_ops)
1567{
1568	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1569	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1570	struct ceph_osd_request *req;
1571	const char *name_format = rbd_dev->image_format == 1 ?
1572				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1573	int ret;
1574
1575	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1576	if (!req)
1577		return ERR_PTR(-ENOMEM);
1578
1579	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
1580	req->r_callback = rbd_osd_req_callback;
1581	req->r_priv = obj_req;
1582
1583	/*
1584	 * Data objects may be stored in a separate pool, but always in
1585	 * the same namespace in that pool as the header in its pool.
1586	 */
1587	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
1588	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1589
1590	ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1591			       rbd_dev->header.object_prefix,
1592			       obj_req->ex.oe_objno);
1593	if (ret)
1594		return ERR_PTR(ret);
1595
1596	return req;
1597}
1598
1599static struct ceph_osd_request *
1600rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
1601{
1602	return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1603					 num_ops);
1604}
1605
1606static struct rbd_obj_request *rbd_obj_request_create(void)
1607{
1608	struct rbd_obj_request *obj_request;
1609
1610	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
1611	if (!obj_request)
1612		return NULL;
1613
1614	ceph_object_extent_init(&obj_request->ex);
1615	INIT_LIST_HEAD(&obj_request->osd_reqs);
1616	mutex_init(&obj_request->state_mutex);
1617	kref_init(&obj_request->kref);
1618
1619	dout("%s %p\n", __func__, obj_request);
1620	return obj_request;
1621}
1622
1623static void rbd_obj_request_destroy(struct kref *kref)
1624{
1625	struct rbd_obj_request *obj_request;
1626	struct ceph_osd_request *osd_req;
1627	u32 i;
1628
1629	obj_request = container_of(kref, struct rbd_obj_request, kref);
1630
1631	dout("%s: obj %p\n", __func__, obj_request);
1632
1633	while (!list_empty(&obj_request->osd_reqs)) {
1634		osd_req = list_first_entry(&obj_request->osd_reqs,
1635				    struct ceph_osd_request, r_private_item);
1636		list_del_init(&osd_req->r_private_item);
1637		ceph_osdc_put_request(osd_req);
1638	}
 
1639
1640	switch (obj_request->img_request->data_type) {
1641	case OBJ_REQUEST_NODATA:
1642	case OBJ_REQUEST_BIO:
1643	case OBJ_REQUEST_BVECS:
1644		break;		/* Nothing to do */
1645	case OBJ_REQUEST_OWN_BVECS:
1646		kfree(obj_request->bvec_pos.bvecs);
1647		break;
1648	default:
1649		BUG();
1650	}
1651
1652	kfree(obj_request->img_extents);
1653	if (obj_request->copyup_bvecs) {
1654		for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1655			if (obj_request->copyup_bvecs[i].bv_page)
1656				__free_page(obj_request->copyup_bvecs[i].bv_page);
1657		}
1658		kfree(obj_request->copyup_bvecs);
1659	}
1660
1661	kmem_cache_free(rbd_obj_request_cache, obj_request);
1662}
1663
1664/* It's OK to call this for a device with no parent */
1665
1666static void rbd_spec_put(struct rbd_spec *spec);
1667static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1668{
1669	rbd_dev_remove_parent(rbd_dev);
1670	rbd_spec_put(rbd_dev->parent_spec);
1671	rbd_dev->parent_spec = NULL;
1672	rbd_dev->parent_overlap = 0;
1673}
1674
1675/*
1676 * Parent image reference counting is used to determine when an
1677 * image's parent fields can be safely torn down--after there are no
1678 * more in-flight requests to the parent image.  When the last
1679 * reference is dropped, cleaning them up is safe.
1680 */
1681static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
 
 
 
1682{
1683	int counter;
1684
1685	if (!rbd_dev->parent_spec)
1686		return;
 
1687
1688	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1689	if (counter > 0)
1690		return;
1691
1692	/* Last reference; clean up parent data structures */
1693
1694	if (!counter)
1695		rbd_dev_unparent(rbd_dev);
1696	else
1697		rbd_warn(rbd_dev, "parent reference underflow");
1698}
1699
1700/*
1701 * If an image has a non-zero parent overlap, get a reference to its
1702 * parent.
 
 
 
 
 
 
1703 *
1704 * Returns true if the rbd device has a parent with a non-zero
1705 * overlap and a reference for it was successfully taken, or
1706 * false otherwise.
1707 */
1708static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
 
 
 
1709{
1710	int counter = 0;
 
 
 
1711
1712	if (!rbd_dev->parent_spec)
1713		return false;
1714
1715	down_read(&rbd_dev->header_rwsem);
1716	if (rbd_dev->parent_overlap)
1717		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1718	up_read(&rbd_dev->header_rwsem);
1719
1720	if (counter < 0)
1721		rbd_warn(rbd_dev, "parent reference overflow");
 
 
1722
1723	return counter > 0;
1724}
 
 
 
 
 
 
1725
1726/*
1727 * Caller is responsible for filling in the list of object requests
1728 * that comprises the image request, and the Linux request pointer
1729 * (if there is one).
1730 */
1731static struct rbd_img_request *rbd_img_request_create(
1732					struct rbd_device *rbd_dev,
1733					enum obj_operation_type op_type,
1734					struct ceph_snap_context *snapc)
1735{
1736	struct rbd_img_request *img_request;
1737
1738	img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
1739	if (!img_request)
1740		return NULL;
 
 
 
 
 
 
1741
1742	img_request->rbd_dev = rbd_dev;
1743	img_request->op_type = op_type;
1744	if (!rbd_img_is_write(img_request))
1745		img_request->snap_id = rbd_dev->spec->snap_id;
1746	else
1747		img_request->snapc = snapc;
1748
1749	if (rbd_dev_parent_get(rbd_dev))
1750		img_request_layered_set(img_request);
1751
1752	INIT_LIST_HEAD(&img_request->lock_item);
1753	INIT_LIST_HEAD(&img_request->object_extents);
1754	mutex_init(&img_request->state_mutex);
1755	kref_init(&img_request->kref);
1756
1757	return img_request;
1758}
1759
1760static void rbd_img_request_destroy(struct kref *kref)
 
 
 
 
 
1761{
1762	struct rbd_img_request *img_request;
1763	struct rbd_obj_request *obj_request;
1764	struct rbd_obj_request *next_obj_request;
1765
1766	img_request = container_of(kref, struct rbd_img_request, kref);
1767
1768	dout("%s: img %p\n", __func__, img_request);
1769
1770	WARN_ON(!list_empty(&img_request->lock_item));
1771	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1772		rbd_img_obj_request_del(img_request, obj_request);
1773
1774	if (img_request_layered_test(img_request)) {
1775		img_request_layered_clear(img_request);
1776		rbd_dev_parent_put(img_request->rbd_dev);
1777	}
1778
1779	if (rbd_img_is_write(img_request))
1780		ceph_put_snap_context(img_request->snapc);
1781
1782	kmem_cache_free(rbd_img_request_cache, img_request);
1783}
1784
1785#define BITS_PER_OBJ	2
1786#define OBJS_PER_BYTE	(BITS_PER_BYTE / BITS_PER_OBJ)
1787#define OBJ_MASK	((1 << BITS_PER_OBJ) - 1)
1788
1789static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
1790				   u64 *index, u8 *shift)
1791{
1792	u32 off;
1793
1794	rbd_assert(objno < rbd_dev->object_map_size);
1795	*index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
1796	*shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
1797}
1798
1799static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1800{
1801	u64 index;
1802	u8 shift;
1803
1804	lockdep_assert_held(&rbd_dev->object_map_lock);
1805	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
1806	return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
 
 
1807}
1808
1809static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
1810{
1811	u64 index;
1812	u8 shift;
1813	u8 *p;
1814
1815	lockdep_assert_held(&rbd_dev->object_map_lock);
1816	rbd_assert(!(val & ~OBJ_MASK));
1817
1818	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
1819	p = &rbd_dev->object_map[index];
1820	*p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
1821}
1822
1823static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1824{
1825	u8 state;
1826
1827	spin_lock(&rbd_dev->object_map_lock);
1828	state = __rbd_object_map_get(rbd_dev, objno);
1829	spin_unlock(&rbd_dev->object_map_lock);
1830	return state;
 
 
 
 
 
 
 
 
 
1831}
1832
1833static bool use_object_map(struct rbd_device *rbd_dev)
1834{
1835	return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
1836		!(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
1837}
1838
1839static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
1840{
1841	u8 state;
1842
1843	/* fall back to default logic if object map is disabled or invalid */
1844	if (!use_object_map(rbd_dev))
1845		return true;
1846
1847	state = rbd_object_map_get(rbd_dev, objno);
1848	return state != OBJECT_NONEXISTENT;
1849}
1850
1851static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
1852				struct ceph_object_id *oid)
1853{
1854	if (snap_id == CEPH_NOSNAP)
1855		ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
1856				rbd_dev->spec->image_id);
1857	else
1858		ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
1859				rbd_dev->spec->image_id, snap_id);
1860}
1861
1862static int rbd_object_map_lock(struct rbd_device *rbd_dev)
 
1863{
1864	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1865	CEPH_DEFINE_OID_ONSTACK(oid);
1866	u8 lock_type;
1867	char *lock_tag;
1868	struct ceph_locker *lockers;
1869	u32 num_lockers;
1870	bool broke_lock = false;
1871	int ret;
1872
1873	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1874
1875again:
1876	ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1877			    CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
1878	if (ret != -EBUSY || broke_lock) {
1879		if (ret == -EEXIST)
1880			ret = 0; /* already locked by myself */
1881		if (ret)
1882			rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
1883		return ret;
1884	}
1885
1886	ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
1887				 RBD_LOCK_NAME, &lock_type, &lock_tag,
1888				 &lockers, &num_lockers);
1889	if (ret) {
1890		if (ret == -ENOENT)
1891			goto again;
1892
1893		rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
1894		return ret;
1895	}
1896
1897	kfree(lock_tag);
1898	if (num_lockers == 0)
1899		goto again;
1900
1901	rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
1902		 ENTITY_NAME(lockers[0].id.name));
1903
1904	ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
1905				  RBD_LOCK_NAME, lockers[0].id.cookie,
1906				  &lockers[0].id.name);
1907	ceph_free_lockers(lockers, num_lockers);
1908	if (ret) {
1909		if (ret == -ENOENT)
1910			goto again;
1911
1912		rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
1913		return ret;
1914	}
1915
1916	broke_lock = true;
1917	goto again;
1918}
1919
1920static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
 
 
 
1921{
1922	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1923	CEPH_DEFINE_OID_ONSTACK(oid);
1924	int ret;
1925
1926	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1927
1928	ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1929			      "");
1930	if (ret && ret != -ENOENT)
1931		rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
1932}
1933
1934static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
 
1935{
1936	u8 struct_v;
1937	u32 struct_len;
1938	u32 header_len;
1939	void *header_end;
1940	int ret;
1941
1942	ceph_decode_32_safe(p, end, header_len, e_inval);
1943	header_end = *p + header_len;
1944
1945	ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
1946				  &struct_len);
1947	if (ret)
1948		return ret;
1949
1950	ceph_decode_64_safe(p, end, *object_map_size, e_inval);
1951
1952	*p = header_end;
1953	return 0;
1954
1955e_inval:
1956	return -EINVAL;
1957}
1958
1959static int __rbd_object_map_load(struct rbd_device *rbd_dev)
 
1960{
1961	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1962	CEPH_DEFINE_OID_ONSTACK(oid);
1963	struct page **pages;
1964	void *p, *end;
1965	size_t reply_len;
1966	u64 num_objects;
1967	u64 object_map_bytes;
1968	u64 object_map_size;
1969	int num_pages;
1970	int ret;
1971
1972	rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
1973
1974	num_objects = ceph_get_num_objects(&rbd_dev->layout,
1975					   rbd_dev->mapping.size);
1976	object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
1977					    BITS_PER_BYTE);
1978	num_pages = calc_pages_for(0, object_map_bytes) + 1;
1979	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1980	if (IS_ERR(pages))
1981		return PTR_ERR(pages);
1982
1983	reply_len = num_pages * PAGE_SIZE;
1984	rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
1985	ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
1986			     "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
1987			     NULL, 0, pages, &reply_len);
1988	if (ret)
1989		goto out;
1990
1991	p = page_address(pages[0]);
1992	end = p + min(reply_len, (size_t)PAGE_SIZE);
1993	ret = decode_object_map_header(&p, end, &object_map_size);
1994	if (ret)
1995		goto out;
1996
1997	if (object_map_size != num_objects) {
1998		rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
1999			 object_map_size, num_objects);
2000		ret = -EINVAL;
2001		goto out;
2002	}
2003
2004	if (offset_in_page(p) + object_map_bytes > reply_len) {
2005		ret = -EINVAL;
2006		goto out;
2007	}
2008
2009	rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
2010	if (!rbd_dev->object_map) {
2011		ret = -ENOMEM;
2012		goto out;
2013	}
2014
2015	rbd_dev->object_map_size = object_map_size;
2016	ceph_copy_from_page_vector(pages, rbd_dev->object_map,
2017				   offset_in_page(p), object_map_bytes);
2018
2019out:
2020	ceph_release_page_vector(pages, num_pages);
2021	return ret;
2022}
2023
2024static void rbd_object_map_free(struct rbd_device *rbd_dev)
2025{
2026	kvfree(rbd_dev->object_map);
2027	rbd_dev->object_map = NULL;
2028	rbd_dev->object_map_size = 0;
2029}
2030
2031static int rbd_object_map_load(struct rbd_device *rbd_dev)
2032{
2033	int ret;
2034
2035	ret = __rbd_object_map_load(rbd_dev);
2036	if (ret)
2037		return ret;
2038
2039	ret = rbd_dev_v2_get_flags(rbd_dev);
2040	if (ret) {
2041		rbd_object_map_free(rbd_dev);
2042		return ret;
2043	}
2044
2045	if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
2046		rbd_warn(rbd_dev, "object map is invalid");
2047
2048	return 0;
2049}
2050
2051static int rbd_object_map_open(struct rbd_device *rbd_dev)
 
2052{
2053	int ret;
2054
2055	ret = rbd_object_map_lock(rbd_dev);
2056	if (ret)
2057		return ret;
2058
2059	ret = rbd_object_map_load(rbd_dev);
2060	if (ret) {
2061		rbd_object_map_unlock(rbd_dev);
2062		return ret;
2063	}
2064
2065	return 0;
2066}
2067
2068static void rbd_object_map_close(struct rbd_device *rbd_dev)
2069{
2070	rbd_object_map_free(rbd_dev);
2071	rbd_object_map_unlock(rbd_dev);
2072}
2073
2074/*
2075 * This function needs snap_id (or more precisely just something to
2076 * distinguish between HEAD and snapshot object maps), new_state and
2077 * current_state that were passed to rbd_object_map_update().
2078 *
2079 * To avoid allocating and stashing a context we piggyback on the OSD
2080 * request.  A HEAD update has two ops (assert_locked).  For new_state
2081 * and current_state we decode our own object_map_update op, encoded in
2082 * rbd_cls_object_map_update().
2083 */
2084static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
2085					struct ceph_osd_request *osd_req)
2086{
2087	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2088	struct ceph_osd_data *osd_data;
2089	u64 objno;
2090	u8 state, new_state, uninitialized_var(current_state);
2091	bool has_current_state;
2092	void *p;
2093
2094	if (osd_req->r_result)
2095		return osd_req->r_result;
2096
2097	/*
2098	 * Nothing to do for a snapshot object map.
 
 
 
2099	 */
2100	if (osd_req->r_num_ops == 1)
2101		return 0;
 
2102
2103	/*
2104	 * Update in-memory HEAD object map.
2105	 */
2106	rbd_assert(osd_req->r_num_ops == 2);
2107	osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
2108	rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
2109
2110	p = page_address(osd_data->pages[0]);
2111	objno = ceph_decode_64(&p);
2112	rbd_assert(objno == obj_req->ex.oe_objno);
2113	rbd_assert(ceph_decode_64(&p) == objno + 1);
2114	new_state = ceph_decode_8(&p);
2115	has_current_state = ceph_decode_8(&p);
2116	if (has_current_state)
2117		current_state = ceph_decode_8(&p);
2118
2119	spin_lock(&rbd_dev->object_map_lock);
2120	state = __rbd_object_map_get(rbd_dev, objno);
2121	if (!has_current_state || current_state == state ||
2122	    (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
2123		__rbd_object_map_set(rbd_dev, objno, new_state);
2124	spin_unlock(&rbd_dev->object_map_lock);
2125
2126	return 0;
 
 
 
2127}
2128
2129static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
2130{
2131	struct rbd_obj_request *obj_req = osd_req->r_priv;
2132	int result;
2133
2134	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
2135	     osd_req->r_result, obj_req);
2136
2137	result = rbd_object_map_update_finish(obj_req, osd_req);
2138	rbd_obj_handle_request(obj_req, result);
2139}
2140
2141static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
2142{
2143	u8 state = rbd_object_map_get(rbd_dev, objno);
2144
2145	if (state == new_state ||
2146	    (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
2147	    (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
2148		return false;
2149
2150	return true;
2151}
2152
2153static int rbd_cls_object_map_update(struct ceph_osd_request *req,
2154				     int which, u64 objno, u8 new_state,
2155				     const u8 *current_state)
 
 
 
2156{
2157	struct page **pages;
2158	void *p, *start;
2159	int ret;
2160
2161	ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
2162	if (ret)
2163		return ret;
2164
2165	pages = ceph_alloc_page_vector(1, GFP_NOIO);
2166	if (IS_ERR(pages))
2167		return PTR_ERR(pages);
2168
2169	p = start = page_address(pages[0]);
2170	ceph_encode_64(&p, objno);
2171	ceph_encode_64(&p, objno + 1);
2172	ceph_encode_8(&p, new_state);
2173	if (current_state) {
2174		ceph_encode_8(&p, 1);
2175		ceph_encode_8(&p, *current_state);
2176	} else {
2177		ceph_encode_8(&p, 0);
2178	}
2179
2180	osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
2181					  false, true);
2182	return 0;
2183}
2184
2185/*
2186 * Return:
2187 *   0 - object map update sent
2188 *   1 - object map update isn't needed
2189 *  <0 - error
2190 */
2191static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
2192				 u8 new_state, const u8 *current_state)
2193{
2194	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2195	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2196	struct ceph_osd_request *req;
2197	int num_ops = 1;
2198	int which = 0;
2199	int ret;
2200
2201	if (snap_id == CEPH_NOSNAP) {
2202		if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
2203			return 1;
2204
2205		num_ops++; /* assert_locked */
2206	}
2207
2208	req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
2209	if (!req)
2210		return -ENOMEM;
2211
2212	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
2213	req->r_callback = rbd_object_map_callback;
2214	req->r_priv = obj_req;
2215
2216	rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
2217	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
2218	req->r_flags = CEPH_OSD_FLAG_WRITE;
2219	ktime_get_real_ts64(&req->r_mtime);
2220
2221	if (snap_id == CEPH_NOSNAP) {
2222		/*
2223		 * Protect against possible race conditions during lock
2224		 * ownership transitions.
2225		 */
2226		ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
2227					     CEPH_CLS_LOCK_EXCLUSIVE, "", "");
2228		if (ret)
2229			return ret;
2230	}
2231
2232	ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
2233					new_state, current_state);
2234	if (ret)
2235		return ret;
2236
2237	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2238	if (ret)
2239		return ret;
2240
2241	ceph_osdc_start_request(osdc, req, false);
2242	return 0;
2243}
2244
2245static void prune_extents(struct ceph_file_extent *img_extents,
2246			  u32 *num_img_extents, u64 overlap)
2247{
2248	u32 cnt = *num_img_extents;
2249
2250	/* drop extents completely beyond the overlap */
2251	while (cnt && img_extents[cnt - 1].fe_off >= overlap)
2252		cnt--;
2253
2254	if (cnt) {
2255		struct ceph_file_extent *ex = &img_extents[cnt - 1];
2256
2257		/* trim final overlapping extent */
2258		if (ex->fe_off + ex->fe_len > overlap)
2259			ex->fe_len = overlap - ex->fe_off;
2260	}
2261
2262	*num_img_extents = cnt;
2263}
2264
2265/*
2266 * Determine the byte range(s) covered by either just the object extent
2267 * or the entire object in the parent image.
2268 */
2269static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
2270				    bool entire)
2271{
2272	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2273	int ret;
2274
2275	if (!rbd_dev->parent_overlap)
2276		return 0;
2277
2278	ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
2279				  entire ? 0 : obj_req->ex.oe_off,
2280				  entire ? rbd_dev->layout.object_size :
2281							obj_req->ex.oe_len,
2282				  &obj_req->img_extents,
2283				  &obj_req->num_img_extents);
2284	if (ret)
2285		return ret;
2286
2287	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2288		      rbd_dev->parent_overlap);
2289	return 0;
2290}
2291
2292static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
2293{
2294	struct rbd_obj_request *obj_req = osd_req->r_priv;
2295
2296	switch (obj_req->img_request->data_type) {
2297	case OBJ_REQUEST_BIO:
2298		osd_req_op_extent_osd_data_bio(osd_req, which,
2299					       &obj_req->bio_pos,
2300					       obj_req->ex.oe_len);
2301		break;
2302	case OBJ_REQUEST_BVECS:
2303	case OBJ_REQUEST_OWN_BVECS:
2304		rbd_assert(obj_req->bvec_pos.iter.bi_size ==
2305							obj_req->ex.oe_len);
2306		rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
2307		osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
2308						    &obj_req->bvec_pos);
2309		break;
2310	default:
2311		BUG();
2312	}
2313}
2314
2315static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
2316{
2317	struct page **pages;
2318
2319	/*
2320	 * The response data for a STAT call consists of:
2321	 *     le64 length;
2322	 *     struct {
2323	 *         le32 tv_sec;
2324	 *         le32 tv_nsec;
2325	 *     } mtime;
2326	 */
2327	pages = ceph_alloc_page_vector(1, GFP_NOIO);
2328	if (IS_ERR(pages))
2329		return PTR_ERR(pages);
2330
2331	osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2332	osd_req_op_raw_data_in_pages(osd_req, which, pages,
2333				     8 + sizeof(struct ceph_timespec),
2334				     0, false, true);
2335	return 0;
2336}
2337
2338static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2339				u32 bytes)
2340{
2341	struct rbd_obj_request *obj_req = osd_req->r_priv;
2342	int ret;
2343
2344	ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2345	if (ret)
2346		return ret;
2347
2348	osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2349					  obj_req->copyup_bvec_count, bytes);
2350	return 0;
2351}
2352
2353static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
2354{
2355	obj_req->read_state = RBD_OBJ_READ_START;
2356	return 0;
2357}
2358
2359static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2360				      int which)
2361{
2362	struct rbd_obj_request *obj_req = osd_req->r_priv;
2363	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2364	u16 opcode;
2365
2366	if (!use_object_map(rbd_dev) ||
2367	    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2368		osd_req_op_alloc_hint_init(osd_req, which++,
2369					   rbd_dev->layout.object_size,
2370					   rbd_dev->layout.object_size);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2371	}
2372
2373	if (rbd_obj_is_entire(obj_req))
2374		opcode = CEPH_OSD_OP_WRITEFULL;
2375	else
2376		opcode = CEPH_OSD_OP_WRITE;
2377
2378	osd_req_op_extent_init(osd_req, which, opcode,
2379			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2380	rbd_osd_setup_data(osd_req, which);
2381}
2382
2383static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
2384{
2385	int ret;
2386
2387	/* reverse map the entire object onto the parent */
2388	ret = rbd_obj_calc_img_extents(obj_req, true);
2389	if (ret)
2390		return ret;
2391
2392	if (rbd_obj_copyup_enabled(obj_req))
2393		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2394
2395	obj_req->write_state = RBD_OBJ_WRITE_START;
2396	return 0;
2397}
2398
2399static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
2400{
2401	return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
2402					  CEPH_OSD_OP_ZERO;
2403}
2404
2405static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
2406					int which)
2407{
2408	struct rbd_obj_request *obj_req = osd_req->r_priv;
 
 
2409
2410	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
2411		rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2412		osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
2413	} else {
2414		osd_req_op_extent_init(osd_req, which,
2415				       truncate_or_zero_opcode(obj_req),
2416				       obj_req->ex.oe_off, obj_req->ex.oe_len,
2417				       0, 0);
2418	}
 
 
 
 
 
 
 
2419}
2420
2421static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
2422{
2423	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2424	u64 off, next_off;
2425	int ret;
2426
2427	/*
2428	 * Align the range to alloc_size boundary and punt on discards
2429	 * that are too small to free up any space.
2430	 *
2431	 * alloc_size == object_size && is_tail() is a special case for
2432	 * filestore with filestore_punch_hole = false, needed to allow
2433	 * truncate (in addition to delete).
2434	 */
2435	if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
2436	    !rbd_obj_is_tail(obj_req)) {
2437		off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
2438		next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
2439				      rbd_dev->opts->alloc_size);
2440		if (off >= next_off)
2441			return 1;
2442
2443		dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
2444		     obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
2445		     off, next_off - off);
2446		obj_req->ex.oe_off = off;
2447		obj_req->ex.oe_len = next_off - off;
2448	}
2449
2450	/* reverse map the entire object onto the parent */
2451	ret = rbd_obj_calc_img_extents(obj_req, true);
2452	if (ret)
2453		return ret;
2454
2455	obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
2456	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
2457		obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2458
2459	obj_req->write_state = RBD_OBJ_WRITE_START;
2460	return 0;
2461}
2462
2463static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2464					int which)
2465{
2466	struct rbd_obj_request *obj_req = osd_req->r_priv;
2467	u16 opcode;
2468
2469	if (rbd_obj_is_entire(obj_req)) {
2470		if (obj_req->num_img_extents) {
2471			if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2472				osd_req_op_init(osd_req, which++,
2473						CEPH_OSD_OP_CREATE, 0);
2474			opcode = CEPH_OSD_OP_TRUNCATE;
2475		} else {
2476			rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2477			osd_req_op_init(osd_req, which++,
2478					CEPH_OSD_OP_DELETE, 0);
2479			opcode = 0;
2480		}
2481	} else {
2482		opcode = truncate_or_zero_opcode(obj_req);
2483	}
2484
2485	if (opcode)
2486		osd_req_op_extent_init(osd_req, which, opcode,
2487				       obj_req->ex.oe_off, obj_req->ex.oe_len,
2488				       0, 0);
2489}
2490
2491static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
2492{
2493	int ret;
2494
2495	/* reverse map the entire object onto the parent */
2496	ret = rbd_obj_calc_img_extents(obj_req, true);
2497	if (ret)
2498		return ret;
 
 
2499
2500	if (rbd_obj_copyup_enabled(obj_req))
2501		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2502	if (!obj_req->num_img_extents) {
2503		obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
2504		if (rbd_obj_is_entire(obj_req))
2505			obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2506	}
2507
2508	obj_req->write_state = RBD_OBJ_WRITE_START;
2509	return 0;
2510}
2511
2512static int count_write_ops(struct rbd_obj_request *obj_req)
2513{
2514	struct rbd_img_request *img_req = obj_req->img_request;
2515
2516	switch (img_req->op_type) {
2517	case OBJ_OP_WRITE:
2518		if (!use_object_map(img_req->rbd_dev) ||
2519		    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2520			return 2; /* setallochint + write/writefull */
2521
2522		return 1; /* write/writefull */
2523	case OBJ_OP_DISCARD:
2524		return 1; /* delete/truncate/zero */
2525	case OBJ_OP_ZEROOUT:
2526		if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2527		    !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2528			return 2; /* create + truncate */
2529
2530		return 1; /* delete/truncate/zero */
2531	default:
2532		BUG();
2533	}
2534}
2535
2536static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2537				    int which)
2538{
2539	struct rbd_obj_request *obj_req = osd_req->r_priv;
2540
2541	switch (obj_req->img_request->op_type) {
2542	case OBJ_OP_WRITE:
2543		__rbd_osd_setup_write_ops(osd_req, which);
2544		break;
2545	case OBJ_OP_DISCARD:
2546		__rbd_osd_setup_discard_ops(osd_req, which);
2547		break;
2548	case OBJ_OP_ZEROOUT:
2549		__rbd_osd_setup_zeroout_ops(osd_req, which);
 
 
2550		break;
2551	default:
2552		BUG();
 
 
2553	}
 
 
 
2554}
2555
2556/*
2557 * Prune the list of object requests (adjust offset and/or length, drop
2558 * redundant requests).  Prepare object request state machines and image
2559 * request state machine for execution.
2560 */
2561static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2562{
2563	struct rbd_obj_request *obj_req, *next_obj_req;
2564	int ret;
 
2565
2566	for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
2567		switch (img_req->op_type) {
2568		case OBJ_OP_READ:
2569			ret = rbd_obj_init_read(obj_req);
2570			break;
2571		case OBJ_OP_WRITE:
2572			ret = rbd_obj_init_write(obj_req);
2573			break;
2574		case OBJ_OP_DISCARD:
2575			ret = rbd_obj_init_discard(obj_req);
2576			break;
2577		case OBJ_OP_ZEROOUT:
2578			ret = rbd_obj_init_zeroout(obj_req);
2579			break;
2580		default:
2581			BUG();
2582		}
2583		if (ret < 0)
2584			return ret;
2585		if (ret > 0) {
2586			rbd_img_obj_request_del(img_req, obj_req);
2587			continue;
2588		}
2589	}
2590
2591	img_req->state = RBD_IMG_START;
2592	return 0;
 
2593}
2594
2595union rbd_img_fill_iter {
2596	struct ceph_bio_iter	bio_iter;
2597	struct ceph_bvec_iter	bvec_iter;
2598};
2599
2600struct rbd_img_fill_ctx {
2601	enum obj_request_type	pos_type;
2602	union rbd_img_fill_iter	*pos;
2603	union rbd_img_fill_iter	iter;
2604	ceph_object_extent_fn_t	set_pos_fn;
2605	ceph_object_extent_fn_t	count_fn;
2606	ceph_object_extent_fn_t	copy_fn;
2607};
2608
2609static struct ceph_object_extent *alloc_object_extent(void *arg)
2610{
2611	struct rbd_img_request *img_req = arg;
2612	struct rbd_obj_request *obj_req;
 
 
2613
2614	obj_req = rbd_obj_request_create();
2615	if (!obj_req)
2616		return NULL;
2617
2618	rbd_img_obj_request_add(img_req, obj_req);
2619	return &obj_req->ex;
 
2620}
2621
2622/*
2623 * While su != os && sc == 1 is technically not fancy (it's the same
2624 * layout as su == os && sc == 1), we can't use the nocopy path for it
2625 * because ->set_pos_fn() should be called only once per object.
2626 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2627 * treat su != os && sc == 1 as fancy.
2628 */
2629static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
 
 
 
 
2630{
2631	return l->stripe_unit != l->object_size;
2632}
 
2633
2634static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2635				       struct ceph_file_extent *img_extents,
2636				       u32 num_img_extents,
2637				       struct rbd_img_fill_ctx *fctx)
2638{
2639	u32 i;
2640	int ret;
2641
2642	img_req->data_type = fctx->pos_type;
2643
2644	/*
2645	 * Create object requests and set each object request's starting
2646	 * position in the provided bio (list) or bio_vec array.
2647	 */
2648	fctx->iter = *fctx->pos;
2649	for (i = 0; i < num_img_extents; i++) {
2650		ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2651					   img_extents[i].fe_off,
2652					   img_extents[i].fe_len,
2653					   &img_req->object_extents,
2654					   alloc_object_extent, img_req,
2655					   fctx->set_pos_fn, &fctx->iter);
2656		if (ret)
2657			return ret;
2658	}
2659
2660	return __rbd_img_fill_request(img_req);
2661}
2662
2663/*
2664 * Map a list of image extents to a list of object extents, create the
2665 * corresponding object requests (normally each to a different object,
2666 * but not always) and add them to @img_req.  For each object request,
2667 * set up its data descriptor to point to the corresponding chunk(s) of
2668 * @fctx->pos data buffer.
2669 *
2670 * Because ceph_file_to_extents() will merge adjacent object extents
2671 * together, each object request's data descriptor may point to multiple
2672 * different chunks of @fctx->pos data buffer.
2673 *
2674 * @fctx->pos data buffer is assumed to be large enough.
2675 */
2676static int rbd_img_fill_request(struct rbd_img_request *img_req,
2677				struct ceph_file_extent *img_extents,
2678				u32 num_img_extents,
2679				struct rbd_img_fill_ctx *fctx)
2680{
2681	struct rbd_device *rbd_dev = img_req->rbd_dev;
2682	struct rbd_obj_request *obj_req;
2683	u32 i;
2684	int ret;
2685
2686	if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2687	    !rbd_layout_is_fancy(&rbd_dev->layout))
2688		return rbd_img_fill_request_nocopy(img_req, img_extents,
2689						   num_img_extents, fctx);
2690
2691	img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2692
2693	/*
2694	 * Create object requests and determine ->bvec_count for each object
2695	 * request.  Note that ->bvec_count sum over all object requests may
2696	 * be greater than the number of bio_vecs in the provided bio (list)
2697	 * or bio_vec array because when mapped, those bio_vecs can straddle
2698	 * stripe unit boundaries.
2699	 */
2700	fctx->iter = *fctx->pos;
2701	for (i = 0; i < num_img_extents; i++) {
2702		ret = ceph_file_to_extents(&rbd_dev->layout,
2703					   img_extents[i].fe_off,
2704					   img_extents[i].fe_len,
2705					   &img_req->object_extents,
2706					   alloc_object_extent, img_req,
2707					   fctx->count_fn, &fctx->iter);
2708		if (ret)
2709			return ret;
2710	}
2711
2712	for_each_obj_request(img_req, obj_req) {
2713		obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2714					      sizeof(*obj_req->bvec_pos.bvecs),
2715					      GFP_NOIO);
2716		if (!obj_req->bvec_pos.bvecs)
2717			return -ENOMEM;
2718	}
2719
2720	/*
2721	 * Fill in each object request's private bio_vec array, splitting and
2722	 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2723	 */
2724	fctx->iter = *fctx->pos;
2725	for (i = 0; i < num_img_extents; i++) {
2726		ret = ceph_iterate_extents(&rbd_dev->layout,
2727					   img_extents[i].fe_off,
2728					   img_extents[i].fe_len,
2729					   &img_req->object_extents,
2730					   fctx->copy_fn, &fctx->iter);
2731		if (ret)
2732			return ret;
2733	}
2734
2735	return __rbd_img_fill_request(img_req);
2736}
2737
2738static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2739			       u64 off, u64 len)
2740{
2741	struct ceph_file_extent ex = { off, len };
2742	union rbd_img_fill_iter dummy;
2743	struct rbd_img_fill_ctx fctx = {
2744		.pos_type = OBJ_REQUEST_NODATA,
2745		.pos = &dummy,
2746	};
2747
2748	return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2749}
2750
2751static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
 
 
 
 
 
 
2752{
2753	struct rbd_obj_request *obj_req =
2754	    container_of(ex, struct rbd_obj_request, ex);
2755	struct ceph_bio_iter *it = arg;
 
 
2756
2757	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2758	obj_req->bio_pos = *it;
2759	ceph_bio_iter_advance(it, bytes);
2760}
2761
2762static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2763{
2764	struct rbd_obj_request *obj_req =
2765	    container_of(ex, struct rbd_obj_request, ex);
2766	struct ceph_bio_iter *it = arg;
2767
2768	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2769	ceph_bio_iter_advance_step(it, bytes, ({
2770		obj_req->bvec_count++;
2771	}));
 
 
2772
2773}
 
 
2774
2775static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2776{
2777	struct rbd_obj_request *obj_req =
2778	    container_of(ex, struct rbd_obj_request, ex);
2779	struct ceph_bio_iter *it = arg;
2780
2781	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2782	ceph_bio_iter_advance_step(it, bytes, ({
2783		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2784		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2785	}));
2786}
2787
2788static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2789				   struct ceph_file_extent *img_extents,
2790				   u32 num_img_extents,
2791				   struct ceph_bio_iter *bio_pos)
2792{
2793	struct rbd_img_fill_ctx fctx = {
2794		.pos_type = OBJ_REQUEST_BIO,
2795		.pos = (union rbd_img_fill_iter *)bio_pos,
2796		.set_pos_fn = set_bio_pos,
2797		.count_fn = count_bio_bvecs,
2798		.copy_fn = copy_bio_bvecs,
2799	};
2800
2801	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2802				    &fctx);
2803}
2804
2805static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2806				 u64 off, u64 len, struct bio *bio)
2807{
2808	struct ceph_file_extent ex = { off, len };
2809	struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2810
2811	return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2812}
2813
2814static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2815{
2816	struct rbd_obj_request *obj_req =
2817	    container_of(ex, struct rbd_obj_request, ex);
2818	struct ceph_bvec_iter *it = arg;
2819
2820	obj_req->bvec_pos = *it;
2821	ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2822	ceph_bvec_iter_advance(it, bytes);
2823}
2824
2825static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
 
 
2826{
2827	struct rbd_obj_request *obj_req =
2828	    container_of(ex, struct rbd_obj_request, ex);
2829	struct ceph_bvec_iter *it = arg;
2830
2831	ceph_bvec_iter_advance_step(it, bytes, ({
2832		obj_req->bvec_count++;
2833	}));
2834}
2835
2836static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2837{
2838	struct rbd_obj_request *obj_req =
2839	    container_of(ex, struct rbd_obj_request, ex);
2840	struct ceph_bvec_iter *it = arg;
2841
2842	ceph_bvec_iter_advance_step(it, bytes, ({
2843		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2844		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2845	}));
2846}
2847
2848static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2849				     struct ceph_file_extent *img_extents,
2850				     u32 num_img_extents,
2851				     struct ceph_bvec_iter *bvec_pos)
2852{
2853	struct rbd_img_fill_ctx fctx = {
2854		.pos_type = OBJ_REQUEST_BVECS,
2855		.pos = (union rbd_img_fill_iter *)bvec_pos,
2856		.set_pos_fn = set_bvec_pos,
2857		.count_fn = count_bvecs,
2858		.copy_fn = copy_bvecs,
2859	};
2860
2861	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2862				    &fctx);
2863}
2864
2865static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2866				   struct ceph_file_extent *img_extents,
2867				   u32 num_img_extents,
2868				   struct bio_vec *bvecs)
2869{
2870	struct ceph_bvec_iter it = {
2871		.bvecs = bvecs,
2872		.iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2873							     num_img_extents) },
2874	};
2875
2876	return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2877					 &it);
2878}
2879
2880static void rbd_img_handle_request_work(struct work_struct *work)
2881{
2882	struct rbd_img_request *img_req =
2883	    container_of(work, struct rbd_img_request, work);
2884
2885	rbd_img_handle_request(img_req, img_req->work_result);
2886}
 
2887
2888static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2889{
2890	INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2891	img_req->work_result = result;
2892	queue_work(rbd_wq, &img_req->work);
2893}
2894
2895static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
2896{
2897	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2898
2899	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
2900		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2901		return true;
 
 
 
 
 
 
 
 
 
 
2902	}
2903
2904	dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
2905	     obj_req->ex.oe_objno);
2906	return false;
2907}
2908
2909static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
 
 
 
2910{
2911	struct ceph_osd_request *osd_req;
2912	int ret;
2913
2914	osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2915	if (IS_ERR(osd_req))
2916		return PTR_ERR(osd_req);
2917
2918	osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2919			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2920	rbd_osd_setup_data(osd_req, 0);
2921	rbd_osd_format_read(osd_req);
2922
2923	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2924	if (ret)
2925		return ret;
2926
2927	rbd_osd_submit(osd_req);
2928	return 0;
2929}
2930
2931static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
 
 
 
 
 
 
2932{
2933	struct rbd_img_request *img_req = obj_req->img_request;
2934	struct rbd_img_request *child_img_req;
2935	int ret;
2936
2937	child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2938					       OBJ_OP_READ, NULL);
2939	if (!child_img_req)
2940		return -ENOMEM;
2941
2942	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2943	child_img_req->obj_request = obj_req;
 
2944
2945	dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
2946	     obj_req);
2947
2948	if (!rbd_img_is_write(img_req)) {
2949		switch (img_req->data_type) {
2950		case OBJ_REQUEST_BIO:
2951			ret = __rbd_img_fill_from_bio(child_img_req,
2952						      obj_req->img_extents,
2953						      obj_req->num_img_extents,
2954						      &obj_req->bio_pos);
2955			break;
2956		case OBJ_REQUEST_BVECS:
2957		case OBJ_REQUEST_OWN_BVECS:
2958			ret = __rbd_img_fill_from_bvecs(child_img_req,
2959						      obj_req->img_extents,
2960						      obj_req->num_img_extents,
2961						      &obj_req->bvec_pos);
2962			break;
2963		default:
2964			BUG();
2965		}
2966	} else {
2967		ret = rbd_img_fill_from_bvecs(child_img_req,
2968					      obj_req->img_extents,
2969					      obj_req->num_img_extents,
2970					      obj_req->copyup_bvecs);
2971	}
2972	if (ret) {
2973		rbd_img_request_put(child_img_req);
2974		return ret;
2975	}
2976
2977	/* avoid parent chain recursion */
2978	rbd_img_schedule(child_img_req, 0);
2979	return 0;
2980}
2981
2982static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
 
 
 
 
 
 
 
 
 
 
 
 
 
2983{
2984	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2985	int ret;
2986
2987again:
2988	switch (obj_req->read_state) {
2989	case RBD_OBJ_READ_START:
2990		rbd_assert(!*result);
2991
2992		if (!rbd_obj_may_exist(obj_req)) {
2993			*result = -ENOENT;
2994			obj_req->read_state = RBD_OBJ_READ_OBJECT;
2995			goto again;
2996		}
2997
2998		ret = rbd_obj_read_object(obj_req);
2999		if (ret) {
3000			*result = ret;
3001			return true;
3002		}
3003		obj_req->read_state = RBD_OBJ_READ_OBJECT;
3004		return false;
3005	case RBD_OBJ_READ_OBJECT:
3006		if (*result == -ENOENT && rbd_dev->parent_overlap) {
3007			/* reverse map this object extent onto the parent */
3008			ret = rbd_obj_calc_img_extents(obj_req, false);
3009			if (ret) {
3010				*result = ret;
3011				return true;
3012			}
3013			if (obj_req->num_img_extents) {
3014				ret = rbd_obj_read_from_parent(obj_req);
3015				if (ret) {
3016					*result = ret;
3017					return true;
3018				}
3019				obj_req->read_state = RBD_OBJ_READ_PARENT;
3020				return false;
3021			}
3022		}
3023
3024		/*
3025		 * -ENOENT means a hole in the image -- zero-fill the entire
3026		 * length of the request.  A short read also implies zero-fill
3027		 * to the end of the request.
3028		 */
3029		if (*result == -ENOENT) {
3030			rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
3031			*result = 0;
3032		} else if (*result >= 0) {
3033			if (*result < obj_req->ex.oe_len)
3034				rbd_obj_zero_range(obj_req, *result,
3035						obj_req->ex.oe_len - *result);
3036			else
3037				rbd_assert(*result == obj_req->ex.oe_len);
3038			*result = 0;
3039		}
3040		return true;
3041	case RBD_OBJ_READ_PARENT:
3042		/*
3043		 * The parent image is read only up to the overlap -- zero-fill
3044		 * from the overlap to the end of the request.
3045		 */
3046		if (!*result) {
3047			u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
3048
3049			if (obj_overlap < obj_req->ex.oe_len)
3050				rbd_obj_zero_range(obj_req, obj_overlap,
3051					    obj_req->ex.oe_len - obj_overlap);
3052		}
3053		return true;
3054	default:
3055		BUG();
3056	}
3057}
3058
3059static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
3060{
3061	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3062
3063	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
3064		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
3065
3066	if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
3067	    (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
3068		dout("%s %p noop for nonexistent\n", __func__, obj_req);
3069		return true;
3070	}
3071
3072	return false;
3073}
3074
3075/*
3076 * Return:
3077 *   0 - object map update sent
3078 *   1 - object map update isn't needed
3079 *  <0 - error
3080 */
3081static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
 
 
 
3082{
3083	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3084	u8 new_state;
3085
3086	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3087		return 1;
 
3088
3089	if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3090		new_state = OBJECT_PENDING;
3091	else
3092		new_state = OBJECT_EXISTS;
 
3093
3094	return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
3095}
3096
3097static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
3098{
3099	struct ceph_osd_request *osd_req;
3100	int num_ops = count_write_ops(obj_req);
3101	int which = 0;
3102	int ret;
3103
3104	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
3105		num_ops++; /* stat */
3106
3107	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3108	if (IS_ERR(osd_req))
3109		return PTR_ERR(osd_req);
3110
3111	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3112		ret = rbd_osd_setup_stat(osd_req, which++);
3113		if (ret)
3114			return ret;
3115	}
 
 
 
 
 
 
 
 
 
3116
3117	rbd_osd_setup_write_ops(osd_req, which);
3118	rbd_osd_format_write(osd_req);
 
3119
3120	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3121	if (ret)
3122		return ret;
3123
3124	rbd_osd_submit(osd_req);
3125	return 0;
3126}
3127
3128/*
3129 * copyup_bvecs pages are never highmem pages
3130 */
3131static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
3132{
3133	struct ceph_bvec_iter it = {
3134		.bvecs = bvecs,
3135		.iter = { .bi_size = bytes },
3136	};
3137
3138	ceph_bvec_iter_advance_step(&it, bytes, ({
3139		if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
3140			       bv.bv_len))
3141			return false;
3142	}));
3143	return true;
3144}
3145
3146#define MODS_ONLY	U32_MAX
3147
3148static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
3149				      u32 bytes)
3150{
3151	struct ceph_osd_request *osd_req;
3152	int ret;
3153
3154	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
3155	rbd_assert(bytes > 0 && bytes != MODS_ONLY);
3156
3157	osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3158	if (IS_ERR(osd_req))
3159		return PTR_ERR(osd_req);
3160
3161	ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
3162	if (ret)
3163		return ret;
3164
3165	rbd_osd_format_write(osd_req);
3166
3167	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3168	if (ret)
3169		return ret;
3170
3171	rbd_osd_submit(osd_req);
3172	return 0;
3173}
3174
3175static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3176					u32 bytes)
3177{
3178	struct ceph_osd_request *osd_req;
3179	int num_ops = count_write_ops(obj_req);
3180	int which = 0;
3181	int ret;
3182
3183	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
3184
3185	if (bytes != MODS_ONLY)
3186		num_ops++; /* copyup */
3187
3188	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3189	if (IS_ERR(osd_req))
3190		return PTR_ERR(osd_req);
3191
3192	if (bytes != MODS_ONLY) {
3193		ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
3194		if (ret)
3195			return ret;
3196	}
3197
3198	rbd_osd_setup_write_ops(osd_req, which);
3199	rbd_osd_format_write(osd_req);
3200
3201	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3202	if (ret)
3203		return ret;
3204
3205	rbd_osd_submit(osd_req);
3206	return 0;
3207}
3208
3209static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
 
 
3210{
3211	u32 i;
 
3212
3213	rbd_assert(!obj_req->copyup_bvecs);
3214	obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
3215	obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
3216					sizeof(*obj_req->copyup_bvecs),
3217					GFP_NOIO);
3218	if (!obj_req->copyup_bvecs)
3219		return -ENOMEM;
3220
3221	for (i = 0; i < obj_req->copyup_bvec_count; i++) {
3222		unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
 
 
3223
3224		obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
3225		if (!obj_req->copyup_bvecs[i].bv_page)
3226			return -ENOMEM;
3227
3228		obj_req->copyup_bvecs[i].bv_offset = 0;
3229		obj_req->copyup_bvecs[i].bv_len = len;
3230		obj_overlap -= len;
3231	}
3232
3233	rbd_assert(!obj_overlap);
3234	return 0;
3235}
3236
3237/*
3238 * The target object doesn't exist.  Read the data for the entire
3239 * target object up to the overlap point (if any) from the parent,
3240 * so we can use it for a copyup.
3241 */
3242static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
3243{
3244	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3245	int ret;
3246
3247	rbd_assert(obj_req->num_img_extents);
3248	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
3249		      rbd_dev->parent_overlap);
3250	if (!obj_req->num_img_extents) {
3251		/*
3252		 * The overlap has become 0 (most likely because the
3253		 * image has been flattened).  Re-submit the original write
3254		 * request -- pass MODS_ONLY since the copyup isn't needed
3255		 * anymore.
3256		 */
3257		return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
3258	}
3259
3260	ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
3261	if (ret)
3262		return ret;
3263
3264	return rbd_obj_read_from_parent(obj_req);
3265}
3266
3267static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
3268{
3269	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3270	struct ceph_snap_context *snapc = obj_req->img_request->snapc;
3271	u8 new_state;
3272	u32 i;
3273	int ret;
3274
3275	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
 
3276
3277	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3278		return;
 
 
 
3279
3280	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3281		return;
 
 
 
 
 
 
 
3282
3283	for (i = 0; i < snapc->num_snaps; i++) {
3284		if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
3285		    i + 1 < snapc->num_snaps)
3286			new_state = OBJECT_EXISTS_CLEAN;
3287		else
3288			new_state = OBJECT_EXISTS;
3289
3290		ret = rbd_object_map_update(obj_req, snapc->snaps[i],
3291					    new_state, NULL);
3292		if (ret < 0) {
3293			obj_req->pending.result = ret;
3294			return;
3295		}
3296
3297		rbd_assert(!ret);
3298		obj_req->pending.num_pending++;
 
 
 
 
3299	}
 
 
3300}
3301
3302static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
3303{
3304	u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3305	int ret;
 
3306
3307	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
 
3308
3309	/*
3310	 * Only send non-zero copyup data to save some I/O and network
3311	 * bandwidth -- zero copyup data is equivalent to the object not
3312	 * existing.
3313	 */
3314	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3315		bytes = 0;
3316
3317	if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3318		/*
3319		 * Send a copyup request with an empty snapshot context to
3320		 * deep-copyup the object through all existing snapshots.
3321		 * A second request with the current snapshot context will be
3322		 * sent for the actual modification.
3323		 */
3324		ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3325		if (ret) {
3326			obj_req->pending.result = ret;
3327			return;
3328		}
3329
3330		obj_req->pending.num_pending++;
3331		bytes = MODS_ONLY;
3332	}
3333
3334	ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3335	if (ret) {
3336		obj_req->pending.result = ret;
3337		return;
3338	}
3339
3340	obj_req->pending.num_pending++;
3341}
 
 
3342
3343static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
3344{
3345	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3346	int ret;
3347
3348again:
3349	switch (obj_req->copyup_state) {
3350	case RBD_OBJ_COPYUP_START:
3351		rbd_assert(!*result);
3352
3353		ret = rbd_obj_copyup_read_parent(obj_req);
3354		if (ret) {
3355			*result = ret;
3356			return true;
3357		}
3358		if (obj_req->num_img_extents)
3359			obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3360		else
3361			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3362		return false;
3363	case RBD_OBJ_COPYUP_READ_PARENT:
3364		if (*result)
3365			return true;
3366
3367		if (is_zero_bvecs(obj_req->copyup_bvecs,
3368				  rbd_obj_img_extents_bytes(obj_req))) {
3369			dout("%s %p detected zeros\n", __func__, obj_req);
3370			obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
3371		}
3372
3373		rbd_obj_copyup_object_maps(obj_req);
3374		if (!obj_req->pending.num_pending) {
3375			*result = obj_req->pending.result;
3376			obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
3377			goto again;
3378		}
3379		obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
3380		return false;
3381	case __RBD_OBJ_COPYUP_OBJECT_MAPS:
3382		if (!pending_result_dec(&obj_req->pending, result))
3383			return false;
3384		/* fall through */
3385	case RBD_OBJ_COPYUP_OBJECT_MAPS:
3386		if (*result) {
3387			rbd_warn(rbd_dev, "snap object map update failed: %d",
3388				 *result);
3389			return true;
3390		}
3391
3392		rbd_obj_copyup_write_object(obj_req);
3393		if (!obj_req->pending.num_pending) {
3394			*result = obj_req->pending.result;
3395			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3396			goto again;
3397		}
3398		obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3399		return false;
3400	case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3401		if (!pending_result_dec(&obj_req->pending, result))
3402			return false;
3403		/* fall through */
3404	case RBD_OBJ_COPYUP_WRITE_OBJECT:
3405		return true;
3406	default:
3407		BUG();
3408	}
3409}
3410
3411/*
3412 * Return:
3413 *   0 - object map update sent
3414 *   1 - object map update isn't needed
3415 *  <0 - error
 
 
3416 */
3417static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
 
 
3418{
3419	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3420	u8 current_state = OBJECT_PENDING;
 
 
 
 
 
 
 
 
3421
3422	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3423		return 1;
3424
3425	if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
3426		return 1;
 
 
 
 
 
 
 
 
 
 
 
3427
3428	return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
3429				     &current_state);
3430}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3431
3432static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
3433{
3434	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3435	int ret;
 
3436
3437again:
3438	switch (obj_req->write_state) {
3439	case RBD_OBJ_WRITE_START:
3440		rbd_assert(!*result);
3441
3442		if (rbd_obj_write_is_noop(obj_req))
3443			return true;
 
 
 
 
 
 
 
 
 
3444
3445		ret = rbd_obj_write_pre_object_map(obj_req);
3446		if (ret < 0) {
3447			*result = ret;
3448			return true;
3449		}
3450		obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
3451		if (ret > 0)
3452			goto again;
3453		return false;
3454	case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
3455		if (*result) {
3456			rbd_warn(rbd_dev, "pre object map update failed: %d",
3457				 *result);
3458			return true;
3459		}
3460		ret = rbd_obj_write_object(obj_req);
3461		if (ret) {
3462			*result = ret;
3463			return true;
3464		}
3465		obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
3466		return false;
3467	case RBD_OBJ_WRITE_OBJECT:
3468		if (*result == -ENOENT) {
3469			if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3470				*result = 0;
3471				obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3472				obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3473				goto again;
3474			}
3475			/*
3476			 * On a non-existent object:
3477			 *   delete - -ENOENT, truncate/zero - 0
3478			 */
3479			if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3480				*result = 0;
3481		}
3482		if (*result)
3483			return true;
3484
3485		obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3486		goto again;
3487	case __RBD_OBJ_WRITE_COPYUP:
3488		if (!rbd_obj_advance_copyup(obj_req, result))
3489			return false;
3490		/* fall through */
3491	case RBD_OBJ_WRITE_COPYUP:
3492		if (*result) {
3493			rbd_warn(rbd_dev, "copyup failed: %d", *result);
3494			return true;
3495		}
3496		ret = rbd_obj_write_post_object_map(obj_req);
3497		if (ret < 0) {
3498			*result = ret;
3499			return true;
3500		}
3501		obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
3502		if (ret > 0)
3503			goto again;
3504		return false;
3505	case RBD_OBJ_WRITE_POST_OBJECT_MAP:
3506		if (*result)
3507			rbd_warn(rbd_dev, "post object map update failed: %d",
3508				 *result);
3509		return true;
3510	default:
3511		BUG();
3512	}
3513}
3514
3515/*
3516 * Return true if @obj_req is completed.
3517 */
3518static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
3519				     int *result)
3520{
3521	struct rbd_img_request *img_req = obj_req->img_request;
3522	struct rbd_device *rbd_dev = img_req->rbd_dev;
3523	bool done;
3524
3525	mutex_lock(&obj_req->state_mutex);
3526	if (!rbd_img_is_write(img_req))
3527		done = rbd_obj_advance_read(obj_req, result);
3528	else
3529		done = rbd_obj_advance_write(obj_req, result);
3530	mutex_unlock(&obj_req->state_mutex);
3531
3532	if (done && *result) {
3533		rbd_assert(*result < 0);
3534		rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
3535			 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
3536			 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
3537	}
3538	return done;
3539}
3540
3541/*
3542 * This is open-coded in rbd_img_handle_request() to avoid parent chain
3543 * recursion.
3544 */
3545static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
3546{
3547	if (__rbd_obj_handle_request(obj_req, &result))
3548		rbd_img_handle_request(obj_req->img_request, result);
3549}
3550
3551static bool need_exclusive_lock(struct rbd_img_request *img_req)
3552{
3553	struct rbd_device *rbd_dev = img_req->rbd_dev;
3554
3555	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3556		return false;
3557
3558	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
3559		return false;
3560
3561	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
3562	if (rbd_dev->opts->lock_on_read ||
3563	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3564		return true;
3565
3566	return rbd_img_is_write(img_req);
3567}
3568
3569static bool rbd_lock_add_request(struct rbd_img_request *img_req)
 
3570{
3571	struct rbd_device *rbd_dev = img_req->rbd_dev;
3572	bool locked;
 
 
3573
3574	lockdep_assert_held(&rbd_dev->lock_rwsem);
3575	locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
3576	spin_lock(&rbd_dev->lock_lists_lock);
3577	rbd_assert(list_empty(&img_req->lock_item));
3578	if (!locked)
3579		list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3580	else
3581		list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
3582	spin_unlock(&rbd_dev->lock_lists_lock);
3583	return locked;
3584}
3585
3586static void rbd_lock_del_request(struct rbd_img_request *img_req)
3587{
3588	struct rbd_device *rbd_dev = img_req->rbd_dev;
3589	bool need_wakeup;
3590
3591	lockdep_assert_held(&rbd_dev->lock_rwsem);
3592	spin_lock(&rbd_dev->lock_lists_lock);
3593	rbd_assert(!list_empty(&img_req->lock_item));
3594	list_del_init(&img_req->lock_item);
3595	need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3596		       list_empty(&rbd_dev->running_list));
3597	spin_unlock(&rbd_dev->lock_lists_lock);
3598	if (need_wakeup)
3599		complete(&rbd_dev->releasing_wait);
3600}
3601
3602static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3603{
3604	struct rbd_device *rbd_dev = img_req->rbd_dev;
3605
3606	if (!need_exclusive_lock(img_req))
3607		return 1;
3608
3609	if (rbd_lock_add_request(img_req))
3610		return 1;
3611
3612	if (rbd_dev->opts->exclusive) {
3613		WARN_ON(1); /* lock got released? */
3614		return -EROFS;
3615	}
3616
3617	/*
3618	 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3619	 * and cancel_delayed_work() in wake_lock_waiters().
 
 
3620	 */
3621	dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3622	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3623	return 0;
3624}
3625
3626static void rbd_img_object_requests(struct rbd_img_request *img_req)
3627{
3628	struct rbd_obj_request *obj_req;
3629
3630	rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3631
3632	for_each_obj_request(img_req, obj_req) {
3633		int result = 0;
3634
3635		if (__rbd_obj_handle_request(obj_req, &result)) {
3636			if (result) {
3637				img_req->pending.result = result;
3638				return;
3639			}
3640		} else {
3641			img_req->pending.num_pending++;
3642		}
3643	}
3644}
3645
3646static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
 
3647{
3648	struct rbd_device *rbd_dev = img_req->rbd_dev;
3649	int ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3650
3651again:
3652	switch (img_req->state) {
3653	case RBD_IMG_START:
3654		rbd_assert(!*result);
3655
3656		ret = rbd_img_exclusive_lock(img_req);
3657		if (ret < 0) {
3658			*result = ret;
3659			return true;
3660		}
3661		img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3662		if (ret > 0)
3663			goto again;
3664		return false;
3665	case RBD_IMG_EXCLUSIVE_LOCK:
3666		if (*result)
3667			return true;
3668
3669		rbd_assert(!need_exclusive_lock(img_req) ||
3670			   __rbd_is_lock_owner(rbd_dev));
3671
3672		rbd_img_object_requests(img_req);
3673		if (!img_req->pending.num_pending) {
3674			*result = img_req->pending.result;
3675			img_req->state = RBD_IMG_OBJECT_REQUESTS;
3676			goto again;
3677		}
3678		img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3679		return false;
3680	case __RBD_IMG_OBJECT_REQUESTS:
3681		if (!pending_result_dec(&img_req->pending, result))
3682			return false;
3683		/* fall through */
3684	case RBD_IMG_OBJECT_REQUESTS:
3685		return true;
3686	default:
3687		BUG();
3688	}
3689}
3690
3691/*
3692 * Return true if @img_req is completed.
3693 */
3694static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3695				     int *result)
3696{
3697	struct rbd_device *rbd_dev = img_req->rbd_dev;
3698	bool done;
3699
3700	if (need_exclusive_lock(img_req)) {
3701		down_read(&rbd_dev->lock_rwsem);
3702		mutex_lock(&img_req->state_mutex);
3703		done = rbd_img_advance(img_req, result);
3704		if (done)
3705			rbd_lock_del_request(img_req);
3706		mutex_unlock(&img_req->state_mutex);
3707		up_read(&rbd_dev->lock_rwsem);
3708	} else {
3709		mutex_lock(&img_req->state_mutex);
3710		done = rbd_img_advance(img_req, result);
3711		mutex_unlock(&img_req->state_mutex);
3712	}
3713
3714	if (done && *result) {
3715		rbd_assert(*result < 0);
3716		rbd_warn(rbd_dev, "%s%s result %d",
3717		      test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3718		      obj_op_name(img_req->op_type), *result);
3719	}
3720	return done;
3721}
3722
3723static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3724{
3725again:
3726	if (!__rbd_img_handle_request(img_req, &result))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3727		return;
 
 
3728
3729	if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
3730		struct rbd_obj_request *obj_req = img_req->obj_request;
3731
3732		rbd_img_request_put(img_req);
3733		if (__rbd_obj_handle_request(obj_req, &result)) {
3734			img_req = obj_req->img_request;
3735			goto again;
3736		}
3737	} else {
3738		struct request *rq = img_req->rq;
3739
3740		rbd_img_request_put(img_req);
3741		blk_mq_end_request(rq, errno_to_blk_status(result));
3742	}
3743}
3744
3745static const struct rbd_client_id rbd_empty_cid;
3746
3747static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3748			  const struct rbd_client_id *rhs)
3749{
3750	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3751}
3752
3753static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3754{
3755	struct rbd_client_id cid;
3756
3757	mutex_lock(&rbd_dev->watch_mutex);
3758	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3759	cid.handle = rbd_dev->watch_cookie;
3760	mutex_unlock(&rbd_dev->watch_mutex);
3761	return cid;
3762}
3763
3764/*
3765 * lock_rwsem must be held for write
 
 
 
 
 
 
 
 
 
 
 
3766 */
3767static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3768			      const struct rbd_client_id *cid)
3769{
3770	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3771	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3772	     cid->gid, cid->handle);
3773	rbd_dev->owner_cid = *cid; /* struct */
3774}
 
 
 
3775
3776static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3777{
3778	mutex_lock(&rbd_dev->watch_mutex);
3779	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3780	mutex_unlock(&rbd_dev->watch_mutex);
3781}
3782
3783static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3784{
3785	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
 
3786
3787	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3788	strcpy(rbd_dev->lock_cookie, cookie);
3789	rbd_set_owner_cid(rbd_dev, &cid);
3790	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3791}
 
3792
3793/*
3794 * lock_rwsem must be held for write
3795 */
3796static int rbd_lock(struct rbd_device *rbd_dev)
3797{
3798	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3799	char cookie[32];
3800	int ret;
 
3801
3802	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3803		rbd_dev->lock_cookie[0] != '\0');
 
 
 
 
 
 
 
 
 
3804
3805	format_lock_cookie(rbd_dev, cookie);
3806	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3807			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3808			    RBD_LOCK_TAG, "", 0);
3809	if (ret)
3810		return ret;
3811
3812	__rbd_lock(rbd_dev, cookie);
3813	return 0;
3814}
 
 
3815
3816/*
3817 * lock_rwsem must be held for write
3818 */
3819static void rbd_unlock(struct rbd_device *rbd_dev)
3820{
3821	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3822	int ret;
3823
3824	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3825		rbd_dev->lock_cookie[0] == '\0');
 
 
 
 
 
 
 
 
 
 
3826
3827	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3828			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
3829	if (ret && ret != -ENOENT)
3830		rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
3831
3832	/* treat errors as the image is unlocked */
3833	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3834	rbd_dev->lock_cookie[0] = '\0';
3835	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3836	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3837}
3838
3839static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3840				enum rbd_notify_op notify_op,
3841				struct page ***preply_pages,
3842				size_t *preply_len)
3843{
3844	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3845	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3846	char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3847	int buf_size = sizeof(buf);
3848	void *p = buf;
3849
3850	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3851
3852	/* encode *LockPayload NotifyMessage (op + ClientId) */
3853	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3854	ceph_encode_32(&p, notify_op);
3855	ceph_encode_64(&p, cid.gid);
3856	ceph_encode_64(&p, cid.handle);
3857
3858	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3859				&rbd_dev->header_oloc, buf, buf_size,
3860				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3861}
3862
3863static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3864			       enum rbd_notify_op notify_op)
3865{
3866	struct page **reply_pages;
3867	size_t reply_len;
 
3868
3869	__rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3870	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3871}
3872
3873static void rbd_notify_acquired_lock(struct work_struct *work)
3874{
3875	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3876						  acquired_lock_work);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3877
3878	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3879}
 
 
 
 
 
 
 
 
 
 
 
 
3880
3881static void rbd_notify_released_lock(struct work_struct *work)
3882{
3883	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3884						  released_lock_work);
3885
3886	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3887}
3888
3889static int rbd_request_lock(struct rbd_device *rbd_dev)
3890{
3891	struct page **reply_pages;
3892	size_t reply_len;
3893	bool lock_owner_responded = false;
3894	int ret;
3895
3896	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3897
3898	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3899				   &reply_pages, &reply_len);
3900	if (ret && ret != -ETIMEDOUT) {
3901		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3902		goto out;
3903	}
3904
3905	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3906		void *p = page_address(reply_pages[0]);
3907		void *const end = p + reply_len;
3908		u32 n;
3909
3910		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3911		while (n--) {
3912			u8 struct_v;
3913			u32 len;
3914
3915			ceph_decode_need(&p, end, 8 + 8, e_inval);
3916			p += 8 + 8; /* skip gid and cookie */
3917
3918			ceph_decode_32_safe(&p, end, len, e_inval);
3919			if (!len)
3920				continue;
3921
3922			if (lock_owner_responded) {
3923				rbd_warn(rbd_dev,
3924					 "duplicate lock owners detected");
3925				ret = -EIO;
3926				goto out;
3927			}
3928
3929			lock_owner_responded = true;
3930			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3931						  &struct_v, &len);
3932			if (ret) {
3933				rbd_warn(rbd_dev,
3934					 "failed to decode ResponseMessage: %d",
3935					 ret);
3936				goto e_inval;
3937			}
3938
3939			ret = ceph_decode_32(&p);
3940		}
3941	}
3942
3943	if (!lock_owner_responded) {
3944		rbd_warn(rbd_dev, "no lock owners detected");
3945		ret = -ETIMEDOUT;
3946	}
3947
3948out:
3949	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3950	return ret;
3951
3952e_inval:
3953	ret = -EINVAL;
3954	goto out;
3955}
3956
3957/*
3958 * Either image request state machine(s) or rbd_add_acquire_lock()
3959 * (i.e. "rbd map").
3960 */
3961static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
3962{
3963	struct rbd_img_request *img_req;
3964
3965	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3966	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
3967
3968	cancel_delayed_work(&rbd_dev->lock_dwork);
3969	if (!completion_done(&rbd_dev->acquire_wait)) {
3970		rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3971			   list_empty(&rbd_dev->running_list));
3972		rbd_dev->acquire_err = result;
3973		complete_all(&rbd_dev->acquire_wait);
3974		return;
3975	}
3976
3977	list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3978		mutex_lock(&img_req->state_mutex);
3979		rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3980		rbd_img_schedule(img_req, result);
3981		mutex_unlock(&img_req->state_mutex);
3982	}
3983
3984	list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
3985}
3986
3987static int get_lock_owner_info(struct rbd_device *rbd_dev,
3988			       struct ceph_locker **lockers, u32 *num_lockers)
3989{
3990	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3991	u8 lock_type;
3992	char *lock_tag;
3993	int ret;
3994
3995	dout("%s rbd_dev %p\n", __func__, rbd_dev);
 
 
 
 
 
 
 
 
 
 
 
 
3996
3997	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3998				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3999				 &lock_type, &lock_tag, lockers, num_lockers);
4000	if (ret)
4001		return ret;
4002
4003	if (*num_lockers == 0) {
4004		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
4005		goto out;
4006	}
4007
4008	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
4009		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
4010			 lock_tag);
4011		ret = -EBUSY;
4012		goto out;
4013	}
4014
4015	if (lock_type == CEPH_CLS_LOCK_SHARED) {
4016		rbd_warn(rbd_dev, "shared lock type detected");
4017		ret = -EBUSY;
 
 
 
 
 
 
 
4018		goto out;
4019	}
4020
4021	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
4022		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
4023		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
4024			 (*lockers)[0].id.cookie);
4025		ret = -EBUSY;
4026		goto out;
4027	}
4028
 
 
4029out:
4030	kfree(lock_tag);
4031	return ret;
4032}
4033
4034static int find_watcher(struct rbd_device *rbd_dev,
4035			const struct ceph_locker *locker)
4036{
4037	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4038	struct ceph_watch_item *watchers;
4039	u32 num_watchers;
4040	u64 cookie;
4041	int i;
4042	int ret;
4043
4044	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
4045				      &rbd_dev->header_oloc, &watchers,
4046				      &num_watchers);
4047	if (ret)
4048		return ret;
4049
4050	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
4051	for (i = 0; i < num_watchers; i++) {
4052		if (!memcmp(&watchers[i].addr, &locker->info.addr,
4053			    sizeof(locker->info.addr)) &&
4054		    watchers[i].cookie == cookie) {
4055			struct rbd_client_id cid = {
4056				.gid = le64_to_cpu(watchers[i].name.num),
4057				.handle = cookie,
4058			};
4059
4060			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
4061			     rbd_dev, cid.gid, cid.handle);
4062			rbd_set_owner_cid(rbd_dev, &cid);
4063			ret = 1;
4064			goto out;
4065		}
4066	}
4067
4068	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
4069	ret = 0;
4070out:
4071	kfree(watchers);
4072	return ret;
4073}
4074
4075/*
4076 * lock_rwsem must be held for write
4077 */
4078static int rbd_try_lock(struct rbd_device *rbd_dev)
4079{
4080	struct ceph_client *client = rbd_dev->rbd_client->client;
4081	struct ceph_locker *lockers;
4082	u32 num_lockers;
4083	int ret;
4084
4085	for (;;) {
4086		ret = rbd_lock(rbd_dev);
4087		if (ret != -EBUSY)
4088			return ret;
4089
4090		/* determine if the current lock holder is still alive */
4091		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
4092		if (ret)
4093			return ret;
4094
4095		if (num_lockers == 0)
4096			goto again;
 
 
 
 
 
 
 
 
 
 
 
 
 
4097
4098		ret = find_watcher(rbd_dev, lockers);
4099		if (ret)
4100			goto out; /* request lock or error */
4101
4102		rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
4103			 ENTITY_NAME(lockers[0].id.name));
4104
4105		ret = ceph_monc_blacklist_add(&client->monc,
4106					      &lockers[0].info.addr);
4107		if (ret) {
4108			rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
4109				 ENTITY_NAME(lockers[0].id.name), ret);
4110			goto out;
4111		}
4112
4113		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
4114					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
4115					  lockers[0].id.cookie,
4116					  &lockers[0].id.name);
4117		if (ret && ret != -ENOENT)
4118			goto out;
 
 
4119
4120again:
4121		ceph_free_lockers(lockers, num_lockers);
4122	}
4123
4124out:
4125	ceph_free_lockers(lockers, num_lockers);
4126	return ret;
4127}
4128
4129static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
4130{
4131	int ret;
 
 
 
 
 
4132
4133	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
4134		ret = rbd_object_map_open(rbd_dev);
4135		if (ret)
4136			return ret;
4137	}
4138
4139	return 0;
4140}
4141
4142/*
4143 * Return:
4144 *   0 - lock acquired
4145 *   1 - caller should call rbd_request_lock()
4146 *  <0 - error
4147 */
4148static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
4149{
4150	int ret;
 
 
 
 
4151
4152	down_read(&rbd_dev->lock_rwsem);
4153	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4154	     rbd_dev->lock_state);
4155	if (__rbd_is_lock_owner(rbd_dev)) {
4156		up_read(&rbd_dev->lock_rwsem);
4157		return 0;
4158	}
4159
4160	up_read(&rbd_dev->lock_rwsem);
4161	down_write(&rbd_dev->lock_rwsem);
4162	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4163	     rbd_dev->lock_state);
4164	if (__rbd_is_lock_owner(rbd_dev)) {
4165		up_write(&rbd_dev->lock_rwsem);
4166		return 0;
4167	}
4168
4169	ret = rbd_try_lock(rbd_dev);
4170	if (ret < 0) {
4171		rbd_warn(rbd_dev, "failed to lock header: %d", ret);
4172		if (ret == -EBLACKLISTED)
4173			goto out;
4174
4175		ret = 1; /* request lock anyway */
4176	}
4177	if (ret > 0) {
4178		up_write(&rbd_dev->lock_rwsem);
4179		return ret;
4180	}
4181
4182	rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4183	rbd_assert(list_empty(&rbd_dev->running_list));
4184
4185	ret = rbd_post_acquire_action(rbd_dev);
4186	if (ret) {
4187		rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
4188		/*
4189		 * Can't stay in RBD_LOCK_STATE_LOCKED because
4190		 * rbd_lock_add_request() would let the request through,
4191		 * assuming that e.g. object map is locked and loaded.
4192		 */
4193		rbd_unlock(rbd_dev);
4194	}
4195
4196out:
4197	wake_lock_waiters(rbd_dev, ret);
4198	up_write(&rbd_dev->lock_rwsem);
4199	return ret;
4200}
4201
4202static void rbd_acquire_lock(struct work_struct *work)
4203{
4204	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4205					    struct rbd_device, lock_dwork);
4206	int ret;
4207
4208	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4209again:
4210	ret = rbd_try_acquire_lock(rbd_dev);
4211	if (ret <= 0) {
4212		dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
4213		return;
4214	}
4215
4216	ret = rbd_request_lock(rbd_dev);
4217	if (ret == -ETIMEDOUT) {
4218		goto again; /* treat this as a dead client */
4219	} else if (ret == -EROFS) {
4220		rbd_warn(rbd_dev, "peer will not release lock");
4221		down_write(&rbd_dev->lock_rwsem);
4222		wake_lock_waiters(rbd_dev, ret);
4223		up_write(&rbd_dev->lock_rwsem);
4224	} else if (ret < 0) {
4225		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4226		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4227				 RBD_RETRY_DELAY);
4228	} else {
4229		/*
4230		 * lock owner acked, but resend if we don't see them
4231		 * release the lock
4232		 */
4233		dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
4234		     rbd_dev);
4235		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4236		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4237	}
4238}
4239
4240static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
4241{
4242	bool need_wait;
4243
4244	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4245	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
4246
4247	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4248		return false;
4249
4250	/*
4251	 * Ensure that all in-flight IO is flushed.
 
 
4252	 */
4253	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4254	rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4255	need_wait = !list_empty(&rbd_dev->running_list);
4256	downgrade_write(&rbd_dev->lock_rwsem);
4257	if (need_wait)
4258		wait_for_completion(&rbd_dev->releasing_wait);
4259	up_read(&rbd_dev->lock_rwsem);
 
 
 
 
4260
4261	down_write(&rbd_dev->lock_rwsem);
4262	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4263		return false;
4264
4265	rbd_assert(list_empty(&rbd_dev->running_list));
4266	return true;
4267}
4268
4269static void rbd_pre_release_action(struct rbd_device *rbd_dev)
4270{
4271	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
4272		rbd_object_map_close(rbd_dev);
4273}
4274
4275static void __rbd_release_lock(struct rbd_device *rbd_dev)
4276{
4277	rbd_assert(list_empty(&rbd_dev->running_list));
4278
4279	rbd_pre_release_action(rbd_dev);
4280	rbd_unlock(rbd_dev);
4281}
4282
4283/*
4284 * lock_rwsem must be held for write
4285 */
4286static void rbd_release_lock(struct rbd_device *rbd_dev)
4287{
4288	if (!rbd_quiesce_lock(rbd_dev))
4289		return;
4290
4291	__rbd_release_lock(rbd_dev);
4292
4293	/*
4294	 * Give others a chance to grab the lock - we would re-acquire
4295	 * almost immediately if we got new IO while draining the running
4296	 * list otherwise.  We need to ack our own notifications, so this
4297	 * lock_dwork will be requeued from rbd_handle_released_lock() by
4298	 * way of maybe_kick_acquire().
4299	 */
4300	cancel_delayed_work(&rbd_dev->lock_dwork);
4301}
4302
4303static void rbd_release_lock_work(struct work_struct *work)
4304{
4305	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4306						  unlock_work);
 
4307
4308	down_write(&rbd_dev->lock_rwsem);
4309	rbd_release_lock(rbd_dev);
4310	up_write(&rbd_dev->lock_rwsem);
4311}
4312
4313static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4314{
4315	bool have_requests;
4316
4317	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4318	if (__rbd_is_lock_owner(rbd_dev))
4319		return;
4320
4321	spin_lock(&rbd_dev->lock_lists_lock);
4322	have_requests = !list_empty(&rbd_dev->acquiring_list);
4323	spin_unlock(&rbd_dev->lock_lists_lock);
4324	if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4325		dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4326		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4327	}
4328}
4329
4330static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4331				     void **p)
4332{
4333	struct rbd_client_id cid = { 0 };
4334
4335	if (struct_v >= 2) {
4336		cid.gid = ceph_decode_64(p);
4337		cid.handle = ceph_decode_64(p);
4338	}
4339
4340	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4341	     cid.handle);
4342	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4343		down_write(&rbd_dev->lock_rwsem);
4344		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4345			/*
4346			 * we already know that the remote client is
4347			 * the owner
4348			 */
4349			up_write(&rbd_dev->lock_rwsem);
4350			return;
4351		}
4352
4353		rbd_set_owner_cid(rbd_dev, &cid);
4354		downgrade_write(&rbd_dev->lock_rwsem);
4355	} else {
4356		down_read(&rbd_dev->lock_rwsem);
4357	}
4358
4359	maybe_kick_acquire(rbd_dev);
4360	up_read(&rbd_dev->lock_rwsem);
4361}
4362
4363static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4364				     void **p)
4365{
4366	struct rbd_client_id cid = { 0 };
 
4367
4368	if (struct_v >= 2) {
4369		cid.gid = ceph_decode_64(p);
4370		cid.handle = ceph_decode_64(p);
4371	}
 
 
 
 
 
 
 
 
4372
4373	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4374	     cid.handle);
4375	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4376		down_write(&rbd_dev->lock_rwsem);
4377		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4378			dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
4379			     __func__, rbd_dev, cid.gid, cid.handle,
4380			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
4381			up_write(&rbd_dev->lock_rwsem);
4382			return;
4383		}
4384
4385		rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4386		downgrade_write(&rbd_dev->lock_rwsem);
4387	} else {
4388		down_read(&rbd_dev->lock_rwsem);
4389	}
4390
4391	maybe_kick_acquire(rbd_dev);
4392	up_read(&rbd_dev->lock_rwsem);
 
 
 
 
 
4393}
4394
4395/*
4396 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
4397 * ResponseMessage is needed.
4398 */
4399static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4400				   void **p)
4401{
4402	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4403	struct rbd_client_id cid = { 0 };
4404	int result = 1;
4405
4406	if (struct_v >= 2) {
4407		cid.gid = ceph_decode_64(p);
4408		cid.handle = ceph_decode_64(p);
4409	}
4410
4411	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4412	     cid.handle);
4413	if (rbd_cid_equal(&cid, &my_cid))
4414		return result;
4415
4416	down_read(&rbd_dev->lock_rwsem);
4417	if (__rbd_is_lock_owner(rbd_dev)) {
4418		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
4419		    rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
4420			goto out_unlock;
4421
4422		/*
4423		 * encode ResponseMessage(0) so the peer can detect
4424		 * a missing owner
4425		 */
4426		result = 0;
4427
4428		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
4429			if (!rbd_dev->opts->exclusive) {
4430				dout("%s rbd_dev %p queueing unlock_work\n",
4431				     __func__, rbd_dev);
4432				queue_work(rbd_dev->task_wq,
4433					   &rbd_dev->unlock_work);
4434			} else {
4435				/* refuse to release the lock */
4436				result = -EROFS;
4437			}
4438		}
4439	}
4440
4441out_unlock:
4442	up_read(&rbd_dev->lock_rwsem);
4443	return result;
4444}
4445
4446static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4447				     u64 notify_id, u64 cookie, s32 *result)
4448{
 
4449	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4450	char buf[4 + CEPH_ENCODING_START_BLK_LEN];
4451	int buf_size = sizeof(buf);
4452	int ret;
4453
4454	if (result) {
4455		void *p = buf;
 
 
 
 
 
 
 
 
4456
4457		/* encode ResponseMessage */
4458		ceph_start_encoding(&p, 1, 1,
4459				    buf_size - CEPH_ENCODING_START_BLK_LEN);
4460		ceph_encode_32(&p, *result);
4461	} else {
4462		buf_size = 0;
4463	}
4464
4465	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4466				   &rbd_dev->header_oloc, notify_id, cookie,
4467				   buf, buf_size);
4468	if (ret)
4469		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4470}
 
 
4471
4472static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4473				   u64 cookie)
4474{
4475	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4476	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4477}
4478
4479static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4480					  u64 notify_id, u64 cookie, s32 result)
4481{
4482	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4483	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4484}
4485
4486static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4487			 u64 notifier_id, void *data, size_t data_len)
4488{
4489	struct rbd_device *rbd_dev = arg;
4490	void *p = data;
4491	void *const end = p + data_len;
4492	u8 struct_v = 0;
4493	u32 len;
4494	u32 notify_op;
4495	int ret;
4496
4497	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4498	     __func__, rbd_dev, cookie, notify_id, data_len);
4499	if (data_len) {
4500		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4501					  &struct_v, &len);
4502		if (ret) {
4503			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4504				 ret);
4505			return;
4506		}
4507
4508		notify_op = ceph_decode_32(&p);
4509	} else {
4510		/* legacy notification for header updates */
4511		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4512		len = 0;
4513	}
4514
4515	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4516	switch (notify_op) {
4517	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4518		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4519		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4520		break;
4521	case RBD_NOTIFY_OP_RELEASED_LOCK:
4522		rbd_handle_released_lock(rbd_dev, struct_v, &p);
4523		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4524		break;
4525	case RBD_NOTIFY_OP_REQUEST_LOCK:
4526		ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
4527		if (ret <= 0)
4528			rbd_acknowledge_notify_result(rbd_dev, notify_id,
4529						      cookie, ret);
4530		else
4531			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4532		break;
4533	case RBD_NOTIFY_OP_HEADER_UPDATE:
4534		ret = rbd_dev_refresh(rbd_dev);
4535		if (ret)
4536			rbd_warn(rbd_dev, "refresh failed: %d", ret);
4537
4538		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4539		break;
4540	default:
4541		if (rbd_is_lock_owner(rbd_dev))
4542			rbd_acknowledge_notify_result(rbd_dev, notify_id,
4543						      cookie, -EOPNOTSUPP);
4544		else
4545			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4546		break;
4547	}
4548}
4549
4550static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
4551
4552static void rbd_watch_errcb(void *arg, u64 cookie, int err)
4553{
4554	struct rbd_device *rbd_dev = arg;
4555
4556	rbd_warn(rbd_dev, "encountered watch error: %d", err);
4557
4558	down_write(&rbd_dev->lock_rwsem);
4559	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4560	up_write(&rbd_dev->lock_rwsem);
4561
4562	mutex_lock(&rbd_dev->watch_mutex);
4563	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
4564		__rbd_unregister_watch(rbd_dev);
4565		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
4566
4567		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
4568	}
4569	mutex_unlock(&rbd_dev->watch_mutex);
4570}
4571
4572/*
4573 * watch_mutex must be locked
 
4574 */
4575static int __rbd_register_watch(struct rbd_device *rbd_dev)
4576{
4577	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4578	struct ceph_osd_linger_request *handle;
 
4579
4580	rbd_assert(!rbd_dev->watch_handle);
4581	dout("%s rbd_dev %p\n", __func__, rbd_dev);
 
 
 
 
 
 
 
 
4582
4583	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4584				 &rbd_dev->header_oloc, rbd_watch_cb,
4585				 rbd_watch_errcb, rbd_dev);
4586	if (IS_ERR(handle))
4587		return PTR_ERR(handle);
4588
4589	rbd_dev->watch_handle = handle;
4590	return 0;
4591}
 
4592
4593/*
4594 * watch_mutex must be locked
4595 */
4596static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
4597{
4598	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4599	int ret;
4600
4601	rbd_assert(rbd_dev->watch_handle);
4602	dout("%s rbd_dev %p\n", __func__, rbd_dev);
 
4603
4604	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
 
 
 
 
 
 
4605	if (ret)
4606		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
4607
4608	rbd_dev->watch_handle = NULL;
4609}
 
 
 
 
 
 
 
 
4610
4611static int rbd_register_watch(struct rbd_device *rbd_dev)
4612{
4613	int ret;
4614
4615	mutex_lock(&rbd_dev->watch_mutex);
4616	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
4617	ret = __rbd_register_watch(rbd_dev);
4618	if (ret)
4619		goto out;
4620
4621	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4622	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
 
 
 
 
 
 
4623
4624out:
4625	mutex_unlock(&rbd_dev->watch_mutex);
4626	return ret;
4627}
4628
4629static void cancel_tasks_sync(struct rbd_device *rbd_dev)
4630{
4631	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4632
4633	cancel_work_sync(&rbd_dev->acquired_lock_work);
4634	cancel_work_sync(&rbd_dev->released_lock_work);
4635	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4636	cancel_work_sync(&rbd_dev->unlock_work);
4637}
4638
4639static void rbd_unregister_watch(struct rbd_device *rbd_dev)
4640{
4641	cancel_tasks_sync(rbd_dev);
4642
4643	mutex_lock(&rbd_dev->watch_mutex);
4644	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
4645		__rbd_unregister_watch(rbd_dev);
4646	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4647	mutex_unlock(&rbd_dev->watch_mutex);
4648
4649	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
4650	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
4651}
4652
4653/*
4654 * lock_rwsem must be held for write
4655 */
4656static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
4657{
4658	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4659	char cookie[32];
4660	int ret;
4661
4662	if (!rbd_quiesce_lock(rbd_dev))
4663		return;
4664
4665	format_lock_cookie(rbd_dev, cookie);
4666	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
4667				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
4668				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
4669				  RBD_LOCK_TAG, cookie);
4670	if (ret) {
4671		if (ret != -EOPNOTSUPP)
4672			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
4673				 ret);
4674
4675		/*
4676		 * Lock cookie cannot be updated on older OSDs, so do
4677		 * a manual release and queue an acquire.
4678		 */
4679		__rbd_release_lock(rbd_dev);
4680		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4681	} else {
4682		__rbd_lock(rbd_dev, cookie);
4683		wake_lock_waiters(rbd_dev, 0);
4684	}
4685}
4686
4687static void rbd_reregister_watch(struct work_struct *work)
4688{
4689	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4690					    struct rbd_device, watch_dwork);
4691	int ret;
4692
4693	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4694
4695	mutex_lock(&rbd_dev->watch_mutex);
4696	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
4697		mutex_unlock(&rbd_dev->watch_mutex);
4698		return;
4699	}
4700
4701	ret = __rbd_register_watch(rbd_dev);
4702	if (ret) {
4703		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
4704		if (ret != -EBLACKLISTED && ret != -ENOENT) {
4705			queue_delayed_work(rbd_dev->task_wq,
4706					   &rbd_dev->watch_dwork,
4707					   RBD_RETRY_DELAY);
4708			mutex_unlock(&rbd_dev->watch_mutex);
4709			return;
4710		}
4711
4712		mutex_unlock(&rbd_dev->watch_mutex);
4713		down_write(&rbd_dev->lock_rwsem);
4714		wake_lock_waiters(rbd_dev, ret);
4715		up_write(&rbd_dev->lock_rwsem);
4716		return;
4717	}
4718
4719	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4720	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4721	mutex_unlock(&rbd_dev->watch_mutex);
4722
4723	down_write(&rbd_dev->lock_rwsem);
4724	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4725		rbd_reacquire_lock(rbd_dev);
4726	up_write(&rbd_dev->lock_rwsem);
4727
4728	ret = rbd_dev_refresh(rbd_dev);
4729	if (ret)
4730		rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
4731}
4732
4733/*
4734 * Synchronous osd object method call.  Returns the number of bytes
4735 * returned in the outbound buffer, or a negative error code.
4736 */
4737static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
4738			     struct ceph_object_id *oid,
4739			     struct ceph_object_locator *oloc,
4740			     const char *method_name,
4741			     const void *outbound,
4742			     size_t outbound_size,
4743			     void *inbound,
4744			     size_t inbound_size)
4745{
4746	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4747	struct page *req_page = NULL;
4748	struct page *reply_page;
 
4749	int ret;
4750
4751	/*
4752	 * Method calls are ultimately read operations.  The result
4753	 * should placed into the inbound buffer provided.  They
4754	 * also supply outbound data--parameters for the object
4755	 * method.  Currently if this is present it will be a
4756	 * snapshot id.
4757	 */
4758	if (outbound) {
4759		if (outbound_size > PAGE_SIZE)
4760			return -E2BIG;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4761
4762		req_page = alloc_page(GFP_KERNEL);
4763		if (!req_page)
4764			return -ENOMEM;
 
 
 
 
 
 
4765
4766		memcpy(page_address(req_page), outbound, outbound_size);
4767	}
 
 
 
 
4768
4769	reply_page = alloc_page(GFP_KERNEL);
4770	if (!reply_page) {
4771		if (req_page)
4772			__free_page(req_page);
4773		return -ENOMEM;
4774	}
4775
4776	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4777			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
4778			     &reply_page, &inbound_size);
4779	if (!ret) {
4780		memcpy(inbound, page_address(reply_page), inbound_size);
4781		ret = inbound_size;
4782	}
 
4783
4784	if (req_page)
4785		__free_page(req_page);
4786	__free_page(reply_page);
4787	return ret;
4788}
4789
4790static void rbd_queue_workfn(struct work_struct *work)
 
4791{
4792	struct request *rq = blk_mq_rq_from_pdu(work);
4793	struct rbd_device *rbd_dev = rq->q->queuedata;
4794	struct rbd_img_request *img_request;
4795	struct ceph_snap_context *snapc = NULL;
4796	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4797	u64 length = blk_rq_bytes(rq);
4798	enum obj_operation_type op_type;
4799	u64 mapping_size;
4800	int result;
4801
4802	switch (req_op(rq)) {
4803	case REQ_OP_DISCARD:
4804		op_type = OBJ_OP_DISCARD;
4805		break;
4806	case REQ_OP_WRITE_ZEROES:
4807		op_type = OBJ_OP_ZEROOUT;
4808		break;
4809	case REQ_OP_WRITE:
4810		op_type = OBJ_OP_WRITE;
4811		break;
4812	case REQ_OP_READ:
4813		op_type = OBJ_OP_READ;
4814		break;
4815	default:
4816		dout("%s: non-fs request type %d\n", __func__, req_op(rq));
4817		result = -EIO;
4818		goto err;
4819	}
 
 
 
 
 
 
 
4820
4821	/* Ignore/skip any zero-length requests */
4822
4823	if (!length) {
4824		dout("%s: zero-length request\n", __func__);
4825		result = 0;
4826		goto err_rq;
4827	}
4828
4829	if (op_type != OBJ_OP_READ && rbd_dev->spec->snap_id != CEPH_NOSNAP) {
4830		rbd_warn(rbd_dev, "%s on read-only snapshot",
4831			 obj_op_name(op_type));
4832		result = -EIO;
4833		goto err;
4834	}
4835
4836	/*
4837	 * Quit early if the mapped snapshot no longer exists.  It's
4838	 * still possible the snapshot will have disappeared by the
4839	 * time our request arrives at the osd, but there's no sense in
4840	 * sending it if we already know.
4841	 */
4842	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4843		dout("request for non-existent snapshot");
4844		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
4845		result = -ENXIO;
4846		goto err_rq;
4847	}
 
4848
4849	if (offset && length > U64_MAX - offset + 1) {
4850		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4851			 length);
4852		result = -EINVAL;
4853		goto err_rq;	/* Shouldn't happen */
4854	}
 
 
 
4855
4856	blk_mq_start_request(rq);
4857
4858	down_read(&rbd_dev->header_rwsem);
4859	mapping_size = rbd_dev->mapping.size;
4860	if (op_type != OBJ_OP_READ) {
4861		snapc = rbd_dev->header.snapc;
4862		ceph_get_snap_context(snapc);
4863	}
4864	up_read(&rbd_dev->header_rwsem);
4865
4866	if (offset + length > mapping_size) {
4867		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
4868			 length, mapping_size);
4869		result = -EIO;
4870		goto err_rq;
4871	}
 
 
 
4872
4873	img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
4874	if (!img_request) {
4875		result = -ENOMEM;
4876		goto err_rq;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4877	}
4878	img_request->rq = rq;
4879	snapc = NULL; /* img_request consumes a ref */
4880
4881	dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
4882	     img_request, obj_op_name(op_type), offset, length);
 
 
 
 
 
 
 
 
 
 
 
4883
4884	if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
4885		result = rbd_img_fill_nodata(img_request, offset, length);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4886	else
4887		result = rbd_img_fill_from_bio(img_request, offset, length,
4888					       rq->bio);
4889	if (result)
4890		goto err_img_request;
4891
4892	rbd_img_handle_request(img_request, 0);
4893	return;
 
 
 
 
 
 
 
4894
4895err_img_request:
4896	rbd_img_request_put(img_request);
4897err_rq:
4898	if (result)
4899		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
4900			 obj_op_name(op_type), length, offset, result);
4901	ceph_put_snap_context(snapc);
4902err:
4903	blk_mq_end_request(rq, errno_to_blk_status(result));
4904}
4905
4906static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
4907		const struct blk_mq_queue_data *bd)
4908{
4909	struct request *rq = bd->rq;
4910	struct work_struct *work = blk_mq_rq_to_pdu(rq);
4911
4912	queue_work(rbd_wq, work);
4913	return BLK_STS_OK;
4914}
4915
4916static void rbd_free_disk(struct rbd_device *rbd_dev)
4917{
4918	blk_cleanup_queue(rbd_dev->disk->queue);
4919	blk_mq_free_tag_set(&rbd_dev->tag_set);
4920	put_disk(rbd_dev->disk);
4921	rbd_dev->disk = NULL;
 
 
 
 
 
 
4922}
4923
4924static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4925			     struct ceph_object_id *oid,
4926			     struct ceph_object_locator *oloc,
4927			     void *buf, int buf_len)
4928
4929{
4930	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4931	struct ceph_osd_request *req;
4932	struct page **pages;
4933	int num_pages = calc_pages_for(0, buf_len);
 
4934	int ret;
4935
4936	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4937	if (!req)
4938		return -ENOMEM;
 
 
 
 
 
 
 
4939
4940	ceph_oid_copy(&req->r_base_oid, oid);
4941	ceph_oloc_copy(&req->r_base_oloc, oloc);
4942	req->r_flags = CEPH_OSD_FLAG_READ;
4943
4944	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4945	if (IS_ERR(pages)) {
4946		ret = PTR_ERR(pages);
4947		goto out_req;
4948	}
4949
4950	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4951	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4952					 true);
 
 
 
 
 
4953
4954	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4955	if (ret)
4956		goto out_req;
 
 
 
4957
4958	ceph_osdc_start_request(osdc, req, false);
4959	ret = ceph_osdc_wait_request(osdc, req);
4960	if (ret >= 0)
4961		ceph_copy_from_page_vector(pages, buf, 0, ret);
 
 
 
 
 
 
 
 
 
 
4962
4963out_req:
4964	ceph_osdc_put_request(req);
4965	return ret;
4966}
4967
4968/*
4969 * Read the complete header for the given rbd device.  On successful
4970 * return, the rbd_dev->header field will contain up-to-date
4971 * information about the image.
4972 */
4973static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
4974{
4975	struct rbd_image_header_ondisk *ondisk = NULL;
4976	u32 snap_count = 0;
4977	u64 names_size = 0;
4978	u32 want_count;
4979	int ret;
4980
4981	/*
4982	 * The complete header will include an array of its 64-bit
4983	 * snapshot ids, followed by the names of those snapshots as
4984	 * a contiguous block of NUL-terminated strings.  Note that
4985	 * the number of snapshots could change by the time we read
4986	 * it in, in which case we re-read it.
4987	 */
4988	do {
4989		size_t size;
4990
4991		kfree(ondisk);
4992
4993		size = sizeof (*ondisk);
4994		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4995		size += names_size;
4996		ondisk = kmalloc(size, GFP_KERNEL);
4997		if (!ondisk)
4998			return -ENOMEM;
4999
5000		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
5001					&rbd_dev->header_oloc, ondisk, size);
5002		if (ret < 0)
5003			goto out;
5004		if ((size_t)ret < size) {
5005			ret = -ENXIO;
5006			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
5007				size, ret);
5008			goto out;
5009		}
5010		if (!rbd_dev_ondisk_valid(ondisk)) {
5011			ret = -ENXIO;
5012			rbd_warn(rbd_dev, "invalid header");
5013			goto out;
5014		}
5015
5016		names_size = le64_to_cpu(ondisk->snap_names_len);
5017		want_count = snap_count;
5018		snap_count = le32_to_cpu(ondisk->snap_count);
5019	} while (snap_count != want_count);
5020
5021	ret = rbd_header_from_disk(rbd_dev, ondisk);
5022out:
5023	kfree(ondisk);
5024
5025	return ret;
5026}
5027
5028/*
5029 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
5030 * has disappeared from the (just updated) snapshot context.
5031 */
5032static void rbd_exists_validate(struct rbd_device *rbd_dev)
5033{
5034	u64 snap_id;
5035
5036	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
5037		return;
5038
5039	snap_id = rbd_dev->spec->snap_id;
5040	if (snap_id == CEPH_NOSNAP)
5041		return;
5042
5043	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
5044		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5045}
5046
5047static void rbd_dev_update_size(struct rbd_device *rbd_dev)
5048{
5049	sector_t size;
 
5050
5051	/*
5052	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
5053	 * try to update its size.  If REMOVING is set, updating size
5054	 * is just useless work since the device can't be opened.
 
 
 
 
 
 
 
5055	 */
5056	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
5057	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
5058		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
5059		dout("setting size to %llu sectors", (unsigned long long)size);
5060		set_capacity(rbd_dev->disk, size);
5061		revalidate_disk(rbd_dev->disk);
5062	}
5063}
5064
5065static int rbd_dev_refresh(struct rbd_device *rbd_dev)
5066{
5067	u64 mapping_size;
5068	int ret;
5069
 
5070	down_write(&rbd_dev->header_rwsem);
5071	mapping_size = rbd_dev->mapping.size;
 
 
 
 
5072
5073	ret = rbd_dev_header_info(rbd_dev);
5074	if (ret)
5075		goto out;
5076
5077	/*
5078	 * If there is a parent, see if it has disappeared due to the
5079	 * mapped image getting flattened.
5080	 */
5081	if (rbd_dev->parent) {
5082		ret = rbd_dev_v2_parent_info(rbd_dev);
5083		if (ret)
5084			goto out;
5085	}
5086
5087	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
5088		rbd_dev->mapping.size = rbd_dev->header.image_size;
5089	} else {
5090		/* validate mapped snapshot's EXISTS flag */
5091		rbd_exists_validate(rbd_dev);
5092	}
5093
5094out:
5095	up_write(&rbd_dev->header_rwsem);
5096	if (!ret && mapping_size != rbd_dev->mapping.size)
5097		rbd_dev_update_size(rbd_dev);
5098
5099	return ret;
5100}
5101
5102static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
5103		unsigned int hctx_idx, unsigned int numa_node)
5104{
5105	struct work_struct *work = blk_mq_rq_to_pdu(rq);
5106
5107	INIT_WORK(work, rbd_queue_workfn);
5108	return 0;
5109}
5110
5111static const struct blk_mq_ops rbd_mq_ops = {
5112	.queue_rq	= rbd_queue_rq,
5113	.init_request	= rbd_init_request,
5114};
5115
5116static int rbd_init_disk(struct rbd_device *rbd_dev)
5117{
5118	struct gendisk *disk;
5119	struct request_queue *q;
5120	unsigned int objset_bytes =
5121	    rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
5122	int err;
5123
5124	/* create gendisk info */
5125	disk = alloc_disk(single_major ?
5126			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
5127			  RBD_MINORS_PER_MAJOR);
5128	if (!disk)
5129		return -ENOMEM;
5130
5131	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
5132		 rbd_dev->dev_id);
5133	disk->major = rbd_dev->major;
5134	disk->first_minor = rbd_dev->minor;
5135	if (single_major)
5136		disk->flags |= GENHD_FL_EXT_DEVT;
5137	disk->fops = &rbd_bd_ops;
5138	disk->private_data = rbd_dev;
5139
5140	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
5141	rbd_dev->tag_set.ops = &rbd_mq_ops;
5142	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
5143	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
5144	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
5145	rbd_dev->tag_set.nr_hw_queues = 1;
5146	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
5147
5148	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
5149	if (err)
5150		goto out_disk;
5151
5152	q = blk_mq_init_queue(&rbd_dev->tag_set);
5153	if (IS_ERR(q)) {
5154		err = PTR_ERR(q);
5155		goto out_tag_set;
5156	}
5157
5158	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
5159	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
 
 
 
 
5160
5161	blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
5162	q->limits.max_sectors = queue_max_hw_sectors(q);
5163	blk_queue_max_segments(q, USHRT_MAX);
5164	blk_queue_max_segment_size(q, UINT_MAX);
5165	blk_queue_io_min(q, rbd_dev->opts->alloc_size);
5166	blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
5167
5168	if (rbd_dev->opts->trim) {
5169		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
5170		q->limits.discard_granularity = rbd_dev->opts->alloc_size;
5171		blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
5172		blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
5173	}
5174
5175	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
5176		q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
5177
5178	/*
5179	 * disk_release() expects a queue ref from add_disk() and will
5180	 * put it.  Hold an extra ref until add_disk() is called.
5181	 */
5182	WARN_ON(!blk_get_queue(q));
5183	disk->queue = q;
5184	q->queuedata = rbd_dev;
5185
5186	rbd_dev->disk = disk;
5187
5188	return 0;
5189out_tag_set:
5190	blk_mq_free_tag_set(&rbd_dev->tag_set);
5191out_disk:
5192	put_disk(disk);
5193	return err;
 
5194}
5195
5196/*
5197  sysfs
5198*/
5199
5200static struct rbd_device *dev_to_rbd_dev(struct device *dev)
5201{
5202	return container_of(dev, struct rbd_device, dev);
5203}
5204
5205static ssize_t rbd_size_show(struct device *dev,
5206			     struct device_attribute *attr, char *buf)
5207{
5208	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5209
5210	return sprintf(buf, "%llu\n",
5211		(unsigned long long)rbd_dev->mapping.size);
5212}
5213
5214/*
5215 * Note this shows the features for whatever's mapped, which is not
5216 * necessarily the base image.
5217 */
5218static ssize_t rbd_features_show(struct device *dev,
5219			     struct device_attribute *attr, char *buf)
5220{
5221	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5222
5223	return sprintf(buf, "0x%016llx\n",
5224			(unsigned long long)rbd_dev->mapping.features);
5225}
5226
5227static ssize_t rbd_major_show(struct device *dev,
5228			      struct device_attribute *attr, char *buf)
5229{
5230	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5231
5232	if (rbd_dev->major)
5233		return sprintf(buf, "%d\n", rbd_dev->major);
5234
5235	return sprintf(buf, "(none)\n");
5236}
5237
5238static ssize_t rbd_minor_show(struct device *dev,
5239			      struct device_attribute *attr, char *buf)
5240{
5241	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5242
5243	return sprintf(buf, "%d\n", rbd_dev->minor);
5244}
5245
5246static ssize_t rbd_client_addr_show(struct device *dev,
5247				    struct device_attribute *attr, char *buf)
5248{
5249	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5250	struct ceph_entity_addr *client_addr =
5251	    ceph_client_addr(rbd_dev->rbd_client->client);
5252
5253	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5254		       le32_to_cpu(client_addr->nonce));
5255}
5256
5257static ssize_t rbd_client_id_show(struct device *dev,
5258				  struct device_attribute *attr, char *buf)
5259{
5260	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5261
5262	return sprintf(buf, "client%lld\n",
5263		       ceph_client_gid(rbd_dev->rbd_client->client));
5264}
5265
5266static ssize_t rbd_cluster_fsid_show(struct device *dev,
5267				     struct device_attribute *attr, char *buf)
5268{
5269	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5270
5271	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5272}
5273
5274static ssize_t rbd_config_info_show(struct device *dev,
5275				    struct device_attribute *attr, char *buf)
5276{
5277	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5278
5279	return sprintf(buf, "%s\n", rbd_dev->config_info);
5280}
5281
5282static ssize_t rbd_pool_show(struct device *dev,
5283			     struct device_attribute *attr, char *buf)
5284{
5285	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5286
5287	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
5288}
5289
5290static ssize_t rbd_pool_id_show(struct device *dev,
5291			     struct device_attribute *attr, char *buf)
5292{
5293	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5294
5295	return sprintf(buf, "%llu\n",
5296			(unsigned long long) rbd_dev->spec->pool_id);
5297}
5298
5299static ssize_t rbd_pool_ns_show(struct device *dev,
5300				struct device_attribute *attr, char *buf)
5301{
5302	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5303
5304	return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5305}
5306
5307static ssize_t rbd_name_show(struct device *dev,
5308			     struct device_attribute *attr, char *buf)
5309{
5310	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5311
5312	if (rbd_dev->spec->image_name)
5313		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5314
5315	return sprintf(buf, "(unknown)\n");
5316}
5317
5318static ssize_t rbd_image_id_show(struct device *dev,
5319			     struct device_attribute *attr, char *buf)
5320{
5321	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5322
5323	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
5324}
5325
5326/*
5327 * Shows the name of the currently-mapped snapshot (or
5328 * RBD_SNAP_HEAD_NAME for the base image).
5329 */
5330static ssize_t rbd_snap_show(struct device *dev,
5331			     struct device_attribute *attr,
5332			     char *buf)
5333{
5334	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5335
5336	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
5337}
5338
5339static ssize_t rbd_snap_id_show(struct device *dev,
5340				struct device_attribute *attr, char *buf)
5341{
5342	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5343
5344	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
5345}
5346
5347/*
5348 * For a v2 image, shows the chain of parent images, separated by empty
5349 * lines.  For v1 images or if there is no parent, shows "(no parent
5350 * image)".
5351 */
5352static ssize_t rbd_parent_show(struct device *dev,
5353			       struct device_attribute *attr,
5354			       char *buf)
5355{
5356	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5357	ssize_t count = 0;
 
 
5358
5359	if (!rbd_dev->parent)
5360		return sprintf(buf, "(no parent image)\n");
5361
5362	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5363		struct rbd_spec *spec = rbd_dev->parent_spec;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5364
5365		count += sprintf(&buf[count], "%s"
5366			    "pool_id %llu\npool_name %s\n"
5367			    "pool_ns %s\n"
5368			    "image_id %s\nimage_name %s\n"
5369			    "snap_id %llu\nsnap_name %s\n"
5370			    "overlap %llu\n",
5371			    !count ? "" : "\n", /* first? */
5372			    spec->pool_id, spec->pool_name,
5373			    spec->pool_ns ?: "",
5374			    spec->image_id, spec->image_name ?: "(unknown)",
5375			    spec->snap_id, spec->snap_name,
5376			    rbd_dev->parent_overlap);
5377	}
5378
5379	return count;
5380}
5381
5382static ssize_t rbd_image_refresh(struct device *dev,
5383				 struct device_attribute *attr,
5384				 const char *buf,
5385				 size_t size)
5386{
5387	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5388	int ret;
5389
5390	ret = rbd_dev_refresh(rbd_dev);
5391	if (ret)
5392		return ret;
5393
5394	return size;
5395}
5396
5397static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
5398static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
5399static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
5400static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
5401static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
5402static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
5403static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
5404static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
5405static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
5406static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
5407static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
5408static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
5409static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
5410static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
5411static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
5412static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
5413static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
5414
5415static struct attribute *rbd_attrs[] = {
5416	&dev_attr_size.attr,
5417	&dev_attr_features.attr,
5418	&dev_attr_major.attr,
5419	&dev_attr_minor.attr,
5420	&dev_attr_client_addr.attr,
5421	&dev_attr_client_id.attr,
5422	&dev_attr_cluster_fsid.attr,
5423	&dev_attr_config_info.attr,
5424	&dev_attr_pool.attr,
5425	&dev_attr_pool_id.attr,
5426	&dev_attr_pool_ns.attr,
5427	&dev_attr_name.attr,
5428	&dev_attr_image_id.attr,
5429	&dev_attr_current_snap.attr,
5430	&dev_attr_snap_id.attr,
5431	&dev_attr_parent.attr,
5432	&dev_attr_refresh.attr,
5433	NULL
5434};
5435
5436static struct attribute_group rbd_attr_group = {
5437	.attrs = rbd_attrs,
5438};
5439
5440static const struct attribute_group *rbd_attr_groups[] = {
5441	&rbd_attr_group,
5442	NULL
5443};
5444
5445static void rbd_dev_release(struct device *dev);
 
 
5446
5447static const struct device_type rbd_device_type = {
5448	.name		= "rbd",
5449	.groups		= rbd_attr_groups,
5450	.release	= rbd_dev_release,
5451};
5452
5453static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
5454{
5455	kref_get(&spec->kref);
5456
5457	return spec;
5458}
5459
5460static void rbd_spec_free(struct kref *kref);
5461static void rbd_spec_put(struct rbd_spec *spec)
5462{
5463	if (spec)
5464		kref_put(&spec->kref, rbd_spec_free);
5465}
5466
5467static struct rbd_spec *rbd_spec_alloc(void)
5468{
5469	struct rbd_spec *spec;
5470
5471	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
5472	if (!spec)
5473		return NULL;
5474
5475	spec->pool_id = CEPH_NOPOOL;
5476	spec->snap_id = CEPH_NOSNAP;
5477	kref_init(&spec->kref);
5478
5479	return spec;
5480}
5481
5482static void rbd_spec_free(struct kref *kref)
5483{
5484	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
5485
5486	kfree(spec->pool_name);
5487	kfree(spec->pool_ns);
5488	kfree(spec->image_id);
5489	kfree(spec->image_name);
5490	kfree(spec->snap_name);
5491	kfree(spec);
5492}
5493
5494static void rbd_dev_free(struct rbd_device *rbd_dev)
5495{
5496	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
5497	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
5498
5499	ceph_oid_destroy(&rbd_dev->header_oid);
5500	ceph_oloc_destroy(&rbd_dev->header_oloc);
5501	kfree(rbd_dev->config_info);
5502
5503	rbd_put_client(rbd_dev->rbd_client);
5504	rbd_spec_put(rbd_dev->spec);
5505	kfree(rbd_dev->opts);
5506	kfree(rbd_dev);
5507}
5508
5509static void rbd_dev_release(struct device *dev)
5510{
5511	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5512	bool need_put = !!rbd_dev->opts;
5513
5514	if (need_put) {
5515		destroy_workqueue(rbd_dev->task_wq);
5516		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5517	}
5518
5519	rbd_dev_free(rbd_dev);
5520
5521	/*
5522	 * This is racy, but way better than putting module outside of
5523	 * the release callback.  The race window is pretty small, so
5524	 * doing something similar to dm (dm-builtin.c) is overkill.
5525	 */
5526	if (need_put)
5527		module_put(THIS_MODULE);
5528}
5529
5530static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
5531					   struct rbd_spec *spec)
5532{
5533	struct rbd_device *rbd_dev;
5534
5535	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
5536	if (!rbd_dev)
5537		return NULL;
5538
5539	spin_lock_init(&rbd_dev->lock);
 
 
5540	INIT_LIST_HEAD(&rbd_dev->node);
5541	init_rwsem(&rbd_dev->header_rwsem);
5542
5543	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
5544	ceph_oid_init(&rbd_dev->header_oid);
5545	rbd_dev->header_oloc.pool = spec->pool_id;
5546	if (spec->pool_ns) {
5547		WARN_ON(!*spec->pool_ns);
5548		rbd_dev->header_oloc.pool_ns =
5549		    ceph_find_or_create_string(spec->pool_ns,
5550					       strlen(spec->pool_ns));
5551	}
5552
5553	mutex_init(&rbd_dev->watch_mutex);
5554	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
5555	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
5556
5557	init_rwsem(&rbd_dev->lock_rwsem);
5558	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5559	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5560	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5561	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5562	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
5563	spin_lock_init(&rbd_dev->lock_lists_lock);
5564	INIT_LIST_HEAD(&rbd_dev->acquiring_list);
5565	INIT_LIST_HEAD(&rbd_dev->running_list);
5566	init_completion(&rbd_dev->acquire_wait);
5567	init_completion(&rbd_dev->releasing_wait);
5568
5569	spin_lock_init(&rbd_dev->object_map_lock);
5570
5571	rbd_dev->dev.bus = &rbd_bus_type;
5572	rbd_dev->dev.type = &rbd_device_type;
5573	rbd_dev->dev.parent = &rbd_root_dev;
5574	device_initialize(&rbd_dev->dev);
5575
5576	rbd_dev->rbd_client = rbdc;
5577	rbd_dev->spec = spec;
5578
5579	return rbd_dev;
5580}
5581
5582/*
5583 * Create a mapping rbd_dev.
5584 */
5585static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
5586					 struct rbd_spec *spec,
5587					 struct rbd_options *opts)
5588{
5589	struct rbd_device *rbd_dev;
5590
5591	rbd_dev = __rbd_dev_create(rbdc, spec);
5592	if (!rbd_dev)
5593		return NULL;
5594
5595	rbd_dev->opts = opts;
5596
5597	/* get an id and fill in device name */
5598	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
5599					 minor_to_rbd_dev_id(1 << MINORBITS),
5600					 GFP_KERNEL);
5601	if (rbd_dev->dev_id < 0)
5602		goto fail_rbd_dev;
5603
5604	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
5605	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
5606						   rbd_dev->name);
5607	if (!rbd_dev->task_wq)
5608		goto fail_dev_id;
5609
5610	/* we have a ref from do_rbd_add() */
5611	__module_get(THIS_MODULE);
5612
5613	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
5614	return rbd_dev;
5615
5616fail_dev_id:
5617	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5618fail_rbd_dev:
5619	rbd_dev_free(rbd_dev);
5620	return NULL;
5621}
5622
5623static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5624{
5625	if (rbd_dev)
5626		put_device(&rbd_dev->dev);
 
5627}
5628
5629/*
5630 * Get the size and object order for an image snapshot, or if
5631 * snap_id is CEPH_NOSNAP, gets this information for the base
5632 * image.
5633 */
5634static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5635				u8 *order, u64 *snap_size)
5636{
5637	__le64 snapid = cpu_to_le64(snap_id);
5638	int ret;
5639	struct {
5640		u8 order;
5641		__le64 size;
5642	} __attribute__ ((packed)) size_buf = { 0 };
5643
5644	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5645				  &rbd_dev->header_oloc, "get_size",
5646				  &snapid, sizeof(snapid),
5647				  &size_buf, sizeof(size_buf));
5648	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5649	if (ret < 0)
5650		return ret;
5651	if (ret < sizeof (size_buf))
5652		return -ERANGE;
5653
5654	if (order) {
5655		*order = size_buf.order;
5656		dout("  order %u", (unsigned int)*order);
5657	}
5658	*snap_size = le64_to_cpu(size_buf.size);
5659
5660	dout("  snap_id 0x%016llx snap_size = %llu\n",
5661		(unsigned long long)snap_id,
5662		(unsigned long long)*snap_size);
5663
5664	return 0;
5665}
5666
5667static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5668{
5669	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
5670					&rbd_dev->header.obj_order,
5671					&rbd_dev->header.image_size);
5672}
5673
5674static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5675{
5676	size_t size;
5677	void *reply_buf;
5678	int ret;
5679	void *p;
5680
5681	/* Response will be an encoded string, which includes a length */
5682	size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
5683	reply_buf = kzalloc(size, GFP_KERNEL);
5684	if (!reply_buf)
5685		return -ENOMEM;
5686
5687	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5688				  &rbd_dev->header_oloc, "get_object_prefix",
5689				  NULL, 0, reply_buf, size);
5690	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5691	if (ret < 0)
5692		goto out;
5693
5694	p = reply_buf;
5695	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
5696						p + ret, NULL, GFP_NOIO);
5697	ret = 0;
5698
5699	if (IS_ERR(rbd_dev->header.object_prefix)) {
5700		ret = PTR_ERR(rbd_dev->header.object_prefix);
5701		rbd_dev->header.object_prefix = NULL;
5702	} else {
5703		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
5704	}
5705out:
5706	kfree(reply_buf);
5707
5708	return ret;
5709}
5710
5711static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5712		u64 *snap_features)
5713{
5714	__le64 snapid = cpu_to_le64(snap_id);
5715	struct {
5716		__le64 features;
5717		__le64 incompat;
5718	} __attribute__ ((packed)) features_buf = { 0 };
5719	u64 unsup;
5720	int ret;
5721
5722	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5723				  &rbd_dev->header_oloc, "get_features",
5724				  &snapid, sizeof(snapid),
5725				  &features_buf, sizeof(features_buf));
5726	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5727	if (ret < 0)
5728		return ret;
5729	if (ret < sizeof (features_buf))
5730		return -ERANGE;
5731
5732	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5733	if (unsup) {
5734		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5735			 unsup);
5736		return -ENXIO;
5737	}
5738
5739	*snap_features = le64_to_cpu(features_buf.features);
5740
5741	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
5742		(unsigned long long)snap_id,
5743		(unsigned long long)*snap_features,
5744		(unsigned long long)le64_to_cpu(features_buf.incompat));
5745
5746	return 0;
5747}
5748
5749static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5750{
5751	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5752						&rbd_dev->header.features);
5753}
5754
5755/*
5756 * These are generic image flags, but since they are used only for
5757 * object map, store them in rbd_dev->object_map_flags.
5758 *
5759 * For the same reason, this function is called only on object map
5760 * (re)load and not on header refresh.
5761 */
5762static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
5763{
5764	__le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5765	__le64 flags;
5766	int ret;
5767
5768	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5769				  &rbd_dev->header_oloc, "get_flags",
5770				  &snapid, sizeof(snapid),
5771				  &flags, sizeof(flags));
5772	if (ret < 0)
5773		return ret;
5774	if (ret < sizeof(flags))
5775		return -EBADMSG;
5776
5777	rbd_dev->object_map_flags = le64_to_cpu(flags);
5778	return 0;
5779}
5780
5781struct parent_image_info {
5782	u64		pool_id;
5783	const char	*pool_ns;
5784	const char	*image_id;
5785	u64		snap_id;
5786
5787	bool		has_overlap;
5788	u64		overlap;
5789};
5790
5791/*
5792 * The caller is responsible for @pii.
5793 */
5794static int decode_parent_image_spec(void **p, void *end,
5795				    struct parent_image_info *pii)
5796{
5797	u8 struct_v;
5798	u32 struct_len;
5799	int ret;
5800
5801	ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5802				  &struct_v, &struct_len);
5803	if (ret)
5804		return ret;
5805
5806	ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5807	pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5808	if (IS_ERR(pii->pool_ns)) {
5809		ret = PTR_ERR(pii->pool_ns);
5810		pii->pool_ns = NULL;
5811		return ret;
5812	}
5813	pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5814	if (IS_ERR(pii->image_id)) {
5815		ret = PTR_ERR(pii->image_id);
5816		pii->image_id = NULL;
5817		return ret;
5818	}
5819	ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5820	return 0;
5821
5822e_inval:
5823	return -EINVAL;
5824}
5825
5826static int __get_parent_info(struct rbd_device *rbd_dev,
5827			     struct page *req_page,
5828			     struct page *reply_page,
5829			     struct parent_image_info *pii)
5830{
5831	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5832	size_t reply_len = PAGE_SIZE;
5833	void *p, *end;
5834	int ret;
5835
5836	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5837			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
5838			     req_page, sizeof(u64), &reply_page, &reply_len);
5839	if (ret)
5840		return ret == -EOPNOTSUPP ? 1 : ret;
5841
5842	p = page_address(reply_page);
5843	end = p + reply_len;
5844	ret = decode_parent_image_spec(&p, end, pii);
5845	if (ret)
5846		return ret;
5847
5848	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5849			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
5850			     req_page, sizeof(u64), &reply_page, &reply_len);
5851	if (ret)
5852		return ret;
5853
5854	p = page_address(reply_page);
5855	end = p + reply_len;
5856	ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5857	if (pii->has_overlap)
5858		ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5859
5860	return 0;
5861
5862e_inval:
5863	return -EINVAL;
5864}
5865
5866/*
5867 * The caller is responsible for @pii.
5868 */
5869static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5870				    struct page *req_page,
5871				    struct page *reply_page,
5872				    struct parent_image_info *pii)
5873{
5874	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5875	size_t reply_len = PAGE_SIZE;
5876	void *p, *end;
5877	int ret;
5878
5879	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5880			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
5881			     req_page, sizeof(u64), &reply_page, &reply_len);
5882	if (ret)
5883		return ret;
5884
5885	p = page_address(reply_page);
5886	end = p + reply_len;
5887	ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5888	pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5889	if (IS_ERR(pii->image_id)) {
5890		ret = PTR_ERR(pii->image_id);
5891		pii->image_id = NULL;
5892		return ret;
5893	}
5894	ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
5895	pii->has_overlap = true;
5896	ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5897
5898	return 0;
5899
5900e_inval:
5901	return -EINVAL;
5902}
5903
5904static int get_parent_info(struct rbd_device *rbd_dev,
5905			   struct parent_image_info *pii)
5906{
5907	struct page *req_page, *reply_page;
5908	void *p;
5909	int ret;
5910
5911	req_page = alloc_page(GFP_KERNEL);
5912	if (!req_page)
5913		return -ENOMEM;
5914
5915	reply_page = alloc_page(GFP_KERNEL);
5916	if (!reply_page) {
5917		__free_page(req_page);
5918		return -ENOMEM;
5919	}
5920
5921	p = page_address(req_page);
5922	ceph_encode_64(&p, rbd_dev->spec->snap_id);
5923	ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5924	if (ret > 0)
5925		ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5926					       pii);
5927
5928	__free_page(req_page);
5929	__free_page(reply_page);
5930	return ret;
5931}
5932
5933static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5934{
5935	struct rbd_spec *parent_spec;
5936	struct parent_image_info pii = { 0 };
 
 
 
 
 
 
 
 
5937	int ret;
5938
5939	parent_spec = rbd_spec_alloc();
5940	if (!parent_spec)
5941		return -ENOMEM;
5942
5943	ret = get_parent_info(rbd_dev, &pii);
5944	if (ret)
 
 
 
 
 
5945		goto out_err;
 
5946
5947	dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5948	     __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5949	     pii.has_overlap, pii.overlap);
 
 
 
 
 
5950
5951	if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
 
 
 
 
5952		/*
5953		 * Either the parent never existed, or we have
5954		 * record of it but the image got flattened so it no
5955		 * longer has a parent.  When the parent of a
5956		 * layered image disappears we immediately set the
5957		 * overlap to 0.  The effect of this is that all new
5958		 * requests will be treated as if the image had no
5959		 * parent.
5960		 *
5961		 * If !pii.has_overlap, the parent image spec is not
5962		 * applicable.  It's there to avoid duplication in each
5963		 * snapshot record.
5964		 */
5965		if (rbd_dev->parent_overlap) {
5966			rbd_dev->parent_overlap = 0;
 
5967			rbd_dev_parent_put(rbd_dev);
5968			pr_info("%s: clone image has been flattened\n",
5969				rbd_dev->disk->disk_name);
5970		}
5971
5972		goto out;	/* No parent?  No problem. */
5973	}
5974
5975	/* The ceph file layout needs to fit pool id in 32 bits */
5976
5977	ret = -EIO;
5978	if (pii.pool_id > (u64)U32_MAX) {
5979		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
5980			(unsigned long long)pii.pool_id, U32_MAX);
5981		goto out_err;
5982	}
5983
 
 
 
 
 
 
 
 
5984	/*
5985	 * The parent won't change (except when the clone is
5986	 * flattened, already handled that).  So we only need to
5987	 * record the parent spec we have not already done so.
5988	 */
5989	if (!rbd_dev->parent_spec) {
5990		parent_spec->pool_id = pii.pool_id;
5991		if (pii.pool_ns && *pii.pool_ns) {
5992			parent_spec->pool_ns = pii.pool_ns;
5993			pii.pool_ns = NULL;
5994		}
5995		parent_spec->image_id = pii.image_id;
5996		pii.image_id = NULL;
5997		parent_spec->snap_id = pii.snap_id;
5998
5999		rbd_dev->parent_spec = parent_spec;
6000		parent_spec = NULL;	/* rbd_dev now owns this */
6001	}
6002
6003	/*
6004	 * We always update the parent overlap.  If it's zero we issue
6005	 * a warning, as we will proceed as if there was no parent.
6006	 */
6007	if (!pii.overlap) {
 
 
 
 
 
6008		if (parent_spec) {
6009			/* refresh, careful to warn just once */
6010			if (rbd_dev->parent_overlap)
6011				rbd_warn(rbd_dev,
6012				    "clone now standalone (overlap became 0)");
 
 
 
 
6013		} else {
6014			/* initial probe */
6015			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
 
 
 
 
 
6016		}
6017	}
6018	rbd_dev->parent_overlap = pii.overlap;
6019
6020out:
6021	ret = 0;
6022out_err:
6023	kfree(pii.pool_ns);
6024	kfree(pii.image_id);
6025	rbd_spec_put(parent_spec);
 
6026	return ret;
6027}
6028
6029static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
6030{
6031	struct {
6032		__le64 stripe_unit;
6033		__le64 stripe_count;
6034	} __attribute__ ((packed)) striping_info_buf = { 0 };
6035	size_t size = sizeof (striping_info_buf);
6036	void *p;
 
 
 
6037	int ret;
6038
6039	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6040				&rbd_dev->header_oloc, "get_stripe_unit_count",
6041				NULL, 0, &striping_info_buf, size);
6042	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6043	if (ret < 0)
6044		return ret;
6045	if (ret < size)
6046		return -ERANGE;
6047
 
 
 
 
 
 
 
 
6048	p = &striping_info_buf;
6049	rbd_dev->header.stripe_unit = ceph_decode_64(&p);
6050	rbd_dev->header.stripe_count = ceph_decode_64(&p);
6051	return 0;
6052}
6053
6054static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
6055{
6056	__le64 data_pool_id;
6057	int ret;
 
 
 
 
 
 
6058
6059	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6060				  &rbd_dev->header_oloc, "get_data_pool",
6061				  NULL, 0, &data_pool_id, sizeof(data_pool_id));
6062	if (ret < 0)
6063		return ret;
6064	if (ret < sizeof(data_pool_id))
6065		return -EBADMSG;
6066
6067	rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
6068	WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
6069	return 0;
6070}
6071
6072static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
6073{
6074	CEPH_DEFINE_OID_ONSTACK(oid);
6075	size_t image_id_size;
6076	char *image_id;
6077	void *p;
6078	void *end;
6079	size_t size;
6080	void *reply_buf = NULL;
6081	size_t len = 0;
6082	char *image_name = NULL;
6083	int ret;
6084
6085	rbd_assert(!rbd_dev->spec->image_name);
6086
6087	len = strlen(rbd_dev->spec->image_id);
6088	image_id_size = sizeof (__le32) + len;
6089	image_id = kmalloc(image_id_size, GFP_KERNEL);
6090	if (!image_id)
6091		return NULL;
6092
6093	p = image_id;
6094	end = image_id + image_id_size;
6095	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
6096
6097	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
6098	reply_buf = kmalloc(size, GFP_KERNEL);
6099	if (!reply_buf)
6100		goto out;
6101
6102	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
6103	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6104				  "dir_get_name", image_id, image_id_size,
6105				  reply_buf, size);
6106	if (ret < 0)
6107		goto out;
6108	p = reply_buf;
6109	end = reply_buf + ret;
6110
6111	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
6112	if (IS_ERR(image_name))
6113		image_name = NULL;
6114	else
6115		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
6116out:
6117	kfree(reply_buf);
6118	kfree(image_id);
6119
6120	return image_name;
6121}
6122
6123static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6124{
6125	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
6126	const char *snap_name;
6127	u32 which = 0;
6128
6129	/* Skip over names until we find the one we are looking for */
6130
6131	snap_name = rbd_dev->header.snap_names;
6132	while (which < snapc->num_snaps) {
6133		if (!strcmp(name, snap_name))
6134			return snapc->snaps[which];
6135		snap_name += strlen(snap_name) + 1;
6136		which++;
6137	}
6138	return CEPH_NOSNAP;
6139}
6140
6141static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6142{
6143	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
6144	u32 which;
6145	bool found = false;
6146	u64 snap_id;
6147
6148	for (which = 0; !found && which < snapc->num_snaps; which++) {
6149		const char *snap_name;
6150
6151		snap_id = snapc->snaps[which];
6152		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
6153		if (IS_ERR(snap_name)) {
6154			/* ignore no-longer existing snapshots */
6155			if (PTR_ERR(snap_name) == -ENOENT)
6156				continue;
6157			else
6158				break;
6159		}
6160		found = !strcmp(name, snap_name);
6161		kfree(snap_name);
6162	}
6163	return found ? snap_id : CEPH_NOSNAP;
6164}
6165
6166/*
6167 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
6168 * no snapshot by that name is found, or if an error occurs.
6169 */
6170static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6171{
6172	if (rbd_dev->image_format == 1)
6173		return rbd_v1_snap_id_by_name(rbd_dev, name);
6174
6175	return rbd_v2_snap_id_by_name(rbd_dev, name);
6176}
6177
6178/*
6179 * An image being mapped will have everything but the snap id.
6180 */
6181static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
6182{
6183	struct rbd_spec *spec = rbd_dev->spec;
6184
6185	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
6186	rbd_assert(spec->image_id && spec->image_name);
6187	rbd_assert(spec->snap_name);
6188
6189	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
6190		u64 snap_id;
6191
6192		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
6193		if (snap_id == CEPH_NOSNAP)
6194			return -ENOENT;
6195
6196		spec->snap_id = snap_id;
6197	} else {
6198		spec->snap_id = CEPH_NOSNAP;
6199	}
6200
6201	return 0;
6202}
6203
6204/*
6205 * A parent image will have all ids but none of the names.
6206 *
6207 * All names in an rbd spec are dynamically allocated.  It's OK if we
6208 * can't figure out the name for an image id.
 
6209 */
6210static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
6211{
6212	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
6213	struct rbd_spec *spec = rbd_dev->spec;
6214	const char *pool_name;
6215	const char *image_name;
6216	const char *snap_name;
6217	int ret;
6218
6219	rbd_assert(spec->pool_id != CEPH_NOPOOL);
6220	rbd_assert(spec->image_id);
6221	rbd_assert(spec->snap_id != CEPH_NOSNAP);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6222
6223	/* Get the pool name; we have to make our own copy of this */
6224
6225	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
6226	if (!pool_name) {
6227		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
6228		return -EIO;
6229	}
6230	pool_name = kstrdup(pool_name, GFP_KERNEL);
6231	if (!pool_name)
6232		return -ENOMEM;
6233
6234	/* Fetch the image name; tolerate failure here */
6235
6236	image_name = rbd_dev_image_name(rbd_dev);
6237	if (!image_name)
6238		rbd_warn(rbd_dev, "unable to get image name");
6239
6240	/* Fetch the snapshot name */
6241
6242	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
6243	if (IS_ERR(snap_name)) {
6244		ret = PTR_ERR(snap_name);
6245		goto out_err;
6246	}
6247
6248	spec->pool_name = pool_name;
6249	spec->image_name = image_name;
6250	spec->snap_name = snap_name;
6251
6252	return 0;
6253
6254out_err:
6255	kfree(image_name);
6256	kfree(pool_name);
 
6257	return ret;
6258}
6259
6260static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
6261{
6262	size_t size;
6263	int ret;
6264	void *reply_buf;
6265	void *p;
6266	void *end;
6267	u64 seq;
6268	u32 snap_count;
6269	struct ceph_snap_context *snapc;
6270	u32 i;
6271
6272	/*
6273	 * We'll need room for the seq value (maximum snapshot id),
6274	 * snapshot count, and array of that many snapshot ids.
6275	 * For now we have a fixed upper limit on the number we're
6276	 * prepared to receive.
6277	 */
6278	size = sizeof (__le64) + sizeof (__le32) +
6279			RBD_MAX_SNAP_COUNT * sizeof (__le64);
6280	reply_buf = kzalloc(size, GFP_KERNEL);
6281	if (!reply_buf)
6282		return -ENOMEM;
6283
6284	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6285				  &rbd_dev->header_oloc, "get_snapcontext",
6286				  NULL, 0, reply_buf, size);
6287	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6288	if (ret < 0)
6289		goto out;
6290
6291	p = reply_buf;
6292	end = reply_buf + ret;
6293	ret = -ERANGE;
6294	ceph_decode_64_safe(&p, end, seq, out);
6295	ceph_decode_32_safe(&p, end, snap_count, out);
6296
6297	/*
6298	 * Make sure the reported number of snapshot ids wouldn't go
6299	 * beyond the end of our buffer.  But before checking that,
6300	 * make sure the computed size of the snapshot context we
6301	 * allocate is representable in a size_t.
6302	 */
6303	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
6304				 / sizeof (u64)) {
6305		ret = -EINVAL;
6306		goto out;
6307	}
6308	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
6309		goto out;
6310	ret = 0;
6311
6312	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
6313	if (!snapc) {
6314		ret = -ENOMEM;
6315		goto out;
6316	}
6317	snapc->seq = seq;
6318	for (i = 0; i < snap_count; i++)
6319		snapc->snaps[i] = ceph_decode_64(&p);
6320
6321	ceph_put_snap_context(rbd_dev->header.snapc);
6322	rbd_dev->header.snapc = snapc;
6323
6324	dout("  snap context seq = %llu, snap_count = %u\n",
6325		(unsigned long long)seq, (unsigned int)snap_count);
6326out:
6327	kfree(reply_buf);
6328
6329	return ret;
6330}
6331
6332static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
6333					u64 snap_id)
6334{
6335	size_t size;
6336	void *reply_buf;
6337	__le64 snapid;
6338	int ret;
6339	void *p;
6340	void *end;
6341	char *snap_name;
6342
6343	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6344	reply_buf = kmalloc(size, GFP_KERNEL);
6345	if (!reply_buf)
6346		return ERR_PTR(-ENOMEM);
6347
6348	snapid = cpu_to_le64(snap_id);
6349	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6350				  &rbd_dev->header_oloc, "get_snapshot_name",
6351				  &snapid, sizeof(snapid), reply_buf, size);
 
6352	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6353	if (ret < 0) {
6354		snap_name = ERR_PTR(ret);
6355		goto out;
6356	}
6357
6358	p = reply_buf;
6359	end = reply_buf + ret;
6360	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
6361	if (IS_ERR(snap_name))
6362		goto out;
6363
6364	dout("  snap_id 0x%016llx snap_name = %s\n",
6365		(unsigned long long)snap_id, snap_name);
6366out:
6367	kfree(reply_buf);
6368
6369	return snap_name;
6370}
6371
6372static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
6373{
6374	bool first_time = rbd_dev->header.object_prefix == NULL;
6375	int ret;
6376
6377	ret = rbd_dev_v2_image_size(rbd_dev);
6378	if (ret)
6379		return ret;
6380
6381	if (first_time) {
6382		ret = rbd_dev_v2_header_onetime(rbd_dev);
6383		if (ret)
6384			return ret;
6385	}
6386
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6387	ret = rbd_dev_v2_snap_context(rbd_dev);
6388	if (ret && first_time) {
6389		kfree(rbd_dev->header.object_prefix);
6390		rbd_dev->header.object_prefix = NULL;
6391	}
 
 
 
 
 
 
 
 
 
 
 
 
 
6392
6393	return ret;
6394}
6395
6396static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6397{
6398	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6399
6400	if (rbd_dev->image_format == 1)
6401		return rbd_dev_v1_header_info(rbd_dev);
6402
6403	return rbd_dev_v2_header_info(rbd_dev);
6404}
6405
6406/*
6407 * Skips over white space at *buf, and updates *buf to point to the
6408 * first found non-space character (if any). Returns the length of
6409 * the token (string of non-white space characters) found.  Note
6410 * that *buf must be terminated with '\0'.
6411 */
6412static inline size_t next_token(const char **buf)
6413{
6414        /*
6415        * These are the characters that produce nonzero for
6416        * isspace() in the "C" and "POSIX" locales.
6417        */
6418        const char *spaces = " \f\n\r\t\v";
6419
6420        *buf += strspn(*buf, spaces);	/* Find start of token */
6421
6422	return strcspn(*buf, spaces);   /* Return token length */
6423}
6424
6425/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6426 * Finds the next token in *buf, dynamically allocates a buffer big
6427 * enough to hold a copy of it, and copies the token into the new
6428 * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
6429 * that a duplicate buffer is created even for a zero-length token.
6430 *
6431 * Returns a pointer to the newly-allocated duplicate, or a null
6432 * pointer if memory for the duplicate was not available.  If
6433 * the lenp argument is a non-null pointer, the length of the token
6434 * (not including the '\0') is returned in *lenp.
6435 *
6436 * If successful, the *buf pointer will be updated to point beyond
6437 * the end of the found token.
6438 *
6439 * Note: uses GFP_KERNEL for allocation.
6440 */
6441static inline char *dup_token(const char **buf, size_t *lenp)
6442{
6443	char *dup;
6444	size_t len;
6445
6446	len = next_token(buf);
6447	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
6448	if (!dup)
6449		return NULL;
6450	*(dup + len) = '\0';
6451	*buf += len;
6452
6453	if (lenp)
6454		*lenp = len;
6455
6456	return dup;
6457}
6458
6459/*
6460 * Parse the options provided for an "rbd add" (i.e., rbd image
6461 * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
6462 * and the data written is passed here via a NUL-terminated buffer.
6463 * Returns 0 if successful or an error code otherwise.
6464 *
6465 * The information extracted from these options is recorded in
6466 * the other parameters which return dynamically-allocated
6467 * structures:
6468 *  ceph_opts
6469 *      The address of a pointer that will refer to a ceph options
6470 *      structure.  Caller must release the returned pointer using
6471 *      ceph_destroy_options() when it is no longer needed.
6472 *  rbd_opts
6473 *	Address of an rbd options pointer.  Fully initialized by
6474 *	this function; caller must release with kfree().
6475 *  spec
6476 *	Address of an rbd image specification pointer.  Fully
6477 *	initialized by this function based on parsed options.
6478 *	Caller must release with rbd_spec_put().
6479 *
6480 * The options passed take this form:
6481 *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6482 * where:
6483 *  <mon_addrs>
6484 *      A comma-separated list of one or more monitor addresses.
6485 *      A monitor address is an ip address, optionally followed
6486 *      by a port number (separated by a colon).
6487 *        I.e.:  ip1[:port1][,ip2[:port2]...]
6488 *  <options>
6489 *      A comma-separated list of ceph and/or rbd options.
6490 *  <pool_name>
6491 *      The name of the rados pool containing the rbd image.
6492 *  <image_name>
6493 *      The name of the image in that pool to map.
6494 *  <snap_id>
6495 *      An optional snapshot id.  If provided, the mapping will
6496 *      present data from the image at the time that snapshot was
6497 *      created.  The image head is used if no snapshot id is
6498 *      provided.  Snapshot mappings are always read-only.
6499 */
6500static int rbd_add_parse_args(const char *buf,
6501				struct ceph_options **ceph_opts,
6502				struct rbd_options **opts,
6503				struct rbd_spec **rbd_spec)
6504{
6505	size_t len;
6506	char *options;
6507	const char *mon_addrs;
6508	char *snap_name;
6509	size_t mon_addrs_size;
6510	struct parse_rbd_opts_ctx pctx = { 0 };
 
6511	struct ceph_options *copts;
6512	int ret;
6513
6514	/* The first four tokens are required */
6515
6516	len = next_token(&buf);
6517	if (!len) {
6518		rbd_warn(NULL, "no monitor address(es) provided");
6519		return -EINVAL;
6520	}
6521	mon_addrs = buf;
6522	mon_addrs_size = len + 1;
6523	buf += len;
6524
6525	ret = -EINVAL;
6526	options = dup_token(&buf, NULL);
6527	if (!options)
6528		return -ENOMEM;
6529	if (!*options) {
6530		rbd_warn(NULL, "no options provided");
6531		goto out_err;
6532	}
6533
6534	pctx.spec = rbd_spec_alloc();
6535	if (!pctx.spec)
6536		goto out_mem;
6537
6538	pctx.spec->pool_name = dup_token(&buf, NULL);
6539	if (!pctx.spec->pool_name)
6540		goto out_mem;
6541	if (!*pctx.spec->pool_name) {
6542		rbd_warn(NULL, "no pool name provided");
6543		goto out_err;
6544	}
6545
6546	pctx.spec->image_name = dup_token(&buf, NULL);
6547	if (!pctx.spec->image_name)
6548		goto out_mem;
6549	if (!*pctx.spec->image_name) {
6550		rbd_warn(NULL, "no image name provided");
6551		goto out_err;
6552	}
6553
6554	/*
6555	 * Snapshot name is optional; default is to use "-"
6556	 * (indicating the head/no snapshot).
6557	 */
6558	len = next_token(&buf);
6559	if (!len) {
6560		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
6561		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
6562	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
6563		ret = -ENAMETOOLONG;
6564		goto out_err;
6565	}
6566	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6567	if (!snap_name)
6568		goto out_mem;
6569	*(snap_name + len) = '\0';
6570	pctx.spec->snap_name = snap_name;
6571
6572	/* Initialize all rbd options to the defaults */
6573
6574	pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6575	if (!pctx.opts)
6576		goto out_mem;
6577
6578	pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6579	pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
6580	pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
6581	pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6582	pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6583	pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6584	pctx.opts->trim = RBD_TRIM_DEFAULT;
6585
6586	copts = ceph_parse_options(options, mon_addrs,
6587				   mon_addrs + mon_addrs_size - 1,
6588				   parse_rbd_opts_token, &pctx);
6589	if (IS_ERR(copts)) {
6590		ret = PTR_ERR(copts);
6591		goto out_err;
6592	}
6593	kfree(options);
6594
6595	*ceph_opts = copts;
6596	*opts = pctx.opts;
6597	*rbd_spec = pctx.spec;
6598
6599	return 0;
6600out_mem:
6601	ret = -ENOMEM;
6602out_err:
6603	kfree(pctx.opts);
6604	rbd_spec_put(pctx.spec);
6605	kfree(options);
6606
6607	return ret;
6608}
6609
6610static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6611{
6612	down_write(&rbd_dev->lock_rwsem);
6613	if (__rbd_is_lock_owner(rbd_dev))
6614		__rbd_release_lock(rbd_dev);
6615	up_write(&rbd_dev->lock_rwsem);
6616}
6617
6618/*
6619 * If the wait is interrupted, an error is returned even if the lock
6620 * was successfully acquired.  rbd_dev_image_unlock() will release it
6621 * if needed.
6622 */
6623static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6624{
6625	long ret;
6626
6627	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
6628		if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6629			return 0;
6630
6631		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6632		return -EINVAL;
6633	}
6634
6635	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
6636		return 0;
6637
6638	rbd_assert(!rbd_is_lock_owner(rbd_dev));
6639	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6640	ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6641			    ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
6642	if (ret > 0) {
6643		ret = rbd_dev->acquire_err;
6644	} else {
6645		cancel_delayed_work_sync(&rbd_dev->lock_dwork);
6646		if (!ret)
6647			ret = -ETIMEDOUT;
6648	}
6649
6650	if (ret) {
6651		rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6652		return ret;
6653	}
6654
6655	/*
6656	 * The lock may have been released by now, unless automatic lock
6657	 * transitions are disabled.
6658	 */
6659	rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
6660	return 0;
6661}
6662
6663/*
6664 * An rbd format 2 image has a unique identifier, distinct from the
6665 * name given to it by the user.  Internally, that identifier is
6666 * what's used to specify the names of objects related to the image.
6667 *
6668 * A special "rbd id" object is used to map an rbd image name to its
6669 * id.  If that object doesn't exist, then there is no v2 rbd image
6670 * with the supplied name.
6671 *
6672 * This function will record the given rbd_dev's image_id field if
6673 * it can be determined, and in that case will return 0.  If any
6674 * errors occur a negative errno will be returned and the rbd_dev's
6675 * image_id field will be unchanged (and should be NULL).
6676 */
6677static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6678{
6679	int ret;
6680	size_t size;
6681	CEPH_DEFINE_OID_ONSTACK(oid);
6682	void *response;
6683	char *image_id;
6684
6685	/*
6686	 * When probing a parent image, the image id is already
6687	 * known (and the image name likely is not).  There's no
6688	 * need to fetch the image id again in this case.  We
6689	 * do still need to set the image format though.
6690	 */
6691	if (rbd_dev->spec->image_id) {
6692		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6693
6694		return 0;
6695	}
6696
6697	/*
6698	 * First, see if the format 2 image id file exists, and if
6699	 * so, get the image's persistent id from it.
6700	 */
6701	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6702			       rbd_dev->spec->image_name);
6703	if (ret)
6704		return ret;
 
 
6705
6706	dout("rbd id object name is %s\n", oid.name);
6707
6708	/* Response will be an encoded string, which includes a length */
6709	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6710	response = kzalloc(size, GFP_NOIO);
6711	if (!response) {
6712		ret = -ENOMEM;
6713		goto out;
6714	}
6715
6716	/* If it doesn't exist we'll assume it's a format 1 image */
6717
6718	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6719				  "get_id", NULL, 0,
6720				  response, size);
6721	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6722	if (ret == -ENOENT) {
6723		image_id = kstrdup("", GFP_KERNEL);
6724		ret = image_id ? 0 : -ENOMEM;
6725		if (!ret)
6726			rbd_dev->image_format = 1;
6727	} else if (ret >= 0) {
6728		void *p = response;
6729
6730		image_id = ceph_extract_encoded_string(&p, p + ret,
6731						NULL, GFP_NOIO);
6732		ret = PTR_ERR_OR_ZERO(image_id);
6733		if (!ret)
6734			rbd_dev->image_format = 2;
 
 
6735	}
6736
6737	if (!ret) {
6738		rbd_dev->spec->image_id = image_id;
6739		dout("image_id is %s\n", image_id);
6740	}
6741out:
6742	kfree(response);
6743	ceph_oid_destroy(&oid);
 
6744	return ret;
6745}
6746
6747/*
6748 * Undo whatever state changes are made by v1 or v2 header info
6749 * call.
6750 */
6751static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6752{
6753	struct rbd_image_header	*header;
6754
6755	rbd_dev_parent_put(rbd_dev);
6756	rbd_object_map_free(rbd_dev);
6757	rbd_dev_mapping_clear(rbd_dev);
 
6758
6759	/* Free dynamic fields from the header, then zero it out */
6760
6761	header = &rbd_dev->header;
6762	ceph_put_snap_context(header->snapc);
6763	kfree(header->snap_sizes);
6764	kfree(header->snap_names);
6765	kfree(header->object_prefix);
6766	memset(header, 0, sizeof (*header));
6767}
6768
6769static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
6770{
6771	int ret;
6772
6773	ret = rbd_dev_v2_object_prefix(rbd_dev);
6774	if (ret)
6775		goto out_err;
6776
6777	/*
6778	 * Get the and check features for the image.  Currently the
6779	 * features are assumed to never change.
6780	 */
6781	ret = rbd_dev_v2_features(rbd_dev);
6782	if (ret)
6783		goto out_err;
6784
6785	/* If the image supports fancy striping, get its parameters */
6786
6787	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6788		ret = rbd_dev_v2_striping_info(rbd_dev);
6789		if (ret < 0)
6790			goto out_err;
6791	}
 
6792
6793	if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6794		ret = rbd_dev_v2_data_pool(rbd_dev);
6795		if (ret)
6796			goto out_err;
6797	}
6798
6799	rbd_init_layout(rbd_dev);
6800	return 0;
6801
6802out_err:
6803	rbd_dev->header.features = 0;
6804	kfree(rbd_dev->header.object_prefix);
6805	rbd_dev->header.object_prefix = NULL;
 
6806	return ret;
6807}
6808
6809/*
6810 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6811 * rbd_dev_image_probe() recursion depth, which means it's also the
6812 * length of the already discovered part of the parent chain.
6813 */
6814static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
6815{
6816	struct rbd_device *parent = NULL;
 
 
6817	int ret;
6818
6819	if (!rbd_dev->parent_spec)
6820		return 0;
 
 
 
 
 
 
 
6821
6822	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6823		pr_info("parent chain is too long (%d)\n", depth);
6824		ret = -EINVAL;
6825		goto out_err;
6826	}
6827
6828	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
6829	if (!parent) {
6830		ret = -ENOMEM;
6831		goto out_err;
6832	}
6833
6834	/*
6835	 * Images related by parent/child relationships always share
6836	 * rbd_client and spec/parent_spec, so bump their refcounts.
6837	 */
6838	__rbd_get_client(rbd_dev->rbd_client);
6839	rbd_spec_get(rbd_dev->parent_spec);
6840
6841	ret = rbd_dev_image_probe(parent, depth);
6842	if (ret < 0)
6843		goto out_err;
6844
6845	rbd_dev->parent = parent;
6846	atomic_set(&rbd_dev->parent_ref, 1);
 
6847	return 0;
 
 
 
 
 
 
 
 
 
6848
6849out_err:
6850	rbd_dev_unparent(rbd_dev);
6851	rbd_dev_destroy(parent);
6852	return ret;
6853}
6854
6855static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6856{
6857	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6858	rbd_free_disk(rbd_dev);
6859	if (!single_major)
6860		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6861}
6862
6863/*
6864 * rbd_dev->header_rwsem must be locked for write and will be unlocked
6865 * upon return.
6866 */
6867static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
6868{
6869	int ret;
6870
 
 
 
 
 
 
 
 
 
 
6871	/* Record our major and minor device numbers. */
6872
6873	if (!single_major) {
6874		ret = register_blkdev(0, rbd_dev->name);
6875		if (ret < 0)
6876			goto err_out_unlock;
6877
6878		rbd_dev->major = ret;
6879		rbd_dev->minor = 0;
6880	} else {
6881		rbd_dev->major = rbd_major;
6882		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6883	}
6884
6885	/* Set up the blkdev mapping. */
6886
6887	ret = rbd_init_disk(rbd_dev);
6888	if (ret)
6889		goto err_out_blkdev;
6890
 
 
 
6891	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
6892	set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
6893
6894	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
6895	if (ret)
6896		goto err_out_disk;
 
 
6897
6898	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6899	up_write(&rbd_dev->header_rwsem);
6900	return 0;
 
 
 
 
6901
 
 
6902err_out_disk:
6903	rbd_free_disk(rbd_dev);
6904err_out_blkdev:
6905	if (!single_major)
6906		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6907err_out_unlock:
6908	up_write(&rbd_dev->header_rwsem);
 
 
6909	return ret;
6910}
6911
6912static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6913{
6914	struct rbd_spec *spec = rbd_dev->spec;
6915	int ret;
6916
6917	/* Record the header object name for this rbd image. */
6918
6919	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 
6920	if (rbd_dev->image_format == 1)
6921		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6922				       spec->image_name, RBD_SUFFIX);
6923	else
6924		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6925				       RBD_HEADER_PREFIX, spec->image_id);
 
 
 
6926
6927	return ret;
 
 
 
 
 
 
6928}
6929
6930static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6931{
6932	rbd_dev_unprobe(rbd_dev);
6933	if (rbd_dev->opts)
6934		rbd_unregister_watch(rbd_dev);
6935	rbd_dev->image_format = 0;
6936	kfree(rbd_dev->spec->image_id);
6937	rbd_dev->spec->image_id = NULL;
 
 
6938}
6939
6940/*
6941 * Probe for the existence of the header object for the given rbd
6942 * device.  If this image is the one being mapped (i.e., not a
6943 * parent), initiate a watch on its header object before using that
6944 * object to get detailed information about the rbd image.
6945 */
6946static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
6947{
6948	int ret;
6949
6950	/*
6951	 * Get the id from the image id object.  Unless there's an
6952	 * error, rbd_dev->spec->image_id will be filled in with
6953	 * a dynamically-allocated string, and rbd_dev->image_format
6954	 * will be set to either 1 or 2.
6955	 */
6956	ret = rbd_dev_image_id(rbd_dev);
6957	if (ret)
6958		return ret;
 
 
6959
6960	ret = rbd_dev_header_name(rbd_dev);
6961	if (ret)
6962		goto err_out_format;
6963
6964	if (!depth) {
6965		ret = rbd_register_watch(rbd_dev);
6966		if (ret) {
6967			if (ret == -ENOENT)
6968				pr_info("image %s/%s%s%s does not exist\n",
6969					rbd_dev->spec->pool_name,
6970					rbd_dev->spec->pool_ns ?: "",
6971					rbd_dev->spec->pool_ns ? "/" : "",
6972					rbd_dev->spec->image_name);
6973			goto err_out_format;
6974		}
6975	}
6976
6977	ret = rbd_dev_header_info(rbd_dev);
 
 
 
6978	if (ret)
6979		goto err_out_watch;
6980
6981	/*
6982	 * If this image is the one being mapped, we have pool name and
6983	 * id, image name and id, and snap name - need to fill snap id.
6984	 * Otherwise this is a parent image, identified by pool, image
6985	 * and snap ids - need to fill in names for those ids.
6986	 */
6987	if (!depth)
6988		ret = rbd_spec_fill_snap_id(rbd_dev);
6989	else
6990		ret = rbd_spec_fill_names(rbd_dev);
6991	if (ret) {
6992		if (ret == -ENOENT)
6993			pr_info("snap %s/%s%s%s@%s does not exist\n",
6994				rbd_dev->spec->pool_name,
6995				rbd_dev->spec->pool_ns ?: "",
6996				rbd_dev->spec->pool_ns ? "/" : "",
6997				rbd_dev->spec->image_name,
6998				rbd_dev->spec->snap_name);
6999		goto err_out_probe;
7000	}
7001
7002	ret = rbd_dev_mapping_set(rbd_dev);
7003	if (ret)
7004		goto err_out_probe;
7005
7006	if (rbd_dev->spec->snap_id != CEPH_NOSNAP &&
7007	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
7008		ret = rbd_object_map_load(rbd_dev);
7009		if (ret)
7010			goto err_out_probe;
7011	}
7012
7013	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
7014		ret = rbd_dev_v2_parent_info(rbd_dev);
7015		if (ret)
7016			goto err_out_probe;
7017	}
7018
7019	ret = rbd_dev_probe_parent(rbd_dev, depth);
7020	if (ret)
7021		goto err_out_probe;
7022
7023	dout("discovered format %u image, header name is %s\n",
7024		rbd_dev->image_format, rbd_dev->header_oid.name);
 
7025	return 0;
7026
7027err_out_probe:
7028	rbd_dev_unprobe(rbd_dev);
7029err_out_watch:
7030	if (!depth)
7031		rbd_unregister_watch(rbd_dev);
 
 
 
7032err_out_format:
7033	rbd_dev->image_format = 0;
7034	kfree(rbd_dev->spec->image_id);
7035	rbd_dev->spec->image_id = NULL;
 
 
 
7036	return ret;
7037}
7038
7039static ssize_t do_rbd_add(struct bus_type *bus,
7040			  const char *buf,
7041			  size_t count)
7042{
7043	struct rbd_device *rbd_dev = NULL;
7044	struct ceph_options *ceph_opts = NULL;
7045	struct rbd_options *rbd_opts = NULL;
7046	struct rbd_spec *spec = NULL;
7047	struct rbd_client *rbdc;
7048	int rc;
 
 
7049
7050	if (!try_module_get(THIS_MODULE))
7051		return -ENODEV;
7052
7053	/* parse add command */
7054	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
7055	if (rc < 0)
7056		goto out;
 
 
 
7057
7058	rbdc = rbd_get_client(ceph_opts);
7059	if (IS_ERR(rbdc)) {
7060		rc = PTR_ERR(rbdc);
7061		goto err_out_args;
7062	}
7063
7064	/* pick the pool */
7065	rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
7066	if (rc < 0) {
7067		if (rc == -ENOENT)
7068			pr_info("pool %s does not exist\n", spec->pool_name);
7069		goto err_out_client;
7070	}
7071	spec->pool_id = (u64)rc;
7072
7073	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
7074	if (!rbd_dev) {
7075		rc = -ENOMEM;
 
 
 
7076		goto err_out_client;
7077	}
 
 
 
 
7078	rbdc = NULL;		/* rbd_dev now owns this */
7079	spec = NULL;		/* rbd_dev now owns this */
7080	rbd_opts = NULL;	/* rbd_dev now owns this */
7081
7082	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
7083	if (!rbd_dev->config_info) {
7084		rc = -ENOMEM;
7085		goto err_out_rbd_dev;
7086	}
7087
7088	down_write(&rbd_dev->header_rwsem);
7089	rc = rbd_dev_image_probe(rbd_dev, 0);
7090	if (rc < 0) {
7091		up_write(&rbd_dev->header_rwsem);
7092		goto err_out_rbd_dev;
7093	}
7094
7095	/* If we are mapping a snapshot it must be marked read-only */
7096	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
7097		rbd_dev->opts->read_only = true;
 
7098
7099	if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
7100		rbd_warn(rbd_dev, "alloc_size adjusted to %u",
7101			 rbd_dev->layout.object_size);
7102		rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
 
 
 
 
 
 
7103	}
7104
7105	rc = rbd_dev_device_setup(rbd_dev);
7106	if (rc)
7107		goto err_out_image_probe;
7108
7109	rc = rbd_add_acquire_lock(rbd_dev);
7110	if (rc)
7111		goto err_out_image_lock;
7112
7113	/* Everything's ready.  Announce the disk to the world. */
7114
7115	rc = device_add(&rbd_dev->dev);
7116	if (rc)
7117		goto err_out_image_lock;
7118
7119	add_disk(rbd_dev->disk);
7120	/* see rbd_init_disk() */
7121	blk_put_queue(rbd_dev->disk->queue);
7122
7123	spin_lock(&rbd_dev_list_lock);
7124	list_add_tail(&rbd_dev->node, &rbd_dev_list);
7125	spin_unlock(&rbd_dev_list_lock);
7126
7127	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
7128		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
7129		rbd_dev->header.features);
7130	rc = count;
7131out:
7132	module_put(THIS_MODULE);
7133	return rc;
7134
7135err_out_image_lock:
7136	rbd_dev_image_unlock(rbd_dev);
7137	rbd_dev_device_release(rbd_dev);
7138err_out_image_probe:
7139	rbd_dev_image_release(rbd_dev);
7140err_out_rbd_dev:
7141	rbd_dev_destroy(rbd_dev);
7142err_out_client:
7143	rbd_put_client(rbdc);
7144err_out_args:
7145	rbd_spec_put(spec);
7146	kfree(rbd_opts);
7147	goto out;
 
 
 
 
7148}
7149
7150static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
 
 
7151{
7152	if (single_major)
7153		return -EINVAL;
7154
7155	return do_rbd_add(bus, buf, count);
7156}
7157
7158static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
7159				      size_t count)
 
7160{
7161	return do_rbd_add(bus, buf, count);
7162}
7163
 
 
 
 
 
 
 
 
 
 
 
 
 
7164static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
7165{
7166	while (rbd_dev->parent) {
7167		struct rbd_device *first = rbd_dev;
7168		struct rbd_device *second = first->parent;
7169		struct rbd_device *third;
7170
7171		/*
7172		 * Follow to the parent with no grandparent and
7173		 * remove it.
7174		 */
7175		while (second && (third = second->parent)) {
7176			first = second;
7177			second = third;
7178		}
7179		rbd_assert(second);
7180		rbd_dev_image_release(second);
7181		rbd_dev_destroy(second);
7182		first->parent = NULL;
7183		first->parent_overlap = 0;
7184
7185		rbd_assert(first->parent_spec);
7186		rbd_spec_put(first->parent_spec);
7187		first->parent_spec = NULL;
7188	}
7189}
7190
7191static ssize_t do_rbd_remove(struct bus_type *bus,
7192			     const char *buf,
7193			     size_t count)
7194{
7195	struct rbd_device *rbd_dev = NULL;
7196	struct list_head *tmp;
7197	int dev_id;
7198	char opt_buf[6];
7199	bool force = false;
7200	int ret;
7201
7202	dev_id = -1;
7203	opt_buf[0] = '\0';
7204	sscanf(buf, "%d %5s", &dev_id, opt_buf);
7205	if (dev_id < 0) {
7206		pr_err("dev_id out of range\n");
 
 
7207		return -EINVAL;
7208	}
7209	if (opt_buf[0] != '\0') {
7210		if (!strcmp(opt_buf, "force")) {
7211			force = true;
7212		} else {
7213			pr_err("bad remove option at '%s'\n", opt_buf);
7214			return -EINVAL;
7215		}
7216	}
7217
7218	ret = -ENOENT;
7219	spin_lock(&rbd_dev_list_lock);
7220	list_for_each(tmp, &rbd_dev_list) {
7221		rbd_dev = list_entry(tmp, struct rbd_device, node);
7222		if (rbd_dev->dev_id == dev_id) {
7223			ret = 0;
7224			break;
7225		}
7226	}
7227	if (!ret) {
7228		spin_lock_irq(&rbd_dev->lock);
7229		if (rbd_dev->open_count && !force)
7230			ret = -EBUSY;
7231		else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
7232					  &rbd_dev->flags))
7233			ret = -EINPROGRESS;
7234		spin_unlock_irq(&rbd_dev->lock);
7235	}
7236	spin_unlock(&rbd_dev_list_lock);
7237	if (ret)
7238		return ret;
7239
7240	if (force) {
7241		/*
7242		 * Prevent new IO from being queued and wait for existing
7243		 * IO to complete/fail.
7244		 */
7245		blk_mq_freeze_queue(rbd_dev->disk->queue);
7246		blk_set_queue_dying(rbd_dev->disk->queue);
7247	}
7248
7249	del_gendisk(rbd_dev->disk);
7250	spin_lock(&rbd_dev_list_lock);
7251	list_del_init(&rbd_dev->node);
7252	spin_unlock(&rbd_dev_list_lock);
7253	device_del(&rbd_dev->dev);
 
 
 
 
7254
7255	rbd_dev_image_unlock(rbd_dev);
7256	rbd_dev_device_release(rbd_dev);
7257	rbd_dev_image_release(rbd_dev);
7258	rbd_dev_destroy(rbd_dev);
7259	return count;
7260}
7261
7262static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
 
 
7263{
7264	if (single_major)
7265		return -EINVAL;
7266
7267	return do_rbd_remove(bus, buf, count);
7268}
7269
7270static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
7271					 size_t count)
 
7272{
7273	return do_rbd_remove(bus, buf, count);
7274}
7275
7276/*
7277 * create control files in sysfs
7278 * /sys/bus/rbd/...
7279 */
7280static int __init rbd_sysfs_init(void)
7281{
7282	int ret;
7283
7284	ret = device_register(&rbd_root_dev);
7285	if (ret < 0)
7286		return ret;
7287
7288	ret = bus_register(&rbd_bus_type);
7289	if (ret < 0)
7290		device_unregister(&rbd_root_dev);
7291
7292	return ret;
7293}
7294
7295static void __exit rbd_sysfs_cleanup(void)
7296{
7297	bus_unregister(&rbd_bus_type);
7298	device_unregister(&rbd_root_dev);
7299}
7300
7301static int __init rbd_slab_init(void)
7302{
7303	rbd_assert(!rbd_img_request_cache);
7304	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
 
 
 
7305	if (!rbd_img_request_cache)
7306		return -ENOMEM;
7307
7308	rbd_assert(!rbd_obj_request_cache);
7309	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
 
 
 
7310	if (!rbd_obj_request_cache)
7311		goto out_err;
7312
7313	return 0;
 
 
 
 
 
 
 
 
 
7314
7315out_err:
7316	kmem_cache_destroy(rbd_img_request_cache);
7317	rbd_img_request_cache = NULL;
 
7318	return -ENOMEM;
7319}
7320
7321static void rbd_slab_exit(void)
7322{
 
 
 
 
7323	rbd_assert(rbd_obj_request_cache);
7324	kmem_cache_destroy(rbd_obj_request_cache);
7325	rbd_obj_request_cache = NULL;
7326
7327	rbd_assert(rbd_img_request_cache);
7328	kmem_cache_destroy(rbd_img_request_cache);
7329	rbd_img_request_cache = NULL;
7330}
7331
7332static int __init rbd_init(void)
7333{
7334	int rc;
7335
7336	if (!libceph_compatible(NULL)) {
7337		rbd_warn(NULL, "libceph incompatibility (quitting)");
7338		return -EINVAL;
7339	}
7340
7341	rc = rbd_slab_init();
7342	if (rc)
7343		return rc;
7344
7345	/*
7346	 * The number of active work items is limited by the number of
7347	 * rbd devices * queue depth, so leave @max_active at default.
7348	 */
7349	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7350	if (!rbd_wq) {
7351		rc = -ENOMEM;
7352		goto err_out_slab;
7353	}
7354
7355	if (single_major) {
7356		rbd_major = register_blkdev(0, RBD_DRV_NAME);
7357		if (rbd_major < 0) {
7358			rc = rbd_major;
7359			goto err_out_wq;
7360		}
7361	}
7362
7363	rc = rbd_sysfs_init();
7364	if (rc)
7365		goto err_out_blkdev;
7366
7367	if (single_major)
7368		pr_info("loaded (major %d)\n", rbd_major);
7369	else
7370		pr_info("loaded\n");
7371
7372	return 0;
7373
7374err_out_blkdev:
7375	if (single_major)
7376		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7377err_out_wq:
7378	destroy_workqueue(rbd_wq);
7379err_out_slab:
7380	rbd_slab_exit();
7381	return rc;
7382}
7383
7384static void __exit rbd_exit(void)
7385{
7386	ida_destroy(&rbd_dev_id_ida);
7387	rbd_sysfs_cleanup();
7388	if (single_major)
7389		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7390	destroy_workqueue(rbd_wq);
7391	rbd_slab_exit();
7392}
7393
7394module_init(rbd_init);
7395module_exit(rbd_exit);
7396
7397MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
7398MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7399MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
7400/* following authorship retained from original osdblk.c */
7401MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7402
7403MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
7404MODULE_LICENSE("GPL");