rbd.c - drivers/block/rbd.c - Linux diff v4.17 - Bootlin Elixir Cross Referencer

   1
   2/*
   3   rbd.c -- Export ceph rados objects as a Linux block device
   4
   5
   6   based on drivers/block/osdblk.c:
   7
   8   Copyright 2009 Red Hat, Inc.
   9
  10   This program is free software; you can redistribute it and/or modify
  11   it under the terms of the GNU General Public License as published by
  12   the Free Software Foundation.
  13
  14   This program is distributed in the hope that it will be useful,
  15   but WITHOUT ANY WARRANTY; without even the implied warranty of
  16   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17   GNU General Public License for more details.
  18
  19   You should have received a copy of the GNU General Public License
  20   along with this program; see the file COPYING.  If not, write to
  21   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  22
  23
  24
  25   For usage instructions, please refer to:
  26
  27                 Documentation/ABI/testing/sysfs-bus-rbd
  28
  29 */
  30
  31#include <linux/ceph/libceph.h>
  32#include <linux/ceph/osd_client.h>
  33#include <linux/ceph/mon_client.h>
  34#include <linux/ceph/cls_lock_client.h>
  35#include <linux/ceph/striper.h>
  36#include <linux/ceph/decode.h>
  37#include <linux/parser.h>
  38#include <linux/bsearch.h>
  39
  40#include <linux/kernel.h>
  41#include <linux/device.h>
  42#include <linux/module.h>
  43#include <linux/blk-mq.h>
  44#include <linux/fs.h>
  45#include <linux/blkdev.h>
  46#include <linux/slab.h>
  47#include <linux/idr.h>
  48#include <linux/workqueue.h>
  49
  50#include "rbd_types.h"
  51
  52#define RBD_DEBUG	/* Activate rbd_assert() calls */
  53
  54/*
 
 
 
 
 
 
 
 
 
  55 * Increment the given counter and return its updated value.
  56 * If the counter is already 0 it will not be incremented.
  57 * If the counter is already at its maximum value returns
  58 * -EINVAL without updating it.
  59 */
  60static int atomic_inc_return_safe(atomic_t *v)
  61{
  62	unsigned int counter;
  63
  64	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
  65	if (counter <= (unsigned int)INT_MAX)
  66		return (int)counter;
  67
  68	atomic_dec(v);
  69
  70	return -EINVAL;
  71}
  72
  73/* Decrement the counter.  Return the resulting value, or -EINVAL */
  74static int atomic_dec_return_safe(atomic_t *v)
  75{
  76	int counter;
  77
  78	counter = atomic_dec_return(v);
  79	if (counter >= 0)
  80		return counter;
  81
  82	atomic_inc(v);
  83
  84	return -EINVAL;
  85}
  86
  87#define RBD_DRV_NAME "rbd"
  88
  89#define RBD_MINORS_PER_MAJOR		256
  90#define RBD_SINGLE_MAJOR_PART_SHIFT	4
  91
  92#define RBD_MAX_PARENT_CHAIN_LEN	16
  93
  94#define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
  95#define RBD_MAX_SNAP_NAME_LEN	\
  96			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
  97
  98#define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
  99
 100#define RBD_SNAP_HEAD_NAME	"-"
 101
 102#define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
 103
 104/* This allows a single page to hold an image name sent by OSD */
 105#define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
 106#define RBD_IMAGE_ID_LEN_MAX	64
 107
 108#define RBD_OBJ_PREFIX_LEN_MAX	64
 109
 110#define RBD_NOTIFY_TIMEOUT	5	/* seconds */
 111#define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
 112
 113/* Feature bits */
 114
 115#define RBD_FEATURE_LAYERING		(1ULL<<0)
 116#define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
 117#define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
 118#define RBD_FEATURE_DATA_POOL		(1ULL<<7)
 119#define RBD_FEATURE_OPERATIONS		(1ULL<<8)
 120
 121#define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
 122				 RBD_FEATURE_STRIPINGV2 |	\
 123				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
 124				 RBD_FEATURE_DATA_POOL |	\
 125				 RBD_FEATURE_OPERATIONS)
 126
 127/* Features supported by this (client software) implementation. */
 128
 129#define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
 130
 131/*
 132 * An RBD device name will be "rbd#", where the "rbd" comes from
 133 * RBD_DRV_NAME above, and # is a unique integer identifier.
 
 
 134 */
 135#define DEV_NAME_LEN		32
 
 136
 137/*
 138 * block device image metadata (in-memory version)
 139 */
 140struct rbd_image_header {
 141	/* These six fields never change for a given rbd image */
 142	char *object_prefix;
 143	__u8 obj_order;
 
 
 144	u64 stripe_unit;
 145	u64 stripe_count;
 146	s64 data_pool_id;
 147	u64 features;		/* Might be changeable someday? */
 148
 149	/* The remaining fields need to be updated occasionally */
 150	u64 image_size;
 151	struct ceph_snap_context *snapc;
 152	char *snap_names;	/* format 1 only */
 153	u64 *snap_sizes;	/* format 1 only */
 154};
 155
 156/*
 157 * An rbd image specification.
 158 *
 159 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
 160 * identify an image.  Each rbd_dev structure includes a pointer to
 161 * an rbd_spec structure that encapsulates this identity.
 162 *
 163 * Each of the id's in an rbd_spec has an associated name.  For a
 164 * user-mapped image, the names are supplied and the id's associated
 165 * with them are looked up.  For a layered image, a parent image is
 166 * defined by the tuple, and the names are looked up.
 167 *
 168 * An rbd_dev structure contains a parent_spec pointer which is
 169 * non-null if the image it represents is a child in a layered
 170 * image.  This pointer will refer to the rbd_spec structure used
 171 * by the parent rbd_dev for its own identity (i.e., the structure
 172 * is shared between the parent and child).
 173 *
 174 * Since these structures are populated once, during the discovery
 175 * phase of image construction, they are effectively immutable so
 176 * we make no effort to synchronize access to them.
 177 *
 178 * Note that code herein does not assume the image name is known (it
 179 * could be a null pointer).
 180 */
 181struct rbd_spec {
 182	u64		pool_id;
 183	const char	*pool_name;
 184
 185	const char	*image_id;
 186	const char	*image_name;
 187
 188	u64		snap_id;
 189	const char	*snap_name;
 190
 191	struct kref	kref;
 192};
 193
 194/*
 195 * an instance of the client.  multiple devices may share an rbd client.
 196 */
 197struct rbd_client {
 198	struct ceph_client	*client;
 199	struct kref		kref;
 200	struct list_head	node;
 201};
 202
 203struct rbd_img_request;
 
 204
 205enum obj_request_type {
 206	OBJ_REQUEST_NODATA = 1,
 207	OBJ_REQUEST_BIO,	/* pointer into provided bio (list) */
 208	OBJ_REQUEST_BVECS,	/* pointer into provided bio_vec array */
 209	OBJ_REQUEST_OWN_BVECS,	/* private bio_vec array, doesn't own pages */
 210};
 211
 212enum obj_operation_type {
 213	OBJ_OP_READ = 1,
 214	OBJ_OP_WRITE,
 215	OBJ_OP_DISCARD,
 
 216};
 217
 218/*
 219 * Writes go through the following state machine to deal with
 220 * layering:
 221 *
 222 *                       need copyup
 223 * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP
 224 *        |     ^                              |
 225 *        v     \------------------------------/
 226 *      done
 227 *        ^
 228 *        |
 229 * RBD_OBJ_WRITE_FLAT
 230 *
 231 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
 232 * there is a parent or not.
 233 */
 234enum rbd_obj_write_state {
 235	RBD_OBJ_WRITE_FLAT = 1,
 236	RBD_OBJ_WRITE_GUARD,
 237	RBD_OBJ_WRITE_COPYUP,
 238};
 239
 240struct rbd_obj_request {
 241	struct ceph_object_extent ex;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 242	union {
 243		bool			tried_parent;	/* for reads */
 244		enum rbd_obj_write_state write_state;	/* for writes */
 
 
 
 
 
 245	};
 
 246
 247	struct rbd_img_request	*img_request;
 248	struct ceph_file_extent	*img_extents;
 249	u32			num_img_extents;
 250
 251	union {
 252		struct ceph_bio_iter	bio_pos;
 253		struct {
 254			struct ceph_bvec_iter	bvec_pos;
 255			u32			bvec_count;
 256			u32			bvec_idx;
 257		};
 258	};
 259	struct bio_vec		*copyup_bvecs;
 260	u32			copyup_bvec_count;
 261
 262	struct ceph_osd_request	*osd_req;
 263
 264	u64			xferred;	/* bytes transferred */
 265	int			result;
 266
 
 
 
 267	struct kref		kref;
 268};
 269
 270enum img_req_flags {
 
 271	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
 272	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
 273};
 274
 275struct rbd_img_request {
 276	struct rbd_device	*rbd_dev;
 277	enum obj_operation_type	op_type;
 278	enum obj_request_type	data_type;
 279	unsigned long		flags;
 280	union {
 281		u64			snap_id;	/* for reads */
 282		struct ceph_snap_context *snapc;	/* for writes */
 283	};
 284	union {
 285		struct request		*rq;		/* block request */
 286		struct rbd_obj_request	*obj_request;	/* obj req initiator */
 287	};
 288	spinlock_t		completion_lock;
 
 
 
 
 289	u64			xferred;/* aggregate bytes transferred */
 290	int			result;	/* first nonzero obj_request result */
 291
 292	struct list_head	object_extents;	/* obj_req.ex structs */
 293	u32			obj_request_count;
 294	u32			pending_count;
 295
 296	struct kref		kref;
 297};
 298
 299#define for_each_obj_request(ireq, oreq) \
 300	list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
 
 
 301#define for_each_obj_request_safe(ireq, oreq, n) \
 302	list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
 303
 304enum rbd_watch_state {
 305	RBD_WATCH_STATE_UNREGISTERED,
 306	RBD_WATCH_STATE_REGISTERED,
 307	RBD_WATCH_STATE_ERROR,
 308};
 309
 310enum rbd_lock_state {
 311	RBD_LOCK_STATE_UNLOCKED,
 312	RBD_LOCK_STATE_LOCKED,
 313	RBD_LOCK_STATE_RELEASING,
 314};
 315
 316/* WatchNotify::ClientId */
 317struct rbd_client_id {
 318	u64 gid;
 319	u64 handle;
 320};
 321
 322struct rbd_mapping {
 323	u64                     size;
 324	u64                     features;
 
 325};
 326
 327/*
 328 * a single device
 329 */
 330struct rbd_device {
 331	int			dev_id;		/* blkdev unique id */
 332
 333	int			major;		/* blkdev assigned major */
 334	int			minor;
 335	struct gendisk		*disk;		/* blkdev's gendisk and rq */
 336
 337	u32			image_format;	/* Either 1 or 2 */
 338	struct rbd_client	*rbd_client;
 339
 340	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 341
 342	spinlock_t		lock;		/* queue, flags, open_count */
 343
 344	struct rbd_image_header	header;
 345	unsigned long		flags;		/* possibly lock protected */
 346	struct rbd_spec		*spec;
 347	struct rbd_options	*opts;
 348	char			*config_info;	/* add{,_single_major} string */
 349
 350	struct ceph_object_id	header_oid;
 351	struct ceph_object_locator header_oloc;
 352
 353	struct ceph_file_layout	layout;		/* used for all rbd requests */
 354
 355	struct mutex		watch_mutex;
 356	enum rbd_watch_state	watch_state;
 357	struct ceph_osd_linger_request *watch_handle;
 358	u64			watch_cookie;
 359	struct delayed_work	watch_dwork;
 360
 361	struct rw_semaphore	lock_rwsem;
 362	enum rbd_lock_state	lock_state;
 363	char			lock_cookie[32];
 364	struct rbd_client_id	owner_cid;
 365	struct work_struct	acquired_lock_work;
 366	struct work_struct	released_lock_work;
 367	struct delayed_work	lock_dwork;
 368	struct work_struct	unlock_work;
 369	wait_queue_head_t	lock_waitq;
 370
 371	struct workqueue_struct	*task_wq;
 372
 373	struct rbd_spec		*parent_spec;
 374	u64			parent_overlap;
 375	atomic_t		parent_ref;
 376	struct rbd_device	*parent;
 377
 378	/* Block layer tags. */
 379	struct blk_mq_tag_set	tag_set;
 380
 381	/* protects updating the header */
 382	struct rw_semaphore     header_rwsem;
 383
 384	struct rbd_mapping	mapping;
 385
 386	struct list_head	node;
 387
 388	/* sysfs related */
 389	struct device		dev;
 390	unsigned long		open_count;	/* protected by lock */
 391};
 392
 393/*
 394 * Flag bits for rbd_dev->flags:
 395 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
 396 *   by rbd_dev->lock
 397 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
 
 398 */
 399enum rbd_dev_flags {
 400	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
 401	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
 402	RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
 403};
 404
 405static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
 406
 407static LIST_HEAD(rbd_dev_list);    /* devices */
 408static DEFINE_SPINLOCK(rbd_dev_list_lock);
 409
 410static LIST_HEAD(rbd_client_list);		/* clients */
 411static DEFINE_SPINLOCK(rbd_client_list_lock);
 412
 413/* Slab caches for frequently-allocated structures */
 414
 415static struct kmem_cache	*rbd_img_request_cache;
 416static struct kmem_cache	*rbd_obj_request_cache;
 
 417
 418static int rbd_major;
 419static DEFINE_IDA(rbd_dev_id_ida);
 420
 421static struct workqueue_struct *rbd_wq;
 422
 423/*
 424 * single-major requires >= 0.75 version of userspace rbd utility.
 
 425 */
 426static bool single_major = true;
 427module_param(single_major, bool, S_IRUGO);
 428MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
 
 
 
 
 429
 430static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 431		       size_t count);
 432static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 433			  size_t count);
 434static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
 435				    size_t count);
 436static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
 437				       size_t count);
 438static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
 
 439
 440static int rbd_dev_id_to_minor(int dev_id)
 441{
 442	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
 443}
 444
 445static int minor_to_rbd_dev_id(int minor)
 446{
 447	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
 448}
 449
 450static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
 451{
 452	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
 453	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
 454}
 455
 456static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
 457{
 458	bool is_lock_owner;
 459
 460	down_read(&rbd_dev->lock_rwsem);
 461	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
 462	up_read(&rbd_dev->lock_rwsem);
 463	return is_lock_owner;
 464}
 465
 466static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
 467{
 468	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
 469}
 470
 471static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
 472static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
 473static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
 474static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
 475static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
 476
 477static struct attribute *rbd_bus_attrs[] = {
 478	&bus_attr_add.attr,
 479	&bus_attr_remove.attr,
 480	&bus_attr_add_single_major.attr,
 481	&bus_attr_remove_single_major.attr,
 482	&bus_attr_supported_features.attr,
 483	NULL,
 484};
 485
 486static umode_t rbd_bus_is_visible(struct kobject *kobj,
 487				  struct attribute *attr, int index)
 488{
 489	if (!single_major &&
 490	    (attr == &bus_attr_add_single_major.attr ||
 491	     attr == &bus_attr_remove_single_major.attr))
 492		return 0;
 493
 494	return attr->mode;
 495}
 496
 497static const struct attribute_group rbd_bus_group = {
 498	.attrs = rbd_bus_attrs,
 499	.is_visible = rbd_bus_is_visible,
 500};
 501__ATTRIBUTE_GROUPS(rbd_bus);
 502
 503static struct bus_type rbd_bus_type = {
 504	.name		= "rbd",
 505	.bus_groups	= rbd_bus_groups,
 506};
 507
 508static void rbd_root_dev_release(struct device *dev)
 509{
 510}
 511
 512static struct device rbd_root_dev = {
 513	.init_name =    "rbd",
 514	.release =      rbd_root_dev_release,
 515};
 516
 517static __printf(2, 3)
 518void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
 519{
 520	struct va_format vaf;
 521	va_list args;
 522
 523	va_start(args, fmt);
 524	vaf.fmt = fmt;
 525	vaf.va = &args;
 526
 527	if (!rbd_dev)
 528		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
 529	else if (rbd_dev->disk)
 530		printk(KERN_WARNING "%s: %s: %pV\n",
 531			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
 532	else if (rbd_dev->spec && rbd_dev->spec->image_name)
 533		printk(KERN_WARNING "%s: image %s: %pV\n",
 534			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
 535	else if (rbd_dev->spec && rbd_dev->spec->image_id)
 536		printk(KERN_WARNING "%s: id %s: %pV\n",
 537			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
 538	else	/* punt */
 539		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
 540			RBD_DRV_NAME, rbd_dev, &vaf);
 541	va_end(args);
 542}
 543
 544#ifdef RBD_DEBUG
 545#define rbd_assert(expr)						\
 546		if (unlikely(!(expr))) {				\
 547			printk(KERN_ERR "\nAssertion failure in %s() "	\
 548						"at line %d:\n\n"	\
 549					"\trbd_assert(%s);\n\n",	\
 550					__func__, __LINE__, #expr);	\
 551			BUG();						\
 552		}
 553#else /* !RBD_DEBUG */
 554#  define rbd_assert(expr)	((void) 0)
 555#endif /* !RBD_DEBUG */
 556
 
 
 557static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
 558
 559static int rbd_dev_refresh(struct rbd_device *rbd_dev);
 560static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
 561static int rbd_dev_header_info(struct rbd_device *rbd_dev);
 562static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
 563static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
 564					u64 snap_id);
 565static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 566				u8 *order, u64 *snap_size);
 567static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 568		u64 *snap_features);
 
 569
 570static int rbd_open(struct block_device *bdev, fmode_t mode)
 571{
 572	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 573	bool removing = false;
 574
 
 
 
 575	spin_lock_irq(&rbd_dev->lock);
 576	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
 577		removing = true;
 578	else
 579		rbd_dev->open_count++;
 580	spin_unlock_irq(&rbd_dev->lock);
 581	if (removing)
 582		return -ENOENT;
 583
 584	(void) get_device(&rbd_dev->dev);
 
 585
 586	return 0;
 587}
 588
 589static void rbd_release(struct gendisk *disk, fmode_t mode)
 590{
 591	struct rbd_device *rbd_dev = disk->private_data;
 592	unsigned long open_count_before;
 593
 594	spin_lock_irq(&rbd_dev->lock);
 595	open_count_before = rbd_dev->open_count--;
 596	spin_unlock_irq(&rbd_dev->lock);
 597	rbd_assert(open_count_before > 0);
 598
 599	put_device(&rbd_dev->dev);
 600}
 601
 602static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
 603{
 604	int ro;
 605
 606	if (get_user(ro, (int __user *)arg))
 607		return -EFAULT;
 608
 609	/* Snapshots can't be marked read-write */
 610	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
 611		return -EROFS;
 612
 613	/* Let blkdev_roset() handle it */
 614	return -ENOTTY;
 615}
 616
 617static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
 618			unsigned int cmd, unsigned long arg)
 619{
 620	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 621	int ret;
 622
 623	switch (cmd) {
 624	case BLKROSET:
 625		ret = rbd_ioctl_set_ro(rbd_dev, arg);
 626		break;
 627	default:
 628		ret = -ENOTTY;
 629	}
 630
 631	return ret;
 632}
 633
 634#ifdef CONFIG_COMPAT
 635static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
 636				unsigned int cmd, unsigned long arg)
 637{
 638	return rbd_ioctl(bdev, mode, cmd, arg);
 639}
 640#endif /* CONFIG_COMPAT */
 641
 642static const struct block_device_operations rbd_bd_ops = {
 643	.owner			= THIS_MODULE,
 644	.open			= rbd_open,
 645	.release		= rbd_release,
 646	.ioctl			= rbd_ioctl,
 647#ifdef CONFIG_COMPAT
 648	.compat_ioctl		= rbd_compat_ioctl,
 649#endif
 650};
 651
 652/*
 653 * Initialize an rbd client instance.  Success or not, this function
 654 * consumes ceph_opts.  Caller holds client_mutex.
 655 */
 656static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 657{
 658	struct rbd_client *rbdc;
 659	int ret = -ENOMEM;
 660
 661	dout("%s:\n", __func__);
 662	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 663	if (!rbdc)
 664		goto out_opt;
 665
 666	kref_init(&rbdc->kref);
 667	INIT_LIST_HEAD(&rbdc->node);
 668
 669	rbdc->client = ceph_create_client(ceph_opts, rbdc);
 670	if (IS_ERR(rbdc->client))
 671		goto out_rbdc;
 672	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 673
 674	ret = ceph_open_session(rbdc->client);
 675	if (ret < 0)
 676		goto out_client;
 677
 678	spin_lock(&rbd_client_list_lock);
 679	list_add_tail(&rbdc->node, &rbd_client_list);
 680	spin_unlock(&rbd_client_list_lock);
 681
 682	dout("%s: rbdc %p\n", __func__, rbdc);
 683
 684	return rbdc;
 685out_client:
 686	ceph_destroy_client(rbdc->client);
 687out_rbdc:
 688	kfree(rbdc);
 689out_opt:
 690	if (ceph_opts)
 691		ceph_destroy_options(ceph_opts);
 692	dout("%s: error %d\n", __func__, ret);
 693
 694	return ERR_PTR(ret);
 695}
 696
 697static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
 698{
 699	kref_get(&rbdc->kref);
 700
 701	return rbdc;
 702}
 703
 704/*
 705 * Find a ceph client with specific addr and configuration.  If
 706 * found, bump its reference count.
 707 */
 708static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 709{
 710	struct rbd_client *client_node;
 711	bool found = false;
 712
 713	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 714		return NULL;
 715
 716	spin_lock(&rbd_client_list_lock);
 717	list_for_each_entry(client_node, &rbd_client_list, node) {
 718		if (!ceph_compare_options(ceph_opts, client_node->client)) {
 719			__rbd_get_client(client_node);
 720
 721			found = true;
 722			break;
 723		}
 724	}
 725	spin_unlock(&rbd_client_list_lock);
 726
 727	return found ? client_node : NULL;
 728}
 729
 730/*
 731 * (Per device) rbd map options
 732 */
 733enum {
 734	Opt_queue_depth,
 735	Opt_lock_timeout,
 736	Opt_last_int,
 737	/* int args above */
 738	Opt_last_string,
 739	/* string args above */
 740	Opt_read_only,
 741	Opt_read_write,
 742	Opt_lock_on_read,
 743	Opt_exclusive,
 744	Opt_notrim,
 745	Opt_err
 746};
 747
 748static match_table_t rbd_opts_tokens = {
 749	{Opt_queue_depth, "queue_depth=%d"},
 750	{Opt_lock_timeout, "lock_timeout=%d"},
 751	/* int args above */
 752	/* string args above */
 753	{Opt_read_only, "read_only"},
 754	{Opt_read_only, "ro"},		/* Alternate spelling */
 755	{Opt_read_write, "read_write"},
 756	{Opt_read_write, "rw"},		/* Alternate spelling */
 757	{Opt_lock_on_read, "lock_on_read"},
 758	{Opt_exclusive, "exclusive"},
 759	{Opt_notrim, "notrim"},
 760	{Opt_err, NULL}
 761};
 762
 763struct rbd_options {
 764	int	queue_depth;
 765	unsigned long	lock_timeout;
 766	bool	read_only;
 767	bool	lock_on_read;
 768	bool	exclusive;
 769	bool	trim;
 770};
 771
 772#define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
 773#define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
 774#define RBD_READ_ONLY_DEFAULT	false
 775#define RBD_LOCK_ON_READ_DEFAULT false
 776#define RBD_EXCLUSIVE_DEFAULT	false
 777#define RBD_TRIM_DEFAULT	true
 778
 779static int parse_rbd_opts_token(char *c, void *private)
 780{
 781	struct rbd_options *rbd_opts = private;
 782	substring_t argstr[MAX_OPT_ARGS];
 783	int token, intval, ret;
 784
 785	token = match_token(c, rbd_opts_tokens, argstr);
 
 
 
 786	if (token < Opt_last_int) {
 787		ret = match_int(&argstr[0], &intval);
 788		if (ret < 0) {
 789			pr_err("bad mount option arg (not int) at '%s'\n", c);
 
 790			return ret;
 791		}
 792		dout("got int token %d val %d\n", token, intval);
 793	} else if (token > Opt_last_int && token < Opt_last_string) {
 794		dout("got string token %d val %s\n", token, argstr[0].from);
 
 
 
 795	} else {
 796		dout("got token %d\n", token);
 797	}
 798
 799	switch (token) {
 800	case Opt_queue_depth:
 801		if (intval < 1) {
 802			pr_err("queue_depth out of range\n");
 803			return -EINVAL;
 804		}
 805		rbd_opts->queue_depth = intval;
 806		break;
 807	case Opt_lock_timeout:
 808		/* 0 is "wait forever" (i.e. infinite timeout) */
 809		if (intval < 0 || intval > INT_MAX / 1000) {
 810			pr_err("lock_timeout out of range\n");
 811			return -EINVAL;
 812		}
 813		rbd_opts->lock_timeout = msecs_to_jiffies(intval * 1000);
 814		break;
 815	case Opt_read_only:
 816		rbd_opts->read_only = true;
 817		break;
 818	case Opt_read_write:
 819		rbd_opts->read_only = false;
 820		break;
 821	case Opt_lock_on_read:
 822		rbd_opts->lock_on_read = true;
 823		break;
 824	case Opt_exclusive:
 825		rbd_opts->exclusive = true;
 826		break;
 827	case Opt_notrim:
 828		rbd_opts->trim = false;
 829		break;
 830	default:
 831		/* libceph prints "bad option" msg */
 832		return -EINVAL;
 833	}
 834
 835	return 0;
 836}
 837
 838static char* obj_op_name(enum obj_operation_type op_type)
 
 
 
 
 
 839{
 840	switch (op_type) {
 841	case OBJ_OP_READ:
 842		return "read";
 843	case OBJ_OP_WRITE:
 844		return "write";
 845	case OBJ_OP_DISCARD:
 846		return "discard";
 847	default:
 848		return "???";
 849	}
 
 850}
 851
 852/*
 853 * Destroy ceph client
 854 *
 855 * Caller must hold rbd_client_list_lock.
 856 */
 857static void rbd_client_release(struct kref *kref)
 858{
 859	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 860
 861	dout("%s: rbdc %p\n", __func__, rbdc);
 862	spin_lock(&rbd_client_list_lock);
 863	list_del(&rbdc->node);
 864	spin_unlock(&rbd_client_list_lock);
 865
 866	ceph_destroy_client(rbdc->client);
 867	kfree(rbdc);
 868}
 869
 870/*
 871 * Drop reference to ceph client node. If it's not referenced anymore, release
 872 * it.
 873 */
 874static void rbd_put_client(struct rbd_client *rbdc)
 875{
 876	if (rbdc)
 877		kref_put(&rbdc->kref, rbd_client_release);
 878}
 879
 880static int wait_for_latest_osdmap(struct ceph_client *client)
 881{
 882	u64 newest_epoch;
 883	int ret;
 884
 885	ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch);
 886	if (ret)
 887		return ret;
 888
 889	if (client->osdc.osdmap->epoch >= newest_epoch)
 890		return 0;
 891
 892	ceph_osdc_maybe_request_map(&client->osdc);
 893	return ceph_monc_wait_osdmap(&client->monc, newest_epoch,
 894				     client->options->mount_timeout);
 895}
 896
 897/*
 898 * Get a ceph client with specific addr and configuration, if one does
 899 * not exist create it.  Either way, ceph_opts is consumed by this
 900 * function.
 901 */
 902static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 903{
 904	struct rbd_client *rbdc;
 905	int ret;
 906
 907	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
 908	rbdc = rbd_client_find(ceph_opts);
 909	if (rbdc) {
 910		ceph_destroy_options(ceph_opts);
 911
 912		/*
 913		 * Using an existing client.  Make sure ->pg_pools is up to
 914		 * date before we look up the pool id in do_rbd_add().
 915		 */
 916		ret = wait_for_latest_osdmap(rbdc->client);
 917		if (ret) {
 918			rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
 919			rbd_put_client(rbdc);
 920			rbdc = ERR_PTR(ret);
 921		}
 922	} else {
 923		rbdc = rbd_client_create(ceph_opts);
 924	}
 925	mutex_unlock(&client_mutex);
 926
 927	return rbdc;
 928}
 929
 930static bool rbd_image_format_valid(u32 image_format)
 931{
 932	return image_format == 1 || image_format == 2;
 933}
 934
 935static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 936{
 937	size_t size;
 938	u32 snap_count;
 939
 940	/* The header has to start with the magic rbd header text */
 941	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
 942		return false;
 943
 944	/* The bio layer requires at least sector-sized I/O */
 945
 946	if (ondisk->options.order < SECTOR_SHIFT)
 947		return false;
 948
 949	/* If we use u64 in a few spots we may be able to loosen this */
 950
 951	if (ondisk->options.order > 8 * sizeof (int) - 1)
 952		return false;
 953
 954	/*
 955	 * The size of a snapshot header has to fit in a size_t, and
 956	 * that limits the number of snapshots.
 957	 */
 958	snap_count = le32_to_cpu(ondisk->snap_count);
 959	size = SIZE_MAX - sizeof (struct ceph_snap_context);
 960	if (snap_count > size / sizeof (__le64))
 961		return false;
 962
 963	/*
 964	 * Not only that, but the size of the entire the snapshot
 965	 * header must also be representable in a size_t.
 966	 */
 967	size -= snap_count * sizeof (__le64);
 968	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
 969		return false;
 970
 971	return true;
 972}
 973
 974/*
 975 * returns the size of an object in the image
 976 */
 977static u32 rbd_obj_bytes(struct rbd_image_header *header)
 978{
 979	return 1U << header->obj_order;
 980}
 981
 982static void rbd_init_layout(struct rbd_device *rbd_dev)
 983{
 984	if (rbd_dev->header.stripe_unit == 0 ||
 985	    rbd_dev->header.stripe_count == 0) {
 986		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
 987		rbd_dev->header.stripe_count = 1;
 988	}
 989
 990	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
 991	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
 992	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
 993	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
 994			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
 995	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
 996}
 997
 998/*
 999 * Fill an rbd image header with information from the given format 1
1000 * on-disk header.
1001 */
1002static int rbd_header_from_disk(struct rbd_device *rbd_dev,
1003				 struct rbd_image_header_ondisk *ondisk)
1004{
1005	struct rbd_image_header *header = &rbd_dev->header;
1006	bool first_time = header->object_prefix == NULL;
1007	struct ceph_snap_context *snapc;
1008	char *object_prefix = NULL;
1009	char *snap_names = NULL;
1010	u64 *snap_sizes = NULL;
1011	u32 snap_count;
 
1012	int ret = -ENOMEM;
1013	u32 i;
1014
1015	/* Allocate this now to avoid having to handle failure below */
1016
1017	if (first_time) {
1018		object_prefix = kstrndup(ondisk->object_prefix,
1019					 sizeof(ondisk->object_prefix),
1020					 GFP_KERNEL);
 
 
1021		if (!object_prefix)
1022			return -ENOMEM;
 
 
1023	}
1024
1025	/* Allocate the snapshot context and fill it in */
1026
1027	snap_count = le32_to_cpu(ondisk->snap_count);
1028	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1029	if (!snapc)
1030		goto out_err;
1031	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1032	if (snap_count) {
1033		struct rbd_image_snap_ondisk *snaps;
1034		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1035
1036		/* We'll keep a copy of the snapshot names... */
1037
1038		if (snap_names_len > (u64)SIZE_MAX)
1039			goto out_2big;
1040		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1041		if (!snap_names)
1042			goto out_err;
1043
1044		/* ...as well as the array of their sizes. */
1045		snap_sizes = kmalloc_array(snap_count,
1046					   sizeof(*header->snap_sizes),
1047					   GFP_KERNEL);
1048		if (!snap_sizes)
1049			goto out_err;
1050
1051		/*
1052		 * Copy the names, and fill in each snapshot's id
1053		 * and size.
1054		 *
1055		 * Note that rbd_dev_v1_header_info() guarantees the
1056		 * ondisk buffer we're working with has
1057		 * snap_names_len bytes beyond the end of the
1058		 * snapshot id array, this memcpy() is safe.
1059		 */
1060		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1061		snaps = ondisk->snaps;
1062		for (i = 0; i < snap_count; i++) {
1063			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1064			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1065		}
1066	}
1067
1068	/* We won't fail any more, fill in the header */
1069
1070	if (first_time) {
1071		header->object_prefix = object_prefix;
1072		header->obj_order = ondisk->options.order;
1073		rbd_init_layout(rbd_dev);
 
 
 
 
 
1074	} else {
1075		ceph_put_snap_context(header->snapc);
1076		kfree(header->snap_names);
1077		kfree(header->snap_sizes);
1078	}
1079
1080	/* The remaining fields always get updated (when we refresh) */
1081
1082	header->image_size = le64_to_cpu(ondisk->image_size);
1083	header->snapc = snapc;
1084	header->snap_names = snap_names;
1085	header->snap_sizes = snap_sizes;
1086
 
 
 
 
 
 
1087	return 0;
1088out_2big:
1089	ret = -EIO;
1090out_err:
1091	kfree(snap_sizes);
1092	kfree(snap_names);
1093	ceph_put_snap_context(snapc);
1094	kfree(object_prefix);
1095
1096	return ret;
1097}
1098
1099static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1100{
1101	const char *snap_name;
1102
1103	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1104
1105	/* Skip over names until we find the one we are looking for */
1106
1107	snap_name = rbd_dev->header.snap_names;
1108	while (which--)
1109		snap_name += strlen(snap_name) + 1;
1110
1111	return kstrdup(snap_name, GFP_KERNEL);
1112}
1113
1114/*
1115 * Snapshot id comparison function for use with qsort()/bsearch().
1116 * Note that result is for snapshots in *descending* order.
1117 */
1118static int snapid_compare_reverse(const void *s1, const void *s2)
1119{
1120	u64 snap_id1 = *(u64 *)s1;
1121	u64 snap_id2 = *(u64 *)s2;
1122
1123	if (snap_id1 < snap_id2)
1124		return 1;
1125	return snap_id1 == snap_id2 ? 0 : -1;
1126}
1127
1128/*
1129 * Search a snapshot context to see if the given snapshot id is
1130 * present.
1131 *
1132 * Returns the position of the snapshot id in the array if it's found,
1133 * or BAD_SNAP_INDEX otherwise.
1134 *
1135 * Note: The snapshot array is in kept sorted (by the osd) in
1136 * reverse order, highest snapshot id first.
1137 */
1138static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1139{
1140	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
1141	u64 *found;
1142
1143	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1144				sizeof (snap_id), snapid_compare_reverse);
1145
1146	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
1147}
1148
1149static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1150					u64 snap_id)
1151{
1152	u32 which;
1153	const char *snap_name;
1154
1155	which = rbd_dev_snap_index(rbd_dev, snap_id);
1156	if (which == BAD_SNAP_INDEX)
1157		return ERR_PTR(-ENOENT);
1158
1159	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1160	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
1161}
1162
1163static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1164{
1165	if (snap_id == CEPH_NOSNAP)
1166		return RBD_SNAP_HEAD_NAME;
1167
1168	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1169	if (rbd_dev->image_format == 1)
1170		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
1171
1172	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
1173}
1174
1175static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1176				u64 *snap_size)
1177{
1178	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1179	if (snap_id == CEPH_NOSNAP) {
1180		*snap_size = rbd_dev->header.image_size;
1181	} else if (rbd_dev->image_format == 1) {
1182		u32 which;
1183
1184		which = rbd_dev_snap_index(rbd_dev, snap_id);
1185		if (which == BAD_SNAP_INDEX)
1186			return -ENOENT;
1187
1188		*snap_size = rbd_dev->header.snap_sizes[which];
1189	} else {
1190		u64 size = 0;
1191		int ret;
1192
1193		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1194		if (ret)
1195			return ret;
1196
1197		*snap_size = size;
1198	}
1199	return 0;
1200}
1201
1202static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1203			u64 *snap_features)
1204{
1205	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1206	if (snap_id == CEPH_NOSNAP) {
1207		*snap_features = rbd_dev->header.features;
1208	} else if (rbd_dev->image_format == 1) {
1209		*snap_features = 0;	/* No features for format 1 */
1210	} else {
1211		u64 features = 0;
1212		int ret;
1213
1214		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1215		if (ret)
1216			return ret;
1217
1218		*snap_features = features;
1219	}
1220	return 0;
1221}
1222
1223static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1224{
1225	u64 snap_id = rbd_dev->spec->snap_id;
1226	u64 size = 0;
1227	u64 features = 0;
1228	int ret;
1229
1230	ret = rbd_snap_size(rbd_dev, snap_id, &size);
1231	if (ret)
1232		return ret;
1233	ret = rbd_snap_features(rbd_dev, snap_id, &features);
1234	if (ret)
1235		return ret;
1236
1237	rbd_dev->mapping.size = size;
1238	rbd_dev->mapping.features = features;
1239
1240	return 0;
1241}
1242
1243static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1244{
1245	rbd_dev->mapping.size = 0;
1246	rbd_dev->mapping.features = 0;
1247}
1248
1249static void zero_bvec(struct bio_vec *bv)
1250{
1251	void *buf;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1252	unsigned long flags;
 
 
1253
1254	buf = bvec_kmap_irq(bv, &flags);
1255	memset(buf, 0, bv->bv_len);
1256	flush_dcache_page(bv->bv_page);
1257	bvec_kunmap_irq(buf, &flags);
 
 
 
 
 
 
 
 
 
 
 
1258}
1259
1260static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
 
 
 
 
 
 
1261{
1262	struct ceph_bio_iter it = *bio_pos;
 
 
 
 
 
 
 
 
1263
1264	ceph_bio_iter_advance(&it, off);
1265	ceph_bio_iter_advance_step(&it, bytes, ({
1266		zero_bvec(&bv);
1267	}));
 
 
 
 
 
 
 
 
1268}
1269
1270static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
 
 
 
 
 
 
 
1271{
1272	struct ceph_bvec_iter it = *bvec_pos;
 
 
 
 
 
 
 
1273
1274	ceph_bvec_iter_advance(&it, off);
1275	ceph_bvec_iter_advance_step(&it, bytes, ({
1276		zero_bvec(&bv);
1277	}));
1278}
1279
1280/*
1281 * Zero a range in @obj_req data buffer defined by a bio (list) or
1282 * (private) bio_vec array.
 
 
1283 *
1284 * @off is relative to the start of the data buffer.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1285 */
1286static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1287			       u32 bytes)
1288{
1289	switch (obj_req->img_request->data_type) {
1290	case OBJ_REQUEST_BIO:
1291		zero_bios(&obj_req->bio_pos, off, bytes);
1292		break;
1293	case OBJ_REQUEST_BVECS:
1294	case OBJ_REQUEST_OWN_BVECS:
1295		zero_bvecs(&obj_req->bvec_pos, off, bytes);
1296		break;
1297	default:
1298		rbd_assert(0);
1299	}
1300}
1301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1302static void rbd_obj_request_destroy(struct kref *kref);
1303static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1304{
1305	rbd_assert(obj_request != NULL);
1306	dout("%s: obj %p (was %d)\n", __func__, obj_request,
1307		kref_read(&obj_request->kref));
1308	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1309}
1310
1311static void rbd_img_request_get(struct rbd_img_request *img_request)
1312{
1313	dout("%s: img %p (was %d)\n", __func__, img_request,
1314	     kref_read(&img_request->kref));
1315	kref_get(&img_request->kref);
1316}
1317
1318static void rbd_img_request_destroy(struct kref *kref);
1319static void rbd_img_request_put(struct rbd_img_request *img_request)
1320{
1321	rbd_assert(img_request != NULL);
1322	dout("%s: img %p (was %d)\n", __func__, img_request,
1323		kref_read(&img_request->kref));
1324	kref_put(&img_request->kref, rbd_img_request_destroy);
 
 
 
1325}
1326
1327static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1328					struct rbd_obj_request *obj_request)
1329{
1330	rbd_assert(obj_request->img_request == NULL);
1331
1332	/* Image request now owns object's original reference */
1333	obj_request->img_request = img_request;
 
 
 
 
1334	img_request->obj_request_count++;
1335	img_request->pending_count++;
1336	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
 
1337}
1338
1339static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1340					struct rbd_obj_request *obj_request)
1341{
1342	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1343	list_del(&obj_request->ex.oe_item);
 
 
 
1344	rbd_assert(img_request->obj_request_count > 0);
1345	img_request->obj_request_count--;
 
 
 
1346	rbd_assert(obj_request->img_request == img_request);
 
 
1347	rbd_obj_request_put(obj_request);
1348}
1349
1350static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
1351{
1352	struct ceph_osd_request *osd_req = obj_request->osd_req;
 
 
 
 
 
 
 
 
 
 
 
 
 
1353
1354	dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
1355	     obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
1356	     obj_request->ex.oe_len, osd_req);
1357	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1358}
1359
1360/*
1361 * The default/initial value for all image request flags is 0.  Each
1362 * is conditionally set to 1 at image request initialization time
1363 * and currently never change thereafter.
1364 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1365static void img_request_layered_set(struct rbd_img_request *img_request)
1366{
1367	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1368	smp_mb();
1369}
1370
1371static void img_request_layered_clear(struct rbd_img_request *img_request)
1372{
1373	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1374	smp_mb();
1375}
1376
1377static bool img_request_layered_test(struct rbd_img_request *img_request)
1378{
1379	smp_mb();
1380	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1381}
1382
1383static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
 
1384{
1385	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
 
1386
1387	return !obj_req->ex.oe_off &&
1388	       obj_req->ex.oe_len == rbd_dev->layout.object_size;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1389}
1390
1391static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
1392{
1393	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
 
 
 
 
 
 
1394
1395	return obj_req->ex.oe_off + obj_req->ex.oe_len ==
1396					rbd_dev->layout.object_size;
 
 
1397}
1398
1399static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1400{
1401	return ceph_file_extents_bytes(obj_req->img_extents,
1402				       obj_req->num_img_extents);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1403}
1404
1405static bool rbd_img_is_write(struct rbd_img_request *img_req)
1406{
1407	switch (img_req->op_type) {
1408	case OBJ_OP_READ:
1409		return false;
1410	case OBJ_OP_WRITE:
1411	case OBJ_OP_DISCARD:
1412		return true;
1413	default:
1414		BUG();
1415	}
1416}
1417
1418static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
 
 
 
 
 
 
 
 
1419
1420static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
 
1421{
1422	struct rbd_obj_request *obj_req = osd_req->r_priv;
 
1423
1424	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1425	     osd_req->r_result, obj_req);
1426	rbd_assert(osd_req == obj_req->osd_req);
 
 
 
 
 
1427
1428	obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
1429	if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
1430		obj_req->xferred = osd_req->r_result;
1431	else
1432		/*
1433		 * Writes aren't allowed to return a data payload.  In some
1434		 * guarded write cases (e.g. stat + zero on an empty object)
1435		 * a stat response makes it through, but we don't care.
1436		 */
1437		obj_req->xferred = 0;
1438
1439	rbd_obj_handle_request(obj_req);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1440}
1441
1442static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1443{
 
1444	struct ceph_osd_request *osd_req = obj_request->osd_req;
 
1445
1446	osd_req->r_flags = CEPH_OSD_FLAG_READ;
1447	osd_req->r_snapid = obj_request->img_request->snap_id;
 
 
 
1448}
1449
1450static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1451{
 
1452	struct ceph_osd_request *osd_req = obj_request->osd_req;
 
 
 
 
1453
1454	osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1455	ktime_get_real_ts(&osd_req->r_mtime);
1456	osd_req->r_data_offset = obj_request->ex.oe_off;
1457}
1458
1459static struct ceph_osd_request *
1460rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
 
 
 
 
 
 
 
 
 
1461{
1462	struct rbd_img_request *img_req = obj_req->img_request;
1463	struct rbd_device *rbd_dev = img_req->rbd_dev;
1464	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1465	struct ceph_osd_request *req;
1466	const char *name_format = rbd_dev->image_format == 1 ?
1467				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1468
1469	req = ceph_osdc_alloc_request(osdc,
1470			(rbd_img_is_write(img_req) ? img_req->snapc : NULL),
1471			num_ops, false, GFP_NOIO);
1472	if (!req)
1473		return NULL;
1474
1475	req->r_callback = rbd_osd_req_callback;
1476	req->r_priv = obj_req;
1477
1478	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1479	if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1480			rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
1481		goto err_req;
 
1482
1483	if (ceph_osdc_alloc_messages(req, GFP_NOIO))
1484		goto err_req;
1485
1486	return req;
1487
1488err_req:
1489	ceph_osdc_put_request(req);
1490	return NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1491}
1492
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1493static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1494{
1495	ceph_osdc_put_request(osd_req);
1496}
1497
1498static struct rbd_obj_request *rbd_obj_request_create(void)
 
 
 
 
1499{
1500	struct rbd_obj_request *obj_request;
 
 
1501
1502	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
1503	if (!obj_request)
 
 
 
1504		return NULL;
1505
1506	ceph_object_extent_init(&obj_request->ex);
 
 
 
 
 
 
 
 
 
 
 
 
 
1507	kref_init(&obj_request->kref);
1508
1509	dout("%s %p\n", __func__, obj_request);
 
 
1510	return obj_request;
1511}
1512
1513static void rbd_obj_request_destroy(struct kref *kref)
1514{
1515	struct rbd_obj_request *obj_request;
1516	u32 i;
1517
1518	obj_request = container_of(kref, struct rbd_obj_request, kref);
1519
1520	dout("%s: obj %p\n", __func__, obj_request);
1521
 
 
 
1522	if (obj_request->osd_req)
1523		rbd_osd_req_destroy(obj_request->osd_req);
1524
1525	switch (obj_request->img_request->data_type) {
 
1526	case OBJ_REQUEST_NODATA:
1527	case OBJ_REQUEST_BIO:
1528	case OBJ_REQUEST_BVECS:
1529		break;		/* Nothing to do */
1530	case OBJ_REQUEST_OWN_BVECS:
1531		kfree(obj_request->bvec_pos.bvecs);
 
 
 
 
 
 
1532		break;
1533	default:
1534		rbd_assert(0);
1535	}
1536
1537	kfree(obj_request->img_extents);
1538	if (obj_request->copyup_bvecs) {
1539		for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1540			if (obj_request->copyup_bvecs[i].bv_page)
1541				__free_page(obj_request->copyup_bvecs[i].bv_page);
1542		}
1543		kfree(obj_request->copyup_bvecs);
1544	}
1545
 
 
1546	kmem_cache_free(rbd_obj_request_cache, obj_request);
1547}
1548
1549/* It's OK to call this for a device with no parent */
1550
1551static void rbd_spec_put(struct rbd_spec *spec);
1552static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1553{
1554	rbd_dev_remove_parent(rbd_dev);
1555	rbd_spec_put(rbd_dev->parent_spec);
1556	rbd_dev->parent_spec = NULL;
1557	rbd_dev->parent_overlap = 0;
1558}
1559
1560/*
1561 * Parent image reference counting is used to determine when an
1562 * image's parent fields can be safely torn down--after there are no
1563 * more in-flight requests to the parent image.  When the last
1564 * reference is dropped, cleaning them up is safe.
1565 */
1566static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1567{
1568	int counter;
1569
1570	if (!rbd_dev->parent_spec)
1571		return;
1572
1573	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1574	if (counter > 0)
1575		return;
1576
1577	/* Last reference; clean up parent data structures */
1578
1579	if (!counter)
1580		rbd_dev_unparent(rbd_dev);
1581	else
1582		rbd_warn(rbd_dev, "parent reference underflow");
1583}
1584
1585/*
1586 * If an image has a non-zero parent overlap, get a reference to its
1587 * parent.
1588 *
 
 
 
 
 
1589 * Returns true if the rbd device has a parent with a non-zero
1590 * overlap and a reference for it was successfully taken, or
1591 * false otherwise.
1592 */
1593static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1594{
1595	int counter = 0;
1596
1597	if (!rbd_dev->parent_spec)
1598		return false;
1599
1600	down_read(&rbd_dev->header_rwsem);
1601	if (rbd_dev->parent_overlap)
1602		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1603	up_read(&rbd_dev->header_rwsem);
 
1604
1605	if (counter < 0)
1606		rbd_warn(rbd_dev, "parent reference overflow");
1607
1608	return counter > 0;
1609}
1610
1611/*
1612 * Caller is responsible for filling in the list of object requests
1613 * that comprises the image request, and the Linux request pointer
1614 * (if there is one).
1615 */
1616static struct rbd_img_request *rbd_img_request_create(
1617					struct rbd_device *rbd_dev,
1618					enum obj_operation_type op_type,
1619					struct ceph_snap_context *snapc)
1620{
1621	struct rbd_img_request *img_request;
1622
1623	img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
1624	if (!img_request)
1625		return NULL;
1626
 
 
 
 
 
 
 
1627	img_request->rbd_dev = rbd_dev;
1628	img_request->op_type = op_type;
1629	if (!rbd_img_is_write(img_request))
 
 
 
 
 
1630		img_request->snap_id = rbd_dev->spec->snap_id;
1631	else
1632		img_request->snapc = snapc;
1633
1634	if (rbd_dev_parent_get(rbd_dev))
1635		img_request_layered_set(img_request);
1636
1637	spin_lock_init(&img_request->completion_lock);
1638	INIT_LIST_HEAD(&img_request->object_extents);
 
 
 
 
1639	kref_init(&img_request->kref);
1640
1641	dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1642	     obj_op_name(op_type), img_request);
 
 
1643	return img_request;
1644}
1645
1646static void rbd_img_request_destroy(struct kref *kref)
1647{
1648	struct rbd_img_request *img_request;
1649	struct rbd_obj_request *obj_request;
1650	struct rbd_obj_request *next_obj_request;
1651
1652	img_request = container_of(kref, struct rbd_img_request, kref);
1653
1654	dout("%s: img %p\n", __func__, img_request);
1655
1656	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1657		rbd_img_obj_request_del(img_request, obj_request);
1658	rbd_assert(img_request->obj_request_count == 0);
1659
1660	if (img_request_layered_test(img_request)) {
1661		img_request_layered_clear(img_request);
1662		rbd_dev_parent_put(img_request->rbd_dev);
1663	}
1664
1665	if (rbd_img_is_write(img_request))
1666		ceph_put_snap_context(img_request->snapc);
1667
1668	kmem_cache_free(rbd_img_request_cache, img_request);
1669}
1670
1671static void prune_extents(struct ceph_file_extent *img_extents,
1672			  u32 *num_img_extents, u64 overlap)
 
1673{
1674	u32 cnt = *num_img_extents;
 
1675
1676	/* drop extents completely beyond the overlap */
1677	while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1678		cnt--;
1679
1680	if (cnt) {
1681		struct ceph_file_extent *ex = &img_extents[cnt - 1];
 
 
1682
1683		/* trim final overlapping extent */
1684		if (ex->fe_off + ex->fe_len > overlap)
1685			ex->fe_len = overlap - ex->fe_off;
1686	}
1687
1688	*num_img_extents = cnt;
1689}
1690
1691/*
1692 * Determine the byte range(s) covered by either just the object extent
1693 * or the entire object in the parent image.
1694 */
1695static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1696				    bool entire)
1697{
1698	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1699	int ret;
1700
1701	if (!rbd_dev->parent_overlap)
1702		return 0;
1703
1704	ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
1705				  entire ? 0 : obj_req->ex.oe_off,
1706				  entire ? rbd_dev->layout.object_size :
1707							obj_req->ex.oe_len,
1708				  &obj_req->img_extents,
1709				  &obj_req->num_img_extents);
1710	if (ret)
1711		return ret;
1712
1713	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1714		      rbd_dev->parent_overlap);
1715	return 0;
1716}
1717
1718static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1719{
1720	switch (obj_req->img_request->data_type) {
1721	case OBJ_REQUEST_BIO:
1722		osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
1723					       &obj_req->bio_pos,
1724					       obj_req->ex.oe_len);
1725		break;
1726	case OBJ_REQUEST_BVECS:
1727	case OBJ_REQUEST_OWN_BVECS:
1728		rbd_assert(obj_req->bvec_pos.iter.bi_size ==
1729							obj_req->ex.oe_len);
1730		rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
1731		osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
1732						    &obj_req->bvec_pos);
1733		break;
1734	default:
1735		rbd_assert(0);
1736	}
1737}
1738
1739static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1740{
1741	obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
1742	if (!obj_req->osd_req)
1743		return -ENOMEM;
1744
1745	osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
1746			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
1747	rbd_osd_req_setup_data(obj_req, 0);
1748
1749	rbd_osd_req_format_read(obj_req);
1750	return 0;
1751}
1752
1753static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1754				unsigned int which)
1755{
1756	struct page **pages;
1757
1758	/*
1759	 * The response data for a STAT call consists of:
1760	 *     le64 length;
1761	 *     struct {
1762	 *         le32 tv_sec;
1763	 *         le32 tv_nsec;
1764	 *     } mtime;
1765	 */
1766	pages = ceph_alloc_page_vector(1, GFP_NOIO);
1767	if (IS_ERR(pages))
1768		return PTR_ERR(pages);
1769
1770	osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
1771	osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
1772				     8 + sizeof(struct ceph_timespec),
1773				     0, false, true);
1774	return 0;
1775}
1776
1777static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
1778				  unsigned int which)
1779{
1780	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1781	u16 opcode;
1782
1783	osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
1784				   rbd_dev->layout.object_size,
1785				   rbd_dev->layout.object_size);
1786
1787	if (rbd_obj_is_entire(obj_req))
1788		opcode = CEPH_OSD_OP_WRITEFULL;
1789	else
1790		opcode = CEPH_OSD_OP_WRITE;
1791
1792	osd_req_op_extent_init(obj_req->osd_req, which, opcode,
1793			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
1794	rbd_osd_req_setup_data(obj_req, which++);
1795
1796	rbd_assert(which == obj_req->osd_req->r_num_ops);
1797	rbd_osd_req_format_write(obj_req);
1798}
1799
1800static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
1801{
1802	unsigned int num_osd_ops, which = 0;
1803	int ret;
1804
1805	/* reverse map the entire object onto the parent */
1806	ret = rbd_obj_calc_img_extents(obj_req, true);
1807	if (ret)
1808		return ret;
1809
1810	if (obj_req->num_img_extents) {
1811		obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1812		num_osd_ops = 3; /* stat + setallochint + write/writefull */
1813	} else {
1814		obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1815		num_osd_ops = 2; /* setallochint + write/writefull */
 
 
1816	}
1817
1818	obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
1819	if (!obj_req->osd_req)
1820		return -ENOMEM;
1821
1822	if (obj_req->num_img_extents) {
1823		ret = __rbd_obj_setup_stat(obj_req, which++);
1824		if (ret)
1825			return ret;
1826	}
1827
1828	__rbd_obj_setup_write(obj_req, which);
1829	return 0;
1830}
1831
1832static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req,
1833				    unsigned int which)
1834{
1835	u16 opcode;
1836
1837	if (rbd_obj_is_entire(obj_req)) {
1838		if (obj_req->num_img_extents) {
1839			osd_req_op_init(obj_req->osd_req, which++,
1840					CEPH_OSD_OP_CREATE, 0);
1841			opcode = CEPH_OSD_OP_TRUNCATE;
1842		} else {
1843			osd_req_op_init(obj_req->osd_req, which++,
1844					CEPH_OSD_OP_DELETE, 0);
1845			opcode = 0;
1846		}
1847	} else if (rbd_obj_is_tail(obj_req)) {
1848		opcode = CEPH_OSD_OP_TRUNCATE;
1849	} else {
1850		opcode = CEPH_OSD_OP_ZERO;
 
1851	}
1852
1853	if (opcode)
1854		osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
1855				       obj_req->ex.oe_off, obj_req->ex.oe_len,
1856				       0, 0);
1857
1858	rbd_assert(which == obj_req->osd_req->r_num_ops);
1859	rbd_osd_req_format_write(obj_req);
1860}
1861
1862static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1863{
1864	unsigned int num_osd_ops, which = 0;
1865	int ret;
1866
1867	/* reverse map the entire object onto the parent */
1868	ret = rbd_obj_calc_img_extents(obj_req, true);
1869	if (ret)
1870		return ret;
1871
1872	if (rbd_obj_is_entire(obj_req)) {
1873		obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1874		if (obj_req->num_img_extents)
1875			num_osd_ops = 2; /* create + truncate */
1876		else
1877			num_osd_ops = 1; /* delete */
1878	} else {
1879		if (obj_req->num_img_extents) {
1880			obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1881			num_osd_ops = 2; /* stat + truncate/zero */
1882		} else {
1883			obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1884			num_osd_ops = 1; /* truncate/zero */
1885		}
1886	}
1887
1888	obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
1889	if (!obj_req->osd_req)
1890		return -ENOMEM;
1891
1892	if (!rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) {
1893		ret = __rbd_obj_setup_stat(obj_req, which++);
1894		if (ret)
1895			return ret;
1896	}
1897
1898	__rbd_obj_setup_discard(obj_req, which);
1899	return 0;
1900}
1901
1902/*
1903 * For each object request in @img_req, allocate an OSD request, add
1904 * individual OSD ops and prepare them for submission.  The number of
1905 * OSD ops depends on op_type and the overlap point (if any).
1906 */
1907static int __rbd_img_fill_request(struct rbd_img_request *img_req)
1908{
1909	struct rbd_obj_request *obj_req;
1910	int ret;
1911
1912	for_each_obj_request(img_req, obj_req) {
1913		switch (img_req->op_type) {
1914		case OBJ_OP_READ:
1915			ret = rbd_obj_setup_read(obj_req);
1916			break;
1917		case OBJ_OP_WRITE:
1918			ret = rbd_obj_setup_write(obj_req);
1919			break;
1920		case OBJ_OP_DISCARD:
1921			ret = rbd_obj_setup_discard(obj_req);
1922			break;
1923		default:
1924			rbd_assert(0);
1925		}
1926		if (ret)
1927			return ret;
1928	}
1929
1930	return 0;
1931}
1932
1933union rbd_img_fill_iter {
1934	struct ceph_bio_iter	bio_iter;
1935	struct ceph_bvec_iter	bvec_iter;
1936};
1937
1938struct rbd_img_fill_ctx {
1939	enum obj_request_type	pos_type;
1940	union rbd_img_fill_iter	*pos;
1941	union rbd_img_fill_iter	iter;
1942	ceph_object_extent_fn_t	set_pos_fn;
1943	ceph_object_extent_fn_t	count_fn;
1944	ceph_object_extent_fn_t	copy_fn;
1945};
1946
1947static struct ceph_object_extent *alloc_object_extent(void *arg)
1948{
1949	struct rbd_img_request *img_req = arg;
1950	struct rbd_obj_request *obj_req;
1951
1952	obj_req = rbd_obj_request_create();
1953	if (!obj_req)
1954		return NULL;
1955
1956	rbd_img_obj_request_add(img_req, obj_req);
1957	return &obj_req->ex;
1958}
1959
1960/*
1961 * While su != os && sc == 1 is technically not fancy (it's the same
1962 * layout as su == os && sc == 1), we can't use the nocopy path for it
1963 * because ->set_pos_fn() should be called only once per object.
1964 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
1965 * treat su != os && sc == 1 as fancy.
1966 */
1967static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
1968{
1969	return l->stripe_unit != l->object_size;
1970}
1971
1972static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
1973				       struct ceph_file_extent *img_extents,
1974				       u32 num_img_extents,
1975				       struct rbd_img_fill_ctx *fctx)
1976{
1977	u32 i;
1978	int ret;
1979
1980	img_req->data_type = fctx->pos_type;
1981
1982	/*
1983	 * Create object requests and set each object request's starting
1984	 * position in the provided bio (list) or bio_vec array.
1985	 */
1986	fctx->iter = *fctx->pos;
1987	for (i = 0; i < num_img_extents; i++) {
1988		ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
1989					   img_extents[i].fe_off,
1990					   img_extents[i].fe_len,
1991					   &img_req->object_extents,
1992					   alloc_object_extent, img_req,
1993					   fctx->set_pos_fn, &fctx->iter);
1994		if (ret)
1995			return ret;
1996	}
1997
1998	return __rbd_img_fill_request(img_req);
 
1999}
2000
2001/*
2002 * Map a list of image extents to a list of object extents, create the
2003 * corresponding object requests (normally each to a different object,
2004 * but not always) and add them to @img_req.  For each object request,
2005 * set up its data descriptor to point to the corresponding chunk(s) of
2006 * @fctx->pos data buffer.
2007 *
2008 * Because ceph_file_to_extents() will merge adjacent object extents
2009 * together, each object request's data descriptor may point to multiple
2010 * different chunks of @fctx->pos data buffer.
2011 *
2012 * @fctx->pos data buffer is assumed to be large enough.
2013 */
2014static int rbd_img_fill_request(struct rbd_img_request *img_req,
2015				struct ceph_file_extent *img_extents,
2016				u32 num_img_extents,
2017				struct rbd_img_fill_ctx *fctx)
2018{
2019	struct rbd_device *rbd_dev = img_req->rbd_dev;
2020	struct rbd_obj_request *obj_req;
2021	u32 i;
2022	int ret;
2023
2024	if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2025	    !rbd_layout_is_fancy(&rbd_dev->layout))
2026		return rbd_img_fill_request_nocopy(img_req, img_extents,
2027						   num_img_extents, fctx);
2028
2029	img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2030
2031	/*
2032	 * Create object requests and determine ->bvec_count for each object
2033	 * request.  Note that ->bvec_count sum over all object requests may
2034	 * be greater than the number of bio_vecs in the provided bio (list)
2035	 * or bio_vec array because when mapped, those bio_vecs can straddle
2036	 * stripe unit boundaries.
2037	 */
2038	fctx->iter = *fctx->pos;
2039	for (i = 0; i < num_img_extents; i++) {
2040		ret = ceph_file_to_extents(&rbd_dev->layout,
2041					   img_extents[i].fe_off,
2042					   img_extents[i].fe_len,
2043					   &img_req->object_extents,
2044					   alloc_object_extent, img_req,
2045					   fctx->count_fn, &fctx->iter);
2046		if (ret)
2047			return ret;
2048	}
2049
2050	for_each_obj_request(img_req, obj_req) {
2051		obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2052					      sizeof(*obj_req->bvec_pos.bvecs),
2053					      GFP_NOIO);
2054		if (!obj_req->bvec_pos.bvecs)
2055			return -ENOMEM;
2056	}
2057
2058	/*
2059	 * Fill in each object request's private bio_vec array, splitting and
2060	 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2061	 */
2062	fctx->iter = *fctx->pos;
2063	for (i = 0; i < num_img_extents; i++) {
2064		ret = ceph_iterate_extents(&rbd_dev->layout,
2065					   img_extents[i].fe_off,
2066					   img_extents[i].fe_len,
2067					   &img_req->object_extents,
2068					   fctx->copy_fn, &fctx->iter);
2069		if (ret)
2070			return ret;
2071	}
2072
2073	return __rbd_img_fill_request(img_req);
2074}
2075
2076static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2077			       u64 off, u64 len)
2078{
2079	struct ceph_file_extent ex = { off, len };
2080	union rbd_img_fill_iter dummy;
2081	struct rbd_img_fill_ctx fctx = {
2082		.pos_type = OBJ_REQUEST_NODATA,
2083		.pos = &dummy,
2084	};
 
 
 
 
 
 
2085
2086	return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2087}
2088
2089static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2090{
2091	struct rbd_obj_request *obj_req =
2092	    container_of(ex, struct rbd_obj_request, ex);
2093	struct ceph_bio_iter *it = arg;
2094
2095	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2096	obj_req->bio_pos = *it;
2097	ceph_bio_iter_advance(it, bytes);
2098}
2099
2100static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2101{
2102	struct rbd_obj_request *obj_req =
2103	    container_of(ex, struct rbd_obj_request, ex);
2104	struct ceph_bio_iter *it = arg;
2105
2106	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2107	ceph_bio_iter_advance_step(it, bytes, ({
2108		obj_req->bvec_count++;
2109	}));
2110
2111}
 
2112
2113static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2114{
2115	struct rbd_obj_request *obj_req =
2116	    container_of(ex, struct rbd_obj_request, ex);
2117	struct ceph_bio_iter *it = arg;
 
 
 
 
 
 
2118
2119	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2120	ceph_bio_iter_advance_step(it, bytes, ({
2121		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2122		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2123	}));
2124}
 
2125
2126static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2127				   struct ceph_file_extent *img_extents,
2128				   u32 num_img_extents,
2129				   struct ceph_bio_iter *bio_pos)
2130{
2131	struct rbd_img_fill_ctx fctx = {
2132		.pos_type = OBJ_REQUEST_BIO,
2133		.pos = (union rbd_img_fill_iter *)bio_pos,
2134		.set_pos_fn = set_bio_pos,
2135		.count_fn = count_bio_bvecs,
2136		.copy_fn = copy_bio_bvecs,
2137	};
 
 
2138
2139	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2140				    &fctx);
2141}
 
 
 
 
 
 
2142
2143static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2144				 u64 off, u64 len, struct bio *bio)
2145{
2146	struct ceph_file_extent ex = { off, len };
2147	struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2148
2149	return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2150}
2151
2152static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2153{
2154	struct rbd_obj_request *obj_req =
2155	    container_of(ex, struct rbd_obj_request, ex);
2156	struct ceph_bvec_iter *it = arg;
2157
2158	obj_req->bvec_pos = *it;
2159	ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2160	ceph_bvec_iter_advance(it, bytes);
2161}
2162
2163static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2164{
2165	struct rbd_obj_request *obj_req =
2166	    container_of(ex, struct rbd_obj_request, ex);
2167	struct ceph_bvec_iter *it = arg;
2168
2169	ceph_bvec_iter_advance_step(it, bytes, ({
2170		obj_req->bvec_count++;
2171	}));
2172}
2173
2174static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
 
2175{
2176	struct rbd_obj_request *obj_req =
2177	    container_of(ex, struct rbd_obj_request, ex);
2178	struct ceph_bvec_iter *it = arg;
2179
2180	ceph_bvec_iter_advance_step(it, bytes, ({
2181		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2182		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2183	}));
2184}
2185
2186static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2187				     struct ceph_file_extent *img_extents,
2188				     u32 num_img_extents,
2189				     struct ceph_bvec_iter *bvec_pos)
2190{
2191	struct rbd_img_fill_ctx fctx = {
2192		.pos_type = OBJ_REQUEST_BVECS,
2193		.pos = (union rbd_img_fill_iter *)bvec_pos,
2194		.set_pos_fn = set_bvec_pos,
2195		.count_fn = count_bvecs,
2196		.copy_fn = copy_bvecs,
2197	};
 
 
 
2198
2199	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2200				    &fctx);
2201}
 
 
 
 
 
2202
2203static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2204				   struct ceph_file_extent *img_extents,
2205				   u32 num_img_extents,
2206				   struct bio_vec *bvecs)
2207{
2208	struct ceph_bvec_iter it = {
2209		.bvecs = bvecs,
2210		.iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2211							     num_img_extents) },
2212	};
2213
2214	return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2215					 &it);
2216}
2217
2218static void rbd_img_request_submit(struct rbd_img_request *img_request)
 
2219{
2220	struct rbd_obj_request *obj_request;
2221
2222	dout("%s: img %p\n", __func__, img_request);
2223
2224	rbd_img_request_get(img_request);
2225	for_each_obj_request(img_request, obj_request)
2226		rbd_obj_request_submit(obj_request);
2227
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2228	rbd_img_request_put(img_request);
2229}
2230
2231static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
2232{
2233	struct rbd_img_request *img_req = obj_req->img_request;
2234	struct rbd_img_request *child_img_req;
2235	int ret;
2236
2237	child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2238					       OBJ_OP_READ, NULL);
2239	if (!child_img_req)
2240		return -ENOMEM;
2241
2242	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2243	child_img_req->obj_request = obj_req;
2244
2245	if (!rbd_img_is_write(img_req)) {
2246		switch (img_req->data_type) {
2247		case OBJ_REQUEST_BIO:
2248			ret = __rbd_img_fill_from_bio(child_img_req,
2249						      obj_req->img_extents,
2250						      obj_req->num_img_extents,
2251						      &obj_req->bio_pos);
2252			break;
2253		case OBJ_REQUEST_BVECS:
2254		case OBJ_REQUEST_OWN_BVECS:
2255			ret = __rbd_img_fill_from_bvecs(child_img_req,
2256						      obj_req->img_extents,
2257						      obj_req->num_img_extents,
2258						      &obj_req->bvec_pos);
2259			break;
2260		default:
2261			rbd_assert(0);
2262		}
2263	} else {
2264		ret = rbd_img_fill_from_bvecs(child_img_req,
2265					      obj_req->img_extents,
2266					      obj_req->num_img_extents,
2267					      obj_req->copyup_bvecs);
2268	}
2269	if (ret) {
2270		rbd_img_request_put(child_img_req);
2271		return ret;
2272	}
2273
2274	rbd_img_request_submit(child_img_req);
2275	return 0;
2276}
2277
2278static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
2279{
2280	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2281	int ret;
2282
2283	if (obj_req->result == -ENOENT &&
2284	    rbd_dev->parent_overlap && !obj_req->tried_parent) {
2285		/* reverse map this object extent onto the parent */
2286		ret = rbd_obj_calc_img_extents(obj_req, false);
2287		if (ret) {
2288			obj_req->result = ret;
2289			return true;
2290		}
2291
2292		if (obj_req->num_img_extents) {
2293			obj_req->tried_parent = true;
2294			ret = rbd_obj_read_from_parent(obj_req);
2295			if (ret) {
2296				obj_req->result = ret;
2297				return true;
2298			}
2299			return false;
2300		}
2301	}
2302
2303	/*
2304	 * -ENOENT means a hole in the image -- zero-fill the entire
2305	 * length of the request.  A short read also implies zero-fill
2306	 * to the end of the request.  In both cases we update xferred
2307	 * count to indicate the whole request was satisfied.
2308	 */
2309	if (obj_req->result == -ENOENT ||
2310	    (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) {
2311		rbd_assert(!obj_req->xferred || !obj_req->result);
2312		rbd_obj_zero_range(obj_req, obj_req->xferred,
2313				   obj_req->ex.oe_len - obj_req->xferred);
2314		obj_req->result = 0;
2315		obj_req->xferred = obj_req->ex.oe_len;
2316	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2317
2318	return true;
 
 
 
2319}
2320
2321/*
2322 * copyup_bvecs pages are never highmem pages
 
 
 
 
 
 
 
 
 
 
 
2323 */
2324static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2325{
2326	struct ceph_bvec_iter it = {
2327		.bvecs = bvecs,
2328		.iter = { .bi_size = bytes },
2329	};
2330
2331	ceph_bvec_iter_advance_step(&it, bytes, ({
2332		if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2333			       bv.bv_len))
2334			return false;
2335	}));
2336	return true;
2337}
2338
2339static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2340{
2341	unsigned int num_osd_ops = obj_req->osd_req->r_num_ops;
2342
2343	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2344	rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
2345	rbd_osd_req_destroy(obj_req->osd_req);
 
2346
2347	/*
2348	 * Create a copyup request with the same number of OSD ops as
2349	 * the original request.  The original request was stat + op(s),
2350	 * the new copyup request will be copyup + the same op(s).
2351	 */
2352	obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
2353	if (!obj_req->osd_req)
2354		return -ENOMEM;
2355
2356	/*
2357	 * Only send non-zero copyup data to save some I/O and network
2358	 * bandwidth -- zero copyup data is equivalent to the object not
2359	 * existing.
2360	 */
2361	if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2362		dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2363		bytes = 0;
2364	}
2365
2366	osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
2367			    "copyup");
2368	osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
2369					  obj_req->copyup_bvecs,
2370					  obj_req->copyup_bvec_count,
2371					  bytes);
2372
2373	switch (obj_req->img_request->op_type) {
2374	case OBJ_OP_WRITE:
2375		__rbd_obj_setup_write(obj_req, 1);
2376		break;
2377	case OBJ_OP_DISCARD:
2378		rbd_assert(!rbd_obj_is_entire(obj_req));
2379		__rbd_obj_setup_discard(obj_req, 1);
2380		break;
2381	default:
2382		rbd_assert(0);
2383	}
2384
2385	rbd_obj_request_submit(obj_req);
2386	return 0;
2387}
2388
2389static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2390{
2391	u32 i;
2392
2393	rbd_assert(!obj_req->copyup_bvecs);
2394	obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2395	obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2396					sizeof(*obj_req->copyup_bvecs),
2397					GFP_NOIO);
2398	if (!obj_req->copyup_bvecs)
2399		return -ENOMEM;
2400
2401	for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2402		unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2403
2404		obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2405		if (!obj_req->copyup_bvecs[i].bv_page)
2406			return -ENOMEM;
2407
2408		obj_req->copyup_bvecs[i].bv_offset = 0;
2409		obj_req->copyup_bvecs[i].bv_len = len;
2410		obj_overlap -= len;
2411	}
2412
2413	rbd_assert(!obj_overlap);
2414	return 0;
2415}
2416
2417static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2418{
2419	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2420	int ret;
2421
2422	rbd_assert(obj_req->num_img_extents);
2423	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2424		      rbd_dev->parent_overlap);
2425	if (!obj_req->num_img_extents) {
2426		/*
2427		 * The overlap has become 0 (most likely because the
2428		 * image has been flattened).  Use rbd_obj_issue_copyup()
2429		 * to re-submit the original write request -- the copyup
2430		 * operation itself will be a no-op, since someone must
2431		 * have populated the child object while we weren't
2432		 * looking.  Move to WRITE_FLAT state as we'll be done
2433		 * with the operation once the null copyup completes.
2434		 */
2435		obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2436		return rbd_obj_issue_copyup(obj_req, 0);
2437	}
2438
2439	ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
2440	if (ret)
2441		return ret;
2442
2443	obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
2444	return rbd_obj_read_from_parent(obj_req);
2445}
2446
2447static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
2448{
2449	int ret;
2450
2451again:
2452	switch (obj_req->write_state) {
2453	case RBD_OBJ_WRITE_GUARD:
2454		rbd_assert(!obj_req->xferred);
2455		if (obj_req->result == -ENOENT) {
2456			/*
2457			 * The target object doesn't exist.  Read the data for
2458			 * the entire target object up to the overlap point (if
2459			 * any) from the parent, so we can use it for a copyup.
2460			 */
2461			ret = rbd_obj_handle_write_guard(obj_req);
2462			if (ret) {
2463				obj_req->result = ret;
2464				return true;
2465			}
2466			return false;
2467		}
2468		/* fall through */
2469	case RBD_OBJ_WRITE_FLAT:
2470		if (!obj_req->result)
2471			/*
2472			 * There is no such thing as a successful short
2473			 * write -- indicate the whole request was satisfied.
2474			 */
2475			obj_req->xferred = obj_req->ex.oe_len;
2476		return true;
2477	case RBD_OBJ_WRITE_COPYUP:
2478		obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2479		if (obj_req->result)
2480			goto again;
2481
2482		rbd_assert(obj_req->xferred);
2483		ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
2484		if (ret) {
2485			obj_req->result = ret;
2486			return true;
2487		}
2488		return false;
2489	default:
2490		BUG();
2491	}
2492}
2493
2494/*
2495 * Returns true if @obj_req is completed, or false otherwise.
2496 */
2497static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2498{
2499	switch (obj_req->img_request->op_type) {
2500	case OBJ_OP_READ:
2501		return rbd_obj_handle_read(obj_req);
2502	case OBJ_OP_WRITE:
2503		return rbd_obj_handle_write(obj_req);
2504	case OBJ_OP_DISCARD:
2505		if (rbd_obj_handle_write(obj_req)) {
2506			/*
2507			 * Hide -ENOENT from delete/truncate/zero -- discarding
2508			 * a non-existent object is not a problem.
2509			 */
2510			if (obj_req->result == -ENOENT) {
2511				obj_req->result = 0;
2512				obj_req->xferred = obj_req->ex.oe_len;
2513			}
2514			return true;
2515		}
2516		return false;
2517	default:
2518		BUG();
2519	}
2520}
2521
2522static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
2523{
2524	struct rbd_img_request *img_req = obj_req->img_request;
2525
2526	rbd_assert((!obj_req->result &&
2527		    obj_req->xferred == obj_req->ex.oe_len) ||
2528		   (obj_req->result < 0 && !obj_req->xferred));
2529	if (!obj_req->result) {
2530		img_req->xferred += obj_req->xferred;
2531		return;
2532	}
2533
2534	rbd_warn(img_req->rbd_dev,
2535		 "%s at objno %llu %llu~%llu result %d xferred %llu",
2536		 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2537		 obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result,
2538		 obj_req->xferred);
2539	if (!img_req->result) {
2540		img_req->result = obj_req->result;
2541		img_req->xferred = 0;
 
 
2542	}
2543}
2544
2545static void rbd_img_end_child_request(struct rbd_img_request *img_req)
2546{
2547	struct rbd_obj_request *obj_req = img_req->obj_request;
 
 
2548
2549	rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
2550	rbd_assert((!img_req->result &&
2551		    img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) ||
2552		   (img_req->result < 0 && !img_req->xferred));
 
2553
2554	obj_req->result = img_req->result;
2555	obj_req->xferred = img_req->xferred;
2556	rbd_img_request_put(img_req);
2557}
2558
2559static void rbd_img_end_request(struct rbd_img_request *img_req)
2560{
2561	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
2562	rbd_assert((!img_req->result &&
2563		    img_req->xferred == blk_rq_bytes(img_req->rq)) ||
2564		   (img_req->result < 0 && !img_req->xferred));
 
 
 
 
 
 
2565
2566	blk_mq_end_request(img_req->rq,
2567			   errno_to_blk_status(img_req->result));
2568	rbd_img_request_put(img_req);
2569}
2570
2571static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2572{
2573	struct rbd_img_request *img_req;
 
 
2574
2575again:
2576	if (!__rbd_obj_handle_request(obj_req))
2577		return;
2578
2579	img_req = obj_req->img_request;
2580	spin_lock(&img_req->completion_lock);
2581	rbd_obj_end_request(obj_req);
2582	rbd_assert(img_req->pending_count);
2583	if (--img_req->pending_count) {
2584		spin_unlock(&img_req->completion_lock);
2585		return;
2586	}
 
 
 
 
 
 
 
 
 
 
2587
2588	spin_unlock(&img_req->completion_lock);
2589	if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
2590		obj_req = img_req->obj_request;
2591		rbd_img_end_child_request(img_req);
2592		goto again;
 
 
 
 
 
 
 
 
2593	}
2594	rbd_img_end_request(img_req);
2595}
2596
2597static const struct rbd_client_id rbd_empty_cid;
2598
2599static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2600			  const struct rbd_client_id *rhs)
2601{
2602	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2603}
2604
2605static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2606{
2607	struct rbd_client_id cid;
2608
2609	mutex_lock(&rbd_dev->watch_mutex);
2610	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2611	cid.handle = rbd_dev->watch_cookie;
2612	mutex_unlock(&rbd_dev->watch_mutex);
2613	return cid;
2614}
2615
2616/*
2617 * lock_rwsem must be held for write
2618 */
2619static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2620			      const struct rbd_client_id *cid)
2621{
2622	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2623	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2624	     cid->gid, cid->handle);
2625	rbd_dev->owner_cid = *cid; /* struct */
2626}
2627
2628static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2629{
2630	mutex_lock(&rbd_dev->watch_mutex);
2631	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2632	mutex_unlock(&rbd_dev->watch_mutex);
2633}
2634
2635static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2636{
2637	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2638
2639	strcpy(rbd_dev->lock_cookie, cookie);
2640	rbd_set_owner_cid(rbd_dev, &cid);
2641	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2642}
2643
2644/*
2645 * lock_rwsem must be held for write
2646 */
2647static int rbd_lock(struct rbd_device *rbd_dev)
2648{
2649	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2650	char cookie[32];
2651	int ret;
2652
2653	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2654		rbd_dev->lock_cookie[0] != '\0');
2655
2656	format_lock_cookie(rbd_dev, cookie);
2657	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2658			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2659			    RBD_LOCK_TAG, "", 0);
2660	if (ret)
2661		return ret;
2662
2663	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
2664	__rbd_lock(rbd_dev, cookie);
2665	return 0;
2666}
2667
2668/*
2669 * lock_rwsem must be held for write
2670 */
2671static void rbd_unlock(struct rbd_device *rbd_dev)
2672{
2673	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2674	int ret;
2675
2676	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2677		rbd_dev->lock_cookie[0] == '\0');
2678
2679	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2680			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
2681	if (ret && ret != -ENOENT)
2682		rbd_warn(rbd_dev, "failed to unlock: %d", ret);
2683
2684	/* treat errors as the image is unlocked */
2685	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
2686	rbd_dev->lock_cookie[0] = '\0';
2687	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2688	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
2689}
2690
2691static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2692				enum rbd_notify_op notify_op,
2693				struct page ***preply_pages,
2694				size_t *preply_len)
2695{
2696	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2697	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2698	char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
2699	int buf_size = sizeof(buf);
2700	void *p = buf;
2701
2702	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2703
2704	/* encode *LockPayload NotifyMessage (op + ClientId) */
2705	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2706	ceph_encode_32(&p, notify_op);
2707	ceph_encode_64(&p, cid.gid);
2708	ceph_encode_64(&p, cid.handle);
2709
2710	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2711				&rbd_dev->header_oloc, buf, buf_size,
2712				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2713}
2714
2715static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2716			       enum rbd_notify_op notify_op)
2717{
2718	struct page **reply_pages;
2719	size_t reply_len;
2720
2721	__rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2722	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2723}
2724
2725static void rbd_notify_acquired_lock(struct work_struct *work)
2726{
2727	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2728						  acquired_lock_work);
2729
2730	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2731}
2732
2733static void rbd_notify_released_lock(struct work_struct *work)
2734{
2735	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2736						  released_lock_work);
2737
2738	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2739}
2740
2741static int rbd_request_lock(struct rbd_device *rbd_dev)
2742{
2743	struct page **reply_pages;
2744	size_t reply_len;
2745	bool lock_owner_responded = false;
2746	int ret;
2747
2748	dout("%s rbd_dev %p\n", __func__, rbd_dev);
2749
2750	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2751				   &reply_pages, &reply_len);
2752	if (ret && ret != -ETIMEDOUT) {
2753		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2754		goto out;
2755	}
2756
2757	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2758		void *p = page_address(reply_pages[0]);
2759		void *const end = p + reply_len;
2760		u32 n;
2761
2762		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2763		while (n--) {
2764			u8 struct_v;
2765			u32 len;
2766
2767			ceph_decode_need(&p, end, 8 + 8, e_inval);
2768			p += 8 + 8; /* skip gid and cookie */
2769
2770			ceph_decode_32_safe(&p, end, len, e_inval);
2771			if (!len)
2772				continue;
2773
2774			if (lock_owner_responded) {
2775				rbd_warn(rbd_dev,
2776					 "duplicate lock owners detected");
2777				ret = -EIO;
2778				goto out;
2779			}
2780
2781			lock_owner_responded = true;
2782			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
2783						  &struct_v, &len);
2784			if (ret) {
2785				rbd_warn(rbd_dev,
2786					 "failed to decode ResponseMessage: %d",
2787					 ret);
2788				goto e_inval;
2789			}
2790
2791			ret = ceph_decode_32(&p);
2792		}
2793	}
2794
2795	if (!lock_owner_responded) {
2796		rbd_warn(rbd_dev, "no lock owners detected");
2797		ret = -ETIMEDOUT;
2798	}
2799
2800out:
2801	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2802	return ret;
2803
2804e_inval:
2805	ret = -EINVAL;
2806	goto out;
2807}
2808
2809static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
2810{
2811	dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
2812
2813	cancel_delayed_work(&rbd_dev->lock_dwork);
2814	if (wake_all)
2815		wake_up_all(&rbd_dev->lock_waitq);
2816	else
2817		wake_up(&rbd_dev->lock_waitq);
2818}
2819
2820static int get_lock_owner_info(struct rbd_device *rbd_dev,
2821			       struct ceph_locker **lockers, u32 *num_lockers)
2822{
2823	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2824	u8 lock_type;
2825	char *lock_tag;
2826	int ret;
2827
2828	dout("%s rbd_dev %p\n", __func__, rbd_dev);
2829
2830	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
2831				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2832				 &lock_type, &lock_tag, lockers, num_lockers);
2833	if (ret)
2834		return ret;
2835
2836	if (*num_lockers == 0) {
2837		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
2838		goto out;
2839	}
 
2840
2841	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
2842		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
2843			 lock_tag);
2844		ret = -EBUSY;
2845		goto out;
2846	}
2847
2848	if (lock_type == CEPH_CLS_LOCK_SHARED) {
2849		rbd_warn(rbd_dev, "shared lock type detected");
2850		ret = -EBUSY;
 
 
 
 
 
 
 
2851		goto out;
2852	}
2853
2854	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
2855		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
2856		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
2857			 (*lockers)[0].id.cookie);
2858		ret = -EBUSY;
2859		goto out;
2860	}
2861
 
 
2862out:
2863	kfree(lock_tag);
2864	return ret;
2865}
2866
2867static int find_watcher(struct rbd_device *rbd_dev,
2868			const struct ceph_locker *locker)
2869{
2870	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2871	struct ceph_watch_item *watchers;
2872	u32 num_watchers;
2873	u64 cookie;
2874	int i;
2875	int ret;
2876
2877	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
2878				      &rbd_dev->header_oloc, &watchers,
2879				      &num_watchers);
2880	if (ret)
2881		return ret;
2882
2883	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
2884	for (i = 0; i < num_watchers; i++) {
2885		if (!memcmp(&watchers[i].addr, &locker->info.addr,
2886			    sizeof(locker->info.addr)) &&
2887		    watchers[i].cookie == cookie) {
2888			struct rbd_client_id cid = {
2889				.gid = le64_to_cpu(watchers[i].name.num),
2890				.handle = cookie,
2891			};
2892
2893			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
2894			     rbd_dev, cid.gid, cid.handle);
2895			rbd_set_owner_cid(rbd_dev, &cid);
2896			ret = 1;
2897			goto out;
2898		}
2899	}
2900
2901	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
2902	ret = 0;
2903out:
2904	kfree(watchers);
2905	return ret;
2906}
2907
2908/*
2909 * lock_rwsem must be held for write
2910 */
2911static int rbd_try_lock(struct rbd_device *rbd_dev)
2912{
2913	struct ceph_client *client = rbd_dev->rbd_client->client;
2914	struct ceph_locker *lockers;
2915	u32 num_lockers;
2916	int ret;
2917
2918	for (;;) {
2919		ret = rbd_lock(rbd_dev);
2920		if (ret != -EBUSY)
2921			return ret;
2922
2923		/* determine if the current lock holder is still alive */
2924		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
2925		if (ret)
2926			return ret;
2927
2928		if (num_lockers == 0)
2929			goto again;
2930
2931		ret = find_watcher(rbd_dev, lockers);
2932		if (ret) {
2933			if (ret > 0)
2934				ret = 0; /* have to request lock */
2935			goto out;
2936		}
2937
2938		rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
2939			 ENTITY_NAME(lockers[0].id.name));
 
 
 
 
 
 
 
 
 
 
 
 
 
2940
2941		ret = ceph_monc_blacklist_add(&client->monc,
2942					      &lockers[0].info.addr);
2943		if (ret) {
2944			rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
2945				 ENTITY_NAME(lockers[0].id.name), ret);
2946			goto out;
2947		}
2948
2949		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
2950					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
2951					  lockers[0].id.cookie,
2952					  &lockers[0].id.name);
2953		if (ret && ret != -ENOENT)
2954			goto out;
2955
2956again:
2957		ceph_free_lockers(lockers, num_lockers);
2958	}
2959
2960out:
2961	ceph_free_lockers(lockers, num_lockers);
2962	return ret;
2963}
2964
2965/*
2966 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
2967 */
2968static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
2969						int *pret)
2970{
2971	enum rbd_lock_state lock_state;
2972
2973	down_read(&rbd_dev->lock_rwsem);
2974	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
2975	     rbd_dev->lock_state);
2976	if (__rbd_is_lock_owner(rbd_dev)) {
2977		lock_state = rbd_dev->lock_state;
2978		up_read(&rbd_dev->lock_rwsem);
2979		return lock_state;
2980	}
2981
2982	up_read(&rbd_dev->lock_rwsem);
2983	down_write(&rbd_dev->lock_rwsem);
2984	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
2985	     rbd_dev->lock_state);
2986	if (!__rbd_is_lock_owner(rbd_dev)) {
2987		*pret = rbd_try_lock(rbd_dev);
2988		if (*pret)
2989			rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
2990	}
2991
2992	lock_state = rbd_dev->lock_state;
2993	up_write(&rbd_dev->lock_rwsem);
2994	return lock_state;
2995}
2996
2997static void rbd_acquire_lock(struct work_struct *work)
2998{
2999	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3000					    struct rbd_device, lock_dwork);
3001	enum rbd_lock_state lock_state;
3002	int ret = 0;
3003
3004	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3005again:
3006	lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3007	if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3008		if (lock_state == RBD_LOCK_STATE_LOCKED)
3009			wake_requests(rbd_dev, true);
3010		dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3011		     rbd_dev, lock_state, ret);
3012		return;
3013	}
3014
3015	ret = rbd_request_lock(rbd_dev);
3016	if (ret == -ETIMEDOUT) {
3017		goto again; /* treat this as a dead client */
3018	} else if (ret == -EROFS) {
3019		rbd_warn(rbd_dev, "peer will not release lock");
3020		/*
3021		 * If this is rbd_add_acquire_lock(), we want to fail
3022		 * immediately -- reuse BLACKLISTED flag.  Otherwise we
3023		 * want to block.
3024		 */
3025		if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3026			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3027			/* wake "rbd map --exclusive" process */
3028			wake_requests(rbd_dev, false);
3029		}
3030	} else if (ret < 0) {
3031		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3032		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3033				 RBD_RETRY_DELAY);
3034	} else {
3035		/*
3036		 * lock owner acked, but resend if we don't see them
3037		 * release the lock
3038		 */
3039		dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3040		     rbd_dev);
3041		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3042		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3043	}
3044}
3045
3046/*
3047 * lock_rwsem must be held for write
3048 */
3049static bool rbd_release_lock(struct rbd_device *rbd_dev)
3050{
3051	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3052	     rbd_dev->lock_state);
3053	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3054		return false;
3055
3056	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3057	downgrade_write(&rbd_dev->lock_rwsem);
3058	/*
3059	 * Ensure that all in-flight IO is flushed.
3060	 *
3061	 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3062	 * may be shared with other devices.
3063	 */
3064	ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3065	up_read(&rbd_dev->lock_rwsem);
3066
3067	down_write(&rbd_dev->lock_rwsem);
3068	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3069	     rbd_dev->lock_state);
3070	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3071		return false;
3072
3073	rbd_unlock(rbd_dev);
3074	/*
3075	 * Give others a chance to grab the lock - we would re-acquire
3076	 * almost immediately if we got new IO during ceph_osdc_sync()
3077	 * otherwise.  We need to ack our own notifications, so this
3078	 * lock_dwork will be requeued from rbd_wait_state_locked()
3079	 * after wake_requests() in rbd_handle_released_lock().
3080	 */
3081	cancel_delayed_work(&rbd_dev->lock_dwork);
3082	return true;
3083}
3084
3085static void rbd_release_lock_work(struct work_struct *work)
3086{
3087	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3088						  unlock_work);
 
 
 
3089
3090	down_write(&rbd_dev->lock_rwsem);
3091	rbd_release_lock(rbd_dev);
3092	up_write(&rbd_dev->lock_rwsem);
3093}
3094
3095static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3096				     void **p)
3097{
3098	struct rbd_client_id cid = { 0 };
3099
3100	if (struct_v >= 2) {
3101		cid.gid = ceph_decode_64(p);
3102		cid.handle = ceph_decode_64(p);
3103	}
3104
3105	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3106	     cid.handle);
3107	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3108		down_write(&rbd_dev->lock_rwsem);
3109		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3110			/*
3111			 * we already know that the remote client is
3112			 * the owner
3113			 */
3114			up_write(&rbd_dev->lock_rwsem);
 
 
 
 
3115			return;
3116		}
3117
3118		rbd_set_owner_cid(rbd_dev, &cid);
3119		downgrade_write(&rbd_dev->lock_rwsem);
3120	} else {
3121		down_read(&rbd_dev->lock_rwsem);
3122	}
3123
3124	if (!__rbd_is_lock_owner(rbd_dev))
3125		wake_requests(rbd_dev, false);
3126	up_read(&rbd_dev->lock_rwsem);
3127}
3128
3129static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3130				     void **p)
3131{
3132	struct rbd_client_id cid = { 0 };
3133
3134	if (struct_v >= 2) {
3135		cid.gid = ceph_decode_64(p);
3136		cid.handle = ceph_decode_64(p);
3137	}
3138
3139	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3140	     cid.handle);
3141	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3142		down_write(&rbd_dev->lock_rwsem);
3143		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3144			dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3145			     __func__, rbd_dev, cid.gid, cid.handle,
3146			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3147			up_write(&rbd_dev->lock_rwsem);
3148			return;
3149		}
 
 
 
 
3150
3151		rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3152		downgrade_write(&rbd_dev->lock_rwsem);
3153	} else {
3154		down_read(&rbd_dev->lock_rwsem);
3155	}
3156
3157	if (!__rbd_is_lock_owner(rbd_dev))
3158		wake_requests(rbd_dev, false);
3159	up_read(&rbd_dev->lock_rwsem);
3160}
3161
3162/*
3163 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3164 * ResponseMessage is needed.
3165 */
3166static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3167				   void **p)
3168{
3169	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3170	struct rbd_client_id cid = { 0 };
3171	int result = 1;
3172
3173	if (struct_v >= 2) {
3174		cid.gid = ceph_decode_64(p);
3175		cid.handle = ceph_decode_64(p);
3176	}
3177
3178	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3179	     cid.handle);
3180	if (rbd_cid_equal(&cid, &my_cid))
3181		return result;
3182
3183	down_read(&rbd_dev->lock_rwsem);
3184	if (__rbd_is_lock_owner(rbd_dev)) {
3185		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3186		    rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3187			goto out_unlock;
 
 
 
 
 
 
 
3188
3189		/*
3190		 * encode ResponseMessage(0) so the peer can detect
3191		 * a missing owner
3192		 */
3193		result = 0;
 
 
 
3194
3195		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
3196			if (!rbd_dev->opts->exclusive) {
3197				dout("%s rbd_dev %p queueing unlock_work\n",
3198				     __func__, rbd_dev);
3199				queue_work(rbd_dev->task_wq,
3200					   &rbd_dev->unlock_work);
3201			} else {
3202				/* refuse to release the lock */
3203				result = -EROFS;
3204			}
3205		}
3206	}
3207
3208out_unlock:
3209	up_read(&rbd_dev->lock_rwsem);
3210	return result;
 
 
 
 
3211}
3212
3213static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3214				     u64 notify_id, u64 cookie, s32 *result)
3215{
 
3216	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3217	char buf[4 + CEPH_ENCODING_START_BLK_LEN];
3218	int buf_size = sizeof(buf);
3219	int ret;
3220
3221	if (result) {
3222		void *p = buf;
 
 
3223
3224		/* encode ResponseMessage */
3225		ceph_start_encoding(&p, 1, 1,
3226				    buf_size - CEPH_ENCODING_START_BLK_LEN);
3227		ceph_encode_32(&p, *result);
3228	} else {
3229		buf_size = 0;
3230	}
3231
3232	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3233				   &rbd_dev->header_oloc, notify_id, cookie,
3234				   buf, buf_size);
3235	if (ret)
3236		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3237}
3238
3239static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3240				   u64 cookie)
3241{
3242	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3243	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3244}
3245
3246static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3247					  u64 notify_id, u64 cookie, s32 result)
3248{
3249	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3250	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3251}
3252
3253static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3254			 u64 notifier_id, void *data, size_t data_len)
3255{
3256	struct rbd_device *rbd_dev = arg;
3257	void *p = data;
3258	void *const end = p + data_len;
3259	u8 struct_v = 0;
3260	u32 len;
3261	u32 notify_op;
3262	int ret;
3263
3264	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3265	     __func__, rbd_dev, cookie, notify_id, data_len);
3266	if (data_len) {
3267		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3268					  &struct_v, &len);
3269		if (ret) {
3270			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3271				 ret);
3272			return;
3273		}
3274
3275		notify_op = ceph_decode_32(&p);
3276	} else {
3277		/* legacy notification for header updates */
3278		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3279		len = 0;
3280	}
3281
3282	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3283	switch (notify_op) {
3284	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3285		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3286		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3287		break;
3288	case RBD_NOTIFY_OP_RELEASED_LOCK:
3289		rbd_handle_released_lock(rbd_dev, struct_v, &p);
3290		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3291		break;
3292	case RBD_NOTIFY_OP_REQUEST_LOCK:
3293		ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3294		if (ret <= 0)
3295			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3296						      cookie, ret);
3297		else
3298			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3299		break;
3300	case RBD_NOTIFY_OP_HEADER_UPDATE:
3301		ret = rbd_dev_refresh(rbd_dev);
3302		if (ret)
3303			rbd_warn(rbd_dev, "refresh failed: %d", ret);
3304
3305		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3306		break;
3307	default:
3308		if (rbd_is_lock_owner(rbd_dev))
3309			rbd_acknowledge_notify_result(rbd_dev, notify_id,
3310						      cookie, -EOPNOTSUPP);
3311		else
3312			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3313		break;
3314	}
3315}
3316
3317static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3318
3319static void rbd_watch_errcb(void *arg, u64 cookie, int err)
3320{
3321	struct rbd_device *rbd_dev = arg;
3322
3323	rbd_warn(rbd_dev, "encountered watch error: %d", err);
3324
3325	down_write(&rbd_dev->lock_rwsem);
3326	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3327	up_write(&rbd_dev->lock_rwsem);
3328
3329	mutex_lock(&rbd_dev->watch_mutex);
3330	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3331		__rbd_unregister_watch(rbd_dev);
3332		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
3333
3334		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
3335	}
3336	mutex_unlock(&rbd_dev->watch_mutex);
3337}
3338
3339/*
3340 * watch_mutex must be locked
3341 */
3342static int __rbd_register_watch(struct rbd_device *rbd_dev)
3343{
3344	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3345	struct ceph_osd_linger_request *handle;
3346
3347	rbd_assert(!rbd_dev->watch_handle);
3348	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3349
3350	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3351				 &rbd_dev->header_oloc, rbd_watch_cb,
3352				 rbd_watch_errcb, rbd_dev);
3353	if (IS_ERR(handle))
3354		return PTR_ERR(handle);
 
3355
3356	rbd_dev->watch_handle = handle;
3357	return 0;
3358}
3359
3360/*
3361 * watch_mutex must be locked
 
3362 */
3363static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
3364{
3365	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 
3366	int ret;
3367
3368	rbd_assert(rbd_dev->watch_handle);
3369	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3370
3371	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3372	if (ret)
3373		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
 
 
 
 
3374
3375	rbd_dev->watch_handle = NULL;
3376}
 
 
 
3377
3378static int rbd_register_watch(struct rbd_device *rbd_dev)
3379{
3380	int ret;
 
3381
3382	mutex_lock(&rbd_dev->watch_mutex);
3383	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3384	ret = __rbd_register_watch(rbd_dev);
3385	if (ret)
3386		goto out;
3387
3388	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3389	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
 
3390
3391out:
3392	mutex_unlock(&rbd_dev->watch_mutex);
3393	return ret;
3394}
 
 
 
 
 
3395
3396static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3397{
3398	dout("%s rbd_dev %p\n", __func__, rbd_dev);
 
 
 
 
 
 
 
3399
3400	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
3401	cancel_work_sync(&rbd_dev->acquired_lock_work);
3402	cancel_work_sync(&rbd_dev->released_lock_work);
3403	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3404	cancel_work_sync(&rbd_dev->unlock_work);
3405}
3406
3407static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3408{
3409	WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
3410	cancel_tasks_sync(rbd_dev);
3411
3412	mutex_lock(&rbd_dev->watch_mutex);
3413	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3414		__rbd_unregister_watch(rbd_dev);
3415	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3416	mutex_unlock(&rbd_dev->watch_mutex);
 
 
 
3417
3418	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
3419}
3420
3421/*
3422 * lock_rwsem must be held for write
3423 */
3424static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3425{
3426	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3427	char cookie[32];
3428	int ret;
3429
3430	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3431
3432	format_lock_cookie(rbd_dev, cookie);
3433	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3434				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
3435				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3436				  RBD_LOCK_TAG, cookie);
3437	if (ret) {
3438		if (ret != -EOPNOTSUPP)
3439			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3440				 ret);
3441
3442		/*
3443		 * Lock cookie cannot be updated on older OSDs, so do
3444		 * a manual release and queue an acquire.
3445		 */
3446		if (rbd_release_lock(rbd_dev))
3447			queue_delayed_work(rbd_dev->task_wq,
3448					   &rbd_dev->lock_dwork, 0);
3449	} else {
3450		__rbd_lock(rbd_dev, cookie);
3451	}
3452}
3453
3454static void rbd_reregister_watch(struct work_struct *work)
3455{
3456	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3457					    struct rbd_device, watch_dwork);
3458	int ret;
3459
3460	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3461
3462	mutex_lock(&rbd_dev->watch_mutex);
3463	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3464		mutex_unlock(&rbd_dev->watch_mutex);
3465		return;
3466	}
3467
3468	ret = __rbd_register_watch(rbd_dev);
3469	if (ret) {
3470		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
3471		if (ret == -EBLACKLISTED || ret == -ENOENT) {
3472			set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3473			wake_requests(rbd_dev, true);
3474		} else {
3475			queue_delayed_work(rbd_dev->task_wq,
3476					   &rbd_dev->watch_dwork,
3477					   RBD_RETRY_DELAY);
3478		}
3479		mutex_unlock(&rbd_dev->watch_mutex);
3480		return;
3481	}
3482
3483	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3484	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3485	mutex_unlock(&rbd_dev->watch_mutex);
3486
3487	down_write(&rbd_dev->lock_rwsem);
3488	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3489		rbd_reacquire_lock(rbd_dev);
3490	up_write(&rbd_dev->lock_rwsem);
3491
3492	ret = rbd_dev_refresh(rbd_dev);
3493	if (ret)
3494		rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
3495}
3496
3497/*
3498 * Synchronous osd object method call.  Returns the number of bytes
3499 * returned in the outbound buffer, or a negative error code.
3500 */
3501static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
3502			     struct ceph_object_id *oid,
3503			     struct ceph_object_locator *oloc,
3504			     const char *method_name,
3505			     const void *outbound,
3506			     size_t outbound_size,
3507			     void *inbound,
3508			     size_t inbound_size)
3509{
3510	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3511	struct page *req_page = NULL;
3512	struct page *reply_page;
 
3513	int ret;
3514
3515	/*
3516	 * Method calls are ultimately read operations.  The result
3517	 * should placed into the inbound buffer provided.  They
3518	 * also supply outbound data--parameters for the object
3519	 * method.  Currently if this is present it will be a
3520	 * snapshot id.
3521	 */
3522	if (outbound) {
3523		if (outbound_size > PAGE_SIZE)
3524			return -E2BIG;
3525
3526		req_page = alloc_page(GFP_KERNEL);
3527		if (!req_page)
3528			return -ENOMEM;
3529
3530		memcpy(page_address(req_page), outbound, outbound_size);
3531	}
 
 
 
3532
3533	reply_page = alloc_page(GFP_KERNEL);
3534	if (!reply_page) {
3535		if (req_page)
3536			__free_page(req_page);
3537		return -ENOMEM;
3538	}
3539
3540	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3541			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
3542			     reply_page, &inbound_size);
3543	if (!ret) {
3544		memcpy(inbound, page_address(reply_page), inbound_size);
3545		ret = inbound_size;
3546	}
3547
3548	if (req_page)
3549		__free_page(req_page);
3550	__free_page(reply_page);
3551	return ret;
3552}
3553
3554/*
3555 * lock_rwsem must be held for read
3556 */
3557static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
3558{
3559	DEFINE_WAIT(wait);
3560	unsigned long timeout;
3561	int ret = 0;
3562
3563	if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
3564		return -EBLACKLISTED;
 
 
 
 
 
 
 
3565
3566	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3567		return 0;
 
 
 
 
3568
3569	if (!may_acquire) {
3570		rbd_warn(rbd_dev, "exclusive lock required");
3571		return -EROFS;
3572	}
3573
3574	do {
3575		/*
3576		 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3577		 * and cancel_delayed_work() in wake_requests().
3578		 */
3579		dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3580		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3581		prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3582					  TASK_UNINTERRUPTIBLE);
3583		up_read(&rbd_dev->lock_rwsem);
3584		timeout = schedule_timeout(ceph_timeout_jiffies(
3585						rbd_dev->opts->lock_timeout));
3586		down_read(&rbd_dev->lock_rwsem);
3587		if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3588			ret = -EBLACKLISTED;
3589			break;
3590		}
3591		if (!timeout) {
3592			rbd_warn(rbd_dev, "timed out waiting for lock");
3593			ret = -ETIMEDOUT;
3594			break;
3595		}
3596	} while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3597
3598	finish_wait(&rbd_dev->lock_waitq, &wait);
3599	return ret;
3600}
3601
3602static void rbd_queue_workfn(struct work_struct *work)
 
3603{
3604	struct request *rq = blk_mq_rq_from_pdu(work);
3605	struct rbd_device *rbd_dev = rq->q->queuedata;
3606	struct rbd_img_request *img_request;
3607	struct ceph_snap_context *snapc = NULL;
3608	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3609	u64 length = blk_rq_bytes(rq);
3610	enum obj_operation_type op_type;
3611	u64 mapping_size;
3612	bool must_be_locked;
3613	int result;
3614
3615	switch (req_op(rq)) {
3616	case REQ_OP_DISCARD:
3617	case REQ_OP_WRITE_ZEROES:
3618		op_type = OBJ_OP_DISCARD;
3619		break;
3620	case REQ_OP_WRITE:
3621		op_type = OBJ_OP_WRITE;
3622		break;
3623	case REQ_OP_READ:
3624		op_type = OBJ_OP_READ;
3625		break;
3626	default:
3627		dout("%s: non-fs request type %d\n", __func__, req_op(rq));
3628		result = -EIO;
3629		goto err;
3630	}
3631
3632	/* Ignore/skip any zero-length requests */
3633
3634	if (!length) {
3635		dout("%s: zero-length request\n", __func__);
3636		result = 0;
3637		goto err_rq;
3638	}
3639
3640	rbd_assert(op_type == OBJ_OP_READ ||
3641		   rbd_dev->spec->snap_id == CEPH_NOSNAP);
 
 
 
3642
3643	/*
3644	 * Quit early if the mapped snapshot no longer exists.  It's
3645	 * still possible the snapshot will have disappeared by the
3646	 * time our request arrives at the osd, but there's no sense in
3647	 * sending it if we already know.
3648	 */
3649	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3650		dout("request for non-existent snapshot");
3651		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3652		result = -ENXIO;
3653		goto err_rq;
3654	}
3655
3656	if (offset && length > U64_MAX - offset + 1) {
3657		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3658			 length);
3659		result = -EINVAL;
3660		goto err_rq;	/* Shouldn't happen */
3661	}
3662
3663	blk_mq_start_request(rq);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3664
3665	down_read(&rbd_dev->header_rwsem);
3666	mapping_size = rbd_dev->mapping.size;
3667	if (op_type != OBJ_OP_READ) {
3668		snapc = rbd_dev->header.snapc;
3669		ceph_get_snap_context(snapc);
3670	}
3671	up_read(&rbd_dev->header_rwsem);
3672
3673	if (offset + length > mapping_size) {
3674		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
3675			 length, mapping_size);
3676		result = -EIO;
3677		goto err_rq;
3678	}
 
 
 
3679
3680	must_be_locked =
3681	    (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3682	    (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
3683	if (must_be_locked) {
3684		down_read(&rbd_dev->lock_rwsem);
3685		result = rbd_wait_state_locked(rbd_dev,
3686					       !rbd_dev->opts->exclusive);
 
 
 
 
 
3687		if (result)
3688			goto err_unlock;
3689	}
 
 
 
 
 
3690
3691	img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
3692	if (!img_request) {
3693		result = -ENOMEM;
3694		goto err_unlock;
3695	}
3696	img_request->rq = rq;
3697	snapc = NULL; /* img_request consumes a ref */
3698
3699	if (op_type == OBJ_OP_DISCARD)
3700		result = rbd_img_fill_nodata(img_request, offset, length);
3701	else
3702		result = rbd_img_fill_from_bio(img_request, offset, length,
3703					       rq->bio);
3704	if (result)
3705		goto err_img_request;
 
 
 
 
 
 
3706
3707	rbd_img_request_submit(img_request);
3708	if (must_be_locked)
3709		up_read(&rbd_dev->lock_rwsem);
3710	return;
 
 
 
 
3711
3712err_img_request:
3713	rbd_img_request_put(img_request);
3714err_unlock:
3715	if (must_be_locked)
3716		up_read(&rbd_dev->lock_rwsem);
3717err_rq:
3718	if (result)
3719		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
3720			 obj_op_name(op_type), length, offset, result);
3721	ceph_put_snap_context(snapc);
3722err:
3723	blk_mq_end_request(rq, errno_to_blk_status(result));
3724}
3725
3726static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
3727		const struct blk_mq_queue_data *bd)
3728{
3729	struct request *rq = bd->rq;
3730	struct work_struct *work = blk_mq_rq_to_pdu(rq);
 
 
 
 
3731
3732	queue_work(rbd_wq, work);
3733	return BLK_STS_OK;
3734}
3735
3736static void rbd_free_disk(struct rbd_device *rbd_dev)
3737{
3738	blk_cleanup_queue(rbd_dev->disk->queue);
3739	blk_mq_free_tag_set(&rbd_dev->tag_set);
3740	put_disk(rbd_dev->disk);
 
 
3741	rbd_dev->disk = NULL;
 
 
 
 
 
 
3742}
3743
3744static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3745			     struct ceph_object_id *oid,
3746			     struct ceph_object_locator *oloc,
3747			     void *buf, int buf_len)
3748
3749{
3750	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3751	struct ceph_osd_request *req;
3752	struct page **pages;
3753	int num_pages = calc_pages_for(0, buf_len);
 
3754	int ret;
3755
3756	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3757	if (!req)
3758		return -ENOMEM;
 
3759
3760	ceph_oid_copy(&req->r_base_oid, oid);
3761	ceph_oloc_copy(&req->r_base_oloc, oloc);
3762	req->r_flags = CEPH_OSD_FLAG_READ;
 
 
3763
3764	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
3765	if (ret)
3766		goto out_req;
3767
3768	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3769	if (IS_ERR(pages)) {
3770		ret = PTR_ERR(pages);
3771		goto out_req;
3772	}
3773
3774	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3775	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3776					 true);
 
 
 
 
 
3777
3778	ceph_osdc_start_request(osdc, req, false);
3779	ret = ceph_osdc_wait_request(osdc, req);
3780	if (ret >= 0)
3781		ceph_copy_from_page_vector(pages, buf, 0, ret);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3782
3783out_req:
3784	ceph_osdc_put_request(req);
3785	return ret;
3786}
3787
3788/*
3789 * Read the complete header for the given rbd device.  On successful
3790 * return, the rbd_dev->header field will contain up-to-date
3791 * information about the image.
3792 */
3793static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
3794{
3795	struct rbd_image_header_ondisk *ondisk = NULL;
3796	u32 snap_count = 0;
3797	u64 names_size = 0;
3798	u32 want_count;
3799	int ret;
3800
3801	/*
3802	 * The complete header will include an array of its 64-bit
3803	 * snapshot ids, followed by the names of those snapshots as
3804	 * a contiguous block of NUL-terminated strings.  Note that
3805	 * the number of snapshots could change by the time we read
3806	 * it in, in which case we re-read it.
3807	 */
3808	do {
3809		size_t size;
3810
3811		kfree(ondisk);
3812
3813		size = sizeof (*ondisk);
3814		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3815		size += names_size;
3816		ondisk = kmalloc(size, GFP_KERNEL);
3817		if (!ondisk)
3818			return -ENOMEM;
3819
3820		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
3821					&rbd_dev->header_oloc, ondisk, size);
3822		if (ret < 0)
3823			goto out;
3824		if ((size_t)ret < size) {
3825			ret = -ENXIO;
3826			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3827				size, ret);
3828			goto out;
3829		}
3830		if (!rbd_dev_ondisk_valid(ondisk)) {
3831			ret = -ENXIO;
3832			rbd_warn(rbd_dev, "invalid header");
3833			goto out;
3834		}
3835
3836		names_size = le64_to_cpu(ondisk->snap_names_len);
3837		want_count = snap_count;
3838		snap_count = le32_to_cpu(ondisk->snap_count);
3839	} while (snap_count != want_count);
3840
3841	ret = rbd_header_from_disk(rbd_dev, ondisk);
3842out:
3843	kfree(ondisk);
3844
3845	return ret;
3846}
3847
3848/*
3849 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3850 * has disappeared from the (just updated) snapshot context.
3851 */
3852static void rbd_exists_validate(struct rbd_device *rbd_dev)
3853{
3854	u64 snap_id;
3855
3856	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3857		return;
3858
3859	snap_id = rbd_dev->spec->snap_id;
3860	if (snap_id == CEPH_NOSNAP)
3861		return;
3862
3863	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3864		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3865}
3866
3867static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3868{
3869	sector_t size;
 
3870
3871	/*
3872	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
3873	 * try to update its size.  If REMOVING is set, updating size
3874	 * is just useless work since the device can't be opened.
3875	 */
3876	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
3877	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
 
 
 
 
 
 
3878		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3879		dout("setting size to %llu sectors", (unsigned long long)size);
3880		set_capacity(rbd_dev->disk, size);
3881		revalidate_disk(rbd_dev->disk);
3882	}
3883}
3884
3885static int rbd_dev_refresh(struct rbd_device *rbd_dev)
3886{
3887	u64 mapping_size;
3888	int ret;
3889
 
3890	down_write(&rbd_dev->header_rwsem);
3891	mapping_size = rbd_dev->mapping.size;
 
 
 
 
3892
3893	ret = rbd_dev_header_info(rbd_dev);
3894	if (ret)
3895		goto out;
3896
3897	/*
3898	 * If there is a parent, see if it has disappeared due to the
3899	 * mapped image getting flattened.
3900	 */
3901	if (rbd_dev->parent) {
3902		ret = rbd_dev_v2_parent_info(rbd_dev);
3903		if (ret)
3904			goto out;
3905	}
3906
3907	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
3908		rbd_dev->mapping.size = rbd_dev->header.image_size;
3909	} else {
3910		/* validate mapped snapshot's EXISTS flag */
3911		rbd_exists_validate(rbd_dev);
3912	}
3913
3914out:
3915	up_write(&rbd_dev->header_rwsem);
3916	if (!ret && mapping_size != rbd_dev->mapping.size)
 
3917		rbd_dev_update_size(rbd_dev);
 
3918
3919	return ret;
3920}
3921
3922static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
3923		unsigned int hctx_idx, unsigned int numa_node)
3924{
3925	struct work_struct *work = blk_mq_rq_to_pdu(rq);
3926
3927	INIT_WORK(work, rbd_queue_workfn);
3928	return 0;
3929}
3930
3931static const struct blk_mq_ops rbd_mq_ops = {
3932	.queue_rq	= rbd_queue_rq,
3933	.init_request	= rbd_init_request,
3934};
3935
3936static int rbd_init_disk(struct rbd_device *rbd_dev)
3937{
3938	struct gendisk *disk;
3939	struct request_queue *q;
3940	unsigned int objset_bytes =
3941	    rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
3942	int err;
3943
3944	/* create gendisk info */
3945	disk = alloc_disk(single_major ?
3946			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
3947			  RBD_MINORS_PER_MAJOR);
3948	if (!disk)
3949		return -ENOMEM;
3950
3951	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3952		 rbd_dev->dev_id);
3953	disk->major = rbd_dev->major;
3954	disk->first_minor = rbd_dev->minor;
3955	if (single_major)
3956		disk->flags |= GENHD_FL_EXT_DEVT;
3957	disk->fops = &rbd_bd_ops;
3958	disk->private_data = rbd_dev;
3959
3960	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
3961	rbd_dev->tag_set.ops = &rbd_mq_ops;
3962	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
3963	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
3964	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
3965	rbd_dev->tag_set.nr_hw_queues = 1;
3966	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
3967
3968	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
3969	if (err)
3970		goto out_disk;
3971
3972	q = blk_mq_init_queue(&rbd_dev->tag_set);
3973	if (IS_ERR(q)) {
3974		err = PTR_ERR(q);
3975		goto out_tag_set;
3976	}
3977
3978	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
3979	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
3980
3981	blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
3982	q->limits.max_sectors = queue_max_hw_sectors(q);
3983	blk_queue_max_segments(q, USHRT_MAX);
3984	blk_queue_max_segment_size(q, UINT_MAX);
3985	blk_queue_io_min(q, objset_bytes);
3986	blk_queue_io_opt(q, objset_bytes);
3987
3988	if (rbd_dev->opts->trim) {
3989		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
3990		q->limits.discard_granularity = objset_bytes;
3991		blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
3992		blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
3993	}
3994
3995	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
3996		q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
 
 
 
 
3997
3998	/*
3999	 * disk_release() expects a queue ref from add_disk() and will
4000	 * put it.  Hold an extra ref until add_disk() is called.
4001	 */
4002	WARN_ON(!blk_get_queue(q));
4003	disk->queue = q;
 
4004	q->queuedata = rbd_dev;
4005
4006	rbd_dev->disk = disk;
4007
4008	return 0;
4009out_tag_set:
4010	blk_mq_free_tag_set(&rbd_dev->tag_set);
4011out_disk:
4012	put_disk(disk);
4013	return err;
 
4014}
4015
4016/*
4017  sysfs
4018*/
4019
4020static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4021{
4022	return container_of(dev, struct rbd_device, dev);
4023}
4024
4025static ssize_t rbd_size_show(struct device *dev,
4026			     struct device_attribute *attr, char *buf)
4027{
4028	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4029
4030	return sprintf(buf, "%llu\n",
4031		(unsigned long long)rbd_dev->mapping.size);
4032}
4033
4034/*
4035 * Note this shows the features for whatever's mapped, which is not
4036 * necessarily the base image.
4037 */
4038static ssize_t rbd_features_show(struct device *dev,
4039			     struct device_attribute *attr, char *buf)
4040{
4041	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4042
4043	return sprintf(buf, "0x%016llx\n",
4044			(unsigned long long)rbd_dev->mapping.features);
4045}
4046
4047static ssize_t rbd_major_show(struct device *dev,
4048			      struct device_attribute *attr, char *buf)
4049{
4050	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4051
4052	if (rbd_dev->major)
4053		return sprintf(buf, "%d\n", rbd_dev->major);
4054
4055	return sprintf(buf, "(none)\n");
4056}
4057
4058static ssize_t rbd_minor_show(struct device *dev,
4059			      struct device_attribute *attr, char *buf)
4060{
4061	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4062
4063	return sprintf(buf, "%d\n", rbd_dev->minor);
4064}
4065
4066static ssize_t rbd_client_addr_show(struct device *dev,
4067				    struct device_attribute *attr, char *buf)
4068{
4069	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4070	struct ceph_entity_addr *client_addr =
4071	    ceph_client_addr(rbd_dev->rbd_client->client);
4072
4073	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4074		       le32_to_cpu(client_addr->nonce));
4075}
4076
4077static ssize_t rbd_client_id_show(struct device *dev,
4078				  struct device_attribute *attr, char *buf)
4079{
4080	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4081
4082	return sprintf(buf, "client%lld\n",
4083		       ceph_client_gid(rbd_dev->rbd_client->client));
4084}
4085
4086static ssize_t rbd_cluster_fsid_show(struct device *dev,
4087				     struct device_attribute *attr, char *buf)
4088{
4089	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4090
4091	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4092}
4093
4094static ssize_t rbd_config_info_show(struct device *dev,
4095				    struct device_attribute *attr, char *buf)
4096{
4097	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4098
4099	return sprintf(buf, "%s\n", rbd_dev->config_info);
4100}
4101
4102static ssize_t rbd_pool_show(struct device *dev,
4103			     struct device_attribute *attr, char *buf)
4104{
4105	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4106
4107	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
4108}
4109
4110static ssize_t rbd_pool_id_show(struct device *dev,
4111			     struct device_attribute *attr, char *buf)
4112{
4113	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4114
4115	return sprintf(buf, "%llu\n",
4116			(unsigned long long) rbd_dev->spec->pool_id);
4117}
4118
4119static ssize_t rbd_name_show(struct device *dev,
4120			     struct device_attribute *attr, char *buf)
4121{
4122	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4123
4124	if (rbd_dev->spec->image_name)
4125		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4126
4127	return sprintf(buf, "(unknown)\n");
4128}
4129
4130static ssize_t rbd_image_id_show(struct device *dev,
4131			     struct device_attribute *attr, char *buf)
4132{
4133	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4134
4135	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
4136}
4137
4138/*
4139 * Shows the name of the currently-mapped snapshot (or
4140 * RBD_SNAP_HEAD_NAME for the base image).
4141 */
4142static ssize_t rbd_snap_show(struct device *dev,
4143			     struct device_attribute *attr,
4144			     char *buf)
4145{
4146	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4147
4148	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
4149}
4150
4151static ssize_t rbd_snap_id_show(struct device *dev,
4152				struct device_attribute *attr, char *buf)
4153{
4154	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4155
4156	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4157}
4158
4159/*
4160 * For a v2 image, shows the chain of parent images, separated by empty
4161 * lines.  For v1 images or if there is no parent, shows "(no parent
4162 * image)".
4163 */
4164static ssize_t rbd_parent_show(struct device *dev,
4165			       struct device_attribute *attr,
4166			       char *buf)
4167{
4168	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4169	ssize_t count = 0;
 
 
4170
4171	if (!rbd_dev->parent)
4172		return sprintf(buf, "(no parent image)\n");
4173
4174	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4175		struct rbd_spec *spec = rbd_dev->parent_spec;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4176
4177		count += sprintf(&buf[count], "%s"
4178			    "pool_id %llu\npool_name %s\n"
4179			    "image_id %s\nimage_name %s\n"
4180			    "snap_id %llu\nsnap_name %s\n"
4181			    "overlap %llu\n",
4182			    !count ? "" : "\n", /* first? */
4183			    spec->pool_id, spec->pool_name,
4184			    spec->image_id, spec->image_name ?: "(unknown)",
4185			    spec->snap_id, spec->snap_name,
4186			    rbd_dev->parent_overlap);
4187	}
4188
4189	return count;
4190}
4191
4192static ssize_t rbd_image_refresh(struct device *dev,
4193				 struct device_attribute *attr,
4194				 const char *buf,
4195				 size_t size)
4196{
4197	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4198	int ret;
4199
4200	ret = rbd_dev_refresh(rbd_dev);
4201	if (ret)
4202		return ret;
4203
4204	return size;
4205}
4206
4207static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
4208static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
4209static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
4210static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
4211static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
4212static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
4213static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
4214static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
4215static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
4216static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
4217static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
4218static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
4219static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4220static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
4221static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
4222static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
4223
4224static struct attribute *rbd_attrs[] = {
4225	&dev_attr_size.attr,
4226	&dev_attr_features.attr,
4227	&dev_attr_major.attr,
4228	&dev_attr_minor.attr,
4229	&dev_attr_client_addr.attr,
4230	&dev_attr_client_id.attr,
4231	&dev_attr_cluster_fsid.attr,
4232	&dev_attr_config_info.attr,
4233	&dev_attr_pool.attr,
4234	&dev_attr_pool_id.attr,
4235	&dev_attr_name.attr,
4236	&dev_attr_image_id.attr,
4237	&dev_attr_current_snap.attr,
4238	&dev_attr_snap_id.attr,
4239	&dev_attr_parent.attr,
4240	&dev_attr_refresh.attr,
4241	NULL
4242};
4243
4244static struct attribute_group rbd_attr_group = {
4245	.attrs = rbd_attrs,
4246};
4247
4248static const struct attribute_group *rbd_attr_groups[] = {
4249	&rbd_attr_group,
4250	NULL
4251};
4252
4253static void rbd_dev_release(struct device *dev);
 
 
4254
4255static const struct device_type rbd_device_type = {
4256	.name		= "rbd",
4257	.groups		= rbd_attr_groups,
4258	.release	= rbd_dev_release,
4259};
4260
4261static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4262{
4263	kref_get(&spec->kref);
4264
4265	return spec;
4266}
4267
4268static void rbd_spec_free(struct kref *kref);
4269static void rbd_spec_put(struct rbd_spec *spec)
4270{
4271	if (spec)
4272		kref_put(&spec->kref, rbd_spec_free);
4273}
4274
4275static struct rbd_spec *rbd_spec_alloc(void)
4276{
4277	struct rbd_spec *spec;
4278
4279	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4280	if (!spec)
4281		return NULL;
4282
4283	spec->pool_id = CEPH_NOPOOL;
4284	spec->snap_id = CEPH_NOSNAP;
4285	kref_init(&spec->kref);
4286
4287	return spec;
4288}
4289
4290static void rbd_spec_free(struct kref *kref)
4291{
4292	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4293
4294	kfree(spec->pool_name);
4295	kfree(spec->image_id);
4296	kfree(spec->image_name);
4297	kfree(spec->snap_name);
4298	kfree(spec);
4299}
4300
4301static void rbd_dev_free(struct rbd_device *rbd_dev)
4302{
4303	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
4304	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
4305
4306	ceph_oid_destroy(&rbd_dev->header_oid);
4307	ceph_oloc_destroy(&rbd_dev->header_oloc);
4308	kfree(rbd_dev->config_info);
4309
4310	rbd_put_client(rbd_dev->rbd_client);
4311	rbd_spec_put(rbd_dev->spec);
4312	kfree(rbd_dev->opts);
4313	kfree(rbd_dev);
4314}
4315
4316static void rbd_dev_release(struct device *dev)
4317{
4318	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4319	bool need_put = !!rbd_dev->opts;
4320
4321	if (need_put) {
4322		destroy_workqueue(rbd_dev->task_wq);
4323		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4324	}
4325
4326	rbd_dev_free(rbd_dev);
4327
4328	/*
4329	 * This is racy, but way better than putting module outside of
4330	 * the release callback.  The race window is pretty small, so
4331	 * doing something similar to dm (dm-builtin.c) is overkill.
4332	 */
4333	if (need_put)
4334		module_put(THIS_MODULE);
4335}
4336
4337static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4338					   struct rbd_spec *spec)
4339{
4340	struct rbd_device *rbd_dev;
4341
4342	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
4343	if (!rbd_dev)
4344		return NULL;
4345
4346	spin_lock_init(&rbd_dev->lock);
 
 
4347	INIT_LIST_HEAD(&rbd_dev->node);
4348	init_rwsem(&rbd_dev->header_rwsem);
4349
4350	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
4351	ceph_oid_init(&rbd_dev->header_oid);
4352	rbd_dev->header_oloc.pool = spec->pool_id;
4353
4354	mutex_init(&rbd_dev->watch_mutex);
4355	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4356	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4357
4358	init_rwsem(&rbd_dev->lock_rwsem);
4359	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4360	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4361	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4362	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4363	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4364	init_waitqueue_head(&rbd_dev->lock_waitq);
4365
4366	rbd_dev->dev.bus = &rbd_bus_type;
4367	rbd_dev->dev.type = &rbd_device_type;
4368	rbd_dev->dev.parent = &rbd_root_dev;
4369	device_initialize(&rbd_dev->dev);
4370
4371	rbd_dev->rbd_client = rbdc;
4372	rbd_dev->spec = spec;
 
4373
4374	return rbd_dev;
4375}
4376
4377/*
4378 * Create a mapping rbd_dev.
4379 */
4380static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4381					 struct rbd_spec *spec,
4382					 struct rbd_options *opts)
4383{
4384	struct rbd_device *rbd_dev;
4385
4386	rbd_dev = __rbd_dev_create(rbdc, spec);
4387	if (!rbd_dev)
4388		return NULL;
4389
4390	rbd_dev->opts = opts;
4391
4392	/* get an id and fill in device name */
4393	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4394					 minor_to_rbd_dev_id(1 << MINORBITS),
4395					 GFP_KERNEL);
4396	if (rbd_dev->dev_id < 0)
4397		goto fail_rbd_dev;
4398
4399	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4400	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4401						   rbd_dev->name);
4402	if (!rbd_dev->task_wq)
4403		goto fail_dev_id;
4404
4405	/* we have a ref from do_rbd_add() */
4406	__module_get(THIS_MODULE);
 
 
4407
4408	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4409	return rbd_dev;
4410
4411fail_dev_id:
4412	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4413fail_rbd_dev:
4414	rbd_dev_free(rbd_dev);
4415	return NULL;
4416}
4417
4418static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4419{
4420	if (rbd_dev)
4421		put_device(&rbd_dev->dev);
 
4422}
4423
4424/*
4425 * Get the size and object order for an image snapshot, or if
4426 * snap_id is CEPH_NOSNAP, gets this information for the base
4427 * image.
4428 */
4429static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4430				u8 *order, u64 *snap_size)
4431{
4432	__le64 snapid = cpu_to_le64(snap_id);
4433	int ret;
4434	struct {
4435		u8 order;
4436		__le64 size;
4437	} __attribute__ ((packed)) size_buf = { 0 };
4438
4439	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4440				  &rbd_dev->header_oloc, "get_size",
4441				  &snapid, sizeof(snapid),
4442				  &size_buf, sizeof(size_buf));
4443	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4444	if (ret < 0)
4445		return ret;
4446	if (ret < sizeof (size_buf))
4447		return -ERANGE;
4448
4449	if (order) {
4450		*order = size_buf.order;
4451		dout("  order %u", (unsigned int)*order);
4452	}
4453	*snap_size = le64_to_cpu(size_buf.size);
4454
4455	dout("  snap_id 0x%016llx snap_size = %llu\n",
4456		(unsigned long long)snap_id,
4457		(unsigned long long)*snap_size);
4458
4459	return 0;
4460}
4461
4462static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4463{
4464	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4465					&rbd_dev->header.obj_order,
4466					&rbd_dev->header.image_size);
4467}
4468
4469static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4470{
4471	void *reply_buf;
4472	int ret;
4473	void *p;
4474
4475	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4476	if (!reply_buf)
4477		return -ENOMEM;
4478
4479	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4480				  &rbd_dev->header_oloc, "get_object_prefix",
4481				  NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
4482	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4483	if (ret < 0)
4484		goto out;
4485
4486	p = reply_buf;
4487	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
4488						p + ret, NULL, GFP_NOIO);
4489	ret = 0;
4490
4491	if (IS_ERR(rbd_dev->header.object_prefix)) {
4492		ret = PTR_ERR(rbd_dev->header.object_prefix);
4493		rbd_dev->header.object_prefix = NULL;
4494	} else {
4495		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
4496	}
4497out:
4498	kfree(reply_buf);
4499
4500	return ret;
4501}
4502
4503static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4504		u64 *snap_features)
4505{
4506	__le64 snapid = cpu_to_le64(snap_id);
4507	struct {
4508		__le64 features;
4509		__le64 incompat;
4510	} __attribute__ ((packed)) features_buf = { 0 };
4511	u64 unsup;
4512	int ret;
4513
4514	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4515				  &rbd_dev->header_oloc, "get_features",
4516				  &snapid, sizeof(snapid),
4517				  &features_buf, sizeof(features_buf));
4518	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4519	if (ret < 0)
4520		return ret;
4521	if (ret < sizeof (features_buf))
4522		return -ERANGE;
4523
4524	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4525	if (unsup) {
4526		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4527			 unsup);
4528		return -ENXIO;
4529	}
4530
4531	*snap_features = le64_to_cpu(features_buf.features);
4532
4533	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
4534		(unsigned long long)snap_id,
4535		(unsigned long long)*snap_features,
4536		(unsigned long long)le64_to_cpu(features_buf.incompat));
4537
4538	return 0;
4539}
4540
4541static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4542{
4543	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4544						&rbd_dev->header.features);
4545}
4546
4547static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4548{
4549	struct rbd_spec *parent_spec;
4550	size_t size;
4551	void *reply_buf = NULL;
4552	__le64 snapid;
4553	void *p;
4554	void *end;
4555	u64 pool_id;
4556	char *image_id;
4557	u64 snap_id;
4558	u64 overlap;
4559	int ret;
4560
4561	parent_spec = rbd_spec_alloc();
4562	if (!parent_spec)
4563		return -ENOMEM;
4564
4565	size = sizeof (__le64) +				/* pool_id */
4566		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
4567		sizeof (__le64) +				/* snap_id */
4568		sizeof (__le64);				/* overlap */
4569	reply_buf = kmalloc(size, GFP_KERNEL);
4570	if (!reply_buf) {
4571		ret = -ENOMEM;
4572		goto out_err;
4573	}
4574
4575	snapid = cpu_to_le64(rbd_dev->spec->snap_id);
4576	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4577				  &rbd_dev->header_oloc, "get_parent",
4578				  &snapid, sizeof(snapid), reply_buf, size);
 
4579	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4580	if (ret < 0)
4581		goto out_err;
4582
4583	p = reply_buf;
4584	end = reply_buf + ret;
4585	ret = -ERANGE;
4586	ceph_decode_64_safe(&p, end, pool_id, out_err);
4587	if (pool_id == CEPH_NOPOOL) {
4588		/*
4589		 * Either the parent never existed, or we have
4590		 * record of it but the image got flattened so it no
4591		 * longer has a parent.  When the parent of a
4592		 * layered image disappears we immediately set the
4593		 * overlap to 0.  The effect of this is that all new
4594		 * requests will be treated as if the image had no
4595		 * parent.
4596		 */
4597		if (rbd_dev->parent_overlap) {
4598			rbd_dev->parent_overlap = 0;
 
4599			rbd_dev_parent_put(rbd_dev);
4600			pr_info("%s: clone image has been flattened\n",
4601				rbd_dev->disk->disk_name);
4602		}
4603
4604		goto out;	/* No parent?  No problem. */
4605	}
4606
4607	/* The ceph file layout needs to fit pool id in 32 bits */
4608
4609	ret = -EIO;
4610	if (pool_id > (u64)U32_MAX) {
4611		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
4612			(unsigned long long)pool_id, U32_MAX);
4613		goto out_err;
4614	}
4615
4616	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4617	if (IS_ERR(image_id)) {
4618		ret = PTR_ERR(image_id);
4619		goto out_err;
4620	}
4621	ceph_decode_64_safe(&p, end, snap_id, out_err);
4622	ceph_decode_64_safe(&p, end, overlap, out_err);
4623
4624	/*
4625	 * The parent won't change (except when the clone is
4626	 * flattened, already handled that).  So we only need to
4627	 * record the parent spec we have not already done so.
4628	 */
4629	if (!rbd_dev->parent_spec) {
4630		parent_spec->pool_id = pool_id;
4631		parent_spec->image_id = image_id;
4632		parent_spec->snap_id = snap_id;
4633		rbd_dev->parent_spec = parent_spec;
4634		parent_spec = NULL;	/* rbd_dev now owns this */
4635	} else {
4636		kfree(image_id);
4637	}
4638
4639	/*
4640	 * We always update the parent overlap.  If it's zero we issue
4641	 * a warning, as we will proceed as if there was no parent.
4642	 */
 
 
4643	if (!overlap) {
 
 
 
4644		if (parent_spec) {
4645			/* refresh, careful to warn just once */
4646			if (rbd_dev->parent_overlap)
4647				rbd_warn(rbd_dev,
4648				    "clone now standalone (overlap became 0)");
 
 
 
 
4649		} else {
4650			/* initial probe */
4651			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
 
 
 
 
 
4652		}
4653	}
4654	rbd_dev->parent_overlap = overlap;
4655
4656out:
4657	ret = 0;
4658out_err:
4659	kfree(reply_buf);
4660	rbd_spec_put(parent_spec);
4661
4662	return ret;
4663}
4664
4665static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4666{
4667	struct {
4668		__le64 stripe_unit;
4669		__le64 stripe_count;
4670	} __attribute__ ((packed)) striping_info_buf = { 0 };
4671	size_t size = sizeof (striping_info_buf);
4672	void *p;
 
 
 
4673	int ret;
4674
4675	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4676				&rbd_dev->header_oloc, "get_stripe_unit_count",
4677				NULL, 0, &striping_info_buf, size);
4678	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4679	if (ret < 0)
4680		return ret;
4681	if (ret < size)
4682		return -ERANGE;
4683
 
 
 
 
 
 
 
 
4684	p = &striping_info_buf;
4685	rbd_dev->header.stripe_unit = ceph_decode_64(&p);
4686	rbd_dev->header.stripe_count = ceph_decode_64(&p);
4687	return 0;
4688}
4689
4690static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
4691{
4692	__le64 data_pool_id;
4693	int ret;
4694
4695	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4696				  &rbd_dev->header_oloc, "get_data_pool",
4697				  NULL, 0, &data_pool_id, sizeof(data_pool_id));
4698	if (ret < 0)
4699		return ret;
4700	if (ret < sizeof(data_pool_id))
4701		return -EBADMSG;
4702
4703	rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
4704	WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
4705	return 0;
4706}
4707
4708static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4709{
4710	CEPH_DEFINE_OID_ONSTACK(oid);
4711	size_t image_id_size;
4712	char *image_id;
4713	void *p;
4714	void *end;
4715	size_t size;
4716	void *reply_buf = NULL;
4717	size_t len = 0;
4718	char *image_name = NULL;
4719	int ret;
4720
4721	rbd_assert(!rbd_dev->spec->image_name);
4722
4723	len = strlen(rbd_dev->spec->image_id);
4724	image_id_size = sizeof (__le32) + len;
4725	image_id = kmalloc(image_id_size, GFP_KERNEL);
4726	if (!image_id)
4727		return NULL;
4728
4729	p = image_id;
4730	end = image_id + image_id_size;
4731	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
4732
4733	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4734	reply_buf = kmalloc(size, GFP_KERNEL);
4735	if (!reply_buf)
4736		goto out;
4737
4738	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
4739	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
4740				  "dir_get_name", image_id, image_id_size,
4741				  reply_buf, size);
4742	if (ret < 0)
4743		goto out;
4744	p = reply_buf;
4745	end = reply_buf + ret;
4746
4747	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4748	if (IS_ERR(image_name))
4749		image_name = NULL;
4750	else
4751		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4752out:
4753	kfree(reply_buf);
4754	kfree(image_id);
4755
4756	return image_name;
4757}
4758
4759static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4760{
4761	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4762	const char *snap_name;
4763	u32 which = 0;
4764
4765	/* Skip over names until we find the one we are looking for */
4766
4767	snap_name = rbd_dev->header.snap_names;
4768	while (which < snapc->num_snaps) {
4769		if (!strcmp(name, snap_name))
4770			return snapc->snaps[which];
4771		snap_name += strlen(snap_name) + 1;
4772		which++;
4773	}
4774	return CEPH_NOSNAP;
4775}
4776
4777static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4778{
4779	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4780	u32 which;
4781	bool found = false;
4782	u64 snap_id;
4783
4784	for (which = 0; !found && which < snapc->num_snaps; which++) {
4785		const char *snap_name;
4786
4787		snap_id = snapc->snaps[which];
4788		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
4789		if (IS_ERR(snap_name)) {
4790			/* ignore no-longer existing snapshots */
4791			if (PTR_ERR(snap_name) == -ENOENT)
4792				continue;
4793			else
4794				break;
4795		}
4796		found = !strcmp(name, snap_name);
4797		kfree(snap_name);
4798	}
4799	return found ? snap_id : CEPH_NOSNAP;
4800}
4801
4802/*
4803 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4804 * no snapshot by that name is found, or if an error occurs.
4805 */
4806static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4807{
4808	if (rbd_dev->image_format == 1)
4809		return rbd_v1_snap_id_by_name(rbd_dev, name);
4810
4811	return rbd_v2_snap_id_by_name(rbd_dev, name);
4812}
4813
4814/*
4815 * An image being mapped will have everything but the snap id.
4816 */
4817static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
4818{
4819	struct rbd_spec *spec = rbd_dev->spec;
4820
4821	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
4822	rbd_assert(spec->image_id && spec->image_name);
4823	rbd_assert(spec->snap_name);
4824
4825	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4826		u64 snap_id;
4827
4828		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
4829		if (snap_id == CEPH_NOSNAP)
4830			return -ENOENT;
4831
4832		spec->snap_id = snap_id;
4833	} else {
4834		spec->snap_id = CEPH_NOSNAP;
4835	}
4836
4837	return 0;
4838}
4839
4840/*
4841 * A parent image will have all ids but none of the names.
4842 *
4843 * All names in an rbd spec are dynamically allocated.  It's OK if we
4844 * can't figure out the name for an image id.
 
4845 */
4846static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
4847{
4848	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4849	struct rbd_spec *spec = rbd_dev->spec;
4850	const char *pool_name;
4851	const char *image_name;
4852	const char *snap_name;
4853	int ret;
4854
4855	rbd_assert(spec->pool_id != CEPH_NOPOOL);
4856	rbd_assert(spec->image_id);
4857	rbd_assert(spec->snap_id != CEPH_NOSNAP);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4858
4859	/* Get the pool name; we have to make our own copy of this */
4860
4861	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
4862	if (!pool_name) {
4863		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4864		return -EIO;
4865	}
4866	pool_name = kstrdup(pool_name, GFP_KERNEL);
4867	if (!pool_name)
4868		return -ENOMEM;
4869
4870	/* Fetch the image name; tolerate failure here */
4871
4872	image_name = rbd_dev_image_name(rbd_dev);
4873	if (!image_name)
4874		rbd_warn(rbd_dev, "unable to get image name");
4875
4876	/* Fetch the snapshot name */
4877
4878	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
4879	if (IS_ERR(snap_name)) {
4880		ret = PTR_ERR(snap_name);
4881		goto out_err;
4882	}
4883
4884	spec->pool_name = pool_name;
4885	spec->image_name = image_name;
4886	spec->snap_name = snap_name;
4887
4888	return 0;
4889
4890out_err:
4891	kfree(image_name);
4892	kfree(pool_name);
 
4893	return ret;
4894}
4895
4896static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
4897{
4898	size_t size;
4899	int ret;
4900	void *reply_buf;
4901	void *p;
4902	void *end;
4903	u64 seq;
4904	u32 snap_count;
4905	struct ceph_snap_context *snapc;
4906	u32 i;
4907
4908	/*
4909	 * We'll need room for the seq value (maximum snapshot id),
4910	 * snapshot count, and array of that many snapshot ids.
4911	 * For now we have a fixed upper limit on the number we're
4912	 * prepared to receive.
4913	 */
4914	size = sizeof (__le64) + sizeof (__le32) +
4915			RBD_MAX_SNAP_COUNT * sizeof (__le64);
4916	reply_buf = kzalloc(size, GFP_KERNEL);
4917	if (!reply_buf)
4918		return -ENOMEM;
4919
4920	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4921				  &rbd_dev->header_oloc, "get_snapcontext",
4922				  NULL, 0, reply_buf, size);
4923	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4924	if (ret < 0)
4925		goto out;
4926
4927	p = reply_buf;
4928	end = reply_buf + ret;
4929	ret = -ERANGE;
4930	ceph_decode_64_safe(&p, end, seq, out);
4931	ceph_decode_32_safe(&p, end, snap_count, out);
4932
4933	/*
4934	 * Make sure the reported number of snapshot ids wouldn't go
4935	 * beyond the end of our buffer.  But before checking that,
4936	 * make sure the computed size of the snapshot context we
4937	 * allocate is representable in a size_t.
4938	 */
4939	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
4940				 / sizeof (u64)) {
4941		ret = -EINVAL;
4942		goto out;
4943	}
4944	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
4945		goto out;
4946	ret = 0;
4947
4948	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
4949	if (!snapc) {
4950		ret = -ENOMEM;
4951		goto out;
4952	}
4953	snapc->seq = seq;
4954	for (i = 0; i < snap_count; i++)
4955		snapc->snaps[i] = ceph_decode_64(&p);
4956
4957	ceph_put_snap_context(rbd_dev->header.snapc);
4958	rbd_dev->header.snapc = snapc;
4959
4960	dout("  snap context seq = %llu, snap_count = %u\n",
4961		(unsigned long long)seq, (unsigned int)snap_count);
4962out:
4963	kfree(reply_buf);
4964
4965	return ret;
4966}
4967
4968static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4969					u64 snap_id)
4970{
4971	size_t size;
4972	void *reply_buf;
4973	__le64 snapid;
4974	int ret;
4975	void *p;
4976	void *end;
4977	char *snap_name;
4978
4979	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4980	reply_buf = kmalloc(size, GFP_KERNEL);
4981	if (!reply_buf)
4982		return ERR_PTR(-ENOMEM);
4983
4984	snapid = cpu_to_le64(snap_id);
4985	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4986				  &rbd_dev->header_oloc, "get_snapshot_name",
4987				  &snapid, sizeof(snapid), reply_buf, size);
 
4988	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4989	if (ret < 0) {
4990		snap_name = ERR_PTR(ret);
4991		goto out;
4992	}
4993
4994	p = reply_buf;
4995	end = reply_buf + ret;
4996	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4997	if (IS_ERR(snap_name))
4998		goto out;
4999
5000	dout("  snap_id 0x%016llx snap_name = %s\n",
5001		(unsigned long long)snap_id, snap_name);
5002out:
5003	kfree(reply_buf);
5004
5005	return snap_name;
5006}
5007
5008static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
5009{
5010	bool first_time = rbd_dev->header.object_prefix == NULL;
5011	int ret;
5012
5013	ret = rbd_dev_v2_image_size(rbd_dev);
5014	if (ret)
5015		return ret;
5016
5017	if (first_time) {
5018		ret = rbd_dev_v2_header_onetime(rbd_dev);
5019		if (ret)
5020			return ret;
5021	}
5022
5023	ret = rbd_dev_v2_snap_context(rbd_dev);
5024	if (ret && first_time) {
5025		kfree(rbd_dev->header.object_prefix);
5026		rbd_dev->header.object_prefix = NULL;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5027	}
5028
 
 
 
 
 
 
 
5029	return ret;
5030}
5031
5032static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5033{
5034	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 
5035
5036	if (rbd_dev->image_format == 1)
5037		return rbd_dev_v1_header_info(rbd_dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5038
5039	return rbd_dev_v2_header_info(rbd_dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5040}
5041
5042/*
5043 * Skips over white space at *buf, and updates *buf to point to the
5044 * first found non-space character (if any). Returns the length of
5045 * the token (string of non-white space characters) found.  Note
5046 * that *buf must be terminated with '\0'.
5047 */
5048static inline size_t next_token(const char **buf)
5049{
5050        /*
5051        * These are the characters that produce nonzero for
5052        * isspace() in the "C" and "POSIX" locales.
5053        */
5054        const char *spaces = " \f\n\r\t\v";
5055
5056        *buf += strspn(*buf, spaces);	/* Find start of token */
5057
5058	return strcspn(*buf, spaces);   /* Return token length */
5059}
5060
5061/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5062 * Finds the next token in *buf, dynamically allocates a buffer big
5063 * enough to hold a copy of it, and copies the token into the new
5064 * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
5065 * that a duplicate buffer is created even for a zero-length token.
5066 *
5067 * Returns a pointer to the newly-allocated duplicate, or a null
5068 * pointer if memory for the duplicate was not available.  If
5069 * the lenp argument is a non-null pointer, the length of the token
5070 * (not including the '\0') is returned in *lenp.
5071 *
5072 * If successful, the *buf pointer will be updated to point beyond
5073 * the end of the found token.
5074 *
5075 * Note: uses GFP_KERNEL for allocation.
5076 */
5077static inline char *dup_token(const char **buf, size_t *lenp)
5078{
5079	char *dup;
5080	size_t len;
5081
5082	len = next_token(buf);
5083	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
5084	if (!dup)
5085		return NULL;
5086	*(dup + len) = '\0';
5087	*buf += len;
5088
5089	if (lenp)
5090		*lenp = len;
5091
5092	return dup;
5093}
5094
5095/*
5096 * Parse the options provided for an "rbd add" (i.e., rbd image
5097 * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
5098 * and the data written is passed here via a NUL-terminated buffer.
5099 * Returns 0 if successful or an error code otherwise.
5100 *
5101 * The information extracted from these options is recorded in
5102 * the other parameters which return dynamically-allocated
5103 * structures:
5104 *  ceph_opts
5105 *      The address of a pointer that will refer to a ceph options
5106 *      structure.  Caller must release the returned pointer using
5107 *      ceph_destroy_options() when it is no longer needed.
5108 *  rbd_opts
5109 *	Address of an rbd options pointer.  Fully initialized by
5110 *	this function; caller must release with kfree().
5111 *  spec
5112 *	Address of an rbd image specification pointer.  Fully
5113 *	initialized by this function based on parsed options.
5114 *	Caller must release with rbd_spec_put().
5115 *
5116 * The options passed take this form:
5117 *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5118 * where:
5119 *  <mon_addrs>
5120 *      A comma-separated list of one or more monitor addresses.
5121 *      A monitor address is an ip address, optionally followed
5122 *      by a port number (separated by a colon).
5123 *        I.e.:  ip1[:port1][,ip2[:port2]...]
5124 *  <options>
5125 *      A comma-separated list of ceph and/or rbd options.
5126 *  <pool_name>
5127 *      The name of the rados pool containing the rbd image.
5128 *  <image_name>
5129 *      The name of the image in that pool to map.
5130 *  <snap_id>
5131 *      An optional snapshot id.  If provided, the mapping will
5132 *      present data from the image at the time that snapshot was
5133 *      created.  The image head is used if no snapshot id is
5134 *      provided.  Snapshot mappings are always read-only.
5135 */
5136static int rbd_add_parse_args(const char *buf,
5137				struct ceph_options **ceph_opts,
5138				struct rbd_options **opts,
5139				struct rbd_spec **rbd_spec)
5140{
5141	size_t len;
5142	char *options;
5143	const char *mon_addrs;
5144	char *snap_name;
5145	size_t mon_addrs_size;
5146	struct rbd_spec *spec = NULL;
5147	struct rbd_options *rbd_opts = NULL;
5148	struct ceph_options *copts;
5149	int ret;
5150
5151	/* The first four tokens are required */
5152
5153	len = next_token(&buf);
5154	if (!len) {
5155		rbd_warn(NULL, "no monitor address(es) provided");
5156		return -EINVAL;
5157	}
5158	mon_addrs = buf;
5159	mon_addrs_size = len + 1;
5160	buf += len;
5161
5162	ret = -EINVAL;
5163	options = dup_token(&buf, NULL);
5164	if (!options)
5165		return -ENOMEM;
5166	if (!*options) {
5167		rbd_warn(NULL, "no options provided");
5168		goto out_err;
5169	}
5170
5171	spec = rbd_spec_alloc();
5172	if (!spec)
5173		goto out_mem;
5174
5175	spec->pool_name = dup_token(&buf, NULL);
5176	if (!spec->pool_name)
5177		goto out_mem;
5178	if (!*spec->pool_name) {
5179		rbd_warn(NULL, "no pool name provided");
5180		goto out_err;
5181	}
5182
5183	spec->image_name = dup_token(&buf, NULL);
5184	if (!spec->image_name)
5185		goto out_mem;
5186	if (!*spec->image_name) {
5187		rbd_warn(NULL, "no image name provided");
5188		goto out_err;
5189	}
5190
5191	/*
5192	 * Snapshot name is optional; default is to use "-"
5193	 * (indicating the head/no snapshot).
5194	 */
5195	len = next_token(&buf);
5196	if (!len) {
5197		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5198		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
5199	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
5200		ret = -ENAMETOOLONG;
5201		goto out_err;
5202	}
5203	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5204	if (!snap_name)
5205		goto out_mem;
5206	*(snap_name + len) = '\0';
5207	spec->snap_name = snap_name;
5208
5209	/* Initialize all rbd options to the defaults */
5210
5211	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
5212	if (!rbd_opts)
5213		goto out_mem;
5214
5215	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
5216	rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
5217	rbd_opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
5218	rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5219	rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
5220	rbd_opts->trim = RBD_TRIM_DEFAULT;
5221
5222	copts = ceph_parse_options(options, mon_addrs,
5223					mon_addrs + mon_addrs_size - 1,
5224					parse_rbd_opts_token, rbd_opts);
5225	if (IS_ERR(copts)) {
5226		ret = PTR_ERR(copts);
5227		goto out_err;
5228	}
5229	kfree(options);
5230
5231	*ceph_opts = copts;
5232	*opts = rbd_opts;
5233	*rbd_spec = spec;
5234
5235	return 0;
5236out_mem:
5237	ret = -ENOMEM;
5238out_err:
5239	kfree(rbd_opts);
5240	rbd_spec_put(spec);
5241	kfree(options);
5242
5243	return ret;
5244}
5245
5246static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5247{
5248	down_write(&rbd_dev->lock_rwsem);
5249	if (__rbd_is_lock_owner(rbd_dev))
5250		rbd_unlock(rbd_dev);
5251	up_write(&rbd_dev->lock_rwsem);
5252}
5253
5254static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5255{
5256	int ret;
5257
5258	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5259		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5260		return -EINVAL;
5261	}
5262
5263	/* FIXME: "rbd map --exclusive" should be in interruptible */
5264	down_read(&rbd_dev->lock_rwsem);
5265	ret = rbd_wait_state_locked(rbd_dev, true);
5266	up_read(&rbd_dev->lock_rwsem);
5267	if (ret) {
5268		rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5269		return -EROFS;
5270	}
5271
5272	return 0;
5273}
5274
5275/*
5276 * An rbd format 2 image has a unique identifier, distinct from the
5277 * name given to it by the user.  Internally, that identifier is
5278 * what's used to specify the names of objects related to the image.
5279 *
5280 * A special "rbd id" object is used to map an rbd image name to its
5281 * id.  If that object doesn't exist, then there is no v2 rbd image
5282 * with the supplied name.
5283 *
5284 * This function will record the given rbd_dev's image_id field if
5285 * it can be determined, and in that case will return 0.  If any
5286 * errors occur a negative errno will be returned and the rbd_dev's
5287 * image_id field will be unchanged (and should be NULL).
5288 */
5289static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5290{
5291	int ret;
5292	size_t size;
5293	CEPH_DEFINE_OID_ONSTACK(oid);
5294	void *response;
5295	char *image_id;
5296
5297	/*
5298	 * When probing a parent image, the image id is already
5299	 * known (and the image name likely is not).  There's no
5300	 * need to fetch the image id again in this case.  We
5301	 * do still need to set the image format though.
5302	 */
5303	if (rbd_dev->spec->image_id) {
5304		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5305
5306		return 0;
5307	}
5308
5309	/*
5310	 * First, see if the format 2 image id file exists, and if
5311	 * so, get the image's persistent id from it.
5312	 */
5313	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5314			       rbd_dev->spec->image_name);
5315	if (ret)
5316		return ret;
5317
5318	dout("rbd id object name is %s\n", oid.name);
5319
5320	/* Response will be an encoded string, which includes a length */
5321
5322	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5323	response = kzalloc(size, GFP_NOIO);
5324	if (!response) {
5325		ret = -ENOMEM;
5326		goto out;
5327	}
5328
5329	/* If it doesn't exist we'll assume it's a format 1 image */
5330
5331	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5332				  "get_id", NULL, 0,
5333				  response, RBD_IMAGE_ID_LEN_MAX);
5334	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5335	if (ret == -ENOENT) {
5336		image_id = kstrdup("", GFP_KERNEL);
5337		ret = image_id ? 0 : -ENOMEM;
5338		if (!ret)
5339			rbd_dev->image_format = 1;
5340	} else if (ret >= 0) {
5341		void *p = response;
5342
5343		image_id = ceph_extract_encoded_string(&p, p + ret,
5344						NULL, GFP_NOIO);
5345		ret = PTR_ERR_OR_ZERO(image_id);
5346		if (!ret)
5347			rbd_dev->image_format = 2;
 
 
5348	}
5349
5350	if (!ret) {
5351		rbd_dev->spec->image_id = image_id;
5352		dout("image_id is %s\n", image_id);
5353	}
5354out:
5355	kfree(response);
5356	ceph_oid_destroy(&oid);
 
5357	return ret;
5358}
5359
5360/*
5361 * Undo whatever state changes are made by v1 or v2 header info
5362 * call.
5363 */
5364static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5365{
5366	struct rbd_image_header	*header;
5367
5368	rbd_dev_parent_put(rbd_dev);
 
 
 
5369
5370	/* Free dynamic fields from the header, then zero it out */
5371
5372	header = &rbd_dev->header;
5373	ceph_put_snap_context(header->snapc);
5374	kfree(header->snap_sizes);
5375	kfree(header->snap_names);
5376	kfree(header->object_prefix);
5377	memset(header, 0, sizeof (*header));
5378}
5379
5380static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
5381{
5382	int ret;
5383
5384	ret = rbd_dev_v2_object_prefix(rbd_dev);
5385	if (ret)
5386		goto out_err;
5387
5388	/*
5389	 * Get the and check features for the image.  Currently the
5390	 * features are assumed to never change.
5391	 */
5392	ret = rbd_dev_v2_features(rbd_dev);
5393	if (ret)
5394		goto out_err;
5395
5396	/* If the image supports fancy striping, get its parameters */
5397
5398	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5399		ret = rbd_dev_v2_striping_info(rbd_dev);
5400		if (ret < 0)
5401			goto out_err;
5402	}
 
5403
5404	if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5405		ret = rbd_dev_v2_data_pool(rbd_dev);
5406		if (ret)
5407			goto out_err;
5408	}
5409
5410	rbd_init_layout(rbd_dev);
5411	return 0;
5412
5413out_err:
5414	rbd_dev->header.features = 0;
5415	kfree(rbd_dev->header.object_prefix);
5416	rbd_dev->header.object_prefix = NULL;
 
5417	return ret;
5418}
5419
5420/*
5421 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5422 * rbd_dev_image_probe() recursion depth, which means it's also the
5423 * length of the already discovered part of the parent chain.
5424 */
5425static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
5426{
5427	struct rbd_device *parent = NULL;
 
 
5428	int ret;
5429
5430	if (!rbd_dev->parent_spec)
5431		return 0;
5432
5433	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5434		pr_info("parent chain is too long (%d)\n", depth);
5435		ret = -EINVAL;
5436		goto out_err;
5437	}
5438
5439	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
5440	if (!parent) {
5441		ret = -ENOMEM;
5442		goto out_err;
5443	}
5444
5445	/*
5446	 * Images related by parent/child relationships always share
5447	 * rbd_client and spec/parent_spec, so bump their refcounts.
 
5448	 */
5449	__rbd_get_client(rbd_dev->rbd_client);
5450	rbd_spec_get(rbd_dev->parent_spec);
5451
5452	ret = rbd_dev_image_probe(parent, depth);
5453	if (ret < 0)
 
5454		goto out_err;
5455
 
 
 
5456	rbd_dev->parent = parent;
5457	atomic_set(&rbd_dev->parent_ref, 1);
5458	return 0;
5459
 
5460out_err:
5461	rbd_dev_unparent(rbd_dev);
5462	rbd_dev_destroy(parent);
5463	return ret;
5464}
 
 
 
 
5465
5466static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5467{
5468	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5469	rbd_dev_mapping_clear(rbd_dev);
5470	rbd_free_disk(rbd_dev);
5471	if (!single_major)
5472		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5473}
5474
5475/*
5476 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5477 * upon return.
5478 */
5479static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5480{
5481	int ret;
5482
 
 
 
 
 
 
 
 
 
 
5483	/* Record our major and minor device numbers. */
5484
5485	if (!single_major) {
5486		ret = register_blkdev(0, rbd_dev->name);
5487		if (ret < 0)
5488			goto err_out_unlock;
5489
5490		rbd_dev->major = ret;
5491		rbd_dev->minor = 0;
5492	} else {
5493		rbd_dev->major = rbd_major;
5494		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5495	}
5496
5497	/* Set up the blkdev mapping. */
5498
5499	ret = rbd_init_disk(rbd_dev);
5500	if (ret)
5501		goto err_out_blkdev;
5502
5503	ret = rbd_dev_mapping_set(rbd_dev);
5504	if (ret)
5505		goto err_out_disk;
5506
5507	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
5508	set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
5509
5510	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
5511	if (ret)
5512		goto err_out_mapping;
5513
 
 
5514	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5515	up_write(&rbd_dev->header_rwsem);
5516	return 0;
 
 
 
 
5517
5518err_out_mapping:
5519	rbd_dev_mapping_clear(rbd_dev);
5520err_out_disk:
5521	rbd_free_disk(rbd_dev);
5522err_out_blkdev:
5523	if (!single_major)
5524		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5525err_out_unlock:
5526	up_write(&rbd_dev->header_rwsem);
 
 
5527	return ret;
5528}
5529
5530static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5531{
5532	struct rbd_spec *spec = rbd_dev->spec;
5533	int ret;
5534
5535	/* Record the header object name for this rbd image. */
5536
5537	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 
5538	if (rbd_dev->image_format == 1)
5539		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5540				       spec->image_name, RBD_SUFFIX);
5541	else
5542		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5543				       RBD_HEADER_PREFIX, spec->image_id);
5544
5545	return ret;
 
 
 
 
 
 
 
 
 
 
5546}
5547
5548static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5549{
5550	rbd_dev_unprobe(rbd_dev);
5551	if (rbd_dev->opts)
5552		rbd_unregister_watch(rbd_dev);
5553	rbd_dev->image_format = 0;
5554	kfree(rbd_dev->spec->image_id);
5555	rbd_dev->spec->image_id = NULL;
 
 
5556}
5557
5558/*
5559 * Probe for the existence of the header object for the given rbd
5560 * device.  If this image is the one being mapped (i.e., not a
5561 * parent), initiate a watch on its header object before using that
5562 * object to get detailed information about the rbd image.
5563 */
5564static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
5565{
5566	int ret;
5567
5568	/*
5569	 * Get the id from the image id object.  Unless there's an
5570	 * error, rbd_dev->spec->image_id will be filled in with
5571	 * a dynamically-allocated string, and rbd_dev->image_format
5572	 * will be set to either 1 or 2.
5573	 */
5574	ret = rbd_dev_image_id(rbd_dev);
5575	if (ret)
5576		return ret;
 
 
5577
5578	ret = rbd_dev_header_name(rbd_dev);
5579	if (ret)
5580		goto err_out_format;
5581
5582	if (!depth) {
5583		ret = rbd_register_watch(rbd_dev);
5584		if (ret) {
5585			if (ret == -ENOENT)
5586				pr_info("image %s/%s does not exist\n",
5587					rbd_dev->spec->pool_name,
5588					rbd_dev->spec->image_name);
5589			goto err_out_format;
5590		}
5591	}
5592
5593	ret = rbd_dev_header_info(rbd_dev);
 
 
 
5594	if (ret)
5595		goto err_out_watch;
5596
5597	/*
5598	 * If this image is the one being mapped, we have pool name and
5599	 * id, image name and id, and snap name - need to fill snap id.
5600	 * Otherwise this is a parent image, identified by pool, image
5601	 * and snap ids - need to fill in names for those ids.
5602	 */
5603	if (!depth)
5604		ret = rbd_spec_fill_snap_id(rbd_dev);
5605	else
5606		ret = rbd_spec_fill_names(rbd_dev);
5607	if (ret) {
5608		if (ret == -ENOENT)
5609			pr_info("snap %s/%s@%s does not exist\n",
5610				rbd_dev->spec->pool_name,
5611				rbd_dev->spec->image_name,
5612				rbd_dev->spec->snap_name);
5613		goto err_out_probe;
5614	}
5615
5616	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5617		ret = rbd_dev_v2_parent_info(rbd_dev);
5618		if (ret)
5619			goto err_out_probe;
5620
5621		/*
5622		 * Need to warn users if this image is the one being
5623		 * mapped and has a parent.
5624		 */
5625		if (!depth && rbd_dev->parent_spec)
5626			rbd_warn(rbd_dev,
5627				 "WARNING: kernel layering is EXPERIMENTAL!");
5628	}
5629
5630	ret = rbd_dev_probe_parent(rbd_dev, depth);
5631	if (ret)
5632		goto err_out_probe;
5633
5634	dout("discovered format %u image, header name is %s\n",
5635		rbd_dev->image_format, rbd_dev->header_oid.name);
5636	return 0;
5637
 
5638err_out_probe:
5639	rbd_dev_unprobe(rbd_dev);
5640err_out_watch:
5641	if (!depth)
5642		rbd_unregister_watch(rbd_dev);
 
 
 
5643err_out_format:
5644	rbd_dev->image_format = 0;
5645	kfree(rbd_dev->spec->image_id);
5646	rbd_dev->spec->image_id = NULL;
 
 
 
5647	return ret;
5648}
5649
5650static ssize_t do_rbd_add(struct bus_type *bus,
5651			  const char *buf,
5652			  size_t count)
5653{
5654	struct rbd_device *rbd_dev = NULL;
5655	struct ceph_options *ceph_opts = NULL;
5656	struct rbd_options *rbd_opts = NULL;
5657	struct rbd_spec *spec = NULL;
5658	struct rbd_client *rbdc;
5659	int rc;
 
 
5660
5661	if (!try_module_get(THIS_MODULE))
5662		return -ENODEV;
5663
5664	/* parse add command */
5665	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5666	if (rc < 0)
5667		goto out;
 
 
 
5668
5669	rbdc = rbd_get_client(ceph_opts);
5670	if (IS_ERR(rbdc)) {
5671		rc = PTR_ERR(rbdc);
5672		goto err_out_args;
5673	}
5674
5675	/* pick the pool */
5676	rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
5677	if (rc < 0) {
5678		if (rc == -ENOENT)
5679			pr_info("pool %s does not exist\n", spec->pool_name);
5680		goto err_out_client;
5681	}
5682	spec->pool_id = (u64)rc;
5683
5684	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
5685	if (!rbd_dev) {
5686		rc = -ENOMEM;
 
 
 
5687		goto err_out_client;
5688	}
 
 
 
 
5689	rbdc = NULL;		/* rbd_dev now owns this */
5690	spec = NULL;		/* rbd_dev now owns this */
5691	rbd_opts = NULL;	/* rbd_dev now owns this */
5692
5693	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
5694	if (!rbd_dev->config_info) {
5695		rc = -ENOMEM;
5696		goto err_out_rbd_dev;
5697	}
5698
5699	down_write(&rbd_dev->header_rwsem);
5700	rc = rbd_dev_image_probe(rbd_dev, 0);
5701	if (rc < 0) {
5702		up_write(&rbd_dev->header_rwsem);
5703		goto err_out_rbd_dev;
5704	}
5705
5706	/* If we are mapping a snapshot it must be marked read-only */
 
5707	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
5708		rbd_dev->opts->read_only = true;
 
5709
5710	rc = rbd_dev_device_setup(rbd_dev);
5711	if (rc)
5712		goto err_out_image_probe;
5713
5714	if (rbd_dev->opts->exclusive) {
5715		rc = rbd_add_acquire_lock(rbd_dev);
5716		if (rc)
5717			goto err_out_device_setup;
 
 
5718	}
5719
5720	/* Everything's ready.  Announce the disk to the world. */
5721
5722	rc = device_add(&rbd_dev->dev);
5723	if (rc)
5724		goto err_out_image_lock;
5725
5726	add_disk(rbd_dev->disk);
5727	/* see rbd_init_disk() */
5728	blk_put_queue(rbd_dev->disk->queue);
5729
5730	spin_lock(&rbd_dev_list_lock);
5731	list_add_tail(&rbd_dev->node, &rbd_dev_list);
5732	spin_unlock(&rbd_dev_list_lock);
5733
5734	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
5735		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
5736		rbd_dev->header.features);
5737	rc = count;
5738out:
5739	module_put(THIS_MODULE);
5740	return rc;
5741
5742err_out_image_lock:
5743	rbd_dev_image_unlock(rbd_dev);
5744err_out_device_setup:
5745	rbd_dev_device_release(rbd_dev);
5746err_out_image_probe:
5747	rbd_dev_image_release(rbd_dev);
5748err_out_rbd_dev:
5749	rbd_dev_destroy(rbd_dev);
5750err_out_client:
5751	rbd_put_client(rbdc);
5752err_out_args:
5753	rbd_spec_put(spec);
5754	kfree(rbd_opts);
5755	goto out;
 
 
 
 
5756}
5757
5758static ssize_t rbd_add(struct bus_type *bus,
5759		       const char *buf,
5760		       size_t count)
5761{
5762	if (single_major)
5763		return -EINVAL;
5764
5765	return do_rbd_add(bus, buf, count);
5766}
5767
5768static ssize_t rbd_add_single_major(struct bus_type *bus,
5769				    const char *buf,
5770				    size_t count)
5771{
5772	return do_rbd_add(bus, buf, count);
5773}
5774
 
 
 
 
 
 
 
 
 
 
 
 
 
5775static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5776{
5777	while (rbd_dev->parent) {
5778		struct rbd_device *first = rbd_dev;
5779		struct rbd_device *second = first->parent;
5780		struct rbd_device *third;
5781
5782		/*
5783		 * Follow to the parent with no grandparent and
5784		 * remove it.
5785		 */
5786		while (second && (third = second->parent)) {
5787			first = second;
5788			second = third;
5789		}
5790		rbd_assert(second);
5791		rbd_dev_image_release(second);
5792		rbd_dev_destroy(second);
5793		first->parent = NULL;
5794		first->parent_overlap = 0;
5795
5796		rbd_assert(first->parent_spec);
5797		rbd_spec_put(first->parent_spec);
5798		first->parent_spec = NULL;
5799	}
5800}
5801
5802static ssize_t do_rbd_remove(struct bus_type *bus,
5803			     const char *buf,
5804			     size_t count)
5805{
5806	struct rbd_device *rbd_dev = NULL;
5807	struct list_head *tmp;
5808	int dev_id;
5809	char opt_buf[6];
5810	bool already = false;
5811	bool force = false;
5812	int ret;
5813
5814	dev_id = -1;
5815	opt_buf[0] = '\0';
5816	sscanf(buf, "%d %5s", &dev_id, opt_buf);
5817	if (dev_id < 0) {
5818		pr_err("dev_id out of range\n");
 
 
5819		return -EINVAL;
5820	}
5821	if (opt_buf[0] != '\0') {
5822		if (!strcmp(opt_buf, "force")) {
5823			force = true;
5824		} else {
5825			pr_err("bad remove option at '%s'\n", opt_buf);
5826			return -EINVAL;
5827		}
5828	}
5829
5830	ret = -ENOENT;
5831	spin_lock(&rbd_dev_list_lock);
5832	list_for_each(tmp, &rbd_dev_list) {
5833		rbd_dev = list_entry(tmp, struct rbd_device, node);
5834		if (rbd_dev->dev_id == dev_id) {
5835			ret = 0;
5836			break;
5837		}
5838	}
5839	if (!ret) {
5840		spin_lock_irq(&rbd_dev->lock);
5841		if (rbd_dev->open_count && !force)
5842			ret = -EBUSY;
5843		else
5844			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
5845							&rbd_dev->flags);
5846		spin_unlock_irq(&rbd_dev->lock);
5847	}
5848	spin_unlock(&rbd_dev_list_lock);
5849	if (ret < 0 || already)
5850		return ret;
5851
5852	if (force) {
5853		/*
5854		 * Prevent new IO from being queued and wait for existing
5855		 * IO to complete/fail.
5856		 */
5857		blk_mq_freeze_queue(rbd_dev->disk->queue);
5858		blk_set_queue_dying(rbd_dev->disk->queue);
5859	}
5860
5861	del_gendisk(rbd_dev->disk);
5862	spin_lock(&rbd_dev_list_lock);
5863	list_del_init(&rbd_dev->node);
5864	spin_unlock(&rbd_dev_list_lock);
5865	device_del(&rbd_dev->dev);
5866
5867	rbd_dev_image_unlock(rbd_dev);
5868	rbd_dev_device_release(rbd_dev);
 
 
 
 
 
5869	rbd_dev_image_release(rbd_dev);
5870	rbd_dev_destroy(rbd_dev);
 
5871	return count;
5872}
5873
5874static ssize_t rbd_remove(struct bus_type *bus,
5875			  const char *buf,
5876			  size_t count)
5877{
5878	if (single_major)
5879		return -EINVAL;
5880
5881	return do_rbd_remove(bus, buf, count);
5882}
5883
5884static ssize_t rbd_remove_single_major(struct bus_type *bus,
5885				       const char *buf,
5886				       size_t count)
5887{
5888	return do_rbd_remove(bus, buf, count);
5889}
5890
5891/*
5892 * create control files in sysfs
5893 * /sys/bus/rbd/...
5894 */
5895static int rbd_sysfs_init(void)
5896{
5897	int ret;
5898
5899	ret = device_register(&rbd_root_dev);
5900	if (ret < 0)
5901		return ret;
5902
5903	ret = bus_register(&rbd_bus_type);
5904	if (ret < 0)
5905		device_unregister(&rbd_root_dev);
5906
5907	return ret;
5908}
5909
5910static void rbd_sysfs_cleanup(void)
5911{
5912	bus_unregister(&rbd_bus_type);
5913	device_unregister(&rbd_root_dev);
5914}
5915
5916static int rbd_slab_init(void)
5917{
5918	rbd_assert(!rbd_img_request_cache);
5919	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
 
 
 
5920	if (!rbd_img_request_cache)
5921		return -ENOMEM;
5922
5923	rbd_assert(!rbd_obj_request_cache);
5924	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
 
 
 
5925	if (!rbd_obj_request_cache)
5926		goto out_err;
5927
5928	return 0;
5929
 
 
 
5930out_err:
 
 
 
 
 
5931	kmem_cache_destroy(rbd_img_request_cache);
5932	rbd_img_request_cache = NULL;
 
5933	return -ENOMEM;
5934}
5935
5936static void rbd_slab_exit(void)
5937{
 
 
 
 
5938	rbd_assert(rbd_obj_request_cache);
5939	kmem_cache_destroy(rbd_obj_request_cache);
5940	rbd_obj_request_cache = NULL;
5941
5942	rbd_assert(rbd_img_request_cache);
5943	kmem_cache_destroy(rbd_img_request_cache);
5944	rbd_img_request_cache = NULL;
5945}
5946
5947static int __init rbd_init(void)
5948{
5949	int rc;
5950
5951	if (!libceph_compatible(NULL)) {
5952		rbd_warn(NULL, "libceph incompatibility (quitting)");
5953		return -EINVAL;
5954	}
5955
5956	rc = rbd_slab_init();
5957	if (rc)
5958		return rc;
5959
5960	/*
5961	 * The number of active work items is limited by the number of
5962	 * rbd devices * queue depth, so leave @max_active at default.
5963	 */
5964	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
5965	if (!rbd_wq) {
5966		rc = -ENOMEM;
5967		goto err_out_slab;
5968	}
5969
5970	if (single_major) {
5971		rbd_major = register_blkdev(0, RBD_DRV_NAME);
5972		if (rbd_major < 0) {
5973			rc = rbd_major;
5974			goto err_out_wq;
5975		}
5976	}
5977
5978	rc = rbd_sysfs_init();
5979	if (rc)
5980		goto err_out_blkdev;
5981
5982	if (single_major)
5983		pr_info("loaded (major %d)\n", rbd_major);
5984	else
5985		pr_info("loaded\n");
5986
5987	return 0;
5988
5989err_out_blkdev:
5990	if (single_major)
5991		unregister_blkdev(rbd_major, RBD_DRV_NAME);
5992err_out_wq:
5993	destroy_workqueue(rbd_wq);
5994err_out_slab:
5995	rbd_slab_exit();
5996	return rc;
5997}
5998
5999static void __exit rbd_exit(void)
6000{
6001	ida_destroy(&rbd_dev_id_ida);
6002	rbd_sysfs_cleanup();
6003	if (single_major)
6004		unregister_blkdev(rbd_major, RBD_DRV_NAME);
6005	destroy_workqueue(rbd_wq);
6006	rbd_slab_exit();
6007}
6008
6009module_init(rbd_init);
6010module_exit(rbd_exit);
6011
6012MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
6013MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6014MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
6015/* following authorship retained from original osdblk.c */
6016MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6017
6018MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
6019MODULE_LICENSE("GPL");

   1
   2/*
   3   rbd.c -- Export ceph rados objects as a Linux block device
   4
   5
   6   based on drivers/block/osdblk.c:
   7
   8   Copyright 2009 Red Hat, Inc.
   9
  10   This program is free software; you can redistribute it and/or modify
  11   it under the terms of the GNU General Public License as published by
  12   the Free Software Foundation.
  13
  14   This program is distributed in the hope that it will be useful,
  15   but WITHOUT ANY WARRANTY; without even the implied warranty of
  16   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17   GNU General Public License for more details.
  18
  19   You should have received a copy of the GNU General Public License
  20   along with this program; see the file COPYING.  If not, write to
  21   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  22
  23
  24
  25   For usage instructions, please refer to:
  26
  27                 Documentation/ABI/testing/sysfs-bus-rbd
  28
  29 */
  30
  31#include <linux/ceph/libceph.h>
  32#include <linux/ceph/osd_client.h>
  33#include <linux/ceph/mon_client.h>
 
 
  34#include <linux/ceph/decode.h>
  35#include <linux/parser.h>
  36#include <linux/bsearch.h>
  37
  38#include <linux/kernel.h>
  39#include <linux/device.h>
  40#include <linux/module.h>
 
  41#include <linux/fs.h>
  42#include <linux/blkdev.h>
  43#include <linux/slab.h>
  44#include <linux/idr.h>
 
  45
  46#include "rbd_types.h"
  47
  48#define RBD_DEBUG	/* Activate rbd_assert() calls */
  49
  50/*
  51 * The basic unit of block I/O is a sector.  It is interpreted in a
  52 * number of contexts in Linux (blk, bio, genhd), but the default is
  53 * universally 512 bytes.  These symbols are just slightly more
  54 * meaningful than the bare numbers they represent.
  55 */
  56#define	SECTOR_SHIFT	9
  57#define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
  58
  59/*
  60 * Increment the given counter and return its updated value.
  61 * If the counter is already 0 it will not be incremented.
  62 * If the counter is already at its maximum value returns
  63 * -EINVAL without updating it.
  64 */
  65static int atomic_inc_return_safe(atomic_t *v)
  66{
  67	unsigned int counter;
  68
  69	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
  70	if (counter <= (unsigned int)INT_MAX)
  71		return (int)counter;
  72
  73	atomic_dec(v);
  74
  75	return -EINVAL;
  76}
  77
  78/* Decrement the counter.  Return the resulting value, or -EINVAL */
  79static int atomic_dec_return_safe(atomic_t *v)
  80{
  81	int counter;
  82
  83	counter = atomic_dec_return(v);
  84	if (counter >= 0)
  85		return counter;
  86
  87	atomic_inc(v);
  88
  89	return -EINVAL;
  90}
  91
  92#define RBD_DRV_NAME "rbd"
  93
  94#define RBD_MINORS_PER_MAJOR		256
  95#define RBD_SINGLE_MAJOR_PART_SHIFT	4
  96
 
 
  97#define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
  98#define RBD_MAX_SNAP_NAME_LEN	\
  99			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
 100
 101#define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
 102
 103#define RBD_SNAP_HEAD_NAME	"-"
 104
 105#define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
 106
 107/* This allows a single page to hold an image name sent by OSD */
 108#define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
 109#define RBD_IMAGE_ID_LEN_MAX	64
 110
 111#define RBD_OBJ_PREFIX_LEN_MAX	64
 112
 
 
 
 113/* Feature bits */
 114
 115#define RBD_FEATURE_LAYERING	(1<<0)
 116#define RBD_FEATURE_STRIPINGV2	(1<<1)
 117#define RBD_FEATURES_ALL \
 118	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
 
 
 
 
 
 
 
 119
 120/* Features supported by this (client software) implementation. */
 121
 122#define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
 123
 124/*
 125 * An RBD device name will be "rbd#", where the "rbd" comes from
 126 * RBD_DRV_NAME above, and # is a unique integer identifier.
 127 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
 128 * enough to hold all possible device names.
 129 */
 130#define DEV_NAME_LEN		32
 131#define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
 132
 133/*
 134 * block device image metadata (in-memory version)
 135 */
 136struct rbd_image_header {
 137	/* These six fields never change for a given rbd image */
 138	char *object_prefix;
 139	__u8 obj_order;
 140	__u8 crypt_type;
 141	__u8 comp_type;
 142	u64 stripe_unit;
 143	u64 stripe_count;
 
 144	u64 features;		/* Might be changeable someday? */
 145
 146	/* The remaining fields need to be updated occasionally */
 147	u64 image_size;
 148	struct ceph_snap_context *snapc;
 149	char *snap_names;	/* format 1 only */
 150	u64 *snap_sizes;	/* format 1 only */
 151};
 152
 153/*
 154 * An rbd image specification.
 155 *
 156 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
 157 * identify an image.  Each rbd_dev structure includes a pointer to
 158 * an rbd_spec structure that encapsulates this identity.
 159 *
 160 * Each of the id's in an rbd_spec has an associated name.  For a
 161 * user-mapped image, the names are supplied and the id's associated
 162 * with them are looked up.  For a layered image, a parent image is
 163 * defined by the tuple, and the names are looked up.
 164 *
 165 * An rbd_dev structure contains a parent_spec pointer which is
 166 * non-null if the image it represents is a child in a layered
 167 * image.  This pointer will refer to the rbd_spec structure used
 168 * by the parent rbd_dev for its own identity (i.e., the structure
 169 * is shared between the parent and child).
 170 *
 171 * Since these structures are populated once, during the discovery
 172 * phase of image construction, they are effectively immutable so
 173 * we make no effort to synchronize access to them.
 174 *
 175 * Note that code herein does not assume the image name is known (it
 176 * could be a null pointer).
 177 */
 178struct rbd_spec {
 179	u64		pool_id;
 180	const char	*pool_name;
 181
 182	const char	*image_id;
 183	const char	*image_name;
 184
 185	u64		snap_id;
 186	const char	*snap_name;
 187
 188	struct kref	kref;
 189};
 190
 191/*
 192 * an instance of the client.  multiple devices may share an rbd client.
 193 */
 194struct rbd_client {
 195	struct ceph_client	*client;
 196	struct kref		kref;
 197	struct list_head	node;
 198};
 199
 200struct rbd_img_request;
 201typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
 202
 203#define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
 
 
 
 
 
 204
 205struct rbd_obj_request;
 206typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
 207
 208enum obj_request_type {
 209	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
 210};
 211
 212enum obj_req_flags {
 213	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
 214	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
 215	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
 216	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 217};
 218
 219struct rbd_obj_request {
 220	const char		*object_name;
 221	u64			offset;		/* object start byte */
 222	u64			length;		/* bytes from offset */
 223	unsigned long		flags;
 224
 225	/*
 226	 * An object request associated with an image will have its
 227	 * img_data flag set; a standalone object request will not.
 228	 *
 229	 * A standalone object request will have which == BAD_WHICH
 230	 * and a null obj_request pointer.
 231	 *
 232	 * An object request initiated in support of a layered image
 233	 * object (to check for its existence before a write) will
 234	 * have which == BAD_WHICH and a non-null obj_request pointer.
 235	 *
 236	 * Finally, an object request for rbd image data will have
 237	 * which != BAD_WHICH, and will have a non-null img_request
 238	 * pointer.  The value of which will be in the range
 239	 * 0..(img_request->obj_request_count-1).
 240	 */
 241	union {
 242		struct rbd_obj_request	*obj_request;	/* STAT op */
 243		struct {
 244			struct rbd_img_request	*img_request;
 245			u64			img_offset;
 246			/* links for img_request->obj_requests list */
 247			struct list_head	links;
 248		};
 249	};
 250	u32			which;		/* posn image request list */
 251
 252	enum obj_request_type	type;
 
 
 
 253	union {
 254		struct bio	*bio_list;
 255		struct {
 256			struct page	**pages;
 257			u32		page_count;
 
 258		};
 259	};
 260	struct page		**copyup_pages;
 261	u32			copyup_page_count;
 262
 263	struct ceph_osd_request	*osd_req;
 264
 265	u64			xferred;	/* bytes transferred */
 266	int			result;
 267
 268	rbd_obj_callback_t	callback;
 269	struct completion	completion;
 270
 271	struct kref		kref;
 272};
 273
 274enum img_req_flags {
 275	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
 276	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
 277	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
 278};
 279
 280struct rbd_img_request {
 281	struct rbd_device	*rbd_dev;
 282	u64			offset;	/* starting image byte offset */
 283	u64			length;	/* byte count from offset */
 284	unsigned long		flags;
 285	union {
 286		u64			snap_id;	/* for reads */
 287		struct ceph_snap_context *snapc;	/* for writes */
 288	};
 289	union {
 290		struct request		*rq;		/* block request */
 291		struct rbd_obj_request	*obj_request;	/* obj req initiator */
 292	};
 293	struct page		**copyup_pages;
 294	u32			copyup_page_count;
 295	spinlock_t		completion_lock;/* protects next_completion */
 296	u32			next_completion;
 297	rbd_img_callback_t	callback;
 298	u64			xferred;/* aggregate bytes transferred */
 299	int			result;	/* first nonzero obj_request result */
 300
 
 301	u32			obj_request_count;
 302	struct list_head	obj_requests;	/* rbd_obj_request structs */
 303
 304	struct kref		kref;
 305};
 306
 307#define for_each_obj_request(ireq, oreq) \
 308	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
 309#define for_each_obj_request_from(ireq, oreq) \
 310	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
 311#define for_each_obj_request_safe(ireq, oreq, n) \
 312	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 313
 314struct rbd_mapping {
 315	u64                     size;
 316	u64                     features;
 317	bool			read_only;
 318};
 319
 320/*
 321 * a single device
 322 */
 323struct rbd_device {
 324	int			dev_id;		/* blkdev unique id */
 325
 326	int			major;		/* blkdev assigned major */
 327	int			minor;
 328	struct gendisk		*disk;		/* blkdev's gendisk and rq */
 329
 330	u32			image_format;	/* Either 1 or 2 */
 331	struct rbd_client	*rbd_client;
 332
 333	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
 334
 335	spinlock_t		lock;		/* queue, flags, open_count */
 336
 337	struct rbd_image_header	header;
 338	unsigned long		flags;		/* possibly lock protected */
 339	struct rbd_spec		*spec;
 
 
 340
 341	char			*header_name;
 
 342
 343	struct ceph_file_layout	layout;
 344
 345	struct ceph_osd_event   *watch_event;
 346	struct rbd_obj_request	*watch_request;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 347
 348	struct rbd_spec		*parent_spec;
 349	u64			parent_overlap;
 350	atomic_t		parent_ref;
 351	struct rbd_device	*parent;
 352
 
 
 
 353	/* protects updating the header */
 354	struct rw_semaphore     header_rwsem;
 355
 356	struct rbd_mapping	mapping;
 357
 358	struct list_head	node;
 359
 360	/* sysfs related */
 361	struct device		dev;
 362	unsigned long		open_count;	/* protected by lock */
 363};
 364
 365/*
 366 * Flag bits for rbd_dev->flags.  If atomicity is required,
 367 * rbd_dev->lock is used to protect access.
 368 *
 369 * Currently, only the "removing" flag (which is coupled with the
 370 * "open_count" field) requires atomic access.
 371 */
 372enum rbd_dev_flags {
 373	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
 374	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
 
 375};
 376
 377static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
 378
 379static LIST_HEAD(rbd_dev_list);    /* devices */
 380static DEFINE_SPINLOCK(rbd_dev_list_lock);
 381
 382static LIST_HEAD(rbd_client_list);		/* clients */
 383static DEFINE_SPINLOCK(rbd_client_list_lock);
 384
 385/* Slab caches for frequently-allocated structures */
 386
 387static struct kmem_cache	*rbd_img_request_cache;
 388static struct kmem_cache	*rbd_obj_request_cache;
 389static struct kmem_cache	*rbd_segment_name_cache;
 390
 391static int rbd_major;
 392static DEFINE_IDA(rbd_dev_id_ida);
 393
 
 
 394/*
 395 * Default to false for now, as single-major requires >= 0.75 version of
 396 * userspace rbd utility.
 397 */
 398static bool single_major = false;
 399module_param(single_major, bool, S_IRUGO);
 400MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
 401
 402static int rbd_img_request_submit(struct rbd_img_request *img_request);
 403
 404static void rbd_dev_device_release(struct device *dev);
 405
 406static ssize_t rbd_add(struct bus_type *bus, const char *buf,
 407		       size_t count);
 408static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
 409			  size_t count);
 410static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
 411				    size_t count);
 412static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
 413				       size_t count);
 414static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
 415static void rbd_spec_put(struct rbd_spec *spec);
 416
 417static int rbd_dev_id_to_minor(int dev_id)
 418{
 419	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
 420}
 421
 422static int minor_to_rbd_dev_id(int minor)
 423{
 424	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
 425}
 426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 427static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
 428static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
 429static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
 430static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
 
 431
 432static struct attribute *rbd_bus_attrs[] = {
 433	&bus_attr_add.attr,
 434	&bus_attr_remove.attr,
 435	&bus_attr_add_single_major.attr,
 436	&bus_attr_remove_single_major.attr,
 
 437	NULL,
 438};
 439
 440static umode_t rbd_bus_is_visible(struct kobject *kobj,
 441				  struct attribute *attr, int index)
 442{
 443	if (!single_major &&
 444	    (attr == &bus_attr_add_single_major.attr ||
 445	     attr == &bus_attr_remove_single_major.attr))
 446		return 0;
 447
 448	return attr->mode;
 449}
 450
 451static const struct attribute_group rbd_bus_group = {
 452	.attrs = rbd_bus_attrs,
 453	.is_visible = rbd_bus_is_visible,
 454};
 455__ATTRIBUTE_GROUPS(rbd_bus);
 456
 457static struct bus_type rbd_bus_type = {
 458	.name		= "rbd",
 459	.bus_groups	= rbd_bus_groups,
 460};
 461
 462static void rbd_root_dev_release(struct device *dev)
 463{
 464}
 465
 466static struct device rbd_root_dev = {
 467	.init_name =    "rbd",
 468	.release =      rbd_root_dev_release,
 469};
 470
 471static __printf(2, 3)
 472void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
 473{
 474	struct va_format vaf;
 475	va_list args;
 476
 477	va_start(args, fmt);
 478	vaf.fmt = fmt;
 479	vaf.va = &args;
 480
 481	if (!rbd_dev)
 482		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
 483	else if (rbd_dev->disk)
 484		printk(KERN_WARNING "%s: %s: %pV\n",
 485			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
 486	else if (rbd_dev->spec && rbd_dev->spec->image_name)
 487		printk(KERN_WARNING "%s: image %s: %pV\n",
 488			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
 489	else if (rbd_dev->spec && rbd_dev->spec->image_id)
 490		printk(KERN_WARNING "%s: id %s: %pV\n",
 491			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
 492	else	/* punt */
 493		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
 494			RBD_DRV_NAME, rbd_dev, &vaf);
 495	va_end(args);
 496}
 497
 498#ifdef RBD_DEBUG
 499#define rbd_assert(expr)						\
 500		if (unlikely(!(expr))) {				\
 501			printk(KERN_ERR "\nAssertion failure in %s() "	\
 502						"at line %d:\n\n"	\
 503					"\trbd_assert(%s);\n\n",	\
 504					__func__, __LINE__, #expr);	\
 505			BUG();						\
 506		}
 507#else /* !RBD_DEBUG */
 508#  define rbd_assert(expr)	((void) 0)
 509#endif /* !RBD_DEBUG */
 510
 511static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
 512static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
 513static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
 514
 515static int rbd_dev_refresh(struct rbd_device *rbd_dev);
 516static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
 517static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev);
 
 518static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
 519					u64 snap_id);
 520static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 521				u8 *order, u64 *snap_size);
 522static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 523		u64 *snap_features);
 524static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
 525
 526static int rbd_open(struct block_device *bdev, fmode_t mode)
 527{
 528	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
 529	bool removing = false;
 530
 531	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
 532		return -EROFS;
 533
 534	spin_lock_irq(&rbd_dev->lock);
 535	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
 536		removing = true;
 537	else
 538		rbd_dev->open_count++;
 539	spin_unlock_irq(&rbd_dev->lock);
 540	if (removing)
 541		return -ENOENT;
 542
 543	(void) get_device(&rbd_dev->dev);
 544	set_device_ro(bdev, rbd_dev->mapping.read_only);
 545
 546	return 0;
 547}
 548
 549static void rbd_release(struct gendisk *disk, fmode_t mode)
 550{
 551	struct rbd_device *rbd_dev = disk->private_data;
 552	unsigned long open_count_before;
 553
 554	spin_lock_irq(&rbd_dev->lock);
 555	open_count_before = rbd_dev->open_count--;
 556	spin_unlock_irq(&rbd_dev->lock);
 557	rbd_assert(open_count_before > 0);
 558
 559	put_device(&rbd_dev->dev);
 560}
 561
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 562static const struct block_device_operations rbd_bd_ops = {
 563	.owner			= THIS_MODULE,
 564	.open			= rbd_open,
 565	.release		= rbd_release,
 
 
 
 
 566};
 567
 568/*
 569 * Initialize an rbd client instance.  Success or not, this function
 570 * consumes ceph_opts.  Caller holds client_mutex.
 571 */
 572static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
 573{
 574	struct rbd_client *rbdc;
 575	int ret = -ENOMEM;
 576
 577	dout("%s:\n", __func__);
 578	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
 579	if (!rbdc)
 580		goto out_opt;
 581
 582	kref_init(&rbdc->kref);
 583	INIT_LIST_HEAD(&rbdc->node);
 584
 585	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
 586	if (IS_ERR(rbdc->client))
 587		goto out_rbdc;
 588	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
 589
 590	ret = ceph_open_session(rbdc->client);
 591	if (ret < 0)
 592		goto out_client;
 593
 594	spin_lock(&rbd_client_list_lock);
 595	list_add_tail(&rbdc->node, &rbd_client_list);
 596	spin_unlock(&rbd_client_list_lock);
 597
 598	dout("%s: rbdc %p\n", __func__, rbdc);
 599
 600	return rbdc;
 601out_client:
 602	ceph_destroy_client(rbdc->client);
 603out_rbdc:
 604	kfree(rbdc);
 605out_opt:
 606	if (ceph_opts)
 607		ceph_destroy_options(ceph_opts);
 608	dout("%s: error %d\n", __func__, ret);
 609
 610	return ERR_PTR(ret);
 611}
 612
 613static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
 614{
 615	kref_get(&rbdc->kref);
 616
 617	return rbdc;
 618}
 619
 620/*
 621 * Find a ceph client with specific addr and configuration.  If
 622 * found, bump its reference count.
 623 */
 624static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
 625{
 626	struct rbd_client *client_node;
 627	bool found = false;
 628
 629	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
 630		return NULL;
 631
 632	spin_lock(&rbd_client_list_lock);
 633	list_for_each_entry(client_node, &rbd_client_list, node) {
 634		if (!ceph_compare_options(ceph_opts, client_node->client)) {
 635			__rbd_get_client(client_node);
 636
 637			found = true;
 638			break;
 639		}
 640	}
 641	spin_unlock(&rbd_client_list_lock);
 642
 643	return found ? client_node : NULL;
 644}
 645
 646/*
 647 * mount options
 648 */
 649enum {
 
 
 650	Opt_last_int,
 651	/* int args above */
 652	Opt_last_string,
 653	/* string args above */
 654	Opt_read_only,
 655	Opt_read_write,
 656	/* Boolean args above */
 657	Opt_last_bool,
 
 
 658};
 659
 660static match_table_t rbd_opts_tokens = {
 
 
 661	/* int args above */
 662	/* string args above */
 663	{Opt_read_only, "read_only"},
 664	{Opt_read_only, "ro"},		/* Alternate spelling */
 665	{Opt_read_write, "read_write"},
 666	{Opt_read_write, "rw"},		/* Alternate spelling */
 667	/* Boolean args above */
 668	{-1, NULL}
 
 
 669};
 670
 671struct rbd_options {
 
 
 672	bool	read_only;
 
 
 
 673};
 674
 
 
 675#define RBD_READ_ONLY_DEFAULT	false
 
 
 
 676
 677static int parse_rbd_opts_token(char *c, void *private)
 678{
 679	struct rbd_options *rbd_opts = private;
 680	substring_t argstr[MAX_OPT_ARGS];
 681	int token, intval, ret;
 682
 683	token = match_token(c, rbd_opts_tokens, argstr);
 684	if (token < 0)
 685		return -EINVAL;
 686
 687	if (token < Opt_last_int) {
 688		ret = match_int(&argstr[0], &intval);
 689		if (ret < 0) {
 690			pr_err("bad mount option arg (not int) "
 691			       "at '%s'\n", c);
 692			return ret;
 693		}
 694		dout("got int token %d val %d\n", token, intval);
 695	} else if (token > Opt_last_int && token < Opt_last_string) {
 696		dout("got string token %d val %s\n", token,
 697		     argstr[0].from);
 698	} else if (token > Opt_last_string && token < Opt_last_bool) {
 699		dout("got Boolean token %d\n", token);
 700	} else {
 701		dout("got token %d\n", token);
 702	}
 703
 704	switch (token) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 705	case Opt_read_only:
 706		rbd_opts->read_only = true;
 707		break;
 708	case Opt_read_write:
 709		rbd_opts->read_only = false;
 710		break;
 
 
 
 
 
 
 
 
 
 711	default:
 712		rbd_assert(false);
 713		break;
 714	}
 
 715	return 0;
 716}
 717
 718/*
 719 * Get a ceph client with specific addr and configuration, if one does
 720 * not exist create it.  Either way, ceph_opts is consumed by this
 721 * function.
 722 */
 723static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
 724{
 725	struct rbd_client *rbdc;
 726
 727	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
 728	rbdc = rbd_client_find(ceph_opts);
 729	if (rbdc)	/* using an existing client */
 730		ceph_destroy_options(ceph_opts);
 731	else
 732		rbdc = rbd_client_create(ceph_opts);
 733	mutex_unlock(&client_mutex);
 734
 735	return rbdc;
 736}
 737
 738/*
 739 * Destroy ceph client
 740 *
 741 * Caller must hold rbd_client_list_lock.
 742 */
 743static void rbd_client_release(struct kref *kref)
 744{
 745	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
 746
 747	dout("%s: rbdc %p\n", __func__, rbdc);
 748	spin_lock(&rbd_client_list_lock);
 749	list_del(&rbdc->node);
 750	spin_unlock(&rbd_client_list_lock);
 751
 752	ceph_destroy_client(rbdc->client);
 753	kfree(rbdc);
 754}
 755
 756/*
 757 * Drop reference to ceph client node. If it's not referenced anymore, release
 758 * it.
 759 */
 760static void rbd_put_client(struct rbd_client *rbdc)
 761{
 762	if (rbdc)
 763		kref_put(&rbdc->kref, rbd_client_release);
 764}
 765
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 766static bool rbd_image_format_valid(u32 image_format)
 767{
 768	return image_format == 1 || image_format == 2;
 769}
 770
 771static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
 772{
 773	size_t size;
 774	u32 snap_count;
 775
 776	/* The header has to start with the magic rbd header text */
 777	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
 778		return false;
 779
 780	/* The bio layer requires at least sector-sized I/O */
 781
 782	if (ondisk->options.order < SECTOR_SHIFT)
 783		return false;
 784
 785	/* If we use u64 in a few spots we may be able to loosen this */
 786
 787	if (ondisk->options.order > 8 * sizeof (int) - 1)
 788		return false;
 789
 790	/*
 791	 * The size of a snapshot header has to fit in a size_t, and
 792	 * that limits the number of snapshots.
 793	 */
 794	snap_count = le32_to_cpu(ondisk->snap_count);
 795	size = SIZE_MAX - sizeof (struct ceph_snap_context);
 796	if (snap_count > size / sizeof (__le64))
 797		return false;
 798
 799	/*
 800	 * Not only that, but the size of the entire the snapshot
 801	 * header must also be representable in a size_t.
 802	 */
 803	size -= snap_count * sizeof (__le64);
 804	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
 805		return false;
 806
 807	return true;
 808}
 809
 810/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 811 * Fill an rbd image header with information from the given format 1
 812 * on-disk header.
 813 */
 814static int rbd_header_from_disk(struct rbd_device *rbd_dev,
 815				 struct rbd_image_header_ondisk *ondisk)
 816{
 817	struct rbd_image_header *header = &rbd_dev->header;
 818	bool first_time = header->object_prefix == NULL;
 819	struct ceph_snap_context *snapc;
 820	char *object_prefix = NULL;
 821	char *snap_names = NULL;
 822	u64 *snap_sizes = NULL;
 823	u32 snap_count;
 824	size_t size;
 825	int ret = -ENOMEM;
 826	u32 i;
 827
 828	/* Allocate this now to avoid having to handle failure below */
 829
 830	if (first_time) {
 831		size_t len;
 832
 833		len = strnlen(ondisk->object_prefix,
 834				sizeof (ondisk->object_prefix));
 835		object_prefix = kmalloc(len + 1, GFP_KERNEL);
 836		if (!object_prefix)
 837			return -ENOMEM;
 838		memcpy(object_prefix, ondisk->object_prefix, len);
 839		object_prefix[len] = '\0';
 840	}
 841
 842	/* Allocate the snapshot context and fill it in */
 843
 844	snap_count = le32_to_cpu(ondisk->snap_count);
 845	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
 846	if (!snapc)
 847		goto out_err;
 848	snapc->seq = le64_to_cpu(ondisk->snap_seq);
 849	if (snap_count) {
 850		struct rbd_image_snap_ondisk *snaps;
 851		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
 852
 853		/* We'll keep a copy of the snapshot names... */
 854
 855		if (snap_names_len > (u64)SIZE_MAX)
 856			goto out_2big;
 857		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
 858		if (!snap_names)
 859			goto out_err;
 860
 861		/* ...as well as the array of their sizes. */
 862
 863		size = snap_count * sizeof (*header->snap_sizes);
 864		snap_sizes = kmalloc(size, GFP_KERNEL);
 865		if (!snap_sizes)
 866			goto out_err;
 867
 868		/*
 869		 * Copy the names, and fill in each snapshot's id
 870		 * and size.
 871		 *
 872		 * Note that rbd_dev_v1_header_info() guarantees the
 873		 * ondisk buffer we're working with has
 874		 * snap_names_len bytes beyond the end of the
 875		 * snapshot id array, this memcpy() is safe.
 876		 */
 877		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
 878		snaps = ondisk->snaps;
 879		for (i = 0; i < snap_count; i++) {
 880			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
 881			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
 882		}
 883	}
 884
 885	/* We won't fail any more, fill in the header */
 886
 887	if (first_time) {
 888		header->object_prefix = object_prefix;
 889		header->obj_order = ondisk->options.order;
 890		header->crypt_type = ondisk->options.crypt_type;
 891		header->comp_type = ondisk->options.comp_type;
 892		/* The rest aren't used for format 1 images */
 893		header->stripe_unit = 0;
 894		header->stripe_count = 0;
 895		header->features = 0;
 896	} else {
 897		ceph_put_snap_context(header->snapc);
 898		kfree(header->snap_names);
 899		kfree(header->snap_sizes);
 900	}
 901
 902	/* The remaining fields always get updated (when we refresh) */
 903
 904	header->image_size = le64_to_cpu(ondisk->image_size);
 905	header->snapc = snapc;
 906	header->snap_names = snap_names;
 907	header->snap_sizes = snap_sizes;
 908
 909	/* Make sure mapping size is consistent with header info */
 910
 911	if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
 912		if (rbd_dev->mapping.size != header->image_size)
 913			rbd_dev->mapping.size = header->image_size;
 914
 915	return 0;
 916out_2big:
 917	ret = -EIO;
 918out_err:
 919	kfree(snap_sizes);
 920	kfree(snap_names);
 921	ceph_put_snap_context(snapc);
 922	kfree(object_prefix);
 923
 924	return ret;
 925}
 926
 927static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
 928{
 929	const char *snap_name;
 930
 931	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
 932
 933	/* Skip over names until we find the one we are looking for */
 934
 935	snap_name = rbd_dev->header.snap_names;
 936	while (which--)
 937		snap_name += strlen(snap_name) + 1;
 938
 939	return kstrdup(snap_name, GFP_KERNEL);
 940}
 941
 942/*
 943 * Snapshot id comparison function for use with qsort()/bsearch().
 944 * Note that result is for snapshots in *descending* order.
 945 */
 946static int snapid_compare_reverse(const void *s1, const void *s2)
 947{
 948	u64 snap_id1 = *(u64 *)s1;
 949	u64 snap_id2 = *(u64 *)s2;
 950
 951	if (snap_id1 < snap_id2)
 952		return 1;
 953	return snap_id1 == snap_id2 ? 0 : -1;
 954}
 955
 956/*
 957 * Search a snapshot context to see if the given snapshot id is
 958 * present.
 959 *
 960 * Returns the position of the snapshot id in the array if it's found,
 961 * or BAD_SNAP_INDEX otherwise.
 962 *
 963 * Note: The snapshot array is in kept sorted (by the osd) in
 964 * reverse order, highest snapshot id first.
 965 */
 966static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
 967{
 968	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
 969	u64 *found;
 970
 971	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
 972				sizeof (snap_id), snapid_compare_reverse);
 973
 974	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
 975}
 976
 977static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
 978					u64 snap_id)
 979{
 980	u32 which;
 981	const char *snap_name;
 982
 983	which = rbd_dev_snap_index(rbd_dev, snap_id);
 984	if (which == BAD_SNAP_INDEX)
 985		return ERR_PTR(-ENOENT);
 986
 987	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
 988	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
 989}
 990
 991static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
 992{
 993	if (snap_id == CEPH_NOSNAP)
 994		return RBD_SNAP_HEAD_NAME;
 995
 996	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 997	if (rbd_dev->image_format == 1)
 998		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
 999
1000	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
1001}
1002
1003static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1004				u64 *snap_size)
1005{
1006	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1007	if (snap_id == CEPH_NOSNAP) {
1008		*snap_size = rbd_dev->header.image_size;
1009	} else if (rbd_dev->image_format == 1) {
1010		u32 which;
1011
1012		which = rbd_dev_snap_index(rbd_dev, snap_id);
1013		if (which == BAD_SNAP_INDEX)
1014			return -ENOENT;
1015
1016		*snap_size = rbd_dev->header.snap_sizes[which];
1017	} else {
1018		u64 size = 0;
1019		int ret;
1020
1021		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1022		if (ret)
1023			return ret;
1024
1025		*snap_size = size;
1026	}
1027	return 0;
1028}
1029
1030static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1031			u64 *snap_features)
1032{
1033	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1034	if (snap_id == CEPH_NOSNAP) {
1035		*snap_features = rbd_dev->header.features;
1036	} else if (rbd_dev->image_format == 1) {
1037		*snap_features = 0;	/* No features for format 1 */
1038	} else {
1039		u64 features = 0;
1040		int ret;
1041
1042		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1043		if (ret)
1044			return ret;
1045
1046		*snap_features = features;
1047	}
1048	return 0;
1049}
1050
1051static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1052{
1053	u64 snap_id = rbd_dev->spec->snap_id;
1054	u64 size = 0;
1055	u64 features = 0;
1056	int ret;
1057
1058	ret = rbd_snap_size(rbd_dev, snap_id, &size);
1059	if (ret)
1060		return ret;
1061	ret = rbd_snap_features(rbd_dev, snap_id, &features);
1062	if (ret)
1063		return ret;
1064
1065	rbd_dev->mapping.size = size;
1066	rbd_dev->mapping.features = features;
1067
1068	return 0;
1069}
1070
1071static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1072{
1073	rbd_dev->mapping.size = 0;
1074	rbd_dev->mapping.features = 0;
1075}
1076
1077static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1078{
1079	char *name;
1080	u64 segment;
1081	int ret;
1082	char *name_format;
1083
1084	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
1085	if (!name)
1086		return NULL;
1087	segment = offset >> rbd_dev->header.obj_order;
1088	name_format = "%s.%012llx";
1089	if (rbd_dev->image_format == 2)
1090		name_format = "%s.%016llx";
1091	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
1092			rbd_dev->header.object_prefix, segment);
1093	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
1094		pr_err("error formatting segment name for #%llu (%d)\n",
1095			segment, ret);
1096		kfree(name);
1097		name = NULL;
1098	}
1099
1100	return name;
1101}
1102
1103static void rbd_segment_name_free(const char *name)
1104{
1105	/* The explicit cast here is needed to drop the const qualifier */
1106
1107	kmem_cache_free(rbd_segment_name_cache, (void *)name);
1108}
1109
1110static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1111{
1112	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1113
1114	return offset & (segment_size - 1);
1115}
1116
1117static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1118				u64 offset, u64 length)
1119{
1120	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1121
1122	offset &= segment_size - 1;
1123
1124	rbd_assert(length <= U64_MAX - offset);
1125	if (offset + length > segment_size)
1126		length = segment_size - offset;
1127
1128	return length;
1129}
1130
1131/*
1132 * returns the size of an object in the image
1133 */
1134static u64 rbd_obj_bytes(struct rbd_image_header *header)
1135{
1136	return 1 << header->obj_order;
1137}
1138
1139/*
1140 * bio helpers
1141 */
1142
1143static void bio_chain_put(struct bio *chain)
1144{
1145	struct bio *tmp;
1146
1147	while (chain) {
1148		tmp = chain;
1149		chain = chain->bi_next;
1150		bio_put(tmp);
1151	}
1152}
1153
1154/*
1155 * zeros a bio chain, starting at specific offset
1156 */
1157static void zero_bio_chain(struct bio *chain, int start_ofs)
1158{
1159	struct bio_vec bv;
1160	struct bvec_iter iter;
1161	unsigned long flags;
1162	void *buf;
1163	int pos = 0;
1164
1165	while (chain) {
1166		bio_for_each_segment(bv, chain, iter) {
1167			if (pos + bv.bv_len > start_ofs) {
1168				int remainder = max(start_ofs - pos, 0);
1169				buf = bvec_kmap_irq(&bv, &flags);
1170				memset(buf + remainder, 0,
1171				       bv.bv_len - remainder);
1172				flush_dcache_page(bv.bv_page);
1173				bvec_kunmap_irq(buf, &flags);
1174			}
1175			pos += bv.bv_len;
1176		}
1177
1178		chain = chain->bi_next;
1179	}
1180}
1181
1182/*
1183 * similar to zero_bio_chain(), zeros data defined by a page array,
1184 * starting at the given byte offset from the start of the array and
1185 * continuing up to the given end offset.  The pages array is
1186 * assumed to be big enough to hold all bytes up to the end.
1187 */
1188static void zero_pages(struct page **pages, u64 offset, u64 end)
1189{
1190	struct page **page = &pages[offset >> PAGE_SHIFT];
1191
1192	rbd_assert(end > offset);
1193	rbd_assert(end - offset <= (u64)SIZE_MAX);
1194	while (offset < end) {
1195		size_t page_offset;
1196		size_t length;
1197		unsigned long flags;
1198		void *kaddr;
1199
1200		page_offset = offset & ~PAGE_MASK;
1201		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1202		local_irq_save(flags);
1203		kaddr = kmap_atomic(*page);
1204		memset(kaddr + page_offset, 0, length);
1205		flush_dcache_page(*page);
1206		kunmap_atomic(kaddr);
1207		local_irq_restore(flags);
1208
1209		offset += length;
1210		page++;
1211	}
1212}
1213
1214/*
1215 * Clone a portion of a bio, starting at the given byte offset
1216 * and continuing for the number of bytes indicated.
1217 */
1218static struct bio *bio_clone_range(struct bio *bio_src,
1219					unsigned int offset,
1220					unsigned int len,
1221					gfp_t gfpmask)
1222{
1223	struct bio *bio;
1224
1225	bio = bio_clone(bio_src, gfpmask);
1226	if (!bio)
1227		return NULL;	/* ENOMEM */
1228
1229	bio_advance(bio, offset);
1230	bio->bi_iter.bi_size = len;
1231
1232	return bio;
 
 
 
1233}
1234
1235/*
1236 * Clone a portion of a bio chain, starting at the given byte offset
1237 * into the first bio in the source chain and continuing for the
1238 * number of bytes indicated.  The result is another bio chain of
1239 * exactly the given length, or a null pointer on error.
1240 *
1241 * The bio_src and offset parameters are both in-out.  On entry they
1242 * refer to the first source bio and the offset into that bio where
1243 * the start of data to be cloned is located.
1244 *
1245 * On return, bio_src is updated to refer to the bio in the source
1246 * chain that contains first un-cloned byte, and *offset will
1247 * contain the offset of that byte within that bio.
1248 */
1249static struct bio *bio_chain_clone_range(struct bio **bio_src,
1250					unsigned int *offset,
1251					unsigned int len,
1252					gfp_t gfpmask)
1253{
1254	struct bio *bi = *bio_src;
1255	unsigned int off = *offset;
1256	struct bio *chain = NULL;
1257	struct bio **end;
1258
1259	/* Build up a chain of clone bios up to the limit */
1260
1261	if (!bi || off >= bi->bi_iter.bi_size || !len)
1262		return NULL;		/* Nothing to clone */
1263
1264	end = &chain;
1265	while (len) {
1266		unsigned int bi_size;
1267		struct bio *bio;
1268
1269		if (!bi) {
1270			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1271			goto out_err;	/* EINVAL; ran out of bio's */
1272		}
1273		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1274		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1275		if (!bio)
1276			goto out_err;	/* ENOMEM */
1277
1278		*end = bio;
1279		end = &bio->bi_next;
1280
1281		off += bi_size;
1282		if (off == bi->bi_iter.bi_size) {
1283			bi = bi->bi_next;
1284			off = 0;
1285		}
1286		len -= bi_size;
1287	}
1288	*bio_src = bi;
1289	*offset = off;
1290
1291	return chain;
1292out_err:
1293	bio_chain_put(chain);
1294
1295	return NULL;
1296}
1297
1298/*
1299 * The default/initial value for all object request flags is 0.  For
1300 * each flag, once its value is set to 1 it is never reset to 0
1301 * again.
1302 */
1303static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
 
1304{
1305	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
1306		struct rbd_device *rbd_dev;
1307
1308		rbd_dev = obj_request->img_request->rbd_dev;
1309		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1310			obj_request);
 
 
 
 
1311	}
1312}
1313
1314static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1315{
1316	smp_mb();
1317	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1318}
1319
1320static void obj_request_done_set(struct rbd_obj_request *obj_request)
1321{
1322	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1323		struct rbd_device *rbd_dev = NULL;
1324
1325		if (obj_request_img_data_test(obj_request))
1326			rbd_dev = obj_request->img_request->rbd_dev;
1327		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1328			obj_request);
1329	}
1330}
1331
1332static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1333{
1334	smp_mb();
1335	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1336}
1337
1338/*
1339 * This sets the KNOWN flag after (possibly) setting the EXISTS
1340 * flag.  The latter is set based on the "exists" value provided.
1341 *
1342 * Note that for our purposes once an object exists it never goes
1343 * away again.  It's possible that the response from two existence
1344 * checks are separated by the creation of the target object, and
1345 * the first ("doesn't exist") response arrives *after* the second
1346 * ("does exist").  In that case we ignore the second one.
1347 */
1348static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1349				bool exists)
1350{
1351	if (exists)
1352		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1353	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1354	smp_mb();
1355}
1356
1357static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1358{
1359	smp_mb();
1360	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1361}
1362
1363static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1364{
1365	smp_mb();
1366	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1367}
1368
1369static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1370{
1371	dout("%s: obj %p (was %d)\n", __func__, obj_request,
1372		atomic_read(&obj_request->kref.refcount));
1373	kref_get(&obj_request->kref);
1374}
1375
1376static void rbd_obj_request_destroy(struct kref *kref);
1377static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1378{
1379	rbd_assert(obj_request != NULL);
1380	dout("%s: obj %p (was %d)\n", __func__, obj_request,
1381		atomic_read(&obj_request->kref.refcount));
1382	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1383}
1384
1385static bool img_request_child_test(struct rbd_img_request *img_request);
1386static void rbd_parent_request_destroy(struct kref *kref);
 
 
 
 
 
1387static void rbd_img_request_destroy(struct kref *kref);
1388static void rbd_img_request_put(struct rbd_img_request *img_request)
1389{
1390	rbd_assert(img_request != NULL);
1391	dout("%s: img %p (was %d)\n", __func__, img_request,
1392		atomic_read(&img_request->kref.refcount));
1393	if (img_request_child_test(img_request))
1394		kref_put(&img_request->kref, rbd_parent_request_destroy);
1395	else
1396		kref_put(&img_request->kref, rbd_img_request_destroy);
1397}
1398
1399static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1400					struct rbd_obj_request *obj_request)
1401{
1402	rbd_assert(obj_request->img_request == NULL);
1403
1404	/* Image request now owns object's original reference */
1405	obj_request->img_request = img_request;
1406	obj_request->which = img_request->obj_request_count;
1407	rbd_assert(!obj_request_img_data_test(obj_request));
1408	obj_request_img_data_set(obj_request);
1409	rbd_assert(obj_request->which != BAD_WHICH);
1410	img_request->obj_request_count++;
1411	list_add_tail(&obj_request->links, &img_request->obj_requests);
1412	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1413		obj_request->which);
1414}
1415
1416static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1417					struct rbd_obj_request *obj_request)
1418{
1419	rbd_assert(obj_request->which != BAD_WHICH);
1420
1421	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1422		obj_request->which);
1423	list_del(&obj_request->links);
1424	rbd_assert(img_request->obj_request_count > 0);
1425	img_request->obj_request_count--;
1426	rbd_assert(obj_request->which == img_request->obj_request_count);
1427	obj_request->which = BAD_WHICH;
1428	rbd_assert(obj_request_img_data_test(obj_request));
1429	rbd_assert(obj_request->img_request == img_request);
1430	obj_request->img_request = NULL;
1431	obj_request->callback = NULL;
1432	rbd_obj_request_put(obj_request);
1433}
1434
1435static bool obj_request_type_valid(enum obj_request_type type)
1436{
1437	switch (type) {
1438	case OBJ_REQUEST_NODATA:
1439	case OBJ_REQUEST_BIO:
1440	case OBJ_REQUEST_PAGES:
1441		return true;
1442	default:
1443		return false;
1444	}
1445}
1446
1447static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1448				struct rbd_obj_request *obj_request)
1449{
1450	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1451
1452	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1453}
1454
1455static void rbd_img_request_complete(struct rbd_img_request *img_request)
1456{
1457
1458	dout("%s: img %p\n", __func__, img_request);
1459
1460	/*
1461	 * If no error occurred, compute the aggregate transfer
1462	 * count for the image request.  We could instead use
1463	 * atomic64_cmpxchg() to update it as each object request
1464	 * completes; not clear which way is better off hand.
1465	 */
1466	if (!img_request->result) {
1467		struct rbd_obj_request *obj_request;
1468		u64 xferred = 0;
1469
1470		for_each_obj_request(img_request, obj_request)
1471			xferred += obj_request->xferred;
1472		img_request->xferred = xferred;
1473	}
1474
1475	if (img_request->callback)
1476		img_request->callback(img_request);
1477	else
1478		rbd_img_request_put(img_request);
1479}
1480
1481/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1482
1483static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1484{
1485	dout("%s: obj %p\n", __func__, obj_request);
1486
1487	return wait_for_completion_interruptible(&obj_request->completion);
1488}
1489
1490/*
1491 * The default/initial value for all image request flags is 0.  Each
1492 * is conditionally set to 1 at image request initialization time
1493 * and currently never change thereafter.
1494 */
1495static void img_request_write_set(struct rbd_img_request *img_request)
1496{
1497	set_bit(IMG_REQ_WRITE, &img_request->flags);
1498	smp_mb();
1499}
1500
1501static bool img_request_write_test(struct rbd_img_request *img_request)
1502{
1503	smp_mb();
1504	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1505}
1506
1507static void img_request_child_set(struct rbd_img_request *img_request)
1508{
1509	set_bit(IMG_REQ_CHILD, &img_request->flags);
1510	smp_mb();
1511}
1512
1513static void img_request_child_clear(struct rbd_img_request *img_request)
1514{
1515	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1516	smp_mb();
1517}
1518
1519static bool img_request_child_test(struct rbd_img_request *img_request)
1520{
1521	smp_mb();
1522	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1523}
1524
1525static void img_request_layered_set(struct rbd_img_request *img_request)
1526{
1527	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1528	smp_mb();
1529}
1530
1531static void img_request_layered_clear(struct rbd_img_request *img_request)
1532{
1533	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1534	smp_mb();
1535}
1536
1537static bool img_request_layered_test(struct rbd_img_request *img_request)
1538{
1539	smp_mb();
1540	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1541}
1542
1543static void
1544rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1545{
1546	u64 xferred = obj_request->xferred;
1547	u64 length = obj_request->length;
1548
1549	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1550		obj_request, obj_request->img_request, obj_request->result,
1551		xferred, length);
1552	/*
1553	 * ENOENT means a hole in the image.  We zero-fill the entire
1554	 * length of the request.  A short read also implies zero-fill
1555	 * to the end of the request.  An error requires the whole
1556	 * length of the request to be reported finished with an error
1557	 * to the block layer.  In each case we update the xferred
1558	 * count to indicate the whole request was satisfied.
1559	 */
1560	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
1561	if (obj_request->result == -ENOENT) {
1562		if (obj_request->type == OBJ_REQUEST_BIO)
1563			zero_bio_chain(obj_request->bio_list, 0);
1564		else
1565			zero_pages(obj_request->pages, 0, length);
1566		obj_request->result = 0;
1567	} else if (xferred < length && !obj_request->result) {
1568		if (obj_request->type == OBJ_REQUEST_BIO)
1569			zero_bio_chain(obj_request->bio_list, xferred);
1570		else
1571			zero_pages(obj_request->pages, xferred, length);
1572	}
1573	obj_request->xferred = length;
1574	obj_request_done_set(obj_request);
1575}
1576
1577static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1578{
1579	dout("%s: obj %p cb %p\n", __func__, obj_request,
1580		obj_request->callback);
1581	if (obj_request->callback)
1582		obj_request->callback(obj_request);
1583	else
1584		complete_all(&obj_request->completion);
1585}
1586
1587static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
1588{
1589	dout("%s: obj %p\n", __func__, obj_request);
1590	obj_request_done_set(obj_request);
1591}
1592
1593static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1594{
1595	struct rbd_img_request *img_request = NULL;
1596	struct rbd_device *rbd_dev = NULL;
1597	bool layered = false;
1598
1599	if (obj_request_img_data_test(obj_request)) {
1600		img_request = obj_request->img_request;
1601		layered = img_request && img_request_layered_test(img_request);
1602		rbd_dev = img_request->rbd_dev;
1603	}
1604
1605	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1606		obj_request, img_request, obj_request->result,
1607		obj_request->xferred, obj_request->length);
1608	if (layered && obj_request->result == -ENOENT &&
1609			obj_request->img_offset < rbd_dev->parent_overlap)
1610		rbd_img_parent_read(obj_request);
1611	else if (img_request)
1612		rbd_img_obj_request_read_callback(obj_request);
1613	else
1614		obj_request_done_set(obj_request);
1615}
1616
1617static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1618{
1619	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1620		obj_request->result, obj_request->length);
1621	/*
1622	 * There is no such thing as a successful short write.  Set
1623	 * it to our originally-requested length.
1624	 */
1625	obj_request->xferred = obj_request->length;
1626	obj_request_done_set(obj_request);
 
1627}
1628
1629/*
1630 * For a simple stat call there's nothing to do.  We'll do more if
1631 * this is part of a write sequence for a layered image.
1632 */
1633static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1634{
1635	dout("%s: obj %p\n", __func__, obj_request);
1636	obj_request_done_set(obj_request);
1637}
1638
1639static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1640				struct ceph_msg *msg)
1641{
1642	struct rbd_obj_request *obj_request = osd_req->r_priv;
1643	u16 opcode;
1644
1645	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1646	rbd_assert(osd_req == obj_request->osd_req);
1647	if (obj_request_img_data_test(obj_request)) {
1648		rbd_assert(obj_request->img_request);
1649		rbd_assert(obj_request->which != BAD_WHICH);
1650	} else {
1651		rbd_assert(obj_request->which == BAD_WHICH);
1652	}
1653
1654	if (osd_req->r_result < 0)
1655		obj_request->result = osd_req->r_result;
1656
1657	rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
 
 
 
 
 
 
1658
1659	/*
1660	 * We support a 64-bit length, but ultimately it has to be
1661	 * passed to blk_end_request(), which takes an unsigned int.
1662	 */
1663	obj_request->xferred = osd_req->r_reply_op_len[0];
1664	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
1665
1666	opcode = osd_req->r_ops[0].op;
1667	switch (opcode) {
1668	case CEPH_OSD_OP_READ:
1669		rbd_osd_read_callback(obj_request);
1670		break;
1671	case CEPH_OSD_OP_SETALLOCHINT:
1672		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
1673		/* fall through */
1674	case CEPH_OSD_OP_WRITE:
1675		rbd_osd_write_callback(obj_request);
1676		break;
1677	case CEPH_OSD_OP_STAT:
1678		rbd_osd_stat_callback(obj_request);
1679		break;
1680	case CEPH_OSD_OP_CALL:
1681	case CEPH_OSD_OP_NOTIFY_ACK:
1682	case CEPH_OSD_OP_WATCH:
1683		rbd_osd_trivial_callback(obj_request);
1684		break;
1685	default:
1686		rbd_warn(NULL, "%s: unsupported op %hu\n",
1687			obj_request->object_name, (unsigned short) opcode);
1688		break;
1689	}
1690
1691	if (obj_request_done_test(obj_request))
1692		rbd_obj_request_complete(obj_request);
1693}
1694
1695static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1696{
1697	struct rbd_img_request *img_request = obj_request->img_request;
1698	struct ceph_osd_request *osd_req = obj_request->osd_req;
1699	u64 snap_id;
1700
1701	rbd_assert(osd_req != NULL);
1702
1703	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
1704	ceph_osdc_build_request(osd_req, obj_request->offset,
1705			NULL, snap_id, NULL);
1706}
1707
1708static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1709{
1710	struct rbd_img_request *img_request = obj_request->img_request;
1711	struct ceph_osd_request *osd_req = obj_request->osd_req;
1712	struct ceph_snap_context *snapc;
1713	struct timespec mtime = CURRENT_TIME;
1714
1715	rbd_assert(osd_req != NULL);
1716
1717	snapc = img_request ? img_request->snapc : NULL;
1718	ceph_osdc_build_request(osd_req, obj_request->offset,
1719			snapc, CEPH_NOSNAP, &mtime);
1720}
1721
1722/*
1723 * Create an osd request.  A read request has one osd op (read).
1724 * A write request has either one (watch) or two (hint+write) osd ops.
1725 * (All rbd data writes are prefixed with an allocation hint op, but
1726 * technically osd watch is a write request, hence this distinction.)
1727 */
1728static struct ceph_osd_request *rbd_osd_req_create(
1729					struct rbd_device *rbd_dev,
1730					bool write_request,
1731					unsigned int num_ops,
1732					struct rbd_obj_request *obj_request)
1733{
1734	struct ceph_snap_context *snapc = NULL;
1735	struct ceph_osd_client *osdc;
1736	struct ceph_osd_request *osd_req;
 
 
 
 
 
 
 
 
 
1737
1738	if (obj_request_img_data_test(obj_request)) {
1739		struct rbd_img_request *img_request = obj_request->img_request;
1740
1741		rbd_assert(write_request ==
1742				img_request_write_test(img_request));
1743		if (write_request)
1744			snapc = img_request->snapc;
1745	}
1746
1747	rbd_assert(num_ops == 1 || (write_request && num_ops == 2));
 
1748
1749	/* Allocate and initialize the request, for the num_ops ops */
1750
1751	osdc = &rbd_dev->rbd_client->client->osdc;
1752	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
1753					  GFP_ATOMIC);
1754	if (!osd_req)
1755		return NULL;	/* ENOMEM */
1756
1757	if (write_request)
1758		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1759	else
1760		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1761
1762	osd_req->r_callback = rbd_osd_req_callback;
1763	osd_req->r_priv = obj_request;
1764
1765	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
1766	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
1767
1768	return osd_req;
1769}
1770
1771/*
1772 * Create a copyup osd request based on the information in the
1773 * object request supplied.  A copyup request has three osd ops,
1774 * a copyup method call, a hint op, and a write op.
1775 */
1776static struct ceph_osd_request *
1777rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1778{
1779	struct rbd_img_request *img_request;
1780	struct ceph_snap_context *snapc;
1781	struct rbd_device *rbd_dev;
1782	struct ceph_osd_client *osdc;
1783	struct ceph_osd_request *osd_req;
1784
1785	rbd_assert(obj_request_img_data_test(obj_request));
1786	img_request = obj_request->img_request;
1787	rbd_assert(img_request);
1788	rbd_assert(img_request_write_test(img_request));
1789
1790	/* Allocate and initialize the request, for the three ops */
1791
1792	snapc = img_request->snapc;
1793	rbd_dev = img_request->rbd_dev;
1794	osdc = &rbd_dev->rbd_client->client->osdc;
1795	osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC);
1796	if (!osd_req)
1797		return NULL;	/* ENOMEM */
1798
1799	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1800	osd_req->r_callback = rbd_osd_req_callback;
1801	osd_req->r_priv = obj_request;
1802
1803	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
1804	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
1805
1806	return osd_req;
1807}
1808
1809
1810static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1811{
1812	ceph_osdc_put_request(osd_req);
1813}
1814
1815/* object_name is assumed to be a non-null pointer and NUL-terminated */
1816
1817static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1818						u64 offset, u64 length,
1819						enum obj_request_type type)
1820{
1821	struct rbd_obj_request *obj_request;
1822	size_t size;
1823	char *name;
1824
1825	rbd_assert(obj_request_type_valid(type));
1826
1827	size = strlen(object_name) + 1;
1828	name = kmalloc(size, GFP_KERNEL);
1829	if (!name)
1830		return NULL;
1831
1832	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1833	if (!obj_request) {
1834		kfree(name);
1835		return NULL;
1836	}
1837
1838	obj_request->object_name = memcpy(name, object_name, size);
1839	obj_request->offset = offset;
1840	obj_request->length = length;
1841	obj_request->flags = 0;
1842	obj_request->which = BAD_WHICH;
1843	obj_request->type = type;
1844	INIT_LIST_HEAD(&obj_request->links);
1845	init_completion(&obj_request->completion);
1846	kref_init(&obj_request->kref);
1847
1848	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1849		offset, length, (int)type, obj_request);
1850
1851	return obj_request;
1852}
1853
1854static void rbd_obj_request_destroy(struct kref *kref)
1855{
1856	struct rbd_obj_request *obj_request;
 
1857
1858	obj_request = container_of(kref, struct rbd_obj_request, kref);
1859
1860	dout("%s: obj %p\n", __func__, obj_request);
1861
1862	rbd_assert(obj_request->img_request == NULL);
1863	rbd_assert(obj_request->which == BAD_WHICH);
1864
1865	if (obj_request->osd_req)
1866		rbd_osd_req_destroy(obj_request->osd_req);
1867
1868	rbd_assert(obj_request_type_valid(obj_request->type));
1869	switch (obj_request->type) {
1870	case OBJ_REQUEST_NODATA:
 
 
1871		break;		/* Nothing to do */
1872	case OBJ_REQUEST_BIO:
1873		if (obj_request->bio_list)
1874			bio_chain_put(obj_request->bio_list);
1875		break;
1876	case OBJ_REQUEST_PAGES:
1877		if (obj_request->pages)
1878			ceph_release_page_vector(obj_request->pages,
1879						obj_request->page_count);
1880		break;
 
 
 
 
 
 
 
 
 
 
 
1881	}
1882
1883	kfree(obj_request->object_name);
1884	obj_request->object_name = NULL;
1885	kmem_cache_free(rbd_obj_request_cache, obj_request);
1886}
1887
1888/* It's OK to call this for a device with no parent */
1889
1890static void rbd_spec_put(struct rbd_spec *spec);
1891static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1892{
1893	rbd_dev_remove_parent(rbd_dev);
1894	rbd_spec_put(rbd_dev->parent_spec);
1895	rbd_dev->parent_spec = NULL;
1896	rbd_dev->parent_overlap = 0;
1897}
1898
1899/*
1900 * Parent image reference counting is used to determine when an
1901 * image's parent fields can be safely torn down--after there are no
1902 * more in-flight requests to the parent image.  When the last
1903 * reference is dropped, cleaning them up is safe.
1904 */
1905static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1906{
1907	int counter;
1908
1909	if (!rbd_dev->parent_spec)
1910		return;
1911
1912	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1913	if (counter > 0)
1914		return;
1915
1916	/* Last reference; clean up parent data structures */
1917
1918	if (!counter)
1919		rbd_dev_unparent(rbd_dev);
1920	else
1921		rbd_warn(rbd_dev, "parent reference underflow\n");
1922}
1923
1924/*
1925 * If an image has a non-zero parent overlap, get a reference to its
1926 * parent.
1927 *
1928 * We must get the reference before checking for the overlap to
1929 * coordinate properly with zeroing the parent overlap in
1930 * rbd_dev_v2_parent_info() when an image gets flattened.  We
1931 * drop it again if there is no overlap.
1932 *
1933 * Returns true if the rbd device has a parent with a non-zero
1934 * overlap and a reference for it was successfully taken, or
1935 * false otherwise.
1936 */
1937static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1938{
1939	int counter;
1940
1941	if (!rbd_dev->parent_spec)
1942		return false;
1943
1944	counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1945	if (counter > 0 && rbd_dev->parent_overlap)
1946		return true;
1947
1948	/* Image was flattened, but parent is not yet torn down */
1949
1950	if (counter < 0)
1951		rbd_warn(rbd_dev, "parent reference overflow\n");
1952
1953	return false;
1954}
1955
1956/*
1957 * Caller is responsible for filling in the list of object requests
1958 * that comprises the image request, and the Linux request pointer
1959 * (if there is one).
1960 */
1961static struct rbd_img_request *rbd_img_request_create(
1962					struct rbd_device *rbd_dev,
1963					u64 offset, u64 length,
1964					bool write_request)
1965{
1966	struct rbd_img_request *img_request;
1967
1968	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
1969	if (!img_request)
1970		return NULL;
1971
1972	if (write_request) {
1973		down_read(&rbd_dev->header_rwsem);
1974		ceph_get_snap_context(rbd_dev->header.snapc);
1975		up_read(&rbd_dev->header_rwsem);
1976	}
1977
1978	img_request->rq = NULL;
1979	img_request->rbd_dev = rbd_dev;
1980	img_request->offset = offset;
1981	img_request->length = length;
1982	img_request->flags = 0;
1983	if (write_request) {
1984		img_request_write_set(img_request);
1985		img_request->snapc = rbd_dev->header.snapc;
1986	} else {
1987		img_request->snap_id = rbd_dev->spec->snap_id;
1988	}
 
 
1989	if (rbd_dev_parent_get(rbd_dev))
1990		img_request_layered_set(img_request);
 
1991	spin_lock_init(&img_request->completion_lock);
1992	img_request->next_completion = 0;
1993	img_request->callback = NULL;
1994	img_request->result = 0;
1995	img_request->obj_request_count = 0;
1996	INIT_LIST_HEAD(&img_request->obj_requests);
1997	kref_init(&img_request->kref);
1998
1999	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
2000		write_request ? "write" : "read", offset, length,
2001		img_request);
2002
2003	return img_request;
2004}
2005
2006static void rbd_img_request_destroy(struct kref *kref)
2007{
2008	struct rbd_img_request *img_request;
2009	struct rbd_obj_request *obj_request;
2010	struct rbd_obj_request *next_obj_request;
2011
2012	img_request = container_of(kref, struct rbd_img_request, kref);
2013
2014	dout("%s: img %p\n", __func__, img_request);
2015
2016	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2017		rbd_img_obj_request_del(img_request, obj_request);
2018	rbd_assert(img_request->obj_request_count == 0);
2019
2020	if (img_request_layered_test(img_request)) {
2021		img_request_layered_clear(img_request);
2022		rbd_dev_parent_put(img_request->rbd_dev);
2023	}
2024
2025	if (img_request_write_test(img_request))
2026		ceph_put_snap_context(img_request->snapc);
2027
2028	kmem_cache_free(rbd_img_request_cache, img_request);
2029}
2030
2031static struct rbd_img_request *rbd_parent_request_create(
2032					struct rbd_obj_request *obj_request,
2033					u64 img_offset, u64 length)
2034{
2035	struct rbd_img_request *parent_request;
2036	struct rbd_device *rbd_dev;
2037
2038	rbd_assert(obj_request->img_request);
2039	rbd_dev = obj_request->img_request->rbd_dev;
 
2040
2041	parent_request = rbd_img_request_create(rbd_dev->parent,
2042						img_offset, length, false);
2043	if (!parent_request)
2044		return NULL;
2045
2046	img_request_child_set(parent_request);
2047	rbd_obj_request_get(obj_request);
2048	parent_request->obj_request = obj_request;
 
2049
2050	return parent_request;
2051}
2052
2053static void rbd_parent_request_destroy(struct kref *kref)
 
 
 
 
 
2054{
2055	struct rbd_img_request *parent_request;
2056	struct rbd_obj_request *orig_request;
 
 
 
2057
2058	parent_request = container_of(kref, struct rbd_img_request, kref);
2059	orig_request = parent_request->obj_request;
 
 
 
 
 
 
2060
2061	parent_request->obj_request = NULL;
2062	rbd_obj_request_put(orig_request);
2063	img_request_child_clear(parent_request);
 
2064
2065	rbd_img_request_destroy(kref);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2066}
2067
2068static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
2069{
2070	struct rbd_img_request *img_request;
2071	unsigned int xferred;
2072	int result;
2073	bool more;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2074
2075	rbd_assert(obj_request_img_data_test(obj_request));
2076	img_request = obj_request->img_request;
 
2077
2078	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
2079	xferred = (unsigned int)obj_request->xferred;
2080	result = obj_request->result;
2081	if (result) {
2082		struct rbd_device *rbd_dev = img_request->rbd_dev;
 
 
 
 
 
 
 
 
2083
2084		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
2085			img_request_write_test(img_request) ? "write" : "read",
2086			obj_request->length, obj_request->img_offset,
2087			obj_request->offset);
2088		rbd_warn(rbd_dev, "  result %d xferred %x\n",
2089			result, xferred);
2090		if (!img_request->result)
2091			img_request->result = result;
2092	}
2093
2094	/* Image object requests don't own their page array */
 
 
2095
2096	if (obj_request->type == OBJ_REQUEST_PAGES) {
2097		obj_request->pages = NULL;
2098		obj_request->page_count = 0;
 
2099	}
2100
2101	if (img_request_child_test(img_request)) {
2102		rbd_assert(img_request->obj_request != NULL);
2103		more = obj_request->which < img_request->obj_request_count - 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2104	} else {
2105		rbd_assert(img_request->rq != NULL);
2106		more = blk_end_request(img_request->rq, result, xferred);
2107	}
2108
2109	return more;
 
 
 
 
 
 
2110}
2111
2112static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
2113{
2114	struct rbd_img_request *img_request;
2115	u32 which = obj_request->which;
2116	bool more = true;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2117
2118	rbd_assert(obj_request_img_data_test(obj_request));
2119	img_request = obj_request->img_request;
 
2120
2121	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
2122	rbd_assert(img_request != NULL);
2123	rbd_assert(img_request->obj_request_count > 0);
2124	rbd_assert(which != BAD_WHICH);
2125	rbd_assert(which < img_request->obj_request_count);
2126
2127	spin_lock_irq(&img_request->completion_lock);
2128	if (which != img_request->next_completion)
2129		goto out;
2130
2131	for_each_obj_request_from(img_request, obj_request) {
2132		rbd_assert(more);
2133		rbd_assert(which < img_request->obj_request_count);
 
 
 
 
 
 
2134
2135		if (!obj_request_done_test(obj_request))
 
 
 
 
 
 
2136			break;
2137		more = rbd_img_obj_end_request(obj_request);
2138		which++;
 
 
 
 
 
 
2139	}
2140
2141	rbd_assert(more ^ (which == img_request->obj_request_count));
2142	img_request->next_completion = which;
2143out:
2144	spin_unlock_irq(&img_request->completion_lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2145
2146	if (!more)
2147		rbd_img_request_complete(img_request);
2148}
2149
2150/*
2151 * Split up an image request into one or more object requests, each
2152 * to a different object.  The "type" parameter indicates whether
2153 * "data_desc" is the pointer to the head of a list of bio
2154 * structures, or the base of a page array.  In either case this
2155 * function assumes data_desc describes memory sufficient to hold
2156 * all data described by the image request.
 
 
 
 
 
2157 */
2158static int rbd_img_request_fill(struct rbd_img_request *img_request,
2159					enum obj_request_type type,
2160					void *data_desc)
 
2161{
2162	struct rbd_device *rbd_dev = img_request->rbd_dev;
2163	struct rbd_obj_request *obj_request = NULL;
2164	struct rbd_obj_request *next_obj_request;
2165	bool write_request = img_request_write_test(img_request);
2166	struct bio *bio_list = NULL;
2167	unsigned int bio_offset = 0;
2168	struct page **pages = NULL;
2169	u64 img_offset;
2170	u64 resid;
2171	u16 opcode;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2172
2173	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2174		(int)type, data_desc);
 
 
 
 
 
2175
2176	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
2177	img_offset = img_request->offset;
2178	resid = img_request->length;
2179	rbd_assert(resid > 0);
2180
2181	if (type == OBJ_REQUEST_BIO) {
2182		bio_list = data_desc;
2183		rbd_assert(img_offset ==
2184			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
2185	} else {
2186		rbd_assert(type == OBJ_REQUEST_PAGES);
2187		pages = data_desc;
 
2188	}
2189
2190	while (resid) {
2191		struct ceph_osd_request *osd_req;
2192		const char *object_name;
2193		u64 offset;
2194		u64 length;
2195		unsigned int which = 0;
2196
2197		object_name = rbd_segment_name(rbd_dev, img_offset);
2198		if (!object_name)
2199			goto out_unwind;
2200		offset = rbd_segment_offset(rbd_dev, img_offset);
2201		length = rbd_segment_length(rbd_dev, img_offset, resid);
2202		obj_request = rbd_obj_request_create(object_name,
2203						offset, length, type);
2204		/* object request has its own copy of the object name */
2205		rbd_segment_name_free(object_name);
2206		if (!obj_request)
2207			goto out_unwind;
2208
2209		/*
2210		 * set obj_request->img_request before creating the
2211		 * osd_request so that it gets the right snapc
2212		 */
2213		rbd_img_obj_request_add(img_request, obj_request);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2214
2215		if (type == OBJ_REQUEST_BIO) {
2216			unsigned int clone_size;
2217
2218			rbd_assert(length <= (u64)UINT_MAX);
2219			clone_size = (unsigned int)length;
2220			obj_request->bio_list =
2221					bio_chain_clone_range(&bio_list,
2222								&bio_offset,
2223								clone_size,
2224								GFP_ATOMIC);
2225			if (!obj_request->bio_list)
2226				goto out_unwind;
2227		} else {
2228			unsigned int page_count;
2229
2230			obj_request->pages = pages;
2231			page_count = (u32)calc_pages_for(offset, length);
2232			obj_request->page_count = page_count;
2233			if ((offset + length) & ~PAGE_MASK)
2234				page_count--;	/* more on last page */
2235			pages += page_count;
2236		}
2237
2238		osd_req = rbd_osd_req_create(rbd_dev, write_request,
2239					     (write_request ? 2 : 1),
2240					     obj_request);
2241		if (!osd_req)
2242			goto out_unwind;
2243		obj_request->osd_req = osd_req;
2244		obj_request->callback = rbd_img_obj_callback;
2245
2246		if (write_request) {
2247			osd_req_op_alloc_hint_init(osd_req, which,
2248					     rbd_obj_bytes(&rbd_dev->header),
2249					     rbd_obj_bytes(&rbd_dev->header));
2250			which++;
2251		}
2252
2253		osd_req_op_extent_init(osd_req, which, opcode, offset, length,
2254				       0, 0);
2255		if (type == OBJ_REQUEST_BIO)
2256			osd_req_op_extent_osd_data_bio(osd_req, which,
2257					obj_request->bio_list, length);
2258		else
2259			osd_req_op_extent_osd_data_pages(osd_req, which,
2260					obj_request->pages, length,
2261					offset & ~PAGE_MASK, false, false);
2262
2263		if (write_request)
2264			rbd_osd_req_format_write(obj_request);
2265		else
2266			rbd_osd_req_format_read(obj_request);
 
2267
2268		obj_request->img_offset = img_offset;
 
2269
2270		img_offset += length;
2271		resid -= length;
2272	}
 
 
2273
2274	return 0;
 
 
 
2275
2276out_unwind:
2277	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2278		rbd_img_obj_request_del(img_request, obj_request);
 
 
2279
2280	return -ENOMEM;
 
 
2281}
2282
2283static void
2284rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2285{
2286	struct rbd_img_request *img_request;
2287	struct rbd_device *rbd_dev;
2288	struct page **pages;
2289	u32 page_count;
 
 
 
 
 
2290
2291	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2292	rbd_assert(obj_request_img_data_test(obj_request));
2293	img_request = obj_request->img_request;
2294	rbd_assert(img_request);
2295
2296	rbd_dev = img_request->rbd_dev;
2297	rbd_assert(rbd_dev);
2298
2299	pages = obj_request->copyup_pages;
2300	rbd_assert(pages != NULL);
2301	obj_request->copyup_pages = NULL;
2302	page_count = obj_request->copyup_page_count;
2303	rbd_assert(page_count);
2304	obj_request->copyup_page_count = 0;
2305	ceph_release_page_vector(pages, page_count);
2306
2307	/*
2308	 * We want the transfer count to reflect the size of the
2309	 * original write request.  There is no such thing as a
2310	 * successful short write, so if the request was successful
2311	 * we can just set it to the originally-requested length.
2312	 */
2313	if (!obj_request->result)
2314		obj_request->xferred = obj_request->length;
2315
2316	/* Finish up with the normal image object callback */
 
 
 
 
 
 
 
 
 
2317
2318	rbd_img_obj_callback(obj_request);
 
2319}
2320
2321static void
2322rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2323{
2324	struct rbd_obj_request *orig_request;
2325	struct ceph_osd_request *osd_req;
2326	struct ceph_osd_client *osdc;
2327	struct rbd_device *rbd_dev;
2328	struct page **pages;
2329	u32 page_count;
2330	int img_result;
2331	u64 parent_length;
2332	u64 offset;
2333	u64 length;
2334
2335	rbd_assert(img_request_child_test(img_request));
2336
2337	/* First get what we need from the image request */
2338
2339	pages = img_request->copyup_pages;
2340	rbd_assert(pages != NULL);
2341	img_request->copyup_pages = NULL;
2342	page_count = img_request->copyup_page_count;
2343	rbd_assert(page_count);
2344	img_request->copyup_page_count = 0;
2345
2346	orig_request = img_request->obj_request;
2347	rbd_assert(orig_request != NULL);
2348	rbd_assert(obj_request_type_valid(orig_request->type));
2349	img_result = img_request->result;
2350	parent_length = img_request->length;
2351	rbd_assert(parent_length == img_request->xferred);
2352	rbd_img_request_put(img_request);
 
2353
2354	rbd_assert(orig_request->img_request);
2355	rbd_dev = orig_request->img_request->rbd_dev;
2356	rbd_assert(rbd_dev);
 
 
2357
2358	/*
2359	 * If the overlap has become 0 (most likely because the
2360	 * image has been flattened) we need to free the pages
2361	 * and re-submit the original write request.
2362	 */
2363	if (!rbd_dev->parent_overlap) {
2364		struct ceph_osd_client *osdc;
2365
2366		ceph_release_page_vector(pages, page_count);
2367		osdc = &rbd_dev->rbd_client->client->osdc;
2368		img_result = rbd_obj_request_submit(osdc, orig_request);
2369		if (!img_result)
2370			return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2371	}
2372
2373	if (img_result)
2374		goto out_err;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2375
2376	/*
2377	 * The original osd request is of no use to use any more.
2378	 * We need a new one that can hold the three ops in a copyup
2379	 * request.  Allocate the new copyup osd request for the
2380	 * original request, and release the old one.
2381	 */
2382	img_result = -ENOMEM;
2383	osd_req = rbd_osd_req_create_copyup(orig_request);
2384	if (!osd_req)
2385		goto out_err;
2386	rbd_osd_req_destroy(orig_request->osd_req);
2387	orig_request->osd_req = osd_req;
2388	orig_request->copyup_pages = pages;
2389	orig_request->copyup_page_count = page_count;
2390
2391	/* Initialize the copyup op */
2392
2393	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2394	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
2395						false, false);
2396
2397	/* Then the hint op */
2398
2399	osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header),
2400				   rbd_obj_bytes(&rbd_dev->header));
2401
2402	/* And the original write request op */
2403
2404	offset = orig_request->offset;
2405	length = orig_request->length;
2406	osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE,
2407					offset, length, 0, 0);
2408	if (orig_request->type == OBJ_REQUEST_BIO)
2409		osd_req_op_extent_osd_data_bio(osd_req, 2,
2410					orig_request->bio_list, length);
2411	else
2412		osd_req_op_extent_osd_data_pages(osd_req, 2,
2413					orig_request->pages, length,
2414					offset & ~PAGE_MASK, false, false);
2415
2416	rbd_osd_req_format_write(orig_request);
2417
2418	/* All set, send it off. */
2419
2420	orig_request->callback = rbd_img_obj_copyup_callback;
2421	osdc = &rbd_dev->rbd_client->client->osdc;
2422	img_result = rbd_obj_request_submit(osdc, orig_request);
2423	if (!img_result)
2424		return;
2425out_err:
2426	/* Record the error code and complete the request */
2427
2428	orig_request->result = img_result;
2429	orig_request->xferred = 0;
2430	obj_request_done_set(orig_request);
2431	rbd_obj_request_complete(orig_request);
2432}
2433
2434/*
2435 * Read from the parent image the range of data that covers the
2436 * entire target of the given object request.  This is used for
2437 * satisfying a layered image write request when the target of an
2438 * object request from the image request does not exist.
2439 *
2440 * A page array big enough to hold the returned data is allocated
2441 * and supplied to rbd_img_request_fill() as the "data descriptor."
2442 * When the read completes, this page array will be transferred to
2443 * the original object request for the copyup operation.
2444 *
2445 * If an error occurs, record it as the result of the original
2446 * object request and mark it done so it gets completed.
2447 */
2448static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2449{
2450	struct rbd_img_request *img_request = NULL;
2451	struct rbd_img_request *parent_request = NULL;
2452	struct rbd_device *rbd_dev;
2453	u64 img_offset;
2454	u64 length;
2455	struct page **pages = NULL;
2456	u32 page_count;
2457	int result;
 
 
 
 
2458
2459	rbd_assert(obj_request_img_data_test(obj_request));
2460	rbd_assert(obj_request_type_valid(obj_request->type));
 
2461
2462	img_request = obj_request->img_request;
2463	rbd_assert(img_request != NULL);
2464	rbd_dev = img_request->rbd_dev;
2465	rbd_assert(rbd_dev->parent != NULL);
2466
2467	/*
2468	 * Determine the byte range covered by the object in the
2469	 * child image to which the original request was to be sent.
 
2470	 */
2471	img_offset = obj_request->img_offset - obj_request->offset;
2472	length = (u64)1 << rbd_dev->header.obj_order;
 
2473
2474	/*
2475	 * There is no defined parent data beyond the parent
2476	 * overlap, so limit what we read at that boundary if
2477	 * necessary.
2478	 */
2479	if (img_offset + length > rbd_dev->parent_overlap) {
2480		rbd_assert(img_offset < rbd_dev->parent_overlap);
2481		length = rbd_dev->parent_overlap - img_offset;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2482	}
2483
2484	/*
2485	 * Allocate a page array big enough to receive the data read
2486	 * from the parent.
2487	 */
2488	page_count = (u32)calc_pages_for(0, length);
2489	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2490	if (IS_ERR(pages)) {
2491		result = PTR_ERR(pages);
2492		pages = NULL;
2493		goto out_err;
2494	}
 
2495
2496	result = -ENOMEM;
2497	parent_request = rbd_parent_request_create(obj_request,
2498						img_offset, length);
2499	if (!parent_request)
2500		goto out_err;
2501
2502	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2503	if (result)
2504		goto out_err;
2505	parent_request->copyup_pages = pages;
2506	parent_request->copyup_page_count = page_count;
2507
2508	parent_request->callback = rbd_img_obj_parent_read_full_callback;
2509	result = rbd_img_request_submit(parent_request);
2510	if (!result)
2511		return 0;
2512
2513	parent_request->copyup_pages = NULL;
2514	parent_request->copyup_page_count = 0;
2515	parent_request->obj_request = NULL;
2516	rbd_obj_request_put(obj_request);
2517out_err:
2518	if (pages)
2519		ceph_release_page_vector(pages, page_count);
2520	if (parent_request)
2521		rbd_img_request_put(parent_request);
2522	obj_request->result = result;
2523	obj_request->xferred = 0;
2524	obj_request_done_set(obj_request);
2525
2526	return result;
 
 
2527}
2528
2529static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2530{
2531	struct rbd_obj_request *orig_request;
2532	struct rbd_device *rbd_dev;
2533	int result;
2534
2535	rbd_assert(!obj_request_img_data_test(obj_request));
 
 
2536
2537	/*
2538	 * All we need from the object request is the original
2539	 * request and the result of the STAT op.  Grab those, then
2540	 * we're done with the request.
2541	 */
2542	orig_request = obj_request->obj_request;
2543	obj_request->obj_request = NULL;
2544	rbd_obj_request_put(orig_request);
2545	rbd_assert(orig_request);
2546	rbd_assert(orig_request->img_request);
2547
2548	result = obj_request->result;
2549	obj_request->result = 0;
2550
2551	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2552		obj_request, orig_request, result,
2553		obj_request->xferred, obj_request->length);
2554	rbd_obj_request_put(obj_request);
2555
2556	/*
2557	 * If the overlap has become 0 (most likely because the
2558	 * image has been flattened) we need to free the pages
2559	 * and re-submit the original write request.
2560	 */
2561	rbd_dev = orig_request->img_request->rbd_dev;
2562	if (!rbd_dev->parent_overlap) {
2563		struct ceph_osd_client *osdc;
2564
2565		osdc = &rbd_dev->rbd_client->client->osdc;
2566		result = rbd_obj_request_submit(osdc, orig_request);
2567		if (!result)
2568			return;
2569	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2570
2571	/*
2572	 * Our only purpose here is to determine whether the object
2573	 * exists, and we don't want to treat the non-existence as
2574	 * an error.  If something else comes back, transfer the
2575	 * error to the original request and complete it now.
2576	 */
2577	if (!result) {
2578		obj_request_existence_set(orig_request, true);
2579	} else if (result == -ENOENT) {
2580		obj_request_existence_set(orig_request, false);
2581	} else if (result) {
2582		orig_request->result = result;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2583		goto out;
2584	}
2585
2586	/*
2587	 * Resubmit the original request now that we have recorded
2588	 * whether the target object exists.
2589	 */
2590	orig_request->result = rbd_img_obj_request_submit(orig_request);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2591out:
2592	if (orig_request->result)
2593		rbd_obj_request_complete(orig_request);
 
 
 
 
2594}
2595
2596static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2597{
2598	struct rbd_obj_request *stat_request;
2599	struct rbd_device *rbd_dev;
2600	struct ceph_osd_client *osdc;
2601	struct page **pages = NULL;
2602	u32 page_count;
2603	size_t size;
 
 
 
 
 
 
 
 
 
2604	int ret;
2605
2606	/*
2607	 * The response data for a STAT call consists of:
2608	 *     le64 length;
2609	 *     struct {
2610	 *         le32 tv_sec;
2611	 *         le32 tv_nsec;
2612	 *     } mtime;
2613	 */
2614	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2615	page_count = (u32)calc_pages_for(0, size);
2616	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2617	if (IS_ERR(pages))
2618		return PTR_ERR(pages);
2619
2620	ret = -ENOMEM;
2621	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2622							OBJ_REQUEST_PAGES);
2623	if (!stat_request)
2624		goto out;
 
2625
2626	rbd_obj_request_get(obj_request);
2627	stat_request->obj_request = obj_request;
2628	stat_request->pages = pages;
2629	stat_request->page_count = page_count;
2630
2631	rbd_assert(obj_request->img_request);
2632	rbd_dev = obj_request->img_request->rbd_dev;
2633	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
2634						   stat_request);
2635	if (!stat_request->osd_req)
2636		goto out;
2637	stat_request->callback = rbd_img_obj_exists_callback;
2638
2639	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2640	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2641					false, false);
2642	rbd_osd_req_format_read(stat_request);
 
 
 
2643
2644	osdc = &rbd_dev->rbd_client->client->osdc;
2645	ret = rbd_obj_request_submit(osdc, stat_request);
2646out:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2647	if (ret)
2648		rbd_obj_request_put(obj_request);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2649
 
 
 
 
2650	return ret;
2651}
2652
2653static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
 
 
 
2654{
2655	struct rbd_img_request *img_request;
2656	struct rbd_device *rbd_dev;
2657	bool known;
 
 
 
 
 
 
 
 
 
 
 
2658
2659	rbd_assert(obj_request_img_data_test(obj_request));
 
2660
2661	img_request = obj_request->img_request;
2662	rbd_assert(img_request);
2663	rbd_dev = img_request->rbd_dev;
 
 
 
2664
2665	/*
2666	 * Only writes to layered images need special handling.
2667	 * Reads and non-layered writes are simple object requests.
2668	 * Layered writes that start beyond the end of the overlap
2669	 * with the parent have no parent data, so they too are
2670	 * simple object requests.  Finally, if the target object is
2671	 * known to already exist, its parent data has already been
2672	 * copied, so a write to the object can also be handled as a
2673	 * simple object request.
2674	 */
2675	if (!img_request_write_test(img_request) ||
2676		!img_request_layered_test(img_request) ||
2677		rbd_dev->parent_overlap <= obj_request->img_offset ||
2678		((known = obj_request_known_test(obj_request)) &&
2679			obj_request_exists_test(obj_request))) {
2680
2681		struct rbd_device *rbd_dev;
2682		struct ceph_osd_client *osdc;
 
 
 
 
 
2683
2684		rbd_dev = obj_request->img_request->rbd_dev;
2685		osdc = &rbd_dev->rbd_client->client->osdc;
 
 
 
 
2686
2687		return rbd_obj_request_submit(osdc, obj_request);
 
2688	}
2689
2690	/*
2691	 * It's a layered write.  The target object might exist but
2692	 * we may not know that yet.  If we know it doesn't exist,
2693	 * start by reading the data for the full target object from
2694	 * the parent so we can use it for a copyup to the target.
2695	 */
2696	if (known)
2697		return rbd_img_obj_parent_read_full(obj_request);
 
 
 
 
2698
2699	/* We don't know whether the target exists.  Go find out. */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2700
2701	return rbd_img_obj_exists_submit(obj_request);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2702}
2703
2704static int rbd_img_request_submit(struct rbd_img_request *img_request)
 
 
 
2705{
2706	struct rbd_obj_request *obj_request;
2707	struct rbd_obj_request *next_obj_request;
 
 
2708
2709	dout("%s: img %p\n", __func__, img_request);
2710	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2711		int ret;
 
 
 
 
 
 
 
2712
2713		ret = rbd_img_obj_request_submit(obj_request);
2714		if (ret)
2715			return ret;
2716	}
 
2717
2718	return 0;
 
 
 
 
 
 
 
 
 
2719}
2720
2721static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2722{
2723	struct rbd_obj_request *obj_request;
2724	struct rbd_device *rbd_dev;
2725	u64 obj_end;
2726	u64 img_xferred;
2727	int img_result;
2728
2729	rbd_assert(img_request_child_test(img_request));
 
 
 
2730
2731	/* First get what we need from the image request and release it */
 
 
 
2732
2733	obj_request = img_request->obj_request;
2734	img_xferred = img_request->xferred;
2735	img_result = img_request->result;
2736	rbd_img_request_put(img_request);
2737
2738	/*
2739	 * If the overlap has become 0 (most likely because the
2740	 * image has been flattened) we need to re-submit the
2741	 * original request.
2742	 */
2743	rbd_assert(obj_request);
2744	rbd_assert(obj_request->img_request);
2745	rbd_dev = obj_request->img_request->rbd_dev;
2746	if (!rbd_dev->parent_overlap) {
2747		struct ceph_osd_client *osdc;
2748
2749		osdc = &rbd_dev->rbd_client->client->osdc;
2750		img_result = rbd_obj_request_submit(osdc, obj_request);
2751		if (!img_result)
2752			return;
 
 
 
 
 
 
2753	}
2754
2755	obj_request->result = img_result;
2756	if (obj_request->result)
2757		goto out;
 
 
 
 
 
 
 
 
 
 
 
2758
2759	/*
2760	 * We need to zero anything beyond the parent overlap
2761	 * boundary.  Since rbd_img_obj_request_read_callback()
2762	 * will zero anything beyond the end of a short read, an
2763	 * easy way to do this is to pretend the data from the
2764	 * parent came up short--ending at the overlap boundary.
2765	 */
2766	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2767	obj_end = obj_request->img_offset + obj_request->length;
2768	if (obj_end > rbd_dev->parent_overlap) {
2769		u64 xferred = 0;
2770
2771		if (obj_request->img_offset < rbd_dev->parent_overlap)
2772			xferred = rbd_dev->parent_overlap -
2773					obj_request->img_offset;
2774
2775		obj_request->xferred = min(img_xferred, xferred);
 
2776	} else {
2777		obj_request->xferred = img_xferred;
2778	}
2779out:
2780	rbd_img_obj_request_read_callback(obj_request);
2781	rbd_obj_request_complete(obj_request);
 
2782}
2783
2784static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
 
 
 
 
 
2785{
2786	struct rbd_img_request *img_request;
2787	int result;
 
 
 
 
 
 
 
 
 
 
 
2788
2789	rbd_assert(obj_request_img_data_test(obj_request));
2790	rbd_assert(obj_request->img_request != NULL);
2791	rbd_assert(obj_request->result == (s32) -ENOENT);
2792	rbd_assert(obj_request_type_valid(obj_request->type));
2793
2794	/* rbd_read_finish(obj_request, obj_request->length); */
2795	img_request = rbd_parent_request_create(obj_request,
2796						obj_request->img_offset,
2797						obj_request->length);
2798	result = -ENOMEM;
2799	if (!img_request)
2800		goto out_err;
2801
2802	if (obj_request->type == OBJ_REQUEST_BIO)
2803		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2804						obj_request->bio_list);
2805	else
2806		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
2807						obj_request->pages);
2808	if (result)
2809		goto out_err;
2810
2811	img_request->callback = rbd_img_parent_read_callback;
2812	result = rbd_img_request_submit(img_request);
2813	if (result)
2814		goto out_err;
 
 
 
 
 
 
 
 
2815
2816	return;
2817out_err:
2818	if (img_request)
2819		rbd_img_request_put(img_request);
2820	obj_request->result = result;
2821	obj_request->xferred = 0;
2822	obj_request_done_set(obj_request);
2823}
2824
2825static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
 
2826{
2827	struct rbd_obj_request *obj_request;
2828	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 
 
2829	int ret;
2830
2831	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2832							OBJ_REQUEST_NODATA);
2833	if (!obj_request)
2834		return -ENOMEM;
2835
2836	ret = -ENOMEM;
2837	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
2838						  obj_request);
2839	if (!obj_request->osd_req)
2840		goto out;
 
 
2841
2842	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2843					notify_id, 0, 0);
2844	rbd_osd_req_format_read(obj_request);
 
 
 
2845
2846	ret = rbd_obj_request_submit(osdc, obj_request);
2847	if (ret)
2848		goto out;
2849	ret = rbd_obj_request_wait(obj_request);
2850out:
2851	rbd_obj_request_put(obj_request);
2852
2853	return ret;
 
 
 
 
2854}
2855
2856static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
 
2857{
2858	struct rbd_device *rbd_dev = (struct rbd_device *)data;
 
 
 
 
 
2859	int ret;
2860
2861	if (!rbd_dev)
2862		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2863
2864	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2865		rbd_dev->header_name, (unsigned long long)notify_id,
2866		(unsigned int)opcode);
2867	ret = rbd_dev_refresh(rbd_dev);
2868	if (ret)
2869		rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
2870
2871	rbd_obj_notify_ack_sync(rbd_dev, notify_id);
 
2872}
2873
2874/*
2875 * Request sync osd watch/unwatch.  The value of "start" determines
2876 * whether a watch request is being initiated or torn down.
2877 */
2878static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start)
2879{
2880	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2881	struct rbd_obj_request *obj_request;
2882	int ret;
2883
2884	rbd_assert(start ^ !!rbd_dev->watch_event);
2885	rbd_assert(start ^ !!rbd_dev->watch_request);
2886
2887	if (start) {
2888		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
2889						&rbd_dev->watch_event);
2890		if (ret < 0)
2891			return ret;
2892		rbd_assert(rbd_dev->watch_event != NULL);
2893	}
2894
2895	ret = -ENOMEM;
2896	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2897							OBJ_REQUEST_NODATA);
2898	if (!obj_request)
2899		goto out_cancel;
2900
2901	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
2902						  obj_request);
2903	if (!obj_request->osd_req)
2904		goto out_cancel;
2905
2906	if (start)
2907		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
2908	else
2909		ceph_osdc_unregister_linger_request(osdc,
2910					rbd_dev->watch_request->osd_req);
2911
2912	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2913				rbd_dev->watch_event->cookie, 0, start ? 1 : 0);
2914	rbd_osd_req_format_write(obj_request);
2915
2916	ret = rbd_obj_request_submit(osdc, obj_request);
2917	if (ret)
2918		goto out_cancel;
2919	ret = rbd_obj_request_wait(obj_request);
2920	if (ret)
2921		goto out_cancel;
2922	ret = obj_request->result;
2923	if (ret)
2924		goto out_cancel;
2925
2926	/*
2927	 * A watch request is set to linger, so the underlying osd
2928	 * request won't go away until we unregister it.  We retain
2929	 * a pointer to the object request during that time (in
2930	 * rbd_dev->watch_request), so we'll keep a reference to
2931	 * it.  We'll drop that reference (below) after we've
2932	 * unregistered it.
2933	 */
2934	if (start) {
2935		rbd_dev->watch_request = obj_request;
2936
2937		return 0;
2938	}
 
 
 
 
2939
2940	/* We have successfully torn down the watch request */
 
 
 
2941
2942	rbd_obj_request_put(rbd_dev->watch_request);
2943	rbd_dev->watch_request = NULL;
2944out_cancel:
2945	/* Cancel the event if we're tearing down, or on error */
2946	ceph_osdc_cancel_event(rbd_dev->watch_event);
2947	rbd_dev->watch_event = NULL;
2948	if (obj_request)
2949		rbd_obj_request_put(obj_request);
2950
2951	return ret;
2952}
2953
2954static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
 
 
 
2955{
2956	return __rbd_dev_header_watch_sync(rbd_dev, true);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2957}
2958
2959static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
2960{
 
 
2961	int ret;
2962
2963	ret = __rbd_dev_header_watch_sync(rbd_dev, false);
 
 
 
 
 
 
 
 
2964	if (ret) {
2965		rbd_warn(rbd_dev, "unable to tear down watch request: %d\n",
2966			 ret);
 
 
 
 
 
 
 
 
 
2967	}
 
 
 
 
 
 
 
 
 
 
 
 
 
2968}
2969
2970/*
2971 * Synchronous osd object method call.  Returns the number of bytes
2972 * returned in the outbound buffer, or a negative error code.
2973 */
2974static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2975			     const char *object_name,
2976			     const char *class_name,
2977			     const char *method_name,
2978			     const void *outbound,
2979			     size_t outbound_size,
2980			     void *inbound,
2981			     size_t inbound_size)
2982{
2983	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2984	struct rbd_obj_request *obj_request;
2985	struct page **pages;
2986	u32 page_count;
2987	int ret;
2988
2989	/*
2990	 * Method calls are ultimately read operations.  The result
2991	 * should placed into the inbound buffer provided.  They
2992	 * also supply outbound data--parameters for the object
2993	 * method.  Currently if this is present it will be a
2994	 * snapshot id.
2995	 */
2996	page_count = (u32)calc_pages_for(0, inbound_size);
2997	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2998	if (IS_ERR(pages))
2999		return PTR_ERR(pages);
 
 
 
3000
3001	ret = -ENOMEM;
3002	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
3003							OBJ_REQUEST_PAGES);
3004	if (!obj_request)
3005		goto out;
3006
3007	obj_request->pages = pages;
3008	obj_request->page_count = page_count;
 
 
 
 
3009
3010	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
3011						  obj_request);
3012	if (!obj_request->osd_req)
3013		goto out;
 
 
 
3014
3015	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
3016					class_name, method_name);
3017	if (outbound_size) {
3018		struct ceph_pagelist *pagelist;
 
3019
3020		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
3021		if (!pagelist)
3022			goto out;
 
 
 
 
 
3023
3024		ceph_pagelist_init(pagelist);
3025		ceph_pagelist_append(pagelist, outbound, outbound_size);
3026		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
3027						pagelist);
3028	}
3029	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
3030					obj_request->pages, inbound_size,
3031					0, false, false);
3032	rbd_osd_req_format_read(obj_request);
3033
3034	ret = rbd_obj_request_submit(osdc, obj_request);
3035	if (ret)
3036		goto out;
3037	ret = rbd_obj_request_wait(obj_request);
3038	if (ret)
3039		goto out;
3040
3041	ret = obj_request->result;
3042	if (ret < 0)
3043		goto out;
 
3044
3045	rbd_assert(obj_request->xferred < (u64)INT_MAX);
3046	ret = (int)obj_request->xferred;
3047	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
3048out:
3049	if (obj_request)
3050		rbd_obj_request_put(obj_request);
3051	else
3052		ceph_release_page_vector(pages, page_count);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3053
 
3054	return ret;
3055}
3056
3057static void rbd_request_fn(struct request_queue *q)
3058		__releases(q->queue_lock) __acquires(q->queue_lock)
3059{
3060	struct rbd_device *rbd_dev = q->queuedata;
3061	bool read_only = rbd_dev->mapping.read_only;
3062	struct request *rq;
 
 
 
 
 
 
3063	int result;
3064
3065	while ((rq = blk_fetch_request(q))) {
3066		bool write_request = rq_data_dir(rq) == WRITE;
3067		struct rbd_img_request *img_request;
3068		u64 offset;
3069		u64 length;
3070
3071		/* Ignore any non-FS requests that filter through. */
3072
3073		if (rq->cmd_type != REQ_TYPE_FS) {
3074			dout("%s: non-fs request type %d\n", __func__,
3075				(int) rq->cmd_type);
3076			__blk_end_request_all(rq, 0);
3077			continue;
3078		}
 
 
3079
3080		/* Ignore/skip any zero-length requests */
3081
3082		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
3083		length = (u64) blk_rq_bytes(rq);
 
 
 
3084
3085		if (!length) {
3086			dout("%s: zero-length request\n", __func__);
3087			__blk_end_request_all(rq, 0);
3088			continue;
3089		}
3090
3091		spin_unlock_irq(q->queue_lock);
 
 
 
 
 
 
 
 
 
 
 
3092
3093		/* Disallow writes to a read-only device */
 
 
 
 
 
3094
3095		if (write_request) {
3096			result = -EROFS;
3097			if (read_only)
3098				goto end_request;
3099			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3100		}
3101
3102		/*
3103		 * Quit early if the mapped snapshot no longer
3104		 * exists.  It's still possible the snapshot will
3105		 * have disappeared by the time our request arrives
3106		 * at the osd, but there's no sense in sending it if
3107		 * we already know.
3108		 */
3109		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3110			dout("request for non-existent snapshot");
3111			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3112			result = -ENXIO;
3113			goto end_request;
3114		}
3115
3116		result = -EINVAL;
3117		if (offset && length > U64_MAX - offset + 1) {
3118			rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
3119				offset, length);
3120			goto end_request;	/* Shouldn't happen */
3121		}
3122
 
 
 
 
 
 
 
 
 
 
 
3123		result = -EIO;
3124		if (offset + length > rbd_dev->mapping.size) {
3125			rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
3126				offset, length, rbd_dev->mapping.size);
3127			goto end_request;
3128		}
3129
3130		result = -ENOMEM;
3131		img_request = rbd_img_request_create(rbd_dev, offset, length,
3132							write_request);
3133		if (!img_request)
3134			goto end_request;
3135
3136		img_request->rq = rq;
3137
3138		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3139						rq->bio);
3140		if (!result)
3141			result = rbd_img_request_submit(img_request);
3142		if (result)
3143			rbd_img_request_put(img_request);
3144end_request:
3145		spin_lock_irq(q->queue_lock);
3146		if (result < 0) {
3147			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
3148				write_request ? "write" : "read",
3149				length, offset, result);
3150
3151			__blk_end_request_all(rq, result);
3152		}
 
 
3153	}
3154}
 
3155
3156/*
3157 * a queue callback. Makes sure that we don't create a bio that spans across
3158 * multiple osd objects. One exception would be with a single page bios,
3159 * which we handle later at bio_chain_clone_range()
3160 */
3161static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
3162			  struct bio_vec *bvec)
3163{
3164	struct rbd_device *rbd_dev = q->queuedata;
3165	sector_t sector_offset;
3166	sector_t sectors_per_obj;
3167	sector_t obj_sector_offset;
3168	int ret;
3169
3170	/*
3171	 * Find how far into its rbd object the partition-relative
3172	 * bio start sector is to offset relative to the enclosing
3173	 * device.
3174	 */
3175	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
3176	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
3177	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
3178
3179	/*
3180	 * Compute the number of bytes from that offset to the end
3181	 * of the object.  Account for what's already used by the bio.
3182	 */
3183	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
3184	if (ret > bmd->bi_size)
3185		ret -= bmd->bi_size;
3186	else
3187		ret = 0;
 
 
 
 
3188
3189	/*
3190	 * Don't send back more than was asked for.  And if the bio
3191	 * was empty, let the whole thing through because:  "Note
3192	 * that a block device *must* allow a single page to be
3193	 * added to an empty bio."
3194	 */
3195	rbd_assert(bvec->bv_len <= PAGE_SIZE);
3196	if (ret > (int) bvec->bv_len || !bmd->bi_size)
3197		ret = (int) bvec->bv_len;
3198
3199	return ret;
 
3200}
3201
3202static void rbd_free_disk(struct rbd_device *rbd_dev)
3203{
3204	struct gendisk *disk = rbd_dev->disk;
3205
3206	if (!disk)
3207		return;
3208
3209	rbd_dev->disk = NULL;
3210	if (disk->flags & GENHD_FL_UP) {
3211		del_gendisk(disk);
3212		if (disk->queue)
3213			blk_cleanup_queue(disk->queue);
3214	}
3215	put_disk(disk);
3216}
3217
3218static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3219				const char *object_name,
3220				u64 offset, u64 length, void *buf)
 
3221
3222{
3223	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3224	struct rbd_obj_request *obj_request;
3225	struct page **pages = NULL;
3226	u32 page_count;
3227	size_t size;
3228	int ret;
3229
3230	page_count = (u32) calc_pages_for(offset, length);
3231	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3232	if (IS_ERR(pages))
3233		ret = PTR_ERR(pages);
3234
3235	ret = -ENOMEM;
3236	obj_request = rbd_obj_request_create(object_name, offset, length,
3237							OBJ_REQUEST_PAGES);
3238	if (!obj_request)
3239		goto out;
3240
3241	obj_request->pages = pages;
3242	obj_request->page_count = page_count;
 
3243
3244	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
3245						  obj_request);
3246	if (!obj_request->osd_req)
3247		goto out;
 
3248
3249	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3250					offset, length, 0, 0);
3251	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3252					obj_request->pages,
3253					obj_request->length,
3254					obj_request->offset & ~PAGE_MASK,
3255					false, false);
3256	rbd_osd_req_format_read(obj_request);
3257
3258	ret = rbd_obj_request_submit(osdc, obj_request);
3259	if (ret)
3260		goto out;
3261	ret = rbd_obj_request_wait(obj_request);
3262	if (ret)
3263		goto out;
3264
3265	ret = obj_request->result;
3266	if (ret < 0)
3267		goto out;
3268
3269	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
3270	size = (size_t) obj_request->xferred;
3271	ceph_copy_from_page_vector(pages, buf, 0, size);
3272	rbd_assert(size <= (size_t)INT_MAX);
3273	ret = (int)size;
3274out:
3275	if (obj_request)
3276		rbd_obj_request_put(obj_request);
3277	else
3278		ceph_release_page_vector(pages, page_count);
3279
 
 
3280	return ret;
3281}
3282
3283/*
3284 * Read the complete header for the given rbd device.  On successful
3285 * return, the rbd_dev->header field will contain up-to-date
3286 * information about the image.
3287 */
3288static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
3289{
3290	struct rbd_image_header_ondisk *ondisk = NULL;
3291	u32 snap_count = 0;
3292	u64 names_size = 0;
3293	u32 want_count;
3294	int ret;
3295
3296	/*
3297	 * The complete header will include an array of its 64-bit
3298	 * snapshot ids, followed by the names of those snapshots as
3299	 * a contiguous block of NUL-terminated strings.  Note that
3300	 * the number of snapshots could change by the time we read
3301	 * it in, in which case we re-read it.
3302	 */
3303	do {
3304		size_t size;
3305
3306		kfree(ondisk);
3307
3308		size = sizeof (*ondisk);
3309		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3310		size += names_size;
3311		ondisk = kmalloc(size, GFP_KERNEL);
3312		if (!ondisk)
3313			return -ENOMEM;
3314
3315		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
3316				       0, size, ondisk);
3317		if (ret < 0)
3318			goto out;
3319		if ((size_t)ret < size) {
3320			ret = -ENXIO;
3321			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3322				size, ret);
3323			goto out;
3324		}
3325		if (!rbd_dev_ondisk_valid(ondisk)) {
3326			ret = -ENXIO;
3327			rbd_warn(rbd_dev, "invalid header");
3328			goto out;
3329		}
3330
3331		names_size = le64_to_cpu(ondisk->snap_names_len);
3332		want_count = snap_count;
3333		snap_count = le32_to_cpu(ondisk->snap_count);
3334	} while (snap_count != want_count);
3335
3336	ret = rbd_header_from_disk(rbd_dev, ondisk);
3337out:
3338	kfree(ondisk);
3339
3340	return ret;
3341}
3342
3343/*
3344 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3345 * has disappeared from the (just updated) snapshot context.
3346 */
3347static void rbd_exists_validate(struct rbd_device *rbd_dev)
3348{
3349	u64 snap_id;
3350
3351	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3352		return;
3353
3354	snap_id = rbd_dev->spec->snap_id;
3355	if (snap_id == CEPH_NOSNAP)
3356		return;
3357
3358	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3359		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3360}
3361
3362static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3363{
3364	sector_t size;
3365	bool removing;
3366
3367	/*
3368	 * Don't hold the lock while doing disk operations,
3369	 * or lock ordering will conflict with the bdev mutex via:
3370	 * rbd_add() -> blkdev_get() -> rbd_open()
3371	 */
3372	spin_lock_irq(&rbd_dev->lock);
3373	removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
3374	spin_unlock_irq(&rbd_dev->lock);
3375	/*
3376	 * If the device is being removed, rbd_dev->disk has
3377	 * been destroyed, so don't try to update its size
3378	 */
3379	if (!removing) {
3380		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3381		dout("setting size to %llu sectors", (unsigned long long)size);
3382		set_capacity(rbd_dev->disk, size);
3383		revalidate_disk(rbd_dev->disk);
3384	}
3385}
3386
3387static int rbd_dev_refresh(struct rbd_device *rbd_dev)
3388{
3389	u64 mapping_size;
3390	int ret;
3391
3392	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3393	down_write(&rbd_dev->header_rwsem);
3394	mapping_size = rbd_dev->mapping.size;
3395	if (rbd_dev->image_format == 1)
3396		ret = rbd_dev_v1_header_info(rbd_dev);
3397	else
3398		ret = rbd_dev_v2_header_info(rbd_dev);
3399
3400	/* If it's a mapped snapshot, validate its EXISTS flag */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3401
3402	rbd_exists_validate(rbd_dev);
3403	up_write(&rbd_dev->header_rwsem);
3404
3405	if (mapping_size != rbd_dev->mapping.size) {
3406		rbd_dev_update_size(rbd_dev);
3407	}
3408
3409	return ret;
3410}
3411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3412static int rbd_init_disk(struct rbd_device *rbd_dev)
3413{
3414	struct gendisk *disk;
3415	struct request_queue *q;
3416	u64 segment_size;
 
 
3417
3418	/* create gendisk info */
3419	disk = alloc_disk(single_major ?
3420			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
3421			  RBD_MINORS_PER_MAJOR);
3422	if (!disk)
3423		return -ENOMEM;
3424
3425	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3426		 rbd_dev->dev_id);
3427	disk->major = rbd_dev->major;
3428	disk->first_minor = rbd_dev->minor;
3429	if (single_major)
3430		disk->flags |= GENHD_FL_EXT_DEVT;
3431	disk->fops = &rbd_bd_ops;
3432	disk->private_data = rbd_dev;
3433
3434	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3435	if (!q)
 
 
 
 
 
 
 
 
3436		goto out_disk;
3437
3438	/* We use the default size, but let's be explicit about it. */
3439	blk_queue_physical_block_size(q, SECTOR_SIZE);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3440
3441	/* set io sizes to object size */
3442	segment_size = rbd_obj_bytes(&rbd_dev->header);
3443	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3444	blk_queue_max_segment_size(q, segment_size);
3445	blk_queue_io_min(q, segment_size);
3446	blk_queue_io_opt(q, segment_size);
3447
3448	blk_queue_merge_bvec(q, rbd_merge_bvec);
 
 
 
 
3449	disk->queue = q;
3450
3451	q->queuedata = rbd_dev;
3452
3453	rbd_dev->disk = disk;
3454
3455	return 0;
 
 
3456out_disk:
3457	put_disk(disk);
3458
3459	return -ENOMEM;
3460}
3461
3462/*
3463  sysfs
3464*/
3465
3466static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3467{
3468	return container_of(dev, struct rbd_device, dev);
3469}
3470
3471static ssize_t rbd_size_show(struct device *dev,
3472			     struct device_attribute *attr, char *buf)
3473{
3474	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3475
3476	return sprintf(buf, "%llu\n",
3477		(unsigned long long)rbd_dev->mapping.size);
3478}
3479
3480/*
3481 * Note this shows the features for whatever's mapped, which is not
3482 * necessarily the base image.
3483 */
3484static ssize_t rbd_features_show(struct device *dev,
3485			     struct device_attribute *attr, char *buf)
3486{
3487	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3488
3489	return sprintf(buf, "0x%016llx\n",
3490			(unsigned long long)rbd_dev->mapping.features);
3491}
3492
3493static ssize_t rbd_major_show(struct device *dev,
3494			      struct device_attribute *attr, char *buf)
3495{
3496	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3497
3498	if (rbd_dev->major)
3499		return sprintf(buf, "%d\n", rbd_dev->major);
3500
3501	return sprintf(buf, "(none)\n");
3502}
3503
3504static ssize_t rbd_minor_show(struct device *dev,
3505			      struct device_attribute *attr, char *buf)
3506{
3507	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3508
3509	return sprintf(buf, "%d\n", rbd_dev->minor);
3510}
3511
 
 
 
 
 
 
 
 
 
 
 
3512static ssize_t rbd_client_id_show(struct device *dev,
3513				  struct device_attribute *attr, char *buf)
3514{
3515	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3516
3517	return sprintf(buf, "client%lld\n",
3518			ceph_client_id(rbd_dev->rbd_client->client));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3519}
3520
3521static ssize_t rbd_pool_show(struct device *dev,
3522			     struct device_attribute *attr, char *buf)
3523{
3524	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3525
3526	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3527}
3528
3529static ssize_t rbd_pool_id_show(struct device *dev,
3530			     struct device_attribute *attr, char *buf)
3531{
3532	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3533
3534	return sprintf(buf, "%llu\n",
3535			(unsigned long long) rbd_dev->spec->pool_id);
3536}
3537
3538static ssize_t rbd_name_show(struct device *dev,
3539			     struct device_attribute *attr, char *buf)
3540{
3541	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3542
3543	if (rbd_dev->spec->image_name)
3544		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3545
3546	return sprintf(buf, "(unknown)\n");
3547}
3548
3549static ssize_t rbd_image_id_show(struct device *dev,
3550			     struct device_attribute *attr, char *buf)
3551{
3552	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3553
3554	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3555}
3556
3557/*
3558 * Shows the name of the currently-mapped snapshot (or
3559 * RBD_SNAP_HEAD_NAME for the base image).
3560 */
3561static ssize_t rbd_snap_show(struct device *dev,
3562			     struct device_attribute *attr,
3563			     char *buf)
3564{
3565	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3566
3567	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3568}
3569
 
 
 
 
 
 
 
 
3570/*
3571 * For an rbd v2 image, shows the pool id, image id, and snapshot id
3572 * for the parent image.  If there is no parent, simply shows
3573 * "(no parent image)".
3574 */
3575static ssize_t rbd_parent_show(struct device *dev,
3576			     struct device_attribute *attr,
3577			     char *buf)
3578{
3579	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3580	struct rbd_spec *spec = rbd_dev->parent_spec;
3581	int count;
3582	char *bufp = buf;
3583
3584	if (!spec)
3585		return sprintf(buf, "(no parent image)\n");
3586
3587	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3588			(unsigned long long) spec->pool_id, spec->pool_name);
3589	if (count < 0)
3590		return count;
3591	bufp += count;
3592
3593	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3594			spec->image_name ? spec->image_name : "(unknown)");
3595	if (count < 0)
3596		return count;
3597	bufp += count;
3598
3599	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3600			(unsigned long long) spec->snap_id, spec->snap_name);
3601	if (count < 0)
3602		return count;
3603	bufp += count;
3604
3605	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3606	if (count < 0)
3607		return count;
3608	bufp += count;
3609
3610	return (ssize_t) (bufp - buf);
 
 
 
 
 
 
 
 
 
 
 
 
3611}
3612
3613static ssize_t rbd_image_refresh(struct device *dev,
3614				 struct device_attribute *attr,
3615				 const char *buf,
3616				 size_t size)
3617{
3618	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3619	int ret;
3620
3621	ret = rbd_dev_refresh(rbd_dev);
3622	if (ret)
3623		rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
3624
3625	return ret < 0 ? ret : size;
3626}
3627
3628static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
3629static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3630static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3631static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
 
3632static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
 
 
3633static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
3634static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3635static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3636static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3637static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3638static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
 
3639static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3640
3641static struct attribute *rbd_attrs[] = {
3642	&dev_attr_size.attr,
3643	&dev_attr_features.attr,
3644	&dev_attr_major.attr,
3645	&dev_attr_minor.attr,
 
3646	&dev_attr_client_id.attr,
 
 
3647	&dev_attr_pool.attr,
3648	&dev_attr_pool_id.attr,
3649	&dev_attr_name.attr,
3650	&dev_attr_image_id.attr,
3651	&dev_attr_current_snap.attr,
 
3652	&dev_attr_parent.attr,
3653	&dev_attr_refresh.attr,
3654	NULL
3655};
3656
3657static struct attribute_group rbd_attr_group = {
3658	.attrs = rbd_attrs,
3659};
3660
3661static const struct attribute_group *rbd_attr_groups[] = {
3662	&rbd_attr_group,
3663	NULL
3664};
3665
3666static void rbd_sysfs_dev_release(struct device *dev)
3667{
3668}
3669
3670static struct device_type rbd_device_type = {
3671	.name		= "rbd",
3672	.groups		= rbd_attr_groups,
3673	.release	= rbd_sysfs_dev_release,
3674};
3675
3676static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3677{
3678	kref_get(&spec->kref);
3679
3680	return spec;
3681}
3682
3683static void rbd_spec_free(struct kref *kref);
3684static void rbd_spec_put(struct rbd_spec *spec)
3685{
3686	if (spec)
3687		kref_put(&spec->kref, rbd_spec_free);
3688}
3689
3690static struct rbd_spec *rbd_spec_alloc(void)
3691{
3692	struct rbd_spec *spec;
3693
3694	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3695	if (!spec)
3696		return NULL;
 
 
 
3697	kref_init(&spec->kref);
3698
3699	return spec;
3700}
3701
3702static void rbd_spec_free(struct kref *kref)
3703{
3704	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3705
3706	kfree(spec->pool_name);
3707	kfree(spec->image_id);
3708	kfree(spec->image_name);
3709	kfree(spec->snap_name);
3710	kfree(spec);
3711}
3712
3713static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3714				struct rbd_spec *spec)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3715{
3716	struct rbd_device *rbd_dev;
3717
3718	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3719	if (!rbd_dev)
3720		return NULL;
3721
3722	spin_lock_init(&rbd_dev->lock);
3723	rbd_dev->flags = 0;
3724	atomic_set(&rbd_dev->parent_ref, 0);
3725	INIT_LIST_HEAD(&rbd_dev->node);
3726	init_rwsem(&rbd_dev->header_rwsem);
3727
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3728	rbd_dev->spec = spec;
3729	rbd_dev->rbd_client = rbdc;
3730
3731	/* Initialize the layout used for all rbd requests */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3732
3733	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3734	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3735	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3736	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3737
 
3738	return rbd_dev;
 
 
 
 
 
 
3739}
3740
3741static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3742{
3743	rbd_put_client(rbd_dev->rbd_client);
3744	rbd_spec_put(rbd_dev->spec);
3745	kfree(rbd_dev);
3746}
3747
3748/*
3749 * Get the size and object order for an image snapshot, or if
3750 * snap_id is CEPH_NOSNAP, gets this information for the base
3751 * image.
3752 */
3753static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3754				u8 *order, u64 *snap_size)
3755{
3756	__le64 snapid = cpu_to_le64(snap_id);
3757	int ret;
3758	struct {
3759		u8 order;
3760		__le64 size;
3761	} __attribute__ ((packed)) size_buf = { 0 };
3762
3763	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3764				"rbd", "get_size",
3765				&snapid, sizeof (snapid),
3766				&size_buf, sizeof (size_buf));
3767	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3768	if (ret < 0)
3769		return ret;
3770	if (ret < sizeof (size_buf))
3771		return -ERANGE;
3772
3773	if (order) {
3774		*order = size_buf.order;
3775		dout("  order %u", (unsigned int)*order);
3776	}
3777	*snap_size = le64_to_cpu(size_buf.size);
3778
3779	dout("  snap_id 0x%016llx snap_size = %llu\n",
3780		(unsigned long long)snap_id,
3781		(unsigned long long)*snap_size);
3782
3783	return 0;
3784}
3785
3786static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3787{
3788	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3789					&rbd_dev->header.obj_order,
3790					&rbd_dev->header.image_size);
3791}
3792
3793static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3794{
3795	void *reply_buf;
3796	int ret;
3797	void *p;
3798
3799	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3800	if (!reply_buf)
3801		return -ENOMEM;
3802
3803	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3804				"rbd", "get_object_prefix", NULL, 0,
3805				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
3806	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3807	if (ret < 0)
3808		goto out;
3809
3810	p = reply_buf;
3811	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
3812						p + ret, NULL, GFP_NOIO);
3813	ret = 0;
3814
3815	if (IS_ERR(rbd_dev->header.object_prefix)) {
3816		ret = PTR_ERR(rbd_dev->header.object_prefix);
3817		rbd_dev->header.object_prefix = NULL;
3818	} else {
3819		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
3820	}
3821out:
3822	kfree(reply_buf);
3823
3824	return ret;
3825}
3826
3827static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3828		u64 *snap_features)
3829{
3830	__le64 snapid = cpu_to_le64(snap_id);
3831	struct {
3832		__le64 features;
3833		__le64 incompat;
3834	} __attribute__ ((packed)) features_buf = { 0 };
3835	u64 incompat;
3836	int ret;
3837
3838	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3839				"rbd", "get_features",
3840				&snapid, sizeof (snapid),
3841				&features_buf, sizeof (features_buf));
3842	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3843	if (ret < 0)
3844		return ret;
3845	if (ret < sizeof (features_buf))
3846		return -ERANGE;
3847
3848	incompat = le64_to_cpu(features_buf.incompat);
3849	if (incompat & ~RBD_FEATURES_SUPPORTED)
 
 
3850		return -ENXIO;
 
3851
3852	*snap_features = le64_to_cpu(features_buf.features);
3853
3854	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3855		(unsigned long long)snap_id,
3856		(unsigned long long)*snap_features,
3857		(unsigned long long)le64_to_cpu(features_buf.incompat));
3858
3859	return 0;
3860}
3861
3862static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3863{
3864	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3865						&rbd_dev->header.features);
3866}
3867
3868static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3869{
3870	struct rbd_spec *parent_spec;
3871	size_t size;
3872	void *reply_buf = NULL;
3873	__le64 snapid;
3874	void *p;
3875	void *end;
3876	u64 pool_id;
3877	char *image_id;
3878	u64 snap_id;
3879	u64 overlap;
3880	int ret;
3881
3882	parent_spec = rbd_spec_alloc();
3883	if (!parent_spec)
3884		return -ENOMEM;
3885
3886	size = sizeof (__le64) +				/* pool_id */
3887		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
3888		sizeof (__le64) +				/* snap_id */
3889		sizeof (__le64);				/* overlap */
3890	reply_buf = kmalloc(size, GFP_KERNEL);
3891	if (!reply_buf) {
3892		ret = -ENOMEM;
3893		goto out_err;
3894	}
3895
3896	snapid = cpu_to_le64(CEPH_NOSNAP);
3897	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3898				"rbd", "get_parent",
3899				&snapid, sizeof (snapid),
3900				reply_buf, size);
3901	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3902	if (ret < 0)
3903		goto out_err;
3904
3905	p = reply_buf;
3906	end = reply_buf + ret;
3907	ret = -ERANGE;
3908	ceph_decode_64_safe(&p, end, pool_id, out_err);
3909	if (pool_id == CEPH_NOPOOL) {
3910		/*
3911		 * Either the parent never existed, or we have
3912		 * record of it but the image got flattened so it no
3913		 * longer has a parent.  When the parent of a
3914		 * layered image disappears we immediately set the
3915		 * overlap to 0.  The effect of this is that all new
3916		 * requests will be treated as if the image had no
3917		 * parent.
3918		 */
3919		if (rbd_dev->parent_overlap) {
3920			rbd_dev->parent_overlap = 0;
3921			smp_mb();
3922			rbd_dev_parent_put(rbd_dev);
3923			pr_info("%s: clone image has been flattened\n",
3924				rbd_dev->disk->disk_name);
3925		}
3926
3927		goto out;	/* No parent?  No problem. */
3928	}
3929
3930	/* The ceph file layout needs to fit pool id in 32 bits */
3931
3932	ret = -EIO;
3933	if (pool_id > (u64)U32_MAX) {
3934		rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3935			(unsigned long long)pool_id, U32_MAX);
3936		goto out_err;
3937	}
3938
3939	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3940	if (IS_ERR(image_id)) {
3941		ret = PTR_ERR(image_id);
3942		goto out_err;
3943	}
3944	ceph_decode_64_safe(&p, end, snap_id, out_err);
3945	ceph_decode_64_safe(&p, end, overlap, out_err);
3946
3947	/*
3948	 * The parent won't change (except when the clone is
3949	 * flattened, already handled that).  So we only need to
3950	 * record the parent spec we have not already done so.
3951	 */
3952	if (!rbd_dev->parent_spec) {
3953		parent_spec->pool_id = pool_id;
3954		parent_spec->image_id = image_id;
3955		parent_spec->snap_id = snap_id;
3956		rbd_dev->parent_spec = parent_spec;
3957		parent_spec = NULL;	/* rbd_dev now owns this */
 
 
3958	}
3959
3960	/*
3961	 * We always update the parent overlap.  If it's zero we
3962	 * treat it specially.
3963	 */
3964	rbd_dev->parent_overlap = overlap;
3965	smp_mb();
3966	if (!overlap) {
3967
3968		/* A null parent_spec indicates it's the initial probe */
3969
3970		if (parent_spec) {
3971			/*
3972			 * The overlap has become zero, so the clone
3973			 * must have been resized down to 0 at some
3974			 * point.  Treat this the same as a flatten.
3975			 */
3976			rbd_dev_parent_put(rbd_dev);
3977			pr_info("%s: clone image now standalone\n",
3978				rbd_dev->disk->disk_name);
3979		} else {
3980			/*
3981			 * For the initial probe, if we find the
3982			 * overlap is zero we just pretend there was
3983			 * no parent image.
3984			 */
3985			rbd_warn(rbd_dev, "ignoring parent of "
3986						"clone with overlap 0\n");
3987		}
3988	}
 
 
3989out:
3990	ret = 0;
3991out_err:
3992	kfree(reply_buf);
3993	rbd_spec_put(parent_spec);
3994
3995	return ret;
3996}
3997
3998static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3999{
4000	struct {
4001		__le64 stripe_unit;
4002		__le64 stripe_count;
4003	} __attribute__ ((packed)) striping_info_buf = { 0 };
4004	size_t size = sizeof (striping_info_buf);
4005	void *p;
4006	u64 obj_size;
4007	u64 stripe_unit;
4008	u64 stripe_count;
4009	int ret;
4010
4011	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4012				"rbd", "get_stripe_unit_count", NULL, 0,
4013				(char *)&striping_info_buf, size);
4014	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4015	if (ret < 0)
4016		return ret;
4017	if (ret < size)
4018		return -ERANGE;
4019
4020	/*
4021	 * We don't actually support the "fancy striping" feature
4022	 * (STRIPINGV2) yet, but if the striping sizes are the
4023	 * defaults the behavior is the same as before.  So find
4024	 * out, and only fail if the image has non-default values.
4025	 */
4026	ret = -EINVAL;
4027	obj_size = (u64)1 << rbd_dev->header.obj_order;
4028	p = &striping_info_buf;
4029	stripe_unit = ceph_decode_64(&p);
4030	if (stripe_unit != obj_size) {
4031		rbd_warn(rbd_dev, "unsupported stripe unit "
4032				"(got %llu want %llu)",
4033				stripe_unit, obj_size);
4034		return -EINVAL;
4035	}
4036	stripe_count = ceph_decode_64(&p);
4037	if (stripe_count != 1) {
4038		rbd_warn(rbd_dev, "unsupported stripe count "
4039				"(got %llu want 1)", stripe_count);
4040		return -EINVAL;
4041	}
4042	rbd_dev->header.stripe_unit = stripe_unit;
4043	rbd_dev->header.stripe_count = stripe_count;
 
 
4044
 
 
4045	return 0;
4046}
4047
4048static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4049{
 
4050	size_t image_id_size;
4051	char *image_id;
4052	void *p;
4053	void *end;
4054	size_t size;
4055	void *reply_buf = NULL;
4056	size_t len = 0;
4057	char *image_name = NULL;
4058	int ret;
4059
4060	rbd_assert(!rbd_dev->spec->image_name);
4061
4062	len = strlen(rbd_dev->spec->image_id);
4063	image_id_size = sizeof (__le32) + len;
4064	image_id = kmalloc(image_id_size, GFP_KERNEL);
4065	if (!image_id)
4066		return NULL;
4067
4068	p = image_id;
4069	end = image_id + image_id_size;
4070	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
4071
4072	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4073	reply_buf = kmalloc(size, GFP_KERNEL);
4074	if (!reply_buf)
4075		goto out;
4076
4077	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
4078				"rbd", "dir_get_name",
4079				image_id, image_id_size,
4080				reply_buf, size);
4081	if (ret < 0)
4082		goto out;
4083	p = reply_buf;
4084	end = reply_buf + ret;
4085
4086	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4087	if (IS_ERR(image_name))
4088		image_name = NULL;
4089	else
4090		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4091out:
4092	kfree(reply_buf);
4093	kfree(image_id);
4094
4095	return image_name;
4096}
4097
4098static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4099{
4100	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4101	const char *snap_name;
4102	u32 which = 0;
4103
4104	/* Skip over names until we find the one we are looking for */
4105
4106	snap_name = rbd_dev->header.snap_names;
4107	while (which < snapc->num_snaps) {
4108		if (!strcmp(name, snap_name))
4109			return snapc->snaps[which];
4110		snap_name += strlen(snap_name) + 1;
4111		which++;
4112	}
4113	return CEPH_NOSNAP;
4114}
4115
4116static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4117{
4118	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4119	u32 which;
4120	bool found = false;
4121	u64 snap_id;
4122
4123	for (which = 0; !found && which < snapc->num_snaps; which++) {
4124		const char *snap_name;
4125
4126		snap_id = snapc->snaps[which];
4127		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
4128		if (IS_ERR(snap_name)) {
4129			/* ignore no-longer existing snapshots */
4130			if (PTR_ERR(snap_name) == -ENOENT)
4131				continue;
4132			else
4133				break;
4134		}
4135		found = !strcmp(name, snap_name);
4136		kfree(snap_name);
4137	}
4138	return found ? snap_id : CEPH_NOSNAP;
4139}
4140
4141/*
4142 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4143 * no snapshot by that name is found, or if an error occurs.
4144 */
4145static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4146{
4147	if (rbd_dev->image_format == 1)
4148		return rbd_v1_snap_id_by_name(rbd_dev, name);
4149
4150	return rbd_v2_snap_id_by_name(rbd_dev, name);
4151}
4152
4153/*
4154 * When an rbd image has a parent image, it is identified by the
4155 * pool, image, and snapshot ids (not names).  This function fills
4156 * in the names for those ids.  (It's OK if we can't figure out the
4157 * name for an image id, but the pool and snapshot ids should always
4158 * exist and have names.)  All names in an rbd spec are dynamically
4159 * allocated.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4160 *
4161 * When an image being mapped (not a parent) is probed, we have the
4162 * pool name and pool id, image name and image id, and the snapshot
4163 * name.  The only thing we're missing is the snapshot id.
4164 */
4165static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
4166{
4167	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4168	struct rbd_spec *spec = rbd_dev->spec;
4169	const char *pool_name;
4170	const char *image_name;
4171	const char *snap_name;
4172	int ret;
4173
4174	/*
4175	 * An image being mapped will have the pool name (etc.), but
4176	 * we need to look up the snapshot id.
4177	 */
4178	if (spec->pool_name) {
4179		if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4180			u64 snap_id;
4181
4182			snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
4183			if (snap_id == CEPH_NOSNAP)
4184				return -ENOENT;
4185			spec->snap_id = snap_id;
4186		} else {
4187			spec->snap_id = CEPH_NOSNAP;
4188		}
4189
4190		return 0;
4191	}
4192
4193	/* Get the pool name; we have to make our own copy of this */
4194
4195	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
4196	if (!pool_name) {
4197		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4198		return -EIO;
4199	}
4200	pool_name = kstrdup(pool_name, GFP_KERNEL);
4201	if (!pool_name)
4202		return -ENOMEM;
4203
4204	/* Fetch the image name; tolerate failure here */
4205
4206	image_name = rbd_dev_image_name(rbd_dev);
4207	if (!image_name)
4208		rbd_warn(rbd_dev, "unable to get image name");
4209
4210	/* Look up the snapshot name, and make a copy */
4211
4212	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
4213	if (IS_ERR(snap_name)) {
4214		ret = PTR_ERR(snap_name);
4215		goto out_err;
4216	}
4217
4218	spec->pool_name = pool_name;
4219	spec->image_name = image_name;
4220	spec->snap_name = snap_name;
4221
4222	return 0;
 
4223out_err:
4224	kfree(image_name);
4225	kfree(pool_name);
4226
4227	return ret;
4228}
4229
4230static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
4231{
4232	size_t size;
4233	int ret;
4234	void *reply_buf;
4235	void *p;
4236	void *end;
4237	u64 seq;
4238	u32 snap_count;
4239	struct ceph_snap_context *snapc;
4240	u32 i;
4241
4242	/*
4243	 * We'll need room for the seq value (maximum snapshot id),
4244	 * snapshot count, and array of that many snapshot ids.
4245	 * For now we have a fixed upper limit on the number we're
4246	 * prepared to receive.
4247	 */
4248	size = sizeof (__le64) + sizeof (__le32) +
4249			RBD_MAX_SNAP_COUNT * sizeof (__le64);
4250	reply_buf = kzalloc(size, GFP_KERNEL);
4251	if (!reply_buf)
4252		return -ENOMEM;
4253
4254	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4255				"rbd", "get_snapcontext", NULL, 0,
4256				reply_buf, size);
4257	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4258	if (ret < 0)
4259		goto out;
4260
4261	p = reply_buf;
4262	end = reply_buf + ret;
4263	ret = -ERANGE;
4264	ceph_decode_64_safe(&p, end, seq, out);
4265	ceph_decode_32_safe(&p, end, snap_count, out);
4266
4267	/*
4268	 * Make sure the reported number of snapshot ids wouldn't go
4269	 * beyond the end of our buffer.  But before checking that,
4270	 * make sure the computed size of the snapshot context we
4271	 * allocate is representable in a size_t.
4272	 */
4273	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
4274				 / sizeof (u64)) {
4275		ret = -EINVAL;
4276		goto out;
4277	}
4278	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
4279		goto out;
4280	ret = 0;
4281
4282	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
4283	if (!snapc) {
4284		ret = -ENOMEM;
4285		goto out;
4286	}
4287	snapc->seq = seq;
4288	for (i = 0; i < snap_count; i++)
4289		snapc->snaps[i] = ceph_decode_64(&p);
4290
4291	ceph_put_snap_context(rbd_dev->header.snapc);
4292	rbd_dev->header.snapc = snapc;
4293
4294	dout("  snap context seq = %llu, snap_count = %u\n",
4295		(unsigned long long)seq, (unsigned int)snap_count);
4296out:
4297	kfree(reply_buf);
4298
4299	return ret;
4300}
4301
4302static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4303					u64 snap_id)
4304{
4305	size_t size;
4306	void *reply_buf;
4307	__le64 snapid;
4308	int ret;
4309	void *p;
4310	void *end;
4311	char *snap_name;
4312
4313	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4314	reply_buf = kmalloc(size, GFP_KERNEL);
4315	if (!reply_buf)
4316		return ERR_PTR(-ENOMEM);
4317
4318	snapid = cpu_to_le64(snap_id);
4319	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4320				"rbd", "get_snapshot_name",
4321				&snapid, sizeof (snapid),
4322				reply_buf, size);
4323	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4324	if (ret < 0) {
4325		snap_name = ERR_PTR(ret);
4326		goto out;
4327	}
4328
4329	p = reply_buf;
4330	end = reply_buf + ret;
4331	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4332	if (IS_ERR(snap_name))
4333		goto out;
4334
4335	dout("  snap_id 0x%016llx snap_name = %s\n",
4336		(unsigned long long)snap_id, snap_name);
4337out:
4338	kfree(reply_buf);
4339
4340	return snap_name;
4341}
4342
4343static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4344{
4345	bool first_time = rbd_dev->header.object_prefix == NULL;
4346	int ret;
4347
4348	ret = rbd_dev_v2_image_size(rbd_dev);
4349	if (ret)
4350		return ret;
4351
4352	if (first_time) {
4353		ret = rbd_dev_v2_header_onetime(rbd_dev);
4354		if (ret)
4355			return ret;
4356	}
4357
4358	/*
4359	 * If the image supports layering, get the parent info.  We
4360	 * need to probe the first time regardless.  Thereafter we
4361	 * only need to if there's a parent, to see if it has
4362	 * disappeared due to the mapped image getting flattened.
4363	 */
4364	if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
4365			(first_time || rbd_dev->parent_spec)) {
4366		bool warn;
4367
4368		ret = rbd_dev_v2_parent_info(rbd_dev);
4369		if (ret)
4370			return ret;
4371
4372		/*
4373		 * Print a warning if this is the initial probe and
4374		 * the image has a parent.  Don't print it if the
4375		 * image now being probed is itself a parent.  We
4376		 * can tell at this point because we won't know its
4377		 * pool name yet (just its pool id).
4378		 */
4379		warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
4380		if (first_time && warn)
4381			rbd_warn(rbd_dev, "WARNING: kernel layering "
4382					"is EXPERIMENTAL!");
4383	}
4384
4385	if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
4386		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
4387			rbd_dev->mapping.size = rbd_dev->header.image_size;
4388
4389	ret = rbd_dev_v2_snap_context(rbd_dev);
4390	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4391
4392	return ret;
4393}
4394
4395static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4396{
4397	struct device *dev;
4398	int ret;
4399
4400	dev = &rbd_dev->dev;
4401	dev->bus = &rbd_bus_type;
4402	dev->type = &rbd_device_type;
4403	dev->parent = &rbd_root_dev;
4404	dev->release = rbd_dev_device_release;
4405	dev_set_name(dev, "%d", rbd_dev->dev_id);
4406	ret = device_register(dev);
4407
4408	return ret;
4409}
4410
4411static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4412{
4413	device_unregister(&rbd_dev->dev);
4414}
4415
4416/*
4417 * Get a unique rbd identifier for the given new rbd_dev, and add
4418 * the rbd_dev to the global list.
4419 */
4420static int rbd_dev_id_get(struct rbd_device *rbd_dev)
4421{
4422	int new_dev_id;
4423
4424	new_dev_id = ida_simple_get(&rbd_dev_id_ida,
4425				    0, minor_to_rbd_dev_id(1 << MINORBITS),
4426				    GFP_KERNEL);
4427	if (new_dev_id < 0)
4428		return new_dev_id;
4429
4430	rbd_dev->dev_id = new_dev_id;
4431
4432	spin_lock(&rbd_dev_list_lock);
4433	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4434	spin_unlock(&rbd_dev_list_lock);
4435
4436	dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
4437
4438	return 0;
4439}
4440
4441/*
4442 * Remove an rbd_dev from the global list, and record that its
4443 * identifier is no longer in use.
4444 */
4445static void rbd_dev_id_put(struct rbd_device *rbd_dev)
4446{
4447	spin_lock(&rbd_dev_list_lock);
4448	list_del_init(&rbd_dev->node);
4449	spin_unlock(&rbd_dev_list_lock);
4450
4451	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4452
4453	dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
4454}
4455
4456/*
4457 * Skips over white space at *buf, and updates *buf to point to the
4458 * first found non-space character (if any). Returns the length of
4459 * the token (string of non-white space characters) found.  Note
4460 * that *buf must be terminated with '\0'.
4461 */
4462static inline size_t next_token(const char **buf)
4463{
4464        /*
4465        * These are the characters that produce nonzero for
4466        * isspace() in the "C" and "POSIX" locales.
4467        */
4468        const char *spaces = " \f\n\r\t\v";
4469
4470        *buf += strspn(*buf, spaces);	/* Find start of token */
4471
4472	return strcspn(*buf, spaces);   /* Return token length */
4473}
4474
4475/*
4476 * Finds the next token in *buf, and if the provided token buffer is
4477 * big enough, copies the found token into it.  The result, if
4478 * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4479 * must be terminated with '\0' on entry.
4480 *
4481 * Returns the length of the token found (not including the '\0').
4482 * Return value will be 0 if no token is found, and it will be >=
4483 * token_size if the token would not fit.
4484 *
4485 * The *buf pointer will be updated to point beyond the end of the
4486 * found token.  Note that this occurs even if the token buffer is
4487 * too small to hold it.
4488 */
4489static inline size_t copy_token(const char **buf,
4490				char *token,
4491				size_t token_size)
4492{
4493        size_t len;
4494
4495	len = next_token(buf);
4496	if (len < token_size) {
4497		memcpy(token, *buf, len);
4498		*(token + len) = '\0';
4499	}
4500	*buf += len;
4501
4502        return len;
4503}
4504
4505/*
4506 * Finds the next token in *buf, dynamically allocates a buffer big
4507 * enough to hold a copy of it, and copies the token into the new
4508 * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4509 * that a duplicate buffer is created even for a zero-length token.
4510 *
4511 * Returns a pointer to the newly-allocated duplicate, or a null
4512 * pointer if memory for the duplicate was not available.  If
4513 * the lenp argument is a non-null pointer, the length of the token
4514 * (not including the '\0') is returned in *lenp.
4515 *
4516 * If successful, the *buf pointer will be updated to point beyond
4517 * the end of the found token.
4518 *
4519 * Note: uses GFP_KERNEL for allocation.
4520 */
4521static inline char *dup_token(const char **buf, size_t *lenp)
4522{
4523	char *dup;
4524	size_t len;
4525
4526	len = next_token(buf);
4527	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4528	if (!dup)
4529		return NULL;
4530	*(dup + len) = '\0';
4531	*buf += len;
4532
4533	if (lenp)
4534		*lenp = len;
4535
4536	return dup;
4537}
4538
4539/*
4540 * Parse the options provided for an "rbd add" (i.e., rbd image
4541 * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4542 * and the data written is passed here via a NUL-terminated buffer.
4543 * Returns 0 if successful or an error code otherwise.
4544 *
4545 * The information extracted from these options is recorded in
4546 * the other parameters which return dynamically-allocated
4547 * structures:
4548 *  ceph_opts
4549 *      The address of a pointer that will refer to a ceph options
4550 *      structure.  Caller must release the returned pointer using
4551 *      ceph_destroy_options() when it is no longer needed.
4552 *  rbd_opts
4553 *	Address of an rbd options pointer.  Fully initialized by
4554 *	this function; caller must release with kfree().
4555 *  spec
4556 *	Address of an rbd image specification pointer.  Fully
4557 *	initialized by this function based on parsed options.
4558 *	Caller must release with rbd_spec_put().
4559 *
4560 * The options passed take this form:
4561 *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4562 * where:
4563 *  <mon_addrs>
4564 *      A comma-separated list of one or more monitor addresses.
4565 *      A monitor address is an ip address, optionally followed
4566 *      by a port number (separated by a colon).
4567 *        I.e.:  ip1[:port1][,ip2[:port2]...]
4568 *  <options>
4569 *      A comma-separated list of ceph and/or rbd options.
4570 *  <pool_name>
4571 *      The name of the rados pool containing the rbd image.
4572 *  <image_name>
4573 *      The name of the image in that pool to map.
4574 *  <snap_id>
4575 *      An optional snapshot id.  If provided, the mapping will
4576 *      present data from the image at the time that snapshot was
4577 *      created.  The image head is used if no snapshot id is
4578 *      provided.  Snapshot mappings are always read-only.
4579 */
4580static int rbd_add_parse_args(const char *buf,
4581				struct ceph_options **ceph_opts,
4582				struct rbd_options **opts,
4583				struct rbd_spec **rbd_spec)
4584{
4585	size_t len;
4586	char *options;
4587	const char *mon_addrs;
4588	char *snap_name;
4589	size_t mon_addrs_size;
4590	struct rbd_spec *spec = NULL;
4591	struct rbd_options *rbd_opts = NULL;
4592	struct ceph_options *copts;
4593	int ret;
4594
4595	/* The first four tokens are required */
4596
4597	len = next_token(&buf);
4598	if (!len) {
4599		rbd_warn(NULL, "no monitor address(es) provided");
4600		return -EINVAL;
4601	}
4602	mon_addrs = buf;
4603	mon_addrs_size = len + 1;
4604	buf += len;
4605
4606	ret = -EINVAL;
4607	options = dup_token(&buf, NULL);
4608	if (!options)
4609		return -ENOMEM;
4610	if (!*options) {
4611		rbd_warn(NULL, "no options provided");
4612		goto out_err;
4613	}
4614
4615	spec = rbd_spec_alloc();
4616	if (!spec)
4617		goto out_mem;
4618
4619	spec->pool_name = dup_token(&buf, NULL);
4620	if (!spec->pool_name)
4621		goto out_mem;
4622	if (!*spec->pool_name) {
4623		rbd_warn(NULL, "no pool name provided");
4624		goto out_err;
4625	}
4626
4627	spec->image_name = dup_token(&buf, NULL);
4628	if (!spec->image_name)
4629		goto out_mem;
4630	if (!*spec->image_name) {
4631		rbd_warn(NULL, "no image name provided");
4632		goto out_err;
4633	}
4634
4635	/*
4636	 * Snapshot name is optional; default is to use "-"
4637	 * (indicating the head/no snapshot).
4638	 */
4639	len = next_token(&buf);
4640	if (!len) {
4641		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4642		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4643	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4644		ret = -ENAMETOOLONG;
4645		goto out_err;
4646	}
4647	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4648	if (!snap_name)
4649		goto out_mem;
4650	*(snap_name + len) = '\0';
4651	spec->snap_name = snap_name;
4652
4653	/* Initialize all rbd options to the defaults */
4654
4655	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4656	if (!rbd_opts)
4657		goto out_mem;
4658
4659	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
 
 
 
 
 
4660
4661	copts = ceph_parse_options(options, mon_addrs,
4662					mon_addrs + mon_addrs_size - 1,
4663					parse_rbd_opts_token, rbd_opts);
4664	if (IS_ERR(copts)) {
4665		ret = PTR_ERR(copts);
4666		goto out_err;
4667	}
4668	kfree(options);
4669
4670	*ceph_opts = copts;
4671	*opts = rbd_opts;
4672	*rbd_spec = spec;
4673
4674	return 0;
4675out_mem:
4676	ret = -ENOMEM;
4677out_err:
4678	kfree(rbd_opts);
4679	rbd_spec_put(spec);
4680	kfree(options);
4681
4682	return ret;
4683}
4684
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4685/*
4686 * An rbd format 2 image has a unique identifier, distinct from the
4687 * name given to it by the user.  Internally, that identifier is
4688 * what's used to specify the names of objects related to the image.
4689 *
4690 * A special "rbd id" object is used to map an rbd image name to its
4691 * id.  If that object doesn't exist, then there is no v2 rbd image
4692 * with the supplied name.
4693 *
4694 * This function will record the given rbd_dev's image_id field if
4695 * it can be determined, and in that case will return 0.  If any
4696 * errors occur a negative errno will be returned and the rbd_dev's
4697 * image_id field will be unchanged (and should be NULL).
4698 */
4699static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4700{
4701	int ret;
4702	size_t size;
4703	char *object_name;
4704	void *response;
4705	char *image_id;
4706
4707	/*
4708	 * When probing a parent image, the image id is already
4709	 * known (and the image name likely is not).  There's no
4710	 * need to fetch the image id again in this case.  We
4711	 * do still need to set the image format though.
4712	 */
4713	if (rbd_dev->spec->image_id) {
4714		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4715
4716		return 0;
4717	}
4718
4719	/*
4720	 * First, see if the format 2 image id file exists, and if
4721	 * so, get the image's persistent id from it.
4722	 */
4723	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4724	object_name = kmalloc(size, GFP_NOIO);
4725	if (!object_name)
4726		return -ENOMEM;
4727	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4728	dout("rbd id object name is %s\n", object_name);
4729
4730	/* Response will be an encoded string, which includes a length */
4731
4732	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4733	response = kzalloc(size, GFP_NOIO);
4734	if (!response) {
4735		ret = -ENOMEM;
4736		goto out;
4737	}
4738
4739	/* If it doesn't exist we'll assume it's a format 1 image */
4740
4741	ret = rbd_obj_method_sync(rbd_dev, object_name,
4742				"rbd", "get_id", NULL, 0,
4743				response, RBD_IMAGE_ID_LEN_MAX);
4744	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4745	if (ret == -ENOENT) {
4746		image_id = kstrdup("", GFP_KERNEL);
4747		ret = image_id ? 0 : -ENOMEM;
4748		if (!ret)
4749			rbd_dev->image_format = 1;
4750	} else if (ret > sizeof (__le32)) {
4751		void *p = response;
4752
4753		image_id = ceph_extract_encoded_string(&p, p + ret,
4754						NULL, GFP_NOIO);
4755		ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4756		if (!ret)
4757			rbd_dev->image_format = 2;
4758	} else {
4759		ret = -EINVAL;
4760	}
4761
4762	if (!ret) {
4763		rbd_dev->spec->image_id = image_id;
4764		dout("image_id is %s\n", image_id);
4765	}
4766out:
4767	kfree(response);
4768	kfree(object_name);
4769
4770	return ret;
4771}
4772
4773/*
4774 * Undo whatever state changes are made by v1 or v2 header info
4775 * call.
4776 */
4777static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
4778{
4779	struct rbd_image_header	*header;
4780
4781	/* Drop parent reference unless it's already been done (or none) */
4782
4783	if (rbd_dev->parent_overlap)
4784		rbd_dev_parent_put(rbd_dev);
4785
4786	/* Free dynamic fields from the header, then zero it out */
4787
4788	header = &rbd_dev->header;
4789	ceph_put_snap_context(header->snapc);
4790	kfree(header->snap_sizes);
4791	kfree(header->snap_names);
4792	kfree(header->object_prefix);
4793	memset(header, 0, sizeof (*header));
4794}
4795
4796static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
4797{
4798	int ret;
4799
4800	ret = rbd_dev_v2_object_prefix(rbd_dev);
4801	if (ret)
4802		goto out_err;
4803
4804	/*
4805	 * Get the and check features for the image.  Currently the
4806	 * features are assumed to never change.
4807	 */
4808	ret = rbd_dev_v2_features(rbd_dev);
4809	if (ret)
4810		goto out_err;
4811
4812	/* If the image supports fancy striping, get its parameters */
4813
4814	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4815		ret = rbd_dev_v2_striping_info(rbd_dev);
4816		if (ret < 0)
4817			goto out_err;
4818	}
4819	/* No support for crypto and compression type format 2 images */
4820
 
 
 
 
 
 
 
4821	return 0;
 
4822out_err:
4823	rbd_dev->header.features = 0;
4824	kfree(rbd_dev->header.object_prefix);
4825	rbd_dev->header.object_prefix = NULL;
4826
4827	return ret;
4828}
4829
4830static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
 
 
 
 
 
4831{
4832	struct rbd_device *parent = NULL;
4833	struct rbd_spec *parent_spec;
4834	struct rbd_client *rbdc;
4835	int ret;
4836
4837	if (!rbd_dev->parent_spec)
4838		return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
4839	/*
4840	 * We need to pass a reference to the client and the parent
4841	 * spec when creating the parent rbd_dev.  Images related by
4842	 * parent/child relationships always share both.
4843	 */
4844	parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4845	rbdc = __rbd_get_client(rbd_dev->rbd_client);
4846
4847	ret = -ENOMEM;
4848	parent = rbd_dev_create(rbdc, parent_spec);
4849	if (!parent)
4850		goto out_err;
4851
4852	ret = rbd_dev_image_probe(parent, false);
4853	if (ret < 0)
4854		goto out_err;
4855	rbd_dev->parent = parent;
4856	atomic_set(&rbd_dev->parent_ref, 1);
 
4857
4858	return 0;
4859out_err:
4860	if (parent) {
4861		rbd_dev_unparent(rbd_dev);
4862		kfree(rbd_dev->header_name);
4863		rbd_dev_destroy(parent);
4864	} else {
4865		rbd_put_client(rbdc);
4866		rbd_spec_put(parent_spec);
4867	}
4868
4869	return ret;
 
 
 
 
 
 
4870}
4871
 
 
 
 
4872static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4873{
4874	int ret;
4875
4876	/* Get an id and fill in device name. */
4877
4878	ret = rbd_dev_id_get(rbd_dev);
4879	if (ret)
4880		return ret;
4881
4882	BUILD_BUG_ON(DEV_NAME_LEN
4883			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4884	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4885
4886	/* Record our major and minor device numbers. */
4887
4888	if (!single_major) {
4889		ret = register_blkdev(0, rbd_dev->name);
4890		if (ret < 0)
4891			goto err_out_id;
4892
4893		rbd_dev->major = ret;
4894		rbd_dev->minor = 0;
4895	} else {
4896		rbd_dev->major = rbd_major;
4897		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
4898	}
4899
4900	/* Set up the blkdev mapping. */
4901
4902	ret = rbd_init_disk(rbd_dev);
4903	if (ret)
4904		goto err_out_blkdev;
4905
4906	ret = rbd_dev_mapping_set(rbd_dev);
4907	if (ret)
4908		goto err_out_disk;
 
4909	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
 
4910
4911	ret = rbd_bus_add_dev(rbd_dev);
4912	if (ret)
4913		goto err_out_mapping;
4914
4915	/* Everything's ready.  Announce the disk to the world. */
4916
4917	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4918	add_disk(rbd_dev->disk);
4919
4920	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4921		(unsigned long long) rbd_dev->mapping.size);
4922
4923	return ret;
4924
4925err_out_mapping:
4926	rbd_dev_mapping_clear(rbd_dev);
4927err_out_disk:
4928	rbd_free_disk(rbd_dev);
4929err_out_blkdev:
4930	if (!single_major)
4931		unregister_blkdev(rbd_dev->major, rbd_dev->name);
4932err_out_id:
4933	rbd_dev_id_put(rbd_dev);
4934	rbd_dev_mapping_clear(rbd_dev);
4935
4936	return ret;
4937}
4938
4939static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4940{
4941	struct rbd_spec *spec = rbd_dev->spec;
4942	size_t size;
4943
4944	/* Record the header object name for this rbd image. */
4945
4946	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4947
4948	if (rbd_dev->image_format == 1)
4949		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
 
4950	else
4951		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
 
4952
4953	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4954	if (!rbd_dev->header_name)
4955		return -ENOMEM;
4956
4957	if (rbd_dev->image_format == 1)
4958		sprintf(rbd_dev->header_name, "%s%s",
4959			spec->image_name, RBD_SUFFIX);
4960	else
4961		sprintf(rbd_dev->header_name, "%s%s",
4962			RBD_HEADER_PREFIX, spec->image_id);
4963	return 0;
4964}
4965
4966static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4967{
4968	rbd_dev_unprobe(rbd_dev);
4969	kfree(rbd_dev->header_name);
4970	rbd_dev->header_name = NULL;
4971	rbd_dev->image_format = 0;
4972	kfree(rbd_dev->spec->image_id);
4973	rbd_dev->spec->image_id = NULL;
4974
4975	rbd_dev_destroy(rbd_dev);
4976}
4977
4978/*
4979 * Probe for the existence of the header object for the given rbd
4980 * device.  If this image is the one being mapped (i.e., not a
4981 * parent), initiate a watch on its header object before using that
4982 * object to get detailed information about the rbd image.
4983 */
4984static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
4985{
4986	int ret;
4987
4988	/*
4989	 * Get the id from the image id object.  Unless there's an
4990	 * error, rbd_dev->spec->image_id will be filled in with
4991	 * a dynamically-allocated string, and rbd_dev->image_format
4992	 * will be set to either 1 or 2.
4993	 */
4994	ret = rbd_dev_image_id(rbd_dev);
4995	if (ret)
4996		return ret;
4997	rbd_assert(rbd_dev->spec->image_id);
4998	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4999
5000	ret = rbd_dev_header_name(rbd_dev);
5001	if (ret)
5002		goto err_out_format;
5003
5004	if (mapping) {
5005		ret = rbd_dev_header_watch_sync(rbd_dev);
5006		if (ret)
5007			goto out_header_name;
 
 
 
 
 
5008	}
5009
5010	if (rbd_dev->image_format == 1)
5011		ret = rbd_dev_v1_header_info(rbd_dev);
5012	else
5013		ret = rbd_dev_v2_header_info(rbd_dev);
5014	if (ret)
5015		goto err_out_watch;
5016
5017	ret = rbd_dev_spec_update(rbd_dev);
5018	if (ret)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5019		goto err_out_probe;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5020
5021	ret = rbd_dev_probe_parent(rbd_dev);
5022	if (ret)
5023		goto err_out_probe;
5024
5025	dout("discovered format %u image, header name is %s\n",
5026		rbd_dev->image_format, rbd_dev->header_name);
 
5027
5028	return 0;
5029err_out_probe:
5030	rbd_dev_unprobe(rbd_dev);
5031err_out_watch:
5032	if (mapping)
5033		rbd_dev_header_unwatch_sync(rbd_dev);
5034out_header_name:
5035	kfree(rbd_dev->header_name);
5036	rbd_dev->header_name = NULL;
5037err_out_format:
5038	rbd_dev->image_format = 0;
5039	kfree(rbd_dev->spec->image_id);
5040	rbd_dev->spec->image_id = NULL;
5041
5042	dout("probe failed, returning %d\n", ret);
5043
5044	return ret;
5045}
5046
5047static ssize_t do_rbd_add(struct bus_type *bus,
5048			  const char *buf,
5049			  size_t count)
5050{
5051	struct rbd_device *rbd_dev = NULL;
5052	struct ceph_options *ceph_opts = NULL;
5053	struct rbd_options *rbd_opts = NULL;
5054	struct rbd_spec *spec = NULL;
5055	struct rbd_client *rbdc;
5056	struct ceph_osd_client *osdc;
5057	bool read_only;
5058	int rc = -ENOMEM;
5059
5060	if (!try_module_get(THIS_MODULE))
5061		return -ENODEV;
5062
5063	/* parse add command */
5064	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5065	if (rc < 0)
5066		goto err_out_module;
5067	read_only = rbd_opts->read_only;
5068	kfree(rbd_opts);
5069	rbd_opts = NULL;	/* done with this */
5070
5071	rbdc = rbd_get_client(ceph_opts);
5072	if (IS_ERR(rbdc)) {
5073		rc = PTR_ERR(rbdc);
5074		goto err_out_args;
5075	}
5076
5077	/* pick the pool */
5078	osdc = &rbdc->client->osdc;
5079	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
5080	if (rc < 0)
 
5081		goto err_out_client;
 
5082	spec->pool_id = (u64)rc;
5083
5084	/* The ceph file layout needs to fit pool id in 32 bits */
5085
5086	if (spec->pool_id > (u64)U32_MAX) {
5087		rbd_warn(NULL, "pool id too large (%llu > %u)\n",
5088				(unsigned long long)spec->pool_id, U32_MAX);
5089		rc = -EIO;
5090		goto err_out_client;
5091	}
5092
5093	rbd_dev = rbd_dev_create(rbdc, spec);
5094	if (!rbd_dev)
5095		goto err_out_client;
5096	rbdc = NULL;		/* rbd_dev now owns this */
5097	spec = NULL;		/* rbd_dev now owns this */
 
 
 
 
 
 
 
5098
5099	rc = rbd_dev_image_probe(rbd_dev, true);
5100	if (rc < 0)
 
 
5101		goto err_out_rbd_dev;
 
5102
5103	/* If we are mapping a snapshot it must be marked read-only */
5104
5105	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
5106		read_only = true;
5107	rbd_dev->mapping.read_only = read_only;
5108
5109	rc = rbd_dev_device_setup(rbd_dev);
5110	if (rc) {
5111		/*
5112		 * rbd_dev_header_unwatch_sync() can't be moved into
5113		 * rbd_dev_image_release() without refactoring, see
5114		 * commit 1f3ef78861ac.
5115		 */
5116		rbd_dev_header_unwatch_sync(rbd_dev);
5117		rbd_dev_image_release(rbd_dev);
5118		goto err_out_module;
5119	}
5120
5121	return count;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5122
 
 
 
 
 
 
5123err_out_rbd_dev:
5124	rbd_dev_destroy(rbd_dev);
5125err_out_client:
5126	rbd_put_client(rbdc);
5127err_out_args:
5128	rbd_spec_put(spec);
5129err_out_module:
5130	module_put(THIS_MODULE);
5131
5132	dout("Error adding device %s\n", buf);
5133
5134	return (ssize_t)rc;
5135}
5136
5137static ssize_t rbd_add(struct bus_type *bus,
5138		       const char *buf,
5139		       size_t count)
5140{
5141	if (single_major)
5142		return -EINVAL;
5143
5144	return do_rbd_add(bus, buf, count);
5145}
5146
5147static ssize_t rbd_add_single_major(struct bus_type *bus,
5148				    const char *buf,
5149				    size_t count)
5150{
5151	return do_rbd_add(bus, buf, count);
5152}
5153
5154static void rbd_dev_device_release(struct device *dev)
5155{
5156	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5157
5158	rbd_free_disk(rbd_dev);
5159	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5160	rbd_dev_mapping_clear(rbd_dev);
5161	if (!single_major)
5162		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5163	rbd_dev_id_put(rbd_dev);
5164	rbd_dev_mapping_clear(rbd_dev);
5165}
5166
5167static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5168{
5169	while (rbd_dev->parent) {
5170		struct rbd_device *first = rbd_dev;
5171		struct rbd_device *second = first->parent;
5172		struct rbd_device *third;
5173
5174		/*
5175		 * Follow to the parent with no grandparent and
5176		 * remove it.
5177		 */
5178		while (second && (third = second->parent)) {
5179			first = second;
5180			second = third;
5181		}
5182		rbd_assert(second);
5183		rbd_dev_image_release(second);
 
5184		first->parent = NULL;
5185		first->parent_overlap = 0;
5186
5187		rbd_assert(first->parent_spec);
5188		rbd_spec_put(first->parent_spec);
5189		first->parent_spec = NULL;
5190	}
5191}
5192
5193static ssize_t do_rbd_remove(struct bus_type *bus,
5194			     const char *buf,
5195			     size_t count)
5196{
5197	struct rbd_device *rbd_dev = NULL;
5198	struct list_head *tmp;
5199	int dev_id;
5200	unsigned long ul;
5201	bool already = false;
 
5202	int ret;
5203
5204	ret = kstrtoul(buf, 10, &ul);
5205	if (ret)
5206		return ret;
5207
5208	/* convert to int; abort if we lost anything in the conversion */
5209	dev_id = (int)ul;
5210	if (dev_id != ul)
5211		return -EINVAL;
 
 
 
 
 
 
 
 
 
5212
5213	ret = -ENOENT;
5214	spin_lock(&rbd_dev_list_lock);
5215	list_for_each(tmp, &rbd_dev_list) {
5216		rbd_dev = list_entry(tmp, struct rbd_device, node);
5217		if (rbd_dev->dev_id == dev_id) {
5218			ret = 0;
5219			break;
5220		}
5221	}
5222	if (!ret) {
5223		spin_lock_irq(&rbd_dev->lock);
5224		if (rbd_dev->open_count)
5225			ret = -EBUSY;
5226		else
5227			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
5228							&rbd_dev->flags);
5229		spin_unlock_irq(&rbd_dev->lock);
5230	}
5231	spin_unlock(&rbd_dev_list_lock);
5232	if (ret < 0 || already)
5233		return ret;
5234
5235	rbd_dev_header_unwatch_sync(rbd_dev);
5236	/*
5237	 * flush remaining watch callbacks - these must be complete
5238	 * before the osd_client is shutdown
5239	 */
5240	dout("%s: flushing notifies", __func__);
5241	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
 
 
 
 
 
 
 
5242
5243	/*
5244	 * Don't free anything from rbd_dev->disk until after all
5245	 * notifies are completely processed. Otherwise
5246	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
5247	 * in a potential use after free of rbd_dev->disk or rbd_dev.
5248	 */
5249	rbd_bus_del_dev(rbd_dev);
5250	rbd_dev_image_release(rbd_dev);
5251	module_put(THIS_MODULE);
5252
5253	return count;
5254}
5255
5256static ssize_t rbd_remove(struct bus_type *bus,
5257			  const char *buf,
5258			  size_t count)
5259{
5260	if (single_major)
5261		return -EINVAL;
5262
5263	return do_rbd_remove(bus, buf, count);
5264}
5265
5266static ssize_t rbd_remove_single_major(struct bus_type *bus,
5267				       const char *buf,
5268				       size_t count)
5269{
5270	return do_rbd_remove(bus, buf, count);
5271}
5272
5273/*
5274 * create control files in sysfs
5275 * /sys/bus/rbd/...
5276 */
5277static int rbd_sysfs_init(void)
5278{
5279	int ret;
5280
5281	ret = device_register(&rbd_root_dev);
5282	if (ret < 0)
5283		return ret;
5284
5285	ret = bus_register(&rbd_bus_type);
5286	if (ret < 0)
5287		device_unregister(&rbd_root_dev);
5288
5289	return ret;
5290}
5291
5292static void rbd_sysfs_cleanup(void)
5293{
5294	bus_unregister(&rbd_bus_type);
5295	device_unregister(&rbd_root_dev);
5296}
5297
5298static int rbd_slab_init(void)
5299{
5300	rbd_assert(!rbd_img_request_cache);
5301	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
5302					sizeof (struct rbd_img_request),
5303					__alignof__(struct rbd_img_request),
5304					0, NULL);
5305	if (!rbd_img_request_cache)
5306		return -ENOMEM;
5307
5308	rbd_assert(!rbd_obj_request_cache);
5309	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5310					sizeof (struct rbd_obj_request),
5311					__alignof__(struct rbd_obj_request),
5312					0, NULL);
5313	if (!rbd_obj_request_cache)
5314		goto out_err;
5315
5316	rbd_assert(!rbd_segment_name_cache);
5317	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
5318					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
5319	if (rbd_segment_name_cache)
5320		return 0;
5321out_err:
5322	if (rbd_obj_request_cache) {
5323		kmem_cache_destroy(rbd_obj_request_cache);
5324		rbd_obj_request_cache = NULL;
5325	}
5326
5327	kmem_cache_destroy(rbd_img_request_cache);
5328	rbd_img_request_cache = NULL;
5329
5330	return -ENOMEM;
5331}
5332
5333static void rbd_slab_exit(void)
5334{
5335	rbd_assert(rbd_segment_name_cache);
5336	kmem_cache_destroy(rbd_segment_name_cache);
5337	rbd_segment_name_cache = NULL;
5338
5339	rbd_assert(rbd_obj_request_cache);
5340	kmem_cache_destroy(rbd_obj_request_cache);
5341	rbd_obj_request_cache = NULL;
5342
5343	rbd_assert(rbd_img_request_cache);
5344	kmem_cache_destroy(rbd_img_request_cache);
5345	rbd_img_request_cache = NULL;
5346}
5347
5348static int __init rbd_init(void)
5349{
5350	int rc;
5351
5352	if (!libceph_compatible(NULL)) {
5353		rbd_warn(NULL, "libceph incompatibility (quitting)");
5354		return -EINVAL;
5355	}
5356
5357	rc = rbd_slab_init();
5358	if (rc)
5359		return rc;
5360
 
 
 
 
 
 
 
 
 
 
5361	if (single_major) {
5362		rbd_major = register_blkdev(0, RBD_DRV_NAME);
5363		if (rbd_major < 0) {
5364			rc = rbd_major;
5365			goto err_out_slab;
5366		}
5367	}
5368
5369	rc = rbd_sysfs_init();
5370	if (rc)
5371		goto err_out_blkdev;
5372
5373	if (single_major)
5374		pr_info("loaded (major %d)\n", rbd_major);
5375	else
5376		pr_info("loaded\n");
5377
5378	return 0;
5379
5380err_out_blkdev:
5381	if (single_major)
5382		unregister_blkdev(rbd_major, RBD_DRV_NAME);
 
 
5383err_out_slab:
5384	rbd_slab_exit();
5385	return rc;
5386}
5387
5388static void __exit rbd_exit(void)
5389{
 
5390	rbd_sysfs_cleanup();
5391	if (single_major)
5392		unregister_blkdev(rbd_major, RBD_DRV_NAME);
 
5393	rbd_slab_exit();
5394}
5395
5396module_init(rbd_init);
5397module_exit(rbd_exit);
5398
5399MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
5400MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5401MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5402/* following authorship retained from original osdblk.c */
5403MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5404
5405MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
5406MODULE_LICENSE("GPL");