drbd_main.c - drivers/block/drbd/drbd_main.c - Linux diff v3.1

   1/*
   2   drbd.c
   3
   4   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10   Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
  11   from Logicworks, Inc. for making SDP replication support possible.
  12
  13   drbd is free software; you can redistribute it and/or modify
  14   it under the terms of the GNU General Public License as published by
  15   the Free Software Foundation; either version 2, or (at your option)
  16   any later version.
  17
  18   drbd is distributed in the hope that it will be useful,
  19   but WITHOUT ANY WARRANTY; without even the implied warranty of
  20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21   GNU General Public License for more details.
  22
  23   You should have received a copy of the GNU General Public License
  24   along with drbd; see the file COPYING.  If not, write to
  25   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  26
  27 */
  28
 
 
  29#include <linux/module.h>
 
  30#include <linux/drbd.h>
  31#include <asm/uaccess.h>
  32#include <asm/types.h>
  33#include <net/sock.h>
  34#include <linux/ctype.h>
  35#include <linux/mutex.h>
  36#include <linux/fs.h>
  37#include <linux/file.h>
  38#include <linux/proc_fs.h>
  39#include <linux/init.h>
  40#include <linux/mm.h>
  41#include <linux/memcontrol.h>
  42#include <linux/mm_inline.h>
  43#include <linux/slab.h>
  44#include <linux/random.h>
  45#include <linux/reboot.h>
  46#include <linux/notifier.h>
  47#include <linux/kthread.h>
  48
  49#define __KERNEL_SYSCALLS__
  50#include <linux/unistd.h>
  51#include <linux/vmalloc.h>
 
  52
  53#include <linux/drbd_limits.h>
  54#include "drbd_int.h"
 
  55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
  56
  57#include "drbd_vli.h"
  58
  59struct after_state_chg_work {
  60	struct drbd_work w;
  61	union drbd_state os;
  62	union drbd_state ns;
  63	enum chg_state_flags flags;
  64	struct completion *done;
  65};
  66
  67static DEFINE_MUTEX(drbd_main_mutex);
  68int drbdd_init(struct drbd_thread *);
  69int drbd_worker(struct drbd_thread *);
  70int drbd_asender(struct drbd_thread *);
  71
  72int drbd_init(void);
  73static int drbd_open(struct block_device *bdev, fmode_t mode);
  74static int drbd_release(struct gendisk *gd, fmode_t mode);
  75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
  76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
  77			   union drbd_state ns, enum chg_state_flags flags);
  78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
  79static void md_sync_timer_fn(unsigned long data);
  80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
  81static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
  82
  83MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
  84	      "Lars Ellenberg <lars@linbit.com>");
  85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
  86MODULE_VERSION(REL_VERSION);
  87MODULE_LICENSE("GPL");
  88MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
  89		 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
  90MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
  91
  92#include <linux/moduleparam.h>
  93/* allow_open_on_secondary */
  94MODULE_PARM_DESC(allow_oos, "DONT USE!");
  95/* thanks to these macros, if compiled into the kernel (not-module),
  96 * this becomes the boot parameter drbd.minor_count */
  97module_param(minor_count, uint, 0444);
  98module_param(disable_sendpage, bool, 0644);
  99module_param(allow_oos, bool, 0);
 100module_param(cn_idx, uint, 0444);
 101module_param(proc_details, int, 0644);
 102
 103#ifdef CONFIG_DRBD_FAULT_INJECTION
 104int enable_faults;
 105int fault_rate;
 106static int fault_count;
 107int fault_devs;
 108/* bitmap of enabled faults */
 109module_param(enable_faults, int, 0664);
 110/* fault rate % value - applies to all enabled faults */
 111module_param(fault_rate, int, 0664);
 112/* count of faults inserted */
 113module_param(fault_count, int, 0664);
 114/* bitmap of devices to insert faults on */
 115module_param(fault_devs, int, 0644);
 116#endif
 117
 118/* module parameter, defined */
 119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
 120int disable_sendpage;
 121int allow_oos;
 122unsigned int cn_idx = CN_IDX_DRBD;
 123int proc_details;       /* Detail level in proc drbd*/
 124
 
 
 
 
 
 125/* Module parameter for setting the user mode helper program
 126 * to run. Default is /sbin/drbdadm */
 127char usermode_helper[80] = "/sbin/drbdadm";
 128
 129module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
 130
 131/* in 2.6.x, our device mapping and config info contains our virtual gendisks
 132 * as member "struct gendisk *vdisk;"
 133 */
 134struct drbd_conf **minor_table;
 
 
 135
 136struct kmem_cache *drbd_request_cache;
 137struct kmem_cache *drbd_ee_cache;	/* epoch entries */
 138struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
 139struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
 140mempool_t *drbd_request_mempool;
 141mempool_t *drbd_ee_mempool;
 
 
 
 142
 143/* I do not use a standard mempool, because:
 144   1) I want to hand out the pre-allocated objects first.
 145   2) I want to be able to interrupt sleeping allocation with a signal.
 146   Note: This is a single linked list, the next pointer is the private
 147	 member of struct page.
 148 */
 149struct page *drbd_pp_pool;
 150spinlock_t   drbd_pp_lock;
 151int          drbd_pp_vacant;
 152wait_queue_head_t drbd_pp_wait;
 153
 154DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
 155
 156static const struct block_device_operations drbd_ops = {
 157	.owner =   THIS_MODULE,
 158	.open =    drbd_open,
 159	.release = drbd_release,
 160};
 161
 162#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
 
 
 
 
 
 
 
 
 
 
 
 163
 164#ifdef __CHECKER__
 165/* When checking with sparse, and this is an inline function, sparse will
 166   give tons of false positives. When this is a real functions sparse works.
 167 */
 168int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
 169{
 170	int io_allowed;
 171
 172	atomic_inc(&mdev->local_cnt);
 173	io_allowed = (mdev->state.disk >= mins);
 174	if (!io_allowed) {
 175		if (atomic_dec_and_test(&mdev->local_cnt))
 176			wake_up(&mdev->misc_wait);
 177	}
 178	return io_allowed;
 179}
 180
 181#endif
 182
 183/**
 184 * DOC: The transfer log
 185 *
 186 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
 187 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
 188 * of the list. There is always at least one &struct drbd_tl_epoch object.
 189 *
 190 * Each &struct drbd_tl_epoch has a circular double linked list of requests
 191 * attached.
 192 */
 193static int tl_init(struct drbd_conf *mdev)
 194{
 195	struct drbd_tl_epoch *b;
 196
 197	/* during device minor initialization, we may well use GFP_KERNEL */
 198	b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
 199	if (!b)
 200		return 0;
 201	INIT_LIST_HEAD(&b->requests);
 202	INIT_LIST_HEAD(&b->w.list);
 203	b->next = NULL;
 204	b->br_number = 4711;
 205	b->n_writes = 0;
 206	b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
 207
 208	mdev->oldest_tle = b;
 209	mdev->newest_tle = b;
 210	INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
 211
 212	mdev->tl_hash = NULL;
 213	mdev->tl_hash_s = 0;
 214
 215	return 1;
 216}
 217
 218static void tl_cleanup(struct drbd_conf *mdev)
 219{
 220	D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
 221	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
 222	kfree(mdev->oldest_tle);
 223	mdev->oldest_tle = NULL;
 224	kfree(mdev->unused_spare_tle);
 225	mdev->unused_spare_tle = NULL;
 226	kfree(mdev->tl_hash);
 227	mdev->tl_hash = NULL;
 228	mdev->tl_hash_s = 0;
 229}
 230
 231/**
 232 * _tl_add_barrier() - Adds a barrier to the transfer log
 233 * @mdev:	DRBD device.
 234 * @new:	Barrier to be added before the current head of the TL.
 235 *
 236 * The caller must hold the req_lock.
 237 */
 238void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
 239{
 240	struct drbd_tl_epoch *newest_before;
 241
 242	INIT_LIST_HEAD(&new->requests);
 243	INIT_LIST_HEAD(&new->w.list);
 244	new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
 245	new->next = NULL;
 246	new->n_writes = 0;
 247
 248	newest_before = mdev->newest_tle;
 249	/* never send a barrier number == 0, because that is special-cased
 250	 * when using TCQ for our write ordering code */
 251	new->br_number = (newest_before->br_number+1) ?: 1;
 252	if (mdev->newest_tle != new) {
 253		mdev->newest_tle->next = new;
 254		mdev->newest_tle = new;
 255	}
 256}
 257
 258/**
 259 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
 260 * @mdev:	DRBD device.
 261 * @barrier_nr:	Expected identifier of the DRBD write barrier packet.
 262 * @set_size:	Expected number of requests before that barrier.
 263 *
 264 * In case the passed barrier_nr or set_size does not match the oldest
 265 * &struct drbd_tl_epoch objects this function will cause a termination
 266 * of the connection.
 267 */
 268void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
 269		       unsigned int set_size)
 270{
 271	struct drbd_tl_epoch *b, *nob; /* next old barrier */
 272	struct list_head *le, *tle;
 273	struct drbd_request *r;
 274
 275	spin_lock_irq(&mdev->req_lock);
 276
 277	b = mdev->oldest_tle;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 278
 279	/* first some paranoia code */
 280	if (b == NULL) {
 281		dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
 282			barrier_nr);
 283		goto bail;
 284	}
 285	if (b->br_number != barrier_nr) {
 286		dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
 287			barrier_nr, b->br_number);
 288		goto bail;
 289	}
 290	if (b->n_writes != set_size) {
 291		dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
 292			barrier_nr, set_size, b->n_writes);
 
 293		goto bail;
 294	}
 295
 296	/* Clean up list of requests processed during current epoch */
 297	list_for_each_safe(le, tle, &b->requests) {
 298		r = list_entry(le, struct drbd_request, tl_requests);
 299		_req_mod(r, barrier_acked);
 300	}
 301	/* There could be requests on the list waiting for completion
 302	   of the write to the local disk. To avoid corruptions of
 303	   slab's data structures we have to remove the lists head.
 304
 305	   Also there could have been a barrier ack out of sequence, overtaking
 306	   the write acks - which would be a bug and violating write ordering.
 307	   To not deadlock in case we lose connection while such requests are
 308	   still pending, we need some way to find them for the
 309	   _req_mode(connection_lost_while_pending).
 310
 311	   These have been list_move'd to the out_of_sequence_requests list in
 312	   _req_mod(, barrier_acked) above.
 313	   */
 314	list_del_init(&b->requests);
 315
 316	nob = b->next;
 317	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
 318		_tl_add_barrier(mdev, b);
 319		if (nob)
 320			mdev->oldest_tle = nob;
 321		/* if nob == NULL b was the only barrier, and becomes the new
 322		   barrier. Therefore mdev->oldest_tle points already to b */
 323	} else {
 324		D_ASSERT(nob != NULL);
 325		mdev->oldest_tle = nob;
 326		kfree(b);
 327	}
 328
 329	spin_unlock_irq(&mdev->req_lock);
 330	dec_ap_pending(mdev);
 331
 332	return;
 333
 334bail:
 335	spin_unlock_irq(&mdev->req_lock);
 336	drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
 337}
 338
 339
 340/**
 341 * _tl_restart() - Walks the transfer log, and applies an action to all requests
 342 * @mdev:	DRBD device.
 343 * @what:       The action/event to perform with all request objects
 344 *
 345 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
 346 * restart_frozen_disk_io.
 347 */
 348static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 
 349{
 350	struct drbd_tl_epoch *b, *tmp, **pn;
 351	struct list_head *le, *tle, carry_reads;
 352	struct drbd_request *req;
 353	int rv, n_writes, n_reads;
 354
 355	b = mdev->oldest_tle;
 356	pn = &mdev->oldest_tle;
 357	while (b) {
 358		n_writes = 0;
 359		n_reads = 0;
 360		INIT_LIST_HEAD(&carry_reads);
 361		list_for_each_safe(le, tle, &b->requests) {
 362			req = list_entry(le, struct drbd_request, tl_requests);
 363			rv = _req_mod(req, what);
 364
 365			n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
 366			n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
 367		}
 368		tmp = b->next;
 369
 370		if (n_writes) {
 371			if (what == resend) {
 372				b->n_writes = n_writes;
 373				if (b->w.cb == NULL) {
 374					b->w.cb = w_send_barrier;
 375					inc_ap_pending(mdev);
 376					set_bit(CREATE_BARRIER, &mdev->flags);
 377				}
 378
 379				drbd_queue_work(&mdev->data.work, &b->w);
 380			}
 381			pn = &b->next;
 382		} else {
 383			if (n_reads)
 384				list_add(&carry_reads, &b->requests);
 385			/* there could still be requests on that ring list,
 386			 * in case local io is still pending */
 387			list_del(&b->requests);
 388
 389			/* dec_ap_pending corresponding to queue_barrier.
 390			 * the newest barrier may not have been queued yet,
 391			 * in which case w.cb is still NULL. */
 392			if (b->w.cb != NULL)
 393				dec_ap_pending(mdev);
 394
 395			if (b == mdev->newest_tle) {
 396				/* recycle, but reinit! */
 397				D_ASSERT(tmp == NULL);
 398				INIT_LIST_HEAD(&b->requests);
 399				list_splice(&carry_reads, &b->requests);
 400				INIT_LIST_HEAD(&b->w.list);
 401				b->w.cb = NULL;
 402				b->br_number = net_random();
 403				b->n_writes = 0;
 404
 405				*pn = b;
 406				break;
 407			}
 408			*pn = tmp;
 409			kfree(b);
 410		}
 411		b = tmp;
 412		list_splice(&carry_reads, &b->requests);
 413	}
 414}
 415
 
 
 
 
 
 
 416
 417/**
 418 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
 419 * @mdev:	DRBD device.
 420 *
 421 * This is called after the connection to the peer was lost. The storage covered
 422 * by the requests on the transfer gets marked as our of sync. Called from the
 423 * receiver thread and the worker thread.
 424 */
 425void tl_clear(struct drbd_conf *mdev)
 426{
 427	struct list_head *le, *tle;
 428	struct drbd_request *r;
 429
 430	spin_lock_irq(&mdev->req_lock);
 431
 432	_tl_restart(mdev, connection_lost_while_pending);
 433
 434	/* we expect this list to be empty. */
 435	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
 436
 437	/* but just in case, clean it up anyways! */
 438	list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
 439		r = list_entry(le, struct drbd_request, tl_requests);
 440		/* It would be nice to complete outside of spinlock.
 441		 * But this is easier for now. */
 442		_req_mod(r, connection_lost_while_pending);
 443	}
 444
 445	/* ensure bit indicating barrier is required is clear */
 446	clear_bit(CREATE_BARRIER, &mdev->flags);
 447
 448	memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
 449
 450	spin_unlock_irq(&mdev->req_lock);
 451}
 452
 453void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 454{
 455	spin_lock_irq(&mdev->req_lock);
 456	_tl_restart(mdev, what);
 457	spin_unlock_irq(&mdev->req_lock);
 458}
 459
 460/**
 461 * cl_wide_st_chg() - true if the state change is a cluster wide one
 462 * @mdev:	DRBD device.
 463 * @os:		old (current) state.
 464 * @ns:		new (wanted) state.
 465 */
 466static int cl_wide_st_chg(struct drbd_conf *mdev,
 467			  union drbd_state os, union drbd_state ns)
 468{
 469	return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
 470		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
 471		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
 472		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
 473		  (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
 474		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
 475		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
 476}
 477
 478enum drbd_state_rv
 479drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
 480		  union drbd_state mask, union drbd_state val)
 481{
 482	unsigned long flags;
 483	union drbd_state os, ns;
 484	enum drbd_state_rv rv;
 485
 486	spin_lock_irqsave(&mdev->req_lock, flags);
 487	os = mdev->state;
 488	ns.i = (os.i & ~mask.i) | val.i;
 489	rv = _drbd_set_state(mdev, ns, f, NULL);
 490	ns = mdev->state;
 491	spin_unlock_irqrestore(&mdev->req_lock, flags);
 492
 493	return rv;
 494}
 495
 496/**
 497 * drbd_force_state() - Impose a change which happens outside our control on our state
 498 * @mdev:	DRBD device.
 499 * @mask:	mask of state bits to change.
 500 * @val:	value of new state bits.
 501 */
 502void drbd_force_state(struct drbd_conf *mdev,
 503	union drbd_state mask, union drbd_state val)
 504{
 505	drbd_change_state(mdev, CS_HARD, mask, val);
 506}
 507
 508static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
 509static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
 510						    union drbd_state,
 511						    union drbd_state);
 512static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
 513				       union drbd_state ns, const char **warn_sync_abort);
 514int drbd_send_state_req(struct drbd_conf *,
 515			union drbd_state, union drbd_state);
 516
 517static enum drbd_state_rv
 518_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
 519	     union drbd_state val)
 520{
 521	union drbd_state os, ns;
 522	unsigned long flags;
 523	enum drbd_state_rv rv;
 524
 525	if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
 526		return SS_CW_SUCCESS;
 527
 528	if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
 529		return SS_CW_FAILED_BY_PEER;
 530
 531	rv = 0;
 532	spin_lock_irqsave(&mdev->req_lock, flags);
 533	os = mdev->state;
 534	ns.i = (os.i & ~mask.i) | val.i;
 535	ns = sanitize_state(mdev, os, ns, NULL);
 536
 537	if (!cl_wide_st_chg(mdev, os, ns))
 538		rv = SS_CW_NO_NEED;
 539	if (!rv) {
 540		rv = is_valid_state(mdev, ns);
 541		if (rv == SS_SUCCESS) {
 542			rv = is_valid_state_transition(mdev, ns, os);
 543			if (rv == SS_SUCCESS)
 544				rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
 545		}
 546	}
 547	spin_unlock_irqrestore(&mdev->req_lock, flags);
 548
 549	return rv;
 550}
 551
 552/**
 553 * drbd_req_state() - Perform an eventually cluster wide state change
 554 * @mdev:	DRBD device.
 555 * @mask:	mask of state bits to change.
 556 * @val:	value of new state bits.
 557 * @f:		flags
 558 *
 559 * Should not be called directly, use drbd_request_state() or
 560 * _drbd_request_state().
 561 */
 562static enum drbd_state_rv
 563drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
 564	       union drbd_state val, enum chg_state_flags f)
 565{
 566	struct completion done;
 567	unsigned long flags;
 568	union drbd_state os, ns;
 569	enum drbd_state_rv rv;
 570
 571	init_completion(&done);
 572
 573	if (f & CS_SERIALIZE)
 574		mutex_lock(&mdev->state_mutex);
 575
 576	spin_lock_irqsave(&mdev->req_lock, flags);
 577	os = mdev->state;
 578	ns.i = (os.i & ~mask.i) | val.i;
 579	ns = sanitize_state(mdev, os, ns, NULL);
 580
 581	if (cl_wide_st_chg(mdev, os, ns)) {
 582		rv = is_valid_state(mdev, ns);
 583		if (rv == SS_SUCCESS)
 584			rv = is_valid_state_transition(mdev, ns, os);
 585		spin_unlock_irqrestore(&mdev->req_lock, flags);
 586
 587		if (rv < SS_SUCCESS) {
 588			if (f & CS_VERBOSE)
 589				print_st_err(mdev, os, ns, rv);
 590			goto abort;
 591		}
 592
 593		drbd_state_lock(mdev);
 594		if (!drbd_send_state_req(mdev, mask, val)) {
 595			drbd_state_unlock(mdev);
 596			rv = SS_CW_FAILED_BY_PEER;
 597			if (f & CS_VERBOSE)
 598				print_st_err(mdev, os, ns, rv);
 599			goto abort;
 600		}
 601
 602		wait_event(mdev->state_wait,
 603			(rv = _req_st_cond(mdev, mask, val)));
 604
 605		if (rv < SS_SUCCESS) {
 606			drbd_state_unlock(mdev);
 607			if (f & CS_VERBOSE)
 608				print_st_err(mdev, os, ns, rv);
 609			goto abort;
 610		}
 611		spin_lock_irqsave(&mdev->req_lock, flags);
 612		os = mdev->state;
 613		ns.i = (os.i & ~mask.i) | val.i;
 614		rv = _drbd_set_state(mdev, ns, f, &done);
 615		drbd_state_unlock(mdev);
 616	} else {
 617		rv = _drbd_set_state(mdev, ns, f, &done);
 618	}
 619
 620	spin_unlock_irqrestore(&mdev->req_lock, flags);
 621
 622	if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
 623		D_ASSERT(current != mdev->worker.task);
 624		wait_for_completion(&done);
 625	}
 626
 627abort:
 628	if (f & CS_SERIALIZE)
 629		mutex_unlock(&mdev->state_mutex);
 630
 631	return rv;
 632}
 633
 634/**
 635 * _drbd_request_state() - Request a state change (with flags)
 636 * @mdev:	DRBD device.
 637 * @mask:	mask of state bits to change.
 638 * @val:	value of new state bits.
 639 * @f:		flags
 640 *
 641 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
 642 * flag, or when logging of failed state change requests is not desired.
 643 */
 644enum drbd_state_rv
 645_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
 646		    union drbd_state val, enum chg_state_flags f)
 647{
 648	enum drbd_state_rv rv;
 649
 650	wait_event(mdev->state_wait,
 651		   (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
 652
 653	return rv;
 654}
 655
 656static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
 657{
 658	dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
 659	    name,
 660	    drbd_conn_str(ns.conn),
 661	    drbd_role_str(ns.role),
 662	    drbd_role_str(ns.peer),
 663	    drbd_disk_str(ns.disk),
 664	    drbd_disk_str(ns.pdsk),
 665	    is_susp(ns) ? 's' : 'r',
 666	    ns.aftr_isp ? 'a' : '-',
 667	    ns.peer_isp ? 'p' : '-',
 668	    ns.user_isp ? 'u' : '-'
 669	    );
 670}
 671
 672void print_st_err(struct drbd_conf *mdev, union drbd_state os,
 673	          union drbd_state ns, enum drbd_state_rv err)
 674{
 675	if (err == SS_IN_TRANSIENT_STATE)
 676		return;
 677	dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
 678	print_st(mdev, " state", os);
 679	print_st(mdev, "wanted", ns);
 680}
 681
 682
 683/**
 684 * is_valid_state() - Returns an SS_ error code if ns is not valid
 685 * @mdev:	DRBD device.
 686 * @ns:		State to consider.
 687 */
 688static enum drbd_state_rv
 689is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
 690{
 691	/* See drbd_state_sw_errors in drbd_strings.c */
 692
 693	enum drbd_fencing_p fp;
 694	enum drbd_state_rv rv = SS_SUCCESS;
 695
 696	fp = FP_DONT_CARE;
 697	if (get_ldev(mdev)) {
 698		fp = mdev->ldev->dc.fencing;
 699		put_ldev(mdev);
 700	}
 701
 702	if (get_net_conf(mdev)) {
 703		if (!mdev->net_conf->two_primaries &&
 704		    ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
 705			rv = SS_TWO_PRIMARIES;
 706		put_net_conf(mdev);
 707	}
 708
 709	if (rv <= 0)
 710		/* already found a reason to abort */;
 711	else if (ns.role == R_SECONDARY && mdev->open_cnt)
 712		rv = SS_DEVICE_IN_USE;
 713
 714	else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
 715		rv = SS_NO_UP_TO_DATE_DISK;
 716
 717	else if (fp >= FP_RESOURCE &&
 718		 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
 719		rv = SS_PRIMARY_NOP;
 720
 721	else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
 722		rv = SS_NO_UP_TO_DATE_DISK;
 723
 724	else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
 725		rv = SS_NO_LOCAL_DISK;
 726
 727	else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
 728		rv = SS_NO_REMOTE_DISK;
 729
 730	else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
 731		rv = SS_NO_UP_TO_DATE_DISK;
 732
 733	else if ((ns.conn == C_CONNECTED ||
 734		  ns.conn == C_WF_BITMAP_S ||
 735		  ns.conn == C_SYNC_SOURCE ||
 736		  ns.conn == C_PAUSED_SYNC_S) &&
 737		  ns.disk == D_OUTDATED)
 738		rv = SS_CONNECTED_OUTDATES;
 739
 740	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
 741		 (mdev->sync_conf.verify_alg[0] == 0))
 742		rv = SS_NO_VERIFY_ALG;
 743
 744	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
 745		  mdev->agreed_pro_version < 88)
 746		rv = SS_NOT_SUPPORTED;
 747
 748	else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
 749		rv = SS_CONNECTED_OUTDATES;
 750
 751	return rv;
 752}
 753
 754/**
 755 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
 756 * @mdev:	DRBD device.
 757 * @ns:		new state.
 758 * @os:		old state.
 759 */
 760static enum drbd_state_rv
 761is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
 762			  union drbd_state os)
 763{
 764	enum drbd_state_rv rv = SS_SUCCESS;
 765
 766	if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
 767	    os.conn > C_CONNECTED)
 768		rv = SS_RESYNC_RUNNING;
 769
 770	if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
 771		rv = SS_ALREADY_STANDALONE;
 772
 773	if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
 774		rv = SS_IS_DISKLESS;
 775
 776	if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
 777		rv = SS_NO_NET_CONFIG;
 778
 779	if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
 780		rv = SS_LOWER_THAN_OUTDATED;
 781
 782	if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
 783		rv = SS_IN_TRANSIENT_STATE;
 784
 785	if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
 786		rv = SS_IN_TRANSIENT_STATE;
 787
 788	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
 789		rv = SS_NEED_CONNECTION;
 790
 791	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
 792	    ns.conn != os.conn && os.conn > C_CONNECTED)
 793		rv = SS_RESYNC_RUNNING;
 794
 795	if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
 796	    os.conn < C_CONNECTED)
 797		rv = SS_NEED_CONNECTION;
 798
 799	if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
 800	    && os.conn < C_WF_REPORT_PARAMS)
 801		rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
 802
 803	return rv;
 804}
 805
 806/**
 807 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
 808 * @mdev:	DRBD device.
 809 * @os:		old state.
 810 * @ns:		new state.
 811 * @warn_sync_abort:
 812 *
 813 * When we loose connection, we have to set the state of the peers disk (pdsk)
 814 * to D_UNKNOWN. This rule and many more along those lines are in this function.
 815 */
 816static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
 817				       union drbd_state ns, const char **warn_sync_abort)
 818{
 819	enum drbd_fencing_p fp;
 820	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
 821
 822	fp = FP_DONT_CARE;
 823	if (get_ldev(mdev)) {
 824		fp = mdev->ldev->dc.fencing;
 825		put_ldev(mdev);
 826	}
 827
 828	/* Disallow Network errors to configure a device's network part */
 829	if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
 830	    os.conn <= C_DISCONNECTING)
 831		ns.conn = os.conn;
 832
 833	/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
 834	 * If you try to go into some Sync* state, that shall fail (elsewhere). */
 835	if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
 836	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
 837		ns.conn = os.conn;
 838
 839	/* we cannot fail (again) if we already detached */
 840	if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
 841		ns.disk = D_DISKLESS;
 842
 843	/* if we are only D_ATTACHING yet,
 844	 * we can (and should) go directly to D_DISKLESS. */
 845	if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
 846		ns.disk = D_DISKLESS;
 847
 848	/* After C_DISCONNECTING only C_STANDALONE may follow */
 849	if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
 850		ns.conn = os.conn;
 851
 852	if (ns.conn < C_CONNECTED) {
 853		ns.peer_isp = 0;
 854		ns.peer = R_UNKNOWN;
 855		if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
 856			ns.pdsk = D_UNKNOWN;
 857	}
 858
 859	/* Clear the aftr_isp when becoming unconfigured */
 860	if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
 861		ns.aftr_isp = 0;
 862
 863	/* Abort resync if a disk fails/detaches */
 864	if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
 865	    (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
 866		if (warn_sync_abort)
 867			*warn_sync_abort =
 868				os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
 869				"Online-verify" : "Resync";
 870		ns.conn = C_CONNECTED;
 871	}
 872
 873	/* Connection breaks down before we finished "Negotiating" */
 874	if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
 875	    get_ldev_if_state(mdev, D_NEGOTIATING)) {
 876		if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
 877			ns.disk = mdev->new_state_tmp.disk;
 878			ns.pdsk = mdev->new_state_tmp.pdsk;
 879		} else {
 880			dev_alert(DEV, "Connection lost while negotiating, no data!\n");
 881			ns.disk = D_DISKLESS;
 882			ns.pdsk = D_UNKNOWN;
 883		}
 884		put_ldev(mdev);
 885	}
 886
 887	/* D_CONSISTENT and D_OUTDATED vanish when we get connected */
 888	if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
 889		if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
 890			ns.disk = D_UP_TO_DATE;
 891		if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
 892			ns.pdsk = D_UP_TO_DATE;
 893	}
 894
 895	/* Implications of the connection stat on the disk states */
 896	disk_min = D_DISKLESS;
 897	disk_max = D_UP_TO_DATE;
 898	pdsk_min = D_INCONSISTENT;
 899	pdsk_max = D_UNKNOWN;
 900	switch ((enum drbd_conns)ns.conn) {
 901	case C_WF_BITMAP_T:
 902	case C_PAUSED_SYNC_T:
 903	case C_STARTING_SYNC_T:
 904	case C_WF_SYNC_UUID:
 905	case C_BEHIND:
 906		disk_min = D_INCONSISTENT;
 907		disk_max = D_OUTDATED;
 908		pdsk_min = D_UP_TO_DATE;
 909		pdsk_max = D_UP_TO_DATE;
 910		break;
 911	case C_VERIFY_S:
 912	case C_VERIFY_T:
 913		disk_min = D_UP_TO_DATE;
 914		disk_max = D_UP_TO_DATE;
 915		pdsk_min = D_UP_TO_DATE;
 916		pdsk_max = D_UP_TO_DATE;
 917		break;
 918	case C_CONNECTED:
 919		disk_min = D_DISKLESS;
 920		disk_max = D_UP_TO_DATE;
 921		pdsk_min = D_DISKLESS;
 922		pdsk_max = D_UP_TO_DATE;
 923		break;
 924	case C_WF_BITMAP_S:
 925	case C_PAUSED_SYNC_S:
 926	case C_STARTING_SYNC_S:
 927	case C_AHEAD:
 928		disk_min = D_UP_TO_DATE;
 929		disk_max = D_UP_TO_DATE;
 930		pdsk_min = D_INCONSISTENT;
 931		pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
 932		break;
 933	case C_SYNC_TARGET:
 934		disk_min = D_INCONSISTENT;
 935		disk_max = D_INCONSISTENT;
 936		pdsk_min = D_UP_TO_DATE;
 937		pdsk_max = D_UP_TO_DATE;
 938		break;
 939	case C_SYNC_SOURCE:
 940		disk_min = D_UP_TO_DATE;
 941		disk_max = D_UP_TO_DATE;
 942		pdsk_min = D_INCONSISTENT;
 943		pdsk_max = D_INCONSISTENT;
 944		break;
 945	case C_STANDALONE:
 946	case C_DISCONNECTING:
 947	case C_UNCONNECTED:
 948	case C_TIMEOUT:
 949	case C_BROKEN_PIPE:
 950	case C_NETWORK_FAILURE:
 951	case C_PROTOCOL_ERROR:
 952	case C_TEAR_DOWN:
 953	case C_WF_CONNECTION:
 954	case C_WF_REPORT_PARAMS:
 955	case C_MASK:
 956		break;
 957	}
 958	if (ns.disk > disk_max)
 959		ns.disk = disk_max;
 960
 961	if (ns.disk < disk_min) {
 962		dev_warn(DEV, "Implicitly set disk from %s to %s\n",
 963			 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
 964		ns.disk = disk_min;
 965	}
 966	if (ns.pdsk > pdsk_max)
 967		ns.pdsk = pdsk_max;
 968
 969	if (ns.pdsk < pdsk_min) {
 970		dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
 971			 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
 972		ns.pdsk = pdsk_min;
 973	}
 974
 975	if (fp == FP_STONITH &&
 976	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
 977	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
 978		ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
 979
 980	if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
 981	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
 982	    !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
 983		ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
 984
 985	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
 986		if (ns.conn == C_SYNC_SOURCE)
 987			ns.conn = C_PAUSED_SYNC_S;
 988		if (ns.conn == C_SYNC_TARGET)
 989			ns.conn = C_PAUSED_SYNC_T;
 990	} else {
 991		if (ns.conn == C_PAUSED_SYNC_S)
 992			ns.conn = C_SYNC_SOURCE;
 993		if (ns.conn == C_PAUSED_SYNC_T)
 994			ns.conn = C_SYNC_TARGET;
 995	}
 996
 997	return ns;
 998}
 999
1000/* helper for __drbd_set_state */
1001static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1002{
1003	if (mdev->agreed_pro_version < 90)
1004		mdev->ov_start_sector = 0;
1005	mdev->rs_total = drbd_bm_bits(mdev);
1006	mdev->ov_position = 0;
1007	if (cs == C_VERIFY_T) {
1008		/* starting online verify from an arbitrary position
1009		 * does not fit well into the existing protocol.
1010		 * on C_VERIFY_T, we initialize ov_left and friends
1011		 * implicitly in receive_DataRequest once the
1012		 * first P_OV_REQUEST is received */
1013		mdev->ov_start_sector = ~(sector_t)0;
1014	} else {
1015		unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
1016		if (bit >= mdev->rs_total) {
1017			mdev->ov_start_sector =
1018				BM_BIT_TO_SECT(mdev->rs_total - 1);
1019			mdev->rs_total = 1;
1020		} else
1021			mdev->rs_total -= bit;
1022		mdev->ov_position = mdev->ov_start_sector;
1023	}
1024	mdev->ov_left = mdev->rs_total;
1025}
1026
1027static void drbd_resume_al(struct drbd_conf *mdev)
1028{
1029	if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1030		dev_info(DEV, "Resumed AL updates\n");
1031}
1032
1033/**
1034 * __drbd_set_state() - Set a new DRBD state
1035 * @mdev:	DRBD device.
1036 * @ns:		new state.
1037 * @flags:	Flags
1038 * @done:	Optional completion, that will get completed after the after_state_ch() finished
1039 *
1040 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1041 */
1042enum drbd_state_rv
1043__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1044	         enum chg_state_flags flags, struct completion *done)
1045{
1046	union drbd_state os;
1047	enum drbd_state_rv rv = SS_SUCCESS;
1048	const char *warn_sync_abort = NULL;
1049	struct after_state_chg_work *ascw;
1050
1051	os = mdev->state;
1052
1053	ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1054
1055	if (ns.i == os.i)
1056		return SS_NOTHING_TO_DO;
1057
1058	if (!(flags & CS_HARD)) {
1059		/*  pre-state-change checks ; only look at ns  */
1060		/* See drbd_state_sw_errors in drbd_strings.c */
1061
1062		rv = is_valid_state(mdev, ns);
1063		if (rv < SS_SUCCESS) {
1064			/* If the old state was illegal as well, then let
1065			   this happen...*/
1066
1067			if (is_valid_state(mdev, os) == rv)
1068				rv = is_valid_state_transition(mdev, ns, os);
1069		} else
1070			rv = is_valid_state_transition(mdev, ns, os);
1071	}
1072
1073	if (rv < SS_SUCCESS) {
1074		if (flags & CS_VERBOSE)
1075			print_st_err(mdev, os, ns, rv);
1076		return rv;
1077	}
1078
1079	if (warn_sync_abort)
1080		dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
1081
1082	{
1083	char *pbp, pb[300];
1084	pbp = pb;
1085	*pbp = 0;
1086	if (ns.role != os.role)
1087		pbp += sprintf(pbp, "role( %s -> %s ) ",
1088			       drbd_role_str(os.role),
1089			       drbd_role_str(ns.role));
1090	if (ns.peer != os.peer)
1091		pbp += sprintf(pbp, "peer( %s -> %s ) ",
1092			       drbd_role_str(os.peer),
1093			       drbd_role_str(ns.peer));
1094	if (ns.conn != os.conn)
1095		pbp += sprintf(pbp, "conn( %s -> %s ) ",
1096			       drbd_conn_str(os.conn),
1097			       drbd_conn_str(ns.conn));
1098	if (ns.disk != os.disk)
1099		pbp += sprintf(pbp, "disk( %s -> %s ) ",
1100			       drbd_disk_str(os.disk),
1101			       drbd_disk_str(ns.disk));
1102	if (ns.pdsk != os.pdsk)
1103		pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1104			       drbd_disk_str(os.pdsk),
1105			       drbd_disk_str(ns.pdsk));
1106	if (is_susp(ns) != is_susp(os))
1107		pbp += sprintf(pbp, "susp( %d -> %d ) ",
1108			       is_susp(os),
1109			       is_susp(ns));
1110	if (ns.aftr_isp != os.aftr_isp)
1111		pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1112			       os.aftr_isp,
1113			       ns.aftr_isp);
1114	if (ns.peer_isp != os.peer_isp)
1115		pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1116			       os.peer_isp,
1117			       ns.peer_isp);
1118	if (ns.user_isp != os.user_isp)
1119		pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1120			       os.user_isp,
1121			       ns.user_isp);
1122	dev_info(DEV, "%s\n", pb);
1123	}
1124
1125	/* solve the race between becoming unconfigured,
1126	 * worker doing the cleanup, and
1127	 * admin reconfiguring us:
1128	 * on (re)configure, first set CONFIG_PENDING,
1129	 * then wait for a potentially exiting worker,
1130	 * start the worker, and schedule one no_op.
1131	 * then proceed with configuration.
1132	 */
1133	if (ns.disk == D_DISKLESS &&
1134	    ns.conn == C_STANDALONE &&
1135	    ns.role == R_SECONDARY &&
1136	    !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1137		set_bit(DEVICE_DYING, &mdev->flags);
1138
1139	/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1140	 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1141	 * drbd_ldev_destroy() won't happen before our corresponding
1142	 * after_state_ch works run, where we put_ldev again. */
1143	if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1144	    (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1145		atomic_inc(&mdev->local_cnt);
1146
1147	mdev->state = ns;
1148
1149	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1150		drbd_print_uuids(mdev, "attached to UUIDs");
1151
1152	wake_up(&mdev->misc_wait);
1153	wake_up(&mdev->state_wait);
1154
1155	/* aborted verify run. log the last position */
1156	if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1157	    ns.conn < C_CONNECTED) {
1158		mdev->ov_start_sector =
1159			BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1160		dev_info(DEV, "Online Verify reached sector %llu\n",
1161			(unsigned long long)mdev->ov_start_sector);
1162	}
1163
1164	if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1165	    (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
1166		dev_info(DEV, "Syncer continues.\n");
1167		mdev->rs_paused += (long)jiffies
1168				  -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1169		if (ns.conn == C_SYNC_TARGET)
1170			mod_timer(&mdev->resync_timer, jiffies);
1171	}
1172
1173	if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
1174	    (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1175		dev_info(DEV, "Resync suspended\n");
1176		mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1177	}
1178
1179	if (os.conn == C_CONNECTED &&
1180	    (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1181		unsigned long now = jiffies;
1182		int i;
1183
1184		set_ov_position(mdev, ns.conn);
1185		mdev->rs_start = now;
1186		mdev->rs_last_events = 0;
1187		mdev->rs_last_sect_ev = 0;
1188		mdev->ov_last_oos_size = 0;
1189		mdev->ov_last_oos_start = 0;
1190
1191		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1192			mdev->rs_mark_left[i] = mdev->ov_left;
1193			mdev->rs_mark_time[i] = now;
1194		}
1195
1196		drbd_rs_controller_reset(mdev);
1197
1198		if (ns.conn == C_VERIFY_S) {
1199			dev_info(DEV, "Starting Online Verify from sector %llu\n",
1200					(unsigned long long)mdev->ov_position);
1201			mod_timer(&mdev->resync_timer, jiffies);
1202		}
1203	}
1204
1205	if (get_ldev(mdev)) {
1206		u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1207						 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1208						 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1209
1210		if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1211			mdf |= MDF_CRASHED_PRIMARY;
1212		if (mdev->state.role == R_PRIMARY ||
1213		    (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1214			mdf |= MDF_PRIMARY_IND;
1215		if (mdev->state.conn > C_WF_REPORT_PARAMS)
1216			mdf |= MDF_CONNECTED_IND;
1217		if (mdev->state.disk > D_INCONSISTENT)
1218			mdf |= MDF_CONSISTENT;
1219		if (mdev->state.disk > D_OUTDATED)
1220			mdf |= MDF_WAS_UP_TO_DATE;
1221		if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1222			mdf |= MDF_PEER_OUT_DATED;
1223		if (mdf != mdev->ldev->md.flags) {
1224			mdev->ldev->md.flags = mdf;
1225			drbd_md_mark_dirty(mdev);
1226		}
1227		if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1228			drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1229		put_ldev(mdev);
1230	}
1231
1232	/* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1233	if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1234	    os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1235		set_bit(CONSIDER_RESYNC, &mdev->flags);
1236
1237	/* Receiver should clean up itself */
1238	if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1239		drbd_thread_stop_nowait(&mdev->receiver);
1240
1241	/* Now the receiver finished cleaning up itself, it should die */
1242	if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1243		drbd_thread_stop_nowait(&mdev->receiver);
1244
1245	/* Upon network failure, we need to restart the receiver. */
1246	if (os.conn > C_TEAR_DOWN &&
1247	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1248		drbd_thread_restart_nowait(&mdev->receiver);
1249
1250	/* Resume AL writing if we get a connection */
1251	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1252		drbd_resume_al(mdev);
1253
1254	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1255	if (ascw) {
1256		ascw->os = os;
1257		ascw->ns = ns;
1258		ascw->flags = flags;
1259		ascw->w.cb = w_after_state_ch;
1260		ascw->done = done;
1261		drbd_queue_work(&mdev->data.work, &ascw->w);
1262	} else {
1263		dev_warn(DEV, "Could not kmalloc an ascw\n");
1264	}
1265
1266	return rv;
1267}
1268
1269static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1270{
1271	struct after_state_chg_work *ascw =
1272		container_of(w, struct after_state_chg_work, w);
1273	after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1274	if (ascw->flags & CS_WAIT_COMPLETE) {
1275		D_ASSERT(ascw->done != NULL);
1276		complete(ascw->done);
1277	}
1278	kfree(ascw);
1279
1280	return 1;
1281}
1282
1283static void abw_start_sync(struct drbd_conf *mdev, int rv)
1284{
1285	if (rv) {
1286		dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1287		_drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1288		return;
1289	}
1290
1291	switch (mdev->state.conn) {
1292	case C_STARTING_SYNC_T:
1293		_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1294		break;
1295	case C_STARTING_SYNC_S:
1296		drbd_start_resync(mdev, C_SYNC_SOURCE);
1297		break;
1298	}
1299}
1300
1301int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1302		int (*io_fn)(struct drbd_conf *),
1303		char *why, enum bm_flag flags)
1304{
1305	int rv;
1306
1307	D_ASSERT(current == mdev->worker.task);
1308
1309	/* open coded non-blocking drbd_suspend_io(mdev); */
1310	set_bit(SUSPEND_IO, &mdev->flags);
1311
1312	drbd_bm_lock(mdev, why, flags);
1313	rv = io_fn(mdev);
1314	drbd_bm_unlock(mdev);
1315
1316	drbd_resume_io(mdev);
1317
1318	return rv;
1319}
1320
1321/**
1322 * after_state_ch() - Perform after state change actions that may sleep
1323 * @mdev:	DRBD device.
1324 * @os:		old state.
1325 * @ns:		new state.
1326 * @flags:	Flags
1327 */
1328static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1329			   union drbd_state ns, enum chg_state_flags flags)
1330{
1331	enum drbd_fencing_p fp;
1332	enum drbd_req_event what = nothing;
1333	union drbd_state nsm = (union drbd_state){ .i = -1 };
1334
1335	if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1336		clear_bit(CRASHED_PRIMARY, &mdev->flags);
1337		if (mdev->p_uuid)
1338			mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1339	}
1340
1341	fp = FP_DONT_CARE;
1342	if (get_ldev(mdev)) {
1343		fp = mdev->ldev->dc.fencing;
1344		put_ldev(mdev);
1345	}
1346
1347	/* Inform userspace about the change... */
1348	drbd_bcast_state(mdev, ns);
1349
1350	if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1351	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1352		drbd_khelper(mdev, "pri-on-incon-degr");
1353
1354	/* Here we have the actions that are performed after a
1355	   state change. This function might sleep */
1356
1357	nsm.i = -1;
1358	if (ns.susp_nod) {
1359		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1360			what = resend;
1361
1362		if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
1363			what = restart_frozen_disk_io;
1364
1365		if (what != nothing)
1366			nsm.susp_nod = 0;
1367	}
1368
1369	if (ns.susp_fen) {
1370		/* case1: The outdate peer handler is successful: */
1371		if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
1372			tl_clear(mdev);
1373			if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1374				drbd_uuid_new_current(mdev);
1375				clear_bit(NEW_CUR_UUID, &mdev->flags);
1376			}
1377			spin_lock_irq(&mdev->req_lock);
1378			_drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1379			spin_unlock_irq(&mdev->req_lock);
1380		}
1381		/* case2: The connection was established again: */
1382		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1383			clear_bit(NEW_CUR_UUID, &mdev->flags);
1384			what = resend;
1385			nsm.susp_fen = 0;
1386		}
1387	}
1388
1389	if (what != nothing) {
1390		spin_lock_irq(&mdev->req_lock);
1391		_tl_restart(mdev, what);
1392		nsm.i &= mdev->state.i;
1393		_drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1394		spin_unlock_irq(&mdev->req_lock);
1395	}
1396
1397	/* Became sync source.  With protocol >= 96, we still need to send out
1398	 * the sync uuid now. Need to do that before any drbd_send_state, or
1399	 * the other side may go "paused sync" before receiving the sync uuids,
1400	 * which is unexpected. */
1401	if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1402	    (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1403	    mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1404		drbd_gen_and_send_sync_uuid(mdev);
1405		put_ldev(mdev);
1406	}
1407
1408	/* Do not change the order of the if above and the two below... */
1409	if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
1410		drbd_send_uuids(mdev);
1411		drbd_send_state(mdev);
1412	}
1413	/* No point in queuing send_bitmap if we don't have a connection
1414	 * anymore, so check also the _current_ state, not only the new state
1415	 * at the time this work was queued. */
1416	if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1417	    mdev->state.conn == C_WF_BITMAP_S)
1418		drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
1419				"send_bitmap (WFBitMapS)",
1420				BM_LOCKED_TEST_ALLOWED);
1421
1422	/* Lost contact to peer's copy of the data */
1423	if ((os.pdsk >= D_INCONSISTENT &&
1424	     os.pdsk != D_UNKNOWN &&
1425	     os.pdsk != D_OUTDATED)
1426	&&  (ns.pdsk < D_INCONSISTENT ||
1427	     ns.pdsk == D_UNKNOWN ||
1428	     ns.pdsk == D_OUTDATED)) {
1429		if (get_ldev(mdev)) {
1430			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1431			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1432				if (is_susp(mdev->state)) {
1433					set_bit(NEW_CUR_UUID, &mdev->flags);
1434				} else {
1435					drbd_uuid_new_current(mdev);
1436					drbd_send_uuids(mdev);
1437				}
1438			}
1439			put_ldev(mdev);
1440		}
1441	}
1442
1443	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1444		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1445			drbd_uuid_new_current(mdev);
1446			drbd_send_uuids(mdev);
1447		}
1448
1449		/* D_DISKLESS Peer becomes secondary */
1450		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1451			/* We may still be Primary ourselves.
1452			 * No harm done if the bitmap still changes,
1453			 * redirtied pages will follow later. */
1454			drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1455				"demote diskless peer", BM_LOCKED_SET_ALLOWED);
1456		put_ldev(mdev);
1457	}
1458
1459	/* Write out all changed bits on demote.
1460	 * Though, no need to da that just yet
1461	 * if there is a resync going on still */
1462	if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1463		mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
1464		/* No changes to the bitmap expected this time, so assert that,
1465		 * even though no harm was done if it did change. */
1466		drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1467				"demote", BM_LOCKED_TEST_ALLOWED);
1468		put_ldev(mdev);
1469	}
1470
1471	/* Last part of the attaching process ... */
1472	if (ns.conn >= C_CONNECTED &&
1473	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1474		drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
1475		drbd_send_uuids(mdev);
1476		drbd_send_state(mdev);
1477	}
1478
1479	/* We want to pause/continue resync, tell peer. */
1480	if (ns.conn >= C_CONNECTED &&
1481	     ((os.aftr_isp != ns.aftr_isp) ||
1482	      (os.user_isp != ns.user_isp)))
1483		drbd_send_state(mdev);
1484
1485	/* In case one of the isp bits got set, suspend other devices. */
1486	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1487	    (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1488		suspend_other_sg(mdev);
1489
1490	/* Make sure the peer gets informed about eventual state
1491	   changes (ISP bits) while we were in WFReportParams. */
1492	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1493		drbd_send_state(mdev);
1494
1495	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1496		drbd_send_state(mdev);
1497
1498	/* We are in the progress to start a full sync... */
1499	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1500	    (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1501		/* no other bitmap changes expected during this phase */
1502		drbd_queue_bitmap_io(mdev,
1503			&drbd_bmio_set_n_write, &abw_start_sync,
1504			"set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
1505
1506	/* We are invalidating our self... */
1507	if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1508	    os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1509		/* other bitmap operation expected during this phase */
1510		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1511			"set_n_write from invalidate", BM_LOCKED_MASK);
1512
1513	/* first half of local IO error, failure to attach,
1514	 * or administrative detach */
1515	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1516		enum drbd_io_error_p eh;
1517		int was_io_error;
1518		/* corresponding get_ldev was in __drbd_set_state, to serialize
1519		 * our cleanup here with the transition to D_DISKLESS,
1520		 * so it is safe to dreference ldev here. */
1521		eh = mdev->ldev->dc.on_io_error;
1522		was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1523
1524		/* current state still has to be D_FAILED,
1525		 * there is only one way out: to D_DISKLESS,
1526		 * and that may only happen after our put_ldev below. */
1527		if (mdev->state.disk != D_FAILED)
1528			dev_err(DEV,
1529				"ASSERT FAILED: disk is %s during detach\n",
1530				drbd_disk_str(mdev->state.disk));
1531
1532		if (drbd_send_state(mdev))
1533			dev_warn(DEV, "Notified peer that I am detaching my disk\n");
1534		else
1535			dev_err(DEV, "Sending state for detaching disk failed\n");
1536
1537		drbd_rs_cancel_all(mdev);
1538
1539		/* In case we want to get something to stable storage still,
1540		 * this may be the last chance.
1541		 * Following put_ldev may transition to D_DISKLESS. */
1542		drbd_md_sync(mdev);
1543		put_ldev(mdev);
1544
1545		if (was_io_error && eh == EP_CALL_HELPER)
1546			drbd_khelper(mdev, "local-io-error");
1547	}
1548
1549        /* second half of local IO error, failure to attach,
1550         * or administrative detach,
1551         * after local_cnt references have reached zero again */
1552        if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1553                /* We must still be diskless,
1554                 * re-attach has to be serialized with this! */
1555                if (mdev->state.disk != D_DISKLESS)
1556                        dev_err(DEV,
1557                                "ASSERT FAILED: disk is %s while going diskless\n",
1558                                drbd_disk_str(mdev->state.disk));
1559
1560                mdev->rs_total = 0;
1561                mdev->rs_failed = 0;
1562                atomic_set(&mdev->rs_pending_cnt, 0);
1563
1564		if (drbd_send_state(mdev))
1565			dev_warn(DEV, "Notified peer that I'm now diskless.\n");
1566		/* corresponding get_ldev in __drbd_set_state
1567		 * this may finally trigger drbd_ldev_destroy. */
1568		put_ldev(mdev);
1569	}
1570
1571	/* Notify peer that I had a local IO error, and did not detached.. */
1572	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1573		drbd_send_state(mdev);
1574
1575	/* Disks got bigger while they were detached */
1576	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1577	    test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1578		if (ns.conn == C_CONNECTED)
1579			resync_after_online_grow(mdev);
1580	}
1581
1582	/* A resync finished or aborted, wake paused devices... */
1583	if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1584	    (os.peer_isp && !ns.peer_isp) ||
1585	    (os.user_isp && !ns.user_isp))
1586		resume_next_sg(mdev);
1587
1588	/* sync target done with resync.  Explicitly notify peer, even though
1589	 * it should (at least for non-empty resyncs) already know itself. */
1590	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1591		drbd_send_state(mdev);
1592
1593	/* This triggers bitmap writeout of potentially still unwritten pages
1594	 * if the resync finished cleanly, or aborted because of peer disk
1595	 * failure, or because of connection loss.
1596	 * For resync aborted because of local disk failure, we cannot do
1597	 * any bitmap writeout anymore.
1598	 * No harm done if some bits change during this phase.
1599	 */
1600	if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1601		drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1602			"write from resync_finished", BM_LOCKED_SET_ALLOWED);
1603		put_ldev(mdev);
1604	}
1605
1606	/* free tl_hash if we Got thawed and are C_STANDALONE */
1607	if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1608		drbd_free_tl_hash(mdev);
1609
1610	/* Upon network connection, we need to start the receiver */
1611	if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1612		drbd_thread_start(&mdev->receiver);
1613
1614	/* Terminate worker thread if we are unconfigured - it will be
1615	   restarted as needed... */
1616	if (ns.disk == D_DISKLESS &&
1617	    ns.conn == C_STANDALONE &&
1618	    ns.role == R_SECONDARY) {
1619		if (os.aftr_isp != ns.aftr_isp)
1620			resume_next_sg(mdev);
1621		/* set in __drbd_set_state, unless CONFIG_PENDING was set */
1622		if (test_bit(DEVICE_DYING, &mdev->flags))
1623			drbd_thread_stop_nowait(&mdev->worker);
1624	}
1625
1626	drbd_md_sync(mdev);
1627}
1628
1629
1630static int drbd_thread_setup(void *arg)
1631{
1632	struct drbd_thread *thi = (struct drbd_thread *) arg;
1633	struct drbd_conf *mdev = thi->mdev;
1634	unsigned long flags;
1635	int retval;
1636
 
 
 
 
1637restart:
1638	retval = thi->function(thi);
1639
1640	spin_lock_irqsave(&thi->t_lock, flags);
1641
1642	/* if the receiver has been "Exiting", the last thing it did
1643	 * was set the conn state to "StandAlone",
1644	 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1645	 * and receiver thread will be "started".
1646	 * drbd_thread_start needs to set "Restarting" in that case.
1647	 * t_state check and assignment needs to be within the same spinlock,
1648	 * so either thread_start sees Exiting, and can remap to Restarting,
1649	 * or thread_start see None, and can proceed as normal.
1650	 */
1651
1652	if (thi->t_state == Restarting) {
1653		dev_info(DEV, "Restarting %s\n", current->comm);
1654		thi->t_state = Running;
1655		spin_unlock_irqrestore(&thi->t_lock, flags);
1656		goto restart;
1657	}
1658
1659	thi->task = NULL;
1660	thi->t_state = None;
1661	smp_mb();
1662	complete(&thi->stop);
1663	spin_unlock_irqrestore(&thi->t_lock, flags);
1664
1665	dev_info(DEV, "Terminating %s\n", current->comm);
1666
1667	/* Release mod reference taken when thread was started */
 
 
 
 
1668	module_put(THIS_MODULE);
1669	return retval;
1670}
1671
1672static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1673		      int (*func) (struct drbd_thread *))
1674{
1675	spin_lock_init(&thi->t_lock);
1676	thi->task    = NULL;
1677	thi->t_state = None;
1678	thi->function = func;
1679	thi->mdev = mdev;
 
 
1680}
1681
1682int drbd_thread_start(struct drbd_thread *thi)
1683{
1684	struct drbd_conf *mdev = thi->mdev;
1685	struct task_struct *nt;
1686	unsigned long flags;
1687
1688	const char *me =
1689		thi == &mdev->receiver ? "receiver" :
1690		thi == &mdev->asender  ? "asender"  :
1691		thi == &mdev->worker   ? "worker"   : "NONSENSE";
1692
1693	/* is used from state engine doing drbd_thread_stop_nowait,
1694	 * while holding the req lock irqsave */
1695	spin_lock_irqsave(&thi->t_lock, flags);
1696
1697	switch (thi->t_state) {
1698	case None:
1699		dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1700				me, current->comm, current->pid);
1701
1702		/* Get ref on module for thread - this is released when thread exits */
1703		if (!try_module_get(THIS_MODULE)) {
1704			dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1705			spin_unlock_irqrestore(&thi->t_lock, flags);
1706			return false;
1707		}
1708
 
 
 
 
1709		init_completion(&thi->stop);
1710		D_ASSERT(thi->task == NULL);
1711		thi->reset_cpu_mask = 1;
1712		thi->t_state = Running;
1713		spin_unlock_irqrestore(&thi->t_lock, flags);
1714		flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1715
1716		nt = kthread_create(drbd_thread_setup, (void *) thi,
1717				    "drbd%d_%s", mdev_to_minor(mdev), me);
1718
1719		if (IS_ERR(nt)) {
1720			dev_err(DEV, "Couldn't start thread\n");
1721
 
 
 
1722			module_put(THIS_MODULE);
1723			return false;
1724		}
1725		spin_lock_irqsave(&thi->t_lock, flags);
1726		thi->task = nt;
1727		thi->t_state = Running;
1728		spin_unlock_irqrestore(&thi->t_lock, flags);
1729		wake_up_process(nt);
1730		break;
1731	case Exiting:
1732		thi->t_state = Restarting;
1733		dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1734				me, current->comm, current->pid);
1735		/* fall through */
1736	case Running:
1737	case Restarting:
1738	default:
1739		spin_unlock_irqrestore(&thi->t_lock, flags);
1740		break;
1741	}
1742
1743	return true;
1744}
1745
1746
1747void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1748{
1749	unsigned long flags;
1750
1751	enum drbd_thread_state ns = restart ? Restarting : Exiting;
1752
1753	/* may be called from state engine, holding the req lock irqsave */
1754	spin_lock_irqsave(&thi->t_lock, flags);
1755
1756	if (thi->t_state == None) {
1757		spin_unlock_irqrestore(&thi->t_lock, flags);
1758		if (restart)
1759			drbd_thread_start(thi);
1760		return;
1761	}
1762
1763	if (thi->t_state != ns) {
1764		if (thi->task == NULL) {
1765			spin_unlock_irqrestore(&thi->t_lock, flags);
1766			return;
1767		}
1768
1769		thi->t_state = ns;
1770		smp_mb();
1771		init_completion(&thi->stop);
1772		if (thi->task != current)
1773			force_sig(DRBD_SIGKILL, thi->task);
1774
1775	}
1776
1777	spin_unlock_irqrestore(&thi->t_lock, flags);
1778
1779	if (wait)
1780		wait_for_completion(&thi->stop);
1781}
1782
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1783#ifdef CONFIG_SMP
1784/**
1785 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1786 * @mdev:	DRBD device.
1787 *
1788 * Forces all threads of a device onto the same CPU. This is beneficial for
1789 * DRBD's performance. May be overwritten by user's configuration.
1790 */
1791void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1792{
1793	int ord, cpu;
1794
1795	/* user override. */
1796	if (cpumask_weight(mdev->cpu_mask))
1797		return;
1798
1799	ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1800	for_each_online_cpu(cpu) {
1801		if (ord-- == 0) {
1802			cpumask_set_cpu(cpu, mdev->cpu_mask);
1803			return;
 
 
 
 
 
 
 
 
 
 
 
1804		}
 
 
 
 
 
1805	}
1806	/* should not be reached */
1807	cpumask_setall(mdev->cpu_mask);
1808}
1809
1810/**
1811 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1812 * @mdev:	DRBD device.
 
1813 *
1814 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1815 * prematurely.
1816 */
1817void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1818{
 
1819	struct task_struct *p = current;
1820	struct drbd_thread *thi =
1821		p == mdev->asender.task  ? &mdev->asender  :
1822		p == mdev->receiver.task ? &mdev->receiver :
1823		p == mdev->worker.task   ? &mdev->worker   :
1824		NULL;
1825	ERR_IF(thi == NULL)
1826		return;
1827	if (!thi->reset_cpu_mask)
1828		return;
1829	thi->reset_cpu_mask = 0;
1830	set_cpus_allowed_ptr(p, mdev->cpu_mask);
1831}
 
 
1832#endif
1833
1834/* the appropriate socket mutex must be held already */
1835int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1836			  enum drbd_packets cmd, struct p_header80 *h,
1837			  size_t size, unsigned msg_flags)
 
 
 
 
1838{
1839	int sent, ok;
 
 
 
 
 
 
 
 
 
1840
1841	ERR_IF(!h) return false;
1842	ERR_IF(!size) return false;
 
 
 
 
 
1843
1844	h->magic   = BE_DRBD_MAGIC;
 
 
1845	h->command = cpu_to_be16(cmd);
1846	h->length  = cpu_to_be16(size-sizeof(struct p_header80));
 
 
1847
1848	sent = drbd_send(mdev, sock, h, size, msg_flags);
 
 
 
 
 
 
 
 
 
1849
1850	ok = (sent == size);
1851	if (!ok && !signal_pending(current))
1852		dev_warn(DEV, "short sent %s size=%d sent=%d\n",
1853		    cmdname(cmd), (int)size, sent);
1854	return ok;
 
 
 
 
 
1855}
1856
1857/* don't pass the socket. we may only look at it
1858 * when we hold the appropriate socket mutex.
1859 */
1860int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1861		  enum drbd_packets cmd, struct p_header80 *h, size_t size)
1862{
1863	int ok = 0;
1864	struct socket *sock;
 
 
1865
1866	if (use_data_socket) {
1867		mutex_lock(&mdev->data.mutex);
1868		sock = mdev->data.socket;
1869	} else {
1870		mutex_lock(&mdev->meta.mutex);
1871		sock = mdev->meta.socket;
1872	}
1873
1874	/* drbd_disconnect() could have called drbd_free_sock()
1875	 * while we were waiting in down()... */
1876	if (likely(sock != NULL))
1877		ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1878
1879	if (use_data_socket)
1880		mutex_unlock(&mdev->data.mutex);
1881	else
1882		mutex_unlock(&mdev->meta.mutex);
1883	return ok;
1884}
1885
1886int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1887		   size_t size)
1888{
1889	struct p_header80 h;
1890	int ok;
1891
1892	h.magic   = BE_DRBD_MAGIC;
1893	h.command = cpu_to_be16(cmd);
1894	h.length  = cpu_to_be16(size);
 
 
 
 
1895
1896	if (!drbd_get_data_sock(mdev))
1897		return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1898
1899	ok = (sizeof(h) ==
1900		drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1901	ok = ok && (size ==
1902		drbd_send(mdev, mdev->data.socket, data, size, 0));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1903
1904	drbd_put_data_sock(mdev);
 
 
 
 
1905
1906	return ok;
 
 
 
 
 
 
 
1907}
1908
1909int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1910{
 
 
 
 
 
 
 
 
 
 
 
1911	struct p_rs_param_95 *p;
1912	struct socket *sock;
1913	int size, rv;
1914	const int apv = mdev->agreed_pro_version;
 
 
 
 
 
 
 
 
 
 
1915
1916	size = apv <= 87 ? sizeof(struct p_rs_param)
1917		: apv == 88 ? sizeof(struct p_rs_param)
1918			+ strlen(mdev->sync_conf.verify_alg) + 1
1919		: apv <= 94 ? sizeof(struct p_rs_param_89)
1920		: /* apv >= 95 */ sizeof(struct p_rs_param_95);
1921
1922	/* used from admin command context and receiver/worker context.
1923	 * to avoid kmalloc, grab the socket right here,
1924	 * then use the pre-allocated sbuf there */
1925	mutex_lock(&mdev->data.mutex);
1926	sock = mdev->data.socket;
1927
1928	if (likely(sock != NULL)) {
1929		enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1930
1931		p = &mdev->data.sbuf.rs_param_95;
1932
1933		/* initialize verify_alg and csums_alg */
1934		memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1935
1936		p->rate = cpu_to_be32(sc->rate);
1937		p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1938		p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1939		p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1940		p->c_max_rate = cpu_to_be32(sc->c_max_rate);
1941
1942		if (apv >= 88)
1943			strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1944		if (apv >= 89)
1945			strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1946
1947		rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1948	} else
1949		rv = 0; /* not ok */
1950
1951	mutex_unlock(&mdev->data.mutex);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1952
1953	return rv;
 
 
 
 
 
 
1954}
1955
1956int drbd_send_protocol(struct drbd_conf *mdev)
1957{
 
1958	struct p_protocol *p;
1959	int size, cf, rv;
1960
1961	size = sizeof(struct p_protocol);
1962
1963	if (mdev->agreed_pro_version >= 87)
1964		size += strlen(mdev->net_conf->integrity_alg) + 1;
1965
1966	/* we must not recurse into our own queue,
1967	 * as that is blocked during handshake */
1968	p = kmalloc(size, GFP_NOIO);
1969	if (p == NULL)
1970		return 0;
1971
1972	p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
1973	p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
1974	p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
1975	p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
1976	p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1977
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1978	cf = 0;
1979	if (mdev->net_conf->want_lose)
1980		cf |= CF_WANT_LOSE;
1981	if (mdev->net_conf->dry_run) {
1982		if (mdev->agreed_pro_version >= 92)
1983			cf |= CF_DRY_RUN;
1984		else {
1985			dev_err(DEV, "--dry-run is not supported by peer");
1986			kfree(p);
1987			return -1;
1988		}
1989	}
1990	p->conn_flags    = cpu_to_be32(cf);
1991
1992	if (mdev->agreed_pro_version >= 87)
1993		strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
 
1994
1995	rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1996			   (struct p_header80 *)p, size);
1997	kfree(p);
1998	return rv;
1999}
2000
2001int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2002{
2003	struct p_uuids p;
 
 
 
 
 
 
 
 
 
 
 
 
 
2004	int i;
2005
2006	if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2007		return 1;
2008
 
 
 
 
 
 
 
2009	for (i = UI_CURRENT; i < UI_SIZE; i++)
2010		p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2011
2012	mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2013	p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2014	uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2015	uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2016	uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2017	p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2018
2019	put_ldev(mdev);
 
 
 
 
 
 
 
2020
2021	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
2022			     (struct p_header80 *)&p, sizeof(p));
2023}
2024
2025int drbd_send_uuids(struct drbd_conf *mdev)
2026{
2027	return _drbd_send_uuids(mdev, 0);
2028}
2029
2030int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2031{
2032	return _drbd_send_uuids(mdev, 8);
2033}
2034
2035void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2036{
2037	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2038		u64 *uuid = mdev->ldev->md.uuid;
2039		dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2040		     text,
2041		     (unsigned long long)uuid[UI_CURRENT],
2042		     (unsigned long long)uuid[UI_BITMAP],
2043		     (unsigned long long)uuid[UI_HISTORY_START],
2044		     (unsigned long long)uuid[UI_HISTORY_END]);
2045		put_ldev(mdev);
2046	} else {
2047		dev_info(DEV, "%s effective data uuid: %016llX\n",
2048				text,
2049				(unsigned long long)mdev->ed_uuid);
2050	}
2051}
2052
2053int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
2054{
2055	struct p_rs_uuid p;
 
 
2056	u64 uuid;
2057
2058	D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2059
2060	uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
2061	drbd_uuid_set(mdev, UI_BITMAP, uuid);
2062	drbd_print_uuids(mdev, "updated sync UUID");
2063	drbd_md_sync(mdev);
2064	p.uuid = cpu_to_be64(uuid);
 
 
 
 
 
 
 
 
 
 
 
2065
2066	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
2067			     (struct p_header80 *)&p, sizeof(p));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2068}
2069
2070int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
2071{
2072	struct p_sizes p;
 
 
2073	sector_t d_size, u_size;
2074	int q_order_type, max_bio_size;
2075	int ok;
 
 
 
 
 
 
2076
2077	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2078		D_ASSERT(mdev->ldev->backing_bdev);
2079		d_size = drbd_get_max_capacity(mdev->ldev);
2080		u_size = mdev->ldev->dc.disk_size;
2081		q_order_type = drbd_queue_order_type(mdev);
2082		max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2083		max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
2084		put_ldev(mdev);
 
 
 
 
 
 
 
 
2085	} else {
2086		d_size = 0;
2087		u_size = 0;
2088		q_order_type = QUEUE_ORDERED_NONE;
2089		max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
 
2090	}
2091
2092	p.d_size = cpu_to_be64(d_size);
2093	p.u_size = cpu_to_be64(u_size);
2094	p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
2095	p.max_bio_size = cpu_to_be32(max_bio_size);
2096	p.queue_order_type = cpu_to_be16(q_order_type);
2097	p.dds_flags = cpu_to_be16(flags);
2098
2099	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
2100			   (struct p_header80 *)&p, sizeof(p));
2101	return ok;
 
 
 
2102}
2103
2104/**
2105 * drbd_send_state() - Sends the drbd state to the peer
2106 * @mdev:	DRBD device.
2107 */
2108int drbd_send_state(struct drbd_conf *mdev)
2109{
2110	struct socket *sock;
2111	struct p_state p;
2112	int ok = 0;
 
 
 
 
 
 
 
2113
2114	/* Grab state lock so we wont send state if we're in the middle
2115	 * of a cluster wide state change on another thread */
2116	drbd_state_lock(mdev);
 
 
 
 
 
 
 
 
 
 
 
2117
2118	mutex_lock(&mdev->data.mutex);
 
 
 
 
 
 
2119
2120	p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2121	sock = mdev->data.socket;
 
 
2122
2123	if (likely(sock != NULL)) {
2124		ok = _drbd_send_cmd(mdev, sock, P_STATE,
2125				    (struct p_header80 *)&p, sizeof(p), 0);
2126	}
 
 
 
 
2127
2128	mutex_unlock(&mdev->data.mutex);
 
 
 
 
2129
2130	drbd_state_unlock(mdev);
2131	return ok;
 
 
 
 
 
 
2132}
2133
2134int drbd_send_state_req(struct drbd_conf *mdev,
2135	union drbd_state mask, union drbd_state val)
2136{
2137	struct p_req_state p;
 
 
 
 
 
 
 
 
 
2138
2139	p.mask    = cpu_to_be32(mask.i);
2140	p.val     = cpu_to_be32(val.i);
 
 
 
2141
2142	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
2143			     (struct p_header80 *)&p, sizeof(p));
 
 
 
 
2144}
2145
2146int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
2147{
2148	struct p_req_state_reply p;
 
 
2149
2150	p.retcode    = cpu_to_be32(retcode);
 
 
 
2151
2152	return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
2153			     (struct p_header80 *)&p, sizeof(p));
 
 
2154}
2155
2156int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2157	struct p_compressed_bm *p,
2158	struct bm_xfer_ctx *c)
 
2159{
2160	struct bitstream bs;
2161	unsigned long plain_bits;
2162	unsigned long tmp;
2163	unsigned long rl;
2164	unsigned len;
2165	unsigned toggle;
2166	int bits;
2167
2168	/* may we use this feature? */
2169	if ((mdev->sync_conf.use_rle == 0) ||
2170		(mdev->agreed_pro_version < 90))
2171			return 0;
 
 
2172
2173	if (c->bit_offset >= c->bm_bits)
2174		return 0; /* nothing to do. */
2175
2176	/* use at most thus many bytes */
2177	bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2178	memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2179	/* plain bits covered in this code string */
2180	plain_bits = 0;
2181
2182	/* p->encoding & 0x80 stores whether the first run length is set.
2183	 * bit offset is implicit.
2184	 * start with toggle == 2 to be able to tell the first iteration */
2185	toggle = 2;
2186
2187	/* see how much plain bits we can stuff into one packet
2188	 * using RLE and VLI. */
2189	do {
2190		tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2191				    : _drbd_bm_find_next(mdev, c->bit_offset);
2192		if (tmp == -1UL)
2193			tmp = c->bm_bits;
2194		rl = tmp - c->bit_offset;
2195
2196		if (toggle == 2) { /* first iteration */
2197			if (rl == 0) {
2198				/* the first checked bit was set,
2199				 * store start value, */
2200				DCBP_set_start(p, 1);
2201				/* but skip encoding of zero run length */
2202				toggle = !toggle;
2203				continue;
2204			}
2205			DCBP_set_start(p, 0);
2206		}
2207
2208		/* paranoia: catch zero runlength.
2209		 * can only happen if bitmap is modified while we scan it. */
2210		if (rl == 0) {
2211			dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2212			    "t:%u bo:%lu\n", toggle, c->bit_offset);
2213			return -1;
2214		}
2215
2216		bits = vli_encode_bits(&bs, rl);
2217		if (bits == -ENOBUFS) /* buffer full */
2218			break;
2219		if (bits <= 0) {
2220			dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2221			return 0;
2222		}
2223
2224		toggle = !toggle;
2225		plain_bits += rl;
2226		c->bit_offset = tmp;
2227	} while (c->bit_offset < c->bm_bits);
2228
2229	len = bs.cur.b - p->code + !!bs.cur.bit;
2230
2231	if (plain_bits < (len << 3)) {
2232		/* incompressible with this method.
2233		 * we need to rewind both word and bit position. */
2234		c->bit_offset -= plain_bits;
2235		bm_xfer_ctx_bit_to_word_offset(c);
2236		c->bit_offset = c->word_offset * BITS_PER_LONG;
2237		return 0;
2238	}
2239
2240	/* RLE + VLI was able to compress it just fine.
2241	 * update c->word_offset. */
2242	bm_xfer_ctx_bit_to_word_offset(c);
2243
2244	/* store pad_bits */
2245	DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2246
2247	return len;
2248}
2249
2250/**
2251 * send_bitmap_rle_or_plain
2252 *
2253 * Return 0 when done, 1 when another iteration is needed, and a negative error
2254 * code upon failure.
2255 */
2256static int
2257send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2258			 struct p_header80 *h, struct bm_xfer_ctx *c)
2259{
2260	struct p_compressed_bm *p = (void*)h;
2261	unsigned long num_words;
2262	int len;
2263	int ok;
2264
2265	len = fill_bitmap_rle_bits(mdev, p, c);
2266
 
 
2267	if (len < 0)
2268		return -EIO;
2269
2270	if (len) {
2271		DCBP_set_code(p, RLE_VLI_Bits);
2272		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2273			sizeof(*p) + len, 0);
2274
2275		c->packets[0]++;
2276		c->bytes[0] += sizeof(*p) + len;
2277
2278		if (c->bit_offset >= c->bm_bits)
2279			len = 0; /* DONE */
2280	} else {
2281		/* was not compressible.
2282		 * send a buffer full of plain text bits instead. */
2283		num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2284		len = num_words * sizeof(long);
 
 
 
 
 
 
2285		if (len)
2286			drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2287		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2288				   h, sizeof(struct p_header80) + len, 0);
2289		c->word_offset += num_words;
2290		c->bit_offset = c->word_offset * BITS_PER_LONG;
2291
2292		c->packets[1]++;
2293		c->bytes[1] += sizeof(struct p_header80) + len;
2294
2295		if (c->bit_offset > c->bm_bits)
2296			c->bit_offset = c->bm_bits;
2297	}
2298	if (ok) {
2299		if (len == 0) {
2300			INFO_bm_xfer_stats(mdev, "send", c);
2301			return 0;
2302		} else
2303			return 1;
2304	}
2305	return -EIO;
2306}
2307
2308/* See the comment at receive_bitmap() */
2309int _drbd_send_bitmap(struct drbd_conf *mdev)
2310{
2311	struct bm_xfer_ctx c;
2312	struct p_header80 *p;
2313	int err;
2314
2315	ERR_IF(!mdev->bitmap) return false;
2316
2317	/* maybe we should use some per thread scratch page,
2318	 * and allocate that during initial device creation? */
2319	p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2320	if (!p) {
2321		dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2322		return false;
2323	}
2324
2325	if (get_ldev(mdev)) {
2326		if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2327			dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2328			drbd_bm_set_all(mdev);
2329			if (drbd_bm_write(mdev)) {
2330				/* write_bm did fail! Leave full sync flag set in Meta P_DATA
2331				 * but otherwise process as per normal - need to tell other
2332				 * side that a full resync is required! */
2333				dev_err(DEV, "Failed to write bitmap to disk!\n");
2334			} else {
2335				drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2336				drbd_md_sync(mdev);
2337			}
2338		}
2339		put_ldev(mdev);
2340	}
2341
2342	c = (struct bm_xfer_ctx) {
2343		.bm_bits = drbd_bm_bits(mdev),
2344		.bm_words = drbd_bm_words(mdev),
2345	};
2346
2347	do {
2348		err = send_bitmap_rle_or_plain(mdev, p, &c);
2349	} while (err > 0);
2350
2351	free_page((unsigned long) p);
2352	return err == 0;
2353}
2354
2355int drbd_send_bitmap(struct drbd_conf *mdev)
2356{
2357	int err;
 
2358
2359	if (!drbd_get_data_sock(mdev))
2360		return -1;
2361	err = !_drbd_send_bitmap(mdev);
2362	drbd_put_data_sock(mdev);
2363	return err;
2364}
2365
2366int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2367{
2368	int ok;
2369	struct p_barrier_ack p;
2370
2371	p.barrier  = barrier_nr;
2372	p.set_size = cpu_to_be32(set_size);
2373
2374	if (mdev->state.conn < C_CONNECTED)
2375		return false;
2376	ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2377			(struct p_header80 *)&p, sizeof(p));
2378	return ok;
 
 
2379}
2380
2381/**
2382 * _drbd_send_ack() - Sends an ack packet
2383 * @mdev:	DRBD device.
2384 * @cmd:	Packet command code.
2385 * @sector:	sector, needs to be in big endian byte order
2386 * @blksize:	size in byte, needs to be in big endian byte order
2387 * @block_id:	Id, big endian byte order
2388 */
2389static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2390			  u64 sector,
2391			  u32 blksize,
2392			  u64 block_id)
2393{
2394	int ok;
2395	struct p_block_ack p;
2396
2397	p.sector   = sector;
2398	p.block_id = block_id;
2399	p.blksize  = blksize;
2400	p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2401
2402	if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2403		return false;
2404	ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2405				(struct p_header80 *)&p, sizeof(p));
2406	return ok;
 
 
 
 
 
 
 
2407}
2408
2409/* dp->sector and dp->block_id already/still in network byte order,
2410 * data_size is payload size according to dp->head,
2411 * and may need to be corrected for digest size. */
2412int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2413		     struct p_data *dp, int data_size)
2414{
2415	data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2416		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2417	return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2418			      dp->block_id);
2419}
2420
2421int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2422		     struct p_block_req *rp)
2423{
2424	return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2425}
2426
2427/**
2428 * drbd_send_ack() - Sends an ack packet
2429 * @mdev:	DRBD device.
2430 * @cmd:	Packet command code.
2431 * @e:		Epoch entry.
2432 */
2433int drbd_send_ack(struct drbd_conf *mdev,
2434	enum drbd_packets cmd, struct drbd_epoch_entry *e)
2435{
2436	return _drbd_send_ack(mdev, cmd,
2437			      cpu_to_be64(e->sector),
2438			      cpu_to_be32(e->size),
2439			      e->block_id);
2440}
2441
2442/* This function misuses the block_id field to signal if the blocks
2443 * are is sync or not. */
2444int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2445		     sector_t sector, int blksize, u64 block_id)
2446{
2447	return _drbd_send_ack(mdev, cmd,
2448			      cpu_to_be64(sector),
2449			      cpu_to_be32(blksize),
2450			      cpu_to_be64(block_id));
2451}
2452
2453int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2454		       sector_t sector, int size, u64 block_id)
2455{
2456	int ok;
2457	struct p_block_req p;
2458
2459	p.sector   = cpu_to_be64(sector);
2460	p.block_id = block_id;
2461	p.blksize  = cpu_to_be32(size);
2462
2463	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2464				(struct p_header80 *)&p, sizeof(p));
2465	return ok;
 
 
 
 
 
2466}
2467
2468int drbd_send_drequest_csum(struct drbd_conf *mdev,
2469			    sector_t sector, int size,
2470			    void *digest, int digest_size,
2471			    enum drbd_packets cmd)
2472{
2473	int ok;
2474	struct p_block_req p;
2475
2476	p.sector   = cpu_to_be64(sector);
2477	p.block_id = BE_DRBD_MAGIC + 0xbeef;
2478	p.blksize  = cpu_to_be32(size);
2479
2480	p.head.magic   = BE_DRBD_MAGIC;
2481	p.head.command = cpu_to_be16(cmd);
2482	p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2483
2484	mutex_lock(&mdev->data.mutex);
 
 
 
 
 
 
 
 
2485
2486	ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2487	ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
 
 
 
2488
2489	mutex_unlock(&mdev->data.mutex);
2490
2491	return ok;
 
 
 
 
 
 
 
2492}
2493
2494int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2495{
2496	int ok;
2497	struct p_block_req p;
2498
2499	p.sector   = cpu_to_be64(sector);
2500	p.block_id = BE_DRBD_MAGIC + 0xbabe;
2501	p.blksize  = cpu_to_be32(size);
2502
2503	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2504			   (struct p_header80 *)&p, sizeof(p));
2505	return ok;
 
 
 
 
 
2506}
2507
2508/* called on sndtimeo
2509 * returns false if we should retry,
2510 * true if we think connection is dead
2511 */
2512static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2513{
2514	int drop_it;
2515	/* long elapsed = (long)(jiffies - mdev->last_received); */
2516
2517	drop_it =   mdev->meta.socket == sock
2518		|| !mdev->asender.task
2519		|| get_t_state(&mdev->asender) != Running
2520		|| mdev->state.conn < C_CONNECTED;
2521
2522	if (drop_it)
2523		return true;
2524
2525	drop_it = !--mdev->ko_count;
2526	if (!drop_it) {
2527		dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2528		       current->comm, current->pid, mdev->ko_count);
2529		request_ping(mdev);
2530	}
2531
2532	return drop_it; /* && (mdev->state == R_PRIMARY) */;
 
 
 
 
 
 
 
2533}
2534
2535/* The idea of sendpage seems to be to put some kind of reference
2536 * to the page into the skb, and to hand it over to the NIC. In
2537 * this process get_page() gets called.
2538 *
2539 * As soon as the page was really sent over the network put_page()
2540 * gets called by some part of the network layer. [ NIC driver? ]
2541 *
2542 * [ get_page() / put_page() increment/decrement the count. If count
2543 *   reaches 0 the page will be freed. ]
2544 *
2545 * This works nicely with pages from FSs.
2546 * But this means that in protocol A we might signal IO completion too early!
2547 *
2548 * In order not to corrupt data during a resync we must make sure
2549 * that we do not reuse our own buffer pages (EEs) to early, therefore
2550 * we have the net_ee list.
2551 *
2552 * XFS seems to have problems, still, it submits pages with page_count == 0!
2553 * As a workaround, we disable sendpage on pages
2554 * with page_count == 0 or PageSlab.
2555 */
2556static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2557		   int offset, size_t size, unsigned msg_flags)
2558{
2559	int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
 
 
 
 
 
 
2560	kunmap(page);
2561	if (sent == size)
2562		mdev->send_cnt += size>>9;
2563	return sent == size;
2564}
2565
2566static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2567		    int offset, size_t size, unsigned msg_flags)
2568{
2569	mm_segment_t oldfs = get_fs();
2570	int sent, ok;
2571	int len = size;
 
2572
2573	/* e.g. XFS meta- & log-data is in slab pages, which have a
2574	 * page_count of 0 and/or have PageSlab() set.
2575	 * we cannot use send_page for those, as that does get_page();
2576	 * put_page(); and would cause either a VM_BUG directly, or
2577	 * __page_cache_release a page that would actually still be referenced
2578	 * by someone, leading to some obscure delayed Oops somewhere else. */
2579	if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2580		return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2581
2582	msg_flags |= MSG_NOSIGNAL;
2583	drbd_update_congested(mdev);
2584	set_fs(KERNEL_DS);
2585	do {
2586		sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2587							offset, len,
2588							msg_flags);
2589		if (sent == -EAGAIN) {
2590			if (we_should_drop_the_connection(mdev,
2591							  mdev->data.socket))
2592				break;
2593			else
2594				continue;
2595		}
2596		if (sent <= 0) {
2597			dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
 
 
 
 
 
2598			     __func__, (int)size, len, sent);
 
 
2599			break;
2600		}
2601		len    -= sent;
2602		offset += sent;
2603	} while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2604	set_fs(oldfs);
2605	clear_bit(NET_CONGESTED, &mdev->flags);
2606
2607	ok = (len == 0);
2608	if (likely(ok))
2609		mdev->send_cnt += size>>9;
2610	return ok;
 
2611}
2612
2613static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2614{
2615	struct bio_vec *bvec;
2616	int i;
 
2617	/* hint all but last page with MSG_MORE */
2618	__bio_for_each_segment(bvec, bio, i, 0) {
2619		if (!_drbd_no_send_page(mdev, bvec->bv_page,
2620				     bvec->bv_offset, bvec->bv_len,
2621				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2622			return 0;
 
 
 
 
 
 
 
2623	}
2624	return 1;
2625}
2626
2627static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2628{
2629	struct bio_vec *bvec;
2630	int i;
 
2631	/* hint all but last page with MSG_MORE */
2632	__bio_for_each_segment(bvec, bio, i, 0) {
2633		if (!_drbd_send_page(mdev, bvec->bv_page,
2634				     bvec->bv_offset, bvec->bv_len,
2635				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2636			return 0;
 
 
 
 
 
 
2637	}
2638	return 1;
2639}
2640
2641static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
 
2642{
2643	struct page *page = e->pages;
2644	unsigned len = e->size;
 
 
2645	/* hint all but last page with MSG_MORE */
2646	page_chain_for_each(page) {
2647		unsigned l = min_t(unsigned, len, PAGE_SIZE);
2648		if (!_drbd_send_page(mdev, page, 0, l,
2649				page_chain_next(page) ? MSG_MORE : 0))
2650			return 0;
 
 
2651		len -= l;
2652	}
2653	return 1;
2654}
2655
2656static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
 
2657{
2658	if (mdev->agreed_pro_version >= 95)
2659		return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2660			(bi_rw & REQ_FUA ? DP_FUA : 0) |
2661			(bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2662			(bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
 
 
2663	else
2664		return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2665}
2666
2667/* Used to send write requests
2668 * R_PRIMARY -> Peer	(P_DATA)
2669 */
2670int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2671{
2672	int ok = 1;
2673	struct p_data p;
 
 
 
2674	unsigned int dp_flags = 0;
2675	void *dgb;
2676	int dgs;
2677
2678	if (!drbd_get_data_sock(mdev))
2679		return 0;
2680
2681	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2682		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2683
2684	if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2685		p.head.h80.magic   = BE_DRBD_MAGIC;
2686		p.head.h80.command = cpu_to_be16(P_DATA);
2687		p.head.h80.length  =
2688			cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2689	} else {
2690		p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2691		p.head.h95.command = cpu_to_be16(P_DATA);
2692		p.head.h95.length  =
2693			cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2694	}
2695
2696	p.sector   = cpu_to_be64(req->sector);
2697	p.block_id = (unsigned long)req;
2698	p.seq_num  = cpu_to_be32(req->seq_num =
2699				 atomic_add_return(1, &mdev->packet_seq));
2700
2701	dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
 
 
 
2702
2703	if (mdev->state.conn >= C_SYNC_SOURCE &&
2704	    mdev->state.conn <= C_PAUSED_SYNC_T)
 
 
 
 
 
 
2705		dp_flags |= DP_MAY_SET_IN_SYNC;
2706
2707	p.dp_flags = cpu_to_be32(dp_flags);
2708	set_bit(UNPLUG_REMOTE, &mdev->flags);
2709	ok = (sizeof(p) ==
2710		drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2711	if (ok && dgs) {
2712		dgb = mdev->int_dig_out;
2713		drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2714		ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
 
 
 
 
 
 
 
2715	}
2716	if (ok) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2717		/* For protocol A, we have to memcpy the payload into
2718		 * socket buffers, as we may complete right away
2719		 * as soon as we handed it over to tcp, at which point the data
2720		 * pages may become invalid.
2721		 *
2722		 * For data-integrity enabled, we copy it as well, so we can be
2723		 * sure that even if the bio pages may still be modified, it
2724		 * won't change the data on the wire, thus if the digest checks
2725		 * out ok after sending on this side, but does not fit on the
2726		 * receiving side, we sure have detected corruption elsewhere.
2727		 */
2728		if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2729			ok = _drbd_send_bio(mdev, req->master_bio);
2730		else
2731			ok = _drbd_send_zc_bio(mdev, req->master_bio);
2732
2733		/* double check digest, sometimes buffers have been modified in flight. */
2734		if (dgs > 0 && dgs <= 64) {
2735			/* 64 byte, 512 bit, is the largest digest size
2736			 * currently supported in kernel crypto. */
2737			unsigned char digest[64];
2738			drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2739			if (memcmp(mdev->int_dig_out, digest, dgs)) {
2740				dev_warn(DEV,
2741					"Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2742					(unsigned long long)req->sector, req->size);
2743			}
2744		} /* else if (dgs > 64) {
2745		     ... Be noisy about digest too large ...
2746		} */
2747	}
 
 
2748
2749	drbd_put_data_sock(mdev);
2750
2751	return ok;
2752}
2753
2754/* answer packet, used to send data back for read requests:
2755 *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
2756 *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
2757 */
2758int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2759		    struct drbd_epoch_entry *e)
2760{
2761	int ok;
2762	struct p_data p;
2763	void *dgb;
2764	int dgs;
2765
2766	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2767		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2768
2769	if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2770		p.head.h80.magic   = BE_DRBD_MAGIC;
2771		p.head.h80.command = cpu_to_be16(cmd);
2772		p.head.h80.length  =
2773			cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2774	} else {
2775		p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2776		p.head.h95.command = cpu_to_be16(cmd);
2777		p.head.h95.length  =
2778			cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2779	}
2780
2781	p.sector   = cpu_to_be64(e->sector);
2782	p.block_id = e->block_id;
2783	/* p.seq_num  = 0;    No sequence numbers here.. */
2784
2785	/* Only called by our kernel thread.
2786	 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2787	 * in response to admin command or module unload.
2788	 */
2789	if (!drbd_get_data_sock(mdev))
2790		return 0;
2791
2792	ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2793	if (ok && dgs) {
2794		dgb = mdev->int_dig_out;
2795		drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2796		ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2797	}
2798	if (ok)
2799		ok = _drbd_send_zc_ee(mdev, e);
2800
2801	drbd_put_data_sock(mdev);
 
 
 
 
 
 
 
 
 
 
 
2802
2803	return ok;
2804}
2805
2806int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2807{
2808	struct p_block_desc p;
 
2809
2810	p.sector  = cpu_to_be64(req->sector);
2811	p.blksize = cpu_to_be32(req->size);
2812
2813	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
 
 
 
2814}
2815
2816/*
2817  drbd_send distinguishes two cases:
2818
2819  Packets sent via the data socket "sock"
2820  and packets sent via the meta data socket "msock"
2821
2822		    sock                      msock
2823  -----------------+-------------------------+------------------------------
2824  timeout           conf.timeout / 2          conf.timeout / 2
2825  timeout action    send a ping via msock     Abort communication
2826					      and close all sockets
2827*/
2828
2829/*
2830 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2831 */
2832int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2833	      void *buf, size_t size, unsigned msg_flags)
2834{
2835	struct kvec iov;
2836	struct msghdr msg;
2837	int rv, sent = 0;
2838
2839	if (!sock)
2840		return -1000;
2841
2842	/* THINK  if (signal_pending) return ... ? */
2843
2844	iov.iov_base = buf;
2845	iov.iov_len  = size;
2846
2847	msg.msg_name       = NULL;
2848	msg.msg_namelen    = 0;
2849	msg.msg_control    = NULL;
2850	msg.msg_controllen = 0;
2851	msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
2852
2853	if (sock == mdev->data.socket) {
2854		mdev->ko_count = mdev->net_conf->ko_count;
2855		drbd_update_congested(mdev);
2856	}
2857	do {
2858		/* STRANGE
2859		 * tcp_sendmsg does _not_ use its size parameter at all ?
2860		 *
2861		 * -EAGAIN on timeout, -EINTR on signal.
2862		 */
2863/* THINK
2864 * do we need to block DRBD_SIG if sock == &meta.socket ??
2865 * otherwise wake_asender() might interrupt some send_*Ack !
2866 */
2867		rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2868		if (rv == -EAGAIN) {
2869			if (we_should_drop_the_connection(mdev, sock))
2870				break;
2871			else
2872				continue;
2873		}
2874		D_ASSERT(rv != 0);
2875		if (rv == -EINTR) {
2876			flush_signals(current);
2877			rv = 0;
2878		}
2879		if (rv < 0)
2880			break;
2881		sent += rv;
2882		iov.iov_base += rv;
2883		iov.iov_len  -= rv;
2884	} while (sent < size);
2885
2886	if (sock == mdev->data.socket)
2887		clear_bit(NET_CONGESTED, &mdev->flags);
2888
2889	if (rv <= 0) {
2890		if (rv != -EAGAIN) {
2891			dev_err(DEV, "%s_sendmsg returned %d\n",
2892			    sock == mdev->meta.socket ? "msock" : "sock",
2893			    rv);
2894			drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2895		} else
2896			drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2897	}
2898
2899	return sent;
2900}
2901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2902static int drbd_open(struct block_device *bdev, fmode_t mode)
2903{
2904	struct drbd_conf *mdev = bdev->bd_disk->private_data;
2905	unsigned long flags;
2906	int rv = 0;
2907
2908	mutex_lock(&drbd_main_mutex);
2909	spin_lock_irqsave(&mdev->req_lock, flags);
2910	/* to have a stable mdev->state.role
2911	 * and no race with updating open_cnt */
2912
2913	if (mdev->state.role != R_PRIMARY) {
2914		if (mode & FMODE_WRITE)
2915			rv = -EROFS;
2916		else if (!allow_oos)
2917			rv = -EMEDIUMTYPE;
2918	}
2919
2920	if (!rv)
2921		mdev->open_cnt++;
2922	spin_unlock_irqrestore(&mdev->req_lock, flags);
2923	mutex_unlock(&drbd_main_mutex);
2924
2925	return rv;
2926}
2927
2928static int drbd_release(struct gendisk *gd, fmode_t mode)
2929{
2930	struct drbd_conf *mdev = gd->private_data;
2931	mutex_lock(&drbd_main_mutex);
2932	mdev->open_cnt--;
2933	mutex_unlock(&drbd_main_mutex);
2934	return 0;
2935}
2936
2937static void drbd_set_defaults(struct drbd_conf *mdev)
 
2938{
2939	/* This way we get a compile error when sync_conf grows,
2940	   and we forgot to initialize it here */
2941	mdev->sync_conf = (struct syncer_conf) {
2942		/* .rate = */		DRBD_RATE_DEF,
2943		/* .after = */		DRBD_AFTER_DEF,
2944		/* .al_extents = */	DRBD_AL_EXTENTS_DEF,
2945		/* .verify_alg = */	{}, 0,
2946		/* .cpu_mask = */	{}, 0,
2947		/* .csums_alg = */	{}, 0,
2948		/* .use_rle = */	0,
2949		/* .on_no_data = */	DRBD_ON_NO_DATA_DEF,
2950		/* .c_plan_ahead = */	DRBD_C_PLAN_AHEAD_DEF,
2951		/* .c_delay_target = */	DRBD_C_DELAY_TARGET_DEF,
2952		/* .c_fill_target = */	DRBD_C_FILL_TARGET_DEF,
2953		/* .c_max_rate = */	DRBD_C_MAX_RATE_DEF,
2954		/* .c_min_rate = */	DRBD_C_MIN_RATE_DEF
2955	};
2956
2957	/* Have to use that way, because the layout differs between
2958	   big endian and little endian */
2959	mdev->state = (union drbd_state) {
 
 
2960		{ .role = R_SECONDARY,
2961		  .peer = R_UNKNOWN,
2962		  .conn = C_STANDALONE,
2963		  .disk = D_DISKLESS,
2964		  .pdsk = D_UNKNOWN,
2965		  .susp = 0,
2966		  .susp_nod = 0,
2967		  .susp_fen = 0
2968		} };
2969}
2970
2971void drbd_init_set_defaults(struct drbd_conf *mdev)
2972{
2973	/* the memset(,0,) did most of this.
2974	 * note: only assignments, no allocation in here */
2975
2976	drbd_set_defaults(mdev);
2977
2978	atomic_set(&mdev->ap_bio_cnt, 0);
2979	atomic_set(&mdev->ap_pending_cnt, 0);
2980	atomic_set(&mdev->rs_pending_cnt, 0);
2981	atomic_set(&mdev->unacked_cnt, 0);
2982	atomic_set(&mdev->local_cnt, 0);
2983	atomic_set(&mdev->net_cnt, 0);
2984	atomic_set(&mdev->packet_seq, 0);
2985	atomic_set(&mdev->pp_in_use, 0);
2986	atomic_set(&mdev->pp_in_use_by_net, 0);
2987	atomic_set(&mdev->rs_sect_in, 0);
2988	atomic_set(&mdev->rs_sect_ev, 0);
2989	atomic_set(&mdev->ap_in_flight, 0);
2990
2991	mutex_init(&mdev->md_io_mutex);
2992	mutex_init(&mdev->data.mutex);
2993	mutex_init(&mdev->meta.mutex);
2994	sema_init(&mdev->data.work.s, 0);
2995	sema_init(&mdev->meta.work.s, 0);
2996	mutex_init(&mdev->state_mutex);
2997
2998	spin_lock_init(&mdev->data.work.q_lock);
2999	spin_lock_init(&mdev->meta.work.q_lock);
3000
3001	spin_lock_init(&mdev->al_lock);
3002	spin_lock_init(&mdev->req_lock);
3003	spin_lock_init(&mdev->peer_seq_lock);
3004	spin_lock_init(&mdev->epoch_lock);
3005
3006	INIT_LIST_HEAD(&mdev->active_ee);
3007	INIT_LIST_HEAD(&mdev->sync_ee);
3008	INIT_LIST_HEAD(&mdev->done_ee);
3009	INIT_LIST_HEAD(&mdev->read_ee);
3010	INIT_LIST_HEAD(&mdev->net_ee);
3011	INIT_LIST_HEAD(&mdev->resync_reads);
3012	INIT_LIST_HEAD(&mdev->data.work.q);
3013	INIT_LIST_HEAD(&mdev->meta.work.q);
3014	INIT_LIST_HEAD(&mdev->resync_work.list);
3015	INIT_LIST_HEAD(&mdev->unplug_work.list);
3016	INIT_LIST_HEAD(&mdev->go_diskless.list);
3017	INIT_LIST_HEAD(&mdev->md_sync_work.list);
3018	INIT_LIST_HEAD(&mdev->start_resync_work.list);
3019	INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
3020
3021	mdev->resync_work.cb  = w_resync_timer;
3022	mdev->unplug_work.cb  = w_send_write_hint;
3023	mdev->go_diskless.cb  = w_go_diskless;
3024	mdev->md_sync_work.cb = w_md_sync;
3025	mdev->bm_io_work.w.cb = w_bitmap_io;
3026	mdev->start_resync_work.cb = w_start_resync;
3027	init_timer(&mdev->resync_timer);
3028	init_timer(&mdev->md_sync_timer);
3029	init_timer(&mdev->start_resync_timer);
3030	init_timer(&mdev->request_timer);
3031	mdev->resync_timer.function = resync_timer_fn;
3032	mdev->resync_timer.data = (unsigned long) mdev;
3033	mdev->md_sync_timer.function = md_sync_timer_fn;
3034	mdev->md_sync_timer.data = (unsigned long) mdev;
3035	mdev->start_resync_timer.function = start_resync_timer_fn;
3036	mdev->start_resync_timer.data = (unsigned long) mdev;
3037	mdev->request_timer.function = request_timer_fn;
3038	mdev->request_timer.data = (unsigned long) mdev;
3039
3040	init_waitqueue_head(&mdev->misc_wait);
3041	init_waitqueue_head(&mdev->state_wait);
3042	init_waitqueue_head(&mdev->net_cnt_wait);
3043	init_waitqueue_head(&mdev->ee_wait);
3044	init_waitqueue_head(&mdev->al_wait);
3045	init_waitqueue_head(&mdev->seq_wait);
3046
3047	drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3048	drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3049	drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3050
3051	mdev->agreed_pro_version = PRO_VERSION_MAX;
3052	mdev->write_ordering = WO_bdev_flush;
3053	mdev->resync_wenr = LC_FREE;
3054	mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3055	mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3056}
3057
3058void drbd_mdev_cleanup(struct drbd_conf *mdev)
3059{
3060	int i;
3061	if (mdev->receiver.t_state != None)
3062		dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3063				mdev->receiver.t_state);
3064
3065	/* no need to lock it, I'm the only thread alive */
3066	if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
3067		dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3068	mdev->al_writ_cnt  =
3069	mdev->bm_writ_cnt  =
3070	mdev->read_cnt     =
3071	mdev->recv_cnt     =
3072	mdev->send_cnt     =
3073	mdev->writ_cnt     =
3074	mdev->p_size       =
3075	mdev->rs_start     =
3076	mdev->rs_total     =
3077	mdev->rs_failed    = 0;
3078	mdev->rs_last_events = 0;
3079	mdev->rs_last_sect_ev = 0;
3080	for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3081		mdev->rs_mark_left[i] = 0;
3082		mdev->rs_mark_time[i] = 0;
3083	}
3084	D_ASSERT(mdev->net_conf == NULL);
3085
3086	drbd_set_my_capacity(mdev, 0);
3087	if (mdev->bitmap) {
3088		/* maybe never allocated. */
3089		drbd_bm_resize(mdev, 0, 1);
3090		drbd_bm_cleanup(mdev);
3091	}
3092
3093	drbd_free_resources(mdev);
3094	clear_bit(AL_SUSPENDED, &mdev->flags);
3095
3096	/*
3097	 * currently we drbd_init_ee only on module load, so
3098	 * we may do drbd_release_ee only on module unload!
3099	 */
3100	D_ASSERT(list_empty(&mdev->active_ee));
3101	D_ASSERT(list_empty(&mdev->sync_ee));
3102	D_ASSERT(list_empty(&mdev->done_ee));
3103	D_ASSERT(list_empty(&mdev->read_ee));
3104	D_ASSERT(list_empty(&mdev->net_ee));
3105	D_ASSERT(list_empty(&mdev->resync_reads));
3106	D_ASSERT(list_empty(&mdev->data.work.q));
3107	D_ASSERT(list_empty(&mdev->meta.work.q));
3108	D_ASSERT(list_empty(&mdev->resync_work.list));
3109	D_ASSERT(list_empty(&mdev->unplug_work.list));
3110	D_ASSERT(list_empty(&mdev->go_diskless.list));
3111
3112	drbd_set_defaults(mdev);
 
 
 
 
 
 
 
 
 
 
3113}
3114
3115
3116static void drbd_destroy_mempools(void)
3117{
3118	struct page *page;
3119
3120	while (drbd_pp_pool) {
3121		page = drbd_pp_pool;
3122		drbd_pp_pool = (struct page *)page_private(page);
3123		__free_page(page);
3124		drbd_pp_vacant--;
3125	}
3126
3127	/* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3128
 
 
 
 
 
 
3129	if (drbd_ee_mempool)
3130		mempool_destroy(drbd_ee_mempool);
3131	if (drbd_request_mempool)
3132		mempool_destroy(drbd_request_mempool);
3133	if (drbd_ee_cache)
3134		kmem_cache_destroy(drbd_ee_cache);
3135	if (drbd_request_cache)
3136		kmem_cache_destroy(drbd_request_cache);
3137	if (drbd_bm_ext_cache)
3138		kmem_cache_destroy(drbd_bm_ext_cache);
3139	if (drbd_al_ext_cache)
3140		kmem_cache_destroy(drbd_al_ext_cache);
3141
 
 
 
3142	drbd_ee_mempool      = NULL;
3143	drbd_request_mempool = NULL;
3144	drbd_ee_cache        = NULL;
3145	drbd_request_cache   = NULL;
3146	drbd_bm_ext_cache    = NULL;
3147	drbd_al_ext_cache    = NULL;
3148
3149	return;
3150}
3151
3152static int drbd_create_mempools(void)
3153{
3154	struct page *page;
3155	const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
3156	int i;
3157
3158	/* prepare our caches and mempools */
3159	drbd_request_mempool = NULL;
3160	drbd_ee_cache        = NULL;
3161	drbd_request_cache   = NULL;
3162	drbd_bm_ext_cache    = NULL;
3163	drbd_al_ext_cache    = NULL;
3164	drbd_pp_pool         = NULL;
 
 
 
3165
3166	/* caches */
3167	drbd_request_cache = kmem_cache_create(
3168		"drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3169	if (drbd_request_cache == NULL)
3170		goto Enomem;
3171
3172	drbd_ee_cache = kmem_cache_create(
3173		"drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3174	if (drbd_ee_cache == NULL)
3175		goto Enomem;
3176
3177	drbd_bm_ext_cache = kmem_cache_create(
3178		"drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3179	if (drbd_bm_ext_cache == NULL)
3180		goto Enomem;
3181
3182	drbd_al_ext_cache = kmem_cache_create(
3183		"drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3184	if (drbd_al_ext_cache == NULL)
3185		goto Enomem;
3186
3187	/* mempools */
3188	drbd_request_mempool = mempool_create(number,
3189		mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
 
 
 
 
 
 
 
 
 
 
 
 
 
3190	if (drbd_request_mempool == NULL)
3191		goto Enomem;
3192
3193	drbd_ee_mempool = mempool_create(number,
3194		mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
3195	if (drbd_ee_mempool == NULL)
3196		goto Enomem;
3197
3198	/* drbd's page pool */
3199	spin_lock_init(&drbd_pp_lock);
3200
3201	for (i = 0; i < number; i++) {
3202		page = alloc_page(GFP_HIGHUSER);
3203		if (!page)
3204			goto Enomem;
3205		set_page_private(page, (unsigned long)drbd_pp_pool);
3206		drbd_pp_pool = page;
3207	}
3208	drbd_pp_vacant = number;
3209
3210	return 0;
3211
3212Enomem:
3213	drbd_destroy_mempools(); /* in case we allocated some */
3214	return -ENOMEM;
3215}
3216
3217static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3218	void *unused)
3219{
3220	/* just so we have it.  you never know what interesting things we
3221	 * might want to do here some day...
3222	 */
3223
3224	return NOTIFY_DONE;
3225}
3226
3227static struct notifier_block drbd_notifier = {
3228	.notifier_call = drbd_notify_sys,
3229};
3230
3231static void drbd_release_ee_lists(struct drbd_conf *mdev)
3232{
3233	int rr;
3234
3235	rr = drbd_release_ee(mdev, &mdev->active_ee);
3236	if (rr)
3237		dev_err(DEV, "%d EEs in active list found!\n", rr);
3238
3239	rr = drbd_release_ee(mdev, &mdev->sync_ee);
3240	if (rr)
3241		dev_err(DEV, "%d EEs in sync list found!\n", rr);
3242
3243	rr = drbd_release_ee(mdev, &mdev->read_ee);
3244	if (rr)
3245		dev_err(DEV, "%d EEs in read list found!\n", rr);
3246
3247	rr = drbd_release_ee(mdev, &mdev->done_ee);
3248	if (rr)
3249		dev_err(DEV, "%d EEs in done list found!\n", rr);
3250
3251	rr = drbd_release_ee(mdev, &mdev->net_ee);
3252	if (rr)
3253		dev_err(DEV, "%d EEs in net list found!\n", rr);
3254}
3255
3256/* caution. no locking.
3257 * currently only used from module cleanup code. */
3258static void drbd_delete_device(unsigned int minor)
3259{
3260	struct drbd_conf *mdev = minor_to_mdev(minor);
 
 
3261
3262	if (!mdev)
3263		return;
3264
3265	/* paranoia asserts */
3266	if (mdev->open_cnt != 0)
3267		dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3268				__FILE__ , __LINE__);
3269
3270	ERR_IF (!list_empty(&mdev->data.work.q)) {
3271		struct list_head *lp;
3272		list_for_each(lp, &mdev->data.work.q) {
3273			dev_err(DEV, "lp = %p\n", lp);
3274		}
3275	};
3276	/* end paranoia asserts */
3277
3278	del_gendisk(mdev->vdisk);
3279
3280	/* cleanup stuff that may have been allocated during
3281	 * device (re-)configuration or state changes */
3282
3283	if (mdev->this_bdev)
3284		bdput(mdev->this_bdev);
3285
3286	drbd_free_resources(mdev);
 
3287
3288	drbd_release_ee_lists(mdev);
3289
3290	/* should be freed on disconnect? */
3291	kfree(mdev->ee_hash);
3292	/*
3293	mdev->ee_hash_s = 0;
3294	mdev->ee_hash = NULL;
3295	*/
3296
3297	lc_destroy(mdev->act_log);
3298	lc_destroy(mdev->resync);
3299
3300	kfree(mdev->p_uuid);
3301	/* mdev->p_uuid = NULL; */
3302
3303	kfree(mdev->int_dig_out);
3304	kfree(mdev->int_dig_in);
3305	kfree(mdev->int_dig_vv);
3306
3307	/* cleanup the rest that has been
3308	 * allocated from drbd_new_device
3309	 * and actually free the mdev itself */
3310	drbd_free_mdev(mdev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3311}
3312
3313static void drbd_cleanup(void)
3314{
3315	unsigned int i;
3316
3317	unregister_reboot_notifier(&drbd_notifier);
3318
3319	/* first remove proc,
3320	 * drbdsetup uses it's presence to detect
3321	 * whether DRBD is loaded.
3322	 * If we would get stuck in proc removal,
3323	 * but have netlink already deregistered,
3324	 * some drbdsetup commands may wait forever
3325	 * for an answer.
3326	 */
3327	if (drbd_proc)
3328		remove_proc_entry("drbd", NULL);
3329
3330	drbd_nl_cleanup();
 
 
 
 
 
 
3331
3332	if (minor_table) {
3333		i = minor_count;
3334		while (i--)
3335			drbd_delete_device(i);
3336		drbd_destroy_mempools();
3337	}
3338
3339	kfree(minor_table);
3340
 
3341	unregister_blkdev(DRBD_MAJOR, "drbd");
3342
3343	printk(KERN_INFO "drbd: module cleanup done.\n");
 
 
3344}
3345
3346/**
3347 * drbd_congested() - Callback for pdflush
3348 * @congested_data:	User data
3349 * @bdi_bits:		Bits pdflush is currently interested in
3350 *
3351 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3352 */
3353static int drbd_congested(void *congested_data, int bdi_bits)
3354{
3355	struct drbd_conf *mdev = congested_data;
3356	struct request_queue *q;
3357	char reason = '-';
3358	int r = 0;
3359
3360	if (!may_inc_ap_bio(mdev)) {
3361		/* DRBD has frozen IO */
3362		r = bdi_bits;
3363		reason = 'd';
3364		goto out;
3365	}
3366
3367	if (get_ldev(mdev)) {
3368		q = bdev_get_queue(mdev->ldev->backing_bdev);
3369		r = bdi_congested(&q->backing_dev_info, bdi_bits);
3370		put_ldev(mdev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3371		if (r)
3372			reason = 'b';
3373	}
3374
3375	if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3376		r |= (1 << BDI_async_congested);
 
3377		reason = reason == 'b' ? 'a' : 'n';
3378	}
3379
3380out:
3381	mdev->congestion_reason = reason;
3382	return r;
3383}
3384
3385struct drbd_conf *drbd_new_device(unsigned int minor)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3386{
3387	struct drbd_conf *mdev;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3388	struct gendisk *disk;
3389	struct request_queue *q;
 
 
 
 
 
 
 
3390
3391	/* GFP_KERNEL, we are outside of all write-out paths */
3392	mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3393	if (!mdev)
3394		return NULL;
3395	if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3396		goto out_no_cpumask;
 
 
 
 
3397
3398	mdev->minor = minor;
3399
3400	drbd_init_set_defaults(mdev);
3401
3402	q = blk_alloc_queue(GFP_KERNEL);
3403	if (!q)
3404		goto out_no_q;
3405	mdev->rq_queue = q;
3406	q->queuedata   = mdev;
3407
3408	disk = alloc_disk(1);
3409	if (!disk)
3410		goto out_no_disk;
3411	mdev->vdisk = disk;
3412
3413	set_disk_ro(disk, true);
3414
3415	disk->queue = q;
3416	disk->major = DRBD_MAJOR;
3417	disk->first_minor = minor;
3418	disk->fops = &drbd_ops;
3419	sprintf(disk->disk_name, "drbd%d", minor);
3420	disk->private_data = mdev;
3421
3422	mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3423	/* we have no partitions. we contain only ourselves. */
3424	mdev->this_bdev->bd_contains = mdev->this_bdev;
3425
3426	q->backing_dev_info.congested_fn = drbd_congested;
3427	q->backing_dev_info.congested_data = mdev;
3428
3429	blk_queue_make_request(q, drbd_make_request);
 
3430	/* Setting the max_hw_sectors to an odd value of 8kibyte here
3431	   This triggers a max_bio_size message upon first attach or connect */
3432	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
3433	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3434	blk_queue_merge_bvec(q, drbd_merge_bvec);
3435	q->queue_lock = &mdev->req_lock;
3436
3437	mdev->md_io_page = alloc_page(GFP_KERNEL);
3438	if (!mdev->md_io_page)
3439		goto out_no_io_page;
3440
3441	if (drbd_bm_init(mdev))
3442		goto out_no_bitmap;
3443	/* no need to lock access, we are still initializing this minor device. */
3444	if (!tl_init(mdev))
3445		goto out_no_tl;
3446
3447	mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3448	if (!mdev->app_reads_hash)
3449		goto out_no_app_reads;
3450
3451	mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3452	if (!mdev->current_epoch)
3453		goto out_no_epoch;
3454
3455	INIT_LIST_HEAD(&mdev->current_epoch->list);
3456	mdev->epochs = 1;
3457
3458	return mdev;
3459
3460/* out_whatever_else:
3461	kfree(mdev->current_epoch); */
3462out_no_epoch:
3463	kfree(mdev->app_reads_hash);
3464out_no_app_reads:
3465	tl_cleanup(mdev);
3466out_no_tl:
3467	drbd_bm_cleanup(mdev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3468out_no_bitmap:
3469	__free_page(mdev->md_io_page);
3470out_no_io_page:
3471	put_disk(disk);
3472out_no_disk:
3473	blk_cleanup_queue(q);
3474out_no_q:
3475	free_cpumask_var(mdev->cpu_mask);
3476out_no_cpumask:
3477	kfree(mdev);
3478	return NULL;
3479}
3480
3481/* counterpart of drbd_new_device.
3482 * last part of drbd_delete_device. */
3483void drbd_free_mdev(struct drbd_conf *mdev)
3484{
3485	kfree(mdev->current_epoch);
3486	kfree(mdev->app_reads_hash);
3487	tl_cleanup(mdev);
3488	if (mdev->bitmap) /* should no longer be there. */
3489		drbd_bm_cleanup(mdev);
3490	__free_page(mdev->md_io_page);
3491	put_disk(mdev->vdisk);
3492	blk_cleanup_queue(mdev->rq_queue);
3493	free_cpumask_var(mdev->cpu_mask);
3494	drbd_free_tl_hash(mdev);
3495	kfree(mdev);
 
 
 
 
 
 
3496}
3497
3498
3499int __init drbd_init(void)
3500{
3501	int err;
3502
3503	if (sizeof(struct p_handshake) != 80) {
3504		printk(KERN_ERR
3505		       "drbd: never change the size or layout "
3506		       "of the HandShake packet.\n");
3507		return -EINVAL;
3508	}
3509
3510	if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
3511		printk(KERN_ERR
3512			"drbd: invalid minor_count (%d)\n", minor_count);
3513#ifdef MODULE
3514		return -EINVAL;
3515#else
3516		minor_count = 8;
3517#endif
3518	}
3519
3520	err = drbd_nl_init();
3521	if (err)
3522		return err;
3523
3524	err = register_blkdev(DRBD_MAJOR, "drbd");
3525	if (err) {
3526		printk(KERN_ERR
3527		       "drbd: unable to register block device major %d\n",
3528		       DRBD_MAJOR);
3529		return err;
3530	}
3531
3532	register_reboot_notifier(&drbd_notifier);
3533
3534	/*
3535	 * allocate all necessary structs
3536	 */
3537	err = -ENOMEM;
3538
3539	init_waitqueue_head(&drbd_pp_wait);
3540
3541	drbd_proc = NULL; /* play safe for drbd_cleanup */
3542	minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3543				GFP_KERNEL);
3544	if (!minor_table)
3545		goto Enomem;
 
 
 
 
 
 
3546
3547	err = drbd_create_mempools();
3548	if (err)
3549		goto Enomem;
3550
 
3551	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3552	if (!drbd_proc)	{
3553		printk(KERN_ERR "drbd: unable to register proc file\n");
3554		goto Enomem;
3555	}
3556
3557	rwlock_init(&global_state_lock);
 
 
 
 
 
 
 
 
 
 
3558
3559	printk(KERN_INFO "drbd: initialized. "
3560	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3561	       API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3562	printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3563	printk(KERN_INFO "drbd: registered as block device major %d\n",
3564		DRBD_MAJOR);
3565	printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3566
3567	return 0; /* Success! */
3568
3569Enomem:
3570	drbd_cleanup();
3571	if (err == -ENOMEM)
3572		/* currently always the case */
3573		printk(KERN_ERR "drbd: ran out of memory\n");
3574	else
3575		printk(KERN_ERR "drbd: initialization failure\n");
3576	return err;
3577}
3578
3579void drbd_free_bc(struct drbd_backing_dev *ldev)
3580{
3581	if (ldev == NULL)
3582		return;
3583
3584	blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3585	blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3586
3587	kfree(ldev);
 
 
 
 
3588}
3589
3590void drbd_free_sock(struct drbd_conf *mdev)
3591{
3592	if (mdev->data.socket) {
3593		mutex_lock(&mdev->data.mutex);
3594		kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3595		sock_release(mdev->data.socket);
3596		mdev->data.socket = NULL;
3597		mutex_unlock(&mdev->data.mutex);
3598	}
3599	if (mdev->meta.socket) {
3600		mutex_lock(&mdev->meta.mutex);
3601		kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3602		sock_release(mdev->meta.socket);
3603		mdev->meta.socket = NULL;
3604		mutex_unlock(&mdev->meta.mutex);
3605	}
3606}
3607
 
3608
3609void drbd_free_resources(struct drbd_conf *mdev)
3610{
3611	crypto_free_hash(mdev->csums_tfm);
3612	mdev->csums_tfm = NULL;
3613	crypto_free_hash(mdev->verify_tfm);
3614	mdev->verify_tfm = NULL;
3615	crypto_free_hash(mdev->cram_hmac_tfm);
3616	mdev->cram_hmac_tfm = NULL;
3617	crypto_free_hash(mdev->integrity_w_tfm);
3618	mdev->integrity_w_tfm = NULL;
3619	crypto_free_hash(mdev->integrity_r_tfm);
3620	mdev->integrity_r_tfm = NULL;
3621
3622	drbd_free_sock(mdev);
 
 
3623
3624	__no_warn(local,
3625		  drbd_free_bc(mdev->ldev);
3626		  mdev->ldev = NULL;);
 
 
 
 
3627}
3628
3629/* meta data management */
3630
3631struct meta_data_on_disk {
3632	u64 la_size;           /* last agreed size. */
3633	u64 uuid[UI_SIZE];   /* UUIDs. */
3634	u64 device_uuid;
3635	u64 reserved_u64_1;
3636	u32 flags;             /* MDF */
3637	u32 magic;
3638	u32 md_size_sect;
3639	u32 al_offset;         /* offset to this block */
3640	u32 al_nr_extents;     /* important for restoring the AL */
3641	      /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3642	u32 bm_offset;         /* offset to the bitmap, from here */
3643	u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
3644	u32 la_peer_max_bio_size;   /* last peer max_bio_size */
3645	u32 reserved_u32[3];
3646
 
 
 
 
 
3647} __packed;
3648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3649/**
3650 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3651 * @mdev:	DRBD device.
3652 */
3653void drbd_md_sync(struct drbd_conf *mdev)
3654{
3655	struct meta_data_on_disk *buffer;
3656	sector_t sector;
3657	int i;
3658
3659	del_timer(&mdev->md_sync_timer);
 
 
 
 
3660	/* timer may be rearmed by drbd_md_mark_dirty() now. */
3661	if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3662		return;
3663
3664	/* We use here D_FAILED and not D_ATTACHING because we try to write
3665	 * metadata even if we detach due to a disk failure! */
3666	if (!get_ldev_if_state(mdev, D_FAILED))
3667		return;
3668
3669	mutex_lock(&mdev->md_io_mutex);
3670	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3671	memset(buffer, 0, 512);
3672
3673	buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3674	for (i = UI_CURRENT; i < UI_SIZE; i++)
3675		buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3676	buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3677	buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3678
3679	buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
3680	buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
3681	buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3682	buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3683	buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3684
3685	buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3686	buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
 
3687
3688	D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3689	sector = mdev->ldev->md.md_offset;
 
 
3690
3691	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3692		/* this was a try anyways ... */
3693		dev_err(DEV, "meta data update failed!\n");
3694		drbd_chk_io_error(mdev, 1, true);
 
 
 
 
 
 
 
 
3695	}
3696
3697	/* Update mdev->ldev->md.la_size_sect,
3698	 * since we updated it on metadata. */
3699	mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
 
 
3700
3701	mutex_unlock(&mdev->md_io_mutex);
3702	put_ldev(mdev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3703}
3704
 
3705/**
3706 * drbd_md_read() - Reads in the meta data super block
3707 * @mdev:	DRBD device.
3708 * @bdev:	Device from which the meta data should be read in.
3709 *
3710 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
3711 * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
 
 
 
3712 */
3713int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3714{
3715	struct meta_data_on_disk *buffer;
 
3716	int i, rv = NO_ERROR;
3717
3718	if (!get_ldev_if_state(mdev, D_ATTACHING))
3719		return ERR_IO_MD_DISK;
3720
3721	mutex_lock(&mdev->md_io_mutex);
3722	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
 
 
 
 
 
 
 
 
 
 
3723
3724	if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
 
3725		/* NOTE: can't do normal error processing here as this is
3726		   called BEFORE disk is attached */
3727		dev_err(DEV, "Error while reading metadata.\n");
3728		rv = ERR_IO_MD_DISK;
3729		goto err;
3730	}
3731
3732	if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3733		dev_err(DEV, "Error while reading metadata, magic not found.\n");
3734		rv = ERR_MD_INVALID;
3735		goto err;
3736	}
3737	if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3738		dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3739		    be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3740		rv = ERR_MD_INVALID;
3741		goto err;
3742	}
3743	if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3744		dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3745		    be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3746		rv = ERR_MD_INVALID;
3747		goto err;
3748	}
3749	if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3750		dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3751		    be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3752		rv = ERR_MD_INVALID;
 
 
 
3753		goto err;
3754	}
3755
3756	if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3757		dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3758		    be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3759		rv = ERR_MD_INVALID;
3760		goto err;
3761	}
3762
3763	bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
 
 
3764	for (i = UI_CURRENT; i < UI_SIZE; i++)
3765		bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3766	bdev->md.flags = be32_to_cpu(buffer->flags);
3767	mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3768	bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3769
3770	spin_lock_irq(&mdev->req_lock);
3771	if (mdev->state.conn < C_CONNECTED) {
3772		int peer;
3773		peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3774		peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3775		mdev->peer_max_bio_size = peer;
 
 
 
 
 
 
 
3776	}
3777	spin_unlock_irq(&mdev->req_lock);
 
 
 
 
 
 
3778
3779	if (mdev->sync_conf.al_extents < 7)
3780		mdev->sync_conf.al_extents = 127;
 
 
 
 
 
 
3781
3782 err:
3783	mutex_unlock(&mdev->md_io_mutex);
3784	put_ldev(mdev);
3785
3786	return rv;
3787}
3788
3789/**
3790 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3791 * @mdev:	DRBD device.
3792 *
3793 * Call this function if you change anything that should be written to
3794 * the meta-data super block. This function sets MD_DIRTY, and starts a
3795 * timer that ensures that within five seconds you have to call drbd_md_sync().
3796 */
3797#ifdef DEBUG
3798void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3799{
3800	if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3801		mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3802		mdev->last_md_mark_dirty.line = line;
3803		mdev->last_md_mark_dirty.func = func;
3804	}
3805}
3806#else
3807void drbd_md_mark_dirty(struct drbd_conf *mdev)
3808{
3809	if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
3810		mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3811}
3812#endif
3813
3814static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3815{
3816	int i;
3817
3818	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
3819		mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3820}
3821
3822void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3823{
3824	if (idx == UI_CURRENT) {
3825		if (mdev->state.role == R_PRIMARY)
3826			val |= 1;
3827		else
3828			val &= ~((u64)1);
3829
3830		drbd_set_ed_uuid(mdev, val);
3831	}
3832
3833	mdev->ldev->md.uuid[idx] = val;
3834	drbd_md_mark_dirty(mdev);
3835}
3836
 
 
 
 
 
 
 
3837
3838void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3839{
3840	if (mdev->ldev->md.uuid[idx]) {
3841		drbd_uuid_move_history(mdev);
3842		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
 
 
3843	}
3844	_drbd_uuid_set(mdev, idx, val);
 
3845}
3846
3847/**
3848 * drbd_uuid_new_current() - Creates a new current UUID
3849 * @mdev:	DRBD device.
3850 *
3851 * Creates a new current UUID, and rotates the old current UUID into
3852 * the bitmap slot. Causes an incremental resync upon next connect.
3853 */
3854void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3855{
3856	u64 val;
3857	unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
 
 
 
 
 
3858
3859	if (bm_uuid)
3860		dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3861
3862	mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
 
 
3863
3864	get_random_bytes(&val, sizeof(u64));
3865	_drbd_uuid_set(mdev, UI_CURRENT, val);
3866	drbd_print_uuids(mdev, "new current UUID");
3867	/* get it to stable storage _now_ */
3868	drbd_md_sync(mdev);
3869}
3870
3871void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3872{
3873	if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
 
3874		return;
3875
 
3876	if (val == 0) {
3877		drbd_uuid_move_history(mdev);
3878		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3879		mdev->ldev->md.uuid[UI_BITMAP] = 0;
3880	} else {
3881		unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3882		if (bm_uuid)
3883			dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3884
3885		mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
3886	}
3887	drbd_md_mark_dirty(mdev);
 
 
3888}
3889
3890/**
3891 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3892 * @mdev:	DRBD device.
3893 *
3894 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3895 */
3896int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3897{
3898	int rv = -EIO;
3899
3900	if (get_ldev_if_state(mdev, D_ATTACHING)) {
3901		drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3902		drbd_md_sync(mdev);
3903		drbd_bm_set_all(mdev);
3904
3905		rv = drbd_bm_write(mdev);
3906
3907		if (!rv) {
3908			drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3909			drbd_md_sync(mdev);
3910		}
3911
3912		put_ldev(mdev);
 
 
3913	}
3914
3915	return rv;
3916}
3917
3918/**
3919 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3920 * @mdev:	DRBD device.
3921 *
3922 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3923 */
3924int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3925{
3926	int rv = -EIO;
3927
3928	drbd_resume_al(mdev);
3929	if (get_ldev_if_state(mdev, D_ATTACHING)) {
3930		drbd_bm_clear_all(mdev);
3931		rv = drbd_bm_write(mdev);
3932		put_ldev(mdev);
3933	}
3934
3935	return rv;
3936}
3937
3938static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3939{
3940	struct bm_io_work *work = container_of(w, struct bm_io_work, w);
 
 
3941	int rv = -EIO;
3942
3943	D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3944
3945	if (get_ldev(mdev)) {
3946		drbd_bm_lock(mdev, work->why, work->flags);
3947		rv = work->io_fn(mdev);
3948		drbd_bm_unlock(mdev);
3949		put_ldev(mdev);
 
 
 
 
 
3950	}
3951
3952	clear_bit(BITMAP_IO, &mdev->flags);
3953	smp_mb__after_clear_bit();
3954	wake_up(&mdev->misc_wait);
3955
3956	if (work->done)
3957		work->done(mdev, rv);
3958
3959	clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3960	work->why = NULL;
3961	work->flags = 0;
3962
3963	return 1;
3964}
3965
3966void drbd_ldev_destroy(struct drbd_conf *mdev)
3967{
3968	lc_destroy(mdev->resync);
3969	mdev->resync = NULL;
3970	lc_destroy(mdev->act_log);
3971	mdev->act_log = NULL;
3972	__no_warn(local,
3973		drbd_free_bc(mdev->ldev);
3974		mdev->ldev = NULL;);
3975
3976	if (mdev->md_io_tmpp) {
3977		__free_page(mdev->md_io_tmpp);
3978		mdev->md_io_tmpp = NULL;
3979	}
3980	clear_bit(GO_DISKLESS, &mdev->flags);
3981}
3982
3983static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3984{
3985	D_ASSERT(mdev->state.disk == D_FAILED);
3986	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3987	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3988	 * the protected members anymore, though, so once put_ldev reaches zero
3989	 * again, it will be safe to free them. */
3990	drbd_force_state(mdev, NS(disk, D_DISKLESS));
3991	return 1;
3992}
3993
3994void drbd_go_diskless(struct drbd_conf *mdev)
3995{
3996	D_ASSERT(mdev->state.disk == D_FAILED);
3997	if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3998		drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
3999}
4000
4001/**
4002 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4003 * @mdev:	DRBD device.
4004 * @io_fn:	IO callback to be called when bitmap IO is possible
4005 * @done:	callback to be called after the bitmap IO was performed
4006 * @why:	Descriptive text of the reason for doing the IO
4007 *
4008 * While IO on the bitmap happens we freeze application IO thus we ensure
4009 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4010 * called from worker context. It MUST NOT be used while a previous such
4011 * work is still pending!
 
 
 
4012 */
4013void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4014			  int (*io_fn)(struct drbd_conf *),
4015			  void (*done)(struct drbd_conf *, int),
4016			  char *why, enum bm_flag flags)
4017{
4018	D_ASSERT(current == mdev->worker.task);
4019
4020	D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4021	D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4022	D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4023	if (mdev->bm_io_work.why)
4024		dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4025			why, mdev->bm_io_work.why);
4026
4027	mdev->bm_io_work.io_fn = io_fn;
4028	mdev->bm_io_work.done = done;
4029	mdev->bm_io_work.why = why;
4030	mdev->bm_io_work.flags = flags;
4031
4032	spin_lock_irq(&mdev->req_lock);
4033	set_bit(BITMAP_IO, &mdev->flags);
4034	if (atomic_read(&mdev->ap_bio_cnt) == 0) {
4035		if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
4036			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
 
 
 
4037	}
4038	spin_unlock_irq(&mdev->req_lock);
4039}
4040
4041/**
4042 * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
4043 * @mdev:	DRBD device.
4044 * @io_fn:	IO callback to be called when bitmap IO is possible
4045 * @why:	Descriptive text of the reason for doing the IO
4046 *
4047 * freezes application IO while that the actual IO operations runs. This
4048 * functions MAY NOT be called from worker context.
4049 */
4050int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4051		char *why, enum bm_flag flags)
4052{
 
 
4053	int rv;
4054
4055	D_ASSERT(current != mdev->worker.task);
4056
4057	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4058		drbd_suspend_io(mdev);
4059
4060	drbd_bm_lock(mdev, why, flags);
4061	rv = io_fn(mdev);
4062	drbd_bm_unlock(mdev);
4063
4064	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4065		drbd_resume_io(mdev);
4066
4067	return rv;
4068}
4069
4070void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4071{
4072	if ((mdev->ldev->md.flags & flag) != flag) {
4073		drbd_md_mark_dirty(mdev);
4074		mdev->ldev->md.flags |= flag;
4075	}
4076}
4077
4078void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4079{
4080	if ((mdev->ldev->md.flags & flag) != 0) {
4081		drbd_md_mark_dirty(mdev);
4082		mdev->ldev->md.flags &= ~flag;
4083	}
4084}
4085int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4086{
4087	return (bdev->md.flags & flag) != 0;
4088}
4089
4090static void md_sync_timer_fn(unsigned long data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4091{
4092	struct drbd_conf *mdev = (struct drbd_conf *) data;
 
4093
4094	drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
 
 
 
4095}
4096
4097static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4098{
4099	dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
4100#ifdef DEBUG
4101	dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4102		mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4103#endif
4104	drbd_md_sync(mdev);
4105	return 1;
4106}
4107
4108#ifdef CONFIG_DRBD_FAULT_INJECTION
4109/* Fault insertion support including random number generator shamelessly
4110 * stolen from kernel/rcutorture.c */
4111struct fault_random_state {
4112	unsigned long state;
4113	unsigned long count;
4114};
4115
4116#define FAULT_RANDOM_MULT 39916801  /* prime */
4117#define FAULT_RANDOM_ADD	479001701 /* prime */
4118#define FAULT_RANDOM_REFRESH 10000
4119
4120/*
4121 * Crude but fast random-number generator.  Uses a linear congruential
4122 * generator, with occasional help from get_random_bytes().
4123 */
4124static unsigned long
4125_drbd_fault_random(struct fault_random_state *rsp)
4126{
4127	long refresh;
4128
4129	if (!rsp->count--) {
4130		get_random_bytes(&refresh, sizeof(refresh));
4131		rsp->state += refresh;
4132		rsp->count = FAULT_RANDOM_REFRESH;
4133	}
4134	rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4135	return swahw32(rsp->state);
4136}
4137
4138static char *
4139_drbd_fault_str(unsigned int type) {
4140	static char *_faults[] = {
4141		[DRBD_FAULT_MD_WR] = "Meta-data write",
4142		[DRBD_FAULT_MD_RD] = "Meta-data read",
4143		[DRBD_FAULT_RS_WR] = "Resync write",
4144		[DRBD_FAULT_RS_RD] = "Resync read",
4145		[DRBD_FAULT_DT_WR] = "Data write",
4146		[DRBD_FAULT_DT_RD] = "Data read",
4147		[DRBD_FAULT_DT_RA] = "Data read ahead",
4148		[DRBD_FAULT_BM_ALLOC] = "BM allocation",
4149		[DRBD_FAULT_AL_EE] = "EE allocation",
4150		[DRBD_FAULT_RECEIVE] = "receive data corruption",
4151	};
4152
4153	return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4154}
4155
4156unsigned int
4157_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4158{
4159	static struct fault_random_state rrs = {0, 0};
4160
4161	unsigned int ret = (
4162		(fault_devs == 0 ||
4163			((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4164		(((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4165
4166	if (ret) {
4167		fault_count++;
4168
4169		if (__ratelimit(&drbd_ratelimit_state))
4170			dev_warn(DEV, "***Simulating %s failure\n",
4171				_drbd_fault_str(type));
4172	}
4173
4174	return ret;
4175}
4176#endif
4177
4178const char *drbd_buildtag(void)
4179{
4180	/* DRBD built from external sources has here a reference to the
4181	   git hash of the source code. */
4182
4183	static char buildtag[38] = "\0uilt-in";
4184
4185	if (buildtag[0] == 0) {
4186#ifdef CONFIG_MODULES
4187		if (THIS_MODULE != NULL)
4188			sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4189		else
4190#endif
4191			buildtag[0] = 'b';
4192	}
4193
4194	return buildtag;
4195}
4196
4197module_init(drbd_init)
4198module_exit(drbd_cleanup)
4199
4200EXPORT_SYMBOL(drbd_conn_str);
4201EXPORT_SYMBOL(drbd_role_str);
4202EXPORT_SYMBOL(drbd_disk_str);
4203EXPORT_SYMBOL(drbd_set_st_err_str);

   1/*
   2   drbd.c
   3
   4   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10   Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
  11   from Logicworks, Inc. for making SDP replication support possible.
  12
  13   drbd is free software; you can redistribute it and/or modify
  14   it under the terms of the GNU General Public License as published by
  15   the Free Software Foundation; either version 2, or (at your option)
  16   any later version.
  17
  18   drbd is distributed in the hope that it will be useful,
  19   but WITHOUT ANY WARRANTY; without even the implied warranty of
  20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21   GNU General Public License for more details.
  22
  23   You should have received a copy of the GNU General Public License
  24   along with drbd; see the file COPYING.  If not, write to
  25   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  26
  27 */
  28
  29#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
  30
  31#include <linux/module.h>
  32#include <linux/jiffies.h>
  33#include <linux/drbd.h>
  34#include <linux/uaccess.h>
  35#include <asm/types.h>
  36#include <net/sock.h>
  37#include <linux/ctype.h>
  38#include <linux/mutex.h>
  39#include <linux/fs.h>
  40#include <linux/file.h>
  41#include <linux/proc_fs.h>
  42#include <linux/init.h>
  43#include <linux/mm.h>
  44#include <linux/memcontrol.h>
  45#include <linux/mm_inline.h>
  46#include <linux/slab.h>
  47#include <linux/random.h>
  48#include <linux/reboot.h>
  49#include <linux/notifier.h>
  50#include <linux/kthread.h>
  51#include <linux/workqueue.h>
  52#define __KERNEL_SYSCALLS__
  53#include <linux/unistd.h>
  54#include <linux/vmalloc.h>
  55#include <linux/sched/signal.h>
  56
  57#include <linux/drbd_limits.h>
  58#include "drbd_int.h"
  59#include "drbd_protocol.h"
  60#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
 
  61#include "drbd_vli.h"
  62#include "drbd_debugfs.h"
 
 
 
 
 
 
 
  63
  64static DEFINE_MUTEX(drbd_main_mutex);
 
 
 
 
 
  65static int drbd_open(struct block_device *bdev, fmode_t mode);
  66static void drbd_release(struct gendisk *gd, fmode_t mode);
  67static void md_sync_timer_fn(struct timer_list *t);
  68static int w_bitmap_io(struct drbd_work *w, int unused);
 
 
 
 
 
  69
  70MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
  71	      "Lars Ellenberg <lars@linbit.com>");
  72MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
  73MODULE_VERSION(REL_VERSION);
  74MODULE_LICENSE("GPL");
  75MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
  76		 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
  77MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
  78
  79#include <linux/moduleparam.h>
 
 
  80/* thanks to these macros, if compiled into the kernel (not-module),
  81 * these become boot parameters (e.g., drbd.minor_count) */
 
 
 
 
 
  82
  83#ifdef CONFIG_DRBD_FAULT_INJECTION
  84int drbd_enable_faults;
  85int drbd_fault_rate;
  86static int drbd_fault_count;
  87static int drbd_fault_devs;
  88/* bitmap of enabled faults */
  89module_param_named(enable_faults, drbd_enable_faults, int, 0664);
  90/* fault rate % value - applies to all enabled faults */
  91module_param_named(fault_rate, drbd_fault_rate, int, 0664);
  92/* count of faults inserted */
  93module_param_named(fault_count, drbd_fault_count, int, 0664);
  94/* bitmap of devices to insert faults on */
  95module_param_named(fault_devs, drbd_fault_devs, int, 0644);
  96#endif
  97
  98/* module parameters we can keep static */
  99static bool drbd_allow_oos; /* allow_open_on_secondary */
 100static bool drbd_disable_sendpage;
 101MODULE_PARM_DESC(allow_oos, "DONT USE!");
 102module_param_named(allow_oos, drbd_allow_oos, bool, 0);
 103module_param_named(disable_sendpage, drbd_disable_sendpage, bool, 0644);
 104
 105/* module parameters we share */
 106int drbd_proc_details; /* Detail level in proc drbd*/
 107module_param_named(proc_details, drbd_proc_details, int, 0644);
 108/* module parameters shared with defaults */
 109unsigned int drbd_minor_count = DRBD_MINOR_COUNT_DEF;
 110/* Module parameter for setting the user mode helper program
 111 * to run. Default is /sbin/drbdadm */
 112char drbd_usermode_helper[80] = "/sbin/drbdadm";
 113module_param_named(minor_count, drbd_minor_count, uint, 0444);
 114module_param_string(usermode_helper, drbd_usermode_helper, sizeof(drbd_usermode_helper), 0644);
 115
 116/* in 2.6.x, our device mapping and config info contains our virtual gendisks
 117 * as member "struct gendisk *vdisk;"
 118 */
 119struct idr drbd_devices;
 120struct list_head drbd_resources;
 121struct mutex resources_mutex;
 122
 123struct kmem_cache *drbd_request_cache;
 124struct kmem_cache *drbd_ee_cache;	/* peer requests */
 125struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
 126struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
 127mempool_t *drbd_request_mempool;
 128mempool_t *drbd_ee_mempool;
 129mempool_t *drbd_md_io_page_pool;
 130struct bio_set *drbd_md_io_bio_set;
 131struct bio_set *drbd_io_bio_set;
 132
 133/* I do not use a standard mempool, because:
 134   1) I want to hand out the pre-allocated objects first.
 135   2) I want to be able to interrupt sleeping allocation with a signal.
 136   Note: This is a single linked list, the next pointer is the private
 137	 member of struct page.
 138 */
 139struct page *drbd_pp_pool;
 140spinlock_t   drbd_pp_lock;
 141int          drbd_pp_vacant;
 142wait_queue_head_t drbd_pp_wait;
 143
 144DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
 145
 146static const struct block_device_operations drbd_ops = {
 147	.owner =   THIS_MODULE,
 148	.open =    drbd_open,
 149	.release = drbd_release,
 150};
 151
 152struct bio *bio_alloc_drbd(gfp_t gfp_mask)
 153{
 154	struct bio *bio;
 155
 156	if (!drbd_md_io_bio_set)
 157		return bio_alloc(gfp_mask, 1);
 158
 159	bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
 160	if (!bio)
 161		return NULL;
 162	return bio;
 163}
 164
 165#ifdef __CHECKER__
 166/* When checking with sparse, and this is an inline function, sparse will
 167   give tons of false positives. When this is a real functions sparse works.
 168 */
 169int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
 170{
 171	int io_allowed;
 172
 173	atomic_inc(&device->local_cnt);
 174	io_allowed = (device->state.disk >= mins);
 175	if (!io_allowed) {
 176		if (atomic_dec_and_test(&device->local_cnt))
 177			wake_up(&device->misc_wait);
 178	}
 179	return io_allowed;
 180}
 181
 182#endif
 183
 184/**
 185 * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
 186 * @connection:	DRBD connection.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 187 * @barrier_nr:	Expected identifier of the DRBD write barrier packet.
 188 * @set_size:	Expected number of requests before that barrier.
 189 *
 190 * In case the passed barrier_nr or set_size does not match the oldest
 191 * epoch of not yet barrier-acked requests, this function will cause a
 192 * termination of the connection.
 193 */
 194void tl_release(struct drbd_connection *connection, unsigned int barrier_nr,
 195		unsigned int set_size)
 196{
 
 
 197	struct drbd_request *r;
 198	struct drbd_request *req = NULL;
 199	int expect_epoch = 0;
 200	int expect_size = 0;
 201
 202	spin_lock_irq(&connection->resource->req_lock);
 203
 204	/* find oldest not yet barrier-acked write request,
 205	 * count writes in its epoch. */
 206	list_for_each_entry(r, &connection->transfer_log, tl_requests) {
 207		const unsigned s = r->rq_state;
 208		if (!req) {
 209			if (!(s & RQ_WRITE))
 210				continue;
 211			if (!(s & RQ_NET_MASK))
 212				continue;
 213			if (s & RQ_NET_DONE)
 214				continue;
 215			req = r;
 216			expect_epoch = req->epoch;
 217			expect_size ++;
 218		} else {
 219			if (r->epoch != expect_epoch)
 220				break;
 221			if (!(s & RQ_WRITE))
 222				continue;
 223			/* if (s & RQ_DONE): not expected */
 224			/* if (!(s & RQ_NET_MASK)): not expected */
 225			expect_size++;
 226		}
 227	}
 228
 229	/* first some paranoia code */
 230	if (req == NULL) {
 231		drbd_err(connection, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
 232			 barrier_nr);
 233		goto bail;
 234	}
 235	if (expect_epoch != barrier_nr) {
 236		drbd_err(connection, "BAD! BarrierAck #%u received, expected #%u!\n",
 237			 barrier_nr, expect_epoch);
 238		goto bail;
 239	}
 240
 241	if (expect_size != set_size) {
 242		drbd_err(connection, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
 243			 barrier_nr, set_size, expect_size);
 244		goto bail;
 245	}
 246
 247	/* Clean up list of requests processed during current epoch. */
 248	/* this extra list walk restart is paranoia,
 249	 * to catch requests being barrier-acked "unexpectedly".
 250	 * It usually should find the same req again, or some READ preceding it. */
 251	list_for_each_entry(req, &connection->transfer_log, tl_requests)
 252		if (req->epoch == expect_epoch)
 253			break;
 254	list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) {
 255		if (req->epoch != expect_epoch)
 256			break;
 257		_req_mod(req, BARRIER_ACKED);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 258	}
 259	spin_unlock_irq(&connection->resource->req_lock);
 
 
 260
 261	return;
 262
 263bail:
 264	spin_unlock_irq(&connection->resource->req_lock);
 265	conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
 266}
 267
 268
 269/**
 270 * _tl_restart() - Walks the transfer log, and applies an action to all requests
 271 * @connection:	DRBD connection to operate on.
 272 * @what:       The action/event to perform with all request objects
 273 *
 274 * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
 275 * RESTART_FROZEN_DISK_IO.
 276 */
 277/* must hold resource->req_lock */
 278void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
 279{
 280	struct drbd_request *req, *r;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 281
 282	list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests)
 283		_req_mod(req, what);
 
 
 
 
 
 
 
 284}
 285
 286void tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
 287{
 288	spin_lock_irq(&connection->resource->req_lock);
 289	_tl_restart(connection, what);
 290	spin_unlock_irq(&connection->resource->req_lock);
 291}
 292
 293/**
 294 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
 295 * @device:	DRBD device.
 296 *
 297 * This is called after the connection to the peer was lost. The storage covered
 298 * by the requests on the transfer gets marked as our of sync. Called from the
 299 * receiver thread and the worker thread.
 300 */
 301void tl_clear(struct drbd_connection *connection)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 302{
 303	tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 304}
 305
 306/**
 307 * tl_abort_disk_io() - Abort disk I/O for all requests for a certain device in the TL
 308 * @device:	DRBD device.
 
 
 
 309 */
 310void tl_abort_disk_io(struct drbd_device *device)
 
 311{
 312	struct drbd_connection *connection = first_peer_device(device)->connection;
 313	struct drbd_request *req, *r;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 314
 315	spin_lock_irq(&connection->resource->req_lock);
 316	list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) {
 317		if (!(req->rq_state & RQ_LOCAL_PENDING))
 318			continue;
 319		if (req->device != device)
 320			continue;
 321		_req_mod(req, ABORT_DISK_IO);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 322	}
 323	spin_unlock_irq(&connection->resource->req_lock);
 
 324}
 325
 
 326static int drbd_thread_setup(void *arg)
 327{
 328	struct drbd_thread *thi = (struct drbd_thread *) arg;
 329	struct drbd_resource *resource = thi->resource;
 330	unsigned long flags;
 331	int retval;
 332
 333	snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
 334		 thi->name[0],
 335		 resource->name);
 336
 337restart:
 338	retval = thi->function(thi);
 339
 340	spin_lock_irqsave(&thi->t_lock, flags);
 341
 342	/* if the receiver has been "EXITING", the last thing it did
 343	 * was set the conn state to "StandAlone",
 344	 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
 345	 * and receiver thread will be "started".
 346	 * drbd_thread_start needs to set "RESTARTING" in that case.
 347	 * t_state check and assignment needs to be within the same spinlock,
 348	 * so either thread_start sees EXITING, and can remap to RESTARTING,
 349	 * or thread_start see NONE, and can proceed as normal.
 350	 */
 351
 352	if (thi->t_state == RESTARTING) {
 353		drbd_info(resource, "Restarting %s thread\n", thi->name);
 354		thi->t_state = RUNNING;
 355		spin_unlock_irqrestore(&thi->t_lock, flags);
 356		goto restart;
 357	}
 358
 359	thi->task = NULL;
 360	thi->t_state = NONE;
 361	smp_mb();
 362	complete_all(&thi->stop);
 363	spin_unlock_irqrestore(&thi->t_lock, flags);
 364
 365	drbd_info(resource, "Terminating %s\n", current->comm);
 366
 367	/* Release mod reference taken when thread was started */
 368
 369	if (thi->connection)
 370		kref_put(&thi->connection->kref, drbd_destroy_connection);
 371	kref_put(&resource->kref, drbd_destroy_resource);
 372	module_put(THIS_MODULE);
 373	return retval;
 374}
 375
 376static void drbd_thread_init(struct drbd_resource *resource, struct drbd_thread *thi,
 377			     int (*func) (struct drbd_thread *), const char *name)
 378{
 379	spin_lock_init(&thi->t_lock);
 380	thi->task    = NULL;
 381	thi->t_state = NONE;
 382	thi->function = func;
 383	thi->resource = resource;
 384	thi->connection = NULL;
 385	thi->name = name;
 386}
 387
 388int drbd_thread_start(struct drbd_thread *thi)
 389{
 390	struct drbd_resource *resource = thi->resource;
 391	struct task_struct *nt;
 392	unsigned long flags;
 393
 
 
 
 
 
 394	/* is used from state engine doing drbd_thread_stop_nowait,
 395	 * while holding the req lock irqsave */
 396	spin_lock_irqsave(&thi->t_lock, flags);
 397
 398	switch (thi->t_state) {
 399	case NONE:
 400		drbd_info(resource, "Starting %s thread (from %s [%d])\n",
 401			 thi->name, current->comm, current->pid);
 402
 403		/* Get ref on module for thread - this is released when thread exits */
 404		if (!try_module_get(THIS_MODULE)) {
 405			drbd_err(resource, "Failed to get module reference in drbd_thread_start\n");
 406			spin_unlock_irqrestore(&thi->t_lock, flags);
 407			return false;
 408		}
 409
 410		kref_get(&resource->kref);
 411		if (thi->connection)
 412			kref_get(&thi->connection->kref);
 413
 414		init_completion(&thi->stop);
 
 415		thi->reset_cpu_mask = 1;
 416		thi->t_state = RUNNING;
 417		spin_unlock_irqrestore(&thi->t_lock, flags);
 418		flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
 419
 420		nt = kthread_create(drbd_thread_setup, (void *) thi,
 421				    "drbd_%c_%s", thi->name[0], thi->resource->name);
 422
 423		if (IS_ERR(nt)) {
 424			drbd_err(resource, "Couldn't start thread\n");
 425
 426			if (thi->connection)
 427				kref_put(&thi->connection->kref, drbd_destroy_connection);
 428			kref_put(&resource->kref, drbd_destroy_resource);
 429			module_put(THIS_MODULE);
 430			return false;
 431		}
 432		spin_lock_irqsave(&thi->t_lock, flags);
 433		thi->task = nt;
 434		thi->t_state = RUNNING;
 435		spin_unlock_irqrestore(&thi->t_lock, flags);
 436		wake_up_process(nt);
 437		break;
 438	case EXITING:
 439		thi->t_state = RESTARTING;
 440		drbd_info(resource, "Restarting %s thread (from %s [%d])\n",
 441				thi->name, current->comm, current->pid);
 442		/* fall through */
 443	case RUNNING:
 444	case RESTARTING:
 445	default:
 446		spin_unlock_irqrestore(&thi->t_lock, flags);
 447		break;
 448	}
 449
 450	return true;
 451}
 452
 453
 454void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
 455{
 456	unsigned long flags;
 457
 458	enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
 459
 460	/* may be called from state engine, holding the req lock irqsave */
 461	spin_lock_irqsave(&thi->t_lock, flags);
 462
 463	if (thi->t_state == NONE) {
 464		spin_unlock_irqrestore(&thi->t_lock, flags);
 465		if (restart)
 466			drbd_thread_start(thi);
 467		return;
 468	}
 469
 470	if (thi->t_state != ns) {
 471		if (thi->task == NULL) {
 472			spin_unlock_irqrestore(&thi->t_lock, flags);
 473			return;
 474		}
 475
 476		thi->t_state = ns;
 477		smp_mb();
 478		init_completion(&thi->stop);
 479		if (thi->task != current)
 480			force_sig(DRBD_SIGKILL, thi->task);
 
 481	}
 482
 483	spin_unlock_irqrestore(&thi->t_lock, flags);
 484
 485	if (wait)
 486		wait_for_completion(&thi->stop);
 487}
 488
 489int conn_lowest_minor(struct drbd_connection *connection)
 490{
 491	struct drbd_peer_device *peer_device;
 492	int vnr = 0, minor = -1;
 493
 494	rcu_read_lock();
 495	peer_device = idr_get_next(&connection->peer_devices, &vnr);
 496	if (peer_device)
 497		minor = device_to_minor(peer_device->device);
 498	rcu_read_unlock();
 499
 500	return minor;
 501}
 502
 503#ifdef CONFIG_SMP
 504/**
 505 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
 
 506 *
 507 * Forces all threads of a resource onto the same CPU. This is beneficial for
 508 * DRBD's performance. May be overwritten by user's configuration.
 509 */
 510static void drbd_calc_cpu_mask(cpumask_var_t *cpu_mask)
 511{
 512	unsigned int *resources_per_cpu, min_index = ~0;
 
 
 
 
 513
 514	resources_per_cpu = kzalloc(nr_cpu_ids * sizeof(*resources_per_cpu), GFP_KERNEL);
 515	if (resources_per_cpu) {
 516		struct drbd_resource *resource;
 517		unsigned int cpu, min = ~0;
 518
 519		rcu_read_lock();
 520		for_each_resource_rcu(resource, &drbd_resources) {
 521			for_each_cpu(cpu, resource->cpu_mask)
 522				resources_per_cpu[cpu]++;
 523		}
 524		rcu_read_unlock();
 525		for_each_online_cpu(cpu) {
 526			if (resources_per_cpu[cpu] < min) {
 527				min = resources_per_cpu[cpu];
 528				min_index = cpu;
 529			}
 530		}
 531		kfree(resources_per_cpu);
 532	}
 533	if (min_index == ~0) {
 534		cpumask_setall(*cpu_mask);
 535		return;
 536	}
 537	cpumask_set_cpu(min_index, *cpu_mask);
 
 538}
 539
 540/**
 541 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
 542 * @device:	DRBD device.
 543 * @thi:	drbd_thread object
 544 *
 545 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
 546 * prematurely.
 547 */
 548void drbd_thread_current_set_cpu(struct drbd_thread *thi)
 549{
 550	struct drbd_resource *resource = thi->resource;
 551	struct task_struct *p = current;
 552
 
 
 
 
 
 
 553	if (!thi->reset_cpu_mask)
 554		return;
 555	thi->reset_cpu_mask = 0;
 556	set_cpus_allowed_ptr(p, resource->cpu_mask);
 557}
 558#else
 559#define drbd_calc_cpu_mask(A) ({})
 560#endif
 561
 562/**
 563 * drbd_header_size  -  size of a packet header
 564 *
 565 * The header size is a multiple of 8, so any payload following the header is
 566 * word aligned on 64-bit architectures.  (The bitmap send and receive code
 567 * relies on this.)
 568 */
 569unsigned int drbd_header_size(struct drbd_connection *connection)
 570{
 571	if (connection->agreed_pro_version >= 100) {
 572		BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8));
 573		return sizeof(struct p_header100);
 574	} else {
 575		BUILD_BUG_ON(sizeof(struct p_header80) !=
 576			     sizeof(struct p_header95));
 577		BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
 578		return sizeof(struct p_header80);
 579	}
 580}
 581
 582static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
 583{
 584	h->magic   = cpu_to_be32(DRBD_MAGIC);
 585	h->command = cpu_to_be16(cmd);
 586	h->length  = cpu_to_be16(size);
 587	return sizeof(struct p_header80);
 588}
 589
 590static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
 591{
 592	h->magic   = cpu_to_be16(DRBD_MAGIC_BIG);
 593	h->command = cpu_to_be16(cmd);
 594	h->length = cpu_to_be32(size);
 595	return sizeof(struct p_header95);
 596}
 597
 598static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd,
 599				      int size, int vnr)
 600{
 601	h->magic = cpu_to_be32(DRBD_MAGIC_100);
 602	h->volume = cpu_to_be16(vnr);
 603	h->command = cpu_to_be16(cmd);
 604	h->length = cpu_to_be32(size);
 605	h->pad = 0;
 606	return sizeof(struct p_header100);
 607}
 608
 609static unsigned int prepare_header(struct drbd_connection *connection, int vnr,
 610				   void *buffer, enum drbd_packet cmd, int size)
 611{
 612	if (connection->agreed_pro_version >= 100)
 613		return prepare_header100(buffer, cmd, size, vnr);
 614	else if (connection->agreed_pro_version >= 95 &&
 615		 size > DRBD_MAX_SIZE_H80_PACKET)
 616		return prepare_header95(buffer, cmd, size);
 617	else
 618		return prepare_header80(buffer, cmd, size);
 619}
 620
 621static void *__conn_prepare_command(struct drbd_connection *connection,
 622				    struct drbd_socket *sock)
 
 
 
 623{
 624	if (!sock->socket)
 625		return NULL;
 626	return sock->sbuf + drbd_header_size(connection);
 627}
 628
 629void *conn_prepare_command(struct drbd_connection *connection, struct drbd_socket *sock)
 630{
 631	void *p;
 
 
 
 
 632
 633	mutex_lock(&sock->mutex);
 634	p = __conn_prepare_command(connection, sock);
 635	if (!p)
 636		mutex_unlock(&sock->mutex);
 637
 638	return p;
 
 
 
 
 639}
 640
 641void *drbd_prepare_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock)
 
 642{
 643	return conn_prepare_command(peer_device->connection, sock);
 644}
 645
 646static int __send_command(struct drbd_connection *connection, int vnr,
 647			  struct drbd_socket *sock, enum drbd_packet cmd,
 648			  unsigned int header_size, void *data,
 649			  unsigned int size)
 650{
 651	int msg_flags;
 652	int err;
 653
 654	/*
 655	 * Called with @data == NULL and the size of the data blocks in @size
 656	 * for commands that send data blocks.  For those commands, omit the
 657	 * MSG_MORE flag: this will increase the likelihood that data blocks
 658	 * which are page aligned on the sender will end up page aligned on the
 659	 * receiver.
 660	 */
 661	msg_flags = data ? MSG_MORE : 0;
 662
 663	header_size += prepare_header(connection, vnr, sock->sbuf, cmd,
 664				      header_size + size);
 665	err = drbd_send_all(connection, sock->socket, sock->sbuf, header_size,
 666			    msg_flags);
 667	if (data && !err)
 668		err = drbd_send_all(connection, sock->socket, data, size, 0);
 669	/* DRBD protocol "pings" are latency critical.
 670	 * This is supposed to trigger tcp_push_pending_frames() */
 671	if (!err && (cmd == P_PING || cmd == P_PING_ACK))
 672		drbd_tcp_nodelay(sock->socket);
 673
 674	return err;
 675}
 676
 677static int __conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
 678			       enum drbd_packet cmd, unsigned int header_size,
 679			       void *data, unsigned int size)
 680{
 681	return __send_command(connection, 0, sock, cmd, header_size, data, size);
 682}
 683
 684int conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
 685		      enum drbd_packet cmd, unsigned int header_size,
 686		      void *data, unsigned int size)
 687{
 688	int err;
 689
 690	err = __conn_send_command(connection, sock, cmd, header_size, data, size);
 691	mutex_unlock(&sock->mutex);
 692	return err;
 693}
 694
 695int drbd_send_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock,
 696		      enum drbd_packet cmd, unsigned int header_size,
 697		      void *data, unsigned int size)
 698{
 699	int err;
 700
 701	err = __send_command(peer_device->connection, peer_device->device->vnr,
 702			     sock, cmd, header_size, data, size);
 703	mutex_unlock(&sock->mutex);
 704	return err;
 705}
 706
 707int drbd_send_ping(struct drbd_connection *connection)
 708{
 709	struct drbd_socket *sock;
 710
 711	sock = &connection->meta;
 712	if (!conn_prepare_command(connection, sock))
 713		return -EIO;
 714	return conn_send_command(connection, sock, P_PING, 0, NULL, 0);
 715}
 716
 717int drbd_send_ping_ack(struct drbd_connection *connection)
 718{
 719	struct drbd_socket *sock;
 720
 721	sock = &connection->meta;
 722	if (!conn_prepare_command(connection, sock))
 723		return -EIO;
 724	return conn_send_command(connection, sock, P_PING_ACK, 0, NULL, 0);
 725}
 726
 727int drbd_send_sync_param(struct drbd_peer_device *peer_device)
 728{
 729	struct drbd_socket *sock;
 730	struct p_rs_param_95 *p;
 731	int size;
 732	const int apv = peer_device->connection->agreed_pro_version;
 733	enum drbd_packet cmd;
 734	struct net_conf *nc;
 735	struct disk_conf *dc;
 736
 737	sock = &peer_device->connection->data;
 738	p = drbd_prepare_command(peer_device, sock);
 739	if (!p)
 740		return -EIO;
 741
 742	rcu_read_lock();
 743	nc = rcu_dereference(peer_device->connection->net_conf);
 744
 745	size = apv <= 87 ? sizeof(struct p_rs_param)
 746		: apv == 88 ? sizeof(struct p_rs_param)
 747			+ strlen(nc->verify_alg) + 1
 748		: apv <= 94 ? sizeof(struct p_rs_param_89)
 749		: /* apv >= 95 */ sizeof(struct p_rs_param_95);
 750
 751	cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 752
 753	/* initialize verify_alg and csums_alg */
 754	memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
 
 755
 756	if (get_ldev(peer_device->device)) {
 757		dc = rcu_dereference(peer_device->device->ldev->disk_conf);
 758		p->resync_rate = cpu_to_be32(dc->resync_rate);
 759		p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead);
 760		p->c_delay_target = cpu_to_be32(dc->c_delay_target);
 761		p->c_fill_target = cpu_to_be32(dc->c_fill_target);
 762		p->c_max_rate = cpu_to_be32(dc->c_max_rate);
 763		put_ldev(peer_device->device);
 764	} else {
 765		p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF);
 766		p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
 767		p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
 768		p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
 769		p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
 770	}
 771
 772	if (apv >= 88)
 773		strcpy(p->verify_alg, nc->verify_alg);
 774	if (apv >= 89)
 775		strcpy(p->csums_alg, nc->csums_alg);
 776	rcu_read_unlock();
 777
 778	return drbd_send_command(peer_device, sock, cmd, size, NULL, 0);
 779}
 780
 781int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd)
 782{
 783	struct drbd_socket *sock;
 784	struct p_protocol *p;
 785	struct net_conf *nc;
 786	int size, cf;
 
 
 
 
 787
 788	sock = &connection->data;
 789	p = __conn_prepare_command(connection, sock);
 790	if (!p)
 791		return -EIO;
 
 792
 793	rcu_read_lock();
 794	nc = rcu_dereference(connection->net_conf);
 
 
 
 795
 796	if (nc->tentative && connection->agreed_pro_version < 92) {
 797		rcu_read_unlock();
 798		mutex_unlock(&sock->mutex);
 799		drbd_err(connection, "--dry-run is not supported by peer");
 800		return -EOPNOTSUPP;
 801	}
 802
 803	size = sizeof(*p);
 804	if (connection->agreed_pro_version >= 87)
 805		size += strlen(nc->integrity_alg) + 1;
 806
 807	p->protocol      = cpu_to_be32(nc->wire_protocol);
 808	p->after_sb_0p   = cpu_to_be32(nc->after_sb_0p);
 809	p->after_sb_1p   = cpu_to_be32(nc->after_sb_1p);
 810	p->after_sb_2p   = cpu_to_be32(nc->after_sb_2p);
 811	p->two_primaries = cpu_to_be32(nc->two_primaries);
 812	cf = 0;
 813	if (nc->discard_my_data)
 814		cf |= CF_DISCARD_MY_DATA;
 815	if (nc->tentative)
 816		cf |= CF_DRY_RUN;
 
 
 
 
 
 
 
 817	p->conn_flags    = cpu_to_be32(cf);
 818
 819	if (connection->agreed_pro_version >= 87)
 820		strcpy(p->integrity_alg, nc->integrity_alg);
 821	rcu_read_unlock();
 822
 823	return __conn_send_command(connection, sock, cmd, size, NULL, 0);
 
 
 
 824}
 825
 826int drbd_send_protocol(struct drbd_connection *connection)
 827{
 828	int err;
 829
 830	mutex_lock(&connection->data.mutex);
 831	err = __drbd_send_protocol(connection, P_PROTOCOL);
 832	mutex_unlock(&connection->data.mutex);
 833
 834	return err;
 835}
 836
 837static int _drbd_send_uuids(struct drbd_peer_device *peer_device, u64 uuid_flags)
 838{
 839	struct drbd_device *device = peer_device->device;
 840	struct drbd_socket *sock;
 841	struct p_uuids *p;
 842	int i;
 843
 844	if (!get_ldev_if_state(device, D_NEGOTIATING))
 845		return 0;
 846
 847	sock = &peer_device->connection->data;
 848	p = drbd_prepare_command(peer_device, sock);
 849	if (!p) {
 850		put_ldev(device);
 851		return -EIO;
 852	}
 853	spin_lock_irq(&device->ldev->md.uuid_lock);
 854	for (i = UI_CURRENT; i < UI_SIZE; i++)
 855		p->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
 856	spin_unlock_irq(&device->ldev->md.uuid_lock);
 
 
 
 
 
 
 857
 858	device->comm_bm_set = drbd_bm_total_weight(device);
 859	p->uuid[UI_SIZE] = cpu_to_be64(device->comm_bm_set);
 860	rcu_read_lock();
 861	uuid_flags |= rcu_dereference(peer_device->connection->net_conf)->discard_my_data ? 1 : 0;
 862	rcu_read_unlock();
 863	uuid_flags |= test_bit(CRASHED_PRIMARY, &device->flags) ? 2 : 0;
 864	uuid_flags |= device->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
 865	p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
 866
 867	put_ldev(device);
 868	return drbd_send_command(peer_device, sock, P_UUIDS, sizeof(*p), NULL, 0);
 869}
 870
 871int drbd_send_uuids(struct drbd_peer_device *peer_device)
 872{
 873	return _drbd_send_uuids(peer_device, 0);
 874}
 875
 876int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *peer_device)
 877{
 878	return _drbd_send_uuids(peer_device, 8);
 879}
 880
 881void drbd_print_uuids(struct drbd_device *device, const char *text)
 882{
 883	if (get_ldev_if_state(device, D_NEGOTIATING)) {
 884		u64 *uuid = device->ldev->md.uuid;
 885		drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX\n",
 886		     text,
 887		     (unsigned long long)uuid[UI_CURRENT],
 888		     (unsigned long long)uuid[UI_BITMAP],
 889		     (unsigned long long)uuid[UI_HISTORY_START],
 890		     (unsigned long long)uuid[UI_HISTORY_END]);
 891		put_ldev(device);
 892	} else {
 893		drbd_info(device, "%s effective data uuid: %016llX\n",
 894				text,
 895				(unsigned long long)device->ed_uuid);
 896	}
 897}
 898
 899void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
 900{
 901	struct drbd_device *device = peer_device->device;
 902	struct drbd_socket *sock;
 903	struct p_rs_uuid *p;
 904	u64 uuid;
 905
 906	D_ASSERT(device, device->state.disk == D_UP_TO_DATE);
 907
 908	uuid = device->ldev->md.uuid[UI_BITMAP];
 909	if (uuid && uuid != UUID_JUST_CREATED)
 910		uuid = uuid + UUID_NEW_BM_OFFSET;
 911	else
 912		get_random_bytes(&uuid, sizeof(u64));
 913	drbd_uuid_set(device, UI_BITMAP, uuid);
 914	drbd_print_uuids(device, "updated sync UUID");
 915	drbd_md_sync(device);
 916
 917	sock = &peer_device->connection->data;
 918	p = drbd_prepare_command(peer_device, sock);
 919	if (p) {
 920		p->uuid = cpu_to_be64(uuid);
 921		drbd_send_command(peer_device, sock, P_SYNC_UUID, sizeof(*p), NULL, 0);
 922	}
 923}
 924
 925/* communicated if (agreed_features & DRBD_FF_WSAME) */
 926static void
 927assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p,
 928					struct request_queue *q)
 929{
 930	if (q) {
 931		p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
 932		p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
 933		p->qlim->alignment_offset = cpu_to_be32(queue_alignment_offset(q));
 934		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 935		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 936		p->qlim->discard_enabled = blk_queue_discard(q);
 937		p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
 938	} else {
 939		q = device->rq_queue;
 940		p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
 941		p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
 942		p->qlim->alignment_offset = 0;
 943		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 944		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 945		p->qlim->discard_enabled = 0;
 946		p->qlim->write_same_capable = 0;
 947	}
 948}
 949
 950int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
 951{
 952	struct drbd_device *device = peer_device->device;
 953	struct drbd_socket *sock;
 954	struct p_sizes *p;
 955	sector_t d_size, u_size;
 956	int q_order_type;
 957	unsigned int max_bio_size;
 958	unsigned int packet_size;
 959
 960	sock = &peer_device->connection->data;
 961	p = drbd_prepare_command(peer_device, sock);
 962	if (!p)
 963		return -EIO;
 964
 965	packet_size = sizeof(*p);
 966	if (peer_device->connection->agreed_features & DRBD_FF_WSAME)
 967		packet_size += sizeof(p->qlim[0]);
 968
 969	memset(p, 0, packet_size);
 970	if (get_ldev_if_state(device, D_NEGOTIATING)) {
 971		struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
 972		d_size = drbd_get_max_capacity(device->ldev);
 973		rcu_read_lock();
 974		u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
 975		rcu_read_unlock();
 976		q_order_type = drbd_queue_order_type(device);
 977		max_bio_size = queue_max_hw_sectors(q) << 9;
 978		max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
 979		assign_p_sizes_qlim(device, p, q);
 980		put_ldev(device);
 981	} else {
 982		d_size = 0;
 983		u_size = 0;
 984		q_order_type = QUEUE_ORDERED_NONE;
 985		max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
 986		assign_p_sizes_qlim(device, p, NULL);
 987	}
 988
 989	if (peer_device->connection->agreed_pro_version <= 94)
 990		max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
 991	else if (peer_device->connection->agreed_pro_version < 100)
 992		max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
 993
 994	p->d_size = cpu_to_be64(d_size);
 995	p->u_size = cpu_to_be64(u_size);
 996	p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(device->this_bdev));
 997	p->max_bio_size = cpu_to_be32(max_bio_size);
 998	p->queue_order_type = cpu_to_be16(q_order_type);
 999	p->dds_flags = cpu_to_be16(flags);
1000
1001	return drbd_send_command(peer_device, sock, P_SIZES, packet_size, NULL, 0);
1002}
1003
1004/**
1005 * drbd_send_current_state() - Sends the drbd state to the peer
1006 * @peer_device:	DRBD peer device.
1007 */
1008int drbd_send_current_state(struct drbd_peer_device *peer_device)
1009{
1010	struct drbd_socket *sock;
1011	struct p_state *p;
1012
1013	sock = &peer_device->connection->data;
1014	p = drbd_prepare_command(peer_device, sock);
1015	if (!p)
1016		return -EIO;
1017	p->state = cpu_to_be32(peer_device->device->state.i); /* Within the send mutex */
1018	return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
1019}
1020
1021/**
1022 * drbd_send_state() - After a state change, sends the new state to the peer
1023 * @peer_device:      DRBD peer device.
1024 * @state:     the state to send, not necessarily the current state.
1025 *
1026 * Each state change queues an "after_state_ch" work, which will eventually
1027 * send the resulting new state to the peer. If more state changes happen
1028 * between queuing and processing of the after_state_ch work, we still
1029 * want to send each intermediary state in the order it occurred.
1030 */
1031int drbd_send_state(struct drbd_peer_device *peer_device, union drbd_state state)
1032{
1033	struct drbd_socket *sock;
1034	struct p_state *p;
1035
1036	sock = &peer_device->connection->data;
1037	p = drbd_prepare_command(peer_device, sock);
1038	if (!p)
1039		return -EIO;
1040	p->state = cpu_to_be32(state.i); /* Within the send mutex */
1041	return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
1042}
1043
1044int drbd_send_state_req(struct drbd_peer_device *peer_device, union drbd_state mask, union drbd_state val)
1045{
1046	struct drbd_socket *sock;
1047	struct p_req_state *p;
1048
1049	sock = &peer_device->connection->data;
1050	p = drbd_prepare_command(peer_device, sock);
1051	if (!p)
1052		return -EIO;
1053	p->mask = cpu_to_be32(mask.i);
1054	p->val = cpu_to_be32(val.i);
1055	return drbd_send_command(peer_device, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0);
1056}
1057
1058int conn_send_state_req(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
1059{
1060	enum drbd_packet cmd;
1061	struct drbd_socket *sock;
1062	struct p_req_state *p;
1063
1064	cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
1065	sock = &connection->data;
1066	p = conn_prepare_command(connection, sock);
1067	if (!p)
1068		return -EIO;
1069	p->mask = cpu_to_be32(mask.i);
1070	p->val = cpu_to_be32(val.i);
1071	return conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
1072}
1073
1074void drbd_send_sr_reply(struct drbd_peer_device *peer_device, enum drbd_state_rv retcode)
 
1075{
1076	struct drbd_socket *sock;
1077	struct p_req_state_reply *p;
1078
1079	sock = &peer_device->connection->meta;
1080	p = drbd_prepare_command(peer_device, sock);
1081	if (p) {
1082		p->retcode = cpu_to_be32(retcode);
1083		drbd_send_command(peer_device, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0);
1084	}
1085}
1086
1087void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode)
1088{
1089	struct drbd_socket *sock;
1090	struct p_req_state_reply *p;
1091	enum drbd_packet cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
1092
1093	sock = &connection->meta;
1094	p = conn_prepare_command(connection, sock);
1095	if (p) {
1096		p->retcode = cpu_to_be32(retcode);
1097		conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
1098	}
1099}
1100
1101static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
1102{
1103	BUG_ON(code & ~0xf);
1104	p->encoding = (p->encoding & ~0xf) | code;
1105}
1106
1107static void dcbp_set_start(struct p_compressed_bm *p, int set)
1108{
1109	p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
1110}
1111
1112static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
1113{
1114	BUG_ON(n & ~0x7);
1115	p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
1116}
1117
1118static int fill_bitmap_rle_bits(struct drbd_device *device,
1119			 struct p_compressed_bm *p,
1120			 unsigned int size,
1121			 struct bm_xfer_ctx *c)
1122{
1123	struct bitstream bs;
1124	unsigned long plain_bits;
1125	unsigned long tmp;
1126	unsigned long rl;
1127	unsigned len;
1128	unsigned toggle;
1129	int bits, use_rle;
1130
1131	/* may we use this feature? */
1132	rcu_read_lock();
1133	use_rle = rcu_dereference(first_peer_device(device)->connection->net_conf)->use_rle;
1134	rcu_read_unlock();
1135	if (!use_rle || first_peer_device(device)->connection->agreed_pro_version < 90)
1136		return 0;
1137
1138	if (c->bit_offset >= c->bm_bits)
1139		return 0; /* nothing to do. */
1140
1141	/* use at most thus many bytes */
1142	bitstream_init(&bs, p->code, size, 0);
1143	memset(p->code, 0, size);
1144	/* plain bits covered in this code string */
1145	plain_bits = 0;
1146
1147	/* p->encoding & 0x80 stores whether the first run length is set.
1148	 * bit offset is implicit.
1149	 * start with toggle == 2 to be able to tell the first iteration */
1150	toggle = 2;
1151
1152	/* see how much plain bits we can stuff into one packet
1153	 * using RLE and VLI. */
1154	do {
1155		tmp = (toggle == 0) ? _drbd_bm_find_next_zero(device, c->bit_offset)
1156				    : _drbd_bm_find_next(device, c->bit_offset);
1157		if (tmp == -1UL)
1158			tmp = c->bm_bits;
1159		rl = tmp - c->bit_offset;
1160
1161		if (toggle == 2) { /* first iteration */
1162			if (rl == 0) {
1163				/* the first checked bit was set,
1164				 * store start value, */
1165				dcbp_set_start(p, 1);
1166				/* but skip encoding of zero run length */
1167				toggle = !toggle;
1168				continue;
1169			}
1170			dcbp_set_start(p, 0);
1171		}
1172
1173		/* paranoia: catch zero runlength.
1174		 * can only happen if bitmap is modified while we scan it. */
1175		if (rl == 0) {
1176			drbd_err(device, "unexpected zero runlength while encoding bitmap "
1177			    "t:%u bo:%lu\n", toggle, c->bit_offset);
1178			return -1;
1179		}
1180
1181		bits = vli_encode_bits(&bs, rl);
1182		if (bits == -ENOBUFS) /* buffer full */
1183			break;
1184		if (bits <= 0) {
1185			drbd_err(device, "error while encoding bitmap: %d\n", bits);
1186			return 0;
1187		}
1188
1189		toggle = !toggle;
1190		plain_bits += rl;
1191		c->bit_offset = tmp;
1192	} while (c->bit_offset < c->bm_bits);
1193
1194	len = bs.cur.b - p->code + !!bs.cur.bit;
1195
1196	if (plain_bits < (len << 3)) {
1197		/* incompressible with this method.
1198		 * we need to rewind both word and bit position. */
1199		c->bit_offset -= plain_bits;
1200		bm_xfer_ctx_bit_to_word_offset(c);
1201		c->bit_offset = c->word_offset * BITS_PER_LONG;
1202		return 0;
1203	}
1204
1205	/* RLE + VLI was able to compress it just fine.
1206	 * update c->word_offset. */
1207	bm_xfer_ctx_bit_to_word_offset(c);
1208
1209	/* store pad_bits */
1210	dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1211
1212	return len;
1213}
1214
1215/**
1216 * send_bitmap_rle_or_plain
1217 *
1218 * Return 0 when done, 1 when another iteration is needed, and a negative error
1219 * code upon failure.
1220 */
1221static int
1222send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c)
 
1223{
1224	struct drbd_socket *sock = &first_peer_device(device)->connection->data;
1225	unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
1226	struct p_compressed_bm *p = sock->sbuf + header_size;
1227	int len, err;
 
 
1228
1229	len = fill_bitmap_rle_bits(device, p,
1230			DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
1231	if (len < 0)
1232		return -EIO;
1233
1234	if (len) {
1235		dcbp_set_code(p, RLE_VLI_Bits);
1236		err = __send_command(first_peer_device(device)->connection, device->vnr, sock,
1237				     P_COMPRESSED_BITMAP, sizeof(*p) + len,
1238				     NULL, 0);
1239		c->packets[0]++;
1240		c->bytes[0] += header_size + sizeof(*p) + len;
1241
1242		if (c->bit_offset >= c->bm_bits)
1243			len = 0; /* DONE */
1244	} else {
1245		/* was not compressible.
1246		 * send a buffer full of plain text bits instead. */
1247		unsigned int data_size;
1248		unsigned long num_words;
1249		unsigned long *p = sock->sbuf + header_size;
1250
1251		data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
1252		num_words = min_t(size_t, data_size / sizeof(*p),
1253				  c->bm_words - c->word_offset);
1254		len = num_words * sizeof(*p);
1255		if (len)
1256			drbd_bm_get_lel(device, c->word_offset, num_words, p);
1257		err = __send_command(first_peer_device(device)->connection, device->vnr, sock, P_BITMAP, len, NULL, 0);
 
1258		c->word_offset += num_words;
1259		c->bit_offset = c->word_offset * BITS_PER_LONG;
1260
1261		c->packets[1]++;
1262		c->bytes[1] += header_size + len;
1263
1264		if (c->bit_offset > c->bm_bits)
1265			c->bit_offset = c->bm_bits;
1266	}
1267	if (!err) {
1268		if (len == 0) {
1269			INFO_bm_xfer_stats(device, "send", c);
1270			return 0;
1271		} else
1272			return 1;
1273	}
1274	return -EIO;
1275}
1276
1277/* See the comment at receive_bitmap() */
1278static int _drbd_send_bitmap(struct drbd_device *device)
1279{
1280	struct bm_xfer_ctx c;
 
1281	int err;
1282
1283	if (!expect(device->bitmap))
 
 
 
 
 
 
1284		return false;
 
1285
1286	if (get_ldev(device)) {
1287		if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC)) {
1288			drbd_info(device, "Writing the whole bitmap, MDF_FullSync was set.\n");
1289			drbd_bm_set_all(device);
1290			if (drbd_bm_write(device)) {
1291				/* write_bm did fail! Leave full sync flag set in Meta P_DATA
1292				 * but otherwise process as per normal - need to tell other
1293				 * side that a full resync is required! */
1294				drbd_err(device, "Failed to write bitmap to disk!\n");
1295			} else {
1296				drbd_md_clear_flag(device, MDF_FULL_SYNC);
1297				drbd_md_sync(device);
1298			}
1299		}
1300		put_ldev(device);
1301	}
1302
1303	c = (struct bm_xfer_ctx) {
1304		.bm_bits = drbd_bm_bits(device),
1305		.bm_words = drbd_bm_words(device),
1306	};
1307
1308	do {
1309		err = send_bitmap_rle_or_plain(device, &c);
1310	} while (err > 0);
1311
 
1312	return err == 0;
1313}
1314
1315int drbd_send_bitmap(struct drbd_device *device)
1316{
1317	struct drbd_socket *sock = &first_peer_device(device)->connection->data;
1318	int err = -1;
1319
1320	mutex_lock(&sock->mutex);
1321	if (sock->socket)
1322		err = !_drbd_send_bitmap(device);
1323	mutex_unlock(&sock->mutex);
1324	return err;
1325}
1326
1327void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, u32 set_size)
1328{
1329	struct drbd_socket *sock;
1330	struct p_barrier_ack *p;
1331
1332	if (connection->cstate < C_WF_REPORT_PARAMS)
1333		return;
1334
1335	sock = &connection->meta;
1336	p = conn_prepare_command(connection, sock);
1337	if (!p)
1338		return;
1339	p->barrier = barrier_nr;
1340	p->set_size = cpu_to_be32(set_size);
1341	conn_send_command(connection, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0);
1342}
1343
1344/**
1345 * _drbd_send_ack() - Sends an ack packet
1346 * @device:	DRBD device.
1347 * @cmd:	Packet command code.
1348 * @sector:	sector, needs to be in big endian byte order
1349 * @blksize:	size in byte, needs to be in big endian byte order
1350 * @block_id:	Id, big endian byte order
1351 */
1352static int _drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
1353			  u64 sector, u32 blksize, u64 block_id)
1354{
1355	struct drbd_socket *sock;
1356	struct p_block_ack *p;
 
 
 
 
 
 
 
1357
1358	if (peer_device->device->state.conn < C_CONNECTED)
1359		return -EIO;
1360
1361	sock = &peer_device->connection->meta;
1362	p = drbd_prepare_command(peer_device, sock);
1363	if (!p)
1364		return -EIO;
1365	p->sector = sector;
1366	p->block_id = block_id;
1367	p->blksize = blksize;
1368	p->seq_num = cpu_to_be32(atomic_inc_return(&peer_device->device->packet_seq));
1369	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
1370}
1371
1372/* dp->sector and dp->block_id already/still in network byte order,
1373 * data_size is payload size according to dp->head,
1374 * and may need to be corrected for digest size. */
1375void drbd_send_ack_dp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
1376		      struct p_data *dp, int data_size)
1377{
1378	if (peer_device->connection->peer_integrity_tfm)
1379		data_size -= crypto_ahash_digestsize(peer_device->connection->peer_integrity_tfm);
1380	_drbd_send_ack(peer_device, cmd, dp->sector, cpu_to_be32(data_size),
1381		       dp->block_id);
1382}
1383
1384void drbd_send_ack_rp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
1385		      struct p_block_req *rp)
1386{
1387	_drbd_send_ack(peer_device, cmd, rp->sector, rp->blksize, rp->block_id);
1388}
1389
1390/**
1391 * drbd_send_ack() - Sends an ack packet
1392 * @device:	DRBD device
1393 * @cmd:	packet command code
1394 * @peer_req:	peer request
1395 */
1396int drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
1397		  struct drbd_peer_request *peer_req)
1398{
1399	return _drbd_send_ack(peer_device, cmd,
1400			      cpu_to_be64(peer_req->i.sector),
1401			      cpu_to_be32(peer_req->i.size),
1402			      peer_req->block_id);
1403}
1404
1405/* This function misuses the block_id field to signal if the blocks
1406 * are is sync or not. */
1407int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
1408		     sector_t sector, int blksize, u64 block_id)
1409{
1410	return _drbd_send_ack(peer_device, cmd,
1411			      cpu_to_be64(sector),
1412			      cpu_to_be32(blksize),
1413			      cpu_to_be64(block_id));
1414}
1415
1416int drbd_send_rs_deallocated(struct drbd_peer_device *peer_device,
1417			     struct drbd_peer_request *peer_req)
1418{
1419	struct drbd_socket *sock;
1420	struct p_block_desc *p;
 
 
 
 
1421
1422	sock = &peer_device->connection->data;
1423	p = drbd_prepare_command(peer_device, sock);
1424	if (!p)
1425		return -EIO;
1426	p->sector = cpu_to_be64(peer_req->i.sector);
1427	p->blksize = cpu_to_be32(peer_req->i.size);
1428	p->pad = 0;
1429	return drbd_send_command(peer_device, sock, P_RS_DEALLOCATED, sizeof(*p), NULL, 0);
1430}
1431
1432int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd,
1433		       sector_t sector, int size, u64 block_id)
 
 
1434{
1435	struct drbd_socket *sock;
1436	struct p_block_req *p;
 
 
 
 
 
 
 
 
1437
1438	sock = &peer_device->connection->data;
1439	p = drbd_prepare_command(peer_device, sock);
1440	if (!p)
1441		return -EIO;
1442	p->sector = cpu_to_be64(sector);
1443	p->block_id = block_id;
1444	p->blksize = cpu_to_be32(size);
1445	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
1446}
1447
1448int drbd_send_drequest_csum(struct drbd_peer_device *peer_device, sector_t sector, int size,
1449			    void *digest, int digest_size, enum drbd_packet cmd)
1450{
1451	struct drbd_socket *sock;
1452	struct p_block_req *p;
1453
1454	/* FIXME: Put the digest into the preallocated socket buffer.  */
1455
1456	sock = &peer_device->connection->data;
1457	p = drbd_prepare_command(peer_device, sock);
1458	if (!p)
1459		return -EIO;
1460	p->sector = cpu_to_be64(sector);
1461	p->block_id = ID_SYNCER /* unused */;
1462	p->blksize = cpu_to_be32(size);
1463	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), digest, digest_size);
1464}
1465
1466int drbd_send_ov_request(struct drbd_peer_device *peer_device, sector_t sector, int size)
1467{
1468	struct drbd_socket *sock;
1469	struct p_block_req *p;
 
 
 
 
1470
1471	sock = &peer_device->connection->data;
1472	p = drbd_prepare_command(peer_device, sock);
1473	if (!p)
1474		return -EIO;
1475	p->sector = cpu_to_be64(sector);
1476	p->block_id = ID_SYNCER /* unused */;
1477	p->blksize = cpu_to_be32(size);
1478	return drbd_send_command(peer_device, sock, P_OV_REQUEST, sizeof(*p), NULL, 0);
1479}
1480
1481/* called on sndtimeo
1482 * returns false if we should retry,
1483 * true if we think connection is dead
1484 */
1485static int we_should_drop_the_connection(struct drbd_connection *connection, struct socket *sock)
1486{
1487	int drop_it;
1488	/* long elapsed = (long)(jiffies - device->last_received); */
1489
1490	drop_it =   connection->meta.socket == sock
1491		|| !connection->ack_receiver.task
1492		|| get_t_state(&connection->ack_receiver) != RUNNING
1493		|| connection->cstate < C_WF_REPORT_PARAMS;
1494
1495	if (drop_it)
1496		return true;
1497
1498	drop_it = !--connection->ko_count;
1499	if (!drop_it) {
1500		drbd_err(connection, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
1501			 current->comm, current->pid, connection->ko_count);
1502		request_ping(connection);
1503	}
1504
1505	return drop_it; /* && (device->state == R_PRIMARY) */;
1506}
1507
1508static void drbd_update_congested(struct drbd_connection *connection)
1509{
1510	struct sock *sk = connection->data.socket->sk;
1511	if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
1512		set_bit(NET_CONGESTED, &connection->flags);
1513}
1514
1515/* The idea of sendpage seems to be to put some kind of reference
1516 * to the page into the skb, and to hand it over to the NIC. In
1517 * this process get_page() gets called.
1518 *
1519 * As soon as the page was really sent over the network put_page()
1520 * gets called by some part of the network layer. [ NIC driver? ]
1521 *
1522 * [ get_page() / put_page() increment/decrement the count. If count
1523 *   reaches 0 the page will be freed. ]
1524 *
1525 * This works nicely with pages from FSs.
1526 * But this means that in protocol A we might signal IO completion too early!
1527 *
1528 * In order not to corrupt data during a resync we must make sure
1529 * that we do not reuse our own buffer pages (EEs) to early, therefore
1530 * we have the net_ee list.
1531 *
1532 * XFS seems to have problems, still, it submits pages with page_count == 0!
1533 * As a workaround, we disable sendpage on pages
1534 * with page_count == 0 or PageSlab.
1535 */
1536static int _drbd_no_send_page(struct drbd_peer_device *peer_device, struct page *page,
1537			      int offset, size_t size, unsigned msg_flags)
1538{
1539	struct socket *socket;
1540	void *addr;
1541	int err;
1542
1543	socket = peer_device->connection->data.socket;
1544	addr = kmap(page) + offset;
1545	err = drbd_send_all(peer_device->connection, socket, addr, size, msg_flags);
1546	kunmap(page);
1547	if (!err)
1548		peer_device->device->send_cnt += size >> 9;
1549	return err;
1550}
1551
1552static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *page,
1553		    int offset, size_t size, unsigned msg_flags)
1554{
1555	struct socket *socket = peer_device->connection->data.socket;
 
1556	int len = size;
1557	int err = -EIO;
1558
1559	/* e.g. XFS meta- & log-data is in slab pages, which have a
1560	 * page_count of 0 and/or have PageSlab() set.
1561	 * we cannot use send_page for those, as that does get_page();
1562	 * put_page(); and would cause either a VM_BUG directly, or
1563	 * __page_cache_release a page that would actually still be referenced
1564	 * by someone, leading to some obscure delayed Oops somewhere else. */
1565	if (drbd_disable_sendpage || (page_count(page) < 1) || PageSlab(page))
1566		return _drbd_no_send_page(peer_device, page, offset, size, msg_flags);
1567
1568	msg_flags |= MSG_NOSIGNAL;
1569	drbd_update_congested(peer_device->connection);
 
1570	do {
1571		int sent;
1572
1573		sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
 
 
 
 
 
 
 
1574		if (sent <= 0) {
1575			if (sent == -EAGAIN) {
1576				if (we_should_drop_the_connection(peer_device->connection, socket))
1577					break;
1578				continue;
1579			}
1580			drbd_warn(peer_device->device, "%s: size=%d len=%d sent=%d\n",
1581			     __func__, (int)size, len, sent);
1582			if (sent < 0)
1583				err = sent;
1584			break;
1585		}
1586		len    -= sent;
1587		offset += sent;
1588	} while (len > 0 /* THINK && device->cstate >= C_CONNECTED*/);
1589	clear_bit(NET_CONGESTED, &peer_device->connection->flags);
 
1590
1591	if (len == 0) {
1592		err = 0;
1593		peer_device->device->send_cnt += size >> 9;
1594	}
1595	return err;
1596}
1597
1598static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
1599{
1600	struct bio_vec bvec;
1601	struct bvec_iter iter;
1602
1603	/* hint all but last page with MSG_MORE */
1604	bio_for_each_segment(bvec, bio, iter) {
1605		int err;
1606
1607		err = _drbd_no_send_page(peer_device, bvec.bv_page,
1608					 bvec.bv_offset, bvec.bv_len,
1609					 bio_iter_last(bvec, iter)
1610					 ? 0 : MSG_MORE);
1611		if (err)
1612			return err;
1613		/* REQ_OP_WRITE_SAME has only one segment */
1614		if (bio_op(bio) == REQ_OP_WRITE_SAME)
1615			break;
1616	}
1617	return 0;
1618}
1619
1620static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *bio)
1621{
1622	struct bio_vec bvec;
1623	struct bvec_iter iter;
1624
1625	/* hint all but last page with MSG_MORE */
1626	bio_for_each_segment(bvec, bio, iter) {
1627		int err;
1628
1629		err = _drbd_send_page(peer_device, bvec.bv_page,
1630				      bvec.bv_offset, bvec.bv_len,
1631				      bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
1632		if (err)
1633			return err;
1634		/* REQ_OP_WRITE_SAME has only one segment */
1635		if (bio_op(bio) == REQ_OP_WRITE_SAME)
1636			break;
1637	}
1638	return 0;
1639}
1640
1641static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
1642			    struct drbd_peer_request *peer_req)
1643{
1644	struct page *page = peer_req->pages;
1645	unsigned len = peer_req->i.size;
1646	int err;
1647
1648	/* hint all but last page with MSG_MORE */
1649	page_chain_for_each(page) {
1650		unsigned l = min_t(unsigned, len, PAGE_SIZE);
1651
1652		err = _drbd_send_page(peer_device, page, 0, l,
1653				      page_chain_next(page) ? MSG_MORE : 0);
1654		if (err)
1655			return err;
1656		len -= l;
1657	}
1658	return 0;
1659}
1660
1661static u32 bio_flags_to_wire(struct drbd_connection *connection,
1662			     struct bio *bio)
1663{
1664	if (connection->agreed_pro_version >= 95)
1665		return  (bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0) |
1666			(bio->bi_opf & REQ_FUA ? DP_FUA : 0) |
1667			(bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : 0) |
1668			(bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
1669			(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0) |
1670			(bio_op(bio) == REQ_OP_WRITE_ZEROES ? DP_DISCARD : 0);
1671	else
1672		return bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0;
1673}
1674
1675/* Used to send write or TRIM aka REQ_DISCARD requests
1676 * R_PRIMARY -> Peer	(P_DATA, P_TRIM)
1677 */
1678int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req)
1679{
1680	struct drbd_device *device = peer_device->device;
1681	struct drbd_socket *sock;
1682	struct p_data *p;
1683	struct p_wsame *wsame = NULL;
1684	void *digest_out;
1685	unsigned int dp_flags = 0;
1686	int digest_size;
1687	int err;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1688
1689	sock = &peer_device->connection->data;
1690	p = drbd_prepare_command(peer_device, sock);
1691	digest_size = peer_device->connection->integrity_tfm ?
1692		      crypto_ahash_digestsize(peer_device->connection->integrity_tfm) : 0;
1693
1694	if (!p)
1695		return -EIO;
1696	p->sector = cpu_to_be64(req->i.sector);
1697	p->block_id = (unsigned long)req;
1698	p->seq_num = cpu_to_be32(atomic_inc_return(&device->packet_seq));
1699	dp_flags = bio_flags_to_wire(peer_device->connection, req->master_bio);
1700	if (device->state.conn >= C_SYNC_SOURCE &&
1701	    device->state.conn <= C_PAUSED_SYNC_T)
1702		dp_flags |= DP_MAY_SET_IN_SYNC;
1703	if (peer_device->connection->agreed_pro_version >= 100) {
1704		if (req->rq_state & RQ_EXP_RECEIVE_ACK)
1705			dp_flags |= DP_SEND_RECEIVE_ACK;
1706		/* During resync, request an explicit write ack,
1707		 * even in protocol != C */
1708		if (req->rq_state & RQ_EXP_WRITE_ACK
1709		|| (dp_flags & DP_MAY_SET_IN_SYNC))
1710			dp_flags |= DP_SEND_WRITE_ACK;
1711	}
1712	p->dp_flags = cpu_to_be32(dp_flags);
1713
1714	if (dp_flags & DP_DISCARD) {
1715		struct p_trim *t = (struct p_trim*)p;
1716		t->size = cpu_to_be32(req->i.size);
1717		err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
1718		goto out;
1719	}
1720	if (dp_flags & DP_WSAME) {
1721		/* this will only work if DRBD_FF_WSAME is set AND the
1722		 * handshake agreed that all nodes and backend devices are
1723		 * WRITE_SAME capable and agree on logical_block_size */
1724		wsame = (struct p_wsame*)p;
1725		digest_out = wsame + 1;
1726		wsame->size = cpu_to_be32(req->i.size);
1727	} else
1728		digest_out = p + 1;
1729
1730	/* our digest is still only over the payload.
1731	 * TRIM does not carry any payload. */
1732	if (digest_size)
1733		drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out);
1734	if (wsame) {
1735		err =
1736		    __send_command(peer_device->connection, device->vnr, sock, P_WSAME,
1737				   sizeof(*wsame) + digest_size, NULL,
1738				   bio_iovec(req->master_bio).bv_len);
1739	} else
1740		err =
1741		    __send_command(peer_device->connection, device->vnr, sock, P_DATA,
1742				   sizeof(*p) + digest_size, NULL, req->i.size);
1743	if (!err) {
1744		/* For protocol A, we have to memcpy the payload into
1745		 * socket buffers, as we may complete right away
1746		 * as soon as we handed it over to tcp, at which point the data
1747		 * pages may become invalid.
1748		 *
1749		 * For data-integrity enabled, we copy it as well, so we can be
1750		 * sure that even if the bio pages may still be modified, it
1751		 * won't change the data on the wire, thus if the digest checks
1752		 * out ok after sending on this side, but does not fit on the
1753		 * receiving side, we sure have detected corruption elsewhere.
1754		 */
1755		if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || digest_size)
1756			err = _drbd_send_bio(peer_device, req->master_bio);
1757		else
1758			err = _drbd_send_zc_bio(peer_device, req->master_bio);
1759
1760		/* double check digest, sometimes buffers have been modified in flight. */
1761		if (digest_size > 0 && digest_size <= 64) {
1762			/* 64 byte, 512 bit, is the largest digest size
1763			 * currently supported in kernel crypto. */
1764			unsigned char digest[64];
1765			drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest);
1766			if (memcmp(p + 1, digest, digest_size)) {
1767				drbd_warn(device,
1768					"Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
1769					(unsigned long long)req->i.sector, req->i.size);
1770			}
1771		} /* else if (digest_size > 64) {
1772		     ... Be noisy about digest too large ...
1773		} */
1774	}
1775out:
1776	mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
1777
1778	return err;
 
 
1779}
1780
1781/* answer packet, used to send data back for read requests:
1782 *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
1783 *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
1784 */
1785int drbd_send_block(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
1786		    struct drbd_peer_request *peer_req)
1787{
1788	struct drbd_device *device = peer_device->device;
1789	struct drbd_socket *sock;
1790	struct p_data *p;
1791	int err;
1792	int digest_size;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1793
1794	sock = &peer_device->connection->data;
1795	p = drbd_prepare_command(peer_device, sock);
 
 
 
 
 
 
 
 
1796
1797	digest_size = peer_device->connection->integrity_tfm ?
1798		      crypto_ahash_digestsize(peer_device->connection->integrity_tfm) : 0;
 
 
 
 
 
 
1799
1800	if (!p)
1801		return -EIO;
1802	p->sector = cpu_to_be64(peer_req->i.sector);
1803	p->block_id = peer_req->block_id;
1804	p->seq_num = 0;  /* unused */
1805	p->dp_flags = 0;
1806	if (digest_size)
1807		drbd_csum_ee(peer_device->connection->integrity_tfm, peer_req, p + 1);
1808	err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*p) + digest_size, NULL, peer_req->i.size);
1809	if (!err)
1810		err = _drbd_send_zc_ee(peer_device, peer_req);
1811	mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
1812
1813	return err;
1814}
1815
1816int drbd_send_out_of_sync(struct drbd_peer_device *peer_device, struct drbd_request *req)
1817{
1818	struct drbd_socket *sock;
1819	struct p_block_desc *p;
1820
1821	sock = &peer_device->connection->data;
1822	p = drbd_prepare_command(peer_device, sock);
1823	if (!p)
1824		return -EIO;
1825	p->sector = cpu_to_be64(req->i.sector);
1826	p->blksize = cpu_to_be32(req->i.size);
1827	return drbd_send_command(peer_device, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0);
1828}
1829
1830/*
1831  drbd_send distinguishes two cases:
1832
1833  Packets sent via the data socket "sock"
1834  and packets sent via the meta data socket "msock"
1835
1836		    sock                      msock
1837  -----------------+-------------------------+------------------------------
1838  timeout           conf.timeout / 2          conf.timeout / 2
1839  timeout action    send a ping via msock     Abort communication
1840					      and close all sockets
1841*/
1842
1843/*
1844 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
1845 */
1846int drbd_send(struct drbd_connection *connection, struct socket *sock,
1847	      void *buf, size_t size, unsigned msg_flags)
1848{
1849	struct kvec iov = {.iov_base = buf, .iov_len = size};
1850	struct msghdr msg = {.msg_flags = msg_flags | MSG_NOSIGNAL};
1851	int rv, sent = 0;
1852
1853	if (!sock)
1854		return -EBADR;
1855
1856	/* THINK  if (signal_pending) return ... ? */
1857
1858	iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1, size);
 
1859
1860	if (sock == connection->data.socket) {
1861		rcu_read_lock();
1862		connection->ko_count = rcu_dereference(connection->net_conf)->ko_count;
1863		rcu_read_unlock();
1864		drbd_update_congested(connection);
 
 
 
 
1865	}
1866	do {
1867		rv = sock_sendmsg(sock, &msg);
 
 
 
 
 
 
 
 
 
1868		if (rv == -EAGAIN) {
1869			if (we_should_drop_the_connection(connection, sock))
1870				break;
1871			else
1872				continue;
1873		}
 
1874		if (rv == -EINTR) {
1875			flush_signals(current);
1876			rv = 0;
1877		}
1878		if (rv < 0)
1879			break;
1880		sent += rv;
 
 
1881	} while (sent < size);
1882
1883	if (sock == connection->data.socket)
1884		clear_bit(NET_CONGESTED, &connection->flags);
1885
1886	if (rv <= 0) {
1887		if (rv != -EAGAIN) {
1888			drbd_err(connection, "%s_sendmsg returned %d\n",
1889				 sock == connection->meta.socket ? "msock" : "sock",
1890				 rv);
1891			conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
1892		} else
1893			conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
1894	}
1895
1896	return sent;
1897}
1898
1899/**
1900 * drbd_send_all  -  Send an entire buffer
1901 *
1902 * Returns 0 upon success and a negative error value otherwise.
1903 */
1904int drbd_send_all(struct drbd_connection *connection, struct socket *sock, void *buffer,
1905		  size_t size, unsigned msg_flags)
1906{
1907	int err;
1908
1909	err = drbd_send(connection, sock, buffer, size, msg_flags);
1910	if (err < 0)
1911		return err;
1912	if (err != size)
1913		return -EIO;
1914	return 0;
1915}
1916
1917static int drbd_open(struct block_device *bdev, fmode_t mode)
1918{
1919	struct drbd_device *device = bdev->bd_disk->private_data;
1920	unsigned long flags;
1921	int rv = 0;
1922
1923	mutex_lock(&drbd_main_mutex);
1924	spin_lock_irqsave(&device->resource->req_lock, flags);
1925	/* to have a stable device->state.role
1926	 * and no race with updating open_cnt */
1927
1928	if (device->state.role != R_PRIMARY) {
1929		if (mode & FMODE_WRITE)
1930			rv = -EROFS;
1931		else if (!drbd_allow_oos)
1932			rv = -EMEDIUMTYPE;
1933	}
1934
1935	if (!rv)
1936		device->open_cnt++;
1937	spin_unlock_irqrestore(&device->resource->req_lock, flags);
1938	mutex_unlock(&drbd_main_mutex);
1939
1940	return rv;
1941}
1942
1943static void drbd_release(struct gendisk *gd, fmode_t mode)
1944{
1945	struct drbd_device *device = gd->private_data;
1946	mutex_lock(&drbd_main_mutex);
1947	device->open_cnt--;
1948	mutex_unlock(&drbd_main_mutex);
 
1949}
1950
1951/* need to hold resource->req_lock */
1952void drbd_queue_unplug(struct drbd_device *device)
1953{
1954	if (device->state.pdsk >= D_INCONSISTENT && device->state.conn >= C_CONNECTED) {
1955		D_ASSERT(device, device->state.role == R_PRIMARY);
1956		if (test_and_clear_bit(UNPLUG_REMOTE, &device->flags)) {
1957			drbd_queue_work_if_unqueued(
1958				&first_peer_device(device)->connection->sender_work,
1959				&device->unplug_work);
1960		}
1961	}
1962}
 
 
 
 
 
 
 
 
1963
1964static void drbd_set_defaults(struct drbd_device *device)
1965{
1966	/* Beware! The actual layout differs
1967	 * between big endian and little endian */
1968	device->state = (union drbd_dev_state) {
1969		{ .role = R_SECONDARY,
1970		  .peer = R_UNKNOWN,
1971		  .conn = C_STANDALONE,
1972		  .disk = D_DISKLESS,
1973		  .pdsk = D_UNKNOWN,
 
 
 
1974		} };
1975}
1976
1977void drbd_init_set_defaults(struct drbd_device *device)
1978{
1979	/* the memset(,0,) did most of this.
1980	 * note: only assignments, no allocation in here */
1981
1982	drbd_set_defaults(device);
1983
1984	atomic_set(&device->ap_bio_cnt, 0);
1985	atomic_set(&device->ap_actlog_cnt, 0);
1986	atomic_set(&device->ap_pending_cnt, 0);
1987	atomic_set(&device->rs_pending_cnt, 0);
1988	atomic_set(&device->unacked_cnt, 0);
1989	atomic_set(&device->local_cnt, 0);
1990	atomic_set(&device->pp_in_use_by_net, 0);
1991	atomic_set(&device->rs_sect_in, 0);
1992	atomic_set(&device->rs_sect_ev, 0);
1993	atomic_set(&device->ap_in_flight, 0);
1994	atomic_set(&device->md_io.in_use, 0);
1995
1996	mutex_init(&device->own_state_mutex);
1997	device->state_mutex = &device->own_state_mutex;
1998
1999	spin_lock_init(&device->al_lock);
2000	spin_lock_init(&device->peer_seq_lock);
2001
2002	INIT_LIST_HEAD(&device->active_ee);
2003	INIT_LIST_HEAD(&device->sync_ee);
2004	INIT_LIST_HEAD(&device->done_ee);
2005	INIT_LIST_HEAD(&device->read_ee);
2006	INIT_LIST_HEAD(&device->net_ee);
2007	INIT_LIST_HEAD(&device->resync_reads);
2008	INIT_LIST_HEAD(&device->resync_work.list);
2009	INIT_LIST_HEAD(&device->unplug_work.list);
2010	INIT_LIST_HEAD(&device->bm_io_work.w.list);
2011	INIT_LIST_HEAD(&device->pending_master_completion[0]);
2012	INIT_LIST_HEAD(&device->pending_master_completion[1]);
2013	INIT_LIST_HEAD(&device->pending_completion[0]);
2014	INIT_LIST_HEAD(&device->pending_completion[1]);
2015
2016	device->resync_work.cb  = w_resync_timer;
2017	device->unplug_work.cb  = w_send_write_hint;
2018	device->bm_io_work.w.cb = w_bitmap_io;
2019
2020	timer_setup(&device->resync_timer, resync_timer_fn, 0);
2021	timer_setup(&device->md_sync_timer, md_sync_timer_fn, 0);
2022	timer_setup(&device->start_resync_timer, start_resync_timer_fn, 0);
2023	timer_setup(&device->request_timer, request_timer_fn, 0);
2024
2025	init_waitqueue_head(&device->misc_wait);
2026	init_waitqueue_head(&device->state_wait);
2027	init_waitqueue_head(&device->ee_wait);
2028	init_waitqueue_head(&device->al_wait);
2029	init_waitqueue_head(&device->seq_wait);
2030
2031	device->resync_wenr = LC_FREE;
2032	device->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
2033	device->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2034}
2035
2036void drbd_device_cleanup(struct drbd_device *device)
2037{
2038	int i;
2039	if (first_peer_device(device)->connection->receiver.t_state != NONE)
2040		drbd_err(device, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2041				first_peer_device(device)->connection->receiver.t_state);
2042
2043	device->al_writ_cnt  =
2044	device->bm_writ_cnt  =
2045	device->read_cnt     =
2046	device->recv_cnt     =
2047	device->send_cnt     =
2048	device->writ_cnt     =
2049	device->p_size       =
2050	device->rs_start     =
2051	device->rs_total     =
2052	device->rs_failed    = 0;
2053	device->rs_last_events = 0;
2054	device->rs_last_sect_ev = 0;
 
 
 
2055	for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2056		device->rs_mark_left[i] = 0;
2057		device->rs_mark_time[i] = 0;
2058	}
2059	D_ASSERT(device, first_peer_device(device)->connection->net_conf == NULL);
2060
2061	drbd_set_my_capacity(device, 0);
2062	if (device->bitmap) {
2063		/* maybe never allocated. */
2064		drbd_bm_resize(device, 0, 1);
2065		drbd_bm_cleanup(device);
2066	}
2067
2068	drbd_backing_dev_free(device, device->ldev);
2069	device->ldev = NULL;
2070
2071	clear_bit(AL_SUSPENDED, &device->flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2072
2073	D_ASSERT(device, list_empty(&device->active_ee));
2074	D_ASSERT(device, list_empty(&device->sync_ee));
2075	D_ASSERT(device, list_empty(&device->done_ee));
2076	D_ASSERT(device, list_empty(&device->read_ee));
2077	D_ASSERT(device, list_empty(&device->net_ee));
2078	D_ASSERT(device, list_empty(&device->resync_reads));
2079	D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
2080	D_ASSERT(device, list_empty(&device->resync_work.list));
2081	D_ASSERT(device, list_empty(&device->unplug_work.list));
2082
2083	drbd_set_defaults(device);
2084}
2085
2086
2087static void drbd_destroy_mempools(void)
2088{
2089	struct page *page;
2090
2091	while (drbd_pp_pool) {
2092		page = drbd_pp_pool;
2093		drbd_pp_pool = (struct page *)page_private(page);
2094		__free_page(page);
2095		drbd_pp_vacant--;
2096	}
2097
2098	/* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
2099
2100	if (drbd_io_bio_set)
2101		bioset_free(drbd_io_bio_set);
2102	if (drbd_md_io_bio_set)
2103		bioset_free(drbd_md_io_bio_set);
2104	if (drbd_md_io_page_pool)
2105		mempool_destroy(drbd_md_io_page_pool);
2106	if (drbd_ee_mempool)
2107		mempool_destroy(drbd_ee_mempool);
2108	if (drbd_request_mempool)
2109		mempool_destroy(drbd_request_mempool);
2110	if (drbd_ee_cache)
2111		kmem_cache_destroy(drbd_ee_cache);
2112	if (drbd_request_cache)
2113		kmem_cache_destroy(drbd_request_cache);
2114	if (drbd_bm_ext_cache)
2115		kmem_cache_destroy(drbd_bm_ext_cache);
2116	if (drbd_al_ext_cache)
2117		kmem_cache_destroy(drbd_al_ext_cache);
2118
2119	drbd_io_bio_set      = NULL;
2120	drbd_md_io_bio_set   = NULL;
2121	drbd_md_io_page_pool = NULL;
2122	drbd_ee_mempool      = NULL;
2123	drbd_request_mempool = NULL;
2124	drbd_ee_cache        = NULL;
2125	drbd_request_cache   = NULL;
2126	drbd_bm_ext_cache    = NULL;
2127	drbd_al_ext_cache    = NULL;
2128
2129	return;
2130}
2131
2132static int drbd_create_mempools(void)
2133{
2134	struct page *page;
2135	const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count;
2136	int i;
2137
2138	/* prepare our caches and mempools */
2139	drbd_request_mempool = NULL;
2140	drbd_ee_cache        = NULL;
2141	drbd_request_cache   = NULL;
2142	drbd_bm_ext_cache    = NULL;
2143	drbd_al_ext_cache    = NULL;
2144	drbd_pp_pool         = NULL;
2145	drbd_md_io_page_pool = NULL;
2146	drbd_md_io_bio_set   = NULL;
2147	drbd_io_bio_set      = NULL;
2148
2149	/* caches */
2150	drbd_request_cache = kmem_cache_create(
2151		"drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2152	if (drbd_request_cache == NULL)
2153		goto Enomem;
2154
2155	drbd_ee_cache = kmem_cache_create(
2156		"drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
2157	if (drbd_ee_cache == NULL)
2158		goto Enomem;
2159
2160	drbd_bm_ext_cache = kmem_cache_create(
2161		"drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2162	if (drbd_bm_ext_cache == NULL)
2163		goto Enomem;
2164
2165	drbd_al_ext_cache = kmem_cache_create(
2166		"drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2167	if (drbd_al_ext_cache == NULL)
2168		goto Enomem;
2169
2170	/* mempools */
2171	drbd_io_bio_set = bioset_create(BIO_POOL_SIZE, 0, 0);
2172	if (drbd_io_bio_set == NULL)
2173		goto Enomem;
2174
2175	drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0,
2176					   BIOSET_NEED_BVECS);
2177	if (drbd_md_io_bio_set == NULL)
2178		goto Enomem;
2179
2180	drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
2181	if (drbd_md_io_page_pool == NULL)
2182		goto Enomem;
2183
2184	drbd_request_mempool = mempool_create_slab_pool(number,
2185		drbd_request_cache);
2186	if (drbd_request_mempool == NULL)
2187		goto Enomem;
2188
2189	drbd_ee_mempool = mempool_create_slab_pool(number, drbd_ee_cache);
 
2190	if (drbd_ee_mempool == NULL)
2191		goto Enomem;
2192
2193	/* drbd's page pool */
2194	spin_lock_init(&drbd_pp_lock);
2195
2196	for (i = 0; i < number; i++) {
2197		page = alloc_page(GFP_HIGHUSER);
2198		if (!page)
2199			goto Enomem;
2200		set_page_private(page, (unsigned long)drbd_pp_pool);
2201		drbd_pp_pool = page;
2202	}
2203	drbd_pp_vacant = number;
2204
2205	return 0;
2206
2207Enomem:
2208	drbd_destroy_mempools(); /* in case we allocated some */
2209	return -ENOMEM;
2210}
2211
2212static void drbd_release_all_peer_reqs(struct drbd_device *device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2213{
2214	int rr;
2215
2216	rr = drbd_free_peer_reqs(device, &device->active_ee);
2217	if (rr)
2218		drbd_err(device, "%d EEs in active list found!\n", rr);
2219
2220	rr = drbd_free_peer_reqs(device, &device->sync_ee);
2221	if (rr)
2222		drbd_err(device, "%d EEs in sync list found!\n", rr);
2223
2224	rr = drbd_free_peer_reqs(device, &device->read_ee);
2225	if (rr)
2226		drbd_err(device, "%d EEs in read list found!\n", rr);
2227
2228	rr = drbd_free_peer_reqs(device, &device->done_ee);
2229	if (rr)
2230		drbd_err(device, "%d EEs in done list found!\n", rr);
2231
2232	rr = drbd_free_peer_reqs(device, &device->net_ee);
2233	if (rr)
2234		drbd_err(device, "%d EEs in net list found!\n", rr);
2235}
2236
2237/* caution. no locking. */
2238void drbd_destroy_device(struct kref *kref)
 
2239{
2240	struct drbd_device *device = container_of(kref, struct drbd_device, kref);
2241	struct drbd_resource *resource = device->resource;
2242	struct drbd_peer_device *peer_device, *tmp_peer_device;
2243
2244	del_timer_sync(&device->request_timer);
 
2245
2246	/* paranoia asserts */
2247	D_ASSERT(device, device->open_cnt == 0);
 
 
 
 
 
 
 
 
 
2248	/* end paranoia asserts */
2249
 
 
2250	/* cleanup stuff that may have been allocated during
2251	 * device (re-)configuration or state changes */
2252
2253	if (device->this_bdev)
2254		bdput(device->this_bdev);
2255
2256	drbd_backing_dev_free(device, device->ldev);
2257	device->ldev = NULL;
2258
2259	drbd_release_all_peer_reqs(device);
2260
2261	lc_destroy(device->act_log);
2262	lc_destroy(device->resync);
2263
2264	kfree(device->p_uuid);
2265	/* device->p_uuid = NULL; */
2266
2267	if (device->bitmap) /* should no longer be there. */
2268		drbd_bm_cleanup(device);
2269	__free_page(device->md_io.page);
2270	put_disk(device->vdisk);
2271	blk_cleanup_queue(device->rq_queue);
2272	kfree(device->rs_plan_s);
2273
2274	/* not for_each_connection(connection, resource):
2275	 * those may have been cleaned up and disassociated already.
2276	 */
2277	for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
2278		kref_put(&peer_device->connection->kref, drbd_destroy_connection);
2279		kfree(peer_device);
2280	}
2281	memset(device, 0xfd, sizeof(*device));
2282	kfree(device);
2283	kref_put(&resource->kref, drbd_destroy_resource);
2284}
2285
2286/* One global retry thread, if we need to push back some bio and have it
2287 * reinserted through our make request function.
2288 */
2289static struct retry_worker {
2290	struct workqueue_struct *wq;
2291	struct work_struct worker;
2292
2293	spinlock_t lock;
2294	struct list_head writes;
2295} retry;
2296
2297static void do_retry(struct work_struct *ws)
2298{
2299	struct retry_worker *retry = container_of(ws, struct retry_worker, worker);
2300	LIST_HEAD(writes);
2301	struct drbd_request *req, *tmp;
2302
2303	spin_lock_irq(&retry->lock);
2304	list_splice_init(&retry->writes, &writes);
2305	spin_unlock_irq(&retry->lock);
2306
2307	list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
2308		struct drbd_device *device = req->device;
2309		struct bio *bio = req->master_bio;
2310		unsigned long start_jif = req->start_jif;
2311		bool expected;
2312
2313		expected =
2314			expect(atomic_read(&req->completion_ref) == 0) &&
2315			expect(req->rq_state & RQ_POSTPONED) &&
2316			expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
2317				(req->rq_state & RQ_LOCAL_ABORTED) != 0);
2318
2319		if (!expected)
2320			drbd_err(device, "req=%p completion_ref=%d rq_state=%x\n",
2321				req, atomic_read(&req->completion_ref),
2322				req->rq_state);
2323
2324		/* We still need to put one kref associated with the
2325		 * "completion_ref" going zero in the code path that queued it
2326		 * here.  The request object may still be referenced by a
2327		 * frozen local req->private_bio, in case we force-detached.
2328		 */
2329		kref_put(&req->kref, drbd_req_destroy);
2330
2331		/* A single suspended or otherwise blocking device may stall
2332		 * all others as well.  Fortunately, this code path is to
2333		 * recover from a situation that "should not happen":
2334		 * concurrent writes in multi-primary setup.
2335		 * In a "normal" lifecycle, this workqueue is supposed to be
2336		 * destroyed without ever doing anything.
2337		 * If it turns out to be an issue anyways, we can do per
2338		 * resource (replication group) or per device (minor) retry
2339		 * workqueues instead.
2340		 */
2341
2342		/* We are not just doing generic_make_request(),
2343		 * as we want to keep the start_time information. */
2344		inc_ap_bio(device);
2345		__drbd_make_request(device, bio, start_jif);
2346	}
2347}
2348
2349/* called via drbd_req_put_completion_ref(),
2350 * holds resource->req_lock */
2351void drbd_restart_request(struct drbd_request *req)
2352{
2353	unsigned long flags;
2354	spin_lock_irqsave(&retry.lock, flags);
2355	list_move_tail(&req->tl_requests, &retry.writes);
2356	spin_unlock_irqrestore(&retry.lock, flags);
2357
2358	/* Drop the extra reference that would otherwise
2359	 * have been dropped by complete_master_bio.
2360	 * do_retry() needs to grab a new one. */
2361	dec_ap_bio(req->device);
2362
2363	queue_work(retry.wq, &retry.worker);
2364}
2365
2366void drbd_destroy_resource(struct kref *kref)
2367{
2368	struct drbd_resource *resource =
2369		container_of(kref, struct drbd_resource, kref);
2370
2371	idr_destroy(&resource->devices);
2372	free_cpumask_var(resource->cpu_mask);
2373	kfree(resource->name);
2374	memset(resource, 0xf2, sizeof(*resource));
2375	kfree(resource);
2376}
2377
2378void drbd_free_resource(struct drbd_resource *resource)
2379{
2380	struct drbd_connection *connection, *tmp;
2381
2382	for_each_connection_safe(connection, tmp, resource) {
2383		list_del(&connection->connections);
2384		drbd_debugfs_connection_cleanup(connection);
2385		kref_put(&connection->kref, drbd_destroy_connection);
2386	}
2387	drbd_debugfs_resource_cleanup(resource);
2388	kref_put(&resource->kref, drbd_destroy_resource);
2389}
2390
2391static void drbd_cleanup(void)
2392{
2393	unsigned int i;
2394	struct drbd_device *device;
2395	struct drbd_resource *resource, *tmp;
2396
2397	/* first remove proc,
2398	 * drbdsetup uses it's presence to detect
2399	 * whether DRBD is loaded.
2400	 * If we would get stuck in proc removal,
2401	 * but have netlink already deregistered,
2402	 * some drbdsetup commands may wait forever
2403	 * for an answer.
2404	 */
2405	if (drbd_proc)
2406		remove_proc_entry("drbd", NULL);
2407
2408	if (retry.wq)
2409		destroy_workqueue(retry.wq);
2410
2411	drbd_genl_unregister();
2412
2413	idr_for_each_entry(&drbd_devices, device, i)
2414		drbd_delete_device(device);
2415
2416	/* not _rcu since, no other updater anymore. Genl already unregistered */
2417	for_each_resource_safe(resource, tmp, &drbd_resources) {
2418		list_del(&resource->resources);
2419		drbd_free_resource(resource);
 
2420	}
2421
2422	drbd_debugfs_cleanup();
2423
2424	drbd_destroy_mempools();
2425	unregister_blkdev(DRBD_MAJOR, "drbd");
2426
2427	idr_destroy(&drbd_devices);
2428
2429	pr_info("module cleanup done.\n");
2430}
2431
2432/**
2433 * drbd_congested() - Callback for the flusher thread
2434 * @congested_data:	User data
2435 * @bdi_bits:		Bits the BDI flusher thread is currently interested in
2436 *
2437 * Returns 1<<WB_async_congested and/or 1<<WB_sync_congested if we are congested.
2438 */
2439static int drbd_congested(void *congested_data, int bdi_bits)
2440{
2441	struct drbd_device *device = congested_data;
2442	struct request_queue *q;
2443	char reason = '-';
2444	int r = 0;
2445
2446	if (!may_inc_ap_bio(device)) {
2447		/* DRBD has frozen IO */
2448		r = bdi_bits;
2449		reason = 'd';
2450		goto out;
2451	}
2452
2453	if (test_bit(CALLBACK_PENDING, &first_peer_device(device)->connection->flags)) {
2454		r |= (1 << WB_async_congested);
2455		/* Without good local data, we would need to read from remote,
2456		 * and that would need the worker thread as well, which is
2457		 * currently blocked waiting for that usermode helper to
2458		 * finish.
2459		 */
2460		if (!get_ldev_if_state(device, D_UP_TO_DATE))
2461			r |= (1 << WB_sync_congested);
2462		else
2463			put_ldev(device);
2464		r &= bdi_bits;
2465		reason = 'c';
2466		goto out;
2467	}
2468
2469	if (get_ldev(device)) {
2470		q = bdev_get_queue(device->ldev->backing_bdev);
2471		r = bdi_congested(q->backing_dev_info, bdi_bits);
2472		put_ldev(device);
2473		if (r)
2474			reason = 'b';
2475	}
2476
2477	if (bdi_bits & (1 << WB_async_congested) &&
2478	    test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) {
2479		r |= (1 << WB_async_congested);
2480		reason = reason == 'b' ? 'a' : 'n';
2481	}
2482
2483out:
2484	device->congestion_reason = reason;
2485	return r;
2486}
2487
2488static void drbd_init_workqueue(struct drbd_work_queue* wq)
2489{
2490	spin_lock_init(&wq->q_lock);
2491	INIT_LIST_HEAD(&wq->q);
2492	init_waitqueue_head(&wq->q_wait);
2493}
2494
2495struct completion_work {
2496	struct drbd_work w;
2497	struct completion done;
2498};
2499
2500static int w_complete(struct drbd_work *w, int cancel)
2501{
2502	struct completion_work *completion_work =
2503		container_of(w, struct completion_work, w);
2504
2505	complete(&completion_work->done);
2506	return 0;
2507}
2508
2509void drbd_flush_workqueue(struct drbd_work_queue *work_queue)
2510{
2511	struct completion_work completion_work;
2512
2513	completion_work.w.cb = w_complete;
2514	init_completion(&completion_work.done);
2515	drbd_queue_work(work_queue, &completion_work.w);
2516	wait_for_completion(&completion_work.done);
2517}
2518
2519struct drbd_resource *drbd_find_resource(const char *name)
2520{
2521	struct drbd_resource *resource;
2522
2523	if (!name || !name[0])
2524		return NULL;
2525
2526	rcu_read_lock();
2527	for_each_resource_rcu(resource, &drbd_resources) {
2528		if (!strcmp(resource->name, name)) {
2529			kref_get(&resource->kref);
2530			goto found;
2531		}
2532	}
2533	resource = NULL;
2534found:
2535	rcu_read_unlock();
2536	return resource;
2537}
2538
2539struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
2540				     void *peer_addr, int peer_addr_len)
2541{
2542	struct drbd_resource *resource;
2543	struct drbd_connection *connection;
2544
2545	rcu_read_lock();
2546	for_each_resource_rcu(resource, &drbd_resources) {
2547		for_each_connection_rcu(connection, resource) {
2548			if (connection->my_addr_len == my_addr_len &&
2549			    connection->peer_addr_len == peer_addr_len &&
2550			    !memcmp(&connection->my_addr, my_addr, my_addr_len) &&
2551			    !memcmp(&connection->peer_addr, peer_addr, peer_addr_len)) {
2552				kref_get(&connection->kref);
2553				goto found;
2554			}
2555		}
2556	}
2557	connection = NULL;
2558found:
2559	rcu_read_unlock();
2560	return connection;
2561}
2562
2563static int drbd_alloc_socket(struct drbd_socket *socket)
2564{
2565	socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
2566	if (!socket->rbuf)
2567		return -ENOMEM;
2568	socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
2569	if (!socket->sbuf)
2570		return -ENOMEM;
2571	return 0;
2572}
2573
2574static void drbd_free_socket(struct drbd_socket *socket)
2575{
2576	free_page((unsigned long) socket->sbuf);
2577	free_page((unsigned long) socket->rbuf);
2578}
2579
2580void conn_free_crypto(struct drbd_connection *connection)
2581{
2582	drbd_free_sock(connection);
2583
2584	crypto_free_ahash(connection->csums_tfm);
2585	crypto_free_ahash(connection->verify_tfm);
2586	crypto_free_shash(connection->cram_hmac_tfm);
2587	crypto_free_ahash(connection->integrity_tfm);
2588	crypto_free_ahash(connection->peer_integrity_tfm);
2589	kfree(connection->int_dig_in);
2590	kfree(connection->int_dig_vv);
2591
2592	connection->csums_tfm = NULL;
2593	connection->verify_tfm = NULL;
2594	connection->cram_hmac_tfm = NULL;
2595	connection->integrity_tfm = NULL;
2596	connection->peer_integrity_tfm = NULL;
2597	connection->int_dig_in = NULL;
2598	connection->int_dig_vv = NULL;
2599}
2600
2601int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts)
2602{
2603	struct drbd_connection *connection;
2604	cpumask_var_t new_cpu_mask;
2605	int err;
2606
2607	if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
2608		return -ENOMEM;
2609
2610	/* silently ignore cpu mask on UP kernel */
2611	if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
2612		err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
2613				   cpumask_bits(new_cpu_mask), nr_cpu_ids);
2614		if (err == -EOVERFLOW) {
2615			/* So what. mask it out. */
2616			cpumask_var_t tmp_cpu_mask;
2617			if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) {
2618				cpumask_setall(tmp_cpu_mask);
2619				cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask);
2620				drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n",
2621					res_opts->cpu_mask,
2622					strlen(res_opts->cpu_mask) > 12 ? "..." : "",
2623					nr_cpu_ids);
2624				free_cpumask_var(tmp_cpu_mask);
2625				err = 0;
2626			}
2627		}
2628		if (err) {
2629			drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
2630			/* retcode = ERR_CPU_MASK_PARSE; */
2631			goto fail;
2632		}
2633	}
2634	resource->res_opts = *res_opts;
2635	if (cpumask_empty(new_cpu_mask))
2636		drbd_calc_cpu_mask(&new_cpu_mask);
2637	if (!cpumask_equal(resource->cpu_mask, new_cpu_mask)) {
2638		cpumask_copy(resource->cpu_mask, new_cpu_mask);
2639		for_each_connection_rcu(connection, resource) {
2640			connection->receiver.reset_cpu_mask = 1;
2641			connection->ack_receiver.reset_cpu_mask = 1;
2642			connection->worker.reset_cpu_mask = 1;
2643		}
2644	}
2645	err = 0;
2646
2647fail:
2648	free_cpumask_var(new_cpu_mask);
2649	return err;
2650
2651}
2652
2653struct drbd_resource *drbd_create_resource(const char *name)
2654{
2655	struct drbd_resource *resource;
2656
2657	resource = kzalloc(sizeof(struct drbd_resource), GFP_KERNEL);
2658	if (!resource)
2659		goto fail;
2660	resource->name = kstrdup(name, GFP_KERNEL);
2661	if (!resource->name)
2662		goto fail_free_resource;
2663	if (!zalloc_cpumask_var(&resource->cpu_mask, GFP_KERNEL))
2664		goto fail_free_name;
2665	kref_init(&resource->kref);
2666	idr_init(&resource->devices);
2667	INIT_LIST_HEAD(&resource->connections);
2668	resource->write_ordering = WO_BDEV_FLUSH;
2669	list_add_tail_rcu(&resource->resources, &drbd_resources);
2670	mutex_init(&resource->conf_update);
2671	mutex_init(&resource->adm_mutex);
2672	spin_lock_init(&resource->req_lock);
2673	drbd_debugfs_resource_add(resource);
2674	return resource;
2675
2676fail_free_name:
2677	kfree(resource->name);
2678fail_free_resource:
2679	kfree(resource);
2680fail:
2681	return NULL;
2682}
2683
2684/* caller must be under adm_mutex */
2685struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
2686{
2687	struct drbd_resource *resource;
2688	struct drbd_connection *connection;
2689
2690	connection = kzalloc(sizeof(struct drbd_connection), GFP_KERNEL);
2691	if (!connection)
2692		return NULL;
2693
2694	if (drbd_alloc_socket(&connection->data))
2695		goto fail;
2696	if (drbd_alloc_socket(&connection->meta))
2697		goto fail;
2698
2699	connection->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
2700	if (!connection->current_epoch)
2701		goto fail;
2702
2703	INIT_LIST_HEAD(&connection->transfer_log);
2704
2705	INIT_LIST_HEAD(&connection->current_epoch->list);
2706	connection->epochs = 1;
2707	spin_lock_init(&connection->epoch_lock);
2708
2709	connection->send.seen_any_write_yet = false;
2710	connection->send.current_epoch_nr = 0;
2711	connection->send.current_epoch_writes = 0;
2712
2713	resource = drbd_create_resource(name);
2714	if (!resource)
2715		goto fail;
2716
2717	connection->cstate = C_STANDALONE;
2718	mutex_init(&connection->cstate_mutex);
2719	init_waitqueue_head(&connection->ping_wait);
2720	idr_init(&connection->peer_devices);
2721
2722	drbd_init_workqueue(&connection->sender_work);
2723	mutex_init(&connection->data.mutex);
2724	mutex_init(&connection->meta.mutex);
2725
2726	drbd_thread_init(resource, &connection->receiver, drbd_receiver, "receiver");
2727	connection->receiver.connection = connection;
2728	drbd_thread_init(resource, &connection->worker, drbd_worker, "worker");
2729	connection->worker.connection = connection;
2730	drbd_thread_init(resource, &connection->ack_receiver, drbd_ack_receiver, "ack_recv");
2731	connection->ack_receiver.connection = connection;
2732
2733	kref_init(&connection->kref);
2734
2735	connection->resource = resource;
2736
2737	if (set_resource_options(resource, res_opts))
2738		goto fail_resource;
2739
2740	kref_get(&resource->kref);
2741	list_add_tail_rcu(&connection->connections, &resource->connections);
2742	drbd_debugfs_connection_add(connection);
2743	return connection;
2744
2745fail_resource:
2746	list_del(&resource->resources);
2747	drbd_free_resource(resource);
2748fail:
2749	kfree(connection->current_epoch);
2750	drbd_free_socket(&connection->meta);
2751	drbd_free_socket(&connection->data);
2752	kfree(connection);
2753	return NULL;
2754}
2755
2756void drbd_destroy_connection(struct kref *kref)
2757{
2758	struct drbd_connection *connection = container_of(kref, struct drbd_connection, kref);
2759	struct drbd_resource *resource = connection->resource;
2760
2761	if (atomic_read(&connection->current_epoch->epoch_size) !=  0)
2762		drbd_err(connection, "epoch_size:%d\n", atomic_read(&connection->current_epoch->epoch_size));
2763	kfree(connection->current_epoch);
2764
2765	idr_destroy(&connection->peer_devices);
2766
2767	drbd_free_socket(&connection->meta);
2768	drbd_free_socket(&connection->data);
2769	kfree(connection->int_dig_in);
2770	kfree(connection->int_dig_vv);
2771	memset(connection, 0xfc, sizeof(*connection));
2772	kfree(connection);
2773	kref_put(&resource->kref, drbd_destroy_resource);
2774}
2775
2776static int init_submitter(struct drbd_device *device)
2777{
2778	/* opencoded create_singlethread_workqueue(),
2779	 * to be able to say "drbd%d", ..., minor */
2780	device->submit.wq =
2781		alloc_ordered_workqueue("drbd%u_submit", WQ_MEM_RECLAIM, device->minor);
2782	if (!device->submit.wq)
2783		return -ENOMEM;
2784
2785	INIT_WORK(&device->submit.worker, do_submit);
2786	INIT_LIST_HEAD(&device->submit.writes);
2787	return 0;
2788}
2789
2790enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor)
2791{
2792	struct drbd_resource *resource = adm_ctx->resource;
2793	struct drbd_connection *connection;
2794	struct drbd_device *device;
2795	struct drbd_peer_device *peer_device, *tmp_peer_device;
2796	struct gendisk *disk;
2797	struct request_queue *q;
2798	int id;
2799	int vnr = adm_ctx->volume;
2800	enum drbd_ret_code err = ERR_NOMEM;
2801
2802	device = minor_to_device(minor);
2803	if (device)
2804		return ERR_MINOR_OR_VOLUME_EXISTS;
2805
2806	/* GFP_KERNEL, we are outside of all write-out paths */
2807	device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL);
2808	if (!device)
2809		return ERR_NOMEM;
2810	kref_init(&device->kref);
2811
2812	kref_get(&resource->kref);
2813	device->resource = resource;
2814	device->minor = minor;
2815	device->vnr = vnr;
2816
2817	drbd_init_set_defaults(device);
2818
2819	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, &resource->req_lock);
 
 
2820	if (!q)
2821		goto out_no_q;
2822	device->rq_queue = q;
2823	q->queuedata   = device;
2824
2825	disk = alloc_disk(1);
2826	if (!disk)
2827		goto out_no_disk;
2828	device->vdisk = disk;
2829
2830	set_disk_ro(disk, true);
2831
2832	disk->queue = q;
2833	disk->major = DRBD_MAJOR;
2834	disk->first_minor = minor;
2835	disk->fops = &drbd_ops;
2836	sprintf(disk->disk_name, "drbd%d", minor);
2837	disk->private_data = device;
2838
2839	device->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
2840	/* we have no partitions. we contain only ourselves. */
2841	device->this_bdev->bd_contains = device->this_bdev;
2842
2843	q->backing_dev_info->congested_fn = drbd_congested;
2844	q->backing_dev_info->congested_data = device;
2845
2846	blk_queue_make_request(q, drbd_make_request);
2847	blk_queue_write_cache(q, true, true);
2848	/* Setting the max_hw_sectors to an odd value of 8kibyte here
2849	   This triggers a max_bio_size message upon first attach or connect */
2850	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
 
 
 
2851
2852	device->md_io.page = alloc_page(GFP_KERNEL);
2853	if (!device->md_io.page)
2854		goto out_no_io_page;
2855
2856	if (drbd_bm_init(device))
2857		goto out_no_bitmap;
2858	device->read_requests = RB_ROOT;
2859	device->write_requests = RB_ROOT;
2860
2861	id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL);
2862	if (id < 0) {
2863		if (id == -ENOSPC)
2864			err = ERR_MINOR_OR_VOLUME_EXISTS;
2865		goto out_no_minor_idr;
2866	}
2867	kref_get(&device->kref);
2868
2869	id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL);
2870	if (id < 0) {
2871		if (id == -ENOSPC)
2872			err = ERR_MINOR_OR_VOLUME_EXISTS;
2873		goto out_idr_remove_minor;
2874	}
2875	kref_get(&device->kref);
2876
2877	INIT_LIST_HEAD(&device->peer_devices);
2878	INIT_LIST_HEAD(&device->pending_bitmap_io);
2879	for_each_connection(connection, resource) {
2880		peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL);
2881		if (!peer_device)
2882			goto out_idr_remove_from_resource;
2883		peer_device->connection = connection;
2884		peer_device->device = device;
2885
2886		list_add(&peer_device->peer_devices, &device->peer_devices);
2887		kref_get(&device->kref);
2888
2889		id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL);
2890		if (id < 0) {
2891			if (id == -ENOSPC)
2892				err = ERR_INVALID_REQUEST;
2893			goto out_idr_remove_from_resource;
2894		}
2895		kref_get(&connection->kref);
2896		INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf);
2897	}
2898
2899	if (init_submitter(device)) {
2900		err = ERR_NOMEM;
2901		goto out_idr_remove_vol;
2902	}
2903
2904	add_disk(disk);
2905
2906	/* inherit the connection state */
2907	device->state.conn = first_connection(resource)->cstate;
2908	if (device->state.conn == C_WF_REPORT_PARAMS) {
2909		for_each_peer_device(peer_device, device)
2910			drbd_connected(peer_device);
2911	}
2912	/* move to create_peer_device() */
2913	for_each_peer_device(peer_device, device)
2914		drbd_debugfs_peer_device_add(peer_device);
2915	drbd_debugfs_device_add(device);
2916	return NO_ERROR;
2917
2918out_idr_remove_vol:
2919	idr_remove(&connection->peer_devices, vnr);
2920out_idr_remove_from_resource:
2921	for_each_connection(connection, resource) {
2922		peer_device = idr_remove(&connection->peer_devices, vnr);
2923		if (peer_device)
2924			kref_put(&connection->kref, drbd_destroy_connection);
2925	}
2926	for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
2927		list_del(&peer_device->peer_devices);
2928		kfree(peer_device);
2929	}
2930	idr_remove(&resource->devices, vnr);
2931out_idr_remove_minor:
2932	idr_remove(&drbd_devices, minor);
2933	synchronize_rcu();
2934out_no_minor_idr:
2935	drbd_bm_cleanup(device);
2936out_no_bitmap:
2937	__free_page(device->md_io.page);
2938out_no_io_page:
2939	put_disk(disk);
2940out_no_disk:
2941	blk_cleanup_queue(q);
2942out_no_q:
2943	kref_put(&resource->kref, drbd_destroy_resource);
2944	kfree(device);
2945	return err;
 
2946}
2947
2948void drbd_delete_device(struct drbd_device *device)
2949{
2950	struct drbd_resource *resource = device->resource;
2951	struct drbd_connection *connection;
2952	struct drbd_peer_device *peer_device;
2953
2954	/* move to free_peer_device() */
2955	for_each_peer_device(peer_device, device)
2956		drbd_debugfs_peer_device_cleanup(peer_device);
2957	drbd_debugfs_device_cleanup(device);
2958	for_each_connection(connection, resource) {
2959		idr_remove(&connection->peer_devices, device->vnr);
2960		kref_put(&device->kref, drbd_destroy_device);
2961	}
2962	idr_remove(&resource->devices, device->vnr);
2963	kref_put(&device->kref, drbd_destroy_device);
2964	idr_remove(&drbd_devices, device_to_minor(device));
2965	kref_put(&device->kref, drbd_destroy_device);
2966	del_gendisk(device->vdisk);
2967	synchronize_rcu();
2968	kref_put(&device->kref, drbd_destroy_device);
2969}
2970
2971static int __init drbd_init(void)
 
2972{
2973	int err;
2974
2975	if (drbd_minor_count < DRBD_MINOR_COUNT_MIN || drbd_minor_count > DRBD_MINOR_COUNT_MAX) {
2976		pr_err("invalid minor_count (%d)\n", drbd_minor_count);
 
 
 
 
 
 
 
 
2977#ifdef MODULE
2978		return -EINVAL;
2979#else
2980		drbd_minor_count = DRBD_MINOR_COUNT_DEF;
2981#endif
2982	}
2983
 
 
 
 
2984	err = register_blkdev(DRBD_MAJOR, "drbd");
2985	if (err) {
2986		pr_err("unable to register block device major %d\n",
 
2987		       DRBD_MAJOR);
2988		return err;
2989	}
2990
 
 
2991	/*
2992	 * allocate all necessary structs
2993	 */
 
 
2994	init_waitqueue_head(&drbd_pp_wait);
2995
2996	drbd_proc = NULL; /* play safe for drbd_cleanup */
2997	idr_init(&drbd_devices);
2998
2999	mutex_init(&resources_mutex);
3000	INIT_LIST_HEAD(&drbd_resources);
3001
3002	err = drbd_genl_register();
3003	if (err) {
3004		pr_err("unable to register generic netlink family\n");
3005		goto fail;
3006	}
3007
3008	err = drbd_create_mempools();
3009	if (err)
3010		goto fail;
3011
3012	err = -ENOMEM;
3013	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3014	if (!drbd_proc)	{
3015		pr_err("unable to register proc file\n");
3016		goto fail;
3017	}
3018
3019	retry.wq = create_singlethread_workqueue("drbd-reissue");
3020	if (!retry.wq) {
3021		pr_err("unable to create retry workqueue\n");
3022		goto fail;
3023	}
3024	INIT_WORK(&retry.worker, do_retry);
3025	spin_lock_init(&retry.lock);
3026	INIT_LIST_HEAD(&retry.writes);
3027
3028	if (drbd_debugfs_init())
3029		pr_notice("failed to initialize debugfs -- will not be available\n");
3030
3031	pr_info("initialized. "
3032	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3033	       API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3034	pr_info("%s\n", drbd_buildtag());
3035	pr_info("registered as block device major %d\n", DRBD_MAJOR);
 
 
 
3036	return 0; /* Success! */
3037
3038fail:
3039	drbd_cleanup();
3040	if (err == -ENOMEM)
3041		pr_err("ran out of memory\n");
 
3042	else
3043		pr_err("initialization failure\n");
3044	return err;
3045}
3046
3047static void drbd_free_one_sock(struct drbd_socket *ds)
3048{
3049	struct socket *s;
3050	mutex_lock(&ds->mutex);
3051	s = ds->socket;
3052	ds->socket = NULL;
3053	mutex_unlock(&ds->mutex);
3054	if (s) {
3055		/* so debugfs does not need to mutex_lock() */
3056		synchronize_rcu();
3057		kernel_sock_shutdown(s, SHUT_RDWR);
3058		sock_release(s);
3059	}
3060}
3061
3062void drbd_free_sock(struct drbd_connection *connection)
3063{
3064	if (connection->data.socket)
3065		drbd_free_one_sock(&connection->data);
3066	if (connection->meta.socket)
3067		drbd_free_one_sock(&connection->meta);
 
 
 
 
 
 
 
 
 
 
3068}
3069
3070/* meta data management */
3071
3072void conn_md_sync(struct drbd_connection *connection)
3073{
3074	struct drbd_peer_device *peer_device;
3075	int vnr;
 
 
 
 
 
 
 
 
3076
3077	rcu_read_lock();
3078	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
3079		struct drbd_device *device = peer_device->device;
3080
3081		kref_get(&device->kref);
3082		rcu_read_unlock();
3083		drbd_md_sync(device);
3084		kref_put(&device->kref, drbd_destroy_device);
3085		rcu_read_lock();
3086	}
3087	rcu_read_unlock();
3088}
3089
3090/* aligned 4kByte */
 
3091struct meta_data_on_disk {
3092	u64 la_size_sect;      /* last agreed size. */
3093	u64 uuid[UI_SIZE];   /* UUIDs. */
3094	u64 device_uuid;
3095	u64 reserved_u64_1;
3096	u32 flags;             /* MDF */
3097	u32 magic;
3098	u32 md_size_sect;
3099	u32 al_offset;         /* offset to this block */
3100	u32 al_nr_extents;     /* important for restoring the AL (userspace) */
3101	      /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
3102	u32 bm_offset;         /* offset to the bitmap, from here */
3103	u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
3104	u32 la_peer_max_bio_size;   /* last peer max_bio_size */
 
3105
3106	/* see al_tr_number_to_on_disk_sector() */
3107	u32 al_stripes;
3108	u32 al_stripe_size_4k;
3109
3110	u8 reserved_u8[4096 - (7*8 + 10*4)];
3111} __packed;
3112
3113
3114
3115void drbd_md_write(struct drbd_device *device, void *b)
3116{
3117	struct meta_data_on_disk *buffer = b;
3118	sector_t sector;
3119	int i;
3120
3121	memset(buffer, 0, sizeof(*buffer));
3122
3123	buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(device->this_bdev));
3124	for (i = UI_CURRENT; i < UI_SIZE; i++)
3125		buffer->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
3126	buffer->flags = cpu_to_be32(device->ldev->md.flags);
3127	buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN);
3128
3129	buffer->md_size_sect  = cpu_to_be32(device->ldev->md.md_size_sect);
3130	buffer->al_offset     = cpu_to_be32(device->ldev->md.al_offset);
3131	buffer->al_nr_extents = cpu_to_be32(device->act_log->nr_elements);
3132	buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3133	buffer->device_uuid = cpu_to_be64(device->ldev->md.device_uuid);
3134
3135	buffer->bm_offset = cpu_to_be32(device->ldev->md.bm_offset);
3136	buffer->la_peer_max_bio_size = cpu_to_be32(device->peer_max_bio_size);
3137
3138	buffer->al_stripes = cpu_to_be32(device->ldev->md.al_stripes);
3139	buffer->al_stripe_size_4k = cpu_to_be32(device->ldev->md.al_stripe_size_4k);
3140
3141	D_ASSERT(device, drbd_md_ss(device->ldev) == device->ldev->md.md_offset);
3142	sector = device->ldev->md.md_offset;
3143
3144	if (drbd_md_sync_page_io(device, device->ldev, sector, REQ_OP_WRITE)) {
3145		/* this was a try anyways ... */
3146		drbd_err(device, "meta data update failed!\n");
3147		drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
3148	}
3149}
3150
3151/**
3152 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3153 * @device:	DRBD device.
3154 */
3155void drbd_md_sync(struct drbd_device *device)
3156{
3157	struct meta_data_on_disk *buffer;
 
 
3158
3159	/* Don't accidentally change the DRBD meta data layout. */
3160	BUILD_BUG_ON(UI_SIZE != 4);
3161	BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
3162
3163	del_timer(&device->md_sync_timer);
3164	/* timer may be rearmed by drbd_md_mark_dirty() now. */
3165	if (!test_and_clear_bit(MD_DIRTY, &device->flags))
3166		return;
3167
3168	/* We use here D_FAILED and not D_ATTACHING because we try to write
3169	 * metadata even if we detach due to a disk failure! */
3170	if (!get_ldev_if_state(device, D_FAILED))
3171		return;
3172
3173	buffer = drbd_md_get_buffer(device, __func__);
3174	if (!buffer)
3175		goto out;
3176
3177	drbd_md_write(device, buffer);
 
 
 
 
 
 
 
 
 
 
3178
3179	/* Update device->ldev->md.la_size_sect,
3180	 * since we updated it on metadata. */
3181	device->ldev->md.la_size_sect = drbd_get_capacity(device->this_bdev);
3182
3183	drbd_md_put_buffer(device);
3184out:
3185	put_ldev(device);
3186}
3187
3188static int check_activity_log_stripe_size(struct drbd_device *device,
3189		struct meta_data_on_disk *on_disk,
3190		struct drbd_md *in_core)
3191{
3192	u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
3193	u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
3194	u64 al_size_4k;
3195
3196	/* both not set: default to old fixed size activity log */
3197	if (al_stripes == 0 && al_stripe_size_4k == 0) {
3198		al_stripes = 1;
3199		al_stripe_size_4k = MD_32kB_SECT/8;
3200	}
3201
3202	/* some paranoia plausibility checks */
3203
3204	/* we need both values to be set */
3205	if (al_stripes == 0 || al_stripe_size_4k == 0)
3206		goto err;
3207
3208	al_size_4k = (u64)al_stripes * al_stripe_size_4k;
3209
3210	/* Upper limit of activity log area, to avoid potential overflow
3211	 * problems in al_tr_number_to_on_disk_sector(). As right now, more
3212	 * than 72 * 4k blocks total only increases the amount of history,
3213	 * limiting this arbitrarily to 16 GB is not a real limitation ;-)  */
3214	if (al_size_4k > (16 * 1024 * 1024/4))
3215		goto err;
3216
3217	/* Lower limit: we need at least 8 transaction slots (32kB)
3218	 * to not break existing setups */
3219	if (al_size_4k < MD_32kB_SECT/8)
3220		goto err;
3221
3222	in_core->al_stripe_size_4k = al_stripe_size_4k;
3223	in_core->al_stripes = al_stripes;
3224	in_core->al_size_4k = al_size_4k;
3225
3226	return 0;
3227err:
3228	drbd_err(device, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
3229			al_stripes, al_stripe_size_4k);
3230	return -EINVAL;
3231}
3232
3233static int check_offsets_and_sizes(struct drbd_device *device, struct drbd_backing_dev *bdev)
3234{
3235	sector_t capacity = drbd_get_capacity(bdev->md_bdev);
3236	struct drbd_md *in_core = &bdev->md;
3237	s32 on_disk_al_sect;
3238	s32 on_disk_bm_sect;
3239
3240	/* The on-disk size of the activity log, calculated from offsets, and
3241	 * the size of the activity log calculated from the stripe settings,
3242	 * should match.
3243	 * Though we could relax this a bit: it is ok, if the striped activity log
3244	 * fits in the available on-disk activity log size.
3245	 * Right now, that would break how resize is implemented.
3246	 * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
3247	 * of possible unused padding space in the on disk layout. */
3248	if (in_core->al_offset < 0) {
3249		if (in_core->bm_offset > in_core->al_offset)
3250			goto err;
3251		on_disk_al_sect = -in_core->al_offset;
3252		on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
3253	} else {
3254		if (in_core->al_offset != MD_4kB_SECT)
3255			goto err;
3256		if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
3257			goto err;
3258
3259		on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
3260		on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
3261	}
3262
3263	/* old fixed size meta data is exactly that: fixed. */
3264	if (in_core->meta_dev_idx >= 0) {
3265		if (in_core->md_size_sect != MD_128MB_SECT
3266		||  in_core->al_offset != MD_4kB_SECT
3267		||  in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
3268		||  in_core->al_stripes != 1
3269		||  in_core->al_stripe_size_4k != MD_32kB_SECT/8)
3270			goto err;
3271	}
3272
3273	if (capacity < in_core->md_size_sect)
3274		goto err;
3275	if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
3276		goto err;
3277
3278	/* should be aligned, and at least 32k */
3279	if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT))
3280		goto err;
3281
3282	/* should fit (for now: exactly) into the available on-disk space;
3283	 * overflow prevention is in check_activity_log_stripe_size() above. */
3284	if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
3285		goto err;
3286
3287	/* again, should be aligned */
3288	if (in_core->bm_offset & 7)
3289		goto err;
3290
3291	/* FIXME check for device grow with flex external meta data? */
3292
3293	/* can the available bitmap space cover the last agreed device size? */
3294	if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512)
3295		goto err;
3296
3297	return 0;
3298
3299err:
3300	drbd_err(device, "meta data offsets don't make sense: idx=%d "
3301			"al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
3302			"md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
3303			in_core->meta_dev_idx,
3304			in_core->al_stripes, in_core->al_stripe_size_4k,
3305			in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
3306			(unsigned long long)in_core->la_size_sect,
3307			(unsigned long long)capacity);
3308
3309	return -EINVAL;
3310}
3311
3312
3313/**
3314 * drbd_md_read() - Reads in the meta data super block
3315 * @device:	DRBD device.
3316 * @bdev:	Device from which the meta data should be read in.
3317 *
3318 * Return NO_ERROR on success, and an enum drbd_ret_code in case
3319 * something goes wrong.
3320 *
3321 * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
3322 * even before @bdev is assigned to @device->ldev.
3323 */
3324int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
3325{
3326	struct meta_data_on_disk *buffer;
3327	u32 magic, flags;
3328	int i, rv = NO_ERROR;
3329
3330	if (device->state.disk != D_DISKLESS)
3331		return ERR_DISK_CONFIGURED;
3332
3333	buffer = drbd_md_get_buffer(device, __func__);
3334	if (!buffer)
3335		return ERR_NOMEM;
3336
3337	/* First, figure out where our meta data superblock is located,
3338	 * and read it. */
3339	bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
3340	bdev->md.md_offset = drbd_md_ss(bdev);
3341	/* Even for (flexible or indexed) external meta data,
3342	 * initially restrict us to the 4k superblock for now.
3343	 * Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */
3344	bdev->md.md_size_sect = 8;
3345
3346	if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset,
3347				 REQ_OP_READ)) {
3348		/* NOTE: can't do normal error processing here as this is
3349		   called BEFORE disk is attached */
3350		drbd_err(device, "Error while reading metadata.\n");
3351		rv = ERR_IO_MD_DISK;
3352		goto err;
3353	}
3354
3355	magic = be32_to_cpu(buffer->magic);
3356	flags = be32_to_cpu(buffer->flags);
3357	if (magic == DRBD_MD_MAGIC_84_UNCLEAN ||
3358	    (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) {
3359			/* btw: that's Activity Log clean, not "all" clean. */
3360		drbd_err(device, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n");
3361		rv = ERR_MD_UNCLEAN;
 
 
 
 
 
 
 
 
3362		goto err;
3363	}
3364
3365	rv = ERR_MD_INVALID;
3366	if (magic != DRBD_MD_MAGIC_08) {
3367		if (magic == DRBD_MD_MAGIC_07)
3368			drbd_err(device, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
3369		else
3370			drbd_err(device, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
3371		goto err;
3372	}
3373
3374	if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3375		drbd_err(device, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3376		    be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
 
3377		goto err;
3378	}
3379
3380
3381	/* convert to in_core endian */
3382	bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
3383	for (i = UI_CURRENT; i < UI_SIZE; i++)
3384		bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3385	bdev->md.flags = be32_to_cpu(buffer->flags);
 
3386	bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3387
3388	bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
3389	bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
3390	bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
3391
3392	if (check_activity_log_stripe_size(device, buffer, &bdev->md))
3393		goto err;
3394	if (check_offsets_and_sizes(device, bdev))
3395		goto err;
3396
3397	if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3398		drbd_err(device, "unexpected bm_offset: %d (expected %d)\n",
3399		    be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3400		goto err;
3401	}
3402	if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3403		drbd_err(device, "unexpected md_size: %u (expected %u)\n",
3404		    be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3405		goto err;
3406	}
3407
3408	rv = NO_ERROR;
3409
3410	spin_lock_irq(&device->resource->req_lock);
3411	if (device->state.conn < C_CONNECTED) {
3412		unsigned int peer;
3413		peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3414		peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
3415		device->peer_max_bio_size = peer;
3416	}
3417	spin_unlock_irq(&device->resource->req_lock);
3418
3419 err:
3420	drbd_md_put_buffer(device);
 
3421
3422	return rv;
3423}
3424
3425/**
3426 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3427 * @device:	DRBD device.
3428 *
3429 * Call this function if you change anything that should be written to
3430 * the meta-data super block. This function sets MD_DIRTY, and starts a
3431 * timer that ensures that within five seconds you have to call drbd_md_sync().
3432 */
3433#ifdef DEBUG
3434void drbd_md_mark_dirty_(struct drbd_device *device, unsigned int line, const char *func)
3435{
3436	if (!test_and_set_bit(MD_DIRTY, &device->flags)) {
3437		mod_timer(&device->md_sync_timer, jiffies + HZ);
3438		device->last_md_mark_dirty.line = line;
3439		device->last_md_mark_dirty.func = func;
3440	}
3441}
3442#else
3443void drbd_md_mark_dirty(struct drbd_device *device)
3444{
3445	if (!test_and_set_bit(MD_DIRTY, &device->flags))
3446		mod_timer(&device->md_sync_timer, jiffies + 5*HZ);
3447}
3448#endif
3449
3450void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local)
3451{
3452	int i;
3453
3454	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
3455		device->ldev->md.uuid[i+1] = device->ldev->md.uuid[i];
3456}
3457
3458void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
3459{
3460	if (idx == UI_CURRENT) {
3461		if (device->state.role == R_PRIMARY)
3462			val |= 1;
3463		else
3464			val &= ~((u64)1);
3465
3466		drbd_set_ed_uuid(device, val);
3467	}
3468
3469	device->ldev->md.uuid[idx] = val;
3470	drbd_md_mark_dirty(device);
3471}
3472
3473void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
3474{
3475	unsigned long flags;
3476	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
3477	__drbd_uuid_set(device, idx, val);
3478	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
3479}
3480
3481void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
3482{
3483	unsigned long flags;
3484	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
3485	if (device->ldev->md.uuid[idx]) {
3486		drbd_uuid_move_history(device);
3487		device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[idx];
3488	}
3489	__drbd_uuid_set(device, idx, val);
3490	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
3491}
3492
3493/**
3494 * drbd_uuid_new_current() - Creates a new current UUID
3495 * @device:	DRBD device.
3496 *
3497 * Creates a new current UUID, and rotates the old current UUID into
3498 * the bitmap slot. Causes an incremental resync upon next connect.
3499 */
3500void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local)
3501{
3502	u64 val;
3503	unsigned long long bm_uuid;
3504
3505	get_random_bytes(&val, sizeof(u64));
3506
3507	spin_lock_irq(&device->ldev->md.uuid_lock);
3508	bm_uuid = device->ldev->md.uuid[UI_BITMAP];
3509
3510	if (bm_uuid)
3511		drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
3512
3513	device->ldev->md.uuid[UI_BITMAP] = device->ldev->md.uuid[UI_CURRENT];
3514	__drbd_uuid_set(device, UI_CURRENT, val);
3515	spin_unlock_irq(&device->ldev->md.uuid_lock);
3516
3517	drbd_print_uuids(device, "new current UUID");
 
 
3518	/* get it to stable storage _now_ */
3519	drbd_md_sync(device);
3520}
3521
3522void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
3523{
3524	unsigned long flags;
3525	if (device->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3526		return;
3527
3528	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
3529	if (val == 0) {
3530		drbd_uuid_move_history(device);
3531		device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
3532		device->ldev->md.uuid[UI_BITMAP] = 0;
3533	} else {
3534		unsigned long long bm_uuid = device->ldev->md.uuid[UI_BITMAP];
3535		if (bm_uuid)
3536			drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
3537
3538		device->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
3539	}
3540	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
3541
3542	drbd_md_mark_dirty(device);
3543}
3544
3545/**
3546 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3547 * @device:	DRBD device.
3548 *
3549 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3550 */
3551int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local)
3552{
3553	int rv = -EIO;
3554
3555	drbd_md_set_flag(device, MDF_FULL_SYNC);
3556	drbd_md_sync(device);
3557	drbd_bm_set_all(device);
3558
3559	rv = drbd_bm_write(device);
 
 
 
 
 
 
3560
3561	if (!rv) {
3562		drbd_md_clear_flag(device, MDF_FULL_SYNC);
3563		drbd_md_sync(device);
3564	}
3565
3566	return rv;
3567}
3568
3569/**
3570 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3571 * @device:	DRBD device.
3572 *
3573 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3574 */
3575int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local)
3576{
3577	drbd_resume_al(device);
3578	drbd_bm_clear_all(device);
3579	return drbd_bm_write(device);
 
 
 
 
 
 
 
3580}
3581
3582static int w_bitmap_io(struct drbd_work *w, int unused)
3583{
3584	struct drbd_device *device =
3585		container_of(w, struct drbd_device, bm_io_work.w);
3586	struct bm_io_work *work = &device->bm_io_work;
3587	int rv = -EIO;
3588
3589	if (work->flags != BM_LOCKED_CHANGE_ALLOWED) {
3590		int cnt = atomic_read(&device->ap_bio_cnt);
3591		if (cnt)
3592			drbd_err(device, "FIXME: ap_bio_cnt %d, expected 0; queued for '%s'\n",
3593					cnt, work->why);
3594	}
3595
3596	if (get_ldev(device)) {
3597		drbd_bm_lock(device, work->why, work->flags);
3598		rv = work->io_fn(device);
3599		drbd_bm_unlock(device);
3600		put_ldev(device);
3601	}
3602
3603	clear_bit_unlock(BITMAP_IO, &device->flags);
3604	wake_up(&device->misc_wait);
 
3605
3606	if (work->done)
3607		work->done(device, rv);
3608
3609	clear_bit(BITMAP_IO_QUEUED, &device->flags);
3610	work->why = NULL;
3611	work->flags = 0;
3612
3613	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3614}
3615
3616/**
3617 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3618 * @device:	DRBD device.
3619 * @io_fn:	IO callback to be called when bitmap IO is possible
3620 * @done:	callback to be called after the bitmap IO was performed
3621 * @why:	Descriptive text of the reason for doing the IO
3622 *
3623 * While IO on the bitmap happens we freeze application IO thus we ensure
3624 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3625 * called from worker context. It MUST NOT be used while a previous such
3626 * work is still pending!
3627 *
3628 * Its worker function encloses the call of io_fn() by get_ldev() and
3629 * put_ldev().
3630 */
3631void drbd_queue_bitmap_io(struct drbd_device *device,
3632			  int (*io_fn)(struct drbd_device *),
3633			  void (*done)(struct drbd_device *, int),
3634			  char *why, enum bm_flag flags)
3635{
3636	D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
3637
3638	D_ASSERT(device, !test_bit(BITMAP_IO_QUEUED, &device->flags));
3639	D_ASSERT(device, !test_bit(BITMAP_IO, &device->flags));
3640	D_ASSERT(device, list_empty(&device->bm_io_work.w.list));
3641	if (device->bm_io_work.why)
3642		drbd_err(device, "FIXME going to queue '%s' but '%s' still pending?\n",
3643			why, device->bm_io_work.why);
3644
3645	device->bm_io_work.io_fn = io_fn;
3646	device->bm_io_work.done = done;
3647	device->bm_io_work.why = why;
3648	device->bm_io_work.flags = flags;
3649
3650	spin_lock_irq(&device->resource->req_lock);
3651	set_bit(BITMAP_IO, &device->flags);
3652	/* don't wait for pending application IO if the caller indicates that
3653	 * application IO does not conflict anyways. */
3654	if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) {
3655		if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
3656			drbd_queue_work(&first_peer_device(device)->connection->sender_work,
3657					&device->bm_io_work.w);
3658	}
3659	spin_unlock_irq(&device->resource->req_lock);
3660}
3661
3662/**
3663 * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
3664 * @device:	DRBD device.
3665 * @io_fn:	IO callback to be called when bitmap IO is possible
3666 * @why:	Descriptive text of the reason for doing the IO
3667 *
3668 * freezes application IO while that the actual IO operations runs. This
3669 * functions MAY NOT be called from worker context.
3670 */
3671int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *),
3672		char *why, enum bm_flag flags)
3673{
3674	/* Only suspend io, if some operation is supposed to be locked out */
3675	const bool do_suspend_io = flags & (BM_DONT_CLEAR|BM_DONT_SET|BM_DONT_TEST);
3676	int rv;
3677
3678	D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
3679
3680	if (do_suspend_io)
3681		drbd_suspend_io(device);
3682
3683	drbd_bm_lock(device, why, flags);
3684	rv = io_fn(device);
3685	drbd_bm_unlock(device);
3686
3687	if (do_suspend_io)
3688		drbd_resume_io(device);
3689
3690	return rv;
3691}
3692
3693void drbd_md_set_flag(struct drbd_device *device, int flag) __must_hold(local)
3694{
3695	if ((device->ldev->md.flags & flag) != flag) {
3696		drbd_md_mark_dirty(device);
3697		device->ldev->md.flags |= flag;
3698	}
3699}
3700
3701void drbd_md_clear_flag(struct drbd_device *device, int flag) __must_hold(local)
3702{
3703	if ((device->ldev->md.flags & flag) != 0) {
3704		drbd_md_mark_dirty(device);
3705		device->ldev->md.flags &= ~flag;
3706	}
3707}
3708int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3709{
3710	return (bdev->md.flags & flag) != 0;
3711}
3712
3713static void md_sync_timer_fn(struct timer_list *t)
3714{
3715	struct drbd_device *device = from_timer(device, t, md_sync_timer);
3716	drbd_device_post_work(device, MD_SYNC);
3717}
3718
3719const char *cmdname(enum drbd_packet cmd)
3720{
3721	/* THINK may need to become several global tables
3722	 * when we want to support more than
3723	 * one PRO_VERSION */
3724	static const char *cmdnames[] = {
3725		[P_DATA]	        = "Data",
3726		[P_WSAME]	        = "WriteSame",
3727		[P_TRIM]	        = "Trim",
3728		[P_DATA_REPLY]	        = "DataReply",
3729		[P_RS_DATA_REPLY]	= "RSDataReply",
3730		[P_BARRIER]	        = "Barrier",
3731		[P_BITMAP]	        = "ReportBitMap",
3732		[P_BECOME_SYNC_TARGET]  = "BecomeSyncTarget",
3733		[P_BECOME_SYNC_SOURCE]  = "BecomeSyncSource",
3734		[P_UNPLUG_REMOTE]	= "UnplugRemote",
3735		[P_DATA_REQUEST]	= "DataRequest",
3736		[P_RS_DATA_REQUEST]     = "RSDataRequest",
3737		[P_SYNC_PARAM]	        = "SyncParam",
3738		[P_SYNC_PARAM89]	= "SyncParam89",
3739		[P_PROTOCOL]            = "ReportProtocol",
3740		[P_UUIDS]	        = "ReportUUIDs",
3741		[P_SIZES]	        = "ReportSizes",
3742		[P_STATE]	        = "ReportState",
3743		[P_SYNC_UUID]           = "ReportSyncUUID",
3744		[P_AUTH_CHALLENGE]      = "AuthChallenge",
3745		[P_AUTH_RESPONSE]	= "AuthResponse",
3746		[P_PING]		= "Ping",
3747		[P_PING_ACK]	        = "PingAck",
3748		[P_RECV_ACK]	        = "RecvAck",
3749		[P_WRITE_ACK]	        = "WriteAck",
3750		[P_RS_WRITE_ACK]	= "RSWriteAck",
3751		[P_SUPERSEDED]          = "Superseded",
3752		[P_NEG_ACK]	        = "NegAck",
3753		[P_NEG_DREPLY]	        = "NegDReply",
3754		[P_NEG_RS_DREPLY]	= "NegRSDReply",
3755		[P_BARRIER_ACK]	        = "BarrierAck",
3756		[P_STATE_CHG_REQ]       = "StateChgRequest",
3757		[P_STATE_CHG_REPLY]     = "StateChgReply",
3758		[P_OV_REQUEST]          = "OVRequest",
3759		[P_OV_REPLY]            = "OVReply",
3760		[P_OV_RESULT]           = "OVResult",
3761		[P_CSUM_RS_REQUEST]     = "CsumRSRequest",
3762		[P_RS_IS_IN_SYNC]	= "CsumRSIsInSync",
3763		[P_COMPRESSED_BITMAP]   = "CBitmap",
3764		[P_DELAY_PROBE]         = "DelayProbe",
3765		[P_OUT_OF_SYNC]		= "OutOfSync",
3766		[P_RETRY_WRITE]		= "RetryWrite",
3767		[P_RS_CANCEL]		= "RSCancel",
3768		[P_CONN_ST_CHG_REQ]	= "conn_st_chg_req",
3769		[P_CONN_ST_CHG_REPLY]	= "conn_st_chg_reply",
3770		[P_RETRY_WRITE]		= "retry_write",
3771		[P_PROTOCOL_UPDATE]	= "protocol_update",
3772		[P_RS_THIN_REQ]         = "rs_thin_req",
3773		[P_RS_DEALLOCATED]      = "rs_deallocated",
3774
3775		/* enum drbd_packet, but not commands - obsoleted flags:
3776		 *	P_MAY_IGNORE
3777		 *	P_MAX_OPT_CMD
3778		 */
3779	};
3780
3781	/* too big for the array: 0xfffX */
3782	if (cmd == P_INITIAL_META)
3783		return "InitialMeta";
3784	if (cmd == P_INITIAL_DATA)
3785		return "InitialData";
3786	if (cmd == P_CONNECTION_FEATURES)
3787		return "ConnectionFeatures";
3788	if (cmd >= ARRAY_SIZE(cmdnames))
3789		return "Unknown";
3790	return cmdnames[cmd];
3791}
3792
3793/**
3794 * drbd_wait_misc  -  wait for a request to make progress
3795 * @device:	device associated with the request
3796 * @i:		the struct drbd_interval embedded in struct drbd_request or
3797 *		struct drbd_peer_request
3798 */
3799int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i)
3800{
3801	struct net_conf *nc;
3802	DEFINE_WAIT(wait);
3803	long timeout;
3804
3805	rcu_read_lock();
3806	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
3807	if (!nc) {
3808		rcu_read_unlock();
3809		return -ETIMEDOUT;
3810	}
3811	timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
3812	rcu_read_unlock();
3813
3814	/* Indicate to wake up device->misc_wait on progress.  */
3815	i->waiting = true;
3816	prepare_to_wait(&device->misc_wait, &wait, TASK_INTERRUPTIBLE);
3817	spin_unlock_irq(&device->resource->req_lock);
3818	timeout = schedule_timeout(timeout);
3819	finish_wait(&device->misc_wait, &wait);
3820	spin_lock_irq(&device->resource->req_lock);
3821	if (!timeout || device->state.conn < C_CONNECTED)
3822		return -ETIMEDOUT;
3823	if (signal_pending(current))
3824		return -ERESTARTSYS;
3825	return 0;
3826}
3827
3828void lock_all_resources(void)
3829{
3830	struct drbd_resource *resource;
3831	int __maybe_unused i = 0;
3832
3833	mutex_lock(&resources_mutex);
3834	local_irq_disable();
3835	for_each_resource(resource, &drbd_resources)
3836		spin_lock_nested(&resource->req_lock, i++);
3837}
3838
3839void unlock_all_resources(void)
3840{
3841	struct drbd_resource *resource;
3842
3843	for_each_resource(resource, &drbd_resources)
3844		spin_unlock(&resource->req_lock);
3845	local_irq_enable();
3846	mutex_unlock(&resources_mutex);
 
3847}
3848
3849#ifdef CONFIG_DRBD_FAULT_INJECTION
3850/* Fault insertion support including random number generator shamelessly
3851 * stolen from kernel/rcutorture.c */
3852struct fault_random_state {
3853	unsigned long state;
3854	unsigned long count;
3855};
3856
3857#define FAULT_RANDOM_MULT 39916801  /* prime */
3858#define FAULT_RANDOM_ADD	479001701 /* prime */
3859#define FAULT_RANDOM_REFRESH 10000
3860
3861/*
3862 * Crude but fast random-number generator.  Uses a linear congruential
3863 * generator, with occasional help from get_random_bytes().
3864 */
3865static unsigned long
3866_drbd_fault_random(struct fault_random_state *rsp)
3867{
3868	long refresh;
3869
3870	if (!rsp->count--) {
3871		get_random_bytes(&refresh, sizeof(refresh));
3872		rsp->state += refresh;
3873		rsp->count = FAULT_RANDOM_REFRESH;
3874	}
3875	rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3876	return swahw32(rsp->state);
3877}
3878
3879static char *
3880_drbd_fault_str(unsigned int type) {
3881	static char *_faults[] = {
3882		[DRBD_FAULT_MD_WR] = "Meta-data write",
3883		[DRBD_FAULT_MD_RD] = "Meta-data read",
3884		[DRBD_FAULT_RS_WR] = "Resync write",
3885		[DRBD_FAULT_RS_RD] = "Resync read",
3886		[DRBD_FAULT_DT_WR] = "Data write",
3887		[DRBD_FAULT_DT_RD] = "Data read",
3888		[DRBD_FAULT_DT_RA] = "Data read ahead",
3889		[DRBD_FAULT_BM_ALLOC] = "BM allocation",
3890		[DRBD_FAULT_AL_EE] = "EE allocation",
3891		[DRBD_FAULT_RECEIVE] = "receive data corruption",
3892	};
3893
3894	return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3895}
3896
3897unsigned int
3898_drbd_insert_fault(struct drbd_device *device, unsigned int type)
3899{
3900	static struct fault_random_state rrs = {0, 0};
3901
3902	unsigned int ret = (
3903		(drbd_fault_devs == 0 ||
3904			((1 << device_to_minor(device)) & drbd_fault_devs) != 0) &&
3905		(((_drbd_fault_random(&rrs) % 100) + 1) <= drbd_fault_rate));
3906
3907	if (ret) {
3908		drbd_fault_count++;
3909
3910		if (__ratelimit(&drbd_ratelimit_state))
3911			drbd_warn(device, "***Simulating %s failure\n",
3912				_drbd_fault_str(type));
3913	}
3914
3915	return ret;
3916}
3917#endif
3918
3919const char *drbd_buildtag(void)
3920{
3921	/* DRBD built from external sources has here a reference to the
3922	   git hash of the source code. */
3923
3924	static char buildtag[38] = "\0uilt-in";
3925
3926	if (buildtag[0] == 0) {
3927#ifdef MODULE
3928		sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3929#else
3930		buildtag[0] = 'b';
3931#endif
 
3932	}
3933
3934	return buildtag;
3935}
3936
3937module_init(drbd_init)
3938module_exit(drbd_cleanup)
3939
3940EXPORT_SYMBOL(drbd_conn_str);
3941EXPORT_SYMBOL(drbd_role_str);
3942EXPORT_SYMBOL(drbd_disk_str);
3943EXPORT_SYMBOL(drbd_set_st_err_str);