Linux Audio

Check our new training course

Loading...
v3.1
 
   1/*
   2   drbd.c
   3
   4   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   7   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10   Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
  11   from Logicworks, Inc. for making SDP replication support possible.
  12
  13   drbd is free software; you can redistribute it and/or modify
  14   it under the terms of the GNU General Public License as published by
  15   the Free Software Foundation; either version 2, or (at your option)
  16   any later version.
  17
  18   drbd is distributed in the hope that it will be useful,
  19   but WITHOUT ANY WARRANTY; without even the implied warranty of
  20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21   GNU General Public License for more details.
  22
  23   You should have received a copy of the GNU General Public License
  24   along with drbd; see the file COPYING.  If not, write to
  25   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  26
  27 */
  28
 
 
  29#include <linux/module.h>
 
  30#include <linux/drbd.h>
  31#include <asm/uaccess.h>
  32#include <asm/types.h>
  33#include <net/sock.h>
  34#include <linux/ctype.h>
  35#include <linux/mutex.h>
  36#include <linux/fs.h>
  37#include <linux/file.h>
  38#include <linux/proc_fs.h>
  39#include <linux/init.h>
  40#include <linux/mm.h>
  41#include <linux/memcontrol.h>
  42#include <linux/mm_inline.h>
  43#include <linux/slab.h>
  44#include <linux/random.h>
  45#include <linux/reboot.h>
  46#include <linux/notifier.h>
  47#include <linux/kthread.h>
  48
  49#define __KERNEL_SYSCALLS__
  50#include <linux/unistd.h>
  51#include <linux/vmalloc.h>
 
  52
  53#include <linux/drbd_limits.h>
  54#include "drbd_int.h"
 
  55#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
  56
  57#include "drbd_vli.h"
  58
  59struct after_state_chg_work {
  60	struct drbd_work w;
  61	union drbd_state os;
  62	union drbd_state ns;
  63	enum chg_state_flags flags;
  64	struct completion *done;
  65};
  66
  67static DEFINE_MUTEX(drbd_main_mutex);
  68int drbdd_init(struct drbd_thread *);
  69int drbd_worker(struct drbd_thread *);
  70int drbd_asender(struct drbd_thread *);
  71
  72int drbd_init(void);
  73static int drbd_open(struct block_device *bdev, fmode_t mode);
  74static int drbd_release(struct gendisk *gd, fmode_t mode);
  75static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
  76static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
  77			   union drbd_state ns, enum chg_state_flags flags);
  78static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
  79static void md_sync_timer_fn(unsigned long data);
  80static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
  81static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
  82
  83MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
  84	      "Lars Ellenberg <lars@linbit.com>");
  85MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
  86MODULE_VERSION(REL_VERSION);
  87MODULE_LICENSE("GPL");
  88MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
  89		 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
  90MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
  91
  92#include <linux/moduleparam.h>
  93/* allow_open_on_secondary */
  94MODULE_PARM_DESC(allow_oos, "DONT USE!");
  95/* thanks to these macros, if compiled into the kernel (not-module),
  96 * this becomes the boot parameter drbd.minor_count */
  97module_param(minor_count, uint, 0444);
  98module_param(disable_sendpage, bool, 0644);
  99module_param(allow_oos, bool, 0);
 100module_param(cn_idx, uint, 0444);
 101module_param(proc_details, int, 0644);
 102
 103#ifdef CONFIG_DRBD_FAULT_INJECTION
 104int enable_faults;
 105int fault_rate;
 106static int fault_count;
 107int fault_devs;
 108/* bitmap of enabled faults */
 109module_param(enable_faults, int, 0664);
 110/* fault rate % value - applies to all enabled faults */
 111module_param(fault_rate, int, 0664);
 112/* count of faults inserted */
 113module_param(fault_count, int, 0664);
 114/* bitmap of devices to insert faults on */
 115module_param(fault_devs, int, 0644);
 116#endif
 117
 118/* module parameter, defined */
 119unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
 120int disable_sendpage;
 121int allow_oos;
 122unsigned int cn_idx = CN_IDX_DRBD;
 123int proc_details;       /* Detail level in proc drbd*/
 124
 
 
 
 
 
 125/* Module parameter for setting the user mode helper program
 126 * to run. Default is /sbin/drbdadm */
 127char usermode_helper[80] = "/sbin/drbdadm";
 128
 129module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
 130
 131/* in 2.6.x, our device mapping and config info contains our virtual gendisks
 132 * as member "struct gendisk *vdisk;"
 133 */
 134struct drbd_conf **minor_table;
 
 
 135
 136struct kmem_cache *drbd_request_cache;
 137struct kmem_cache *drbd_ee_cache;	/* epoch entries */
 138struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
 139struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
 140mempool_t *drbd_request_mempool;
 141mempool_t *drbd_ee_mempool;
 
 
 
 142
 143/* I do not use a standard mempool, because:
 144   1) I want to hand out the pre-allocated objects first.
 145   2) I want to be able to interrupt sleeping allocation with a signal.
 146   Note: This is a single linked list, the next pointer is the private
 147	 member of struct page.
 148 */
 149struct page *drbd_pp_pool;
 150spinlock_t   drbd_pp_lock;
 151int          drbd_pp_vacant;
 152wait_queue_head_t drbd_pp_wait;
 153
 154DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
 155
 156static const struct block_device_operations drbd_ops = {
 157	.owner =   THIS_MODULE,
 158	.open =    drbd_open,
 159	.release = drbd_release,
 
 160};
 161
 162#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
 
 
 
 
 
 
 
 
 
 
 
 163
 164#ifdef __CHECKER__
 165/* When checking with sparse, and this is an inline function, sparse will
 166   give tons of false positives. When this is a real functions sparse works.
 167 */
 168int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
 169{
 170	int io_allowed;
 171
 172	atomic_inc(&mdev->local_cnt);
 173	io_allowed = (mdev->state.disk >= mins);
 174	if (!io_allowed) {
 175		if (atomic_dec_and_test(&mdev->local_cnt))
 176			wake_up(&mdev->misc_wait);
 177	}
 178	return io_allowed;
 179}
 180
 181#endif
 182
 183/**
 184 * DOC: The transfer log
 185 *
 186 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
 187 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
 188 * of the list. There is always at least one &struct drbd_tl_epoch object.
 189 *
 190 * Each &struct drbd_tl_epoch has a circular double linked list of requests
 191 * attached.
 192 */
 193static int tl_init(struct drbd_conf *mdev)
 194{
 195	struct drbd_tl_epoch *b;
 196
 197	/* during device minor initialization, we may well use GFP_KERNEL */
 198	b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
 199	if (!b)
 200		return 0;
 201	INIT_LIST_HEAD(&b->requests);
 202	INIT_LIST_HEAD(&b->w.list);
 203	b->next = NULL;
 204	b->br_number = 4711;
 205	b->n_writes = 0;
 206	b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
 207
 208	mdev->oldest_tle = b;
 209	mdev->newest_tle = b;
 210	INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
 211
 212	mdev->tl_hash = NULL;
 213	mdev->tl_hash_s = 0;
 214
 215	return 1;
 216}
 217
 218static void tl_cleanup(struct drbd_conf *mdev)
 219{
 220	D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
 221	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
 222	kfree(mdev->oldest_tle);
 223	mdev->oldest_tle = NULL;
 224	kfree(mdev->unused_spare_tle);
 225	mdev->unused_spare_tle = NULL;
 226	kfree(mdev->tl_hash);
 227	mdev->tl_hash = NULL;
 228	mdev->tl_hash_s = 0;
 229}
 230
 231/**
 232 * _tl_add_barrier() - Adds a barrier to the transfer log
 233 * @mdev:	DRBD device.
 234 * @new:	Barrier to be added before the current head of the TL.
 235 *
 236 * The caller must hold the req_lock.
 237 */
 238void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
 239{
 240	struct drbd_tl_epoch *newest_before;
 241
 242	INIT_LIST_HEAD(&new->requests);
 243	INIT_LIST_HEAD(&new->w.list);
 244	new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
 245	new->next = NULL;
 246	new->n_writes = 0;
 247
 248	newest_before = mdev->newest_tle;
 249	/* never send a barrier number == 0, because that is special-cased
 250	 * when using TCQ for our write ordering code */
 251	new->br_number = (newest_before->br_number+1) ?: 1;
 252	if (mdev->newest_tle != new) {
 253		mdev->newest_tle->next = new;
 254		mdev->newest_tle = new;
 255	}
 256}
 257
 258/**
 259 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
 260 * @mdev:	DRBD device.
 261 * @barrier_nr:	Expected identifier of the DRBD write barrier packet.
 262 * @set_size:	Expected number of requests before that barrier.
 263 *
 264 * In case the passed barrier_nr or set_size does not match the oldest
 265 * &struct drbd_tl_epoch objects this function will cause a termination
 266 * of the connection.
 267 */
 268void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
 269		       unsigned int set_size)
 270{
 271	struct drbd_tl_epoch *b, *nob; /* next old barrier */
 272	struct list_head *le, *tle;
 273	struct drbd_request *r;
 274
 275	spin_lock_irq(&mdev->req_lock);
 276
 277	b = mdev->oldest_tle;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 278
 279	/* first some paranoia code */
 280	if (b == NULL) {
 281		dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
 282			barrier_nr);
 283		goto bail;
 284	}
 285	if (b->br_number != barrier_nr) {
 286		dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
 287			barrier_nr, b->br_number);
 288		goto bail;
 289	}
 290	if (b->n_writes != set_size) {
 291		dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
 292			barrier_nr, set_size, b->n_writes);
 
 293		goto bail;
 294	}
 295
 296	/* Clean up list of requests processed during current epoch */
 297	list_for_each_safe(le, tle, &b->requests) {
 298		r = list_entry(le, struct drbd_request, tl_requests);
 299		_req_mod(r, barrier_acked);
 300	}
 301	/* There could be requests on the list waiting for completion
 302	   of the write to the local disk. To avoid corruptions of
 303	   slab's data structures we have to remove the lists head.
 304
 305	   Also there could have been a barrier ack out of sequence, overtaking
 306	   the write acks - which would be a bug and violating write ordering.
 307	   To not deadlock in case we lose connection while such requests are
 308	   still pending, we need some way to find them for the
 309	   _req_mode(connection_lost_while_pending).
 310
 311	   These have been list_move'd to the out_of_sequence_requests list in
 312	   _req_mod(, barrier_acked) above.
 313	   */
 314	list_del_init(&b->requests);
 315
 316	nob = b->next;
 317	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
 318		_tl_add_barrier(mdev, b);
 319		if (nob)
 320			mdev->oldest_tle = nob;
 321		/* if nob == NULL b was the only barrier, and becomes the new
 322		   barrier. Therefore mdev->oldest_tle points already to b */
 323	} else {
 324		D_ASSERT(nob != NULL);
 325		mdev->oldest_tle = nob;
 326		kfree(b);
 327	}
 328
 329	spin_unlock_irq(&mdev->req_lock);
 330	dec_ap_pending(mdev);
 331
 332	return;
 333
 334bail:
 335	spin_unlock_irq(&mdev->req_lock);
 336	drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
 337}
 338
 339
 340/**
 341 * _tl_restart() - Walks the transfer log, and applies an action to all requests
 342 * @mdev:	DRBD device.
 343 * @what:       The action/event to perform with all request objects
 344 *
 345 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
 346 * restart_frozen_disk_io.
 347 */
 348static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 
 349{
 350	struct drbd_tl_epoch *b, *tmp, **pn;
 351	struct list_head *le, *tle, carry_reads;
 352	struct drbd_request *req;
 353	int rv, n_writes, n_reads;
 354
 355	b = mdev->oldest_tle;
 356	pn = &mdev->oldest_tle;
 357	while (b) {
 358		n_writes = 0;
 359		n_reads = 0;
 360		INIT_LIST_HEAD(&carry_reads);
 361		list_for_each_safe(le, tle, &b->requests) {
 362			req = list_entry(le, struct drbd_request, tl_requests);
 363			rv = _req_mod(req, what);
 364
 365			n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
 366			n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
 367		}
 368		tmp = b->next;
 369
 370		if (n_writes) {
 371			if (what == resend) {
 372				b->n_writes = n_writes;
 373				if (b->w.cb == NULL) {
 374					b->w.cb = w_send_barrier;
 375					inc_ap_pending(mdev);
 376					set_bit(CREATE_BARRIER, &mdev->flags);
 377				}
 378
 379				drbd_queue_work(&mdev->data.work, &b->w);
 380			}
 381			pn = &b->next;
 382		} else {
 383			if (n_reads)
 384				list_add(&carry_reads, &b->requests);
 385			/* there could still be requests on that ring list,
 386			 * in case local io is still pending */
 387			list_del(&b->requests);
 388
 389			/* dec_ap_pending corresponding to queue_barrier.
 390			 * the newest barrier may not have been queued yet,
 391			 * in which case w.cb is still NULL. */
 392			if (b->w.cb != NULL)
 393				dec_ap_pending(mdev);
 394
 395			if (b == mdev->newest_tle) {
 396				/* recycle, but reinit! */
 397				D_ASSERT(tmp == NULL);
 398				INIT_LIST_HEAD(&b->requests);
 399				list_splice(&carry_reads, &b->requests);
 400				INIT_LIST_HEAD(&b->w.list);
 401				b->w.cb = NULL;
 402				b->br_number = net_random();
 403				b->n_writes = 0;
 404
 405				*pn = b;
 406				break;
 407			}
 408			*pn = tmp;
 409			kfree(b);
 410		}
 411		b = tmp;
 412		list_splice(&carry_reads, &b->requests);
 413	}
 414}
 415
 
 
 
 
 
 
 416
 417/**
 418 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
 419 * @mdev:	DRBD device.
 420 *
 421 * This is called after the connection to the peer was lost. The storage covered
 422 * by the requests on the transfer gets marked as our of sync. Called from the
 423 * receiver thread and the worker thread.
 424 */
 425void tl_clear(struct drbd_conf *mdev)
 426{
 427	struct list_head *le, *tle;
 428	struct drbd_request *r;
 429
 430	spin_lock_irq(&mdev->req_lock);
 431
 432	_tl_restart(mdev, connection_lost_while_pending);
 433
 434	/* we expect this list to be empty. */
 435	D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
 436
 437	/* but just in case, clean it up anyways! */
 438	list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
 439		r = list_entry(le, struct drbd_request, tl_requests);
 440		/* It would be nice to complete outside of spinlock.
 441		 * But this is easier for now. */
 442		_req_mod(r, connection_lost_while_pending);
 443	}
 444
 445	/* ensure bit indicating barrier is required is clear */
 446	clear_bit(CREATE_BARRIER, &mdev->flags);
 447
 448	memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
 449
 450	spin_unlock_irq(&mdev->req_lock);
 451}
 452
 453void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 454{
 455	spin_lock_irq(&mdev->req_lock);
 456	_tl_restart(mdev, what);
 457	spin_unlock_irq(&mdev->req_lock);
 458}
 459
 460/**
 461 * cl_wide_st_chg() - true if the state change is a cluster wide one
 462 * @mdev:	DRBD device.
 463 * @os:		old (current) state.
 464 * @ns:		new (wanted) state.
 465 */
 466static int cl_wide_st_chg(struct drbd_conf *mdev,
 467			  union drbd_state os, union drbd_state ns)
 468{
 469	return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
 470		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
 471		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
 472		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
 473		  (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
 474		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
 475		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
 476}
 477
 478enum drbd_state_rv
 479drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
 480		  union drbd_state mask, union drbd_state val)
 481{
 482	unsigned long flags;
 483	union drbd_state os, ns;
 484	enum drbd_state_rv rv;
 485
 486	spin_lock_irqsave(&mdev->req_lock, flags);
 487	os = mdev->state;
 488	ns.i = (os.i & ~mask.i) | val.i;
 489	rv = _drbd_set_state(mdev, ns, f, NULL);
 490	ns = mdev->state;
 491	spin_unlock_irqrestore(&mdev->req_lock, flags);
 492
 493	return rv;
 494}
 495
 496/**
 497 * drbd_force_state() - Impose a change which happens outside our control on our state
 498 * @mdev:	DRBD device.
 499 * @mask:	mask of state bits to change.
 500 * @val:	value of new state bits.
 501 */
 502void drbd_force_state(struct drbd_conf *mdev,
 503	union drbd_state mask, union drbd_state val)
 504{
 505	drbd_change_state(mdev, CS_HARD, mask, val);
 506}
 507
 508static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
 509static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
 510						    union drbd_state,
 511						    union drbd_state);
 512static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
 513				       union drbd_state ns, const char **warn_sync_abort);
 514int drbd_send_state_req(struct drbd_conf *,
 515			union drbd_state, union drbd_state);
 516
 517static enum drbd_state_rv
 518_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
 519	     union drbd_state val)
 520{
 521	union drbd_state os, ns;
 522	unsigned long flags;
 523	enum drbd_state_rv rv;
 524
 525	if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
 526		return SS_CW_SUCCESS;
 527
 528	if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
 529		return SS_CW_FAILED_BY_PEER;
 530
 531	rv = 0;
 532	spin_lock_irqsave(&mdev->req_lock, flags);
 533	os = mdev->state;
 534	ns.i = (os.i & ~mask.i) | val.i;
 535	ns = sanitize_state(mdev, os, ns, NULL);
 536
 537	if (!cl_wide_st_chg(mdev, os, ns))
 538		rv = SS_CW_NO_NEED;
 539	if (!rv) {
 540		rv = is_valid_state(mdev, ns);
 541		if (rv == SS_SUCCESS) {
 542			rv = is_valid_state_transition(mdev, ns, os);
 543			if (rv == SS_SUCCESS)
 544				rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
 545		}
 546	}
 547	spin_unlock_irqrestore(&mdev->req_lock, flags);
 548
 549	return rv;
 550}
 551
 552/**
 553 * drbd_req_state() - Perform an eventually cluster wide state change
 554 * @mdev:	DRBD device.
 555 * @mask:	mask of state bits to change.
 556 * @val:	value of new state bits.
 557 * @f:		flags
 558 *
 559 * Should not be called directly, use drbd_request_state() or
 560 * _drbd_request_state().
 561 */
 562static enum drbd_state_rv
 563drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
 564	       union drbd_state val, enum chg_state_flags f)
 565{
 566	struct completion done;
 567	unsigned long flags;
 568	union drbd_state os, ns;
 569	enum drbd_state_rv rv;
 570
 571	init_completion(&done);
 572
 573	if (f & CS_SERIALIZE)
 574		mutex_lock(&mdev->state_mutex);
 575
 576	spin_lock_irqsave(&mdev->req_lock, flags);
 577	os = mdev->state;
 578	ns.i = (os.i & ~mask.i) | val.i;
 579	ns = sanitize_state(mdev, os, ns, NULL);
 580
 581	if (cl_wide_st_chg(mdev, os, ns)) {
 582		rv = is_valid_state(mdev, ns);
 583		if (rv == SS_SUCCESS)
 584			rv = is_valid_state_transition(mdev, ns, os);
 585		spin_unlock_irqrestore(&mdev->req_lock, flags);
 586
 587		if (rv < SS_SUCCESS) {
 588			if (f & CS_VERBOSE)
 589				print_st_err(mdev, os, ns, rv);
 590			goto abort;
 591		}
 592
 593		drbd_state_lock(mdev);
 594		if (!drbd_send_state_req(mdev, mask, val)) {
 595			drbd_state_unlock(mdev);
 596			rv = SS_CW_FAILED_BY_PEER;
 597			if (f & CS_VERBOSE)
 598				print_st_err(mdev, os, ns, rv);
 599			goto abort;
 600		}
 601
 602		wait_event(mdev->state_wait,
 603			(rv = _req_st_cond(mdev, mask, val)));
 604
 605		if (rv < SS_SUCCESS) {
 606			drbd_state_unlock(mdev);
 607			if (f & CS_VERBOSE)
 608				print_st_err(mdev, os, ns, rv);
 609			goto abort;
 610		}
 611		spin_lock_irqsave(&mdev->req_lock, flags);
 612		os = mdev->state;
 613		ns.i = (os.i & ~mask.i) | val.i;
 614		rv = _drbd_set_state(mdev, ns, f, &done);
 615		drbd_state_unlock(mdev);
 616	} else {
 617		rv = _drbd_set_state(mdev, ns, f, &done);
 618	}
 619
 620	spin_unlock_irqrestore(&mdev->req_lock, flags);
 621
 622	if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
 623		D_ASSERT(current != mdev->worker.task);
 624		wait_for_completion(&done);
 625	}
 626
 627abort:
 628	if (f & CS_SERIALIZE)
 629		mutex_unlock(&mdev->state_mutex);
 630
 631	return rv;
 632}
 633
 634/**
 635 * _drbd_request_state() - Request a state change (with flags)
 636 * @mdev:	DRBD device.
 637 * @mask:	mask of state bits to change.
 638 * @val:	value of new state bits.
 639 * @f:		flags
 640 *
 641 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
 642 * flag, or when logging of failed state change requests is not desired.
 643 */
 644enum drbd_state_rv
 645_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
 646		    union drbd_state val, enum chg_state_flags f)
 647{
 648	enum drbd_state_rv rv;
 649
 650	wait_event(mdev->state_wait,
 651		   (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
 652
 653	return rv;
 654}
 655
 656static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
 657{
 658	dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
 659	    name,
 660	    drbd_conn_str(ns.conn),
 661	    drbd_role_str(ns.role),
 662	    drbd_role_str(ns.peer),
 663	    drbd_disk_str(ns.disk),
 664	    drbd_disk_str(ns.pdsk),
 665	    is_susp(ns) ? 's' : 'r',
 666	    ns.aftr_isp ? 'a' : '-',
 667	    ns.peer_isp ? 'p' : '-',
 668	    ns.user_isp ? 'u' : '-'
 669	    );
 670}
 671
 672void print_st_err(struct drbd_conf *mdev, union drbd_state os,
 673	          union drbd_state ns, enum drbd_state_rv err)
 674{
 675	if (err == SS_IN_TRANSIENT_STATE)
 676		return;
 677	dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
 678	print_st(mdev, " state", os);
 679	print_st(mdev, "wanted", ns);
 680}
 681
 682
 683/**
 684 * is_valid_state() - Returns an SS_ error code if ns is not valid
 685 * @mdev:	DRBD device.
 686 * @ns:		State to consider.
 687 */
 688static enum drbd_state_rv
 689is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
 690{
 691	/* See drbd_state_sw_errors in drbd_strings.c */
 692
 693	enum drbd_fencing_p fp;
 694	enum drbd_state_rv rv = SS_SUCCESS;
 695
 696	fp = FP_DONT_CARE;
 697	if (get_ldev(mdev)) {
 698		fp = mdev->ldev->dc.fencing;
 699		put_ldev(mdev);
 700	}
 701
 702	if (get_net_conf(mdev)) {
 703		if (!mdev->net_conf->two_primaries &&
 704		    ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
 705			rv = SS_TWO_PRIMARIES;
 706		put_net_conf(mdev);
 707	}
 708
 709	if (rv <= 0)
 710		/* already found a reason to abort */;
 711	else if (ns.role == R_SECONDARY && mdev->open_cnt)
 712		rv = SS_DEVICE_IN_USE;
 713
 714	else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
 715		rv = SS_NO_UP_TO_DATE_DISK;
 716
 717	else if (fp >= FP_RESOURCE &&
 718		 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
 719		rv = SS_PRIMARY_NOP;
 720
 721	else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
 722		rv = SS_NO_UP_TO_DATE_DISK;
 723
 724	else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
 725		rv = SS_NO_LOCAL_DISK;
 726
 727	else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
 728		rv = SS_NO_REMOTE_DISK;
 729
 730	else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
 731		rv = SS_NO_UP_TO_DATE_DISK;
 732
 733	else if ((ns.conn == C_CONNECTED ||
 734		  ns.conn == C_WF_BITMAP_S ||
 735		  ns.conn == C_SYNC_SOURCE ||
 736		  ns.conn == C_PAUSED_SYNC_S) &&
 737		  ns.disk == D_OUTDATED)
 738		rv = SS_CONNECTED_OUTDATES;
 739
 740	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
 741		 (mdev->sync_conf.verify_alg[0] == 0))
 742		rv = SS_NO_VERIFY_ALG;
 743
 744	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
 745		  mdev->agreed_pro_version < 88)
 746		rv = SS_NOT_SUPPORTED;
 747
 748	else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
 749		rv = SS_CONNECTED_OUTDATES;
 750
 751	return rv;
 752}
 753
 754/**
 755 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
 756 * @mdev:	DRBD device.
 757 * @ns:		new state.
 758 * @os:		old state.
 759 */
 760static enum drbd_state_rv
 761is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
 762			  union drbd_state os)
 763{
 764	enum drbd_state_rv rv = SS_SUCCESS;
 765
 766	if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
 767	    os.conn > C_CONNECTED)
 768		rv = SS_RESYNC_RUNNING;
 769
 770	if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
 771		rv = SS_ALREADY_STANDALONE;
 772
 773	if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
 774		rv = SS_IS_DISKLESS;
 775
 776	if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
 777		rv = SS_NO_NET_CONFIG;
 778
 779	if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
 780		rv = SS_LOWER_THAN_OUTDATED;
 781
 782	if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
 783		rv = SS_IN_TRANSIENT_STATE;
 784
 785	if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
 786		rv = SS_IN_TRANSIENT_STATE;
 787
 788	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
 789		rv = SS_NEED_CONNECTION;
 790
 791	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
 792	    ns.conn != os.conn && os.conn > C_CONNECTED)
 793		rv = SS_RESYNC_RUNNING;
 794
 795	if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
 796	    os.conn < C_CONNECTED)
 797		rv = SS_NEED_CONNECTION;
 798
 799	if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
 800	    && os.conn < C_WF_REPORT_PARAMS)
 801		rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
 802
 803	return rv;
 804}
 805
 806/**
 807 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
 808 * @mdev:	DRBD device.
 809 * @os:		old state.
 810 * @ns:		new state.
 811 * @warn_sync_abort:
 812 *
 813 * When we loose connection, we have to set the state of the peers disk (pdsk)
 814 * to D_UNKNOWN. This rule and many more along those lines are in this function.
 815 */
 816static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
 817				       union drbd_state ns, const char **warn_sync_abort)
 818{
 819	enum drbd_fencing_p fp;
 820	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
 821
 822	fp = FP_DONT_CARE;
 823	if (get_ldev(mdev)) {
 824		fp = mdev->ldev->dc.fencing;
 825		put_ldev(mdev);
 826	}
 827
 828	/* Disallow Network errors to configure a device's network part */
 829	if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
 830	    os.conn <= C_DISCONNECTING)
 831		ns.conn = os.conn;
 832
 833	/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
 834	 * If you try to go into some Sync* state, that shall fail (elsewhere). */
 835	if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
 836	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
 837		ns.conn = os.conn;
 838
 839	/* we cannot fail (again) if we already detached */
 840	if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
 841		ns.disk = D_DISKLESS;
 842
 843	/* if we are only D_ATTACHING yet,
 844	 * we can (and should) go directly to D_DISKLESS. */
 845	if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
 846		ns.disk = D_DISKLESS;
 847
 848	/* After C_DISCONNECTING only C_STANDALONE may follow */
 849	if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
 850		ns.conn = os.conn;
 851
 852	if (ns.conn < C_CONNECTED) {
 853		ns.peer_isp = 0;
 854		ns.peer = R_UNKNOWN;
 855		if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
 856			ns.pdsk = D_UNKNOWN;
 857	}
 858
 859	/* Clear the aftr_isp when becoming unconfigured */
 860	if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
 861		ns.aftr_isp = 0;
 862
 863	/* Abort resync if a disk fails/detaches */
 864	if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
 865	    (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
 866		if (warn_sync_abort)
 867			*warn_sync_abort =
 868				os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
 869				"Online-verify" : "Resync";
 870		ns.conn = C_CONNECTED;
 871	}
 872
 873	/* Connection breaks down before we finished "Negotiating" */
 874	if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
 875	    get_ldev_if_state(mdev, D_NEGOTIATING)) {
 876		if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
 877			ns.disk = mdev->new_state_tmp.disk;
 878			ns.pdsk = mdev->new_state_tmp.pdsk;
 879		} else {
 880			dev_alert(DEV, "Connection lost while negotiating, no data!\n");
 881			ns.disk = D_DISKLESS;
 882			ns.pdsk = D_UNKNOWN;
 883		}
 884		put_ldev(mdev);
 885	}
 886
 887	/* D_CONSISTENT and D_OUTDATED vanish when we get connected */
 888	if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
 889		if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
 890			ns.disk = D_UP_TO_DATE;
 891		if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
 892			ns.pdsk = D_UP_TO_DATE;
 893	}
 894
 895	/* Implications of the connection stat on the disk states */
 896	disk_min = D_DISKLESS;
 897	disk_max = D_UP_TO_DATE;
 898	pdsk_min = D_INCONSISTENT;
 899	pdsk_max = D_UNKNOWN;
 900	switch ((enum drbd_conns)ns.conn) {
 901	case C_WF_BITMAP_T:
 902	case C_PAUSED_SYNC_T:
 903	case C_STARTING_SYNC_T:
 904	case C_WF_SYNC_UUID:
 905	case C_BEHIND:
 906		disk_min = D_INCONSISTENT;
 907		disk_max = D_OUTDATED;
 908		pdsk_min = D_UP_TO_DATE;
 909		pdsk_max = D_UP_TO_DATE;
 910		break;
 911	case C_VERIFY_S:
 912	case C_VERIFY_T:
 913		disk_min = D_UP_TO_DATE;
 914		disk_max = D_UP_TO_DATE;
 915		pdsk_min = D_UP_TO_DATE;
 916		pdsk_max = D_UP_TO_DATE;
 917		break;
 918	case C_CONNECTED:
 919		disk_min = D_DISKLESS;
 920		disk_max = D_UP_TO_DATE;
 921		pdsk_min = D_DISKLESS;
 922		pdsk_max = D_UP_TO_DATE;
 923		break;
 924	case C_WF_BITMAP_S:
 925	case C_PAUSED_SYNC_S:
 926	case C_STARTING_SYNC_S:
 927	case C_AHEAD:
 928		disk_min = D_UP_TO_DATE;
 929		disk_max = D_UP_TO_DATE;
 930		pdsk_min = D_INCONSISTENT;
 931		pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
 932		break;
 933	case C_SYNC_TARGET:
 934		disk_min = D_INCONSISTENT;
 935		disk_max = D_INCONSISTENT;
 936		pdsk_min = D_UP_TO_DATE;
 937		pdsk_max = D_UP_TO_DATE;
 938		break;
 939	case C_SYNC_SOURCE:
 940		disk_min = D_UP_TO_DATE;
 941		disk_max = D_UP_TO_DATE;
 942		pdsk_min = D_INCONSISTENT;
 943		pdsk_max = D_INCONSISTENT;
 944		break;
 945	case C_STANDALONE:
 946	case C_DISCONNECTING:
 947	case C_UNCONNECTED:
 948	case C_TIMEOUT:
 949	case C_BROKEN_PIPE:
 950	case C_NETWORK_FAILURE:
 951	case C_PROTOCOL_ERROR:
 952	case C_TEAR_DOWN:
 953	case C_WF_CONNECTION:
 954	case C_WF_REPORT_PARAMS:
 955	case C_MASK:
 956		break;
 957	}
 958	if (ns.disk > disk_max)
 959		ns.disk = disk_max;
 960
 961	if (ns.disk < disk_min) {
 962		dev_warn(DEV, "Implicitly set disk from %s to %s\n",
 963			 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
 964		ns.disk = disk_min;
 965	}
 966	if (ns.pdsk > pdsk_max)
 967		ns.pdsk = pdsk_max;
 968
 969	if (ns.pdsk < pdsk_min) {
 970		dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
 971			 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
 972		ns.pdsk = pdsk_min;
 973	}
 974
 975	if (fp == FP_STONITH &&
 976	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
 977	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
 978		ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
 979
 980	if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
 981	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
 982	    !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
 983		ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
 984
 985	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
 986		if (ns.conn == C_SYNC_SOURCE)
 987			ns.conn = C_PAUSED_SYNC_S;
 988		if (ns.conn == C_SYNC_TARGET)
 989			ns.conn = C_PAUSED_SYNC_T;
 990	} else {
 991		if (ns.conn == C_PAUSED_SYNC_S)
 992			ns.conn = C_SYNC_SOURCE;
 993		if (ns.conn == C_PAUSED_SYNC_T)
 994			ns.conn = C_SYNC_TARGET;
 995	}
 996
 997	return ns;
 998}
 999
1000/* helper for __drbd_set_state */
1001static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1002{
1003	if (mdev->agreed_pro_version < 90)
1004		mdev->ov_start_sector = 0;
1005	mdev->rs_total = drbd_bm_bits(mdev);
1006	mdev->ov_position = 0;
1007	if (cs == C_VERIFY_T) {
1008		/* starting online verify from an arbitrary position
1009		 * does not fit well into the existing protocol.
1010		 * on C_VERIFY_T, we initialize ov_left and friends
1011		 * implicitly in receive_DataRequest once the
1012		 * first P_OV_REQUEST is received */
1013		mdev->ov_start_sector = ~(sector_t)0;
1014	} else {
1015		unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
1016		if (bit >= mdev->rs_total) {
1017			mdev->ov_start_sector =
1018				BM_BIT_TO_SECT(mdev->rs_total - 1);
1019			mdev->rs_total = 1;
1020		} else
1021			mdev->rs_total -= bit;
1022		mdev->ov_position = mdev->ov_start_sector;
1023	}
1024	mdev->ov_left = mdev->rs_total;
1025}
1026
1027static void drbd_resume_al(struct drbd_conf *mdev)
1028{
1029	if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
1030		dev_info(DEV, "Resumed AL updates\n");
1031}
1032
1033/**
1034 * __drbd_set_state() - Set a new DRBD state
1035 * @mdev:	DRBD device.
1036 * @ns:		new state.
1037 * @flags:	Flags
1038 * @done:	Optional completion, that will get completed after the after_state_ch() finished
1039 *
1040 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1041 */
1042enum drbd_state_rv
1043__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1044	         enum chg_state_flags flags, struct completion *done)
1045{
1046	union drbd_state os;
1047	enum drbd_state_rv rv = SS_SUCCESS;
1048	const char *warn_sync_abort = NULL;
1049	struct after_state_chg_work *ascw;
1050
1051	os = mdev->state;
1052
1053	ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
1054
1055	if (ns.i == os.i)
1056		return SS_NOTHING_TO_DO;
1057
1058	if (!(flags & CS_HARD)) {
1059		/*  pre-state-change checks ; only look at ns  */
1060		/* See drbd_state_sw_errors in drbd_strings.c */
1061
1062		rv = is_valid_state(mdev, ns);
1063		if (rv < SS_SUCCESS) {
1064			/* If the old state was illegal as well, then let
1065			   this happen...*/
1066
1067			if (is_valid_state(mdev, os) == rv)
1068				rv = is_valid_state_transition(mdev, ns, os);
1069		} else
1070			rv = is_valid_state_transition(mdev, ns, os);
1071	}
1072
1073	if (rv < SS_SUCCESS) {
1074		if (flags & CS_VERBOSE)
1075			print_st_err(mdev, os, ns, rv);
1076		return rv;
1077	}
1078
1079	if (warn_sync_abort)
1080		dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
1081
1082	{
1083	char *pbp, pb[300];
1084	pbp = pb;
1085	*pbp = 0;
1086	if (ns.role != os.role)
1087		pbp += sprintf(pbp, "role( %s -> %s ) ",
1088			       drbd_role_str(os.role),
1089			       drbd_role_str(ns.role));
1090	if (ns.peer != os.peer)
1091		pbp += sprintf(pbp, "peer( %s -> %s ) ",
1092			       drbd_role_str(os.peer),
1093			       drbd_role_str(ns.peer));
1094	if (ns.conn != os.conn)
1095		pbp += sprintf(pbp, "conn( %s -> %s ) ",
1096			       drbd_conn_str(os.conn),
1097			       drbd_conn_str(ns.conn));
1098	if (ns.disk != os.disk)
1099		pbp += sprintf(pbp, "disk( %s -> %s ) ",
1100			       drbd_disk_str(os.disk),
1101			       drbd_disk_str(ns.disk));
1102	if (ns.pdsk != os.pdsk)
1103		pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1104			       drbd_disk_str(os.pdsk),
1105			       drbd_disk_str(ns.pdsk));
1106	if (is_susp(ns) != is_susp(os))
1107		pbp += sprintf(pbp, "susp( %d -> %d ) ",
1108			       is_susp(os),
1109			       is_susp(ns));
1110	if (ns.aftr_isp != os.aftr_isp)
1111		pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1112			       os.aftr_isp,
1113			       ns.aftr_isp);
1114	if (ns.peer_isp != os.peer_isp)
1115		pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1116			       os.peer_isp,
1117			       ns.peer_isp);
1118	if (ns.user_isp != os.user_isp)
1119		pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1120			       os.user_isp,
1121			       ns.user_isp);
1122	dev_info(DEV, "%s\n", pb);
1123	}
1124
1125	/* solve the race between becoming unconfigured,
1126	 * worker doing the cleanup, and
1127	 * admin reconfiguring us:
1128	 * on (re)configure, first set CONFIG_PENDING,
1129	 * then wait for a potentially exiting worker,
1130	 * start the worker, and schedule one no_op.
1131	 * then proceed with configuration.
1132	 */
1133	if (ns.disk == D_DISKLESS &&
1134	    ns.conn == C_STANDALONE &&
1135	    ns.role == R_SECONDARY &&
1136	    !test_and_set_bit(CONFIG_PENDING, &mdev->flags))
1137		set_bit(DEVICE_DYING, &mdev->flags);
1138
1139	/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1140	 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1141	 * drbd_ldev_destroy() won't happen before our corresponding
1142	 * after_state_ch works run, where we put_ldev again. */
1143	if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1144	    (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1145		atomic_inc(&mdev->local_cnt);
1146
1147	mdev->state = ns;
1148
1149	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1150		drbd_print_uuids(mdev, "attached to UUIDs");
1151
1152	wake_up(&mdev->misc_wait);
1153	wake_up(&mdev->state_wait);
1154
1155	/* aborted verify run. log the last position */
1156	if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1157	    ns.conn < C_CONNECTED) {
1158		mdev->ov_start_sector =
1159			BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1160		dev_info(DEV, "Online Verify reached sector %llu\n",
1161			(unsigned long long)mdev->ov_start_sector);
1162	}
1163
1164	if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1165	    (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
1166		dev_info(DEV, "Syncer continues.\n");
1167		mdev->rs_paused += (long)jiffies
1168				  -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1169		if (ns.conn == C_SYNC_TARGET)
1170			mod_timer(&mdev->resync_timer, jiffies);
1171	}
1172
1173	if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
1174	    (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1175		dev_info(DEV, "Resync suspended\n");
1176		mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1177	}
1178
1179	if (os.conn == C_CONNECTED &&
1180	    (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1181		unsigned long now = jiffies;
1182		int i;
1183
1184		set_ov_position(mdev, ns.conn);
1185		mdev->rs_start = now;
1186		mdev->rs_last_events = 0;
1187		mdev->rs_last_sect_ev = 0;
1188		mdev->ov_last_oos_size = 0;
1189		mdev->ov_last_oos_start = 0;
1190
1191		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1192			mdev->rs_mark_left[i] = mdev->ov_left;
1193			mdev->rs_mark_time[i] = now;
1194		}
1195
1196		drbd_rs_controller_reset(mdev);
1197
1198		if (ns.conn == C_VERIFY_S) {
1199			dev_info(DEV, "Starting Online Verify from sector %llu\n",
1200					(unsigned long long)mdev->ov_position);
1201			mod_timer(&mdev->resync_timer, jiffies);
1202		}
1203	}
1204
1205	if (get_ldev(mdev)) {
1206		u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1207						 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1208						 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1209
1210		if (test_bit(CRASHED_PRIMARY, &mdev->flags))
1211			mdf |= MDF_CRASHED_PRIMARY;
1212		if (mdev->state.role == R_PRIMARY ||
1213		    (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1214			mdf |= MDF_PRIMARY_IND;
1215		if (mdev->state.conn > C_WF_REPORT_PARAMS)
1216			mdf |= MDF_CONNECTED_IND;
1217		if (mdev->state.disk > D_INCONSISTENT)
1218			mdf |= MDF_CONSISTENT;
1219		if (mdev->state.disk > D_OUTDATED)
1220			mdf |= MDF_WAS_UP_TO_DATE;
1221		if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1222			mdf |= MDF_PEER_OUT_DATED;
1223		if (mdf != mdev->ldev->md.flags) {
1224			mdev->ldev->md.flags = mdf;
1225			drbd_md_mark_dirty(mdev);
1226		}
1227		if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1228			drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1229		put_ldev(mdev);
1230	}
1231
1232	/* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1233	if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1234	    os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1235		set_bit(CONSIDER_RESYNC, &mdev->flags);
1236
1237	/* Receiver should clean up itself */
1238	if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1239		drbd_thread_stop_nowait(&mdev->receiver);
1240
1241	/* Now the receiver finished cleaning up itself, it should die */
1242	if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1243		drbd_thread_stop_nowait(&mdev->receiver);
1244
1245	/* Upon network failure, we need to restart the receiver. */
1246	if (os.conn > C_TEAR_DOWN &&
1247	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1248		drbd_thread_restart_nowait(&mdev->receiver);
1249
1250	/* Resume AL writing if we get a connection */
1251	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1252		drbd_resume_al(mdev);
1253
1254	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1255	if (ascw) {
1256		ascw->os = os;
1257		ascw->ns = ns;
1258		ascw->flags = flags;
1259		ascw->w.cb = w_after_state_ch;
1260		ascw->done = done;
1261		drbd_queue_work(&mdev->data.work, &ascw->w);
1262	} else {
1263		dev_warn(DEV, "Could not kmalloc an ascw\n");
1264	}
1265
1266	return rv;
1267}
1268
1269static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1270{
1271	struct after_state_chg_work *ascw =
1272		container_of(w, struct after_state_chg_work, w);
1273	after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1274	if (ascw->flags & CS_WAIT_COMPLETE) {
1275		D_ASSERT(ascw->done != NULL);
1276		complete(ascw->done);
1277	}
1278	kfree(ascw);
1279
1280	return 1;
1281}
1282
1283static void abw_start_sync(struct drbd_conf *mdev, int rv)
1284{
1285	if (rv) {
1286		dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1287		_drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1288		return;
1289	}
1290
1291	switch (mdev->state.conn) {
1292	case C_STARTING_SYNC_T:
1293		_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1294		break;
1295	case C_STARTING_SYNC_S:
1296		drbd_start_resync(mdev, C_SYNC_SOURCE);
1297		break;
1298	}
1299}
1300
1301int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1302		int (*io_fn)(struct drbd_conf *),
1303		char *why, enum bm_flag flags)
1304{
1305	int rv;
1306
1307	D_ASSERT(current == mdev->worker.task);
1308
1309	/* open coded non-blocking drbd_suspend_io(mdev); */
1310	set_bit(SUSPEND_IO, &mdev->flags);
1311
1312	drbd_bm_lock(mdev, why, flags);
1313	rv = io_fn(mdev);
1314	drbd_bm_unlock(mdev);
1315
1316	drbd_resume_io(mdev);
1317
1318	return rv;
1319}
1320
1321/**
1322 * after_state_ch() - Perform after state change actions that may sleep
1323 * @mdev:	DRBD device.
1324 * @os:		old state.
1325 * @ns:		new state.
1326 * @flags:	Flags
1327 */
1328static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1329			   union drbd_state ns, enum chg_state_flags flags)
1330{
1331	enum drbd_fencing_p fp;
1332	enum drbd_req_event what = nothing;
1333	union drbd_state nsm = (union drbd_state){ .i = -1 };
1334
1335	if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1336		clear_bit(CRASHED_PRIMARY, &mdev->flags);
1337		if (mdev->p_uuid)
1338			mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
 
 
 
1339	}
1340
1341	fp = FP_DONT_CARE;
1342	if (get_ldev(mdev)) {
1343		fp = mdev->ldev->dc.fencing;
1344		put_ldev(mdev);
1345	}
1346
1347	/* Inform userspace about the change... */
1348	drbd_bcast_state(mdev, ns);
1349
1350	if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1351	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1352		drbd_khelper(mdev, "pri-on-incon-degr");
1353
1354	/* Here we have the actions that are performed after a
1355	   state change. This function might sleep */
1356
1357	nsm.i = -1;
1358	if (ns.susp_nod) {
1359		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1360			what = resend;
1361
1362		if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
1363			what = restart_frozen_disk_io;
1364
1365		if (what != nothing)
1366			nsm.susp_nod = 0;
1367	}
1368
1369	if (ns.susp_fen) {
1370		/* case1: The outdate peer handler is successful: */
1371		if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
1372			tl_clear(mdev);
1373			if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
1374				drbd_uuid_new_current(mdev);
1375				clear_bit(NEW_CUR_UUID, &mdev->flags);
1376			}
1377			spin_lock_irq(&mdev->req_lock);
1378			_drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1379			spin_unlock_irq(&mdev->req_lock);
1380		}
1381		/* case2: The connection was established again: */
1382		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1383			clear_bit(NEW_CUR_UUID, &mdev->flags);
1384			what = resend;
1385			nsm.susp_fen = 0;
1386		}
1387	}
1388
1389	if (what != nothing) {
1390		spin_lock_irq(&mdev->req_lock);
1391		_tl_restart(mdev, what);
1392		nsm.i &= mdev->state.i;
1393		_drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1394		spin_unlock_irq(&mdev->req_lock);
1395	}
1396
1397	/* Became sync source.  With protocol >= 96, we still need to send out
1398	 * the sync uuid now. Need to do that before any drbd_send_state, or
1399	 * the other side may go "paused sync" before receiving the sync uuids,
1400	 * which is unexpected. */
1401	if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1402	    (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1403	    mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1404		drbd_gen_and_send_sync_uuid(mdev);
1405		put_ldev(mdev);
1406	}
1407
1408	/* Do not change the order of the if above and the two below... */
1409	if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
1410		drbd_send_uuids(mdev);
1411		drbd_send_state(mdev);
1412	}
1413	/* No point in queuing send_bitmap if we don't have a connection
1414	 * anymore, so check also the _current_ state, not only the new state
1415	 * at the time this work was queued. */
1416	if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1417	    mdev->state.conn == C_WF_BITMAP_S)
1418		drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
1419				"send_bitmap (WFBitMapS)",
1420				BM_LOCKED_TEST_ALLOWED);
1421
1422	/* Lost contact to peer's copy of the data */
1423	if ((os.pdsk >= D_INCONSISTENT &&
1424	     os.pdsk != D_UNKNOWN &&
1425	     os.pdsk != D_OUTDATED)
1426	&&  (ns.pdsk < D_INCONSISTENT ||
1427	     ns.pdsk == D_UNKNOWN ||
1428	     ns.pdsk == D_OUTDATED)) {
1429		if (get_ldev(mdev)) {
1430			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1431			    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1432				if (is_susp(mdev->state)) {
1433					set_bit(NEW_CUR_UUID, &mdev->flags);
1434				} else {
1435					drbd_uuid_new_current(mdev);
1436					drbd_send_uuids(mdev);
1437				}
1438			}
1439			put_ldev(mdev);
1440		}
1441	}
1442
1443	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1444		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
1445			drbd_uuid_new_current(mdev);
1446			drbd_send_uuids(mdev);
1447		}
1448
1449		/* D_DISKLESS Peer becomes secondary */
1450		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1451			/* We may still be Primary ourselves.
1452			 * No harm done if the bitmap still changes,
1453			 * redirtied pages will follow later. */
1454			drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1455				"demote diskless peer", BM_LOCKED_SET_ALLOWED);
1456		put_ldev(mdev);
1457	}
1458
1459	/* Write out all changed bits on demote.
1460	 * Though, no need to da that just yet
1461	 * if there is a resync going on still */
1462	if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1463		mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
1464		/* No changes to the bitmap expected this time, so assert that,
1465		 * even though no harm was done if it did change. */
1466		drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1467				"demote", BM_LOCKED_TEST_ALLOWED);
1468		put_ldev(mdev);
1469	}
1470
1471	/* Last part of the attaching process ... */
1472	if (ns.conn >= C_CONNECTED &&
1473	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1474		drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
1475		drbd_send_uuids(mdev);
1476		drbd_send_state(mdev);
1477	}
1478
1479	/* We want to pause/continue resync, tell peer. */
1480	if (ns.conn >= C_CONNECTED &&
1481	     ((os.aftr_isp != ns.aftr_isp) ||
1482	      (os.user_isp != ns.user_isp)))
1483		drbd_send_state(mdev);
1484
1485	/* In case one of the isp bits got set, suspend other devices. */
1486	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1487	    (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1488		suspend_other_sg(mdev);
1489
1490	/* Make sure the peer gets informed about eventual state
1491	   changes (ISP bits) while we were in WFReportParams. */
1492	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1493		drbd_send_state(mdev);
1494
1495	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1496		drbd_send_state(mdev);
1497
1498	/* We are in the progress to start a full sync... */
1499	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1500	    (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1501		/* no other bitmap changes expected during this phase */
1502		drbd_queue_bitmap_io(mdev,
1503			&drbd_bmio_set_n_write, &abw_start_sync,
1504			"set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
1505
1506	/* We are invalidating our self... */
1507	if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1508	    os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1509		/* other bitmap operation expected during this phase */
1510		drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1511			"set_n_write from invalidate", BM_LOCKED_MASK);
1512
1513	/* first half of local IO error, failure to attach,
1514	 * or administrative detach */
1515	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1516		enum drbd_io_error_p eh;
1517		int was_io_error;
1518		/* corresponding get_ldev was in __drbd_set_state, to serialize
1519		 * our cleanup here with the transition to D_DISKLESS,
1520		 * so it is safe to dreference ldev here. */
1521		eh = mdev->ldev->dc.on_io_error;
1522		was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
1523
1524		/* current state still has to be D_FAILED,
1525		 * there is only one way out: to D_DISKLESS,
1526		 * and that may only happen after our put_ldev below. */
1527		if (mdev->state.disk != D_FAILED)
1528			dev_err(DEV,
1529				"ASSERT FAILED: disk is %s during detach\n",
1530				drbd_disk_str(mdev->state.disk));
1531
1532		if (drbd_send_state(mdev))
1533			dev_warn(DEV, "Notified peer that I am detaching my disk\n");
1534		else
1535			dev_err(DEV, "Sending state for detaching disk failed\n");
1536
1537		drbd_rs_cancel_all(mdev);
1538
1539		/* In case we want to get something to stable storage still,
1540		 * this may be the last chance.
1541		 * Following put_ldev may transition to D_DISKLESS. */
1542		drbd_md_sync(mdev);
1543		put_ldev(mdev);
1544
1545		if (was_io_error && eh == EP_CALL_HELPER)
1546			drbd_khelper(mdev, "local-io-error");
1547	}
1548
1549        /* second half of local IO error, failure to attach,
1550         * or administrative detach,
1551         * after local_cnt references have reached zero again */
1552        if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1553                /* We must still be diskless,
1554                 * re-attach has to be serialized with this! */
1555                if (mdev->state.disk != D_DISKLESS)
1556                        dev_err(DEV,
1557                                "ASSERT FAILED: disk is %s while going diskless\n",
1558                                drbd_disk_str(mdev->state.disk));
1559
1560                mdev->rs_total = 0;
1561                mdev->rs_failed = 0;
1562                atomic_set(&mdev->rs_pending_cnt, 0);
1563
1564		if (drbd_send_state(mdev))
1565			dev_warn(DEV, "Notified peer that I'm now diskless.\n");
1566		/* corresponding get_ldev in __drbd_set_state
1567		 * this may finally trigger drbd_ldev_destroy. */
1568		put_ldev(mdev);
1569	}
1570
1571	/* Notify peer that I had a local IO error, and did not detached.. */
1572	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
1573		drbd_send_state(mdev);
1574
1575	/* Disks got bigger while they were detached */
1576	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1577	    test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
1578		if (ns.conn == C_CONNECTED)
1579			resync_after_online_grow(mdev);
1580	}
1581
1582	/* A resync finished or aborted, wake paused devices... */
1583	if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1584	    (os.peer_isp && !ns.peer_isp) ||
1585	    (os.user_isp && !ns.user_isp))
1586		resume_next_sg(mdev);
1587
1588	/* sync target done with resync.  Explicitly notify peer, even though
1589	 * it should (at least for non-empty resyncs) already know itself. */
1590	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1591		drbd_send_state(mdev);
1592
1593	/* This triggers bitmap writeout of potentially still unwritten pages
1594	 * if the resync finished cleanly, or aborted because of peer disk
1595	 * failure, or because of connection loss.
1596	 * For resync aborted because of local disk failure, we cannot do
1597	 * any bitmap writeout anymore.
1598	 * No harm done if some bits change during this phase.
1599	 */
1600	if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1601		drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
1602			"write from resync_finished", BM_LOCKED_SET_ALLOWED);
1603		put_ldev(mdev);
1604	}
1605
1606	/* free tl_hash if we Got thawed and are C_STANDALONE */
1607	if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1608		drbd_free_tl_hash(mdev);
1609
1610	/* Upon network connection, we need to start the receiver */
1611	if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1612		drbd_thread_start(&mdev->receiver);
1613
1614	/* Terminate worker thread if we are unconfigured - it will be
1615	   restarted as needed... */
1616	if (ns.disk == D_DISKLESS &&
1617	    ns.conn == C_STANDALONE &&
1618	    ns.role == R_SECONDARY) {
1619		if (os.aftr_isp != ns.aftr_isp)
1620			resume_next_sg(mdev);
1621		/* set in __drbd_set_state, unless CONFIG_PENDING was set */
1622		if (test_bit(DEVICE_DYING, &mdev->flags))
1623			drbd_thread_stop_nowait(&mdev->worker);
1624	}
1625
1626	drbd_md_sync(mdev);
1627}
1628
1629
1630static int drbd_thread_setup(void *arg)
1631{
1632	struct drbd_thread *thi = (struct drbd_thread *) arg;
1633	struct drbd_conf *mdev = thi->mdev;
1634	unsigned long flags;
1635	int retval;
1636
 
 
 
 
 
 
1637restart:
1638	retval = thi->function(thi);
1639
1640	spin_lock_irqsave(&thi->t_lock, flags);
1641
1642	/* if the receiver has been "Exiting", the last thing it did
1643	 * was set the conn state to "StandAlone",
1644	 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1645	 * and receiver thread will be "started".
1646	 * drbd_thread_start needs to set "Restarting" in that case.
1647	 * t_state check and assignment needs to be within the same spinlock,
1648	 * so either thread_start sees Exiting, and can remap to Restarting,
1649	 * or thread_start see None, and can proceed as normal.
1650	 */
1651
1652	if (thi->t_state == Restarting) {
1653		dev_info(DEV, "Restarting %s\n", current->comm);
1654		thi->t_state = Running;
1655		spin_unlock_irqrestore(&thi->t_lock, flags);
1656		goto restart;
1657	}
1658
1659	thi->task = NULL;
1660	thi->t_state = None;
1661	smp_mb();
1662	complete(&thi->stop);
1663	spin_unlock_irqrestore(&thi->t_lock, flags);
1664
1665	dev_info(DEV, "Terminating %s\n", current->comm);
1666
1667	/* Release mod reference taken when thread was started */
 
 
 
 
1668	module_put(THIS_MODULE);
1669	return retval;
1670}
1671
1672static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1673		      int (*func) (struct drbd_thread *))
1674{
1675	spin_lock_init(&thi->t_lock);
1676	thi->task    = NULL;
1677	thi->t_state = None;
1678	thi->function = func;
1679	thi->mdev = mdev;
 
 
1680}
1681
1682int drbd_thread_start(struct drbd_thread *thi)
1683{
1684	struct drbd_conf *mdev = thi->mdev;
1685	struct task_struct *nt;
1686	unsigned long flags;
1687
1688	const char *me =
1689		thi == &mdev->receiver ? "receiver" :
1690		thi == &mdev->asender  ? "asender"  :
1691		thi == &mdev->worker   ? "worker"   : "NONSENSE";
1692
1693	/* is used from state engine doing drbd_thread_stop_nowait,
1694	 * while holding the req lock irqsave */
1695	spin_lock_irqsave(&thi->t_lock, flags);
1696
1697	switch (thi->t_state) {
1698	case None:
1699		dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1700				me, current->comm, current->pid);
1701
1702		/* Get ref on module for thread - this is released when thread exits */
1703		if (!try_module_get(THIS_MODULE)) {
1704			dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1705			spin_unlock_irqrestore(&thi->t_lock, flags);
1706			return false;
1707		}
1708
 
 
 
 
1709		init_completion(&thi->stop);
1710		D_ASSERT(thi->task == NULL);
1711		thi->reset_cpu_mask = 1;
1712		thi->t_state = Running;
1713		spin_unlock_irqrestore(&thi->t_lock, flags);
1714		flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1715
1716		nt = kthread_create(drbd_thread_setup, (void *) thi,
1717				    "drbd%d_%s", mdev_to_minor(mdev), me);
1718
1719		if (IS_ERR(nt)) {
1720			dev_err(DEV, "Couldn't start thread\n");
1721
 
 
 
1722			module_put(THIS_MODULE);
1723			return false;
1724		}
1725		spin_lock_irqsave(&thi->t_lock, flags);
1726		thi->task = nt;
1727		thi->t_state = Running;
1728		spin_unlock_irqrestore(&thi->t_lock, flags);
1729		wake_up_process(nt);
1730		break;
1731	case Exiting:
1732		thi->t_state = Restarting;
1733		dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1734				me, current->comm, current->pid);
1735		/* fall through */
1736	case Running:
1737	case Restarting:
1738	default:
1739		spin_unlock_irqrestore(&thi->t_lock, flags);
1740		break;
1741	}
1742
1743	return true;
1744}
1745
1746
1747void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1748{
1749	unsigned long flags;
1750
1751	enum drbd_thread_state ns = restart ? Restarting : Exiting;
1752
1753	/* may be called from state engine, holding the req lock irqsave */
1754	spin_lock_irqsave(&thi->t_lock, flags);
1755
1756	if (thi->t_state == None) {
1757		spin_unlock_irqrestore(&thi->t_lock, flags);
1758		if (restart)
1759			drbd_thread_start(thi);
1760		return;
1761	}
1762
1763	if (thi->t_state != ns) {
1764		if (thi->task == NULL) {
1765			spin_unlock_irqrestore(&thi->t_lock, flags);
1766			return;
1767		}
1768
1769		thi->t_state = ns;
1770		smp_mb();
1771		init_completion(&thi->stop);
1772		if (thi->task != current)
1773			force_sig(DRBD_SIGKILL, thi->task);
1774
1775	}
1776
1777	spin_unlock_irqrestore(&thi->t_lock, flags);
1778
1779	if (wait)
1780		wait_for_completion(&thi->stop);
1781}
1782
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1783#ifdef CONFIG_SMP
1784/**
1785 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1786 * @mdev:	DRBD device.
1787 *
1788 * Forces all threads of a device onto the same CPU. This is beneficial for
1789 * DRBD's performance. May be overwritten by user's configuration.
1790 */
1791void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1792{
1793	int ord, cpu;
1794
1795	/* user override. */
1796	if (cpumask_weight(mdev->cpu_mask))
1797		return;
1798
1799	ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1800	for_each_online_cpu(cpu) {
1801		if (ord-- == 0) {
1802			cpumask_set_cpu(cpu, mdev->cpu_mask);
1803			return;
 
 
 
 
 
 
 
 
1804		}
 
1805	}
1806	/* should not be reached */
1807	cpumask_setall(mdev->cpu_mask);
 
 
 
1808}
1809
1810/**
1811 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1812 * @mdev:	DRBD device.
 
1813 *
1814 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1815 * prematurely.
1816 */
1817void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1818{
 
1819	struct task_struct *p = current;
1820	struct drbd_thread *thi =
1821		p == mdev->asender.task  ? &mdev->asender  :
1822		p == mdev->receiver.task ? &mdev->receiver :
1823		p == mdev->worker.task   ? &mdev->worker   :
1824		NULL;
1825	ERR_IF(thi == NULL)
1826		return;
1827	if (!thi->reset_cpu_mask)
1828		return;
1829	thi->reset_cpu_mask = 0;
1830	set_cpus_allowed_ptr(p, mdev->cpu_mask);
1831}
 
 
1832#endif
1833
1834/* the appropriate socket mutex must be held already */
1835int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1836			  enum drbd_packets cmd, struct p_header80 *h,
1837			  size_t size, unsigned msg_flags)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1838{
1839	int sent, ok;
 
 
 
 
1840
1841	ERR_IF(!h) return false;
1842	ERR_IF(!size) return false;
 
 
 
 
 
1843
1844	h->magic   = BE_DRBD_MAGIC;
 
 
 
 
1845	h->command = cpu_to_be16(cmd);
1846	h->length  = cpu_to_be16(size-sizeof(struct p_header80));
 
 
 
1847
1848	sent = drbd_send(mdev, sock, h, size, msg_flags);
 
 
 
 
 
 
 
 
 
 
1849
1850	ok = (sent == size);
1851	if (!ok && !signal_pending(current))
1852		dev_warn(DEV, "short sent %s size=%d sent=%d\n",
1853		    cmdname(cmd), (int)size, sent);
1854	return ok;
 
1855}
1856
1857/* don't pass the socket. we may only look at it
1858 * when we hold the appropriate socket mutex.
1859 */
1860int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1861		  enum drbd_packets cmd, struct p_header80 *h, size_t size)
1862{
1863	int ok = 0;
1864	struct socket *sock;
1865
1866	if (use_data_socket) {
1867		mutex_lock(&mdev->data.mutex);
1868		sock = mdev->data.socket;
1869	} else {
1870		mutex_lock(&mdev->meta.mutex);
1871		sock = mdev->meta.socket;
1872	}
1873
1874	/* drbd_disconnect() could have called drbd_free_sock()
1875	 * while we were waiting in down()... */
1876	if (likely(sock != NULL))
1877		ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
1878
1879	if (use_data_socket)
1880		mutex_unlock(&mdev->data.mutex);
1881	else
1882		mutex_unlock(&mdev->meta.mutex);
1883	return ok;
1884}
1885
1886int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
1887		   size_t size)
 
 
1888{
1889	struct p_header80 h;
1890	int ok;
1891
1892	h.magic   = BE_DRBD_MAGIC;
1893	h.command = cpu_to_be16(cmd);
1894	h.length  = cpu_to_be16(size);
 
 
 
 
 
1895
1896	if (!drbd_get_data_sock(mdev))
1897		return 0;
 
 
 
 
 
 
 
 
1898
1899	ok = (sizeof(h) ==
1900		drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
1901	ok = ok && (size ==
1902		drbd_send(mdev, mdev->data.socket, data, size, 0));
1903
1904	drbd_put_data_sock(mdev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1905
1906	return ok;
 
 
 
1907}
1908
1909int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
1910{
 
 
 
 
 
 
 
 
 
 
 
1911	struct p_rs_param_95 *p;
1912	struct socket *sock;
1913	int size, rv;
1914	const int apv = mdev->agreed_pro_version;
 
 
 
 
 
 
 
 
 
 
1915
1916	size = apv <= 87 ? sizeof(struct p_rs_param)
1917		: apv == 88 ? sizeof(struct p_rs_param)
1918			+ strlen(mdev->sync_conf.verify_alg) + 1
1919		: apv <= 94 ? sizeof(struct p_rs_param_89)
1920		: /* apv >= 95 */ sizeof(struct p_rs_param_95);
1921
1922	/* used from admin command context and receiver/worker context.
1923	 * to avoid kmalloc, grab the socket right here,
1924	 * then use the pre-allocated sbuf there */
1925	mutex_lock(&mdev->data.mutex);
1926	sock = mdev->data.socket;
1927
1928	if (likely(sock != NULL)) {
1929		enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
1930
1931		p = &mdev->data.sbuf.rs_param_95;
1932
1933		/* initialize verify_alg and csums_alg */
1934		memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
1935
1936		p->rate = cpu_to_be32(sc->rate);
1937		p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
1938		p->c_delay_target = cpu_to_be32(sc->c_delay_target);
1939		p->c_fill_target = cpu_to_be32(sc->c_fill_target);
1940		p->c_max_rate = cpu_to_be32(sc->c_max_rate);
1941
1942		if (apv >= 88)
1943			strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
1944		if (apv >= 89)
1945			strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
1946
1947		rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
1948	} else
1949		rv = 0; /* not ok */
1950
1951	mutex_unlock(&mdev->data.mutex);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1952
1953	return rv;
 
 
 
 
 
 
1954}
1955
1956int drbd_send_protocol(struct drbd_conf *mdev)
1957{
 
1958	struct p_protocol *p;
1959	int size, cf, rv;
1960
1961	size = sizeof(struct p_protocol);
1962
1963	if (mdev->agreed_pro_version >= 87)
1964		size += strlen(mdev->net_conf->integrity_alg) + 1;
1965
1966	/* we must not recurse into our own queue,
1967	 * as that is blocked during handshake */
1968	p = kmalloc(size, GFP_NOIO);
1969	if (p == NULL)
1970		return 0;
1971
1972	p->protocol      = cpu_to_be32(mdev->net_conf->wire_protocol);
1973	p->after_sb_0p   = cpu_to_be32(mdev->net_conf->after_sb_0p);
1974	p->after_sb_1p   = cpu_to_be32(mdev->net_conf->after_sb_1p);
1975	p->after_sb_2p   = cpu_to_be32(mdev->net_conf->after_sb_2p);
1976	p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
1977
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1978	cf = 0;
1979	if (mdev->net_conf->want_lose)
1980		cf |= CF_WANT_LOSE;
1981	if (mdev->net_conf->dry_run) {
1982		if (mdev->agreed_pro_version >= 92)
1983			cf |= CF_DRY_RUN;
1984		else {
1985			dev_err(DEV, "--dry-run is not supported by peer");
1986			kfree(p);
1987			return -1;
1988		}
1989	}
1990	p->conn_flags    = cpu_to_be32(cf);
1991
1992	if (mdev->agreed_pro_version >= 87)
1993		strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
 
1994
1995	rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
1996			   (struct p_header80 *)p, size);
1997	kfree(p);
1998	return rv;
 
 
 
 
 
 
 
 
1999}
2000
2001int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2002{
2003	struct p_uuids p;
 
 
2004	int i;
2005
2006	if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2007		return 1;
2008
 
 
 
 
 
 
 
2009	for (i = UI_CURRENT; i < UI_SIZE; i++)
2010		p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2011
2012	mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2013	p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2014	uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2015	uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
2016	uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2017	p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2018
2019	put_ldev(mdev);
 
 
 
 
 
 
 
2020
2021	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
2022			     (struct p_header80 *)&p, sizeof(p));
2023}
2024
2025int drbd_send_uuids(struct drbd_conf *mdev)
2026{
2027	return _drbd_send_uuids(mdev, 0);
2028}
2029
2030int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2031{
2032	return _drbd_send_uuids(mdev, 8);
2033}
2034
2035void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2036{
2037	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2038		u64 *uuid = mdev->ldev->md.uuid;
2039		dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2040		     text,
2041		     (unsigned long long)uuid[UI_CURRENT],
2042		     (unsigned long long)uuid[UI_BITMAP],
2043		     (unsigned long long)uuid[UI_HISTORY_START],
2044		     (unsigned long long)uuid[UI_HISTORY_END]);
2045		put_ldev(mdev);
2046	} else {
2047		dev_info(DEV, "%s effective data uuid: %016llX\n",
2048				text,
2049				(unsigned long long)mdev->ed_uuid);
2050	}
2051}
2052
2053int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
2054{
2055	struct p_rs_uuid p;
 
 
2056	u64 uuid;
2057
2058	D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2059
2060	uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
2061	drbd_uuid_set(mdev, UI_BITMAP, uuid);
2062	drbd_print_uuids(mdev, "updated sync UUID");
2063	drbd_md_sync(mdev);
2064	p.uuid = cpu_to_be64(uuid);
 
 
 
 
 
 
 
 
 
 
 
2065
2066	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
2067			     (struct p_header80 *)&p, sizeof(p));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2068}
2069
2070int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
2071{
2072	struct p_sizes p;
 
 
2073	sector_t d_size, u_size;
2074	int q_order_type, max_bio_size;
2075	int ok;
 
 
 
 
 
 
2076
2077	if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2078		D_ASSERT(mdev->ldev->backing_bdev);
2079		d_size = drbd_get_max_capacity(mdev->ldev);
2080		u_size = mdev->ldev->dc.disk_size;
2081		q_order_type = drbd_queue_order_type(mdev);
2082		max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2083		max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
2084		put_ldev(mdev);
 
 
 
 
 
 
 
 
2085	} else {
2086		d_size = 0;
2087		u_size = 0;
2088		q_order_type = QUEUE_ORDERED_NONE;
2089		max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
 
2090	}
2091
2092	p.d_size = cpu_to_be64(d_size);
2093	p.u_size = cpu_to_be64(u_size);
2094	p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
2095	p.max_bio_size = cpu_to_be32(max_bio_size);
2096	p.queue_order_type = cpu_to_be16(q_order_type);
2097	p.dds_flags = cpu_to_be16(flags);
2098
2099	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
2100			   (struct p_header80 *)&p, sizeof(p));
2101	return ok;
 
 
 
2102}
2103
2104/**
2105 * drbd_send_state() - Sends the drbd state to the peer
2106 * @mdev:	DRBD device.
2107 */
2108int drbd_send_state(struct drbd_conf *mdev)
2109{
2110	struct socket *sock;
2111	struct p_state p;
2112	int ok = 0;
2113
2114	/* Grab state lock so we wont send state if we're in the middle
2115	 * of a cluster wide state change on another thread */
2116	drbd_state_lock(mdev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2117
2118	mutex_lock(&mdev->data.mutex);
 
 
 
 
 
 
2119
2120	p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2121	sock = mdev->data.socket;
 
 
2122
2123	if (likely(sock != NULL)) {
2124		ok = _drbd_send_cmd(mdev, sock, P_STATE,
2125				    (struct p_header80 *)&p, sizeof(p), 0);
2126	}
 
 
 
 
2127
2128	mutex_unlock(&mdev->data.mutex);
 
 
 
 
2129
2130	drbd_state_unlock(mdev);
2131	return ok;
 
 
 
 
 
 
2132}
2133
2134int drbd_send_state_req(struct drbd_conf *mdev,
2135	union drbd_state mask, union drbd_state val)
2136{
2137	struct p_req_state p;
 
 
 
 
 
 
 
 
 
2138
2139	p.mask    = cpu_to_be32(mask.i);
2140	p.val     = cpu_to_be32(val.i);
 
 
 
2141
2142	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
2143			     (struct p_header80 *)&p, sizeof(p));
 
 
 
 
2144}
2145
2146int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
2147{
2148	struct p_req_state_reply p;
 
 
2149
2150	p.retcode    = cpu_to_be32(retcode);
 
 
 
2151
2152	return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
2153			     (struct p_header80 *)&p, sizeof(p));
 
 
2154}
2155
2156int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2157	struct p_compressed_bm *p,
2158	struct bm_xfer_ctx *c)
 
2159{
2160	struct bitstream bs;
2161	unsigned long plain_bits;
2162	unsigned long tmp;
2163	unsigned long rl;
2164	unsigned len;
2165	unsigned toggle;
2166	int bits;
2167
2168	/* may we use this feature? */
2169	if ((mdev->sync_conf.use_rle == 0) ||
2170		(mdev->agreed_pro_version < 90))
2171			return 0;
 
 
2172
2173	if (c->bit_offset >= c->bm_bits)
2174		return 0; /* nothing to do. */
2175
2176	/* use at most thus many bytes */
2177	bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2178	memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2179	/* plain bits covered in this code string */
2180	plain_bits = 0;
2181
2182	/* p->encoding & 0x80 stores whether the first run length is set.
2183	 * bit offset is implicit.
2184	 * start with toggle == 2 to be able to tell the first iteration */
2185	toggle = 2;
2186
2187	/* see how much plain bits we can stuff into one packet
2188	 * using RLE and VLI. */
2189	do {
2190		tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2191				    : _drbd_bm_find_next(mdev, c->bit_offset);
2192		if (tmp == -1UL)
2193			tmp = c->bm_bits;
2194		rl = tmp - c->bit_offset;
2195
2196		if (toggle == 2) { /* first iteration */
2197			if (rl == 0) {
2198				/* the first checked bit was set,
2199				 * store start value, */
2200				DCBP_set_start(p, 1);
2201				/* but skip encoding of zero run length */
2202				toggle = !toggle;
2203				continue;
2204			}
2205			DCBP_set_start(p, 0);
2206		}
2207
2208		/* paranoia: catch zero runlength.
2209		 * can only happen if bitmap is modified while we scan it. */
2210		if (rl == 0) {
2211			dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2212			    "t:%u bo:%lu\n", toggle, c->bit_offset);
2213			return -1;
2214		}
2215
2216		bits = vli_encode_bits(&bs, rl);
2217		if (bits == -ENOBUFS) /* buffer full */
2218			break;
2219		if (bits <= 0) {
2220			dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2221			return 0;
2222		}
2223
2224		toggle = !toggle;
2225		plain_bits += rl;
2226		c->bit_offset = tmp;
2227	} while (c->bit_offset < c->bm_bits);
2228
2229	len = bs.cur.b - p->code + !!bs.cur.bit;
2230
2231	if (plain_bits < (len << 3)) {
2232		/* incompressible with this method.
2233		 * we need to rewind both word and bit position. */
2234		c->bit_offset -= plain_bits;
2235		bm_xfer_ctx_bit_to_word_offset(c);
2236		c->bit_offset = c->word_offset * BITS_PER_LONG;
2237		return 0;
2238	}
2239
2240	/* RLE + VLI was able to compress it just fine.
2241	 * update c->word_offset. */
2242	bm_xfer_ctx_bit_to_word_offset(c);
2243
2244	/* store pad_bits */
2245	DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2246
2247	return len;
2248}
2249
2250/**
2251 * send_bitmap_rle_or_plain
2252 *
2253 * Return 0 when done, 1 when another iteration is needed, and a negative error
2254 * code upon failure.
2255 */
2256static int
2257send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2258			 struct p_header80 *h, struct bm_xfer_ctx *c)
2259{
2260	struct p_compressed_bm *p = (void*)h;
2261	unsigned long num_words;
2262	int len;
2263	int ok;
2264
2265	len = fill_bitmap_rle_bits(mdev, p, c);
2266
 
 
2267	if (len < 0)
2268		return -EIO;
2269
2270	if (len) {
2271		DCBP_set_code(p, RLE_VLI_Bits);
2272		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2273			sizeof(*p) + len, 0);
2274
2275		c->packets[0]++;
2276		c->bytes[0] += sizeof(*p) + len;
2277
2278		if (c->bit_offset >= c->bm_bits)
2279			len = 0; /* DONE */
2280	} else {
2281		/* was not compressible.
2282		 * send a buffer full of plain text bits instead. */
2283		num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2284		len = num_words * sizeof(long);
 
 
 
 
 
 
2285		if (len)
2286			drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2287		ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2288				   h, sizeof(struct p_header80) + len, 0);
2289		c->word_offset += num_words;
2290		c->bit_offset = c->word_offset * BITS_PER_LONG;
2291
2292		c->packets[1]++;
2293		c->bytes[1] += sizeof(struct p_header80) + len;
2294
2295		if (c->bit_offset > c->bm_bits)
2296			c->bit_offset = c->bm_bits;
2297	}
2298	if (ok) {
2299		if (len == 0) {
2300			INFO_bm_xfer_stats(mdev, "send", c);
2301			return 0;
2302		} else
2303			return 1;
2304	}
2305	return -EIO;
2306}
2307
2308/* See the comment at receive_bitmap() */
2309int _drbd_send_bitmap(struct drbd_conf *mdev)
2310{
2311	struct bm_xfer_ctx c;
2312	struct p_header80 *p;
2313	int err;
2314
2315	ERR_IF(!mdev->bitmap) return false;
2316
2317	/* maybe we should use some per thread scratch page,
2318	 * and allocate that during initial device creation? */
2319	p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2320	if (!p) {
2321		dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2322		return false;
2323	}
2324
2325	if (get_ldev(mdev)) {
2326		if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2327			dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2328			drbd_bm_set_all(mdev);
2329			if (drbd_bm_write(mdev)) {
2330				/* write_bm did fail! Leave full sync flag set in Meta P_DATA
2331				 * but otherwise process as per normal - need to tell other
2332				 * side that a full resync is required! */
2333				dev_err(DEV, "Failed to write bitmap to disk!\n");
2334			} else {
2335				drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2336				drbd_md_sync(mdev);
2337			}
2338		}
2339		put_ldev(mdev);
2340	}
2341
2342	c = (struct bm_xfer_ctx) {
2343		.bm_bits = drbd_bm_bits(mdev),
2344		.bm_words = drbd_bm_words(mdev),
2345	};
2346
2347	do {
2348		err = send_bitmap_rle_or_plain(mdev, p, &c);
2349	} while (err > 0);
2350
2351	free_page((unsigned long) p);
2352	return err == 0;
2353}
2354
2355int drbd_send_bitmap(struct drbd_conf *mdev)
2356{
2357	int err;
 
2358
2359	if (!drbd_get_data_sock(mdev))
2360		return -1;
2361	err = !_drbd_send_bitmap(mdev);
2362	drbd_put_data_sock(mdev);
2363	return err;
2364}
2365
2366int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2367{
2368	int ok;
2369	struct p_barrier_ack p;
2370
2371	p.barrier  = barrier_nr;
2372	p.set_size = cpu_to_be32(set_size);
2373
2374	if (mdev->state.conn < C_CONNECTED)
2375		return false;
2376	ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2377			(struct p_header80 *)&p, sizeof(p));
2378	return ok;
 
 
2379}
2380
2381/**
2382 * _drbd_send_ack() - Sends an ack packet
2383 * @mdev:	DRBD device.
2384 * @cmd:	Packet command code.
2385 * @sector:	sector, needs to be in big endian byte order
2386 * @blksize:	size in byte, needs to be in big endian byte order
2387 * @block_id:	Id, big endian byte order
2388 */
2389static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2390			  u64 sector,
2391			  u32 blksize,
2392			  u64 block_id)
2393{
2394	int ok;
2395	struct p_block_ack p;
2396
2397	p.sector   = sector;
2398	p.block_id = block_id;
2399	p.blksize  = blksize;
2400	p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2401
2402	if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2403		return false;
2404	ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2405				(struct p_header80 *)&p, sizeof(p));
2406	return ok;
 
 
 
 
 
 
 
2407}
2408
2409/* dp->sector and dp->block_id already/still in network byte order,
2410 * data_size is payload size according to dp->head,
2411 * and may need to be corrected for digest size. */
2412int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2413		     struct p_data *dp, int data_size)
2414{
2415	data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2416		crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2417	return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2418			      dp->block_id);
2419}
2420
2421int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2422		     struct p_block_req *rp)
2423{
2424	return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2425}
2426
2427/**
2428 * drbd_send_ack() - Sends an ack packet
2429 * @mdev:	DRBD device.
2430 * @cmd:	Packet command code.
2431 * @e:		Epoch entry.
2432 */
2433int drbd_send_ack(struct drbd_conf *mdev,
2434	enum drbd_packets cmd, struct drbd_epoch_entry *e)
2435{
2436	return _drbd_send_ack(mdev, cmd,
2437			      cpu_to_be64(e->sector),
2438			      cpu_to_be32(e->size),
2439			      e->block_id);
2440}
2441
2442/* This function misuses the block_id field to signal if the blocks
2443 * are is sync or not. */
2444int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2445		     sector_t sector, int blksize, u64 block_id)
2446{
2447	return _drbd_send_ack(mdev, cmd,
2448			      cpu_to_be64(sector),
2449			      cpu_to_be32(blksize),
2450			      cpu_to_be64(block_id));
2451}
2452
2453int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2454		       sector_t sector, int size, u64 block_id)
2455{
2456	int ok;
2457	struct p_block_req p;
2458
2459	p.sector   = cpu_to_be64(sector);
2460	p.block_id = block_id;
2461	p.blksize  = cpu_to_be32(size);
2462
2463	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2464				(struct p_header80 *)&p, sizeof(p));
2465	return ok;
 
2466}
2467
2468int drbd_send_drequest_csum(struct drbd_conf *mdev,
2469			    sector_t sector, int size,
2470			    void *digest, int digest_size,
2471			    enum drbd_packets cmd)
2472{
2473	int ok;
2474	struct p_block_req p;
2475
2476	p.sector   = cpu_to_be64(sector);
2477	p.block_id = BE_DRBD_MAGIC + 0xbeef;
2478	p.blksize  = cpu_to_be32(size);
2479
2480	p.head.magic   = BE_DRBD_MAGIC;
2481	p.head.command = cpu_to_be16(cmd);
2482	p.head.length  = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2483
2484	mutex_lock(&mdev->data.mutex);
2485
2486	ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2487	ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
 
 
 
2488
2489	mutex_unlock(&mdev->data.mutex);
2490
2491	return ok;
 
 
 
 
 
 
 
2492}
2493
2494int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2495{
2496	int ok;
2497	struct p_block_req p;
2498
2499	p.sector   = cpu_to_be64(sector);
2500	p.block_id = BE_DRBD_MAGIC + 0xbabe;
2501	p.blksize  = cpu_to_be32(size);
2502
2503	ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2504			   (struct p_header80 *)&p, sizeof(p));
2505	return ok;
 
 
 
 
 
2506}
2507
2508/* called on sndtimeo
2509 * returns false if we should retry,
2510 * true if we think connection is dead
2511 */
2512static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2513{
2514	int drop_it;
2515	/* long elapsed = (long)(jiffies - mdev->last_received); */
2516
2517	drop_it =   mdev->meta.socket == sock
2518		|| !mdev->asender.task
2519		|| get_t_state(&mdev->asender) != Running
2520		|| mdev->state.conn < C_CONNECTED;
2521
2522	if (drop_it)
2523		return true;
2524
2525	drop_it = !--mdev->ko_count;
2526	if (!drop_it) {
2527		dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2528		       current->comm, current->pid, mdev->ko_count);
2529		request_ping(mdev);
2530	}
2531
2532	return drop_it; /* && (mdev->state == R_PRIMARY) */;
 
 
 
 
 
 
 
2533}
2534
2535/* The idea of sendpage seems to be to put some kind of reference
2536 * to the page into the skb, and to hand it over to the NIC. In
2537 * this process get_page() gets called.
2538 *
2539 * As soon as the page was really sent over the network put_page()
2540 * gets called by some part of the network layer. [ NIC driver? ]
2541 *
2542 * [ get_page() / put_page() increment/decrement the count. If count
2543 *   reaches 0 the page will be freed. ]
2544 *
2545 * This works nicely with pages from FSs.
2546 * But this means that in protocol A we might signal IO completion too early!
2547 *
2548 * In order not to corrupt data during a resync we must make sure
2549 * that we do not reuse our own buffer pages (EEs) to early, therefore
2550 * we have the net_ee list.
2551 *
2552 * XFS seems to have problems, still, it submits pages with page_count == 0!
2553 * As a workaround, we disable sendpage on pages
2554 * with page_count == 0 or PageSlab.
2555 */
2556static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2557		   int offset, size_t size, unsigned msg_flags)
2558{
2559	int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
 
 
 
 
 
 
2560	kunmap(page);
2561	if (sent == size)
2562		mdev->send_cnt += size>>9;
2563	return sent == size;
2564}
2565
2566static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2567		    int offset, size_t size, unsigned msg_flags)
2568{
2569	mm_segment_t oldfs = get_fs();
2570	int sent, ok;
2571	int len = size;
 
2572
2573	/* e.g. XFS meta- & log-data is in slab pages, which have a
2574	 * page_count of 0 and/or have PageSlab() set.
2575	 * we cannot use send_page for those, as that does get_page();
2576	 * put_page(); and would cause either a VM_BUG directly, or
2577	 * __page_cache_release a page that would actually still be referenced
2578	 * by someone, leading to some obscure delayed Oops somewhere else. */
2579	if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2580		return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2581
2582	msg_flags |= MSG_NOSIGNAL;
2583	drbd_update_congested(mdev);
2584	set_fs(KERNEL_DS);
2585	do {
2586		sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2587							offset, len,
2588							msg_flags);
2589		if (sent == -EAGAIN) {
2590			if (we_should_drop_the_connection(mdev,
2591							  mdev->data.socket))
2592				break;
2593			else
2594				continue;
2595		}
2596		if (sent <= 0) {
2597			dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
 
 
 
 
 
2598			     __func__, (int)size, len, sent);
 
 
2599			break;
2600		}
2601		len    -= sent;
2602		offset += sent;
2603	} while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2604	set_fs(oldfs);
2605	clear_bit(NET_CONGESTED, &mdev->flags);
2606
2607	ok = (len == 0);
2608	if (likely(ok))
2609		mdev->send_cnt += size>>9;
2610	return ok;
 
2611}
2612
2613static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2614{
2615	struct bio_vec *bvec;
2616	int i;
 
2617	/* hint all but last page with MSG_MORE */
2618	__bio_for_each_segment(bvec, bio, i, 0) {
2619		if (!_drbd_no_send_page(mdev, bvec->bv_page,
2620				     bvec->bv_offset, bvec->bv_len,
2621				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2622			return 0;
 
 
 
 
 
 
 
2623	}
2624	return 1;
2625}
2626
2627static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2628{
2629	struct bio_vec *bvec;
2630	int i;
 
2631	/* hint all but last page with MSG_MORE */
2632	__bio_for_each_segment(bvec, bio, i, 0) {
2633		if (!_drbd_send_page(mdev, bvec->bv_page,
2634				     bvec->bv_offset, bvec->bv_len,
2635				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2636			return 0;
 
 
 
 
 
 
2637	}
2638	return 1;
2639}
2640
2641static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
 
2642{
2643	struct page *page = e->pages;
2644	unsigned len = e->size;
 
 
2645	/* hint all but last page with MSG_MORE */
2646	page_chain_for_each(page) {
2647		unsigned l = min_t(unsigned, len, PAGE_SIZE);
2648		if (!_drbd_send_page(mdev, page, 0, l,
2649				page_chain_next(page) ? MSG_MORE : 0))
2650			return 0;
 
 
2651		len -= l;
2652	}
2653	return 1;
2654}
2655
2656static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
 
2657{
2658	if (mdev->agreed_pro_version >= 95)
2659		return  (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2660			(bi_rw & REQ_FUA ? DP_FUA : 0) |
2661			(bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2662			(bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
 
 
 
 
 
 
2663	else
2664		return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2665}
2666
2667/* Used to send write requests
2668 * R_PRIMARY -> Peer	(P_DATA)
2669 */
2670int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2671{
2672	int ok = 1;
2673	struct p_data p;
 
 
 
2674	unsigned int dp_flags = 0;
2675	void *dgb;
2676	int dgs;
2677
2678	if (!drbd_get_data_sock(mdev))
2679		return 0;
2680
2681	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2682		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2683
2684	if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2685		p.head.h80.magic   = BE_DRBD_MAGIC;
2686		p.head.h80.command = cpu_to_be16(P_DATA);
2687		p.head.h80.length  =
2688			cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2689	} else {
2690		p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2691		p.head.h95.command = cpu_to_be16(P_DATA);
2692		p.head.h95.length  =
2693			cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2694	}
2695
2696	p.sector   = cpu_to_be64(req->sector);
2697	p.block_id = (unsigned long)req;
2698	p.seq_num  = cpu_to_be32(req->seq_num =
2699				 atomic_add_return(1, &mdev->packet_seq));
2700
2701	dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
 
 
 
2702
2703	if (mdev->state.conn >= C_SYNC_SOURCE &&
2704	    mdev->state.conn <= C_PAUSED_SYNC_T)
 
 
 
 
 
 
2705		dp_flags |= DP_MAY_SET_IN_SYNC;
2706
2707	p.dp_flags = cpu_to_be32(dp_flags);
2708	set_bit(UNPLUG_REMOTE, &mdev->flags);
2709	ok = (sizeof(p) ==
2710		drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2711	if (ok && dgs) {
2712		dgb = mdev->int_dig_out;
2713		drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2714		ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
 
 
 
 
 
 
 
 
2715	}
2716	if (ok) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2717		/* For protocol A, we have to memcpy the payload into
2718		 * socket buffers, as we may complete right away
2719		 * as soon as we handed it over to tcp, at which point the data
2720		 * pages may become invalid.
2721		 *
2722		 * For data-integrity enabled, we copy it as well, so we can be
2723		 * sure that even if the bio pages may still be modified, it
2724		 * won't change the data on the wire, thus if the digest checks
2725		 * out ok after sending on this side, but does not fit on the
2726		 * receiving side, we sure have detected corruption elsewhere.
2727		 */
2728		if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2729			ok = _drbd_send_bio(mdev, req->master_bio);
2730		else
2731			ok = _drbd_send_zc_bio(mdev, req->master_bio);
2732
2733		/* double check digest, sometimes buffers have been modified in flight. */
2734		if (dgs > 0 && dgs <= 64) {
2735			/* 64 byte, 512 bit, is the largest digest size
2736			 * currently supported in kernel crypto. */
2737			unsigned char digest[64];
2738			drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2739			if (memcmp(mdev->int_dig_out, digest, dgs)) {
2740				dev_warn(DEV,
2741					"Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2742					(unsigned long long)req->sector, req->size);
2743			}
2744		} /* else if (dgs > 64) {
2745		     ... Be noisy about digest too large ...
2746		} */
2747	}
 
 
2748
2749	drbd_put_data_sock(mdev);
2750
2751	return ok;
2752}
2753
2754/* answer packet, used to send data back for read requests:
2755 *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
2756 *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
2757 */
2758int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2759		    struct drbd_epoch_entry *e)
2760{
2761	int ok;
2762	struct p_data p;
2763	void *dgb;
2764	int dgs;
2765
2766	dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2767		crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2768
2769	if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2770		p.head.h80.magic   = BE_DRBD_MAGIC;
2771		p.head.h80.command = cpu_to_be16(cmd);
2772		p.head.h80.length  =
2773			cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2774	} else {
2775		p.head.h95.magic   = BE_DRBD_MAGIC_BIG;
2776		p.head.h95.command = cpu_to_be16(cmd);
2777		p.head.h95.length  =
2778			cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2779	}
2780
2781	p.sector   = cpu_to_be64(e->sector);
2782	p.block_id = e->block_id;
2783	/* p.seq_num  = 0;    No sequence numbers here.. */
2784
2785	/* Only called by our kernel thread.
2786	 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2787	 * in response to admin command or module unload.
2788	 */
2789	if (!drbd_get_data_sock(mdev))
2790		return 0;
2791
2792	ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2793	if (ok && dgs) {
2794		dgb = mdev->int_dig_out;
2795		drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2796		ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2797	}
2798	if (ok)
2799		ok = _drbd_send_zc_ee(mdev, e);
2800
2801	drbd_put_data_sock(mdev);
 
 
 
 
 
 
 
 
 
 
 
2802
2803	return ok;
2804}
2805
2806int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2807{
2808	struct p_block_desc p;
 
2809
2810	p.sector  = cpu_to_be64(req->sector);
2811	p.blksize = cpu_to_be32(req->size);
2812
2813	return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
 
 
 
2814}
2815
2816/*
2817  drbd_send distinguishes two cases:
2818
2819  Packets sent via the data socket "sock"
2820  and packets sent via the meta data socket "msock"
2821
2822		    sock                      msock
2823  -----------------+-------------------------+------------------------------
2824  timeout           conf.timeout / 2          conf.timeout / 2
2825  timeout action    send a ping via msock     Abort communication
2826					      and close all sockets
2827*/
2828
2829/*
2830 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
2831 */
2832int drbd_send(struct drbd_conf *mdev, struct socket *sock,
2833	      void *buf, size_t size, unsigned msg_flags)
2834{
2835	struct kvec iov;
2836	struct msghdr msg;
2837	int rv, sent = 0;
2838
2839	if (!sock)
2840		return -1000;
2841
2842	/* THINK  if (signal_pending) return ... ? */
2843
2844	iov.iov_base = buf;
2845	iov.iov_len  = size;
2846
2847	msg.msg_name       = NULL;
2848	msg.msg_namelen    = 0;
2849	msg.msg_control    = NULL;
2850	msg.msg_controllen = 0;
2851	msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
2852
2853	if (sock == mdev->data.socket) {
2854		mdev->ko_count = mdev->net_conf->ko_count;
2855		drbd_update_congested(mdev);
2856	}
2857	do {
2858		/* STRANGE
2859		 * tcp_sendmsg does _not_ use its size parameter at all ?
2860		 *
2861		 * -EAGAIN on timeout, -EINTR on signal.
2862		 */
2863/* THINK
2864 * do we need to block DRBD_SIG if sock == &meta.socket ??
2865 * otherwise wake_asender() might interrupt some send_*Ack !
2866 */
2867		rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
2868		if (rv == -EAGAIN) {
2869			if (we_should_drop_the_connection(mdev, sock))
2870				break;
2871			else
2872				continue;
2873		}
2874		D_ASSERT(rv != 0);
2875		if (rv == -EINTR) {
2876			flush_signals(current);
2877			rv = 0;
2878		}
2879		if (rv < 0)
2880			break;
2881		sent += rv;
2882		iov.iov_base += rv;
2883		iov.iov_len  -= rv;
2884	} while (sent < size);
2885
2886	if (sock == mdev->data.socket)
2887		clear_bit(NET_CONGESTED, &mdev->flags);
2888
2889	if (rv <= 0) {
2890		if (rv != -EAGAIN) {
2891			dev_err(DEV, "%s_sendmsg returned %d\n",
2892			    sock == mdev->meta.socket ? "msock" : "sock",
2893			    rv);
2894			drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
2895		} else
2896			drbd_force_state(mdev, NS(conn, C_TIMEOUT));
2897	}
2898
2899	return sent;
2900}
2901
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2902static int drbd_open(struct block_device *bdev, fmode_t mode)
2903{
2904	struct drbd_conf *mdev = bdev->bd_disk->private_data;
2905	unsigned long flags;
2906	int rv = 0;
2907
2908	mutex_lock(&drbd_main_mutex);
2909	spin_lock_irqsave(&mdev->req_lock, flags);
2910	/* to have a stable mdev->state.role
2911	 * and no race with updating open_cnt */
2912
2913	if (mdev->state.role != R_PRIMARY) {
2914		if (mode & FMODE_WRITE)
2915			rv = -EROFS;
2916		else if (!allow_oos)
2917			rv = -EMEDIUMTYPE;
2918	}
2919
2920	if (!rv)
2921		mdev->open_cnt++;
2922	spin_unlock_irqrestore(&mdev->req_lock, flags);
2923	mutex_unlock(&drbd_main_mutex);
2924
2925	return rv;
2926}
2927
2928static int drbd_release(struct gendisk *gd, fmode_t mode)
2929{
2930	struct drbd_conf *mdev = gd->private_data;
2931	mutex_lock(&drbd_main_mutex);
2932	mdev->open_cnt--;
2933	mutex_unlock(&drbd_main_mutex);
2934	return 0;
2935}
2936
2937static void drbd_set_defaults(struct drbd_conf *mdev)
 
2938{
2939	/* This way we get a compile error when sync_conf grows,
2940	   and we forgot to initialize it here */
2941	mdev->sync_conf = (struct syncer_conf) {
2942		/* .rate = */		DRBD_RATE_DEF,
2943		/* .after = */		DRBD_AFTER_DEF,
2944		/* .al_extents = */	DRBD_AL_EXTENTS_DEF,
2945		/* .verify_alg = */	{}, 0,
2946		/* .cpu_mask = */	{}, 0,
2947		/* .csums_alg = */	{}, 0,
2948		/* .use_rle = */	0,
2949		/* .on_no_data = */	DRBD_ON_NO_DATA_DEF,
2950		/* .c_plan_ahead = */	DRBD_C_PLAN_AHEAD_DEF,
2951		/* .c_delay_target = */	DRBD_C_DELAY_TARGET_DEF,
2952		/* .c_fill_target = */	DRBD_C_FILL_TARGET_DEF,
2953		/* .c_max_rate = */	DRBD_C_MAX_RATE_DEF,
2954		/* .c_min_rate = */	DRBD_C_MIN_RATE_DEF
2955	};
2956
2957	/* Have to use that way, because the layout differs between
2958	   big endian and little endian */
2959	mdev->state = (union drbd_state) {
 
 
2960		{ .role = R_SECONDARY,
2961		  .peer = R_UNKNOWN,
2962		  .conn = C_STANDALONE,
2963		  .disk = D_DISKLESS,
2964		  .pdsk = D_UNKNOWN,
2965		  .susp = 0,
2966		  .susp_nod = 0,
2967		  .susp_fen = 0
2968		} };
2969}
2970
2971void drbd_init_set_defaults(struct drbd_conf *mdev)
2972{
2973	/* the memset(,0,) did most of this.
2974	 * note: only assignments, no allocation in here */
2975
2976	drbd_set_defaults(mdev);
2977
2978	atomic_set(&mdev->ap_bio_cnt, 0);
2979	atomic_set(&mdev->ap_pending_cnt, 0);
2980	atomic_set(&mdev->rs_pending_cnt, 0);
2981	atomic_set(&mdev->unacked_cnt, 0);
2982	atomic_set(&mdev->local_cnt, 0);
2983	atomic_set(&mdev->net_cnt, 0);
2984	atomic_set(&mdev->packet_seq, 0);
2985	atomic_set(&mdev->pp_in_use, 0);
2986	atomic_set(&mdev->pp_in_use_by_net, 0);
2987	atomic_set(&mdev->rs_sect_in, 0);
2988	atomic_set(&mdev->rs_sect_ev, 0);
2989	atomic_set(&mdev->ap_in_flight, 0);
2990
2991	mutex_init(&mdev->md_io_mutex);
2992	mutex_init(&mdev->data.mutex);
2993	mutex_init(&mdev->meta.mutex);
2994	sema_init(&mdev->data.work.s, 0);
2995	sema_init(&mdev->meta.work.s, 0);
2996	mutex_init(&mdev->state_mutex);
2997
2998	spin_lock_init(&mdev->data.work.q_lock);
2999	spin_lock_init(&mdev->meta.work.q_lock);
3000
3001	spin_lock_init(&mdev->al_lock);
3002	spin_lock_init(&mdev->req_lock);
3003	spin_lock_init(&mdev->peer_seq_lock);
3004	spin_lock_init(&mdev->epoch_lock);
3005
3006	INIT_LIST_HEAD(&mdev->active_ee);
3007	INIT_LIST_HEAD(&mdev->sync_ee);
3008	INIT_LIST_HEAD(&mdev->done_ee);
3009	INIT_LIST_HEAD(&mdev->read_ee);
3010	INIT_LIST_HEAD(&mdev->net_ee);
3011	INIT_LIST_HEAD(&mdev->resync_reads);
3012	INIT_LIST_HEAD(&mdev->data.work.q);
3013	INIT_LIST_HEAD(&mdev->meta.work.q);
3014	INIT_LIST_HEAD(&mdev->resync_work.list);
3015	INIT_LIST_HEAD(&mdev->unplug_work.list);
3016	INIT_LIST_HEAD(&mdev->go_diskless.list);
3017	INIT_LIST_HEAD(&mdev->md_sync_work.list);
3018	INIT_LIST_HEAD(&mdev->start_resync_work.list);
3019	INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
3020
3021	mdev->resync_work.cb  = w_resync_timer;
3022	mdev->unplug_work.cb  = w_send_write_hint;
3023	mdev->go_diskless.cb  = w_go_diskless;
3024	mdev->md_sync_work.cb = w_md_sync;
3025	mdev->bm_io_work.w.cb = w_bitmap_io;
3026	mdev->start_resync_work.cb = w_start_resync;
3027	init_timer(&mdev->resync_timer);
3028	init_timer(&mdev->md_sync_timer);
3029	init_timer(&mdev->start_resync_timer);
3030	init_timer(&mdev->request_timer);
3031	mdev->resync_timer.function = resync_timer_fn;
3032	mdev->resync_timer.data = (unsigned long) mdev;
3033	mdev->md_sync_timer.function = md_sync_timer_fn;
3034	mdev->md_sync_timer.data = (unsigned long) mdev;
3035	mdev->start_resync_timer.function = start_resync_timer_fn;
3036	mdev->start_resync_timer.data = (unsigned long) mdev;
3037	mdev->request_timer.function = request_timer_fn;
3038	mdev->request_timer.data = (unsigned long) mdev;
3039
3040	init_waitqueue_head(&mdev->misc_wait);
3041	init_waitqueue_head(&mdev->state_wait);
3042	init_waitqueue_head(&mdev->net_cnt_wait);
3043	init_waitqueue_head(&mdev->ee_wait);
3044	init_waitqueue_head(&mdev->al_wait);
3045	init_waitqueue_head(&mdev->seq_wait);
3046
3047	drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3048	drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3049	drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3050
3051	mdev->agreed_pro_version = PRO_VERSION_MAX;
3052	mdev->write_ordering = WO_bdev_flush;
3053	mdev->resync_wenr = LC_FREE;
3054	mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3055	mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3056}
3057
3058void drbd_mdev_cleanup(struct drbd_conf *mdev)
3059{
3060	int i;
3061	if (mdev->receiver.t_state != None)
3062		dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3063				mdev->receiver.t_state);
3064
3065	/* no need to lock it, I'm the only thread alive */
3066	if (atomic_read(&mdev->current_epoch->epoch_size) !=  0)
3067		dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3068	mdev->al_writ_cnt  =
3069	mdev->bm_writ_cnt  =
3070	mdev->read_cnt     =
3071	mdev->recv_cnt     =
3072	mdev->send_cnt     =
3073	mdev->writ_cnt     =
3074	mdev->p_size       =
3075	mdev->rs_start     =
3076	mdev->rs_total     =
3077	mdev->rs_failed    = 0;
3078	mdev->rs_last_events = 0;
3079	mdev->rs_last_sect_ev = 0;
3080	for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3081		mdev->rs_mark_left[i] = 0;
3082		mdev->rs_mark_time[i] = 0;
3083	}
3084	D_ASSERT(mdev->net_conf == NULL);
3085
3086	drbd_set_my_capacity(mdev, 0);
3087	if (mdev->bitmap) {
3088		/* maybe never allocated. */
3089		drbd_bm_resize(mdev, 0, 1);
3090		drbd_bm_cleanup(mdev);
3091	}
3092
3093	drbd_free_resources(mdev);
3094	clear_bit(AL_SUSPENDED, &mdev->flags);
3095
3096	/*
3097	 * currently we drbd_init_ee only on module load, so
3098	 * we may do drbd_release_ee only on module unload!
3099	 */
3100	D_ASSERT(list_empty(&mdev->active_ee));
3101	D_ASSERT(list_empty(&mdev->sync_ee));
3102	D_ASSERT(list_empty(&mdev->done_ee));
3103	D_ASSERT(list_empty(&mdev->read_ee));
3104	D_ASSERT(list_empty(&mdev->net_ee));
3105	D_ASSERT(list_empty(&mdev->resync_reads));
3106	D_ASSERT(list_empty(&mdev->data.work.q));
3107	D_ASSERT(list_empty(&mdev->meta.work.q));
3108	D_ASSERT(list_empty(&mdev->resync_work.list));
3109	D_ASSERT(list_empty(&mdev->unplug_work.list));
3110	D_ASSERT(list_empty(&mdev->go_diskless.list));
3111
3112	drbd_set_defaults(mdev);
 
 
 
 
 
 
 
 
 
 
3113}
3114
3115
3116static void drbd_destroy_mempools(void)
3117{
3118	struct page *page;
3119
3120	while (drbd_pp_pool) {
3121		page = drbd_pp_pool;
3122		drbd_pp_pool = (struct page *)page_private(page);
3123		__free_page(page);
3124		drbd_pp_vacant--;
3125	}
3126
3127	/* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3128
3129	if (drbd_ee_mempool)
3130		mempool_destroy(drbd_ee_mempool);
3131	if (drbd_request_mempool)
3132		mempool_destroy(drbd_request_mempool);
3133	if (drbd_ee_cache)
3134		kmem_cache_destroy(drbd_ee_cache);
3135	if (drbd_request_cache)
3136		kmem_cache_destroy(drbd_request_cache);
3137	if (drbd_bm_ext_cache)
3138		kmem_cache_destroy(drbd_bm_ext_cache);
3139	if (drbd_al_ext_cache)
3140		kmem_cache_destroy(drbd_al_ext_cache);
3141
3142	drbd_ee_mempool      = NULL;
3143	drbd_request_mempool = NULL;
3144	drbd_ee_cache        = NULL;
3145	drbd_request_cache   = NULL;
3146	drbd_bm_ext_cache    = NULL;
3147	drbd_al_ext_cache    = NULL;
3148
3149	return;
3150}
3151
3152static int drbd_create_mempools(void)
3153{
3154	struct page *page;
3155	const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
3156	int i;
3157
3158	/* prepare our caches and mempools */
3159	drbd_request_mempool = NULL;
3160	drbd_ee_cache        = NULL;
3161	drbd_request_cache   = NULL;
3162	drbd_bm_ext_cache    = NULL;
3163	drbd_al_ext_cache    = NULL;
3164	drbd_pp_pool         = NULL;
3165
3166	/* caches */
3167	drbd_request_cache = kmem_cache_create(
3168		"drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3169	if (drbd_request_cache == NULL)
3170		goto Enomem;
3171
3172	drbd_ee_cache = kmem_cache_create(
3173		"drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3174	if (drbd_ee_cache == NULL)
3175		goto Enomem;
3176
3177	drbd_bm_ext_cache = kmem_cache_create(
3178		"drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3179	if (drbd_bm_ext_cache == NULL)
3180		goto Enomem;
3181
3182	drbd_al_ext_cache = kmem_cache_create(
3183		"drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3184	if (drbd_al_ext_cache == NULL)
3185		goto Enomem;
3186
3187	/* mempools */
3188	drbd_request_mempool = mempool_create(number,
3189		mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3190	if (drbd_request_mempool == NULL)
3191		goto Enomem;
3192
3193	drbd_ee_mempool = mempool_create(number,
3194		mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
3195	if (drbd_ee_mempool == NULL)
 
 
 
 
 
 
 
 
 
 
 
 
 
3196		goto Enomem;
3197
3198	/* drbd's page pool */
3199	spin_lock_init(&drbd_pp_lock);
3200
3201	for (i = 0; i < number; i++) {
3202		page = alloc_page(GFP_HIGHUSER);
3203		if (!page)
3204			goto Enomem;
3205		set_page_private(page, (unsigned long)drbd_pp_pool);
3206		drbd_pp_pool = page;
3207	}
3208	drbd_pp_vacant = number;
3209
3210	return 0;
3211
3212Enomem:
3213	drbd_destroy_mempools(); /* in case we allocated some */
3214	return -ENOMEM;
3215}
3216
3217static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3218	void *unused)
3219{
3220	/* just so we have it.  you never know what interesting things we
3221	 * might want to do here some day...
3222	 */
3223
3224	return NOTIFY_DONE;
3225}
3226
3227static struct notifier_block drbd_notifier = {
3228	.notifier_call = drbd_notify_sys,
3229};
3230
3231static void drbd_release_ee_lists(struct drbd_conf *mdev)
3232{
3233	int rr;
3234
3235	rr = drbd_release_ee(mdev, &mdev->active_ee);
3236	if (rr)
3237		dev_err(DEV, "%d EEs in active list found!\n", rr);
3238
3239	rr = drbd_release_ee(mdev, &mdev->sync_ee);
3240	if (rr)
3241		dev_err(DEV, "%d EEs in sync list found!\n", rr);
3242
3243	rr = drbd_release_ee(mdev, &mdev->read_ee);
3244	if (rr)
3245		dev_err(DEV, "%d EEs in read list found!\n", rr);
3246
3247	rr = drbd_release_ee(mdev, &mdev->done_ee);
3248	if (rr)
3249		dev_err(DEV, "%d EEs in done list found!\n", rr);
3250
3251	rr = drbd_release_ee(mdev, &mdev->net_ee);
3252	if (rr)
3253		dev_err(DEV, "%d EEs in net list found!\n", rr);
3254}
3255
3256/* caution. no locking.
3257 * currently only used from module cleanup code. */
3258static void drbd_delete_device(unsigned int minor)
3259{
3260	struct drbd_conf *mdev = minor_to_mdev(minor);
 
 
3261
3262	if (!mdev)
3263		return;
3264
3265	/* paranoia asserts */
3266	if (mdev->open_cnt != 0)
3267		dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3268				__FILE__ , __LINE__);
3269
3270	ERR_IF (!list_empty(&mdev->data.work.q)) {
3271		struct list_head *lp;
3272		list_for_each(lp, &mdev->data.work.q) {
3273			dev_err(DEV, "lp = %p\n", lp);
3274		}
3275	};
3276	/* end paranoia asserts */
3277
3278	del_gendisk(mdev->vdisk);
3279
3280	/* cleanup stuff that may have been allocated during
3281	 * device (re-)configuration or state changes */
3282
3283	if (mdev->this_bdev)
3284		bdput(mdev->this_bdev);
3285
3286	drbd_free_resources(mdev);
 
3287
3288	drbd_release_ee_lists(mdev);
3289
3290	/* should be freed on disconnect? */
3291	kfree(mdev->ee_hash);
3292	/*
3293	mdev->ee_hash_s = 0;
3294	mdev->ee_hash = NULL;
3295	*/
3296
3297	lc_destroy(mdev->act_log);
3298	lc_destroy(mdev->resync);
3299
3300	kfree(mdev->p_uuid);
3301	/* mdev->p_uuid = NULL; */
3302
3303	kfree(mdev->int_dig_out);
3304	kfree(mdev->int_dig_in);
3305	kfree(mdev->int_dig_vv);
3306
3307	/* cleanup the rest that has been
3308	 * allocated from drbd_new_device
3309	 * and actually free the mdev itself */
3310	drbd_free_mdev(mdev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3311}
3312
3313static void drbd_cleanup(void)
3314{
3315	unsigned int i;
3316
3317	unregister_reboot_notifier(&drbd_notifier);
3318
3319	/* first remove proc,
3320	 * drbdsetup uses it's presence to detect
3321	 * whether DRBD is loaded.
3322	 * If we would get stuck in proc removal,
3323	 * but have netlink already deregistered,
3324	 * some drbdsetup commands may wait forever
3325	 * for an answer.
3326	 */
3327	if (drbd_proc)
3328		remove_proc_entry("drbd", NULL);
3329
3330	drbd_nl_cleanup();
 
 
 
 
 
 
3331
3332	if (minor_table) {
3333		i = minor_count;
3334		while (i--)
3335			drbd_delete_device(i);
3336		drbd_destroy_mempools();
3337	}
3338
3339	kfree(minor_table);
3340
 
3341	unregister_blkdev(DRBD_MAJOR, "drbd");
3342
3343	printk(KERN_INFO "drbd: module cleanup done.\n");
 
 
3344}
3345
3346/**
3347 * drbd_congested() - Callback for pdflush
3348 * @congested_data:	User data
3349 * @bdi_bits:		Bits pdflush is currently interested in
3350 *
3351 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3352 */
3353static int drbd_congested(void *congested_data, int bdi_bits)
3354{
3355	struct drbd_conf *mdev = congested_data;
3356	struct request_queue *q;
3357	char reason = '-';
3358	int r = 0;
3359
3360	if (!may_inc_ap_bio(mdev)) {
3361		/* DRBD has frozen IO */
3362		r = bdi_bits;
3363		reason = 'd';
3364		goto out;
3365	}
 
 
 
 
 
 
 
3366
3367	if (get_ldev(mdev)) {
3368		q = bdev_get_queue(mdev->ldev->backing_bdev);
3369		r = bdi_congested(&q->backing_dev_info, bdi_bits);
3370		put_ldev(mdev);
3371		if (r)
3372			reason = 'b';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3373	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3374
3375	if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) {
3376		r |= (1 << BDI_async_congested);
3377		reason = reason == 'b' ? 'a' : 'n';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3378	}
 
3379
3380out:
3381	mdev->congestion_reason = reason;
3382	return r;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3383}
3384
3385struct drbd_conf *drbd_new_device(unsigned int minor)
 
3386{
3387	struct drbd_conf *mdev;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3388	struct gendisk *disk;
3389	struct request_queue *q;
 
 
 
 
 
 
 
3390
3391	/* GFP_KERNEL, we are outside of all write-out paths */
3392	mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3393	if (!mdev)
3394		return NULL;
3395	if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3396		goto out_no_cpumask;
3397
3398	mdev->minor = minor;
 
 
3399
3400	drbd_init_set_defaults(mdev);
3401
3402	q = blk_alloc_queue(GFP_KERNEL);
3403	if (!q)
3404		goto out_no_q;
3405	mdev->rq_queue = q;
3406	q->queuedata   = mdev;
3407
3408	disk = alloc_disk(1);
3409	if (!disk)
3410		goto out_no_disk;
3411	mdev->vdisk = disk;
3412
3413	set_disk_ro(disk, true);
3414
3415	disk->queue = q;
3416	disk->major = DRBD_MAJOR;
3417	disk->first_minor = minor;
3418	disk->fops = &drbd_ops;
3419	sprintf(disk->disk_name, "drbd%d", minor);
3420	disk->private_data = mdev;
3421
3422	mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3423	/* we have no partitions. we contain only ourselves. */
3424	mdev->this_bdev->bd_contains = mdev->this_bdev;
3425
3426	q->backing_dev_info.congested_fn = drbd_congested;
3427	q->backing_dev_info.congested_data = mdev;
3428
3429	blk_queue_make_request(q, drbd_make_request);
3430	/* Setting the max_hw_sectors to an odd value of 8kibyte here
3431	   This triggers a max_bio_size message upon first attach or connect */
3432	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
3433	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3434	blk_queue_merge_bvec(q, drbd_merge_bvec);
3435	q->queue_lock = &mdev->req_lock;
3436
3437	mdev->md_io_page = alloc_page(GFP_KERNEL);
3438	if (!mdev->md_io_page)
3439		goto out_no_io_page;
3440
3441	if (drbd_bm_init(mdev))
3442		goto out_no_bitmap;
3443	/* no need to lock access, we are still initializing this minor device. */
3444	if (!tl_init(mdev))
3445		goto out_no_tl;
3446
3447	mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3448	if (!mdev->app_reads_hash)
3449		goto out_no_app_reads;
3450
3451	mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3452	if (!mdev->current_epoch)
3453		goto out_no_epoch;
3454
3455	INIT_LIST_HEAD(&mdev->current_epoch->list);
3456	mdev->epochs = 1;
3457
3458	return mdev;
3459
3460/* out_whatever_else:
3461	kfree(mdev->current_epoch); */
3462out_no_epoch:
3463	kfree(mdev->app_reads_hash);
3464out_no_app_reads:
3465	tl_cleanup(mdev);
3466out_no_tl:
3467	drbd_bm_cleanup(mdev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3468out_no_bitmap:
3469	__free_page(mdev->md_io_page);
3470out_no_io_page:
3471	put_disk(disk);
3472out_no_disk:
3473	blk_cleanup_queue(q);
3474out_no_q:
3475	free_cpumask_var(mdev->cpu_mask);
3476out_no_cpumask:
3477	kfree(mdev);
3478	return NULL;
3479}
3480
3481/* counterpart of drbd_new_device.
3482 * last part of drbd_delete_device. */
3483void drbd_free_mdev(struct drbd_conf *mdev)
3484{
3485	kfree(mdev->current_epoch);
3486	kfree(mdev->app_reads_hash);
3487	tl_cleanup(mdev);
3488	if (mdev->bitmap) /* should no longer be there. */
3489		drbd_bm_cleanup(mdev);
3490	__free_page(mdev->md_io_page);
3491	put_disk(mdev->vdisk);
3492	blk_cleanup_queue(mdev->rq_queue);
3493	free_cpumask_var(mdev->cpu_mask);
3494	drbd_free_tl_hash(mdev);
3495	kfree(mdev);
 
 
 
 
 
 
3496}
3497
3498
3499int __init drbd_init(void)
3500{
3501	int err;
3502
3503	if (sizeof(struct p_handshake) != 80) {
3504		printk(KERN_ERR
3505		       "drbd: never change the size or layout "
3506		       "of the HandShake packet.\n");
3507		return -EINVAL;
3508	}
3509
3510	if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
3511		printk(KERN_ERR
3512			"drbd: invalid minor_count (%d)\n", minor_count);
3513#ifdef MODULE
3514		return -EINVAL;
3515#else
3516		minor_count = 8;
3517#endif
3518	}
3519
3520	err = drbd_nl_init();
3521	if (err)
3522		return err;
3523
3524	err = register_blkdev(DRBD_MAJOR, "drbd");
3525	if (err) {
3526		printk(KERN_ERR
3527		       "drbd: unable to register block device major %d\n",
3528		       DRBD_MAJOR);
3529		return err;
3530	}
3531
3532	register_reboot_notifier(&drbd_notifier);
3533
3534	/*
3535	 * allocate all necessary structs
3536	 */
3537	err = -ENOMEM;
3538
3539	init_waitqueue_head(&drbd_pp_wait);
3540
3541	drbd_proc = NULL; /* play safe for drbd_cleanup */
3542	minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3543				GFP_KERNEL);
3544	if (!minor_table)
3545		goto Enomem;
 
 
 
 
 
 
3546
3547	err = drbd_create_mempools();
3548	if (err)
3549		goto Enomem;
3550
3551	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
 
3552	if (!drbd_proc)	{
3553		printk(KERN_ERR "drbd: unable to register proc file\n");
3554		goto Enomem;
 
 
 
 
 
 
3555	}
 
 
 
3556
3557	rwlock_init(&global_state_lock);
3558
3559	printk(KERN_INFO "drbd: initialized. "
3560	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3561	       API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3562	printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3563	printk(KERN_INFO "drbd: registered as block device major %d\n",
3564		DRBD_MAJOR);
3565	printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3566
3567	return 0; /* Success! */
3568
3569Enomem:
3570	drbd_cleanup();
3571	if (err == -ENOMEM)
3572		/* currently always the case */
3573		printk(KERN_ERR "drbd: ran out of memory\n");
3574	else
3575		printk(KERN_ERR "drbd: initialization failure\n");
3576	return err;
3577}
3578
3579void drbd_free_bc(struct drbd_backing_dev *ldev)
3580{
3581	if (ldev == NULL)
3582		return;
3583
3584	blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3585	blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3586
3587	kfree(ldev);
 
 
 
 
3588}
3589
3590void drbd_free_sock(struct drbd_conf *mdev)
3591{
3592	if (mdev->data.socket) {
3593		mutex_lock(&mdev->data.mutex);
3594		kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3595		sock_release(mdev->data.socket);
3596		mdev->data.socket = NULL;
3597		mutex_unlock(&mdev->data.mutex);
3598	}
3599	if (mdev->meta.socket) {
3600		mutex_lock(&mdev->meta.mutex);
3601		kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3602		sock_release(mdev->meta.socket);
3603		mdev->meta.socket = NULL;
3604		mutex_unlock(&mdev->meta.mutex);
3605	}
3606}
3607
 
3608
3609void drbd_free_resources(struct drbd_conf *mdev)
3610{
3611	crypto_free_hash(mdev->csums_tfm);
3612	mdev->csums_tfm = NULL;
3613	crypto_free_hash(mdev->verify_tfm);
3614	mdev->verify_tfm = NULL;
3615	crypto_free_hash(mdev->cram_hmac_tfm);
3616	mdev->cram_hmac_tfm = NULL;
3617	crypto_free_hash(mdev->integrity_w_tfm);
3618	mdev->integrity_w_tfm = NULL;
3619	crypto_free_hash(mdev->integrity_r_tfm);
3620	mdev->integrity_r_tfm = NULL;
3621
3622	drbd_free_sock(mdev);
 
 
3623
3624	__no_warn(local,
3625		  drbd_free_bc(mdev->ldev);
3626		  mdev->ldev = NULL;);
 
 
 
 
3627}
3628
3629/* meta data management */
3630
3631struct meta_data_on_disk {
3632	u64 la_size;           /* last agreed size. */
3633	u64 uuid[UI_SIZE];   /* UUIDs. */
3634	u64 device_uuid;
3635	u64 reserved_u64_1;
3636	u32 flags;             /* MDF */
3637	u32 magic;
3638	u32 md_size_sect;
3639	u32 al_offset;         /* offset to this block */
3640	u32 al_nr_extents;     /* important for restoring the AL */
3641	      /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3642	u32 bm_offset;         /* offset to the bitmap, from here */
3643	u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
3644	u32 la_peer_max_bio_size;   /* last peer max_bio_size */
3645	u32 reserved_u32[3];
3646
 
 
 
 
 
3647} __packed;
3648
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3649/**
3650 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3651 * @mdev:	DRBD device.
3652 */
3653void drbd_md_sync(struct drbd_conf *mdev)
3654{
3655	struct meta_data_on_disk *buffer;
3656	sector_t sector;
3657	int i;
3658
3659	del_timer(&mdev->md_sync_timer);
 
 
 
 
3660	/* timer may be rearmed by drbd_md_mark_dirty() now. */
3661	if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
3662		return;
3663
3664	/* We use here D_FAILED and not D_ATTACHING because we try to write
3665	 * metadata even if we detach due to a disk failure! */
3666	if (!get_ldev_if_state(mdev, D_FAILED))
3667		return;
3668
3669	mutex_lock(&mdev->md_io_mutex);
3670	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
3671	memset(buffer, 0, 512);
3672
3673	buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3674	for (i = UI_CURRENT; i < UI_SIZE; i++)
3675		buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3676	buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3677	buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3678
3679	buffer->md_size_sect  = cpu_to_be32(mdev->ldev->md.md_size_sect);
3680	buffer->al_offset     = cpu_to_be32(mdev->ldev->md.al_offset);
3681	buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3682	buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3683	buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3684
3685	buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3686	buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
 
3687
3688	D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3689	sector = mdev->ldev->md.md_offset;
 
 
3690
3691	if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3692		/* this was a try anyways ... */
3693		dev_err(DEV, "meta data update failed!\n");
3694		drbd_chk_io_error(mdev, 1, true);
 
 
 
 
 
 
 
 
3695	}
3696
3697	/* Update mdev->ldev->md.la_size_sect,
3698	 * since we updated it on metadata. */
3699	mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
 
 
 
 
 
 
 
 
 
 
 
3700
3701	mutex_unlock(&mdev->md_io_mutex);
3702	put_ldev(mdev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3703}
3704
 
3705/**
3706 * drbd_md_read() - Reads in the meta data super block
3707 * @mdev:	DRBD device.
3708 * @bdev:	Device from which the meta data should be read in.
3709 *
3710 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
3711 * something goes wrong.  Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
 
 
 
3712 */
3713int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3714{
3715	struct meta_data_on_disk *buffer;
 
3716	int i, rv = NO_ERROR;
3717
3718	if (!get_ldev_if_state(mdev, D_ATTACHING))
3719		return ERR_IO_MD_DISK;
3720
3721	mutex_lock(&mdev->md_io_mutex);
3722	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
 
 
 
 
 
 
 
 
 
 
3723
3724	if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
 
3725		/* NOTE: can't do normal error processing here as this is
3726		   called BEFORE disk is attached */
3727		dev_err(DEV, "Error while reading metadata.\n");
3728		rv = ERR_IO_MD_DISK;
3729		goto err;
3730	}
3731
3732	if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3733		dev_err(DEV, "Error while reading metadata, magic not found.\n");
3734		rv = ERR_MD_INVALID;
 
 
 
 
3735		goto err;
3736	}
3737	if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3738		dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3739		    be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3740		rv = ERR_MD_INVALID;
3741		goto err;
3742	}
3743	if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3744		dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3745		    be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3746		rv = ERR_MD_INVALID;
3747		goto err;
3748	}
3749	if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3750		dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3751		    be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3752		rv = ERR_MD_INVALID;
3753		goto err;
3754	}
3755
3756	if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3757		dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3758		    be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3759		rv = ERR_MD_INVALID;
3760		goto err;
3761	}
3762
3763	bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
 
 
3764	for (i = UI_CURRENT; i < UI_SIZE; i++)
3765		bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3766	bdev->md.flags = be32_to_cpu(buffer->flags);
3767	mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3768	bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3769
3770	spin_lock_irq(&mdev->req_lock);
3771	if (mdev->state.conn < C_CONNECTED) {
3772		int peer;
3773		peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3774		peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
3775		mdev->peer_max_bio_size = peer;
 
 
 
 
 
 
 
 
 
 
 
 
3776	}
3777	spin_unlock_irq(&mdev->req_lock);
3778
3779	if (mdev->sync_conf.al_extents < 7)
3780		mdev->sync_conf.al_extents = 127;
 
 
 
 
 
 
 
 
3781
3782 err:
3783	mutex_unlock(&mdev->md_io_mutex);
3784	put_ldev(mdev);
3785
3786	return rv;
3787}
3788
3789/**
3790 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3791 * @mdev:	DRBD device.
3792 *
3793 * Call this function if you change anything that should be written to
3794 * the meta-data super block. This function sets MD_DIRTY, and starts a
3795 * timer that ensures that within five seconds you have to call drbd_md_sync().
3796 */
3797#ifdef DEBUG
3798void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
3799{
3800	if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
3801		mod_timer(&mdev->md_sync_timer, jiffies + HZ);
3802		mdev->last_md_mark_dirty.line = line;
3803		mdev->last_md_mark_dirty.func = func;
3804	}
3805}
3806#else
3807void drbd_md_mark_dirty(struct drbd_conf *mdev)
3808{
3809	if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
3810		mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
3811}
3812#endif
3813
3814static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
3815{
3816	int i;
3817
3818	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
3819		mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
3820}
3821
3822void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3823{
3824	if (idx == UI_CURRENT) {
3825		if (mdev->state.role == R_PRIMARY)
3826			val |= 1;
3827		else
3828			val &= ~((u64)1);
3829
3830		drbd_set_ed_uuid(mdev, val);
3831	}
3832
3833	mdev->ldev->md.uuid[idx] = val;
3834	drbd_md_mark_dirty(mdev);
3835}
3836
 
 
 
 
 
 
 
3837
3838void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
3839{
3840	if (mdev->ldev->md.uuid[idx]) {
3841		drbd_uuid_move_history(mdev);
3842		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
 
 
3843	}
3844	_drbd_uuid_set(mdev, idx, val);
 
3845}
3846
3847/**
3848 * drbd_uuid_new_current() - Creates a new current UUID
3849 * @mdev:	DRBD device.
3850 *
3851 * Creates a new current UUID, and rotates the old current UUID into
3852 * the bitmap slot. Causes an incremental resync upon next connect.
3853 */
3854void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
3855{
3856	u64 val;
3857	unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
 
 
 
 
 
3858
3859	if (bm_uuid)
3860		dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3861
3862	mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
 
 
3863
3864	get_random_bytes(&val, sizeof(u64));
3865	_drbd_uuid_set(mdev, UI_CURRENT, val);
3866	drbd_print_uuids(mdev, "new current UUID");
3867	/* get it to stable storage _now_ */
3868	drbd_md_sync(mdev);
3869}
3870
3871void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
3872{
3873	if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
 
3874		return;
3875
 
3876	if (val == 0) {
3877		drbd_uuid_move_history(mdev);
3878		mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
3879		mdev->ldev->md.uuid[UI_BITMAP] = 0;
3880	} else {
3881		unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
3882		if (bm_uuid)
3883			dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
3884
3885		mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
3886	}
3887	drbd_md_mark_dirty(mdev);
 
 
3888}
3889
3890/**
3891 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3892 * @mdev:	DRBD device.
3893 *
3894 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3895 */
3896int drbd_bmio_set_n_write(struct drbd_conf *mdev)
3897{
3898	int rv = -EIO;
3899
3900	if (get_ldev_if_state(mdev, D_ATTACHING)) {
3901		drbd_md_set_flag(mdev, MDF_FULL_SYNC);
3902		drbd_md_sync(mdev);
3903		drbd_bm_set_all(mdev);
3904
3905		rv = drbd_bm_write(mdev);
3906
3907		if (!rv) {
3908			drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
3909			drbd_md_sync(mdev);
3910		}
3911
3912		put_ldev(mdev);
 
 
 
 
3913	}
3914
3915	return rv;
3916}
3917
3918/**
3919 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3920 * @mdev:	DRBD device.
3921 *
3922 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3923 */
3924int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
3925{
3926	int rv = -EIO;
3927
3928	drbd_resume_al(mdev);
3929	if (get_ldev_if_state(mdev, D_ATTACHING)) {
3930		drbd_bm_clear_all(mdev);
3931		rv = drbd_bm_write(mdev);
3932		put_ldev(mdev);
3933	}
3934
3935	return rv;
3936}
3937
3938static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3939{
3940	struct bm_io_work *work = container_of(w, struct bm_io_work, w);
 
 
3941	int rv = -EIO;
3942
3943	D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
3944
3945	if (get_ldev(mdev)) {
3946		drbd_bm_lock(mdev, work->why, work->flags);
3947		rv = work->io_fn(mdev);
3948		drbd_bm_unlock(mdev);
3949		put_ldev(mdev);
 
 
 
 
 
3950	}
3951
3952	clear_bit(BITMAP_IO, &mdev->flags);
3953	smp_mb__after_clear_bit();
3954	wake_up(&mdev->misc_wait);
3955
3956	if (work->done)
3957		work->done(mdev, rv);
3958
3959	clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
3960	work->why = NULL;
3961	work->flags = 0;
3962
3963	return 1;
3964}
3965
3966void drbd_ldev_destroy(struct drbd_conf *mdev)
3967{
3968	lc_destroy(mdev->resync);
3969	mdev->resync = NULL;
3970	lc_destroy(mdev->act_log);
3971	mdev->act_log = NULL;
3972	__no_warn(local,
3973		drbd_free_bc(mdev->ldev);
3974		mdev->ldev = NULL;);
3975
3976	if (mdev->md_io_tmpp) {
3977		__free_page(mdev->md_io_tmpp);
3978		mdev->md_io_tmpp = NULL;
3979	}
3980	clear_bit(GO_DISKLESS, &mdev->flags);
3981}
3982
3983static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
3984{
3985	D_ASSERT(mdev->state.disk == D_FAILED);
3986	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
3987	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
3988	 * the protected members anymore, though, so once put_ldev reaches zero
3989	 * again, it will be safe to free them. */
3990	drbd_force_state(mdev, NS(disk, D_DISKLESS));
3991	return 1;
3992}
3993
3994void drbd_go_diskless(struct drbd_conf *mdev)
3995{
3996	D_ASSERT(mdev->state.disk == D_FAILED);
3997	if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
3998		drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
3999}
4000
4001/**
4002 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4003 * @mdev:	DRBD device.
4004 * @io_fn:	IO callback to be called when bitmap IO is possible
4005 * @done:	callback to be called after the bitmap IO was performed
4006 * @why:	Descriptive text of the reason for doing the IO
4007 *
4008 * While IO on the bitmap happens we freeze application IO thus we ensure
4009 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4010 * called from worker context. It MUST NOT be used while a previous such
4011 * work is still pending!
 
 
 
4012 */
4013void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4014			  int (*io_fn)(struct drbd_conf *),
4015			  void (*done)(struct drbd_conf *, int),
4016			  char *why, enum bm_flag flags)
4017{
4018	D_ASSERT(current == mdev->worker.task);
4019
4020	D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
4021	D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
4022	D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4023	if (mdev->bm_io_work.why)
4024		dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4025			why, mdev->bm_io_work.why);
4026
4027	mdev->bm_io_work.io_fn = io_fn;
4028	mdev->bm_io_work.done = done;
4029	mdev->bm_io_work.why = why;
4030	mdev->bm_io_work.flags = flags;
4031
4032	spin_lock_irq(&mdev->req_lock);
4033	set_bit(BITMAP_IO, &mdev->flags);
4034	if (atomic_read(&mdev->ap_bio_cnt) == 0) {
4035		if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
4036			drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
 
 
 
4037	}
4038	spin_unlock_irq(&mdev->req_lock);
4039}
4040
4041/**
4042 * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
4043 * @mdev:	DRBD device.
4044 * @io_fn:	IO callback to be called when bitmap IO is possible
4045 * @why:	Descriptive text of the reason for doing the IO
4046 *
4047 * freezes application IO while that the actual IO operations runs. This
4048 * functions MAY NOT be called from worker context.
4049 */
4050int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4051		char *why, enum bm_flag flags)
4052{
 
 
4053	int rv;
4054
4055	D_ASSERT(current != mdev->worker.task);
4056
4057	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4058		drbd_suspend_io(mdev);
4059
4060	drbd_bm_lock(mdev, why, flags);
4061	rv = io_fn(mdev);
4062	drbd_bm_unlock(mdev);
4063
4064	if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4065		drbd_resume_io(mdev);
4066
4067	return rv;
4068}
4069
4070void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4071{
4072	if ((mdev->ldev->md.flags & flag) != flag) {
4073		drbd_md_mark_dirty(mdev);
4074		mdev->ldev->md.flags |= flag;
4075	}
4076}
4077
4078void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4079{
4080	if ((mdev->ldev->md.flags & flag) != 0) {
4081		drbd_md_mark_dirty(mdev);
4082		mdev->ldev->md.flags &= ~flag;
4083	}
4084}
4085int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4086{
4087	return (bdev->md.flags & flag) != 0;
4088}
4089
4090static void md_sync_timer_fn(unsigned long data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4091{
4092	struct drbd_conf *mdev = (struct drbd_conf *) data;
 
4093
4094	drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
 
 
 
4095}
4096
4097static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4098{
4099	dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
4100#ifdef DEBUG
4101	dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4102		mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4103#endif
4104	drbd_md_sync(mdev);
4105	return 1;
4106}
4107
4108#ifdef CONFIG_DRBD_FAULT_INJECTION
4109/* Fault insertion support including random number generator shamelessly
4110 * stolen from kernel/rcutorture.c */
4111struct fault_random_state {
4112	unsigned long state;
4113	unsigned long count;
4114};
4115
4116#define FAULT_RANDOM_MULT 39916801  /* prime */
4117#define FAULT_RANDOM_ADD	479001701 /* prime */
4118#define FAULT_RANDOM_REFRESH 10000
4119
4120/*
4121 * Crude but fast random-number generator.  Uses a linear congruential
4122 * generator, with occasional help from get_random_bytes().
4123 */
4124static unsigned long
4125_drbd_fault_random(struct fault_random_state *rsp)
4126{
4127	long refresh;
4128
4129	if (!rsp->count--) {
4130		get_random_bytes(&refresh, sizeof(refresh));
4131		rsp->state += refresh;
4132		rsp->count = FAULT_RANDOM_REFRESH;
4133	}
4134	rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4135	return swahw32(rsp->state);
4136}
4137
4138static char *
4139_drbd_fault_str(unsigned int type) {
4140	static char *_faults[] = {
4141		[DRBD_FAULT_MD_WR] = "Meta-data write",
4142		[DRBD_FAULT_MD_RD] = "Meta-data read",
4143		[DRBD_FAULT_RS_WR] = "Resync write",
4144		[DRBD_FAULT_RS_RD] = "Resync read",
4145		[DRBD_FAULT_DT_WR] = "Data write",
4146		[DRBD_FAULT_DT_RD] = "Data read",
4147		[DRBD_FAULT_DT_RA] = "Data read ahead",
4148		[DRBD_FAULT_BM_ALLOC] = "BM allocation",
4149		[DRBD_FAULT_AL_EE] = "EE allocation",
4150		[DRBD_FAULT_RECEIVE] = "receive data corruption",
4151	};
4152
4153	return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4154}
4155
4156unsigned int
4157_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4158{
4159	static struct fault_random_state rrs = {0, 0};
4160
4161	unsigned int ret = (
4162		(fault_devs == 0 ||
4163			((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4164		(((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4165
4166	if (ret) {
4167		fault_count++;
4168
4169		if (__ratelimit(&drbd_ratelimit_state))
4170			dev_warn(DEV, "***Simulating %s failure\n",
4171				_drbd_fault_str(type));
4172	}
4173
4174	return ret;
4175}
4176#endif
4177
4178const char *drbd_buildtag(void)
4179{
4180	/* DRBD built from external sources has here a reference to the
4181	   git hash of the source code. */
4182
4183	static char buildtag[38] = "\0uilt-in";
4184
4185	if (buildtag[0] == 0) {
4186#ifdef CONFIG_MODULES
4187		if (THIS_MODULE != NULL)
4188			sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4189		else
4190#endif
4191			buildtag[0] = 'b';
4192	}
4193
4194	return buildtag;
4195}
4196
4197module_init(drbd_init)
4198module_exit(drbd_cleanup)
4199
4200EXPORT_SYMBOL(drbd_conn_str);
4201EXPORT_SYMBOL(drbd_role_str);
4202EXPORT_SYMBOL(drbd_disk_str);
4203EXPORT_SYMBOL(drbd_set_st_err_str);
v5.9
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3   drbd.c
   4
   5   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   6
   7   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   8   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   9   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
  10
  11   Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
  12   from Logicworks, Inc. for making SDP replication support possible.
  13
 
 
 
 
 
 
 
 
 
 
 
 
 
  14
  15 */
  16
  17#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
  18
  19#include <linux/module.h>
  20#include <linux/jiffies.h>
  21#include <linux/drbd.h>
  22#include <linux/uaccess.h>
  23#include <asm/types.h>
  24#include <net/sock.h>
  25#include <linux/ctype.h>
  26#include <linux/mutex.h>
  27#include <linux/fs.h>
  28#include <linux/file.h>
  29#include <linux/proc_fs.h>
  30#include <linux/init.h>
  31#include <linux/mm.h>
  32#include <linux/memcontrol.h>
  33#include <linux/mm_inline.h>
  34#include <linux/slab.h>
  35#include <linux/random.h>
  36#include <linux/reboot.h>
  37#include <linux/notifier.h>
  38#include <linux/kthread.h>
  39#include <linux/workqueue.h>
  40#define __KERNEL_SYSCALLS__
  41#include <linux/unistd.h>
  42#include <linux/vmalloc.h>
  43#include <linux/sched/signal.h>
  44
  45#include <linux/drbd_limits.h>
  46#include "drbd_int.h"
  47#include "drbd_protocol.h"
  48#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
 
  49#include "drbd_vli.h"
  50#include "drbd_debugfs.h"
 
 
 
 
 
 
 
  51
  52static DEFINE_MUTEX(drbd_main_mutex);
 
 
 
 
 
  53static int drbd_open(struct block_device *bdev, fmode_t mode);
  54static void drbd_release(struct gendisk *gd, fmode_t mode);
  55static void md_sync_timer_fn(struct timer_list *t);
  56static int w_bitmap_io(struct drbd_work *w, int unused);
 
 
 
 
 
  57
  58MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
  59	      "Lars Ellenberg <lars@linbit.com>");
  60MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
  61MODULE_VERSION(REL_VERSION);
  62MODULE_LICENSE("GPL");
  63MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
  64		 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
  65MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
  66
  67#include <linux/moduleparam.h>
 
 
  68/* thanks to these macros, if compiled into the kernel (not-module),
  69 * these become boot parameters (e.g., drbd.minor_count) */
 
 
 
 
 
  70
  71#ifdef CONFIG_DRBD_FAULT_INJECTION
  72int drbd_enable_faults;
  73int drbd_fault_rate;
  74static int drbd_fault_count;
  75static int drbd_fault_devs;
  76/* bitmap of enabled faults */
  77module_param_named(enable_faults, drbd_enable_faults, int, 0664);
  78/* fault rate % value - applies to all enabled faults */
  79module_param_named(fault_rate, drbd_fault_rate, int, 0664);
  80/* count of faults inserted */
  81module_param_named(fault_count, drbd_fault_count, int, 0664);
  82/* bitmap of devices to insert faults on */
  83module_param_named(fault_devs, drbd_fault_devs, int, 0644);
  84#endif
  85
  86/* module parameters we can keep static */
  87static bool drbd_allow_oos; /* allow_open_on_secondary */
  88static bool drbd_disable_sendpage;
  89MODULE_PARM_DESC(allow_oos, "DONT USE!");
  90module_param_named(allow_oos, drbd_allow_oos, bool, 0);
  91module_param_named(disable_sendpage, drbd_disable_sendpage, bool, 0644);
  92
  93/* module parameters we share */
  94int drbd_proc_details; /* Detail level in proc drbd*/
  95module_param_named(proc_details, drbd_proc_details, int, 0644);
  96/* module parameters shared with defaults */
  97unsigned int drbd_minor_count = DRBD_MINOR_COUNT_DEF;
  98/* Module parameter for setting the user mode helper program
  99 * to run. Default is /sbin/drbdadm */
 100char drbd_usermode_helper[80] = "/sbin/drbdadm";
 101module_param_named(minor_count, drbd_minor_count, uint, 0444);
 102module_param_string(usermode_helper, drbd_usermode_helper, sizeof(drbd_usermode_helper), 0644);
 103
 104/* in 2.6.x, our device mapping and config info contains our virtual gendisks
 105 * as member "struct gendisk *vdisk;"
 106 */
 107struct idr drbd_devices;
 108struct list_head drbd_resources;
 109struct mutex resources_mutex;
 110
 111struct kmem_cache *drbd_request_cache;
 112struct kmem_cache *drbd_ee_cache;	/* peer requests */
 113struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
 114struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
 115mempool_t drbd_request_mempool;
 116mempool_t drbd_ee_mempool;
 117mempool_t drbd_md_io_page_pool;
 118struct bio_set drbd_md_io_bio_set;
 119struct bio_set drbd_io_bio_set;
 120
 121/* I do not use a standard mempool, because:
 122   1) I want to hand out the pre-allocated objects first.
 123   2) I want to be able to interrupt sleeping allocation with a signal.
 124   Note: This is a single linked list, the next pointer is the private
 125	 member of struct page.
 126 */
 127struct page *drbd_pp_pool;
 128spinlock_t   drbd_pp_lock;
 129int          drbd_pp_vacant;
 130wait_queue_head_t drbd_pp_wait;
 131
 132DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
 133
 134static const struct block_device_operations drbd_ops = {
 135	.owner		= THIS_MODULE,
 136	.submit_bio	= drbd_submit_bio,
 137	.open		= drbd_open,
 138	.release	= drbd_release,
 139};
 140
 141struct bio *bio_alloc_drbd(gfp_t gfp_mask)
 142{
 143	struct bio *bio;
 144
 145	if (!bioset_initialized(&drbd_md_io_bio_set))
 146		return bio_alloc(gfp_mask, 1);
 147
 148	bio = bio_alloc_bioset(gfp_mask, 1, &drbd_md_io_bio_set);
 149	if (!bio)
 150		return NULL;
 151	return bio;
 152}
 153
 154#ifdef __CHECKER__
 155/* When checking with sparse, and this is an inline function, sparse will
 156   give tons of false positives. When this is a real functions sparse works.
 157 */
 158int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
 159{
 160	int io_allowed;
 161
 162	atomic_inc(&device->local_cnt);
 163	io_allowed = (device->state.disk >= mins);
 164	if (!io_allowed) {
 165		if (atomic_dec_and_test(&device->local_cnt))
 166			wake_up(&device->misc_wait);
 167	}
 168	return io_allowed;
 169}
 170
 171#endif
 172
 173/**
 174 * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
 175 * @connection:	DRBD connection.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 176 * @barrier_nr:	Expected identifier of the DRBD write barrier packet.
 177 * @set_size:	Expected number of requests before that barrier.
 178 *
 179 * In case the passed barrier_nr or set_size does not match the oldest
 180 * epoch of not yet barrier-acked requests, this function will cause a
 181 * termination of the connection.
 182 */
 183void tl_release(struct drbd_connection *connection, unsigned int barrier_nr,
 184		unsigned int set_size)
 185{
 
 
 186	struct drbd_request *r;
 187	struct drbd_request *req = NULL;
 188	int expect_epoch = 0;
 189	int expect_size = 0;
 190
 191	spin_lock_irq(&connection->resource->req_lock);
 192
 193	/* find oldest not yet barrier-acked write request,
 194	 * count writes in its epoch. */
 195	list_for_each_entry(r, &connection->transfer_log, tl_requests) {
 196		const unsigned s = r->rq_state;
 197		if (!req) {
 198			if (!(s & RQ_WRITE))
 199				continue;
 200			if (!(s & RQ_NET_MASK))
 201				continue;
 202			if (s & RQ_NET_DONE)
 203				continue;
 204			req = r;
 205			expect_epoch = req->epoch;
 206			expect_size ++;
 207		} else {
 208			if (r->epoch != expect_epoch)
 209				break;
 210			if (!(s & RQ_WRITE))
 211				continue;
 212			/* if (s & RQ_DONE): not expected */
 213			/* if (!(s & RQ_NET_MASK)): not expected */
 214			expect_size++;
 215		}
 216	}
 217
 218	/* first some paranoia code */
 219	if (req == NULL) {
 220		drbd_err(connection, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
 221			 barrier_nr);
 222		goto bail;
 223	}
 224	if (expect_epoch != barrier_nr) {
 225		drbd_err(connection, "BAD! BarrierAck #%u received, expected #%u!\n",
 226			 barrier_nr, expect_epoch);
 227		goto bail;
 228	}
 229
 230	if (expect_size != set_size) {
 231		drbd_err(connection, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
 232			 barrier_nr, set_size, expect_size);
 233		goto bail;
 234	}
 235
 236	/* Clean up list of requests processed during current epoch. */
 237	/* this extra list walk restart is paranoia,
 238	 * to catch requests being barrier-acked "unexpectedly".
 239	 * It usually should find the same req again, or some READ preceding it. */
 240	list_for_each_entry(req, &connection->transfer_log, tl_requests)
 241		if (req->epoch == expect_epoch)
 242			break;
 243	list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) {
 244		if (req->epoch != expect_epoch)
 245			break;
 246		_req_mod(req, BARRIER_ACKED);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 247	}
 248	spin_unlock_irq(&connection->resource->req_lock);
 
 
 249
 250	return;
 251
 252bail:
 253	spin_unlock_irq(&connection->resource->req_lock);
 254	conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
 255}
 256
 257
 258/**
 259 * _tl_restart() - Walks the transfer log, and applies an action to all requests
 260 * @connection:	DRBD connection to operate on.
 261 * @what:       The action/event to perform with all request objects
 262 *
 263 * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
 264 * RESTART_FROZEN_DISK_IO.
 265 */
 266/* must hold resource->req_lock */
 267void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
 268{
 269	struct drbd_request *req, *r;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 270
 271	list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests)
 272		_req_mod(req, what);
 
 
 
 
 
 
 
 273}
 274
 275void tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
 276{
 277	spin_lock_irq(&connection->resource->req_lock);
 278	_tl_restart(connection, what);
 279	spin_unlock_irq(&connection->resource->req_lock);
 280}
 281
 282/**
 283 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
 284 * @device:	DRBD device.
 285 *
 286 * This is called after the connection to the peer was lost. The storage covered
 287 * by the requests on the transfer gets marked as our of sync. Called from the
 288 * receiver thread and the worker thread.
 289 */
 290void tl_clear(struct drbd_connection *connection)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 291{
 292	tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 293}
 294
 295/**
 296 * tl_abort_disk_io() - Abort disk I/O for all requests for a certain device in the TL
 297 * @device:	DRBD device.
 
 
 
 298 */
 299void tl_abort_disk_io(struct drbd_device *device)
 
 300{
 301	struct drbd_connection *connection = first_peer_device(device)->connection;
 302	struct drbd_request *req, *r;
 
 303
 304	spin_lock_irq(&connection->resource->req_lock);
 305	list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) {
 306		if (!(req->rq_state & RQ_LOCAL_PENDING))
 307			continue;
 308		if (req->device != device)
 309			continue;
 310		_req_mod(req, ABORT_DISK_IO);
 311	}
 312	spin_unlock_irq(&connection->resource->req_lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 313}
 314
 
 315static int drbd_thread_setup(void *arg)
 316{
 317	struct drbd_thread *thi = (struct drbd_thread *) arg;
 318	struct drbd_resource *resource = thi->resource;
 319	unsigned long flags;
 320	int retval;
 321
 322	snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
 323		 thi->name[0],
 324		 resource->name);
 325
 326	allow_kernel_signal(DRBD_SIGKILL);
 327	allow_kernel_signal(SIGXCPU);
 328restart:
 329	retval = thi->function(thi);
 330
 331	spin_lock_irqsave(&thi->t_lock, flags);
 332
 333	/* if the receiver has been "EXITING", the last thing it did
 334	 * was set the conn state to "StandAlone",
 335	 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
 336	 * and receiver thread will be "started".
 337	 * drbd_thread_start needs to set "RESTARTING" in that case.
 338	 * t_state check and assignment needs to be within the same spinlock,
 339	 * so either thread_start sees EXITING, and can remap to RESTARTING,
 340	 * or thread_start see NONE, and can proceed as normal.
 341	 */
 342
 343	if (thi->t_state == RESTARTING) {
 344		drbd_info(resource, "Restarting %s thread\n", thi->name);
 345		thi->t_state = RUNNING;
 346		spin_unlock_irqrestore(&thi->t_lock, flags);
 347		goto restart;
 348	}
 349
 350	thi->task = NULL;
 351	thi->t_state = NONE;
 352	smp_mb();
 353	complete_all(&thi->stop);
 354	spin_unlock_irqrestore(&thi->t_lock, flags);
 355
 356	drbd_info(resource, "Terminating %s\n", current->comm);
 357
 358	/* Release mod reference taken when thread was started */
 359
 360	if (thi->connection)
 361		kref_put(&thi->connection->kref, drbd_destroy_connection);
 362	kref_put(&resource->kref, drbd_destroy_resource);
 363	module_put(THIS_MODULE);
 364	return retval;
 365}
 366
 367static void drbd_thread_init(struct drbd_resource *resource, struct drbd_thread *thi,
 368			     int (*func) (struct drbd_thread *), const char *name)
 369{
 370	spin_lock_init(&thi->t_lock);
 371	thi->task    = NULL;
 372	thi->t_state = NONE;
 373	thi->function = func;
 374	thi->resource = resource;
 375	thi->connection = NULL;
 376	thi->name = name;
 377}
 378
 379int drbd_thread_start(struct drbd_thread *thi)
 380{
 381	struct drbd_resource *resource = thi->resource;
 382	struct task_struct *nt;
 383	unsigned long flags;
 384
 
 
 
 
 
 385	/* is used from state engine doing drbd_thread_stop_nowait,
 386	 * while holding the req lock irqsave */
 387	spin_lock_irqsave(&thi->t_lock, flags);
 388
 389	switch (thi->t_state) {
 390	case NONE:
 391		drbd_info(resource, "Starting %s thread (from %s [%d])\n",
 392			 thi->name, current->comm, current->pid);
 393
 394		/* Get ref on module for thread - this is released when thread exits */
 395		if (!try_module_get(THIS_MODULE)) {
 396			drbd_err(resource, "Failed to get module reference in drbd_thread_start\n");
 397			spin_unlock_irqrestore(&thi->t_lock, flags);
 398			return false;
 399		}
 400
 401		kref_get(&resource->kref);
 402		if (thi->connection)
 403			kref_get(&thi->connection->kref);
 404
 405		init_completion(&thi->stop);
 
 406		thi->reset_cpu_mask = 1;
 407		thi->t_state = RUNNING;
 408		spin_unlock_irqrestore(&thi->t_lock, flags);
 409		flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
 410
 411		nt = kthread_create(drbd_thread_setup, (void *) thi,
 412				    "drbd_%c_%s", thi->name[0], thi->resource->name);
 413
 414		if (IS_ERR(nt)) {
 415			drbd_err(resource, "Couldn't start thread\n");
 416
 417			if (thi->connection)
 418				kref_put(&thi->connection->kref, drbd_destroy_connection);
 419			kref_put(&resource->kref, drbd_destroy_resource);
 420			module_put(THIS_MODULE);
 421			return false;
 422		}
 423		spin_lock_irqsave(&thi->t_lock, flags);
 424		thi->task = nt;
 425		thi->t_state = RUNNING;
 426		spin_unlock_irqrestore(&thi->t_lock, flags);
 427		wake_up_process(nt);
 428		break;
 429	case EXITING:
 430		thi->t_state = RESTARTING;
 431		drbd_info(resource, "Restarting %s thread (from %s [%d])\n",
 432				thi->name, current->comm, current->pid);
 433		fallthrough;
 434	case RUNNING:
 435	case RESTARTING:
 436	default:
 437		spin_unlock_irqrestore(&thi->t_lock, flags);
 438		break;
 439	}
 440
 441	return true;
 442}
 443
 444
 445void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
 446{
 447	unsigned long flags;
 448
 449	enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
 450
 451	/* may be called from state engine, holding the req lock irqsave */
 452	spin_lock_irqsave(&thi->t_lock, flags);
 453
 454	if (thi->t_state == NONE) {
 455		spin_unlock_irqrestore(&thi->t_lock, flags);
 456		if (restart)
 457			drbd_thread_start(thi);
 458		return;
 459	}
 460
 461	if (thi->t_state != ns) {
 462		if (thi->task == NULL) {
 463			spin_unlock_irqrestore(&thi->t_lock, flags);
 464			return;
 465		}
 466
 467		thi->t_state = ns;
 468		smp_mb();
 469		init_completion(&thi->stop);
 470		if (thi->task != current)
 471			send_sig(DRBD_SIGKILL, thi->task, 1);
 
 472	}
 473
 474	spin_unlock_irqrestore(&thi->t_lock, flags);
 475
 476	if (wait)
 477		wait_for_completion(&thi->stop);
 478}
 479
 480int conn_lowest_minor(struct drbd_connection *connection)
 481{
 482	struct drbd_peer_device *peer_device;
 483	int vnr = 0, minor = -1;
 484
 485	rcu_read_lock();
 486	peer_device = idr_get_next(&connection->peer_devices, &vnr);
 487	if (peer_device)
 488		minor = device_to_minor(peer_device->device);
 489	rcu_read_unlock();
 490
 491	return minor;
 492}
 493
 494#ifdef CONFIG_SMP
 495/**
 496 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
 
 497 *
 498 * Forces all threads of a resource onto the same CPU. This is beneficial for
 499 * DRBD's performance. May be overwritten by user's configuration.
 500 */
 501static void drbd_calc_cpu_mask(cpumask_var_t *cpu_mask)
 502{
 503	unsigned int *resources_per_cpu, min_index = ~0;
 504
 505	resources_per_cpu = kcalloc(nr_cpu_ids, sizeof(*resources_per_cpu),
 506				    GFP_KERNEL);
 507	if (resources_per_cpu) {
 508		struct drbd_resource *resource;
 509		unsigned int cpu, min = ~0;
 510
 511		rcu_read_lock();
 512		for_each_resource_rcu(resource, &drbd_resources) {
 513			for_each_cpu(cpu, resource->cpu_mask)
 514				resources_per_cpu[cpu]++;
 515		}
 516		rcu_read_unlock();
 517		for_each_online_cpu(cpu) {
 518			if (resources_per_cpu[cpu] < min) {
 519				min = resources_per_cpu[cpu];
 520				min_index = cpu;
 521			}
 522		}
 523		kfree(resources_per_cpu);
 524	}
 525	if (min_index == ~0) {
 526		cpumask_setall(*cpu_mask);
 527		return;
 528	}
 529	cpumask_set_cpu(min_index, *cpu_mask);
 530}
 531
 532/**
 533 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
 534 * @device:	DRBD device.
 535 * @thi:	drbd_thread object
 536 *
 537 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
 538 * prematurely.
 539 */
 540void drbd_thread_current_set_cpu(struct drbd_thread *thi)
 541{
 542	struct drbd_resource *resource = thi->resource;
 543	struct task_struct *p = current;
 544
 
 
 
 
 
 
 545	if (!thi->reset_cpu_mask)
 546		return;
 547	thi->reset_cpu_mask = 0;
 548	set_cpus_allowed_ptr(p, resource->cpu_mask);
 549}
 550#else
 551#define drbd_calc_cpu_mask(A) ({})
 552#endif
 553
 554/**
 555 * drbd_header_size  -  size of a packet header
 556 *
 557 * The header size is a multiple of 8, so any payload following the header is
 558 * word aligned on 64-bit architectures.  (The bitmap send and receive code
 559 * relies on this.)
 560 */
 561unsigned int drbd_header_size(struct drbd_connection *connection)
 562{
 563	if (connection->agreed_pro_version >= 100) {
 564		BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8));
 565		return sizeof(struct p_header100);
 566	} else {
 567		BUILD_BUG_ON(sizeof(struct p_header80) !=
 568			     sizeof(struct p_header95));
 569		BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
 570		return sizeof(struct p_header80);
 571	}
 572}
 573
 574static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
 575{
 576	h->magic   = cpu_to_be32(DRBD_MAGIC);
 577	h->command = cpu_to_be16(cmd);
 578	h->length  = cpu_to_be16(size);
 579	return sizeof(struct p_header80);
 580}
 581
 582static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
 583{
 584	h->magic   = cpu_to_be16(DRBD_MAGIC_BIG);
 585	h->command = cpu_to_be16(cmd);
 586	h->length = cpu_to_be32(size);
 587	return sizeof(struct p_header95);
 588}
 589
 590static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd,
 591				      int size, int vnr)
 592{
 593	h->magic = cpu_to_be32(DRBD_MAGIC_100);
 594	h->volume = cpu_to_be16(vnr);
 595	h->command = cpu_to_be16(cmd);
 596	h->length = cpu_to_be32(size);
 597	h->pad = 0;
 598	return sizeof(struct p_header100);
 599}
 600
 601static unsigned int prepare_header(struct drbd_connection *connection, int vnr,
 602				   void *buffer, enum drbd_packet cmd, int size)
 603{
 604	if (connection->agreed_pro_version >= 100)
 605		return prepare_header100(buffer, cmd, size, vnr);
 606	else if (connection->agreed_pro_version >= 95 &&
 607		 size > DRBD_MAX_SIZE_H80_PACKET)
 608		return prepare_header95(buffer, cmd, size);
 609	else
 610		return prepare_header80(buffer, cmd, size);
 611}
 612
 613static void *__conn_prepare_command(struct drbd_connection *connection,
 614				    struct drbd_socket *sock)
 615{
 616	if (!sock->socket)
 617		return NULL;
 618	return sock->sbuf + drbd_header_size(connection);
 619}
 620
 621void *conn_prepare_command(struct drbd_connection *connection, struct drbd_socket *sock)
 
 
 
 
 622{
 623	void *p;
 
 624
 625	mutex_lock(&sock->mutex);
 626	p = __conn_prepare_command(connection, sock);
 627	if (!p)
 628		mutex_unlock(&sock->mutex);
 
 
 
 629
 630	return p;
 631}
 
 
 632
 633void *drbd_prepare_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock)
 634{
 635	return conn_prepare_command(peer_device->connection, sock);
 
 
 636}
 637
 638static int __send_command(struct drbd_connection *connection, int vnr,
 639			  struct drbd_socket *sock, enum drbd_packet cmd,
 640			  unsigned int header_size, void *data,
 641			  unsigned int size)
 642{
 643	int msg_flags;
 644	int err;
 645
 646	/*
 647	 * Called with @data == NULL and the size of the data blocks in @size
 648	 * for commands that send data blocks.  For those commands, omit the
 649	 * MSG_MORE flag: this will increase the likelihood that data blocks
 650	 * which are page aligned on the sender will end up page aligned on the
 651	 * receiver.
 652	 */
 653	msg_flags = data ? MSG_MORE : 0;
 654
 655	header_size += prepare_header(connection, vnr, sock->sbuf, cmd,
 656				      header_size + size);
 657	err = drbd_send_all(connection, sock->socket, sock->sbuf, header_size,
 658			    msg_flags);
 659	if (data && !err)
 660		err = drbd_send_all(connection, sock->socket, data, size, 0);
 661	/* DRBD protocol "pings" are latency critical.
 662	 * This is supposed to trigger tcp_push_pending_frames() */
 663	if (!err && (cmd == P_PING || cmd == P_PING_ACK))
 664		tcp_sock_set_nodelay(sock->socket->sk);
 665
 666	return err;
 667}
 
 
 668
 669static int __conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
 670			       enum drbd_packet cmd, unsigned int header_size,
 671			       void *data, unsigned int size)
 672{
 673	return __send_command(connection, 0, sock, cmd, header_size, data, size);
 674}
 675
 676int conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
 677		      enum drbd_packet cmd, unsigned int header_size,
 678		      void *data, unsigned int size)
 679{
 680	int err;
 681
 682	err = __conn_send_command(connection, sock, cmd, header_size, data, size);
 683	mutex_unlock(&sock->mutex);
 684	return err;
 685}
 686
 687int drbd_send_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock,
 688		      enum drbd_packet cmd, unsigned int header_size,
 689		      void *data, unsigned int size)
 690{
 691	int err;
 692
 693	err = __send_command(peer_device->connection, peer_device->device->vnr,
 694			     sock, cmd, header_size, data, size);
 695	mutex_unlock(&sock->mutex);
 696	return err;
 697}
 698
 699int drbd_send_ping(struct drbd_connection *connection)
 700{
 701	struct drbd_socket *sock;
 702
 703	sock = &connection->meta;
 704	if (!conn_prepare_command(connection, sock))
 705		return -EIO;
 706	return conn_send_command(connection, sock, P_PING, 0, NULL, 0);
 707}
 708
 709int drbd_send_ping_ack(struct drbd_connection *connection)
 710{
 711	struct drbd_socket *sock;
 712
 713	sock = &connection->meta;
 714	if (!conn_prepare_command(connection, sock))
 715		return -EIO;
 716	return conn_send_command(connection, sock, P_PING_ACK, 0, NULL, 0);
 717}
 718
 719int drbd_send_sync_param(struct drbd_peer_device *peer_device)
 720{
 721	struct drbd_socket *sock;
 722	struct p_rs_param_95 *p;
 723	int size;
 724	const int apv = peer_device->connection->agreed_pro_version;
 725	enum drbd_packet cmd;
 726	struct net_conf *nc;
 727	struct disk_conf *dc;
 728
 729	sock = &peer_device->connection->data;
 730	p = drbd_prepare_command(peer_device, sock);
 731	if (!p)
 732		return -EIO;
 733
 734	rcu_read_lock();
 735	nc = rcu_dereference(peer_device->connection->net_conf);
 736
 737	size = apv <= 87 ? sizeof(struct p_rs_param)
 738		: apv == 88 ? sizeof(struct p_rs_param)
 739			+ strlen(nc->verify_alg) + 1
 740		: apv <= 94 ? sizeof(struct p_rs_param_89)
 741		: /* apv >= 95 */ sizeof(struct p_rs_param_95);
 742
 743	cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 744
 745	/* initialize verify_alg and csums_alg */
 746	memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
 
 747
 748	if (get_ldev(peer_device->device)) {
 749		dc = rcu_dereference(peer_device->device->ldev->disk_conf);
 750		p->resync_rate = cpu_to_be32(dc->resync_rate);
 751		p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead);
 752		p->c_delay_target = cpu_to_be32(dc->c_delay_target);
 753		p->c_fill_target = cpu_to_be32(dc->c_fill_target);
 754		p->c_max_rate = cpu_to_be32(dc->c_max_rate);
 755		put_ldev(peer_device->device);
 756	} else {
 757		p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF);
 758		p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
 759		p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
 760		p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
 761		p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
 762	}
 763
 764	if (apv >= 88)
 765		strcpy(p->verify_alg, nc->verify_alg);
 766	if (apv >= 89)
 767		strcpy(p->csums_alg, nc->csums_alg);
 768	rcu_read_unlock();
 769
 770	return drbd_send_command(peer_device, sock, cmd, size, NULL, 0);
 771}
 772
 773int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd)
 774{
 775	struct drbd_socket *sock;
 776	struct p_protocol *p;
 777	struct net_conf *nc;
 778	int size, cf;
 
 779
 780	sock = &connection->data;
 781	p = __conn_prepare_command(connection, sock);
 782	if (!p)
 783		return -EIO;
 
 
 
 
 784
 785	rcu_read_lock();
 786	nc = rcu_dereference(connection->net_conf);
 
 
 
 787
 788	if (nc->tentative && connection->agreed_pro_version < 92) {
 789		rcu_read_unlock();
 790		drbd_err(connection, "--dry-run is not supported by peer");
 791		return -EOPNOTSUPP;
 792	}
 793
 794	size = sizeof(*p);
 795	if (connection->agreed_pro_version >= 87)
 796		size += strlen(nc->integrity_alg) + 1;
 797
 798	p->protocol      = cpu_to_be32(nc->wire_protocol);
 799	p->after_sb_0p   = cpu_to_be32(nc->after_sb_0p);
 800	p->after_sb_1p   = cpu_to_be32(nc->after_sb_1p);
 801	p->after_sb_2p   = cpu_to_be32(nc->after_sb_2p);
 802	p->two_primaries = cpu_to_be32(nc->two_primaries);
 803	cf = 0;
 804	if (nc->discard_my_data)
 805		cf |= CF_DISCARD_MY_DATA;
 806	if (nc->tentative)
 807		cf |= CF_DRY_RUN;
 
 
 
 
 
 
 
 808	p->conn_flags    = cpu_to_be32(cf);
 809
 810	if (connection->agreed_pro_version >= 87)
 811		strcpy(p->integrity_alg, nc->integrity_alg);
 812	rcu_read_unlock();
 813
 814	return __conn_send_command(connection, sock, cmd, size, NULL, 0);
 815}
 816
 817int drbd_send_protocol(struct drbd_connection *connection)
 818{
 819	int err;
 820
 821	mutex_lock(&connection->data.mutex);
 822	err = __drbd_send_protocol(connection, P_PROTOCOL);
 823	mutex_unlock(&connection->data.mutex);
 824
 825	return err;
 826}
 827
 828static int _drbd_send_uuids(struct drbd_peer_device *peer_device, u64 uuid_flags)
 829{
 830	struct drbd_device *device = peer_device->device;
 831	struct drbd_socket *sock;
 832	struct p_uuids *p;
 833	int i;
 834
 835	if (!get_ldev_if_state(device, D_NEGOTIATING))
 836		return 0;
 837
 838	sock = &peer_device->connection->data;
 839	p = drbd_prepare_command(peer_device, sock);
 840	if (!p) {
 841		put_ldev(device);
 842		return -EIO;
 843	}
 844	spin_lock_irq(&device->ldev->md.uuid_lock);
 845	for (i = UI_CURRENT; i < UI_SIZE; i++)
 846		p->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
 847	spin_unlock_irq(&device->ldev->md.uuid_lock);
 
 
 
 
 
 
 848
 849	device->comm_bm_set = drbd_bm_total_weight(device);
 850	p->uuid[UI_SIZE] = cpu_to_be64(device->comm_bm_set);
 851	rcu_read_lock();
 852	uuid_flags |= rcu_dereference(peer_device->connection->net_conf)->discard_my_data ? 1 : 0;
 853	rcu_read_unlock();
 854	uuid_flags |= test_bit(CRASHED_PRIMARY, &device->flags) ? 2 : 0;
 855	uuid_flags |= device->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
 856	p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
 857
 858	put_ldev(device);
 859	return drbd_send_command(peer_device, sock, P_UUIDS, sizeof(*p), NULL, 0);
 860}
 861
 862int drbd_send_uuids(struct drbd_peer_device *peer_device)
 863{
 864	return _drbd_send_uuids(peer_device, 0);
 865}
 866
 867int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *peer_device)
 868{
 869	return _drbd_send_uuids(peer_device, 8);
 870}
 871
 872void drbd_print_uuids(struct drbd_device *device, const char *text)
 873{
 874	if (get_ldev_if_state(device, D_NEGOTIATING)) {
 875		u64 *uuid = device->ldev->md.uuid;
 876		drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX\n",
 877		     text,
 878		     (unsigned long long)uuid[UI_CURRENT],
 879		     (unsigned long long)uuid[UI_BITMAP],
 880		     (unsigned long long)uuid[UI_HISTORY_START],
 881		     (unsigned long long)uuid[UI_HISTORY_END]);
 882		put_ldev(device);
 883	} else {
 884		drbd_info(device, "%s effective data uuid: %016llX\n",
 885				text,
 886				(unsigned long long)device->ed_uuid);
 887	}
 888}
 889
 890void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
 891{
 892	struct drbd_device *device = peer_device->device;
 893	struct drbd_socket *sock;
 894	struct p_rs_uuid *p;
 895	u64 uuid;
 896
 897	D_ASSERT(device, device->state.disk == D_UP_TO_DATE);
 898
 899	uuid = device->ldev->md.uuid[UI_BITMAP];
 900	if (uuid && uuid != UUID_JUST_CREATED)
 901		uuid = uuid + UUID_NEW_BM_OFFSET;
 902	else
 903		get_random_bytes(&uuid, sizeof(u64));
 904	drbd_uuid_set(device, UI_BITMAP, uuid);
 905	drbd_print_uuids(device, "updated sync UUID");
 906	drbd_md_sync(device);
 907
 908	sock = &peer_device->connection->data;
 909	p = drbd_prepare_command(peer_device, sock);
 910	if (p) {
 911		p->uuid = cpu_to_be64(uuid);
 912		drbd_send_command(peer_device, sock, P_SYNC_UUID, sizeof(*p), NULL, 0);
 913	}
 914}
 915
 916/* communicated if (agreed_features & DRBD_FF_WSAME) */
 917static void
 918assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p,
 919					struct request_queue *q)
 920{
 921	if (q) {
 922		p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
 923		p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
 924		p->qlim->alignment_offset = cpu_to_be32(queue_alignment_offset(q));
 925		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 926		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 927		p->qlim->discard_enabled = blk_queue_discard(q);
 928		p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
 929	} else {
 930		q = device->rq_queue;
 931		p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
 932		p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
 933		p->qlim->alignment_offset = 0;
 934		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 935		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 936		p->qlim->discard_enabled = 0;
 937		p->qlim->write_same_capable = 0;
 938	}
 939}
 940
 941int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
 942{
 943	struct drbd_device *device = peer_device->device;
 944	struct drbd_socket *sock;
 945	struct p_sizes *p;
 946	sector_t d_size, u_size;
 947	int q_order_type;
 948	unsigned int max_bio_size;
 949	unsigned int packet_size;
 950
 951	sock = &peer_device->connection->data;
 952	p = drbd_prepare_command(peer_device, sock);
 953	if (!p)
 954		return -EIO;
 955
 956	packet_size = sizeof(*p);
 957	if (peer_device->connection->agreed_features & DRBD_FF_WSAME)
 958		packet_size += sizeof(p->qlim[0]);
 959
 960	memset(p, 0, packet_size);
 961	if (get_ldev_if_state(device, D_NEGOTIATING)) {
 962		struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
 963		d_size = drbd_get_max_capacity(device->ldev);
 964		rcu_read_lock();
 965		u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
 966		rcu_read_unlock();
 967		q_order_type = drbd_queue_order_type(device);
 968		max_bio_size = queue_max_hw_sectors(q) << 9;
 969		max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
 970		assign_p_sizes_qlim(device, p, q);
 971		put_ldev(device);
 972	} else {
 973		d_size = 0;
 974		u_size = 0;
 975		q_order_type = QUEUE_ORDERED_NONE;
 976		max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
 977		assign_p_sizes_qlim(device, p, NULL);
 978	}
 979
 980	if (peer_device->connection->agreed_pro_version <= 94)
 981		max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
 982	else if (peer_device->connection->agreed_pro_version < 100)
 983		max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
 984
 985	p->d_size = cpu_to_be64(d_size);
 986	p->u_size = cpu_to_be64(u_size);
 987	p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(device->this_bdev));
 988	p->max_bio_size = cpu_to_be32(max_bio_size);
 989	p->queue_order_type = cpu_to_be16(q_order_type);
 990	p->dds_flags = cpu_to_be16(flags);
 991
 992	return drbd_send_command(peer_device, sock, P_SIZES, packet_size, NULL, 0);
 993}
 994
 995/**
 996 * drbd_send_current_state() - Sends the drbd state to the peer
 997 * @peer_device:	DRBD peer device.
 998 */
 999int drbd_send_current_state(struct drbd_peer_device *peer_device)
1000{
1001	struct drbd_socket *sock;
1002	struct p_state *p;
 
1003
1004	sock = &peer_device->connection->data;
1005	p = drbd_prepare_command(peer_device, sock);
1006	if (!p)
1007		return -EIO;
1008	p->state = cpu_to_be32(peer_device->device->state.i); /* Within the send mutex */
1009	return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
1010}
1011
1012/**
1013 * drbd_send_state() - After a state change, sends the new state to the peer
1014 * @peer_device:      DRBD peer device.
1015 * @state:     the state to send, not necessarily the current state.
1016 *
1017 * Each state change queues an "after_state_ch" work, which will eventually
1018 * send the resulting new state to the peer. If more state changes happen
1019 * between queuing and processing of the after_state_ch work, we still
1020 * want to send each intermediary state in the order it occurred.
1021 */
1022int drbd_send_state(struct drbd_peer_device *peer_device, union drbd_state state)
1023{
1024	struct drbd_socket *sock;
1025	struct p_state *p;
1026
1027	sock = &peer_device->connection->data;
1028	p = drbd_prepare_command(peer_device, sock);
1029	if (!p)
1030		return -EIO;
1031	p->state = cpu_to_be32(state.i); /* Within the send mutex */
1032	return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
1033}
1034
1035int drbd_send_state_req(struct drbd_peer_device *peer_device, union drbd_state mask, union drbd_state val)
1036{
1037	struct drbd_socket *sock;
1038	struct p_req_state *p;
1039
1040	sock = &peer_device->connection->data;
1041	p = drbd_prepare_command(peer_device, sock);
1042	if (!p)
1043		return -EIO;
1044	p->mask = cpu_to_be32(mask.i);
1045	p->val = cpu_to_be32(val.i);
1046	return drbd_send_command(peer_device, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0);
1047}
1048
1049int conn_send_state_req(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
1050{
1051	enum drbd_packet cmd;
1052	struct drbd_socket *sock;
1053	struct p_req_state *p;
1054
1055	cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
1056	sock = &connection->data;
1057	p = conn_prepare_command(connection, sock);
1058	if (!p)
1059		return -EIO;
1060	p->mask = cpu_to_be32(mask.i);
1061	p->val = cpu_to_be32(val.i);
1062	return conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
1063}
1064
1065void drbd_send_sr_reply(struct drbd_peer_device *peer_device, enum drbd_state_rv retcode)
 
1066{
1067	struct drbd_socket *sock;
1068	struct p_req_state_reply *p;
1069
1070	sock = &peer_device->connection->meta;
1071	p = drbd_prepare_command(peer_device, sock);
1072	if (p) {
1073		p->retcode = cpu_to_be32(retcode);
1074		drbd_send_command(peer_device, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0);
1075	}
1076}
1077
1078void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode)
1079{
1080	struct drbd_socket *sock;
1081	struct p_req_state_reply *p;
1082	enum drbd_packet cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
1083
1084	sock = &connection->meta;
1085	p = conn_prepare_command(connection, sock);
1086	if (p) {
1087		p->retcode = cpu_to_be32(retcode);
1088		conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
1089	}
1090}
1091
1092static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
1093{
1094	BUG_ON(code & ~0xf);
1095	p->encoding = (p->encoding & ~0xf) | code;
1096}
1097
1098static void dcbp_set_start(struct p_compressed_bm *p, int set)
1099{
1100	p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
1101}
1102
1103static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
1104{
1105	BUG_ON(n & ~0x7);
1106	p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
1107}
1108
1109static int fill_bitmap_rle_bits(struct drbd_device *device,
1110			 struct p_compressed_bm *p,
1111			 unsigned int size,
1112			 struct bm_xfer_ctx *c)
1113{
1114	struct bitstream bs;
1115	unsigned long plain_bits;
1116	unsigned long tmp;
1117	unsigned long rl;
1118	unsigned len;
1119	unsigned toggle;
1120	int bits, use_rle;
1121
1122	/* may we use this feature? */
1123	rcu_read_lock();
1124	use_rle = rcu_dereference(first_peer_device(device)->connection->net_conf)->use_rle;
1125	rcu_read_unlock();
1126	if (!use_rle || first_peer_device(device)->connection->agreed_pro_version < 90)
1127		return 0;
1128
1129	if (c->bit_offset >= c->bm_bits)
1130		return 0; /* nothing to do. */
1131
1132	/* use at most thus many bytes */
1133	bitstream_init(&bs, p->code, size, 0);
1134	memset(p->code, 0, size);
1135	/* plain bits covered in this code string */
1136	plain_bits = 0;
1137
1138	/* p->encoding & 0x80 stores whether the first run length is set.
1139	 * bit offset is implicit.
1140	 * start with toggle == 2 to be able to tell the first iteration */
1141	toggle = 2;
1142
1143	/* see how much plain bits we can stuff into one packet
1144	 * using RLE and VLI. */
1145	do {
1146		tmp = (toggle == 0) ? _drbd_bm_find_next_zero(device, c->bit_offset)
1147				    : _drbd_bm_find_next(device, c->bit_offset);
1148		if (tmp == -1UL)
1149			tmp = c->bm_bits;
1150		rl = tmp - c->bit_offset;
1151
1152		if (toggle == 2) { /* first iteration */
1153			if (rl == 0) {
1154				/* the first checked bit was set,
1155				 * store start value, */
1156				dcbp_set_start(p, 1);
1157				/* but skip encoding of zero run length */
1158				toggle = !toggle;
1159				continue;
1160			}
1161			dcbp_set_start(p, 0);
1162		}
1163
1164		/* paranoia: catch zero runlength.
1165		 * can only happen if bitmap is modified while we scan it. */
1166		if (rl == 0) {
1167			drbd_err(device, "unexpected zero runlength while encoding bitmap "
1168			    "t:%u bo:%lu\n", toggle, c->bit_offset);
1169			return -1;
1170		}
1171
1172		bits = vli_encode_bits(&bs, rl);
1173		if (bits == -ENOBUFS) /* buffer full */
1174			break;
1175		if (bits <= 0) {
1176			drbd_err(device, "error while encoding bitmap: %d\n", bits);
1177			return 0;
1178		}
1179
1180		toggle = !toggle;
1181		plain_bits += rl;
1182		c->bit_offset = tmp;
1183	} while (c->bit_offset < c->bm_bits);
1184
1185	len = bs.cur.b - p->code + !!bs.cur.bit;
1186
1187	if (plain_bits < (len << 3)) {
1188		/* incompressible with this method.
1189		 * we need to rewind both word and bit position. */
1190		c->bit_offset -= plain_bits;
1191		bm_xfer_ctx_bit_to_word_offset(c);
1192		c->bit_offset = c->word_offset * BITS_PER_LONG;
1193		return 0;
1194	}
1195
1196	/* RLE + VLI was able to compress it just fine.
1197	 * update c->word_offset. */
1198	bm_xfer_ctx_bit_to_word_offset(c);
1199
1200	/* store pad_bits */
1201	dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
1202
1203	return len;
1204}
1205
1206/**
1207 * send_bitmap_rle_or_plain
1208 *
1209 * Return 0 when done, 1 when another iteration is needed, and a negative error
1210 * code upon failure.
1211 */
1212static int
1213send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c)
 
1214{
1215	struct drbd_socket *sock = &first_peer_device(device)->connection->data;
1216	unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
1217	struct p_compressed_bm *p = sock->sbuf + header_size;
1218	int len, err;
 
 
1219
1220	len = fill_bitmap_rle_bits(device, p,
1221			DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
1222	if (len < 0)
1223		return -EIO;
1224
1225	if (len) {
1226		dcbp_set_code(p, RLE_VLI_Bits);
1227		err = __send_command(first_peer_device(device)->connection, device->vnr, sock,
1228				     P_COMPRESSED_BITMAP, sizeof(*p) + len,
1229				     NULL, 0);
1230		c->packets[0]++;
1231		c->bytes[0] += header_size + sizeof(*p) + len;
1232
1233		if (c->bit_offset >= c->bm_bits)
1234			len = 0; /* DONE */
1235	} else {
1236		/* was not compressible.
1237		 * send a buffer full of plain text bits instead. */
1238		unsigned int data_size;
1239		unsigned long num_words;
1240		unsigned long *p = sock->sbuf + header_size;
1241
1242		data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
1243		num_words = min_t(size_t, data_size / sizeof(*p),
1244				  c->bm_words - c->word_offset);
1245		len = num_words * sizeof(*p);
1246		if (len)
1247			drbd_bm_get_lel(device, c->word_offset, num_words, p);
1248		err = __send_command(first_peer_device(device)->connection, device->vnr, sock, P_BITMAP, len, NULL, 0);
 
1249		c->word_offset += num_words;
1250		c->bit_offset = c->word_offset * BITS_PER_LONG;
1251
1252		c->packets[1]++;
1253		c->bytes[1] += header_size + len;
1254
1255		if (c->bit_offset > c->bm_bits)
1256			c->bit_offset = c->bm_bits;
1257	}
1258	if (!err) {
1259		if (len == 0) {
1260			INFO_bm_xfer_stats(device, "send", c);
1261			return 0;
1262		} else
1263			return 1;
1264	}
1265	return -EIO;
1266}
1267
1268/* See the comment at receive_bitmap() */
1269static int _drbd_send_bitmap(struct drbd_device *device)
1270{
1271	struct bm_xfer_ctx c;
 
1272	int err;
1273
1274	if (!expect(device->bitmap))
 
 
 
 
 
 
1275		return false;
 
1276
1277	if (get_ldev(device)) {
1278		if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC)) {
1279			drbd_info(device, "Writing the whole bitmap, MDF_FullSync was set.\n");
1280			drbd_bm_set_all(device);
1281			if (drbd_bm_write(device)) {
1282				/* write_bm did fail! Leave full sync flag set in Meta P_DATA
1283				 * but otherwise process as per normal - need to tell other
1284				 * side that a full resync is required! */
1285				drbd_err(device, "Failed to write bitmap to disk!\n");
1286			} else {
1287				drbd_md_clear_flag(device, MDF_FULL_SYNC);
1288				drbd_md_sync(device);
1289			}
1290		}
1291		put_ldev(device);
1292	}
1293
1294	c = (struct bm_xfer_ctx) {
1295		.bm_bits = drbd_bm_bits(device),
1296		.bm_words = drbd_bm_words(device),
1297	};
1298
1299	do {
1300		err = send_bitmap_rle_or_plain(device, &c);
1301	} while (err > 0);
1302
 
1303	return err == 0;
1304}
1305
1306int drbd_send_bitmap(struct drbd_device *device)
1307{
1308	struct drbd_socket *sock = &first_peer_device(device)->connection->data;
1309	int err = -1;
1310
1311	mutex_lock(&sock->mutex);
1312	if (sock->socket)
1313		err = !_drbd_send_bitmap(device);
1314	mutex_unlock(&sock->mutex);
1315	return err;
1316}
1317
1318void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, u32 set_size)
1319{
1320	struct drbd_socket *sock;
1321	struct p_barrier_ack *p;
1322
1323	if (connection->cstate < C_WF_REPORT_PARAMS)
1324		return;
1325
1326	sock = &connection->meta;
1327	p = conn_prepare_command(connection, sock);
1328	if (!p)
1329		return;
1330	p->barrier = barrier_nr;
1331	p->set_size = cpu_to_be32(set_size);
1332	conn_send_command(connection, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0);
1333}
1334
1335/**
1336 * _drbd_send_ack() - Sends an ack packet
1337 * @device:	DRBD device.
1338 * @cmd:	Packet command code.
1339 * @sector:	sector, needs to be in big endian byte order
1340 * @blksize:	size in byte, needs to be in big endian byte order
1341 * @block_id:	Id, big endian byte order
1342 */
1343static int _drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
1344			  u64 sector, u32 blksize, u64 block_id)
1345{
1346	struct drbd_socket *sock;
1347	struct p_block_ack *p;
 
 
 
 
 
 
 
1348
1349	if (peer_device->device->state.conn < C_CONNECTED)
1350		return -EIO;
1351
1352	sock = &peer_device->connection->meta;
1353	p = drbd_prepare_command(peer_device, sock);
1354	if (!p)
1355		return -EIO;
1356	p->sector = sector;
1357	p->block_id = block_id;
1358	p->blksize = blksize;
1359	p->seq_num = cpu_to_be32(atomic_inc_return(&peer_device->device->packet_seq));
1360	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
1361}
1362
1363/* dp->sector and dp->block_id already/still in network byte order,
1364 * data_size is payload size according to dp->head,
1365 * and may need to be corrected for digest size. */
1366void drbd_send_ack_dp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
1367		      struct p_data *dp, int data_size)
1368{
1369	if (peer_device->connection->peer_integrity_tfm)
1370		data_size -= crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm);
1371	_drbd_send_ack(peer_device, cmd, dp->sector, cpu_to_be32(data_size),
1372		       dp->block_id);
1373}
1374
1375void drbd_send_ack_rp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
1376		      struct p_block_req *rp)
1377{
1378	_drbd_send_ack(peer_device, cmd, rp->sector, rp->blksize, rp->block_id);
1379}
1380
1381/**
1382 * drbd_send_ack() - Sends an ack packet
1383 * @device:	DRBD device
1384 * @cmd:	packet command code
1385 * @peer_req:	peer request
1386 */
1387int drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
1388		  struct drbd_peer_request *peer_req)
1389{
1390	return _drbd_send_ack(peer_device, cmd,
1391			      cpu_to_be64(peer_req->i.sector),
1392			      cpu_to_be32(peer_req->i.size),
1393			      peer_req->block_id);
1394}
1395
1396/* This function misuses the block_id field to signal if the blocks
1397 * are is sync or not. */
1398int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
1399		     sector_t sector, int blksize, u64 block_id)
1400{
1401	return _drbd_send_ack(peer_device, cmd,
1402			      cpu_to_be64(sector),
1403			      cpu_to_be32(blksize),
1404			      cpu_to_be64(block_id));
1405}
1406
1407int drbd_send_rs_deallocated(struct drbd_peer_device *peer_device,
1408			     struct drbd_peer_request *peer_req)
1409{
1410	struct drbd_socket *sock;
1411	struct p_block_desc *p;
1412
1413	sock = &peer_device->connection->data;
1414	p = drbd_prepare_command(peer_device, sock);
1415	if (!p)
1416		return -EIO;
1417	p->sector = cpu_to_be64(peer_req->i.sector);
1418	p->blksize = cpu_to_be32(peer_req->i.size);
1419	p->pad = 0;
1420	return drbd_send_command(peer_device, sock, P_RS_DEALLOCATED, sizeof(*p), NULL, 0);
1421}
1422
1423int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd,
1424		       sector_t sector, int size, u64 block_id)
 
 
1425{
1426	struct drbd_socket *sock;
1427	struct p_block_req *p;
1428
1429	sock = &peer_device->connection->data;
1430	p = drbd_prepare_command(peer_device, sock);
1431	if (!p)
1432		return -EIO;
1433	p->sector = cpu_to_be64(sector);
1434	p->block_id = block_id;
1435	p->blksize = cpu_to_be32(size);
1436	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
1437}
1438
1439int drbd_send_drequest_csum(struct drbd_peer_device *peer_device, sector_t sector, int size,
1440			    void *digest, int digest_size, enum drbd_packet cmd)
1441{
1442	struct drbd_socket *sock;
1443	struct p_block_req *p;
1444
1445	/* FIXME: Put the digest into the preallocated socket buffer.  */
1446
1447	sock = &peer_device->connection->data;
1448	p = drbd_prepare_command(peer_device, sock);
1449	if (!p)
1450		return -EIO;
1451	p->sector = cpu_to_be64(sector);
1452	p->block_id = ID_SYNCER /* unused */;
1453	p->blksize = cpu_to_be32(size);
1454	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), digest, digest_size);
1455}
1456
1457int drbd_send_ov_request(struct drbd_peer_device *peer_device, sector_t sector, int size)
1458{
1459	struct drbd_socket *sock;
1460	struct p_block_req *p;
 
 
 
 
1461
1462	sock = &peer_device->connection->data;
1463	p = drbd_prepare_command(peer_device, sock);
1464	if (!p)
1465		return -EIO;
1466	p->sector = cpu_to_be64(sector);
1467	p->block_id = ID_SYNCER /* unused */;
1468	p->blksize = cpu_to_be32(size);
1469	return drbd_send_command(peer_device, sock, P_OV_REQUEST, sizeof(*p), NULL, 0);
1470}
1471
1472/* called on sndtimeo
1473 * returns false if we should retry,
1474 * true if we think connection is dead
1475 */
1476static int we_should_drop_the_connection(struct drbd_connection *connection, struct socket *sock)
1477{
1478	int drop_it;
1479	/* long elapsed = (long)(jiffies - device->last_received); */
1480
1481	drop_it =   connection->meta.socket == sock
1482		|| !connection->ack_receiver.task
1483		|| get_t_state(&connection->ack_receiver) != RUNNING
1484		|| connection->cstate < C_WF_REPORT_PARAMS;
1485
1486	if (drop_it)
1487		return true;
1488
1489	drop_it = !--connection->ko_count;
1490	if (!drop_it) {
1491		drbd_err(connection, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
1492			 current->comm, current->pid, connection->ko_count);
1493		request_ping(connection);
1494	}
1495
1496	return drop_it; /* && (device->state == R_PRIMARY) */;
1497}
1498
1499static void drbd_update_congested(struct drbd_connection *connection)
1500{
1501	struct sock *sk = connection->data.socket->sk;
1502	if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
1503		set_bit(NET_CONGESTED, &connection->flags);
1504}
1505
1506/* The idea of sendpage seems to be to put some kind of reference
1507 * to the page into the skb, and to hand it over to the NIC. In
1508 * this process get_page() gets called.
1509 *
1510 * As soon as the page was really sent over the network put_page()
1511 * gets called by some part of the network layer. [ NIC driver? ]
1512 *
1513 * [ get_page() / put_page() increment/decrement the count. If count
1514 *   reaches 0 the page will be freed. ]
1515 *
1516 * This works nicely with pages from FSs.
1517 * But this means that in protocol A we might signal IO completion too early!
1518 *
1519 * In order not to corrupt data during a resync we must make sure
1520 * that we do not reuse our own buffer pages (EEs) to early, therefore
1521 * we have the net_ee list.
1522 *
1523 * XFS seems to have problems, still, it submits pages with page_count == 0!
1524 * As a workaround, we disable sendpage on pages
1525 * with page_count == 0 or PageSlab.
1526 */
1527static int _drbd_no_send_page(struct drbd_peer_device *peer_device, struct page *page,
1528			      int offset, size_t size, unsigned msg_flags)
1529{
1530	struct socket *socket;
1531	void *addr;
1532	int err;
1533
1534	socket = peer_device->connection->data.socket;
1535	addr = kmap(page) + offset;
1536	err = drbd_send_all(peer_device->connection, socket, addr, size, msg_flags);
1537	kunmap(page);
1538	if (!err)
1539		peer_device->device->send_cnt += size >> 9;
1540	return err;
1541}
1542
1543static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *page,
1544		    int offset, size_t size, unsigned msg_flags)
1545{
1546	struct socket *socket = peer_device->connection->data.socket;
 
1547	int len = size;
1548	int err = -EIO;
1549
1550	/* e.g. XFS meta- & log-data is in slab pages, which have a
1551	 * page_count of 0 and/or have PageSlab() set.
1552	 * we cannot use send_page for those, as that does get_page();
1553	 * put_page(); and would cause either a VM_BUG directly, or
1554	 * __page_cache_release a page that would actually still be referenced
1555	 * by someone, leading to some obscure delayed Oops somewhere else. */
1556	if (drbd_disable_sendpage || !sendpage_ok(page))
1557		return _drbd_no_send_page(peer_device, page, offset, size, msg_flags);
1558
1559	msg_flags |= MSG_NOSIGNAL;
1560	drbd_update_congested(peer_device->connection);
 
1561	do {
1562		int sent;
1563
1564		sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
 
 
 
 
 
 
 
1565		if (sent <= 0) {
1566			if (sent == -EAGAIN) {
1567				if (we_should_drop_the_connection(peer_device->connection, socket))
1568					break;
1569				continue;
1570			}
1571			drbd_warn(peer_device->device, "%s: size=%d len=%d sent=%d\n",
1572			     __func__, (int)size, len, sent);
1573			if (sent < 0)
1574				err = sent;
1575			break;
1576		}
1577		len    -= sent;
1578		offset += sent;
1579	} while (len > 0 /* THINK && device->cstate >= C_CONNECTED*/);
1580	clear_bit(NET_CONGESTED, &peer_device->connection->flags);
 
1581
1582	if (len == 0) {
1583		err = 0;
1584		peer_device->device->send_cnt += size >> 9;
1585	}
1586	return err;
1587}
1588
1589static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
1590{
1591	struct bio_vec bvec;
1592	struct bvec_iter iter;
1593
1594	/* hint all but last page with MSG_MORE */
1595	bio_for_each_segment(bvec, bio, iter) {
1596		int err;
1597
1598		err = _drbd_no_send_page(peer_device, bvec.bv_page,
1599					 bvec.bv_offset, bvec.bv_len,
1600					 bio_iter_last(bvec, iter)
1601					 ? 0 : MSG_MORE);
1602		if (err)
1603			return err;
1604		/* REQ_OP_WRITE_SAME has only one segment */
1605		if (bio_op(bio) == REQ_OP_WRITE_SAME)
1606			break;
1607	}
1608	return 0;
1609}
1610
1611static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *bio)
1612{
1613	struct bio_vec bvec;
1614	struct bvec_iter iter;
1615
1616	/* hint all but last page with MSG_MORE */
1617	bio_for_each_segment(bvec, bio, iter) {
1618		int err;
1619
1620		err = _drbd_send_page(peer_device, bvec.bv_page,
1621				      bvec.bv_offset, bvec.bv_len,
1622				      bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
1623		if (err)
1624			return err;
1625		/* REQ_OP_WRITE_SAME has only one segment */
1626		if (bio_op(bio) == REQ_OP_WRITE_SAME)
1627			break;
1628	}
1629	return 0;
1630}
1631
1632static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
1633			    struct drbd_peer_request *peer_req)
1634{
1635	struct page *page = peer_req->pages;
1636	unsigned len = peer_req->i.size;
1637	int err;
1638
1639	/* hint all but last page with MSG_MORE */
1640	page_chain_for_each(page) {
1641		unsigned l = min_t(unsigned, len, PAGE_SIZE);
1642
1643		err = _drbd_send_page(peer_device, page, 0, l,
1644				      page_chain_next(page) ? MSG_MORE : 0);
1645		if (err)
1646			return err;
1647		len -= l;
1648	}
1649	return 0;
1650}
1651
1652static u32 bio_flags_to_wire(struct drbd_connection *connection,
1653			     struct bio *bio)
1654{
1655	if (connection->agreed_pro_version >= 95)
1656		return  (bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0) |
1657			(bio->bi_opf & REQ_FUA ? DP_FUA : 0) |
1658			(bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : 0) |
1659			(bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
1660			(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0) |
1661			(bio_op(bio) == REQ_OP_WRITE_ZEROES ?
1662			  ((connection->agreed_features & DRBD_FF_WZEROES) ?
1663			   (DP_ZEROES |(!(bio->bi_opf & REQ_NOUNMAP) ? DP_DISCARD : 0))
1664			   : DP_DISCARD)
1665			: 0);
1666	else
1667		return bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0;
1668}
1669
1670/* Used to send write or TRIM aka REQ_OP_DISCARD requests
1671 * R_PRIMARY -> Peer	(P_DATA, P_TRIM)
1672 */
1673int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req)
1674{
1675	struct drbd_device *device = peer_device->device;
1676	struct drbd_socket *sock;
1677	struct p_data *p;
1678	struct p_wsame *wsame = NULL;
1679	void *digest_out;
1680	unsigned int dp_flags = 0;
1681	int digest_size;
1682	int err;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1683
1684	sock = &peer_device->connection->data;
1685	p = drbd_prepare_command(peer_device, sock);
1686	digest_size = peer_device->connection->integrity_tfm ?
1687		      crypto_shash_digestsize(peer_device->connection->integrity_tfm) : 0;
1688
1689	if (!p)
1690		return -EIO;
1691	p->sector = cpu_to_be64(req->i.sector);
1692	p->block_id = (unsigned long)req;
1693	p->seq_num = cpu_to_be32(atomic_inc_return(&device->packet_seq));
1694	dp_flags = bio_flags_to_wire(peer_device->connection, req->master_bio);
1695	if (device->state.conn >= C_SYNC_SOURCE &&
1696	    device->state.conn <= C_PAUSED_SYNC_T)
1697		dp_flags |= DP_MAY_SET_IN_SYNC;
1698	if (peer_device->connection->agreed_pro_version >= 100) {
1699		if (req->rq_state & RQ_EXP_RECEIVE_ACK)
1700			dp_flags |= DP_SEND_RECEIVE_ACK;
1701		/* During resync, request an explicit write ack,
1702		 * even in protocol != C */
1703		if (req->rq_state & RQ_EXP_WRITE_ACK
1704		|| (dp_flags & DP_MAY_SET_IN_SYNC))
1705			dp_flags |= DP_SEND_WRITE_ACK;
1706	}
1707	p->dp_flags = cpu_to_be32(dp_flags);
1708
1709	if (dp_flags & (DP_DISCARD|DP_ZEROES)) {
1710		enum drbd_packet cmd = (dp_flags & DP_ZEROES) ? P_ZEROES : P_TRIM;
1711		struct p_trim *t = (struct p_trim*)p;
1712		t->size = cpu_to_be32(req->i.size);
1713		err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*t), NULL, 0);
1714		goto out;
1715	}
1716	if (dp_flags & DP_WSAME) {
1717		/* this will only work if DRBD_FF_WSAME is set AND the
1718		 * handshake agreed that all nodes and backend devices are
1719		 * WRITE_SAME capable and agree on logical_block_size */
1720		wsame = (struct p_wsame*)p;
1721		digest_out = wsame + 1;
1722		wsame->size = cpu_to_be32(req->i.size);
1723	} else
1724		digest_out = p + 1;
1725
1726	/* our digest is still only over the payload.
1727	 * TRIM does not carry any payload. */
1728	if (digest_size)
1729		drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out);
1730	if (wsame) {
1731		err =
1732		    __send_command(peer_device->connection, device->vnr, sock, P_WSAME,
1733				   sizeof(*wsame) + digest_size, NULL,
1734				   bio_iovec(req->master_bio).bv_len);
1735	} else
1736		err =
1737		    __send_command(peer_device->connection, device->vnr, sock, P_DATA,
1738				   sizeof(*p) + digest_size, NULL, req->i.size);
1739	if (!err) {
1740		/* For protocol A, we have to memcpy the payload into
1741		 * socket buffers, as we may complete right away
1742		 * as soon as we handed it over to tcp, at which point the data
1743		 * pages may become invalid.
1744		 *
1745		 * For data-integrity enabled, we copy it as well, so we can be
1746		 * sure that even if the bio pages may still be modified, it
1747		 * won't change the data on the wire, thus if the digest checks
1748		 * out ok after sending on this side, but does not fit on the
1749		 * receiving side, we sure have detected corruption elsewhere.
1750		 */
1751		if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || digest_size)
1752			err = _drbd_send_bio(peer_device, req->master_bio);
1753		else
1754			err = _drbd_send_zc_bio(peer_device, req->master_bio);
1755
1756		/* double check digest, sometimes buffers have been modified in flight. */
1757		if (digest_size > 0 && digest_size <= 64) {
1758			/* 64 byte, 512 bit, is the largest digest size
1759			 * currently supported in kernel crypto. */
1760			unsigned char digest[64];
1761			drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest);
1762			if (memcmp(p + 1, digest, digest_size)) {
1763				drbd_warn(device,
1764					"Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
1765					(unsigned long long)req->i.sector, req->i.size);
1766			}
1767		} /* else if (digest_size > 64) {
1768		     ... Be noisy about digest too large ...
1769		} */
1770	}
1771out:
1772	mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
1773
1774	return err;
 
 
1775}
1776
1777/* answer packet, used to send data back for read requests:
1778 *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
1779 *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
1780 */
1781int drbd_send_block(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
1782		    struct drbd_peer_request *peer_req)
1783{
1784	struct drbd_device *device = peer_device->device;
1785	struct drbd_socket *sock;
1786	struct p_data *p;
1787	int err;
1788	int digest_size;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1789
1790	sock = &peer_device->connection->data;
1791	p = drbd_prepare_command(peer_device, sock);
 
 
 
 
 
 
 
 
1792
1793	digest_size = peer_device->connection->integrity_tfm ?
1794		      crypto_shash_digestsize(peer_device->connection->integrity_tfm) : 0;
 
 
 
 
 
 
1795
1796	if (!p)
1797		return -EIO;
1798	p->sector = cpu_to_be64(peer_req->i.sector);
1799	p->block_id = peer_req->block_id;
1800	p->seq_num = 0;  /* unused */
1801	p->dp_flags = 0;
1802	if (digest_size)
1803		drbd_csum_ee(peer_device->connection->integrity_tfm, peer_req, p + 1);
1804	err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*p) + digest_size, NULL, peer_req->i.size);
1805	if (!err)
1806		err = _drbd_send_zc_ee(peer_device, peer_req);
1807	mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
1808
1809	return err;
1810}
1811
1812int drbd_send_out_of_sync(struct drbd_peer_device *peer_device, struct drbd_request *req)
1813{
1814	struct drbd_socket *sock;
1815	struct p_block_desc *p;
1816
1817	sock = &peer_device->connection->data;
1818	p = drbd_prepare_command(peer_device, sock);
1819	if (!p)
1820		return -EIO;
1821	p->sector = cpu_to_be64(req->i.sector);
1822	p->blksize = cpu_to_be32(req->i.size);
1823	return drbd_send_command(peer_device, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0);
1824}
1825
1826/*
1827  drbd_send distinguishes two cases:
1828
1829  Packets sent via the data socket "sock"
1830  and packets sent via the meta data socket "msock"
1831
1832		    sock                      msock
1833  -----------------+-------------------------+------------------------------
1834  timeout           conf.timeout / 2          conf.timeout / 2
1835  timeout action    send a ping via msock     Abort communication
1836					      and close all sockets
1837*/
1838
1839/*
1840 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
1841 */
1842int drbd_send(struct drbd_connection *connection, struct socket *sock,
1843	      void *buf, size_t size, unsigned msg_flags)
1844{
1845	struct kvec iov = {.iov_base = buf, .iov_len = size};
1846	struct msghdr msg = {.msg_flags = msg_flags | MSG_NOSIGNAL};
1847	int rv, sent = 0;
1848
1849	if (!sock)
1850		return -EBADR;
1851
1852	/* THINK  if (signal_pending) return ... ? */
1853
1854	iov_iter_kvec(&msg.msg_iter, WRITE, &iov, 1, size);
 
1855
1856	if (sock == connection->data.socket) {
1857		rcu_read_lock();
1858		connection->ko_count = rcu_dereference(connection->net_conf)->ko_count;
1859		rcu_read_unlock();
1860		drbd_update_congested(connection);
 
 
 
 
1861	}
1862	do {
1863		rv = sock_sendmsg(sock, &msg);
 
 
 
 
 
 
 
 
 
1864		if (rv == -EAGAIN) {
1865			if (we_should_drop_the_connection(connection, sock))
1866				break;
1867			else
1868				continue;
1869		}
 
1870		if (rv == -EINTR) {
1871			flush_signals(current);
1872			rv = 0;
1873		}
1874		if (rv < 0)
1875			break;
1876		sent += rv;
 
 
1877	} while (sent < size);
1878
1879	if (sock == connection->data.socket)
1880		clear_bit(NET_CONGESTED, &connection->flags);
1881
1882	if (rv <= 0) {
1883		if (rv != -EAGAIN) {
1884			drbd_err(connection, "%s_sendmsg returned %d\n",
1885				 sock == connection->meta.socket ? "msock" : "sock",
1886				 rv);
1887			conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
1888		} else
1889			conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
1890	}
1891
1892	return sent;
1893}
1894
1895/**
1896 * drbd_send_all  -  Send an entire buffer
1897 *
1898 * Returns 0 upon success and a negative error value otherwise.
1899 */
1900int drbd_send_all(struct drbd_connection *connection, struct socket *sock, void *buffer,
1901		  size_t size, unsigned msg_flags)
1902{
1903	int err;
1904
1905	err = drbd_send(connection, sock, buffer, size, msg_flags);
1906	if (err < 0)
1907		return err;
1908	if (err != size)
1909		return -EIO;
1910	return 0;
1911}
1912
1913static int drbd_open(struct block_device *bdev, fmode_t mode)
1914{
1915	struct drbd_device *device = bdev->bd_disk->private_data;
1916	unsigned long flags;
1917	int rv = 0;
1918
1919	mutex_lock(&drbd_main_mutex);
1920	spin_lock_irqsave(&device->resource->req_lock, flags);
1921	/* to have a stable device->state.role
1922	 * and no race with updating open_cnt */
1923
1924	if (device->state.role != R_PRIMARY) {
1925		if (mode & FMODE_WRITE)
1926			rv = -EROFS;
1927		else if (!drbd_allow_oos)
1928			rv = -EMEDIUMTYPE;
1929	}
1930
1931	if (!rv)
1932		device->open_cnt++;
1933	spin_unlock_irqrestore(&device->resource->req_lock, flags);
1934	mutex_unlock(&drbd_main_mutex);
1935
1936	return rv;
1937}
1938
1939static void drbd_release(struct gendisk *gd, fmode_t mode)
1940{
1941	struct drbd_device *device = gd->private_data;
1942	mutex_lock(&drbd_main_mutex);
1943	device->open_cnt--;
1944	mutex_unlock(&drbd_main_mutex);
 
1945}
1946
1947/* need to hold resource->req_lock */
1948void drbd_queue_unplug(struct drbd_device *device)
1949{
1950	if (device->state.pdsk >= D_INCONSISTENT && device->state.conn >= C_CONNECTED) {
1951		D_ASSERT(device, device->state.role == R_PRIMARY);
1952		if (test_and_clear_bit(UNPLUG_REMOTE, &device->flags)) {
1953			drbd_queue_work_if_unqueued(
1954				&first_peer_device(device)->connection->sender_work,
1955				&device->unplug_work);
1956		}
1957	}
1958}
 
 
 
 
 
 
 
 
1959
1960static void drbd_set_defaults(struct drbd_device *device)
1961{
1962	/* Beware! The actual layout differs
1963	 * between big endian and little endian */
1964	device->state = (union drbd_dev_state) {
1965		{ .role = R_SECONDARY,
1966		  .peer = R_UNKNOWN,
1967		  .conn = C_STANDALONE,
1968		  .disk = D_DISKLESS,
1969		  .pdsk = D_UNKNOWN,
 
 
 
1970		} };
1971}
1972
1973void drbd_init_set_defaults(struct drbd_device *device)
1974{
1975	/* the memset(,0,) did most of this.
1976	 * note: only assignments, no allocation in here */
1977
1978	drbd_set_defaults(device);
1979
1980	atomic_set(&device->ap_bio_cnt, 0);
1981	atomic_set(&device->ap_actlog_cnt, 0);
1982	atomic_set(&device->ap_pending_cnt, 0);
1983	atomic_set(&device->rs_pending_cnt, 0);
1984	atomic_set(&device->unacked_cnt, 0);
1985	atomic_set(&device->local_cnt, 0);
1986	atomic_set(&device->pp_in_use_by_net, 0);
1987	atomic_set(&device->rs_sect_in, 0);
1988	atomic_set(&device->rs_sect_ev, 0);
1989	atomic_set(&device->ap_in_flight, 0);
1990	atomic_set(&device->md_io.in_use, 0);
1991
1992	mutex_init(&device->own_state_mutex);
1993	device->state_mutex = &device->own_state_mutex;
1994
1995	spin_lock_init(&device->al_lock);
1996	spin_lock_init(&device->peer_seq_lock);
1997
1998	INIT_LIST_HEAD(&device->active_ee);
1999	INIT_LIST_HEAD(&device->sync_ee);
2000	INIT_LIST_HEAD(&device->done_ee);
2001	INIT_LIST_HEAD(&device->read_ee);
2002	INIT_LIST_HEAD(&device->net_ee);
2003	INIT_LIST_HEAD(&device->resync_reads);
2004	INIT_LIST_HEAD(&device->resync_work.list);
2005	INIT_LIST_HEAD(&device->unplug_work.list);
2006	INIT_LIST_HEAD(&device->bm_io_work.w.list);
2007	INIT_LIST_HEAD(&device->pending_master_completion[0]);
2008	INIT_LIST_HEAD(&device->pending_master_completion[1]);
2009	INIT_LIST_HEAD(&device->pending_completion[0]);
2010	INIT_LIST_HEAD(&device->pending_completion[1]);
2011
2012	device->resync_work.cb  = w_resync_timer;
2013	device->unplug_work.cb  = w_send_write_hint;
2014	device->bm_io_work.w.cb = w_bitmap_io;
2015
2016	timer_setup(&device->resync_timer, resync_timer_fn, 0);
2017	timer_setup(&device->md_sync_timer, md_sync_timer_fn, 0);
2018	timer_setup(&device->start_resync_timer, start_resync_timer_fn, 0);
2019	timer_setup(&device->request_timer, request_timer_fn, 0);
2020
2021	init_waitqueue_head(&device->misc_wait);
2022	init_waitqueue_head(&device->state_wait);
2023	init_waitqueue_head(&device->ee_wait);
2024	init_waitqueue_head(&device->al_wait);
2025	init_waitqueue_head(&device->seq_wait);
2026
2027	device->resync_wenr = LC_FREE;
2028	device->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
2029	device->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
2030}
2031
2032static void _drbd_set_my_capacity(struct drbd_device *device, sector_t size)
2033{
2034	/* set_capacity(device->this_bdev->bd_disk, size); */
2035	set_capacity(device->vdisk, size);
2036	device->this_bdev->bd_inode->i_size = (loff_t)size << 9;
2037}
2038
2039void drbd_set_my_capacity(struct drbd_device *device, sector_t size)
2040{
2041	char ppb[10];
2042	_drbd_set_my_capacity(device, size);
2043	drbd_info(device, "size = %s (%llu KB)\n",
2044		ppsize(ppb, size>>1), (unsigned long long)size>>1);
 
 
 
 
 
 
 
 
 
 
 
 
 
2045}
2046
2047void drbd_device_cleanup(struct drbd_device *device)
2048{
2049	int i;
2050	if (first_peer_device(device)->connection->receiver.t_state != NONE)
2051		drbd_err(device, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
2052				first_peer_device(device)->connection->receiver.t_state);
2053
2054	device->al_writ_cnt  =
2055	device->bm_writ_cnt  =
2056	device->read_cnt     =
2057	device->recv_cnt     =
2058	device->send_cnt     =
2059	device->writ_cnt     =
2060	device->p_size       =
2061	device->rs_start     =
2062	device->rs_total     =
2063	device->rs_failed    = 0;
2064	device->rs_last_events = 0;
2065	device->rs_last_sect_ev = 0;
 
 
 
2066	for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2067		device->rs_mark_left[i] = 0;
2068		device->rs_mark_time[i] = 0;
2069	}
2070	D_ASSERT(device, first_peer_device(device)->connection->net_conf == NULL);
2071
2072	_drbd_set_my_capacity(device, 0);
2073	if (device->bitmap) {
2074		/* maybe never allocated. */
2075		drbd_bm_resize(device, 0, 1);
2076		drbd_bm_cleanup(device);
2077	}
2078
2079	drbd_backing_dev_free(device, device->ldev);
2080	device->ldev = NULL;
2081
2082	clear_bit(AL_SUSPENDED, &device->flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2083
2084	D_ASSERT(device, list_empty(&device->active_ee));
2085	D_ASSERT(device, list_empty(&device->sync_ee));
2086	D_ASSERT(device, list_empty(&device->done_ee));
2087	D_ASSERT(device, list_empty(&device->read_ee));
2088	D_ASSERT(device, list_empty(&device->net_ee));
2089	D_ASSERT(device, list_empty(&device->resync_reads));
2090	D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
2091	D_ASSERT(device, list_empty(&device->resync_work.list));
2092	D_ASSERT(device, list_empty(&device->unplug_work.list));
2093
2094	drbd_set_defaults(device);
2095}
2096
2097
2098static void drbd_destroy_mempools(void)
2099{
2100	struct page *page;
2101
2102	while (drbd_pp_pool) {
2103		page = drbd_pp_pool;
2104		drbd_pp_pool = (struct page *)page_private(page);
2105		__free_page(page);
2106		drbd_pp_vacant--;
2107	}
2108
2109	/* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
2110
2111	bioset_exit(&drbd_io_bio_set);
2112	bioset_exit(&drbd_md_io_bio_set);
2113	mempool_exit(&drbd_md_io_page_pool);
2114	mempool_exit(&drbd_ee_mempool);
2115	mempool_exit(&drbd_request_mempool);
2116	kmem_cache_destroy(drbd_ee_cache);
2117	kmem_cache_destroy(drbd_request_cache);
2118	kmem_cache_destroy(drbd_bm_ext_cache);
2119	kmem_cache_destroy(drbd_al_ext_cache);
 
 
 
2120
 
 
2121	drbd_ee_cache        = NULL;
2122	drbd_request_cache   = NULL;
2123	drbd_bm_ext_cache    = NULL;
2124	drbd_al_ext_cache    = NULL;
2125
2126	return;
2127}
2128
2129static int drbd_create_mempools(void)
2130{
2131	struct page *page;
2132	const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count;
2133	int i, ret;
 
 
 
 
 
 
 
 
2134
2135	/* caches */
2136	drbd_request_cache = kmem_cache_create(
2137		"drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
2138	if (drbd_request_cache == NULL)
2139		goto Enomem;
2140
2141	drbd_ee_cache = kmem_cache_create(
2142		"drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
2143	if (drbd_ee_cache == NULL)
2144		goto Enomem;
2145
2146	drbd_bm_ext_cache = kmem_cache_create(
2147		"drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
2148	if (drbd_bm_ext_cache == NULL)
2149		goto Enomem;
2150
2151	drbd_al_ext_cache = kmem_cache_create(
2152		"drbd_al", sizeof(struct lc_element), 0, 0, NULL);
2153	if (drbd_al_ext_cache == NULL)
2154		goto Enomem;
2155
2156	/* mempools */
2157	ret = bioset_init(&drbd_io_bio_set, BIO_POOL_SIZE, 0, 0);
2158	if (ret)
 
2159		goto Enomem;
2160
2161	ret = bioset_init(&drbd_md_io_bio_set, DRBD_MIN_POOL_PAGES, 0,
2162			  BIOSET_NEED_BVECS);
2163	if (ret)
2164		goto Enomem;
2165
2166	ret = mempool_init_page_pool(&drbd_md_io_page_pool, DRBD_MIN_POOL_PAGES, 0);
2167	if (ret)
2168		goto Enomem;
2169
2170	ret = mempool_init_slab_pool(&drbd_request_mempool, number,
2171				     drbd_request_cache);
2172	if (ret)
2173		goto Enomem;
2174
2175	ret = mempool_init_slab_pool(&drbd_ee_mempool, number, drbd_ee_cache);
2176	if (ret)
2177		goto Enomem;
2178
2179	/* drbd's page pool */
2180	spin_lock_init(&drbd_pp_lock);
2181
2182	for (i = 0; i < number; i++) {
2183		page = alloc_page(GFP_HIGHUSER);
2184		if (!page)
2185			goto Enomem;
2186		set_page_private(page, (unsigned long)drbd_pp_pool);
2187		drbd_pp_pool = page;
2188	}
2189	drbd_pp_vacant = number;
2190
2191	return 0;
2192
2193Enomem:
2194	drbd_destroy_mempools(); /* in case we allocated some */
2195	return -ENOMEM;
2196}
2197
2198static void drbd_release_all_peer_reqs(struct drbd_device *device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2199{
2200	int rr;
2201
2202	rr = drbd_free_peer_reqs(device, &device->active_ee);
2203	if (rr)
2204		drbd_err(device, "%d EEs in active list found!\n", rr);
2205
2206	rr = drbd_free_peer_reqs(device, &device->sync_ee);
2207	if (rr)
2208		drbd_err(device, "%d EEs in sync list found!\n", rr);
2209
2210	rr = drbd_free_peer_reqs(device, &device->read_ee);
2211	if (rr)
2212		drbd_err(device, "%d EEs in read list found!\n", rr);
2213
2214	rr = drbd_free_peer_reqs(device, &device->done_ee);
2215	if (rr)
2216		drbd_err(device, "%d EEs in done list found!\n", rr);
2217
2218	rr = drbd_free_peer_reqs(device, &device->net_ee);
2219	if (rr)
2220		drbd_err(device, "%d EEs in net list found!\n", rr);
2221}
2222
2223/* caution. no locking. */
2224void drbd_destroy_device(struct kref *kref)
 
2225{
2226	struct drbd_device *device = container_of(kref, struct drbd_device, kref);
2227	struct drbd_resource *resource = device->resource;
2228	struct drbd_peer_device *peer_device, *tmp_peer_device;
2229
2230	del_timer_sync(&device->request_timer);
 
2231
2232	/* paranoia asserts */
2233	D_ASSERT(device, device->open_cnt == 0);
 
 
 
 
 
 
 
 
 
2234	/* end paranoia asserts */
2235
 
 
2236	/* cleanup stuff that may have been allocated during
2237	 * device (re-)configuration or state changes */
2238
2239	if (device->this_bdev)
2240		bdput(device->this_bdev);
2241
2242	drbd_backing_dev_free(device, device->ldev);
2243	device->ldev = NULL;
2244
2245	drbd_release_all_peer_reqs(device);
2246
2247	lc_destroy(device->act_log);
2248	lc_destroy(device->resync);
2249
2250	kfree(device->p_uuid);
2251	/* device->p_uuid = NULL; */
2252
2253	if (device->bitmap) /* should no longer be there. */
2254		drbd_bm_cleanup(device);
2255	__free_page(device->md_io.page);
2256	put_disk(device->vdisk);
2257	blk_cleanup_queue(device->rq_queue);
2258	kfree(device->rs_plan_s);
2259
2260	/* not for_each_connection(connection, resource):
2261	 * those may have been cleaned up and disassociated already.
2262	 */
2263	for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
2264		kref_put(&peer_device->connection->kref, drbd_destroy_connection);
2265		kfree(peer_device);
2266	}
2267	memset(device, 0xfd, sizeof(*device));
2268	kfree(device);
2269	kref_put(&resource->kref, drbd_destroy_resource);
2270}
2271
2272/* One global retry thread, if we need to push back some bio and have it
2273 * reinserted through our make request function.
2274 */
2275static struct retry_worker {
2276	struct workqueue_struct *wq;
2277	struct work_struct worker;
2278
2279	spinlock_t lock;
2280	struct list_head writes;
2281} retry;
2282
2283static void do_retry(struct work_struct *ws)
2284{
2285	struct retry_worker *retry = container_of(ws, struct retry_worker, worker);
2286	LIST_HEAD(writes);
2287	struct drbd_request *req, *tmp;
2288
2289	spin_lock_irq(&retry->lock);
2290	list_splice_init(&retry->writes, &writes);
2291	spin_unlock_irq(&retry->lock);
2292
2293	list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
2294		struct drbd_device *device = req->device;
2295		struct bio *bio = req->master_bio;
2296		unsigned long start_jif = req->start_jif;
2297		bool expected;
2298
2299		expected =
2300			expect(atomic_read(&req->completion_ref) == 0) &&
2301			expect(req->rq_state & RQ_POSTPONED) &&
2302			expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
2303				(req->rq_state & RQ_LOCAL_ABORTED) != 0);
2304
2305		if (!expected)
2306			drbd_err(device, "req=%p completion_ref=%d rq_state=%x\n",
2307				req, atomic_read(&req->completion_ref),
2308				req->rq_state);
2309
2310		/* We still need to put one kref associated with the
2311		 * "completion_ref" going zero in the code path that queued it
2312		 * here.  The request object may still be referenced by a
2313		 * frozen local req->private_bio, in case we force-detached.
2314		 */
2315		kref_put(&req->kref, drbd_req_destroy);
2316
2317		/* A single suspended or otherwise blocking device may stall
2318		 * all others as well.  Fortunately, this code path is to
2319		 * recover from a situation that "should not happen":
2320		 * concurrent writes in multi-primary setup.
2321		 * In a "normal" lifecycle, this workqueue is supposed to be
2322		 * destroyed without ever doing anything.
2323		 * If it turns out to be an issue anyways, we can do per
2324		 * resource (replication group) or per device (minor) retry
2325		 * workqueues instead.
2326		 */
2327
2328		/* We are not just doing submit_bio_noacct(),
2329		 * as we want to keep the start_time information. */
2330		inc_ap_bio(device);
2331		__drbd_make_request(device, bio, start_jif);
2332	}
2333}
2334
2335/* called via drbd_req_put_completion_ref(),
2336 * holds resource->req_lock */
2337void drbd_restart_request(struct drbd_request *req)
2338{
2339	unsigned long flags;
2340	spin_lock_irqsave(&retry.lock, flags);
2341	list_move_tail(&req->tl_requests, &retry.writes);
2342	spin_unlock_irqrestore(&retry.lock, flags);
2343
2344	/* Drop the extra reference that would otherwise
2345	 * have been dropped by complete_master_bio.
2346	 * do_retry() needs to grab a new one. */
2347	dec_ap_bio(req->device);
2348
2349	queue_work(retry.wq, &retry.worker);
2350}
2351
2352void drbd_destroy_resource(struct kref *kref)
2353{
2354	struct drbd_resource *resource =
2355		container_of(kref, struct drbd_resource, kref);
2356
2357	idr_destroy(&resource->devices);
2358	free_cpumask_var(resource->cpu_mask);
2359	kfree(resource->name);
2360	memset(resource, 0xf2, sizeof(*resource));
2361	kfree(resource);
2362}
2363
2364void drbd_free_resource(struct drbd_resource *resource)
2365{
2366	struct drbd_connection *connection, *tmp;
2367
2368	for_each_connection_safe(connection, tmp, resource) {
2369		list_del(&connection->connections);
2370		drbd_debugfs_connection_cleanup(connection);
2371		kref_put(&connection->kref, drbd_destroy_connection);
2372	}
2373	drbd_debugfs_resource_cleanup(resource);
2374	kref_put(&resource->kref, drbd_destroy_resource);
2375}
2376
2377static void drbd_cleanup(void)
2378{
2379	unsigned int i;
2380	struct drbd_device *device;
2381	struct drbd_resource *resource, *tmp;
2382
2383	/* first remove proc,
2384	 * drbdsetup uses it's presence to detect
2385	 * whether DRBD is loaded.
2386	 * If we would get stuck in proc removal,
2387	 * but have netlink already deregistered,
2388	 * some drbdsetup commands may wait forever
2389	 * for an answer.
2390	 */
2391	if (drbd_proc)
2392		remove_proc_entry("drbd", NULL);
2393
2394	if (retry.wq)
2395		destroy_workqueue(retry.wq);
2396
2397	drbd_genl_unregister();
2398
2399	idr_for_each_entry(&drbd_devices, device, i)
2400		drbd_delete_device(device);
2401
2402	/* not _rcu since, no other updater anymore. Genl already unregistered */
2403	for_each_resource_safe(resource, tmp, &drbd_resources) {
2404		list_del(&resource->resources);
2405		drbd_free_resource(resource);
 
2406	}
2407
2408	drbd_debugfs_cleanup();
2409
2410	drbd_destroy_mempools();
2411	unregister_blkdev(DRBD_MAJOR, "drbd");
2412
2413	idr_destroy(&drbd_devices);
2414
2415	pr_info("module cleanup done.\n");
2416}
2417
2418static void drbd_init_workqueue(struct drbd_work_queue* wq)
 
 
 
 
 
 
 
2419{
2420	spin_lock_init(&wq->q_lock);
2421	INIT_LIST_HEAD(&wq->q);
2422	init_waitqueue_head(&wq->q_wait);
2423}
2424
2425struct completion_work {
2426	struct drbd_work w;
2427	struct completion done;
2428};
2429
2430static int w_complete(struct drbd_work *w, int cancel)
2431{
2432	struct completion_work *completion_work =
2433		container_of(w, struct completion_work, w);
2434
2435	complete(&completion_work->done);
2436	return 0;
2437}
2438
2439void drbd_flush_workqueue(struct drbd_work_queue *work_queue)
2440{
2441	struct completion_work completion_work;
2442
2443	completion_work.w.cb = w_complete;
2444	init_completion(&completion_work.done);
2445	drbd_queue_work(work_queue, &completion_work.w);
2446	wait_for_completion(&completion_work.done);
2447}
2448
2449struct drbd_resource *drbd_find_resource(const char *name)
2450{
2451	struct drbd_resource *resource;
2452
2453	if (!name || !name[0])
2454		return NULL;
2455
2456	rcu_read_lock();
2457	for_each_resource_rcu(resource, &drbd_resources) {
2458		if (!strcmp(resource->name, name)) {
2459			kref_get(&resource->kref);
2460			goto found;
2461		}
2462	}
2463	resource = NULL;
2464found:
2465	rcu_read_unlock();
2466	return resource;
2467}
2468
2469struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
2470				     void *peer_addr, int peer_addr_len)
2471{
2472	struct drbd_resource *resource;
2473	struct drbd_connection *connection;
2474
2475	rcu_read_lock();
2476	for_each_resource_rcu(resource, &drbd_resources) {
2477		for_each_connection_rcu(connection, resource) {
2478			if (connection->my_addr_len == my_addr_len &&
2479			    connection->peer_addr_len == peer_addr_len &&
2480			    !memcmp(&connection->my_addr, my_addr, my_addr_len) &&
2481			    !memcmp(&connection->peer_addr, peer_addr, peer_addr_len)) {
2482				kref_get(&connection->kref);
2483				goto found;
2484			}
2485		}
2486	}
2487	connection = NULL;
2488found:
2489	rcu_read_unlock();
2490	return connection;
2491}
2492
2493static int drbd_alloc_socket(struct drbd_socket *socket)
2494{
2495	socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
2496	if (!socket->rbuf)
2497		return -ENOMEM;
2498	socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
2499	if (!socket->sbuf)
2500		return -ENOMEM;
2501	return 0;
2502}
2503
2504static void drbd_free_socket(struct drbd_socket *socket)
2505{
2506	free_page((unsigned long) socket->sbuf);
2507	free_page((unsigned long) socket->rbuf);
2508}
2509
2510void conn_free_crypto(struct drbd_connection *connection)
2511{
2512	drbd_free_sock(connection);
2513
2514	crypto_free_shash(connection->csums_tfm);
2515	crypto_free_shash(connection->verify_tfm);
2516	crypto_free_shash(connection->cram_hmac_tfm);
2517	crypto_free_shash(connection->integrity_tfm);
2518	crypto_free_shash(connection->peer_integrity_tfm);
2519	kfree(connection->int_dig_in);
2520	kfree(connection->int_dig_vv);
2521
2522	connection->csums_tfm = NULL;
2523	connection->verify_tfm = NULL;
2524	connection->cram_hmac_tfm = NULL;
2525	connection->integrity_tfm = NULL;
2526	connection->peer_integrity_tfm = NULL;
2527	connection->int_dig_in = NULL;
2528	connection->int_dig_vv = NULL;
2529}
2530
2531int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts)
2532{
2533	struct drbd_connection *connection;
2534	cpumask_var_t new_cpu_mask;
2535	int err;
2536
2537	if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
2538		return -ENOMEM;
2539
2540	/* silently ignore cpu mask on UP kernel */
2541	if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
2542		err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
2543				   cpumask_bits(new_cpu_mask), nr_cpu_ids);
2544		if (err == -EOVERFLOW) {
2545			/* So what. mask it out. */
2546			cpumask_var_t tmp_cpu_mask;
2547			if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) {
2548				cpumask_setall(tmp_cpu_mask);
2549				cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask);
2550				drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n",
2551					res_opts->cpu_mask,
2552					strlen(res_opts->cpu_mask) > 12 ? "..." : "",
2553					nr_cpu_ids);
2554				free_cpumask_var(tmp_cpu_mask);
2555				err = 0;
2556			}
2557		}
2558		if (err) {
2559			drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
2560			/* retcode = ERR_CPU_MASK_PARSE; */
2561			goto fail;
2562		}
2563	}
2564	resource->res_opts = *res_opts;
2565	if (cpumask_empty(new_cpu_mask))
2566		drbd_calc_cpu_mask(&new_cpu_mask);
2567	if (!cpumask_equal(resource->cpu_mask, new_cpu_mask)) {
2568		cpumask_copy(resource->cpu_mask, new_cpu_mask);
2569		for_each_connection_rcu(connection, resource) {
2570			connection->receiver.reset_cpu_mask = 1;
2571			connection->ack_receiver.reset_cpu_mask = 1;
2572			connection->worker.reset_cpu_mask = 1;
2573		}
2574	}
2575	err = 0;
2576
2577fail:
2578	free_cpumask_var(new_cpu_mask);
2579	return err;
2580
2581}
2582
2583struct drbd_resource *drbd_create_resource(const char *name)
2584{
2585	struct drbd_resource *resource;
2586
2587	resource = kzalloc(sizeof(struct drbd_resource), GFP_KERNEL);
2588	if (!resource)
2589		goto fail;
2590	resource->name = kstrdup(name, GFP_KERNEL);
2591	if (!resource->name)
2592		goto fail_free_resource;
2593	if (!zalloc_cpumask_var(&resource->cpu_mask, GFP_KERNEL))
2594		goto fail_free_name;
2595	kref_init(&resource->kref);
2596	idr_init(&resource->devices);
2597	INIT_LIST_HEAD(&resource->connections);
2598	resource->write_ordering = WO_BDEV_FLUSH;
2599	list_add_tail_rcu(&resource->resources, &drbd_resources);
2600	mutex_init(&resource->conf_update);
2601	mutex_init(&resource->adm_mutex);
2602	spin_lock_init(&resource->req_lock);
2603	drbd_debugfs_resource_add(resource);
2604	return resource;
2605
2606fail_free_name:
2607	kfree(resource->name);
2608fail_free_resource:
2609	kfree(resource);
2610fail:
2611	return NULL;
2612}
2613
2614/* caller must be under adm_mutex */
2615struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
2616{
2617	struct drbd_resource *resource;
2618	struct drbd_connection *connection;
2619
2620	connection = kzalloc(sizeof(struct drbd_connection), GFP_KERNEL);
2621	if (!connection)
2622		return NULL;
2623
2624	if (drbd_alloc_socket(&connection->data))
2625		goto fail;
2626	if (drbd_alloc_socket(&connection->meta))
2627		goto fail;
2628
2629	connection->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
2630	if (!connection->current_epoch)
2631		goto fail;
2632
2633	INIT_LIST_HEAD(&connection->transfer_log);
2634
2635	INIT_LIST_HEAD(&connection->current_epoch->list);
2636	connection->epochs = 1;
2637	spin_lock_init(&connection->epoch_lock);
2638
2639	connection->send.seen_any_write_yet = false;
2640	connection->send.current_epoch_nr = 0;
2641	connection->send.current_epoch_writes = 0;
2642
2643	resource = drbd_create_resource(name);
2644	if (!resource)
2645		goto fail;
2646
2647	connection->cstate = C_STANDALONE;
2648	mutex_init(&connection->cstate_mutex);
2649	init_waitqueue_head(&connection->ping_wait);
2650	idr_init(&connection->peer_devices);
2651
2652	drbd_init_workqueue(&connection->sender_work);
2653	mutex_init(&connection->data.mutex);
2654	mutex_init(&connection->meta.mutex);
2655
2656	drbd_thread_init(resource, &connection->receiver, drbd_receiver, "receiver");
2657	connection->receiver.connection = connection;
2658	drbd_thread_init(resource, &connection->worker, drbd_worker, "worker");
2659	connection->worker.connection = connection;
2660	drbd_thread_init(resource, &connection->ack_receiver, drbd_ack_receiver, "ack_recv");
2661	connection->ack_receiver.connection = connection;
2662
2663	kref_init(&connection->kref);
2664
2665	connection->resource = resource;
2666
2667	if (set_resource_options(resource, res_opts))
2668		goto fail_resource;
2669
2670	kref_get(&resource->kref);
2671	list_add_tail_rcu(&connection->connections, &resource->connections);
2672	drbd_debugfs_connection_add(connection);
2673	return connection;
2674
2675fail_resource:
2676	list_del(&resource->resources);
2677	drbd_free_resource(resource);
2678fail:
2679	kfree(connection->current_epoch);
2680	drbd_free_socket(&connection->meta);
2681	drbd_free_socket(&connection->data);
2682	kfree(connection);
2683	return NULL;
2684}
2685
2686void drbd_destroy_connection(struct kref *kref)
2687{
2688	struct drbd_connection *connection = container_of(kref, struct drbd_connection, kref);
2689	struct drbd_resource *resource = connection->resource;
2690
2691	if (atomic_read(&connection->current_epoch->epoch_size) !=  0)
2692		drbd_err(connection, "epoch_size:%d\n", atomic_read(&connection->current_epoch->epoch_size));
2693	kfree(connection->current_epoch);
2694
2695	idr_destroy(&connection->peer_devices);
2696
2697	drbd_free_socket(&connection->meta);
2698	drbd_free_socket(&connection->data);
2699	kfree(connection->int_dig_in);
2700	kfree(connection->int_dig_vv);
2701	memset(connection, 0xfc, sizeof(*connection));
2702	kfree(connection);
2703	kref_put(&resource->kref, drbd_destroy_resource);
2704}
2705
2706static int init_submitter(struct drbd_device *device)
2707{
2708	/* opencoded create_singlethread_workqueue(),
2709	 * to be able to say "drbd%d", ..., minor */
2710	device->submit.wq =
2711		alloc_ordered_workqueue("drbd%u_submit", WQ_MEM_RECLAIM, device->minor);
2712	if (!device->submit.wq)
2713		return -ENOMEM;
2714
2715	INIT_WORK(&device->submit.worker, do_submit);
2716	INIT_LIST_HEAD(&device->submit.writes);
2717	return 0;
2718}
2719
2720enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor)
2721{
2722	struct drbd_resource *resource = adm_ctx->resource;
2723	struct drbd_connection *connection;
2724	struct drbd_device *device;
2725	struct drbd_peer_device *peer_device, *tmp_peer_device;
2726	struct gendisk *disk;
2727	struct request_queue *q;
2728	int id;
2729	int vnr = adm_ctx->volume;
2730	enum drbd_ret_code err = ERR_NOMEM;
2731
2732	device = minor_to_device(minor);
2733	if (device)
2734		return ERR_MINOR_OR_VOLUME_EXISTS;
2735
2736	/* GFP_KERNEL, we are outside of all write-out paths */
2737	device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL);
2738	if (!device)
2739		return ERR_NOMEM;
2740	kref_init(&device->kref);
2741
2742	kref_get(&resource->kref);
2743	device->resource = resource;
2744	device->minor = minor;
2745	device->vnr = vnr;
2746
2747	drbd_init_set_defaults(device);
2748
2749	q = blk_alloc_queue(NUMA_NO_NODE);
2750	if (!q)
2751		goto out_no_q;
2752	device->rq_queue = q;
 
2753
2754	disk = alloc_disk(1);
2755	if (!disk)
2756		goto out_no_disk;
2757	device->vdisk = disk;
2758
2759	set_disk_ro(disk, true);
2760
2761	disk->queue = q;
2762	disk->major = DRBD_MAJOR;
2763	disk->first_minor = minor;
2764	disk->fops = &drbd_ops;
2765	sprintf(disk->disk_name, "drbd%d", minor);
2766	disk->private_data = device;
2767
2768	device->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
2769	/* we have no partitions. we contain only ourselves. */
2770	device->this_bdev->bd_contains = device->this_bdev;
2771
2772	blk_queue_write_cache(q, true, true);
 
 
 
2773	/* Setting the max_hw_sectors to an odd value of 8kibyte here
2774	   This triggers a max_bio_size message upon first attach or connect */
2775	blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
 
 
 
2776
2777	device->md_io.page = alloc_page(GFP_KERNEL);
2778	if (!device->md_io.page)
2779		goto out_no_io_page;
2780
2781	if (drbd_bm_init(device))
2782		goto out_no_bitmap;
2783	device->read_requests = RB_ROOT;
2784	device->write_requests = RB_ROOT;
2785
2786	id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL);
2787	if (id < 0) {
2788		if (id == -ENOSPC)
2789			err = ERR_MINOR_OR_VOLUME_EXISTS;
2790		goto out_no_minor_idr;
2791	}
2792	kref_get(&device->kref);
2793
2794	id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL);
2795	if (id < 0) {
2796		if (id == -ENOSPC)
2797			err = ERR_MINOR_OR_VOLUME_EXISTS;
2798		goto out_idr_remove_minor;
2799	}
2800	kref_get(&device->kref);
2801
2802	INIT_LIST_HEAD(&device->peer_devices);
2803	INIT_LIST_HEAD(&device->pending_bitmap_io);
2804	for_each_connection(connection, resource) {
2805		peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL);
2806		if (!peer_device)
2807			goto out_idr_remove_from_resource;
2808		peer_device->connection = connection;
2809		peer_device->device = device;
2810
2811		list_add(&peer_device->peer_devices, &device->peer_devices);
2812		kref_get(&device->kref);
2813
2814		id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL);
2815		if (id < 0) {
2816			if (id == -ENOSPC)
2817				err = ERR_INVALID_REQUEST;
2818			goto out_idr_remove_from_resource;
2819		}
2820		kref_get(&connection->kref);
2821		INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf);
2822	}
2823
2824	if (init_submitter(device)) {
2825		err = ERR_NOMEM;
2826		goto out_idr_remove_vol;
2827	}
2828
2829	add_disk(disk);
2830
2831	/* inherit the connection state */
2832	device->state.conn = first_connection(resource)->cstate;
2833	if (device->state.conn == C_WF_REPORT_PARAMS) {
2834		for_each_peer_device(peer_device, device)
2835			drbd_connected(peer_device);
2836	}
2837	/* move to create_peer_device() */
2838	for_each_peer_device(peer_device, device)
2839		drbd_debugfs_peer_device_add(peer_device);
2840	drbd_debugfs_device_add(device);
2841	return NO_ERROR;
2842
2843out_idr_remove_vol:
2844	idr_remove(&connection->peer_devices, vnr);
2845out_idr_remove_from_resource:
2846	for_each_connection(connection, resource) {
2847		peer_device = idr_remove(&connection->peer_devices, vnr);
2848		if (peer_device)
2849			kref_put(&connection->kref, drbd_destroy_connection);
2850	}
2851	for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
2852		list_del(&peer_device->peer_devices);
2853		kfree(peer_device);
2854	}
2855	idr_remove(&resource->devices, vnr);
2856out_idr_remove_minor:
2857	idr_remove(&drbd_devices, minor);
2858	synchronize_rcu();
2859out_no_minor_idr:
2860	drbd_bm_cleanup(device);
2861out_no_bitmap:
2862	__free_page(device->md_io.page);
2863out_no_io_page:
2864	put_disk(disk);
2865out_no_disk:
2866	blk_cleanup_queue(q);
2867out_no_q:
2868	kref_put(&resource->kref, drbd_destroy_resource);
2869	kfree(device);
2870	return err;
 
2871}
2872
2873void drbd_delete_device(struct drbd_device *device)
2874{
2875	struct drbd_resource *resource = device->resource;
2876	struct drbd_connection *connection;
2877	struct drbd_peer_device *peer_device;
2878
2879	/* move to free_peer_device() */
2880	for_each_peer_device(peer_device, device)
2881		drbd_debugfs_peer_device_cleanup(peer_device);
2882	drbd_debugfs_device_cleanup(device);
2883	for_each_connection(connection, resource) {
2884		idr_remove(&connection->peer_devices, device->vnr);
2885		kref_put(&device->kref, drbd_destroy_device);
2886	}
2887	idr_remove(&resource->devices, device->vnr);
2888	kref_put(&device->kref, drbd_destroy_device);
2889	idr_remove(&drbd_devices, device_to_minor(device));
2890	kref_put(&device->kref, drbd_destroy_device);
2891	del_gendisk(device->vdisk);
2892	synchronize_rcu();
2893	kref_put(&device->kref, drbd_destroy_device);
2894}
2895
2896static int __init drbd_init(void)
 
2897{
2898	int err;
2899
2900	if (drbd_minor_count < DRBD_MINOR_COUNT_MIN || drbd_minor_count > DRBD_MINOR_COUNT_MAX) {
2901		pr_err("invalid minor_count (%d)\n", drbd_minor_count);
 
 
 
 
 
 
 
 
2902#ifdef MODULE
2903		return -EINVAL;
2904#else
2905		drbd_minor_count = DRBD_MINOR_COUNT_DEF;
2906#endif
2907	}
2908
 
 
 
 
2909	err = register_blkdev(DRBD_MAJOR, "drbd");
2910	if (err) {
2911		pr_err("unable to register block device major %d\n",
 
2912		       DRBD_MAJOR);
2913		return err;
2914	}
2915
 
 
2916	/*
2917	 * allocate all necessary structs
2918	 */
 
 
2919	init_waitqueue_head(&drbd_pp_wait);
2920
2921	drbd_proc = NULL; /* play safe for drbd_cleanup */
2922	idr_init(&drbd_devices);
2923
2924	mutex_init(&resources_mutex);
2925	INIT_LIST_HEAD(&drbd_resources);
2926
2927	err = drbd_genl_register();
2928	if (err) {
2929		pr_err("unable to register generic netlink family\n");
2930		goto fail;
2931	}
2932
2933	err = drbd_create_mempools();
2934	if (err)
2935		goto fail;
2936
2937	err = -ENOMEM;
2938	drbd_proc = proc_create_single("drbd", S_IFREG | 0444 , NULL, drbd_seq_show);
2939	if (!drbd_proc)	{
2940		pr_err("unable to register proc file\n");
2941		goto fail;
2942	}
2943
2944	retry.wq = create_singlethread_workqueue("drbd-reissue");
2945	if (!retry.wq) {
2946		pr_err("unable to create retry workqueue\n");
2947		goto fail;
2948	}
2949	INIT_WORK(&retry.worker, do_retry);
2950	spin_lock_init(&retry.lock);
2951	INIT_LIST_HEAD(&retry.writes);
2952
2953	drbd_debugfs_init();
2954
2955	pr_info("initialized. "
2956	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
2957	       API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
2958	pr_info("%s\n", drbd_buildtag());
2959	pr_info("registered as block device major %d\n", DRBD_MAJOR);
 
 
 
2960	return 0; /* Success! */
2961
2962fail:
2963	drbd_cleanup();
2964	if (err == -ENOMEM)
2965		pr_err("ran out of memory\n");
 
2966	else
2967		pr_err("initialization failure\n");
2968	return err;
2969}
2970
2971static void drbd_free_one_sock(struct drbd_socket *ds)
2972{
2973	struct socket *s;
2974	mutex_lock(&ds->mutex);
2975	s = ds->socket;
2976	ds->socket = NULL;
2977	mutex_unlock(&ds->mutex);
2978	if (s) {
2979		/* so debugfs does not need to mutex_lock() */
2980		synchronize_rcu();
2981		kernel_sock_shutdown(s, SHUT_RDWR);
2982		sock_release(s);
2983	}
2984}
2985
2986void drbd_free_sock(struct drbd_connection *connection)
2987{
2988	if (connection->data.socket)
2989		drbd_free_one_sock(&connection->data);
2990	if (connection->meta.socket)
2991		drbd_free_one_sock(&connection->meta);
 
 
 
 
 
 
 
 
 
 
2992}
2993
2994/* meta data management */
2995
2996void conn_md_sync(struct drbd_connection *connection)
2997{
2998	struct drbd_peer_device *peer_device;
2999	int vnr;
 
 
 
 
 
 
 
 
3000
3001	rcu_read_lock();
3002	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
3003		struct drbd_device *device = peer_device->device;
3004
3005		kref_get(&device->kref);
3006		rcu_read_unlock();
3007		drbd_md_sync(device);
3008		kref_put(&device->kref, drbd_destroy_device);
3009		rcu_read_lock();
3010	}
3011	rcu_read_unlock();
3012}
3013
3014/* aligned 4kByte */
 
3015struct meta_data_on_disk {
3016	u64 la_size_sect;      /* last agreed size. */
3017	u64 uuid[UI_SIZE];   /* UUIDs. */
3018	u64 device_uuid;
3019	u64 reserved_u64_1;
3020	u32 flags;             /* MDF */
3021	u32 magic;
3022	u32 md_size_sect;
3023	u32 al_offset;         /* offset to this block */
3024	u32 al_nr_extents;     /* important for restoring the AL (userspace) */
3025	      /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
3026	u32 bm_offset;         /* offset to the bitmap, from here */
3027	u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
3028	u32 la_peer_max_bio_size;   /* last peer max_bio_size */
 
3029
3030	/* see al_tr_number_to_on_disk_sector() */
3031	u32 al_stripes;
3032	u32 al_stripe_size_4k;
3033
3034	u8 reserved_u8[4096 - (7*8 + 10*4)];
3035} __packed;
3036
3037
3038
3039void drbd_md_write(struct drbd_device *device, void *b)
3040{
3041	struct meta_data_on_disk *buffer = b;
3042	sector_t sector;
3043	int i;
3044
3045	memset(buffer, 0, sizeof(*buffer));
3046
3047	buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(device->this_bdev));
3048	for (i = UI_CURRENT; i < UI_SIZE; i++)
3049		buffer->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
3050	buffer->flags = cpu_to_be32(device->ldev->md.flags);
3051	buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN);
3052
3053	buffer->md_size_sect  = cpu_to_be32(device->ldev->md.md_size_sect);
3054	buffer->al_offset     = cpu_to_be32(device->ldev->md.al_offset);
3055	buffer->al_nr_extents = cpu_to_be32(device->act_log->nr_elements);
3056	buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3057	buffer->device_uuid = cpu_to_be64(device->ldev->md.device_uuid);
3058
3059	buffer->bm_offset = cpu_to_be32(device->ldev->md.bm_offset);
3060	buffer->la_peer_max_bio_size = cpu_to_be32(device->peer_max_bio_size);
3061
3062	buffer->al_stripes = cpu_to_be32(device->ldev->md.al_stripes);
3063	buffer->al_stripe_size_4k = cpu_to_be32(device->ldev->md.al_stripe_size_4k);
3064
3065	D_ASSERT(device, drbd_md_ss(device->ldev) == device->ldev->md.md_offset);
3066	sector = device->ldev->md.md_offset;
3067
3068	if (drbd_md_sync_page_io(device, device->ldev, sector, REQ_OP_WRITE)) {
3069		/* this was a try anyways ... */
3070		drbd_err(device, "meta data update failed!\n");
3071		drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
3072	}
3073}
3074
3075/**
3076 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3077 * @device:	DRBD device.
3078 */
3079void drbd_md_sync(struct drbd_device *device)
3080{
3081	struct meta_data_on_disk *buffer;
 
 
3082
3083	/* Don't accidentally change the DRBD meta data layout. */
3084	BUILD_BUG_ON(UI_SIZE != 4);
3085	BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
3086
3087	del_timer(&device->md_sync_timer);
3088	/* timer may be rearmed by drbd_md_mark_dirty() now. */
3089	if (!test_and_clear_bit(MD_DIRTY, &device->flags))
3090		return;
3091
3092	/* We use here D_FAILED and not D_ATTACHING because we try to write
3093	 * metadata even if we detach due to a disk failure! */
3094	if (!get_ldev_if_state(device, D_FAILED))
3095		return;
3096
3097	buffer = drbd_md_get_buffer(device, __func__);
3098	if (!buffer)
3099		goto out;
3100
3101	drbd_md_write(device, buffer);
 
 
 
 
 
 
 
 
 
 
3102
3103	/* Update device->ldev->md.la_size_sect,
3104	 * since we updated it on metadata. */
3105	device->ldev->md.la_size_sect = drbd_get_capacity(device->this_bdev);
3106
3107	drbd_md_put_buffer(device);
3108out:
3109	put_ldev(device);
3110}
3111
3112static int check_activity_log_stripe_size(struct drbd_device *device,
3113		struct meta_data_on_disk *on_disk,
3114		struct drbd_md *in_core)
3115{
3116	u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
3117	u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
3118	u64 al_size_4k;
3119
3120	/* both not set: default to old fixed size activity log */
3121	if (al_stripes == 0 && al_stripe_size_4k == 0) {
3122		al_stripes = 1;
3123		al_stripe_size_4k = MD_32kB_SECT/8;
3124	}
3125
3126	/* some paranoia plausibility checks */
3127
3128	/* we need both values to be set */
3129	if (al_stripes == 0 || al_stripe_size_4k == 0)
3130		goto err;
3131
3132	al_size_4k = (u64)al_stripes * al_stripe_size_4k;
3133
3134	/* Upper limit of activity log area, to avoid potential overflow
3135	 * problems in al_tr_number_to_on_disk_sector(). As right now, more
3136	 * than 72 * 4k blocks total only increases the amount of history,
3137	 * limiting this arbitrarily to 16 GB is not a real limitation ;-)  */
3138	if (al_size_4k > (16 * 1024 * 1024/4))
3139		goto err;
3140
3141	/* Lower limit: we need at least 8 transaction slots (32kB)
3142	 * to not break existing setups */
3143	if (al_size_4k < MD_32kB_SECT/8)
3144		goto err;
3145
3146	in_core->al_stripe_size_4k = al_stripe_size_4k;
3147	in_core->al_stripes = al_stripes;
3148	in_core->al_size_4k = al_size_4k;
3149
3150	return 0;
3151err:
3152	drbd_err(device, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
3153			al_stripes, al_stripe_size_4k);
3154	return -EINVAL;
3155}
3156
3157static int check_offsets_and_sizes(struct drbd_device *device, struct drbd_backing_dev *bdev)
3158{
3159	sector_t capacity = drbd_get_capacity(bdev->md_bdev);
3160	struct drbd_md *in_core = &bdev->md;
3161	s32 on_disk_al_sect;
3162	s32 on_disk_bm_sect;
3163
3164	/* The on-disk size of the activity log, calculated from offsets, and
3165	 * the size of the activity log calculated from the stripe settings,
3166	 * should match.
3167	 * Though we could relax this a bit: it is ok, if the striped activity log
3168	 * fits in the available on-disk activity log size.
3169	 * Right now, that would break how resize is implemented.
3170	 * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
3171	 * of possible unused padding space in the on disk layout. */
3172	if (in_core->al_offset < 0) {
3173		if (in_core->bm_offset > in_core->al_offset)
3174			goto err;
3175		on_disk_al_sect = -in_core->al_offset;
3176		on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
3177	} else {
3178		if (in_core->al_offset != MD_4kB_SECT)
3179			goto err;
3180		if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
3181			goto err;
3182
3183		on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
3184		on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
3185	}
3186
3187	/* old fixed size meta data is exactly that: fixed. */
3188	if (in_core->meta_dev_idx >= 0) {
3189		if (in_core->md_size_sect != MD_128MB_SECT
3190		||  in_core->al_offset != MD_4kB_SECT
3191		||  in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
3192		||  in_core->al_stripes != 1
3193		||  in_core->al_stripe_size_4k != MD_32kB_SECT/8)
3194			goto err;
3195	}
3196
3197	if (capacity < in_core->md_size_sect)
3198		goto err;
3199	if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
3200		goto err;
3201
3202	/* should be aligned, and at least 32k */
3203	if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT))
3204		goto err;
3205
3206	/* should fit (for now: exactly) into the available on-disk space;
3207	 * overflow prevention is in check_activity_log_stripe_size() above. */
3208	if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
3209		goto err;
3210
3211	/* again, should be aligned */
3212	if (in_core->bm_offset & 7)
3213		goto err;
3214
3215	/* FIXME check for device grow with flex external meta data? */
3216
3217	/* can the available bitmap space cover the last agreed device size? */
3218	if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512)
3219		goto err;
3220
3221	return 0;
3222
3223err:
3224	drbd_err(device, "meta data offsets don't make sense: idx=%d "
3225			"al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
3226			"md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
3227			in_core->meta_dev_idx,
3228			in_core->al_stripes, in_core->al_stripe_size_4k,
3229			in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
3230			(unsigned long long)in_core->la_size_sect,
3231			(unsigned long long)capacity);
3232
3233	return -EINVAL;
3234}
3235
3236
3237/**
3238 * drbd_md_read() - Reads in the meta data super block
3239 * @device:	DRBD device.
3240 * @bdev:	Device from which the meta data should be read in.
3241 *
3242 * Return NO_ERROR on success, and an enum drbd_ret_code in case
3243 * something goes wrong.
3244 *
3245 * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
3246 * even before @bdev is assigned to @device->ldev.
3247 */
3248int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
3249{
3250	struct meta_data_on_disk *buffer;
3251	u32 magic, flags;
3252	int i, rv = NO_ERROR;
3253
3254	if (device->state.disk != D_DISKLESS)
3255		return ERR_DISK_CONFIGURED;
3256
3257	buffer = drbd_md_get_buffer(device, __func__);
3258	if (!buffer)
3259		return ERR_NOMEM;
3260
3261	/* First, figure out where our meta data superblock is located,
3262	 * and read it. */
3263	bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
3264	bdev->md.md_offset = drbd_md_ss(bdev);
3265	/* Even for (flexible or indexed) external meta data,
3266	 * initially restrict us to the 4k superblock for now.
3267	 * Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */
3268	bdev->md.md_size_sect = 8;
3269
3270	if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset,
3271				 REQ_OP_READ)) {
3272		/* NOTE: can't do normal error processing here as this is
3273		   called BEFORE disk is attached */
3274		drbd_err(device, "Error while reading metadata.\n");
3275		rv = ERR_IO_MD_DISK;
3276		goto err;
3277	}
3278
3279	magic = be32_to_cpu(buffer->magic);
3280	flags = be32_to_cpu(buffer->flags);
3281	if (magic == DRBD_MD_MAGIC_84_UNCLEAN ||
3282	    (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) {
3283			/* btw: that's Activity Log clean, not "all" clean. */
3284		drbd_err(device, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n");
3285		rv = ERR_MD_UNCLEAN;
3286		goto err;
3287	}
3288
3289	rv = ERR_MD_INVALID;
3290	if (magic != DRBD_MD_MAGIC_08) {
3291		if (magic == DRBD_MD_MAGIC_07)
3292			drbd_err(device, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
3293		else
3294			drbd_err(device, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
 
 
 
 
 
 
 
 
 
3295		goto err;
3296	}
3297
3298	if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3299		drbd_err(device, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3300		    be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
 
3301		goto err;
3302	}
3303
3304
3305	/* convert to in_core endian */
3306	bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
3307	for (i = UI_CURRENT; i < UI_SIZE; i++)
3308		bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3309	bdev->md.flags = be32_to_cpu(buffer->flags);
 
3310	bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3311
3312	bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
3313	bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
3314	bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
3315
3316	if (check_activity_log_stripe_size(device, buffer, &bdev->md))
3317		goto err;
3318	if (check_offsets_and_sizes(device, bdev))
3319		goto err;
3320
3321	if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3322		drbd_err(device, "unexpected bm_offset: %d (expected %d)\n",
3323		    be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3324		goto err;
3325	}
3326	if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3327		drbd_err(device, "unexpected md_size: %u (expected %u)\n",
3328		    be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3329		goto err;
3330	}
 
3331
3332	rv = NO_ERROR;
3333
3334	spin_lock_irq(&device->resource->req_lock);
3335	if (device->state.conn < C_CONNECTED) {
3336		unsigned int peer;
3337		peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3338		peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
3339		device->peer_max_bio_size = peer;
3340	}
3341	spin_unlock_irq(&device->resource->req_lock);
3342
3343 err:
3344	drbd_md_put_buffer(device);
 
3345
3346	return rv;
3347}
3348
3349/**
3350 * drbd_md_mark_dirty() - Mark meta data super block as dirty
3351 * @device:	DRBD device.
3352 *
3353 * Call this function if you change anything that should be written to
3354 * the meta-data super block. This function sets MD_DIRTY, and starts a
3355 * timer that ensures that within five seconds you have to call drbd_md_sync().
3356 */
3357void drbd_md_mark_dirty(struct drbd_device *device)
 
3358{
3359	if (!test_and_set_bit(MD_DIRTY, &device->flags))
3360		mod_timer(&device->md_sync_timer, jiffies + 5*HZ);
 
 
 
3361}
 
 
 
 
 
 
 
3362
3363void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local)
3364{
3365	int i;
3366
3367	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
3368		device->ldev->md.uuid[i+1] = device->ldev->md.uuid[i];
3369}
3370
3371void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
3372{
3373	if (idx == UI_CURRENT) {
3374		if (device->state.role == R_PRIMARY)
3375			val |= 1;
3376		else
3377			val &= ~((u64)1);
3378
3379		drbd_set_ed_uuid(device, val);
3380	}
3381
3382	device->ldev->md.uuid[idx] = val;
3383	drbd_md_mark_dirty(device);
3384}
3385
3386void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
3387{
3388	unsigned long flags;
3389	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
3390	__drbd_uuid_set(device, idx, val);
3391	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
3392}
3393
3394void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
3395{
3396	unsigned long flags;
3397	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
3398	if (device->ldev->md.uuid[idx]) {
3399		drbd_uuid_move_history(device);
3400		device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[idx];
3401	}
3402	__drbd_uuid_set(device, idx, val);
3403	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
3404}
3405
3406/**
3407 * drbd_uuid_new_current() - Creates a new current UUID
3408 * @device:	DRBD device.
3409 *
3410 * Creates a new current UUID, and rotates the old current UUID into
3411 * the bitmap slot. Causes an incremental resync upon next connect.
3412 */
3413void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local)
3414{
3415	u64 val;
3416	unsigned long long bm_uuid;
3417
3418	get_random_bytes(&val, sizeof(u64));
3419
3420	spin_lock_irq(&device->ldev->md.uuid_lock);
3421	bm_uuid = device->ldev->md.uuid[UI_BITMAP];
3422
3423	if (bm_uuid)
3424		drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
3425
3426	device->ldev->md.uuid[UI_BITMAP] = device->ldev->md.uuid[UI_CURRENT];
3427	__drbd_uuid_set(device, UI_CURRENT, val);
3428	spin_unlock_irq(&device->ldev->md.uuid_lock);
3429
3430	drbd_print_uuids(device, "new current UUID");
 
 
3431	/* get it to stable storage _now_ */
3432	drbd_md_sync(device);
3433}
3434
3435void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
3436{
3437	unsigned long flags;
3438	if (device->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
3439		return;
3440
3441	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
3442	if (val == 0) {
3443		drbd_uuid_move_history(device);
3444		device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
3445		device->ldev->md.uuid[UI_BITMAP] = 0;
3446	} else {
3447		unsigned long long bm_uuid = device->ldev->md.uuid[UI_BITMAP];
3448		if (bm_uuid)
3449			drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
3450
3451		device->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
3452	}
3453	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
3454
3455	drbd_md_mark_dirty(device);
3456}
3457
3458/**
3459 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3460 * @device:	DRBD device.
3461 *
3462 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
3463 */
3464int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local)
3465{
3466	int rv = -EIO;
3467
3468	drbd_md_set_flag(device, MDF_FULL_SYNC);
3469	drbd_md_sync(device);
3470	drbd_bm_set_all(device);
 
 
 
 
 
 
 
 
3471
3472	rv = drbd_bm_write(device);
3473
3474	if (!rv) {
3475		drbd_md_clear_flag(device, MDF_FULL_SYNC);
3476		drbd_md_sync(device);
3477	}
3478
3479	return rv;
3480}
3481
3482/**
3483 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
3484 * @device:	DRBD device.
3485 *
3486 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
3487 */
3488int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local)
3489{
3490	drbd_resume_al(device);
3491	drbd_bm_clear_all(device);
3492	return drbd_bm_write(device);
 
 
 
 
 
 
 
3493}
3494
3495static int w_bitmap_io(struct drbd_work *w, int unused)
3496{
3497	struct drbd_device *device =
3498		container_of(w, struct drbd_device, bm_io_work.w);
3499	struct bm_io_work *work = &device->bm_io_work;
3500	int rv = -EIO;
3501
3502	if (work->flags != BM_LOCKED_CHANGE_ALLOWED) {
3503		int cnt = atomic_read(&device->ap_bio_cnt);
3504		if (cnt)
3505			drbd_err(device, "FIXME: ap_bio_cnt %d, expected 0; queued for '%s'\n",
3506					cnt, work->why);
3507	}
3508
3509	if (get_ldev(device)) {
3510		drbd_bm_lock(device, work->why, work->flags);
3511		rv = work->io_fn(device);
3512		drbd_bm_unlock(device);
3513		put_ldev(device);
3514	}
3515
3516	clear_bit_unlock(BITMAP_IO, &device->flags);
3517	wake_up(&device->misc_wait);
 
3518
3519	if (work->done)
3520		work->done(device, rv);
3521
3522	clear_bit(BITMAP_IO_QUEUED, &device->flags);
3523	work->why = NULL;
3524	work->flags = 0;
3525
3526	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3527}
3528
3529/**
3530 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
3531 * @device:	DRBD device.
3532 * @io_fn:	IO callback to be called when bitmap IO is possible
3533 * @done:	callback to be called after the bitmap IO was performed
3534 * @why:	Descriptive text of the reason for doing the IO
3535 *
3536 * While IO on the bitmap happens we freeze application IO thus we ensure
3537 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
3538 * called from worker context. It MUST NOT be used while a previous such
3539 * work is still pending!
3540 *
3541 * Its worker function encloses the call of io_fn() by get_ldev() and
3542 * put_ldev().
3543 */
3544void drbd_queue_bitmap_io(struct drbd_device *device,
3545			  int (*io_fn)(struct drbd_device *),
3546			  void (*done)(struct drbd_device *, int),
3547			  char *why, enum bm_flag flags)
3548{
3549	D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
3550
3551	D_ASSERT(device, !test_bit(BITMAP_IO_QUEUED, &device->flags));
3552	D_ASSERT(device, !test_bit(BITMAP_IO, &device->flags));
3553	D_ASSERT(device, list_empty(&device->bm_io_work.w.list));
3554	if (device->bm_io_work.why)
3555		drbd_err(device, "FIXME going to queue '%s' but '%s' still pending?\n",
3556			why, device->bm_io_work.why);
3557
3558	device->bm_io_work.io_fn = io_fn;
3559	device->bm_io_work.done = done;
3560	device->bm_io_work.why = why;
3561	device->bm_io_work.flags = flags;
3562
3563	spin_lock_irq(&device->resource->req_lock);
3564	set_bit(BITMAP_IO, &device->flags);
3565	/* don't wait for pending application IO if the caller indicates that
3566	 * application IO does not conflict anyways. */
3567	if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) {
3568		if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
3569			drbd_queue_work(&first_peer_device(device)->connection->sender_work,
3570					&device->bm_io_work.w);
3571	}
3572	spin_unlock_irq(&device->resource->req_lock);
3573}
3574
3575/**
3576 * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
3577 * @device:	DRBD device.
3578 * @io_fn:	IO callback to be called when bitmap IO is possible
3579 * @why:	Descriptive text of the reason for doing the IO
3580 *
3581 * freezes application IO while that the actual IO operations runs. This
3582 * functions MAY NOT be called from worker context.
3583 */
3584int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *),
3585		char *why, enum bm_flag flags)
3586{
3587	/* Only suspend io, if some operation is supposed to be locked out */
3588	const bool do_suspend_io = flags & (BM_DONT_CLEAR|BM_DONT_SET|BM_DONT_TEST);
3589	int rv;
3590
3591	D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
3592
3593	if (do_suspend_io)
3594		drbd_suspend_io(device);
3595
3596	drbd_bm_lock(device, why, flags);
3597	rv = io_fn(device);
3598	drbd_bm_unlock(device);
3599
3600	if (do_suspend_io)
3601		drbd_resume_io(device);
3602
3603	return rv;
3604}
3605
3606void drbd_md_set_flag(struct drbd_device *device, int flag) __must_hold(local)
3607{
3608	if ((device->ldev->md.flags & flag) != flag) {
3609		drbd_md_mark_dirty(device);
3610		device->ldev->md.flags |= flag;
3611	}
3612}
3613
3614void drbd_md_clear_flag(struct drbd_device *device, int flag) __must_hold(local)
3615{
3616	if ((device->ldev->md.flags & flag) != 0) {
3617		drbd_md_mark_dirty(device);
3618		device->ldev->md.flags &= ~flag;
3619	}
3620}
3621int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
3622{
3623	return (bdev->md.flags & flag) != 0;
3624}
3625
3626static void md_sync_timer_fn(struct timer_list *t)
3627{
3628	struct drbd_device *device = from_timer(device, t, md_sync_timer);
3629	drbd_device_post_work(device, MD_SYNC);
3630}
3631
3632const char *cmdname(enum drbd_packet cmd)
3633{
3634	/* THINK may need to become several global tables
3635	 * when we want to support more than
3636	 * one PRO_VERSION */
3637	static const char *cmdnames[] = {
3638		[P_DATA]	        = "Data",
3639		[P_WSAME]	        = "WriteSame",
3640		[P_TRIM]	        = "Trim",
3641		[P_DATA_REPLY]	        = "DataReply",
3642		[P_RS_DATA_REPLY]	= "RSDataReply",
3643		[P_BARRIER]	        = "Barrier",
3644		[P_BITMAP]	        = "ReportBitMap",
3645		[P_BECOME_SYNC_TARGET]  = "BecomeSyncTarget",
3646		[P_BECOME_SYNC_SOURCE]  = "BecomeSyncSource",
3647		[P_UNPLUG_REMOTE]	= "UnplugRemote",
3648		[P_DATA_REQUEST]	= "DataRequest",
3649		[P_RS_DATA_REQUEST]     = "RSDataRequest",
3650		[P_SYNC_PARAM]	        = "SyncParam",
3651		[P_SYNC_PARAM89]	= "SyncParam89",
3652		[P_PROTOCOL]            = "ReportProtocol",
3653		[P_UUIDS]	        = "ReportUUIDs",
3654		[P_SIZES]	        = "ReportSizes",
3655		[P_STATE]	        = "ReportState",
3656		[P_SYNC_UUID]           = "ReportSyncUUID",
3657		[P_AUTH_CHALLENGE]      = "AuthChallenge",
3658		[P_AUTH_RESPONSE]	= "AuthResponse",
3659		[P_PING]		= "Ping",
3660		[P_PING_ACK]	        = "PingAck",
3661		[P_RECV_ACK]	        = "RecvAck",
3662		[P_WRITE_ACK]	        = "WriteAck",
3663		[P_RS_WRITE_ACK]	= "RSWriteAck",
3664		[P_SUPERSEDED]          = "Superseded",
3665		[P_NEG_ACK]	        = "NegAck",
3666		[P_NEG_DREPLY]	        = "NegDReply",
3667		[P_NEG_RS_DREPLY]	= "NegRSDReply",
3668		[P_BARRIER_ACK]	        = "BarrierAck",
3669		[P_STATE_CHG_REQ]       = "StateChgRequest",
3670		[P_STATE_CHG_REPLY]     = "StateChgReply",
3671		[P_OV_REQUEST]          = "OVRequest",
3672		[P_OV_REPLY]            = "OVReply",
3673		[P_OV_RESULT]           = "OVResult",
3674		[P_CSUM_RS_REQUEST]     = "CsumRSRequest",
3675		[P_RS_IS_IN_SYNC]	= "CsumRSIsInSync",
3676		[P_COMPRESSED_BITMAP]   = "CBitmap",
3677		[P_DELAY_PROBE]         = "DelayProbe",
3678		[P_OUT_OF_SYNC]		= "OutOfSync",
3679		[P_RETRY_WRITE]		= "RetryWrite",
3680		[P_RS_CANCEL]		= "RSCancel",
3681		[P_CONN_ST_CHG_REQ]	= "conn_st_chg_req",
3682		[P_CONN_ST_CHG_REPLY]	= "conn_st_chg_reply",
3683		[P_RETRY_WRITE]		= "retry_write",
3684		[P_PROTOCOL_UPDATE]	= "protocol_update",
3685		[P_RS_THIN_REQ]         = "rs_thin_req",
3686		[P_RS_DEALLOCATED]      = "rs_deallocated",
3687
3688		/* enum drbd_packet, but not commands - obsoleted flags:
3689		 *	P_MAY_IGNORE
3690		 *	P_MAX_OPT_CMD
3691		 */
3692	};
3693
3694	/* too big for the array: 0xfffX */
3695	if (cmd == P_INITIAL_META)
3696		return "InitialMeta";
3697	if (cmd == P_INITIAL_DATA)
3698		return "InitialData";
3699	if (cmd == P_CONNECTION_FEATURES)
3700		return "ConnectionFeatures";
3701	if (cmd >= ARRAY_SIZE(cmdnames))
3702		return "Unknown";
3703	return cmdnames[cmd];
3704}
3705
3706/**
3707 * drbd_wait_misc  -  wait for a request to make progress
3708 * @device:	device associated with the request
3709 * @i:		the struct drbd_interval embedded in struct drbd_request or
3710 *		struct drbd_peer_request
3711 */
3712int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i)
3713{
3714	struct net_conf *nc;
3715	DEFINE_WAIT(wait);
3716	long timeout;
3717
3718	rcu_read_lock();
3719	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
3720	if (!nc) {
3721		rcu_read_unlock();
3722		return -ETIMEDOUT;
3723	}
3724	timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
3725	rcu_read_unlock();
3726
3727	/* Indicate to wake up device->misc_wait on progress.  */
3728	i->waiting = true;
3729	prepare_to_wait(&device->misc_wait, &wait, TASK_INTERRUPTIBLE);
3730	spin_unlock_irq(&device->resource->req_lock);
3731	timeout = schedule_timeout(timeout);
3732	finish_wait(&device->misc_wait, &wait);
3733	spin_lock_irq(&device->resource->req_lock);
3734	if (!timeout || device->state.conn < C_CONNECTED)
3735		return -ETIMEDOUT;
3736	if (signal_pending(current))
3737		return -ERESTARTSYS;
3738	return 0;
3739}
3740
3741void lock_all_resources(void)
3742{
3743	struct drbd_resource *resource;
3744	int __maybe_unused i = 0;
3745
3746	mutex_lock(&resources_mutex);
3747	local_irq_disable();
3748	for_each_resource(resource, &drbd_resources)
3749		spin_lock_nested(&resource->req_lock, i++);
3750}
3751
3752void unlock_all_resources(void)
3753{
3754	struct drbd_resource *resource;
3755
3756	for_each_resource(resource, &drbd_resources)
3757		spin_unlock(&resource->req_lock);
3758	local_irq_enable();
3759	mutex_unlock(&resources_mutex);
 
3760}
3761
3762#ifdef CONFIG_DRBD_FAULT_INJECTION
3763/* Fault insertion support including random number generator shamelessly
3764 * stolen from kernel/rcutorture.c */
3765struct fault_random_state {
3766	unsigned long state;
3767	unsigned long count;
3768};
3769
3770#define FAULT_RANDOM_MULT 39916801  /* prime */
3771#define FAULT_RANDOM_ADD	479001701 /* prime */
3772#define FAULT_RANDOM_REFRESH 10000
3773
3774/*
3775 * Crude but fast random-number generator.  Uses a linear congruential
3776 * generator, with occasional help from get_random_bytes().
3777 */
3778static unsigned long
3779_drbd_fault_random(struct fault_random_state *rsp)
3780{
3781	long refresh;
3782
3783	if (!rsp->count--) {
3784		get_random_bytes(&refresh, sizeof(refresh));
3785		rsp->state += refresh;
3786		rsp->count = FAULT_RANDOM_REFRESH;
3787	}
3788	rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
3789	return swahw32(rsp->state);
3790}
3791
3792static char *
3793_drbd_fault_str(unsigned int type) {
3794	static char *_faults[] = {
3795		[DRBD_FAULT_MD_WR] = "Meta-data write",
3796		[DRBD_FAULT_MD_RD] = "Meta-data read",
3797		[DRBD_FAULT_RS_WR] = "Resync write",
3798		[DRBD_FAULT_RS_RD] = "Resync read",
3799		[DRBD_FAULT_DT_WR] = "Data write",
3800		[DRBD_FAULT_DT_RD] = "Data read",
3801		[DRBD_FAULT_DT_RA] = "Data read ahead",
3802		[DRBD_FAULT_BM_ALLOC] = "BM allocation",
3803		[DRBD_FAULT_AL_EE] = "EE allocation",
3804		[DRBD_FAULT_RECEIVE] = "receive data corruption",
3805	};
3806
3807	return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
3808}
3809
3810unsigned int
3811_drbd_insert_fault(struct drbd_device *device, unsigned int type)
3812{
3813	static struct fault_random_state rrs = {0, 0};
3814
3815	unsigned int ret = (
3816		(drbd_fault_devs == 0 ||
3817			((1 << device_to_minor(device)) & drbd_fault_devs) != 0) &&
3818		(((_drbd_fault_random(&rrs) % 100) + 1) <= drbd_fault_rate));
3819
3820	if (ret) {
3821		drbd_fault_count++;
3822
3823		if (__ratelimit(&drbd_ratelimit_state))
3824			drbd_warn(device, "***Simulating %s failure\n",
3825				_drbd_fault_str(type));
3826	}
3827
3828	return ret;
3829}
3830#endif
3831
3832const char *drbd_buildtag(void)
3833{
3834	/* DRBD built from external sources has here a reference to the
3835	   git hash of the source code. */
3836
3837	static char buildtag[38] = "\0uilt-in";
3838
3839	if (buildtag[0] == 0) {
3840#ifdef MODULE
3841		sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
3842#else
3843		buildtag[0] = 'b';
3844#endif
 
3845	}
3846
3847	return buildtag;
3848}
3849
3850module_init(drbd_init)
3851module_exit(drbd_cleanup)
3852
3853EXPORT_SYMBOL(drbd_conn_str);
3854EXPORT_SYMBOL(drbd_role_str);
3855EXPORT_SYMBOL(drbd_disk_str);
3856EXPORT_SYMBOL(drbd_set_st_err_str);