Linux Audio

Check our new training course

Loading...
v3.1
 
   1/*
   2   drbd_bitmap.c
   3
   4   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   5
   6   Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
   7   Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   8   Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
   9
  10   drbd is free software; you can redistribute it and/or modify
  11   it under the terms of the GNU General Public License as published by
  12   the Free Software Foundation; either version 2, or (at your option)
  13   any later version.
  14
  15   drbd is distributed in the hope that it will be useful,
  16   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18   GNU General Public License for more details.
  19
  20   You should have received a copy of the GNU General Public License
  21   along with drbd; see the file COPYING.  If not, write to
  22   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  23 */
  24
  25#include <linux/bitops.h>
 
 
  26#include <linux/vmalloc.h>
  27#include <linux/string.h>
  28#include <linux/drbd.h>
  29#include <linux/slab.h>
  30#include <asm/kmap_types.h>
  31
  32#include "drbd_int.h"
  33
  34
  35/* OPAQUE outside this file!
  36 * interface defined in drbd_int.h
  37
  38 * convention:
  39 * function name drbd_bm_... => used elsewhere, "public".
  40 * function name      bm_... => internal to implementation, "private".
  41 */
  42
  43
  44/*
  45 * LIMITATIONS:
  46 * We want to support >= peta byte of backend storage, while for now still using
  47 * a granularity of one bit per 4KiB of storage.
  48 * 1 << 50		bytes backend storage (1 PiB)
  49 * 1 << (50 - 12)	bits needed
  50 *	38 --> we need u64 to index and count bits
  51 * 1 << (38 - 3)	bitmap bytes needed
  52 *	35 --> we still need u64 to index and count bytes
  53 *			(that's 32 GiB of bitmap for 1 PiB storage)
  54 * 1 << (35 - 2)	32bit longs needed
  55 *	33 --> we'd even need u64 to index and count 32bit long words.
  56 * 1 << (35 - 3)	64bit longs needed
  57 *	32 --> we could get away with a 32bit unsigned int to index and count
  58 *	64bit long words, but I rather stay with unsigned long for now.
  59 *	We probably should neither count nor point to bytes or long words
  60 *	directly, but either by bitnumber, or by page index and offset.
  61 * 1 << (35 - 12)
  62 *	22 --> we need that much 4KiB pages of bitmap.
  63 *	1 << (22 + 3) --> on a 64bit arch,
  64 *	we need 32 MiB to store the array of page pointers.
  65 *
  66 * Because I'm lazy, and because the resulting patch was too large, too ugly
  67 * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
  68 * (1 << 32) bits * 4k storage.
  69 *
  70
  71 * bitmap storage and IO:
  72 *	Bitmap is stored little endian on disk, and is kept little endian in
  73 *	core memory. Currently we still hold the full bitmap in core as long
  74 *	as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
  75 *	seems excessive.
  76 *
  77 *	We plan to reduce the amount of in-core bitmap pages by paging them in
  78 *	and out against their on-disk location as necessary, but need to make
  79 *	sure we don't cause too much meta data IO, and must not deadlock in
  80 *	tight memory situations. This needs some more work.
  81 */
  82
  83/*
  84 * NOTE
  85 *  Access to the *bm_pages is protected by bm_lock.
  86 *  It is safe to read the other members within the lock.
  87 *
  88 *  drbd_bm_set_bits is called from bio_endio callbacks,
  89 *  We may be called with irq already disabled,
  90 *  so we need spin_lock_irqsave().
  91 *  And we need the kmap_atomic.
  92 */
  93struct drbd_bitmap {
  94	struct page **bm_pages;
  95	spinlock_t bm_lock;
  96
 
 
 
 
 
 
 
  97	/* see LIMITATIONS: above */
  98
  99	unsigned long bm_set;       /* nr of set bits; THINK maybe atomic_t? */
 100	unsigned long bm_bits;
 101	size_t   bm_words;
 102	size_t   bm_number_of_pages;
 103	sector_t bm_dev_capacity;
 104	struct mutex bm_change; /* serializes resize operations */
 105
 106	wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
 107
 108	enum bm_flag bm_flags;
 109
 110	/* debugging aid, in case we are still racy somewhere */
 111	char          *bm_why;
 112	struct task_struct *bm_task;
 113};
 114
 115#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
 116static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
 117{
 118	struct drbd_bitmap *b = mdev->bitmap;
 119	if (!__ratelimit(&drbd_ratelimit_state))
 120		return;
 121	dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
 122	    current == mdev->receiver.task ? "receiver" :
 123	    current == mdev->asender.task  ? "asender"  :
 124	    current == mdev->worker.task   ? "worker"   : current->comm,
 125	    func, b->bm_why ?: "?",
 126	    b->bm_task == mdev->receiver.task ? "receiver" :
 127	    b->bm_task == mdev->asender.task  ? "asender"  :
 128	    b->bm_task == mdev->worker.task   ? "worker"   : "?");
 129}
 130
 131void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
 132{
 133	struct drbd_bitmap *b = mdev->bitmap;
 134	int trylock_failed;
 135
 136	if (!b) {
 137		dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");
 138		return;
 139	}
 140
 141	trylock_failed = !mutex_trylock(&b->bm_change);
 142
 143	if (trylock_failed) {
 144		dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
 145		    current == mdev->receiver.task ? "receiver" :
 146		    current == mdev->asender.task  ? "asender"  :
 147		    current == mdev->worker.task   ? "worker"   : current->comm,
 148		    why, b->bm_why ?: "?",
 149		    b->bm_task == mdev->receiver.task ? "receiver" :
 150		    b->bm_task == mdev->asender.task  ? "asender"  :
 151		    b->bm_task == mdev->worker.task   ? "worker"   : "?");
 152		mutex_lock(&b->bm_change);
 153	}
 154	if (BM_LOCKED_MASK & b->bm_flags)
 155		dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
 156	b->bm_flags |= flags & BM_LOCKED_MASK;
 157
 158	b->bm_why  = why;
 159	b->bm_task = current;
 160}
 161
 162void drbd_bm_unlock(struct drbd_conf *mdev)
 163{
 164	struct drbd_bitmap *b = mdev->bitmap;
 165	if (!b) {
 166		dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");
 167		return;
 168	}
 169
 170	if (!(BM_LOCKED_MASK & mdev->bitmap->bm_flags))
 171		dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
 172
 173	b->bm_flags &= ~BM_LOCKED_MASK;
 174	b->bm_why  = NULL;
 175	b->bm_task = NULL;
 176	mutex_unlock(&b->bm_change);
 177}
 178
 179/* we store some "meta" info about our pages in page->private */
 180/* at a granularity of 4k storage per bitmap bit:
 181 * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
 182 *  1<<38 bits,
 183 *  1<<23 4k bitmap pages.
 184 * Use 24 bits as page index, covers 2 peta byte storage
 185 * at a granularity of 4k per bit.
 186 * Used to report the failed page idx on io error from the endio handlers.
 187 */
 188#define BM_PAGE_IDX_MASK	((1UL<<24)-1)
 189/* this page is currently read in, or written back */
 190#define BM_PAGE_IO_LOCK		31
 191/* if there has been an IO error for this page */
 192#define BM_PAGE_IO_ERROR	30
 193/* this is to be able to intelligently skip disk IO,
 194 * set if bits have been set since last IO. */
 195#define BM_PAGE_NEED_WRITEOUT	29
 196/* to mark for lazy writeout once syncer cleared all clearable bits,
 197 * we if bits have been cleared since last IO. */
 198#define BM_PAGE_LAZY_WRITEOUT	28
 
 
 
 199
 200/* store_page_idx uses non-atomic assignment. It is only used directly after
 201 * allocating the page.  All other bm_set_page_* and bm_clear_page_* need to
 202 * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
 203 * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
 204 * requires it all to be atomic as well. */
 205static void bm_store_page_idx(struct page *page, unsigned long idx)
 206{
 207	BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
 208	page_private(page) |= idx;
 209}
 210
 211static unsigned long bm_page_to_idx(struct page *page)
 212{
 213	return page_private(page) & BM_PAGE_IDX_MASK;
 214}
 215
 216/* As is very unlikely that the same page is under IO from more than one
 217 * context, we can get away with a bit per page and one wait queue per bitmap.
 218 */
 219static void bm_page_lock_io(struct drbd_conf *mdev, int page_nr)
 220{
 221	struct drbd_bitmap *b = mdev->bitmap;
 222	void *addr = &page_private(b->bm_pages[page_nr]);
 223	wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
 224}
 225
 226static void bm_page_unlock_io(struct drbd_conf *mdev, int page_nr)
 227{
 228	struct drbd_bitmap *b = mdev->bitmap;
 229	void *addr = &page_private(b->bm_pages[page_nr]);
 230	clear_bit(BM_PAGE_IO_LOCK, addr);
 231	smp_mb__after_clear_bit();
 232	wake_up(&mdev->bitmap->bm_io_wait);
 233}
 234
 235/* set _before_ submit_io, so it may be reset due to being changed
 236 * while this page is in flight... will get submitted later again */
 237static void bm_set_page_unchanged(struct page *page)
 238{
 239	/* use cmpxchg? */
 240	clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
 241	clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
 242}
 243
 244static void bm_set_page_need_writeout(struct page *page)
 245{
 246	set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
 247}
 248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 249static int bm_test_page_unchanged(struct page *page)
 250{
 251	volatile const unsigned long *addr = &page_private(page);
 252	return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
 253}
 254
 255static void bm_set_page_io_err(struct page *page)
 256{
 257	set_bit(BM_PAGE_IO_ERROR, &page_private(page));
 258}
 259
 260static void bm_clear_page_io_err(struct page *page)
 261{
 262	clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
 263}
 264
 265static void bm_set_page_lazy_writeout(struct page *page)
 266{
 267	set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
 268}
 269
 270static int bm_test_page_lazy_writeout(struct page *page)
 271{
 272	return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
 273}
 274
 275/* on a 32bit box, this would allow for exactly (2<<38) bits. */
 276static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
 277{
 278	/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
 279	unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
 280	BUG_ON(page_nr >= b->bm_number_of_pages);
 281	return page_nr;
 282}
 283
 284static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
 285{
 286	/* page_nr = (bitnr/8) >> PAGE_SHIFT; */
 287	unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
 288	BUG_ON(page_nr >= b->bm_number_of_pages);
 289	return page_nr;
 290}
 291
 292static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km)
 293{
 294	struct page *page = b->bm_pages[idx];
 295	return (unsigned long *) kmap_atomic(page, km);
 296}
 297
 298static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
 299{
 300	return __bm_map_pidx(b, idx, KM_IRQ1);
 301}
 302
 303static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
 304{
 305	kunmap_atomic(p_addr, km);
 306};
 307
 308static void bm_unmap(unsigned long *p_addr)
 309{
 310	return __bm_unmap(p_addr, KM_IRQ1);
 311}
 312
 313/* long word offset of _bitmap_ sector */
 314#define S2W(s)	((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
 315/* word offset from start of bitmap to word number _in_page_
 316 * modulo longs per page
 317#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
 318 hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
 319 so do it explicitly:
 320 */
 321#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
 322
 323/* Long words per page */
 324#define LWPP (PAGE_SIZE/sizeof(long))
 325
 326/*
 327 * actually most functions herein should take a struct drbd_bitmap*, not a
 328 * struct drbd_conf*, but for the debug macros I like to have the mdev around
 329 * to be able to report device specific.
 330 */
 331
 332
 333static void bm_free_pages(struct page **pages, unsigned long number)
 334{
 335	unsigned long i;
 336	if (!pages)
 337		return;
 338
 339	for (i = 0; i < number; i++) {
 340		if (!pages[i]) {
 341			printk(KERN_ALERT "drbd: bm_free_pages tried to free "
 342					  "a NULL pointer; i=%lu n=%lu\n",
 343					  i, number);
 344			continue;
 345		}
 346		__free_page(pages[i]);
 347		pages[i] = NULL;
 348	}
 349}
 350
 351static void bm_vk_free(void *ptr, int v)
 352{
 353	if (v)
 354		vfree(ptr);
 355	else
 356		kfree(ptr);
 357}
 358
 359/*
 360 * "have" and "want" are NUMBER OF PAGES.
 361 */
 362static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
 363{
 364	struct page **old_pages = b->bm_pages;
 365	struct page **new_pages, *page;
 366	unsigned int i, bytes, vmalloced = 0;
 367	unsigned long have = b->bm_number_of_pages;
 368
 369	BUG_ON(have == 0 && old_pages != NULL);
 370	BUG_ON(have != 0 && old_pages == NULL);
 371
 372	if (have == want)
 373		return old_pages;
 374
 375	/* Trying kmalloc first, falling back to vmalloc.
 376	 * GFP_KERNEL is ok, as this is done when a lower level disk is
 377	 * "attached" to the drbd.  Context is receiver thread or cqueue
 378	 * thread.  As we have no disk yet, we are not in the IO path,
 379	 * not even the IO path of the peer. */
 380	bytes = sizeof(struct page *)*want;
 381	new_pages = kmalloc(bytes, GFP_KERNEL);
 382	if (!new_pages) {
 383		new_pages = vmalloc(bytes);
 384		if (!new_pages)
 385			return NULL;
 386		vmalloced = 1;
 387	}
 388
 389	memset(new_pages, 0, bytes);
 390	if (want >= have) {
 391		for (i = 0; i < have; i++)
 392			new_pages[i] = old_pages[i];
 393		for (; i < want; i++) {
 394			page = alloc_page(GFP_HIGHUSER);
 395			if (!page) {
 396				bm_free_pages(new_pages + have, i - have);
 397				bm_vk_free(new_pages, vmalloced);
 398				return NULL;
 399			}
 400			/* we want to know which page it is
 401			 * from the endio handlers */
 402			bm_store_page_idx(page, i);
 403			new_pages[i] = page;
 404		}
 405	} else {
 406		for (i = 0; i < want; i++)
 407			new_pages[i] = old_pages[i];
 408		/* NOT HERE, we are outside the spinlock!
 409		bm_free_pages(old_pages + want, have - want);
 410		*/
 411	}
 412
 413	if (vmalloced)
 414		b->bm_flags |= BM_P_VMALLOCED;
 415	else
 416		b->bm_flags &= ~BM_P_VMALLOCED;
 417
 418	return new_pages;
 419}
 420
 421/*
 422 * called on driver init only. TODO call when a device is created.
 423 * allocates the drbd_bitmap, and stores it in mdev->bitmap.
 424 */
 425int drbd_bm_init(struct drbd_conf *mdev)
 426{
 427	struct drbd_bitmap *b = mdev->bitmap;
 428	WARN_ON(b != NULL);
 429	b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
 430	if (!b)
 431		return -ENOMEM;
 432	spin_lock_init(&b->bm_lock);
 433	mutex_init(&b->bm_change);
 434	init_waitqueue_head(&b->bm_io_wait);
 435
 436	mdev->bitmap = b;
 437
 438	return 0;
 439}
 440
 441sector_t drbd_bm_capacity(struct drbd_conf *mdev)
 442{
 443	ERR_IF(!mdev->bitmap) return 0;
 444	return mdev->bitmap->bm_dev_capacity;
 
 445}
 446
 447/* called on driver unload. TODO: call when a device is destroyed.
 448 */
 449void drbd_bm_cleanup(struct drbd_conf *mdev)
 450{
 451	ERR_IF (!mdev->bitmap) return;
 452	bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
 453	bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));
 454	kfree(mdev->bitmap);
 455	mdev->bitmap = NULL;
 
 456}
 457
 458/*
 459 * since (b->bm_bits % BITS_PER_LONG) != 0,
 460 * this masks out the remaining bits.
 461 * Returns the number of bits cleared.
 462 */
 
 463#define BITS_PER_PAGE		(1UL << (PAGE_SHIFT + 3))
 464#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE - 1)
 
 
 
 
 
 465#define BITS_PER_LONG_MASK	(BITS_PER_LONG - 1)
 466static int bm_clear_surplus(struct drbd_bitmap *b)
 467{
 468	unsigned long mask;
 469	unsigned long *p_addr, *bm;
 470	int tmp;
 471	int cleared = 0;
 472
 473	/* number of bits modulo bits per page */
 474	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
 475	/* mask the used bits of the word containing the last bit */
 476	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
 477	/* bitmap is always stored little endian,
 478	 * on disk and in core memory alike */
 479	mask = cpu_to_lel(mask);
 480
 481	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
 482	bm = p_addr + (tmp/BITS_PER_LONG);
 483	if (mask) {
 484		/* If mask != 0, we are not exactly aligned, so bm now points
 485		 * to the long containing the last bit.
 486		 * If mask == 0, bm already points to the word immediately
 487		 * after the last (long word aligned) bit. */
 488		cleared = hweight_long(*bm & ~mask);
 489		*bm &= mask;
 490		bm++;
 491	}
 492
 493	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
 494		/* on a 32bit arch, we may need to zero out
 495		 * a padding long to align with a 64bit remote */
 496		cleared += hweight_long(*bm);
 497		*bm = 0;
 498	}
 499	bm_unmap(p_addr);
 500	return cleared;
 501}
 502
 503static void bm_set_surplus(struct drbd_bitmap *b)
 504{
 505	unsigned long mask;
 506	unsigned long *p_addr, *bm;
 507	int tmp;
 508
 509	/* number of bits modulo bits per page */
 510	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
 511	/* mask the used bits of the word containing the last bit */
 512	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
 513	/* bitmap is always stored little endian,
 514	 * on disk and in core memory alike */
 515	mask = cpu_to_lel(mask);
 516
 517	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
 518	bm = p_addr + (tmp/BITS_PER_LONG);
 519	if (mask) {
 520		/* If mask != 0, we are not exactly aligned, so bm now points
 521		 * to the long containing the last bit.
 522		 * If mask == 0, bm already points to the word immediately
 523		 * after the last (long word aligned) bit. */
 524		*bm |= ~mask;
 525		bm++;
 526	}
 527
 528	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
 529		/* on a 32bit arch, we may need to zero out
 530		 * a padding long to align with a 64bit remote */
 531		*bm = ~0UL;
 532	}
 533	bm_unmap(p_addr);
 534}
 535
 536/* you better not modify the bitmap while this is running,
 537 * or its results will be stale */
 538static unsigned long bm_count_bits(struct drbd_bitmap *b)
 539{
 540	unsigned long *p_addr;
 541	unsigned long bits = 0;
 542	unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
 543	int idx, i, last_word;
 544
 545	/* all but last page */
 546	for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
 547		p_addr = __bm_map_pidx(b, idx, KM_USER0);
 548		for (i = 0; i < LWPP; i++)
 549			bits += hweight_long(p_addr[i]);
 550		__bm_unmap(p_addr, KM_USER0);
 551		cond_resched();
 552	}
 553	/* last (or only) page */
 554	last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
 555	p_addr = __bm_map_pidx(b, idx, KM_USER0);
 556	for (i = 0; i < last_word; i++)
 557		bits += hweight_long(p_addr[i]);
 558	p_addr[last_word] &= cpu_to_lel(mask);
 559	bits += hweight_long(p_addr[last_word]);
 560	/* 32bit arch, may have an unused padding long */
 561	if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
 562		p_addr[last_word+1] = 0;
 563	__bm_unmap(p_addr, KM_USER0);
 564	return bits;
 565}
 566
 567/* offset and len in long words.*/
 568static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
 569{
 570	unsigned long *p_addr, *bm;
 571	unsigned int idx;
 572	size_t do_now, end;
 573
 574	end = offset + len;
 575
 576	if (end > b->bm_words) {
 577		printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
 578		return;
 579	}
 580
 581	while (offset < end) {
 582		do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
 583		idx = bm_word_to_page_idx(b, offset);
 584		p_addr = bm_map_pidx(b, idx);
 585		bm = p_addr + MLPP(offset);
 586		if (bm+do_now > p_addr + LWPP) {
 587			printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
 588			       p_addr, bm, (int)do_now);
 589		} else
 590			memset(bm, c, do_now * sizeof(long));
 591		bm_unmap(p_addr);
 592		bm_set_page_need_writeout(b->bm_pages[idx]);
 593		offset += do_now;
 594	}
 595}
 596
 
 
 
 
 
 
 
 
 
 
 
 597/*
 598 * make sure the bitmap has enough room for the attached storage,
 599 * if necessary, resize.
 600 * called whenever we may have changed the device size.
 601 * returns -ENOMEM if we could not allocate enough memory, 0 on success.
 602 * In case this is actually a resize, we copy the old bitmap into the new one.
 603 * Otherwise, the bitmap is initialized to all bits set.
 604 */
 605int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
 606{
 607	struct drbd_bitmap *b = mdev->bitmap;
 608	unsigned long bits, words, owords, obits;
 609	unsigned long want, have, onpages; /* number of pages */
 610	struct page **npages, **opages = NULL;
 611	int err = 0, growing;
 612	int opages_vmalloced;
 613
 614	ERR_IF(!b) return -ENOMEM;
 
 615
 616	drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);
 617
 618	dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
 619			(unsigned long long)capacity);
 620
 621	if (capacity == b->bm_dev_capacity)
 622		goto out;
 623
 624	opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
 625
 626	if (capacity == 0) {
 627		spin_lock_irq(&b->bm_lock);
 628		opages = b->bm_pages;
 629		onpages = b->bm_number_of_pages;
 630		owords = b->bm_words;
 631		b->bm_pages = NULL;
 632		b->bm_number_of_pages =
 633		b->bm_set   =
 634		b->bm_bits  =
 635		b->bm_words =
 636		b->bm_dev_capacity = 0;
 637		spin_unlock_irq(&b->bm_lock);
 638		bm_free_pages(opages, onpages);
 639		bm_vk_free(opages, opages_vmalloced);
 640		goto out;
 641	}
 642	bits  = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
 643
 644	/* if we would use
 645	   words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
 646	   a 32bit host could present the wrong number of words
 647	   to a 64bit host.
 648	*/
 649	words = ALIGN(bits, 64) >> LN2_BPL;
 650
 651	if (get_ldev(mdev)) {
 652		u64 bits_on_disk = ((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12;
 653		put_ldev(mdev);
 654		if (bits > bits_on_disk) {
 655			dev_info(DEV, "bits = %lu\n", bits);
 656			dev_info(DEV, "bits_on_disk = %llu\n", bits_on_disk);
 657			err = -ENOSPC;
 658			goto out;
 659		}
 660	}
 661
 662	want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
 663	have = b->bm_number_of_pages;
 664	if (want == have) {
 665		D_ASSERT(b->bm_pages != NULL);
 666		npages = b->bm_pages;
 667	} else {
 668		if (drbd_insert_fault(mdev, DRBD_FAULT_BM_ALLOC))
 669			npages = NULL;
 670		else
 671			npages = bm_realloc_pages(b, want);
 672	}
 673
 674	if (!npages) {
 675		err = -ENOMEM;
 676		goto out;
 677	}
 678
 679	spin_lock_irq(&b->bm_lock);
 680	opages = b->bm_pages;
 681	owords = b->bm_words;
 682	obits  = b->bm_bits;
 683
 684	growing = bits > obits;
 685	if (opages && growing && set_new_bits)
 686		bm_set_surplus(b);
 687
 688	b->bm_pages = npages;
 689	b->bm_number_of_pages = want;
 690	b->bm_bits  = bits;
 691	b->bm_words = words;
 692	b->bm_dev_capacity = capacity;
 693
 694	if (growing) {
 695		if (set_new_bits) {
 696			bm_memset(b, owords, 0xff, words-owords);
 697			b->bm_set += bits - obits;
 698		} else
 699			bm_memset(b, owords, 0x00, words-owords);
 700
 701	}
 702
 703	if (want < have) {
 704		/* implicit: (opages != NULL) && (opages != npages) */
 705		bm_free_pages(opages + want, have - want);
 706	}
 707
 708	(void)bm_clear_surplus(b);
 709
 710	spin_unlock_irq(&b->bm_lock);
 711	if (opages != npages)
 712		bm_vk_free(opages, opages_vmalloced);
 713	if (!growing)
 714		b->bm_set = bm_count_bits(b);
 715	dev_info(DEV, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
 716
 717 out:
 718	drbd_bm_unlock(mdev);
 719	return err;
 720}
 721
 722/* inherently racy:
 723 * if not protected by other means, return value may be out of date when
 724 * leaving this function...
 725 * we still need to lock it, since it is important that this returns
 726 * bm_set == 0 precisely.
 727 *
 728 * maybe bm_set should be atomic_t ?
 729 */
 730unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
 731{
 732	struct drbd_bitmap *b = mdev->bitmap;
 733	unsigned long s;
 734	unsigned long flags;
 735
 736	ERR_IF(!b) return 0;
 737	ERR_IF(!b->bm_pages) return 0;
 
 
 738
 739	spin_lock_irqsave(&b->bm_lock, flags);
 740	s = b->bm_set;
 741	spin_unlock_irqrestore(&b->bm_lock, flags);
 742
 743	return s;
 744}
 745
 746unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
 747{
 748	unsigned long s;
 749	/* if I don't have a disk, I don't know about out-of-sync status */
 750	if (!get_ldev_if_state(mdev, D_NEGOTIATING))
 751		return 0;
 752	s = _drbd_bm_total_weight(mdev);
 753	put_ldev(mdev);
 754	return s;
 755}
 756
 757size_t drbd_bm_words(struct drbd_conf *mdev)
 758{
 759	struct drbd_bitmap *b = mdev->bitmap;
 760	ERR_IF(!b) return 0;
 761	ERR_IF(!b->bm_pages) return 0;
 
 
 762
 763	return b->bm_words;
 764}
 765
 766unsigned long drbd_bm_bits(struct drbd_conf *mdev)
 767{
 768	struct drbd_bitmap *b = mdev->bitmap;
 769	ERR_IF(!b) return 0;
 
 770
 771	return b->bm_bits;
 772}
 773
 774/* merge number words from buffer into the bitmap starting at offset.
 775 * buffer[i] is expected to be little endian unsigned long.
 776 * bitmap must be locked by drbd_bm_lock.
 777 * currently only used from receive_bitmap.
 778 */
 779void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
 780			unsigned long *buffer)
 781{
 782	struct drbd_bitmap *b = mdev->bitmap;
 783	unsigned long *p_addr, *bm;
 784	unsigned long word, bits;
 785	unsigned int idx;
 786	size_t end, do_now;
 787
 788	end = offset + number;
 789
 790	ERR_IF(!b) return;
 791	ERR_IF(!b->bm_pages) return;
 
 
 792	if (number == 0)
 793		return;
 794	WARN_ON(offset >= b->bm_words);
 795	WARN_ON(end    >  b->bm_words);
 796
 797	spin_lock_irq(&b->bm_lock);
 798	while (offset < end) {
 799		do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
 800		idx = bm_word_to_page_idx(b, offset);
 801		p_addr = bm_map_pidx(b, idx);
 802		bm = p_addr + MLPP(offset);
 803		offset += do_now;
 804		while (do_now--) {
 805			bits = hweight_long(*bm);
 806			word = *bm | *buffer++;
 807			*bm++ = word;
 808			b->bm_set += hweight_long(word) - bits;
 809		}
 810		bm_unmap(p_addr);
 811		bm_set_page_need_writeout(b->bm_pages[idx]);
 812	}
 813	/* with 32bit <-> 64bit cross-platform connect
 814	 * this is only correct for current usage,
 815	 * where we _know_ that we are 64 bit aligned,
 816	 * and know that this function is used in this way, too...
 817	 */
 818	if (end == b->bm_words)
 819		b->bm_set -= bm_clear_surplus(b);
 820	spin_unlock_irq(&b->bm_lock);
 821}
 822
 823/* copy number words from the bitmap starting at offset into the buffer.
 824 * buffer[i] will be little endian unsigned long.
 825 */
 826void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
 827		     unsigned long *buffer)
 828{
 829	struct drbd_bitmap *b = mdev->bitmap;
 830	unsigned long *p_addr, *bm;
 831	size_t end, do_now;
 832
 833	end = offset + number;
 834
 835	ERR_IF(!b) return;
 836	ERR_IF(!b->bm_pages) return;
 
 
 837
 838	spin_lock_irq(&b->bm_lock);
 839	if ((offset >= b->bm_words) ||
 840	    (end    >  b->bm_words) ||
 841	    (number <= 0))
 842		dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",
 843			(unsigned long)	offset,
 844			(unsigned long)	number,
 845			(unsigned long) b->bm_words);
 846	else {
 847		while (offset < end) {
 848			do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
 849			p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
 850			bm = p_addr + MLPP(offset);
 851			offset += do_now;
 852			while (do_now--)
 853				*buffer++ = *bm++;
 854			bm_unmap(p_addr);
 855		}
 856	}
 857	spin_unlock_irq(&b->bm_lock);
 858}
 859
 860/* set all bits in the bitmap */
 861void drbd_bm_set_all(struct drbd_conf *mdev)
 862{
 863	struct drbd_bitmap *b = mdev->bitmap;
 864	ERR_IF(!b) return;
 865	ERR_IF(!b->bm_pages) return;
 
 
 866
 867	spin_lock_irq(&b->bm_lock);
 868	bm_memset(b, 0, 0xff, b->bm_words);
 869	(void)bm_clear_surplus(b);
 870	b->bm_set = b->bm_bits;
 871	spin_unlock_irq(&b->bm_lock);
 872}
 873
 874/* clear all bits in the bitmap */
 875void drbd_bm_clear_all(struct drbd_conf *mdev)
 876{
 877	struct drbd_bitmap *b = mdev->bitmap;
 878	ERR_IF(!b) return;
 879	ERR_IF(!b->bm_pages) return;
 
 
 880
 881	spin_lock_irq(&b->bm_lock);
 882	bm_memset(b, 0, 0, b->bm_words);
 883	b->bm_set = 0;
 884	spin_unlock_irq(&b->bm_lock);
 885}
 886
 887struct bm_aio_ctx {
 888	struct drbd_conf *mdev;
 889	atomic_t in_flight;
 890	struct completion done;
 891	unsigned flags;
 892#define BM_AIO_COPY_PAGES	1
 893	int error;
 894};
 895
 896/* bv_page may be a copy, or may be the original */
 897static void bm_async_io_complete(struct bio *bio, int error)
 898{
 899	struct bm_aio_ctx *ctx = bio->bi_private;
 900	struct drbd_conf *mdev = ctx->mdev;
 901	struct drbd_bitmap *b = mdev->bitmap;
 902	unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
 903	int uptodate = bio_flagged(bio, BIO_UPTODATE);
 904
 
 
 
 
 
 
 905
 906	/* strange behavior of some lower level drivers...
 907	 * fail the request by clearing the uptodate flag,
 908	 * but do not return any error?!
 909	 * do we want to WARN() on this? */
 910	if (!error && !uptodate)
 911		error = -EIO;
 
 912
 913	if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
 914	    !bm_test_page_unchanged(b->bm_pages[idx]))
 915		dev_warn(DEV, "bitmap page idx %u changed during IO!\n", idx);
 916
 917	if (error) {
 918		/* ctx error will hold the completed-last non-zero error code,
 919		 * in case error codes differ. */
 920		ctx->error = error;
 921		bm_set_page_io_err(b->bm_pages[idx]);
 922		/* Not identical to on disk version of it.
 923		 * Is BM_PAGE_IO_ERROR enough? */
 924		if (__ratelimit(&drbd_ratelimit_state))
 925			dev_err(DEV, "IO ERROR %d on bitmap page idx %u\n",
 926					error, idx);
 927	} else {
 928		bm_clear_page_io_err(b->bm_pages[idx]);
 929		dynamic_dev_dbg(DEV, "bitmap page idx %u completed\n", idx);
 930	}
 931
 932	bm_page_unlock_io(mdev, idx);
 933
 934	/* FIXME give back to page pool */
 935	if (ctx->flags & BM_AIO_COPY_PAGES)
 936		put_page(bio->bi_io_vec[0].bv_page);
 937
 938	bio_put(bio);
 939
 940	if (atomic_dec_and_test(&ctx->in_flight))
 941		complete(&ctx->done);
 
 
 
 942}
 943
 944static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
 
 945{
 946	/* we are process context. we always get a bio */
 947	struct bio *bio = bio_alloc(GFP_KERNEL, 1);
 948	struct drbd_conf *mdev = ctx->mdev;
 949	struct drbd_bitmap *b = mdev->bitmap;
 
 
 
 
 
 
 
 
 
 
 
 
 950	struct page *page;
 
 
 
 951	unsigned int len;
 952
 953	sector_t on_disk_sector =
 954		mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
 955	on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
 956
 957	/* this might happen with very small
 958	 * flexible external meta data device,
 959	 * or with PAGE_SIZE > 4k */
 960	len = min_t(unsigned int, PAGE_SIZE,
 961		(drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 962
 963	/* serialize IO on this page */
 964	bm_page_lock_io(mdev, page_nr);
 965	/* before memcpy and submit,
 966	 * so it can be redirtied any time */
 967	bm_set_page_unchanged(b->bm_pages[page_nr]);
 968
 969	if (ctx->flags & BM_AIO_COPY_PAGES) {
 970		/* FIXME alloc_page is good enough for now, but actually needs
 971		 * to use pre-allocated page pool */
 972		void *src, *dest;
 973		page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT);
 974		dest = kmap_atomic(page, KM_USER0);
 975		src = kmap_atomic(b->bm_pages[page_nr], KM_USER1);
 976		memcpy(dest, src, PAGE_SIZE);
 977		kunmap_atomic(src, KM_USER1);
 978		kunmap_atomic(dest, KM_USER0);
 979		bm_store_page_idx(page, page_nr);
 980	} else
 981		page = b->bm_pages[page_nr];
 982
 983	bio->bi_bdev = mdev->ldev->md_bdev;
 984	bio->bi_sector = on_disk_sector;
 985	bio_add_page(bio, page, len, 0);
 986	bio->bi_private = ctx;
 987	bio->bi_end_io = bm_async_io_complete;
 988
 989	if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
 990		bio->bi_rw |= rw;
 991		bio_endio(bio, -EIO);
 992	} else {
 993		submit_bio(rw, bio);
 994		/* this should not count as user activity and cause the
 995		 * resync to throttle -- see drbd_rs_should_slow_down(). */
 996		atomic_add(len >> 9, &mdev->rs_sect_ev);
 997	}
 998}
 999
1000/*
1001 * bm_rw: read/write the whole bitmap from/to its on disk location.
1002 */
1003static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local)
1004{
1005	struct bm_aio_ctx ctx = {
1006		.mdev = mdev,
1007		.in_flight = ATOMIC_INIT(1),
1008		.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
1009		.flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0,
1010	};
1011	struct drbd_bitmap *b = mdev->bitmap;
1012	int num_pages, i, count = 0;
1013	unsigned long now;
1014	char ppb[10];
1015	int err = 0;
1016
1017	/*
1018	 * We are protected against bitmap disappearing/resizing by holding an
1019	 * ldev reference (caller must have called get_ldev()).
1020	 * For read/write, we are protected against changes to the bitmap by
1021	 * the bitmap lock (see drbd_bitmap_io).
1022	 * For lazy writeout, we don't care for ongoing changes to the bitmap,
1023	 * as we submit copies of pages anyways.
1024	 */
1025	if (!ctx.flags)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1026		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
1027
 
 
 
 
1028	num_pages = b->bm_number_of_pages;
1029
1030	now = jiffies;
1031
1032	/* let the layers below us try to merge these bios... */
1033	for (i = 0; i < num_pages; i++) {
1034		/* ignore completely unchanged pages */
1035		if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
1036			break;
1037		if (rw & WRITE) {
1038			if (bm_test_page_unchanged(b->bm_pages[i])) {
1039				dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1040				continue;
1041			}
1042			/* during lazy writeout,
1043			 * ignore those pages not marked for lazy writeout. */
1044			if (lazy_writeout_upper_idx &&
1045			    !bm_test_page_lazy_writeout(b->bm_pages[i])) {
1046				dynamic_dev_dbg(DEV, "skipped bm lazy write for idx %u\n", i);
1047				continue;
1048			}
 
 
 
 
1049		}
1050		atomic_inc(&ctx.in_flight);
1051		bm_page_io_async(&ctx, i, rw);
1052		++count;
1053		cond_resched();
1054	}
1055
1056	/*
1057	 * We initialize ctx.in_flight to one to make sure bm_async_io_complete
1058	 * will not complete() early, and decrement / test it here.  If there
1059	 * are still some bios in flight, we need to wait for them here.
 
 
 
1060	 */
1061	if (!atomic_dec_and_test(&ctx.in_flight))
1062		wait_for_completion(&ctx.done);
1063	dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
1064			rw == WRITE ? "WRITE" : "READ",
1065			count, jiffies - now);
1066
1067	if (ctx.error) {
1068		dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
1069		drbd_chk_io_error(mdev, 1, true);
1070		err = -EIO; /* ctx.error ? */
 
 
 
 
 
 
 
 
 
1071	}
1072
 
 
 
1073	now = jiffies;
1074	if (rw == WRITE) {
1075		drbd_md_flush(mdev);
1076	} else /* rw == READ */ {
1077		b->bm_set = bm_count_bits(b);
1078		dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
1079		     jiffies - now);
1080	}
1081	now = b->bm_set;
1082
1083	dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
1084	     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
 
1085
 
1086	return err;
1087}
1088
1089/**
1090 * drbd_bm_read() - Read the whole bitmap from its on disk location.
1091 * @mdev:	DRBD device.
1092 */
1093int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
 
 
1094{
1095	return bm_rw(mdev, READ, 0);
1096}
1097
1098/**
1099 * drbd_bm_write() - Write the whole bitmap to its on disk location.
1100 * @mdev:	DRBD device.
1101 *
1102 * Will only write pages that have changed since last IO.
1103 */
1104int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
 
1105{
1106	return bm_rw(mdev, WRITE, 0);
1107}
1108
1109/**
1110 * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
1111 * @mdev:	DRBD device.
1112 * @upper_idx:	0: write all changed pages; +ve: page index to stop scanning for changed pages
 
1113 */
1114int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
 
1115{
1116	return bm_rw(mdev, WRITE, upper_idx);
1117}
1118
 
 
 
 
 
 
 
 
 
1119
1120/**
1121 * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
1122 * @mdev:	DRBD device.
1123 * @idx:	bitmap page index
1124 *
1125 * We don't want to special case on logical_block_size of the backend device,
1126 * so we submit PAGE_SIZE aligned pieces.
1127 * Note that on "most" systems, PAGE_SIZE is 4k.
1128 *
1129 * In case this becomes an issue on systems with larger PAGE_SIZE,
1130 * we may want to change this again to write 4k aligned 4k pieces.
 
 
 
 
1131 */
1132int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
 
1133{
1134	struct bm_aio_ctx ctx = {
1135		.mdev = mdev,
1136		.in_flight = ATOMIC_INIT(1),
1137		.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
1138		.flags = BM_AIO_COPY_PAGES,
1139	};
1140
1141	if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
1142		dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
1143		return 0;
1144	}
1145
1146	bm_page_io_async(&ctx, idx, WRITE_SYNC);
1147	wait_for_completion(&ctx.done);
1148
1149	if (ctx.error)
1150		drbd_chk_io_error(mdev, 1, true);
1151		/* that should force detach, so the in memory bitmap will be
1152		 * gone in a moment as well. */
1153
1154	mdev->bm_writ_cnt++;
1155	return ctx.error;
 
 
 
 
 
1156}
1157
1158/* NOTE
1159 * find_first_bit returns int, we return unsigned long.
1160 * For this to work on 32bit arch with bitnumbers > (1<<32),
1161 * we'd need to return u64, and get a whole lot of other places
1162 * fixed where we still use unsigned long.
1163 *
1164 * this returns a bit number, NOT a sector!
1165 */
1166static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
1167	const int find_zero_bit, const enum km_type km)
1168{
1169	struct drbd_bitmap *b = mdev->bitmap;
1170	unsigned long *p_addr;
1171	unsigned long bit_offset;
1172	unsigned i;
1173
1174
1175	if (bm_fo > b->bm_bits) {
1176		dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
1177		bm_fo = DRBD_END_OF_BITMAP;
1178	} else {
1179		while (bm_fo < b->bm_bits) {
1180			/* bit offset of the first bit in the page */
1181			bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
1182			p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km);
1183
1184			if (find_zero_bit)
1185				i = find_next_zero_bit_le(p_addr,
1186						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
1187			else
1188				i = find_next_bit_le(p_addr,
1189						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
1190
1191			__bm_unmap(p_addr, km);
1192			if (i < PAGE_SIZE*8) {
1193				bm_fo = bit_offset + i;
1194				if (bm_fo >= b->bm_bits)
1195					break;
1196				goto found;
1197			}
1198			bm_fo = bit_offset + PAGE_SIZE*8;
1199		}
1200		bm_fo = DRBD_END_OF_BITMAP;
1201	}
1202 found:
1203	return bm_fo;
1204}
1205
1206static unsigned long bm_find_next(struct drbd_conf *mdev,
1207	unsigned long bm_fo, const int find_zero_bit)
1208{
1209	struct drbd_bitmap *b = mdev->bitmap;
1210	unsigned long i = DRBD_END_OF_BITMAP;
1211
1212	ERR_IF(!b) return i;
1213	ERR_IF(!b->bm_pages) return i;
 
 
1214
1215	spin_lock_irq(&b->bm_lock);
1216	if (BM_DONT_TEST & b->bm_flags)
1217		bm_print_lock_info(mdev);
1218
1219	i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
1220
1221	spin_unlock_irq(&b->bm_lock);
1222	return i;
1223}
1224
1225unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
1226{
1227	return bm_find_next(mdev, bm_fo, 0);
1228}
1229
1230#if 0
1231/* not yet needed for anything. */
1232unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
1233{
1234	return bm_find_next(mdev, bm_fo, 1);
1235}
1236#endif
1237
1238/* does not spin_lock_irqsave.
1239 * you must take drbd_bm_lock() first */
1240unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
1241{
1242	/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
1243	return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
1244}
1245
1246unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
1247{
1248	/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
1249	return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
1250}
1251
1252/* returns number of bits actually changed.
1253 * for val != 0, we change 0 -> 1, return code positive
1254 * for val == 0, we change 1 -> 0, return code negative
1255 * wants bitnr, not sector.
1256 * expected to be called for only a few bits (e - s about BITS_PER_LONG).
1257 * Must hold bitmap lock already. */
1258static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1259	unsigned long e, int val)
1260{
1261	struct drbd_bitmap *b = mdev->bitmap;
1262	unsigned long *p_addr = NULL;
1263	unsigned long bitnr;
1264	unsigned int last_page_nr = -1U;
1265	int c = 0;
1266	int changed_total = 0;
1267
1268	if (e >= b->bm_bits) {
1269		dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
1270				s, e, b->bm_bits);
1271		e = b->bm_bits ? b->bm_bits -1 : 0;
1272	}
1273	for (bitnr = s; bitnr <= e; bitnr++) {
1274		unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
1275		if (page_nr != last_page_nr) {
1276			if (p_addr)
1277				__bm_unmap(p_addr, KM_IRQ1);
1278			if (c < 0)
1279				bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
1280			else if (c > 0)
1281				bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
1282			changed_total += c;
1283			c = 0;
1284			p_addr = __bm_map_pidx(b, page_nr, KM_IRQ1);
1285			last_page_nr = page_nr;
1286		}
1287		if (val)
1288			c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
1289		else
1290			c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
1291	}
1292	if (p_addr)
1293		__bm_unmap(p_addr, KM_IRQ1);
1294	if (c < 0)
1295		bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
1296	else if (c > 0)
1297		bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
1298	changed_total += c;
1299	b->bm_set += changed_total;
1300	return changed_total;
1301}
1302
1303/* returns number of bits actually changed.
1304 * for val != 0, we change 0 -> 1, return code positive
1305 * for val == 0, we change 1 -> 0, return code negative
1306 * wants bitnr, not sector */
1307static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
1308	const unsigned long e, int val)
1309{
1310	unsigned long flags;
1311	struct drbd_bitmap *b = mdev->bitmap;
1312	int c = 0;
1313
1314	ERR_IF(!b) return 1;
1315	ERR_IF(!b->bm_pages) return 0;
 
 
1316
1317	spin_lock_irqsave(&b->bm_lock, flags);
1318	if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
1319		bm_print_lock_info(mdev);
1320
1321	c = __bm_change_bits_to(mdev, s, e, val);
1322
1323	spin_unlock_irqrestore(&b->bm_lock, flags);
1324	return c;
1325}
1326
1327/* returns number of bits changed 0 -> 1 */
1328int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1329{
1330	return bm_change_bits_to(mdev, s, e, 1);
1331}
1332
1333/* returns number of bits changed 1 -> 0 */
1334int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1335{
1336	return -bm_change_bits_to(mdev, s, e, 0);
1337}
1338
1339/* sets all bits in full words,
1340 * from first_word up to, but not including, last_word */
1341static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
1342		int page_nr, int first_word, int last_word)
1343{
1344	int i;
1345	int bits;
1346	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1);
 
 
 
 
1347	for (i = first_word; i < last_word; i++) {
1348		bits = hweight_long(paddr[i]);
1349		paddr[i] = ~0UL;
1350		b->bm_set += BITS_PER_LONG - bits;
 
 
 
 
 
 
 
 
1351	}
1352	kunmap_atomic(paddr, KM_IRQ1);
1353}
1354
1355/* Same thing as drbd_bm_set_bits,
1356 * but more efficient for a large bit range.
1357 * You must first drbd_bm_lock().
1358 * Can be called to set the whole bitmap in one go.
1359 * Sets bits from s to e _inclusive_. */
1360void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1361{
1362	/* First set_bit from the first bit (s)
1363	 * up to the next long boundary (sl),
1364	 * then assign full words up to the last long boundary (el),
1365	 * then set_bit up to and including the last bit (e).
1366	 *
1367	 * Do not use memset, because we must account for changes,
1368	 * so we need to loop over the words with hweight() anyways.
1369	 */
1370	struct drbd_bitmap *b = mdev->bitmap;
1371	unsigned long sl = ALIGN(s,BITS_PER_LONG);
1372	unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
1373	int first_page;
1374	int last_page;
1375	int page_nr;
1376	int first_word;
1377	int last_word;
1378
1379	if (e - s <= 3*BITS_PER_LONG) {
1380		/* don't bother; el and sl may even be wrong. */
1381		spin_lock_irq(&b->bm_lock);
1382		__bm_change_bits_to(mdev, s, e, 1);
1383		spin_unlock_irq(&b->bm_lock);
1384		return;
1385	}
1386
1387	/* difference is large enough that we can trust sl and el */
1388
1389	spin_lock_irq(&b->bm_lock);
1390
1391	/* bits filling the current long */
1392	if (sl)
1393		__bm_change_bits_to(mdev, s, sl-1, 1);
1394
1395	first_page = sl >> (3 + PAGE_SHIFT);
1396	last_page = el >> (3 + PAGE_SHIFT);
1397
1398	/* MLPP: modulo longs per page */
1399	/* LWPP: long words per page */
1400	first_word = MLPP(sl >> LN2_BPL);
1401	last_word = LWPP;
1402
1403	/* first and full pages, unless first page == last page */
1404	for (page_nr = first_page; page_nr < last_page; page_nr++) {
1405		bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
1406		spin_unlock_irq(&b->bm_lock);
1407		cond_resched();
1408		first_word = 0;
1409		spin_lock_irq(&b->bm_lock);
1410	}
1411
1412	/* last page (respectively only page, for first page == last page) */
1413	last_word = MLPP(el >> LN2_BPL);
1414	bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
 
 
 
 
 
 
 
 
1415
1416	/* possibly trailing bits.
1417	 * example: (e & 63) == 63, el will be e+1.
1418	 * if that even was the very last bit,
1419	 * it would trigger an assert in __bm_change_bits_to()
1420	 */
1421	if (el <= e)
1422		__bm_change_bits_to(mdev, el, e, 1);
1423	spin_unlock_irq(&b->bm_lock);
1424}
1425
1426/* returns bit state
1427 * wants bitnr, NOT sector.
1428 * inherently racy... area needs to be locked by means of {al,rs}_lru
1429 *  1 ... bit set
1430 *  0 ... bit not set
1431 * -1 ... first out of bounds access, stop testing for bits!
1432 */
1433int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
1434{
1435	unsigned long flags;
1436	struct drbd_bitmap *b = mdev->bitmap;
1437	unsigned long *p_addr;
1438	int i;
1439
1440	ERR_IF(!b) return 0;
1441	ERR_IF(!b->bm_pages) return 0;
 
 
1442
1443	spin_lock_irqsave(&b->bm_lock, flags);
1444	if (BM_DONT_TEST & b->bm_flags)
1445		bm_print_lock_info(mdev);
1446	if (bitnr < b->bm_bits) {
1447		p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
1448		i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
1449		bm_unmap(p_addr);
1450	} else if (bitnr == b->bm_bits) {
1451		i = -1;
1452	} else { /* (bitnr > b->bm_bits) */
1453		dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
1454		i = 0;
1455	}
1456
1457	spin_unlock_irqrestore(&b->bm_lock, flags);
1458	return i;
1459}
1460
1461/* returns number of bits set in the range [s, e] */
1462int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
1463{
1464	unsigned long flags;
1465	struct drbd_bitmap *b = mdev->bitmap;
1466	unsigned long *p_addr = NULL;
1467	unsigned long bitnr;
1468	unsigned int page_nr = -1U;
1469	int c = 0;
1470
1471	/* If this is called without a bitmap, that is a bug.  But just to be
1472	 * robust in case we screwed up elsewhere, in that case pretend there
1473	 * was one dirty bit in the requested area, so we won't try to do a
1474	 * local read there (no bitmap probably implies no disk) */
1475	ERR_IF(!b) return 1;
1476	ERR_IF(!b->bm_pages) return 1;
 
 
1477
1478	spin_lock_irqsave(&b->bm_lock, flags);
1479	if (BM_DONT_TEST & b->bm_flags)
1480		bm_print_lock_info(mdev);
1481	for (bitnr = s; bitnr <= e; bitnr++) {
1482		unsigned int idx = bm_bit_to_page_idx(b, bitnr);
1483		if (page_nr != idx) {
1484			page_nr = idx;
1485			if (p_addr)
1486				bm_unmap(p_addr);
1487			p_addr = bm_map_pidx(b, idx);
1488		}
1489		ERR_IF (bitnr >= b->bm_bits) {
1490			dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
1491		} else {
1492			c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
1493		}
 
1494	}
1495	if (p_addr)
1496		bm_unmap(p_addr);
1497	spin_unlock_irqrestore(&b->bm_lock, flags);
1498	return c;
1499}
1500
1501
1502/* inherently racy...
1503 * return value may be already out-of-date when this function returns.
1504 * but the general usage is that this is only use during a cstate when bits are
1505 * only cleared, not set, and typically only care for the case when the return
1506 * value is zero, or we already "locked" this "bitmap extent" by other means.
1507 *
1508 * enr is bm-extent number, since we chose to name one sector (512 bytes)
1509 * worth of the bitmap a "bitmap extent".
1510 *
1511 * TODO
1512 * I think since we use it like a reference count, we should use the real
1513 * reference count of some bitmap extent element from some lru instead...
1514 *
1515 */
1516int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
1517{
1518	struct drbd_bitmap *b = mdev->bitmap;
1519	int count, s, e;
1520	unsigned long flags;
1521	unsigned long *p_addr, *bm;
1522
1523	ERR_IF(!b) return 0;
1524	ERR_IF(!b->bm_pages) return 0;
 
 
1525
1526	spin_lock_irqsave(&b->bm_lock, flags);
1527	if (BM_DONT_TEST & b->bm_flags)
1528		bm_print_lock_info(mdev);
1529
1530	s = S2W(enr);
1531	e = min((size_t)S2W(enr+1), b->bm_words);
1532	count = 0;
1533	if (s < b->bm_words) {
1534		int n = e-s;
1535		p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
1536		bm = p_addr + MLPP(s);
1537		while (n--)
1538			count += hweight_long(*bm++);
1539		bm_unmap(p_addr);
1540	} else {
1541		dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);
1542	}
1543	spin_unlock_irqrestore(&b->bm_lock, flags);
1544	return count;
1545}
1546
1547/* Set all bits covered by the AL-extent al_enr.
1548 * Returns number of bits changed. */
1549unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
1550{
1551	struct drbd_bitmap *b = mdev->bitmap;
1552	unsigned long *p_addr, *bm;
1553	unsigned long weight;
1554	unsigned long s, e;
1555	int count, i, do_now;
1556	ERR_IF(!b) return 0;
1557	ERR_IF(!b->bm_pages) return 0;
1558
1559	spin_lock_irq(&b->bm_lock);
1560	if (BM_DONT_SET & b->bm_flags)
1561		bm_print_lock_info(mdev);
1562	weight = b->bm_set;
1563
1564	s = al_enr * BM_WORDS_PER_AL_EXT;
1565	e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
1566	/* assert that s and e are on the same page */
1567	D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
1568	      ==  s    >> (PAGE_SHIFT - LN2_BPL + 3));
1569	count = 0;
1570	if (s < b->bm_words) {
1571		i = do_now = e-s;
1572		p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
1573		bm = p_addr + MLPP(s);
1574		while (i--) {
1575			count += hweight_long(*bm);
1576			*bm = -1UL;
1577			bm++;
1578		}
1579		bm_unmap(p_addr);
1580		b->bm_set += do_now*BITS_PER_LONG - count;
1581		if (e == b->bm_words)
1582			b->bm_set -= bm_clear_surplus(b);
1583	} else {
1584		dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);
1585	}
1586	weight = b->bm_set - weight;
1587	spin_unlock_irq(&b->bm_lock);
1588	return weight;
1589}
v6.9.4
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3   drbd_bitmap.c
   4
   5   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   6
   7   Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
   8   Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   9   Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
  10
 
 
 
 
 
 
 
 
 
 
 
 
 
  11 */
  12
  13#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
  14
  15#include <linux/bitmap.h>
  16#include <linux/vmalloc.h>
  17#include <linux/string.h>
  18#include <linux/drbd.h>
  19#include <linux/slab.h>
  20#include <linux/highmem.h>
  21
  22#include "drbd_int.h"
  23
  24
  25/* OPAQUE outside this file!
  26 * interface defined in drbd_int.h
  27
  28 * convention:
  29 * function name drbd_bm_... => used elsewhere, "public".
  30 * function name      bm_... => internal to implementation, "private".
  31 */
  32
  33
  34/*
  35 * LIMITATIONS:
  36 * We want to support >= peta byte of backend storage, while for now still using
  37 * a granularity of one bit per 4KiB of storage.
  38 * 1 << 50		bytes backend storage (1 PiB)
  39 * 1 << (50 - 12)	bits needed
  40 *	38 --> we need u64 to index and count bits
  41 * 1 << (38 - 3)	bitmap bytes needed
  42 *	35 --> we still need u64 to index and count bytes
  43 *			(that's 32 GiB of bitmap for 1 PiB storage)
  44 * 1 << (35 - 2)	32bit longs needed
  45 *	33 --> we'd even need u64 to index and count 32bit long words.
  46 * 1 << (35 - 3)	64bit longs needed
  47 *	32 --> we could get away with a 32bit unsigned int to index and count
  48 *	64bit long words, but I rather stay with unsigned long for now.
  49 *	We probably should neither count nor point to bytes or long words
  50 *	directly, but either by bitnumber, or by page index and offset.
  51 * 1 << (35 - 12)
  52 *	22 --> we need that much 4KiB pages of bitmap.
  53 *	1 << (22 + 3) --> on a 64bit arch,
  54 *	we need 32 MiB to store the array of page pointers.
  55 *
  56 * Because I'm lazy, and because the resulting patch was too large, too ugly
  57 * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
  58 * (1 << 32) bits * 4k storage.
  59 *
  60
  61 * bitmap storage and IO:
  62 *	Bitmap is stored little endian on disk, and is kept little endian in
  63 *	core memory. Currently we still hold the full bitmap in core as long
  64 *	as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
  65 *	seems excessive.
  66 *
  67 *	We plan to reduce the amount of in-core bitmap pages by paging them in
  68 *	and out against their on-disk location as necessary, but need to make
  69 *	sure we don't cause too much meta data IO, and must not deadlock in
  70 *	tight memory situations. This needs some more work.
  71 */
  72
  73/*
  74 * NOTE
  75 *  Access to the *bm_pages is protected by bm_lock.
  76 *  It is safe to read the other members within the lock.
  77 *
  78 *  drbd_bm_set_bits is called from bio_endio callbacks,
  79 *  We may be called with irq already disabled,
  80 *  so we need spin_lock_irqsave().
  81 *  And we need the kmap_atomic.
  82 */
  83struct drbd_bitmap {
  84	struct page **bm_pages;
  85	spinlock_t bm_lock;
  86
  87	/* exclusively to be used by __al_write_transaction(),
  88	 * drbd_bm_mark_for_writeout() and
  89	 * and drbd_bm_write_hinted() -> bm_rw() called from there.
  90	 */
  91	unsigned int n_bitmap_hints;
  92	unsigned int al_bitmap_hints[AL_UPDATES_PER_TRANSACTION];
  93
  94	/* see LIMITATIONS: above */
  95
  96	unsigned long bm_set;       /* nr of set bits; THINK maybe atomic_t? */
  97	unsigned long bm_bits;
  98	size_t   bm_words;
  99	size_t   bm_number_of_pages;
 100	sector_t bm_dev_capacity;
 101	struct mutex bm_change; /* serializes resize operations */
 102
 103	wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
 104
 105	enum bm_flag bm_flags;
 106
 107	/* debugging aid, in case we are still racy somewhere */
 108	char          *bm_why;
 109	struct task_struct *bm_task;
 110};
 111
 112#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
 113static void __bm_print_lock_info(struct drbd_device *device, const char *func)
 114{
 115	struct drbd_bitmap *b = device->bitmap;
 116	if (!drbd_ratelimit())
 117		return;
 118	drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n",
 119		 current->comm, task_pid_nr(current),
 120		 func, b->bm_why ?: "?",
 121		 b->bm_task->comm, task_pid_nr(b->bm_task));
 
 
 
 
 122}
 123
 124void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags)
 125{
 126	struct drbd_bitmap *b = device->bitmap;
 127	int trylock_failed;
 128
 129	if (!b) {
 130		drbd_err(device, "FIXME no bitmap in drbd_bm_lock!?\n");
 131		return;
 132	}
 133
 134	trylock_failed = !mutex_trylock(&b->bm_change);
 135
 136	if (trylock_failed) {
 137		drbd_warn(device, "%s[%d] going to '%s' but bitmap already locked for '%s' by %s[%d]\n",
 138			  current->comm, task_pid_nr(current),
 139			  why, b->bm_why ?: "?",
 140			  b->bm_task->comm, task_pid_nr(b->bm_task));
 
 
 
 
 141		mutex_lock(&b->bm_change);
 142	}
 143	if (BM_LOCKED_MASK & b->bm_flags)
 144		drbd_err(device, "FIXME bitmap already locked in bm_lock\n");
 145	b->bm_flags |= flags & BM_LOCKED_MASK;
 146
 147	b->bm_why  = why;
 148	b->bm_task = current;
 149}
 150
 151void drbd_bm_unlock(struct drbd_device *device)
 152{
 153	struct drbd_bitmap *b = device->bitmap;
 154	if (!b) {
 155		drbd_err(device, "FIXME no bitmap in drbd_bm_unlock!?\n");
 156		return;
 157	}
 158
 159	if (!(BM_LOCKED_MASK & device->bitmap->bm_flags))
 160		drbd_err(device, "FIXME bitmap not locked in bm_unlock\n");
 161
 162	b->bm_flags &= ~BM_LOCKED_MASK;
 163	b->bm_why  = NULL;
 164	b->bm_task = NULL;
 165	mutex_unlock(&b->bm_change);
 166}
 167
 168/* we store some "meta" info about our pages in page->private */
 169/* at a granularity of 4k storage per bitmap bit:
 170 * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
 171 *  1<<38 bits,
 172 *  1<<23 4k bitmap pages.
 173 * Use 24 bits as page index, covers 2 peta byte storage
 174 * at a granularity of 4k per bit.
 175 * Used to report the failed page idx on io error from the endio handlers.
 176 */
 177#define BM_PAGE_IDX_MASK	((1UL<<24)-1)
 178/* this page is currently read in, or written back */
 179#define BM_PAGE_IO_LOCK		31
 180/* if there has been an IO error for this page */
 181#define BM_PAGE_IO_ERROR	30
 182/* this is to be able to intelligently skip disk IO,
 183 * set if bits have been set since last IO. */
 184#define BM_PAGE_NEED_WRITEOUT	29
 185/* to mark for lazy writeout once syncer cleared all clearable bits,
 186 * we if bits have been cleared since last IO. */
 187#define BM_PAGE_LAZY_WRITEOUT	28
 188/* pages marked with this "HINT" will be considered for writeout
 189 * on activity log transactions */
 190#define BM_PAGE_HINT_WRITEOUT	27
 191
 192/* store_page_idx uses non-atomic assignment. It is only used directly after
 193 * allocating the page.  All other bm_set_page_* and bm_clear_page_* need to
 194 * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
 195 * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
 196 * requires it all to be atomic as well. */
 197static void bm_store_page_idx(struct page *page, unsigned long idx)
 198{
 199	BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
 200	set_page_private(page, idx);
 201}
 202
 203static unsigned long bm_page_to_idx(struct page *page)
 204{
 205	return page_private(page) & BM_PAGE_IDX_MASK;
 206}
 207
 208/* As is very unlikely that the same page is under IO from more than one
 209 * context, we can get away with a bit per page and one wait queue per bitmap.
 210 */
 211static void bm_page_lock_io(struct drbd_device *device, int page_nr)
 212{
 213	struct drbd_bitmap *b = device->bitmap;
 214	void *addr = &page_private(b->bm_pages[page_nr]);
 215	wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
 216}
 217
 218static void bm_page_unlock_io(struct drbd_device *device, int page_nr)
 219{
 220	struct drbd_bitmap *b = device->bitmap;
 221	void *addr = &page_private(b->bm_pages[page_nr]);
 222	clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
 223	wake_up(&device->bitmap->bm_io_wait);
 
 224}
 225
 226/* set _before_ submit_io, so it may be reset due to being changed
 227 * while this page is in flight... will get submitted later again */
 228static void bm_set_page_unchanged(struct page *page)
 229{
 230	/* use cmpxchg? */
 231	clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
 232	clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
 233}
 234
 235static void bm_set_page_need_writeout(struct page *page)
 236{
 237	set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
 238}
 239
 240void drbd_bm_reset_al_hints(struct drbd_device *device)
 241{
 242	device->bitmap->n_bitmap_hints = 0;
 243}
 244
 245/**
 246 * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
 247 * @device:	DRBD device.
 248 * @page_nr:	the bitmap page to mark with the "hint" flag
 249 *
 250 * From within an activity log transaction, we mark a few pages with these
 251 * hints, then call drbd_bm_write_hinted(), which will only write out changed
 252 * pages which are flagged with this mark.
 253 */
 254void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
 255{
 256	struct drbd_bitmap *b = device->bitmap;
 257	struct page *page;
 258	if (page_nr >= device->bitmap->bm_number_of_pages) {
 259		drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
 260			 page_nr, (int)device->bitmap->bm_number_of_pages);
 261		return;
 262	}
 263	page = device->bitmap->bm_pages[page_nr];
 264	BUG_ON(b->n_bitmap_hints >= ARRAY_SIZE(b->al_bitmap_hints));
 265	if (!test_and_set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)))
 266		b->al_bitmap_hints[b->n_bitmap_hints++] = page_nr;
 267}
 268
 269static int bm_test_page_unchanged(struct page *page)
 270{
 271	volatile const unsigned long *addr = &page_private(page);
 272	return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
 273}
 274
 275static void bm_set_page_io_err(struct page *page)
 276{
 277	set_bit(BM_PAGE_IO_ERROR, &page_private(page));
 278}
 279
 280static void bm_clear_page_io_err(struct page *page)
 281{
 282	clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
 283}
 284
 285static void bm_set_page_lazy_writeout(struct page *page)
 286{
 287	set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
 288}
 289
 290static int bm_test_page_lazy_writeout(struct page *page)
 291{
 292	return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
 293}
 294
 295/* on a 32bit box, this would allow for exactly (2<<38) bits. */
 296static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
 297{
 298	/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
 299	unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
 300	BUG_ON(page_nr >= b->bm_number_of_pages);
 301	return page_nr;
 302}
 303
 304static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
 305{
 306	/* page_nr = (bitnr/8) >> PAGE_SHIFT; */
 307	unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
 308	BUG_ON(page_nr >= b->bm_number_of_pages);
 309	return page_nr;
 310}
 311
 312static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
 313{
 314	struct page *page = b->bm_pages[idx];
 315	return (unsigned long *) kmap_atomic(page);
 316}
 317
 318static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
 319{
 320	return __bm_map_pidx(b, idx);
 321}
 322
 323static void __bm_unmap(unsigned long *p_addr)
 324{
 325	kunmap_atomic(p_addr);
 326};
 327
 328static void bm_unmap(unsigned long *p_addr)
 329{
 330	return __bm_unmap(p_addr);
 331}
 332
 333/* long word offset of _bitmap_ sector */
 334#define S2W(s)	((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
 335/* word offset from start of bitmap to word number _in_page_
 336 * modulo longs per page
 337#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
 338 hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
 339 so do it explicitly:
 340 */
 341#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
 342
 343/* Long words per page */
 344#define LWPP (PAGE_SIZE/sizeof(long))
 345
 346/*
 347 * actually most functions herein should take a struct drbd_bitmap*, not a
 348 * struct drbd_device*, but for the debug macros I like to have the device around
 349 * to be able to report device specific.
 350 */
 351
 352
 353static void bm_free_pages(struct page **pages, unsigned long number)
 354{
 355	unsigned long i;
 356	if (!pages)
 357		return;
 358
 359	for (i = 0; i < number; i++) {
 360		if (!pages[i]) {
 361			pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n",
 362				 i, number);
 
 363			continue;
 364		}
 365		__free_page(pages[i]);
 366		pages[i] = NULL;
 367	}
 368}
 369
 370static inline void bm_vk_free(void *ptr)
 371{
 372	kvfree(ptr);
 
 
 
 373}
 374
 375/*
 376 * "have" and "want" are NUMBER OF PAGES.
 377 */
 378static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
 379{
 380	struct page **old_pages = b->bm_pages;
 381	struct page **new_pages, *page;
 382	unsigned int i, bytes;
 383	unsigned long have = b->bm_number_of_pages;
 384
 385	BUG_ON(have == 0 && old_pages != NULL);
 386	BUG_ON(have != 0 && old_pages == NULL);
 387
 388	if (have == want)
 389		return old_pages;
 390
 391	/* Trying kmalloc first, falling back to vmalloc.
 392	 * GFP_NOIO, as this is called while drbd IO is "suspended",
 393	 * and during resize or attach on diskless Primary,
 394	 * we must not block on IO to ourselves.
 395	 * Context is receiver thread or dmsetup. */
 396	bytes = sizeof(struct page *)*want;
 397	new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN);
 398	if (!new_pages) {
 399		new_pages = __vmalloc(bytes, GFP_NOIO | __GFP_ZERO);
 400		if (!new_pages)
 401			return NULL;
 
 402	}
 403
 
 404	if (want >= have) {
 405		for (i = 0; i < have; i++)
 406			new_pages[i] = old_pages[i];
 407		for (; i < want; i++) {
 408			page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
 409			if (!page) {
 410				bm_free_pages(new_pages + have, i - have);
 411				bm_vk_free(new_pages);
 412				return NULL;
 413			}
 414			/* we want to know which page it is
 415			 * from the endio handlers */
 416			bm_store_page_idx(page, i);
 417			new_pages[i] = page;
 418		}
 419	} else {
 420		for (i = 0; i < want; i++)
 421			new_pages[i] = old_pages[i];
 422		/* NOT HERE, we are outside the spinlock!
 423		bm_free_pages(old_pages + want, have - want);
 424		*/
 425	}
 426
 
 
 
 
 
 427	return new_pages;
 428}
 429
 430/*
 431 * allocates the drbd_bitmap and stores it in device->bitmap.
 
 432 */
 433int drbd_bm_init(struct drbd_device *device)
 434{
 435	struct drbd_bitmap *b = device->bitmap;
 436	WARN_ON(b != NULL);
 437	b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
 438	if (!b)
 439		return -ENOMEM;
 440	spin_lock_init(&b->bm_lock);
 441	mutex_init(&b->bm_change);
 442	init_waitqueue_head(&b->bm_io_wait);
 443
 444	device->bitmap = b;
 445
 446	return 0;
 447}
 448
 449sector_t drbd_bm_capacity(struct drbd_device *device)
 450{
 451	if (!expect(device, device->bitmap))
 452		return 0;
 453	return device->bitmap->bm_dev_capacity;
 454}
 455
 456/* called on driver unload. TODO: call when a device is destroyed.
 457 */
 458void drbd_bm_cleanup(struct drbd_device *device)
 459{
 460	if (!expect(device, device->bitmap))
 461		return;
 462	bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
 463	bm_vk_free(device->bitmap->bm_pages);
 464	kfree(device->bitmap);
 465	device->bitmap = NULL;
 466}
 467
 468/*
 469 * since (b->bm_bits % BITS_PER_LONG) != 0,
 470 * this masks out the remaining bits.
 471 * Returns the number of bits cleared.
 472 */
 473#ifndef BITS_PER_PAGE
 474#define BITS_PER_PAGE		(1UL << (PAGE_SHIFT + 3))
 475#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE - 1)
 476#else
 477# if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3))
 478#  error "ambiguous BITS_PER_PAGE"
 479# endif
 480#endif
 481#define BITS_PER_LONG_MASK	(BITS_PER_LONG - 1)
 482static int bm_clear_surplus(struct drbd_bitmap *b)
 483{
 484	unsigned long mask;
 485	unsigned long *p_addr, *bm;
 486	int tmp;
 487	int cleared = 0;
 488
 489	/* number of bits modulo bits per page */
 490	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
 491	/* mask the used bits of the word containing the last bit */
 492	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
 493	/* bitmap is always stored little endian,
 494	 * on disk and in core memory alike */
 495	mask = cpu_to_lel(mask);
 496
 497	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
 498	bm = p_addr + (tmp/BITS_PER_LONG);
 499	if (mask) {
 500		/* If mask != 0, we are not exactly aligned, so bm now points
 501		 * to the long containing the last bit.
 502		 * If mask == 0, bm already points to the word immediately
 503		 * after the last (long word aligned) bit. */
 504		cleared = hweight_long(*bm & ~mask);
 505		*bm &= mask;
 506		bm++;
 507	}
 508
 509	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
 510		/* on a 32bit arch, we may need to zero out
 511		 * a padding long to align with a 64bit remote */
 512		cleared += hweight_long(*bm);
 513		*bm = 0;
 514	}
 515	bm_unmap(p_addr);
 516	return cleared;
 517}
 518
 519static void bm_set_surplus(struct drbd_bitmap *b)
 520{
 521	unsigned long mask;
 522	unsigned long *p_addr, *bm;
 523	int tmp;
 524
 525	/* number of bits modulo bits per page */
 526	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
 527	/* mask the used bits of the word containing the last bit */
 528	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
 529	/* bitmap is always stored little endian,
 530	 * on disk and in core memory alike */
 531	mask = cpu_to_lel(mask);
 532
 533	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
 534	bm = p_addr + (tmp/BITS_PER_LONG);
 535	if (mask) {
 536		/* If mask != 0, we are not exactly aligned, so bm now points
 537		 * to the long containing the last bit.
 538		 * If mask == 0, bm already points to the word immediately
 539		 * after the last (long word aligned) bit. */
 540		*bm |= ~mask;
 541		bm++;
 542	}
 543
 544	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
 545		/* on a 32bit arch, we may need to zero out
 546		 * a padding long to align with a 64bit remote */
 547		*bm = ~0UL;
 548	}
 549	bm_unmap(p_addr);
 550}
 551
 552/* you better not modify the bitmap while this is running,
 553 * or its results will be stale */
 554static unsigned long bm_count_bits(struct drbd_bitmap *b)
 555{
 556	unsigned long *p_addr;
 557	unsigned long bits = 0;
 558	unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
 559	int idx, last_word;
 560
 561	/* all but last page */
 562	for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
 563		p_addr = __bm_map_pidx(b, idx);
 564		bits += bitmap_weight(p_addr, BITS_PER_PAGE);
 565		__bm_unmap(p_addr);
 
 566		cond_resched();
 567	}
 568	/* last (or only) page */
 569	last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
 570	p_addr = __bm_map_pidx(b, idx);
 571	bits += bitmap_weight(p_addr, last_word * BITS_PER_LONG);
 
 572	p_addr[last_word] &= cpu_to_lel(mask);
 573	bits += hweight_long(p_addr[last_word]);
 574	/* 32bit arch, may have an unused padding long */
 575	if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
 576		p_addr[last_word+1] = 0;
 577	__bm_unmap(p_addr);
 578	return bits;
 579}
 580
 581/* offset and len in long words.*/
 582static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
 583{
 584	unsigned long *p_addr, *bm;
 585	unsigned int idx;
 586	size_t do_now, end;
 587
 588	end = offset + len;
 589
 590	if (end > b->bm_words) {
 591		pr_alert("bm_memset end > bm_words\n");
 592		return;
 593	}
 594
 595	while (offset < end) {
 596		do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
 597		idx = bm_word_to_page_idx(b, offset);
 598		p_addr = bm_map_pidx(b, idx);
 599		bm = p_addr + MLPP(offset);
 600		if (bm+do_now > p_addr + LWPP) {
 601			pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
 602			       p_addr, bm, (int)do_now);
 603		} else
 604			memset(bm, c, do_now * sizeof(long));
 605		bm_unmap(p_addr);
 606		bm_set_page_need_writeout(b->bm_pages[idx]);
 607		offset += do_now;
 608	}
 609}
 610
 611/* For the layout, see comment above drbd_md_set_sector_offsets(). */
 612static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
 613{
 614	u64 bitmap_sectors;
 615	if (ldev->md.al_offset == 8)
 616		bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
 617	else
 618		bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
 619	return bitmap_sectors << (9 + 3);
 620}
 621
 622/*
 623 * make sure the bitmap has enough room for the attached storage,
 624 * if necessary, resize.
 625 * called whenever we may have changed the device size.
 626 * returns -ENOMEM if we could not allocate enough memory, 0 on success.
 627 * In case this is actually a resize, we copy the old bitmap into the new one.
 628 * Otherwise, the bitmap is initialized to all bits set.
 629 */
 630int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bits)
 631{
 632	struct drbd_bitmap *b = device->bitmap;
 633	unsigned long bits, words, owords, obits;
 634	unsigned long want, have, onpages; /* number of pages */
 635	struct page **npages, **opages = NULL;
 636	int err = 0;
 637	bool growing;
 638
 639	if (!expect(device, b))
 640		return -ENOMEM;
 641
 642	drbd_bm_lock(device, "resize", BM_LOCKED_MASK);
 643
 644	drbd_info(device, "drbd_bm_resize called with capacity == %llu\n",
 645			(unsigned long long)capacity);
 646
 647	if (capacity == b->bm_dev_capacity)
 648		goto out;
 649
 
 
 650	if (capacity == 0) {
 651		spin_lock_irq(&b->bm_lock);
 652		opages = b->bm_pages;
 653		onpages = b->bm_number_of_pages;
 654		owords = b->bm_words;
 655		b->bm_pages = NULL;
 656		b->bm_number_of_pages =
 657		b->bm_set   =
 658		b->bm_bits  =
 659		b->bm_words =
 660		b->bm_dev_capacity = 0;
 661		spin_unlock_irq(&b->bm_lock);
 662		bm_free_pages(opages, onpages);
 663		bm_vk_free(opages);
 664		goto out;
 665	}
 666	bits  = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
 667
 668	/* if we would use
 669	   words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
 670	   a 32bit host could present the wrong number of words
 671	   to a 64bit host.
 672	*/
 673	words = ALIGN(bits, 64) >> LN2_BPL;
 674
 675	if (get_ldev(device)) {
 676		u64 bits_on_disk = drbd_md_on_disk_bits(device->ldev);
 677		put_ldev(device);
 678		if (bits > bits_on_disk) {
 679			drbd_info(device, "bits = %lu\n", bits);
 680			drbd_info(device, "bits_on_disk = %llu\n", bits_on_disk);
 681			err = -ENOSPC;
 682			goto out;
 683		}
 684	}
 685
 686	want = PFN_UP(words*sizeof(long));
 687	have = b->bm_number_of_pages;
 688	if (want == have) {
 689		D_ASSERT(device, b->bm_pages != NULL);
 690		npages = b->bm_pages;
 691	} else {
 692		if (drbd_insert_fault(device, DRBD_FAULT_BM_ALLOC))
 693			npages = NULL;
 694		else
 695			npages = bm_realloc_pages(b, want);
 696	}
 697
 698	if (!npages) {
 699		err = -ENOMEM;
 700		goto out;
 701	}
 702
 703	spin_lock_irq(&b->bm_lock);
 704	opages = b->bm_pages;
 705	owords = b->bm_words;
 706	obits  = b->bm_bits;
 707
 708	growing = bits > obits;
 709	if (opages && growing && set_new_bits)
 710		bm_set_surplus(b);
 711
 712	b->bm_pages = npages;
 713	b->bm_number_of_pages = want;
 714	b->bm_bits  = bits;
 715	b->bm_words = words;
 716	b->bm_dev_capacity = capacity;
 717
 718	if (growing) {
 719		if (set_new_bits) {
 720			bm_memset(b, owords, 0xff, words-owords);
 721			b->bm_set += bits - obits;
 722		} else
 723			bm_memset(b, owords, 0x00, words-owords);
 724
 725	}
 726
 727	if (want < have) {
 728		/* implicit: (opages != NULL) && (opages != npages) */
 729		bm_free_pages(opages + want, have - want);
 730	}
 731
 732	(void)bm_clear_surplus(b);
 733
 734	spin_unlock_irq(&b->bm_lock);
 735	if (opages != npages)
 736		bm_vk_free(opages);
 737	if (!growing)
 738		b->bm_set = bm_count_bits(b);
 739	drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
 740
 741 out:
 742	drbd_bm_unlock(device);
 743	return err;
 744}
 745
 746/* inherently racy:
 747 * if not protected by other means, return value may be out of date when
 748 * leaving this function...
 749 * we still need to lock it, since it is important that this returns
 750 * bm_set == 0 precisely.
 751 *
 752 * maybe bm_set should be atomic_t ?
 753 */
 754unsigned long _drbd_bm_total_weight(struct drbd_device *device)
 755{
 756	struct drbd_bitmap *b = device->bitmap;
 757	unsigned long s;
 758	unsigned long flags;
 759
 760	if (!expect(device, b))
 761		return 0;
 762	if (!expect(device, b->bm_pages))
 763		return 0;
 764
 765	spin_lock_irqsave(&b->bm_lock, flags);
 766	s = b->bm_set;
 767	spin_unlock_irqrestore(&b->bm_lock, flags);
 768
 769	return s;
 770}
 771
 772unsigned long drbd_bm_total_weight(struct drbd_device *device)
 773{
 774	unsigned long s;
 775	/* if I don't have a disk, I don't know about out-of-sync status */
 776	if (!get_ldev_if_state(device, D_NEGOTIATING))
 777		return 0;
 778	s = _drbd_bm_total_weight(device);
 779	put_ldev(device);
 780	return s;
 781}
 782
 783size_t drbd_bm_words(struct drbd_device *device)
 784{
 785	struct drbd_bitmap *b = device->bitmap;
 786	if (!expect(device, b))
 787		return 0;
 788	if (!expect(device, b->bm_pages))
 789		return 0;
 790
 791	return b->bm_words;
 792}
 793
 794unsigned long drbd_bm_bits(struct drbd_device *device)
 795{
 796	struct drbd_bitmap *b = device->bitmap;
 797	if (!expect(device, b))
 798		return 0;
 799
 800	return b->bm_bits;
 801}
 802
 803/* merge number words from buffer into the bitmap starting at offset.
 804 * buffer[i] is expected to be little endian unsigned long.
 805 * bitmap must be locked by drbd_bm_lock.
 806 * currently only used from receive_bitmap.
 807 */
 808void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number,
 809			unsigned long *buffer)
 810{
 811	struct drbd_bitmap *b = device->bitmap;
 812	unsigned long *p_addr, *bm;
 813	unsigned long word, bits;
 814	unsigned int idx;
 815	size_t end, do_now;
 816
 817	end = offset + number;
 818
 819	if (!expect(device, b))
 820		return;
 821	if (!expect(device, b->bm_pages))
 822		return;
 823	if (number == 0)
 824		return;
 825	WARN_ON(offset >= b->bm_words);
 826	WARN_ON(end    >  b->bm_words);
 827
 828	spin_lock_irq(&b->bm_lock);
 829	while (offset < end) {
 830		do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
 831		idx = bm_word_to_page_idx(b, offset);
 832		p_addr = bm_map_pidx(b, idx);
 833		bm = p_addr + MLPP(offset);
 834		offset += do_now;
 835		while (do_now--) {
 836			bits = hweight_long(*bm);
 837			word = *bm | *buffer++;
 838			*bm++ = word;
 839			b->bm_set += hweight_long(word) - bits;
 840		}
 841		bm_unmap(p_addr);
 842		bm_set_page_need_writeout(b->bm_pages[idx]);
 843	}
 844	/* with 32bit <-> 64bit cross-platform connect
 845	 * this is only correct for current usage,
 846	 * where we _know_ that we are 64 bit aligned,
 847	 * and know that this function is used in this way, too...
 848	 */
 849	if (end == b->bm_words)
 850		b->bm_set -= bm_clear_surplus(b);
 851	spin_unlock_irq(&b->bm_lock);
 852}
 853
 854/* copy number words from the bitmap starting at offset into the buffer.
 855 * buffer[i] will be little endian unsigned long.
 856 */
 857void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
 858		     unsigned long *buffer)
 859{
 860	struct drbd_bitmap *b = device->bitmap;
 861	unsigned long *p_addr, *bm;
 862	size_t end, do_now;
 863
 864	end = offset + number;
 865
 866	if (!expect(device, b))
 867		return;
 868	if (!expect(device, b->bm_pages))
 869		return;
 870
 871	spin_lock_irq(&b->bm_lock);
 872	if ((offset >= b->bm_words) ||
 873	    (end    >  b->bm_words) ||
 874	    (number <= 0))
 875		drbd_err(device, "offset=%lu number=%lu bm_words=%lu\n",
 876			(unsigned long)	offset,
 877			(unsigned long)	number,
 878			(unsigned long) b->bm_words);
 879	else {
 880		while (offset < end) {
 881			do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
 882			p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
 883			bm = p_addr + MLPP(offset);
 884			offset += do_now;
 885			while (do_now--)
 886				*buffer++ = *bm++;
 887			bm_unmap(p_addr);
 888		}
 889	}
 890	spin_unlock_irq(&b->bm_lock);
 891}
 892
 893/* set all bits in the bitmap */
 894void drbd_bm_set_all(struct drbd_device *device)
 895{
 896	struct drbd_bitmap *b = device->bitmap;
 897	if (!expect(device, b))
 898		return;
 899	if (!expect(device, b->bm_pages))
 900		return;
 901
 902	spin_lock_irq(&b->bm_lock);
 903	bm_memset(b, 0, 0xff, b->bm_words);
 904	(void)bm_clear_surplus(b);
 905	b->bm_set = b->bm_bits;
 906	spin_unlock_irq(&b->bm_lock);
 907}
 908
 909/* clear all bits in the bitmap */
 910void drbd_bm_clear_all(struct drbd_device *device)
 911{
 912	struct drbd_bitmap *b = device->bitmap;
 913	if (!expect(device, b))
 914		return;
 915	if (!expect(device, b->bm_pages))
 916		return;
 917
 918	spin_lock_irq(&b->bm_lock);
 919	bm_memset(b, 0, 0, b->bm_words);
 920	b->bm_set = 0;
 921	spin_unlock_irq(&b->bm_lock);
 922}
 923
 924static void drbd_bm_aio_ctx_destroy(struct kref *kref)
 
 
 
 
 
 
 
 
 
 
 925{
 926	struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref);
 927	unsigned long flags;
 
 
 
 928
 929	spin_lock_irqsave(&ctx->device->resource->req_lock, flags);
 930	list_del(&ctx->list);
 931	spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags);
 932	put_ldev(ctx->device);
 933	kfree(ctx);
 934}
 935
 936/* bv_page may be a copy, or may be the original */
 937static void drbd_bm_endio(struct bio *bio)
 938{
 939	struct drbd_bm_aio_ctx *ctx = bio->bi_private;
 940	struct drbd_device *device = ctx->device;
 941	struct drbd_bitmap *b = device->bitmap;
 942	unsigned int idx = bm_page_to_idx(bio_first_page_all(bio));
 943
 944	if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
 945	    !bm_test_page_unchanged(b->bm_pages[idx]))
 946		drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx);
 947
 948	if (bio->bi_status) {
 949		/* ctx error will hold the completed-last non-zero error code,
 950		 * in case error codes differ. */
 951		ctx->error = blk_status_to_errno(bio->bi_status);
 952		bm_set_page_io_err(b->bm_pages[idx]);
 953		/* Not identical to on disk version of it.
 954		 * Is BM_PAGE_IO_ERROR enough? */
 955		if (drbd_ratelimit())
 956			drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
 957					bio->bi_status, idx);
 958	} else {
 959		bm_clear_page_io_err(b->bm_pages[idx]);
 960		dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx);
 961	}
 962
 963	bm_page_unlock_io(device, idx);
 964
 
 965	if (ctx->flags & BM_AIO_COPY_PAGES)
 966		mempool_free(bio->bi_io_vec[0].bv_page, &drbd_md_io_page_pool);
 967
 968	bio_put(bio);
 969
 970	if (atomic_dec_and_test(&ctx->in_flight)) {
 971		ctx->done = 1;
 972		wake_up(&device->misc_wait);
 973		kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
 974	}
 975}
 976
 977/* For the layout, see comment above drbd_md_set_sector_offsets(). */
 978static inline sector_t drbd_md_last_bitmap_sector(struct drbd_backing_dev *bdev)
 979{
 980	switch (bdev->md.meta_dev_idx) {
 981	case DRBD_MD_INDEX_INTERNAL:
 982	case DRBD_MD_INDEX_FLEX_INT:
 983		return bdev->md.md_offset + bdev->md.al_offset -1;
 984	case DRBD_MD_INDEX_FLEX_EXT:
 985	default:
 986		return bdev->md.md_offset + bdev->md.md_size_sect -1;
 987	}
 988}
 989
 990static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
 991{
 992	struct drbd_device *device = ctx->device;
 993	enum req_op op = ctx->flags & BM_AIO_READ ? REQ_OP_READ : REQ_OP_WRITE;
 994	struct drbd_bitmap *b = device->bitmap;
 995	struct bio *bio;
 996	struct page *page;
 997	sector_t last_bm_sect;
 998	sector_t first_bm_sect;
 999	sector_t on_disk_sector;
1000	unsigned int len;
1001
1002	first_bm_sect = device->ldev->md.md_offset + device->ldev->md.bm_offset;
1003	on_disk_sector = first_bm_sect + (((sector_t)page_nr) << (PAGE_SHIFT-SECTOR_SHIFT));
 
1004
1005	/* this might happen with very small
1006	 * flexible external meta data device,
1007	 * or with PAGE_SIZE > 4k */
1008	last_bm_sect = drbd_md_last_bitmap_sector(device->ldev);
1009	if (first_bm_sect <= on_disk_sector && last_bm_sect >= on_disk_sector) {
1010		sector_t len_sect = last_bm_sect - on_disk_sector + 1;
1011		if (len_sect < PAGE_SIZE/SECTOR_SIZE)
1012			len = (unsigned int)len_sect*SECTOR_SIZE;
1013		else
1014			len = PAGE_SIZE;
1015	} else {
1016		if (drbd_ratelimit()) {
1017			drbd_err(device, "Invalid offset during on-disk bitmap access: "
1018				 "page idx %u, sector %llu\n", page_nr, on_disk_sector);
1019		}
1020		ctx->error = -EIO;
1021		bm_set_page_io_err(b->bm_pages[page_nr]);
1022		if (atomic_dec_and_test(&ctx->in_flight)) {
1023			ctx->done = 1;
1024			wake_up(&device->misc_wait);
1025			kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
1026		}
1027		return;
1028	}
1029
1030	/* serialize IO on this page */
1031	bm_page_lock_io(device, page_nr);
1032	/* before memcpy and submit,
1033	 * so it can be redirtied any time */
1034	bm_set_page_unchanged(b->bm_pages[page_nr]);
1035
1036	if (ctx->flags & BM_AIO_COPY_PAGES) {
1037		page = mempool_alloc(&drbd_md_io_page_pool,
1038				GFP_NOIO | __GFP_HIGHMEM);
1039		copy_highpage(page, b->bm_pages[page_nr]);
 
 
 
 
 
 
1040		bm_store_page_idx(page, page_nr);
1041	} else
1042		page = b->bm_pages[page_nr];
1043	bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op, GFP_NOIO,
1044			&drbd_md_io_bio_set);
1045	bio->bi_iter.bi_sector = on_disk_sector;
1046	__bio_add_page(bio, page, len, 0);
1047	bio->bi_private = ctx;
1048	bio->bi_end_io = drbd_bm_endio;
1049
1050	if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
1051		bio_io_error(bio);
 
1052	} else {
1053		submit_bio(bio);
1054		/* this should not count as user activity and cause the
1055		 * resync to throttle -- see drbd_rs_should_slow_down(). */
1056		atomic_add(len >> 9, &device->rs_sect_ev);
1057	}
1058}
1059
1060/*
1061 * bm_rw: read/write the whole bitmap from/to its on disk location.
1062 */
1063static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
1064{
1065	struct drbd_bm_aio_ctx *ctx;
1066	struct drbd_bitmap *b = device->bitmap;
1067	unsigned int num_pages, i, count = 0;
 
 
 
 
 
1068	unsigned long now;
1069	char ppb[10];
1070	int err = 0;
1071
1072	/*
1073	 * We are protected against bitmap disappearing/resizing by holding an
1074	 * ldev reference (caller must have called get_ldev()).
1075	 * For read/write, we are protected against changes to the bitmap by
1076	 * the bitmap lock (see drbd_bitmap_io).
1077	 * For lazy writeout, we don't care for ongoing changes to the bitmap,
1078	 * as we submit copies of pages anyways.
1079	 */
1080
1081	ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO);
1082	if (!ctx)
1083		return -ENOMEM;
1084
1085	*ctx = (struct drbd_bm_aio_ctx) {
1086		.device = device,
1087		.start_jif = jiffies,
1088		.in_flight = ATOMIC_INIT(1),
1089		.done = 0,
1090		.flags = flags,
1091		.error = 0,
1092		.kref = KREF_INIT(2),
1093	};
1094
1095	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in drbd_bm_aio_ctx_destroy() */
1096		drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
1097		kfree(ctx);
1098		return -ENODEV;
1099	}
1100	/* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
1101	   drbd_adm_attach(), after device->ldev was assigned. */
1102
1103	if (0 == (ctx->flags & ~BM_AIO_READ))
1104		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
1105
1106	spin_lock_irq(&device->resource->req_lock);
1107	list_add_tail(&ctx->list, &device->pending_bitmap_io);
1108	spin_unlock_irq(&device->resource->req_lock);
1109
1110	num_pages = b->bm_number_of_pages;
1111
1112	now = jiffies;
1113
1114	/* let the layers below us try to merge these bios... */
1115
1116	if (flags & BM_AIO_READ) {
1117		for (i = 0; i < num_pages; i++) {
1118			atomic_inc(&ctx->in_flight);
1119			bm_page_io_async(ctx, i);
1120			++count;
1121			cond_resched();
1122		}
1123	} else if (flags & BM_AIO_WRITE_HINTED) {
1124		/* ASSERT: BM_AIO_WRITE_ALL_PAGES is not set. */
1125		unsigned int hint;
1126		for (hint = 0; hint < b->n_bitmap_hints; hint++) {
1127			i = b->al_bitmap_hints[hint];
1128			if (i >= num_pages) /* == -1U: no hint here. */
1129				continue;
1130			/* Several AL-extents may point to the same page. */
1131			if (!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
1132			    &page_private(b->bm_pages[i])))
1133				continue;
1134			/* Has it even changed? */
1135			if (bm_test_page_unchanged(b->bm_pages[i]))
1136				continue;
1137			atomic_inc(&ctx->in_flight);
1138			bm_page_io_async(ctx, i);
1139			++count;
1140		}
1141	} else {
1142		for (i = 0; i < num_pages; i++) {
1143			/* ignore completely unchanged pages */
1144			if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
1145				break;
1146			if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
1147			    bm_test_page_unchanged(b->bm_pages[i])) {
1148				dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
1149				continue;
1150			}
1151			/* during lazy writeout,
1152			 * ignore those pages not marked for lazy writeout. */
1153			if (lazy_writeout_upper_idx &&
1154			    !bm_test_page_lazy_writeout(b->bm_pages[i])) {
1155				dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
1156				continue;
1157			}
1158			atomic_inc(&ctx->in_flight);
1159			bm_page_io_async(ctx, i);
1160			++count;
1161			cond_resched();
1162		}
 
 
 
 
1163	}
1164
1165	/*
1166	 * We initialize ctx->in_flight to one to make sure drbd_bm_endio
1167	 * will not set ctx->done early, and decrement / test it here.  If there
1168	 * are still some bios in flight, we need to wait for them here.
1169	 * If all IO is done already (or nothing had been submitted), there is
1170	 * no need to wait.  Still, we need to put the kref associated with the
1171	 * "in_flight reached zero, all done" event.
1172	 */
1173	if (!atomic_dec_and_test(&ctx->in_flight))
1174		wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
1175	else
1176		kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
1177
1178	/* summary for global bitmap IO */
1179	if (flags == 0) {
1180		unsigned int ms = jiffies_to_msecs(jiffies - now);
1181		if (ms > 5) {
1182			drbd_info(device, "bitmap %s of %u pages took %u ms\n",
1183				 (flags & BM_AIO_READ) ? "READ" : "WRITE",
1184				 count, ms);
1185		}
1186	}
1187
1188	if (ctx->error) {
1189		drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n");
1190		drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
1191		err = -EIO; /* ctx->error ? */
1192	}
1193
1194	if (atomic_read(&ctx->in_flight))
1195		err = -EIO; /* Disk timeout/force-detach during IO... */
1196
1197	now = jiffies;
1198	if (flags & BM_AIO_READ) {
 
 
1199		b->bm_set = bm_count_bits(b);
1200		drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
1201		     jiffies - now);
1202	}
1203	now = b->bm_set;
1204
1205	if ((flags & ~BM_AIO_READ) == 0)
1206		drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
1207		     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
1208
1209	kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
1210	return err;
1211}
1212
1213/**
1214 * drbd_bm_read() - Read the whole bitmap from its on disk location.
1215 * @device:	DRBD device.
1216 */
1217int drbd_bm_read(struct drbd_device *device,
1218		 struct drbd_peer_device *peer_device) __must_hold(local)
1219
1220{
1221	return bm_rw(device, BM_AIO_READ, 0);
1222}
1223
1224/**
1225 * drbd_bm_write() - Write the whole bitmap to its on disk location.
1226 * @device:	DRBD device.
1227 *
1228 * Will only write pages that have changed since last IO.
1229 */
1230int drbd_bm_write(struct drbd_device *device,
1231		 struct drbd_peer_device *peer_device) __must_hold(local)
1232{
1233	return bm_rw(device, 0, 0);
1234}
1235
1236/**
1237 * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
1238 * @device:	DRBD device.
1239 *
1240 * Will write all pages.
1241 */
1242int drbd_bm_write_all(struct drbd_device *device,
1243		struct drbd_peer_device *peer_device) __must_hold(local)
1244{
1245	return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0);
1246}
1247
1248/**
1249 * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
1250 * @device:	DRBD device.
1251 * @upper_idx:	0: write all changed pages; +ve: page index to stop scanning for changed pages
1252 */
1253int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local)
1254{
1255	return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
1256}
1257
1258/**
1259 * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
1260 * @device:	DRBD device.
 
 
 
 
 
1261 *
1262 * Will only write pages that have changed since last IO.
1263 * In contrast to drbd_bm_write(), this will copy the bitmap pages
1264 * to temporary writeout pages. It is intended to trigger a full write-out
1265 * while still allowing the bitmap to change, for example if a resync or online
1266 * verify is aborted due to a failed peer disk, while local IO continues, or
1267 * pending resync acks are still being processed.
1268 */
1269int drbd_bm_write_copy_pages(struct drbd_device *device,
1270		struct drbd_peer_device *peer_device) __must_hold(local)
1271{
1272	return bm_rw(device, BM_AIO_COPY_PAGES, 0);
1273}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1274
1275/**
1276 * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
1277 * @device:	DRBD device.
1278 */
1279int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
1280{
1281	return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
1282}
1283
1284/* NOTE
1285 * find_first_bit returns int, we return unsigned long.
1286 * For this to work on 32bit arch with bitnumbers > (1<<32),
1287 * we'd need to return u64, and get a whole lot of other places
1288 * fixed where we still use unsigned long.
1289 *
1290 * this returns a bit number, NOT a sector!
1291 */
1292static unsigned long __bm_find_next(struct drbd_device *device, unsigned long bm_fo,
1293	const int find_zero_bit)
1294{
1295	struct drbd_bitmap *b = device->bitmap;
1296	unsigned long *p_addr;
1297	unsigned long bit_offset;
1298	unsigned i;
1299
1300
1301	if (bm_fo > b->bm_bits) {
1302		drbd_err(device, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
1303		bm_fo = DRBD_END_OF_BITMAP;
1304	} else {
1305		while (bm_fo < b->bm_bits) {
1306			/* bit offset of the first bit in the page */
1307			bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
1308			p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
1309
1310			if (find_zero_bit)
1311				i = find_next_zero_bit_le(p_addr,
1312						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
1313			else
1314				i = find_next_bit_le(p_addr,
1315						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
1316
1317			__bm_unmap(p_addr);
1318			if (i < PAGE_SIZE*8) {
1319				bm_fo = bit_offset + i;
1320				if (bm_fo >= b->bm_bits)
1321					break;
1322				goto found;
1323			}
1324			bm_fo = bit_offset + PAGE_SIZE*8;
1325		}
1326		bm_fo = DRBD_END_OF_BITMAP;
1327	}
1328 found:
1329	return bm_fo;
1330}
1331
1332static unsigned long bm_find_next(struct drbd_device *device,
1333	unsigned long bm_fo, const int find_zero_bit)
1334{
1335	struct drbd_bitmap *b = device->bitmap;
1336	unsigned long i = DRBD_END_OF_BITMAP;
1337
1338	if (!expect(device, b))
1339		return i;
1340	if (!expect(device, b->bm_pages))
1341		return i;
1342
1343	spin_lock_irq(&b->bm_lock);
1344	if (BM_DONT_TEST & b->bm_flags)
1345		bm_print_lock_info(device);
1346
1347	i = __bm_find_next(device, bm_fo, find_zero_bit);
1348
1349	spin_unlock_irq(&b->bm_lock);
1350	return i;
1351}
1352
1353unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
1354{
1355	return bm_find_next(device, bm_fo, 0);
1356}
1357
1358#if 0
1359/* not yet needed for anything. */
1360unsigned long drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
1361{
1362	return bm_find_next(device, bm_fo, 1);
1363}
1364#endif
1365
1366/* does not spin_lock_irqsave.
1367 * you must take drbd_bm_lock() first */
1368unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
1369{
1370	/* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
1371	return __bm_find_next(device, bm_fo, 0);
1372}
1373
1374unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
1375{
1376	/* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
1377	return __bm_find_next(device, bm_fo, 1);
1378}
1379
1380/* returns number of bits actually changed.
1381 * for val != 0, we change 0 -> 1, return code positive
1382 * for val == 0, we change 1 -> 0, return code negative
1383 * wants bitnr, not sector.
1384 * expected to be called for only a few bits (e - s about BITS_PER_LONG).
1385 * Must hold bitmap lock already. */
1386static int __bm_change_bits_to(struct drbd_device *device, const unsigned long s,
1387	unsigned long e, int val)
1388{
1389	struct drbd_bitmap *b = device->bitmap;
1390	unsigned long *p_addr = NULL;
1391	unsigned long bitnr;
1392	unsigned int last_page_nr = -1U;
1393	int c = 0;
1394	int changed_total = 0;
1395
1396	if (e >= b->bm_bits) {
1397		drbd_err(device, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
1398				s, e, b->bm_bits);
1399		e = b->bm_bits ? b->bm_bits -1 : 0;
1400	}
1401	for (bitnr = s; bitnr <= e; bitnr++) {
1402		unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
1403		if (page_nr != last_page_nr) {
1404			if (p_addr)
1405				__bm_unmap(p_addr);
1406			if (c < 0)
1407				bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
1408			else if (c > 0)
1409				bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
1410			changed_total += c;
1411			c = 0;
1412			p_addr = __bm_map_pidx(b, page_nr);
1413			last_page_nr = page_nr;
1414		}
1415		if (val)
1416			c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
1417		else
1418			c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
1419	}
1420	if (p_addr)
1421		__bm_unmap(p_addr);
1422	if (c < 0)
1423		bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
1424	else if (c > 0)
1425		bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
1426	changed_total += c;
1427	b->bm_set += changed_total;
1428	return changed_total;
1429}
1430
1431/* returns number of bits actually changed.
1432 * for val != 0, we change 0 -> 1, return code positive
1433 * for val == 0, we change 1 -> 0, return code negative
1434 * wants bitnr, not sector */
1435static int bm_change_bits_to(struct drbd_device *device, const unsigned long s,
1436	const unsigned long e, int val)
1437{
1438	unsigned long flags;
1439	struct drbd_bitmap *b = device->bitmap;
1440	int c = 0;
1441
1442	if (!expect(device, b))
1443		return 1;
1444	if (!expect(device, b->bm_pages))
1445		return 0;
1446
1447	spin_lock_irqsave(&b->bm_lock, flags);
1448	if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
1449		bm_print_lock_info(device);
1450
1451	c = __bm_change_bits_to(device, s, e, val);
1452
1453	spin_unlock_irqrestore(&b->bm_lock, flags);
1454	return c;
1455}
1456
1457/* returns number of bits changed 0 -> 1 */
1458int drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
1459{
1460	return bm_change_bits_to(device, s, e, 1);
1461}
1462
1463/* returns number of bits changed 1 -> 0 */
1464int drbd_bm_clear_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
1465{
1466	return -bm_change_bits_to(device, s, e, 0);
1467}
1468
1469/* sets all bits in full words,
1470 * from first_word up to, but not including, last_word */
1471static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
1472		int page_nr, int first_word, int last_word)
1473{
1474	int i;
1475	int bits;
1476	int changed = 0;
1477	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
1478
1479	/* I think it is more cache line friendly to hweight_long then set to ~0UL,
1480	 * than to first bitmap_weight() all words, then bitmap_fill() all words */
1481	for (i = first_word; i < last_word; i++) {
1482		bits = hweight_long(paddr[i]);
1483		paddr[i] = ~0UL;
1484		changed += BITS_PER_LONG - bits;
1485	}
1486	kunmap_atomic(paddr);
1487	if (changed) {
1488		/* We only need lazy writeout, the information is still in the
1489		 * remote bitmap as well, and is reconstructed during the next
1490		 * bitmap exchange, if lost locally due to a crash. */
1491		bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
1492		b->bm_set += changed;
1493	}
 
1494}
1495
1496/* Same thing as drbd_bm_set_bits,
1497 * but more efficient for a large bit range.
1498 * You must first drbd_bm_lock().
1499 * Can be called to set the whole bitmap in one go.
1500 * Sets bits from s to e _inclusive_. */
1501void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
1502{
1503	/* First set_bit from the first bit (s)
1504	 * up to the next long boundary (sl),
1505	 * then assign full words up to the last long boundary (el),
1506	 * then set_bit up to and including the last bit (e).
1507	 *
1508	 * Do not use memset, because we must account for changes,
1509	 * so we need to loop over the words with hweight() anyways.
1510	 */
1511	struct drbd_bitmap *b = device->bitmap;
1512	unsigned long sl = ALIGN(s,BITS_PER_LONG);
1513	unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
1514	int first_page;
1515	int last_page;
1516	int page_nr;
1517	int first_word;
1518	int last_word;
1519
1520	if (e - s <= 3*BITS_PER_LONG) {
1521		/* don't bother; el and sl may even be wrong. */
1522		spin_lock_irq(&b->bm_lock);
1523		__bm_change_bits_to(device, s, e, 1);
1524		spin_unlock_irq(&b->bm_lock);
1525		return;
1526	}
1527
1528	/* difference is large enough that we can trust sl and el */
1529
1530	spin_lock_irq(&b->bm_lock);
1531
1532	/* bits filling the current long */
1533	if (sl)
1534		__bm_change_bits_to(device, s, sl-1, 1);
1535
1536	first_page = sl >> (3 + PAGE_SHIFT);
1537	last_page = el >> (3 + PAGE_SHIFT);
1538
1539	/* MLPP: modulo longs per page */
1540	/* LWPP: long words per page */
1541	first_word = MLPP(sl >> LN2_BPL);
1542	last_word = LWPP;
1543
1544	/* first and full pages, unless first page == last page */
1545	for (page_nr = first_page; page_nr < last_page; page_nr++) {
1546		bm_set_full_words_within_one_page(device->bitmap, page_nr, first_word, last_word);
1547		spin_unlock_irq(&b->bm_lock);
1548		cond_resched();
1549		first_word = 0;
1550		spin_lock_irq(&b->bm_lock);
1551	}
 
1552	/* last page (respectively only page, for first page == last page) */
1553	last_word = MLPP(el >> LN2_BPL);
1554
1555	/* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples).
1556	 * ==> e = 32767, el = 32768, last_page = 2,
1557	 * and now last_word = 0.
1558	 * We do not want to touch last_page in this case,
1559	 * as we did not allocate it, it is not present in bitmap->bm_pages.
1560	 */
1561	if (last_word)
1562		bm_set_full_words_within_one_page(device->bitmap, last_page, first_word, last_word);
1563
1564	/* possibly trailing bits.
1565	 * example: (e & 63) == 63, el will be e+1.
1566	 * if that even was the very last bit,
1567	 * it would trigger an assert in __bm_change_bits_to()
1568	 */
1569	if (el <= e)
1570		__bm_change_bits_to(device, el, e, 1);
1571	spin_unlock_irq(&b->bm_lock);
1572}
1573
1574/* returns bit state
1575 * wants bitnr, NOT sector.
1576 * inherently racy... area needs to be locked by means of {al,rs}_lru
1577 *  1 ... bit set
1578 *  0 ... bit not set
1579 * -1 ... first out of bounds access, stop testing for bits!
1580 */
1581int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr)
1582{
1583	unsigned long flags;
1584	struct drbd_bitmap *b = device->bitmap;
1585	unsigned long *p_addr;
1586	int i;
1587
1588	if (!expect(device, b))
1589		return 0;
1590	if (!expect(device, b->bm_pages))
1591		return 0;
1592
1593	spin_lock_irqsave(&b->bm_lock, flags);
1594	if (BM_DONT_TEST & b->bm_flags)
1595		bm_print_lock_info(device);
1596	if (bitnr < b->bm_bits) {
1597		p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
1598		i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
1599		bm_unmap(p_addr);
1600	} else if (bitnr == b->bm_bits) {
1601		i = -1;
1602	} else { /* (bitnr > b->bm_bits) */
1603		drbd_err(device, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
1604		i = 0;
1605	}
1606
1607	spin_unlock_irqrestore(&b->bm_lock, flags);
1608	return i;
1609}
1610
1611/* returns number of bits set in the range [s, e] */
1612int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
1613{
1614	unsigned long flags;
1615	struct drbd_bitmap *b = device->bitmap;
1616	unsigned long *p_addr = NULL;
1617	unsigned long bitnr;
1618	unsigned int page_nr = -1U;
1619	int c = 0;
1620
1621	/* If this is called without a bitmap, that is a bug.  But just to be
1622	 * robust in case we screwed up elsewhere, in that case pretend there
1623	 * was one dirty bit in the requested area, so we won't try to do a
1624	 * local read there (no bitmap probably implies no disk) */
1625	if (!expect(device, b))
1626		return 1;
1627	if (!expect(device, b->bm_pages))
1628		return 1;
1629
1630	spin_lock_irqsave(&b->bm_lock, flags);
1631	if (BM_DONT_TEST & b->bm_flags)
1632		bm_print_lock_info(device);
1633	for (bitnr = s; bitnr <= e; bitnr++) {
1634		unsigned int idx = bm_bit_to_page_idx(b, bitnr);
1635		if (page_nr != idx) {
1636			page_nr = idx;
1637			if (p_addr)
1638				bm_unmap(p_addr);
1639			p_addr = bm_map_pidx(b, idx);
1640		}
1641		if (expect(device, bitnr < b->bm_bits))
 
 
1642			c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
1643		else
1644			drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
1645	}
1646	if (p_addr)
1647		bm_unmap(p_addr);
1648	spin_unlock_irqrestore(&b->bm_lock, flags);
1649	return c;
1650}
1651
1652
1653/* inherently racy...
1654 * return value may be already out-of-date when this function returns.
1655 * but the general usage is that this is only use during a cstate when bits are
1656 * only cleared, not set, and typically only care for the case when the return
1657 * value is zero, or we already "locked" this "bitmap extent" by other means.
1658 *
1659 * enr is bm-extent number, since we chose to name one sector (512 bytes)
1660 * worth of the bitmap a "bitmap extent".
1661 *
1662 * TODO
1663 * I think since we use it like a reference count, we should use the real
1664 * reference count of some bitmap extent element from some lru instead...
1665 *
1666 */
1667int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
1668{
1669	struct drbd_bitmap *b = device->bitmap;
1670	int count, s, e;
1671	unsigned long flags;
1672	unsigned long *p_addr, *bm;
1673
1674	if (!expect(device, b))
1675		return 0;
1676	if (!expect(device, b->bm_pages))
1677		return 0;
1678
1679	spin_lock_irqsave(&b->bm_lock, flags);
1680	if (BM_DONT_TEST & b->bm_flags)
1681		bm_print_lock_info(device);
1682
1683	s = S2W(enr);
1684	e = min((size_t)S2W(enr+1), b->bm_words);
1685	count = 0;
1686	if (s < b->bm_words) {
1687		int n = e-s;
1688		p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
1689		bm = p_addr + MLPP(s);
1690		count += bitmap_weight(bm, n * BITS_PER_LONG);
 
1691		bm_unmap(p_addr);
1692	} else {
1693		drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
1694	}
1695	spin_unlock_irqrestore(&b->bm_lock, flags);
1696	return count;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1697}