Linux Audio

Check our new training course

Loading...
v4.6
   1/*
   2 * raid5.c : Multiple Devices driver for Linux
   3 *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   4 *	   Copyright (C) 1999, 2000 Ingo Molnar
   5 *	   Copyright (C) 2002, 2003 H. Peter Anvin
   6 *
   7 * RAID-4/5/6 management functions.
   8 * Thanks to Penguin Computing for making the RAID-6 development possible
   9 * by donating a test server!
  10 *
  11 * This program is free software; you can redistribute it and/or modify
  12 * it under the terms of the GNU General Public License as published by
  13 * the Free Software Foundation; either version 2, or (at your option)
  14 * any later version.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * (for example /usr/src/linux/COPYING); if not, write to the Free
  18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19 */
  20
  21/*
  22 * BITMAP UNPLUGGING:
  23 *
  24 * The sequencing for updating the bitmap reliably is a little
  25 * subtle (and I got it wrong the first time) so it deserves some
  26 * explanation.
  27 *
  28 * We group bitmap updates into batches.  Each batch has a number.
  29 * We may write out several batches at once, but that isn't very important.
  30 * conf->seq_write is the number of the last batch successfully written.
  31 * conf->seq_flush is the number of the last batch that was closed to
  32 *    new additions.
  33 * When we discover that we will need to write to any block in a stripe
  34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
  35 * the number of the batch it will be in. This is seq_flush+1.
  36 * When we are ready to do a write, if that batch hasn't been written yet,
  37 *   we plug the array and queue the stripe for later.
  38 * When an unplug happens, we increment bm_flush, thus closing the current
  39 *   batch.
  40 * When we notice that bm_flush > bm_write, we write out all pending updates
  41 * to the bitmap, and advance bm_write to where bm_flush was.
  42 * This may occasionally write a bit out twice, but is sure never to
  43 * miss any bits.
  44 */
  45
  46#include <linux/blkdev.h>
  47#include <linux/kthread.h>
  48#include <linux/raid/pq.h>
  49#include <linux/async_tx.h>
  50#include <linux/module.h>
  51#include <linux/async.h>
  52#include <linux/seq_file.h>
  53#include <linux/cpu.h>
  54#include <linux/slab.h>
  55#include <linux/ratelimit.h>
  56#include <linux/nodemask.h>
  57#include <linux/flex_array.h>
  58#include <trace/events/block.h>
  59
  60#include "md.h"
  61#include "raid5.h"
  62#include "raid0.h"
  63#include "bitmap.h"
  64
 
 
  65#define cpu_to_group(cpu) cpu_to_node(cpu)
  66#define ANY_GROUP NUMA_NO_NODE
  67
  68static bool devices_handle_discard_safely = false;
  69module_param(devices_handle_discard_safely, bool, 0644);
  70MODULE_PARM_DESC(devices_handle_discard_safely,
  71		 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
  72static struct workqueue_struct *raid5_wq;
  73/*
  74 * Stripe cache
  75 */
  76
  77#define NR_STRIPES		256
  78#define STRIPE_SIZE		PAGE_SIZE
  79#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
  80#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
  81#define	IO_THRESHOLD		1
  82#define BYPASS_THRESHOLD	1
  83#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
  84#define HASH_MASK		(NR_HASH - 1)
  85#define MAX_STRIPE_BATCH	8
  86
  87static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
  88{
  89	int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
  90	return &conf->stripe_hashtbl[hash];
  91}
  92
  93static inline int stripe_hash_locks_hash(sector_t sect)
  94{
  95	return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
  96}
  97
  98static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
  99{
 100	spin_lock_irq(conf->hash_locks + hash);
 101	spin_lock(&conf->device_lock);
 102}
 103
 104static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
 105{
 106	spin_unlock(&conf->device_lock);
 107	spin_unlock_irq(conf->hash_locks + hash);
 108}
 109
 110static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
 111{
 112	int i;
 113	local_irq_disable();
 114	spin_lock(conf->hash_locks);
 115	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
 116		spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
 117	spin_lock(&conf->device_lock);
 118}
 119
 120static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
 121{
 122	int i;
 123	spin_unlock(&conf->device_lock);
 124	for (i = NR_STRIPE_HASH_LOCKS; i; i--)
 125		spin_unlock(conf->hash_locks + i - 1);
 126	local_irq_enable();
 127}
 128
 129/* bio's attached to a stripe+device for I/O are linked together in bi_sector
 130 * order without overlap.  There may be several bio's per stripe+device, and
 131 * a bio could span several devices.
 132 * When walking this list for a particular stripe+device, we must never proceed
 133 * beyond a bio that extends past this device, as the next bio might no longer
 134 * be valid.
 135 * This function is used to determine the 'next' bio in the list, given the sector
 136 * of the current stripe+device
 137 */
 138static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 139{
 140	int sectors = bio_sectors(bio);
 141	if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
 142		return bio->bi_next;
 143	else
 144		return NULL;
 145}
 146
 147/*
 148 * We maintain a biased count of active stripes in the bottom 16 bits of
 149 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
 150 */
 151static inline int raid5_bi_processed_stripes(struct bio *bio)
 152{
 153	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
 154	return (atomic_read(segments) >> 16) & 0xffff;
 155}
 156
 157static inline int raid5_dec_bi_active_stripes(struct bio *bio)
 158{
 159	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
 160	return atomic_sub_return(1, segments) & 0xffff;
 161}
 162
 163static inline void raid5_inc_bi_active_stripes(struct bio *bio)
 164{
 165	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
 166	atomic_inc(segments);
 167}
 168
 169static inline void raid5_set_bi_processed_stripes(struct bio *bio,
 170	unsigned int cnt)
 171{
 172	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
 173	int old, new;
 174
 175	do {
 176		old = atomic_read(segments);
 177		new = (old & 0xffff) | (cnt << 16);
 178	} while (atomic_cmpxchg(segments, old, new) != old);
 179}
 180
 181static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
 182{
 183	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
 184	atomic_set(segments, cnt);
 185}
 186
 187/* Find first data disk in a raid6 stripe */
 188static inline int raid6_d0(struct stripe_head *sh)
 189{
 190	if (sh->ddf_layout)
 191		/* ddf always start from first device */
 192		return 0;
 193	/* md starts just after Q block */
 194	if (sh->qd_idx == sh->disks - 1)
 195		return 0;
 196	else
 197		return sh->qd_idx + 1;
 198}
 199static inline int raid6_next_disk(int disk, int raid_disks)
 200{
 201	disk++;
 202	return (disk < raid_disks) ? disk : 0;
 203}
 204
 205/* When walking through the disks in a raid5, starting at raid6_d0,
 206 * We need to map each disk to a 'slot', where the data disks are slot
 207 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
 208 * is raid_disks-1.  This help does that mapping.
 209 */
 210static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 211			     int *count, int syndrome_disks)
 212{
 213	int slot = *count;
 214
 215	if (sh->ddf_layout)
 216		(*count)++;
 217	if (idx == sh->pd_idx)
 218		return syndrome_disks;
 219	if (idx == sh->qd_idx)
 220		return syndrome_disks + 1;
 221	if (!sh->ddf_layout)
 222		(*count)++;
 223	return slot;
 224}
 225
 226static void return_io(struct bio_list *return_bi)
 227{
 228	struct bio *bi;
 229	while ((bi = bio_list_pop(return_bi)) != NULL) {
 230		bi->bi_iter.bi_size = 0;
 231		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
 232					 bi, 0);
 233		bio_endio(bi);
 234	}
 235}
 236
 237static void print_raid5_conf (struct r5conf *conf);
 238
 239static int stripe_operations_active(struct stripe_head *sh)
 240{
 241	return sh->check_state || sh->reconstruct_state ||
 242	       test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
 243	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 244}
 245
 246static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 247{
 248	struct r5conf *conf = sh->raid_conf;
 249	struct r5worker_group *group;
 250	int thread_cnt;
 251	int i, cpu = sh->cpu;
 252
 253	if (!cpu_online(cpu)) {
 254		cpu = cpumask_any(cpu_online_mask);
 255		sh->cpu = cpu;
 256	}
 257
 258	if (list_empty(&sh->lru)) {
 259		struct r5worker_group *group;
 260		group = conf->worker_groups + cpu_to_group(cpu);
 261		list_add_tail(&sh->lru, &group->handle_list);
 262		group->stripes_cnt++;
 263		sh->group = group;
 264	}
 265
 266	if (conf->worker_cnt_per_group == 0) {
 267		md_wakeup_thread(conf->mddev->thread);
 268		return;
 269	}
 270
 271	group = conf->worker_groups + cpu_to_group(sh->cpu);
 272
 273	group->workers[0].working = true;
 274	/* at least one worker should run to avoid race */
 275	queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
 276
 277	thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
 278	/* wakeup more workers */
 279	for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
 280		if (group->workers[i].working == false) {
 281			group->workers[i].working = true;
 282			queue_work_on(sh->cpu, raid5_wq,
 283				      &group->workers[i].work);
 284			thread_cnt--;
 285		}
 286	}
 287}
 288
 289static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 290			      struct list_head *temp_inactive_list)
 291{
 
 
 
 292	BUG_ON(!list_empty(&sh->lru));
 293	BUG_ON(atomic_read(&conf->active_stripes)==0);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 294	if (test_bit(STRIPE_HANDLE, &sh->state)) {
 295		if (test_bit(STRIPE_DELAYED, &sh->state) &&
 296		    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 297			list_add_tail(&sh->lru, &conf->delayed_list);
 298		else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
 299			   sh->bm_seq - conf->seq_write > 0)
 300			list_add_tail(&sh->lru, &conf->bitmap_list);
 301		else {
 302			clear_bit(STRIPE_DELAYED, &sh->state);
 303			clear_bit(STRIPE_BIT_DELAY, &sh->state);
 304			if (conf->worker_cnt_per_group == 0) {
 305				list_add_tail(&sh->lru, &conf->handle_list);
 306			} else {
 307				raid5_wakeup_stripe_thread(sh);
 308				return;
 309			}
 310		}
 311		md_wakeup_thread(conf->mddev->thread);
 312	} else {
 313		BUG_ON(stripe_operations_active(sh));
 314		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 315			if (atomic_dec_return(&conf->preread_active_stripes)
 316			    < IO_THRESHOLD)
 317				md_wakeup_thread(conf->mddev->thread);
 318		atomic_dec(&conf->active_stripes);
 319		if (!test_bit(STRIPE_EXPANDING, &sh->state))
 320			list_add_tail(&sh->lru, temp_inactive_list);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 321	}
 322}
 323
 324static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
 325			     struct list_head *temp_inactive_list)
 326{
 327	if (atomic_dec_and_test(&sh->count))
 328		do_release_stripe(conf, sh, temp_inactive_list);
 329}
 330
 331/*
 332 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
 333 *
 334 * Be careful: Only one task can add/delete stripes from temp_inactive_list at
 335 * given time. Adding stripes only takes device lock, while deleting stripes
 336 * only takes hash lock.
 337 */
 338static void release_inactive_stripe_list(struct r5conf *conf,
 339					 struct list_head *temp_inactive_list,
 340					 int hash)
 341{
 342	int size;
 343	bool do_wakeup = false;
 344	unsigned long flags;
 345
 346	if (hash == NR_STRIPE_HASH_LOCKS) {
 347		size = NR_STRIPE_HASH_LOCKS;
 348		hash = NR_STRIPE_HASH_LOCKS - 1;
 349	} else
 350		size = 1;
 351	while (size) {
 352		struct list_head *list = &temp_inactive_list[size - 1];
 353
 354		/*
 355		 * We don't hold any lock here yet, raid5_get_active_stripe() might
 356		 * remove stripes from the list
 357		 */
 358		if (!list_empty_careful(list)) {
 359			spin_lock_irqsave(conf->hash_locks + hash, flags);
 360			if (list_empty(conf->inactive_list + hash) &&
 361			    !list_empty(list))
 362				atomic_dec(&conf->empty_inactive_list_nr);
 363			list_splice_tail_init(list, conf->inactive_list + hash);
 364			do_wakeup = true;
 365			spin_unlock_irqrestore(conf->hash_locks + hash, flags);
 366		}
 367		size--;
 368		hash--;
 369	}
 370
 371	if (do_wakeup) {
 372		wake_up(&conf->wait_for_stripe);
 373		if (atomic_read(&conf->active_stripes) == 0)
 374			wake_up(&conf->wait_for_quiescent);
 375		if (conf->retry_read_aligned)
 376			md_wakeup_thread(conf->mddev->thread);
 377	}
 378}
 379
 380/* should hold conf->device_lock already */
 381static int release_stripe_list(struct r5conf *conf,
 382			       struct list_head *temp_inactive_list)
 383{
 384	struct stripe_head *sh;
 385	int count = 0;
 386	struct llist_node *head;
 387
 388	head = llist_del_all(&conf->released_stripes);
 389	head = llist_reverse_order(head);
 390	while (head) {
 391		int hash;
 392
 393		sh = llist_entry(head, struct stripe_head, release_list);
 394		head = llist_next(head);
 395		/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
 396		smp_mb();
 397		clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
 398		/*
 399		 * Don't worry the bit is set here, because if the bit is set
 400		 * again, the count is always > 1. This is true for
 401		 * STRIPE_ON_UNPLUG_LIST bit too.
 402		 */
 403		hash = sh->hash_lock_index;
 404		__release_stripe(conf, sh, &temp_inactive_list[hash]);
 405		count++;
 406	}
 407
 408	return count;
 409}
 410
 411void raid5_release_stripe(struct stripe_head *sh)
 412{
 413	struct r5conf *conf = sh->raid_conf;
 414	unsigned long flags;
 415	struct list_head list;
 416	int hash;
 417	bool wakeup;
 418
 419	/* Avoid release_list until the last reference.
 420	 */
 421	if (atomic_add_unless(&sh->count, -1, 1))
 422		return;
 423
 424	if (unlikely(!conf->mddev->thread) ||
 425		test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
 426		goto slow_path;
 427	wakeup = llist_add(&sh->release_list, &conf->released_stripes);
 428	if (wakeup)
 429		md_wakeup_thread(conf->mddev->thread);
 430	return;
 431slow_path:
 432	local_irq_save(flags);
 433	/* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
 434	if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
 435		INIT_LIST_HEAD(&list);
 436		hash = sh->hash_lock_index;
 437		do_release_stripe(conf, sh, &list);
 438		spin_unlock(&conf->device_lock);
 439		release_inactive_stripe_list(conf, &list, hash);
 440	}
 441	local_irq_restore(flags);
 442}
 443
 444static inline void remove_hash(struct stripe_head *sh)
 445{
 446	pr_debug("remove_hash(), stripe %llu\n",
 447		(unsigned long long)sh->sector);
 448
 449	hlist_del_init(&sh->hash);
 450}
 451
 452static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
 453{
 454	struct hlist_head *hp = stripe_hash(conf, sh->sector);
 455
 456	pr_debug("insert_hash(), stripe %llu\n",
 457		(unsigned long long)sh->sector);
 458
 459	hlist_add_head(&sh->hash, hp);
 460}
 461
 462/* find an idle stripe, make sure it is unhashed, and return it. */
 463static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
 464{
 465	struct stripe_head *sh = NULL;
 466	struct list_head *first;
 467
 468	if (list_empty(conf->inactive_list + hash))
 469		goto out;
 470	first = (conf->inactive_list + hash)->next;
 471	sh = list_entry(first, struct stripe_head, lru);
 472	list_del_init(first);
 473	remove_hash(sh);
 474	atomic_inc(&conf->active_stripes);
 475	BUG_ON(hash != sh->hash_lock_index);
 476	if (list_empty(conf->inactive_list + hash))
 477		atomic_inc(&conf->empty_inactive_list_nr);
 478out:
 479	return sh;
 480}
 481
 482static void shrink_buffers(struct stripe_head *sh)
 483{
 484	struct page *p;
 485	int i;
 486	int num = sh->raid_conf->pool_size;
 487
 488	for (i = 0; i < num ; i++) {
 489		WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
 490		p = sh->dev[i].page;
 491		if (!p)
 492			continue;
 493		sh->dev[i].page = NULL;
 494		put_page(p);
 495	}
 496}
 497
 498static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 499{
 500	int i;
 501	int num = sh->raid_conf->pool_size;
 502
 503	for (i = 0; i < num; i++) {
 504		struct page *page;
 505
 506		if (!(page = alloc_page(gfp))) {
 507			return 1;
 508		}
 509		sh->dev[i].page = page;
 510		sh->dev[i].orig_page = page;
 511	}
 512	return 0;
 513}
 514
 515static void raid5_build_block(struct stripe_head *sh, int i, int previous);
 516static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 517			    struct stripe_head *sh);
 518
 519static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 520{
 521	struct r5conf *conf = sh->raid_conf;
 522	int i, seq;
 523
 524	BUG_ON(atomic_read(&sh->count) != 0);
 525	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
 526	BUG_ON(stripe_operations_active(sh));
 527	BUG_ON(sh->batch_head);
 528
 529	pr_debug("init_stripe called, stripe %llu\n",
 530		(unsigned long long)sector);
 531retry:
 532	seq = read_seqcount_begin(&conf->gen_lock);
 533	sh->generation = conf->generation - previous;
 534	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
 535	sh->sector = sector;
 536	stripe_set_idx(sector, conf, previous, sh);
 537	sh->state = 0;
 538
 539	for (i = sh->disks; i--; ) {
 540		struct r5dev *dev = &sh->dev[i];
 541
 542		if (dev->toread || dev->read || dev->towrite || dev->written ||
 543		    test_bit(R5_LOCKED, &dev->flags)) {
 544			printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
 545			       (unsigned long long)sh->sector, i, dev->toread,
 546			       dev->read, dev->towrite, dev->written,
 547			       test_bit(R5_LOCKED, &dev->flags));
 548			WARN_ON(1);
 549		}
 550		dev->flags = 0;
 551		raid5_build_block(sh, i, previous);
 552	}
 553	if (read_seqcount_retry(&conf->gen_lock, seq))
 554		goto retry;
 555	sh->overwrite_disks = 0;
 556	insert_hash(conf, sh);
 557	sh->cpu = smp_processor_id();
 558	set_bit(STRIPE_BATCH_READY, &sh->state);
 559}
 560
 561static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
 562					 short generation)
 563{
 564	struct stripe_head *sh;
 565
 566	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
 567	hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
 568		if (sh->sector == sector && sh->generation == generation)
 569			return sh;
 570	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
 571	return NULL;
 572}
 573
 574/*
 575 * Need to check if array has failed when deciding whether to:
 576 *  - start an array
 577 *  - remove non-faulty devices
 578 *  - add a spare
 579 *  - allow a reshape
 580 * This determination is simple when no reshape is happening.
 581 * However if there is a reshape, we need to carefully check
 582 * both the before and after sections.
 583 * This is because some failed devices may only affect one
 584 * of the two sections, and some non-in_sync devices may
 585 * be insync in the section most affected by failed devices.
 586 */
 587static int calc_degraded(struct r5conf *conf)
 588{
 589	int degraded, degraded2;
 590	int i;
 591
 592	rcu_read_lock();
 593	degraded = 0;
 594	for (i = 0; i < conf->previous_raid_disks; i++) {
 595		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
 596		if (rdev && test_bit(Faulty, &rdev->flags))
 597			rdev = rcu_dereference(conf->disks[i].replacement);
 598		if (!rdev || test_bit(Faulty, &rdev->flags))
 599			degraded++;
 600		else if (test_bit(In_sync, &rdev->flags))
 601			;
 602		else
 603			/* not in-sync or faulty.
 604			 * If the reshape increases the number of devices,
 605			 * this is being recovered by the reshape, so
 606			 * this 'previous' section is not in_sync.
 607			 * If the number of devices is being reduced however,
 608			 * the device can only be part of the array if
 609			 * we are reverting a reshape, so this section will
 610			 * be in-sync.
 611			 */
 612			if (conf->raid_disks >= conf->previous_raid_disks)
 613				degraded++;
 614	}
 615	rcu_read_unlock();
 616	if (conf->raid_disks == conf->previous_raid_disks)
 617		return degraded;
 618	rcu_read_lock();
 619	degraded2 = 0;
 620	for (i = 0; i < conf->raid_disks; i++) {
 621		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
 622		if (rdev && test_bit(Faulty, &rdev->flags))
 623			rdev = rcu_dereference(conf->disks[i].replacement);
 624		if (!rdev || test_bit(Faulty, &rdev->flags))
 625			degraded2++;
 626		else if (test_bit(In_sync, &rdev->flags))
 627			;
 628		else
 629			/* not in-sync or faulty.
 630			 * If reshape increases the number of devices, this
 631			 * section has already been recovered, else it
 632			 * almost certainly hasn't.
 633			 */
 634			if (conf->raid_disks <= conf->previous_raid_disks)
 635				degraded2++;
 636	}
 637	rcu_read_unlock();
 638	if (degraded2 > degraded)
 639		return degraded2;
 640	return degraded;
 641}
 642
 643static int has_failed(struct r5conf *conf)
 644{
 645	int degraded;
 646
 647	if (conf->mddev->reshape_position == MaxSector)
 648		return conf->mddev->degraded > conf->max_degraded;
 649
 650	degraded = calc_degraded(conf);
 651	if (degraded > conf->max_degraded)
 652		return 1;
 653	return 0;
 654}
 655
 656struct stripe_head *
 657raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 658			int previous, int noblock, int noquiesce)
 659{
 660	struct stripe_head *sh;
 661	int hash = stripe_hash_locks_hash(sector);
 
 662
 663	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
 664
 665	spin_lock_irq(conf->hash_locks + hash);
 666
 667	do {
 668		wait_event_lock_irq(conf->wait_for_quiescent,
 669				    conf->quiesce == 0 || noquiesce,
 670				    *(conf->hash_locks + hash));
 671		sh = __find_stripe(conf, sector, conf->generation - previous);
 672		if (!sh) {
 673			if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
 674				sh = get_free_stripe(conf, hash);
 675				if (!sh && !test_bit(R5_DID_ALLOC,
 676						     &conf->cache_state))
 677					set_bit(R5_ALLOC_MORE,
 678						&conf->cache_state);
 679			}
 680			if (noblock && sh == NULL)
 681				break;
 
 
 682			if (!sh) {
 683				set_bit(R5_INACTIVE_BLOCKED,
 684					&conf->cache_state);
 
 685				wait_event_lock_irq(
 686					conf->wait_for_stripe,
 687					!list_empty(conf->inactive_list + hash) &&
 688					(atomic_read(&conf->active_stripes)
 689					 < (conf->max_nr_stripes * 3 / 4)
 690					 || !test_bit(R5_INACTIVE_BLOCKED,
 691						      &conf->cache_state)),
 692					*(conf->hash_locks + hash));
 693				clear_bit(R5_INACTIVE_BLOCKED,
 694					  &conf->cache_state);
 695			} else {
 696				init_stripe(sh, sector, previous);
 697				atomic_inc(&sh->count);
 698			}
 699		} else if (!atomic_inc_not_zero(&sh->count)) {
 700			spin_lock(&conf->device_lock);
 701			if (!atomic_read(&sh->count)) {
 702				if (!test_bit(STRIPE_HANDLE, &sh->state))
 703					atomic_inc(&conf->active_stripes);
 704				BUG_ON(list_empty(&sh->lru) &&
 705				       !test_bit(STRIPE_EXPANDING, &sh->state));
 
 
 
 706				list_del_init(&sh->lru);
 
 
 707				if (sh->group) {
 708					sh->group->stripes_cnt--;
 709					sh->group = NULL;
 710				}
 711			}
 712			atomic_inc(&sh->count);
 713			spin_unlock(&conf->device_lock);
 714		}
 715	} while (sh == NULL);
 716
 717	spin_unlock_irq(conf->hash_locks + hash);
 718	return sh;
 719}
 720
 721static bool is_full_stripe_write(struct stripe_head *sh)
 722{
 723	BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
 724	return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
 725}
 726
 727static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 728{
 729	local_irq_disable();
 730	if (sh1 > sh2) {
 731		spin_lock(&sh2->stripe_lock);
 732		spin_lock_nested(&sh1->stripe_lock, 1);
 733	} else {
 734		spin_lock(&sh1->stripe_lock);
 735		spin_lock_nested(&sh2->stripe_lock, 1);
 736	}
 737}
 738
 739static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 740{
 741	spin_unlock(&sh1->stripe_lock);
 742	spin_unlock(&sh2->stripe_lock);
 743	local_irq_enable();
 744}
 745
 746/* Only freshly new full stripe normal write stripe can be added to a batch list */
 747static bool stripe_can_batch(struct stripe_head *sh)
 748{
 749	struct r5conf *conf = sh->raid_conf;
 750
 751	if (conf->log)
 752		return false;
 753	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
 754		!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
 755		is_full_stripe_write(sh);
 756}
 757
 758/* we only do back search */
 759static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
 760{
 761	struct stripe_head *head;
 762	sector_t head_sector, tmp_sec;
 763	int hash;
 764	int dd_idx;
 
 765
 766	/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
 767	tmp_sec = sh->sector;
 768	if (!sector_div(tmp_sec, conf->chunk_sectors))
 769		return;
 770	head_sector = sh->sector - STRIPE_SECTORS;
 771
 772	hash = stripe_hash_locks_hash(head_sector);
 773	spin_lock_irq(conf->hash_locks + hash);
 774	head = __find_stripe(conf, head_sector, conf->generation);
 775	if (head && !atomic_inc_not_zero(&head->count)) {
 776		spin_lock(&conf->device_lock);
 777		if (!atomic_read(&head->count)) {
 778			if (!test_bit(STRIPE_HANDLE, &head->state))
 779				atomic_inc(&conf->active_stripes);
 780			BUG_ON(list_empty(&head->lru) &&
 781			       !test_bit(STRIPE_EXPANDING, &head->state));
 
 
 
 782			list_del_init(&head->lru);
 
 
 783			if (head->group) {
 784				head->group->stripes_cnt--;
 785				head->group = NULL;
 786			}
 787		}
 788		atomic_inc(&head->count);
 789		spin_unlock(&conf->device_lock);
 790	}
 791	spin_unlock_irq(conf->hash_locks + hash);
 792
 793	if (!head)
 794		return;
 795	if (!stripe_can_batch(head))
 796		goto out;
 797
 798	lock_two_stripes(head, sh);
 799	/* clear_batch_ready clear the flag */
 800	if (!stripe_can_batch(head) || !stripe_can_batch(sh))
 801		goto unlock_out;
 802
 803	if (sh->batch_head)
 804		goto unlock_out;
 805
 806	dd_idx = 0;
 807	while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
 808		dd_idx++;
 809	if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw)
 
 810		goto unlock_out;
 811
 812	if (head->batch_head) {
 813		spin_lock(&head->batch_head->batch_lock);
 814		/* This batch list is already running */
 815		if (!stripe_can_batch(head)) {
 816			spin_unlock(&head->batch_head->batch_lock);
 817			goto unlock_out;
 818		}
 819
 820		/*
 821		 * at this point, head's BATCH_READY could be cleared, but we
 822		 * can still add the stripe to batch list
 823		 */
 824		list_add(&sh->batch_list, &head->batch_list);
 825		spin_unlock(&head->batch_head->batch_lock);
 826
 827		sh->batch_head = head->batch_head;
 828	} else {
 829		head->batch_head = head;
 830		sh->batch_head = head->batch_head;
 831		spin_lock(&head->batch_lock);
 832		list_add_tail(&sh->batch_list, &head->batch_list);
 833		spin_unlock(&head->batch_lock);
 834	}
 835
 836	if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 837		if (atomic_dec_return(&conf->preread_active_stripes)
 838		    < IO_THRESHOLD)
 839			md_wakeup_thread(conf->mddev->thread);
 840
 841	if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
 842		int seq = sh->bm_seq;
 843		if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
 844		    sh->batch_head->bm_seq > seq)
 845			seq = sh->batch_head->bm_seq;
 846		set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
 847		sh->batch_head->bm_seq = seq;
 848	}
 849
 850	atomic_inc(&sh->count);
 851unlock_out:
 852	unlock_two_stripes(head, sh);
 853out:
 854	raid5_release_stripe(head);
 855}
 856
 857/* Determine if 'data_offset' or 'new_data_offset' should be used
 858 * in this stripe_head.
 859 */
 860static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
 861{
 862	sector_t progress = conf->reshape_progress;
 863	/* Need a memory barrier to make sure we see the value
 864	 * of conf->generation, or ->data_offset that was set before
 865	 * reshape_progress was updated.
 866	 */
 867	smp_rmb();
 868	if (progress == MaxSector)
 869		return 0;
 870	if (sh->generation == conf->generation - 1)
 871		return 0;
 872	/* We are in a reshape, and this is a new-generation stripe,
 873	 * so use new_data_offset.
 874	 */
 875	return 1;
 876}
 877
 878static void
 879raid5_end_read_request(struct bio *bi);
 880static void
 881raid5_end_write_request(struct bio *bi);
 882
 883static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 884{
 885	struct r5conf *conf = sh->raid_conf;
 886	int i, disks = sh->disks;
 887	struct stripe_head *head_sh = sh;
 888
 889	might_sleep();
 890
 891	if (r5l_write_stripe(conf->log, sh) == 0)
 892		return;
 
 
 
 
 
 
 
 
 
 
 
 893	for (i = disks; i--; ) {
 894		int rw;
 895		int replace_only = 0;
 896		struct bio *bi, *rbi;
 897		struct md_rdev *rdev, *rrdev = NULL;
 898
 899		sh = head_sh;
 900		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 
 901			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
 902				rw = WRITE_FUA;
 903			else
 904				rw = WRITE;
 905			if (test_bit(R5_Discard, &sh->dev[i].flags))
 906				rw |= REQ_DISCARD;
 907		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
 908			rw = READ;
 909		else if (test_and_clear_bit(R5_WantReplace,
 910					    &sh->dev[i].flags)) {
 911			rw = WRITE;
 912			replace_only = 1;
 913		} else
 914			continue;
 915		if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
 916			rw |= REQ_SYNC;
 917
 918again:
 919		bi = &sh->dev[i].req;
 920		rbi = &sh->dev[i].rreq; /* For writing to replacement */
 921
 922		rcu_read_lock();
 923		rrdev = rcu_dereference(conf->disks[i].replacement);
 924		smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
 925		rdev = rcu_dereference(conf->disks[i].rdev);
 926		if (!rdev) {
 927			rdev = rrdev;
 928			rrdev = NULL;
 929		}
 930		if (rw & WRITE) {
 931			if (replace_only)
 932				rdev = NULL;
 933			if (rdev == rrdev)
 934				/* We raced and saw duplicates */
 935				rrdev = NULL;
 936		} else {
 937			if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
 938				rdev = rrdev;
 939			rrdev = NULL;
 940		}
 941
 942		if (rdev && test_bit(Faulty, &rdev->flags))
 943			rdev = NULL;
 944		if (rdev)
 945			atomic_inc(&rdev->nr_pending);
 946		if (rrdev && test_bit(Faulty, &rrdev->flags))
 947			rrdev = NULL;
 948		if (rrdev)
 949			atomic_inc(&rrdev->nr_pending);
 950		rcu_read_unlock();
 951
 952		/* We have already checked bad blocks for reads.  Now
 953		 * need to check for writes.  We never accept write errors
 954		 * on the replacement, so we don't to check rrdev.
 955		 */
 956		while ((rw & WRITE) && rdev &&
 957		       test_bit(WriteErrorSeen, &rdev->flags)) {
 958			sector_t first_bad;
 959			int bad_sectors;
 960			int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
 961					      &first_bad, &bad_sectors);
 962			if (!bad)
 963				break;
 964
 965			if (bad < 0) {
 966				set_bit(BlockedBadBlocks, &rdev->flags);
 967				if (!conf->mddev->external &&
 968				    conf->mddev->flags) {
 969					/* It is very unlikely, but we might
 970					 * still need to write out the
 971					 * bad block log - better give it
 972					 * a chance*/
 973					md_check_recovery(conf->mddev);
 974				}
 975				/*
 976				 * Because md_wait_for_blocked_rdev
 977				 * will dec nr_pending, we must
 978				 * increment it first.
 979				 */
 980				atomic_inc(&rdev->nr_pending);
 981				md_wait_for_blocked_rdev(rdev, conf->mddev);
 982			} else {
 983				/* Acknowledged bad block - skip the write */
 984				rdev_dec_pending(rdev, conf->mddev);
 985				rdev = NULL;
 986			}
 987		}
 988
 989		if (rdev) {
 990			if (s->syncing || s->expanding || s->expanded
 991			    || s->replacing)
 992				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 993
 994			set_bit(STRIPE_IO_STARTED, &sh->state);
 995
 996			bio_reset(bi);
 997			bi->bi_bdev = rdev->bdev;
 998			bi->bi_rw = rw;
 999			bi->bi_end_io = (rw & WRITE)
1000				? raid5_end_write_request
1001				: raid5_end_read_request;
1002			bi->bi_private = sh;
1003
1004			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
1005				__func__, (unsigned long long)sh->sector,
1006				bi->bi_rw, i);
1007			atomic_inc(&sh->count);
1008			if (sh != head_sh)
1009				atomic_inc(&head_sh->count);
1010			if (use_new_offset(conf, sh))
1011				bi->bi_iter.bi_sector = (sh->sector
1012						 + rdev->new_data_offset);
1013			else
1014				bi->bi_iter.bi_sector = (sh->sector
1015						 + rdev->data_offset);
1016			if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1017				bi->bi_rw |= REQ_NOMERGE;
1018
1019			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1020				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1021			sh->dev[i].vec.bv_page = sh->dev[i].page;
 
 
 
 
 
 
 
 
 
 
1022			bi->bi_vcnt = 1;
1023			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1024			bi->bi_io_vec[0].bv_offset = 0;
1025			bi->bi_iter.bi_size = STRIPE_SIZE;
1026			/*
1027			 * If this is discard request, set bi_vcnt 0. We don't
1028			 * want to confuse SCSI because SCSI will replace payload
1029			 */
1030			if (rw & REQ_DISCARD)
1031				bi->bi_vcnt = 0;
1032			if (rrdev)
1033				set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1034
1035			if (conf->mddev->gendisk)
1036				trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
1037						      bi, disk_devt(conf->mddev->gendisk),
1038						      sh->dev[i].sector);
1039			generic_make_request(bi);
1040		}
1041		if (rrdev) {
1042			if (s->syncing || s->expanding || s->expanded
1043			    || s->replacing)
1044				md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
1045
1046			set_bit(STRIPE_IO_STARTED, &sh->state);
1047
1048			bio_reset(rbi);
1049			rbi->bi_bdev = rrdev->bdev;
1050			rbi->bi_rw = rw;
1051			BUG_ON(!(rw & WRITE));
1052			rbi->bi_end_io = raid5_end_write_request;
1053			rbi->bi_private = sh;
1054
1055			pr_debug("%s: for %llu schedule op %ld on "
1056				 "replacement disc %d\n",
1057				__func__, (unsigned long long)sh->sector,
1058				rbi->bi_rw, i);
1059			atomic_inc(&sh->count);
1060			if (sh != head_sh)
1061				atomic_inc(&head_sh->count);
1062			if (use_new_offset(conf, sh))
1063				rbi->bi_iter.bi_sector = (sh->sector
1064						  + rrdev->new_data_offset);
1065			else
1066				rbi->bi_iter.bi_sector = (sh->sector
1067						  + rrdev->data_offset);
1068			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1069				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1070			sh->dev[i].rvec.bv_page = sh->dev[i].page;
1071			rbi->bi_vcnt = 1;
1072			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1073			rbi->bi_io_vec[0].bv_offset = 0;
1074			rbi->bi_iter.bi_size = STRIPE_SIZE;
1075			/*
1076			 * If this is discard request, set bi_vcnt 0. We don't
1077			 * want to confuse SCSI because SCSI will replace payload
1078			 */
1079			if (rw & REQ_DISCARD)
1080				rbi->bi_vcnt = 0;
1081			if (conf->mddev->gendisk)
1082				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
1083						      rbi, disk_devt(conf->mddev->gendisk),
1084						      sh->dev[i].sector);
1085			generic_make_request(rbi);
1086		}
1087		if (!rdev && !rrdev) {
1088			if (rw & WRITE)
1089				set_bit(STRIPE_DEGRADED, &sh->state);
1090			pr_debug("skip op %ld on disc %d for sector %llu\n",
1091				bi->bi_rw, i, (unsigned long long)sh->sector);
1092			clear_bit(R5_LOCKED, &sh->dev[i].flags);
1093			set_bit(STRIPE_HANDLE, &sh->state);
1094		}
1095
1096		if (!head_sh->batch_head)
1097			continue;
1098		sh = list_first_entry(&sh->batch_list, struct stripe_head,
1099				      batch_list);
1100		if (sh != head_sh)
1101			goto again;
1102	}
1103}
1104
1105static struct dma_async_tx_descriptor *
1106async_copy_data(int frombio, struct bio *bio, struct page **page,
1107	sector_t sector, struct dma_async_tx_descriptor *tx,
1108	struct stripe_head *sh)
1109{
1110	struct bio_vec bvl;
1111	struct bvec_iter iter;
1112	struct page *bio_page;
1113	int page_offset;
1114	struct async_submit_ctl submit;
1115	enum async_tx_flags flags = 0;
1116
1117	if (bio->bi_iter.bi_sector >= sector)
1118		page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1119	else
1120		page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1121
1122	if (frombio)
1123		flags |= ASYNC_TX_FENCE;
1124	init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1125
1126	bio_for_each_segment(bvl, bio, iter) {
1127		int len = bvl.bv_len;
1128		int clen;
1129		int b_offset = 0;
1130
1131		if (page_offset < 0) {
1132			b_offset = -page_offset;
1133			page_offset += b_offset;
1134			len -= b_offset;
1135		}
1136
1137		if (len > 0 && page_offset + len > STRIPE_SIZE)
1138			clen = STRIPE_SIZE - page_offset;
1139		else
1140			clen = len;
1141
1142		if (clen > 0) {
1143			b_offset += bvl.bv_offset;
1144			bio_page = bvl.bv_page;
1145			if (frombio) {
1146				if (sh->raid_conf->skip_copy &&
1147				    b_offset == 0 && page_offset == 0 &&
1148				    clen == STRIPE_SIZE)
 
1149					*page = bio_page;
1150				else
1151					tx = async_memcpy(*page, bio_page, page_offset,
1152						  b_offset, clen, &submit);
1153			} else
1154				tx = async_memcpy(bio_page, *page, b_offset,
1155						  page_offset, clen, &submit);
1156		}
1157		/* chain the operations */
1158		submit.depend_tx = tx;
1159
1160		if (clen < len) /* hit end of page */
1161			break;
1162		page_offset +=  len;
1163	}
1164
1165	return tx;
1166}
1167
1168static void ops_complete_biofill(void *stripe_head_ref)
1169{
1170	struct stripe_head *sh = stripe_head_ref;
1171	struct bio_list return_bi = BIO_EMPTY_LIST;
1172	int i;
1173
1174	pr_debug("%s: stripe %llu\n", __func__,
1175		(unsigned long long)sh->sector);
1176
1177	/* clear completed biofills */
1178	for (i = sh->disks; i--; ) {
1179		struct r5dev *dev = &sh->dev[i];
1180
1181		/* acknowledge completion of a biofill operation */
1182		/* and check if we need to reply to a read request,
1183		 * new R5_Wantfill requests are held off until
1184		 * !STRIPE_BIOFILL_RUN
1185		 */
1186		if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1187			struct bio *rbi, *rbi2;
1188
1189			BUG_ON(!dev->read);
1190			rbi = dev->read;
1191			dev->read = NULL;
1192			while (rbi && rbi->bi_iter.bi_sector <
1193				dev->sector + STRIPE_SECTORS) {
1194				rbi2 = r5_next_bio(rbi, dev->sector);
1195				if (!raid5_dec_bi_active_stripes(rbi))
1196					bio_list_add(&return_bi, rbi);
1197				rbi = rbi2;
1198			}
1199		}
1200	}
1201	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1202
1203	return_io(&return_bi);
1204
1205	set_bit(STRIPE_HANDLE, &sh->state);
1206	raid5_release_stripe(sh);
1207}
1208
1209static void ops_run_biofill(struct stripe_head *sh)
1210{
1211	struct dma_async_tx_descriptor *tx = NULL;
1212	struct async_submit_ctl submit;
1213	int i;
1214
1215	BUG_ON(sh->batch_head);
1216	pr_debug("%s: stripe %llu\n", __func__,
1217		(unsigned long long)sh->sector);
1218
1219	for (i = sh->disks; i--; ) {
1220		struct r5dev *dev = &sh->dev[i];
1221		if (test_bit(R5_Wantfill, &dev->flags)) {
1222			struct bio *rbi;
1223			spin_lock_irq(&sh->stripe_lock);
1224			dev->read = rbi = dev->toread;
1225			dev->toread = NULL;
1226			spin_unlock_irq(&sh->stripe_lock);
1227			while (rbi && rbi->bi_iter.bi_sector <
1228				dev->sector + STRIPE_SECTORS) {
1229				tx = async_copy_data(0, rbi, &dev->page,
1230					dev->sector, tx, sh);
1231				rbi = r5_next_bio(rbi, dev->sector);
1232			}
1233		}
1234	}
1235
1236	atomic_inc(&sh->count);
1237	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1238	async_trigger_callback(&submit);
1239}
1240
1241static void mark_target_uptodate(struct stripe_head *sh, int target)
1242{
1243	struct r5dev *tgt;
1244
1245	if (target < 0)
1246		return;
1247
1248	tgt = &sh->dev[target];
1249	set_bit(R5_UPTODATE, &tgt->flags);
1250	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1251	clear_bit(R5_Wantcompute, &tgt->flags);
1252}
1253
1254static void ops_complete_compute(void *stripe_head_ref)
1255{
1256	struct stripe_head *sh = stripe_head_ref;
1257
1258	pr_debug("%s: stripe %llu\n", __func__,
1259		(unsigned long long)sh->sector);
1260
1261	/* mark the computed target(s) as uptodate */
1262	mark_target_uptodate(sh, sh->ops.target);
1263	mark_target_uptodate(sh, sh->ops.target2);
1264
1265	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1266	if (sh->check_state == check_state_compute_run)
1267		sh->check_state = check_state_compute_result;
1268	set_bit(STRIPE_HANDLE, &sh->state);
1269	raid5_release_stripe(sh);
1270}
1271
1272/* return a pointer to the address conversion region of the scribble buffer */
1273static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1274				 struct raid5_percpu *percpu, int i)
1275{
1276	void *addr;
1277
1278	addr = flex_array_get(percpu->scribble, i);
1279	return addr + sizeof(struct page *) * (sh->disks + 2);
1280}
1281
1282/* return a pointer to the address conversion region of the scribble buffer */
1283static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1284{
1285	void *addr;
1286
1287	addr = flex_array_get(percpu->scribble, i);
1288	return addr;
1289}
1290
1291static struct dma_async_tx_descriptor *
1292ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1293{
1294	int disks = sh->disks;
1295	struct page **xor_srcs = to_addr_page(percpu, 0);
1296	int target = sh->ops.target;
1297	struct r5dev *tgt = &sh->dev[target];
1298	struct page *xor_dest = tgt->page;
1299	int count = 0;
1300	struct dma_async_tx_descriptor *tx;
1301	struct async_submit_ctl submit;
1302	int i;
1303
1304	BUG_ON(sh->batch_head);
1305
1306	pr_debug("%s: stripe %llu block: %d\n",
1307		__func__, (unsigned long long)sh->sector, target);
1308	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1309
1310	for (i = disks; i--; )
1311		if (i != target)
1312			xor_srcs[count++] = sh->dev[i].page;
1313
1314	atomic_inc(&sh->count);
1315
1316	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1317			  ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1318	if (unlikely(count == 1))
1319		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1320	else
1321		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1322
1323	return tx;
1324}
1325
1326/* set_syndrome_sources - populate source buffers for gen_syndrome
1327 * @srcs - (struct page *) array of size sh->disks
1328 * @sh - stripe_head to parse
1329 *
1330 * Populates srcs in proper layout order for the stripe and returns the
1331 * 'count' of sources to be used in a call to async_gen_syndrome.  The P
1332 * destination buffer is recorded in srcs[count] and the Q destination
1333 * is recorded in srcs[count+1]].
1334 */
1335static int set_syndrome_sources(struct page **srcs,
1336				struct stripe_head *sh,
1337				int srctype)
1338{
1339	int disks = sh->disks;
1340	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1341	int d0_idx = raid6_d0(sh);
1342	int count;
1343	int i;
1344
1345	for (i = 0; i < disks; i++)
1346		srcs[i] = NULL;
1347
1348	count = 0;
1349	i = d0_idx;
1350	do {
1351		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1352		struct r5dev *dev = &sh->dev[i];
1353
1354		if (i == sh->qd_idx || i == sh->pd_idx ||
1355		    (srctype == SYNDROME_SRC_ALL) ||
1356		    (srctype == SYNDROME_SRC_WANT_DRAIN &&
1357		     test_bit(R5_Wantdrain, &dev->flags)) ||
 
1358		    (srctype == SYNDROME_SRC_WRITTEN &&
1359		     dev->written))
1360			srcs[slot] = sh->dev[i].page;
 
 
 
 
 
1361		i = raid6_next_disk(i, disks);
1362	} while (i != d0_idx);
1363
1364	return syndrome_disks;
1365}
1366
1367static struct dma_async_tx_descriptor *
1368ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1369{
1370	int disks = sh->disks;
1371	struct page **blocks = to_addr_page(percpu, 0);
1372	int target;
1373	int qd_idx = sh->qd_idx;
1374	struct dma_async_tx_descriptor *tx;
1375	struct async_submit_ctl submit;
1376	struct r5dev *tgt;
1377	struct page *dest;
1378	int i;
1379	int count;
1380
1381	BUG_ON(sh->batch_head);
1382	if (sh->ops.target < 0)
1383		target = sh->ops.target2;
1384	else if (sh->ops.target2 < 0)
1385		target = sh->ops.target;
1386	else
1387		/* we should only have one valid target */
1388		BUG();
1389	BUG_ON(target < 0);
1390	pr_debug("%s: stripe %llu block: %d\n",
1391		__func__, (unsigned long long)sh->sector, target);
1392
1393	tgt = &sh->dev[target];
1394	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1395	dest = tgt->page;
1396
1397	atomic_inc(&sh->count);
1398
1399	if (target == qd_idx) {
1400		count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1401		blocks[count] = NULL; /* regenerating p is not necessary */
1402		BUG_ON(blocks[count+1] != dest); /* q should already be set */
1403		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1404				  ops_complete_compute, sh,
1405				  to_addr_conv(sh, percpu, 0));
1406		tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1407	} else {
1408		/* Compute any data- or p-drive using XOR */
1409		count = 0;
1410		for (i = disks; i-- ; ) {
1411			if (i == target || i == qd_idx)
1412				continue;
1413			blocks[count++] = sh->dev[i].page;
1414		}
1415
1416		init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1417				  NULL, ops_complete_compute, sh,
1418				  to_addr_conv(sh, percpu, 0));
1419		tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1420	}
1421
1422	return tx;
1423}
1424
1425static struct dma_async_tx_descriptor *
1426ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1427{
1428	int i, count, disks = sh->disks;
1429	int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1430	int d0_idx = raid6_d0(sh);
1431	int faila = -1, failb = -1;
1432	int target = sh->ops.target;
1433	int target2 = sh->ops.target2;
1434	struct r5dev *tgt = &sh->dev[target];
1435	struct r5dev *tgt2 = &sh->dev[target2];
1436	struct dma_async_tx_descriptor *tx;
1437	struct page **blocks = to_addr_page(percpu, 0);
1438	struct async_submit_ctl submit;
1439
1440	BUG_ON(sh->batch_head);
1441	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1442		 __func__, (unsigned long long)sh->sector, target, target2);
1443	BUG_ON(target < 0 || target2 < 0);
1444	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1445	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1446
1447	/* we need to open-code set_syndrome_sources to handle the
1448	 * slot number conversion for 'faila' and 'failb'
1449	 */
1450	for (i = 0; i < disks ; i++)
1451		blocks[i] = NULL;
1452	count = 0;
1453	i = d0_idx;
1454	do {
1455		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1456
1457		blocks[slot] = sh->dev[i].page;
1458
1459		if (i == target)
1460			faila = slot;
1461		if (i == target2)
1462			failb = slot;
1463		i = raid6_next_disk(i, disks);
1464	} while (i != d0_idx);
1465
1466	BUG_ON(faila == failb);
1467	if (failb < faila)
1468		swap(faila, failb);
1469	pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1470		 __func__, (unsigned long long)sh->sector, faila, failb);
1471
1472	atomic_inc(&sh->count);
1473
1474	if (failb == syndrome_disks+1) {
1475		/* Q disk is one of the missing disks */
1476		if (faila == syndrome_disks) {
1477			/* Missing P+Q, just recompute */
1478			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1479					  ops_complete_compute, sh,
1480					  to_addr_conv(sh, percpu, 0));
1481			return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1482						  STRIPE_SIZE, &submit);
1483		} else {
1484			struct page *dest;
1485			int data_target;
1486			int qd_idx = sh->qd_idx;
1487
1488			/* Missing D+Q: recompute D from P, then recompute Q */
1489			if (target == qd_idx)
1490				data_target = target2;
1491			else
1492				data_target = target;
1493
1494			count = 0;
1495			for (i = disks; i-- ; ) {
1496				if (i == data_target || i == qd_idx)
1497					continue;
1498				blocks[count++] = sh->dev[i].page;
1499			}
1500			dest = sh->dev[data_target].page;
1501			init_async_submit(&submit,
1502					  ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1503					  NULL, NULL, NULL,
1504					  to_addr_conv(sh, percpu, 0));
1505			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1506				       &submit);
1507
1508			count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1509			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1510					  ops_complete_compute, sh,
1511					  to_addr_conv(sh, percpu, 0));
1512			return async_gen_syndrome(blocks, 0, count+2,
1513						  STRIPE_SIZE, &submit);
1514		}
1515	} else {
1516		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1517				  ops_complete_compute, sh,
1518				  to_addr_conv(sh, percpu, 0));
1519		if (failb == syndrome_disks) {
1520			/* We're missing D+P. */
1521			return async_raid6_datap_recov(syndrome_disks+2,
1522						       STRIPE_SIZE, faila,
1523						       blocks, &submit);
1524		} else {
1525			/* We're missing D+D. */
1526			return async_raid6_2data_recov(syndrome_disks+2,
1527						       STRIPE_SIZE, faila, failb,
1528						       blocks, &submit);
1529		}
1530	}
1531}
1532
1533static void ops_complete_prexor(void *stripe_head_ref)
1534{
1535	struct stripe_head *sh = stripe_head_ref;
1536
1537	pr_debug("%s: stripe %llu\n", __func__,
1538		(unsigned long long)sh->sector);
 
 
 
 
 
 
 
1539}
1540
1541static struct dma_async_tx_descriptor *
1542ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1543		struct dma_async_tx_descriptor *tx)
1544{
1545	int disks = sh->disks;
1546	struct page **xor_srcs = to_addr_page(percpu, 0);
1547	int count = 0, pd_idx = sh->pd_idx, i;
1548	struct async_submit_ctl submit;
1549
1550	/* existing parity data subtracted */
1551	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1552
1553	BUG_ON(sh->batch_head);
1554	pr_debug("%s: stripe %llu\n", __func__,
1555		(unsigned long long)sh->sector);
1556
1557	for (i = disks; i--; ) {
1558		struct r5dev *dev = &sh->dev[i];
1559		/* Only process blocks that are known to be uptodate */
1560		if (test_bit(R5_Wantdrain, &dev->flags))
 
 
1561			xor_srcs[count++] = dev->page;
1562	}
1563
1564	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1565			  ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1566	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1567
1568	return tx;
1569}
1570
1571static struct dma_async_tx_descriptor *
1572ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1573		struct dma_async_tx_descriptor *tx)
1574{
1575	struct page **blocks = to_addr_page(percpu, 0);
1576	int count;
1577	struct async_submit_ctl submit;
1578
1579	pr_debug("%s: stripe %llu\n", __func__,
1580		(unsigned long long)sh->sector);
1581
1582	count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1583
1584	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1585			  ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1586	tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
1587
1588	return tx;
1589}
1590
1591static struct dma_async_tx_descriptor *
1592ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1593{
 
1594	int disks = sh->disks;
1595	int i;
1596	struct stripe_head *head_sh = sh;
1597
1598	pr_debug("%s: stripe %llu\n", __func__,
1599		(unsigned long long)sh->sector);
1600
1601	for (i = disks; i--; ) {
1602		struct r5dev *dev;
1603		struct bio *chosen;
1604
1605		sh = head_sh;
1606		if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1607			struct bio *wbi;
1608
1609again:
1610			dev = &sh->dev[i];
 
 
 
 
 
1611			spin_lock_irq(&sh->stripe_lock);
1612			chosen = dev->towrite;
1613			dev->towrite = NULL;
1614			sh->overwrite_disks = 0;
1615			BUG_ON(dev->written);
1616			wbi = dev->written = chosen;
1617			spin_unlock_irq(&sh->stripe_lock);
1618			WARN_ON(dev->page != dev->orig_page);
1619
1620			while (wbi && wbi->bi_iter.bi_sector <
1621				dev->sector + STRIPE_SECTORS) {
1622				if (wbi->bi_rw & REQ_FUA)
1623					set_bit(R5_WantFUA, &dev->flags);
1624				if (wbi->bi_rw & REQ_SYNC)
1625					set_bit(R5_SyncIO, &dev->flags);
1626				if (wbi->bi_rw & REQ_DISCARD)
1627					set_bit(R5_Discard, &dev->flags);
1628				else {
1629					tx = async_copy_data(1, wbi, &dev->page,
1630						dev->sector, tx, sh);
1631					if (dev->page != dev->orig_page) {
 
 
1632						set_bit(R5_SkipCopy, &dev->flags);
1633						clear_bit(R5_UPTODATE, &dev->flags);
1634						clear_bit(R5_OVERWRITE, &dev->flags);
1635					}
1636				}
1637				wbi = r5_next_bio(wbi, dev->sector);
1638			}
1639
1640			if (head_sh->batch_head) {
1641				sh = list_first_entry(&sh->batch_list,
1642						      struct stripe_head,
1643						      batch_list);
1644				if (sh == head_sh)
1645					continue;
1646				goto again;
1647			}
1648		}
1649	}
1650
1651	return tx;
1652}
1653
1654static void ops_complete_reconstruct(void *stripe_head_ref)
1655{
1656	struct stripe_head *sh = stripe_head_ref;
1657	int disks = sh->disks;
1658	int pd_idx = sh->pd_idx;
1659	int qd_idx = sh->qd_idx;
1660	int i;
1661	bool fua = false, sync = false, discard = false;
1662
1663	pr_debug("%s: stripe %llu\n", __func__,
1664		(unsigned long long)sh->sector);
1665
1666	for (i = disks; i--; ) {
1667		fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1668		sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1669		discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1670	}
1671
1672	for (i = disks; i--; ) {
1673		struct r5dev *dev = &sh->dev[i];
1674
1675		if (dev->written || i == pd_idx || i == qd_idx) {
1676			if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
1677				set_bit(R5_UPTODATE, &dev->flags);
1678			if (fua)
1679				set_bit(R5_WantFUA, &dev->flags);
1680			if (sync)
1681				set_bit(R5_SyncIO, &dev->flags);
1682		}
1683	}
1684
1685	if (sh->reconstruct_state == reconstruct_state_drain_run)
1686		sh->reconstruct_state = reconstruct_state_drain_result;
1687	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1688		sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1689	else {
1690		BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1691		sh->reconstruct_state = reconstruct_state_result;
1692	}
1693
1694	set_bit(STRIPE_HANDLE, &sh->state);
1695	raid5_release_stripe(sh);
1696}
1697
1698static void
1699ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1700		     struct dma_async_tx_descriptor *tx)
1701{
1702	int disks = sh->disks;
1703	struct page **xor_srcs;
1704	struct async_submit_ctl submit;
1705	int count, pd_idx = sh->pd_idx, i;
1706	struct page *xor_dest;
1707	int prexor = 0;
1708	unsigned long flags;
1709	int j = 0;
1710	struct stripe_head *head_sh = sh;
1711	int last_stripe;
1712
1713	pr_debug("%s: stripe %llu\n", __func__,
1714		(unsigned long long)sh->sector);
1715
1716	for (i = 0; i < sh->disks; i++) {
1717		if (pd_idx == i)
1718			continue;
1719		if (!test_bit(R5_Discard, &sh->dev[i].flags))
1720			break;
1721	}
1722	if (i >= sh->disks) {
1723		atomic_inc(&sh->count);
1724		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1725		ops_complete_reconstruct(sh);
1726		return;
1727	}
1728again:
1729	count = 0;
1730	xor_srcs = to_addr_page(percpu, j);
1731	/* check if prexor is active which means only process blocks
1732	 * that are part of a read-modify-write (written)
1733	 */
1734	if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1735		prexor = 1;
1736		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1737		for (i = disks; i--; ) {
1738			struct r5dev *dev = &sh->dev[i];
1739			if (head_sh->dev[i].written)
 
1740				xor_srcs[count++] = dev->page;
1741		}
1742	} else {
1743		xor_dest = sh->dev[pd_idx].page;
1744		for (i = disks; i--; ) {
1745			struct r5dev *dev = &sh->dev[i];
1746			if (i != pd_idx)
1747				xor_srcs[count++] = dev->page;
1748		}
1749	}
1750
1751	/* 1/ if we prexor'd then the dest is reused as a source
1752	 * 2/ if we did not prexor then we are redoing the parity
1753	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1754	 * for the synchronous xor case
1755	 */
1756	last_stripe = !head_sh->batch_head ||
1757		list_first_entry(&sh->batch_list,
1758				 struct stripe_head, batch_list) == head_sh;
1759	if (last_stripe) {
1760		flags = ASYNC_TX_ACK |
1761			(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1762
1763		atomic_inc(&head_sh->count);
1764		init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
1765				  to_addr_conv(sh, percpu, j));
1766	} else {
1767		flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
1768		init_async_submit(&submit, flags, tx, NULL, NULL,
1769				  to_addr_conv(sh, percpu, j));
1770	}
1771
1772	if (unlikely(count == 1))
1773		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1774	else
1775		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1776	if (!last_stripe) {
1777		j++;
1778		sh = list_first_entry(&sh->batch_list, struct stripe_head,
1779				      batch_list);
1780		goto again;
1781	}
1782}
1783
1784static void
1785ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1786		     struct dma_async_tx_descriptor *tx)
1787{
1788	struct async_submit_ctl submit;
1789	struct page **blocks;
1790	int count, i, j = 0;
1791	struct stripe_head *head_sh = sh;
1792	int last_stripe;
1793	int synflags;
1794	unsigned long txflags;
1795
1796	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1797
1798	for (i = 0; i < sh->disks; i++) {
1799		if (sh->pd_idx == i || sh->qd_idx == i)
1800			continue;
1801		if (!test_bit(R5_Discard, &sh->dev[i].flags))
1802			break;
1803	}
1804	if (i >= sh->disks) {
1805		atomic_inc(&sh->count);
1806		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1807		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1808		ops_complete_reconstruct(sh);
1809		return;
1810	}
1811
1812again:
1813	blocks = to_addr_page(percpu, j);
1814
1815	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1816		synflags = SYNDROME_SRC_WRITTEN;
1817		txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
1818	} else {
1819		synflags = SYNDROME_SRC_ALL;
1820		txflags = ASYNC_TX_ACK;
1821	}
1822
1823	count = set_syndrome_sources(blocks, sh, synflags);
1824	last_stripe = !head_sh->batch_head ||
1825		list_first_entry(&sh->batch_list,
1826				 struct stripe_head, batch_list) == head_sh;
1827
1828	if (last_stripe) {
1829		atomic_inc(&head_sh->count);
1830		init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
1831				  head_sh, to_addr_conv(sh, percpu, j));
1832	} else
1833		init_async_submit(&submit, 0, tx, NULL, NULL,
1834				  to_addr_conv(sh, percpu, j));
1835	tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
1836	if (!last_stripe) {
1837		j++;
1838		sh = list_first_entry(&sh->batch_list, struct stripe_head,
1839				      batch_list);
1840		goto again;
1841	}
1842}
1843
1844static void ops_complete_check(void *stripe_head_ref)
1845{
1846	struct stripe_head *sh = stripe_head_ref;
1847
1848	pr_debug("%s: stripe %llu\n", __func__,
1849		(unsigned long long)sh->sector);
1850
1851	sh->check_state = check_state_check_result;
1852	set_bit(STRIPE_HANDLE, &sh->state);
1853	raid5_release_stripe(sh);
1854}
1855
1856static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1857{
1858	int disks = sh->disks;
1859	int pd_idx = sh->pd_idx;
1860	int qd_idx = sh->qd_idx;
1861	struct page *xor_dest;
1862	struct page **xor_srcs = to_addr_page(percpu, 0);
1863	struct dma_async_tx_descriptor *tx;
1864	struct async_submit_ctl submit;
1865	int count;
1866	int i;
1867
1868	pr_debug("%s: stripe %llu\n", __func__,
1869		(unsigned long long)sh->sector);
1870
1871	BUG_ON(sh->batch_head);
1872	count = 0;
1873	xor_dest = sh->dev[pd_idx].page;
1874	xor_srcs[count++] = xor_dest;
1875	for (i = disks; i--; ) {
1876		if (i == pd_idx || i == qd_idx)
1877			continue;
1878		xor_srcs[count++] = sh->dev[i].page;
1879	}
1880
1881	init_async_submit(&submit, 0, NULL, NULL, NULL,
1882			  to_addr_conv(sh, percpu, 0));
1883	tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1884			   &sh->ops.zero_sum_result, &submit);
1885
1886	atomic_inc(&sh->count);
1887	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1888	tx = async_trigger_callback(&submit);
1889}
1890
1891static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1892{
1893	struct page **srcs = to_addr_page(percpu, 0);
1894	struct async_submit_ctl submit;
1895	int count;
1896
1897	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1898		(unsigned long long)sh->sector, checkp);
1899
1900	BUG_ON(sh->batch_head);
1901	count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
1902	if (!checkp)
1903		srcs[count] = NULL;
1904
1905	atomic_inc(&sh->count);
1906	init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1907			  sh, to_addr_conv(sh, percpu, 0));
1908	async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1909			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1910}
1911
1912static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1913{
1914	int overlap_clear = 0, i, disks = sh->disks;
1915	struct dma_async_tx_descriptor *tx = NULL;
1916	struct r5conf *conf = sh->raid_conf;
1917	int level = conf->level;
1918	struct raid5_percpu *percpu;
1919	unsigned long cpu;
1920
1921	cpu = get_cpu();
1922	percpu = per_cpu_ptr(conf->percpu, cpu);
1923	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
1924		ops_run_biofill(sh);
1925		overlap_clear++;
1926	}
1927
1928	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1929		if (level < 6)
1930			tx = ops_run_compute5(sh, percpu);
1931		else {
1932			if (sh->ops.target2 < 0 || sh->ops.target < 0)
1933				tx = ops_run_compute6_1(sh, percpu);
1934			else
1935				tx = ops_run_compute6_2(sh, percpu);
1936		}
1937		/* terminate the chain if reconstruct is not set to be run */
1938		if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
1939			async_tx_ack(tx);
1940	}
1941
1942	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
1943		if (level < 6)
1944			tx = ops_run_prexor5(sh, percpu, tx);
1945		else
1946			tx = ops_run_prexor6(sh, percpu, tx);
1947	}
1948
1949	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1950		tx = ops_run_biodrain(sh, tx);
1951		overlap_clear++;
1952	}
1953
1954	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
1955		if (level < 6)
1956			ops_run_reconstruct5(sh, percpu, tx);
1957		else
1958			ops_run_reconstruct6(sh, percpu, tx);
1959	}
1960
1961	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
1962		if (sh->check_state == check_state_run)
1963			ops_run_check_p(sh, percpu);
1964		else if (sh->check_state == check_state_run_q)
1965			ops_run_check_pq(sh, percpu, 0);
1966		else if (sh->check_state == check_state_run_pq)
1967			ops_run_check_pq(sh, percpu, 1);
1968		else
1969			BUG();
1970	}
1971
1972	if (overlap_clear && !sh->batch_head)
1973		for (i = disks; i--; ) {
1974			struct r5dev *dev = &sh->dev[i];
1975			if (test_and_clear_bit(R5_Overlap, &dev->flags))
1976				wake_up(&sh->raid_conf->wait_for_overlap);
1977		}
1978	put_cpu();
1979}
1980
1981static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
 
1982{
1983	struct stripe_head *sh;
 
1984
1985	sh = kmem_cache_zalloc(sc, gfp);
1986	if (sh) {
1987		spin_lock_init(&sh->stripe_lock);
1988		spin_lock_init(&sh->batch_lock);
1989		INIT_LIST_HEAD(&sh->batch_list);
1990		INIT_LIST_HEAD(&sh->lru);
 
 
1991		atomic_set(&sh->count, 1);
 
 
 
 
 
 
 
1992	}
1993	return sh;
1994}
1995static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
1996{
1997	struct stripe_head *sh;
1998
1999	sh = alloc_stripe(conf->slab_cache, gfp);
2000	if (!sh)
2001		return 0;
2002
2003	sh->raid_conf = conf;
2004
2005	if (grow_buffers(sh, gfp)) {
2006		shrink_buffers(sh);
2007		kmem_cache_free(conf->slab_cache, sh);
2008		return 0;
2009	}
2010	sh->hash_lock_index =
2011		conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2012	/* we just created an active stripe so... */
2013	atomic_inc(&conf->active_stripes);
2014
2015	raid5_release_stripe(sh);
2016	conf->max_nr_stripes++;
2017	return 1;
2018}
2019
2020static int grow_stripes(struct r5conf *conf, int num)
2021{
2022	struct kmem_cache *sc;
2023	int devs = max(conf->raid_disks, conf->previous_raid_disks);
2024
2025	if (conf->mddev->gendisk)
2026		sprintf(conf->cache_name[0],
2027			"raid%d-%s", conf->level, mdname(conf->mddev));
2028	else
2029		sprintf(conf->cache_name[0],
2030			"raid%d-%p", conf->level, conf->mddev);
2031	sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
2032
2033	conf->active_name = 0;
2034	sc = kmem_cache_create(conf->cache_name[conf->active_name],
2035			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2036			       0, 0, NULL);
2037	if (!sc)
2038		return 1;
2039	conf->slab_cache = sc;
2040	conf->pool_size = devs;
2041	while (num--)
2042		if (!grow_one_stripe(conf, GFP_KERNEL))
2043			return 1;
2044
2045	return 0;
2046}
2047
2048/**
2049 * scribble_len - return the required size of the scribble region
2050 * @num - total number of disks in the array
2051 *
2052 * The size must be enough to contain:
2053 * 1/ a struct page pointer for each device in the array +2
2054 * 2/ room to convert each entry in (1) to its corresponding dma
2055 *    (dma_map_page()) or page (page_address()) address.
2056 *
2057 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
2058 * calculate over all devices (not just the data blocks), using zeros in place
2059 * of the P and Q blocks.
2060 */
2061static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
2062{
2063	struct flex_array *ret;
2064	size_t len;
2065
2066	len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
2067	ret = flex_array_alloc(len, cnt, flags);
2068	if (!ret)
2069		return NULL;
2070	/* always prealloc all elements, so no locking is required */
2071	if (flex_array_prealloc(ret, 0, cnt, flags)) {
2072		flex_array_free(ret);
2073		return NULL;
2074	}
2075	return ret;
2076}
2077
2078static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2079{
2080	unsigned long cpu;
2081	int err = 0;
2082
2083	/*
2084	 * Never shrink. And mddev_suspend() could deadlock if this is called
2085	 * from raid5d. In that case, scribble_disks and scribble_sectors
2086	 * should equal to new_disks and new_sectors
2087	 */
2088	if (conf->scribble_disks >= new_disks &&
2089	    conf->scribble_sectors >= new_sectors)
2090		return 0;
2091	mddev_suspend(conf->mddev);
2092	get_online_cpus();
2093	for_each_present_cpu(cpu) {
2094		struct raid5_percpu *percpu;
2095		struct flex_array *scribble;
2096
2097		percpu = per_cpu_ptr(conf->percpu, cpu);
2098		scribble = scribble_alloc(new_disks,
2099					  new_sectors / STRIPE_SECTORS,
2100					  GFP_NOIO);
2101
2102		if (scribble) {
2103			flex_array_free(percpu->scribble);
2104			percpu->scribble = scribble;
2105		} else {
2106			err = -ENOMEM;
2107			break;
2108		}
2109	}
2110	put_online_cpus();
2111	mddev_resume(conf->mddev);
2112	if (!err) {
2113		conf->scribble_disks = new_disks;
2114		conf->scribble_sectors = new_sectors;
2115	}
2116	return err;
2117}
2118
2119static int resize_stripes(struct r5conf *conf, int newsize)
2120{
2121	/* Make all the stripes able to hold 'newsize' devices.
2122	 * New slots in each stripe get 'page' set to a new page.
2123	 *
2124	 * This happens in stages:
2125	 * 1/ create a new kmem_cache and allocate the required number of
2126	 *    stripe_heads.
2127	 * 2/ gather all the old stripe_heads and transfer the pages across
2128	 *    to the new stripe_heads.  This will have the side effect of
2129	 *    freezing the array as once all stripe_heads have been collected,
2130	 *    no IO will be possible.  Old stripe heads are freed once their
2131	 *    pages have been transferred over, and the old kmem_cache is
2132	 *    freed when all stripes are done.
2133	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
2134	 *    we simple return a failre status - no need to clean anything up.
2135	 * 4/ allocate new pages for the new slots in the new stripe_heads.
2136	 *    If this fails, we don't bother trying the shrink the
2137	 *    stripe_heads down again, we just leave them as they are.
2138	 *    As each stripe_head is processed the new one is released into
2139	 *    active service.
2140	 *
2141	 * Once step2 is started, we cannot afford to wait for a write,
2142	 * so we use GFP_NOIO allocations.
2143	 */
2144	struct stripe_head *osh, *nsh;
2145	LIST_HEAD(newstripes);
2146	struct disk_info *ndisks;
2147	int err;
2148	struct kmem_cache *sc;
2149	int i;
2150	int hash, cnt;
2151
2152	if (newsize <= conf->pool_size)
2153		return 0; /* never bother to shrink */
2154
2155	err = md_allow_write(conf->mddev);
2156	if (err)
2157		return err;
2158
2159	/* Step 1 */
2160	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2161			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2162			       0, 0, NULL);
2163	if (!sc)
2164		return -ENOMEM;
2165
2166	/* Need to ensure auto-resizing doesn't interfere */
2167	mutex_lock(&conf->cache_size_mutex);
2168
2169	for (i = conf->max_nr_stripes; i; i--) {
2170		nsh = alloc_stripe(sc, GFP_KERNEL);
2171		if (!nsh)
2172			break;
2173
2174		nsh->raid_conf = conf;
2175		list_add(&nsh->lru, &newstripes);
2176	}
2177	if (i) {
2178		/* didn't get enough, give up */
2179		while (!list_empty(&newstripes)) {
2180			nsh = list_entry(newstripes.next, struct stripe_head, lru);
2181			list_del(&nsh->lru);
2182			kmem_cache_free(sc, nsh);
2183		}
2184		kmem_cache_destroy(sc);
2185		mutex_unlock(&conf->cache_size_mutex);
2186		return -ENOMEM;
2187	}
2188	/* Step 2 - Must use GFP_NOIO now.
2189	 * OK, we have enough stripes, start collecting inactive
2190	 * stripes and copying them over
2191	 */
2192	hash = 0;
2193	cnt = 0;
2194	list_for_each_entry(nsh, &newstripes, lru) {
2195		lock_device_hash_lock(conf, hash);
2196		wait_event_cmd(conf->wait_for_stripe,
2197				    !list_empty(conf->inactive_list + hash),
2198				    unlock_device_hash_lock(conf, hash),
2199				    lock_device_hash_lock(conf, hash));
2200		osh = get_free_stripe(conf, hash);
2201		unlock_device_hash_lock(conf, hash);
2202
2203		for(i=0; i<conf->pool_size; i++) {
2204			nsh->dev[i].page = osh->dev[i].page;
2205			nsh->dev[i].orig_page = osh->dev[i].page;
2206		}
2207		nsh->hash_lock_index = hash;
2208		kmem_cache_free(conf->slab_cache, osh);
2209		cnt++;
2210		if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2211		    !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2212			hash++;
2213			cnt = 0;
2214		}
2215	}
2216	kmem_cache_destroy(conf->slab_cache);
2217
2218	/* Step 3.
2219	 * At this point, we are holding all the stripes so the array
2220	 * is completely stalled, so now is a good time to resize
2221	 * conf->disks and the scribble region
2222	 */
2223	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
2224	if (ndisks) {
2225		for (i=0; i<conf->raid_disks; i++)
2226			ndisks[i] = conf->disks[i];
2227		kfree(conf->disks);
2228		conf->disks = ndisks;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2229	} else
2230		err = -ENOMEM;
2231
2232	mutex_unlock(&conf->cache_size_mutex);
2233	/* Step 4, return new stripes to service */
2234	while(!list_empty(&newstripes)) {
2235		nsh = list_entry(newstripes.next, struct stripe_head, lru);
2236		list_del_init(&nsh->lru);
2237
2238		for (i=conf->raid_disks; i < newsize; i++)
2239			if (nsh->dev[i].page == NULL) {
2240				struct page *p = alloc_page(GFP_NOIO);
2241				nsh->dev[i].page = p;
2242				nsh->dev[i].orig_page = p;
2243				if (!p)
2244					err = -ENOMEM;
2245			}
2246		raid5_release_stripe(nsh);
2247	}
2248	/* critical section pass, GFP_NOIO no longer needed */
2249
2250	conf->slab_cache = sc;
2251	conf->active_name = 1-conf->active_name;
2252	if (!err)
2253		conf->pool_size = newsize;
2254	return err;
2255}
2256
2257static int drop_one_stripe(struct r5conf *conf)
2258{
2259	struct stripe_head *sh;
2260	int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2261
2262	spin_lock_irq(conf->hash_locks + hash);
2263	sh = get_free_stripe(conf, hash);
2264	spin_unlock_irq(conf->hash_locks + hash);
2265	if (!sh)
2266		return 0;
2267	BUG_ON(atomic_read(&sh->count));
2268	shrink_buffers(sh);
2269	kmem_cache_free(conf->slab_cache, sh);
2270	atomic_dec(&conf->active_stripes);
2271	conf->max_nr_stripes--;
2272	return 1;
2273}
2274
2275static void shrink_stripes(struct r5conf *conf)
2276{
2277	while (conf->max_nr_stripes &&
2278	       drop_one_stripe(conf))
2279		;
2280
2281	kmem_cache_destroy(conf->slab_cache);
2282	conf->slab_cache = NULL;
2283}
2284
2285static void raid5_end_read_request(struct bio * bi)
2286{
2287	struct stripe_head *sh = bi->bi_private;
2288	struct r5conf *conf = sh->raid_conf;
2289	int disks = sh->disks, i;
2290	char b[BDEVNAME_SIZE];
2291	struct md_rdev *rdev = NULL;
2292	sector_t s;
2293
2294	for (i=0 ; i<disks; i++)
2295		if (bi == &sh->dev[i].req)
2296			break;
2297
2298	pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2299		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
2300		bi->bi_error);
2301	if (i == disks) {
 
2302		BUG();
2303		return;
2304	}
2305	if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2306		/* If replacement finished while this request was outstanding,
2307		 * 'replacement' might be NULL already.
2308		 * In that case it moved down to 'rdev'.
2309		 * rdev is not removed until all requests are finished.
2310		 */
2311		rdev = conf->disks[i].replacement;
2312	if (!rdev)
2313		rdev = conf->disks[i].rdev;
2314
2315	if (use_new_offset(conf, sh))
2316		s = sh->sector + rdev->new_data_offset;
2317	else
2318		s = sh->sector + rdev->data_offset;
2319	if (!bi->bi_error) {
2320		set_bit(R5_UPTODATE, &sh->dev[i].flags);
2321		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2322			/* Note that this cannot happen on a
2323			 * replacement device.  We just fail those on
2324			 * any error
2325			 */
2326			printk_ratelimited(
2327				KERN_INFO
2328				"md/raid:%s: read error corrected"
2329				" (%lu sectors at %llu on %s)\n",
2330				mdname(conf->mddev), STRIPE_SECTORS,
2331				(unsigned long long)s,
2332				bdevname(rdev->bdev, b));
2333			atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2334			clear_bit(R5_ReadError, &sh->dev[i].flags);
2335			clear_bit(R5_ReWrite, &sh->dev[i].flags);
2336		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2337			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2338
 
 
 
 
 
 
 
2339		if (atomic_read(&rdev->read_errors))
2340			atomic_set(&rdev->read_errors, 0);
2341	} else {
2342		const char *bdn = bdevname(rdev->bdev, b);
2343		int retry = 0;
2344		int set_bad = 0;
2345
2346		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2347		atomic_inc(&rdev->read_errors);
2348		if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2349			printk_ratelimited(
2350				KERN_WARNING
2351				"md/raid:%s: read error on replacement device "
2352				"(sector %llu on %s).\n",
2353				mdname(conf->mddev),
2354				(unsigned long long)s,
2355				bdn);
2356		else if (conf->mddev->degraded >= conf->max_degraded) {
2357			set_bad = 1;
2358			printk_ratelimited(
2359				KERN_WARNING
2360				"md/raid:%s: read error not correctable "
2361				"(sector %llu on %s).\n",
2362				mdname(conf->mddev),
2363				(unsigned long long)s,
2364				bdn);
2365		} else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2366			/* Oh, no!!! */
2367			set_bad = 1;
2368			printk_ratelimited(
2369				KERN_WARNING
2370				"md/raid:%s: read error NOT corrected!! "
2371				"(sector %llu on %s).\n",
2372				mdname(conf->mddev),
2373				(unsigned long long)s,
2374				bdn);
2375		} else if (atomic_read(&rdev->read_errors)
2376			 > conf->max_nr_stripes)
2377			printk(KERN_WARNING
2378			       "md/raid:%s: Too many read errors, failing device %s.\n",
2379			       mdname(conf->mddev), bdn);
2380		else
2381			retry = 1;
2382		if (set_bad && test_bit(In_sync, &rdev->flags)
2383		    && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2384			retry = 1;
2385		if (retry)
2386			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2387				set_bit(R5_ReadError, &sh->dev[i].flags);
2388				clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2389			} else
2390				set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2391		else {
2392			clear_bit(R5_ReadError, &sh->dev[i].flags);
2393			clear_bit(R5_ReWrite, &sh->dev[i].flags);
2394			if (!(set_bad
2395			      && test_bit(In_sync, &rdev->flags)
2396			      && rdev_set_badblocks(
2397				      rdev, sh->sector, STRIPE_SECTORS, 0)))
2398				md_error(conf->mddev, rdev);
2399		}
2400	}
2401	rdev_dec_pending(rdev, conf->mddev);
 
2402	clear_bit(R5_LOCKED, &sh->dev[i].flags);
2403	set_bit(STRIPE_HANDLE, &sh->state);
2404	raid5_release_stripe(sh);
2405}
2406
2407static void raid5_end_write_request(struct bio *bi)
2408{
2409	struct stripe_head *sh = bi->bi_private;
2410	struct r5conf *conf = sh->raid_conf;
2411	int disks = sh->disks, i;
2412	struct md_rdev *uninitialized_var(rdev);
2413	sector_t first_bad;
2414	int bad_sectors;
2415	int replacement = 0;
2416
2417	for (i = 0 ; i < disks; i++) {
2418		if (bi == &sh->dev[i].req) {
2419			rdev = conf->disks[i].rdev;
2420			break;
2421		}
2422		if (bi == &sh->dev[i].rreq) {
2423			rdev = conf->disks[i].replacement;
2424			if (rdev)
2425				replacement = 1;
2426			else
2427				/* rdev was removed and 'replacement'
2428				 * replaced it.  rdev is not removed
2429				 * until all requests are finished.
2430				 */
2431				rdev = conf->disks[i].rdev;
2432			break;
2433		}
2434	}
2435	pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2436		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
2437		bi->bi_error);
2438	if (i == disks) {
 
2439		BUG();
2440		return;
2441	}
2442
2443	if (replacement) {
2444		if (bi->bi_error)
2445			md_error(conf->mddev, rdev);
2446		else if (is_badblock(rdev, sh->sector,
2447				     STRIPE_SECTORS,
2448				     &first_bad, &bad_sectors))
2449			set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2450	} else {
2451		if (bi->bi_error) {
2452			set_bit(STRIPE_DEGRADED, &sh->state);
2453			set_bit(WriteErrorSeen, &rdev->flags);
2454			set_bit(R5_WriteError, &sh->dev[i].flags);
2455			if (!test_and_set_bit(WantReplacement, &rdev->flags))
2456				set_bit(MD_RECOVERY_NEEDED,
2457					&rdev->mddev->recovery);
2458		} else if (is_badblock(rdev, sh->sector,
2459				       STRIPE_SECTORS,
2460				       &first_bad, &bad_sectors)) {
2461			set_bit(R5_MadeGood, &sh->dev[i].flags);
2462			if (test_bit(R5_ReadError, &sh->dev[i].flags))
2463				/* That was a successful write so make
2464				 * sure it looks like we already did
2465				 * a re-write.
2466				 */
2467				set_bit(R5_ReWrite, &sh->dev[i].flags);
2468		}
2469	}
2470	rdev_dec_pending(rdev, conf->mddev);
2471
2472	if (sh->batch_head && bi->bi_error && !replacement)
2473		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2474
 
2475	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2476		clear_bit(R5_LOCKED, &sh->dev[i].flags);
2477	set_bit(STRIPE_HANDLE, &sh->state);
2478	raid5_release_stripe(sh);
2479
2480	if (sh->batch_head && sh != sh->batch_head)
2481		raid5_release_stripe(sh->batch_head);
2482}
2483
2484static void raid5_build_block(struct stripe_head *sh, int i, int previous)
2485{
2486	struct r5dev *dev = &sh->dev[i];
2487
2488	bio_init(&dev->req);
2489	dev->req.bi_io_vec = &dev->vec;
2490	dev->req.bi_max_vecs = 1;
2491	dev->req.bi_private = sh;
2492
2493	bio_init(&dev->rreq);
2494	dev->rreq.bi_io_vec = &dev->rvec;
2495	dev->rreq.bi_max_vecs = 1;
2496	dev->rreq.bi_private = sh;
2497
2498	dev->flags = 0;
2499	dev->sector = raid5_compute_blocknr(sh, i, previous);
2500}
2501
2502static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2503{
2504	char b[BDEVNAME_SIZE];
2505	struct r5conf *conf = mddev->private;
2506	unsigned long flags;
2507	pr_debug("raid456: error called\n");
2508
2509	spin_lock_irqsave(&conf->device_lock, flags);
2510	clear_bit(In_sync, &rdev->flags);
2511	mddev->degraded = calc_degraded(conf);
2512	spin_unlock_irqrestore(&conf->device_lock, flags);
2513	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2514
2515	set_bit(Blocked, &rdev->flags);
2516	set_bit(Faulty, &rdev->flags);
2517	set_bit(MD_CHANGE_DEVS, &mddev->flags);
2518	set_bit(MD_CHANGE_PENDING, &mddev->flags);
2519	printk(KERN_ALERT
2520	       "md/raid:%s: Disk failure on %s, disabling device.\n"
2521	       "md/raid:%s: Operation continuing on %d devices.\n",
2522	       mdname(mddev),
2523	       bdevname(rdev->bdev, b),
2524	       mdname(mddev),
2525	       conf->raid_disks - mddev->degraded);
2526}
2527
2528/*
2529 * Input: a 'big' sector number,
2530 * Output: index of the data and parity disk, and the sector # in them.
2531 */
2532sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2533			      int previous, int *dd_idx,
2534			      struct stripe_head *sh)
2535{
2536	sector_t stripe, stripe2;
2537	sector_t chunk_number;
2538	unsigned int chunk_offset;
2539	int pd_idx, qd_idx;
2540	int ddf_layout = 0;
2541	sector_t new_sector;
2542	int algorithm = previous ? conf->prev_algo
2543				 : conf->algorithm;
2544	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2545					 : conf->chunk_sectors;
2546	int raid_disks = previous ? conf->previous_raid_disks
2547				  : conf->raid_disks;
2548	int data_disks = raid_disks - conf->max_degraded;
2549
2550	/* First compute the information on this sector */
2551
2552	/*
2553	 * Compute the chunk number and the sector offset inside the chunk
2554	 */
2555	chunk_offset = sector_div(r_sector, sectors_per_chunk);
2556	chunk_number = r_sector;
2557
2558	/*
2559	 * Compute the stripe number
2560	 */
2561	stripe = chunk_number;
2562	*dd_idx = sector_div(stripe, data_disks);
2563	stripe2 = stripe;
2564	/*
2565	 * Select the parity disk based on the user selected algorithm.
2566	 */
2567	pd_idx = qd_idx = -1;
2568	switch(conf->level) {
2569	case 4:
2570		pd_idx = data_disks;
2571		break;
2572	case 5:
2573		switch (algorithm) {
2574		case ALGORITHM_LEFT_ASYMMETRIC:
2575			pd_idx = data_disks - sector_div(stripe2, raid_disks);
2576			if (*dd_idx >= pd_idx)
2577				(*dd_idx)++;
2578			break;
2579		case ALGORITHM_RIGHT_ASYMMETRIC:
2580			pd_idx = sector_div(stripe2, raid_disks);
2581			if (*dd_idx >= pd_idx)
2582				(*dd_idx)++;
2583			break;
2584		case ALGORITHM_LEFT_SYMMETRIC:
2585			pd_idx = data_disks - sector_div(stripe2, raid_disks);
2586			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2587			break;
2588		case ALGORITHM_RIGHT_SYMMETRIC:
2589			pd_idx = sector_div(stripe2, raid_disks);
2590			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2591			break;
2592		case ALGORITHM_PARITY_0:
2593			pd_idx = 0;
2594			(*dd_idx)++;
2595			break;
2596		case ALGORITHM_PARITY_N:
2597			pd_idx = data_disks;
2598			break;
2599		default:
2600			BUG();
2601		}
2602		break;
2603	case 6:
2604
2605		switch (algorithm) {
2606		case ALGORITHM_LEFT_ASYMMETRIC:
2607			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2608			qd_idx = pd_idx + 1;
2609			if (pd_idx == raid_disks-1) {
2610				(*dd_idx)++;	/* Q D D D P */
2611				qd_idx = 0;
2612			} else if (*dd_idx >= pd_idx)
2613				(*dd_idx) += 2; /* D D P Q D */
2614			break;
2615		case ALGORITHM_RIGHT_ASYMMETRIC:
2616			pd_idx = sector_div(stripe2, raid_disks);
2617			qd_idx = pd_idx + 1;
2618			if (pd_idx == raid_disks-1) {
2619				(*dd_idx)++;	/* Q D D D P */
2620				qd_idx = 0;
2621			} else if (*dd_idx >= pd_idx)
2622				(*dd_idx) += 2; /* D D P Q D */
2623			break;
2624		case ALGORITHM_LEFT_SYMMETRIC:
2625			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2626			qd_idx = (pd_idx + 1) % raid_disks;
2627			*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2628			break;
2629		case ALGORITHM_RIGHT_SYMMETRIC:
2630			pd_idx = sector_div(stripe2, raid_disks);
2631			qd_idx = (pd_idx + 1) % raid_disks;
2632			*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2633			break;
2634
2635		case ALGORITHM_PARITY_0:
2636			pd_idx = 0;
2637			qd_idx = 1;
2638			(*dd_idx) += 2;
2639			break;
2640		case ALGORITHM_PARITY_N:
2641			pd_idx = data_disks;
2642			qd_idx = data_disks + 1;
2643			break;
2644
2645		case ALGORITHM_ROTATING_ZERO_RESTART:
2646			/* Exactly the same as RIGHT_ASYMMETRIC, but or
2647			 * of blocks for computing Q is different.
2648			 */
2649			pd_idx = sector_div(stripe2, raid_disks);
2650			qd_idx = pd_idx + 1;
2651			if (pd_idx == raid_disks-1) {
2652				(*dd_idx)++;	/* Q D D D P */
2653				qd_idx = 0;
2654			} else if (*dd_idx >= pd_idx)
2655				(*dd_idx) += 2; /* D D P Q D */
2656			ddf_layout = 1;
2657			break;
2658
2659		case ALGORITHM_ROTATING_N_RESTART:
2660			/* Same a left_asymmetric, by first stripe is
2661			 * D D D P Q  rather than
2662			 * Q D D D P
2663			 */
2664			stripe2 += 1;
2665			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2666			qd_idx = pd_idx + 1;
2667			if (pd_idx == raid_disks-1) {
2668				(*dd_idx)++;	/* Q D D D P */
2669				qd_idx = 0;
2670			} else if (*dd_idx >= pd_idx)
2671				(*dd_idx) += 2; /* D D P Q D */
2672			ddf_layout = 1;
2673			break;
2674
2675		case ALGORITHM_ROTATING_N_CONTINUE:
2676			/* Same as left_symmetric but Q is before P */
2677			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2678			qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2679			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2680			ddf_layout = 1;
2681			break;
2682
2683		case ALGORITHM_LEFT_ASYMMETRIC_6:
2684			/* RAID5 left_asymmetric, with Q on last device */
2685			pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2686			if (*dd_idx >= pd_idx)
2687				(*dd_idx)++;
2688			qd_idx = raid_disks - 1;
2689			break;
2690
2691		case ALGORITHM_RIGHT_ASYMMETRIC_6:
2692			pd_idx = sector_div(stripe2, raid_disks-1);
2693			if (*dd_idx >= pd_idx)
2694				(*dd_idx)++;
2695			qd_idx = raid_disks - 1;
2696			break;
2697
2698		case ALGORITHM_LEFT_SYMMETRIC_6:
2699			pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2700			*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2701			qd_idx = raid_disks - 1;
2702			break;
2703
2704		case ALGORITHM_RIGHT_SYMMETRIC_6:
2705			pd_idx = sector_div(stripe2, raid_disks-1);
2706			*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2707			qd_idx = raid_disks - 1;
2708			break;
2709
2710		case ALGORITHM_PARITY_0_6:
2711			pd_idx = 0;
2712			(*dd_idx)++;
2713			qd_idx = raid_disks - 1;
2714			break;
2715
2716		default:
2717			BUG();
2718		}
2719		break;
2720	}
2721
2722	if (sh) {
2723		sh->pd_idx = pd_idx;
2724		sh->qd_idx = qd_idx;
2725		sh->ddf_layout = ddf_layout;
2726	}
2727	/*
2728	 * Finally, compute the new sector number
2729	 */
2730	new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
2731	return new_sector;
2732}
2733
2734sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
2735{
2736	struct r5conf *conf = sh->raid_conf;
2737	int raid_disks = sh->disks;
2738	int data_disks = raid_disks - conf->max_degraded;
2739	sector_t new_sector = sh->sector, check;
2740	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2741					 : conf->chunk_sectors;
2742	int algorithm = previous ? conf->prev_algo
2743				 : conf->algorithm;
2744	sector_t stripe;
2745	int chunk_offset;
2746	sector_t chunk_number;
2747	int dummy1, dd_idx = i;
2748	sector_t r_sector;
2749	struct stripe_head sh2;
2750
2751	chunk_offset = sector_div(new_sector, sectors_per_chunk);
2752	stripe = new_sector;
2753
2754	if (i == sh->pd_idx)
2755		return 0;
2756	switch(conf->level) {
2757	case 4: break;
2758	case 5:
2759		switch (algorithm) {
2760		case ALGORITHM_LEFT_ASYMMETRIC:
2761		case ALGORITHM_RIGHT_ASYMMETRIC:
2762			if (i > sh->pd_idx)
2763				i--;
2764			break;
2765		case ALGORITHM_LEFT_SYMMETRIC:
2766		case ALGORITHM_RIGHT_SYMMETRIC:
2767			if (i < sh->pd_idx)
2768				i += raid_disks;
2769			i -= (sh->pd_idx + 1);
2770			break;
2771		case ALGORITHM_PARITY_0:
2772			i -= 1;
2773			break;
2774		case ALGORITHM_PARITY_N:
2775			break;
2776		default:
2777			BUG();
2778		}
2779		break;
2780	case 6:
2781		if (i == sh->qd_idx)
2782			return 0; /* It is the Q disk */
2783		switch (algorithm) {
2784		case ALGORITHM_LEFT_ASYMMETRIC:
2785		case ALGORITHM_RIGHT_ASYMMETRIC:
2786		case ALGORITHM_ROTATING_ZERO_RESTART:
2787		case ALGORITHM_ROTATING_N_RESTART:
2788			if (sh->pd_idx == raid_disks-1)
2789				i--;	/* Q D D D P */
2790			else if (i > sh->pd_idx)
2791				i -= 2; /* D D P Q D */
2792			break;
2793		case ALGORITHM_LEFT_SYMMETRIC:
2794		case ALGORITHM_RIGHT_SYMMETRIC:
2795			if (sh->pd_idx == raid_disks-1)
2796				i--; /* Q D D D P */
2797			else {
2798				/* D D P Q D */
2799				if (i < sh->pd_idx)
2800					i += raid_disks;
2801				i -= (sh->pd_idx + 2);
2802			}
2803			break;
2804		case ALGORITHM_PARITY_0:
2805			i -= 2;
2806			break;
2807		case ALGORITHM_PARITY_N:
2808			break;
2809		case ALGORITHM_ROTATING_N_CONTINUE:
2810			/* Like left_symmetric, but P is before Q */
2811			if (sh->pd_idx == 0)
2812				i--;	/* P D D D Q */
2813			else {
2814				/* D D Q P D */
2815				if (i < sh->pd_idx)
2816					i += raid_disks;
2817				i -= (sh->pd_idx + 1);
2818			}
2819			break;
2820		case ALGORITHM_LEFT_ASYMMETRIC_6:
2821		case ALGORITHM_RIGHT_ASYMMETRIC_6:
2822			if (i > sh->pd_idx)
2823				i--;
2824			break;
2825		case ALGORITHM_LEFT_SYMMETRIC_6:
2826		case ALGORITHM_RIGHT_SYMMETRIC_6:
2827			if (i < sh->pd_idx)
2828				i += data_disks + 1;
2829			i -= (sh->pd_idx + 1);
2830			break;
2831		case ALGORITHM_PARITY_0_6:
2832			i -= 1;
2833			break;
2834		default:
2835			BUG();
2836		}
2837		break;
2838	}
2839
2840	chunk_number = stripe * data_disks + i;
2841	r_sector = chunk_number * sectors_per_chunk + chunk_offset;
2842
2843	check = raid5_compute_sector(conf, r_sector,
2844				     previous, &dummy1, &sh2);
2845	if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
2846		|| sh2.qd_idx != sh->qd_idx) {
2847		printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
2848		       mdname(conf->mddev));
2849		return 0;
2850	}
2851	return r_sector;
2852}
2853
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2854static void
2855schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2856			 int rcw, int expand)
2857{
2858	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
2859	struct r5conf *conf = sh->raid_conf;
2860	int level = conf->level;
2861
2862	if (rcw) {
 
 
 
 
 
 
 
2863
2864		for (i = disks; i--; ) {
2865			struct r5dev *dev = &sh->dev[i];
2866
2867			if (dev->towrite) {
2868				set_bit(R5_LOCKED, &dev->flags);
2869				set_bit(R5_Wantdrain, &dev->flags);
2870				if (!expand)
2871					clear_bit(R5_UPTODATE, &dev->flags);
2872				s->locked++;
 
 
 
2873			}
2874		}
2875		/* if we are not expanding this is a proper write request, and
2876		 * there will be bios with new data to be drained into the
2877		 * stripe cache
2878		 */
2879		if (!expand) {
2880			if (!s->locked)
2881				/* False alarm, nothing to do */
2882				return;
2883			sh->reconstruct_state = reconstruct_state_drain_run;
2884			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2885		} else
2886			sh->reconstruct_state = reconstruct_state_run;
2887
2888		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2889
2890		if (s->locked + conf->max_degraded == disks)
2891			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2892				atomic_inc(&conf->pending_full_writes);
2893	} else {
2894		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
2895			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
2896		BUG_ON(level == 6 &&
2897			(!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
2898			   test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
2899
2900		for (i = disks; i--; ) {
2901			struct r5dev *dev = &sh->dev[i];
2902			if (i == pd_idx || i == qd_idx)
2903				continue;
2904
2905			if (dev->towrite &&
2906			    (test_bit(R5_UPTODATE, &dev->flags) ||
2907			     test_bit(R5_Wantcompute, &dev->flags))) {
2908				set_bit(R5_Wantdrain, &dev->flags);
2909				set_bit(R5_LOCKED, &dev->flags);
2910				clear_bit(R5_UPTODATE, &dev->flags);
2911				s->locked++;
 
 
 
2912			}
2913		}
2914		if (!s->locked)
2915			/* False alarm - nothing to do */
2916			return;
2917		sh->reconstruct_state = reconstruct_state_prexor_drain_run;
2918		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
2919		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2920		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2921	}
2922
2923	/* keep the parity disk(s) locked while asynchronous operations
2924	 * are in flight
2925	 */
2926	set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
2927	clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2928	s->locked++;
2929
2930	if (level == 6) {
2931		int qd_idx = sh->qd_idx;
2932		struct r5dev *dev = &sh->dev[qd_idx];
2933
2934		set_bit(R5_LOCKED, &dev->flags);
2935		clear_bit(R5_UPTODATE, &dev->flags);
2936		s->locked++;
2937	}
2938
2939	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
2940		__func__, (unsigned long long)sh->sector,
2941		s->locked, s->ops_request);
2942}
2943
2944/*
2945 * Each stripe/dev can have one or more bion attached.
2946 * toread/towrite point to the first in a chain.
2947 * The bi_next chain must be in order.
2948 */
2949static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
2950			  int forwrite, int previous)
2951{
2952	struct bio **bip;
2953	struct r5conf *conf = sh->raid_conf;
2954	int firstwrite=0;
2955
2956	pr_debug("adding bi b#%llu to stripe s#%llu\n",
2957		(unsigned long long)bi->bi_iter.bi_sector,
2958		(unsigned long long)sh->sector);
2959
2960	/*
2961	 * If several bio share a stripe. The bio bi_phys_segments acts as a
2962	 * reference count to avoid race. The reference count should already be
2963	 * increased before this function is called (for example, in
2964	 * raid5_make_request()), so other bio sharing this stripe will not free the
2965	 * stripe. If a stripe is owned by one stripe, the stripe lock will
2966	 * protect it.
2967	 */
2968	spin_lock_irq(&sh->stripe_lock);
2969	/* Don't allow new IO added to stripes in batch list */
2970	if (sh->batch_head)
2971		goto overlap;
2972	if (forwrite) {
2973		bip = &sh->dev[dd_idx].towrite;
2974		if (*bip == NULL)
2975			firstwrite = 1;
2976	} else
2977		bip = &sh->dev[dd_idx].toread;
2978	while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
2979		if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
2980			goto overlap;
2981		bip = & (*bip)->bi_next;
2982	}
2983	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
2984		goto overlap;
2985
2986	if (!forwrite || previous)
2987		clear_bit(STRIPE_BATCH_READY, &sh->state);
2988
2989	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
2990	if (*bip)
2991		bi->bi_next = *bip;
2992	*bip = bi;
2993	raid5_inc_bi_active_stripes(bi);
2994
2995	if (forwrite) {
2996		/* check if page is covered */
2997		sector_t sector = sh->dev[dd_idx].sector;
2998		for (bi=sh->dev[dd_idx].towrite;
2999		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
3000			     bi && bi->bi_iter.bi_sector <= sector;
3001		     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
3002			if (bio_end_sector(bi) >= sector)
3003				sector = bio_end_sector(bi);
3004		}
3005		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
3006			if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3007				sh->overwrite_disks++;
3008	}
3009
3010	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
3011		(unsigned long long)(*bip)->bi_iter.bi_sector,
3012		(unsigned long long)sh->sector, dd_idx);
3013
3014	if (conf->mddev->bitmap && firstwrite) {
3015		/* Cannot hold spinlock over bitmap_startwrite,
3016		 * but must ensure this isn't added to a batch until
3017		 * we have added to the bitmap and set bm_seq.
3018		 * So set STRIPE_BITMAP_PENDING to prevent
3019		 * batching.
3020		 * If multiple add_stripe_bio() calls race here they
3021		 * much all set STRIPE_BITMAP_PENDING.  So only the first one
3022		 * to complete "bitmap_startwrite" gets to set
3023		 * STRIPE_BIT_DELAY.  This is important as once a stripe
3024		 * is added to a batch, STRIPE_BIT_DELAY cannot be changed
3025		 * any more.
3026		 */
3027		set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3028		spin_unlock_irq(&sh->stripe_lock);
3029		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3030				  STRIPE_SECTORS, 0);
3031		spin_lock_irq(&sh->stripe_lock);
3032		clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3033		if (!sh->batch_head) {
3034			sh->bm_seq = conf->seq_flush+1;
3035			set_bit(STRIPE_BIT_DELAY, &sh->state);
3036		}
3037	}
3038	spin_unlock_irq(&sh->stripe_lock);
3039
3040	if (stripe_can_batch(sh))
3041		stripe_add_to_batch_list(conf, sh);
3042	return 1;
3043
3044 overlap:
3045	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3046	spin_unlock_irq(&sh->stripe_lock);
3047	return 0;
3048}
3049
3050static void end_reshape(struct r5conf *conf);
3051
3052static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3053			    struct stripe_head *sh)
3054{
3055	int sectors_per_chunk =
3056		previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3057	int dd_idx;
3058	int chunk_offset = sector_div(stripe, sectors_per_chunk);
3059	int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3060
3061	raid5_compute_sector(conf,
3062			     stripe * (disks - conf->max_degraded)
3063			     *sectors_per_chunk + chunk_offset,
3064			     previous,
3065			     &dd_idx, sh);
3066}
3067
3068static void
3069handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3070				struct stripe_head_state *s, int disks,
3071				struct bio_list *return_bi)
3072{
3073	int i;
3074	BUG_ON(sh->batch_head);
3075	for (i = disks; i--; ) {
3076		struct bio *bi;
3077		int bitmap_end = 0;
3078
3079		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3080			struct md_rdev *rdev;
3081			rcu_read_lock();
3082			rdev = rcu_dereference(conf->disks[i].rdev);
3083			if (rdev && test_bit(In_sync, &rdev->flags))
 
3084				atomic_inc(&rdev->nr_pending);
3085			else
3086				rdev = NULL;
3087			rcu_read_unlock();
3088			if (rdev) {
3089				if (!rdev_set_badblocks(
3090					    rdev,
3091					    sh->sector,
3092					    STRIPE_SECTORS, 0))
3093					md_error(conf->mddev, rdev);
3094				rdev_dec_pending(rdev, conf->mddev);
3095			}
3096		}
3097		spin_lock_irq(&sh->stripe_lock);
3098		/* fail all writes first */
3099		bi = sh->dev[i].towrite;
3100		sh->dev[i].towrite = NULL;
3101		sh->overwrite_disks = 0;
3102		spin_unlock_irq(&sh->stripe_lock);
3103		if (bi)
3104			bitmap_end = 1;
3105
3106		r5l_stripe_write_finished(sh);
3107
3108		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3109			wake_up(&conf->wait_for_overlap);
3110
3111		while (bi && bi->bi_iter.bi_sector <
3112			sh->dev[i].sector + STRIPE_SECTORS) {
3113			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
3114
3115			bi->bi_error = -EIO;
3116			if (!raid5_dec_bi_active_stripes(bi)) {
3117				md_write_end(conf->mddev);
3118				bio_list_add(return_bi, bi);
3119			}
3120			bi = nextbi;
3121		}
3122		if (bitmap_end)
3123			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3124				STRIPE_SECTORS, 0, 0);
3125		bitmap_end = 0;
3126		/* and fail all 'written' */
3127		bi = sh->dev[i].written;
3128		sh->dev[i].written = NULL;
3129		if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3130			WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3131			sh->dev[i].page = sh->dev[i].orig_page;
3132		}
3133
3134		if (bi) bitmap_end = 1;
3135		while (bi && bi->bi_iter.bi_sector <
3136		       sh->dev[i].sector + STRIPE_SECTORS) {
3137			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
3138
3139			bi->bi_error = -EIO;
3140			if (!raid5_dec_bi_active_stripes(bi)) {
3141				md_write_end(conf->mddev);
3142				bio_list_add(return_bi, bi);
3143			}
3144			bi = bi2;
3145		}
3146
3147		/* fail any reads if this device is non-operational and
3148		 * the data has not reached the cache yet.
3149		 */
3150		if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3151		    s->failed > conf->max_degraded &&
3152		    (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3153		      test_bit(R5_ReadError, &sh->dev[i].flags))) {
3154			spin_lock_irq(&sh->stripe_lock);
3155			bi = sh->dev[i].toread;
3156			sh->dev[i].toread = NULL;
3157			spin_unlock_irq(&sh->stripe_lock);
3158			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3159				wake_up(&conf->wait_for_overlap);
3160			if (bi)
3161				s->to_read--;
3162			while (bi && bi->bi_iter.bi_sector <
3163			       sh->dev[i].sector + STRIPE_SECTORS) {
3164				struct bio *nextbi =
3165					r5_next_bio(bi, sh->dev[i].sector);
3166
3167				bi->bi_error = -EIO;
3168				if (!raid5_dec_bi_active_stripes(bi))
3169					bio_list_add(return_bi, bi);
3170				bi = nextbi;
3171			}
3172		}
3173		if (bitmap_end)
3174			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3175					STRIPE_SECTORS, 0, 0);
3176		/* If we were in the middle of a write the parity block might
3177		 * still be locked - so just clear all R5_LOCKED flags
3178		 */
3179		clear_bit(R5_LOCKED, &sh->dev[i].flags);
3180	}
3181	s->to_write = 0;
3182	s->written = 0;
3183
3184	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3185		if (atomic_dec_and_test(&conf->pending_full_writes))
3186			md_wakeup_thread(conf->mddev->thread);
3187}
3188
3189static void
3190handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3191		   struct stripe_head_state *s)
3192{
3193	int abort = 0;
3194	int i;
3195
3196	BUG_ON(sh->batch_head);
3197	clear_bit(STRIPE_SYNCING, &sh->state);
3198	if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3199		wake_up(&conf->wait_for_overlap);
3200	s->syncing = 0;
3201	s->replacing = 0;
3202	/* There is nothing more to do for sync/check/repair.
3203	 * Don't even need to abort as that is handled elsewhere
3204	 * if needed, and not always wanted e.g. if there is a known
3205	 * bad block here.
3206	 * For recover/replace we need to record a bad block on all
3207	 * non-sync devices, or abort the recovery
3208	 */
3209	if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3210		/* During recovery devices cannot be removed, so
3211		 * locking and refcounting of rdevs is not needed
3212		 */
 
3213		for (i = 0; i < conf->raid_disks; i++) {
3214			struct md_rdev *rdev = conf->disks[i].rdev;
3215			if (rdev
3216			    && !test_bit(Faulty, &rdev->flags)
3217			    && !test_bit(In_sync, &rdev->flags)
3218			    && !rdev_set_badblocks(rdev, sh->sector,
3219						   STRIPE_SECTORS, 0))
3220				abort = 1;
3221			rdev = conf->disks[i].replacement;
3222			if (rdev
3223			    && !test_bit(Faulty, &rdev->flags)
3224			    && !test_bit(In_sync, &rdev->flags)
3225			    && !rdev_set_badblocks(rdev, sh->sector,
3226						   STRIPE_SECTORS, 0))
3227				abort = 1;
3228		}
 
3229		if (abort)
3230			conf->recovery_disabled =
3231				conf->mddev->recovery_disabled;
3232	}
3233	md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
3234}
3235
3236static int want_replace(struct stripe_head *sh, int disk_idx)
3237{
3238	struct md_rdev *rdev;
3239	int rv = 0;
3240	/* Doing recovery so rcu locking not required */
3241	rdev = sh->raid_conf->disks[disk_idx].replacement;
 
3242	if (rdev
3243	    && !test_bit(Faulty, &rdev->flags)
3244	    && !test_bit(In_sync, &rdev->flags)
3245	    && (rdev->recovery_offset <= sh->sector
3246		|| rdev->mddev->recovery_cp <= sh->sector))
3247		rv = 1;
3248
3249	return rv;
3250}
3251
3252/* fetch_block - checks the given member device to see if its data needs
3253 * to be read or computed to satisfy a request.
3254 *
3255 * Returns 1 when no more member devices need to be checked, otherwise returns
3256 * 0 to tell the loop in handle_stripe_fill to continue
3257 */
3258
3259static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3260			   int disk_idx, int disks)
3261{
3262	struct r5dev *dev = &sh->dev[disk_idx];
3263	struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3264				  &sh->dev[s->failed_num[1]] };
3265	int i;
3266
3267
3268	if (test_bit(R5_LOCKED, &dev->flags) ||
3269	    test_bit(R5_UPTODATE, &dev->flags))
3270		/* No point reading this as we already have it or have
3271		 * decided to get it.
3272		 */
3273		return 0;
3274
3275	if (dev->toread ||
3276	    (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3277		/* We need this block to directly satisfy a request */
3278		return 1;
3279
3280	if (s->syncing || s->expanding ||
3281	    (s->replacing && want_replace(sh, disk_idx)))
3282		/* When syncing, or expanding we read everything.
3283		 * When replacing, we need the replaced block.
3284		 */
3285		return 1;
3286
3287	if ((s->failed >= 1 && fdev[0]->toread) ||
3288	    (s->failed >= 2 && fdev[1]->toread))
3289		/* If we want to read from a failed device, then
3290		 * we need to actually read every other device.
3291		 */
3292		return 1;
3293
3294	/* Sometimes neither read-modify-write nor reconstruct-write
3295	 * cycles can work.  In those cases we read every block we
3296	 * can.  Then the parity-update is certain to have enough to
3297	 * work with.
3298	 * This can only be a problem when we need to write something,
3299	 * and some device has failed.  If either of those tests
3300	 * fail we need look no further.
3301	 */
3302	if (!s->failed || !s->to_write)
3303		return 0;
3304
3305	if (test_bit(R5_Insync, &dev->flags) &&
3306	    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3307		/* Pre-reads at not permitted until after short delay
3308		 * to gather multiple requests.  However if this
3309		 * device is no Insync, the block could only be be computed
3310		 * and there is no need to delay that.
3311		 */
3312		return 0;
3313
3314	for (i = 0; i < s->failed && i < 2; i++) {
3315		if (fdev[i]->towrite &&
3316		    !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3317		    !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3318			/* If we have a partial write to a failed
3319			 * device, then we will need to reconstruct
3320			 * the content of that device, so all other
3321			 * devices must be read.
3322			 */
3323			return 1;
3324	}
3325
3326	/* If we are forced to do a reconstruct-write, either because
3327	 * the current RAID6 implementation only supports that, or
3328	 * or because parity cannot be trusted and we are currently
3329	 * recovering it, there is extra need to be careful.
3330	 * If one of the devices that we would need to read, because
3331	 * it is not being overwritten (and maybe not written at all)
3332	 * is missing/faulty, then we need to read everything we can.
3333	 */
3334	if (sh->raid_conf->level != 6 &&
3335	    sh->sector < sh->raid_conf->mddev->recovery_cp)
3336		/* reconstruct-write isn't being forced */
3337		return 0;
3338	for (i = 0; i < s->failed && i < 2; i++) {
3339		if (s->failed_num[i] != sh->pd_idx &&
3340		    s->failed_num[i] != sh->qd_idx &&
3341		    !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3342		    !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3343			return 1;
3344	}
3345
3346	return 0;
3347}
3348
 
 
 
 
 
 
3349static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3350		       int disk_idx, int disks)
3351{
3352	struct r5dev *dev = &sh->dev[disk_idx];
3353
3354	/* is the data in this block needed, and can we get it? */
3355	if (need_this_block(sh, s, disk_idx, disks)) {
3356		/* we would like to get this block, possibly by computing it,
3357		 * otherwise read it if the backing disk is insync
3358		 */
3359		BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3360		BUG_ON(test_bit(R5_Wantread, &dev->flags));
3361		BUG_ON(sh->batch_head);
3362		if ((s->uptodate == disks - 1) &&
3363		    (s->failed && (disk_idx == s->failed_num[0] ||
3364				   disk_idx == s->failed_num[1]))) {
3365			/* have disk failed, and we're requested to fetch it;
3366			 * do compute it
3367			 */
3368			pr_debug("Computing stripe %llu block %d\n",
3369			       (unsigned long long)sh->sector, disk_idx);
3370			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3371			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3372			set_bit(R5_Wantcompute, &dev->flags);
3373			sh->ops.target = disk_idx;
3374			sh->ops.target2 = -1; /* no 2nd target */
3375			s->req_compute = 1;
3376			/* Careful: from this point on 'uptodate' is in the eye
3377			 * of raid_run_ops which services 'compute' operations
3378			 * before writes. R5_Wantcompute flags a block that will
3379			 * be R5_UPTODATE by the time it is needed for a
3380			 * subsequent operation.
3381			 */
3382			s->uptodate++;
3383			return 1;
3384		} else if (s->uptodate == disks-2 && s->failed >= 2) {
3385			/* Computing 2-failure is *very* expensive; only
3386			 * do it if failed >= 2
3387			 */
3388			int other;
3389			for (other = disks; other--; ) {
3390				if (other == disk_idx)
3391					continue;
3392				if (!test_bit(R5_UPTODATE,
3393				      &sh->dev[other].flags))
3394					break;
3395			}
3396			BUG_ON(other < 0);
3397			pr_debug("Computing stripe %llu blocks %d,%d\n",
3398			       (unsigned long long)sh->sector,
3399			       disk_idx, other);
3400			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3401			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3402			set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
3403			set_bit(R5_Wantcompute, &sh->dev[other].flags);
3404			sh->ops.target = disk_idx;
3405			sh->ops.target2 = other;
3406			s->uptodate += 2;
3407			s->req_compute = 1;
3408			return 1;
3409		} else if (test_bit(R5_Insync, &dev->flags)) {
3410			set_bit(R5_LOCKED, &dev->flags);
3411			set_bit(R5_Wantread, &dev->flags);
3412			s->locked++;
3413			pr_debug("Reading block %d (sync=%d)\n",
3414				disk_idx, s->syncing);
3415		}
3416	}
3417
3418	return 0;
3419}
3420
3421/**
3422 * handle_stripe_fill - read or compute data to satisfy pending requests.
3423 */
3424static void handle_stripe_fill(struct stripe_head *sh,
3425			       struct stripe_head_state *s,
3426			       int disks)
3427{
3428	int i;
3429
3430	/* look for blocks to read/compute, skip this if a compute
3431	 * is already in flight, or if the stripe contents are in the
3432	 * midst of changing due to a write
3433	 */
3434	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3435	    !sh->reconstruct_state)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3436		for (i = disks; i--; )
3437			if (fetch_block(sh, s, i, disks))
3438				break;
 
 
3439	set_bit(STRIPE_HANDLE, &sh->state);
3440}
3441
3442static void break_stripe_batch_list(struct stripe_head *head_sh,
3443				    unsigned long handle_flags);
3444/* handle_stripe_clean_event
3445 * any written block on an uptodate or failed drive can be returned.
3446 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
3447 * never LOCKED, so we don't need to test 'failed' directly.
3448 */
3449static void handle_stripe_clean_event(struct r5conf *conf,
3450	struct stripe_head *sh, int disks, struct bio_list *return_bi)
3451{
3452	int i;
3453	struct r5dev *dev;
3454	int discard_pending = 0;
3455	struct stripe_head *head_sh = sh;
3456	bool do_endio = false;
3457
3458	for (i = disks; i--; )
3459		if (sh->dev[i].written) {
3460			dev = &sh->dev[i];
3461			if (!test_bit(R5_LOCKED, &dev->flags) &&
3462			    (test_bit(R5_UPTODATE, &dev->flags) ||
3463			     test_bit(R5_Discard, &dev->flags) ||
3464			     test_bit(R5_SkipCopy, &dev->flags))) {
3465				/* We can return any write requests */
3466				struct bio *wbi, *wbi2;
3467				pr_debug("Return write for disc %d\n", i);
3468				if (test_and_clear_bit(R5_Discard, &dev->flags))
3469					clear_bit(R5_UPTODATE, &dev->flags);
3470				if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3471					WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3472				}
3473				do_endio = true;
3474
3475returnbi:
3476				dev->page = dev->orig_page;
3477				wbi = dev->written;
3478				dev->written = NULL;
3479				while (wbi && wbi->bi_iter.bi_sector <
3480					dev->sector + STRIPE_SECTORS) {
3481					wbi2 = r5_next_bio(wbi, dev->sector);
3482					if (!raid5_dec_bi_active_stripes(wbi)) {
3483						md_write_end(conf->mddev);
3484						bio_list_add(return_bi, wbi);
3485					}
3486					wbi = wbi2;
3487				}
3488				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3489						STRIPE_SECTORS,
3490					 !test_bit(STRIPE_DEGRADED, &sh->state),
3491						0);
3492				if (head_sh->batch_head) {
3493					sh = list_first_entry(&sh->batch_list,
3494							      struct stripe_head,
3495							      batch_list);
3496					if (sh != head_sh) {
3497						dev = &sh->dev[i];
3498						goto returnbi;
3499					}
3500				}
3501				sh = head_sh;
3502				dev = &sh->dev[i];
3503			} else if (test_bit(R5_Discard, &dev->flags))
3504				discard_pending = 1;
3505		}
3506
3507	r5l_stripe_write_finished(sh);
3508
3509	if (!discard_pending &&
3510	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
3511		int hash;
3512		clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
3513		clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
3514		if (sh->qd_idx >= 0) {
3515			clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
3516			clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
3517		}
3518		/* now that discard is done we can proceed with any sync */
3519		clear_bit(STRIPE_DISCARD, &sh->state);
3520		/*
3521		 * SCSI discard will change some bio fields and the stripe has
3522		 * no updated data, so remove it from hash list and the stripe
3523		 * will be reinitialized
3524		 */
3525unhash:
3526		hash = sh->hash_lock_index;
3527		spin_lock_irq(conf->hash_locks + hash);
3528		remove_hash(sh);
3529		spin_unlock_irq(conf->hash_locks + hash);
3530		if (head_sh->batch_head) {
3531			sh = list_first_entry(&sh->batch_list,
3532					      struct stripe_head, batch_list);
3533			if (sh != head_sh)
3534					goto unhash;
3535		}
3536		sh = head_sh;
3537
3538		if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
3539			set_bit(STRIPE_HANDLE, &sh->state);
3540
3541	}
3542
3543	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3544		if (atomic_dec_and_test(&conf->pending_full_writes))
3545			md_wakeup_thread(conf->mddev->thread);
3546
3547	if (head_sh->batch_head && do_endio)
3548		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
3549}
3550
3551static void handle_stripe_dirtying(struct r5conf *conf,
3552				   struct stripe_head *sh,
3553				   struct stripe_head_state *s,
3554				   int disks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3555{
3556	int rmw = 0, rcw = 0, i;
3557	sector_t recovery_cp = conf->mddev->recovery_cp;
3558
3559	/* Check whether resync is now happening or should start.
3560	 * If yes, then the array is dirty (after unclean shutdown or
3561	 * initial creation), so parity in some stripes might be inconsistent.
3562	 * In this case, we need to always do reconstruct-write, to ensure
3563	 * that in case of drive failure or read-error correction, we
3564	 * generate correct data from the parity.
3565	 */
3566	if (conf->rmw_level == PARITY_DISABLE_RMW ||
3567	    (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
3568	     s->failed == 0)) {
3569		/* Calculate the real rcw later - for now make it
3570		 * look like rcw is cheaper
3571		 */
3572		rcw = 1; rmw = 2;
3573		pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
3574			 conf->rmw_level, (unsigned long long)recovery_cp,
3575			 (unsigned long long)sh->sector);
3576	} else for (i = disks; i--; ) {
3577		/* would I have to read this buffer for read_modify_write */
3578		struct r5dev *dev = &sh->dev[i];
3579		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
 
 
3580		    !test_bit(R5_LOCKED, &dev->flags) &&
3581		    !(test_bit(R5_UPTODATE, &dev->flags) ||
3582		      test_bit(R5_Wantcompute, &dev->flags))) {
3583			if (test_bit(R5_Insync, &dev->flags))
3584				rmw++;
3585			else
3586				rmw += 2*disks;  /* cannot read it */
3587		}
3588		/* Would I have to read this buffer for reconstruct_write */
3589		if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3590		    i != sh->pd_idx && i != sh->qd_idx &&
3591		    !test_bit(R5_LOCKED, &dev->flags) &&
3592		    !(test_bit(R5_UPTODATE, &dev->flags) ||
3593		    test_bit(R5_Wantcompute, &dev->flags))) {
3594			if (test_bit(R5_Insync, &dev->flags))
3595				rcw++;
3596			else
3597				rcw += 2*disks;
3598		}
3599	}
 
3600	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
3601		(unsigned long long)sh->sector, rmw, rcw);
3602	set_bit(STRIPE_HANDLE, &sh->state);
3603	if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) {
3604		/* prefer read-modify-write, but need to get some data */
3605		if (conf->mddev->queue)
3606			blk_add_trace_msg(conf->mddev->queue,
3607					  "raid5 rmw %llu %d",
3608					  (unsigned long long)sh->sector, rmw);
3609		for (i = disks; i--; ) {
3610			struct r5dev *dev = &sh->dev[i];
3611			if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3612			    !test_bit(R5_LOCKED, &dev->flags) &&
3613			    !(test_bit(R5_UPTODATE, &dev->flags) ||
3614			    test_bit(R5_Wantcompute, &dev->flags)) &&
3615			    test_bit(R5_Insync, &dev->flags)) {
3616				if (test_bit(STRIPE_PREREAD_ACTIVE,
3617					     &sh->state)) {
3618					pr_debug("Read_old block %d for r-m-w\n",
3619						 i);
3620					set_bit(R5_LOCKED, &dev->flags);
3621					set_bit(R5_Wantread, &dev->flags);
3622					s->locked++;
3623				} else {
3624					set_bit(STRIPE_DELAYED, &sh->state);
3625					set_bit(STRIPE_HANDLE, &sh->state);
3626				}
3627			}
3628		}
3629	}
3630	if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) {
3631		/* want reconstruct write, but need to get some data */
3632		int qread =0;
3633		rcw = 0;
3634		for (i = disks; i--; ) {
3635			struct r5dev *dev = &sh->dev[i];
3636			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3637			    i != sh->pd_idx && i != sh->qd_idx &&
3638			    !test_bit(R5_LOCKED, &dev->flags) &&
3639			    !(test_bit(R5_UPTODATE, &dev->flags) ||
3640			      test_bit(R5_Wantcompute, &dev->flags))) {
3641				rcw++;
3642				if (test_bit(R5_Insync, &dev->flags) &&
3643				    test_bit(STRIPE_PREREAD_ACTIVE,
3644					     &sh->state)) {
3645					pr_debug("Read_old block "
3646						"%d for Reconstruct\n", i);
3647					set_bit(R5_LOCKED, &dev->flags);
3648					set_bit(R5_Wantread, &dev->flags);
3649					s->locked++;
3650					qread++;
3651				} else {
3652					set_bit(STRIPE_DELAYED, &sh->state);
3653					set_bit(STRIPE_HANDLE, &sh->state);
3654				}
3655			}
3656		}
3657		if (rcw && conf->mddev->queue)
3658			blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
3659					  (unsigned long long)sh->sector,
3660					  rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
3661	}
3662
3663	if (rcw > disks && rmw > disks &&
3664	    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3665		set_bit(STRIPE_DELAYED, &sh->state);
3666
3667	/* now if nothing is locked, and if we have enough data,
3668	 * we can start a write request
3669	 */
3670	/* since handle_stripe can be called at any time we need to handle the
3671	 * case where a compute block operation has been submitted and then a
3672	 * subsequent call wants to start a write request.  raid_run_ops only
3673	 * handles the case where compute block and reconstruct are requested
3674	 * simultaneously.  If this is not the case then new writes need to be
3675	 * held off until the compute completes.
3676	 */
3677	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
3678	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
3679	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
3680		schedule_reconstruction(sh, s, rcw == 0, 0);
 
3681}
3682
3683static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
3684				struct stripe_head_state *s, int disks)
3685{
3686	struct r5dev *dev = NULL;
3687
3688	BUG_ON(sh->batch_head);
3689	set_bit(STRIPE_HANDLE, &sh->state);
3690
3691	switch (sh->check_state) {
3692	case check_state_idle:
3693		/* start a new check operation if there are no failures */
3694		if (s->failed == 0) {
3695			BUG_ON(s->uptodate != disks);
3696			sh->check_state = check_state_run;
3697			set_bit(STRIPE_OP_CHECK, &s->ops_request);
3698			clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
3699			s->uptodate--;
3700			break;
3701		}
3702		dev = &sh->dev[s->failed_num[0]];
3703		/* fall through */
3704	case check_state_compute_result:
3705		sh->check_state = check_state_idle;
3706		if (!dev)
3707			dev = &sh->dev[sh->pd_idx];
3708
3709		/* check that a write has not made the stripe insync */
3710		if (test_bit(STRIPE_INSYNC, &sh->state))
3711			break;
3712
3713		/* either failed parity check, or recovery is happening */
3714		BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
3715		BUG_ON(s->uptodate != disks);
3716
3717		set_bit(R5_LOCKED, &dev->flags);
3718		s->locked++;
3719		set_bit(R5_Wantwrite, &dev->flags);
3720
3721		clear_bit(STRIPE_DEGRADED, &sh->state);
3722		set_bit(STRIPE_INSYNC, &sh->state);
3723		break;
3724	case check_state_run:
3725		break; /* we will be called again upon completion */
3726	case check_state_check_result:
3727		sh->check_state = check_state_idle;
3728
3729		/* if a failure occurred during the check operation, leave
3730		 * STRIPE_INSYNC not set and let the stripe be handled again
3731		 */
3732		if (s->failed)
3733			break;
3734
3735		/* handle a successful check operation, if parity is correct
3736		 * we are done.  Otherwise update the mismatch count and repair
3737		 * parity if !MD_RECOVERY_CHECK
3738		 */
3739		if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
3740			/* parity is correct (on disc,
3741			 * not in buffer any more)
3742			 */
3743			set_bit(STRIPE_INSYNC, &sh->state);
3744		else {
3745			atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
3746			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
3747				/* don't try to repair!! */
3748				set_bit(STRIPE_INSYNC, &sh->state);
3749			else {
3750				sh->check_state = check_state_compute_run;
3751				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3752				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3753				set_bit(R5_Wantcompute,
3754					&sh->dev[sh->pd_idx].flags);
3755				sh->ops.target = sh->pd_idx;
3756				sh->ops.target2 = -1;
3757				s->uptodate++;
3758			}
3759		}
3760		break;
3761	case check_state_compute_run:
3762		break;
3763	default:
3764		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
3765		       __func__, sh->check_state,
3766		       (unsigned long long) sh->sector);
3767		BUG();
3768	}
3769}
3770
3771static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
3772				  struct stripe_head_state *s,
3773				  int disks)
3774{
3775	int pd_idx = sh->pd_idx;
3776	int qd_idx = sh->qd_idx;
3777	struct r5dev *dev;
3778
3779	BUG_ON(sh->batch_head);
3780	set_bit(STRIPE_HANDLE, &sh->state);
3781
3782	BUG_ON(s->failed > 2);
3783
3784	/* Want to check and possibly repair P and Q.
3785	 * However there could be one 'failed' device, in which
3786	 * case we can only check one of them, possibly using the
3787	 * other to generate missing data
3788	 */
3789
3790	switch (sh->check_state) {
3791	case check_state_idle:
3792		/* start a new check operation if there are < 2 failures */
3793		if (s->failed == s->q_failed) {
3794			/* The only possible failed device holds Q, so it
3795			 * makes sense to check P (If anything else were failed,
3796			 * we would have used P to recreate it).
3797			 */
3798			sh->check_state = check_state_run;
3799		}
3800		if (!s->q_failed && s->failed < 2) {
3801			/* Q is not failed, and we didn't use it to generate
3802			 * anything, so it makes sense to check it
3803			 */
3804			if (sh->check_state == check_state_run)
3805				sh->check_state = check_state_run_pq;
3806			else
3807				sh->check_state = check_state_run_q;
3808		}
3809
3810		/* discard potentially stale zero_sum_result */
3811		sh->ops.zero_sum_result = 0;
3812
3813		if (sh->check_state == check_state_run) {
3814			/* async_xor_zero_sum destroys the contents of P */
3815			clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3816			s->uptodate--;
3817		}
3818		if (sh->check_state >= check_state_run &&
3819		    sh->check_state <= check_state_run_pq) {
3820			/* async_syndrome_zero_sum preserves P and Q, so
3821			 * no need to mark them !uptodate here
3822			 */
3823			set_bit(STRIPE_OP_CHECK, &s->ops_request);
3824			break;
3825		}
3826
3827		/* we have 2-disk failure */
3828		BUG_ON(s->failed != 2);
3829		/* fall through */
3830	case check_state_compute_result:
3831		sh->check_state = check_state_idle;
3832
3833		/* check that a write has not made the stripe insync */
3834		if (test_bit(STRIPE_INSYNC, &sh->state))
3835			break;
3836
3837		/* now write out any block on a failed drive,
3838		 * or P or Q if they were recomputed
3839		 */
3840		BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
3841		if (s->failed == 2) {
3842			dev = &sh->dev[s->failed_num[1]];
3843			s->locked++;
3844			set_bit(R5_LOCKED, &dev->flags);
3845			set_bit(R5_Wantwrite, &dev->flags);
3846		}
3847		if (s->failed >= 1) {
3848			dev = &sh->dev[s->failed_num[0]];
3849			s->locked++;
3850			set_bit(R5_LOCKED, &dev->flags);
3851			set_bit(R5_Wantwrite, &dev->flags);
3852		}
3853		if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
3854			dev = &sh->dev[pd_idx];
3855			s->locked++;
3856			set_bit(R5_LOCKED, &dev->flags);
3857			set_bit(R5_Wantwrite, &dev->flags);
3858		}
3859		if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
3860			dev = &sh->dev[qd_idx];
3861			s->locked++;
3862			set_bit(R5_LOCKED, &dev->flags);
3863			set_bit(R5_Wantwrite, &dev->flags);
3864		}
3865		clear_bit(STRIPE_DEGRADED, &sh->state);
3866
3867		set_bit(STRIPE_INSYNC, &sh->state);
3868		break;
3869	case check_state_run:
3870	case check_state_run_q:
3871	case check_state_run_pq:
3872		break; /* we will be called again upon completion */
3873	case check_state_check_result:
3874		sh->check_state = check_state_idle;
3875
3876		/* handle a successful check operation, if parity is correct
3877		 * we are done.  Otherwise update the mismatch count and repair
3878		 * parity if !MD_RECOVERY_CHECK
3879		 */
3880		if (sh->ops.zero_sum_result == 0) {
3881			/* both parities are correct */
3882			if (!s->failed)
3883				set_bit(STRIPE_INSYNC, &sh->state);
3884			else {
3885				/* in contrast to the raid5 case we can validate
3886				 * parity, but still have a failure to write
3887				 * back
3888				 */
3889				sh->check_state = check_state_compute_result;
3890				/* Returning at this point means that we may go
3891				 * off and bring p and/or q uptodate again so
3892				 * we make sure to check zero_sum_result again
3893				 * to verify if p or q need writeback
3894				 */
3895			}
3896		} else {
3897			atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
3898			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
3899				/* don't try to repair!! */
3900				set_bit(STRIPE_INSYNC, &sh->state);
3901			else {
3902				int *target = &sh->ops.target;
3903
3904				sh->ops.target = -1;
3905				sh->ops.target2 = -1;
3906				sh->check_state = check_state_compute_run;
3907				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3908				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3909				if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
3910					set_bit(R5_Wantcompute,
3911						&sh->dev[pd_idx].flags);
3912					*target = pd_idx;
3913					target = &sh->ops.target2;
3914					s->uptodate++;
3915				}
3916				if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
3917					set_bit(R5_Wantcompute,
3918						&sh->dev[qd_idx].flags);
3919					*target = qd_idx;
3920					s->uptodate++;
3921				}
3922			}
3923		}
3924		break;
3925	case check_state_compute_run:
3926		break;
3927	default:
3928		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
3929		       __func__, sh->check_state,
3930		       (unsigned long long) sh->sector);
3931		BUG();
3932	}
3933}
3934
3935static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
3936{
3937	int i;
3938
3939	/* We have read all the blocks in this stripe and now we need to
3940	 * copy some of them into a target stripe for expand.
3941	 */
3942	struct dma_async_tx_descriptor *tx = NULL;
3943	BUG_ON(sh->batch_head);
3944	clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3945	for (i = 0; i < sh->disks; i++)
3946		if (i != sh->pd_idx && i != sh->qd_idx) {
3947			int dd_idx, j;
3948			struct stripe_head *sh2;
3949			struct async_submit_ctl submit;
3950
3951			sector_t bn = raid5_compute_blocknr(sh, i, 1);
3952			sector_t s = raid5_compute_sector(conf, bn, 0,
3953							  &dd_idx, NULL);
3954			sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
3955			if (sh2 == NULL)
3956				/* so far only the early blocks of this stripe
3957				 * have been requested.  When later blocks
3958				 * get requested, we will try again
3959				 */
3960				continue;
3961			if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
3962			   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
3963				/* must have already done this block */
3964				raid5_release_stripe(sh2);
3965				continue;
3966			}
3967
3968			/* place all the copies on one channel */
3969			init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
3970			tx = async_memcpy(sh2->dev[dd_idx].page,
3971					  sh->dev[i].page, 0, 0, STRIPE_SIZE,
3972					  &submit);
3973
3974			set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
3975			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
3976			for (j = 0; j < conf->raid_disks; j++)
3977				if (j != sh2->pd_idx &&
3978				    j != sh2->qd_idx &&
3979				    !test_bit(R5_Expanded, &sh2->dev[j].flags))
3980					break;
3981			if (j == conf->raid_disks) {
3982				set_bit(STRIPE_EXPAND_READY, &sh2->state);
3983				set_bit(STRIPE_HANDLE, &sh2->state);
3984			}
3985			raid5_release_stripe(sh2);
3986
3987		}
3988	/* done submitting copies, wait for them to complete */
3989	async_tx_quiesce(&tx);
3990}
3991
3992/*
3993 * handle_stripe - do things to a stripe.
3994 *
3995 * We lock the stripe by setting STRIPE_ACTIVE and then examine the
3996 * state of various bits to see what needs to be done.
3997 * Possible results:
3998 *    return some read requests which now have data
3999 *    return some write requests which are safely on storage
4000 *    schedule a read on some buffers
4001 *    schedule a write of some buffers
4002 *    return confirmation of parity correctness
4003 *
4004 */
4005
4006static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4007{
4008	struct r5conf *conf = sh->raid_conf;
4009	int disks = sh->disks;
4010	struct r5dev *dev;
4011	int i;
4012	int do_recovery = 0;
4013
4014	memset(s, 0, sizeof(*s));
4015
4016	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4017	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4018	s->failed_num[0] = -1;
4019	s->failed_num[1] = -1;
4020	s->log_failed = r5l_log_disk_error(conf);
4021
4022	/* Now to look around and see what can be done */
4023	rcu_read_lock();
4024	for (i=disks; i--; ) {
4025		struct md_rdev *rdev;
4026		sector_t first_bad;
4027		int bad_sectors;
4028		int is_bad = 0;
4029
4030		dev = &sh->dev[i];
4031
4032		pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4033			 i, dev->flags,
4034			 dev->toread, dev->towrite, dev->written);
4035		/* maybe we can reply to a read
4036		 *
4037		 * new wantfill requests are only permitted while
4038		 * ops_complete_biofill is guaranteed to be inactive
4039		 */
4040		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4041		    !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4042			set_bit(R5_Wantfill, &dev->flags);
4043
4044		/* now count some things */
4045		if (test_bit(R5_LOCKED, &dev->flags))
4046			s->locked++;
4047		if (test_bit(R5_UPTODATE, &dev->flags))
4048			s->uptodate++;
4049		if (test_bit(R5_Wantcompute, &dev->flags)) {
4050			s->compute++;
4051			BUG_ON(s->compute > 2);
4052		}
4053
4054		if (test_bit(R5_Wantfill, &dev->flags))
4055			s->to_fill++;
4056		else if (dev->toread)
4057			s->to_read++;
4058		if (dev->towrite) {
4059			s->to_write++;
4060			if (!test_bit(R5_OVERWRITE, &dev->flags))
4061				s->non_overwrite++;
4062		}
4063		if (dev->written)
4064			s->written++;
4065		/* Prefer to use the replacement for reads, but only
4066		 * if it is recovered enough and has no bad blocks.
4067		 */
4068		rdev = rcu_dereference(conf->disks[i].replacement);
4069		if (rdev && !test_bit(Faulty, &rdev->flags) &&
4070		    rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
4071		    !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4072				 &first_bad, &bad_sectors))
4073			set_bit(R5_ReadRepl, &dev->flags);
4074		else {
4075			if (rdev && !test_bit(Faulty, &rdev->flags))
4076				set_bit(R5_NeedReplace, &dev->flags);
4077			else
4078				clear_bit(R5_NeedReplace, &dev->flags);
4079			rdev = rcu_dereference(conf->disks[i].rdev);
4080			clear_bit(R5_ReadRepl, &dev->flags);
4081		}
4082		if (rdev && test_bit(Faulty, &rdev->flags))
4083			rdev = NULL;
4084		if (rdev) {
4085			is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4086					     &first_bad, &bad_sectors);
4087			if (s->blocked_rdev == NULL
4088			    && (test_bit(Blocked, &rdev->flags)
4089				|| is_bad < 0)) {
4090				if (is_bad < 0)
4091					set_bit(BlockedBadBlocks,
4092						&rdev->flags);
4093				s->blocked_rdev = rdev;
4094				atomic_inc(&rdev->nr_pending);
4095			}
4096		}
4097		clear_bit(R5_Insync, &dev->flags);
4098		if (!rdev)
4099			/* Not in-sync */;
4100		else if (is_bad) {
4101			/* also not in-sync */
4102			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4103			    test_bit(R5_UPTODATE, &dev->flags)) {
4104				/* treat as in-sync, but with a read error
4105				 * which we can now try to correct
4106				 */
4107				set_bit(R5_Insync, &dev->flags);
4108				set_bit(R5_ReadError, &dev->flags);
4109			}
4110		} else if (test_bit(In_sync, &rdev->flags))
4111			set_bit(R5_Insync, &dev->flags);
4112		else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
4113			/* in sync if before recovery_offset */
4114			set_bit(R5_Insync, &dev->flags);
4115		else if (test_bit(R5_UPTODATE, &dev->flags) &&
4116			 test_bit(R5_Expanded, &dev->flags))
4117			/* If we've reshaped into here, we assume it is Insync.
4118			 * We will shortly update recovery_offset to make
4119			 * it official.
4120			 */
4121			set_bit(R5_Insync, &dev->flags);
4122
4123		if (test_bit(R5_WriteError, &dev->flags)) {
4124			/* This flag does not apply to '.replacement'
4125			 * only to .rdev, so make sure to check that*/
4126			struct md_rdev *rdev2 = rcu_dereference(
4127				conf->disks[i].rdev);
4128			if (rdev2 == rdev)
4129				clear_bit(R5_Insync, &dev->flags);
4130			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4131				s->handle_bad_blocks = 1;
4132				atomic_inc(&rdev2->nr_pending);
4133			} else
4134				clear_bit(R5_WriteError, &dev->flags);
4135		}
4136		if (test_bit(R5_MadeGood, &dev->flags)) {
4137			/* This flag does not apply to '.replacement'
4138			 * only to .rdev, so make sure to check that*/
4139			struct md_rdev *rdev2 = rcu_dereference(
4140				conf->disks[i].rdev);
4141			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4142				s->handle_bad_blocks = 1;
4143				atomic_inc(&rdev2->nr_pending);
4144			} else
4145				clear_bit(R5_MadeGood, &dev->flags);
4146		}
4147		if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4148			struct md_rdev *rdev2 = rcu_dereference(
4149				conf->disks[i].replacement);
4150			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4151				s->handle_bad_blocks = 1;
4152				atomic_inc(&rdev2->nr_pending);
4153			} else
4154				clear_bit(R5_MadeGoodRepl, &dev->flags);
4155		}
4156		if (!test_bit(R5_Insync, &dev->flags)) {
4157			/* The ReadError flag will just be confusing now */
4158			clear_bit(R5_ReadError, &dev->flags);
4159			clear_bit(R5_ReWrite, &dev->flags);
4160		}
4161		if (test_bit(R5_ReadError, &dev->flags))
4162			clear_bit(R5_Insync, &dev->flags);
4163		if (!test_bit(R5_Insync, &dev->flags)) {
4164			if (s->failed < 2)
4165				s->failed_num[s->failed] = i;
4166			s->failed++;
4167			if (rdev && !test_bit(Faulty, &rdev->flags))
4168				do_recovery = 1;
4169		}
 
 
 
 
 
4170	}
4171	if (test_bit(STRIPE_SYNCING, &sh->state)) {
4172		/* If there is a failed device being replaced,
4173		 *     we must be recovering.
4174		 * else if we are after recovery_cp, we must be syncing
4175		 * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
4176		 * else we can only be replacing
4177		 * sync and recovery both need to read all devices, and so
4178		 * use the same flag.
4179		 */
4180		if (do_recovery ||
4181		    sh->sector >= conf->mddev->recovery_cp ||
4182		    test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4183			s->syncing = 1;
4184		else
4185			s->replacing = 1;
4186	}
4187	rcu_read_unlock();
4188}
4189
4190static int clear_batch_ready(struct stripe_head *sh)
4191{
4192	/* Return '1' if this is a member of batch, or
4193	 * '0' if it is a lone stripe or a head which can now be
4194	 * handled.
4195	 */
4196	struct stripe_head *tmp;
4197	if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4198		return (sh->batch_head && sh->batch_head != sh);
4199	spin_lock(&sh->stripe_lock);
4200	if (!sh->batch_head) {
4201		spin_unlock(&sh->stripe_lock);
4202		return 0;
4203	}
4204
4205	/*
4206	 * this stripe could be added to a batch list before we check
4207	 * BATCH_READY, skips it
4208	 */
4209	if (sh->batch_head != sh) {
4210		spin_unlock(&sh->stripe_lock);
4211		return 1;
4212	}
4213	spin_lock(&sh->batch_lock);
4214	list_for_each_entry(tmp, &sh->batch_list, batch_list)
4215		clear_bit(STRIPE_BATCH_READY, &tmp->state);
4216	spin_unlock(&sh->batch_lock);
4217	spin_unlock(&sh->stripe_lock);
4218
4219	/*
4220	 * BATCH_READY is cleared, no new stripes can be added.
4221	 * batch_list can be accessed without lock
4222	 */
4223	return 0;
4224}
4225
4226static void break_stripe_batch_list(struct stripe_head *head_sh,
4227				    unsigned long handle_flags)
4228{
4229	struct stripe_head *sh, *next;
4230	int i;
4231	int do_wakeup = 0;
4232
4233	list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4234
4235		list_del_init(&sh->batch_list);
4236
4237		WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4238					  (1 << STRIPE_SYNCING) |
4239					  (1 << STRIPE_REPLACED) |
4240					  (1 << STRIPE_DELAYED) |
4241					  (1 << STRIPE_BIT_DELAY) |
4242					  (1 << STRIPE_FULL_WRITE) |
4243					  (1 << STRIPE_BIOFILL_RUN) |
4244					  (1 << STRIPE_COMPUTE_RUN)  |
4245					  (1 << STRIPE_OPS_REQ_PENDING) |
4246					  (1 << STRIPE_DISCARD) |
4247					  (1 << STRIPE_BATCH_READY) |
4248					  (1 << STRIPE_BATCH_ERR) |
4249					  (1 << STRIPE_BITMAP_PENDING)),
4250			"stripe state: %lx\n", sh->state);
4251		WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4252					      (1 << STRIPE_REPLACED)),
4253			"head stripe state: %lx\n", head_sh->state);
4254
4255		set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4256					    (1 << STRIPE_PREREAD_ACTIVE) |
4257					    (1 << STRIPE_DEGRADED)),
4258			      head_sh->state & (1 << STRIPE_INSYNC));
4259
4260		sh->check_state = head_sh->check_state;
4261		sh->reconstruct_state = head_sh->reconstruct_state;
4262		for (i = 0; i < sh->disks; i++) {
4263			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4264				do_wakeup = 1;
4265			sh->dev[i].flags = head_sh->dev[i].flags &
4266				(~((1 << R5_WriteError) | (1 << R5_Overlap)));
4267		}
4268		spin_lock_irq(&sh->stripe_lock);
4269		sh->batch_head = NULL;
4270		spin_unlock_irq(&sh->stripe_lock);
4271		if (handle_flags == 0 ||
4272		    sh->state & handle_flags)
4273			set_bit(STRIPE_HANDLE, &sh->state);
4274		raid5_release_stripe(sh);
4275	}
4276	spin_lock_irq(&head_sh->stripe_lock);
4277	head_sh->batch_head = NULL;
4278	spin_unlock_irq(&head_sh->stripe_lock);
4279	for (i = 0; i < head_sh->disks; i++)
4280		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4281			do_wakeup = 1;
4282	if (head_sh->state & handle_flags)
4283		set_bit(STRIPE_HANDLE, &head_sh->state);
4284
4285	if (do_wakeup)
4286		wake_up(&head_sh->raid_conf->wait_for_overlap);
4287}
4288
4289static void handle_stripe(struct stripe_head *sh)
4290{
4291	struct stripe_head_state s;
4292	struct r5conf *conf = sh->raid_conf;
4293	int i;
4294	int prexor;
4295	int disks = sh->disks;
4296	struct r5dev *pdev, *qdev;
4297
4298	clear_bit(STRIPE_HANDLE, &sh->state);
4299	if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
4300		/* already being handled, ensure it gets handled
4301		 * again when current action finishes */
4302		set_bit(STRIPE_HANDLE, &sh->state);
4303		return;
4304	}
4305
4306	if (clear_batch_ready(sh) ) {
4307		clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
4308		return;
4309	}
4310
4311	if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4312		break_stripe_batch_list(sh, 0);
4313
4314	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4315		spin_lock(&sh->stripe_lock);
4316		/* Cannot process 'sync' concurrently with 'discard' */
4317		if (!test_bit(STRIPE_DISCARD, &sh->state) &&
4318		    test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
4319			set_bit(STRIPE_SYNCING, &sh->state);
4320			clear_bit(STRIPE_INSYNC, &sh->state);
4321			clear_bit(STRIPE_REPLACED, &sh->state);
4322		}
4323		spin_unlock(&sh->stripe_lock);
4324	}
4325	clear_bit(STRIPE_DELAYED, &sh->state);
4326
4327	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
4328		"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
4329	       (unsigned long long)sh->sector, sh->state,
4330	       atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
4331	       sh->check_state, sh->reconstruct_state);
4332
4333	analyse_stripe(sh, &s);
4334
4335	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
4336		goto finish;
4337
4338	if (s.handle_bad_blocks) {
4339		set_bit(STRIPE_HANDLE, &sh->state);
4340		goto finish;
4341	}
4342
4343	if (unlikely(s.blocked_rdev)) {
4344		if (s.syncing || s.expanding || s.expanded ||
4345		    s.replacing || s.to_write || s.written) {
4346			set_bit(STRIPE_HANDLE, &sh->state);
4347			goto finish;
4348		}
4349		/* There is nothing for the blocked_rdev to block */
4350		rdev_dec_pending(s.blocked_rdev, conf->mddev);
4351		s.blocked_rdev = NULL;
4352	}
4353
4354	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
4355		set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
4356		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
4357	}
4358
4359	pr_debug("locked=%d uptodate=%d to_read=%d"
4360	       " to_write=%d failed=%d failed_num=%d,%d\n",
4361	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4362	       s.failed_num[0], s.failed_num[1]);
4363	/* check if the array has lost more than max_degraded devices and,
4364	 * if so, some requests might need to be failed.
4365	 */
4366	if (s.failed > conf->max_degraded || s.log_failed) {
4367		sh->check_state = 0;
4368		sh->reconstruct_state = 0;
4369		break_stripe_batch_list(sh, 0);
4370		if (s.to_read+s.to_write+s.written)
4371			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
4372		if (s.syncing + s.replacing)
4373			handle_failed_sync(conf, sh, &s);
4374	}
4375
4376	/* Now we check to see if any write operations have recently
4377	 * completed
4378	 */
4379	prexor = 0;
4380	if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
4381		prexor = 1;
4382	if (sh->reconstruct_state == reconstruct_state_drain_result ||
4383	    sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
4384		sh->reconstruct_state = reconstruct_state_idle;
4385
4386		/* All the 'written' buffers and the parity block are ready to
4387		 * be written back to disk
4388		 */
4389		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
4390		       !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
4391		BUG_ON(sh->qd_idx >= 0 &&
4392		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
4393		       !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
4394		for (i = disks; i--; ) {
4395			struct r5dev *dev = &sh->dev[i];
4396			if (test_bit(R5_LOCKED, &dev->flags) &&
4397				(i == sh->pd_idx || i == sh->qd_idx ||
4398				 dev->written)) {
 
4399				pr_debug("Writing block %d\n", i);
4400				set_bit(R5_Wantwrite, &dev->flags);
4401				if (prexor)
4402					continue;
4403				if (s.failed > 1)
4404					continue;
4405				if (!test_bit(R5_Insync, &dev->flags) ||
4406				    ((i == sh->pd_idx || i == sh->qd_idx)  &&
4407				     s.failed == 0))
4408					set_bit(STRIPE_INSYNC, &sh->state);
4409			}
4410		}
4411		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4412			s.dec_preread_active = 1;
4413	}
4414
4415	/*
4416	 * might be able to return some write requests if the parity blocks
4417	 * are safe, or on a failed drive
4418	 */
4419	pdev = &sh->dev[sh->pd_idx];
4420	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
4421		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
4422	qdev = &sh->dev[sh->qd_idx];
4423	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
4424		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
4425		|| conf->level < 6;
4426
4427	if (s.written &&
4428	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
4429			     && !test_bit(R5_LOCKED, &pdev->flags)
4430			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
4431				 test_bit(R5_Discard, &pdev->flags))))) &&
4432	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
4433			     && !test_bit(R5_LOCKED, &qdev->flags)
4434			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
4435				 test_bit(R5_Discard, &qdev->flags))))))
4436		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
4437
 
 
 
 
4438	/* Now we might consider reading some blocks, either to check/generate
4439	 * parity, or to satisfy requests
4440	 * or to load a block that is being partially written.
4441	 */
4442	if (s.to_read || s.non_overwrite
4443	    || (conf->level == 6 && s.to_write && s.failed)
4444	    || (s.syncing && (s.uptodate + s.compute < disks))
4445	    || s.replacing
4446	    || s.expanding)
4447		handle_stripe_fill(sh, &s, disks);
4448
4449	/* Now to consider new write requests and what else, if anything
4450	 * should be read.  We do not handle new writes when:
 
 
 
 
 
 
 
 
4451	 * 1/ A 'write' operation (copy+xor) is already in flight.
4452	 * 2/ A 'check' operation is in flight, as it may clobber the parity
4453	 *    block.
 
4454	 */
4455	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
4456		handle_stripe_dirtying(conf, sh, &s, disks);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4457
4458	/* maybe we need to check and possibly fix the parity for this stripe
4459	 * Any reads will already have been scheduled, so we just see if enough
4460	 * data is available.  The parity check is held off while parity
4461	 * dependent operations are in flight.
4462	 */
4463	if (sh->check_state ||
4464	    (s.syncing && s.locked == 0 &&
4465	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4466	     !test_bit(STRIPE_INSYNC, &sh->state))) {
4467		if (conf->level == 6)
4468			handle_parity_checks6(conf, sh, &s, disks);
4469		else
4470			handle_parity_checks5(conf, sh, &s, disks);
4471	}
4472
4473	if ((s.replacing || s.syncing) && s.locked == 0
4474	    && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
4475	    && !test_bit(STRIPE_REPLACED, &sh->state)) {
4476		/* Write out to replacement devices where possible */
4477		for (i = 0; i < conf->raid_disks; i++)
4478			if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
4479				WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
4480				set_bit(R5_WantReplace, &sh->dev[i].flags);
4481				set_bit(R5_LOCKED, &sh->dev[i].flags);
4482				s.locked++;
4483			}
4484		if (s.replacing)
4485			set_bit(STRIPE_INSYNC, &sh->state);
4486		set_bit(STRIPE_REPLACED, &sh->state);
4487	}
4488	if ((s.syncing || s.replacing) && s.locked == 0 &&
4489	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4490	    test_bit(STRIPE_INSYNC, &sh->state)) {
4491		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
4492		clear_bit(STRIPE_SYNCING, &sh->state);
4493		if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
4494			wake_up(&conf->wait_for_overlap);
4495	}
4496
4497	/* If the failed drives are just a ReadError, then we might need
4498	 * to progress the repair/check process
4499	 */
4500	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
4501		for (i = 0; i < s.failed; i++) {
4502			struct r5dev *dev = &sh->dev[s.failed_num[i]];
4503			if (test_bit(R5_ReadError, &dev->flags)
4504			    && !test_bit(R5_LOCKED, &dev->flags)
4505			    && test_bit(R5_UPTODATE, &dev->flags)
4506				) {
4507				if (!test_bit(R5_ReWrite, &dev->flags)) {
4508					set_bit(R5_Wantwrite, &dev->flags);
4509					set_bit(R5_ReWrite, &dev->flags);
4510					set_bit(R5_LOCKED, &dev->flags);
4511					s.locked++;
4512				} else {
4513					/* let's read it back */
4514					set_bit(R5_Wantread, &dev->flags);
4515					set_bit(R5_LOCKED, &dev->flags);
4516					s.locked++;
4517				}
4518			}
4519		}
4520
4521	/* Finish reconstruct operations initiated by the expansion process */
4522	if (sh->reconstruct_state == reconstruct_state_result) {
4523		struct stripe_head *sh_src
4524			= raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
4525		if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
4526			/* sh cannot be written until sh_src has been read.
4527			 * so arrange for sh to be delayed a little
4528			 */
4529			set_bit(STRIPE_DELAYED, &sh->state);
4530			set_bit(STRIPE_HANDLE, &sh->state);
4531			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
4532					      &sh_src->state))
4533				atomic_inc(&conf->preread_active_stripes);
4534			raid5_release_stripe(sh_src);
4535			goto finish;
4536		}
4537		if (sh_src)
4538			raid5_release_stripe(sh_src);
4539
4540		sh->reconstruct_state = reconstruct_state_idle;
4541		clear_bit(STRIPE_EXPANDING, &sh->state);
4542		for (i = conf->raid_disks; i--; ) {
4543			set_bit(R5_Wantwrite, &sh->dev[i].flags);
4544			set_bit(R5_LOCKED, &sh->dev[i].flags);
4545			s.locked++;
4546		}
4547	}
4548
4549	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
4550	    !sh->reconstruct_state) {
4551		/* Need to write out all blocks after computing parity */
4552		sh->disks = conf->raid_disks;
4553		stripe_set_idx(sh->sector, conf, 0, sh);
4554		schedule_reconstruction(sh, &s, 1, 1);
4555	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
4556		clear_bit(STRIPE_EXPAND_READY, &sh->state);
4557		atomic_dec(&conf->reshape_stripes);
4558		wake_up(&conf->wait_for_overlap);
4559		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
4560	}
4561
4562	if (s.expanding && s.locked == 0 &&
4563	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
4564		handle_stripe_expansion(conf, sh);
4565
4566finish:
4567	/* wait for this device to become unblocked */
4568	if (unlikely(s.blocked_rdev)) {
4569		if (conf->mddev->external)
4570			md_wait_for_blocked_rdev(s.blocked_rdev,
4571						 conf->mddev);
4572		else
4573			/* Internal metadata will immediately
4574			 * be written by raid5d, so we don't
4575			 * need to wait here.
4576			 */
4577			rdev_dec_pending(s.blocked_rdev,
4578					 conf->mddev);
4579	}
4580
4581	if (s.handle_bad_blocks)
4582		for (i = disks; i--; ) {
4583			struct md_rdev *rdev;
4584			struct r5dev *dev = &sh->dev[i];
4585			if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
4586				/* We own a safe reference to the rdev */
4587				rdev = conf->disks[i].rdev;
4588				if (!rdev_set_badblocks(rdev, sh->sector,
4589							STRIPE_SECTORS, 0))
4590					md_error(conf->mddev, rdev);
4591				rdev_dec_pending(rdev, conf->mddev);
4592			}
4593			if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
4594				rdev = conf->disks[i].rdev;
4595				rdev_clear_badblocks(rdev, sh->sector,
4596						     STRIPE_SECTORS, 0);
4597				rdev_dec_pending(rdev, conf->mddev);
4598			}
4599			if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
4600				rdev = conf->disks[i].replacement;
4601				if (!rdev)
4602					/* rdev have been moved down */
4603					rdev = conf->disks[i].rdev;
4604				rdev_clear_badblocks(rdev, sh->sector,
4605						     STRIPE_SECTORS, 0);
4606				rdev_dec_pending(rdev, conf->mddev);
4607			}
4608		}
4609
4610	if (s.ops_request)
4611		raid_run_ops(sh, s.ops_request);
4612
4613	ops_run_io(sh, &s);
4614
4615	if (s.dec_preread_active) {
4616		/* We delay this until after ops_run_io so that if make_request
4617		 * is waiting on a flush, it won't continue until the writes
4618		 * have actually been submitted.
4619		 */
4620		atomic_dec(&conf->preread_active_stripes);
4621		if (atomic_read(&conf->preread_active_stripes) <
4622		    IO_THRESHOLD)
4623			md_wakeup_thread(conf->mddev->thread);
4624	}
4625
4626	if (!bio_list_empty(&s.return_bi)) {
4627		if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) {
4628			spin_lock_irq(&conf->device_lock);
4629			bio_list_merge(&conf->return_bi, &s.return_bi);
4630			spin_unlock_irq(&conf->device_lock);
4631			md_wakeup_thread(conf->mddev->thread);
4632		} else
4633			return_io(&s.return_bi);
4634	}
4635
4636	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
4637}
4638
4639static void raid5_activate_delayed(struct r5conf *conf)
4640{
4641	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
4642		while (!list_empty(&conf->delayed_list)) {
4643			struct list_head *l = conf->delayed_list.next;
4644			struct stripe_head *sh;
4645			sh = list_entry(l, struct stripe_head, lru);
4646			list_del_init(l);
4647			clear_bit(STRIPE_DELAYED, &sh->state);
4648			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4649				atomic_inc(&conf->preread_active_stripes);
4650			list_add_tail(&sh->lru, &conf->hold_list);
4651			raid5_wakeup_stripe_thread(sh);
4652		}
4653	}
4654}
4655
4656static void activate_bit_delay(struct r5conf *conf,
4657	struct list_head *temp_inactive_list)
4658{
4659	/* device_lock is held */
4660	struct list_head head;
4661	list_add(&head, &conf->bitmap_list);
4662	list_del_init(&conf->bitmap_list);
4663	while (!list_empty(&head)) {
4664		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
4665		int hash;
4666		list_del_init(&sh->lru);
4667		atomic_inc(&sh->count);
4668		hash = sh->hash_lock_index;
4669		__release_stripe(conf, sh, &temp_inactive_list[hash]);
4670	}
4671}
4672
4673static int raid5_congested(struct mddev *mddev, int bits)
4674{
4675	struct r5conf *conf = mddev->private;
4676
4677	/* No difference between reads and writes.  Just check
4678	 * how busy the stripe_cache is
4679	 */
4680
4681	if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
4682		return 1;
 
 
 
 
4683	if (conf->quiesce)
4684		return 1;
4685	if (atomic_read(&conf->empty_inactive_list_nr))
4686		return 1;
4687
4688	return 0;
4689}
4690
4691static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
4692{
4693	struct r5conf *conf = mddev->private;
4694	sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
4695	unsigned int chunk_sectors;
4696	unsigned int bio_sectors = bio_sectors(bio);
4697
4698	chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
4699	return  chunk_sectors >=
4700		((sector & (chunk_sectors - 1)) + bio_sectors);
4701}
4702
4703/*
4704 *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
4705 *  later sampled by raid5d.
4706 */
4707static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
4708{
4709	unsigned long flags;
4710
4711	spin_lock_irqsave(&conf->device_lock, flags);
4712
4713	bi->bi_next = conf->retry_read_aligned_list;
4714	conf->retry_read_aligned_list = bi;
4715
4716	spin_unlock_irqrestore(&conf->device_lock, flags);
4717	md_wakeup_thread(conf->mddev->thread);
4718}
4719
4720static struct bio *remove_bio_from_retry(struct r5conf *conf)
4721{
4722	struct bio *bi;
4723
4724	bi = conf->retry_read_aligned;
4725	if (bi) {
4726		conf->retry_read_aligned = NULL;
4727		return bi;
4728	}
4729	bi = conf->retry_read_aligned_list;
4730	if(bi) {
4731		conf->retry_read_aligned_list = bi->bi_next;
4732		bi->bi_next = NULL;
4733		/*
4734		 * this sets the active strip count to 1 and the processed
4735		 * strip count to zero (upper 8 bits)
4736		 */
4737		raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
4738	}
4739
4740	return bi;
4741}
4742
4743/*
4744 *  The "raid5_align_endio" should check if the read succeeded and if it
4745 *  did, call bio_endio on the original bio (having bio_put the new bio
4746 *  first).
4747 *  If the read failed..
4748 */
4749static void raid5_align_endio(struct bio *bi)
4750{
4751	struct bio* raid_bi  = bi->bi_private;
4752	struct mddev *mddev;
4753	struct r5conf *conf;
4754	struct md_rdev *rdev;
4755	int error = bi->bi_error;
4756
4757	bio_put(bi);
4758
4759	rdev = (void*)raid_bi->bi_next;
4760	raid_bi->bi_next = NULL;
4761	mddev = rdev->mddev;
4762	conf = mddev->private;
4763
4764	rdev_dec_pending(rdev, conf->mddev);
4765
4766	if (!error) {
4767		trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
4768					 raid_bi, 0);
4769		bio_endio(raid_bi);
4770		if (atomic_dec_and_test(&conf->active_aligned_reads))
4771			wake_up(&conf->wait_for_quiescent);
4772		return;
4773	}
4774
4775	pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
4776
4777	add_bio_to_retry(raid_bi, conf);
4778}
4779
4780static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
4781{
4782	struct r5conf *conf = mddev->private;
4783	int dd_idx;
4784	struct bio* align_bi;
4785	struct md_rdev *rdev;
4786	sector_t end_sector;
4787
4788	if (!in_chunk_boundary(mddev, raid_bio)) {
4789		pr_debug("%s: non aligned\n", __func__);
4790		return 0;
4791	}
4792	/*
4793	 * use bio_clone_mddev to make a copy of the bio
4794	 */
4795	align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
4796	if (!align_bi)
4797		return 0;
4798	/*
4799	 *   set bi_end_io to a new function, and set bi_private to the
4800	 *     original bio.
4801	 */
4802	align_bi->bi_end_io  = raid5_align_endio;
4803	align_bi->bi_private = raid_bio;
4804	/*
4805	 *	compute position
4806	 */
4807	align_bi->bi_iter.bi_sector =
4808		raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
4809				     0, &dd_idx, NULL);
4810
4811	end_sector = bio_end_sector(align_bi);
4812	rcu_read_lock();
4813	rdev = rcu_dereference(conf->disks[dd_idx].replacement);
4814	if (!rdev || test_bit(Faulty, &rdev->flags) ||
4815	    rdev->recovery_offset < end_sector) {
4816		rdev = rcu_dereference(conf->disks[dd_idx].rdev);
4817		if (rdev &&
4818		    (test_bit(Faulty, &rdev->flags) ||
4819		    !(test_bit(In_sync, &rdev->flags) ||
4820		      rdev->recovery_offset >= end_sector)))
4821			rdev = NULL;
4822	}
4823	if (rdev) {
4824		sector_t first_bad;
4825		int bad_sectors;
4826
4827		atomic_inc(&rdev->nr_pending);
4828		rcu_read_unlock();
4829		raid_bio->bi_next = (void*)rdev;
4830		align_bi->bi_bdev =  rdev->bdev;
4831		bio_clear_flag(align_bi, BIO_SEG_VALID);
4832
4833		if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
4834				bio_sectors(align_bi),
4835				&first_bad, &bad_sectors)) {
4836			bio_put(align_bi);
4837			rdev_dec_pending(rdev, mddev);
4838			return 0;
4839		}
4840
4841		/* No reshape active, so we can trust rdev->data_offset */
4842		align_bi->bi_iter.bi_sector += rdev->data_offset;
4843
4844		spin_lock_irq(&conf->device_lock);
4845		wait_event_lock_irq(conf->wait_for_quiescent,
4846				    conf->quiesce == 0,
4847				    conf->device_lock);
4848		atomic_inc(&conf->active_aligned_reads);
4849		spin_unlock_irq(&conf->device_lock);
4850
4851		if (mddev->gendisk)
4852			trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
4853					      align_bi, disk_devt(mddev->gendisk),
4854					      raid_bio->bi_iter.bi_sector);
4855		generic_make_request(align_bi);
4856		return 1;
4857	} else {
4858		rcu_read_unlock();
4859		bio_put(align_bi);
4860		return 0;
4861	}
4862}
4863
4864static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
4865{
4866	struct bio *split;
4867
4868	do {
4869		sector_t sector = raid_bio->bi_iter.bi_sector;
4870		unsigned chunk_sects = mddev->chunk_sectors;
4871		unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
4872
4873		if (sectors < bio_sectors(raid_bio)) {
4874			split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
4875			bio_chain(split, raid_bio);
4876		} else
4877			split = raid_bio;
4878
4879		if (!raid5_read_one_chunk(mddev, split)) {
4880			if (split != raid_bio)
4881				generic_make_request(raid_bio);
4882			return split;
4883		}
4884	} while (split != raid_bio);
4885
4886	return NULL;
4887}
4888
4889/* __get_priority_stripe - get the next stripe to process
4890 *
4891 * Full stripe writes are allowed to pass preread active stripes up until
4892 * the bypass_threshold is exceeded.  In general the bypass_count
4893 * increments when the handle_list is handled before the hold_list; however, it
4894 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
4895 * stripe with in flight i/o.  The bypass_count will be reset when the
4896 * head of the hold_list has changed, i.e. the head was promoted to the
4897 * handle_list.
4898 */
4899static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
4900{
4901	struct stripe_head *sh = NULL, *tmp;
4902	struct list_head *handle_list = NULL;
4903	struct r5worker_group *wg = NULL;
4904
4905	if (conf->worker_cnt_per_group == 0) {
4906		handle_list = &conf->handle_list;
4907	} else if (group != ANY_GROUP) {
4908		handle_list = &conf->worker_groups[group].handle_list;
4909		wg = &conf->worker_groups[group];
4910	} else {
4911		int i;
4912		for (i = 0; i < conf->group_cnt; i++) {
4913			handle_list = &conf->worker_groups[i].handle_list;
4914			wg = &conf->worker_groups[i];
4915			if (!list_empty(handle_list))
4916				break;
4917		}
4918	}
4919
4920	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
4921		  __func__,
4922		  list_empty(handle_list) ? "empty" : "busy",
4923		  list_empty(&conf->hold_list) ? "empty" : "busy",
4924		  atomic_read(&conf->pending_full_writes), conf->bypass_count);
4925
4926	if (!list_empty(handle_list)) {
4927		sh = list_entry(handle_list->next, typeof(*sh), lru);
4928
4929		if (list_empty(&conf->hold_list))
4930			conf->bypass_count = 0;
4931		else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
4932			if (conf->hold_list.next == conf->last_hold)
4933				conf->bypass_count++;
4934			else {
4935				conf->last_hold = conf->hold_list.next;
4936				conf->bypass_count -= conf->bypass_threshold;
4937				if (conf->bypass_count < 0)
4938					conf->bypass_count = 0;
4939			}
4940		}
4941	} else if (!list_empty(&conf->hold_list) &&
4942		   ((conf->bypass_threshold &&
4943		     conf->bypass_count > conf->bypass_threshold) ||
4944		    atomic_read(&conf->pending_full_writes) == 0)) {
4945
4946		list_for_each_entry(tmp, &conf->hold_list,  lru) {
4947			if (conf->worker_cnt_per_group == 0 ||
4948			    group == ANY_GROUP ||
4949			    !cpu_online(tmp->cpu) ||
4950			    cpu_to_group(tmp->cpu) == group) {
4951				sh = tmp;
4952				break;
4953			}
4954		}
4955
4956		if (sh) {
4957			conf->bypass_count -= conf->bypass_threshold;
4958			if (conf->bypass_count < 0)
4959				conf->bypass_count = 0;
4960		}
4961		wg = NULL;
4962	}
4963
4964	if (!sh)
4965		return NULL;
4966
4967	if (wg) {
4968		wg->stripes_cnt--;
4969		sh->group = NULL;
4970	}
4971	list_del_init(&sh->lru);
4972	BUG_ON(atomic_inc_return(&sh->count) != 1);
4973	return sh;
4974}
4975
4976struct raid5_plug_cb {
4977	struct blk_plug_cb	cb;
4978	struct list_head	list;
4979	struct list_head	temp_inactive_list[NR_STRIPE_HASH_LOCKS];
4980};
4981
4982static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
4983{
4984	struct raid5_plug_cb *cb = container_of(
4985		blk_cb, struct raid5_plug_cb, cb);
4986	struct stripe_head *sh;
4987	struct mddev *mddev = cb->cb.data;
4988	struct r5conf *conf = mddev->private;
4989	int cnt = 0;
4990	int hash;
4991
4992	if (cb->list.next && !list_empty(&cb->list)) {
4993		spin_lock_irq(&conf->device_lock);
4994		while (!list_empty(&cb->list)) {
4995			sh = list_first_entry(&cb->list, struct stripe_head, lru);
4996			list_del_init(&sh->lru);
4997			/*
4998			 * avoid race release_stripe_plug() sees
4999			 * STRIPE_ON_UNPLUG_LIST clear but the stripe
5000			 * is still in our list
5001			 */
5002			smp_mb__before_atomic();
5003			clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5004			/*
5005			 * STRIPE_ON_RELEASE_LIST could be set here. In that
5006			 * case, the count is always > 1 here
5007			 */
5008			hash = sh->hash_lock_index;
5009			__release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5010			cnt++;
5011		}
5012		spin_unlock_irq(&conf->device_lock);
5013	}
5014	release_inactive_stripe_list(conf, cb->temp_inactive_list,
5015				     NR_STRIPE_HASH_LOCKS);
5016	if (mddev->queue)
5017		trace_block_unplug(mddev->queue, cnt, !from_schedule);
5018	kfree(cb);
5019}
5020
5021static void release_stripe_plug(struct mddev *mddev,
5022				struct stripe_head *sh)
5023{
5024	struct blk_plug_cb *blk_cb = blk_check_plugged(
5025		raid5_unplug, mddev,
5026		sizeof(struct raid5_plug_cb));
5027	struct raid5_plug_cb *cb;
5028
5029	if (!blk_cb) {
5030		raid5_release_stripe(sh);
5031		return;
5032	}
5033
5034	cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5035
5036	if (cb->list.next == NULL) {
5037		int i;
5038		INIT_LIST_HEAD(&cb->list);
5039		for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5040			INIT_LIST_HEAD(cb->temp_inactive_list + i);
5041	}
5042
5043	if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5044		list_add_tail(&sh->lru, &cb->list);
5045	else
5046		raid5_release_stripe(sh);
5047}
5048
5049static void make_discard_request(struct mddev *mddev, struct bio *bi)
5050{
5051	struct r5conf *conf = mddev->private;
5052	sector_t logical_sector, last_sector;
5053	struct stripe_head *sh;
5054	int remaining;
5055	int stripe_sectors;
5056
5057	if (mddev->reshape_position != MaxSector)
5058		/* Skip discard while reshape is happening */
5059		return;
5060
5061	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5062	last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
5063
5064	bi->bi_next = NULL;
5065	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
5066
5067	stripe_sectors = conf->chunk_sectors *
5068		(conf->raid_disks - conf->max_degraded);
5069	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5070					       stripe_sectors);
5071	sector_div(last_sector, stripe_sectors);
5072
5073	logical_sector *= conf->chunk_sectors;
5074	last_sector *= conf->chunk_sectors;
5075
5076	for (; logical_sector < last_sector;
5077	     logical_sector += STRIPE_SECTORS) {
5078		DEFINE_WAIT(w);
5079		int d;
5080	again:
5081		sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
5082		prepare_to_wait(&conf->wait_for_overlap, &w,
5083				TASK_UNINTERRUPTIBLE);
5084		set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5085		if (test_bit(STRIPE_SYNCING, &sh->state)) {
5086			raid5_release_stripe(sh);
5087			schedule();
5088			goto again;
5089		}
5090		clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5091		spin_lock_irq(&sh->stripe_lock);
5092		for (d = 0; d < conf->raid_disks; d++) {
5093			if (d == sh->pd_idx || d == sh->qd_idx)
5094				continue;
5095			if (sh->dev[d].towrite || sh->dev[d].toread) {
5096				set_bit(R5_Overlap, &sh->dev[d].flags);
5097				spin_unlock_irq(&sh->stripe_lock);
5098				raid5_release_stripe(sh);
5099				schedule();
5100				goto again;
5101			}
5102		}
5103		set_bit(STRIPE_DISCARD, &sh->state);
5104		finish_wait(&conf->wait_for_overlap, &w);
5105		sh->overwrite_disks = 0;
5106		for (d = 0; d < conf->raid_disks; d++) {
5107			if (d == sh->pd_idx || d == sh->qd_idx)
5108				continue;
5109			sh->dev[d].towrite = bi;
5110			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5111			raid5_inc_bi_active_stripes(bi);
5112			sh->overwrite_disks++;
5113		}
5114		spin_unlock_irq(&sh->stripe_lock);
5115		if (conf->mddev->bitmap) {
5116			for (d = 0;
5117			     d < conf->raid_disks - conf->max_degraded;
5118			     d++)
5119				bitmap_startwrite(mddev->bitmap,
5120						  sh->sector,
5121						  STRIPE_SECTORS,
5122						  0);
5123			sh->bm_seq = conf->seq_flush + 1;
5124			set_bit(STRIPE_BIT_DELAY, &sh->state);
5125		}
5126
5127		set_bit(STRIPE_HANDLE, &sh->state);
5128		clear_bit(STRIPE_DELAYED, &sh->state);
5129		if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5130			atomic_inc(&conf->preread_active_stripes);
5131		release_stripe_plug(mddev, sh);
5132	}
5133
5134	remaining = raid5_dec_bi_active_stripes(bi);
5135	if (remaining == 0) {
5136		md_write_end(mddev);
5137		bio_endio(bi);
5138	}
5139}
5140
5141static void raid5_make_request(struct mddev *mddev, struct bio * bi)
5142{
5143	struct r5conf *conf = mddev->private;
5144	int dd_idx;
5145	sector_t new_sector;
5146	sector_t logical_sector, last_sector;
5147	struct stripe_head *sh;
5148	const int rw = bio_data_dir(bi);
5149	int remaining;
5150	DEFINE_WAIT(w);
5151	bool do_prepare;
 
5152
5153	if (unlikely(bi->bi_rw & REQ_FLUSH)) {
5154		int ret = r5l_handle_flush_request(conf->log, bi);
5155
5156		if (ret == 0)
5157			return;
5158		if (ret == -ENODEV) {
5159			md_flush_request(mddev, bi);
5160			return;
5161		}
5162		/* ret == -EAGAIN, fallback */
 
 
 
 
 
5163	}
5164
5165	md_write_start(mddev, bi);
5166
5167	/*
5168	 * If array is degraded, better not do chunk aligned read because
5169	 * later we might have to read it again in order to reconstruct
5170	 * data on failed drives.
5171	 */
5172	if (rw == READ && mddev->degraded == 0 &&
 
5173	    mddev->reshape_position == MaxSector) {
5174		bi = chunk_aligned_read(mddev, bi);
5175		if (!bi)
5176			return;
5177	}
5178
5179	if (unlikely(bi->bi_rw & REQ_DISCARD)) {
5180		make_discard_request(mddev, bi);
5181		return;
5182	}
5183
5184	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5185	last_sector = bio_end_sector(bi);
5186	bi->bi_next = NULL;
5187	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
5188
5189	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5190	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
5191		int previous;
5192		int seq;
5193
5194		do_prepare = false;
5195	retry:
5196		seq = read_seqcount_begin(&conf->gen_lock);
5197		previous = 0;
5198		if (do_prepare)
5199			prepare_to_wait(&conf->wait_for_overlap, &w,
5200				TASK_UNINTERRUPTIBLE);
5201		if (unlikely(conf->reshape_progress != MaxSector)) {
5202			/* spinlock is needed as reshape_progress may be
5203			 * 64bit on a 32bit platform, and so it might be
5204			 * possible to see a half-updated value
5205			 * Of course reshape_progress could change after
5206			 * the lock is dropped, so once we get a reference
5207			 * to the stripe that we think it is, we will have
5208			 * to check again.
5209			 */
5210			spin_lock_irq(&conf->device_lock);
5211			if (mddev->reshape_backwards
5212			    ? logical_sector < conf->reshape_progress
5213			    : logical_sector >= conf->reshape_progress) {
5214				previous = 1;
5215			} else {
5216				if (mddev->reshape_backwards
5217				    ? logical_sector < conf->reshape_safe
5218				    : logical_sector >= conf->reshape_safe) {
5219					spin_unlock_irq(&conf->device_lock);
5220					schedule();
5221					do_prepare = true;
5222					goto retry;
5223				}
5224			}
5225			spin_unlock_irq(&conf->device_lock);
5226		}
5227
5228		new_sector = raid5_compute_sector(conf, logical_sector,
5229						  previous,
5230						  &dd_idx, NULL);
5231		pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
5232			(unsigned long long)new_sector,
5233			(unsigned long long)logical_sector);
5234
5235		sh = raid5_get_active_stripe(conf, new_sector, previous,
5236				       (bi->bi_rw&RWA_MASK), 0);
5237		if (sh) {
5238			if (unlikely(previous)) {
5239				/* expansion might have moved on while waiting for a
5240				 * stripe, so we must do the range check again.
5241				 * Expansion could still move past after this
5242				 * test, but as we are holding a reference to
5243				 * 'sh', we know that if that happens,
5244				 *  STRIPE_EXPANDING will get set and the expansion
5245				 * won't proceed until we finish with the stripe.
5246				 */
5247				int must_retry = 0;
5248				spin_lock_irq(&conf->device_lock);
5249				if (mddev->reshape_backwards
5250				    ? logical_sector >= conf->reshape_progress
5251				    : logical_sector < conf->reshape_progress)
5252					/* mismatch, need to try again */
5253					must_retry = 1;
5254				spin_unlock_irq(&conf->device_lock);
5255				if (must_retry) {
5256					raid5_release_stripe(sh);
5257					schedule();
5258					do_prepare = true;
5259					goto retry;
5260				}
5261			}
5262			if (read_seqcount_retry(&conf->gen_lock, seq)) {
5263				/* Might have got the wrong stripe_head
5264				 * by accident
5265				 */
5266				raid5_release_stripe(sh);
5267				goto retry;
5268			}
5269
5270			if (rw == WRITE &&
5271			    logical_sector >= mddev->suspend_lo &&
5272			    logical_sector < mddev->suspend_hi) {
5273				raid5_release_stripe(sh);
5274				/* As the suspend_* range is controlled by
5275				 * userspace, we want an interruptible
5276				 * wait.
5277				 */
5278				flush_signals(current);
5279				prepare_to_wait(&conf->wait_for_overlap,
5280						&w, TASK_INTERRUPTIBLE);
5281				if (logical_sector >= mddev->suspend_lo &&
5282				    logical_sector < mddev->suspend_hi) {
5283					schedule();
5284					do_prepare = true;
5285				}
5286				goto retry;
5287			}
5288
5289			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
5290			    !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
5291				/* Stripe is busy expanding or
5292				 * add failed due to overlap.  Flush everything
5293				 * and wait a while
5294				 */
5295				md_wakeup_thread(mddev->thread);
5296				raid5_release_stripe(sh);
5297				schedule();
5298				do_prepare = true;
5299				goto retry;
5300			}
 
 
 
 
 
 
5301			set_bit(STRIPE_HANDLE, &sh->state);
5302			clear_bit(STRIPE_DELAYED, &sh->state);
5303			if ((!sh->batch_head || sh == sh->batch_head) &&
5304			    (bi->bi_rw & REQ_SYNC) &&
5305			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5306				atomic_inc(&conf->preread_active_stripes);
5307			release_stripe_plug(mddev, sh);
5308		} else {
5309			/* cannot get stripe for read-ahead, just give-up */
5310			bi->bi_error = -EIO;
5311			break;
5312		}
5313	}
5314	finish_wait(&conf->wait_for_overlap, &w);
5315
5316	remaining = raid5_dec_bi_active_stripes(bi);
5317	if (remaining == 0) {
5318
5319		if ( rw == WRITE )
5320			md_write_end(mddev);
5321
5322		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
5323					 bi, 0);
5324		bio_endio(bi);
5325	}
5326}
5327
5328static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
5329
5330static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
5331{
5332	/* reshaping is quite different to recovery/resync so it is
5333	 * handled quite separately ... here.
5334	 *
5335	 * On each call to sync_request, we gather one chunk worth of
5336	 * destination stripes and flag them as expanding.
5337	 * Then we find all the source stripes and request reads.
5338	 * As the reads complete, handle_stripe will copy the data
5339	 * into the destination stripe and release that stripe.
5340	 */
5341	struct r5conf *conf = mddev->private;
5342	struct stripe_head *sh;
5343	sector_t first_sector, last_sector;
5344	int raid_disks = conf->previous_raid_disks;
5345	int data_disks = raid_disks - conf->max_degraded;
5346	int new_data_disks = conf->raid_disks - conf->max_degraded;
5347	int i;
5348	int dd_idx;
5349	sector_t writepos, readpos, safepos;
5350	sector_t stripe_addr;
5351	int reshape_sectors;
5352	struct list_head stripes;
5353	sector_t retn;
5354
5355	if (sector_nr == 0) {
5356		/* If restarting in the middle, skip the initial sectors */
5357		if (mddev->reshape_backwards &&
5358		    conf->reshape_progress < raid5_size(mddev, 0, 0)) {
5359			sector_nr = raid5_size(mddev, 0, 0)
5360				- conf->reshape_progress;
5361		} else if (mddev->reshape_backwards &&
5362			   conf->reshape_progress == MaxSector) {
5363			/* shouldn't happen, but just in case, finish up.*/
5364			sector_nr = MaxSector;
5365		} else if (!mddev->reshape_backwards &&
5366			   conf->reshape_progress > 0)
5367			sector_nr = conf->reshape_progress;
5368		sector_div(sector_nr, new_data_disks);
5369		if (sector_nr) {
5370			mddev->curr_resync_completed = sector_nr;
5371			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5372			*skipped = 1;
5373			retn = sector_nr;
5374			goto finish;
5375		}
5376	}
5377
5378	/* We need to process a full chunk at a time.
5379	 * If old and new chunk sizes differ, we need to process the
5380	 * largest of these
5381	 */
5382
5383	reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
5384
5385	/* We update the metadata at least every 10 seconds, or when
5386	 * the data about to be copied would over-write the source of
5387	 * the data at the front of the range.  i.e. one new_stripe
5388	 * along from reshape_progress new_maps to after where
5389	 * reshape_safe old_maps to
5390	 */
5391	writepos = conf->reshape_progress;
5392	sector_div(writepos, new_data_disks);
5393	readpos = conf->reshape_progress;
5394	sector_div(readpos, data_disks);
5395	safepos = conf->reshape_safe;
5396	sector_div(safepos, data_disks);
5397	if (mddev->reshape_backwards) {
5398		BUG_ON(writepos < reshape_sectors);
5399		writepos -= reshape_sectors;
5400		readpos += reshape_sectors;
5401		safepos += reshape_sectors;
5402	} else {
5403		writepos += reshape_sectors;
5404		/* readpos and safepos are worst-case calculations.
5405		 * A negative number is overly pessimistic, and causes
5406		 * obvious problems for unsigned storage.  So clip to 0.
5407		 */
5408		readpos -= min_t(sector_t, reshape_sectors, readpos);
5409		safepos -= min_t(sector_t, reshape_sectors, safepos);
5410	}
5411
5412	/* Having calculated the 'writepos' possibly use it
5413	 * to set 'stripe_addr' which is where we will write to.
5414	 */
5415	if (mddev->reshape_backwards) {
5416		BUG_ON(conf->reshape_progress == 0);
5417		stripe_addr = writepos;
5418		BUG_ON((mddev->dev_sectors &
5419			~((sector_t)reshape_sectors - 1))
5420		       - reshape_sectors - stripe_addr
5421		       != sector_nr);
5422	} else {
5423		BUG_ON(writepos != sector_nr + reshape_sectors);
5424		stripe_addr = sector_nr;
5425	}
5426
5427	/* 'writepos' is the most advanced device address we might write.
5428	 * 'readpos' is the least advanced device address we might read.
5429	 * 'safepos' is the least address recorded in the metadata as having
5430	 *     been reshaped.
5431	 * If there is a min_offset_diff, these are adjusted either by
5432	 * increasing the safepos/readpos if diff is negative, or
5433	 * increasing writepos if diff is positive.
5434	 * If 'readpos' is then behind 'writepos', there is no way that we can
5435	 * ensure safety in the face of a crash - that must be done by userspace
5436	 * making a backup of the data.  So in that case there is no particular
5437	 * rush to update metadata.
5438	 * Otherwise if 'safepos' is behind 'writepos', then we really need to
5439	 * update the metadata to advance 'safepos' to match 'readpos' so that
5440	 * we can be safe in the event of a crash.
5441	 * So we insist on updating metadata if safepos is behind writepos and
5442	 * readpos is beyond writepos.
5443	 * In any case, update the metadata every 10 seconds.
5444	 * Maybe that number should be configurable, but I'm not sure it is
5445	 * worth it.... maybe it could be a multiple of safemode_delay???
5446	 */
5447	if (conf->min_offset_diff < 0) {
5448		safepos += -conf->min_offset_diff;
5449		readpos += -conf->min_offset_diff;
5450	} else
5451		writepos += conf->min_offset_diff;
5452
5453	if ((mddev->reshape_backwards
5454	     ? (safepos > writepos && readpos < writepos)
5455	     : (safepos < writepos && readpos > writepos)) ||
5456	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
5457		/* Cannot proceed until we've updated the superblock... */
5458		wait_event(conf->wait_for_overlap,
5459			   atomic_read(&conf->reshape_stripes)==0
5460			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5461		if (atomic_read(&conf->reshape_stripes) != 0)
5462			return 0;
5463		mddev->reshape_position = conf->reshape_progress;
5464		mddev->curr_resync_completed = sector_nr;
5465		conf->reshape_checkpoint = jiffies;
5466		set_bit(MD_CHANGE_DEVS, &mddev->flags);
5467		md_wakeup_thread(mddev->thread);
5468		wait_event(mddev->sb_wait, mddev->flags == 0 ||
5469			   test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5470		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5471			return 0;
5472		spin_lock_irq(&conf->device_lock);
5473		conf->reshape_safe = mddev->reshape_position;
5474		spin_unlock_irq(&conf->device_lock);
5475		wake_up(&conf->wait_for_overlap);
5476		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5477	}
5478
5479	INIT_LIST_HEAD(&stripes);
5480	for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
5481		int j;
5482		int skipped_disk = 0;
5483		sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
5484		set_bit(STRIPE_EXPANDING, &sh->state);
5485		atomic_inc(&conf->reshape_stripes);
5486		/* If any of this stripe is beyond the end of the old
5487		 * array, then we need to zero those blocks
5488		 */
5489		for (j=sh->disks; j--;) {
5490			sector_t s;
5491			if (j == sh->pd_idx)
5492				continue;
5493			if (conf->level == 6 &&
5494			    j == sh->qd_idx)
5495				continue;
5496			s = raid5_compute_blocknr(sh, j, 0);
5497			if (s < raid5_size(mddev, 0, 0)) {
5498				skipped_disk = 1;
5499				continue;
5500			}
5501			memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
5502			set_bit(R5_Expanded, &sh->dev[j].flags);
5503			set_bit(R5_UPTODATE, &sh->dev[j].flags);
5504		}
5505		if (!skipped_disk) {
5506			set_bit(STRIPE_EXPAND_READY, &sh->state);
5507			set_bit(STRIPE_HANDLE, &sh->state);
5508		}
5509		list_add(&sh->lru, &stripes);
5510	}
5511	spin_lock_irq(&conf->device_lock);
5512	if (mddev->reshape_backwards)
5513		conf->reshape_progress -= reshape_sectors * new_data_disks;
5514	else
5515		conf->reshape_progress += reshape_sectors * new_data_disks;
5516	spin_unlock_irq(&conf->device_lock);
5517	/* Ok, those stripe are ready. We can start scheduling
5518	 * reads on the source stripes.
5519	 * The source stripes are determined by mapping the first and last
5520	 * block on the destination stripes.
5521	 */
5522	first_sector =
5523		raid5_compute_sector(conf, stripe_addr*(new_data_disks),
5524				     1, &dd_idx, NULL);
5525	last_sector =
5526		raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
5527					    * new_data_disks - 1),
5528				     1, &dd_idx, NULL);
5529	if (last_sector >= mddev->dev_sectors)
5530		last_sector = mddev->dev_sectors - 1;
5531	while (first_sector <= last_sector) {
5532		sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
5533		set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
5534		set_bit(STRIPE_HANDLE, &sh->state);
5535		raid5_release_stripe(sh);
5536		first_sector += STRIPE_SECTORS;
5537	}
5538	/* Now that the sources are clearly marked, we can release
5539	 * the destination stripes
5540	 */
5541	while (!list_empty(&stripes)) {
5542		sh = list_entry(stripes.next, struct stripe_head, lru);
5543		list_del_init(&sh->lru);
5544		raid5_release_stripe(sh);
5545	}
5546	/* If this takes us to the resync_max point where we have to pause,
5547	 * then we need to write out the superblock.
5548	 */
5549	sector_nr += reshape_sectors;
5550	retn = reshape_sectors;
5551finish:
5552	if (mddev->curr_resync_completed > mddev->resync_max ||
5553	    (sector_nr - mddev->curr_resync_completed) * 2
5554	    >= mddev->resync_max - mddev->curr_resync_completed) {
5555		/* Cannot proceed until we've updated the superblock... */
5556		wait_event(conf->wait_for_overlap,
5557			   atomic_read(&conf->reshape_stripes) == 0
5558			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5559		if (atomic_read(&conf->reshape_stripes) != 0)
5560			goto ret;
5561		mddev->reshape_position = conf->reshape_progress;
5562		mddev->curr_resync_completed = sector_nr;
5563		conf->reshape_checkpoint = jiffies;
5564		set_bit(MD_CHANGE_DEVS, &mddev->flags);
5565		md_wakeup_thread(mddev->thread);
5566		wait_event(mddev->sb_wait,
5567			   !test_bit(MD_CHANGE_DEVS, &mddev->flags)
5568			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5569		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5570			goto ret;
5571		spin_lock_irq(&conf->device_lock);
5572		conf->reshape_safe = mddev->reshape_position;
5573		spin_unlock_irq(&conf->device_lock);
5574		wake_up(&conf->wait_for_overlap);
5575		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5576	}
5577ret:
5578	return retn;
5579}
5580
5581static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
5582					  int *skipped)
5583{
5584	struct r5conf *conf = mddev->private;
5585	struct stripe_head *sh;
5586	sector_t max_sector = mddev->dev_sectors;
5587	sector_t sync_blocks;
5588	int still_degraded = 0;
5589	int i;
5590
5591	if (sector_nr >= max_sector) {
5592		/* just being told to finish up .. nothing much to do */
5593
5594		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
5595			end_reshape(conf);
5596			return 0;
5597		}
5598
5599		if (mddev->curr_resync < max_sector) /* aborted */
5600			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
5601					&sync_blocks, 1);
5602		else /* completed sync */
5603			conf->fullsync = 0;
5604		bitmap_close_sync(mddev->bitmap);
5605
5606		return 0;
5607	}
5608
5609	/* Allow raid5_quiesce to complete */
5610	wait_event(conf->wait_for_overlap, conf->quiesce != 2);
5611
5612	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5613		return reshape_request(mddev, sector_nr, skipped);
5614
5615	/* No need to check resync_max as we never do more than one
5616	 * stripe, and as resync_max will always be on a chunk boundary,
5617	 * if the check in md_do_sync didn't fire, there is no chance
5618	 * of overstepping resync_max here
5619	 */
5620
5621	/* if there is too many failed drives and we are trying
5622	 * to resync, then assert that we are finished, because there is
5623	 * nothing we can do.
5624	 */
5625	if (mddev->degraded >= conf->max_degraded &&
5626	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5627		sector_t rv = mddev->dev_sectors - sector_nr;
5628		*skipped = 1;
5629		return rv;
5630	}
5631	if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
5632	    !conf->fullsync &&
5633	    !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
5634	    sync_blocks >= STRIPE_SECTORS) {
5635		/* we can skip this block, and probably more */
5636		sync_blocks /= STRIPE_SECTORS;
5637		*skipped = 1;
5638		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
5639	}
5640
5641	bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
5642
5643	sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
5644	if (sh == NULL) {
5645		sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
5646		/* make sure we don't swamp the stripe cache if someone else
5647		 * is trying to get access
5648		 */
5649		schedule_timeout_uninterruptible(1);
5650	}
5651	/* Need to check if array will still be degraded after recovery/resync
5652	 * Note in case of > 1 drive failures it's possible we're rebuilding
5653	 * one drive while leaving another faulty drive in array.
5654	 */
5655	rcu_read_lock();
5656	for (i = 0; i < conf->raid_disks; i++) {
5657		struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev);
5658
5659		if (rdev == NULL || test_bit(Faulty, &rdev->flags))
5660			still_degraded = 1;
5661	}
5662	rcu_read_unlock();
5663
5664	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
5665
5666	set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
5667	set_bit(STRIPE_HANDLE, &sh->state);
5668
5669	raid5_release_stripe(sh);
5670
5671	return STRIPE_SECTORS;
5672}
5673
5674static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
5675{
5676	/* We may not be able to submit a whole bio at once as there
5677	 * may not be enough stripe_heads available.
5678	 * We cannot pre-allocate enough stripe_heads as we may need
5679	 * more than exist in the cache (if we allow ever large chunks).
5680	 * So we do one stripe head at a time and record in
5681	 * ->bi_hw_segments how many have been done.
5682	 *
5683	 * We *know* that this entire raid_bio is in one chunk, so
5684	 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
5685	 */
5686	struct stripe_head *sh;
5687	int dd_idx;
5688	sector_t sector, logical_sector, last_sector;
5689	int scnt = 0;
5690	int remaining;
5691	int handled = 0;
5692
5693	logical_sector = raid_bio->bi_iter.bi_sector &
5694		~((sector_t)STRIPE_SECTORS-1);
5695	sector = raid5_compute_sector(conf, logical_sector,
5696				      0, &dd_idx, NULL);
5697	last_sector = bio_end_sector(raid_bio);
5698
5699	for (; logical_sector < last_sector;
5700	     logical_sector += STRIPE_SECTORS,
5701		     sector += STRIPE_SECTORS,
5702		     scnt++) {
5703
5704		if (scnt < raid5_bi_processed_stripes(raid_bio))
5705			/* already done this stripe */
5706			continue;
5707
5708		sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
5709
5710		if (!sh) {
5711			/* failed to get a stripe - must wait */
5712			raid5_set_bi_processed_stripes(raid_bio, scnt);
5713			conf->retry_read_aligned = raid_bio;
5714			return handled;
5715		}
5716
5717		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
5718			raid5_release_stripe(sh);
5719			raid5_set_bi_processed_stripes(raid_bio, scnt);
5720			conf->retry_read_aligned = raid_bio;
5721			return handled;
5722		}
5723
5724		set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
5725		handle_stripe(sh);
5726		raid5_release_stripe(sh);
5727		handled++;
5728	}
5729	remaining = raid5_dec_bi_active_stripes(raid_bio);
5730	if (remaining == 0) {
5731		trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
5732					 raid_bio, 0);
5733		bio_endio(raid_bio);
5734	}
5735	if (atomic_dec_and_test(&conf->active_aligned_reads))
5736		wake_up(&conf->wait_for_quiescent);
5737	return handled;
5738}
5739
5740static int handle_active_stripes(struct r5conf *conf, int group,
5741				 struct r5worker *worker,
5742				 struct list_head *temp_inactive_list)
5743{
5744	struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
5745	int i, batch_size = 0, hash;
5746	bool release_inactive = false;
5747
5748	while (batch_size < MAX_STRIPE_BATCH &&
5749			(sh = __get_priority_stripe(conf, group)) != NULL)
5750		batch[batch_size++] = sh;
5751
5752	if (batch_size == 0) {
5753		for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5754			if (!list_empty(temp_inactive_list + i))
5755				break;
5756		if (i == NR_STRIPE_HASH_LOCKS) {
5757			spin_unlock_irq(&conf->device_lock);
5758			r5l_flush_stripe_to_raid(conf->log);
5759			spin_lock_irq(&conf->device_lock);
5760			return batch_size;
5761		}
5762		release_inactive = true;
5763	}
5764	spin_unlock_irq(&conf->device_lock);
5765
5766	release_inactive_stripe_list(conf, temp_inactive_list,
5767				     NR_STRIPE_HASH_LOCKS);
5768
5769	r5l_flush_stripe_to_raid(conf->log);
5770	if (release_inactive) {
5771		spin_lock_irq(&conf->device_lock);
5772		return 0;
5773	}
5774
5775	for (i = 0; i < batch_size; i++)
5776		handle_stripe(batch[i]);
5777	r5l_write_stripe_run(conf->log);
5778
5779	cond_resched();
5780
5781	spin_lock_irq(&conf->device_lock);
5782	for (i = 0; i < batch_size; i++) {
5783		hash = batch[i]->hash_lock_index;
5784		__release_stripe(conf, batch[i], &temp_inactive_list[hash]);
5785	}
5786	return batch_size;
5787}
5788
5789static void raid5_do_work(struct work_struct *work)
5790{
5791	struct r5worker *worker = container_of(work, struct r5worker, work);
5792	struct r5worker_group *group = worker->group;
5793	struct r5conf *conf = group->conf;
5794	int group_id = group - conf->worker_groups;
5795	int handled;
5796	struct blk_plug plug;
5797
5798	pr_debug("+++ raid5worker active\n");
5799
5800	blk_start_plug(&plug);
5801	handled = 0;
5802	spin_lock_irq(&conf->device_lock);
5803	while (1) {
5804		int batch_size, released;
5805
5806		released = release_stripe_list(conf, worker->temp_inactive_list);
5807
5808		batch_size = handle_active_stripes(conf, group_id, worker,
5809						   worker->temp_inactive_list);
5810		worker->working = false;
5811		if (!batch_size && !released)
5812			break;
5813		handled += batch_size;
5814	}
5815	pr_debug("%d stripes handled\n", handled);
5816
5817	spin_unlock_irq(&conf->device_lock);
5818	blk_finish_plug(&plug);
5819
5820	pr_debug("--- raid5worker inactive\n");
5821}
5822
5823/*
5824 * This is our raid5 kernel thread.
5825 *
5826 * We scan the hash table for stripes which can be handled now.
5827 * During the scan, completed stripes are saved for us by the interrupt
5828 * handler, so that they will not have to wait for our next wakeup.
5829 */
5830static void raid5d(struct md_thread *thread)
5831{
5832	struct mddev *mddev = thread->mddev;
5833	struct r5conf *conf = mddev->private;
5834	int handled;
5835	struct blk_plug plug;
5836
5837	pr_debug("+++ raid5d active\n");
5838
5839	md_check_recovery(mddev);
5840
5841	if (!bio_list_empty(&conf->return_bi) &&
5842	    !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
5843		struct bio_list tmp = BIO_EMPTY_LIST;
5844		spin_lock_irq(&conf->device_lock);
5845		if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
5846			bio_list_merge(&tmp, &conf->return_bi);
5847			bio_list_init(&conf->return_bi);
5848		}
5849		spin_unlock_irq(&conf->device_lock);
5850		return_io(&tmp);
5851	}
5852
5853	blk_start_plug(&plug);
5854	handled = 0;
5855	spin_lock_irq(&conf->device_lock);
5856	while (1) {
5857		struct bio *bio;
5858		int batch_size, released;
5859
5860		released = release_stripe_list(conf, conf->temp_inactive_list);
5861		if (released)
5862			clear_bit(R5_DID_ALLOC, &conf->cache_state);
5863
5864		if (
5865		    !list_empty(&conf->bitmap_list)) {
5866			/* Now is a good time to flush some bitmap updates */
5867			conf->seq_flush++;
5868			spin_unlock_irq(&conf->device_lock);
5869			bitmap_unplug(mddev->bitmap);
5870			spin_lock_irq(&conf->device_lock);
5871			conf->seq_write = conf->seq_flush;
5872			activate_bit_delay(conf, conf->temp_inactive_list);
5873		}
5874		raid5_activate_delayed(conf);
5875
5876		while ((bio = remove_bio_from_retry(conf))) {
5877			int ok;
5878			spin_unlock_irq(&conf->device_lock);
5879			ok = retry_aligned_read(conf, bio);
5880			spin_lock_irq(&conf->device_lock);
5881			if (!ok)
5882				break;
5883			handled++;
5884		}
5885
5886		batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
5887						   conf->temp_inactive_list);
5888		if (!batch_size && !released)
5889			break;
5890		handled += batch_size;
5891
5892		if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) {
5893			spin_unlock_irq(&conf->device_lock);
5894			md_check_recovery(mddev);
5895			spin_lock_irq(&conf->device_lock);
5896		}
5897	}
5898	pr_debug("%d stripes handled\n", handled);
5899
5900	spin_unlock_irq(&conf->device_lock);
5901	if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
5902	    mutex_trylock(&conf->cache_size_mutex)) {
5903		grow_one_stripe(conf, __GFP_NOWARN);
5904		/* Set flag even if allocation failed.  This helps
5905		 * slow down allocation requests when mem is short
5906		 */
5907		set_bit(R5_DID_ALLOC, &conf->cache_state);
5908		mutex_unlock(&conf->cache_size_mutex);
5909	}
5910
5911	r5l_flush_stripe_to_raid(conf->log);
5912
5913	async_tx_issue_pending_all();
5914	blk_finish_plug(&plug);
5915
5916	pr_debug("--- raid5d inactive\n");
5917}
5918
5919static ssize_t
5920raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
5921{
5922	struct r5conf *conf;
5923	int ret = 0;
5924	spin_lock(&mddev->lock);
5925	conf = mddev->private;
5926	if (conf)
5927		ret = sprintf(page, "%d\n", conf->min_nr_stripes);
5928	spin_unlock(&mddev->lock);
5929	return ret;
5930}
5931
5932int
5933raid5_set_cache_size(struct mddev *mddev, int size)
5934{
5935	struct r5conf *conf = mddev->private;
5936	int err;
5937
5938	if (size <= 16 || size > 32768)
5939		return -EINVAL;
5940
5941	conf->min_nr_stripes = size;
5942	mutex_lock(&conf->cache_size_mutex);
5943	while (size < conf->max_nr_stripes &&
5944	       drop_one_stripe(conf))
5945		;
5946	mutex_unlock(&conf->cache_size_mutex);
5947
5948
5949	err = md_allow_write(mddev);
5950	if (err)
5951		return err;
5952
5953	mutex_lock(&conf->cache_size_mutex);
5954	while (size > conf->max_nr_stripes)
5955		if (!grow_one_stripe(conf, GFP_KERNEL))
5956			break;
5957	mutex_unlock(&conf->cache_size_mutex);
5958
5959	return 0;
5960}
5961EXPORT_SYMBOL(raid5_set_cache_size);
5962
5963static ssize_t
5964raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
5965{
5966	struct r5conf *conf;
5967	unsigned long new;
5968	int err;
5969
5970	if (len >= PAGE_SIZE)
5971		return -EINVAL;
5972	if (kstrtoul(page, 10, &new))
5973		return -EINVAL;
5974	err = mddev_lock(mddev);
5975	if (err)
5976		return err;
5977	conf = mddev->private;
5978	if (!conf)
5979		err = -ENODEV;
5980	else
5981		err = raid5_set_cache_size(mddev, new);
5982	mddev_unlock(mddev);
5983
5984	return err ?: len;
5985}
5986
5987static struct md_sysfs_entry
5988raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
5989				raid5_show_stripe_cache_size,
5990				raid5_store_stripe_cache_size);
5991
5992static ssize_t
5993raid5_show_rmw_level(struct mddev  *mddev, char *page)
5994{
5995	struct r5conf *conf = mddev->private;
5996	if (conf)
5997		return sprintf(page, "%d\n", conf->rmw_level);
5998	else
5999		return 0;
6000}
6001
6002static ssize_t
6003raid5_store_rmw_level(struct mddev  *mddev, const char *page, size_t len)
6004{
6005	struct r5conf *conf = mddev->private;
6006	unsigned long new;
6007
6008	if (!conf)
6009		return -ENODEV;
6010
6011	if (len >= PAGE_SIZE)
6012		return -EINVAL;
6013
6014	if (kstrtoul(page, 10, &new))
6015		return -EINVAL;
6016
6017	if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6018		return -EINVAL;
6019
6020	if (new != PARITY_DISABLE_RMW &&
6021	    new != PARITY_ENABLE_RMW &&
6022	    new != PARITY_PREFER_RMW)
6023		return -EINVAL;
6024
6025	conf->rmw_level = new;
6026	return len;
6027}
6028
6029static struct md_sysfs_entry
6030raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6031			 raid5_show_rmw_level,
6032			 raid5_store_rmw_level);
6033
6034
6035static ssize_t
6036raid5_show_preread_threshold(struct mddev *mddev, char *page)
6037{
6038	struct r5conf *conf;
6039	int ret = 0;
6040	spin_lock(&mddev->lock);
6041	conf = mddev->private;
6042	if (conf)
6043		ret = sprintf(page, "%d\n", conf->bypass_threshold);
6044	spin_unlock(&mddev->lock);
6045	return ret;
6046}
6047
6048static ssize_t
6049raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
6050{
6051	struct r5conf *conf;
6052	unsigned long new;
6053	int err;
6054
6055	if (len >= PAGE_SIZE)
6056		return -EINVAL;
6057	if (kstrtoul(page, 10, &new))
6058		return -EINVAL;
6059
6060	err = mddev_lock(mddev);
6061	if (err)
6062		return err;
6063	conf = mddev->private;
6064	if (!conf)
6065		err = -ENODEV;
6066	else if (new > conf->min_nr_stripes)
6067		err = -EINVAL;
6068	else
6069		conf->bypass_threshold = new;
6070	mddev_unlock(mddev);
6071	return err ?: len;
6072}
6073
6074static struct md_sysfs_entry
6075raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
6076					S_IRUGO | S_IWUSR,
6077					raid5_show_preread_threshold,
6078					raid5_store_preread_threshold);
6079
6080static ssize_t
6081raid5_show_skip_copy(struct mddev *mddev, char *page)
6082{
6083	struct r5conf *conf;
6084	int ret = 0;
6085	spin_lock(&mddev->lock);
6086	conf = mddev->private;
6087	if (conf)
6088		ret = sprintf(page, "%d\n", conf->skip_copy);
6089	spin_unlock(&mddev->lock);
6090	return ret;
6091}
6092
6093static ssize_t
6094raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
6095{
6096	struct r5conf *conf;
6097	unsigned long new;
6098	int err;
6099
6100	if (len >= PAGE_SIZE)
6101		return -EINVAL;
6102	if (kstrtoul(page, 10, &new))
6103		return -EINVAL;
6104	new = !!new;
6105
6106	err = mddev_lock(mddev);
6107	if (err)
6108		return err;
6109	conf = mddev->private;
6110	if (!conf)
6111		err = -ENODEV;
6112	else if (new != conf->skip_copy) {
6113		mddev_suspend(mddev);
6114		conf->skip_copy = new;
6115		if (new)
6116			mddev->queue->backing_dev_info.capabilities |=
6117				BDI_CAP_STABLE_WRITES;
6118		else
6119			mddev->queue->backing_dev_info.capabilities &=
6120				~BDI_CAP_STABLE_WRITES;
6121		mddev_resume(mddev);
6122	}
6123	mddev_unlock(mddev);
6124	return err ?: len;
6125}
6126
6127static struct md_sysfs_entry
6128raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
6129					raid5_show_skip_copy,
6130					raid5_store_skip_copy);
6131
6132static ssize_t
6133stripe_cache_active_show(struct mddev *mddev, char *page)
6134{
6135	struct r5conf *conf = mddev->private;
6136	if (conf)
6137		return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
6138	else
6139		return 0;
6140}
6141
6142static struct md_sysfs_entry
6143raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
6144
6145static ssize_t
6146raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
6147{
6148	struct r5conf *conf;
6149	int ret = 0;
6150	spin_lock(&mddev->lock);
6151	conf = mddev->private;
6152	if (conf)
6153		ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
6154	spin_unlock(&mddev->lock);
6155	return ret;
6156}
6157
6158static int alloc_thread_groups(struct r5conf *conf, int cnt,
6159			       int *group_cnt,
6160			       int *worker_cnt_per_group,
6161			       struct r5worker_group **worker_groups);
6162static ssize_t
6163raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
6164{
6165	struct r5conf *conf;
6166	unsigned long new;
6167	int err;
6168	struct r5worker_group *new_groups, *old_groups;
6169	int group_cnt, worker_cnt_per_group;
6170
6171	if (len >= PAGE_SIZE)
6172		return -EINVAL;
6173	if (kstrtoul(page, 10, &new))
6174		return -EINVAL;
6175
6176	err = mddev_lock(mddev);
6177	if (err)
6178		return err;
6179	conf = mddev->private;
6180	if (!conf)
6181		err = -ENODEV;
6182	else if (new != conf->worker_cnt_per_group) {
6183		mddev_suspend(mddev);
6184
6185		old_groups = conf->worker_groups;
6186		if (old_groups)
6187			flush_workqueue(raid5_wq);
6188
6189		err = alloc_thread_groups(conf, new,
6190					  &group_cnt, &worker_cnt_per_group,
6191					  &new_groups);
6192		if (!err) {
6193			spin_lock_irq(&conf->device_lock);
6194			conf->group_cnt = group_cnt;
6195			conf->worker_cnt_per_group = worker_cnt_per_group;
6196			conf->worker_groups = new_groups;
6197			spin_unlock_irq(&conf->device_lock);
6198
6199			if (old_groups)
6200				kfree(old_groups[0].workers);
6201			kfree(old_groups);
6202		}
6203		mddev_resume(mddev);
6204	}
6205	mddev_unlock(mddev);
6206
6207	return err ?: len;
6208}
6209
6210static struct md_sysfs_entry
6211raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
6212				raid5_show_group_thread_cnt,
6213				raid5_store_group_thread_cnt);
6214
6215static struct attribute *raid5_attrs[] =  {
6216	&raid5_stripecache_size.attr,
6217	&raid5_stripecache_active.attr,
6218	&raid5_preread_bypass_threshold.attr,
6219	&raid5_group_thread_cnt.attr,
6220	&raid5_skip_copy.attr,
6221	&raid5_rmw_level.attr,
 
6222	NULL,
6223};
6224static struct attribute_group raid5_attrs_group = {
6225	.name = NULL,
6226	.attrs = raid5_attrs,
6227};
6228
6229static int alloc_thread_groups(struct r5conf *conf, int cnt,
6230			       int *group_cnt,
6231			       int *worker_cnt_per_group,
6232			       struct r5worker_group **worker_groups)
6233{
6234	int i, j, k;
6235	ssize_t size;
6236	struct r5worker *workers;
6237
6238	*worker_cnt_per_group = cnt;
6239	if (cnt == 0) {
6240		*group_cnt = 0;
6241		*worker_groups = NULL;
6242		return 0;
6243	}
6244	*group_cnt = num_possible_nodes();
6245	size = sizeof(struct r5worker) * cnt;
6246	workers = kzalloc(size * *group_cnt, GFP_NOIO);
6247	*worker_groups = kzalloc(sizeof(struct r5worker_group) *
6248				*group_cnt, GFP_NOIO);
6249	if (!*worker_groups || !workers) {
6250		kfree(workers);
6251		kfree(*worker_groups);
6252		return -ENOMEM;
6253	}
6254
6255	for (i = 0; i < *group_cnt; i++) {
6256		struct r5worker_group *group;
6257
6258		group = &(*worker_groups)[i];
6259		INIT_LIST_HEAD(&group->handle_list);
6260		group->conf = conf;
6261		group->workers = workers + i * cnt;
6262
6263		for (j = 0; j < cnt; j++) {
6264			struct r5worker *worker = group->workers + j;
6265			worker->group = group;
6266			INIT_WORK(&worker->work, raid5_do_work);
6267
6268			for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
6269				INIT_LIST_HEAD(worker->temp_inactive_list + k);
6270		}
6271	}
6272
6273	return 0;
6274}
6275
6276static void free_thread_groups(struct r5conf *conf)
6277{
6278	if (conf->worker_groups)
6279		kfree(conf->worker_groups[0].workers);
6280	kfree(conf->worker_groups);
6281	conf->worker_groups = NULL;
6282}
6283
6284static sector_t
6285raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
6286{
6287	struct r5conf *conf = mddev->private;
6288
6289	if (!sectors)
6290		sectors = mddev->dev_sectors;
6291	if (!raid_disks)
6292		/* size is defined by the smallest of previous and new size */
6293		raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
6294
6295	sectors &= ~((sector_t)conf->chunk_sectors - 1);
6296	sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
6297	return sectors * (raid_disks - conf->max_degraded);
6298}
6299
6300static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6301{
6302	safe_put_page(percpu->spare_page);
6303	if (percpu->scribble)
6304		flex_array_free(percpu->scribble);
6305	percpu->spare_page = NULL;
6306	percpu->scribble = NULL;
6307}
6308
6309static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6310{
6311	if (conf->level == 6 && !percpu->spare_page)
6312		percpu->spare_page = alloc_page(GFP_KERNEL);
6313	if (!percpu->scribble)
6314		percpu->scribble = scribble_alloc(max(conf->raid_disks,
6315						      conf->previous_raid_disks),
6316						  max(conf->chunk_sectors,
6317						      conf->prev_chunk_sectors)
6318						   / STRIPE_SECTORS,
6319						  GFP_KERNEL);
6320
6321	if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
6322		free_scratch_buffer(conf, percpu);
6323		return -ENOMEM;
6324	}
6325
6326	return 0;
6327}
6328
6329static void raid5_free_percpu(struct r5conf *conf)
6330{
6331	unsigned long cpu;
 
 
 
 
6332
 
 
6333	if (!conf->percpu)
6334		return;
6335
6336#ifdef CONFIG_HOTPLUG_CPU
6337	unregister_cpu_notifier(&conf->cpu_notify);
6338#endif
6339
6340	get_online_cpus();
6341	for_each_possible_cpu(cpu)
6342		free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
6343	put_online_cpus();
6344
6345	free_percpu(conf->percpu);
6346}
6347
6348static void free_conf(struct r5conf *conf)
6349{
 
 
6350	if (conf->log)
6351		r5l_exit_log(conf->log);
6352	if (conf->shrinker.seeks)
6353		unregister_shrinker(&conf->shrinker);
6354
6355	free_thread_groups(conf);
6356	shrink_stripes(conf);
6357	raid5_free_percpu(conf);
 
 
 
6358	kfree(conf->disks);
6359	kfree(conf->stripe_hashtbl);
6360	kfree(conf);
6361}
6362
6363#ifdef CONFIG_HOTPLUG_CPU
6364static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
6365			      void *hcpu)
6366{
6367	struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify);
6368	long cpu = (long)hcpu;
6369	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
6370
6371	switch (action) {
6372	case CPU_UP_PREPARE:
6373	case CPU_UP_PREPARE_FROZEN:
6374		if (alloc_scratch_buffer(conf, percpu)) {
6375			pr_err("%s: failed memory allocation for cpu%ld\n",
6376			       __func__, cpu);
6377			return notifier_from_errno(-ENOMEM);
6378		}
6379		break;
6380	case CPU_DEAD:
6381	case CPU_DEAD_FROZEN:
6382	case CPU_UP_CANCELED:
6383	case CPU_UP_CANCELED_FROZEN:
6384		free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
6385		break;
6386	default:
6387		break;
6388	}
6389	return NOTIFY_OK;
6390}
6391#endif
6392
6393static int raid5_alloc_percpu(struct r5conf *conf)
6394{
6395	unsigned long cpu;
6396	int err = 0;
6397
6398	conf->percpu = alloc_percpu(struct raid5_percpu);
6399	if (!conf->percpu)
6400		return -ENOMEM;
6401
6402#ifdef CONFIG_HOTPLUG_CPU
6403	conf->cpu_notify.notifier_call = raid456_cpu_notify;
6404	conf->cpu_notify.priority = 0;
6405	err = register_cpu_notifier(&conf->cpu_notify);
6406	if (err)
6407		return err;
6408#endif
6409
6410	get_online_cpus();
6411	for_each_present_cpu(cpu) {
6412		err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
6413		if (err) {
6414			pr_err("%s: failed memory allocation for cpu%ld\n",
6415			       __func__, cpu);
6416			break;
6417		}
6418	}
6419	put_online_cpus();
6420
6421	if (!err) {
6422		conf->scribble_disks = max(conf->raid_disks,
6423			conf->previous_raid_disks);
6424		conf->scribble_sectors = max(conf->chunk_sectors,
6425			conf->prev_chunk_sectors);
6426	}
6427	return err;
6428}
6429
6430static unsigned long raid5_cache_scan(struct shrinker *shrink,
6431				      struct shrink_control *sc)
6432{
6433	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6434	unsigned long ret = SHRINK_STOP;
6435
6436	if (mutex_trylock(&conf->cache_size_mutex)) {
6437		ret= 0;
6438		while (ret < sc->nr_to_scan &&
6439		       conf->max_nr_stripes > conf->min_nr_stripes) {
6440			if (drop_one_stripe(conf) == 0) {
6441				ret = SHRINK_STOP;
6442				break;
6443			}
6444			ret++;
6445		}
6446		mutex_unlock(&conf->cache_size_mutex);
6447	}
6448	return ret;
6449}
6450
6451static unsigned long raid5_cache_count(struct shrinker *shrink,
6452				       struct shrink_control *sc)
6453{
6454	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6455
6456	if (conf->max_nr_stripes < conf->min_nr_stripes)
6457		/* unlikely, but not impossible */
6458		return 0;
6459	return conf->max_nr_stripes - conf->min_nr_stripes;
6460}
6461
6462static struct r5conf *setup_conf(struct mddev *mddev)
6463{
6464	struct r5conf *conf;
6465	int raid_disk, memory, max_disks;
6466	struct md_rdev *rdev;
6467	struct disk_info *disk;
6468	char pers_name[6];
6469	int i;
6470	int group_cnt, worker_cnt_per_group;
6471	struct r5worker_group *new_group;
6472
6473	if (mddev->new_level != 5
6474	    && mddev->new_level != 4
6475	    && mddev->new_level != 6) {
6476		printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
6477		       mdname(mddev), mddev->new_level);
6478		return ERR_PTR(-EIO);
6479	}
6480	if ((mddev->new_level == 5
6481	     && !algorithm_valid_raid5(mddev->new_layout)) ||
6482	    (mddev->new_level == 6
6483	     && !algorithm_valid_raid6(mddev->new_layout))) {
6484		printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
6485		       mdname(mddev), mddev->new_layout);
6486		return ERR_PTR(-EIO);
6487	}
6488	if (mddev->new_level == 6 && mddev->raid_disks < 4) {
6489		printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
6490		       mdname(mddev), mddev->raid_disks);
6491		return ERR_PTR(-EINVAL);
6492	}
6493
6494	if (!mddev->new_chunk_sectors ||
6495	    (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
6496	    !is_power_of_2(mddev->new_chunk_sectors)) {
6497		printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
6498		       mdname(mddev), mddev->new_chunk_sectors << 9);
6499		return ERR_PTR(-EINVAL);
6500	}
6501
6502	conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
6503	if (conf == NULL)
6504		goto abort;
6505	/* Don't enable multi-threading by default*/
6506	if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
6507				 &new_group)) {
6508		conf->group_cnt = group_cnt;
6509		conf->worker_cnt_per_group = worker_cnt_per_group;
6510		conf->worker_groups = new_group;
6511	} else
6512		goto abort;
6513	spin_lock_init(&conf->device_lock);
6514	seqcount_init(&conf->gen_lock);
6515	mutex_init(&conf->cache_size_mutex);
6516	init_waitqueue_head(&conf->wait_for_quiescent);
6517	init_waitqueue_head(&conf->wait_for_stripe);
6518	init_waitqueue_head(&conf->wait_for_overlap);
6519	INIT_LIST_HEAD(&conf->handle_list);
6520	INIT_LIST_HEAD(&conf->hold_list);
6521	INIT_LIST_HEAD(&conf->delayed_list);
6522	INIT_LIST_HEAD(&conf->bitmap_list);
6523	bio_list_init(&conf->return_bi);
6524	init_llist_head(&conf->released_stripes);
6525	atomic_set(&conf->active_stripes, 0);
6526	atomic_set(&conf->preread_active_stripes, 0);
6527	atomic_set(&conf->active_aligned_reads, 0);
6528	conf->bypass_threshold = BYPASS_THRESHOLD;
6529	conf->recovery_disabled = mddev->recovery_disabled - 1;
6530
6531	conf->raid_disks = mddev->raid_disks;
6532	if (mddev->reshape_position == MaxSector)
6533		conf->previous_raid_disks = mddev->raid_disks;
6534	else
6535		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
6536	max_disks = max(conf->raid_disks, conf->previous_raid_disks);
6537
6538	conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
6539			      GFP_KERNEL);
 
6540	if (!conf->disks)
6541		goto abort;
6542
 
 
 
 
 
 
6543	conf->mddev = mddev;
6544
6545	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
6546		goto abort;
6547
6548	/* We init hash_locks[0] separately to that it can be used
6549	 * as the reference lock in the spin_lock_nest_lock() call
6550	 * in lock_all_device_hash_locks_irq in order to convince
6551	 * lockdep that we know what we are doing.
6552	 */
6553	spin_lock_init(conf->hash_locks);
6554	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
6555		spin_lock_init(conf->hash_locks + i);
6556
6557	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6558		INIT_LIST_HEAD(conf->inactive_list + i);
6559
6560	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6561		INIT_LIST_HEAD(conf->temp_inactive_list + i);
6562
 
 
 
 
 
6563	conf->level = mddev->new_level;
6564	conf->chunk_sectors = mddev->new_chunk_sectors;
6565	if (raid5_alloc_percpu(conf) != 0)
6566		goto abort;
6567
6568	pr_debug("raid456: run(%s) called.\n", mdname(mddev));
6569
6570	rdev_for_each(rdev, mddev) {
6571		raid_disk = rdev->raid_disk;
6572		if (raid_disk >= max_disks
6573		    || raid_disk < 0 || test_bit(Journal, &rdev->flags))
6574			continue;
6575		disk = conf->disks + raid_disk;
6576
6577		if (test_bit(Replacement, &rdev->flags)) {
6578			if (disk->replacement)
6579				goto abort;
6580			disk->replacement = rdev;
6581		} else {
6582			if (disk->rdev)
6583				goto abort;
6584			disk->rdev = rdev;
6585		}
6586
6587		if (test_bit(In_sync, &rdev->flags)) {
6588			char b[BDEVNAME_SIZE];
6589			printk(KERN_INFO "md/raid:%s: device %s operational as raid"
6590			       " disk %d\n",
6591			       mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
6592		} else if (rdev->saved_raid_disk != raid_disk)
6593			/* Cannot rely on bitmap to complete recovery */
6594			conf->fullsync = 1;
6595	}
6596
6597	conf->level = mddev->new_level;
6598	if (conf->level == 6) {
6599		conf->max_degraded = 2;
6600		if (raid6_call.xor_syndrome)
6601			conf->rmw_level = PARITY_ENABLE_RMW;
6602		else
6603			conf->rmw_level = PARITY_DISABLE_RMW;
6604	} else {
6605		conf->max_degraded = 1;
6606		conf->rmw_level = PARITY_ENABLE_RMW;
6607	}
6608	conf->algorithm = mddev->new_layout;
6609	conf->reshape_progress = mddev->reshape_position;
6610	if (conf->reshape_progress != MaxSector) {
6611		conf->prev_chunk_sectors = mddev->chunk_sectors;
6612		conf->prev_algo = mddev->layout;
6613	} else {
6614		conf->prev_chunk_sectors = conf->chunk_sectors;
6615		conf->prev_algo = conf->algorithm;
6616	}
6617
6618	conf->min_nr_stripes = NR_STRIPES;
 
 
 
 
 
 
 
 
 
6619	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
6620		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
6621	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
6622	if (grow_stripes(conf, conf->min_nr_stripes)) {
6623		printk(KERN_ERR
6624		       "md/raid:%s: couldn't allocate %dkB for buffers\n",
6625		       mdname(mddev), memory);
6626		goto abort;
6627	} else
6628		printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
6629		       mdname(mddev), memory);
6630	/*
6631	 * Losing a stripe head costs more than the time to refill it,
6632	 * it reduces the queue depth and so can hurt throughput.
6633	 * So set it rather large, scaled by number of devices.
6634	 */
6635	conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
6636	conf->shrinker.scan_objects = raid5_cache_scan;
6637	conf->shrinker.count_objects = raid5_cache_count;
6638	conf->shrinker.batch = 128;
6639	conf->shrinker.flags = 0;
6640	register_shrinker(&conf->shrinker);
 
 
 
 
6641
6642	sprintf(pers_name, "raid%d", mddev->new_level);
6643	conf->thread = md_register_thread(raid5d, mddev, pers_name);
6644	if (!conf->thread) {
6645		printk(KERN_ERR
6646		       "md/raid:%s: couldn't allocate thread.\n",
6647		       mdname(mddev));
6648		goto abort;
6649	}
6650
6651	return conf;
6652
6653 abort:
6654	if (conf) {
6655		free_conf(conf);
6656		return ERR_PTR(-EIO);
6657	} else
6658		return ERR_PTR(-ENOMEM);
6659}
6660
6661static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
6662{
6663	switch (algo) {
6664	case ALGORITHM_PARITY_0:
6665		if (raid_disk < max_degraded)
6666			return 1;
6667		break;
6668	case ALGORITHM_PARITY_N:
6669		if (raid_disk >= raid_disks - max_degraded)
6670			return 1;
6671		break;
6672	case ALGORITHM_PARITY_0_6:
6673		if (raid_disk == 0 ||
6674		    raid_disk == raid_disks - 1)
6675			return 1;
6676		break;
6677	case ALGORITHM_LEFT_ASYMMETRIC_6:
6678	case ALGORITHM_RIGHT_ASYMMETRIC_6:
6679	case ALGORITHM_LEFT_SYMMETRIC_6:
6680	case ALGORITHM_RIGHT_SYMMETRIC_6:
6681		if (raid_disk == raid_disks - 1)
6682			return 1;
6683	}
6684	return 0;
6685}
6686
6687static int raid5_run(struct mddev *mddev)
6688{
6689	struct r5conf *conf;
6690	int working_disks = 0;
6691	int dirty_parity_disks = 0;
6692	struct md_rdev *rdev;
6693	struct md_rdev *journal_dev = NULL;
6694	sector_t reshape_offset = 0;
6695	int i;
6696	long long min_offset_diff = 0;
6697	int first = 1;
6698
6699	if (mddev->recovery_cp != MaxSector)
6700		printk(KERN_NOTICE "md/raid:%s: not clean"
6701		       " -- starting background reconstruction\n",
6702		       mdname(mddev));
6703
6704	rdev_for_each(rdev, mddev) {
6705		long long diff;
6706
6707		if (test_bit(Journal, &rdev->flags)) {
6708			journal_dev = rdev;
6709			continue;
6710		}
6711		if (rdev->raid_disk < 0)
6712			continue;
6713		diff = (rdev->new_data_offset - rdev->data_offset);
6714		if (first) {
6715			min_offset_diff = diff;
6716			first = 0;
6717		} else if (mddev->reshape_backwards &&
6718			 diff < min_offset_diff)
6719			min_offset_diff = diff;
6720		else if (!mddev->reshape_backwards &&
6721			 diff > min_offset_diff)
6722			min_offset_diff = diff;
6723	}
6724
6725	if (mddev->reshape_position != MaxSector) {
6726		/* Check that we can continue the reshape.
6727		 * Difficulties arise if the stripe we would write to
6728		 * next is at or after the stripe we would read from next.
6729		 * For a reshape that changes the number of devices, this
6730		 * is only possible for a very short time, and mdadm makes
6731		 * sure that time appears to have past before assembling
6732		 * the array.  So we fail if that time hasn't passed.
6733		 * For a reshape that keeps the number of devices the same
6734		 * mdadm must be monitoring the reshape can keeping the
6735		 * critical areas read-only and backed up.  It will start
6736		 * the array in read-only mode, so we check for that.
6737		 */
6738		sector_t here_new, here_old;
6739		int old_disks;
6740		int max_degraded = (mddev->level == 6 ? 2 : 1);
6741		int chunk_sectors;
6742		int new_data_disks;
6743
6744		if (journal_dev) {
6745			printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n",
6746			       mdname(mddev));
6747			return -EINVAL;
6748		}
6749
6750		if (mddev->new_level != mddev->level) {
6751			printk(KERN_ERR "md/raid:%s: unsupported reshape "
6752			       "required - aborting.\n",
6753			       mdname(mddev));
6754			return -EINVAL;
6755		}
6756		old_disks = mddev->raid_disks - mddev->delta_disks;
6757		/* reshape_position must be on a new-stripe boundary, and one
6758		 * further up in new geometry must map after here in old
6759		 * geometry.
6760		 * If the chunk sizes are different, then as we perform reshape
6761		 * in units of the largest of the two, reshape_position needs
6762		 * be a multiple of the largest chunk size times new data disks.
6763		 */
6764		here_new = mddev->reshape_position;
6765		chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
6766		new_data_disks = mddev->raid_disks - max_degraded;
6767		if (sector_div(here_new, chunk_sectors * new_data_disks)) {
6768			printk(KERN_ERR "md/raid:%s: reshape_position not "
6769			       "on a stripe boundary\n", mdname(mddev));
6770			return -EINVAL;
6771		}
6772		reshape_offset = here_new * chunk_sectors;
6773		/* here_new is the stripe we will write to */
6774		here_old = mddev->reshape_position;
6775		sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
6776		/* here_old is the first stripe that we might need to read
6777		 * from */
6778		if (mddev->delta_disks == 0) {
6779			/* We cannot be sure it is safe to start an in-place
6780			 * reshape.  It is only safe if user-space is monitoring
6781			 * and taking constant backups.
6782			 * mdadm always starts a situation like this in
6783			 * readonly mode so it can take control before
6784			 * allowing any writes.  So just check for that.
6785			 */
6786			if (abs(min_offset_diff) >= mddev->chunk_sectors &&
6787			    abs(min_offset_diff) >= mddev->new_chunk_sectors)
6788				/* not really in-place - so OK */;
6789			else if (mddev->ro == 0) {
6790				printk(KERN_ERR "md/raid:%s: in-place reshape "
6791				       "must be started in read-only mode "
6792				       "- aborting\n",
6793				       mdname(mddev));
6794				return -EINVAL;
6795			}
6796		} else if (mddev->reshape_backwards
6797		    ? (here_new * chunk_sectors + min_offset_diff <=
6798		       here_old * chunk_sectors)
6799		    : (here_new * chunk_sectors >=
6800		       here_old * chunk_sectors + (-min_offset_diff))) {
6801			/* Reading from the same stripe as writing to - bad */
6802			printk(KERN_ERR "md/raid:%s: reshape_position too early for "
6803			       "auto-recovery - aborting.\n",
6804			       mdname(mddev));
6805			return -EINVAL;
6806		}
6807		printk(KERN_INFO "md/raid:%s: reshape will continue\n",
6808		       mdname(mddev));
6809		/* OK, we should be able to continue; */
6810	} else {
6811		BUG_ON(mddev->level != mddev->new_level);
6812		BUG_ON(mddev->layout != mddev->new_layout);
6813		BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
6814		BUG_ON(mddev->delta_disks != 0);
6815	}
6816
6817	if (mddev->private == NULL)
6818		conf = setup_conf(mddev);
6819	else
6820		conf = mddev->private;
6821
6822	if (IS_ERR(conf))
6823		return PTR_ERR(conf);
6824
6825	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !journal_dev) {
6826		printk(KERN_ERR "md/raid:%s: journal disk is missing, force array readonly\n",
6827		       mdname(mddev));
6828		mddev->ro = 1;
6829		set_disk_ro(mddev->gendisk, 1);
 
 
 
6830	}
6831
6832	conf->min_offset_diff = min_offset_diff;
6833	mddev->thread = conf->thread;
6834	conf->thread = NULL;
6835	mddev->private = conf;
6836
6837	for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
6838	     i++) {
6839		rdev = conf->disks[i].rdev;
6840		if (!rdev && conf->disks[i].replacement) {
6841			/* The replacement is all we have yet */
6842			rdev = conf->disks[i].replacement;
6843			conf->disks[i].replacement = NULL;
6844			clear_bit(Replacement, &rdev->flags);
6845			conf->disks[i].rdev = rdev;
6846		}
6847		if (!rdev)
6848			continue;
6849		if (conf->disks[i].replacement &&
6850		    conf->reshape_progress != MaxSector) {
6851			/* replacements and reshape simply do not mix. */
6852			printk(KERN_ERR "md: cannot handle concurrent "
6853			       "replacement and reshape.\n");
6854			goto abort;
6855		}
6856		if (test_bit(In_sync, &rdev->flags)) {
6857			working_disks++;
6858			continue;
6859		}
6860		/* This disc is not fully in-sync.  However if it
6861		 * just stored parity (beyond the recovery_offset),
6862		 * when we don't need to be concerned about the
6863		 * array being dirty.
6864		 * When reshape goes 'backwards', we never have
6865		 * partially completed devices, so we only need
6866		 * to worry about reshape going forwards.
6867		 */
6868		/* Hack because v0.91 doesn't store recovery_offset properly. */
6869		if (mddev->major_version == 0 &&
6870		    mddev->minor_version > 90)
6871			rdev->recovery_offset = reshape_offset;
6872
6873		if (rdev->recovery_offset < reshape_offset) {
6874			/* We need to check old and new layout */
6875			if (!only_parity(rdev->raid_disk,
6876					 conf->algorithm,
6877					 conf->raid_disks,
6878					 conf->max_degraded))
6879				continue;
6880		}
6881		if (!only_parity(rdev->raid_disk,
6882				 conf->prev_algo,
6883				 conf->previous_raid_disks,
6884				 conf->max_degraded))
6885			continue;
6886		dirty_parity_disks++;
6887	}
6888
6889	/*
6890	 * 0 for a fully functional array, 1 or 2 for a degraded array.
6891	 */
6892	mddev->degraded = calc_degraded(conf);
6893
6894	if (has_failed(conf)) {
6895		printk(KERN_ERR "md/raid:%s: not enough operational devices"
6896			" (%d/%d failed)\n",
6897			mdname(mddev), mddev->degraded, conf->raid_disks);
6898		goto abort;
6899	}
6900
6901	/* device size must be a multiple of chunk size */
6902	mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
6903	mddev->resync_max_sectors = mddev->dev_sectors;
6904
6905	if (mddev->degraded > dirty_parity_disks &&
6906	    mddev->recovery_cp != MaxSector) {
6907		if (mddev->ok_start_degraded)
6908			printk(KERN_WARNING
6909			       "md/raid:%s: starting dirty degraded array"
6910			       " - data corruption possible.\n",
6911			       mdname(mddev));
6912		else {
6913			printk(KERN_ERR
6914			       "md/raid:%s: cannot start dirty degraded array.\n",
6915			       mdname(mddev));
6916			goto abort;
6917		}
6918	}
6919
6920	if (mddev->degraded == 0)
6921		printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d"
6922		       " devices, algorithm %d\n", mdname(mddev), conf->level,
6923		       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
6924		       mddev->new_layout);
6925	else
6926		printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
6927		       " out of %d devices, algorithm %d\n",
6928		       mdname(mddev), conf->level,
6929		       mddev->raid_disks - mddev->degraded,
6930		       mddev->raid_disks, mddev->new_layout);
6931
6932	print_raid5_conf(conf);
6933
6934	if (conf->reshape_progress != MaxSector) {
6935		conf->reshape_safe = conf->reshape_progress;
6936		atomic_set(&conf->reshape_stripes, 0);
6937		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6938		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
6939		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
6940		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6941		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
6942							"reshape");
6943	}
6944
6945	/* Ok, everything is just fine now */
6946	if (mddev->to_remove == &raid5_attrs_group)
6947		mddev->to_remove = NULL;
6948	else if (mddev->kobj.sd &&
6949	    sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
6950		printk(KERN_WARNING
6951		       "raid5: failed to create sysfs attributes for %s\n",
6952		       mdname(mddev));
6953	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
6954
6955	if (mddev->queue) {
6956		int chunk_size;
6957		bool discard_supported = true;
6958		/* read-ahead size must cover two whole stripes, which
6959		 * is 2 * (datadisks) * chunksize where 'n' is the
6960		 * number of raid devices
6961		 */
6962		int data_disks = conf->previous_raid_disks - conf->max_degraded;
6963		int stripe = data_disks *
6964			((mddev->chunk_sectors << 9) / PAGE_SIZE);
6965		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
6966			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
6967
6968		chunk_size = mddev->chunk_sectors << 9;
6969		blk_queue_io_min(mddev->queue, chunk_size);
6970		blk_queue_io_opt(mddev->queue, chunk_size *
6971				 (conf->raid_disks - conf->max_degraded));
6972		mddev->queue->limits.raid_partial_stripes_expensive = 1;
6973		/*
6974		 * We can only discard a whole stripe. It doesn't make sense to
6975		 * discard data disk but write parity disk
6976		 */
6977		stripe = stripe * PAGE_SIZE;
6978		/* Round up to power of 2, as discard handling
6979		 * currently assumes that */
6980		while ((stripe-1) & stripe)
6981			stripe = (stripe | (stripe-1)) + 1;
6982		mddev->queue->limits.discard_alignment = stripe;
6983		mddev->queue->limits.discard_granularity = stripe;
 
 
 
 
 
 
 
 
 
6984		/*
6985		 * unaligned part of discard request will be ignored, so can't
6986		 * guarantee discard_zeroes_data
6987		 */
6988		mddev->queue->limits.discard_zeroes_data = 0;
6989
6990		blk_queue_max_write_same_sectors(mddev->queue, 0);
6991
6992		rdev_for_each(rdev, mddev) {
6993			disk_stack_limits(mddev->gendisk, rdev->bdev,
6994					  rdev->data_offset << 9);
6995			disk_stack_limits(mddev->gendisk, rdev->bdev,
6996					  rdev->new_data_offset << 9);
6997			/*
6998			 * discard_zeroes_data is required, otherwise data
6999			 * could be lost. Consider a scenario: discard a stripe
7000			 * (the stripe could be inconsistent if
7001			 * discard_zeroes_data is 0); write one disk of the
7002			 * stripe (the stripe could be inconsistent again
7003			 * depending on which disks are used to calculate
7004			 * parity); the disk is broken; The stripe data of this
7005			 * disk is lost.
7006			 */
7007			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
7008			    !bdev_get_queue(rdev->bdev)->
7009						limits.discard_zeroes_data)
7010				discard_supported = false;
7011			/* Unfortunately, discard_zeroes_data is not currently
7012			 * a guarantee - just a hint.  So we only allow DISCARD
7013			 * if the sysadmin has confirmed that only safe devices
7014			 * are in use by setting a module parameter.
7015			 */
7016			if (!devices_handle_discard_safely) {
7017				if (discard_supported) {
7018					pr_info("md/raid456: discard support disabled due to uncertainty.\n");
7019					pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
7020				}
7021				discard_supported = false;
7022			}
7023		}
7024
7025		if (discard_supported &&
7026		    mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
7027		    mddev->queue->limits.discard_granularity >= stripe)
7028			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
7029						mddev->queue);
7030		else
7031			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
7032						mddev->queue);
 
 
7033	}
7034
7035	if (journal_dev) {
7036		char b[BDEVNAME_SIZE];
7037
7038		printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
7039		       mdname(mddev), bdevname(journal_dev->bdev, b));
7040		r5l_init_log(conf, journal_dev);
 
7041	}
7042
7043	return 0;
7044abort:
7045	md_unregister_thread(&mddev->thread);
7046	print_raid5_conf(conf);
7047	free_conf(conf);
7048	mddev->private = NULL;
7049	printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
7050	return -EIO;
7051}
7052
7053static void raid5_free(struct mddev *mddev, void *priv)
7054{
7055	struct r5conf *conf = priv;
7056
7057	free_conf(conf);
7058	mddev->to_remove = &raid5_attrs_group;
7059}
7060
7061static void raid5_status(struct seq_file *seq, struct mddev *mddev)
7062{
7063	struct r5conf *conf = mddev->private;
7064	int i;
7065
7066	seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
7067		conf->chunk_sectors / 2, mddev->layout);
7068	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
7069	for (i = 0; i < conf->raid_disks; i++)
7070		seq_printf (seq, "%s",
7071			       conf->disks[i].rdev &&
7072			       test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
 
 
7073	seq_printf (seq, "]");
7074}
7075
7076static void print_raid5_conf (struct r5conf *conf)
7077{
7078	int i;
7079	struct disk_info *tmp;
7080
7081	printk(KERN_DEBUG "RAID conf printout:\n");
7082	if (!conf) {
7083		printk("(conf==NULL)\n");
7084		return;
7085	}
7086	printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level,
7087	       conf->raid_disks,
7088	       conf->raid_disks - conf->mddev->degraded);
7089
7090	for (i = 0; i < conf->raid_disks; i++) {
7091		char b[BDEVNAME_SIZE];
7092		tmp = conf->disks + i;
7093		if (tmp->rdev)
7094			printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n",
7095			       i, !test_bit(Faulty, &tmp->rdev->flags),
7096			       bdevname(tmp->rdev->bdev, b));
7097	}
7098}
7099
7100static int raid5_spare_active(struct mddev *mddev)
7101{
7102	int i;
7103	struct r5conf *conf = mddev->private;
7104	struct disk_info *tmp;
7105	int count = 0;
7106	unsigned long flags;
7107
7108	for (i = 0; i < conf->raid_disks; i++) {
7109		tmp = conf->disks + i;
7110		if (tmp->replacement
7111		    && tmp->replacement->recovery_offset == MaxSector
7112		    && !test_bit(Faulty, &tmp->replacement->flags)
7113		    && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
7114			/* Replacement has just become active. */
7115			if (!tmp->rdev
7116			    || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
7117				count++;
7118			if (tmp->rdev) {
7119				/* Replaced device not technically faulty,
7120				 * but we need to be sure it gets removed
7121				 * and never re-added.
7122				 */
7123				set_bit(Faulty, &tmp->rdev->flags);
7124				sysfs_notify_dirent_safe(
7125					tmp->rdev->sysfs_state);
7126			}
7127			sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
7128		} else if (tmp->rdev
7129		    && tmp->rdev->recovery_offset == MaxSector
7130		    && !test_bit(Faulty, &tmp->rdev->flags)
7131		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
7132			count++;
7133			sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
7134		}
7135	}
7136	spin_lock_irqsave(&conf->device_lock, flags);
7137	mddev->degraded = calc_degraded(conf);
7138	spin_unlock_irqrestore(&conf->device_lock, flags);
7139	print_raid5_conf(conf);
7140	return count;
7141}
7142
7143static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
7144{
7145	struct r5conf *conf = mddev->private;
7146	int err = 0;
7147	int number = rdev->raid_disk;
7148	struct md_rdev **rdevp;
7149	struct disk_info *p = conf->disks + number;
7150
7151	print_raid5_conf(conf);
7152	if (test_bit(Journal, &rdev->flags) && conf->log) {
7153		struct r5l_log *log;
7154		/*
7155		 * we can't wait pending write here, as this is called in
7156		 * raid5d, wait will deadlock.
7157		 */
7158		if (atomic_read(&mddev->writes_pending))
7159			return -EBUSY;
7160		log = conf->log;
7161		conf->log = NULL;
7162		synchronize_rcu();
7163		r5l_exit_log(log);
7164		return 0;
7165	}
7166	if (rdev == p->rdev)
7167		rdevp = &p->rdev;
7168	else if (rdev == p->replacement)
7169		rdevp = &p->replacement;
7170	else
7171		return 0;
7172
7173	if (number >= conf->raid_disks &&
7174	    conf->reshape_progress == MaxSector)
7175		clear_bit(In_sync, &rdev->flags);
7176
7177	if (test_bit(In_sync, &rdev->flags) ||
7178	    atomic_read(&rdev->nr_pending)) {
7179		err = -EBUSY;
7180		goto abort;
7181	}
7182	/* Only remove non-faulty devices if recovery
7183	 * isn't possible.
7184	 */
7185	if (!test_bit(Faulty, &rdev->flags) &&
7186	    mddev->recovery_disabled != conf->recovery_disabled &&
7187	    !has_failed(conf) &&
7188	    (!p->replacement || p->replacement == rdev) &&
7189	    number < conf->raid_disks) {
7190		err = -EBUSY;
7191		goto abort;
7192	}
7193	*rdevp = NULL;
7194	synchronize_rcu();
7195	if (atomic_read(&rdev->nr_pending)) {
7196		/* lost the race, try later */
7197		err = -EBUSY;
7198		*rdevp = rdev;
7199	} else if (p->replacement) {
 
 
 
7200		/* We must have just cleared 'rdev' */
7201		p->rdev = p->replacement;
7202		clear_bit(Replacement, &p->replacement->flags);
7203		smp_mb(); /* Make sure other CPUs may see both as identical
7204			   * but will never see neither - if they are careful
7205			   */
7206		p->replacement = NULL;
7207		clear_bit(WantReplacement, &rdev->flags);
7208	} else
7209		/* We might have just removed the Replacement as faulty-
7210		 * clear the bit just in case
7211		 */
7212		clear_bit(WantReplacement, &rdev->flags);
7213abort:
7214
7215	print_raid5_conf(conf);
7216	return err;
7217}
7218
7219static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
7220{
7221	struct r5conf *conf = mddev->private;
7222	int err = -EEXIST;
7223	int disk;
7224	struct disk_info *p;
7225	int first = 0;
7226	int last = conf->raid_disks - 1;
7227
7228	if (test_bit(Journal, &rdev->flags)) {
7229		char b[BDEVNAME_SIZE];
7230		if (conf->log)
7231			return -EBUSY;
7232
7233		rdev->raid_disk = 0;
7234		/*
7235		 * The array is in readonly mode if journal is missing, so no
7236		 * write requests running. We should be safe
7237		 */
7238		r5l_init_log(conf, rdev);
7239		printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
7240		       mdname(mddev), bdevname(rdev->bdev, b));
7241		return 0;
7242	}
7243	if (mddev->recovery_disabled == conf->recovery_disabled)
7244		return -EBUSY;
7245
7246	if (rdev->saved_raid_disk < 0 && has_failed(conf))
7247		/* no point adding a device */
7248		return -EINVAL;
7249
7250	if (rdev->raid_disk >= 0)
7251		first = last = rdev->raid_disk;
7252
7253	/*
7254	 * find the disk ... but prefer rdev->saved_raid_disk
7255	 * if possible.
7256	 */
7257	if (rdev->saved_raid_disk >= 0 &&
7258	    rdev->saved_raid_disk >= first &&
7259	    conf->disks[rdev->saved_raid_disk].rdev == NULL)
7260		first = rdev->saved_raid_disk;
7261
7262	for (disk = first; disk <= last; disk++) {
7263		p = conf->disks + disk;
7264		if (p->rdev == NULL) {
7265			clear_bit(In_sync, &rdev->flags);
7266			rdev->raid_disk = disk;
7267			err = 0;
7268			if (rdev->saved_raid_disk != disk)
7269				conf->fullsync = 1;
7270			rcu_assign_pointer(p->rdev, rdev);
7271			goto out;
7272		}
7273	}
7274	for (disk = first; disk <= last; disk++) {
7275		p = conf->disks + disk;
7276		if (test_bit(WantReplacement, &p->rdev->flags) &&
7277		    p->replacement == NULL) {
7278			clear_bit(In_sync, &rdev->flags);
7279			set_bit(Replacement, &rdev->flags);
7280			rdev->raid_disk = disk;
7281			err = 0;
7282			conf->fullsync = 1;
7283			rcu_assign_pointer(p->replacement, rdev);
7284			break;
7285		}
7286	}
7287out:
7288	print_raid5_conf(conf);
7289	return err;
7290}
7291
7292static int raid5_resize(struct mddev *mddev, sector_t sectors)
7293{
7294	/* no resync is happening, and there is enough space
7295	 * on all devices, so we can resize.
7296	 * We need to make sure resync covers any new space.
7297	 * If the array is shrinking we should possibly wait until
7298	 * any io in the removed space completes, but it hardly seems
7299	 * worth it.
7300	 */
7301	sector_t newsize;
7302	struct r5conf *conf = mddev->private;
7303
7304	if (conf->log)
7305		return -EINVAL;
7306	sectors &= ~((sector_t)conf->chunk_sectors - 1);
7307	newsize = raid5_size(mddev, sectors, mddev->raid_disks);
7308	if (mddev->external_size &&
7309	    mddev->array_sectors > newsize)
7310		return -EINVAL;
7311	if (mddev->bitmap) {
7312		int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0);
7313		if (ret)
7314			return ret;
7315	}
7316	md_set_array_sectors(mddev, newsize);
7317	set_capacity(mddev->gendisk, mddev->array_sectors);
7318	revalidate_disk(mddev->gendisk);
7319	if (sectors > mddev->dev_sectors &&
7320	    mddev->recovery_cp > mddev->dev_sectors) {
7321		mddev->recovery_cp = mddev->dev_sectors;
7322		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7323	}
7324	mddev->dev_sectors = sectors;
7325	mddev->resync_max_sectors = sectors;
7326	return 0;
7327}
7328
7329static int check_stripe_cache(struct mddev *mddev)
7330{
7331	/* Can only proceed if there are plenty of stripe_heads.
7332	 * We need a minimum of one full stripe,, and for sensible progress
7333	 * it is best to have about 4 times that.
7334	 * If we require 4 times, then the default 256 4K stripe_heads will
7335	 * allow for chunk sizes up to 256K, which is probably OK.
7336	 * If the chunk size is greater, user-space should request more
7337	 * stripe_heads first.
7338	 */
7339	struct r5conf *conf = mddev->private;
7340	if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
7341	    > conf->min_nr_stripes ||
7342	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
7343	    > conf->min_nr_stripes) {
7344		printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
7345		       mdname(mddev),
7346		       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7347			/ STRIPE_SIZE)*4);
7348		return 0;
7349	}
7350	return 1;
7351}
7352
7353static int check_reshape(struct mddev *mddev)
7354{
7355	struct r5conf *conf = mddev->private;
7356
7357	if (conf->log)
7358		return -EINVAL;
7359	if (mddev->delta_disks == 0 &&
7360	    mddev->new_layout == mddev->layout &&
7361	    mddev->new_chunk_sectors == mddev->chunk_sectors)
7362		return 0; /* nothing to do */
7363	if (has_failed(conf))
7364		return -EINVAL;
7365	if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
7366		/* We might be able to shrink, but the devices must
7367		 * be made bigger first.
7368		 * For raid6, 4 is the minimum size.
7369		 * Otherwise 2 is the minimum
7370		 */
7371		int min = 2;
7372		if (mddev->level == 6)
7373			min = 4;
7374		if (mddev->raid_disks + mddev->delta_disks < min)
7375			return -EINVAL;
7376	}
7377
7378	if (!check_stripe_cache(mddev))
7379		return -ENOSPC;
7380
7381	if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
7382	    mddev->delta_disks > 0)
7383		if (resize_chunks(conf,
7384				  conf->previous_raid_disks
7385				  + max(0, mddev->delta_disks),
7386				  max(mddev->new_chunk_sectors,
7387				      mddev->chunk_sectors)
7388			    ) < 0)
7389			return -ENOMEM;
7390	return resize_stripes(conf, (conf->previous_raid_disks
7391				     + mddev->delta_disks));
7392}
7393
7394static int raid5_start_reshape(struct mddev *mddev)
7395{
7396	struct r5conf *conf = mddev->private;
7397	struct md_rdev *rdev;
7398	int spares = 0;
7399	unsigned long flags;
7400
7401	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
7402		return -EBUSY;
7403
7404	if (!check_stripe_cache(mddev))
7405		return -ENOSPC;
7406
7407	if (has_failed(conf))
7408		return -EINVAL;
7409
7410	rdev_for_each(rdev, mddev) {
7411		if (!test_bit(In_sync, &rdev->flags)
7412		    && !test_bit(Faulty, &rdev->flags))
7413			spares++;
7414	}
7415
7416	if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
7417		/* Not enough devices even to make a degraded array
7418		 * of that size
7419		 */
7420		return -EINVAL;
7421
7422	/* Refuse to reduce size of the array.  Any reductions in
7423	 * array size must be through explicit setting of array_size
7424	 * attribute.
7425	 */
7426	if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
7427	    < mddev->array_sectors) {
7428		printk(KERN_ERR "md/raid:%s: array size must be reduced "
7429		       "before number of disks\n", mdname(mddev));
7430		return -EINVAL;
7431	}
7432
7433	atomic_set(&conf->reshape_stripes, 0);
7434	spin_lock_irq(&conf->device_lock);
7435	write_seqcount_begin(&conf->gen_lock);
7436	conf->previous_raid_disks = conf->raid_disks;
7437	conf->raid_disks += mddev->delta_disks;
7438	conf->prev_chunk_sectors = conf->chunk_sectors;
7439	conf->chunk_sectors = mddev->new_chunk_sectors;
7440	conf->prev_algo = conf->algorithm;
7441	conf->algorithm = mddev->new_layout;
7442	conf->generation++;
7443	/* Code that selects data_offset needs to see the generation update
7444	 * if reshape_progress has been set - so a memory barrier needed.
7445	 */
7446	smp_mb();
7447	if (mddev->reshape_backwards)
7448		conf->reshape_progress = raid5_size(mddev, 0, 0);
7449	else
7450		conf->reshape_progress = 0;
7451	conf->reshape_safe = conf->reshape_progress;
7452	write_seqcount_end(&conf->gen_lock);
7453	spin_unlock_irq(&conf->device_lock);
7454
7455	/* Now make sure any requests that proceeded on the assumption
7456	 * the reshape wasn't running - like Discard or Read - have
7457	 * completed.
7458	 */
7459	mddev_suspend(mddev);
7460	mddev_resume(mddev);
7461
7462	/* Add some new drives, as many as will fit.
7463	 * We know there are enough to make the newly sized array work.
7464	 * Don't add devices if we are reducing the number of
7465	 * devices in the array.  This is because it is not possible
7466	 * to correctly record the "partially reconstructed" state of
7467	 * such devices during the reshape and confusion could result.
7468	 */
7469	if (mddev->delta_disks >= 0) {
7470		rdev_for_each(rdev, mddev)
7471			if (rdev->raid_disk < 0 &&
7472			    !test_bit(Faulty, &rdev->flags)) {
7473				if (raid5_add_disk(mddev, rdev) == 0) {
7474					if (rdev->raid_disk
7475					    >= conf->previous_raid_disks)
7476						set_bit(In_sync, &rdev->flags);
7477					else
7478						rdev->recovery_offset = 0;
7479
7480					if (sysfs_link_rdev(mddev, rdev))
7481						/* Failure here is OK */;
7482				}
7483			} else if (rdev->raid_disk >= conf->previous_raid_disks
7484				   && !test_bit(Faulty, &rdev->flags)) {
7485				/* This is a spare that was manually added */
7486				set_bit(In_sync, &rdev->flags);
7487			}
7488
7489		/* When a reshape changes the number of devices,
7490		 * ->degraded is measured against the larger of the
7491		 * pre and post number of devices.
7492		 */
7493		spin_lock_irqsave(&conf->device_lock, flags);
7494		mddev->degraded = calc_degraded(conf);
7495		spin_unlock_irqrestore(&conf->device_lock, flags);
7496	}
7497	mddev->raid_disks = conf->raid_disks;
7498	mddev->reshape_position = conf->reshape_progress;
7499	set_bit(MD_CHANGE_DEVS, &mddev->flags);
7500
7501	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7502	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7503	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
7504	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7505	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7506	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7507						"reshape");
7508	if (!mddev->sync_thread) {
7509		mddev->recovery = 0;
7510		spin_lock_irq(&conf->device_lock);
7511		write_seqcount_begin(&conf->gen_lock);
7512		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
7513		mddev->new_chunk_sectors =
7514			conf->chunk_sectors = conf->prev_chunk_sectors;
7515		mddev->new_layout = conf->algorithm = conf->prev_algo;
7516		rdev_for_each(rdev, mddev)
7517			rdev->new_data_offset = rdev->data_offset;
7518		smp_wmb();
7519		conf->generation --;
7520		conf->reshape_progress = MaxSector;
7521		mddev->reshape_position = MaxSector;
7522		write_seqcount_end(&conf->gen_lock);
7523		spin_unlock_irq(&conf->device_lock);
7524		return -EAGAIN;
7525	}
7526	conf->reshape_checkpoint = jiffies;
7527	md_wakeup_thread(mddev->sync_thread);
7528	md_new_event(mddev);
7529	return 0;
7530}
7531
7532/* This is called from the reshape thread and should make any
7533 * changes needed in 'conf'
7534 */
7535static void end_reshape(struct r5conf *conf)
7536{
7537
7538	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
7539		struct md_rdev *rdev;
7540
7541		spin_lock_irq(&conf->device_lock);
7542		conf->previous_raid_disks = conf->raid_disks;
7543		rdev_for_each(rdev, conf->mddev)
7544			rdev->data_offset = rdev->new_data_offset;
7545		smp_wmb();
7546		conf->reshape_progress = MaxSector;
7547		conf->mddev->reshape_position = MaxSector;
7548		spin_unlock_irq(&conf->device_lock);
7549		wake_up(&conf->wait_for_overlap);
7550
7551		/* read-ahead size must cover two whole stripes, which is
7552		 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
7553		 */
7554		if (conf->mddev->queue) {
7555			int data_disks = conf->raid_disks - conf->max_degraded;
7556			int stripe = data_disks * ((conf->chunk_sectors << 9)
7557						   / PAGE_SIZE);
7558			if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
7559				conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
7560		}
7561	}
7562}
7563
7564/* This is called from the raid5d thread with mddev_lock held.
7565 * It makes config changes to the device.
7566 */
7567static void raid5_finish_reshape(struct mddev *mddev)
7568{
7569	struct r5conf *conf = mddev->private;
7570
7571	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7572
7573		if (mddev->delta_disks > 0) {
7574			md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7575			set_capacity(mddev->gendisk, mddev->array_sectors);
7576			revalidate_disk(mddev->gendisk);
 
 
7577		} else {
7578			int d;
7579			spin_lock_irq(&conf->device_lock);
7580			mddev->degraded = calc_degraded(conf);
7581			spin_unlock_irq(&conf->device_lock);
7582			for (d = conf->raid_disks ;
7583			     d < conf->raid_disks - mddev->delta_disks;
7584			     d++) {
7585				struct md_rdev *rdev = conf->disks[d].rdev;
7586				if (rdev)
7587					clear_bit(In_sync, &rdev->flags);
7588				rdev = conf->disks[d].replacement;
7589				if (rdev)
7590					clear_bit(In_sync, &rdev->flags);
7591			}
7592		}
7593		mddev->layout = conf->algorithm;
7594		mddev->chunk_sectors = conf->chunk_sectors;
7595		mddev->reshape_position = MaxSector;
7596		mddev->delta_disks = 0;
7597		mddev->reshape_backwards = 0;
7598	}
7599}
7600
7601static void raid5_quiesce(struct mddev *mddev, int state)
7602{
7603	struct r5conf *conf = mddev->private;
7604
7605	switch(state) {
7606	case 2: /* resume for a suspend */
7607		wake_up(&conf->wait_for_overlap);
7608		break;
7609
7610	case 1: /* stop all writes */
7611		lock_all_device_hash_locks_irq(conf);
7612		/* '2' tells resync/reshape to pause so that all
7613		 * active stripes can drain
7614		 */
 
7615		conf->quiesce = 2;
7616		wait_event_cmd(conf->wait_for_quiescent,
7617				    atomic_read(&conf->active_stripes) == 0 &&
7618				    atomic_read(&conf->active_aligned_reads) == 0,
7619				    unlock_all_device_hash_locks_irq(conf),
7620				    lock_all_device_hash_locks_irq(conf));
7621		conf->quiesce = 1;
7622		unlock_all_device_hash_locks_irq(conf);
7623		/* allow reshape to continue */
7624		wake_up(&conf->wait_for_overlap);
7625		break;
7626
7627	case 0: /* re-enable writes */
7628		lock_all_device_hash_locks_irq(conf);
7629		conf->quiesce = 0;
7630		wake_up(&conf->wait_for_quiescent);
7631		wake_up(&conf->wait_for_overlap);
7632		unlock_all_device_hash_locks_irq(conf);
7633		break;
7634	}
7635	r5l_quiesce(conf->log, state);
7636}
7637
7638static void *raid45_takeover_raid0(struct mddev *mddev, int level)
7639{
7640	struct r0conf *raid0_conf = mddev->private;
7641	sector_t sectors;
7642
7643	/* for raid0 takeover only one zone is supported */
7644	if (raid0_conf->nr_strip_zones > 1) {
7645		printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
7646		       mdname(mddev));
7647		return ERR_PTR(-EINVAL);
7648	}
7649
7650	sectors = raid0_conf->strip_zone[0].zone_end;
7651	sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
7652	mddev->dev_sectors = sectors;
7653	mddev->new_level = level;
7654	mddev->new_layout = ALGORITHM_PARITY_N;
7655	mddev->new_chunk_sectors = mddev->chunk_sectors;
7656	mddev->raid_disks += 1;
7657	mddev->delta_disks = 1;
7658	/* make sure it will be not marked as dirty */
7659	mddev->recovery_cp = MaxSector;
7660
7661	return setup_conf(mddev);
7662}
7663
7664static void *raid5_takeover_raid1(struct mddev *mddev)
7665{
7666	int chunksect;
 
7667
7668	if (mddev->raid_disks != 2 ||
7669	    mddev->degraded > 1)
7670		return ERR_PTR(-EINVAL);
7671
7672	/* Should check if there are write-behind devices? */
7673
7674	chunksect = 64*2; /* 64K by default */
7675
7676	/* The array must be an exact multiple of chunksize */
7677	while (chunksect && (mddev->array_sectors & (chunksect-1)))
7678		chunksect >>= 1;
7679
7680	if ((chunksect<<9) < STRIPE_SIZE)
7681		/* array size does not allow a suitable chunk size */
7682		return ERR_PTR(-EINVAL);
7683
7684	mddev->new_level = 5;
7685	mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
7686	mddev->new_chunk_sectors = chunksect;
7687
7688	return setup_conf(mddev);
 
 
 
 
7689}
7690
7691static void *raid5_takeover_raid6(struct mddev *mddev)
7692{
7693	int new_layout;
7694
7695	switch (mddev->layout) {
7696	case ALGORITHM_LEFT_ASYMMETRIC_6:
7697		new_layout = ALGORITHM_LEFT_ASYMMETRIC;
7698		break;
7699	case ALGORITHM_RIGHT_ASYMMETRIC_6:
7700		new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
7701		break;
7702	case ALGORITHM_LEFT_SYMMETRIC_6:
7703		new_layout = ALGORITHM_LEFT_SYMMETRIC;
7704		break;
7705	case ALGORITHM_RIGHT_SYMMETRIC_6:
7706		new_layout = ALGORITHM_RIGHT_SYMMETRIC;
7707		break;
7708	case ALGORITHM_PARITY_0_6:
7709		new_layout = ALGORITHM_PARITY_0;
7710		break;
7711	case ALGORITHM_PARITY_N:
7712		new_layout = ALGORITHM_PARITY_N;
7713		break;
7714	default:
7715		return ERR_PTR(-EINVAL);
7716	}
7717	mddev->new_level = 5;
7718	mddev->new_layout = new_layout;
7719	mddev->delta_disks = -1;
7720	mddev->raid_disks -= 1;
7721	return setup_conf(mddev);
7722}
7723
7724static int raid5_check_reshape(struct mddev *mddev)
7725{
7726	/* For a 2-drive array, the layout and chunk size can be changed
7727	 * immediately as not restriping is needed.
7728	 * For larger arrays we record the new value - after validation
7729	 * to be used by a reshape pass.
7730	 */
7731	struct r5conf *conf = mddev->private;
7732	int new_chunk = mddev->new_chunk_sectors;
7733
7734	if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
7735		return -EINVAL;
7736	if (new_chunk > 0) {
7737		if (!is_power_of_2(new_chunk))
7738			return -EINVAL;
7739		if (new_chunk < (PAGE_SIZE>>9))
7740			return -EINVAL;
7741		if (mddev->array_sectors & (new_chunk-1))
7742			/* not factor of array size */
7743			return -EINVAL;
7744	}
7745
7746	/* They look valid */
7747
7748	if (mddev->raid_disks == 2) {
7749		/* can make the change immediately */
7750		if (mddev->new_layout >= 0) {
7751			conf->algorithm = mddev->new_layout;
7752			mddev->layout = mddev->new_layout;
7753		}
7754		if (new_chunk > 0) {
7755			conf->chunk_sectors = new_chunk ;
7756			mddev->chunk_sectors = new_chunk;
7757		}
7758		set_bit(MD_CHANGE_DEVS, &mddev->flags);
7759		md_wakeup_thread(mddev->thread);
7760	}
7761	return check_reshape(mddev);
7762}
7763
7764static int raid6_check_reshape(struct mddev *mddev)
7765{
7766	int new_chunk = mddev->new_chunk_sectors;
7767
7768	if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
7769		return -EINVAL;
7770	if (new_chunk > 0) {
7771		if (!is_power_of_2(new_chunk))
7772			return -EINVAL;
7773		if (new_chunk < (PAGE_SIZE >> 9))
7774			return -EINVAL;
7775		if (mddev->array_sectors & (new_chunk-1))
7776			/* not factor of array size */
7777			return -EINVAL;
7778	}
7779
7780	/* They look valid */
7781	return check_reshape(mddev);
7782}
7783
7784static void *raid5_takeover(struct mddev *mddev)
7785{
7786	/* raid5 can take over:
7787	 *  raid0 - if there is only one strip zone - make it a raid4 layout
7788	 *  raid1 - if there are two drives.  We need to know the chunk size
7789	 *  raid4 - trivial - just use a raid4 layout.
7790	 *  raid6 - Providing it is a *_6 layout
7791	 */
7792	if (mddev->level == 0)
7793		return raid45_takeover_raid0(mddev, 5);
7794	if (mddev->level == 1)
7795		return raid5_takeover_raid1(mddev);
7796	if (mddev->level == 4) {
7797		mddev->new_layout = ALGORITHM_PARITY_N;
7798		mddev->new_level = 5;
7799		return setup_conf(mddev);
7800	}
7801	if (mddev->level == 6)
7802		return raid5_takeover_raid6(mddev);
7803
7804	return ERR_PTR(-EINVAL);
7805}
7806
7807static void *raid4_takeover(struct mddev *mddev)
7808{
7809	/* raid4 can take over:
7810	 *  raid0 - if there is only one strip zone
7811	 *  raid5 - if layout is right
7812	 */
7813	if (mddev->level == 0)
7814		return raid45_takeover_raid0(mddev, 4);
7815	if (mddev->level == 5 &&
7816	    mddev->layout == ALGORITHM_PARITY_N) {
7817		mddev->new_layout = 0;
7818		mddev->new_level = 4;
7819		return setup_conf(mddev);
7820	}
7821	return ERR_PTR(-EINVAL);
7822}
7823
7824static struct md_personality raid5_personality;
7825
7826static void *raid6_takeover(struct mddev *mddev)
7827{
7828	/* Currently can only take over a raid5.  We map the
7829	 * personality to an equivalent raid6 personality
7830	 * with the Q block at the end.
7831	 */
7832	int new_layout;
7833
7834	if (mddev->pers != &raid5_personality)
7835		return ERR_PTR(-EINVAL);
7836	if (mddev->degraded > 1)
7837		return ERR_PTR(-EINVAL);
7838	if (mddev->raid_disks > 253)
7839		return ERR_PTR(-EINVAL);
7840	if (mddev->raid_disks < 3)
7841		return ERR_PTR(-EINVAL);
7842
7843	switch (mddev->layout) {
7844	case ALGORITHM_LEFT_ASYMMETRIC:
7845		new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
7846		break;
7847	case ALGORITHM_RIGHT_ASYMMETRIC:
7848		new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
7849		break;
7850	case ALGORITHM_LEFT_SYMMETRIC:
7851		new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
7852		break;
7853	case ALGORITHM_RIGHT_SYMMETRIC:
7854		new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
7855		break;
7856	case ALGORITHM_PARITY_0:
7857		new_layout = ALGORITHM_PARITY_0_6;
7858		break;
7859	case ALGORITHM_PARITY_N:
7860		new_layout = ALGORITHM_PARITY_N;
7861		break;
7862	default:
7863		return ERR_PTR(-EINVAL);
7864	}
7865	mddev->new_level = 6;
7866	mddev->new_layout = new_layout;
7867	mddev->delta_disks = 1;
7868	mddev->raid_disks += 1;
7869	return setup_conf(mddev);
7870}
7871
7872static struct md_personality raid6_personality =
7873{
7874	.name		= "raid6",
7875	.level		= 6,
7876	.owner		= THIS_MODULE,
7877	.make_request	= raid5_make_request,
7878	.run		= raid5_run,
7879	.free		= raid5_free,
7880	.status		= raid5_status,
7881	.error_handler	= raid5_error,
7882	.hot_add_disk	= raid5_add_disk,
7883	.hot_remove_disk= raid5_remove_disk,
7884	.spare_active	= raid5_spare_active,
7885	.sync_request	= raid5_sync_request,
7886	.resize		= raid5_resize,
7887	.size		= raid5_size,
7888	.check_reshape	= raid6_check_reshape,
7889	.start_reshape  = raid5_start_reshape,
7890	.finish_reshape = raid5_finish_reshape,
7891	.quiesce	= raid5_quiesce,
7892	.takeover	= raid6_takeover,
7893	.congested	= raid5_congested,
7894};
7895static struct md_personality raid5_personality =
7896{
7897	.name		= "raid5",
7898	.level		= 5,
7899	.owner		= THIS_MODULE,
7900	.make_request	= raid5_make_request,
7901	.run		= raid5_run,
7902	.free		= raid5_free,
7903	.status		= raid5_status,
7904	.error_handler	= raid5_error,
7905	.hot_add_disk	= raid5_add_disk,
7906	.hot_remove_disk= raid5_remove_disk,
7907	.spare_active	= raid5_spare_active,
7908	.sync_request	= raid5_sync_request,
7909	.resize		= raid5_resize,
7910	.size		= raid5_size,
7911	.check_reshape	= raid5_check_reshape,
7912	.start_reshape  = raid5_start_reshape,
7913	.finish_reshape = raid5_finish_reshape,
7914	.quiesce	= raid5_quiesce,
7915	.takeover	= raid5_takeover,
7916	.congested	= raid5_congested,
7917};
7918
7919static struct md_personality raid4_personality =
7920{
7921	.name		= "raid4",
7922	.level		= 4,
7923	.owner		= THIS_MODULE,
7924	.make_request	= raid5_make_request,
7925	.run		= raid5_run,
7926	.free		= raid5_free,
7927	.status		= raid5_status,
7928	.error_handler	= raid5_error,
7929	.hot_add_disk	= raid5_add_disk,
7930	.hot_remove_disk= raid5_remove_disk,
7931	.spare_active	= raid5_spare_active,
7932	.sync_request	= raid5_sync_request,
7933	.resize		= raid5_resize,
7934	.size		= raid5_size,
7935	.check_reshape	= raid5_check_reshape,
7936	.start_reshape  = raid5_start_reshape,
7937	.finish_reshape = raid5_finish_reshape,
7938	.quiesce	= raid5_quiesce,
7939	.takeover	= raid4_takeover,
7940	.congested	= raid5_congested,
7941};
7942
7943static int __init raid5_init(void)
7944{
 
 
7945	raid5_wq = alloc_workqueue("raid5wq",
7946		WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
7947	if (!raid5_wq)
7948		return -ENOMEM;
 
 
 
 
 
 
 
 
 
7949	register_md_personality(&raid6_personality);
7950	register_md_personality(&raid5_personality);
7951	register_md_personality(&raid4_personality);
7952	return 0;
7953}
7954
7955static void raid5_exit(void)
7956{
7957	unregister_md_personality(&raid6_personality);
7958	unregister_md_personality(&raid5_personality);
7959	unregister_md_personality(&raid4_personality);
 
7960	destroy_workqueue(raid5_wq);
7961}
7962
7963module_init(raid5_init);
7964module_exit(raid5_exit);
7965MODULE_LICENSE("GPL");
7966MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
7967MODULE_ALIAS("md-personality-4"); /* RAID5 */
7968MODULE_ALIAS("md-raid5");
7969MODULE_ALIAS("md-raid4");
7970MODULE_ALIAS("md-level-5");
7971MODULE_ALIAS("md-level-4");
7972MODULE_ALIAS("md-personality-8"); /* RAID6 */
7973MODULE_ALIAS("md-raid6");
7974MODULE_ALIAS("md-level-6");
7975
7976/* This used to be two separate modules, they were: */
7977MODULE_ALIAS("raid5");
7978MODULE_ALIAS("raid6");
v4.10.11
   1/*
   2 * raid5.c : Multiple Devices driver for Linux
   3 *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   4 *	   Copyright (C) 1999, 2000 Ingo Molnar
   5 *	   Copyright (C) 2002, 2003 H. Peter Anvin
   6 *
   7 * RAID-4/5/6 management functions.
   8 * Thanks to Penguin Computing for making the RAID-6 development possible
   9 * by donating a test server!
  10 *
  11 * This program is free software; you can redistribute it and/or modify
  12 * it under the terms of the GNU General Public License as published by
  13 * the Free Software Foundation; either version 2, or (at your option)
  14 * any later version.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * (for example /usr/src/linux/COPYING); if not, write to the Free
  18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19 */
  20
  21/*
  22 * BITMAP UNPLUGGING:
  23 *
  24 * The sequencing for updating the bitmap reliably is a little
  25 * subtle (and I got it wrong the first time) so it deserves some
  26 * explanation.
  27 *
  28 * We group bitmap updates into batches.  Each batch has a number.
  29 * We may write out several batches at once, but that isn't very important.
  30 * conf->seq_write is the number of the last batch successfully written.
  31 * conf->seq_flush is the number of the last batch that was closed to
  32 *    new additions.
  33 * When we discover that we will need to write to any block in a stripe
  34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
  35 * the number of the batch it will be in. This is seq_flush+1.
  36 * When we are ready to do a write, if that batch hasn't been written yet,
  37 *   we plug the array and queue the stripe for later.
  38 * When an unplug happens, we increment bm_flush, thus closing the current
  39 *   batch.
  40 * When we notice that bm_flush > bm_write, we write out all pending updates
  41 * to the bitmap, and advance bm_write to where bm_flush was.
  42 * This may occasionally write a bit out twice, but is sure never to
  43 * miss any bits.
  44 */
  45
  46#include <linux/blkdev.h>
  47#include <linux/kthread.h>
  48#include <linux/raid/pq.h>
  49#include <linux/async_tx.h>
  50#include <linux/module.h>
  51#include <linux/async.h>
  52#include <linux/seq_file.h>
  53#include <linux/cpu.h>
  54#include <linux/slab.h>
  55#include <linux/ratelimit.h>
  56#include <linux/nodemask.h>
  57#include <linux/flex_array.h>
  58#include <trace/events/block.h>
  59
  60#include "md.h"
  61#include "raid5.h"
  62#include "raid0.h"
  63#include "bitmap.h"
  64
  65#define UNSUPPORTED_MDDEV_FLAGS	(1L << MD_FAILFAST_SUPPORTED)
  66
  67#define cpu_to_group(cpu) cpu_to_node(cpu)
  68#define ANY_GROUP NUMA_NO_NODE
  69
  70static bool devices_handle_discard_safely = false;
  71module_param(devices_handle_discard_safely, bool, 0644);
  72MODULE_PARM_DESC(devices_handle_discard_safely,
  73		 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
  74static struct workqueue_struct *raid5_wq;
 
 
 
 
 
 
 
 
 
 
 
 
 
  75
  76static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
  77{
  78	int hash = (sect >> STRIPE_SHIFT) & HASH_MASK;
  79	return &conf->stripe_hashtbl[hash];
  80}
  81
  82static inline int stripe_hash_locks_hash(sector_t sect)
  83{
  84	return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK;
  85}
  86
  87static inline void lock_device_hash_lock(struct r5conf *conf, int hash)
  88{
  89	spin_lock_irq(conf->hash_locks + hash);
  90	spin_lock(&conf->device_lock);
  91}
  92
  93static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
  94{
  95	spin_unlock(&conf->device_lock);
  96	spin_unlock_irq(conf->hash_locks + hash);
  97}
  98
  99static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
 100{
 101	int i;
 102	local_irq_disable();
 103	spin_lock(conf->hash_locks);
 104	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
 105		spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
 106	spin_lock(&conf->device_lock);
 107}
 108
 109static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
 110{
 111	int i;
 112	spin_unlock(&conf->device_lock);
 113	for (i = NR_STRIPE_HASH_LOCKS; i; i--)
 114		spin_unlock(conf->hash_locks + i - 1);
 115	local_irq_enable();
 116}
 117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 118/* Find first data disk in a raid6 stripe */
 119static inline int raid6_d0(struct stripe_head *sh)
 120{
 121	if (sh->ddf_layout)
 122		/* ddf always start from first device */
 123		return 0;
 124	/* md starts just after Q block */
 125	if (sh->qd_idx == sh->disks - 1)
 126		return 0;
 127	else
 128		return sh->qd_idx + 1;
 129}
 130static inline int raid6_next_disk(int disk, int raid_disks)
 131{
 132	disk++;
 133	return (disk < raid_disks) ? disk : 0;
 134}
 135
 136/* When walking through the disks in a raid5, starting at raid6_d0,
 137 * We need to map each disk to a 'slot', where the data disks are slot
 138 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
 139 * is raid_disks-1.  This help does that mapping.
 140 */
 141static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 142			     int *count, int syndrome_disks)
 143{
 144	int slot = *count;
 145
 146	if (sh->ddf_layout)
 147		(*count)++;
 148	if (idx == sh->pd_idx)
 149		return syndrome_disks;
 150	if (idx == sh->qd_idx)
 151		return syndrome_disks + 1;
 152	if (!sh->ddf_layout)
 153		(*count)++;
 154	return slot;
 155}
 156
 157static void return_io(struct bio_list *return_bi)
 158{
 159	struct bio *bi;
 160	while ((bi = bio_list_pop(return_bi)) != NULL) {
 161		bi->bi_iter.bi_size = 0;
 162		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
 163					 bi, 0);
 164		bio_endio(bi);
 165	}
 166}
 167
 168static void print_raid5_conf (struct r5conf *conf);
 169
 170static int stripe_operations_active(struct stripe_head *sh)
 171{
 172	return sh->check_state || sh->reconstruct_state ||
 173	       test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
 174	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 175}
 176
 177static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 178{
 179	struct r5conf *conf = sh->raid_conf;
 180	struct r5worker_group *group;
 181	int thread_cnt;
 182	int i, cpu = sh->cpu;
 183
 184	if (!cpu_online(cpu)) {
 185		cpu = cpumask_any(cpu_online_mask);
 186		sh->cpu = cpu;
 187	}
 188
 189	if (list_empty(&sh->lru)) {
 190		struct r5worker_group *group;
 191		group = conf->worker_groups + cpu_to_group(cpu);
 192		list_add_tail(&sh->lru, &group->handle_list);
 193		group->stripes_cnt++;
 194		sh->group = group;
 195	}
 196
 197	if (conf->worker_cnt_per_group == 0) {
 198		md_wakeup_thread(conf->mddev->thread);
 199		return;
 200	}
 201
 202	group = conf->worker_groups + cpu_to_group(sh->cpu);
 203
 204	group->workers[0].working = true;
 205	/* at least one worker should run to avoid race */
 206	queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work);
 207
 208	thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1;
 209	/* wakeup more workers */
 210	for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) {
 211		if (group->workers[i].working == false) {
 212			group->workers[i].working = true;
 213			queue_work_on(sh->cpu, raid5_wq,
 214				      &group->workers[i].work);
 215			thread_cnt--;
 216		}
 217	}
 218}
 219
 220static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 221			      struct list_head *temp_inactive_list)
 222{
 223	int i;
 224	int injournal = 0;	/* number of date pages with R5_InJournal */
 225
 226	BUG_ON(!list_empty(&sh->lru));
 227	BUG_ON(atomic_read(&conf->active_stripes)==0);
 228
 229	if (r5c_is_writeback(conf->log))
 230		for (i = sh->disks; i--; )
 231			if (test_bit(R5_InJournal, &sh->dev[i].flags))
 232				injournal++;
 233	/*
 234	 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
 235	 * data in journal, so they are not released to cached lists
 236	 */
 237	if (conf->quiesce && r5c_is_writeback(conf->log) &&
 238	    !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
 239		if (test_bit(STRIPE_R5C_CACHING, &sh->state))
 240			r5c_make_stripe_write_out(sh);
 241		set_bit(STRIPE_HANDLE, &sh->state);
 242	}
 243
 244	if (test_bit(STRIPE_HANDLE, &sh->state)) {
 245		if (test_bit(STRIPE_DELAYED, &sh->state) &&
 246		    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 247			list_add_tail(&sh->lru, &conf->delayed_list);
 248		else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
 249			   sh->bm_seq - conf->seq_write > 0)
 250			list_add_tail(&sh->lru, &conf->bitmap_list);
 251		else {
 252			clear_bit(STRIPE_DELAYED, &sh->state);
 253			clear_bit(STRIPE_BIT_DELAY, &sh->state);
 254			if (conf->worker_cnt_per_group == 0) {
 255				list_add_tail(&sh->lru, &conf->handle_list);
 256			} else {
 257				raid5_wakeup_stripe_thread(sh);
 258				return;
 259			}
 260		}
 261		md_wakeup_thread(conf->mddev->thread);
 262	} else {
 263		BUG_ON(stripe_operations_active(sh));
 264		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 265			if (atomic_dec_return(&conf->preread_active_stripes)
 266			    < IO_THRESHOLD)
 267				md_wakeup_thread(conf->mddev->thread);
 268		atomic_dec(&conf->active_stripes);
 269		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 270			if (!r5c_is_writeback(conf->log))
 271				list_add_tail(&sh->lru, temp_inactive_list);
 272			else {
 273				WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
 274				if (injournal == 0)
 275					list_add_tail(&sh->lru, temp_inactive_list);
 276				else if (injournal == conf->raid_disks - conf->max_degraded) {
 277					/* full stripe */
 278					if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
 279						atomic_inc(&conf->r5c_cached_full_stripes);
 280					if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
 281						atomic_dec(&conf->r5c_cached_partial_stripes);
 282					list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
 283					r5c_check_cached_full_stripe(conf);
 284				} else {
 285					/* partial stripe */
 286					if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
 287							      &sh->state))
 288						atomic_inc(&conf->r5c_cached_partial_stripes);
 289					list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
 290				}
 291			}
 292		}
 293	}
 294}
 295
 296static void __release_stripe(struct r5conf *conf, struct stripe_head *sh,
 297			     struct list_head *temp_inactive_list)
 298{
 299	if (atomic_dec_and_test(&sh->count))
 300		do_release_stripe(conf, sh, temp_inactive_list);
 301}
 302
 303/*
 304 * @hash could be NR_STRIPE_HASH_LOCKS, then we have a list of inactive_list
 305 *
 306 * Be careful: Only one task can add/delete stripes from temp_inactive_list at
 307 * given time. Adding stripes only takes device lock, while deleting stripes
 308 * only takes hash lock.
 309 */
 310static void release_inactive_stripe_list(struct r5conf *conf,
 311					 struct list_head *temp_inactive_list,
 312					 int hash)
 313{
 314	int size;
 315	bool do_wakeup = false;
 316	unsigned long flags;
 317
 318	if (hash == NR_STRIPE_HASH_LOCKS) {
 319		size = NR_STRIPE_HASH_LOCKS;
 320		hash = NR_STRIPE_HASH_LOCKS - 1;
 321	} else
 322		size = 1;
 323	while (size) {
 324		struct list_head *list = &temp_inactive_list[size - 1];
 325
 326		/*
 327		 * We don't hold any lock here yet, raid5_get_active_stripe() might
 328		 * remove stripes from the list
 329		 */
 330		if (!list_empty_careful(list)) {
 331			spin_lock_irqsave(conf->hash_locks + hash, flags);
 332			if (list_empty(conf->inactive_list + hash) &&
 333			    !list_empty(list))
 334				atomic_dec(&conf->empty_inactive_list_nr);
 335			list_splice_tail_init(list, conf->inactive_list + hash);
 336			do_wakeup = true;
 337			spin_unlock_irqrestore(conf->hash_locks + hash, flags);
 338		}
 339		size--;
 340		hash--;
 341	}
 342
 343	if (do_wakeup) {
 344		wake_up(&conf->wait_for_stripe);
 345		if (atomic_read(&conf->active_stripes) == 0)
 346			wake_up(&conf->wait_for_quiescent);
 347		if (conf->retry_read_aligned)
 348			md_wakeup_thread(conf->mddev->thread);
 349	}
 350}
 351
 352/* should hold conf->device_lock already */
 353static int release_stripe_list(struct r5conf *conf,
 354			       struct list_head *temp_inactive_list)
 355{
 356	struct stripe_head *sh;
 357	int count = 0;
 358	struct llist_node *head;
 359
 360	head = llist_del_all(&conf->released_stripes);
 361	head = llist_reverse_order(head);
 362	while (head) {
 363		int hash;
 364
 365		sh = llist_entry(head, struct stripe_head, release_list);
 366		head = llist_next(head);
 367		/* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */
 368		smp_mb();
 369		clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state);
 370		/*
 371		 * Don't worry the bit is set here, because if the bit is set
 372		 * again, the count is always > 1. This is true for
 373		 * STRIPE_ON_UNPLUG_LIST bit too.
 374		 */
 375		hash = sh->hash_lock_index;
 376		__release_stripe(conf, sh, &temp_inactive_list[hash]);
 377		count++;
 378	}
 379
 380	return count;
 381}
 382
 383void raid5_release_stripe(struct stripe_head *sh)
 384{
 385	struct r5conf *conf = sh->raid_conf;
 386	unsigned long flags;
 387	struct list_head list;
 388	int hash;
 389	bool wakeup;
 390
 391	/* Avoid release_list until the last reference.
 392	 */
 393	if (atomic_add_unless(&sh->count, -1, 1))
 394		return;
 395
 396	if (unlikely(!conf->mddev->thread) ||
 397		test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
 398		goto slow_path;
 399	wakeup = llist_add(&sh->release_list, &conf->released_stripes);
 400	if (wakeup)
 401		md_wakeup_thread(conf->mddev->thread);
 402	return;
 403slow_path:
 404	local_irq_save(flags);
 405	/* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */
 406	if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
 407		INIT_LIST_HEAD(&list);
 408		hash = sh->hash_lock_index;
 409		do_release_stripe(conf, sh, &list);
 410		spin_unlock(&conf->device_lock);
 411		release_inactive_stripe_list(conf, &list, hash);
 412	}
 413	local_irq_restore(flags);
 414}
 415
 416static inline void remove_hash(struct stripe_head *sh)
 417{
 418	pr_debug("remove_hash(), stripe %llu\n",
 419		(unsigned long long)sh->sector);
 420
 421	hlist_del_init(&sh->hash);
 422}
 423
 424static inline void insert_hash(struct r5conf *conf, struct stripe_head *sh)
 425{
 426	struct hlist_head *hp = stripe_hash(conf, sh->sector);
 427
 428	pr_debug("insert_hash(), stripe %llu\n",
 429		(unsigned long long)sh->sector);
 430
 431	hlist_add_head(&sh->hash, hp);
 432}
 433
 434/* find an idle stripe, make sure it is unhashed, and return it. */
 435static struct stripe_head *get_free_stripe(struct r5conf *conf, int hash)
 436{
 437	struct stripe_head *sh = NULL;
 438	struct list_head *first;
 439
 440	if (list_empty(conf->inactive_list + hash))
 441		goto out;
 442	first = (conf->inactive_list + hash)->next;
 443	sh = list_entry(first, struct stripe_head, lru);
 444	list_del_init(first);
 445	remove_hash(sh);
 446	atomic_inc(&conf->active_stripes);
 447	BUG_ON(hash != sh->hash_lock_index);
 448	if (list_empty(conf->inactive_list + hash))
 449		atomic_inc(&conf->empty_inactive_list_nr);
 450out:
 451	return sh;
 452}
 453
 454static void shrink_buffers(struct stripe_head *sh)
 455{
 456	struct page *p;
 457	int i;
 458	int num = sh->raid_conf->pool_size;
 459
 460	for (i = 0; i < num ; i++) {
 461		WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
 462		p = sh->dev[i].page;
 463		if (!p)
 464			continue;
 465		sh->dev[i].page = NULL;
 466		put_page(p);
 467	}
 468}
 469
 470static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 471{
 472	int i;
 473	int num = sh->raid_conf->pool_size;
 474
 475	for (i = 0; i < num; i++) {
 476		struct page *page;
 477
 478		if (!(page = alloc_page(gfp))) {
 479			return 1;
 480		}
 481		sh->dev[i].page = page;
 482		sh->dev[i].orig_page = page;
 483	}
 484	return 0;
 485}
 486
 487static void raid5_build_block(struct stripe_head *sh, int i, int previous);
 488static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 489			    struct stripe_head *sh);
 490
 491static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 492{
 493	struct r5conf *conf = sh->raid_conf;
 494	int i, seq;
 495
 496	BUG_ON(atomic_read(&sh->count) != 0);
 497	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
 498	BUG_ON(stripe_operations_active(sh));
 499	BUG_ON(sh->batch_head);
 500
 501	pr_debug("init_stripe called, stripe %llu\n",
 502		(unsigned long long)sector);
 503retry:
 504	seq = read_seqcount_begin(&conf->gen_lock);
 505	sh->generation = conf->generation - previous;
 506	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
 507	sh->sector = sector;
 508	stripe_set_idx(sector, conf, previous, sh);
 509	sh->state = 0;
 510
 511	for (i = sh->disks; i--; ) {
 512		struct r5dev *dev = &sh->dev[i];
 513
 514		if (dev->toread || dev->read || dev->towrite || dev->written ||
 515		    test_bit(R5_LOCKED, &dev->flags)) {
 516			pr_err("sector=%llx i=%d %p %p %p %p %d\n",
 517			       (unsigned long long)sh->sector, i, dev->toread,
 518			       dev->read, dev->towrite, dev->written,
 519			       test_bit(R5_LOCKED, &dev->flags));
 520			WARN_ON(1);
 521		}
 522		dev->flags = 0;
 523		raid5_build_block(sh, i, previous);
 524	}
 525	if (read_seqcount_retry(&conf->gen_lock, seq))
 526		goto retry;
 527	sh->overwrite_disks = 0;
 528	insert_hash(conf, sh);
 529	sh->cpu = smp_processor_id();
 530	set_bit(STRIPE_BATCH_READY, &sh->state);
 531}
 532
 533static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
 534					 short generation)
 535{
 536	struct stripe_head *sh;
 537
 538	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
 539	hlist_for_each_entry(sh, stripe_hash(conf, sector), hash)
 540		if (sh->sector == sector && sh->generation == generation)
 541			return sh;
 542	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
 543	return NULL;
 544}
 545
 546/*
 547 * Need to check if array has failed when deciding whether to:
 548 *  - start an array
 549 *  - remove non-faulty devices
 550 *  - add a spare
 551 *  - allow a reshape
 552 * This determination is simple when no reshape is happening.
 553 * However if there is a reshape, we need to carefully check
 554 * both the before and after sections.
 555 * This is because some failed devices may only affect one
 556 * of the two sections, and some non-in_sync devices may
 557 * be insync in the section most affected by failed devices.
 558 */
 559int raid5_calc_degraded(struct r5conf *conf)
 560{
 561	int degraded, degraded2;
 562	int i;
 563
 564	rcu_read_lock();
 565	degraded = 0;
 566	for (i = 0; i < conf->previous_raid_disks; i++) {
 567		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
 568		if (rdev && test_bit(Faulty, &rdev->flags))
 569			rdev = rcu_dereference(conf->disks[i].replacement);
 570		if (!rdev || test_bit(Faulty, &rdev->flags))
 571			degraded++;
 572		else if (test_bit(In_sync, &rdev->flags))
 573			;
 574		else
 575			/* not in-sync or faulty.
 576			 * If the reshape increases the number of devices,
 577			 * this is being recovered by the reshape, so
 578			 * this 'previous' section is not in_sync.
 579			 * If the number of devices is being reduced however,
 580			 * the device can only be part of the array if
 581			 * we are reverting a reshape, so this section will
 582			 * be in-sync.
 583			 */
 584			if (conf->raid_disks >= conf->previous_raid_disks)
 585				degraded++;
 586	}
 587	rcu_read_unlock();
 588	if (conf->raid_disks == conf->previous_raid_disks)
 589		return degraded;
 590	rcu_read_lock();
 591	degraded2 = 0;
 592	for (i = 0; i < conf->raid_disks; i++) {
 593		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
 594		if (rdev && test_bit(Faulty, &rdev->flags))
 595			rdev = rcu_dereference(conf->disks[i].replacement);
 596		if (!rdev || test_bit(Faulty, &rdev->flags))
 597			degraded2++;
 598		else if (test_bit(In_sync, &rdev->flags))
 599			;
 600		else
 601			/* not in-sync or faulty.
 602			 * If reshape increases the number of devices, this
 603			 * section has already been recovered, else it
 604			 * almost certainly hasn't.
 605			 */
 606			if (conf->raid_disks <= conf->previous_raid_disks)
 607				degraded2++;
 608	}
 609	rcu_read_unlock();
 610	if (degraded2 > degraded)
 611		return degraded2;
 612	return degraded;
 613}
 614
 615static int has_failed(struct r5conf *conf)
 616{
 617	int degraded;
 618
 619	if (conf->mddev->reshape_position == MaxSector)
 620		return conf->mddev->degraded > conf->max_degraded;
 621
 622	degraded = raid5_calc_degraded(conf);
 623	if (degraded > conf->max_degraded)
 624		return 1;
 625	return 0;
 626}
 627
 628struct stripe_head *
 629raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 630			int previous, int noblock, int noquiesce)
 631{
 632	struct stripe_head *sh;
 633	int hash = stripe_hash_locks_hash(sector);
 634	int inc_empty_inactive_list_flag;
 635
 636	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
 637
 638	spin_lock_irq(conf->hash_locks + hash);
 639
 640	do {
 641		wait_event_lock_irq(conf->wait_for_quiescent,
 642				    conf->quiesce == 0 || noquiesce,
 643				    *(conf->hash_locks + hash));
 644		sh = __find_stripe(conf, sector, conf->generation - previous);
 645		if (!sh) {
 646			if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
 647				sh = get_free_stripe(conf, hash);
 648				if (!sh && !test_bit(R5_DID_ALLOC,
 649						     &conf->cache_state))
 650					set_bit(R5_ALLOC_MORE,
 651						&conf->cache_state);
 652			}
 653			if (noblock && sh == NULL)
 654				break;
 655
 656			r5c_check_stripe_cache_usage(conf);
 657			if (!sh) {
 658				set_bit(R5_INACTIVE_BLOCKED,
 659					&conf->cache_state);
 660				r5l_wake_reclaim(conf->log, 0);
 661				wait_event_lock_irq(
 662					conf->wait_for_stripe,
 663					!list_empty(conf->inactive_list + hash) &&
 664					(atomic_read(&conf->active_stripes)
 665					 < (conf->max_nr_stripes * 3 / 4)
 666					 || !test_bit(R5_INACTIVE_BLOCKED,
 667						      &conf->cache_state)),
 668					*(conf->hash_locks + hash));
 669				clear_bit(R5_INACTIVE_BLOCKED,
 670					  &conf->cache_state);
 671			} else {
 672				init_stripe(sh, sector, previous);
 673				atomic_inc(&sh->count);
 674			}
 675		} else if (!atomic_inc_not_zero(&sh->count)) {
 676			spin_lock(&conf->device_lock);
 677			if (!atomic_read(&sh->count)) {
 678				if (!test_bit(STRIPE_HANDLE, &sh->state))
 679					atomic_inc(&conf->active_stripes);
 680				BUG_ON(list_empty(&sh->lru) &&
 681				       !test_bit(STRIPE_EXPANDING, &sh->state));
 682				inc_empty_inactive_list_flag = 0;
 683				if (!list_empty(conf->inactive_list + hash))
 684					inc_empty_inactive_list_flag = 1;
 685				list_del_init(&sh->lru);
 686				if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
 687					atomic_inc(&conf->empty_inactive_list_nr);
 688				if (sh->group) {
 689					sh->group->stripes_cnt--;
 690					sh->group = NULL;
 691				}
 692			}
 693			atomic_inc(&sh->count);
 694			spin_unlock(&conf->device_lock);
 695		}
 696	} while (sh == NULL);
 697
 698	spin_unlock_irq(conf->hash_locks + hash);
 699	return sh;
 700}
 701
 702static bool is_full_stripe_write(struct stripe_head *sh)
 703{
 704	BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
 705	return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
 706}
 707
 708static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 709{
 710	local_irq_disable();
 711	if (sh1 > sh2) {
 712		spin_lock(&sh2->stripe_lock);
 713		spin_lock_nested(&sh1->stripe_lock, 1);
 714	} else {
 715		spin_lock(&sh1->stripe_lock);
 716		spin_lock_nested(&sh2->stripe_lock, 1);
 717	}
 718}
 719
 720static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 721{
 722	spin_unlock(&sh1->stripe_lock);
 723	spin_unlock(&sh2->stripe_lock);
 724	local_irq_enable();
 725}
 726
 727/* Only freshly new full stripe normal write stripe can be added to a batch list */
 728static bool stripe_can_batch(struct stripe_head *sh)
 729{
 730	struct r5conf *conf = sh->raid_conf;
 731
 732	if (conf->log)
 733		return false;
 734	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
 735		!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
 736		is_full_stripe_write(sh);
 737}
 738
 739/* we only do back search */
 740static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
 741{
 742	struct stripe_head *head;
 743	sector_t head_sector, tmp_sec;
 744	int hash;
 745	int dd_idx;
 746	int inc_empty_inactive_list_flag;
 747
 748	/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
 749	tmp_sec = sh->sector;
 750	if (!sector_div(tmp_sec, conf->chunk_sectors))
 751		return;
 752	head_sector = sh->sector - STRIPE_SECTORS;
 753
 754	hash = stripe_hash_locks_hash(head_sector);
 755	spin_lock_irq(conf->hash_locks + hash);
 756	head = __find_stripe(conf, head_sector, conf->generation);
 757	if (head && !atomic_inc_not_zero(&head->count)) {
 758		spin_lock(&conf->device_lock);
 759		if (!atomic_read(&head->count)) {
 760			if (!test_bit(STRIPE_HANDLE, &head->state))
 761				atomic_inc(&conf->active_stripes);
 762			BUG_ON(list_empty(&head->lru) &&
 763			       !test_bit(STRIPE_EXPANDING, &head->state));
 764			inc_empty_inactive_list_flag = 0;
 765			if (!list_empty(conf->inactive_list + hash))
 766				inc_empty_inactive_list_flag = 1;
 767			list_del_init(&head->lru);
 768			if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
 769				atomic_inc(&conf->empty_inactive_list_nr);
 770			if (head->group) {
 771				head->group->stripes_cnt--;
 772				head->group = NULL;
 773			}
 774		}
 775		atomic_inc(&head->count);
 776		spin_unlock(&conf->device_lock);
 777	}
 778	spin_unlock_irq(conf->hash_locks + hash);
 779
 780	if (!head)
 781		return;
 782	if (!stripe_can_batch(head))
 783		goto out;
 784
 785	lock_two_stripes(head, sh);
 786	/* clear_batch_ready clear the flag */
 787	if (!stripe_can_batch(head) || !stripe_can_batch(sh))
 788		goto unlock_out;
 789
 790	if (sh->batch_head)
 791		goto unlock_out;
 792
 793	dd_idx = 0;
 794	while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
 795		dd_idx++;
 796	if (head->dev[dd_idx].towrite->bi_opf != sh->dev[dd_idx].towrite->bi_opf ||
 797	    bio_op(head->dev[dd_idx].towrite) != bio_op(sh->dev[dd_idx].towrite))
 798		goto unlock_out;
 799
 800	if (head->batch_head) {
 801		spin_lock(&head->batch_head->batch_lock);
 802		/* This batch list is already running */
 803		if (!stripe_can_batch(head)) {
 804			spin_unlock(&head->batch_head->batch_lock);
 805			goto unlock_out;
 806		}
 807
 808		/*
 809		 * at this point, head's BATCH_READY could be cleared, but we
 810		 * can still add the stripe to batch list
 811		 */
 812		list_add(&sh->batch_list, &head->batch_list);
 813		spin_unlock(&head->batch_head->batch_lock);
 814
 815		sh->batch_head = head->batch_head;
 816	} else {
 817		head->batch_head = head;
 818		sh->batch_head = head->batch_head;
 819		spin_lock(&head->batch_lock);
 820		list_add_tail(&sh->batch_list, &head->batch_list);
 821		spin_unlock(&head->batch_lock);
 822	}
 823
 824	if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 825		if (atomic_dec_return(&conf->preread_active_stripes)
 826		    < IO_THRESHOLD)
 827			md_wakeup_thread(conf->mddev->thread);
 828
 829	if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
 830		int seq = sh->bm_seq;
 831		if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
 832		    sh->batch_head->bm_seq > seq)
 833			seq = sh->batch_head->bm_seq;
 834		set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
 835		sh->batch_head->bm_seq = seq;
 836	}
 837
 838	atomic_inc(&sh->count);
 839unlock_out:
 840	unlock_two_stripes(head, sh);
 841out:
 842	raid5_release_stripe(head);
 843}
 844
 845/* Determine if 'data_offset' or 'new_data_offset' should be used
 846 * in this stripe_head.
 847 */
 848static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
 849{
 850	sector_t progress = conf->reshape_progress;
 851	/* Need a memory barrier to make sure we see the value
 852	 * of conf->generation, or ->data_offset that was set before
 853	 * reshape_progress was updated.
 854	 */
 855	smp_rmb();
 856	if (progress == MaxSector)
 857		return 0;
 858	if (sh->generation == conf->generation - 1)
 859		return 0;
 860	/* We are in a reshape, and this is a new-generation stripe,
 861	 * so use new_data_offset.
 862	 */
 863	return 1;
 864}
 865
 866static void
 867raid5_end_read_request(struct bio *bi);
 868static void
 869raid5_end_write_request(struct bio *bi);
 870
 871static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 872{
 873	struct r5conf *conf = sh->raid_conf;
 874	int i, disks = sh->disks;
 875	struct stripe_head *head_sh = sh;
 876
 877	might_sleep();
 878
 879	if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
 880		/* writing out phase */
 881		if (s->waiting_extra_page)
 882			return;
 883		if (r5l_write_stripe(conf->log, sh) == 0)
 884			return;
 885	} else {  /* caching phase */
 886		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
 887			r5c_cache_data(conf->log, sh, s);
 888			return;
 889		}
 890	}
 891
 892	for (i = disks; i--; ) {
 893		int op, op_flags = 0;
 894		int replace_only = 0;
 895		struct bio *bi, *rbi;
 896		struct md_rdev *rdev, *rrdev = NULL;
 897
 898		sh = head_sh;
 899		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 900			op = REQ_OP_WRITE;
 901			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
 902				op_flags = REQ_FUA;
 
 
 903			if (test_bit(R5_Discard, &sh->dev[i].flags))
 904				op = REQ_OP_DISCARD;
 905		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
 906			op = REQ_OP_READ;
 907		else if (test_and_clear_bit(R5_WantReplace,
 908					    &sh->dev[i].flags)) {
 909			op = REQ_OP_WRITE;
 910			replace_only = 1;
 911		} else
 912			continue;
 913		if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
 914			op_flags |= REQ_SYNC;
 915
 916again:
 917		bi = &sh->dev[i].req;
 918		rbi = &sh->dev[i].rreq; /* For writing to replacement */
 919
 920		rcu_read_lock();
 921		rrdev = rcu_dereference(conf->disks[i].replacement);
 922		smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
 923		rdev = rcu_dereference(conf->disks[i].rdev);
 924		if (!rdev) {
 925			rdev = rrdev;
 926			rrdev = NULL;
 927		}
 928		if (op_is_write(op)) {
 929			if (replace_only)
 930				rdev = NULL;
 931			if (rdev == rrdev)
 932				/* We raced and saw duplicates */
 933				rrdev = NULL;
 934		} else {
 935			if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
 936				rdev = rrdev;
 937			rrdev = NULL;
 938		}
 939
 940		if (rdev && test_bit(Faulty, &rdev->flags))
 941			rdev = NULL;
 942		if (rdev)
 943			atomic_inc(&rdev->nr_pending);
 944		if (rrdev && test_bit(Faulty, &rrdev->flags))
 945			rrdev = NULL;
 946		if (rrdev)
 947			atomic_inc(&rrdev->nr_pending);
 948		rcu_read_unlock();
 949
 950		/* We have already checked bad blocks for reads.  Now
 951		 * need to check for writes.  We never accept write errors
 952		 * on the replacement, so we don't to check rrdev.
 953		 */
 954		while (op_is_write(op) && rdev &&
 955		       test_bit(WriteErrorSeen, &rdev->flags)) {
 956			sector_t first_bad;
 957			int bad_sectors;
 958			int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
 959					      &first_bad, &bad_sectors);
 960			if (!bad)
 961				break;
 962
 963			if (bad < 0) {
 964				set_bit(BlockedBadBlocks, &rdev->flags);
 965				if (!conf->mddev->external &&
 966				    conf->mddev->sb_flags) {
 967					/* It is very unlikely, but we might
 968					 * still need to write out the
 969					 * bad block log - better give it
 970					 * a chance*/
 971					md_check_recovery(conf->mddev);
 972				}
 973				/*
 974				 * Because md_wait_for_blocked_rdev
 975				 * will dec nr_pending, we must
 976				 * increment it first.
 977				 */
 978				atomic_inc(&rdev->nr_pending);
 979				md_wait_for_blocked_rdev(rdev, conf->mddev);
 980			} else {
 981				/* Acknowledged bad block - skip the write */
 982				rdev_dec_pending(rdev, conf->mddev);
 983				rdev = NULL;
 984			}
 985		}
 986
 987		if (rdev) {
 988			if (s->syncing || s->expanding || s->expanded
 989			    || s->replacing)
 990				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 991
 992			set_bit(STRIPE_IO_STARTED, &sh->state);
 993
 
 994			bi->bi_bdev = rdev->bdev;
 995			bio_set_op_attrs(bi, op, op_flags);
 996			bi->bi_end_io = op_is_write(op)
 997				? raid5_end_write_request
 998				: raid5_end_read_request;
 999			bi->bi_private = sh;
1000
1001			pr_debug("%s: for %llu schedule op %d on disc %d\n",
1002				__func__, (unsigned long long)sh->sector,
1003				bi->bi_opf, i);
1004			atomic_inc(&sh->count);
1005			if (sh != head_sh)
1006				atomic_inc(&head_sh->count);
1007			if (use_new_offset(conf, sh))
1008				bi->bi_iter.bi_sector = (sh->sector
1009						 + rdev->new_data_offset);
1010			else
1011				bi->bi_iter.bi_sector = (sh->sector
1012						 + rdev->data_offset);
1013			if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
1014				bi->bi_opf |= REQ_NOMERGE;
1015
1016			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1017				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1018
1019			if (!op_is_write(op) &&
1020			    test_bit(R5_InJournal, &sh->dev[i].flags))
1021				/*
1022				 * issuing read for a page in journal, this
1023				 * must be preparing for prexor in rmw; read
1024				 * the data into orig_page
1025				 */
1026				sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
1027			else
1028				sh->dev[i].vec.bv_page = sh->dev[i].page;
1029			bi->bi_vcnt = 1;
1030			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1031			bi->bi_io_vec[0].bv_offset = 0;
1032			bi->bi_iter.bi_size = STRIPE_SIZE;
1033			/*
1034			 * If this is discard request, set bi_vcnt 0. We don't
1035			 * want to confuse SCSI because SCSI will replace payload
1036			 */
1037			if (op == REQ_OP_DISCARD)
1038				bi->bi_vcnt = 0;
1039			if (rrdev)
1040				set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
1041
1042			if (conf->mddev->gendisk)
1043				trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
1044						      bi, disk_devt(conf->mddev->gendisk),
1045						      sh->dev[i].sector);
1046			generic_make_request(bi);
1047		}
1048		if (rrdev) {
1049			if (s->syncing || s->expanding || s->expanded
1050			    || s->replacing)
1051				md_sync_acct(rrdev->bdev, STRIPE_SECTORS);
1052
1053			set_bit(STRIPE_IO_STARTED, &sh->state);
1054
 
1055			rbi->bi_bdev = rrdev->bdev;
1056			bio_set_op_attrs(rbi, op, op_flags);
1057			BUG_ON(!op_is_write(op));
1058			rbi->bi_end_io = raid5_end_write_request;
1059			rbi->bi_private = sh;
1060
1061			pr_debug("%s: for %llu schedule op %d on "
1062				 "replacement disc %d\n",
1063				__func__, (unsigned long long)sh->sector,
1064				rbi->bi_opf, i);
1065			atomic_inc(&sh->count);
1066			if (sh != head_sh)
1067				atomic_inc(&head_sh->count);
1068			if (use_new_offset(conf, sh))
1069				rbi->bi_iter.bi_sector = (sh->sector
1070						  + rrdev->new_data_offset);
1071			else
1072				rbi->bi_iter.bi_sector = (sh->sector
1073						  + rrdev->data_offset);
1074			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
1075				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
1076			sh->dev[i].rvec.bv_page = sh->dev[i].page;
1077			rbi->bi_vcnt = 1;
1078			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1079			rbi->bi_io_vec[0].bv_offset = 0;
1080			rbi->bi_iter.bi_size = STRIPE_SIZE;
1081			/*
1082			 * If this is discard request, set bi_vcnt 0. We don't
1083			 * want to confuse SCSI because SCSI will replace payload
1084			 */
1085			if (op == REQ_OP_DISCARD)
1086				rbi->bi_vcnt = 0;
1087			if (conf->mddev->gendisk)
1088				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
1089						      rbi, disk_devt(conf->mddev->gendisk),
1090						      sh->dev[i].sector);
1091			generic_make_request(rbi);
1092		}
1093		if (!rdev && !rrdev) {
1094			if (op_is_write(op))
1095				set_bit(STRIPE_DEGRADED, &sh->state);
1096			pr_debug("skip op %d on disc %d for sector %llu\n",
1097				bi->bi_opf, i, (unsigned long long)sh->sector);
1098			clear_bit(R5_LOCKED, &sh->dev[i].flags);
1099			set_bit(STRIPE_HANDLE, &sh->state);
1100		}
1101
1102		if (!head_sh->batch_head)
1103			continue;
1104		sh = list_first_entry(&sh->batch_list, struct stripe_head,
1105				      batch_list);
1106		if (sh != head_sh)
1107			goto again;
1108	}
1109}
1110
1111static struct dma_async_tx_descriptor *
1112async_copy_data(int frombio, struct bio *bio, struct page **page,
1113	sector_t sector, struct dma_async_tx_descriptor *tx,
1114	struct stripe_head *sh, int no_skipcopy)
1115{
1116	struct bio_vec bvl;
1117	struct bvec_iter iter;
1118	struct page *bio_page;
1119	int page_offset;
1120	struct async_submit_ctl submit;
1121	enum async_tx_flags flags = 0;
1122
1123	if (bio->bi_iter.bi_sector >= sector)
1124		page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512;
1125	else
1126		page_offset = (signed)(sector - bio->bi_iter.bi_sector) * -512;
1127
1128	if (frombio)
1129		flags |= ASYNC_TX_FENCE;
1130	init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
1131
1132	bio_for_each_segment(bvl, bio, iter) {
1133		int len = bvl.bv_len;
1134		int clen;
1135		int b_offset = 0;
1136
1137		if (page_offset < 0) {
1138			b_offset = -page_offset;
1139			page_offset += b_offset;
1140			len -= b_offset;
1141		}
1142
1143		if (len > 0 && page_offset + len > STRIPE_SIZE)
1144			clen = STRIPE_SIZE - page_offset;
1145		else
1146			clen = len;
1147
1148		if (clen > 0) {
1149			b_offset += bvl.bv_offset;
1150			bio_page = bvl.bv_page;
1151			if (frombio) {
1152				if (sh->raid_conf->skip_copy &&
1153				    b_offset == 0 && page_offset == 0 &&
1154				    clen == STRIPE_SIZE &&
1155				    !no_skipcopy)
1156					*page = bio_page;
1157				else
1158					tx = async_memcpy(*page, bio_page, page_offset,
1159						  b_offset, clen, &submit);
1160			} else
1161				tx = async_memcpy(bio_page, *page, b_offset,
1162						  page_offset, clen, &submit);
1163		}
1164		/* chain the operations */
1165		submit.depend_tx = tx;
1166
1167		if (clen < len) /* hit end of page */
1168			break;
1169		page_offset +=  len;
1170	}
1171
1172	return tx;
1173}
1174
1175static void ops_complete_biofill(void *stripe_head_ref)
1176{
1177	struct stripe_head *sh = stripe_head_ref;
1178	struct bio_list return_bi = BIO_EMPTY_LIST;
1179	int i;
1180
1181	pr_debug("%s: stripe %llu\n", __func__,
1182		(unsigned long long)sh->sector);
1183
1184	/* clear completed biofills */
1185	for (i = sh->disks; i--; ) {
1186		struct r5dev *dev = &sh->dev[i];
1187
1188		/* acknowledge completion of a biofill operation */
1189		/* and check if we need to reply to a read request,
1190		 * new R5_Wantfill requests are held off until
1191		 * !STRIPE_BIOFILL_RUN
1192		 */
1193		if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
1194			struct bio *rbi, *rbi2;
1195
1196			BUG_ON(!dev->read);
1197			rbi = dev->read;
1198			dev->read = NULL;
1199			while (rbi && rbi->bi_iter.bi_sector <
1200				dev->sector + STRIPE_SECTORS) {
1201				rbi2 = r5_next_bio(rbi, dev->sector);
1202				if (!raid5_dec_bi_active_stripes(rbi))
1203					bio_list_add(&return_bi, rbi);
1204				rbi = rbi2;
1205			}
1206		}
1207	}
1208	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
1209
1210	return_io(&return_bi);
1211
1212	set_bit(STRIPE_HANDLE, &sh->state);
1213	raid5_release_stripe(sh);
1214}
1215
1216static void ops_run_biofill(struct stripe_head *sh)
1217{
1218	struct dma_async_tx_descriptor *tx = NULL;
1219	struct async_submit_ctl submit;
1220	int i;
1221
1222	BUG_ON(sh->batch_head);
1223	pr_debug("%s: stripe %llu\n", __func__,
1224		(unsigned long long)sh->sector);
1225
1226	for (i = sh->disks; i--; ) {
1227		struct r5dev *dev = &sh->dev[i];
1228		if (test_bit(R5_Wantfill, &dev->flags)) {
1229			struct bio *rbi;
1230			spin_lock_irq(&sh->stripe_lock);
1231			dev->read = rbi = dev->toread;
1232			dev->toread = NULL;
1233			spin_unlock_irq(&sh->stripe_lock);
1234			while (rbi && rbi->bi_iter.bi_sector <
1235				dev->sector + STRIPE_SECTORS) {
1236				tx = async_copy_data(0, rbi, &dev->page,
1237						     dev->sector, tx, sh, 0);
1238				rbi = r5_next_bio(rbi, dev->sector);
1239			}
1240		}
1241	}
1242
1243	atomic_inc(&sh->count);
1244	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
1245	async_trigger_callback(&submit);
1246}
1247
1248static void mark_target_uptodate(struct stripe_head *sh, int target)
1249{
1250	struct r5dev *tgt;
1251
1252	if (target < 0)
1253		return;
1254
1255	tgt = &sh->dev[target];
1256	set_bit(R5_UPTODATE, &tgt->flags);
1257	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1258	clear_bit(R5_Wantcompute, &tgt->flags);
1259}
1260
1261static void ops_complete_compute(void *stripe_head_ref)
1262{
1263	struct stripe_head *sh = stripe_head_ref;
1264
1265	pr_debug("%s: stripe %llu\n", __func__,
1266		(unsigned long long)sh->sector);
1267
1268	/* mark the computed target(s) as uptodate */
1269	mark_target_uptodate(sh, sh->ops.target);
1270	mark_target_uptodate(sh, sh->ops.target2);
1271
1272	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
1273	if (sh->check_state == check_state_compute_run)
1274		sh->check_state = check_state_compute_result;
1275	set_bit(STRIPE_HANDLE, &sh->state);
1276	raid5_release_stripe(sh);
1277}
1278
1279/* return a pointer to the address conversion region of the scribble buffer */
1280static addr_conv_t *to_addr_conv(struct stripe_head *sh,
1281				 struct raid5_percpu *percpu, int i)
1282{
1283	void *addr;
1284
1285	addr = flex_array_get(percpu->scribble, i);
1286	return addr + sizeof(struct page *) * (sh->disks + 2);
1287}
1288
1289/* return a pointer to the address conversion region of the scribble buffer */
1290static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
1291{
1292	void *addr;
1293
1294	addr = flex_array_get(percpu->scribble, i);
1295	return addr;
1296}
1297
1298static struct dma_async_tx_descriptor *
1299ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
1300{
1301	int disks = sh->disks;
1302	struct page **xor_srcs = to_addr_page(percpu, 0);
1303	int target = sh->ops.target;
1304	struct r5dev *tgt = &sh->dev[target];
1305	struct page *xor_dest = tgt->page;
1306	int count = 0;
1307	struct dma_async_tx_descriptor *tx;
1308	struct async_submit_ctl submit;
1309	int i;
1310
1311	BUG_ON(sh->batch_head);
1312
1313	pr_debug("%s: stripe %llu block: %d\n",
1314		__func__, (unsigned long long)sh->sector, target);
1315	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1316
1317	for (i = disks; i--; )
1318		if (i != target)
1319			xor_srcs[count++] = sh->dev[i].page;
1320
1321	atomic_inc(&sh->count);
1322
1323	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
1324			  ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
1325	if (unlikely(count == 1))
1326		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1327	else
1328		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1329
1330	return tx;
1331}
1332
1333/* set_syndrome_sources - populate source buffers for gen_syndrome
1334 * @srcs - (struct page *) array of size sh->disks
1335 * @sh - stripe_head to parse
1336 *
1337 * Populates srcs in proper layout order for the stripe and returns the
1338 * 'count' of sources to be used in a call to async_gen_syndrome.  The P
1339 * destination buffer is recorded in srcs[count] and the Q destination
1340 * is recorded in srcs[count+1]].
1341 */
1342static int set_syndrome_sources(struct page **srcs,
1343				struct stripe_head *sh,
1344				int srctype)
1345{
1346	int disks = sh->disks;
1347	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
1348	int d0_idx = raid6_d0(sh);
1349	int count;
1350	int i;
1351
1352	for (i = 0; i < disks; i++)
1353		srcs[i] = NULL;
1354
1355	count = 0;
1356	i = d0_idx;
1357	do {
1358		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1359		struct r5dev *dev = &sh->dev[i];
1360
1361		if (i == sh->qd_idx || i == sh->pd_idx ||
1362		    (srctype == SYNDROME_SRC_ALL) ||
1363		    (srctype == SYNDROME_SRC_WANT_DRAIN &&
1364		     (test_bit(R5_Wantdrain, &dev->flags) ||
1365		      test_bit(R5_InJournal, &dev->flags))) ||
1366		    (srctype == SYNDROME_SRC_WRITTEN &&
1367		     (dev->written ||
1368		      test_bit(R5_InJournal, &dev->flags)))) {
1369			if (test_bit(R5_InJournal, &dev->flags))
1370				srcs[slot] = sh->dev[i].orig_page;
1371			else
1372				srcs[slot] = sh->dev[i].page;
1373		}
1374		i = raid6_next_disk(i, disks);
1375	} while (i != d0_idx);
1376
1377	return syndrome_disks;
1378}
1379
1380static struct dma_async_tx_descriptor *
1381ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
1382{
1383	int disks = sh->disks;
1384	struct page **blocks = to_addr_page(percpu, 0);
1385	int target;
1386	int qd_idx = sh->qd_idx;
1387	struct dma_async_tx_descriptor *tx;
1388	struct async_submit_ctl submit;
1389	struct r5dev *tgt;
1390	struct page *dest;
1391	int i;
1392	int count;
1393
1394	BUG_ON(sh->batch_head);
1395	if (sh->ops.target < 0)
1396		target = sh->ops.target2;
1397	else if (sh->ops.target2 < 0)
1398		target = sh->ops.target;
1399	else
1400		/* we should only have one valid target */
1401		BUG();
1402	BUG_ON(target < 0);
1403	pr_debug("%s: stripe %llu block: %d\n",
1404		__func__, (unsigned long long)sh->sector, target);
1405
1406	tgt = &sh->dev[target];
1407	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1408	dest = tgt->page;
1409
1410	atomic_inc(&sh->count);
1411
1412	if (target == qd_idx) {
1413		count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1414		blocks[count] = NULL; /* regenerating p is not necessary */
1415		BUG_ON(blocks[count+1] != dest); /* q should already be set */
1416		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1417				  ops_complete_compute, sh,
1418				  to_addr_conv(sh, percpu, 0));
1419		tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1420	} else {
1421		/* Compute any data- or p-drive using XOR */
1422		count = 0;
1423		for (i = disks; i-- ; ) {
1424			if (i == target || i == qd_idx)
1425				continue;
1426			blocks[count++] = sh->dev[i].page;
1427		}
1428
1429		init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1430				  NULL, ops_complete_compute, sh,
1431				  to_addr_conv(sh, percpu, 0));
1432		tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
1433	}
1434
1435	return tx;
1436}
1437
1438static struct dma_async_tx_descriptor *
1439ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
1440{
1441	int i, count, disks = sh->disks;
1442	int syndrome_disks = sh->ddf_layout ? disks : disks-2;
1443	int d0_idx = raid6_d0(sh);
1444	int faila = -1, failb = -1;
1445	int target = sh->ops.target;
1446	int target2 = sh->ops.target2;
1447	struct r5dev *tgt = &sh->dev[target];
1448	struct r5dev *tgt2 = &sh->dev[target2];
1449	struct dma_async_tx_descriptor *tx;
1450	struct page **blocks = to_addr_page(percpu, 0);
1451	struct async_submit_ctl submit;
1452
1453	BUG_ON(sh->batch_head);
1454	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
1455		 __func__, (unsigned long long)sh->sector, target, target2);
1456	BUG_ON(target < 0 || target2 < 0);
1457	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
1458	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
1459
1460	/* we need to open-code set_syndrome_sources to handle the
1461	 * slot number conversion for 'faila' and 'failb'
1462	 */
1463	for (i = 0; i < disks ; i++)
1464		blocks[i] = NULL;
1465	count = 0;
1466	i = d0_idx;
1467	do {
1468		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
1469
1470		blocks[slot] = sh->dev[i].page;
1471
1472		if (i == target)
1473			faila = slot;
1474		if (i == target2)
1475			failb = slot;
1476		i = raid6_next_disk(i, disks);
1477	} while (i != d0_idx);
1478
1479	BUG_ON(faila == failb);
1480	if (failb < faila)
1481		swap(faila, failb);
1482	pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
1483		 __func__, (unsigned long long)sh->sector, faila, failb);
1484
1485	atomic_inc(&sh->count);
1486
1487	if (failb == syndrome_disks+1) {
1488		/* Q disk is one of the missing disks */
1489		if (faila == syndrome_disks) {
1490			/* Missing P+Q, just recompute */
1491			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1492					  ops_complete_compute, sh,
1493					  to_addr_conv(sh, percpu, 0));
1494			return async_gen_syndrome(blocks, 0, syndrome_disks+2,
1495						  STRIPE_SIZE, &submit);
1496		} else {
1497			struct page *dest;
1498			int data_target;
1499			int qd_idx = sh->qd_idx;
1500
1501			/* Missing D+Q: recompute D from P, then recompute Q */
1502			if (target == qd_idx)
1503				data_target = target2;
1504			else
1505				data_target = target;
1506
1507			count = 0;
1508			for (i = disks; i-- ; ) {
1509				if (i == data_target || i == qd_idx)
1510					continue;
1511				blocks[count++] = sh->dev[i].page;
1512			}
1513			dest = sh->dev[data_target].page;
1514			init_async_submit(&submit,
1515					  ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
1516					  NULL, NULL, NULL,
1517					  to_addr_conv(sh, percpu, 0));
1518			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
1519				       &submit);
1520
1521			count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
1522			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
1523					  ops_complete_compute, sh,
1524					  to_addr_conv(sh, percpu, 0));
1525			return async_gen_syndrome(blocks, 0, count+2,
1526						  STRIPE_SIZE, &submit);
1527		}
1528	} else {
1529		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
1530				  ops_complete_compute, sh,
1531				  to_addr_conv(sh, percpu, 0));
1532		if (failb == syndrome_disks) {
1533			/* We're missing D+P. */
1534			return async_raid6_datap_recov(syndrome_disks+2,
1535						       STRIPE_SIZE, faila,
1536						       blocks, &submit);
1537		} else {
1538			/* We're missing D+D. */
1539			return async_raid6_2data_recov(syndrome_disks+2,
1540						       STRIPE_SIZE, faila, failb,
1541						       blocks, &submit);
1542		}
1543	}
1544}
1545
1546static void ops_complete_prexor(void *stripe_head_ref)
1547{
1548	struct stripe_head *sh = stripe_head_ref;
1549
1550	pr_debug("%s: stripe %llu\n", __func__,
1551		(unsigned long long)sh->sector);
1552
1553	if (r5c_is_writeback(sh->raid_conf->log))
1554		/*
1555		 * raid5-cache write back uses orig_page during prexor.
1556		 * After prexor, it is time to free orig_page
1557		 */
1558		r5c_release_extra_page(sh);
1559}
1560
1561static struct dma_async_tx_descriptor *
1562ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
1563		struct dma_async_tx_descriptor *tx)
1564{
1565	int disks = sh->disks;
1566	struct page **xor_srcs = to_addr_page(percpu, 0);
1567	int count = 0, pd_idx = sh->pd_idx, i;
1568	struct async_submit_ctl submit;
1569
1570	/* existing parity data subtracted */
1571	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1572
1573	BUG_ON(sh->batch_head);
1574	pr_debug("%s: stripe %llu\n", __func__,
1575		(unsigned long long)sh->sector);
1576
1577	for (i = disks; i--; ) {
1578		struct r5dev *dev = &sh->dev[i];
1579		/* Only process blocks that are known to be uptodate */
1580		if (test_bit(R5_InJournal, &dev->flags))
1581			xor_srcs[count++] = dev->orig_page;
1582		else if (test_bit(R5_Wantdrain, &dev->flags))
1583			xor_srcs[count++] = dev->page;
1584	}
1585
1586	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1587			  ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1588	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1589
1590	return tx;
1591}
1592
1593static struct dma_async_tx_descriptor *
1594ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
1595		struct dma_async_tx_descriptor *tx)
1596{
1597	struct page **blocks = to_addr_page(percpu, 0);
1598	int count;
1599	struct async_submit_ctl submit;
1600
1601	pr_debug("%s: stripe %llu\n", __func__,
1602		(unsigned long long)sh->sector);
1603
1604	count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
1605
1606	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
1607			  ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
1608	tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
1609
1610	return tx;
1611}
1612
1613static struct dma_async_tx_descriptor *
1614ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1615{
1616	struct r5conf *conf = sh->raid_conf;
1617	int disks = sh->disks;
1618	int i;
1619	struct stripe_head *head_sh = sh;
1620
1621	pr_debug("%s: stripe %llu\n", __func__,
1622		(unsigned long long)sh->sector);
1623
1624	for (i = disks; i--; ) {
1625		struct r5dev *dev;
1626		struct bio *chosen;
1627
1628		sh = head_sh;
1629		if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
1630			struct bio *wbi;
1631
1632again:
1633			dev = &sh->dev[i];
1634			/*
1635			 * clear R5_InJournal, so when rewriting a page in
1636			 * journal, it is not skipped by r5l_log_stripe()
1637			 */
1638			clear_bit(R5_InJournal, &dev->flags);
1639			spin_lock_irq(&sh->stripe_lock);
1640			chosen = dev->towrite;
1641			dev->towrite = NULL;
1642			sh->overwrite_disks = 0;
1643			BUG_ON(dev->written);
1644			wbi = dev->written = chosen;
1645			spin_unlock_irq(&sh->stripe_lock);
1646			WARN_ON(dev->page != dev->orig_page);
1647
1648			while (wbi && wbi->bi_iter.bi_sector <
1649				dev->sector + STRIPE_SECTORS) {
1650				if (wbi->bi_opf & REQ_FUA)
1651					set_bit(R5_WantFUA, &dev->flags);
1652				if (wbi->bi_opf & REQ_SYNC)
1653					set_bit(R5_SyncIO, &dev->flags);
1654				if (bio_op(wbi) == REQ_OP_DISCARD)
1655					set_bit(R5_Discard, &dev->flags);
1656				else {
1657					tx = async_copy_data(1, wbi, &dev->page,
1658							     dev->sector, tx, sh,
1659							     r5c_is_writeback(conf->log));
1660					if (dev->page != dev->orig_page &&
1661					    !r5c_is_writeback(conf->log)) {
1662						set_bit(R5_SkipCopy, &dev->flags);
1663						clear_bit(R5_UPTODATE, &dev->flags);
1664						clear_bit(R5_OVERWRITE, &dev->flags);
1665					}
1666				}
1667				wbi = r5_next_bio(wbi, dev->sector);
1668			}
1669
1670			if (head_sh->batch_head) {
1671				sh = list_first_entry(&sh->batch_list,
1672						      struct stripe_head,
1673						      batch_list);
1674				if (sh == head_sh)
1675					continue;
1676				goto again;
1677			}
1678		}
1679	}
1680
1681	return tx;
1682}
1683
1684static void ops_complete_reconstruct(void *stripe_head_ref)
1685{
1686	struct stripe_head *sh = stripe_head_ref;
1687	int disks = sh->disks;
1688	int pd_idx = sh->pd_idx;
1689	int qd_idx = sh->qd_idx;
1690	int i;
1691	bool fua = false, sync = false, discard = false;
1692
1693	pr_debug("%s: stripe %llu\n", __func__,
1694		(unsigned long long)sh->sector);
1695
1696	for (i = disks; i--; ) {
1697		fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1698		sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1699		discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1700	}
1701
1702	for (i = disks; i--; ) {
1703		struct r5dev *dev = &sh->dev[i];
1704
1705		if (dev->written || i == pd_idx || i == qd_idx) {
1706			if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
1707				set_bit(R5_UPTODATE, &dev->flags);
1708			if (fua)
1709				set_bit(R5_WantFUA, &dev->flags);
1710			if (sync)
1711				set_bit(R5_SyncIO, &dev->flags);
1712		}
1713	}
1714
1715	if (sh->reconstruct_state == reconstruct_state_drain_run)
1716		sh->reconstruct_state = reconstruct_state_drain_result;
1717	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1718		sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1719	else {
1720		BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1721		sh->reconstruct_state = reconstruct_state_result;
1722	}
1723
1724	set_bit(STRIPE_HANDLE, &sh->state);
1725	raid5_release_stripe(sh);
1726}
1727
1728static void
1729ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1730		     struct dma_async_tx_descriptor *tx)
1731{
1732	int disks = sh->disks;
1733	struct page **xor_srcs;
1734	struct async_submit_ctl submit;
1735	int count, pd_idx = sh->pd_idx, i;
1736	struct page *xor_dest;
1737	int prexor = 0;
1738	unsigned long flags;
1739	int j = 0;
1740	struct stripe_head *head_sh = sh;
1741	int last_stripe;
1742
1743	pr_debug("%s: stripe %llu\n", __func__,
1744		(unsigned long long)sh->sector);
1745
1746	for (i = 0; i < sh->disks; i++) {
1747		if (pd_idx == i)
1748			continue;
1749		if (!test_bit(R5_Discard, &sh->dev[i].flags))
1750			break;
1751	}
1752	if (i >= sh->disks) {
1753		atomic_inc(&sh->count);
1754		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1755		ops_complete_reconstruct(sh);
1756		return;
1757	}
1758again:
1759	count = 0;
1760	xor_srcs = to_addr_page(percpu, j);
1761	/* check if prexor is active which means only process blocks
1762	 * that are part of a read-modify-write (written)
1763	 */
1764	if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1765		prexor = 1;
1766		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1767		for (i = disks; i--; ) {
1768			struct r5dev *dev = &sh->dev[i];
1769			if (head_sh->dev[i].written ||
1770			    test_bit(R5_InJournal, &head_sh->dev[i].flags))
1771				xor_srcs[count++] = dev->page;
1772		}
1773	} else {
1774		xor_dest = sh->dev[pd_idx].page;
1775		for (i = disks; i--; ) {
1776			struct r5dev *dev = &sh->dev[i];
1777			if (i != pd_idx)
1778				xor_srcs[count++] = dev->page;
1779		}
1780	}
1781
1782	/* 1/ if we prexor'd then the dest is reused as a source
1783	 * 2/ if we did not prexor then we are redoing the parity
1784	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1785	 * for the synchronous xor case
1786	 */
1787	last_stripe = !head_sh->batch_head ||
1788		list_first_entry(&sh->batch_list,
1789				 struct stripe_head, batch_list) == head_sh;
1790	if (last_stripe) {
1791		flags = ASYNC_TX_ACK |
1792			(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1793
1794		atomic_inc(&head_sh->count);
1795		init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
1796				  to_addr_conv(sh, percpu, j));
1797	} else {
1798		flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
1799		init_async_submit(&submit, flags, tx, NULL, NULL,
1800				  to_addr_conv(sh, percpu, j));
1801	}
1802
1803	if (unlikely(count == 1))
1804		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1805	else
1806		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1807	if (!last_stripe) {
1808		j++;
1809		sh = list_first_entry(&sh->batch_list, struct stripe_head,
1810				      batch_list);
1811		goto again;
1812	}
1813}
1814
1815static void
1816ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1817		     struct dma_async_tx_descriptor *tx)
1818{
1819	struct async_submit_ctl submit;
1820	struct page **blocks;
1821	int count, i, j = 0;
1822	struct stripe_head *head_sh = sh;
1823	int last_stripe;
1824	int synflags;
1825	unsigned long txflags;
1826
1827	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1828
1829	for (i = 0; i < sh->disks; i++) {
1830		if (sh->pd_idx == i || sh->qd_idx == i)
1831			continue;
1832		if (!test_bit(R5_Discard, &sh->dev[i].flags))
1833			break;
1834	}
1835	if (i >= sh->disks) {
1836		atomic_inc(&sh->count);
1837		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1838		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1839		ops_complete_reconstruct(sh);
1840		return;
1841	}
1842
1843again:
1844	blocks = to_addr_page(percpu, j);
1845
1846	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1847		synflags = SYNDROME_SRC_WRITTEN;
1848		txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
1849	} else {
1850		synflags = SYNDROME_SRC_ALL;
1851		txflags = ASYNC_TX_ACK;
1852	}
1853
1854	count = set_syndrome_sources(blocks, sh, synflags);
1855	last_stripe = !head_sh->batch_head ||
1856		list_first_entry(&sh->batch_list,
1857				 struct stripe_head, batch_list) == head_sh;
1858
1859	if (last_stripe) {
1860		atomic_inc(&head_sh->count);
1861		init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
1862				  head_sh, to_addr_conv(sh, percpu, j));
1863	} else
1864		init_async_submit(&submit, 0, tx, NULL, NULL,
1865				  to_addr_conv(sh, percpu, j));
1866	tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
1867	if (!last_stripe) {
1868		j++;
1869		sh = list_first_entry(&sh->batch_list, struct stripe_head,
1870				      batch_list);
1871		goto again;
1872	}
1873}
1874
1875static void ops_complete_check(void *stripe_head_ref)
1876{
1877	struct stripe_head *sh = stripe_head_ref;
1878
1879	pr_debug("%s: stripe %llu\n", __func__,
1880		(unsigned long long)sh->sector);
1881
1882	sh->check_state = check_state_check_result;
1883	set_bit(STRIPE_HANDLE, &sh->state);
1884	raid5_release_stripe(sh);
1885}
1886
1887static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1888{
1889	int disks = sh->disks;
1890	int pd_idx = sh->pd_idx;
1891	int qd_idx = sh->qd_idx;
1892	struct page *xor_dest;
1893	struct page **xor_srcs = to_addr_page(percpu, 0);
1894	struct dma_async_tx_descriptor *tx;
1895	struct async_submit_ctl submit;
1896	int count;
1897	int i;
1898
1899	pr_debug("%s: stripe %llu\n", __func__,
1900		(unsigned long long)sh->sector);
1901
1902	BUG_ON(sh->batch_head);
1903	count = 0;
1904	xor_dest = sh->dev[pd_idx].page;
1905	xor_srcs[count++] = xor_dest;
1906	for (i = disks; i--; ) {
1907		if (i == pd_idx || i == qd_idx)
1908			continue;
1909		xor_srcs[count++] = sh->dev[i].page;
1910	}
1911
1912	init_async_submit(&submit, 0, NULL, NULL, NULL,
1913			  to_addr_conv(sh, percpu, 0));
1914	tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1915			   &sh->ops.zero_sum_result, &submit);
1916
1917	atomic_inc(&sh->count);
1918	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1919	tx = async_trigger_callback(&submit);
1920}
1921
1922static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1923{
1924	struct page **srcs = to_addr_page(percpu, 0);
1925	struct async_submit_ctl submit;
1926	int count;
1927
1928	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1929		(unsigned long long)sh->sector, checkp);
1930
1931	BUG_ON(sh->batch_head);
1932	count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
1933	if (!checkp)
1934		srcs[count] = NULL;
1935
1936	atomic_inc(&sh->count);
1937	init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1938			  sh, to_addr_conv(sh, percpu, 0));
1939	async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1940			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1941}
1942
1943static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1944{
1945	int overlap_clear = 0, i, disks = sh->disks;
1946	struct dma_async_tx_descriptor *tx = NULL;
1947	struct r5conf *conf = sh->raid_conf;
1948	int level = conf->level;
1949	struct raid5_percpu *percpu;
1950	unsigned long cpu;
1951
1952	cpu = get_cpu();
1953	percpu = per_cpu_ptr(conf->percpu, cpu);
1954	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
1955		ops_run_biofill(sh);
1956		overlap_clear++;
1957	}
1958
1959	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1960		if (level < 6)
1961			tx = ops_run_compute5(sh, percpu);
1962		else {
1963			if (sh->ops.target2 < 0 || sh->ops.target < 0)
1964				tx = ops_run_compute6_1(sh, percpu);
1965			else
1966				tx = ops_run_compute6_2(sh, percpu);
1967		}
1968		/* terminate the chain if reconstruct is not set to be run */
1969		if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
1970			async_tx_ack(tx);
1971	}
1972
1973	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
1974		if (level < 6)
1975			tx = ops_run_prexor5(sh, percpu, tx);
1976		else
1977			tx = ops_run_prexor6(sh, percpu, tx);
1978	}
1979
1980	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1981		tx = ops_run_biodrain(sh, tx);
1982		overlap_clear++;
1983	}
1984
1985	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
1986		if (level < 6)
1987			ops_run_reconstruct5(sh, percpu, tx);
1988		else
1989			ops_run_reconstruct6(sh, percpu, tx);
1990	}
1991
1992	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
1993		if (sh->check_state == check_state_run)
1994			ops_run_check_p(sh, percpu);
1995		else if (sh->check_state == check_state_run_q)
1996			ops_run_check_pq(sh, percpu, 0);
1997		else if (sh->check_state == check_state_run_pq)
1998			ops_run_check_pq(sh, percpu, 1);
1999		else
2000			BUG();
2001	}
2002
2003	if (overlap_clear && !sh->batch_head)
2004		for (i = disks; i--; ) {
2005			struct r5dev *dev = &sh->dev[i];
2006			if (test_and_clear_bit(R5_Overlap, &dev->flags))
2007				wake_up(&sh->raid_conf->wait_for_overlap);
2008		}
2009	put_cpu();
2010}
2011
2012static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
2013	int disks)
2014{
2015	struct stripe_head *sh;
2016	int i;
2017
2018	sh = kmem_cache_zalloc(sc, gfp);
2019	if (sh) {
2020		spin_lock_init(&sh->stripe_lock);
2021		spin_lock_init(&sh->batch_lock);
2022		INIT_LIST_HEAD(&sh->batch_list);
2023		INIT_LIST_HEAD(&sh->lru);
2024		INIT_LIST_HEAD(&sh->r5c);
2025		INIT_LIST_HEAD(&sh->log_list);
2026		atomic_set(&sh->count, 1);
2027		sh->log_start = MaxSector;
2028		for (i = 0; i < disks; i++) {
2029			struct r5dev *dev = &sh->dev[i];
2030
2031			bio_init(&dev->req, &dev->vec, 1);
2032			bio_init(&dev->rreq, &dev->rvec, 1);
2033		}
2034	}
2035	return sh;
2036}
2037static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
2038{
2039	struct stripe_head *sh;
2040
2041	sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size);
2042	if (!sh)
2043		return 0;
2044
2045	sh->raid_conf = conf;
2046
2047	if (grow_buffers(sh, gfp)) {
2048		shrink_buffers(sh);
2049		kmem_cache_free(conf->slab_cache, sh);
2050		return 0;
2051	}
2052	sh->hash_lock_index =
2053		conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
2054	/* we just created an active stripe so... */
2055	atomic_inc(&conf->active_stripes);
2056
2057	raid5_release_stripe(sh);
2058	conf->max_nr_stripes++;
2059	return 1;
2060}
2061
2062static int grow_stripes(struct r5conf *conf, int num)
2063{
2064	struct kmem_cache *sc;
2065	int devs = max(conf->raid_disks, conf->previous_raid_disks);
2066
2067	if (conf->mddev->gendisk)
2068		sprintf(conf->cache_name[0],
2069			"raid%d-%s", conf->level, mdname(conf->mddev));
2070	else
2071		sprintf(conf->cache_name[0],
2072			"raid%d-%p", conf->level, conf->mddev);
2073	sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
2074
2075	conf->active_name = 0;
2076	sc = kmem_cache_create(conf->cache_name[conf->active_name],
2077			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
2078			       0, 0, NULL);
2079	if (!sc)
2080		return 1;
2081	conf->slab_cache = sc;
2082	conf->pool_size = devs;
2083	while (num--)
2084		if (!grow_one_stripe(conf, GFP_KERNEL))
2085			return 1;
2086
2087	return 0;
2088}
2089
2090/**
2091 * scribble_len - return the required size of the scribble region
2092 * @num - total number of disks in the array
2093 *
2094 * The size must be enough to contain:
2095 * 1/ a struct page pointer for each device in the array +2
2096 * 2/ room to convert each entry in (1) to its corresponding dma
2097 *    (dma_map_page()) or page (page_address()) address.
2098 *
2099 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
2100 * calculate over all devices (not just the data blocks), using zeros in place
2101 * of the P and Q blocks.
2102 */
2103static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
2104{
2105	struct flex_array *ret;
2106	size_t len;
2107
2108	len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
2109	ret = flex_array_alloc(len, cnt, flags);
2110	if (!ret)
2111		return NULL;
2112	/* always prealloc all elements, so no locking is required */
2113	if (flex_array_prealloc(ret, 0, cnt, flags)) {
2114		flex_array_free(ret);
2115		return NULL;
2116	}
2117	return ret;
2118}
2119
2120static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
2121{
2122	unsigned long cpu;
2123	int err = 0;
2124
2125	/*
2126	 * Never shrink. And mddev_suspend() could deadlock if this is called
2127	 * from raid5d. In that case, scribble_disks and scribble_sectors
2128	 * should equal to new_disks and new_sectors
2129	 */
2130	if (conf->scribble_disks >= new_disks &&
2131	    conf->scribble_sectors >= new_sectors)
2132		return 0;
2133	mddev_suspend(conf->mddev);
2134	get_online_cpus();
2135	for_each_present_cpu(cpu) {
2136		struct raid5_percpu *percpu;
2137		struct flex_array *scribble;
2138
2139		percpu = per_cpu_ptr(conf->percpu, cpu);
2140		scribble = scribble_alloc(new_disks,
2141					  new_sectors / STRIPE_SECTORS,
2142					  GFP_NOIO);
2143
2144		if (scribble) {
2145			flex_array_free(percpu->scribble);
2146			percpu->scribble = scribble;
2147		} else {
2148			err = -ENOMEM;
2149			break;
2150		}
2151	}
2152	put_online_cpus();
2153	mddev_resume(conf->mddev);
2154	if (!err) {
2155		conf->scribble_disks = new_disks;
2156		conf->scribble_sectors = new_sectors;
2157	}
2158	return err;
2159}
2160
2161static int resize_stripes(struct r5conf *conf, int newsize)
2162{
2163	/* Make all the stripes able to hold 'newsize' devices.
2164	 * New slots in each stripe get 'page' set to a new page.
2165	 *
2166	 * This happens in stages:
2167	 * 1/ create a new kmem_cache and allocate the required number of
2168	 *    stripe_heads.
2169	 * 2/ gather all the old stripe_heads and transfer the pages across
2170	 *    to the new stripe_heads.  This will have the side effect of
2171	 *    freezing the array as once all stripe_heads have been collected,
2172	 *    no IO will be possible.  Old stripe heads are freed once their
2173	 *    pages have been transferred over, and the old kmem_cache is
2174	 *    freed when all stripes are done.
2175	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
2176	 *    we simple return a failre status - no need to clean anything up.
2177	 * 4/ allocate new pages for the new slots in the new stripe_heads.
2178	 *    If this fails, we don't bother trying the shrink the
2179	 *    stripe_heads down again, we just leave them as they are.
2180	 *    As each stripe_head is processed the new one is released into
2181	 *    active service.
2182	 *
2183	 * Once step2 is started, we cannot afford to wait for a write,
2184	 * so we use GFP_NOIO allocations.
2185	 */
2186	struct stripe_head *osh, *nsh;
2187	LIST_HEAD(newstripes);
2188	struct disk_info *ndisks;
2189	int err;
2190	struct kmem_cache *sc;
2191	int i;
2192	int hash, cnt;
2193
2194	if (newsize <= conf->pool_size)
2195		return 0; /* never bother to shrink */
2196
2197	err = md_allow_write(conf->mddev);
2198	if (err)
2199		return err;
2200
2201	/* Step 1 */
2202	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
2203			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
2204			       0, 0, NULL);
2205	if (!sc)
2206		return -ENOMEM;
2207
2208	/* Need to ensure auto-resizing doesn't interfere */
2209	mutex_lock(&conf->cache_size_mutex);
2210
2211	for (i = conf->max_nr_stripes; i; i--) {
2212		nsh = alloc_stripe(sc, GFP_KERNEL, newsize);
2213		if (!nsh)
2214			break;
2215
2216		nsh->raid_conf = conf;
2217		list_add(&nsh->lru, &newstripes);
2218	}
2219	if (i) {
2220		/* didn't get enough, give up */
2221		while (!list_empty(&newstripes)) {
2222			nsh = list_entry(newstripes.next, struct stripe_head, lru);
2223			list_del(&nsh->lru);
2224			kmem_cache_free(sc, nsh);
2225		}
2226		kmem_cache_destroy(sc);
2227		mutex_unlock(&conf->cache_size_mutex);
2228		return -ENOMEM;
2229	}
2230	/* Step 2 - Must use GFP_NOIO now.
2231	 * OK, we have enough stripes, start collecting inactive
2232	 * stripes and copying them over
2233	 */
2234	hash = 0;
2235	cnt = 0;
2236	list_for_each_entry(nsh, &newstripes, lru) {
2237		lock_device_hash_lock(conf, hash);
2238		wait_event_cmd(conf->wait_for_stripe,
2239				    !list_empty(conf->inactive_list + hash),
2240				    unlock_device_hash_lock(conf, hash),
2241				    lock_device_hash_lock(conf, hash));
2242		osh = get_free_stripe(conf, hash);
2243		unlock_device_hash_lock(conf, hash);
2244
2245		for(i=0; i<conf->pool_size; i++) {
2246			nsh->dev[i].page = osh->dev[i].page;
2247			nsh->dev[i].orig_page = osh->dev[i].page;
2248		}
2249		nsh->hash_lock_index = hash;
2250		kmem_cache_free(conf->slab_cache, osh);
2251		cnt++;
2252		if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
2253		    !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
2254			hash++;
2255			cnt = 0;
2256		}
2257	}
2258	kmem_cache_destroy(conf->slab_cache);
2259
2260	/* Step 3.
2261	 * At this point, we are holding all the stripes so the array
2262	 * is completely stalled, so now is a good time to resize
2263	 * conf->disks and the scribble region
2264	 */
2265	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
2266	if (ndisks) {
2267		for (i = 0; i < conf->pool_size; i++)
2268			ndisks[i] = conf->disks[i];
2269
2270		for (i = conf->pool_size; i < newsize; i++) {
2271			ndisks[i].extra_page = alloc_page(GFP_NOIO);
2272			if (!ndisks[i].extra_page)
2273				err = -ENOMEM;
2274		}
2275
2276		if (err) {
2277			for (i = conf->pool_size; i < newsize; i++)
2278				if (ndisks[i].extra_page)
2279					put_page(ndisks[i].extra_page);
2280			kfree(ndisks);
2281		} else {
2282			kfree(conf->disks);
2283			conf->disks = ndisks;
2284		}
2285	} else
2286		err = -ENOMEM;
2287
2288	mutex_unlock(&conf->cache_size_mutex);
2289	/* Step 4, return new stripes to service */
2290	while(!list_empty(&newstripes)) {
2291		nsh = list_entry(newstripes.next, struct stripe_head, lru);
2292		list_del_init(&nsh->lru);
2293
2294		for (i=conf->raid_disks; i < newsize; i++)
2295			if (nsh->dev[i].page == NULL) {
2296				struct page *p = alloc_page(GFP_NOIO);
2297				nsh->dev[i].page = p;
2298				nsh->dev[i].orig_page = p;
2299				if (!p)
2300					err = -ENOMEM;
2301			}
2302		raid5_release_stripe(nsh);
2303	}
2304	/* critical section pass, GFP_NOIO no longer needed */
2305
2306	conf->slab_cache = sc;
2307	conf->active_name = 1-conf->active_name;
2308	if (!err)
2309		conf->pool_size = newsize;
2310	return err;
2311}
2312
2313static int drop_one_stripe(struct r5conf *conf)
2314{
2315	struct stripe_head *sh;
2316	int hash = (conf->max_nr_stripes - 1) & STRIPE_HASH_LOCKS_MASK;
2317
2318	spin_lock_irq(conf->hash_locks + hash);
2319	sh = get_free_stripe(conf, hash);
2320	spin_unlock_irq(conf->hash_locks + hash);
2321	if (!sh)
2322		return 0;
2323	BUG_ON(atomic_read(&sh->count));
2324	shrink_buffers(sh);
2325	kmem_cache_free(conf->slab_cache, sh);
2326	atomic_dec(&conf->active_stripes);
2327	conf->max_nr_stripes--;
2328	return 1;
2329}
2330
2331static void shrink_stripes(struct r5conf *conf)
2332{
2333	while (conf->max_nr_stripes &&
2334	       drop_one_stripe(conf))
2335		;
2336
2337	kmem_cache_destroy(conf->slab_cache);
2338	conf->slab_cache = NULL;
2339}
2340
2341static void raid5_end_read_request(struct bio * bi)
2342{
2343	struct stripe_head *sh = bi->bi_private;
2344	struct r5conf *conf = sh->raid_conf;
2345	int disks = sh->disks, i;
2346	char b[BDEVNAME_SIZE];
2347	struct md_rdev *rdev = NULL;
2348	sector_t s;
2349
2350	for (i=0 ; i<disks; i++)
2351		if (bi == &sh->dev[i].req)
2352			break;
2353
2354	pr_debug("end_read_request %llu/%d, count: %d, error %d.\n",
2355		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
2356		bi->bi_error);
2357	if (i == disks) {
2358		bio_reset(bi);
2359		BUG();
2360		return;
2361	}
2362	if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2363		/* If replacement finished while this request was outstanding,
2364		 * 'replacement' might be NULL already.
2365		 * In that case it moved down to 'rdev'.
2366		 * rdev is not removed until all requests are finished.
2367		 */
2368		rdev = conf->disks[i].replacement;
2369	if (!rdev)
2370		rdev = conf->disks[i].rdev;
2371
2372	if (use_new_offset(conf, sh))
2373		s = sh->sector + rdev->new_data_offset;
2374	else
2375		s = sh->sector + rdev->data_offset;
2376	if (!bi->bi_error) {
2377		set_bit(R5_UPTODATE, &sh->dev[i].flags);
2378		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2379			/* Note that this cannot happen on a
2380			 * replacement device.  We just fail those on
2381			 * any error
2382			 */
2383			pr_info_ratelimited(
2384				"md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
 
 
2385				mdname(conf->mddev), STRIPE_SECTORS,
2386				(unsigned long long)s,
2387				bdevname(rdev->bdev, b));
2388			atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
2389			clear_bit(R5_ReadError, &sh->dev[i].flags);
2390			clear_bit(R5_ReWrite, &sh->dev[i].flags);
2391		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2392			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2393
2394		if (test_bit(R5_InJournal, &sh->dev[i].flags))
2395			/*
2396			 * end read for a page in journal, this
2397			 * must be preparing for prexor in rmw
2398			 */
2399			set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
2400
2401		if (atomic_read(&rdev->read_errors))
2402			atomic_set(&rdev->read_errors, 0);
2403	} else {
2404		const char *bdn = bdevname(rdev->bdev, b);
2405		int retry = 0;
2406		int set_bad = 0;
2407
2408		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
2409		atomic_inc(&rdev->read_errors);
2410		if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
2411			pr_warn_ratelimited(
2412				"md/raid:%s: read error on replacement device (sector %llu on %s).\n",
 
 
2413				mdname(conf->mddev),
2414				(unsigned long long)s,
2415				bdn);
2416		else if (conf->mddev->degraded >= conf->max_degraded) {
2417			set_bad = 1;
2418			pr_warn_ratelimited(
2419				"md/raid:%s: read error not correctable (sector %llu on %s).\n",
 
 
2420				mdname(conf->mddev),
2421				(unsigned long long)s,
2422				bdn);
2423		} else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
2424			/* Oh, no!!! */
2425			set_bad = 1;
2426			pr_warn_ratelimited(
2427				"md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
 
 
2428				mdname(conf->mddev),
2429				(unsigned long long)s,
2430				bdn);
2431		} else if (atomic_read(&rdev->read_errors)
2432			 > conf->max_nr_stripes)
2433			pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
 
2434			       mdname(conf->mddev), bdn);
2435		else
2436			retry = 1;
2437		if (set_bad && test_bit(In_sync, &rdev->flags)
2438		    && !test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
2439			retry = 1;
2440		if (retry)
2441			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
2442				set_bit(R5_ReadError, &sh->dev[i].flags);
2443				clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2444			} else
2445				set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
2446		else {
2447			clear_bit(R5_ReadError, &sh->dev[i].flags);
2448			clear_bit(R5_ReWrite, &sh->dev[i].flags);
2449			if (!(set_bad
2450			      && test_bit(In_sync, &rdev->flags)
2451			      && rdev_set_badblocks(
2452				      rdev, sh->sector, STRIPE_SECTORS, 0)))
2453				md_error(conf->mddev, rdev);
2454		}
2455	}
2456	rdev_dec_pending(rdev, conf->mddev);
2457	bio_reset(bi);
2458	clear_bit(R5_LOCKED, &sh->dev[i].flags);
2459	set_bit(STRIPE_HANDLE, &sh->state);
2460	raid5_release_stripe(sh);
2461}
2462
2463static void raid5_end_write_request(struct bio *bi)
2464{
2465	struct stripe_head *sh = bi->bi_private;
2466	struct r5conf *conf = sh->raid_conf;
2467	int disks = sh->disks, i;
2468	struct md_rdev *uninitialized_var(rdev);
2469	sector_t first_bad;
2470	int bad_sectors;
2471	int replacement = 0;
2472
2473	for (i = 0 ; i < disks; i++) {
2474		if (bi == &sh->dev[i].req) {
2475			rdev = conf->disks[i].rdev;
2476			break;
2477		}
2478		if (bi == &sh->dev[i].rreq) {
2479			rdev = conf->disks[i].replacement;
2480			if (rdev)
2481				replacement = 1;
2482			else
2483				/* rdev was removed and 'replacement'
2484				 * replaced it.  rdev is not removed
2485				 * until all requests are finished.
2486				 */
2487				rdev = conf->disks[i].rdev;
2488			break;
2489		}
2490	}
2491	pr_debug("end_write_request %llu/%d, count %d, error: %d.\n",
2492		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
2493		bi->bi_error);
2494	if (i == disks) {
2495		bio_reset(bi);
2496		BUG();
2497		return;
2498	}
2499
2500	if (replacement) {
2501		if (bi->bi_error)
2502			md_error(conf->mddev, rdev);
2503		else if (is_badblock(rdev, sh->sector,
2504				     STRIPE_SECTORS,
2505				     &first_bad, &bad_sectors))
2506			set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
2507	} else {
2508		if (bi->bi_error) {
2509			set_bit(STRIPE_DEGRADED, &sh->state);
2510			set_bit(WriteErrorSeen, &rdev->flags);
2511			set_bit(R5_WriteError, &sh->dev[i].flags);
2512			if (!test_and_set_bit(WantReplacement, &rdev->flags))
2513				set_bit(MD_RECOVERY_NEEDED,
2514					&rdev->mddev->recovery);
2515		} else if (is_badblock(rdev, sh->sector,
2516				       STRIPE_SECTORS,
2517				       &first_bad, &bad_sectors)) {
2518			set_bit(R5_MadeGood, &sh->dev[i].flags);
2519			if (test_bit(R5_ReadError, &sh->dev[i].flags))
2520				/* That was a successful write so make
2521				 * sure it looks like we already did
2522				 * a re-write.
2523				 */
2524				set_bit(R5_ReWrite, &sh->dev[i].flags);
2525		}
2526	}
2527	rdev_dec_pending(rdev, conf->mddev);
2528
2529	if (sh->batch_head && bi->bi_error && !replacement)
2530		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
2531
2532	bio_reset(bi);
2533	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
2534		clear_bit(R5_LOCKED, &sh->dev[i].flags);
2535	set_bit(STRIPE_HANDLE, &sh->state);
2536	raid5_release_stripe(sh);
2537
2538	if (sh->batch_head && sh != sh->batch_head)
2539		raid5_release_stripe(sh->batch_head);
2540}
2541
2542static void raid5_build_block(struct stripe_head *sh, int i, int previous)
2543{
2544	struct r5dev *dev = &sh->dev[i];
2545
 
 
 
 
 
 
 
 
 
 
2546	dev->flags = 0;
2547	dev->sector = raid5_compute_blocknr(sh, i, previous);
2548}
2549
2550static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
2551{
2552	char b[BDEVNAME_SIZE];
2553	struct r5conf *conf = mddev->private;
2554	unsigned long flags;
2555	pr_debug("raid456: error called\n");
2556
2557	spin_lock_irqsave(&conf->device_lock, flags);
2558	clear_bit(In_sync, &rdev->flags);
2559	mddev->degraded = raid5_calc_degraded(conf);
2560	spin_unlock_irqrestore(&conf->device_lock, flags);
2561	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2562
2563	set_bit(Blocked, &rdev->flags);
2564	set_bit(Faulty, &rdev->flags);
2565	set_mask_bits(&mddev->sb_flags, 0,
2566		      BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
2567	pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
2568		"md/raid:%s: Operation continuing on %d devices.\n",
2569		mdname(mddev),
2570		bdevname(rdev->bdev, b),
2571		mdname(mddev),
2572		conf->raid_disks - mddev->degraded);
2573	r5c_update_on_rdev_error(mddev);
2574}
2575
2576/*
2577 * Input: a 'big' sector number,
2578 * Output: index of the data and parity disk, and the sector # in them.
2579 */
2580sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
2581			      int previous, int *dd_idx,
2582			      struct stripe_head *sh)
2583{
2584	sector_t stripe, stripe2;
2585	sector_t chunk_number;
2586	unsigned int chunk_offset;
2587	int pd_idx, qd_idx;
2588	int ddf_layout = 0;
2589	sector_t new_sector;
2590	int algorithm = previous ? conf->prev_algo
2591				 : conf->algorithm;
2592	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2593					 : conf->chunk_sectors;
2594	int raid_disks = previous ? conf->previous_raid_disks
2595				  : conf->raid_disks;
2596	int data_disks = raid_disks - conf->max_degraded;
2597
2598	/* First compute the information on this sector */
2599
2600	/*
2601	 * Compute the chunk number and the sector offset inside the chunk
2602	 */
2603	chunk_offset = sector_div(r_sector, sectors_per_chunk);
2604	chunk_number = r_sector;
2605
2606	/*
2607	 * Compute the stripe number
2608	 */
2609	stripe = chunk_number;
2610	*dd_idx = sector_div(stripe, data_disks);
2611	stripe2 = stripe;
2612	/*
2613	 * Select the parity disk based on the user selected algorithm.
2614	 */
2615	pd_idx = qd_idx = -1;
2616	switch(conf->level) {
2617	case 4:
2618		pd_idx = data_disks;
2619		break;
2620	case 5:
2621		switch (algorithm) {
2622		case ALGORITHM_LEFT_ASYMMETRIC:
2623			pd_idx = data_disks - sector_div(stripe2, raid_disks);
2624			if (*dd_idx >= pd_idx)
2625				(*dd_idx)++;
2626			break;
2627		case ALGORITHM_RIGHT_ASYMMETRIC:
2628			pd_idx = sector_div(stripe2, raid_disks);
2629			if (*dd_idx >= pd_idx)
2630				(*dd_idx)++;
2631			break;
2632		case ALGORITHM_LEFT_SYMMETRIC:
2633			pd_idx = data_disks - sector_div(stripe2, raid_disks);
2634			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2635			break;
2636		case ALGORITHM_RIGHT_SYMMETRIC:
2637			pd_idx = sector_div(stripe2, raid_disks);
2638			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2639			break;
2640		case ALGORITHM_PARITY_0:
2641			pd_idx = 0;
2642			(*dd_idx)++;
2643			break;
2644		case ALGORITHM_PARITY_N:
2645			pd_idx = data_disks;
2646			break;
2647		default:
2648			BUG();
2649		}
2650		break;
2651	case 6:
2652
2653		switch (algorithm) {
2654		case ALGORITHM_LEFT_ASYMMETRIC:
2655			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2656			qd_idx = pd_idx + 1;
2657			if (pd_idx == raid_disks-1) {
2658				(*dd_idx)++;	/* Q D D D P */
2659				qd_idx = 0;
2660			} else if (*dd_idx >= pd_idx)
2661				(*dd_idx) += 2; /* D D P Q D */
2662			break;
2663		case ALGORITHM_RIGHT_ASYMMETRIC:
2664			pd_idx = sector_div(stripe2, raid_disks);
2665			qd_idx = pd_idx + 1;
2666			if (pd_idx == raid_disks-1) {
2667				(*dd_idx)++;	/* Q D D D P */
2668				qd_idx = 0;
2669			} else if (*dd_idx >= pd_idx)
2670				(*dd_idx) += 2; /* D D P Q D */
2671			break;
2672		case ALGORITHM_LEFT_SYMMETRIC:
2673			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2674			qd_idx = (pd_idx + 1) % raid_disks;
2675			*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2676			break;
2677		case ALGORITHM_RIGHT_SYMMETRIC:
2678			pd_idx = sector_div(stripe2, raid_disks);
2679			qd_idx = (pd_idx + 1) % raid_disks;
2680			*dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
2681			break;
2682
2683		case ALGORITHM_PARITY_0:
2684			pd_idx = 0;
2685			qd_idx = 1;
2686			(*dd_idx) += 2;
2687			break;
2688		case ALGORITHM_PARITY_N:
2689			pd_idx = data_disks;
2690			qd_idx = data_disks + 1;
2691			break;
2692
2693		case ALGORITHM_ROTATING_ZERO_RESTART:
2694			/* Exactly the same as RIGHT_ASYMMETRIC, but or
2695			 * of blocks for computing Q is different.
2696			 */
2697			pd_idx = sector_div(stripe2, raid_disks);
2698			qd_idx = pd_idx + 1;
2699			if (pd_idx == raid_disks-1) {
2700				(*dd_idx)++;	/* Q D D D P */
2701				qd_idx = 0;
2702			} else if (*dd_idx >= pd_idx)
2703				(*dd_idx) += 2; /* D D P Q D */
2704			ddf_layout = 1;
2705			break;
2706
2707		case ALGORITHM_ROTATING_N_RESTART:
2708			/* Same a left_asymmetric, by first stripe is
2709			 * D D D P Q  rather than
2710			 * Q D D D P
2711			 */
2712			stripe2 += 1;
2713			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2714			qd_idx = pd_idx + 1;
2715			if (pd_idx == raid_disks-1) {
2716				(*dd_idx)++;	/* Q D D D P */
2717				qd_idx = 0;
2718			} else if (*dd_idx >= pd_idx)
2719				(*dd_idx) += 2; /* D D P Q D */
2720			ddf_layout = 1;
2721			break;
2722
2723		case ALGORITHM_ROTATING_N_CONTINUE:
2724			/* Same as left_symmetric but Q is before P */
2725			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
2726			qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
2727			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
2728			ddf_layout = 1;
2729			break;
2730
2731		case ALGORITHM_LEFT_ASYMMETRIC_6:
2732			/* RAID5 left_asymmetric, with Q on last device */
2733			pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2734			if (*dd_idx >= pd_idx)
2735				(*dd_idx)++;
2736			qd_idx = raid_disks - 1;
2737			break;
2738
2739		case ALGORITHM_RIGHT_ASYMMETRIC_6:
2740			pd_idx = sector_div(stripe2, raid_disks-1);
2741			if (*dd_idx >= pd_idx)
2742				(*dd_idx)++;
2743			qd_idx = raid_disks - 1;
2744			break;
2745
2746		case ALGORITHM_LEFT_SYMMETRIC_6:
2747			pd_idx = data_disks - sector_div(stripe2, raid_disks-1);
2748			*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2749			qd_idx = raid_disks - 1;
2750			break;
2751
2752		case ALGORITHM_RIGHT_SYMMETRIC_6:
2753			pd_idx = sector_div(stripe2, raid_disks-1);
2754			*dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
2755			qd_idx = raid_disks - 1;
2756			break;
2757
2758		case ALGORITHM_PARITY_0_6:
2759			pd_idx = 0;
2760			(*dd_idx)++;
2761			qd_idx = raid_disks - 1;
2762			break;
2763
2764		default:
2765			BUG();
2766		}
2767		break;
2768	}
2769
2770	if (sh) {
2771		sh->pd_idx = pd_idx;
2772		sh->qd_idx = qd_idx;
2773		sh->ddf_layout = ddf_layout;
2774	}
2775	/*
2776	 * Finally, compute the new sector number
2777	 */
2778	new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
2779	return new_sector;
2780}
2781
2782sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
2783{
2784	struct r5conf *conf = sh->raid_conf;
2785	int raid_disks = sh->disks;
2786	int data_disks = raid_disks - conf->max_degraded;
2787	sector_t new_sector = sh->sector, check;
2788	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
2789					 : conf->chunk_sectors;
2790	int algorithm = previous ? conf->prev_algo
2791				 : conf->algorithm;
2792	sector_t stripe;
2793	int chunk_offset;
2794	sector_t chunk_number;
2795	int dummy1, dd_idx = i;
2796	sector_t r_sector;
2797	struct stripe_head sh2;
2798
2799	chunk_offset = sector_div(new_sector, sectors_per_chunk);
2800	stripe = new_sector;
2801
2802	if (i == sh->pd_idx)
2803		return 0;
2804	switch(conf->level) {
2805	case 4: break;
2806	case 5:
2807		switch (algorithm) {
2808		case ALGORITHM_LEFT_ASYMMETRIC:
2809		case ALGORITHM_RIGHT_ASYMMETRIC:
2810			if (i > sh->pd_idx)
2811				i--;
2812			break;
2813		case ALGORITHM_LEFT_SYMMETRIC:
2814		case ALGORITHM_RIGHT_SYMMETRIC:
2815			if (i < sh->pd_idx)
2816				i += raid_disks;
2817			i -= (sh->pd_idx + 1);
2818			break;
2819		case ALGORITHM_PARITY_0:
2820			i -= 1;
2821			break;
2822		case ALGORITHM_PARITY_N:
2823			break;
2824		default:
2825			BUG();
2826		}
2827		break;
2828	case 6:
2829		if (i == sh->qd_idx)
2830			return 0; /* It is the Q disk */
2831		switch (algorithm) {
2832		case ALGORITHM_LEFT_ASYMMETRIC:
2833		case ALGORITHM_RIGHT_ASYMMETRIC:
2834		case ALGORITHM_ROTATING_ZERO_RESTART:
2835		case ALGORITHM_ROTATING_N_RESTART:
2836			if (sh->pd_idx == raid_disks-1)
2837				i--;	/* Q D D D P */
2838			else if (i > sh->pd_idx)
2839				i -= 2; /* D D P Q D */
2840			break;
2841		case ALGORITHM_LEFT_SYMMETRIC:
2842		case ALGORITHM_RIGHT_SYMMETRIC:
2843			if (sh->pd_idx == raid_disks-1)
2844				i--; /* Q D D D P */
2845			else {
2846				/* D D P Q D */
2847				if (i < sh->pd_idx)
2848					i += raid_disks;
2849				i -= (sh->pd_idx + 2);
2850			}
2851			break;
2852		case ALGORITHM_PARITY_0:
2853			i -= 2;
2854			break;
2855		case ALGORITHM_PARITY_N:
2856			break;
2857		case ALGORITHM_ROTATING_N_CONTINUE:
2858			/* Like left_symmetric, but P is before Q */
2859			if (sh->pd_idx == 0)
2860				i--;	/* P D D D Q */
2861			else {
2862				/* D D Q P D */
2863				if (i < sh->pd_idx)
2864					i += raid_disks;
2865				i -= (sh->pd_idx + 1);
2866			}
2867			break;
2868		case ALGORITHM_LEFT_ASYMMETRIC_6:
2869		case ALGORITHM_RIGHT_ASYMMETRIC_6:
2870			if (i > sh->pd_idx)
2871				i--;
2872			break;
2873		case ALGORITHM_LEFT_SYMMETRIC_6:
2874		case ALGORITHM_RIGHT_SYMMETRIC_6:
2875			if (i < sh->pd_idx)
2876				i += data_disks + 1;
2877			i -= (sh->pd_idx + 1);
2878			break;
2879		case ALGORITHM_PARITY_0_6:
2880			i -= 1;
2881			break;
2882		default:
2883			BUG();
2884		}
2885		break;
2886	}
2887
2888	chunk_number = stripe * data_disks + i;
2889	r_sector = chunk_number * sectors_per_chunk + chunk_offset;
2890
2891	check = raid5_compute_sector(conf, r_sector,
2892				     previous, &dummy1, &sh2);
2893	if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
2894		|| sh2.qd_idx != sh->qd_idx) {
2895		pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
2896			mdname(conf->mddev));
2897		return 0;
2898	}
2899	return r_sector;
2900}
2901
2902/*
2903 * There are cases where we want handle_stripe_dirtying() and
2904 * schedule_reconstruction() to delay towrite to some dev of a stripe.
2905 *
2906 * This function checks whether we want to delay the towrite. Specifically,
2907 * we delay the towrite when:
2908 *
2909 *   1. degraded stripe has a non-overwrite to the missing dev, AND this
2910 *      stripe has data in journal (for other devices).
2911 *
2912 *      In this case, when reading data for the non-overwrite dev, it is
2913 *      necessary to handle complex rmw of write back cache (prexor with
2914 *      orig_page, and xor with page). To keep read path simple, we would
2915 *      like to flush data in journal to RAID disks first, so complex rmw
2916 *      is handled in the write patch (handle_stripe_dirtying).
2917 *
2918 */
2919static inline bool delay_towrite(struct r5dev *dev,
2920				   struct stripe_head_state *s)
2921{
2922	return !test_bit(R5_OVERWRITE, &dev->flags) &&
2923		!test_bit(R5_Insync, &dev->flags) && s->injournal;
2924}
2925
2926static void
2927schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
2928			 int rcw, int expand)
2929{
2930	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
2931	struct r5conf *conf = sh->raid_conf;
2932	int level = conf->level;
2933
2934	if (rcw) {
2935		/*
2936		 * In some cases, handle_stripe_dirtying initially decided to
2937		 * run rmw and allocates extra page for prexor. However, rcw is
2938		 * cheaper later on. We need to free the extra page now,
2939		 * because we won't be able to do that in ops_complete_prexor().
2940		 */
2941		r5c_release_extra_page(sh);
2942
2943		for (i = disks; i--; ) {
2944			struct r5dev *dev = &sh->dev[i];
2945
2946			if (dev->towrite && !delay_towrite(dev, s)) {
2947				set_bit(R5_LOCKED, &dev->flags);
2948				set_bit(R5_Wantdrain, &dev->flags);
2949				if (!expand)
2950					clear_bit(R5_UPTODATE, &dev->flags);
2951				s->locked++;
2952			} else if (test_bit(R5_InJournal, &dev->flags)) {
2953				set_bit(R5_LOCKED, &dev->flags);
2954				s->locked++;
2955			}
2956		}
2957		/* if we are not expanding this is a proper write request, and
2958		 * there will be bios with new data to be drained into the
2959		 * stripe cache
2960		 */
2961		if (!expand) {
2962			if (!s->locked)
2963				/* False alarm, nothing to do */
2964				return;
2965			sh->reconstruct_state = reconstruct_state_drain_run;
2966			set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2967		} else
2968			sh->reconstruct_state = reconstruct_state_run;
2969
2970		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2971
2972		if (s->locked + conf->max_degraded == disks)
2973			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2974				atomic_inc(&conf->pending_full_writes);
2975	} else {
2976		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
2977			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
2978		BUG_ON(level == 6 &&
2979			(!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
2980			   test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
2981
2982		for (i = disks; i--; ) {
2983			struct r5dev *dev = &sh->dev[i];
2984			if (i == pd_idx || i == qd_idx)
2985				continue;
2986
2987			if (dev->towrite &&
2988			    (test_bit(R5_UPTODATE, &dev->flags) ||
2989			     test_bit(R5_Wantcompute, &dev->flags))) {
2990				set_bit(R5_Wantdrain, &dev->flags);
2991				set_bit(R5_LOCKED, &dev->flags);
2992				clear_bit(R5_UPTODATE, &dev->flags);
2993				s->locked++;
2994			} else if (test_bit(R5_InJournal, &dev->flags)) {
2995				set_bit(R5_LOCKED, &dev->flags);
2996				s->locked++;
2997			}
2998		}
2999		if (!s->locked)
3000			/* False alarm - nothing to do */
3001			return;
3002		sh->reconstruct_state = reconstruct_state_prexor_drain_run;
3003		set_bit(STRIPE_OP_PREXOR, &s->ops_request);
3004		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
3005		set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
3006	}
3007
3008	/* keep the parity disk(s) locked while asynchronous operations
3009	 * are in flight
3010	 */
3011	set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
3012	clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3013	s->locked++;
3014
3015	if (level == 6) {
3016		int qd_idx = sh->qd_idx;
3017		struct r5dev *dev = &sh->dev[qd_idx];
3018
3019		set_bit(R5_LOCKED, &dev->flags);
3020		clear_bit(R5_UPTODATE, &dev->flags);
3021		s->locked++;
3022	}
3023
3024	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
3025		__func__, (unsigned long long)sh->sector,
3026		s->locked, s->ops_request);
3027}
3028
3029/*
3030 * Each stripe/dev can have one or more bion attached.
3031 * toread/towrite point to the first in a chain.
3032 * The bi_next chain must be in order.
3033 */
3034static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
3035			  int forwrite, int previous)
3036{
3037	struct bio **bip;
3038	struct r5conf *conf = sh->raid_conf;
3039	int firstwrite=0;
3040
3041	pr_debug("adding bi b#%llu to stripe s#%llu\n",
3042		(unsigned long long)bi->bi_iter.bi_sector,
3043		(unsigned long long)sh->sector);
3044
3045	/*
3046	 * If several bio share a stripe. The bio bi_phys_segments acts as a
3047	 * reference count to avoid race. The reference count should already be
3048	 * increased before this function is called (for example, in
3049	 * raid5_make_request()), so other bio sharing this stripe will not free the
3050	 * stripe. If a stripe is owned by one stripe, the stripe lock will
3051	 * protect it.
3052	 */
3053	spin_lock_irq(&sh->stripe_lock);
3054	/* Don't allow new IO added to stripes in batch list */
3055	if (sh->batch_head)
3056		goto overlap;
3057	if (forwrite) {
3058		bip = &sh->dev[dd_idx].towrite;
3059		if (*bip == NULL)
3060			firstwrite = 1;
3061	} else
3062		bip = &sh->dev[dd_idx].toread;
3063	while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) {
3064		if (bio_end_sector(*bip) > bi->bi_iter.bi_sector)
3065			goto overlap;
3066		bip = & (*bip)->bi_next;
3067	}
3068	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
3069		goto overlap;
3070
3071	if (!forwrite || previous)
3072		clear_bit(STRIPE_BATCH_READY, &sh->state);
3073
3074	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
3075	if (*bip)
3076		bi->bi_next = *bip;
3077	*bip = bi;
3078	raid5_inc_bi_active_stripes(bi);
3079
3080	if (forwrite) {
3081		/* check if page is covered */
3082		sector_t sector = sh->dev[dd_idx].sector;
3083		for (bi=sh->dev[dd_idx].towrite;
3084		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
3085			     bi && bi->bi_iter.bi_sector <= sector;
3086		     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
3087			if (bio_end_sector(bi) >= sector)
3088				sector = bio_end_sector(bi);
3089		}
3090		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
3091			if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
3092				sh->overwrite_disks++;
3093	}
3094
3095	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
3096		(unsigned long long)(*bip)->bi_iter.bi_sector,
3097		(unsigned long long)sh->sector, dd_idx);
3098
3099	if (conf->mddev->bitmap && firstwrite) {
3100		/* Cannot hold spinlock over bitmap_startwrite,
3101		 * but must ensure this isn't added to a batch until
3102		 * we have added to the bitmap and set bm_seq.
3103		 * So set STRIPE_BITMAP_PENDING to prevent
3104		 * batching.
3105		 * If multiple add_stripe_bio() calls race here they
3106		 * much all set STRIPE_BITMAP_PENDING.  So only the first one
3107		 * to complete "bitmap_startwrite" gets to set
3108		 * STRIPE_BIT_DELAY.  This is important as once a stripe
3109		 * is added to a batch, STRIPE_BIT_DELAY cannot be changed
3110		 * any more.
3111		 */
3112		set_bit(STRIPE_BITMAP_PENDING, &sh->state);
3113		spin_unlock_irq(&sh->stripe_lock);
3114		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
3115				  STRIPE_SECTORS, 0);
3116		spin_lock_irq(&sh->stripe_lock);
3117		clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
3118		if (!sh->batch_head) {
3119			sh->bm_seq = conf->seq_flush+1;
3120			set_bit(STRIPE_BIT_DELAY, &sh->state);
3121		}
3122	}
3123	spin_unlock_irq(&sh->stripe_lock);
3124
3125	if (stripe_can_batch(sh))
3126		stripe_add_to_batch_list(conf, sh);
3127	return 1;
3128
3129 overlap:
3130	set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
3131	spin_unlock_irq(&sh->stripe_lock);
3132	return 0;
3133}
3134
3135static void end_reshape(struct r5conf *conf);
3136
3137static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
3138			    struct stripe_head *sh)
3139{
3140	int sectors_per_chunk =
3141		previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
3142	int dd_idx;
3143	int chunk_offset = sector_div(stripe, sectors_per_chunk);
3144	int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
3145
3146	raid5_compute_sector(conf,
3147			     stripe * (disks - conf->max_degraded)
3148			     *sectors_per_chunk + chunk_offset,
3149			     previous,
3150			     &dd_idx, sh);
3151}
3152
3153static void
3154handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
3155				struct stripe_head_state *s, int disks,
3156				struct bio_list *return_bi)
3157{
3158	int i;
3159	BUG_ON(sh->batch_head);
3160	for (i = disks; i--; ) {
3161		struct bio *bi;
3162		int bitmap_end = 0;
3163
3164		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
3165			struct md_rdev *rdev;
3166			rcu_read_lock();
3167			rdev = rcu_dereference(conf->disks[i].rdev);
3168			if (rdev && test_bit(In_sync, &rdev->flags) &&
3169			    !test_bit(Faulty, &rdev->flags))
3170				atomic_inc(&rdev->nr_pending);
3171			else
3172				rdev = NULL;
3173			rcu_read_unlock();
3174			if (rdev) {
3175				if (!rdev_set_badblocks(
3176					    rdev,
3177					    sh->sector,
3178					    STRIPE_SECTORS, 0))
3179					md_error(conf->mddev, rdev);
3180				rdev_dec_pending(rdev, conf->mddev);
3181			}
3182		}
3183		spin_lock_irq(&sh->stripe_lock);
3184		/* fail all writes first */
3185		bi = sh->dev[i].towrite;
3186		sh->dev[i].towrite = NULL;
3187		sh->overwrite_disks = 0;
3188		spin_unlock_irq(&sh->stripe_lock);
3189		if (bi)
3190			bitmap_end = 1;
3191
3192		r5l_stripe_write_finished(sh);
3193
3194		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3195			wake_up(&conf->wait_for_overlap);
3196
3197		while (bi && bi->bi_iter.bi_sector <
3198			sh->dev[i].sector + STRIPE_SECTORS) {
3199			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
3200
3201			bi->bi_error = -EIO;
3202			if (!raid5_dec_bi_active_stripes(bi)) {
3203				md_write_end(conf->mddev);
3204				bio_list_add(return_bi, bi);
3205			}
3206			bi = nextbi;
3207		}
3208		if (bitmap_end)
3209			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3210				STRIPE_SECTORS, 0, 0);
3211		bitmap_end = 0;
3212		/* and fail all 'written' */
3213		bi = sh->dev[i].written;
3214		sh->dev[i].written = NULL;
3215		if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
3216			WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
3217			sh->dev[i].page = sh->dev[i].orig_page;
3218		}
3219
3220		if (bi) bitmap_end = 1;
3221		while (bi && bi->bi_iter.bi_sector <
3222		       sh->dev[i].sector + STRIPE_SECTORS) {
3223			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
3224
3225			bi->bi_error = -EIO;
3226			if (!raid5_dec_bi_active_stripes(bi)) {
3227				md_write_end(conf->mddev);
3228				bio_list_add(return_bi, bi);
3229			}
3230			bi = bi2;
3231		}
3232
3233		/* fail any reads if this device is non-operational and
3234		 * the data has not reached the cache yet.
3235		 */
3236		if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
3237		    s->failed > conf->max_degraded &&
3238		    (!test_bit(R5_Insync, &sh->dev[i].flags) ||
3239		      test_bit(R5_ReadError, &sh->dev[i].flags))) {
3240			spin_lock_irq(&sh->stripe_lock);
3241			bi = sh->dev[i].toread;
3242			sh->dev[i].toread = NULL;
3243			spin_unlock_irq(&sh->stripe_lock);
3244			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
3245				wake_up(&conf->wait_for_overlap);
3246			if (bi)
3247				s->to_read--;
3248			while (bi && bi->bi_iter.bi_sector <
3249			       sh->dev[i].sector + STRIPE_SECTORS) {
3250				struct bio *nextbi =
3251					r5_next_bio(bi, sh->dev[i].sector);
3252
3253				bi->bi_error = -EIO;
3254				if (!raid5_dec_bi_active_stripes(bi))
3255					bio_list_add(return_bi, bi);
3256				bi = nextbi;
3257			}
3258		}
3259		if (bitmap_end)
3260			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3261					STRIPE_SECTORS, 0, 0);
3262		/* If we were in the middle of a write the parity block might
3263		 * still be locked - so just clear all R5_LOCKED flags
3264		 */
3265		clear_bit(R5_LOCKED, &sh->dev[i].flags);
3266	}
3267	s->to_write = 0;
3268	s->written = 0;
3269
3270	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3271		if (atomic_dec_and_test(&conf->pending_full_writes))
3272			md_wakeup_thread(conf->mddev->thread);
3273}
3274
3275static void
3276handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
3277		   struct stripe_head_state *s)
3278{
3279	int abort = 0;
3280	int i;
3281
3282	BUG_ON(sh->batch_head);
3283	clear_bit(STRIPE_SYNCING, &sh->state);
3284	if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
3285		wake_up(&conf->wait_for_overlap);
3286	s->syncing = 0;
3287	s->replacing = 0;
3288	/* There is nothing more to do for sync/check/repair.
3289	 * Don't even need to abort as that is handled elsewhere
3290	 * if needed, and not always wanted e.g. if there is a known
3291	 * bad block here.
3292	 * For recover/replace we need to record a bad block on all
3293	 * non-sync devices, or abort the recovery
3294	 */
3295	if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) {
3296		/* During recovery devices cannot be removed, so
3297		 * locking and refcounting of rdevs is not needed
3298		 */
3299		rcu_read_lock();
3300		for (i = 0; i < conf->raid_disks; i++) {
3301			struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
3302			if (rdev
3303			    && !test_bit(Faulty, &rdev->flags)
3304			    && !test_bit(In_sync, &rdev->flags)
3305			    && !rdev_set_badblocks(rdev, sh->sector,
3306						   STRIPE_SECTORS, 0))
3307				abort = 1;
3308			rdev = rcu_dereference(conf->disks[i].replacement);
3309			if (rdev
3310			    && !test_bit(Faulty, &rdev->flags)
3311			    && !test_bit(In_sync, &rdev->flags)
3312			    && !rdev_set_badblocks(rdev, sh->sector,
3313						   STRIPE_SECTORS, 0))
3314				abort = 1;
3315		}
3316		rcu_read_unlock();
3317		if (abort)
3318			conf->recovery_disabled =
3319				conf->mddev->recovery_disabled;
3320	}
3321	md_done_sync(conf->mddev, STRIPE_SECTORS, !abort);
3322}
3323
3324static int want_replace(struct stripe_head *sh, int disk_idx)
3325{
3326	struct md_rdev *rdev;
3327	int rv = 0;
3328
3329	rcu_read_lock();
3330	rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
3331	if (rdev
3332	    && !test_bit(Faulty, &rdev->flags)
3333	    && !test_bit(In_sync, &rdev->flags)
3334	    && (rdev->recovery_offset <= sh->sector
3335		|| rdev->mddev->recovery_cp <= sh->sector))
3336		rv = 1;
3337	rcu_read_unlock();
3338	return rv;
3339}
3340
 
 
 
 
 
 
 
3341static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
3342			   int disk_idx, int disks)
3343{
3344	struct r5dev *dev = &sh->dev[disk_idx];
3345	struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
3346				  &sh->dev[s->failed_num[1]] };
3347	int i;
3348
3349
3350	if (test_bit(R5_LOCKED, &dev->flags) ||
3351	    test_bit(R5_UPTODATE, &dev->flags))
3352		/* No point reading this as we already have it or have
3353		 * decided to get it.
3354		 */
3355		return 0;
3356
3357	if (dev->toread ||
3358	    (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)))
3359		/* We need this block to directly satisfy a request */
3360		return 1;
3361
3362	if (s->syncing || s->expanding ||
3363	    (s->replacing && want_replace(sh, disk_idx)))
3364		/* When syncing, or expanding we read everything.
3365		 * When replacing, we need the replaced block.
3366		 */
3367		return 1;
3368
3369	if ((s->failed >= 1 && fdev[0]->toread) ||
3370	    (s->failed >= 2 && fdev[1]->toread))
3371		/* If we want to read from a failed device, then
3372		 * we need to actually read every other device.
3373		 */
3374		return 1;
3375
3376	/* Sometimes neither read-modify-write nor reconstruct-write
3377	 * cycles can work.  In those cases we read every block we
3378	 * can.  Then the parity-update is certain to have enough to
3379	 * work with.
3380	 * This can only be a problem when we need to write something,
3381	 * and some device has failed.  If either of those tests
3382	 * fail we need look no further.
3383	 */
3384	if (!s->failed || !s->to_write)
3385		return 0;
3386
3387	if (test_bit(R5_Insync, &dev->flags) &&
3388	    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3389		/* Pre-reads at not permitted until after short delay
3390		 * to gather multiple requests.  However if this
3391		 * device is no Insync, the block could only be be computed
3392		 * and there is no need to delay that.
3393		 */
3394		return 0;
3395
3396	for (i = 0; i < s->failed && i < 2; i++) {
3397		if (fdev[i]->towrite &&
3398		    !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3399		    !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3400			/* If we have a partial write to a failed
3401			 * device, then we will need to reconstruct
3402			 * the content of that device, so all other
3403			 * devices must be read.
3404			 */
3405			return 1;
3406	}
3407
3408	/* If we are forced to do a reconstruct-write, either because
3409	 * the current RAID6 implementation only supports that, or
3410	 * or because parity cannot be trusted and we are currently
3411	 * recovering it, there is extra need to be careful.
3412	 * If one of the devices that we would need to read, because
3413	 * it is not being overwritten (and maybe not written at all)
3414	 * is missing/faulty, then we need to read everything we can.
3415	 */
3416	if (sh->raid_conf->level != 6 &&
3417	    sh->sector < sh->raid_conf->mddev->recovery_cp)
3418		/* reconstruct-write isn't being forced */
3419		return 0;
3420	for (i = 0; i < s->failed && i < 2; i++) {
3421		if (s->failed_num[i] != sh->pd_idx &&
3422		    s->failed_num[i] != sh->qd_idx &&
3423		    !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
3424		    !test_bit(R5_OVERWRITE, &fdev[i]->flags))
3425			return 1;
3426	}
3427
3428	return 0;
3429}
3430
3431/* fetch_block - checks the given member device to see if its data needs
3432 * to be read or computed to satisfy a request.
3433 *
3434 * Returns 1 when no more member devices need to be checked, otherwise returns
3435 * 0 to tell the loop in handle_stripe_fill to continue
3436 */
3437static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
3438		       int disk_idx, int disks)
3439{
3440	struct r5dev *dev = &sh->dev[disk_idx];
3441
3442	/* is the data in this block needed, and can we get it? */
3443	if (need_this_block(sh, s, disk_idx, disks)) {
3444		/* we would like to get this block, possibly by computing it,
3445		 * otherwise read it if the backing disk is insync
3446		 */
3447		BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
3448		BUG_ON(test_bit(R5_Wantread, &dev->flags));
3449		BUG_ON(sh->batch_head);
3450		if ((s->uptodate == disks - 1) &&
3451		    (s->failed && (disk_idx == s->failed_num[0] ||
3452				   disk_idx == s->failed_num[1]))) {
3453			/* have disk failed, and we're requested to fetch it;
3454			 * do compute it
3455			 */
3456			pr_debug("Computing stripe %llu block %d\n",
3457			       (unsigned long long)sh->sector, disk_idx);
3458			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3459			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3460			set_bit(R5_Wantcompute, &dev->flags);
3461			sh->ops.target = disk_idx;
3462			sh->ops.target2 = -1; /* no 2nd target */
3463			s->req_compute = 1;
3464			/* Careful: from this point on 'uptodate' is in the eye
3465			 * of raid_run_ops which services 'compute' operations
3466			 * before writes. R5_Wantcompute flags a block that will
3467			 * be R5_UPTODATE by the time it is needed for a
3468			 * subsequent operation.
3469			 */
3470			s->uptodate++;
3471			return 1;
3472		} else if (s->uptodate == disks-2 && s->failed >= 2) {
3473			/* Computing 2-failure is *very* expensive; only
3474			 * do it if failed >= 2
3475			 */
3476			int other;
3477			for (other = disks; other--; ) {
3478				if (other == disk_idx)
3479					continue;
3480				if (!test_bit(R5_UPTODATE,
3481				      &sh->dev[other].flags))
3482					break;
3483			}
3484			BUG_ON(other < 0);
3485			pr_debug("Computing stripe %llu blocks %d,%d\n",
3486			       (unsigned long long)sh->sector,
3487			       disk_idx, other);
3488			set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3489			set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3490			set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
3491			set_bit(R5_Wantcompute, &sh->dev[other].flags);
3492			sh->ops.target = disk_idx;
3493			sh->ops.target2 = other;
3494			s->uptodate += 2;
3495			s->req_compute = 1;
3496			return 1;
3497		} else if (test_bit(R5_Insync, &dev->flags)) {
3498			set_bit(R5_LOCKED, &dev->flags);
3499			set_bit(R5_Wantread, &dev->flags);
3500			s->locked++;
3501			pr_debug("Reading block %d (sync=%d)\n",
3502				disk_idx, s->syncing);
3503		}
3504	}
3505
3506	return 0;
3507}
3508
3509/**
3510 * handle_stripe_fill - read or compute data to satisfy pending requests.
3511 */
3512static void handle_stripe_fill(struct stripe_head *sh,
3513			       struct stripe_head_state *s,
3514			       int disks)
3515{
3516	int i;
3517
3518	/* look for blocks to read/compute, skip this if a compute
3519	 * is already in flight, or if the stripe contents are in the
3520	 * midst of changing due to a write
3521	 */
3522	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
3523	    !sh->reconstruct_state) {
3524
3525		/*
3526		 * For degraded stripe with data in journal, do not handle
3527		 * read requests yet, instead, flush the stripe to raid
3528		 * disks first, this avoids handling complex rmw of write
3529		 * back cache (prexor with orig_page, and then xor with
3530		 * page) in the read path
3531		 */
3532		if (s->injournal && s->failed) {
3533			if (test_bit(STRIPE_R5C_CACHING, &sh->state))
3534				r5c_make_stripe_write_out(sh);
3535			goto out;
3536		}
3537
3538		for (i = disks; i--; )
3539			if (fetch_block(sh, s, i, disks))
3540				break;
3541	}
3542out:
3543	set_bit(STRIPE_HANDLE, &sh->state);
3544}
3545
3546static void break_stripe_batch_list(struct stripe_head *head_sh,
3547				    unsigned long handle_flags);
3548/* handle_stripe_clean_event
3549 * any written block on an uptodate or failed drive can be returned.
3550 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
3551 * never LOCKED, so we don't need to test 'failed' directly.
3552 */
3553static void handle_stripe_clean_event(struct r5conf *conf,
3554	struct stripe_head *sh, int disks, struct bio_list *return_bi)
3555{
3556	int i;
3557	struct r5dev *dev;
3558	int discard_pending = 0;
3559	struct stripe_head *head_sh = sh;
3560	bool do_endio = false;
3561
3562	for (i = disks; i--; )
3563		if (sh->dev[i].written) {
3564			dev = &sh->dev[i];
3565			if (!test_bit(R5_LOCKED, &dev->flags) &&
3566			    (test_bit(R5_UPTODATE, &dev->flags) ||
3567			     test_bit(R5_Discard, &dev->flags) ||
3568			     test_bit(R5_SkipCopy, &dev->flags))) {
3569				/* We can return any write requests */
3570				struct bio *wbi, *wbi2;
3571				pr_debug("Return write for disc %d\n", i);
3572				if (test_and_clear_bit(R5_Discard, &dev->flags))
3573					clear_bit(R5_UPTODATE, &dev->flags);
3574				if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
3575					WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
3576				}
3577				do_endio = true;
3578
3579returnbi:
3580				dev->page = dev->orig_page;
3581				wbi = dev->written;
3582				dev->written = NULL;
3583				while (wbi && wbi->bi_iter.bi_sector <
3584					dev->sector + STRIPE_SECTORS) {
3585					wbi2 = r5_next_bio(wbi, dev->sector);
3586					if (!raid5_dec_bi_active_stripes(wbi)) {
3587						md_write_end(conf->mddev);
3588						bio_list_add(return_bi, wbi);
3589					}
3590					wbi = wbi2;
3591				}
3592				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
3593						STRIPE_SECTORS,
3594					 !test_bit(STRIPE_DEGRADED, &sh->state),
3595						0);
3596				if (head_sh->batch_head) {
3597					sh = list_first_entry(&sh->batch_list,
3598							      struct stripe_head,
3599							      batch_list);
3600					if (sh != head_sh) {
3601						dev = &sh->dev[i];
3602						goto returnbi;
3603					}
3604				}
3605				sh = head_sh;
3606				dev = &sh->dev[i];
3607			} else if (test_bit(R5_Discard, &dev->flags))
3608				discard_pending = 1;
3609		}
3610
3611	r5l_stripe_write_finished(sh);
3612
3613	if (!discard_pending &&
3614	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
3615		int hash;
3616		clear_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
3617		clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
3618		if (sh->qd_idx >= 0) {
3619			clear_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
3620			clear_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags);
3621		}
3622		/* now that discard is done we can proceed with any sync */
3623		clear_bit(STRIPE_DISCARD, &sh->state);
3624		/*
3625		 * SCSI discard will change some bio fields and the stripe has
3626		 * no updated data, so remove it from hash list and the stripe
3627		 * will be reinitialized
3628		 */
3629unhash:
3630		hash = sh->hash_lock_index;
3631		spin_lock_irq(conf->hash_locks + hash);
3632		remove_hash(sh);
3633		spin_unlock_irq(conf->hash_locks + hash);
3634		if (head_sh->batch_head) {
3635			sh = list_first_entry(&sh->batch_list,
3636					      struct stripe_head, batch_list);
3637			if (sh != head_sh)
3638					goto unhash;
3639		}
3640		sh = head_sh;
3641
3642		if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
3643			set_bit(STRIPE_HANDLE, &sh->state);
3644
3645	}
3646
3647	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
3648		if (atomic_dec_and_test(&conf->pending_full_writes))
3649			md_wakeup_thread(conf->mddev->thread);
3650
3651	if (head_sh->batch_head && do_endio)
3652		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
3653}
3654
3655/*
3656 * For RMW in write back cache, we need extra page in prexor to store the
3657 * old data. This page is stored in dev->orig_page.
3658 *
3659 * This function checks whether we have data for prexor. The exact logic
3660 * is:
3661 *       R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE)
3662 */
3663static inline bool uptodate_for_rmw(struct r5dev *dev)
3664{
3665	return (test_bit(R5_UPTODATE, &dev->flags)) &&
3666		(!test_bit(R5_InJournal, &dev->flags) ||
3667		 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
3668}
3669
3670static int handle_stripe_dirtying(struct r5conf *conf,
3671				  struct stripe_head *sh,
3672				  struct stripe_head_state *s,
3673				  int disks)
3674{
3675	int rmw = 0, rcw = 0, i;
3676	sector_t recovery_cp = conf->mddev->recovery_cp;
3677
3678	/* Check whether resync is now happening or should start.
3679	 * If yes, then the array is dirty (after unclean shutdown or
3680	 * initial creation), so parity in some stripes might be inconsistent.
3681	 * In this case, we need to always do reconstruct-write, to ensure
3682	 * that in case of drive failure or read-error correction, we
3683	 * generate correct data from the parity.
3684	 */
3685	if (conf->rmw_level == PARITY_DISABLE_RMW ||
3686	    (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
3687	     s->failed == 0)) {
3688		/* Calculate the real rcw later - for now make it
3689		 * look like rcw is cheaper
3690		 */
3691		rcw = 1; rmw = 2;
3692		pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
3693			 conf->rmw_level, (unsigned long long)recovery_cp,
3694			 (unsigned long long)sh->sector);
3695	} else for (i = disks; i--; ) {
3696		/* would I have to read this buffer for read_modify_write */
3697		struct r5dev *dev = &sh->dev[i];
3698		if (((dev->towrite && !delay_towrite(dev, s)) ||
3699		     i == sh->pd_idx || i == sh->qd_idx ||
3700		     test_bit(R5_InJournal, &dev->flags)) &&
3701		    !test_bit(R5_LOCKED, &dev->flags) &&
3702		    !(uptodate_for_rmw(dev) ||
3703		      test_bit(R5_Wantcompute, &dev->flags))) {
3704			if (test_bit(R5_Insync, &dev->flags))
3705				rmw++;
3706			else
3707				rmw += 2*disks;  /* cannot read it */
3708		}
3709		/* Would I have to read this buffer for reconstruct_write */
3710		if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3711		    i != sh->pd_idx && i != sh->qd_idx &&
3712		    !test_bit(R5_LOCKED, &dev->flags) &&
3713		    !(test_bit(R5_UPTODATE, &dev->flags) ||
3714		      test_bit(R5_Wantcompute, &dev->flags))) {
3715			if (test_bit(R5_Insync, &dev->flags))
3716				rcw++;
3717			else
3718				rcw += 2*disks;
3719		}
3720	}
3721
3722	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
3723		(unsigned long long)sh->sector, rmw, rcw);
3724	set_bit(STRIPE_HANDLE, &sh->state);
3725	if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
3726		/* prefer read-modify-write, but need to get some data */
3727		if (conf->mddev->queue)
3728			blk_add_trace_msg(conf->mddev->queue,
3729					  "raid5 rmw %llu %d",
3730					  (unsigned long long)sh->sector, rmw);
3731		for (i = disks; i--; ) {
3732			struct r5dev *dev = &sh->dev[i];
3733			if (test_bit(R5_InJournal, &dev->flags) &&
3734			    dev->page == dev->orig_page &&
3735			    !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
3736				/* alloc page for prexor */
3737				struct page *p = alloc_page(GFP_NOIO);
3738
3739				if (p) {
3740					dev->orig_page = p;
3741					continue;
3742				}
3743
3744				/*
3745				 * alloc_page() failed, try use
3746				 * disk_info->extra_page
3747				 */
3748				if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
3749						      &conf->cache_state)) {
3750					r5c_use_extra_page(sh);
3751					break;
3752				}
3753
3754				/* extra_page in use, add to delayed_list */
3755				set_bit(STRIPE_DELAYED, &sh->state);
3756				s->waiting_extra_page = 1;
3757				return -EAGAIN;
3758			}
3759		}
3760
3761		for (i = disks; i--; ) {
3762			struct r5dev *dev = &sh->dev[i];
3763			if (((dev->towrite && !delay_towrite(dev, s)) ||
3764			     i == sh->pd_idx || i == sh->qd_idx ||
3765			     test_bit(R5_InJournal, &dev->flags)) &&
3766			    !test_bit(R5_LOCKED, &dev->flags) &&
3767			    !(uptodate_for_rmw(dev) ||
3768			      test_bit(R5_Wantcompute, &dev->flags)) &&
3769			    test_bit(R5_Insync, &dev->flags)) {
3770				if (test_bit(STRIPE_PREREAD_ACTIVE,
3771					     &sh->state)) {
3772					pr_debug("Read_old block %d for r-m-w\n",
3773						 i);
3774					set_bit(R5_LOCKED, &dev->flags);
3775					set_bit(R5_Wantread, &dev->flags);
3776					s->locked++;
3777				} else {
3778					set_bit(STRIPE_DELAYED, &sh->state);
3779					set_bit(STRIPE_HANDLE, &sh->state);
3780				}
3781			}
3782		}
3783	}
3784	if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_PREFER_RMW)) && rcw > 0) {
3785		/* want reconstruct write, but need to get some data */
3786		int qread =0;
3787		rcw = 0;
3788		for (i = disks; i--; ) {
3789			struct r5dev *dev = &sh->dev[i];
3790			if (!test_bit(R5_OVERWRITE, &dev->flags) &&
3791			    i != sh->pd_idx && i != sh->qd_idx &&
3792			    !test_bit(R5_LOCKED, &dev->flags) &&
3793			    !(test_bit(R5_UPTODATE, &dev->flags) ||
3794			      test_bit(R5_Wantcompute, &dev->flags))) {
3795				rcw++;
3796				if (test_bit(R5_Insync, &dev->flags) &&
3797				    test_bit(STRIPE_PREREAD_ACTIVE,
3798					     &sh->state)) {
3799					pr_debug("Read_old block "
3800						"%d for Reconstruct\n", i);
3801					set_bit(R5_LOCKED, &dev->flags);
3802					set_bit(R5_Wantread, &dev->flags);
3803					s->locked++;
3804					qread++;
3805				} else {
3806					set_bit(STRIPE_DELAYED, &sh->state);
3807					set_bit(STRIPE_HANDLE, &sh->state);
3808				}
3809			}
3810		}
3811		if (rcw && conf->mddev->queue)
3812			blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
3813					  (unsigned long long)sh->sector,
3814					  rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
3815	}
3816
3817	if (rcw > disks && rmw > disks &&
3818	    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3819		set_bit(STRIPE_DELAYED, &sh->state);
3820
3821	/* now if nothing is locked, and if we have enough data,
3822	 * we can start a write request
3823	 */
3824	/* since handle_stripe can be called at any time we need to handle the
3825	 * case where a compute block operation has been submitted and then a
3826	 * subsequent call wants to start a write request.  raid_run_ops only
3827	 * handles the case where compute block and reconstruct are requested
3828	 * simultaneously.  If this is not the case then new writes need to be
3829	 * held off until the compute completes.
3830	 */
3831	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
3832	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
3833	     !test_bit(STRIPE_BIT_DELAY, &sh->state)))
3834		schedule_reconstruction(sh, s, rcw == 0, 0);
3835	return 0;
3836}
3837
3838static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
3839				struct stripe_head_state *s, int disks)
3840{
3841	struct r5dev *dev = NULL;
3842
3843	BUG_ON(sh->batch_head);
3844	set_bit(STRIPE_HANDLE, &sh->state);
3845
3846	switch (sh->check_state) {
3847	case check_state_idle:
3848		/* start a new check operation if there are no failures */
3849		if (s->failed == 0) {
3850			BUG_ON(s->uptodate != disks);
3851			sh->check_state = check_state_run;
3852			set_bit(STRIPE_OP_CHECK, &s->ops_request);
3853			clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
3854			s->uptodate--;
3855			break;
3856		}
3857		dev = &sh->dev[s->failed_num[0]];
3858		/* fall through */
3859	case check_state_compute_result:
3860		sh->check_state = check_state_idle;
3861		if (!dev)
3862			dev = &sh->dev[sh->pd_idx];
3863
3864		/* check that a write has not made the stripe insync */
3865		if (test_bit(STRIPE_INSYNC, &sh->state))
3866			break;
3867
3868		/* either failed parity check, or recovery is happening */
3869		BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
3870		BUG_ON(s->uptodate != disks);
3871
3872		set_bit(R5_LOCKED, &dev->flags);
3873		s->locked++;
3874		set_bit(R5_Wantwrite, &dev->flags);
3875
3876		clear_bit(STRIPE_DEGRADED, &sh->state);
3877		set_bit(STRIPE_INSYNC, &sh->state);
3878		break;
3879	case check_state_run:
3880		break; /* we will be called again upon completion */
3881	case check_state_check_result:
3882		sh->check_state = check_state_idle;
3883
3884		/* if a failure occurred during the check operation, leave
3885		 * STRIPE_INSYNC not set and let the stripe be handled again
3886		 */
3887		if (s->failed)
3888			break;
3889
3890		/* handle a successful check operation, if parity is correct
3891		 * we are done.  Otherwise update the mismatch count and repair
3892		 * parity if !MD_RECOVERY_CHECK
3893		 */
3894		if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
3895			/* parity is correct (on disc,
3896			 * not in buffer any more)
3897			 */
3898			set_bit(STRIPE_INSYNC, &sh->state);
3899		else {
3900			atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
3901			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
3902				/* don't try to repair!! */
3903				set_bit(STRIPE_INSYNC, &sh->state);
3904			else {
3905				sh->check_state = check_state_compute_run;
3906				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
3907				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
3908				set_bit(R5_Wantcompute,
3909					&sh->dev[sh->pd_idx].flags);
3910				sh->ops.target = sh->pd_idx;
3911				sh->ops.target2 = -1;
3912				s->uptodate++;
3913			}
3914		}
3915		break;
3916	case check_state_compute_run:
3917		break;
3918	default:
3919		pr_err("%s: unknown check_state: %d sector: %llu\n",
3920		       __func__, sh->check_state,
3921		       (unsigned long long) sh->sector);
3922		BUG();
3923	}
3924}
3925
3926static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
3927				  struct stripe_head_state *s,
3928				  int disks)
3929{
3930	int pd_idx = sh->pd_idx;
3931	int qd_idx = sh->qd_idx;
3932	struct r5dev *dev;
3933
3934	BUG_ON(sh->batch_head);
3935	set_bit(STRIPE_HANDLE, &sh->state);
3936
3937	BUG_ON(s->failed > 2);
3938
3939	/* Want to check and possibly repair P and Q.
3940	 * However there could be one 'failed' device, in which
3941	 * case we can only check one of them, possibly using the
3942	 * other to generate missing data
3943	 */
3944
3945	switch (sh->check_state) {
3946	case check_state_idle:
3947		/* start a new check operation if there are < 2 failures */
3948		if (s->failed == s->q_failed) {
3949			/* The only possible failed device holds Q, so it
3950			 * makes sense to check P (If anything else were failed,
3951			 * we would have used P to recreate it).
3952			 */
3953			sh->check_state = check_state_run;
3954		}
3955		if (!s->q_failed && s->failed < 2) {
3956			/* Q is not failed, and we didn't use it to generate
3957			 * anything, so it makes sense to check it
3958			 */
3959			if (sh->check_state == check_state_run)
3960				sh->check_state = check_state_run_pq;
3961			else
3962				sh->check_state = check_state_run_q;
3963		}
3964
3965		/* discard potentially stale zero_sum_result */
3966		sh->ops.zero_sum_result = 0;
3967
3968		if (sh->check_state == check_state_run) {
3969			/* async_xor_zero_sum destroys the contents of P */
3970			clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
3971			s->uptodate--;
3972		}
3973		if (sh->check_state >= check_state_run &&
3974		    sh->check_state <= check_state_run_pq) {
3975			/* async_syndrome_zero_sum preserves P and Q, so
3976			 * no need to mark them !uptodate here
3977			 */
3978			set_bit(STRIPE_OP_CHECK, &s->ops_request);
3979			break;
3980		}
3981
3982		/* we have 2-disk failure */
3983		BUG_ON(s->failed != 2);
3984		/* fall through */
3985	case check_state_compute_result:
3986		sh->check_state = check_state_idle;
3987
3988		/* check that a write has not made the stripe insync */
3989		if (test_bit(STRIPE_INSYNC, &sh->state))
3990			break;
3991
3992		/* now write out any block on a failed drive,
3993		 * or P or Q if they were recomputed
3994		 */
3995		BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
3996		if (s->failed == 2) {
3997			dev = &sh->dev[s->failed_num[1]];
3998			s->locked++;
3999			set_bit(R5_LOCKED, &dev->flags);
4000			set_bit(R5_Wantwrite, &dev->flags);
4001		}
4002		if (s->failed >= 1) {
4003			dev = &sh->dev[s->failed_num[0]];
4004			s->locked++;
4005			set_bit(R5_LOCKED, &dev->flags);
4006			set_bit(R5_Wantwrite, &dev->flags);
4007		}
4008		if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4009			dev = &sh->dev[pd_idx];
4010			s->locked++;
4011			set_bit(R5_LOCKED, &dev->flags);
4012			set_bit(R5_Wantwrite, &dev->flags);
4013		}
4014		if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4015			dev = &sh->dev[qd_idx];
4016			s->locked++;
4017			set_bit(R5_LOCKED, &dev->flags);
4018			set_bit(R5_Wantwrite, &dev->flags);
4019		}
4020		clear_bit(STRIPE_DEGRADED, &sh->state);
4021
4022		set_bit(STRIPE_INSYNC, &sh->state);
4023		break;
4024	case check_state_run:
4025	case check_state_run_q:
4026	case check_state_run_pq:
4027		break; /* we will be called again upon completion */
4028	case check_state_check_result:
4029		sh->check_state = check_state_idle;
4030
4031		/* handle a successful check operation, if parity is correct
4032		 * we are done.  Otherwise update the mismatch count and repair
4033		 * parity if !MD_RECOVERY_CHECK
4034		 */
4035		if (sh->ops.zero_sum_result == 0) {
4036			/* both parities are correct */
4037			if (!s->failed)
4038				set_bit(STRIPE_INSYNC, &sh->state);
4039			else {
4040				/* in contrast to the raid5 case we can validate
4041				 * parity, but still have a failure to write
4042				 * back
4043				 */
4044				sh->check_state = check_state_compute_result;
4045				/* Returning at this point means that we may go
4046				 * off and bring p and/or q uptodate again so
4047				 * we make sure to check zero_sum_result again
4048				 * to verify if p or q need writeback
4049				 */
4050			}
4051		} else {
4052			atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
4053			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
4054				/* don't try to repair!! */
4055				set_bit(STRIPE_INSYNC, &sh->state);
4056			else {
4057				int *target = &sh->ops.target;
4058
4059				sh->ops.target = -1;
4060				sh->ops.target2 = -1;
4061				sh->check_state = check_state_compute_run;
4062				set_bit(STRIPE_COMPUTE_RUN, &sh->state);
4063				set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
4064				if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
4065					set_bit(R5_Wantcompute,
4066						&sh->dev[pd_idx].flags);
4067					*target = pd_idx;
4068					target = &sh->ops.target2;
4069					s->uptodate++;
4070				}
4071				if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
4072					set_bit(R5_Wantcompute,
4073						&sh->dev[qd_idx].flags);
4074					*target = qd_idx;
4075					s->uptodate++;
4076				}
4077			}
4078		}
4079		break;
4080	case check_state_compute_run:
4081		break;
4082	default:
4083		pr_warn("%s: unknown check_state: %d sector: %llu\n",
4084			__func__, sh->check_state,
4085			(unsigned long long) sh->sector);
4086		BUG();
4087	}
4088}
4089
4090static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
4091{
4092	int i;
4093
4094	/* We have read all the blocks in this stripe and now we need to
4095	 * copy some of them into a target stripe for expand.
4096	 */
4097	struct dma_async_tx_descriptor *tx = NULL;
4098	BUG_ON(sh->batch_head);
4099	clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4100	for (i = 0; i < sh->disks; i++)
4101		if (i != sh->pd_idx && i != sh->qd_idx) {
4102			int dd_idx, j;
4103			struct stripe_head *sh2;
4104			struct async_submit_ctl submit;
4105
4106			sector_t bn = raid5_compute_blocknr(sh, i, 1);
4107			sector_t s = raid5_compute_sector(conf, bn, 0,
4108							  &dd_idx, NULL);
4109			sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
4110			if (sh2 == NULL)
4111				/* so far only the early blocks of this stripe
4112				 * have been requested.  When later blocks
4113				 * get requested, we will try again
4114				 */
4115				continue;
4116			if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
4117			   test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
4118				/* must have already done this block */
4119				raid5_release_stripe(sh2);
4120				continue;
4121			}
4122
4123			/* place all the copies on one channel */
4124			init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
4125			tx = async_memcpy(sh2->dev[dd_idx].page,
4126					  sh->dev[i].page, 0, 0, STRIPE_SIZE,
4127					  &submit);
4128
4129			set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
4130			set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
4131			for (j = 0; j < conf->raid_disks; j++)
4132				if (j != sh2->pd_idx &&
4133				    j != sh2->qd_idx &&
4134				    !test_bit(R5_Expanded, &sh2->dev[j].flags))
4135					break;
4136			if (j == conf->raid_disks) {
4137				set_bit(STRIPE_EXPAND_READY, &sh2->state);
4138				set_bit(STRIPE_HANDLE, &sh2->state);
4139			}
4140			raid5_release_stripe(sh2);
4141
4142		}
4143	/* done submitting copies, wait for them to complete */
4144	async_tx_quiesce(&tx);
4145}
4146
4147/*
4148 * handle_stripe - do things to a stripe.
4149 *
4150 * We lock the stripe by setting STRIPE_ACTIVE and then examine the
4151 * state of various bits to see what needs to be done.
4152 * Possible results:
4153 *    return some read requests which now have data
4154 *    return some write requests which are safely on storage
4155 *    schedule a read on some buffers
4156 *    schedule a write of some buffers
4157 *    return confirmation of parity correctness
4158 *
4159 */
4160
4161static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
4162{
4163	struct r5conf *conf = sh->raid_conf;
4164	int disks = sh->disks;
4165	struct r5dev *dev;
4166	int i;
4167	int do_recovery = 0;
4168
4169	memset(s, 0, sizeof(*s));
4170
4171	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
4172	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
4173	s->failed_num[0] = -1;
4174	s->failed_num[1] = -1;
4175	s->log_failed = r5l_log_disk_error(conf);
4176
4177	/* Now to look around and see what can be done */
4178	rcu_read_lock();
4179	for (i=disks; i--; ) {
4180		struct md_rdev *rdev;
4181		sector_t first_bad;
4182		int bad_sectors;
4183		int is_bad = 0;
4184
4185		dev = &sh->dev[i];
4186
4187		pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
4188			 i, dev->flags,
4189			 dev->toread, dev->towrite, dev->written);
4190		/* maybe we can reply to a read
4191		 *
4192		 * new wantfill requests are only permitted while
4193		 * ops_complete_biofill is guaranteed to be inactive
4194		 */
4195		if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
4196		    !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
4197			set_bit(R5_Wantfill, &dev->flags);
4198
4199		/* now count some things */
4200		if (test_bit(R5_LOCKED, &dev->flags))
4201			s->locked++;
4202		if (test_bit(R5_UPTODATE, &dev->flags))
4203			s->uptodate++;
4204		if (test_bit(R5_Wantcompute, &dev->flags)) {
4205			s->compute++;
4206			BUG_ON(s->compute > 2);
4207		}
4208
4209		if (test_bit(R5_Wantfill, &dev->flags))
4210			s->to_fill++;
4211		else if (dev->toread)
4212			s->to_read++;
4213		if (dev->towrite) {
4214			s->to_write++;
4215			if (!test_bit(R5_OVERWRITE, &dev->flags))
4216				s->non_overwrite++;
4217		}
4218		if (dev->written)
4219			s->written++;
4220		/* Prefer to use the replacement for reads, but only
4221		 * if it is recovered enough and has no bad blocks.
4222		 */
4223		rdev = rcu_dereference(conf->disks[i].replacement);
4224		if (rdev && !test_bit(Faulty, &rdev->flags) &&
4225		    rdev->recovery_offset >= sh->sector + STRIPE_SECTORS &&
4226		    !is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4227				 &first_bad, &bad_sectors))
4228			set_bit(R5_ReadRepl, &dev->flags);
4229		else {
4230			if (rdev && !test_bit(Faulty, &rdev->flags))
4231				set_bit(R5_NeedReplace, &dev->flags);
4232			else
4233				clear_bit(R5_NeedReplace, &dev->flags);
4234			rdev = rcu_dereference(conf->disks[i].rdev);
4235			clear_bit(R5_ReadRepl, &dev->flags);
4236		}
4237		if (rdev && test_bit(Faulty, &rdev->flags))
4238			rdev = NULL;
4239		if (rdev) {
4240			is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
4241					     &first_bad, &bad_sectors);
4242			if (s->blocked_rdev == NULL
4243			    && (test_bit(Blocked, &rdev->flags)
4244				|| is_bad < 0)) {
4245				if (is_bad < 0)
4246					set_bit(BlockedBadBlocks,
4247						&rdev->flags);
4248				s->blocked_rdev = rdev;
4249				atomic_inc(&rdev->nr_pending);
4250			}
4251		}
4252		clear_bit(R5_Insync, &dev->flags);
4253		if (!rdev)
4254			/* Not in-sync */;
4255		else if (is_bad) {
4256			/* also not in-sync */
4257			if (!test_bit(WriteErrorSeen, &rdev->flags) &&
4258			    test_bit(R5_UPTODATE, &dev->flags)) {
4259				/* treat as in-sync, but with a read error
4260				 * which we can now try to correct
4261				 */
4262				set_bit(R5_Insync, &dev->flags);
4263				set_bit(R5_ReadError, &dev->flags);
4264			}
4265		} else if (test_bit(In_sync, &rdev->flags))
4266			set_bit(R5_Insync, &dev->flags);
4267		else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
4268			/* in sync if before recovery_offset */
4269			set_bit(R5_Insync, &dev->flags);
4270		else if (test_bit(R5_UPTODATE, &dev->flags) &&
4271			 test_bit(R5_Expanded, &dev->flags))
4272			/* If we've reshaped into here, we assume it is Insync.
4273			 * We will shortly update recovery_offset to make
4274			 * it official.
4275			 */
4276			set_bit(R5_Insync, &dev->flags);
4277
4278		if (test_bit(R5_WriteError, &dev->flags)) {
4279			/* This flag does not apply to '.replacement'
4280			 * only to .rdev, so make sure to check that*/
4281			struct md_rdev *rdev2 = rcu_dereference(
4282				conf->disks[i].rdev);
4283			if (rdev2 == rdev)
4284				clear_bit(R5_Insync, &dev->flags);
4285			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4286				s->handle_bad_blocks = 1;
4287				atomic_inc(&rdev2->nr_pending);
4288			} else
4289				clear_bit(R5_WriteError, &dev->flags);
4290		}
4291		if (test_bit(R5_MadeGood, &dev->flags)) {
4292			/* This flag does not apply to '.replacement'
4293			 * only to .rdev, so make sure to check that*/
4294			struct md_rdev *rdev2 = rcu_dereference(
4295				conf->disks[i].rdev);
4296			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4297				s->handle_bad_blocks = 1;
4298				atomic_inc(&rdev2->nr_pending);
4299			} else
4300				clear_bit(R5_MadeGood, &dev->flags);
4301		}
4302		if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
4303			struct md_rdev *rdev2 = rcu_dereference(
4304				conf->disks[i].replacement);
4305			if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
4306				s->handle_bad_blocks = 1;
4307				atomic_inc(&rdev2->nr_pending);
4308			} else
4309				clear_bit(R5_MadeGoodRepl, &dev->flags);
4310		}
4311		if (!test_bit(R5_Insync, &dev->flags)) {
4312			/* The ReadError flag will just be confusing now */
4313			clear_bit(R5_ReadError, &dev->flags);
4314			clear_bit(R5_ReWrite, &dev->flags);
4315		}
4316		if (test_bit(R5_ReadError, &dev->flags))
4317			clear_bit(R5_Insync, &dev->flags);
4318		if (!test_bit(R5_Insync, &dev->flags)) {
4319			if (s->failed < 2)
4320				s->failed_num[s->failed] = i;
4321			s->failed++;
4322			if (rdev && !test_bit(Faulty, &rdev->flags))
4323				do_recovery = 1;
4324		}
4325
4326		if (test_bit(R5_InJournal, &dev->flags))
4327			s->injournal++;
4328		if (test_bit(R5_InJournal, &dev->flags) && dev->written)
4329			s->just_cached++;
4330	}
4331	if (test_bit(STRIPE_SYNCING, &sh->state)) {
4332		/* If there is a failed device being replaced,
4333		 *     we must be recovering.
4334		 * else if we are after recovery_cp, we must be syncing
4335		 * else if MD_RECOVERY_REQUESTED is set, we also are syncing.
4336		 * else we can only be replacing
4337		 * sync and recovery both need to read all devices, and so
4338		 * use the same flag.
4339		 */
4340		if (do_recovery ||
4341		    sh->sector >= conf->mddev->recovery_cp ||
4342		    test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery)))
4343			s->syncing = 1;
4344		else
4345			s->replacing = 1;
4346	}
4347	rcu_read_unlock();
4348}
4349
4350static int clear_batch_ready(struct stripe_head *sh)
4351{
4352	/* Return '1' if this is a member of batch, or
4353	 * '0' if it is a lone stripe or a head which can now be
4354	 * handled.
4355	 */
4356	struct stripe_head *tmp;
4357	if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
4358		return (sh->batch_head && sh->batch_head != sh);
4359	spin_lock(&sh->stripe_lock);
4360	if (!sh->batch_head) {
4361		spin_unlock(&sh->stripe_lock);
4362		return 0;
4363	}
4364
4365	/*
4366	 * this stripe could be added to a batch list before we check
4367	 * BATCH_READY, skips it
4368	 */
4369	if (sh->batch_head != sh) {
4370		spin_unlock(&sh->stripe_lock);
4371		return 1;
4372	}
4373	spin_lock(&sh->batch_lock);
4374	list_for_each_entry(tmp, &sh->batch_list, batch_list)
4375		clear_bit(STRIPE_BATCH_READY, &tmp->state);
4376	spin_unlock(&sh->batch_lock);
4377	spin_unlock(&sh->stripe_lock);
4378
4379	/*
4380	 * BATCH_READY is cleared, no new stripes can be added.
4381	 * batch_list can be accessed without lock
4382	 */
4383	return 0;
4384}
4385
4386static void break_stripe_batch_list(struct stripe_head *head_sh,
4387				    unsigned long handle_flags)
4388{
4389	struct stripe_head *sh, *next;
4390	int i;
4391	int do_wakeup = 0;
4392
4393	list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
4394
4395		list_del_init(&sh->batch_list);
4396
4397		WARN_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
4398					  (1 << STRIPE_SYNCING) |
4399					  (1 << STRIPE_REPLACED) |
4400					  (1 << STRIPE_DELAYED) |
4401					  (1 << STRIPE_BIT_DELAY) |
4402					  (1 << STRIPE_FULL_WRITE) |
4403					  (1 << STRIPE_BIOFILL_RUN) |
4404					  (1 << STRIPE_COMPUTE_RUN)  |
4405					  (1 << STRIPE_OPS_REQ_PENDING) |
4406					  (1 << STRIPE_DISCARD) |
4407					  (1 << STRIPE_BATCH_READY) |
4408					  (1 << STRIPE_BATCH_ERR) |
4409					  (1 << STRIPE_BITMAP_PENDING)),
4410			"stripe state: %lx\n", sh->state);
4411		WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
4412					      (1 << STRIPE_REPLACED)),
4413			"head stripe state: %lx\n", head_sh->state);
4414
4415		set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
4416					    (1 << STRIPE_PREREAD_ACTIVE) |
4417					    (1 << STRIPE_DEGRADED)),
4418			      head_sh->state & (1 << STRIPE_INSYNC));
4419
4420		sh->check_state = head_sh->check_state;
4421		sh->reconstruct_state = head_sh->reconstruct_state;
4422		for (i = 0; i < sh->disks; i++) {
4423			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
4424				do_wakeup = 1;
4425			sh->dev[i].flags = head_sh->dev[i].flags &
4426				(~((1 << R5_WriteError) | (1 << R5_Overlap)));
4427		}
4428		spin_lock_irq(&sh->stripe_lock);
4429		sh->batch_head = NULL;
4430		spin_unlock_irq(&sh->stripe_lock);
4431		if (handle_flags == 0 ||
4432		    sh->state & handle_flags)
4433			set_bit(STRIPE_HANDLE, &sh->state);
4434		raid5_release_stripe(sh);
4435	}
4436	spin_lock_irq(&head_sh->stripe_lock);
4437	head_sh->batch_head = NULL;
4438	spin_unlock_irq(&head_sh->stripe_lock);
4439	for (i = 0; i < head_sh->disks; i++)
4440		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
4441			do_wakeup = 1;
4442	if (head_sh->state & handle_flags)
4443		set_bit(STRIPE_HANDLE, &head_sh->state);
4444
4445	if (do_wakeup)
4446		wake_up(&head_sh->raid_conf->wait_for_overlap);
4447}
4448
4449static void handle_stripe(struct stripe_head *sh)
4450{
4451	struct stripe_head_state s;
4452	struct r5conf *conf = sh->raid_conf;
4453	int i;
4454	int prexor;
4455	int disks = sh->disks;
4456	struct r5dev *pdev, *qdev;
4457
4458	clear_bit(STRIPE_HANDLE, &sh->state);
4459	if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) {
4460		/* already being handled, ensure it gets handled
4461		 * again when current action finishes */
4462		set_bit(STRIPE_HANDLE, &sh->state);
4463		return;
4464	}
4465
4466	if (clear_batch_ready(sh) ) {
4467		clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
4468		return;
4469	}
4470
4471	if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
4472		break_stripe_batch_list(sh, 0);
4473
4474	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
4475		spin_lock(&sh->stripe_lock);
4476		/* Cannot process 'sync' concurrently with 'discard' */
4477		if (!test_bit(STRIPE_DISCARD, &sh->state) &&
4478		    test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
4479			set_bit(STRIPE_SYNCING, &sh->state);
4480			clear_bit(STRIPE_INSYNC, &sh->state);
4481			clear_bit(STRIPE_REPLACED, &sh->state);
4482		}
4483		spin_unlock(&sh->stripe_lock);
4484	}
4485	clear_bit(STRIPE_DELAYED, &sh->state);
4486
4487	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
4488		"pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
4489	       (unsigned long long)sh->sector, sh->state,
4490	       atomic_read(&sh->count), sh->pd_idx, sh->qd_idx,
4491	       sh->check_state, sh->reconstruct_state);
4492
4493	analyse_stripe(sh, &s);
4494
4495	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
4496		goto finish;
4497
4498	if (s.handle_bad_blocks) {
4499		set_bit(STRIPE_HANDLE, &sh->state);
4500		goto finish;
4501	}
4502
4503	if (unlikely(s.blocked_rdev)) {
4504		if (s.syncing || s.expanding || s.expanded ||
4505		    s.replacing || s.to_write || s.written) {
4506			set_bit(STRIPE_HANDLE, &sh->state);
4507			goto finish;
4508		}
4509		/* There is nothing for the blocked_rdev to block */
4510		rdev_dec_pending(s.blocked_rdev, conf->mddev);
4511		s.blocked_rdev = NULL;
4512	}
4513
4514	if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
4515		set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
4516		set_bit(STRIPE_BIOFILL_RUN, &sh->state);
4517	}
4518
4519	pr_debug("locked=%d uptodate=%d to_read=%d"
4520	       " to_write=%d failed=%d failed_num=%d,%d\n",
4521	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
4522	       s.failed_num[0], s.failed_num[1]);
4523	/* check if the array has lost more than max_degraded devices and,
4524	 * if so, some requests might need to be failed.
4525	 */
4526	if (s.failed > conf->max_degraded || s.log_failed) {
4527		sh->check_state = 0;
4528		sh->reconstruct_state = 0;
4529		break_stripe_batch_list(sh, 0);
4530		if (s.to_read+s.to_write+s.written)
4531			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
4532		if (s.syncing + s.replacing)
4533			handle_failed_sync(conf, sh, &s);
4534	}
4535
4536	/* Now we check to see if any write operations have recently
4537	 * completed
4538	 */
4539	prexor = 0;
4540	if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
4541		prexor = 1;
4542	if (sh->reconstruct_state == reconstruct_state_drain_result ||
4543	    sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
4544		sh->reconstruct_state = reconstruct_state_idle;
4545
4546		/* All the 'written' buffers and the parity block are ready to
4547		 * be written back to disk
4548		 */
4549		BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
4550		       !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
4551		BUG_ON(sh->qd_idx >= 0 &&
4552		       !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
4553		       !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
4554		for (i = disks; i--; ) {
4555			struct r5dev *dev = &sh->dev[i];
4556			if (test_bit(R5_LOCKED, &dev->flags) &&
4557				(i == sh->pd_idx || i == sh->qd_idx ||
4558				 dev->written || test_bit(R5_InJournal,
4559							  &dev->flags))) {
4560				pr_debug("Writing block %d\n", i);
4561				set_bit(R5_Wantwrite, &dev->flags);
4562				if (prexor)
4563					continue;
4564				if (s.failed > 1)
4565					continue;
4566				if (!test_bit(R5_Insync, &dev->flags) ||
4567				    ((i == sh->pd_idx || i == sh->qd_idx)  &&
4568				     s.failed == 0))
4569					set_bit(STRIPE_INSYNC, &sh->state);
4570			}
4571		}
4572		if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4573			s.dec_preread_active = 1;
4574	}
4575
4576	/*
4577	 * might be able to return some write requests if the parity blocks
4578	 * are safe, or on a failed drive
4579	 */
4580	pdev = &sh->dev[sh->pd_idx];
4581	s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx)
4582		|| (s.failed >= 2 && s.failed_num[1] == sh->pd_idx);
4583	qdev = &sh->dev[sh->qd_idx];
4584	s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx)
4585		|| (s.failed >= 2 && s.failed_num[1] == sh->qd_idx)
4586		|| conf->level < 6;
4587
4588	if (s.written &&
4589	    (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
4590			     && !test_bit(R5_LOCKED, &pdev->flags)
4591			     && (test_bit(R5_UPTODATE, &pdev->flags) ||
4592				 test_bit(R5_Discard, &pdev->flags))))) &&
4593	    (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
4594			     && !test_bit(R5_LOCKED, &qdev->flags)
4595			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
4596				 test_bit(R5_Discard, &qdev->flags))))))
4597		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
4598
4599	if (s.just_cached)
4600		r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
4601	r5l_stripe_write_finished(sh);
4602
4603	/* Now we might consider reading some blocks, either to check/generate
4604	 * parity, or to satisfy requests
4605	 * or to load a block that is being partially written.
4606	 */
4607	if (s.to_read || s.non_overwrite
4608	    || (conf->level == 6 && s.to_write && s.failed)
4609	    || (s.syncing && (s.uptodate + s.compute < disks))
4610	    || s.replacing
4611	    || s.expanding)
4612		handle_stripe_fill(sh, &s, disks);
4613
4614	/*
4615	 * When the stripe finishes full journal write cycle (write to journal
4616	 * and raid disk), this is the clean up procedure so it is ready for
4617	 * next operation.
4618	 */
4619	r5c_finish_stripe_write_out(conf, sh, &s);
4620
4621	/*
4622	 * Now to consider new write requests, cache write back and what else,
4623	 * if anything should be read.  We do not handle new writes when:
4624	 * 1/ A 'write' operation (copy+xor) is already in flight.
4625	 * 2/ A 'check' operation is in flight, as it may clobber the parity
4626	 *    block.
4627	 * 3/ A r5c cache log write is in flight.
4628	 */
4629
4630	if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
4631		if (!r5c_is_writeback(conf->log)) {
4632			if (s.to_write)
4633				handle_stripe_dirtying(conf, sh, &s, disks);
4634		} else { /* write back cache */
4635			int ret = 0;
4636
4637			/* First, try handle writes in caching phase */
4638			if (s.to_write)
4639				ret = r5c_try_caching_write(conf, sh, &s,
4640							    disks);
4641			/*
4642			 * If caching phase failed: ret == -EAGAIN
4643			 *    OR
4644			 * stripe under reclaim: !caching && injournal
4645			 *
4646			 * fall back to handle_stripe_dirtying()
4647			 */
4648			if (ret == -EAGAIN ||
4649			    /* stripe under reclaim: !caching && injournal */
4650			    (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
4651			     s.injournal > 0)) {
4652				ret = handle_stripe_dirtying(conf, sh, &s,
4653							     disks);
4654				if (ret == -EAGAIN)
4655					goto finish;
4656			}
4657		}
4658	}
4659
4660	/* maybe we need to check and possibly fix the parity for this stripe
4661	 * Any reads will already have been scheduled, so we just see if enough
4662	 * data is available.  The parity check is held off while parity
4663	 * dependent operations are in flight.
4664	 */
4665	if (sh->check_state ||
4666	    (s.syncing && s.locked == 0 &&
4667	     !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4668	     !test_bit(STRIPE_INSYNC, &sh->state))) {
4669		if (conf->level == 6)
4670			handle_parity_checks6(conf, sh, &s, disks);
4671		else
4672			handle_parity_checks5(conf, sh, &s, disks);
4673	}
4674
4675	if ((s.replacing || s.syncing) && s.locked == 0
4676	    && !test_bit(STRIPE_COMPUTE_RUN, &sh->state)
4677	    && !test_bit(STRIPE_REPLACED, &sh->state)) {
4678		/* Write out to replacement devices where possible */
4679		for (i = 0; i < conf->raid_disks; i++)
4680			if (test_bit(R5_NeedReplace, &sh->dev[i].flags)) {
4681				WARN_ON(!test_bit(R5_UPTODATE, &sh->dev[i].flags));
4682				set_bit(R5_WantReplace, &sh->dev[i].flags);
4683				set_bit(R5_LOCKED, &sh->dev[i].flags);
4684				s.locked++;
4685			}
4686		if (s.replacing)
4687			set_bit(STRIPE_INSYNC, &sh->state);
4688		set_bit(STRIPE_REPLACED, &sh->state);
4689	}
4690	if ((s.syncing || s.replacing) && s.locked == 0 &&
4691	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
4692	    test_bit(STRIPE_INSYNC, &sh->state)) {
4693		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
4694		clear_bit(STRIPE_SYNCING, &sh->state);
4695		if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
4696			wake_up(&conf->wait_for_overlap);
4697	}
4698
4699	/* If the failed drives are just a ReadError, then we might need
4700	 * to progress the repair/check process
4701	 */
4702	if (s.failed <= conf->max_degraded && !conf->mddev->ro)
4703		for (i = 0; i < s.failed; i++) {
4704			struct r5dev *dev = &sh->dev[s.failed_num[i]];
4705			if (test_bit(R5_ReadError, &dev->flags)
4706			    && !test_bit(R5_LOCKED, &dev->flags)
4707			    && test_bit(R5_UPTODATE, &dev->flags)
4708				) {
4709				if (!test_bit(R5_ReWrite, &dev->flags)) {
4710					set_bit(R5_Wantwrite, &dev->flags);
4711					set_bit(R5_ReWrite, &dev->flags);
4712					set_bit(R5_LOCKED, &dev->flags);
4713					s.locked++;
4714				} else {
4715					/* let's read it back */
4716					set_bit(R5_Wantread, &dev->flags);
4717					set_bit(R5_LOCKED, &dev->flags);
4718					s.locked++;
4719				}
4720			}
4721		}
4722
4723	/* Finish reconstruct operations initiated by the expansion process */
4724	if (sh->reconstruct_state == reconstruct_state_result) {
4725		struct stripe_head *sh_src
4726			= raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
4727		if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
4728			/* sh cannot be written until sh_src has been read.
4729			 * so arrange for sh to be delayed a little
4730			 */
4731			set_bit(STRIPE_DELAYED, &sh->state);
4732			set_bit(STRIPE_HANDLE, &sh->state);
4733			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
4734					      &sh_src->state))
4735				atomic_inc(&conf->preread_active_stripes);
4736			raid5_release_stripe(sh_src);
4737			goto finish;
4738		}
4739		if (sh_src)
4740			raid5_release_stripe(sh_src);
4741
4742		sh->reconstruct_state = reconstruct_state_idle;
4743		clear_bit(STRIPE_EXPANDING, &sh->state);
4744		for (i = conf->raid_disks; i--; ) {
4745			set_bit(R5_Wantwrite, &sh->dev[i].flags);
4746			set_bit(R5_LOCKED, &sh->dev[i].flags);
4747			s.locked++;
4748		}
4749	}
4750
4751	if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
4752	    !sh->reconstruct_state) {
4753		/* Need to write out all blocks after computing parity */
4754		sh->disks = conf->raid_disks;
4755		stripe_set_idx(sh->sector, conf, 0, sh);
4756		schedule_reconstruction(sh, &s, 1, 1);
4757	} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
4758		clear_bit(STRIPE_EXPAND_READY, &sh->state);
4759		atomic_dec(&conf->reshape_stripes);
4760		wake_up(&conf->wait_for_overlap);
4761		md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
4762	}
4763
4764	if (s.expanding && s.locked == 0 &&
4765	    !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
4766		handle_stripe_expansion(conf, sh);
4767
4768finish:
4769	/* wait for this device to become unblocked */
4770	if (unlikely(s.blocked_rdev)) {
4771		if (conf->mddev->external)
4772			md_wait_for_blocked_rdev(s.blocked_rdev,
4773						 conf->mddev);
4774		else
4775			/* Internal metadata will immediately
4776			 * be written by raid5d, so we don't
4777			 * need to wait here.
4778			 */
4779			rdev_dec_pending(s.blocked_rdev,
4780					 conf->mddev);
4781	}
4782
4783	if (s.handle_bad_blocks)
4784		for (i = disks; i--; ) {
4785			struct md_rdev *rdev;
4786			struct r5dev *dev = &sh->dev[i];
4787			if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
4788				/* We own a safe reference to the rdev */
4789				rdev = conf->disks[i].rdev;
4790				if (!rdev_set_badblocks(rdev, sh->sector,
4791							STRIPE_SECTORS, 0))
4792					md_error(conf->mddev, rdev);
4793				rdev_dec_pending(rdev, conf->mddev);
4794			}
4795			if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
4796				rdev = conf->disks[i].rdev;
4797				rdev_clear_badblocks(rdev, sh->sector,
4798						     STRIPE_SECTORS, 0);
4799				rdev_dec_pending(rdev, conf->mddev);
4800			}
4801			if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
4802				rdev = conf->disks[i].replacement;
4803				if (!rdev)
4804					/* rdev have been moved down */
4805					rdev = conf->disks[i].rdev;
4806				rdev_clear_badblocks(rdev, sh->sector,
4807						     STRIPE_SECTORS, 0);
4808				rdev_dec_pending(rdev, conf->mddev);
4809			}
4810		}
4811
4812	if (s.ops_request)
4813		raid_run_ops(sh, s.ops_request);
4814
4815	ops_run_io(sh, &s);
4816
4817	if (s.dec_preread_active) {
4818		/* We delay this until after ops_run_io so that if make_request
4819		 * is waiting on a flush, it won't continue until the writes
4820		 * have actually been submitted.
4821		 */
4822		atomic_dec(&conf->preread_active_stripes);
4823		if (atomic_read(&conf->preread_active_stripes) <
4824		    IO_THRESHOLD)
4825			md_wakeup_thread(conf->mddev->thread);
4826	}
4827
4828	if (!bio_list_empty(&s.return_bi)) {
4829		if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
4830			spin_lock_irq(&conf->device_lock);
4831			bio_list_merge(&conf->return_bi, &s.return_bi);
4832			spin_unlock_irq(&conf->device_lock);
4833			md_wakeup_thread(conf->mddev->thread);
4834		} else
4835			return_io(&s.return_bi);
4836	}
4837
4838	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
4839}
4840
4841static void raid5_activate_delayed(struct r5conf *conf)
4842{
4843	if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
4844		while (!list_empty(&conf->delayed_list)) {
4845			struct list_head *l = conf->delayed_list.next;
4846			struct stripe_head *sh;
4847			sh = list_entry(l, struct stripe_head, lru);
4848			list_del_init(l);
4849			clear_bit(STRIPE_DELAYED, &sh->state);
4850			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4851				atomic_inc(&conf->preread_active_stripes);
4852			list_add_tail(&sh->lru, &conf->hold_list);
4853			raid5_wakeup_stripe_thread(sh);
4854		}
4855	}
4856}
4857
4858static void activate_bit_delay(struct r5conf *conf,
4859	struct list_head *temp_inactive_list)
4860{
4861	/* device_lock is held */
4862	struct list_head head;
4863	list_add(&head, &conf->bitmap_list);
4864	list_del_init(&conf->bitmap_list);
4865	while (!list_empty(&head)) {
4866		struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
4867		int hash;
4868		list_del_init(&sh->lru);
4869		atomic_inc(&sh->count);
4870		hash = sh->hash_lock_index;
4871		__release_stripe(conf, sh, &temp_inactive_list[hash]);
4872	}
4873}
4874
4875static int raid5_congested(struct mddev *mddev, int bits)
4876{
4877	struct r5conf *conf = mddev->private;
4878
4879	/* No difference between reads and writes.  Just check
4880	 * how busy the stripe_cache is
4881	 */
4882
4883	if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
4884		return 1;
4885
4886	/* Also checks whether there is pressure on r5cache log space */
4887	if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
4888		return 1;
4889	if (conf->quiesce)
4890		return 1;
4891	if (atomic_read(&conf->empty_inactive_list_nr))
4892		return 1;
4893
4894	return 0;
4895}
4896
4897static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
4898{
4899	struct r5conf *conf = mddev->private;
4900	sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
4901	unsigned int chunk_sectors;
4902	unsigned int bio_sectors = bio_sectors(bio);
4903
4904	chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
4905	return  chunk_sectors >=
4906		((sector & (chunk_sectors - 1)) + bio_sectors);
4907}
4908
4909/*
4910 *  add bio to the retry LIFO  ( in O(1) ... we are in interrupt )
4911 *  later sampled by raid5d.
4912 */
4913static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
4914{
4915	unsigned long flags;
4916
4917	spin_lock_irqsave(&conf->device_lock, flags);
4918
4919	bi->bi_next = conf->retry_read_aligned_list;
4920	conf->retry_read_aligned_list = bi;
4921
4922	spin_unlock_irqrestore(&conf->device_lock, flags);
4923	md_wakeup_thread(conf->mddev->thread);
4924}
4925
4926static struct bio *remove_bio_from_retry(struct r5conf *conf)
4927{
4928	struct bio *bi;
4929
4930	bi = conf->retry_read_aligned;
4931	if (bi) {
4932		conf->retry_read_aligned = NULL;
4933		return bi;
4934	}
4935	bi = conf->retry_read_aligned_list;
4936	if(bi) {
4937		conf->retry_read_aligned_list = bi->bi_next;
4938		bi->bi_next = NULL;
4939		/*
4940		 * this sets the active strip count to 1 and the processed
4941		 * strip count to zero (upper 8 bits)
4942		 */
4943		raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
4944	}
4945
4946	return bi;
4947}
4948
4949/*
4950 *  The "raid5_align_endio" should check if the read succeeded and if it
4951 *  did, call bio_endio on the original bio (having bio_put the new bio
4952 *  first).
4953 *  If the read failed..
4954 */
4955static void raid5_align_endio(struct bio *bi)
4956{
4957	struct bio* raid_bi  = bi->bi_private;
4958	struct mddev *mddev;
4959	struct r5conf *conf;
4960	struct md_rdev *rdev;
4961	int error = bi->bi_error;
4962
4963	bio_put(bi);
4964
4965	rdev = (void*)raid_bi->bi_next;
4966	raid_bi->bi_next = NULL;
4967	mddev = rdev->mddev;
4968	conf = mddev->private;
4969
4970	rdev_dec_pending(rdev, conf->mddev);
4971
4972	if (!error) {
4973		trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
4974					 raid_bi, 0);
4975		bio_endio(raid_bi);
4976		if (atomic_dec_and_test(&conf->active_aligned_reads))
4977			wake_up(&conf->wait_for_quiescent);
4978		return;
4979	}
4980
4981	pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
4982
4983	add_bio_to_retry(raid_bi, conf);
4984}
4985
4986static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
4987{
4988	struct r5conf *conf = mddev->private;
4989	int dd_idx;
4990	struct bio* align_bi;
4991	struct md_rdev *rdev;
4992	sector_t end_sector;
4993
4994	if (!in_chunk_boundary(mddev, raid_bio)) {
4995		pr_debug("%s: non aligned\n", __func__);
4996		return 0;
4997	}
4998	/*
4999	 * use bio_clone_mddev to make a copy of the bio
5000	 */
5001	align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev);
5002	if (!align_bi)
5003		return 0;
5004	/*
5005	 *   set bi_end_io to a new function, and set bi_private to the
5006	 *     original bio.
5007	 */
5008	align_bi->bi_end_io  = raid5_align_endio;
5009	align_bi->bi_private = raid_bio;
5010	/*
5011	 *	compute position
5012	 */
5013	align_bi->bi_iter.bi_sector =
5014		raid5_compute_sector(conf, raid_bio->bi_iter.bi_sector,
5015				     0, &dd_idx, NULL);
5016
5017	end_sector = bio_end_sector(align_bi);
5018	rcu_read_lock();
5019	rdev = rcu_dereference(conf->disks[dd_idx].replacement);
5020	if (!rdev || test_bit(Faulty, &rdev->flags) ||
5021	    rdev->recovery_offset < end_sector) {
5022		rdev = rcu_dereference(conf->disks[dd_idx].rdev);
5023		if (rdev &&
5024		    (test_bit(Faulty, &rdev->flags) ||
5025		    !(test_bit(In_sync, &rdev->flags) ||
5026		      rdev->recovery_offset >= end_sector)))
5027			rdev = NULL;
5028	}
5029	if (rdev) {
5030		sector_t first_bad;
5031		int bad_sectors;
5032
5033		atomic_inc(&rdev->nr_pending);
5034		rcu_read_unlock();
5035		raid_bio->bi_next = (void*)rdev;
5036		align_bi->bi_bdev =  rdev->bdev;
5037		bio_clear_flag(align_bi, BIO_SEG_VALID);
5038
5039		if (is_badblock(rdev, align_bi->bi_iter.bi_sector,
5040				bio_sectors(align_bi),
5041				&first_bad, &bad_sectors)) {
5042			bio_put(align_bi);
5043			rdev_dec_pending(rdev, mddev);
5044			return 0;
5045		}
5046
5047		/* No reshape active, so we can trust rdev->data_offset */
5048		align_bi->bi_iter.bi_sector += rdev->data_offset;
5049
5050		spin_lock_irq(&conf->device_lock);
5051		wait_event_lock_irq(conf->wait_for_quiescent,
5052				    conf->quiesce == 0,
5053				    conf->device_lock);
5054		atomic_inc(&conf->active_aligned_reads);
5055		spin_unlock_irq(&conf->device_lock);
5056
5057		if (mddev->gendisk)
5058			trace_block_bio_remap(bdev_get_queue(align_bi->bi_bdev),
5059					      align_bi, disk_devt(mddev->gendisk),
5060					      raid_bio->bi_iter.bi_sector);
5061		generic_make_request(align_bi);
5062		return 1;
5063	} else {
5064		rcu_read_unlock();
5065		bio_put(align_bi);
5066		return 0;
5067	}
5068}
5069
5070static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
5071{
5072	struct bio *split;
5073
5074	do {
5075		sector_t sector = raid_bio->bi_iter.bi_sector;
5076		unsigned chunk_sects = mddev->chunk_sectors;
5077		unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
5078
5079		if (sectors < bio_sectors(raid_bio)) {
5080			split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
5081			bio_chain(split, raid_bio);
5082		} else
5083			split = raid_bio;
5084
5085		if (!raid5_read_one_chunk(mddev, split)) {
5086			if (split != raid_bio)
5087				generic_make_request(raid_bio);
5088			return split;
5089		}
5090	} while (split != raid_bio);
5091
5092	return NULL;
5093}
5094
5095/* __get_priority_stripe - get the next stripe to process
5096 *
5097 * Full stripe writes are allowed to pass preread active stripes up until
5098 * the bypass_threshold is exceeded.  In general the bypass_count
5099 * increments when the handle_list is handled before the hold_list; however, it
5100 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
5101 * stripe with in flight i/o.  The bypass_count will be reset when the
5102 * head of the hold_list has changed, i.e. the head was promoted to the
5103 * handle_list.
5104 */
5105static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
5106{
5107	struct stripe_head *sh = NULL, *tmp;
5108	struct list_head *handle_list = NULL;
5109	struct r5worker_group *wg = NULL;
5110
5111	if (conf->worker_cnt_per_group == 0) {
5112		handle_list = &conf->handle_list;
5113	} else if (group != ANY_GROUP) {
5114		handle_list = &conf->worker_groups[group].handle_list;
5115		wg = &conf->worker_groups[group];
5116	} else {
5117		int i;
5118		for (i = 0; i < conf->group_cnt; i++) {
5119			handle_list = &conf->worker_groups[i].handle_list;
5120			wg = &conf->worker_groups[i];
5121			if (!list_empty(handle_list))
5122				break;
5123		}
5124	}
5125
5126	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
5127		  __func__,
5128		  list_empty(handle_list) ? "empty" : "busy",
5129		  list_empty(&conf->hold_list) ? "empty" : "busy",
5130		  atomic_read(&conf->pending_full_writes), conf->bypass_count);
5131
5132	if (!list_empty(handle_list)) {
5133		sh = list_entry(handle_list->next, typeof(*sh), lru);
5134
5135		if (list_empty(&conf->hold_list))
5136			conf->bypass_count = 0;
5137		else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
5138			if (conf->hold_list.next == conf->last_hold)
5139				conf->bypass_count++;
5140			else {
5141				conf->last_hold = conf->hold_list.next;
5142				conf->bypass_count -= conf->bypass_threshold;
5143				if (conf->bypass_count < 0)
5144					conf->bypass_count = 0;
5145			}
5146		}
5147	} else if (!list_empty(&conf->hold_list) &&
5148		   ((conf->bypass_threshold &&
5149		     conf->bypass_count > conf->bypass_threshold) ||
5150		    atomic_read(&conf->pending_full_writes) == 0)) {
5151
5152		list_for_each_entry(tmp, &conf->hold_list,  lru) {
5153			if (conf->worker_cnt_per_group == 0 ||
5154			    group == ANY_GROUP ||
5155			    !cpu_online(tmp->cpu) ||
5156			    cpu_to_group(tmp->cpu) == group) {
5157				sh = tmp;
5158				break;
5159			}
5160		}
5161
5162		if (sh) {
5163			conf->bypass_count -= conf->bypass_threshold;
5164			if (conf->bypass_count < 0)
5165				conf->bypass_count = 0;
5166		}
5167		wg = NULL;
5168	}
5169
5170	if (!sh)
5171		return NULL;
5172
5173	if (wg) {
5174		wg->stripes_cnt--;
5175		sh->group = NULL;
5176	}
5177	list_del_init(&sh->lru);
5178	BUG_ON(atomic_inc_return(&sh->count) != 1);
5179	return sh;
5180}
5181
5182struct raid5_plug_cb {
5183	struct blk_plug_cb	cb;
5184	struct list_head	list;
5185	struct list_head	temp_inactive_list[NR_STRIPE_HASH_LOCKS];
5186};
5187
5188static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
5189{
5190	struct raid5_plug_cb *cb = container_of(
5191		blk_cb, struct raid5_plug_cb, cb);
5192	struct stripe_head *sh;
5193	struct mddev *mddev = cb->cb.data;
5194	struct r5conf *conf = mddev->private;
5195	int cnt = 0;
5196	int hash;
5197
5198	if (cb->list.next && !list_empty(&cb->list)) {
5199		spin_lock_irq(&conf->device_lock);
5200		while (!list_empty(&cb->list)) {
5201			sh = list_first_entry(&cb->list, struct stripe_head, lru);
5202			list_del_init(&sh->lru);
5203			/*
5204			 * avoid race release_stripe_plug() sees
5205			 * STRIPE_ON_UNPLUG_LIST clear but the stripe
5206			 * is still in our list
5207			 */
5208			smp_mb__before_atomic();
5209			clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state);
5210			/*
5211			 * STRIPE_ON_RELEASE_LIST could be set here. In that
5212			 * case, the count is always > 1 here
5213			 */
5214			hash = sh->hash_lock_index;
5215			__release_stripe(conf, sh, &cb->temp_inactive_list[hash]);
5216			cnt++;
5217		}
5218		spin_unlock_irq(&conf->device_lock);
5219	}
5220	release_inactive_stripe_list(conf, cb->temp_inactive_list,
5221				     NR_STRIPE_HASH_LOCKS);
5222	if (mddev->queue)
5223		trace_block_unplug(mddev->queue, cnt, !from_schedule);
5224	kfree(cb);
5225}
5226
5227static void release_stripe_plug(struct mddev *mddev,
5228				struct stripe_head *sh)
5229{
5230	struct blk_plug_cb *blk_cb = blk_check_plugged(
5231		raid5_unplug, mddev,
5232		sizeof(struct raid5_plug_cb));
5233	struct raid5_plug_cb *cb;
5234
5235	if (!blk_cb) {
5236		raid5_release_stripe(sh);
5237		return;
5238	}
5239
5240	cb = container_of(blk_cb, struct raid5_plug_cb, cb);
5241
5242	if (cb->list.next == NULL) {
5243		int i;
5244		INIT_LIST_HEAD(&cb->list);
5245		for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5246			INIT_LIST_HEAD(cb->temp_inactive_list + i);
5247	}
5248
5249	if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
5250		list_add_tail(&sh->lru, &cb->list);
5251	else
5252		raid5_release_stripe(sh);
5253}
5254
5255static void make_discard_request(struct mddev *mddev, struct bio *bi)
5256{
5257	struct r5conf *conf = mddev->private;
5258	sector_t logical_sector, last_sector;
5259	struct stripe_head *sh;
5260	int remaining;
5261	int stripe_sectors;
5262
5263	if (mddev->reshape_position != MaxSector)
5264		/* Skip discard while reshape is happening */
5265		return;
5266
5267	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5268	last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
5269
5270	bi->bi_next = NULL;
5271	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
5272
5273	stripe_sectors = conf->chunk_sectors *
5274		(conf->raid_disks - conf->max_degraded);
5275	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
5276					       stripe_sectors);
5277	sector_div(last_sector, stripe_sectors);
5278
5279	logical_sector *= conf->chunk_sectors;
5280	last_sector *= conf->chunk_sectors;
5281
5282	for (; logical_sector < last_sector;
5283	     logical_sector += STRIPE_SECTORS) {
5284		DEFINE_WAIT(w);
5285		int d;
5286	again:
5287		sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
5288		prepare_to_wait(&conf->wait_for_overlap, &w,
5289				TASK_UNINTERRUPTIBLE);
5290		set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5291		if (test_bit(STRIPE_SYNCING, &sh->state)) {
5292			raid5_release_stripe(sh);
5293			schedule();
5294			goto again;
5295		}
5296		clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
5297		spin_lock_irq(&sh->stripe_lock);
5298		for (d = 0; d < conf->raid_disks; d++) {
5299			if (d == sh->pd_idx || d == sh->qd_idx)
5300				continue;
5301			if (sh->dev[d].towrite || sh->dev[d].toread) {
5302				set_bit(R5_Overlap, &sh->dev[d].flags);
5303				spin_unlock_irq(&sh->stripe_lock);
5304				raid5_release_stripe(sh);
5305				schedule();
5306				goto again;
5307			}
5308		}
5309		set_bit(STRIPE_DISCARD, &sh->state);
5310		finish_wait(&conf->wait_for_overlap, &w);
5311		sh->overwrite_disks = 0;
5312		for (d = 0; d < conf->raid_disks; d++) {
5313			if (d == sh->pd_idx || d == sh->qd_idx)
5314				continue;
5315			sh->dev[d].towrite = bi;
5316			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
5317			raid5_inc_bi_active_stripes(bi);
5318			sh->overwrite_disks++;
5319		}
5320		spin_unlock_irq(&sh->stripe_lock);
5321		if (conf->mddev->bitmap) {
5322			for (d = 0;
5323			     d < conf->raid_disks - conf->max_degraded;
5324			     d++)
5325				bitmap_startwrite(mddev->bitmap,
5326						  sh->sector,
5327						  STRIPE_SECTORS,
5328						  0);
5329			sh->bm_seq = conf->seq_flush + 1;
5330			set_bit(STRIPE_BIT_DELAY, &sh->state);
5331		}
5332
5333		set_bit(STRIPE_HANDLE, &sh->state);
5334		clear_bit(STRIPE_DELAYED, &sh->state);
5335		if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5336			atomic_inc(&conf->preread_active_stripes);
5337		release_stripe_plug(mddev, sh);
5338	}
5339
5340	remaining = raid5_dec_bi_active_stripes(bi);
5341	if (remaining == 0) {
5342		md_write_end(mddev);
5343		bio_endio(bi);
5344	}
5345}
5346
5347static void raid5_make_request(struct mddev *mddev, struct bio * bi)
5348{
5349	struct r5conf *conf = mddev->private;
5350	int dd_idx;
5351	sector_t new_sector;
5352	sector_t logical_sector, last_sector;
5353	struct stripe_head *sh;
5354	const int rw = bio_data_dir(bi);
5355	int remaining;
5356	DEFINE_WAIT(w);
5357	bool do_prepare;
5358	bool do_flush = false;
5359
5360	if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
5361		int ret = r5l_handle_flush_request(conf->log, bi);
5362
5363		if (ret == 0)
5364			return;
5365		if (ret == -ENODEV) {
5366			md_flush_request(mddev, bi);
5367			return;
5368		}
5369		/* ret == -EAGAIN, fallback */
5370		/*
5371		 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
5372		 * we need to flush journal device
5373		 */
5374		do_flush = bi->bi_opf & REQ_PREFLUSH;
5375	}
5376
5377	md_write_start(mddev, bi);
5378
5379	/*
5380	 * If array is degraded, better not do chunk aligned read because
5381	 * later we might have to read it again in order to reconstruct
5382	 * data on failed drives.
5383	 */
5384	if (rw == READ && mddev->degraded == 0 &&
5385	    !r5c_is_writeback(conf->log) &&
5386	    mddev->reshape_position == MaxSector) {
5387		bi = chunk_aligned_read(mddev, bi);
5388		if (!bi)
5389			return;
5390	}
5391
5392	if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) {
5393		make_discard_request(mddev, bi);
5394		return;
5395	}
5396
5397	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
5398	last_sector = bio_end_sector(bi);
5399	bi->bi_next = NULL;
5400	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
5401
5402	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
5403	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
5404		int previous;
5405		int seq;
5406
5407		do_prepare = false;
5408	retry:
5409		seq = read_seqcount_begin(&conf->gen_lock);
5410		previous = 0;
5411		if (do_prepare)
5412			prepare_to_wait(&conf->wait_for_overlap, &w,
5413				TASK_UNINTERRUPTIBLE);
5414		if (unlikely(conf->reshape_progress != MaxSector)) {
5415			/* spinlock is needed as reshape_progress may be
5416			 * 64bit on a 32bit platform, and so it might be
5417			 * possible to see a half-updated value
5418			 * Of course reshape_progress could change after
5419			 * the lock is dropped, so once we get a reference
5420			 * to the stripe that we think it is, we will have
5421			 * to check again.
5422			 */
5423			spin_lock_irq(&conf->device_lock);
5424			if (mddev->reshape_backwards
5425			    ? logical_sector < conf->reshape_progress
5426			    : logical_sector >= conf->reshape_progress) {
5427				previous = 1;
5428			} else {
5429				if (mddev->reshape_backwards
5430				    ? logical_sector < conf->reshape_safe
5431				    : logical_sector >= conf->reshape_safe) {
5432					spin_unlock_irq(&conf->device_lock);
5433					schedule();
5434					do_prepare = true;
5435					goto retry;
5436				}
5437			}
5438			spin_unlock_irq(&conf->device_lock);
5439		}
5440
5441		new_sector = raid5_compute_sector(conf, logical_sector,
5442						  previous,
5443						  &dd_idx, NULL);
5444		pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n",
5445			(unsigned long long)new_sector,
5446			(unsigned long long)logical_sector);
5447
5448		sh = raid5_get_active_stripe(conf, new_sector, previous,
5449				       (bi->bi_opf & REQ_RAHEAD), 0);
5450		if (sh) {
5451			if (unlikely(previous)) {
5452				/* expansion might have moved on while waiting for a
5453				 * stripe, so we must do the range check again.
5454				 * Expansion could still move past after this
5455				 * test, but as we are holding a reference to
5456				 * 'sh', we know that if that happens,
5457				 *  STRIPE_EXPANDING will get set and the expansion
5458				 * won't proceed until we finish with the stripe.
5459				 */
5460				int must_retry = 0;
5461				spin_lock_irq(&conf->device_lock);
5462				if (mddev->reshape_backwards
5463				    ? logical_sector >= conf->reshape_progress
5464				    : logical_sector < conf->reshape_progress)
5465					/* mismatch, need to try again */
5466					must_retry = 1;
5467				spin_unlock_irq(&conf->device_lock);
5468				if (must_retry) {
5469					raid5_release_stripe(sh);
5470					schedule();
5471					do_prepare = true;
5472					goto retry;
5473				}
5474			}
5475			if (read_seqcount_retry(&conf->gen_lock, seq)) {
5476				/* Might have got the wrong stripe_head
5477				 * by accident
5478				 */
5479				raid5_release_stripe(sh);
5480				goto retry;
5481			}
5482
5483			if (rw == WRITE &&
5484			    logical_sector >= mddev->suspend_lo &&
5485			    logical_sector < mddev->suspend_hi) {
5486				raid5_release_stripe(sh);
5487				/* As the suspend_* range is controlled by
5488				 * userspace, we want an interruptible
5489				 * wait.
5490				 */
5491				flush_signals(current);
5492				prepare_to_wait(&conf->wait_for_overlap,
5493						&w, TASK_INTERRUPTIBLE);
5494				if (logical_sector >= mddev->suspend_lo &&
5495				    logical_sector < mddev->suspend_hi) {
5496					schedule();
5497					do_prepare = true;
5498				}
5499				goto retry;
5500			}
5501
5502			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
5503			    !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
5504				/* Stripe is busy expanding or
5505				 * add failed due to overlap.  Flush everything
5506				 * and wait a while
5507				 */
5508				md_wakeup_thread(mddev->thread);
5509				raid5_release_stripe(sh);
5510				schedule();
5511				do_prepare = true;
5512				goto retry;
5513			}
5514			if (do_flush) {
5515				set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
5516				/* we only need flush for one stripe */
5517				do_flush = false;
5518			}
5519
5520			set_bit(STRIPE_HANDLE, &sh->state);
5521			clear_bit(STRIPE_DELAYED, &sh->state);
5522			if ((!sh->batch_head || sh == sh->batch_head) &&
5523			    (bi->bi_opf & REQ_SYNC) &&
5524			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
5525				atomic_inc(&conf->preread_active_stripes);
5526			release_stripe_plug(mddev, sh);
5527		} else {
5528			/* cannot get stripe for read-ahead, just give-up */
5529			bi->bi_error = -EIO;
5530			break;
5531		}
5532	}
5533	finish_wait(&conf->wait_for_overlap, &w);
5534
5535	remaining = raid5_dec_bi_active_stripes(bi);
5536	if (remaining == 0) {
5537
5538		if ( rw == WRITE )
5539			md_write_end(mddev);
5540
5541		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
5542					 bi, 0);
5543		bio_endio(bi);
5544	}
5545}
5546
5547static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
5548
5549static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
5550{
5551	/* reshaping is quite different to recovery/resync so it is
5552	 * handled quite separately ... here.
5553	 *
5554	 * On each call to sync_request, we gather one chunk worth of
5555	 * destination stripes and flag them as expanding.
5556	 * Then we find all the source stripes and request reads.
5557	 * As the reads complete, handle_stripe will copy the data
5558	 * into the destination stripe and release that stripe.
5559	 */
5560	struct r5conf *conf = mddev->private;
5561	struct stripe_head *sh;
5562	sector_t first_sector, last_sector;
5563	int raid_disks = conf->previous_raid_disks;
5564	int data_disks = raid_disks - conf->max_degraded;
5565	int new_data_disks = conf->raid_disks - conf->max_degraded;
5566	int i;
5567	int dd_idx;
5568	sector_t writepos, readpos, safepos;
5569	sector_t stripe_addr;
5570	int reshape_sectors;
5571	struct list_head stripes;
5572	sector_t retn;
5573
5574	if (sector_nr == 0) {
5575		/* If restarting in the middle, skip the initial sectors */
5576		if (mddev->reshape_backwards &&
5577		    conf->reshape_progress < raid5_size(mddev, 0, 0)) {
5578			sector_nr = raid5_size(mddev, 0, 0)
5579				- conf->reshape_progress;
5580		} else if (mddev->reshape_backwards &&
5581			   conf->reshape_progress == MaxSector) {
5582			/* shouldn't happen, but just in case, finish up.*/
5583			sector_nr = MaxSector;
5584		} else if (!mddev->reshape_backwards &&
5585			   conf->reshape_progress > 0)
5586			sector_nr = conf->reshape_progress;
5587		sector_div(sector_nr, new_data_disks);
5588		if (sector_nr) {
5589			mddev->curr_resync_completed = sector_nr;
5590			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5591			*skipped = 1;
5592			retn = sector_nr;
5593			goto finish;
5594		}
5595	}
5596
5597	/* We need to process a full chunk at a time.
5598	 * If old and new chunk sizes differ, we need to process the
5599	 * largest of these
5600	 */
5601
5602	reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
5603
5604	/* We update the metadata at least every 10 seconds, or when
5605	 * the data about to be copied would over-write the source of
5606	 * the data at the front of the range.  i.e. one new_stripe
5607	 * along from reshape_progress new_maps to after where
5608	 * reshape_safe old_maps to
5609	 */
5610	writepos = conf->reshape_progress;
5611	sector_div(writepos, new_data_disks);
5612	readpos = conf->reshape_progress;
5613	sector_div(readpos, data_disks);
5614	safepos = conf->reshape_safe;
5615	sector_div(safepos, data_disks);
5616	if (mddev->reshape_backwards) {
5617		BUG_ON(writepos < reshape_sectors);
5618		writepos -= reshape_sectors;
5619		readpos += reshape_sectors;
5620		safepos += reshape_sectors;
5621	} else {
5622		writepos += reshape_sectors;
5623		/* readpos and safepos are worst-case calculations.
5624		 * A negative number is overly pessimistic, and causes
5625		 * obvious problems for unsigned storage.  So clip to 0.
5626		 */
5627		readpos -= min_t(sector_t, reshape_sectors, readpos);
5628		safepos -= min_t(sector_t, reshape_sectors, safepos);
5629	}
5630
5631	/* Having calculated the 'writepos' possibly use it
5632	 * to set 'stripe_addr' which is where we will write to.
5633	 */
5634	if (mddev->reshape_backwards) {
5635		BUG_ON(conf->reshape_progress == 0);
5636		stripe_addr = writepos;
5637		BUG_ON((mddev->dev_sectors &
5638			~((sector_t)reshape_sectors - 1))
5639		       - reshape_sectors - stripe_addr
5640		       != sector_nr);
5641	} else {
5642		BUG_ON(writepos != sector_nr + reshape_sectors);
5643		stripe_addr = sector_nr;
5644	}
5645
5646	/* 'writepos' is the most advanced device address we might write.
5647	 * 'readpos' is the least advanced device address we might read.
5648	 * 'safepos' is the least address recorded in the metadata as having
5649	 *     been reshaped.
5650	 * If there is a min_offset_diff, these are adjusted either by
5651	 * increasing the safepos/readpos if diff is negative, or
5652	 * increasing writepos if diff is positive.
5653	 * If 'readpos' is then behind 'writepos', there is no way that we can
5654	 * ensure safety in the face of a crash - that must be done by userspace
5655	 * making a backup of the data.  So in that case there is no particular
5656	 * rush to update metadata.
5657	 * Otherwise if 'safepos' is behind 'writepos', then we really need to
5658	 * update the metadata to advance 'safepos' to match 'readpos' so that
5659	 * we can be safe in the event of a crash.
5660	 * So we insist on updating metadata if safepos is behind writepos and
5661	 * readpos is beyond writepos.
5662	 * In any case, update the metadata every 10 seconds.
5663	 * Maybe that number should be configurable, but I'm not sure it is
5664	 * worth it.... maybe it could be a multiple of safemode_delay???
5665	 */
5666	if (conf->min_offset_diff < 0) {
5667		safepos += -conf->min_offset_diff;
5668		readpos += -conf->min_offset_diff;
5669	} else
5670		writepos += conf->min_offset_diff;
5671
5672	if ((mddev->reshape_backwards
5673	     ? (safepos > writepos && readpos < writepos)
5674	     : (safepos < writepos && readpos > writepos)) ||
5675	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
5676		/* Cannot proceed until we've updated the superblock... */
5677		wait_event(conf->wait_for_overlap,
5678			   atomic_read(&conf->reshape_stripes)==0
5679			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5680		if (atomic_read(&conf->reshape_stripes) != 0)
5681			return 0;
5682		mddev->reshape_position = conf->reshape_progress;
5683		mddev->curr_resync_completed = sector_nr;
5684		conf->reshape_checkpoint = jiffies;
5685		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5686		md_wakeup_thread(mddev->thread);
5687		wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
5688			   test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5689		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5690			return 0;
5691		spin_lock_irq(&conf->device_lock);
5692		conf->reshape_safe = mddev->reshape_position;
5693		spin_unlock_irq(&conf->device_lock);
5694		wake_up(&conf->wait_for_overlap);
5695		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5696	}
5697
5698	INIT_LIST_HEAD(&stripes);
5699	for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
5700		int j;
5701		int skipped_disk = 0;
5702		sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
5703		set_bit(STRIPE_EXPANDING, &sh->state);
5704		atomic_inc(&conf->reshape_stripes);
5705		/* If any of this stripe is beyond the end of the old
5706		 * array, then we need to zero those blocks
5707		 */
5708		for (j=sh->disks; j--;) {
5709			sector_t s;
5710			if (j == sh->pd_idx)
5711				continue;
5712			if (conf->level == 6 &&
5713			    j == sh->qd_idx)
5714				continue;
5715			s = raid5_compute_blocknr(sh, j, 0);
5716			if (s < raid5_size(mddev, 0, 0)) {
5717				skipped_disk = 1;
5718				continue;
5719			}
5720			memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
5721			set_bit(R5_Expanded, &sh->dev[j].flags);
5722			set_bit(R5_UPTODATE, &sh->dev[j].flags);
5723		}
5724		if (!skipped_disk) {
5725			set_bit(STRIPE_EXPAND_READY, &sh->state);
5726			set_bit(STRIPE_HANDLE, &sh->state);
5727		}
5728		list_add(&sh->lru, &stripes);
5729	}
5730	spin_lock_irq(&conf->device_lock);
5731	if (mddev->reshape_backwards)
5732		conf->reshape_progress -= reshape_sectors * new_data_disks;
5733	else
5734		conf->reshape_progress += reshape_sectors * new_data_disks;
5735	spin_unlock_irq(&conf->device_lock);
5736	/* Ok, those stripe are ready. We can start scheduling
5737	 * reads on the source stripes.
5738	 * The source stripes are determined by mapping the first and last
5739	 * block on the destination stripes.
5740	 */
5741	first_sector =
5742		raid5_compute_sector(conf, stripe_addr*(new_data_disks),
5743				     1, &dd_idx, NULL);
5744	last_sector =
5745		raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
5746					    * new_data_disks - 1),
5747				     1, &dd_idx, NULL);
5748	if (last_sector >= mddev->dev_sectors)
5749		last_sector = mddev->dev_sectors - 1;
5750	while (first_sector <= last_sector) {
5751		sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
5752		set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
5753		set_bit(STRIPE_HANDLE, &sh->state);
5754		raid5_release_stripe(sh);
5755		first_sector += STRIPE_SECTORS;
5756	}
5757	/* Now that the sources are clearly marked, we can release
5758	 * the destination stripes
5759	 */
5760	while (!list_empty(&stripes)) {
5761		sh = list_entry(stripes.next, struct stripe_head, lru);
5762		list_del_init(&sh->lru);
5763		raid5_release_stripe(sh);
5764	}
5765	/* If this takes us to the resync_max point where we have to pause,
5766	 * then we need to write out the superblock.
5767	 */
5768	sector_nr += reshape_sectors;
5769	retn = reshape_sectors;
5770finish:
5771	if (mddev->curr_resync_completed > mddev->resync_max ||
5772	    (sector_nr - mddev->curr_resync_completed) * 2
5773	    >= mddev->resync_max - mddev->curr_resync_completed) {
5774		/* Cannot proceed until we've updated the superblock... */
5775		wait_event(conf->wait_for_overlap,
5776			   atomic_read(&conf->reshape_stripes) == 0
5777			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5778		if (atomic_read(&conf->reshape_stripes) != 0)
5779			goto ret;
5780		mddev->reshape_position = conf->reshape_progress;
5781		mddev->curr_resync_completed = sector_nr;
5782		conf->reshape_checkpoint = jiffies;
5783		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
5784		md_wakeup_thread(mddev->thread);
5785		wait_event(mddev->sb_wait,
5786			   !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
5787			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
5788		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
5789			goto ret;
5790		spin_lock_irq(&conf->device_lock);
5791		conf->reshape_safe = mddev->reshape_position;
5792		spin_unlock_irq(&conf->device_lock);
5793		wake_up(&conf->wait_for_overlap);
5794		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
5795	}
5796ret:
5797	return retn;
5798}
5799
5800static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_nr,
5801					  int *skipped)
5802{
5803	struct r5conf *conf = mddev->private;
5804	struct stripe_head *sh;
5805	sector_t max_sector = mddev->dev_sectors;
5806	sector_t sync_blocks;
5807	int still_degraded = 0;
5808	int i;
5809
5810	if (sector_nr >= max_sector) {
5811		/* just being told to finish up .. nothing much to do */
5812
5813		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
5814			end_reshape(conf);
5815			return 0;
5816		}
5817
5818		if (mddev->curr_resync < max_sector) /* aborted */
5819			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
5820					&sync_blocks, 1);
5821		else /* completed sync */
5822			conf->fullsync = 0;
5823		bitmap_close_sync(mddev->bitmap);
5824
5825		return 0;
5826	}
5827
5828	/* Allow raid5_quiesce to complete */
5829	wait_event(conf->wait_for_overlap, conf->quiesce != 2);
5830
5831	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
5832		return reshape_request(mddev, sector_nr, skipped);
5833
5834	/* No need to check resync_max as we never do more than one
5835	 * stripe, and as resync_max will always be on a chunk boundary,
5836	 * if the check in md_do_sync didn't fire, there is no chance
5837	 * of overstepping resync_max here
5838	 */
5839
5840	/* if there is too many failed drives and we are trying
5841	 * to resync, then assert that we are finished, because there is
5842	 * nothing we can do.
5843	 */
5844	if (mddev->degraded >= conf->max_degraded &&
5845	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
5846		sector_t rv = mddev->dev_sectors - sector_nr;
5847		*skipped = 1;
5848		return rv;
5849	}
5850	if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
5851	    !conf->fullsync &&
5852	    !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
5853	    sync_blocks >= STRIPE_SECTORS) {
5854		/* we can skip this block, and probably more */
5855		sync_blocks /= STRIPE_SECTORS;
5856		*skipped = 1;
5857		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
5858	}
5859
5860	bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
5861
5862	sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
5863	if (sh == NULL) {
5864		sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
5865		/* make sure we don't swamp the stripe cache if someone else
5866		 * is trying to get access
5867		 */
5868		schedule_timeout_uninterruptible(1);
5869	}
5870	/* Need to check if array will still be degraded after recovery/resync
5871	 * Note in case of > 1 drive failures it's possible we're rebuilding
5872	 * one drive while leaving another faulty drive in array.
5873	 */
5874	rcu_read_lock();
5875	for (i = 0; i < conf->raid_disks; i++) {
5876		struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev);
5877
5878		if (rdev == NULL || test_bit(Faulty, &rdev->flags))
5879			still_degraded = 1;
5880	}
5881	rcu_read_unlock();
5882
5883	bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
5884
5885	set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
5886	set_bit(STRIPE_HANDLE, &sh->state);
5887
5888	raid5_release_stripe(sh);
5889
5890	return STRIPE_SECTORS;
5891}
5892
5893static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
5894{
5895	/* We may not be able to submit a whole bio at once as there
5896	 * may not be enough stripe_heads available.
5897	 * We cannot pre-allocate enough stripe_heads as we may need
5898	 * more than exist in the cache (if we allow ever large chunks).
5899	 * So we do one stripe head at a time and record in
5900	 * ->bi_hw_segments how many have been done.
5901	 *
5902	 * We *know* that this entire raid_bio is in one chunk, so
5903	 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
5904	 */
5905	struct stripe_head *sh;
5906	int dd_idx;
5907	sector_t sector, logical_sector, last_sector;
5908	int scnt = 0;
5909	int remaining;
5910	int handled = 0;
5911
5912	logical_sector = raid_bio->bi_iter.bi_sector &
5913		~((sector_t)STRIPE_SECTORS-1);
5914	sector = raid5_compute_sector(conf, logical_sector,
5915				      0, &dd_idx, NULL);
5916	last_sector = bio_end_sector(raid_bio);
5917
5918	for (; logical_sector < last_sector;
5919	     logical_sector += STRIPE_SECTORS,
5920		     sector += STRIPE_SECTORS,
5921		     scnt++) {
5922
5923		if (scnt < raid5_bi_processed_stripes(raid_bio))
5924			/* already done this stripe */
5925			continue;
5926
5927		sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
5928
5929		if (!sh) {
5930			/* failed to get a stripe - must wait */
5931			raid5_set_bi_processed_stripes(raid_bio, scnt);
5932			conf->retry_read_aligned = raid_bio;
5933			return handled;
5934		}
5935
5936		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
5937			raid5_release_stripe(sh);
5938			raid5_set_bi_processed_stripes(raid_bio, scnt);
5939			conf->retry_read_aligned = raid_bio;
5940			return handled;
5941		}
5942
5943		set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
5944		handle_stripe(sh);
5945		raid5_release_stripe(sh);
5946		handled++;
5947	}
5948	remaining = raid5_dec_bi_active_stripes(raid_bio);
5949	if (remaining == 0) {
5950		trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
5951					 raid_bio, 0);
5952		bio_endio(raid_bio);
5953	}
5954	if (atomic_dec_and_test(&conf->active_aligned_reads))
5955		wake_up(&conf->wait_for_quiescent);
5956	return handled;
5957}
5958
5959static int handle_active_stripes(struct r5conf *conf, int group,
5960				 struct r5worker *worker,
5961				 struct list_head *temp_inactive_list)
5962{
5963	struct stripe_head *batch[MAX_STRIPE_BATCH], *sh;
5964	int i, batch_size = 0, hash;
5965	bool release_inactive = false;
5966
5967	while (batch_size < MAX_STRIPE_BATCH &&
5968			(sh = __get_priority_stripe(conf, group)) != NULL)
5969		batch[batch_size++] = sh;
5970
5971	if (batch_size == 0) {
5972		for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
5973			if (!list_empty(temp_inactive_list + i))
5974				break;
5975		if (i == NR_STRIPE_HASH_LOCKS) {
5976			spin_unlock_irq(&conf->device_lock);
5977			r5l_flush_stripe_to_raid(conf->log);
5978			spin_lock_irq(&conf->device_lock);
5979			return batch_size;
5980		}
5981		release_inactive = true;
5982	}
5983	spin_unlock_irq(&conf->device_lock);
5984
5985	release_inactive_stripe_list(conf, temp_inactive_list,
5986				     NR_STRIPE_HASH_LOCKS);
5987
5988	r5l_flush_stripe_to_raid(conf->log);
5989	if (release_inactive) {
5990		spin_lock_irq(&conf->device_lock);
5991		return 0;
5992	}
5993
5994	for (i = 0; i < batch_size; i++)
5995		handle_stripe(batch[i]);
5996	r5l_write_stripe_run(conf->log);
5997
5998	cond_resched();
5999
6000	spin_lock_irq(&conf->device_lock);
6001	for (i = 0; i < batch_size; i++) {
6002		hash = batch[i]->hash_lock_index;
6003		__release_stripe(conf, batch[i], &temp_inactive_list[hash]);
6004	}
6005	return batch_size;
6006}
6007
6008static void raid5_do_work(struct work_struct *work)
6009{
6010	struct r5worker *worker = container_of(work, struct r5worker, work);
6011	struct r5worker_group *group = worker->group;
6012	struct r5conf *conf = group->conf;
6013	int group_id = group - conf->worker_groups;
6014	int handled;
6015	struct blk_plug plug;
6016
6017	pr_debug("+++ raid5worker active\n");
6018
6019	blk_start_plug(&plug);
6020	handled = 0;
6021	spin_lock_irq(&conf->device_lock);
6022	while (1) {
6023		int batch_size, released;
6024
6025		released = release_stripe_list(conf, worker->temp_inactive_list);
6026
6027		batch_size = handle_active_stripes(conf, group_id, worker,
6028						   worker->temp_inactive_list);
6029		worker->working = false;
6030		if (!batch_size && !released)
6031			break;
6032		handled += batch_size;
6033	}
6034	pr_debug("%d stripes handled\n", handled);
6035
6036	spin_unlock_irq(&conf->device_lock);
6037	blk_finish_plug(&plug);
6038
6039	pr_debug("--- raid5worker inactive\n");
6040}
6041
6042/*
6043 * This is our raid5 kernel thread.
6044 *
6045 * We scan the hash table for stripes which can be handled now.
6046 * During the scan, completed stripes are saved for us by the interrupt
6047 * handler, so that they will not have to wait for our next wakeup.
6048 */
6049static void raid5d(struct md_thread *thread)
6050{
6051	struct mddev *mddev = thread->mddev;
6052	struct r5conf *conf = mddev->private;
6053	int handled;
6054	struct blk_plug plug;
6055
6056	pr_debug("+++ raid5d active\n");
6057
6058	md_check_recovery(mddev);
6059
6060	if (!bio_list_empty(&conf->return_bi) &&
6061	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
6062		struct bio_list tmp = BIO_EMPTY_LIST;
6063		spin_lock_irq(&conf->device_lock);
6064		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
6065			bio_list_merge(&tmp, &conf->return_bi);
6066			bio_list_init(&conf->return_bi);
6067		}
6068		spin_unlock_irq(&conf->device_lock);
6069		return_io(&tmp);
6070	}
6071
6072	blk_start_plug(&plug);
6073	handled = 0;
6074	spin_lock_irq(&conf->device_lock);
6075	while (1) {
6076		struct bio *bio;
6077		int batch_size, released;
6078
6079		released = release_stripe_list(conf, conf->temp_inactive_list);
6080		if (released)
6081			clear_bit(R5_DID_ALLOC, &conf->cache_state);
6082
6083		if (
6084		    !list_empty(&conf->bitmap_list)) {
6085			/* Now is a good time to flush some bitmap updates */
6086			conf->seq_flush++;
6087			spin_unlock_irq(&conf->device_lock);
6088			bitmap_unplug(mddev->bitmap);
6089			spin_lock_irq(&conf->device_lock);
6090			conf->seq_write = conf->seq_flush;
6091			activate_bit_delay(conf, conf->temp_inactive_list);
6092		}
6093		raid5_activate_delayed(conf);
6094
6095		while ((bio = remove_bio_from_retry(conf))) {
6096			int ok;
6097			spin_unlock_irq(&conf->device_lock);
6098			ok = retry_aligned_read(conf, bio);
6099			spin_lock_irq(&conf->device_lock);
6100			if (!ok)
6101				break;
6102			handled++;
6103		}
6104
6105		batch_size = handle_active_stripes(conf, ANY_GROUP, NULL,
6106						   conf->temp_inactive_list);
6107		if (!batch_size && !released)
6108			break;
6109		handled += batch_size;
6110
6111		if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
6112			spin_unlock_irq(&conf->device_lock);
6113			md_check_recovery(mddev);
6114			spin_lock_irq(&conf->device_lock);
6115		}
6116	}
6117	pr_debug("%d stripes handled\n", handled);
6118
6119	spin_unlock_irq(&conf->device_lock);
6120	if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state) &&
6121	    mutex_trylock(&conf->cache_size_mutex)) {
6122		grow_one_stripe(conf, __GFP_NOWARN);
6123		/* Set flag even if allocation failed.  This helps
6124		 * slow down allocation requests when mem is short
6125		 */
6126		set_bit(R5_DID_ALLOC, &conf->cache_state);
6127		mutex_unlock(&conf->cache_size_mutex);
6128	}
6129
6130	r5l_flush_stripe_to_raid(conf->log);
6131
6132	async_tx_issue_pending_all();
6133	blk_finish_plug(&plug);
6134
6135	pr_debug("--- raid5d inactive\n");
6136}
6137
6138static ssize_t
6139raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
6140{
6141	struct r5conf *conf;
6142	int ret = 0;
6143	spin_lock(&mddev->lock);
6144	conf = mddev->private;
6145	if (conf)
6146		ret = sprintf(page, "%d\n", conf->min_nr_stripes);
6147	spin_unlock(&mddev->lock);
6148	return ret;
6149}
6150
6151int
6152raid5_set_cache_size(struct mddev *mddev, int size)
6153{
6154	struct r5conf *conf = mddev->private;
6155	int err;
6156
6157	if (size <= 16 || size > 32768)
6158		return -EINVAL;
6159
6160	conf->min_nr_stripes = size;
6161	mutex_lock(&conf->cache_size_mutex);
6162	while (size < conf->max_nr_stripes &&
6163	       drop_one_stripe(conf))
6164		;
6165	mutex_unlock(&conf->cache_size_mutex);
6166
6167
6168	err = md_allow_write(mddev);
6169	if (err)
6170		return err;
6171
6172	mutex_lock(&conf->cache_size_mutex);
6173	while (size > conf->max_nr_stripes)
6174		if (!grow_one_stripe(conf, GFP_KERNEL))
6175			break;
6176	mutex_unlock(&conf->cache_size_mutex);
6177
6178	return 0;
6179}
6180EXPORT_SYMBOL(raid5_set_cache_size);
6181
6182static ssize_t
6183raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len)
6184{
6185	struct r5conf *conf;
6186	unsigned long new;
6187	int err;
6188
6189	if (len >= PAGE_SIZE)
6190		return -EINVAL;
6191	if (kstrtoul(page, 10, &new))
6192		return -EINVAL;
6193	err = mddev_lock(mddev);
6194	if (err)
6195		return err;
6196	conf = mddev->private;
6197	if (!conf)
6198		err = -ENODEV;
6199	else
6200		err = raid5_set_cache_size(mddev, new);
6201	mddev_unlock(mddev);
6202
6203	return err ?: len;
6204}
6205
6206static struct md_sysfs_entry
6207raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
6208				raid5_show_stripe_cache_size,
6209				raid5_store_stripe_cache_size);
6210
6211static ssize_t
6212raid5_show_rmw_level(struct mddev  *mddev, char *page)
6213{
6214	struct r5conf *conf = mddev->private;
6215	if (conf)
6216		return sprintf(page, "%d\n", conf->rmw_level);
6217	else
6218		return 0;
6219}
6220
6221static ssize_t
6222raid5_store_rmw_level(struct mddev  *mddev, const char *page, size_t len)
6223{
6224	struct r5conf *conf = mddev->private;
6225	unsigned long new;
6226
6227	if (!conf)
6228		return -ENODEV;
6229
6230	if (len >= PAGE_SIZE)
6231		return -EINVAL;
6232
6233	if (kstrtoul(page, 10, &new))
6234		return -EINVAL;
6235
6236	if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
6237		return -EINVAL;
6238
6239	if (new != PARITY_DISABLE_RMW &&
6240	    new != PARITY_ENABLE_RMW &&
6241	    new != PARITY_PREFER_RMW)
6242		return -EINVAL;
6243
6244	conf->rmw_level = new;
6245	return len;
6246}
6247
6248static struct md_sysfs_entry
6249raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
6250			 raid5_show_rmw_level,
6251			 raid5_store_rmw_level);
6252
6253
6254static ssize_t
6255raid5_show_preread_threshold(struct mddev *mddev, char *page)
6256{
6257	struct r5conf *conf;
6258	int ret = 0;
6259	spin_lock(&mddev->lock);
6260	conf = mddev->private;
6261	if (conf)
6262		ret = sprintf(page, "%d\n", conf->bypass_threshold);
6263	spin_unlock(&mddev->lock);
6264	return ret;
6265}
6266
6267static ssize_t
6268raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
6269{
6270	struct r5conf *conf;
6271	unsigned long new;
6272	int err;
6273
6274	if (len >= PAGE_SIZE)
6275		return -EINVAL;
6276	if (kstrtoul(page, 10, &new))
6277		return -EINVAL;
6278
6279	err = mddev_lock(mddev);
6280	if (err)
6281		return err;
6282	conf = mddev->private;
6283	if (!conf)
6284		err = -ENODEV;
6285	else if (new > conf->min_nr_stripes)
6286		err = -EINVAL;
6287	else
6288		conf->bypass_threshold = new;
6289	mddev_unlock(mddev);
6290	return err ?: len;
6291}
6292
6293static struct md_sysfs_entry
6294raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
6295					S_IRUGO | S_IWUSR,
6296					raid5_show_preread_threshold,
6297					raid5_store_preread_threshold);
6298
6299static ssize_t
6300raid5_show_skip_copy(struct mddev *mddev, char *page)
6301{
6302	struct r5conf *conf;
6303	int ret = 0;
6304	spin_lock(&mddev->lock);
6305	conf = mddev->private;
6306	if (conf)
6307		ret = sprintf(page, "%d\n", conf->skip_copy);
6308	spin_unlock(&mddev->lock);
6309	return ret;
6310}
6311
6312static ssize_t
6313raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
6314{
6315	struct r5conf *conf;
6316	unsigned long new;
6317	int err;
6318
6319	if (len >= PAGE_SIZE)
6320		return -EINVAL;
6321	if (kstrtoul(page, 10, &new))
6322		return -EINVAL;
6323	new = !!new;
6324
6325	err = mddev_lock(mddev);
6326	if (err)
6327		return err;
6328	conf = mddev->private;
6329	if (!conf)
6330		err = -ENODEV;
6331	else if (new != conf->skip_copy) {
6332		mddev_suspend(mddev);
6333		conf->skip_copy = new;
6334		if (new)
6335			mddev->queue->backing_dev_info.capabilities |=
6336				BDI_CAP_STABLE_WRITES;
6337		else
6338			mddev->queue->backing_dev_info.capabilities &=
6339				~BDI_CAP_STABLE_WRITES;
6340		mddev_resume(mddev);
6341	}
6342	mddev_unlock(mddev);
6343	return err ?: len;
6344}
6345
6346static struct md_sysfs_entry
6347raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
6348					raid5_show_skip_copy,
6349					raid5_store_skip_copy);
6350
6351static ssize_t
6352stripe_cache_active_show(struct mddev *mddev, char *page)
6353{
6354	struct r5conf *conf = mddev->private;
6355	if (conf)
6356		return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
6357	else
6358		return 0;
6359}
6360
6361static struct md_sysfs_entry
6362raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
6363
6364static ssize_t
6365raid5_show_group_thread_cnt(struct mddev *mddev, char *page)
6366{
6367	struct r5conf *conf;
6368	int ret = 0;
6369	spin_lock(&mddev->lock);
6370	conf = mddev->private;
6371	if (conf)
6372		ret = sprintf(page, "%d\n", conf->worker_cnt_per_group);
6373	spin_unlock(&mddev->lock);
6374	return ret;
6375}
6376
6377static int alloc_thread_groups(struct r5conf *conf, int cnt,
6378			       int *group_cnt,
6379			       int *worker_cnt_per_group,
6380			       struct r5worker_group **worker_groups);
6381static ssize_t
6382raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
6383{
6384	struct r5conf *conf;
6385	unsigned long new;
6386	int err;
6387	struct r5worker_group *new_groups, *old_groups;
6388	int group_cnt, worker_cnt_per_group;
6389
6390	if (len >= PAGE_SIZE)
6391		return -EINVAL;
6392	if (kstrtoul(page, 10, &new))
6393		return -EINVAL;
6394
6395	err = mddev_lock(mddev);
6396	if (err)
6397		return err;
6398	conf = mddev->private;
6399	if (!conf)
6400		err = -ENODEV;
6401	else if (new != conf->worker_cnt_per_group) {
6402		mddev_suspend(mddev);
6403
6404		old_groups = conf->worker_groups;
6405		if (old_groups)
6406			flush_workqueue(raid5_wq);
6407
6408		err = alloc_thread_groups(conf, new,
6409					  &group_cnt, &worker_cnt_per_group,
6410					  &new_groups);
6411		if (!err) {
6412			spin_lock_irq(&conf->device_lock);
6413			conf->group_cnt = group_cnt;
6414			conf->worker_cnt_per_group = worker_cnt_per_group;
6415			conf->worker_groups = new_groups;
6416			spin_unlock_irq(&conf->device_lock);
6417
6418			if (old_groups)
6419				kfree(old_groups[0].workers);
6420			kfree(old_groups);
6421		}
6422		mddev_resume(mddev);
6423	}
6424	mddev_unlock(mddev);
6425
6426	return err ?: len;
6427}
6428
6429static struct md_sysfs_entry
6430raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
6431				raid5_show_group_thread_cnt,
6432				raid5_store_group_thread_cnt);
6433
6434static struct attribute *raid5_attrs[] =  {
6435	&raid5_stripecache_size.attr,
6436	&raid5_stripecache_active.attr,
6437	&raid5_preread_bypass_threshold.attr,
6438	&raid5_group_thread_cnt.attr,
6439	&raid5_skip_copy.attr,
6440	&raid5_rmw_level.attr,
6441	&r5c_journal_mode.attr,
6442	NULL,
6443};
6444static struct attribute_group raid5_attrs_group = {
6445	.name = NULL,
6446	.attrs = raid5_attrs,
6447};
6448
6449static int alloc_thread_groups(struct r5conf *conf, int cnt,
6450			       int *group_cnt,
6451			       int *worker_cnt_per_group,
6452			       struct r5worker_group **worker_groups)
6453{
6454	int i, j, k;
6455	ssize_t size;
6456	struct r5worker *workers;
6457
6458	*worker_cnt_per_group = cnt;
6459	if (cnt == 0) {
6460		*group_cnt = 0;
6461		*worker_groups = NULL;
6462		return 0;
6463	}
6464	*group_cnt = num_possible_nodes();
6465	size = sizeof(struct r5worker) * cnt;
6466	workers = kzalloc(size * *group_cnt, GFP_NOIO);
6467	*worker_groups = kzalloc(sizeof(struct r5worker_group) *
6468				*group_cnt, GFP_NOIO);
6469	if (!*worker_groups || !workers) {
6470		kfree(workers);
6471		kfree(*worker_groups);
6472		return -ENOMEM;
6473	}
6474
6475	for (i = 0; i < *group_cnt; i++) {
6476		struct r5worker_group *group;
6477
6478		group = &(*worker_groups)[i];
6479		INIT_LIST_HEAD(&group->handle_list);
6480		group->conf = conf;
6481		group->workers = workers + i * cnt;
6482
6483		for (j = 0; j < cnt; j++) {
6484			struct r5worker *worker = group->workers + j;
6485			worker->group = group;
6486			INIT_WORK(&worker->work, raid5_do_work);
6487
6488			for (k = 0; k < NR_STRIPE_HASH_LOCKS; k++)
6489				INIT_LIST_HEAD(worker->temp_inactive_list + k);
6490		}
6491	}
6492
6493	return 0;
6494}
6495
6496static void free_thread_groups(struct r5conf *conf)
6497{
6498	if (conf->worker_groups)
6499		kfree(conf->worker_groups[0].workers);
6500	kfree(conf->worker_groups);
6501	conf->worker_groups = NULL;
6502}
6503
6504static sector_t
6505raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
6506{
6507	struct r5conf *conf = mddev->private;
6508
6509	if (!sectors)
6510		sectors = mddev->dev_sectors;
6511	if (!raid_disks)
6512		/* size is defined by the smallest of previous and new size */
6513		raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
6514
6515	sectors &= ~((sector_t)conf->chunk_sectors - 1);
6516	sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
6517	return sectors * (raid_disks - conf->max_degraded);
6518}
6519
6520static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6521{
6522	safe_put_page(percpu->spare_page);
6523	if (percpu->scribble)
6524		flex_array_free(percpu->scribble);
6525	percpu->spare_page = NULL;
6526	percpu->scribble = NULL;
6527}
6528
6529static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
6530{
6531	if (conf->level == 6 && !percpu->spare_page)
6532		percpu->spare_page = alloc_page(GFP_KERNEL);
6533	if (!percpu->scribble)
6534		percpu->scribble = scribble_alloc(max(conf->raid_disks,
6535						      conf->previous_raid_disks),
6536						  max(conf->chunk_sectors,
6537						      conf->prev_chunk_sectors)
6538						   / STRIPE_SECTORS,
6539						  GFP_KERNEL);
6540
6541	if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
6542		free_scratch_buffer(conf, percpu);
6543		return -ENOMEM;
6544	}
6545
6546	return 0;
6547}
6548
6549static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
6550{
6551	struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
6552
6553	free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
6554	return 0;
6555}
6556
6557static void raid5_free_percpu(struct r5conf *conf)
6558{
6559	if (!conf->percpu)
6560		return;
6561
6562	cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
 
 
 
 
 
 
 
 
6563	free_percpu(conf->percpu);
6564}
6565
6566static void free_conf(struct r5conf *conf)
6567{
6568	int i;
6569
6570	if (conf->log)
6571		r5l_exit_log(conf->log);
6572	if (conf->shrinker.nr_deferred)
6573		unregister_shrinker(&conf->shrinker);
6574
6575	free_thread_groups(conf);
6576	shrink_stripes(conf);
6577	raid5_free_percpu(conf);
6578	for (i = 0; i < conf->pool_size; i++)
6579		if (conf->disks[i].extra_page)
6580			put_page(conf->disks[i].extra_page);
6581	kfree(conf->disks);
6582	kfree(conf->stripe_hashtbl);
6583	kfree(conf);
6584}
6585
6586static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
 
 
6587{
6588	struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
 
6589	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
6590
6591	if (alloc_scratch_buffer(conf, percpu)) {
6592		pr_warn("%s: failed memory allocation for cpu%u\n",
6593			__func__, cpu);
6594		return -ENOMEM;
 
 
 
 
 
 
 
 
 
 
 
 
 
6595	}
6596	return 0;
6597}
 
6598
6599static int raid5_alloc_percpu(struct r5conf *conf)
6600{
 
6601	int err = 0;
6602
6603	conf->percpu = alloc_percpu(struct raid5_percpu);
6604	if (!conf->percpu)
6605		return -ENOMEM;
6606
6607	err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6608	if (!err) {
6609		conf->scribble_disks = max(conf->raid_disks,
6610			conf->previous_raid_disks);
6611		conf->scribble_sectors = max(conf->chunk_sectors,
6612			conf->prev_chunk_sectors);
6613	}
6614	return err;
6615}
6616
6617static unsigned long raid5_cache_scan(struct shrinker *shrink,
6618				      struct shrink_control *sc)
6619{
6620	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6621	unsigned long ret = SHRINK_STOP;
6622
6623	if (mutex_trylock(&conf->cache_size_mutex)) {
6624		ret= 0;
6625		while (ret < sc->nr_to_scan &&
6626		       conf->max_nr_stripes > conf->min_nr_stripes) {
6627			if (drop_one_stripe(conf) == 0) {
6628				ret = SHRINK_STOP;
6629				break;
6630			}
6631			ret++;
6632		}
6633		mutex_unlock(&conf->cache_size_mutex);
6634	}
6635	return ret;
6636}
6637
6638static unsigned long raid5_cache_count(struct shrinker *shrink,
6639				       struct shrink_control *sc)
6640{
6641	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
6642
6643	if (conf->max_nr_stripes < conf->min_nr_stripes)
6644		/* unlikely, but not impossible */
6645		return 0;
6646	return conf->max_nr_stripes - conf->min_nr_stripes;
6647}
6648
6649static struct r5conf *setup_conf(struct mddev *mddev)
6650{
6651	struct r5conf *conf;
6652	int raid_disk, memory, max_disks;
6653	struct md_rdev *rdev;
6654	struct disk_info *disk;
6655	char pers_name[6];
6656	int i;
6657	int group_cnt, worker_cnt_per_group;
6658	struct r5worker_group *new_group;
6659
6660	if (mddev->new_level != 5
6661	    && mddev->new_level != 4
6662	    && mddev->new_level != 6) {
6663		pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
6664			mdname(mddev), mddev->new_level);
6665		return ERR_PTR(-EIO);
6666	}
6667	if ((mddev->new_level == 5
6668	     && !algorithm_valid_raid5(mddev->new_layout)) ||
6669	    (mddev->new_level == 6
6670	     && !algorithm_valid_raid6(mddev->new_layout))) {
6671		pr_warn("md/raid:%s: layout %d not supported\n",
6672			mdname(mddev), mddev->new_layout);
6673		return ERR_PTR(-EIO);
6674	}
6675	if (mddev->new_level == 6 && mddev->raid_disks < 4) {
6676		pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
6677			mdname(mddev), mddev->raid_disks);
6678		return ERR_PTR(-EINVAL);
6679	}
6680
6681	if (!mddev->new_chunk_sectors ||
6682	    (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
6683	    !is_power_of_2(mddev->new_chunk_sectors)) {
6684		pr_warn("md/raid:%s: invalid chunk size %d\n",
6685			mdname(mddev), mddev->new_chunk_sectors << 9);
6686		return ERR_PTR(-EINVAL);
6687	}
6688
6689	conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
6690	if (conf == NULL)
6691		goto abort;
6692	/* Don't enable multi-threading by default*/
6693	if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
6694				 &new_group)) {
6695		conf->group_cnt = group_cnt;
6696		conf->worker_cnt_per_group = worker_cnt_per_group;
6697		conf->worker_groups = new_group;
6698	} else
6699		goto abort;
6700	spin_lock_init(&conf->device_lock);
6701	seqcount_init(&conf->gen_lock);
6702	mutex_init(&conf->cache_size_mutex);
6703	init_waitqueue_head(&conf->wait_for_quiescent);
6704	init_waitqueue_head(&conf->wait_for_stripe);
6705	init_waitqueue_head(&conf->wait_for_overlap);
6706	INIT_LIST_HEAD(&conf->handle_list);
6707	INIT_LIST_HEAD(&conf->hold_list);
6708	INIT_LIST_HEAD(&conf->delayed_list);
6709	INIT_LIST_HEAD(&conf->bitmap_list);
6710	bio_list_init(&conf->return_bi);
6711	init_llist_head(&conf->released_stripes);
6712	atomic_set(&conf->active_stripes, 0);
6713	atomic_set(&conf->preread_active_stripes, 0);
6714	atomic_set(&conf->active_aligned_reads, 0);
6715	conf->bypass_threshold = BYPASS_THRESHOLD;
6716	conf->recovery_disabled = mddev->recovery_disabled - 1;
6717
6718	conf->raid_disks = mddev->raid_disks;
6719	if (mddev->reshape_position == MaxSector)
6720		conf->previous_raid_disks = mddev->raid_disks;
6721	else
6722		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
6723	max_disks = max(conf->raid_disks, conf->previous_raid_disks);
6724
6725	conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
6726			      GFP_KERNEL);
6727
6728	if (!conf->disks)
6729		goto abort;
6730
6731	for (i = 0; i < max_disks; i++) {
6732		conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
6733		if (!conf->disks[i].extra_page)
6734			goto abort;
6735	}
6736
6737	conf->mddev = mddev;
6738
6739	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
6740		goto abort;
6741
6742	/* We init hash_locks[0] separately to that it can be used
6743	 * as the reference lock in the spin_lock_nest_lock() call
6744	 * in lock_all_device_hash_locks_irq in order to convince
6745	 * lockdep that we know what we are doing.
6746	 */
6747	spin_lock_init(conf->hash_locks);
6748	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
6749		spin_lock_init(conf->hash_locks + i);
6750
6751	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6752		INIT_LIST_HEAD(conf->inactive_list + i);
6753
6754	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
6755		INIT_LIST_HEAD(conf->temp_inactive_list + i);
6756
6757	atomic_set(&conf->r5c_cached_full_stripes, 0);
6758	INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
6759	atomic_set(&conf->r5c_cached_partial_stripes, 0);
6760	INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
6761
6762	conf->level = mddev->new_level;
6763	conf->chunk_sectors = mddev->new_chunk_sectors;
6764	if (raid5_alloc_percpu(conf) != 0)
6765		goto abort;
6766
6767	pr_debug("raid456: run(%s) called.\n", mdname(mddev));
6768
6769	rdev_for_each(rdev, mddev) {
6770		raid_disk = rdev->raid_disk;
6771		if (raid_disk >= max_disks
6772		    || raid_disk < 0 || test_bit(Journal, &rdev->flags))
6773			continue;
6774		disk = conf->disks + raid_disk;
6775
6776		if (test_bit(Replacement, &rdev->flags)) {
6777			if (disk->replacement)
6778				goto abort;
6779			disk->replacement = rdev;
6780		} else {
6781			if (disk->rdev)
6782				goto abort;
6783			disk->rdev = rdev;
6784		}
6785
6786		if (test_bit(In_sync, &rdev->flags)) {
6787			char b[BDEVNAME_SIZE];
6788			pr_info("md/raid:%s: device %s operational as raid disk %d\n",
6789				mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
 
6790		} else if (rdev->saved_raid_disk != raid_disk)
6791			/* Cannot rely on bitmap to complete recovery */
6792			conf->fullsync = 1;
6793	}
6794
6795	conf->level = mddev->new_level;
6796	if (conf->level == 6) {
6797		conf->max_degraded = 2;
6798		if (raid6_call.xor_syndrome)
6799			conf->rmw_level = PARITY_ENABLE_RMW;
6800		else
6801			conf->rmw_level = PARITY_DISABLE_RMW;
6802	} else {
6803		conf->max_degraded = 1;
6804		conf->rmw_level = PARITY_ENABLE_RMW;
6805	}
6806	conf->algorithm = mddev->new_layout;
6807	conf->reshape_progress = mddev->reshape_position;
6808	if (conf->reshape_progress != MaxSector) {
6809		conf->prev_chunk_sectors = mddev->chunk_sectors;
6810		conf->prev_algo = mddev->layout;
6811	} else {
6812		conf->prev_chunk_sectors = conf->chunk_sectors;
6813		conf->prev_algo = conf->algorithm;
6814	}
6815
6816	conf->min_nr_stripes = NR_STRIPES;
6817	if (mddev->reshape_position != MaxSector) {
6818		int stripes = max_t(int,
6819			((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
6820			((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
6821		conf->min_nr_stripes = max(NR_STRIPES, stripes);
6822		if (conf->min_nr_stripes != NR_STRIPES)
6823			pr_info("md/raid:%s: force stripe size %d for reshape\n",
6824				mdname(mddev), conf->min_nr_stripes);
6825	}
6826	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
6827		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
6828	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
6829	if (grow_stripes(conf, conf->min_nr_stripes)) {
6830		pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
6831			mdname(mddev), memory);
 
6832		goto abort;
6833	} else
6834		pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
 
6835	/*
6836	 * Losing a stripe head costs more than the time to refill it,
6837	 * it reduces the queue depth and so can hurt throughput.
6838	 * So set it rather large, scaled by number of devices.
6839	 */
6840	conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
6841	conf->shrinker.scan_objects = raid5_cache_scan;
6842	conf->shrinker.count_objects = raid5_cache_count;
6843	conf->shrinker.batch = 128;
6844	conf->shrinker.flags = 0;
6845	if (register_shrinker(&conf->shrinker)) {
6846		pr_warn("md/raid:%s: couldn't register shrinker.\n",
6847			mdname(mddev));
6848		goto abort;
6849	}
6850
6851	sprintf(pers_name, "raid%d", mddev->new_level);
6852	conf->thread = md_register_thread(raid5d, mddev, pers_name);
6853	if (!conf->thread) {
6854		pr_warn("md/raid:%s: couldn't allocate thread.\n",
6855			mdname(mddev));
 
6856		goto abort;
6857	}
6858
6859	return conf;
6860
6861 abort:
6862	if (conf) {
6863		free_conf(conf);
6864		return ERR_PTR(-EIO);
6865	} else
6866		return ERR_PTR(-ENOMEM);
6867}
6868
6869static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded)
6870{
6871	switch (algo) {
6872	case ALGORITHM_PARITY_0:
6873		if (raid_disk < max_degraded)
6874			return 1;
6875		break;
6876	case ALGORITHM_PARITY_N:
6877		if (raid_disk >= raid_disks - max_degraded)
6878			return 1;
6879		break;
6880	case ALGORITHM_PARITY_0_6:
6881		if (raid_disk == 0 ||
6882		    raid_disk == raid_disks - 1)
6883			return 1;
6884		break;
6885	case ALGORITHM_LEFT_ASYMMETRIC_6:
6886	case ALGORITHM_RIGHT_ASYMMETRIC_6:
6887	case ALGORITHM_LEFT_SYMMETRIC_6:
6888	case ALGORITHM_RIGHT_SYMMETRIC_6:
6889		if (raid_disk == raid_disks - 1)
6890			return 1;
6891	}
6892	return 0;
6893}
6894
6895static int raid5_run(struct mddev *mddev)
6896{
6897	struct r5conf *conf;
6898	int working_disks = 0;
6899	int dirty_parity_disks = 0;
6900	struct md_rdev *rdev;
6901	struct md_rdev *journal_dev = NULL;
6902	sector_t reshape_offset = 0;
6903	int i;
6904	long long min_offset_diff = 0;
6905	int first = 1;
6906
6907	if (mddev->recovery_cp != MaxSector)
6908		pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
6909			  mdname(mddev));
 
6910
6911	rdev_for_each(rdev, mddev) {
6912		long long diff;
6913
6914		if (test_bit(Journal, &rdev->flags)) {
6915			journal_dev = rdev;
6916			continue;
6917		}
6918		if (rdev->raid_disk < 0)
6919			continue;
6920		diff = (rdev->new_data_offset - rdev->data_offset);
6921		if (first) {
6922			min_offset_diff = diff;
6923			first = 0;
6924		} else if (mddev->reshape_backwards &&
6925			 diff < min_offset_diff)
6926			min_offset_diff = diff;
6927		else if (!mddev->reshape_backwards &&
6928			 diff > min_offset_diff)
6929			min_offset_diff = diff;
6930	}
6931
6932	if (mddev->reshape_position != MaxSector) {
6933		/* Check that we can continue the reshape.
6934		 * Difficulties arise if the stripe we would write to
6935		 * next is at or after the stripe we would read from next.
6936		 * For a reshape that changes the number of devices, this
6937		 * is only possible for a very short time, and mdadm makes
6938		 * sure that time appears to have past before assembling
6939		 * the array.  So we fail if that time hasn't passed.
6940		 * For a reshape that keeps the number of devices the same
6941		 * mdadm must be monitoring the reshape can keeping the
6942		 * critical areas read-only and backed up.  It will start
6943		 * the array in read-only mode, so we check for that.
6944		 */
6945		sector_t here_new, here_old;
6946		int old_disks;
6947		int max_degraded = (mddev->level == 6 ? 2 : 1);
6948		int chunk_sectors;
6949		int new_data_disks;
6950
6951		if (journal_dev) {
6952			pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
6953				mdname(mddev));
6954			return -EINVAL;
6955		}
6956
6957		if (mddev->new_level != mddev->level) {
6958			pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
6959				mdname(mddev));
 
6960			return -EINVAL;
6961		}
6962		old_disks = mddev->raid_disks - mddev->delta_disks;
6963		/* reshape_position must be on a new-stripe boundary, and one
6964		 * further up in new geometry must map after here in old
6965		 * geometry.
6966		 * If the chunk sizes are different, then as we perform reshape
6967		 * in units of the largest of the two, reshape_position needs
6968		 * be a multiple of the largest chunk size times new data disks.
6969		 */
6970		here_new = mddev->reshape_position;
6971		chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
6972		new_data_disks = mddev->raid_disks - max_degraded;
6973		if (sector_div(here_new, chunk_sectors * new_data_disks)) {
6974			pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
6975				mdname(mddev));
6976			return -EINVAL;
6977		}
6978		reshape_offset = here_new * chunk_sectors;
6979		/* here_new is the stripe we will write to */
6980		here_old = mddev->reshape_position;
6981		sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
6982		/* here_old is the first stripe that we might need to read
6983		 * from */
6984		if (mddev->delta_disks == 0) {
6985			/* We cannot be sure it is safe to start an in-place
6986			 * reshape.  It is only safe if user-space is monitoring
6987			 * and taking constant backups.
6988			 * mdadm always starts a situation like this in
6989			 * readonly mode so it can take control before
6990			 * allowing any writes.  So just check for that.
6991			 */
6992			if (abs(min_offset_diff) >= mddev->chunk_sectors &&
6993			    abs(min_offset_diff) >= mddev->new_chunk_sectors)
6994				/* not really in-place - so OK */;
6995			else if (mddev->ro == 0) {
6996				pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
6997					mdname(mddev));
 
 
6998				return -EINVAL;
6999			}
7000		} else if (mddev->reshape_backwards
7001		    ? (here_new * chunk_sectors + min_offset_diff <=
7002		       here_old * chunk_sectors)
7003		    : (here_new * chunk_sectors >=
7004		       here_old * chunk_sectors + (-min_offset_diff))) {
7005			/* Reading from the same stripe as writing to - bad */
7006			pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
7007				mdname(mddev));
 
7008			return -EINVAL;
7009		}
7010		pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
 
7011		/* OK, we should be able to continue; */
7012	} else {
7013		BUG_ON(mddev->level != mddev->new_level);
7014		BUG_ON(mddev->layout != mddev->new_layout);
7015		BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
7016		BUG_ON(mddev->delta_disks != 0);
7017	}
7018
7019	if (mddev->private == NULL)
7020		conf = setup_conf(mddev);
7021	else
7022		conf = mddev->private;
7023
7024	if (IS_ERR(conf))
7025		return PTR_ERR(conf);
7026
7027	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
7028		if (!journal_dev) {
7029			pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
7030				mdname(mddev));
7031			mddev->ro = 1;
7032			set_disk_ro(mddev->gendisk, 1);
7033		} else if (mddev->recovery_cp == MaxSector)
7034			set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
7035	}
7036
7037	conf->min_offset_diff = min_offset_diff;
7038	mddev->thread = conf->thread;
7039	conf->thread = NULL;
7040	mddev->private = conf;
7041
7042	for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
7043	     i++) {
7044		rdev = conf->disks[i].rdev;
7045		if (!rdev && conf->disks[i].replacement) {
7046			/* The replacement is all we have yet */
7047			rdev = conf->disks[i].replacement;
7048			conf->disks[i].replacement = NULL;
7049			clear_bit(Replacement, &rdev->flags);
7050			conf->disks[i].rdev = rdev;
7051		}
7052		if (!rdev)
7053			continue;
7054		if (conf->disks[i].replacement &&
7055		    conf->reshape_progress != MaxSector) {
7056			/* replacements and reshape simply do not mix. */
7057			pr_warn("md: cannot handle concurrent replacement and reshape.\n");
 
7058			goto abort;
7059		}
7060		if (test_bit(In_sync, &rdev->flags)) {
7061			working_disks++;
7062			continue;
7063		}
7064		/* This disc is not fully in-sync.  However if it
7065		 * just stored parity (beyond the recovery_offset),
7066		 * when we don't need to be concerned about the
7067		 * array being dirty.
7068		 * When reshape goes 'backwards', we never have
7069		 * partially completed devices, so we only need
7070		 * to worry about reshape going forwards.
7071		 */
7072		/* Hack because v0.91 doesn't store recovery_offset properly. */
7073		if (mddev->major_version == 0 &&
7074		    mddev->minor_version > 90)
7075			rdev->recovery_offset = reshape_offset;
7076
7077		if (rdev->recovery_offset < reshape_offset) {
7078			/* We need to check old and new layout */
7079			if (!only_parity(rdev->raid_disk,
7080					 conf->algorithm,
7081					 conf->raid_disks,
7082					 conf->max_degraded))
7083				continue;
7084		}
7085		if (!only_parity(rdev->raid_disk,
7086				 conf->prev_algo,
7087				 conf->previous_raid_disks,
7088				 conf->max_degraded))
7089			continue;
7090		dirty_parity_disks++;
7091	}
7092
7093	/*
7094	 * 0 for a fully functional array, 1 or 2 for a degraded array.
7095	 */
7096	mddev->degraded = raid5_calc_degraded(conf);
7097
7098	if (has_failed(conf)) {
7099		pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
 
7100			mdname(mddev), mddev->degraded, conf->raid_disks);
7101		goto abort;
7102	}
7103
7104	/* device size must be a multiple of chunk size */
7105	mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
7106	mddev->resync_max_sectors = mddev->dev_sectors;
7107
7108	if (mddev->degraded > dirty_parity_disks &&
7109	    mddev->recovery_cp != MaxSector) {
7110		if (mddev->ok_start_degraded)
7111			pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
7112				mdname(mddev));
 
 
7113		else {
7114			pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
7115				mdname(mddev));
 
7116			goto abort;
7117		}
7118	}
7119
7120	pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
7121		mdname(mddev), conf->level,
7122		mddev->raid_disks-mddev->degraded, mddev->raid_disks,
7123		mddev->new_layout);
 
 
 
 
 
 
 
7124
7125	print_raid5_conf(conf);
7126
7127	if (conf->reshape_progress != MaxSector) {
7128		conf->reshape_safe = conf->reshape_progress;
7129		atomic_set(&conf->reshape_stripes, 0);
7130		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7131		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7132		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7133		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7134		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7135							"reshape");
7136	}
7137
7138	/* Ok, everything is just fine now */
7139	if (mddev->to_remove == &raid5_attrs_group)
7140		mddev->to_remove = NULL;
7141	else if (mddev->kobj.sd &&
7142	    sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
7143		pr_warn("raid5: failed to create sysfs attributes for %s\n",
7144			mdname(mddev));
 
7145	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7146
7147	if (mddev->queue) {
7148		int chunk_size;
7149		bool discard_supported = true;
7150		/* read-ahead size must cover two whole stripes, which
7151		 * is 2 * (datadisks) * chunksize where 'n' is the
7152		 * number of raid devices
7153		 */
7154		int data_disks = conf->previous_raid_disks - conf->max_degraded;
7155		int stripe = data_disks *
7156			((mddev->chunk_sectors << 9) / PAGE_SIZE);
7157		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
7158			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
7159
7160		chunk_size = mddev->chunk_sectors << 9;
7161		blk_queue_io_min(mddev->queue, chunk_size);
7162		blk_queue_io_opt(mddev->queue, chunk_size *
7163				 (conf->raid_disks - conf->max_degraded));
7164		mddev->queue->limits.raid_partial_stripes_expensive = 1;
7165		/*
7166		 * We can only discard a whole stripe. It doesn't make sense to
7167		 * discard data disk but write parity disk
7168		 */
7169		stripe = stripe * PAGE_SIZE;
7170		/* Round up to power of 2, as discard handling
7171		 * currently assumes that */
7172		while ((stripe-1) & stripe)
7173			stripe = (stripe | (stripe-1)) + 1;
7174		mddev->queue->limits.discard_alignment = stripe;
7175		mddev->queue->limits.discard_granularity = stripe;
7176
7177		/*
7178		 * We use 16-bit counter of active stripes in bi_phys_segments
7179		 * (minus one for over-loaded initialization)
7180		 */
7181		blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
7182		blk_queue_max_discard_sectors(mddev->queue,
7183					      0xfffe * STRIPE_SECTORS);
7184
7185		/*
7186		 * unaligned part of discard request will be ignored, so can't
7187		 * guarantee discard_zeroes_data
7188		 */
7189		mddev->queue->limits.discard_zeroes_data = 0;
7190
7191		blk_queue_max_write_same_sectors(mddev->queue, 0);
7192
7193		rdev_for_each(rdev, mddev) {
7194			disk_stack_limits(mddev->gendisk, rdev->bdev,
7195					  rdev->data_offset << 9);
7196			disk_stack_limits(mddev->gendisk, rdev->bdev,
7197					  rdev->new_data_offset << 9);
7198			/*
7199			 * discard_zeroes_data is required, otherwise data
7200			 * could be lost. Consider a scenario: discard a stripe
7201			 * (the stripe could be inconsistent if
7202			 * discard_zeroes_data is 0); write one disk of the
7203			 * stripe (the stripe could be inconsistent again
7204			 * depending on which disks are used to calculate
7205			 * parity); the disk is broken; The stripe data of this
7206			 * disk is lost.
7207			 */
7208			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
7209			    !bdev_get_queue(rdev->bdev)->
7210						limits.discard_zeroes_data)
7211				discard_supported = false;
7212			/* Unfortunately, discard_zeroes_data is not currently
7213			 * a guarantee - just a hint.  So we only allow DISCARD
7214			 * if the sysadmin has confirmed that only safe devices
7215			 * are in use by setting a module parameter.
7216			 */
7217			if (!devices_handle_discard_safely) {
7218				if (discard_supported) {
7219					pr_info("md/raid456: discard support disabled due to uncertainty.\n");
7220					pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
7221				}
7222				discard_supported = false;
7223			}
7224		}
7225
7226		if (discard_supported &&
7227		    mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
7228		    mddev->queue->limits.discard_granularity >= stripe)
7229			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
7230						mddev->queue);
7231		else
7232			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
7233						mddev->queue);
7234
7235		blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
7236	}
7237
7238	if (journal_dev) {
7239		char b[BDEVNAME_SIZE];
7240
7241		pr_debug("md/raid:%s: using device %s as journal\n",
7242			 mdname(mddev), bdevname(journal_dev->bdev, b));
7243		if (r5l_init_log(conf, journal_dev))
7244			goto abort;
7245	}
7246
7247	return 0;
7248abort:
7249	md_unregister_thread(&mddev->thread);
7250	print_raid5_conf(conf);
7251	free_conf(conf);
7252	mddev->private = NULL;
7253	pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
7254	return -EIO;
7255}
7256
7257static void raid5_free(struct mddev *mddev, void *priv)
7258{
7259	struct r5conf *conf = priv;
7260
7261	free_conf(conf);
7262	mddev->to_remove = &raid5_attrs_group;
7263}
7264
7265static void raid5_status(struct seq_file *seq, struct mddev *mddev)
7266{
7267	struct r5conf *conf = mddev->private;
7268	int i;
7269
7270	seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
7271		conf->chunk_sectors / 2, mddev->layout);
7272	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
7273	rcu_read_lock();
7274	for (i = 0; i < conf->raid_disks; i++) {
7275		struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
7276		seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
7277	}
7278	rcu_read_unlock();
7279	seq_printf (seq, "]");
7280}
7281
7282static void print_raid5_conf (struct r5conf *conf)
7283{
7284	int i;
7285	struct disk_info *tmp;
7286
7287	pr_debug("RAID conf printout:\n");
7288	if (!conf) {
7289		pr_debug("(conf==NULL)\n");
7290		return;
7291	}
7292	pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
7293	       conf->raid_disks,
7294	       conf->raid_disks - conf->mddev->degraded);
7295
7296	for (i = 0; i < conf->raid_disks; i++) {
7297		char b[BDEVNAME_SIZE];
7298		tmp = conf->disks + i;
7299		if (tmp->rdev)
7300			pr_debug(" disk %d, o:%d, dev:%s\n",
7301			       i, !test_bit(Faulty, &tmp->rdev->flags),
7302			       bdevname(tmp->rdev->bdev, b));
7303	}
7304}
7305
7306static int raid5_spare_active(struct mddev *mddev)
7307{
7308	int i;
7309	struct r5conf *conf = mddev->private;
7310	struct disk_info *tmp;
7311	int count = 0;
7312	unsigned long flags;
7313
7314	for (i = 0; i < conf->raid_disks; i++) {
7315		tmp = conf->disks + i;
7316		if (tmp->replacement
7317		    && tmp->replacement->recovery_offset == MaxSector
7318		    && !test_bit(Faulty, &tmp->replacement->flags)
7319		    && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
7320			/* Replacement has just become active. */
7321			if (!tmp->rdev
7322			    || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
7323				count++;
7324			if (tmp->rdev) {
7325				/* Replaced device not technically faulty,
7326				 * but we need to be sure it gets removed
7327				 * and never re-added.
7328				 */
7329				set_bit(Faulty, &tmp->rdev->flags);
7330				sysfs_notify_dirent_safe(
7331					tmp->rdev->sysfs_state);
7332			}
7333			sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
7334		} else if (tmp->rdev
7335		    && tmp->rdev->recovery_offset == MaxSector
7336		    && !test_bit(Faulty, &tmp->rdev->flags)
7337		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
7338			count++;
7339			sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
7340		}
7341	}
7342	spin_lock_irqsave(&conf->device_lock, flags);
7343	mddev->degraded = raid5_calc_degraded(conf);
7344	spin_unlock_irqrestore(&conf->device_lock, flags);
7345	print_raid5_conf(conf);
7346	return count;
7347}
7348
7349static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
7350{
7351	struct r5conf *conf = mddev->private;
7352	int err = 0;
7353	int number = rdev->raid_disk;
7354	struct md_rdev **rdevp;
7355	struct disk_info *p = conf->disks + number;
7356
7357	print_raid5_conf(conf);
7358	if (test_bit(Journal, &rdev->flags) && conf->log) {
7359		struct r5l_log *log;
7360		/*
7361		 * we can't wait pending write here, as this is called in
7362		 * raid5d, wait will deadlock.
7363		 */
7364		if (atomic_read(&mddev->writes_pending))
7365			return -EBUSY;
7366		log = conf->log;
7367		conf->log = NULL;
7368		synchronize_rcu();
7369		r5l_exit_log(log);
7370		return 0;
7371	}
7372	if (rdev == p->rdev)
7373		rdevp = &p->rdev;
7374	else if (rdev == p->replacement)
7375		rdevp = &p->replacement;
7376	else
7377		return 0;
7378
7379	if (number >= conf->raid_disks &&
7380	    conf->reshape_progress == MaxSector)
7381		clear_bit(In_sync, &rdev->flags);
7382
7383	if (test_bit(In_sync, &rdev->flags) ||
7384	    atomic_read(&rdev->nr_pending)) {
7385		err = -EBUSY;
7386		goto abort;
7387	}
7388	/* Only remove non-faulty devices if recovery
7389	 * isn't possible.
7390	 */
7391	if (!test_bit(Faulty, &rdev->flags) &&
7392	    mddev->recovery_disabled != conf->recovery_disabled &&
7393	    !has_failed(conf) &&
7394	    (!p->replacement || p->replacement == rdev) &&
7395	    number < conf->raid_disks) {
7396		err = -EBUSY;
7397		goto abort;
7398	}
7399	*rdevp = NULL;
7400	if (!test_bit(RemoveSynchronized, &rdev->flags)) {
7401		synchronize_rcu();
7402		if (atomic_read(&rdev->nr_pending)) {
7403			/* lost the race, try later */
7404			err = -EBUSY;
7405			*rdevp = rdev;
7406		}
7407	}
7408	if (p->replacement) {
7409		/* We must have just cleared 'rdev' */
7410		p->rdev = p->replacement;
7411		clear_bit(Replacement, &p->replacement->flags);
7412		smp_mb(); /* Make sure other CPUs may see both as identical
7413			   * but will never see neither - if they are careful
7414			   */
7415		p->replacement = NULL;
7416		clear_bit(WantReplacement, &rdev->flags);
7417	} else
7418		/* We might have just removed the Replacement as faulty-
7419		 * clear the bit just in case
7420		 */
7421		clear_bit(WantReplacement, &rdev->flags);
7422abort:
7423
7424	print_raid5_conf(conf);
7425	return err;
7426}
7427
7428static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
7429{
7430	struct r5conf *conf = mddev->private;
7431	int err = -EEXIST;
7432	int disk;
7433	struct disk_info *p;
7434	int first = 0;
7435	int last = conf->raid_disks - 1;
7436
7437	if (test_bit(Journal, &rdev->flags)) {
7438		char b[BDEVNAME_SIZE];
7439		if (conf->log)
7440			return -EBUSY;
7441
7442		rdev->raid_disk = 0;
7443		/*
7444		 * The array is in readonly mode if journal is missing, so no
7445		 * write requests running. We should be safe
7446		 */
7447		r5l_init_log(conf, rdev);
7448		pr_debug("md/raid:%s: using device %s as journal\n",
7449			 mdname(mddev), bdevname(rdev->bdev, b));
7450		return 0;
7451	}
7452	if (mddev->recovery_disabled == conf->recovery_disabled)
7453		return -EBUSY;
7454
7455	if (rdev->saved_raid_disk < 0 && has_failed(conf))
7456		/* no point adding a device */
7457		return -EINVAL;
7458
7459	if (rdev->raid_disk >= 0)
7460		first = last = rdev->raid_disk;
7461
7462	/*
7463	 * find the disk ... but prefer rdev->saved_raid_disk
7464	 * if possible.
7465	 */
7466	if (rdev->saved_raid_disk >= 0 &&
7467	    rdev->saved_raid_disk >= first &&
7468	    conf->disks[rdev->saved_raid_disk].rdev == NULL)
7469		first = rdev->saved_raid_disk;
7470
7471	for (disk = first; disk <= last; disk++) {
7472		p = conf->disks + disk;
7473		if (p->rdev == NULL) {
7474			clear_bit(In_sync, &rdev->flags);
7475			rdev->raid_disk = disk;
7476			err = 0;
7477			if (rdev->saved_raid_disk != disk)
7478				conf->fullsync = 1;
7479			rcu_assign_pointer(p->rdev, rdev);
7480			goto out;
7481		}
7482	}
7483	for (disk = first; disk <= last; disk++) {
7484		p = conf->disks + disk;
7485		if (test_bit(WantReplacement, &p->rdev->flags) &&
7486		    p->replacement == NULL) {
7487			clear_bit(In_sync, &rdev->flags);
7488			set_bit(Replacement, &rdev->flags);
7489			rdev->raid_disk = disk;
7490			err = 0;
7491			conf->fullsync = 1;
7492			rcu_assign_pointer(p->replacement, rdev);
7493			break;
7494		}
7495	}
7496out:
7497	print_raid5_conf(conf);
7498	return err;
7499}
7500
7501static int raid5_resize(struct mddev *mddev, sector_t sectors)
7502{
7503	/* no resync is happening, and there is enough space
7504	 * on all devices, so we can resize.
7505	 * We need to make sure resync covers any new space.
7506	 * If the array is shrinking we should possibly wait until
7507	 * any io in the removed space completes, but it hardly seems
7508	 * worth it.
7509	 */
7510	sector_t newsize;
7511	struct r5conf *conf = mddev->private;
7512
7513	if (conf->log)
7514		return -EINVAL;
7515	sectors &= ~((sector_t)conf->chunk_sectors - 1);
7516	newsize = raid5_size(mddev, sectors, mddev->raid_disks);
7517	if (mddev->external_size &&
7518	    mddev->array_sectors > newsize)
7519		return -EINVAL;
7520	if (mddev->bitmap) {
7521		int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0);
7522		if (ret)
7523			return ret;
7524	}
7525	md_set_array_sectors(mddev, newsize);
7526	set_capacity(mddev->gendisk, mddev->array_sectors);
7527	revalidate_disk(mddev->gendisk);
7528	if (sectors > mddev->dev_sectors &&
7529	    mddev->recovery_cp > mddev->dev_sectors) {
7530		mddev->recovery_cp = mddev->dev_sectors;
7531		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7532	}
7533	mddev->dev_sectors = sectors;
7534	mddev->resync_max_sectors = sectors;
7535	return 0;
7536}
7537
7538static int check_stripe_cache(struct mddev *mddev)
7539{
7540	/* Can only proceed if there are plenty of stripe_heads.
7541	 * We need a minimum of one full stripe,, and for sensible progress
7542	 * it is best to have about 4 times that.
7543	 * If we require 4 times, then the default 256 4K stripe_heads will
7544	 * allow for chunk sizes up to 256K, which is probably OK.
7545	 * If the chunk size is greater, user-space should request more
7546	 * stripe_heads first.
7547	 */
7548	struct r5conf *conf = mddev->private;
7549	if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
7550	    > conf->min_nr_stripes ||
7551	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
7552	    > conf->min_nr_stripes) {
7553		pr_warn("md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
7554			mdname(mddev),
7555			((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
7556			 / STRIPE_SIZE)*4);
7557		return 0;
7558	}
7559	return 1;
7560}
7561
7562static int check_reshape(struct mddev *mddev)
7563{
7564	struct r5conf *conf = mddev->private;
7565
7566	if (conf->log)
7567		return -EINVAL;
7568	if (mddev->delta_disks == 0 &&
7569	    mddev->new_layout == mddev->layout &&
7570	    mddev->new_chunk_sectors == mddev->chunk_sectors)
7571		return 0; /* nothing to do */
7572	if (has_failed(conf))
7573		return -EINVAL;
7574	if (mddev->delta_disks < 0 && mddev->reshape_position == MaxSector) {
7575		/* We might be able to shrink, but the devices must
7576		 * be made bigger first.
7577		 * For raid6, 4 is the minimum size.
7578		 * Otherwise 2 is the minimum
7579		 */
7580		int min = 2;
7581		if (mddev->level == 6)
7582			min = 4;
7583		if (mddev->raid_disks + mddev->delta_disks < min)
7584			return -EINVAL;
7585	}
7586
7587	if (!check_stripe_cache(mddev))
7588		return -ENOSPC;
7589
7590	if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
7591	    mddev->delta_disks > 0)
7592		if (resize_chunks(conf,
7593				  conf->previous_raid_disks
7594				  + max(0, mddev->delta_disks),
7595				  max(mddev->new_chunk_sectors,
7596				      mddev->chunk_sectors)
7597			    ) < 0)
7598			return -ENOMEM;
7599	return resize_stripes(conf, (conf->previous_raid_disks
7600				     + mddev->delta_disks));
7601}
7602
7603static int raid5_start_reshape(struct mddev *mddev)
7604{
7605	struct r5conf *conf = mddev->private;
7606	struct md_rdev *rdev;
7607	int spares = 0;
7608	unsigned long flags;
7609
7610	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
7611		return -EBUSY;
7612
7613	if (!check_stripe_cache(mddev))
7614		return -ENOSPC;
7615
7616	if (has_failed(conf))
7617		return -EINVAL;
7618
7619	rdev_for_each(rdev, mddev) {
7620		if (!test_bit(In_sync, &rdev->flags)
7621		    && !test_bit(Faulty, &rdev->flags))
7622			spares++;
7623	}
7624
7625	if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
7626		/* Not enough devices even to make a degraded array
7627		 * of that size
7628		 */
7629		return -EINVAL;
7630
7631	/* Refuse to reduce size of the array.  Any reductions in
7632	 * array size must be through explicit setting of array_size
7633	 * attribute.
7634	 */
7635	if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
7636	    < mddev->array_sectors) {
7637		pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
7638			mdname(mddev));
7639		return -EINVAL;
7640	}
7641
7642	atomic_set(&conf->reshape_stripes, 0);
7643	spin_lock_irq(&conf->device_lock);
7644	write_seqcount_begin(&conf->gen_lock);
7645	conf->previous_raid_disks = conf->raid_disks;
7646	conf->raid_disks += mddev->delta_disks;
7647	conf->prev_chunk_sectors = conf->chunk_sectors;
7648	conf->chunk_sectors = mddev->new_chunk_sectors;
7649	conf->prev_algo = conf->algorithm;
7650	conf->algorithm = mddev->new_layout;
7651	conf->generation++;
7652	/* Code that selects data_offset needs to see the generation update
7653	 * if reshape_progress has been set - so a memory barrier needed.
7654	 */
7655	smp_mb();
7656	if (mddev->reshape_backwards)
7657		conf->reshape_progress = raid5_size(mddev, 0, 0);
7658	else
7659		conf->reshape_progress = 0;
7660	conf->reshape_safe = conf->reshape_progress;
7661	write_seqcount_end(&conf->gen_lock);
7662	spin_unlock_irq(&conf->device_lock);
7663
7664	/* Now make sure any requests that proceeded on the assumption
7665	 * the reshape wasn't running - like Discard or Read - have
7666	 * completed.
7667	 */
7668	mddev_suspend(mddev);
7669	mddev_resume(mddev);
7670
7671	/* Add some new drives, as many as will fit.
7672	 * We know there are enough to make the newly sized array work.
7673	 * Don't add devices if we are reducing the number of
7674	 * devices in the array.  This is because it is not possible
7675	 * to correctly record the "partially reconstructed" state of
7676	 * such devices during the reshape and confusion could result.
7677	 */
7678	if (mddev->delta_disks >= 0) {
7679		rdev_for_each(rdev, mddev)
7680			if (rdev->raid_disk < 0 &&
7681			    !test_bit(Faulty, &rdev->flags)) {
7682				if (raid5_add_disk(mddev, rdev) == 0) {
7683					if (rdev->raid_disk
7684					    >= conf->previous_raid_disks)
7685						set_bit(In_sync, &rdev->flags);
7686					else
7687						rdev->recovery_offset = 0;
7688
7689					if (sysfs_link_rdev(mddev, rdev))
7690						/* Failure here is OK */;
7691				}
7692			} else if (rdev->raid_disk >= conf->previous_raid_disks
7693				   && !test_bit(Faulty, &rdev->flags)) {
7694				/* This is a spare that was manually added */
7695				set_bit(In_sync, &rdev->flags);
7696			}
7697
7698		/* When a reshape changes the number of devices,
7699		 * ->degraded is measured against the larger of the
7700		 * pre and post number of devices.
7701		 */
7702		spin_lock_irqsave(&conf->device_lock, flags);
7703		mddev->degraded = raid5_calc_degraded(conf);
7704		spin_unlock_irqrestore(&conf->device_lock, flags);
7705	}
7706	mddev->raid_disks = conf->raid_disks;
7707	mddev->reshape_position = conf->reshape_progress;
7708	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7709
7710	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7711	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7712	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
7713	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7714	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7715	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
7716						"reshape");
7717	if (!mddev->sync_thread) {
7718		mddev->recovery = 0;
7719		spin_lock_irq(&conf->device_lock);
7720		write_seqcount_begin(&conf->gen_lock);
7721		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
7722		mddev->new_chunk_sectors =
7723			conf->chunk_sectors = conf->prev_chunk_sectors;
7724		mddev->new_layout = conf->algorithm = conf->prev_algo;
7725		rdev_for_each(rdev, mddev)
7726			rdev->new_data_offset = rdev->data_offset;
7727		smp_wmb();
7728		conf->generation --;
7729		conf->reshape_progress = MaxSector;
7730		mddev->reshape_position = MaxSector;
7731		write_seqcount_end(&conf->gen_lock);
7732		spin_unlock_irq(&conf->device_lock);
7733		return -EAGAIN;
7734	}
7735	conf->reshape_checkpoint = jiffies;
7736	md_wakeup_thread(mddev->sync_thread);
7737	md_new_event(mddev);
7738	return 0;
7739}
7740
7741/* This is called from the reshape thread and should make any
7742 * changes needed in 'conf'
7743 */
7744static void end_reshape(struct r5conf *conf)
7745{
7746
7747	if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
7748		struct md_rdev *rdev;
7749
7750		spin_lock_irq(&conf->device_lock);
7751		conf->previous_raid_disks = conf->raid_disks;
7752		rdev_for_each(rdev, conf->mddev)
7753			rdev->data_offset = rdev->new_data_offset;
7754		smp_wmb();
7755		conf->reshape_progress = MaxSector;
7756		conf->mddev->reshape_position = MaxSector;
7757		spin_unlock_irq(&conf->device_lock);
7758		wake_up(&conf->wait_for_overlap);
7759
7760		/* read-ahead size must cover two whole stripes, which is
7761		 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
7762		 */
7763		if (conf->mddev->queue) {
7764			int data_disks = conf->raid_disks - conf->max_degraded;
7765			int stripe = data_disks * ((conf->chunk_sectors << 9)
7766						   / PAGE_SIZE);
7767			if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
7768				conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
7769		}
7770	}
7771}
7772
7773/* This is called from the raid5d thread with mddev_lock held.
7774 * It makes config changes to the device.
7775 */
7776static void raid5_finish_reshape(struct mddev *mddev)
7777{
7778	struct r5conf *conf = mddev->private;
7779
7780	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7781
7782		if (mddev->delta_disks > 0) {
7783			md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
7784			if (mddev->queue) {
7785				set_capacity(mddev->gendisk, mddev->array_sectors);
7786				revalidate_disk(mddev->gendisk);
7787			}
7788		} else {
7789			int d;
7790			spin_lock_irq(&conf->device_lock);
7791			mddev->degraded = raid5_calc_degraded(conf);
7792			spin_unlock_irq(&conf->device_lock);
7793			for (d = conf->raid_disks ;
7794			     d < conf->raid_disks - mddev->delta_disks;
7795			     d++) {
7796				struct md_rdev *rdev = conf->disks[d].rdev;
7797				if (rdev)
7798					clear_bit(In_sync, &rdev->flags);
7799				rdev = conf->disks[d].replacement;
7800				if (rdev)
7801					clear_bit(In_sync, &rdev->flags);
7802			}
7803		}
7804		mddev->layout = conf->algorithm;
7805		mddev->chunk_sectors = conf->chunk_sectors;
7806		mddev->reshape_position = MaxSector;
7807		mddev->delta_disks = 0;
7808		mddev->reshape_backwards = 0;
7809	}
7810}
7811
7812static void raid5_quiesce(struct mddev *mddev, int state)
7813{
7814	struct r5conf *conf = mddev->private;
7815
7816	switch(state) {
7817	case 2: /* resume for a suspend */
7818		wake_up(&conf->wait_for_overlap);
7819		break;
7820
7821	case 1: /* stop all writes */
7822		lock_all_device_hash_locks_irq(conf);
7823		/* '2' tells resync/reshape to pause so that all
7824		 * active stripes can drain
7825		 */
7826		r5c_flush_cache(conf, INT_MAX);
7827		conf->quiesce = 2;
7828		wait_event_cmd(conf->wait_for_quiescent,
7829				    atomic_read(&conf->active_stripes) == 0 &&
7830				    atomic_read(&conf->active_aligned_reads) == 0,
7831				    unlock_all_device_hash_locks_irq(conf),
7832				    lock_all_device_hash_locks_irq(conf));
7833		conf->quiesce = 1;
7834		unlock_all_device_hash_locks_irq(conf);
7835		/* allow reshape to continue */
7836		wake_up(&conf->wait_for_overlap);
7837		break;
7838
7839	case 0: /* re-enable writes */
7840		lock_all_device_hash_locks_irq(conf);
7841		conf->quiesce = 0;
7842		wake_up(&conf->wait_for_quiescent);
7843		wake_up(&conf->wait_for_overlap);
7844		unlock_all_device_hash_locks_irq(conf);
7845		break;
7846	}
7847	r5l_quiesce(conf->log, state);
7848}
7849
7850static void *raid45_takeover_raid0(struct mddev *mddev, int level)
7851{
7852	struct r0conf *raid0_conf = mddev->private;
7853	sector_t sectors;
7854
7855	/* for raid0 takeover only one zone is supported */
7856	if (raid0_conf->nr_strip_zones > 1) {
7857		pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
7858			mdname(mddev));
7859		return ERR_PTR(-EINVAL);
7860	}
7861
7862	sectors = raid0_conf->strip_zone[0].zone_end;
7863	sector_div(sectors, raid0_conf->strip_zone[0].nb_dev);
7864	mddev->dev_sectors = sectors;
7865	mddev->new_level = level;
7866	mddev->new_layout = ALGORITHM_PARITY_N;
7867	mddev->new_chunk_sectors = mddev->chunk_sectors;
7868	mddev->raid_disks += 1;
7869	mddev->delta_disks = 1;
7870	/* make sure it will be not marked as dirty */
7871	mddev->recovery_cp = MaxSector;
7872
7873	return setup_conf(mddev);
7874}
7875
7876static void *raid5_takeover_raid1(struct mddev *mddev)
7877{
7878	int chunksect;
7879	void *ret;
7880
7881	if (mddev->raid_disks != 2 ||
7882	    mddev->degraded > 1)
7883		return ERR_PTR(-EINVAL);
7884
7885	/* Should check if there are write-behind devices? */
7886
7887	chunksect = 64*2; /* 64K by default */
7888
7889	/* The array must be an exact multiple of chunksize */
7890	while (chunksect && (mddev->array_sectors & (chunksect-1)))
7891		chunksect >>= 1;
7892
7893	if ((chunksect<<9) < STRIPE_SIZE)
7894		/* array size does not allow a suitable chunk size */
7895		return ERR_PTR(-EINVAL);
7896
7897	mddev->new_level = 5;
7898	mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
7899	mddev->new_chunk_sectors = chunksect;
7900
7901	ret = setup_conf(mddev);
7902	if (!IS_ERR(ret))
7903		mddev_clear_unsupported_flags(mddev,
7904			UNSUPPORTED_MDDEV_FLAGS);
7905	return ret;
7906}
7907
7908static void *raid5_takeover_raid6(struct mddev *mddev)
7909{
7910	int new_layout;
7911
7912	switch (mddev->layout) {
7913	case ALGORITHM_LEFT_ASYMMETRIC_6:
7914		new_layout = ALGORITHM_LEFT_ASYMMETRIC;
7915		break;
7916	case ALGORITHM_RIGHT_ASYMMETRIC_6:
7917		new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
7918		break;
7919	case ALGORITHM_LEFT_SYMMETRIC_6:
7920		new_layout = ALGORITHM_LEFT_SYMMETRIC;
7921		break;
7922	case ALGORITHM_RIGHT_SYMMETRIC_6:
7923		new_layout = ALGORITHM_RIGHT_SYMMETRIC;
7924		break;
7925	case ALGORITHM_PARITY_0_6:
7926		new_layout = ALGORITHM_PARITY_0;
7927		break;
7928	case ALGORITHM_PARITY_N:
7929		new_layout = ALGORITHM_PARITY_N;
7930		break;
7931	default:
7932		return ERR_PTR(-EINVAL);
7933	}
7934	mddev->new_level = 5;
7935	mddev->new_layout = new_layout;
7936	mddev->delta_disks = -1;
7937	mddev->raid_disks -= 1;
7938	return setup_conf(mddev);
7939}
7940
7941static int raid5_check_reshape(struct mddev *mddev)
7942{
7943	/* For a 2-drive array, the layout and chunk size can be changed
7944	 * immediately as not restriping is needed.
7945	 * For larger arrays we record the new value - after validation
7946	 * to be used by a reshape pass.
7947	 */
7948	struct r5conf *conf = mddev->private;
7949	int new_chunk = mddev->new_chunk_sectors;
7950
7951	if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
7952		return -EINVAL;
7953	if (new_chunk > 0) {
7954		if (!is_power_of_2(new_chunk))
7955			return -EINVAL;
7956		if (new_chunk < (PAGE_SIZE>>9))
7957			return -EINVAL;
7958		if (mddev->array_sectors & (new_chunk-1))
7959			/* not factor of array size */
7960			return -EINVAL;
7961	}
7962
7963	/* They look valid */
7964
7965	if (mddev->raid_disks == 2) {
7966		/* can make the change immediately */
7967		if (mddev->new_layout >= 0) {
7968			conf->algorithm = mddev->new_layout;
7969			mddev->layout = mddev->new_layout;
7970		}
7971		if (new_chunk > 0) {
7972			conf->chunk_sectors = new_chunk ;
7973			mddev->chunk_sectors = new_chunk;
7974		}
7975		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
7976		md_wakeup_thread(mddev->thread);
7977	}
7978	return check_reshape(mddev);
7979}
7980
7981static int raid6_check_reshape(struct mddev *mddev)
7982{
7983	int new_chunk = mddev->new_chunk_sectors;
7984
7985	if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
7986		return -EINVAL;
7987	if (new_chunk > 0) {
7988		if (!is_power_of_2(new_chunk))
7989			return -EINVAL;
7990		if (new_chunk < (PAGE_SIZE >> 9))
7991			return -EINVAL;
7992		if (mddev->array_sectors & (new_chunk-1))
7993			/* not factor of array size */
7994			return -EINVAL;
7995	}
7996
7997	/* They look valid */
7998	return check_reshape(mddev);
7999}
8000
8001static void *raid5_takeover(struct mddev *mddev)
8002{
8003	/* raid5 can take over:
8004	 *  raid0 - if there is only one strip zone - make it a raid4 layout
8005	 *  raid1 - if there are two drives.  We need to know the chunk size
8006	 *  raid4 - trivial - just use a raid4 layout.
8007	 *  raid6 - Providing it is a *_6 layout
8008	 */
8009	if (mddev->level == 0)
8010		return raid45_takeover_raid0(mddev, 5);
8011	if (mddev->level == 1)
8012		return raid5_takeover_raid1(mddev);
8013	if (mddev->level == 4) {
8014		mddev->new_layout = ALGORITHM_PARITY_N;
8015		mddev->new_level = 5;
8016		return setup_conf(mddev);
8017	}
8018	if (mddev->level == 6)
8019		return raid5_takeover_raid6(mddev);
8020
8021	return ERR_PTR(-EINVAL);
8022}
8023
8024static void *raid4_takeover(struct mddev *mddev)
8025{
8026	/* raid4 can take over:
8027	 *  raid0 - if there is only one strip zone
8028	 *  raid5 - if layout is right
8029	 */
8030	if (mddev->level == 0)
8031		return raid45_takeover_raid0(mddev, 4);
8032	if (mddev->level == 5 &&
8033	    mddev->layout == ALGORITHM_PARITY_N) {
8034		mddev->new_layout = 0;
8035		mddev->new_level = 4;
8036		return setup_conf(mddev);
8037	}
8038	return ERR_PTR(-EINVAL);
8039}
8040
8041static struct md_personality raid5_personality;
8042
8043static void *raid6_takeover(struct mddev *mddev)
8044{
8045	/* Currently can only take over a raid5.  We map the
8046	 * personality to an equivalent raid6 personality
8047	 * with the Q block at the end.
8048	 */
8049	int new_layout;
8050
8051	if (mddev->pers != &raid5_personality)
8052		return ERR_PTR(-EINVAL);
8053	if (mddev->degraded > 1)
8054		return ERR_PTR(-EINVAL);
8055	if (mddev->raid_disks > 253)
8056		return ERR_PTR(-EINVAL);
8057	if (mddev->raid_disks < 3)
8058		return ERR_PTR(-EINVAL);
8059
8060	switch (mddev->layout) {
8061	case ALGORITHM_LEFT_ASYMMETRIC:
8062		new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
8063		break;
8064	case ALGORITHM_RIGHT_ASYMMETRIC:
8065		new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
8066		break;
8067	case ALGORITHM_LEFT_SYMMETRIC:
8068		new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
8069		break;
8070	case ALGORITHM_RIGHT_SYMMETRIC:
8071		new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
8072		break;
8073	case ALGORITHM_PARITY_0:
8074		new_layout = ALGORITHM_PARITY_0_6;
8075		break;
8076	case ALGORITHM_PARITY_N:
8077		new_layout = ALGORITHM_PARITY_N;
8078		break;
8079	default:
8080		return ERR_PTR(-EINVAL);
8081	}
8082	mddev->new_level = 6;
8083	mddev->new_layout = new_layout;
8084	mddev->delta_disks = 1;
8085	mddev->raid_disks += 1;
8086	return setup_conf(mddev);
8087}
8088
8089static struct md_personality raid6_personality =
8090{
8091	.name		= "raid6",
8092	.level		= 6,
8093	.owner		= THIS_MODULE,
8094	.make_request	= raid5_make_request,
8095	.run		= raid5_run,
8096	.free		= raid5_free,
8097	.status		= raid5_status,
8098	.error_handler	= raid5_error,
8099	.hot_add_disk	= raid5_add_disk,
8100	.hot_remove_disk= raid5_remove_disk,
8101	.spare_active	= raid5_spare_active,
8102	.sync_request	= raid5_sync_request,
8103	.resize		= raid5_resize,
8104	.size		= raid5_size,
8105	.check_reshape	= raid6_check_reshape,
8106	.start_reshape  = raid5_start_reshape,
8107	.finish_reshape = raid5_finish_reshape,
8108	.quiesce	= raid5_quiesce,
8109	.takeover	= raid6_takeover,
8110	.congested	= raid5_congested,
8111};
8112static struct md_personality raid5_personality =
8113{
8114	.name		= "raid5",
8115	.level		= 5,
8116	.owner		= THIS_MODULE,
8117	.make_request	= raid5_make_request,
8118	.run		= raid5_run,
8119	.free		= raid5_free,
8120	.status		= raid5_status,
8121	.error_handler	= raid5_error,
8122	.hot_add_disk	= raid5_add_disk,
8123	.hot_remove_disk= raid5_remove_disk,
8124	.spare_active	= raid5_spare_active,
8125	.sync_request	= raid5_sync_request,
8126	.resize		= raid5_resize,
8127	.size		= raid5_size,
8128	.check_reshape	= raid5_check_reshape,
8129	.start_reshape  = raid5_start_reshape,
8130	.finish_reshape = raid5_finish_reshape,
8131	.quiesce	= raid5_quiesce,
8132	.takeover	= raid5_takeover,
8133	.congested	= raid5_congested,
8134};
8135
8136static struct md_personality raid4_personality =
8137{
8138	.name		= "raid4",
8139	.level		= 4,
8140	.owner		= THIS_MODULE,
8141	.make_request	= raid5_make_request,
8142	.run		= raid5_run,
8143	.free		= raid5_free,
8144	.status		= raid5_status,
8145	.error_handler	= raid5_error,
8146	.hot_add_disk	= raid5_add_disk,
8147	.hot_remove_disk= raid5_remove_disk,
8148	.spare_active	= raid5_spare_active,
8149	.sync_request	= raid5_sync_request,
8150	.resize		= raid5_resize,
8151	.size		= raid5_size,
8152	.check_reshape	= raid5_check_reshape,
8153	.start_reshape  = raid5_start_reshape,
8154	.finish_reshape = raid5_finish_reshape,
8155	.quiesce	= raid5_quiesce,
8156	.takeover	= raid4_takeover,
8157	.congested	= raid5_congested,
8158};
8159
8160static int __init raid5_init(void)
8161{
8162	int ret;
8163
8164	raid5_wq = alloc_workqueue("raid5wq",
8165		WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
8166	if (!raid5_wq)
8167		return -ENOMEM;
8168
8169	ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
8170				      "md/raid5:prepare",
8171				      raid456_cpu_up_prepare,
8172				      raid456_cpu_dead);
8173	if (ret) {
8174		destroy_workqueue(raid5_wq);
8175		return ret;
8176	}
8177	register_md_personality(&raid6_personality);
8178	register_md_personality(&raid5_personality);
8179	register_md_personality(&raid4_personality);
8180	return 0;
8181}
8182
8183static void raid5_exit(void)
8184{
8185	unregister_md_personality(&raid6_personality);
8186	unregister_md_personality(&raid5_personality);
8187	unregister_md_personality(&raid4_personality);
8188	cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
8189	destroy_workqueue(raid5_wq);
8190}
8191
8192module_init(raid5_init);
8193module_exit(raid5_exit);
8194MODULE_LICENSE("GPL");
8195MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD");
8196MODULE_ALIAS("md-personality-4"); /* RAID5 */
8197MODULE_ALIAS("md-raid5");
8198MODULE_ALIAS("md-raid4");
8199MODULE_ALIAS("md-level-5");
8200MODULE_ALIAS("md-level-4");
8201MODULE_ALIAS("md-personality-8"); /* RAID6 */
8202MODULE_ALIAS("md-raid6");
8203MODULE_ALIAS("md-level-6");
8204
8205/* This used to be two separate modules, they were: */
8206MODULE_ALIAS("raid5");
8207MODULE_ALIAS("raid6");