dm-cache-policy-mq.c - drivers/md/dm-cache-policy-mq.c - Linux source code v5.9

Note: File does not exist in v5.9.
   1/*
   2 * Copyright (C) 2012 Red Hat. All rights reserved.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-cache-policy.h"
   8#include "dm.h"
   9
  10#include <linux/hash.h>
  11#include <linux/module.h>
  12#include <linux/mutex.h>
  13#include <linux/slab.h>
  14#include <linux/vmalloc.h>
  15
  16#define DM_MSG_PREFIX "cache-policy-mq"
  17
  18static struct kmem_cache *mq_entry_cache;
  19
  20/*----------------------------------------------------------------*/
  21
  22static unsigned next_power(unsigned n, unsigned min)
  23{
  24	return roundup_pow_of_two(max(n, min));
  25}
  26
  27/*----------------------------------------------------------------*/
  28
  29/*
  30 * Large, sequential ios are probably better left on the origin device since
  31 * spindles tend to have good bandwidth.
  32 *
  33 * The io_tracker tries to spot when the io is in one of these sequential
  34 * modes.
  35 *
  36 * Two thresholds to switch between random and sequential io mode are defaulting
  37 * as follows and can be adjusted via the constructor and message interfaces.
  38 */
  39#define RANDOM_THRESHOLD_DEFAULT 4
  40#define SEQUENTIAL_THRESHOLD_DEFAULT 512
  41
  42enum io_pattern {
  43	PATTERN_SEQUENTIAL,
  44	PATTERN_RANDOM
  45};
  46
  47struct io_tracker {
  48	enum io_pattern pattern;
  49
  50	unsigned nr_seq_samples;
  51	unsigned nr_rand_samples;
  52	unsigned thresholds[2];
  53
  54	dm_oblock_t last_end_oblock;
  55};
  56
  57static void iot_init(struct io_tracker *t,
  58		     int sequential_threshold, int random_threshold)
  59{
  60	t->pattern = PATTERN_RANDOM;
  61	t->nr_seq_samples = 0;
  62	t->nr_rand_samples = 0;
  63	t->last_end_oblock = 0;
  64	t->thresholds[PATTERN_RANDOM] = random_threshold;
  65	t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold;
  66}
  67
  68static enum io_pattern iot_pattern(struct io_tracker *t)
  69{
  70	return t->pattern;
  71}
  72
  73static void iot_update_stats(struct io_tracker *t, struct bio *bio)
  74{
  75	if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1)
  76		t->nr_seq_samples++;
  77	else {
  78		/*
  79		 * Just one non-sequential IO is enough to reset the
  80		 * counters.
  81		 */
  82		if (t->nr_seq_samples) {
  83			t->nr_seq_samples = 0;
  84			t->nr_rand_samples = 0;
  85		}
  86
  87		t->nr_rand_samples++;
  88	}
  89
  90	t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1);
  91}
  92
  93static void iot_check_for_pattern_switch(struct io_tracker *t)
  94{
  95	switch (t->pattern) {
  96	case PATTERN_SEQUENTIAL:
  97		if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) {
  98			t->pattern = PATTERN_RANDOM;
  99			t->nr_seq_samples = t->nr_rand_samples = 0;
 100		}
 101		break;
 102
 103	case PATTERN_RANDOM:
 104		if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) {
 105			t->pattern = PATTERN_SEQUENTIAL;
 106			t->nr_seq_samples = t->nr_rand_samples = 0;
 107		}
 108		break;
 109	}
 110}
 111
 112static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
 113{
 114	iot_update_stats(t, bio);
 115	iot_check_for_pattern_switch(t);
 116}
 117
 118/*----------------------------------------------------------------*/
 119
 120
 121/*
 122 * This queue is divided up into different levels.  Allowing us to push
 123 * entries to the back of any of the levels.  Think of it as a partially
 124 * sorted queue.
 125 */
 126#define NR_QUEUE_LEVELS 16u
 127
 128struct queue {
 129	struct list_head qs[NR_QUEUE_LEVELS];
 130};
 131
 132static void queue_init(struct queue *q)
 133{
 134	unsigned i;
 135
 136	for (i = 0; i < NR_QUEUE_LEVELS; i++)
 137		INIT_LIST_HEAD(q->qs + i);
 138}
 139
 140/*
 141 * Checks to see if the queue is empty.
 142 * FIXME: reduce cpu usage.
 143 */
 144static bool queue_empty(struct queue *q)
 145{
 146	unsigned i;
 147
 148	for (i = 0; i < NR_QUEUE_LEVELS; i++)
 149		if (!list_empty(q->qs + i))
 150			return false;
 151
 152	return true;
 153}
 154
 155/*
 156 * Insert an entry to the back of the given level.
 157 */
 158static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
 159{
 160	list_add_tail(elt, q->qs + level);
 161}
 162
 163static void queue_remove(struct list_head *elt)
 164{
 165	list_del(elt);
 166}
 167
 168/*
 169 * Shifts all regions down one level.  This has no effect on the order of
 170 * the queue.
 171 */
 172static void queue_shift_down(struct queue *q)
 173{
 174	unsigned level;
 175
 176	for (level = 1; level < NR_QUEUE_LEVELS; level++)
 177		list_splice_init(q->qs + level, q->qs + level - 1);
 178}
 179
 180/*
 181 * Gives us the oldest entry of the lowest popoulated level.  If the first
 182 * level is emptied then we shift down one level.
 183 */
 184static struct list_head *queue_pop(struct queue *q)
 185{
 186	unsigned level;
 187	struct list_head *r;
 188
 189	for (level = 0; level < NR_QUEUE_LEVELS; level++)
 190		if (!list_empty(q->qs + level)) {
 191			r = q->qs[level].next;
 192			list_del(r);
 193
 194			/* have we just emptied the bottom level? */
 195			if (level == 0 && list_empty(q->qs))
 196				queue_shift_down(q);
 197
 198			return r;
 199		}
 200
 201	return NULL;
 202}
 203
 204static struct list_head *list_pop(struct list_head *lh)
 205{
 206	struct list_head *r = lh->next;
 207
 208	BUG_ON(!r);
 209	list_del_init(r);
 210
 211	return r;
 212}
 213
 214/*----------------------------------------------------------------*/
 215
 216/*
 217 * Describes a cache entry.  Used in both the cache and the pre_cache.
 218 */
 219struct entry {
 220	struct hlist_node hlist;
 221	struct list_head list;
 222	dm_oblock_t oblock;
 223
 224	/*
 225	 * FIXME: pack these better
 226	 */
 227	bool dirty:1;
 228	unsigned hit_count;
 229	unsigned generation;
 230	unsigned tick;
 231};
 232
 233/*
 234 * Rather than storing the cblock in an entry, we allocate all entries in
 235 * an array, and infer the cblock from the entry position.
 236 *
 237 * Free entries are linked together into a list.
 238 */
 239struct entry_pool {
 240	struct entry *entries, *entries_end;
 241	struct list_head free;
 242	unsigned nr_allocated;
 243};
 244
 245static int epool_init(struct entry_pool *ep, unsigned nr_entries)
 246{
 247	unsigned i;
 248
 249	ep->entries = vzalloc(sizeof(struct entry) * nr_entries);
 250	if (!ep->entries)
 251		return -ENOMEM;
 252
 253	ep->entries_end = ep->entries + nr_entries;
 254
 255	INIT_LIST_HEAD(&ep->free);
 256	for (i = 0; i < nr_entries; i++)
 257		list_add(&ep->entries[i].list, &ep->free);
 258
 259	ep->nr_allocated = 0;
 260
 261	return 0;
 262}
 263
 264static void epool_exit(struct entry_pool *ep)
 265{
 266	vfree(ep->entries);
 267}
 268
 269static struct entry *alloc_entry(struct entry_pool *ep)
 270{
 271	struct entry *e;
 272
 273	if (list_empty(&ep->free))
 274		return NULL;
 275
 276	e = list_entry(list_pop(&ep->free), struct entry, list);
 277	INIT_LIST_HEAD(&e->list);
 278	INIT_HLIST_NODE(&e->hlist);
 279	ep->nr_allocated++;
 280
 281	return e;
 282}
 283
 284/*
 285 * This assumes the cblock hasn't already been allocated.
 286 */
 287static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
 288{
 289	struct entry *e = ep->entries + from_cblock(cblock);
 290
 291	list_del_init(&e->list);
 292	INIT_HLIST_NODE(&e->hlist);
 293	ep->nr_allocated++;
 294
 295	return e;
 296}
 297
 298static void free_entry(struct entry_pool *ep, struct entry *e)
 299{
 300	BUG_ON(!ep->nr_allocated);
 301	ep->nr_allocated--;
 302	INIT_HLIST_NODE(&e->hlist);
 303	list_add(&e->list, &ep->free);
 304}
 305
 306/*
 307 * Returns NULL if the entry is free.
 308 */
 309static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock)
 310{
 311	struct entry *e = ep->entries + from_cblock(cblock);
 312	return !hlist_unhashed(&e->hlist) ? e : NULL;
 313}
 314
 315static bool epool_empty(struct entry_pool *ep)
 316{
 317	return list_empty(&ep->free);
 318}
 319
 320static bool in_pool(struct entry_pool *ep, struct entry *e)
 321{
 322	return e >= ep->entries && e < ep->entries_end;
 323}
 324
 325static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e)
 326{
 327	return to_cblock(e - ep->entries);
 328}
 329
 330/*----------------------------------------------------------------*/
 331
 332struct mq_policy {
 333	struct dm_cache_policy policy;
 334
 335	/* protects everything */
 336	struct mutex lock;
 337	dm_cblock_t cache_size;
 338	struct io_tracker tracker;
 339
 340	/*
 341	 * Entries come from two pools, one of pre-cache entries, and one
 342	 * for the cache proper.
 343	 */
 344	struct entry_pool pre_cache_pool;
 345	struct entry_pool cache_pool;
 346
 347	/*
 348	 * We maintain three queues of entries.  The cache proper,
 349	 * consisting of a clean and dirty queue, contains the currently
 350	 * active mappings.  Whereas the pre_cache tracks blocks that
 351	 * are being hit frequently and potential candidates for promotion
 352	 * to the cache.
 353	 */
 354	struct queue pre_cache;
 355	struct queue cache_clean;
 356	struct queue cache_dirty;
 357
 358	/*
 359	 * Keeps track of time, incremented by the core.  We use this to
 360	 * avoid attributing multiple hits within the same tick.
 361	 *
 362	 * Access to tick_protected should be done with the spin lock held.
 363	 * It's copied to tick at the start of the map function (within the
 364	 * mutex).
 365	 */
 366	spinlock_t tick_lock;
 367	unsigned tick_protected;
 368	unsigned tick;
 369
 370	/*
 371	 * A count of the number of times the map function has been called
 372	 * and found an entry in the pre_cache or cache.  Currently used to
 373	 * calculate the generation.
 374	 */
 375	unsigned hit_count;
 376
 377	/*
 378	 * A generation is a longish period that is used to trigger some
 379	 * book keeping effects.  eg, decrementing hit counts on entries.
 380	 * This is needed to allow the cache to evolve as io patterns
 381	 * change.
 382	 */
 383	unsigned generation;
 384	unsigned generation_period; /* in lookups (will probably change) */
 385
 386	/*
 387	 * Entries in the pre_cache whose hit count passes the promotion
 388	 * threshold move to the cache proper.  Working out the correct
 389	 * value for the promotion_threshold is crucial to this policy.
 390	 */
 391	unsigned promote_threshold;
 392
 393	unsigned discard_promote_adjustment;
 394	unsigned read_promote_adjustment;
 395	unsigned write_promote_adjustment;
 396
 397	/*
 398	 * The hash table allows us to quickly find an entry by origin
 399	 * block.  Both pre_cache and cache entries are in here.
 400	 */
 401	unsigned nr_buckets;
 402	dm_block_t hash_bits;
 403	struct hlist_head *table;
 404};
 405
 406#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
 407#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
 408#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
 409
 410/*----------------------------------------------------------------*/
 411
 412/*
 413 * Simple hash table implementation.  Should replace with the standard hash
 414 * table that's making its way upstream.
 415 */
 416static void hash_insert(struct mq_policy *mq, struct entry *e)
 417{
 418	unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits);
 419
 420	hlist_add_head(&e->hlist, mq->table + h);
 421}
 422
 423static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock)
 424{
 425	unsigned h = hash_64(from_oblock(oblock), mq->hash_bits);
 426	struct hlist_head *bucket = mq->table + h;
 427	struct entry *e;
 428
 429	hlist_for_each_entry(e, bucket, hlist)
 430		if (e->oblock == oblock) {
 431			hlist_del(&e->hlist);
 432			hlist_add_head(&e->hlist, bucket);
 433			return e;
 434		}
 435
 436	return NULL;
 437}
 438
 439static void hash_remove(struct entry *e)
 440{
 441	hlist_del(&e->hlist);
 442}
 443
 444/*----------------------------------------------------------------*/
 445
 446static bool any_free_cblocks(struct mq_policy *mq)
 447{
 448	return !epool_empty(&mq->cache_pool);
 449}
 450
 451static bool any_clean_cblocks(struct mq_policy *mq)
 452{
 453	return !queue_empty(&mq->cache_clean);
 454}
 455
 456/*----------------------------------------------------------------*/
 457
 458/*
 459 * Now we get to the meat of the policy.  This section deals with deciding
 460 * when to to add entries to the pre_cache and cache, and move between
 461 * them.
 462 */
 463
 464/*
 465 * The queue level is based on the log2 of the hit count.
 466 */
 467static unsigned queue_level(struct entry *e)
 468{
 469	return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
 470}
 471
 472static bool in_cache(struct mq_policy *mq, struct entry *e)
 473{
 474	return in_pool(&mq->cache_pool, e);
 475}
 476
 477/*
 478 * Inserts the entry into the pre_cache or the cache.  Ensures the cache
 479 * block is marked as allocated if necc.  Inserts into the hash table.
 480 * Sets the tick which records when the entry was last moved about.
 481 */
 482static void push(struct mq_policy *mq, struct entry *e)
 483{
 484	e->tick = mq->tick;
 485	hash_insert(mq, e);
 486
 487	if (in_cache(mq, e))
 488		queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean,
 489			   queue_level(e), &e->list);
 490	else
 491		queue_push(&mq->pre_cache, queue_level(e), &e->list);
 492}
 493
 494/*
 495 * Removes an entry from pre_cache or cache.  Removes from the hash table.
 496 */
 497static void del(struct mq_policy *mq, struct entry *e)
 498{
 499	queue_remove(&e->list);
 500	hash_remove(e);
 501}
 502
 503/*
 504 * Like del, except it removes the first entry in the queue (ie. the least
 505 * recently used).
 506 */
 507static struct entry *pop(struct mq_policy *mq, struct queue *q)
 508{
 509	struct entry *e;
 510	struct list_head *h = queue_pop(q);
 511
 512	if (!h)
 513		return NULL;
 514
 515	e = container_of(h, struct entry, list);
 516	hash_remove(e);
 517
 518	return e;
 519}
 520
 521/*
 522 * Has this entry already been updated?
 523 */
 524static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
 525{
 526	return mq->tick == e->tick;
 527}
 528
 529/*
 530 * The promotion threshold is adjusted every generation.  As are the counts
 531 * of the entries.
 532 *
 533 * At the moment the threshold is taken by averaging the hit counts of some
 534 * of the entries in the cache (the first 20 entries across all levels in
 535 * ascending order, giving preference to the clean entries at each level).
 536 *
 537 * We can be much cleverer than this though.  For example, each promotion
 538 * could bump up the threshold helping to prevent churn.  Much more to do
 539 * here.
 540 */
 541
 542#define MAX_TO_AVERAGE 20
 543
 544static void check_generation(struct mq_policy *mq)
 545{
 546	unsigned total = 0, nr = 0, count = 0, level;
 547	struct list_head *head;
 548	struct entry *e;
 549
 550	if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) {
 551		mq->hit_count = 0;
 552		mq->generation++;
 553
 554		for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
 555			head = mq->cache_clean.qs + level;
 556			list_for_each_entry(e, head, list) {
 557				nr++;
 558				total += e->hit_count;
 559
 560				if (++count >= MAX_TO_AVERAGE)
 561					break;
 562			}
 563
 564			head = mq->cache_dirty.qs + level;
 565			list_for_each_entry(e, head, list) {
 566				nr++;
 567				total += e->hit_count;
 568
 569				if (++count >= MAX_TO_AVERAGE)
 570					break;
 571			}
 572		}
 573
 574		mq->promote_threshold = nr ? total / nr : 1;
 575		if (mq->promote_threshold * nr < total)
 576			mq->promote_threshold++;
 577	}
 578}
 579
 580/*
 581 * Whenever we use an entry we bump up it's hit counter, and push it to the
 582 * back to it's current level.
 583 */
 584static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
 585{
 586	if (updated_this_tick(mq, e))
 587		return;
 588
 589	e->hit_count++;
 590	mq->hit_count++;
 591	check_generation(mq);
 592
 593	/* generation adjustment, to stop the counts increasing forever. */
 594	/* FIXME: divide? */
 595	/* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
 596	e->generation = mq->generation;
 597
 598	del(mq, e);
 599	push(mq, e);
 600}
 601
 602/*
 603 * Demote the least recently used entry from the cache to the pre_cache.
 604 * Returns the new cache entry to use, and the old origin block it was
 605 * mapped to.
 606 *
 607 * We drop the hit count on the demoted entry back to 1 to stop it bouncing
 608 * straight back into the cache if it's subsequently hit.  There are
 609 * various options here, and more experimentation would be good:
 610 *
 611 * - just forget about the demoted entry completely (ie. don't insert it
 612     into the pre_cache).
 613 * - divide the hit count rather that setting to some hard coded value.
 614 * - set the hit count to a hard coded value other than 1, eg, is it better
 615 *   if it goes in at level 2?
 616 */
 617static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
 618{
 619	struct entry *demoted = pop(mq, &mq->cache_clean);
 620
 621	if (!demoted)
 622		/*
 623		 * We could get a block from mq->cache_dirty, but that
 624		 * would add extra latency to the triggering bio as it
 625		 * waits for the writeback.  Better to not promote this
 626		 * time and hope there's a clean block next time this block
 627		 * is hit.
 628		 */
 629		return -ENOSPC;
 630
 631	*oblock = demoted->oblock;
 632	free_entry(&mq->cache_pool, demoted);
 633
 634	/*
 635	 * We used to put the demoted block into the pre-cache, but I think
 636	 * it's simpler to just let it work it's way up from zero again.
 637	 * Stops blocks flickering in and out of the cache.
 638	 */
 639
 640	return 0;
 641}
 642
 643/*
 644 * We modify the basic promotion_threshold depending on the specific io.
 645 *
 646 * If the origin block has been discarded then there's no cost to copy it
 647 * to the cache.
 648 *
 649 * We bias towards reads, since they can be demoted at no cost if they
 650 * haven't been dirtied.
 651 */
 652static unsigned adjusted_promote_threshold(struct mq_policy *mq,
 653					   bool discarded_oblock, int data_dir)
 654{
 655	if (data_dir == READ)
 656		return mq->promote_threshold + mq->read_promote_adjustment;
 657
 658	if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
 659		/*
 660		 * We don't need to do any copying at all, so give this a
 661		 * very low threshold.
 662		 */
 663		return mq->discard_promote_adjustment;
 664	}
 665
 666	return mq->promote_threshold + mq->write_promote_adjustment;
 667}
 668
 669static bool should_promote(struct mq_policy *mq, struct entry *e,
 670			   bool discarded_oblock, int data_dir)
 671{
 672	return e->hit_count >=
 673		adjusted_promote_threshold(mq, discarded_oblock, data_dir);
 674}
 675
 676static int cache_entry_found(struct mq_policy *mq,
 677			     struct entry *e,
 678			     struct policy_result *result)
 679{
 680	requeue_and_update_tick(mq, e);
 681
 682	if (in_cache(mq, e)) {
 683		result->op = POLICY_HIT;
 684		result->cblock = infer_cblock(&mq->cache_pool, e);
 685	}
 686
 687	return 0;
 688}
 689
 690/*
 691 * Moves an entry from the pre_cache to the cache.  The main work is
 692 * finding which cache block to use.
 693 */
 694static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 695			      struct policy_result *result)
 696{
 697	int r;
 698	struct entry *new_e;
 699
 700	/* Ensure there's a free cblock in the cache */
 701	if (epool_empty(&mq->cache_pool)) {
 702		result->op = POLICY_REPLACE;
 703		r = demote_cblock(mq, &result->old_oblock);
 704		if (r) {
 705			result->op = POLICY_MISS;
 706			return 0;
 707		}
 708	} else
 709		result->op = POLICY_NEW;
 710
 711	new_e = alloc_entry(&mq->cache_pool);
 712	BUG_ON(!new_e);
 713
 714	new_e->oblock = e->oblock;
 715	new_e->dirty = false;
 716	new_e->hit_count = e->hit_count;
 717	new_e->generation = e->generation;
 718	new_e->tick = e->tick;
 719
 720	del(mq, e);
 721	free_entry(&mq->pre_cache_pool, e);
 722	push(mq, new_e);
 723
 724	result->cblock = infer_cblock(&mq->cache_pool, new_e);
 725
 726	return 0;
 727}
 728
 729static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 730				 bool can_migrate, bool discarded_oblock,
 731				 int data_dir, struct policy_result *result)
 732{
 733	int r = 0;
 734	bool updated = updated_this_tick(mq, e);
 735
 736	if ((!discarded_oblock && updated) ||
 737	    !should_promote(mq, e, discarded_oblock, data_dir)) {
 738		requeue_and_update_tick(mq, e);
 739		result->op = POLICY_MISS;
 740
 741	} else if (!can_migrate)
 742		r = -EWOULDBLOCK;
 743
 744	else {
 745		requeue_and_update_tick(mq, e);
 746		r = pre_cache_to_cache(mq, e, result);
 747	}
 748
 749	return r;
 750}
 751
 752static void insert_in_pre_cache(struct mq_policy *mq,
 753				dm_oblock_t oblock)
 754{
 755	struct entry *e = alloc_entry(&mq->pre_cache_pool);
 756
 757	if (!e)
 758		/*
 759		 * There's no spare entry structure, so we grab the least
 760		 * used one from the pre_cache.
 761		 */
 762		e = pop(mq, &mq->pre_cache);
 763
 764	if (unlikely(!e)) {
 765		DMWARN("couldn't pop from pre cache");
 766		return;
 767	}
 768
 769	e->dirty = false;
 770	e->oblock = oblock;
 771	e->hit_count = 1;
 772	e->generation = mq->generation;
 773	push(mq, e);
 774}
 775
 776static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 777			    struct policy_result *result)
 778{
 779	int r;
 780	struct entry *e;
 781
 782	if (epool_empty(&mq->cache_pool)) {
 783		result->op = POLICY_REPLACE;
 784		r = demote_cblock(mq, &result->old_oblock);
 785		if (unlikely(r)) {
 786			result->op = POLICY_MISS;
 787			insert_in_pre_cache(mq, oblock);
 788			return;
 789		}
 790
 791		/*
 792		 * This will always succeed, since we've just demoted.
 793		 */
 794		e = alloc_entry(&mq->cache_pool);
 795		BUG_ON(!e);
 796
 797	} else {
 798		e = alloc_entry(&mq->cache_pool);
 799		result->op = POLICY_NEW;
 800	}
 801
 802	e->oblock = oblock;
 803	e->dirty = false;
 804	e->hit_count = 1;
 805	e->generation = mq->generation;
 806	push(mq, e);
 807
 808	result->cblock = infer_cblock(&mq->cache_pool, e);
 809}
 810
 811static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
 812			  bool can_migrate, bool discarded_oblock,
 813			  int data_dir, struct policy_result *result)
 814{
 815	if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
 816		if (can_migrate)
 817			insert_in_cache(mq, oblock, result);
 818		else
 819			return -EWOULDBLOCK;
 820	} else {
 821		insert_in_pre_cache(mq, oblock);
 822		result->op = POLICY_MISS;
 823	}
 824
 825	return 0;
 826}
 827
 828/*
 829 * Looks the oblock up in the hash table, then decides whether to put in
 830 * pre_cache, or cache etc.
 831 */
 832static int map(struct mq_policy *mq, dm_oblock_t oblock,
 833	       bool can_migrate, bool discarded_oblock,
 834	       int data_dir, struct policy_result *result)
 835{
 836	int r = 0;
 837	struct entry *e = hash_lookup(mq, oblock);
 838
 839	if (e && in_cache(mq, e))
 840		r = cache_entry_found(mq, e, result);
 841
 842	else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
 843		result->op = POLICY_MISS;
 844
 845	else if (e)
 846		r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
 847					  data_dir, result);
 848
 849	else
 850		r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
 851				   data_dir, result);
 852
 853	if (r == -EWOULDBLOCK)
 854		result->op = POLICY_MISS;
 855
 856	return r;
 857}
 858
 859/*----------------------------------------------------------------*/
 860
 861/*
 862 * Public interface, via the policy struct.  See dm-cache-policy.h for a
 863 * description of these.
 864 */
 865
 866static struct mq_policy *to_mq_policy(struct dm_cache_policy *p)
 867{
 868	return container_of(p, struct mq_policy, policy);
 869}
 870
 871static void mq_destroy(struct dm_cache_policy *p)
 872{
 873	struct mq_policy *mq = to_mq_policy(p);
 874
 875	vfree(mq->table);
 876	epool_exit(&mq->cache_pool);
 877	epool_exit(&mq->pre_cache_pool);
 878	kfree(mq);
 879}
 880
 881static void copy_tick(struct mq_policy *mq)
 882{
 883	unsigned long flags;
 884
 885	spin_lock_irqsave(&mq->tick_lock, flags);
 886	mq->tick = mq->tick_protected;
 887	spin_unlock_irqrestore(&mq->tick_lock, flags);
 888}
 889
 890static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
 891		  bool can_block, bool can_migrate, bool discarded_oblock,
 892		  struct bio *bio, struct policy_result *result)
 893{
 894	int r;
 895	struct mq_policy *mq = to_mq_policy(p);
 896
 897	result->op = POLICY_MISS;
 898
 899	if (can_block)
 900		mutex_lock(&mq->lock);
 901	else if (!mutex_trylock(&mq->lock))
 902		return -EWOULDBLOCK;
 903
 904	copy_tick(mq);
 905
 906	iot_examine_bio(&mq->tracker, bio);
 907	r = map(mq, oblock, can_migrate, discarded_oblock,
 908		bio_data_dir(bio), result);
 909
 910	mutex_unlock(&mq->lock);
 911
 912	return r;
 913}
 914
 915static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
 916{
 917	int r;
 918	struct mq_policy *mq = to_mq_policy(p);
 919	struct entry *e;
 920
 921	if (!mutex_trylock(&mq->lock))
 922		return -EWOULDBLOCK;
 923
 924	e = hash_lookup(mq, oblock);
 925	if (e && in_cache(mq, e)) {
 926		*cblock = infer_cblock(&mq->cache_pool, e);
 927		r = 0;
 928	} else
 929		r = -ENOENT;
 930
 931	mutex_unlock(&mq->lock);
 932
 933	return r;
 934}
 935
 936static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set)
 937{
 938	struct entry *e;
 939
 940	e = hash_lookup(mq, oblock);
 941	BUG_ON(!e || !in_cache(mq, e));
 942
 943	del(mq, e);
 944	e->dirty = set;
 945	push(mq, e);
 946}
 947
 948static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
 949{
 950	struct mq_policy *mq = to_mq_policy(p);
 951
 952	mutex_lock(&mq->lock);
 953	__mq_set_clear_dirty(mq, oblock, true);
 954	mutex_unlock(&mq->lock);
 955}
 956
 957static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
 958{
 959	struct mq_policy *mq = to_mq_policy(p);
 960
 961	mutex_lock(&mq->lock);
 962	__mq_set_clear_dirty(mq, oblock, false);
 963	mutex_unlock(&mq->lock);
 964}
 965
 966static int mq_load_mapping(struct dm_cache_policy *p,
 967			   dm_oblock_t oblock, dm_cblock_t cblock,
 968			   uint32_t hint, bool hint_valid)
 969{
 970	struct mq_policy *mq = to_mq_policy(p);
 971	struct entry *e;
 972
 973	e = alloc_particular_entry(&mq->cache_pool, cblock);
 974	e->oblock = oblock;
 975	e->dirty = false;	/* this gets corrected in a minute */
 976	e->hit_count = hint_valid ? hint : 1;
 977	e->generation = mq->generation;
 978	push(mq, e);
 979
 980	return 0;
 981}
 982
 983static int mq_save_hints(struct mq_policy *mq, struct queue *q,
 984			 policy_walk_fn fn, void *context)
 985{
 986	int r;
 987	unsigned level;
 988	struct entry *e;
 989
 990	for (level = 0; level < NR_QUEUE_LEVELS; level++)
 991		list_for_each_entry(e, q->qs + level, list) {
 992			r = fn(context, infer_cblock(&mq->cache_pool, e),
 993			       e->oblock, e->hit_count);
 994			if (r)
 995				return r;
 996		}
 997
 998	return 0;
 999}
1000
1001static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
1002			    void *context)
1003{
1004	struct mq_policy *mq = to_mq_policy(p);
1005	int r = 0;
1006
1007	mutex_lock(&mq->lock);
1008
1009	r = mq_save_hints(mq, &mq->cache_clean, fn, context);
1010	if (!r)
1011		r = mq_save_hints(mq, &mq->cache_dirty, fn, context);
1012
1013	mutex_unlock(&mq->lock);
1014
1015	return r;
1016}
1017
1018static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
1019{
1020	struct entry *e;
1021
1022	e = hash_lookup(mq, oblock);
1023	BUG_ON(!e || !in_cache(mq, e));
1024
1025	del(mq, e);
1026	free_entry(&mq->cache_pool, e);
1027}
1028
1029static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
1030{
1031	struct mq_policy *mq = to_mq_policy(p);
1032
1033	mutex_lock(&mq->lock);
1034	__remove_mapping(mq, oblock);
1035	mutex_unlock(&mq->lock);
1036}
1037
1038static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock)
1039{
1040	struct entry *e = epool_find(&mq->cache_pool, cblock);
1041
1042	if (!e)
1043		return -ENODATA;
1044
1045	del(mq, e);
1046	free_entry(&mq->cache_pool, e);
1047
1048	return 0;
1049}
1050
1051static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
1052{
1053	int r;
1054	struct mq_policy *mq = to_mq_policy(p);
1055
1056	mutex_lock(&mq->lock);
1057	r = __remove_cblock(mq, cblock);
1058	mutex_unlock(&mq->lock);
1059
1060	return r;
1061}
1062
1063static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
1064			      dm_cblock_t *cblock)
1065{
1066	struct entry *e = pop(mq, &mq->cache_dirty);
1067
1068	if (!e)
1069		return -ENODATA;
1070
1071	*oblock = e->oblock;
1072	*cblock = infer_cblock(&mq->cache_pool, e);
1073	e->dirty = false;
1074	push(mq, e);
1075
1076	return 0;
1077}
1078
1079static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
1080			     dm_cblock_t *cblock)
1081{
1082	int r;
1083	struct mq_policy *mq = to_mq_policy(p);
1084
1085	mutex_lock(&mq->lock);
1086	r = __mq_writeback_work(mq, oblock, cblock);
1087	mutex_unlock(&mq->lock);
1088
1089	return r;
1090}
1091
1092static void __force_mapping(struct mq_policy *mq,
1093			    dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1094{
1095	struct entry *e = hash_lookup(mq, current_oblock);
1096
1097	if (e && in_cache(mq, e)) {
1098		del(mq, e);
1099		e->oblock = new_oblock;
1100		e->dirty = true;
1101		push(mq, e);
1102	}
1103}
1104
1105static void mq_force_mapping(struct dm_cache_policy *p,
1106			     dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1107{
1108	struct mq_policy *mq = to_mq_policy(p);
1109
1110	mutex_lock(&mq->lock);
1111	__force_mapping(mq, current_oblock, new_oblock);
1112	mutex_unlock(&mq->lock);
1113}
1114
1115static dm_cblock_t mq_residency(struct dm_cache_policy *p)
1116{
1117	dm_cblock_t r;
1118	struct mq_policy *mq = to_mq_policy(p);
1119
1120	mutex_lock(&mq->lock);
1121	r = to_cblock(mq->cache_pool.nr_allocated);
1122	mutex_unlock(&mq->lock);
1123
1124	return r;
1125}
1126
1127static void mq_tick(struct dm_cache_policy *p)
1128{
1129	struct mq_policy *mq = to_mq_policy(p);
1130	unsigned long flags;
1131
1132	spin_lock_irqsave(&mq->tick_lock, flags);
1133	mq->tick_protected++;
1134	spin_unlock_irqrestore(&mq->tick_lock, flags);
1135}
1136
1137static int mq_set_config_value(struct dm_cache_policy *p,
1138			       const char *key, const char *value)
1139{
1140	struct mq_policy *mq = to_mq_policy(p);
1141	unsigned long tmp;
1142
1143	if (kstrtoul(value, 10, &tmp))
1144		return -EINVAL;
1145
1146	if (!strcasecmp(key, "random_threshold")) {
1147		mq->tracker.thresholds[PATTERN_RANDOM] = tmp;
1148
1149	} else if (!strcasecmp(key, "sequential_threshold")) {
1150		mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp;
1151
1152	} else if (!strcasecmp(key, "discard_promote_adjustment"))
1153		mq->discard_promote_adjustment = tmp;
1154
1155	else if (!strcasecmp(key, "read_promote_adjustment"))
1156		mq->read_promote_adjustment = tmp;
1157
1158	else if (!strcasecmp(key, "write_promote_adjustment"))
1159		mq->write_promote_adjustment = tmp;
1160
1161	else
1162		return -EINVAL;
1163
1164	return 0;
1165}
1166
1167static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
1168{
1169	ssize_t sz = 0;
1170	struct mq_policy *mq = to_mq_policy(p);
1171
1172	DMEMIT("10 random_threshold %u "
1173	       "sequential_threshold %u "
1174	       "discard_promote_adjustment %u "
1175	       "read_promote_adjustment %u "
1176	       "write_promote_adjustment %u",
1177	       mq->tracker.thresholds[PATTERN_RANDOM],
1178	       mq->tracker.thresholds[PATTERN_SEQUENTIAL],
1179	       mq->discard_promote_adjustment,
1180	       mq->read_promote_adjustment,
1181	       mq->write_promote_adjustment);
1182
1183	return 0;
1184}
1185
1186/* Init the policy plugin interface function pointers. */
1187static void init_policy_functions(struct mq_policy *mq)
1188{
1189	mq->policy.destroy = mq_destroy;
1190	mq->policy.map = mq_map;
1191	mq->policy.lookup = mq_lookup;
1192	mq->policy.set_dirty = mq_set_dirty;
1193	mq->policy.clear_dirty = mq_clear_dirty;
1194	mq->policy.load_mapping = mq_load_mapping;
1195	mq->policy.walk_mappings = mq_walk_mappings;
1196	mq->policy.remove_mapping = mq_remove_mapping;
1197	mq->policy.remove_cblock = mq_remove_cblock;
1198	mq->policy.writeback_work = mq_writeback_work;
1199	mq->policy.force_mapping = mq_force_mapping;
1200	mq->policy.residency = mq_residency;
1201	mq->policy.tick = mq_tick;
1202	mq->policy.emit_config_values = mq_emit_config_values;
1203	mq->policy.set_config_value = mq_set_config_value;
1204}
1205
1206static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1207					 sector_t origin_size,
1208					 sector_t cache_block_size)
1209{
1210	struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1211
1212	if (!mq)
1213		return NULL;
1214
1215	init_policy_functions(mq);
1216	iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
1217	mq->cache_size = cache_size;
1218
1219	if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) {
1220		DMERR("couldn't initialize pool of pre-cache entries");
1221		goto bad_pre_cache_init;
1222	}
1223
1224	if (epool_init(&mq->cache_pool, from_cblock(cache_size))) {
1225		DMERR("couldn't initialize pool of cache entries");
1226		goto bad_cache_init;
1227	}
1228
1229	mq->tick_protected = 0;
1230	mq->tick = 0;
1231	mq->hit_count = 0;
1232	mq->generation = 0;
1233	mq->promote_threshold = 0;
1234	mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
1235	mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
1236	mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
1237	mutex_init(&mq->lock);
1238	spin_lock_init(&mq->tick_lock);
1239
1240	queue_init(&mq->pre_cache);
1241	queue_init(&mq->cache_clean);
1242	queue_init(&mq->cache_dirty);
1243
1244	mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
1245
1246	mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
1247	mq->hash_bits = ffs(mq->nr_buckets) - 1;
1248	mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
1249	if (!mq->table)
1250		goto bad_alloc_table;
1251
1252	return &mq->policy;
1253
1254bad_alloc_table:
1255	epool_exit(&mq->cache_pool);
1256bad_cache_init:
1257	epool_exit(&mq->pre_cache_pool);
1258bad_pre_cache_init:
1259	kfree(mq);
1260
1261	return NULL;
1262}
1263
1264/*----------------------------------------------------------------*/
1265
1266static struct dm_cache_policy_type mq_policy_type = {
1267	.name = "mq",
1268	.version = {1, 2, 0},
1269	.hint_size = 4,
1270	.owner = THIS_MODULE,
1271	.create = mq_create
1272};
1273
1274static struct dm_cache_policy_type default_policy_type = {
1275	.name = "default",
1276	.version = {1, 2, 0},
1277	.hint_size = 4,
1278	.owner = THIS_MODULE,
1279	.create = mq_create,
1280	.real = &mq_policy_type
1281};
1282
1283static int __init mq_init(void)
1284{
1285	int r;
1286
1287	mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry",
1288					   sizeof(struct entry),
1289					   __alignof__(struct entry),
1290					   0, NULL);
1291	if (!mq_entry_cache)
1292		goto bad;
1293
1294	r = dm_cache_policy_register(&mq_policy_type);
1295	if (r) {
1296		DMERR("register failed %d", r);
1297		goto bad_register_mq;
1298	}
1299
1300	r = dm_cache_policy_register(&default_policy_type);
1301	if (!r) {
1302		DMINFO("version %u.%u.%u loaded",
1303		       mq_policy_type.version[0],
1304		       mq_policy_type.version[1],
1305		       mq_policy_type.version[2]);
1306		return 0;
1307	}
1308
1309	DMERR("register failed (as default) %d", r);
1310
1311	dm_cache_policy_unregister(&mq_policy_type);
1312bad_register_mq:
1313	kmem_cache_destroy(mq_entry_cache);
1314bad:
1315	return -ENOMEM;
1316}
1317
1318static void __exit mq_exit(void)
1319{
1320	dm_cache_policy_unregister(&mq_policy_type);
1321	dm_cache_policy_unregister(&default_policy_type);
1322
1323	kmem_cache_destroy(mq_entry_cache);
1324}
1325
1326module_init(mq_init);
1327module_exit(mq_exit);
1328
1329MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1330MODULE_LICENSE("GPL");
1331MODULE_DESCRIPTION("mq cache policy");
1332
1333MODULE_ALIAS("dm-cache-default");