Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.5.6.
  1/*
  2 * background writeback - scan btree for dirty data and write it to the backing
  3 * device
  4 *
  5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
  6 * Copyright 2012 Google, Inc.
  7 */
  8
  9#include "bcache.h"
 10#include "btree.h"
 11#include "debug.h"
 12#include "writeback.h"
 13
 14#include <linux/delay.h>
 15#include <linux/kthread.h>
 16#include <trace/events/bcache.h>
 17
 18/* Rate limiting */
 19
 20static void __update_writeback_rate(struct cached_dev *dc)
 21{
 22	struct cache_set *c = dc->disk.c;
 23	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
 24	uint64_t cache_dirty_target =
 25		div_u64(cache_sectors * dc->writeback_percent, 100);
 26
 27	int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
 28				   c->cached_dev_sectors);
 29
 30	/* PD controller */
 31
 32	int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
 33	int64_t derivative = dirty - dc->disk.sectors_dirty_last;
 34	int64_t proportional = dirty - target;
 35	int64_t change;
 36
 37	dc->disk.sectors_dirty_last = dirty;
 38
 39	/* Scale to sectors per second */
 40
 41	proportional *= dc->writeback_rate_update_seconds;
 42	proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
 43
 44	derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
 45
 46	derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
 47			      (dc->writeback_rate_d_term /
 48			       dc->writeback_rate_update_seconds) ?: 1, 0);
 49
 50	derivative *= dc->writeback_rate_d_term;
 51	derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
 52
 53	change = proportional + derivative;
 54
 55	/* Don't increase writeback rate if the device isn't keeping up */
 56	if (change > 0 &&
 57	    time_after64(local_clock(),
 58			 dc->writeback_rate.next + NSEC_PER_MSEC))
 59		change = 0;
 60
 61	dc->writeback_rate.rate =
 62		clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
 63			1, NSEC_PER_MSEC);
 64
 65	dc->writeback_rate_proportional = proportional;
 66	dc->writeback_rate_derivative = derivative;
 67	dc->writeback_rate_change = change;
 68	dc->writeback_rate_target = target;
 69}
 70
 71static void update_writeback_rate(struct work_struct *work)
 72{
 73	struct cached_dev *dc = container_of(to_delayed_work(work),
 74					     struct cached_dev,
 75					     writeback_rate_update);
 76
 77	down_read(&dc->writeback_lock);
 78
 79	if (atomic_read(&dc->has_dirty) &&
 80	    dc->writeback_percent)
 81		__update_writeback_rate(dc);
 82
 83	up_read(&dc->writeback_lock);
 84
 85	schedule_delayed_work(&dc->writeback_rate_update,
 86			      dc->writeback_rate_update_seconds * HZ);
 87}
 88
 89static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
 90{
 91	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 92	    !dc->writeback_percent)
 93		return 0;
 94
 95	return bch_next_delay(&dc->writeback_rate, sectors);
 96}
 97
 98struct dirty_io {
 99	struct closure		cl;
100	struct cached_dev	*dc;
101	struct bio		bio;
102};
103
104static void dirty_init(struct keybuf_key *w)
105{
106	struct dirty_io *io = w->private;
107	struct bio *bio = &io->bio;
108
109	bio_init(bio, bio->bi_inline_vecs,
110		 DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS));
111	if (!io->dc->writeback_percent)
112		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
113
114	bio->bi_iter.bi_size	= KEY_SIZE(&w->key) << 9;
115	bio->bi_private		= w;
116	bch_bio_map(bio, NULL);
117}
118
119static void dirty_io_destructor(struct closure *cl)
120{
121	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
122	kfree(io);
123}
124
125static void write_dirty_finish(struct closure *cl)
126{
127	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
128	struct keybuf_key *w = io->bio.bi_private;
129	struct cached_dev *dc = io->dc;
130
131	bio_free_pages(&io->bio);
132
133	/* This is kind of a dumb way of signalling errors. */
134	if (KEY_DIRTY(&w->key)) {
135		int ret;
136		unsigned i;
137		struct keylist keys;
138
139		bch_keylist_init(&keys);
140
141		bkey_copy(keys.top, &w->key);
142		SET_KEY_DIRTY(keys.top, false);
143		bch_keylist_push(&keys);
144
145		for (i = 0; i < KEY_PTRS(&w->key); i++)
146			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
147
148		ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
149
150		if (ret)
151			trace_bcache_writeback_collision(&w->key);
152
153		atomic_long_inc(ret
154				? &dc->disk.c->writeback_keys_failed
155				: &dc->disk.c->writeback_keys_done);
156	}
157
158	bch_keybuf_del(&dc->writeback_keys, w);
159	up(&dc->in_flight);
160
161	closure_return_with_destructor(cl, dirty_io_destructor);
162}
163
164static void dirty_endio(struct bio *bio)
165{
166	struct keybuf_key *w = bio->bi_private;
167	struct dirty_io *io = w->private;
168
169	if (bio->bi_error)
170		SET_KEY_DIRTY(&w->key, false);
171
172	closure_put(&io->cl);
173}
174
175static void write_dirty(struct closure *cl)
176{
177	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
178	struct keybuf_key *w = io->bio.bi_private;
179
180	dirty_init(w);
181	bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
182	io->bio.bi_iter.bi_sector = KEY_START(&w->key);
183	io->bio.bi_bdev		= io->dc->bdev;
184	io->bio.bi_end_io	= dirty_endio;
185
186	closure_bio_submit(&io->bio, cl);
187
188	continue_at(cl, write_dirty_finish, system_wq);
189}
190
191static void read_dirty_endio(struct bio *bio)
192{
193	struct keybuf_key *w = bio->bi_private;
194	struct dirty_io *io = w->private;
195
196	bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
197			    bio->bi_error, "reading dirty data from cache");
198
199	dirty_endio(bio);
200}
201
202static void read_dirty_submit(struct closure *cl)
203{
204	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
205
206	closure_bio_submit(&io->bio, cl);
207
208	continue_at(cl, write_dirty, system_wq);
209}
210
211static void read_dirty(struct cached_dev *dc)
212{
213	unsigned delay = 0;
214	struct keybuf_key *w;
215	struct dirty_io *io;
216	struct closure cl;
217
218	closure_init_stack(&cl);
219
220	/*
221	 * XXX: if we error, background writeback just spins. Should use some
222	 * mempools.
223	 */
224
225	while (!kthread_should_stop()) {
226
227		w = bch_keybuf_next(&dc->writeback_keys);
228		if (!w)
229			break;
230
231		BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
232
233		if (KEY_START(&w->key) != dc->last_read ||
234		    jiffies_to_msecs(delay) > 50)
235			while (!kthread_should_stop() && delay)
236				delay = schedule_timeout_interruptible(delay);
237
238		dc->last_read	= KEY_OFFSET(&w->key);
239
240		io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
241			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
242			     GFP_KERNEL);
243		if (!io)
244			goto err;
245
246		w->private	= io;
247		io->dc		= dc;
248
249		dirty_init(w);
250		bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
251		io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
252		io->bio.bi_bdev		= PTR_CACHE(dc->disk.c,
253						    &w->key, 0)->bdev;
254		io->bio.bi_end_io	= read_dirty_endio;
255
256		if (bio_alloc_pages(&io->bio, GFP_KERNEL))
257			goto err_free;
258
259		trace_bcache_writeback(&w->key);
260
261		down(&dc->in_flight);
262		closure_call(&io->cl, read_dirty_submit, NULL, &cl);
263
264		delay = writeback_delay(dc, KEY_SIZE(&w->key));
265	}
266
267	if (0) {
268err_free:
269		kfree(w->private);
270err:
271		bch_keybuf_del(&dc->writeback_keys, w);
272	}
273
274	/*
275	 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
276	 * freed) before refilling again
277	 */
278	closure_sync(&cl);
279}
280
281/* Scan for dirty data */
282
283void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
284				  uint64_t offset, int nr_sectors)
285{
286	struct bcache_device *d = c->devices[inode];
287	unsigned stripe_offset, stripe, sectors_dirty;
288
289	if (!d)
290		return;
291
292	stripe = offset_to_stripe(d, offset);
293	stripe_offset = offset & (d->stripe_size - 1);
294
295	while (nr_sectors) {
296		int s = min_t(unsigned, abs(nr_sectors),
297			      d->stripe_size - stripe_offset);
298
299		if (nr_sectors < 0)
300			s = -s;
301
302		if (stripe >= d->nr_stripes)
303			return;
304
305		sectors_dirty = atomic_add_return(s,
306					d->stripe_sectors_dirty + stripe);
307		if (sectors_dirty == d->stripe_size)
308			set_bit(stripe, d->full_dirty_stripes);
309		else
310			clear_bit(stripe, d->full_dirty_stripes);
311
312		nr_sectors -= s;
313		stripe_offset = 0;
314		stripe++;
315	}
316}
317
318static bool dirty_pred(struct keybuf *buf, struct bkey *k)
319{
320	struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
321
322	BUG_ON(KEY_INODE(k) != dc->disk.id);
323
324	return KEY_DIRTY(k);
325}
326
327static void refill_full_stripes(struct cached_dev *dc)
328{
329	struct keybuf *buf = &dc->writeback_keys;
330	unsigned start_stripe, stripe, next_stripe;
331	bool wrapped = false;
332
333	stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
334
335	if (stripe >= dc->disk.nr_stripes)
336		stripe = 0;
337
338	start_stripe = stripe;
339
340	while (1) {
341		stripe = find_next_bit(dc->disk.full_dirty_stripes,
342				       dc->disk.nr_stripes, stripe);
343
344		if (stripe == dc->disk.nr_stripes)
345			goto next;
346
347		next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
348						 dc->disk.nr_stripes, stripe);
349
350		buf->last_scanned = KEY(dc->disk.id,
351					stripe * dc->disk.stripe_size, 0);
352
353		bch_refill_keybuf(dc->disk.c, buf,
354				  &KEY(dc->disk.id,
355				       next_stripe * dc->disk.stripe_size, 0),
356				  dirty_pred);
357
358		if (array_freelist_empty(&buf->freelist))
359			return;
360
361		stripe = next_stripe;
362next:
363		if (wrapped && stripe > start_stripe)
364			return;
365
366		if (stripe == dc->disk.nr_stripes) {
367			stripe = 0;
368			wrapped = true;
369		}
370	}
371}
372
373/*
374 * Returns true if we scanned the entire disk
375 */
376static bool refill_dirty(struct cached_dev *dc)
377{
378	struct keybuf *buf = &dc->writeback_keys;
379	struct bkey start = KEY(dc->disk.id, 0, 0);
380	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
381	struct bkey start_pos;
382
383	/*
384	 * make sure keybuf pos is inside the range for this disk - at bringup
385	 * we might not be attached yet so this disk's inode nr isn't
386	 * initialized then
387	 */
388	if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
389	    bkey_cmp(&buf->last_scanned, &end) > 0)
390		buf->last_scanned = start;
391
392	if (dc->partial_stripes_expensive) {
393		refill_full_stripes(dc);
394		if (array_freelist_empty(&buf->freelist))
395			return false;
396	}
397
398	start_pos = buf->last_scanned;
399	bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
400
401	if (bkey_cmp(&buf->last_scanned, &end) < 0)
402		return false;
403
404	/*
405	 * If we get to the end start scanning again from the beginning, and
406	 * only scan up to where we initially started scanning from:
407	 */
408	buf->last_scanned = start;
409	bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
410
411	return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
412}
413
414static int bch_writeback_thread(void *arg)
415{
416	struct cached_dev *dc = arg;
417	bool searched_full_index;
418
419	while (!kthread_should_stop()) {
420		down_write(&dc->writeback_lock);
421		if (!atomic_read(&dc->has_dirty) ||
422		    (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
423		     !dc->writeback_running)) {
424			up_write(&dc->writeback_lock);
425			set_current_state(TASK_INTERRUPTIBLE);
426
427			if (kthread_should_stop())
428				return 0;
429
430			schedule();
431			continue;
432		}
433
434		searched_full_index = refill_dirty(dc);
435
436		if (searched_full_index &&
437		    RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
438			atomic_set(&dc->has_dirty, 0);
439			cached_dev_put(dc);
440			SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
441			bch_write_bdev_super(dc, NULL);
442		}
443
444		up_write(&dc->writeback_lock);
445
446		bch_ratelimit_reset(&dc->writeback_rate);
447		read_dirty(dc);
448
449		if (searched_full_index) {
450			unsigned delay = dc->writeback_delay * HZ;
451
452			while (delay &&
453			       !kthread_should_stop() &&
454			       !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
455				delay = schedule_timeout_interruptible(delay);
456		}
457	}
458
459	return 0;
460}
461
462/* Init */
463
464struct sectors_dirty_init {
465	struct btree_op	op;
466	unsigned	inode;
467};
468
469static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
470				 struct bkey *k)
471{
472	struct sectors_dirty_init *op = container_of(_op,
473						struct sectors_dirty_init, op);
474	if (KEY_INODE(k) > op->inode)
475		return MAP_DONE;
476
477	if (KEY_DIRTY(k))
478		bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
479					     KEY_START(k), KEY_SIZE(k));
480
481	return MAP_CONTINUE;
482}
483
484void bch_sectors_dirty_init(struct cached_dev *dc)
485{
486	struct sectors_dirty_init op;
487
488	bch_btree_op_init(&op.op, -1);
489	op.inode = dc->disk.id;
490
491	bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
492			   sectors_dirty_init_fn, 0);
493
494	dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
495}
496
497void bch_cached_dev_writeback_init(struct cached_dev *dc)
498{
499	sema_init(&dc->in_flight, 64);
500	init_rwsem(&dc->writeback_lock);
501	bch_keybuf_init(&dc->writeback_keys);
502
503	dc->writeback_metadata		= true;
504	dc->writeback_running		= true;
505	dc->writeback_percent		= 10;
506	dc->writeback_delay		= 30;
507	dc->writeback_rate.rate		= 1024;
508
509	dc->writeback_rate_update_seconds = 5;
510	dc->writeback_rate_d_term	= 30;
511	dc->writeback_rate_p_term_inverse = 6000;
512
513	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
514}
515
516int bch_cached_dev_writeback_start(struct cached_dev *dc)
517{
518	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
519					      "bcache_writeback");
520	if (IS_ERR(dc->writeback_thread))
521		return PTR_ERR(dc->writeback_thread);
522
523	schedule_delayed_work(&dc->writeback_rate_update,
524			      dc->writeback_rate_update_seconds * HZ);
525
526	bch_writeback_queue(dc);
527
528	return 0;
529}