Linux Audio

Check our new training course

Loading...
Note: File does not exist in v5.9.
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include "bcachefs.h"
   4#include "btree_cache.h"
   5#include "btree_iter.h"
   6#include "btree_key_cache.h"
   7#include "btree_locking.h"
   8#include "btree_update.h"
   9#include "errcode.h"
  10#include "error.h"
  11#include "journal.h"
  12#include "journal_reclaim.h"
  13#include "trace.h"
  14
  15#include <linux/sched/mm.h>
  16
  17static inline bool btree_uses_pcpu_readers(enum btree_id id)
  18{
  19	return id == BTREE_ID_subvolumes;
  20}
  21
  22static struct kmem_cache *bch2_key_cache;
  23
  24static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
  25				       const void *obj)
  26{
  27	const struct bkey_cached *ck = obj;
  28	const struct bkey_cached_key *key = arg->key;
  29
  30	return ck->key.btree_id != key->btree_id ||
  31		!bpos_eq(ck->key.pos, key->pos);
  32}
  33
  34static const struct rhashtable_params bch2_btree_key_cache_params = {
  35	.head_offset	= offsetof(struct bkey_cached, hash),
  36	.key_offset	= offsetof(struct bkey_cached, key),
  37	.key_len	= sizeof(struct bkey_cached_key),
  38	.obj_cmpfn	= bch2_btree_key_cache_cmp_fn,
  39};
  40
  41__flatten
  42inline struct bkey_cached *
  43bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
  44{
  45	struct bkey_cached_key key = {
  46		.btree_id	= btree_id,
  47		.pos		= pos,
  48	};
  49
  50	return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
  51				      bch2_btree_key_cache_params);
  52}
  53
  54static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
  55{
  56	if (!six_trylock_intent(&ck->c.lock))
  57		return false;
  58
  59	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
  60		six_unlock_intent(&ck->c.lock);
  61		return false;
  62	}
  63
  64	if (!six_trylock_write(&ck->c.lock)) {
  65		six_unlock_intent(&ck->c.lock);
  66		return false;
  67	}
  68
  69	return true;
  70}
  71
  72static void bkey_cached_evict(struct btree_key_cache *c,
  73			      struct bkey_cached *ck)
  74{
  75	BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
  76				      bch2_btree_key_cache_params));
  77	memset(&ck->key, ~0, sizeof(ck->key));
  78
  79	atomic_long_dec(&c->nr_keys);
  80}
  81
  82static void bkey_cached_free(struct btree_key_cache *bc,
  83			     struct bkey_cached *ck)
  84{
  85	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
  86
  87	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
  88
  89	ck->btree_trans_barrier_seq =
  90		start_poll_synchronize_srcu(&c->btree_trans_barrier);
  91
  92	if (ck->c.lock.readers) {
  93		list_move_tail(&ck->list, &bc->freed_pcpu);
  94		bc->nr_freed_pcpu++;
  95	} else {
  96		list_move_tail(&ck->list, &bc->freed_nonpcpu);
  97		bc->nr_freed_nonpcpu++;
  98	}
  99	atomic_long_inc(&bc->nr_freed);
 100
 101	kfree(ck->k);
 102	ck->k		= NULL;
 103	ck->u64s	= 0;
 104
 105	six_unlock_write(&ck->c.lock);
 106	six_unlock_intent(&ck->c.lock);
 107}
 108
 109#ifdef __KERNEL__
 110static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
 111						   struct bkey_cached *ck)
 112{
 113	struct bkey_cached *pos;
 114
 115	bc->nr_freed_nonpcpu++;
 116
 117	list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
 118		if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
 119				 pos->btree_trans_barrier_seq)) {
 120			list_move(&ck->list, &pos->list);
 121			return;
 122		}
 123	}
 124
 125	list_move(&ck->list, &bc->freed_nonpcpu);
 126}
 127#endif
 128
 129static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
 130					 struct bkey_cached *ck)
 131{
 132	BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
 133
 134	if (!ck->c.lock.readers) {
 135#ifdef __KERNEL__
 136		struct btree_key_cache_freelist *f;
 137		bool freed = false;
 138
 139		preempt_disable();
 140		f = this_cpu_ptr(bc->pcpu_freed);
 141
 142		if (f->nr < ARRAY_SIZE(f->objs)) {
 143			f->objs[f->nr++] = ck;
 144			freed = true;
 145		}
 146		preempt_enable();
 147
 148		if (!freed) {
 149			mutex_lock(&bc->lock);
 150			preempt_disable();
 151			f = this_cpu_ptr(bc->pcpu_freed);
 152
 153			while (f->nr > ARRAY_SIZE(f->objs) / 2) {
 154				struct bkey_cached *ck2 = f->objs[--f->nr];
 155
 156				__bkey_cached_move_to_freelist_ordered(bc, ck2);
 157			}
 158			preempt_enable();
 159
 160			__bkey_cached_move_to_freelist_ordered(bc, ck);
 161			mutex_unlock(&bc->lock);
 162		}
 163#else
 164		mutex_lock(&bc->lock);
 165		list_move_tail(&ck->list, &bc->freed_nonpcpu);
 166		bc->nr_freed_nonpcpu++;
 167		mutex_unlock(&bc->lock);
 168#endif
 169	} else {
 170		mutex_lock(&bc->lock);
 171		list_move_tail(&ck->list, &bc->freed_pcpu);
 172		mutex_unlock(&bc->lock);
 173	}
 174}
 175
 176static void bkey_cached_free_fast(struct btree_key_cache *bc,
 177				  struct bkey_cached *ck)
 178{
 179	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 180
 181	ck->btree_trans_barrier_seq =
 182		start_poll_synchronize_srcu(&c->btree_trans_barrier);
 183
 184	list_del_init(&ck->list);
 185	atomic_long_inc(&bc->nr_freed);
 186
 187	kfree(ck->k);
 188	ck->k		= NULL;
 189	ck->u64s	= 0;
 190
 191	bkey_cached_move_to_freelist(bc, ck);
 192
 193	six_unlock_write(&ck->c.lock);
 194	six_unlock_intent(&ck->c.lock);
 195}
 196
 197static struct bkey_cached *
 198bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 199		  bool *was_new)
 200{
 201	struct bch_fs *c = trans->c;
 202	struct btree_key_cache *bc = &c->btree_key_cache;
 203	struct bkey_cached *ck = NULL;
 204	bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id);
 205	int ret;
 206
 207	if (!pcpu_readers) {
 208#ifdef __KERNEL__
 209		struct btree_key_cache_freelist *f;
 210
 211		preempt_disable();
 212		f = this_cpu_ptr(bc->pcpu_freed);
 213		if (f->nr)
 214			ck = f->objs[--f->nr];
 215		preempt_enable();
 216
 217		if (!ck) {
 218			mutex_lock(&bc->lock);
 219			preempt_disable();
 220			f = this_cpu_ptr(bc->pcpu_freed);
 221
 222			while (!list_empty(&bc->freed_nonpcpu) &&
 223			       f->nr < ARRAY_SIZE(f->objs) / 2) {
 224				ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
 225				list_del_init(&ck->list);
 226				bc->nr_freed_nonpcpu--;
 227				f->objs[f->nr++] = ck;
 228			}
 229
 230			ck = f->nr ? f->objs[--f->nr] : NULL;
 231			preempt_enable();
 232			mutex_unlock(&bc->lock);
 233		}
 234#else
 235		mutex_lock(&bc->lock);
 236		if (!list_empty(&bc->freed_nonpcpu)) {
 237			ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
 238			list_del_init(&ck->list);
 239			bc->nr_freed_nonpcpu--;
 240		}
 241		mutex_unlock(&bc->lock);
 242#endif
 243	} else {
 244		mutex_lock(&bc->lock);
 245		if (!list_empty(&bc->freed_pcpu)) {
 246			ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
 247			list_del_init(&ck->list);
 248		}
 249		mutex_unlock(&bc->lock);
 250	}
 251
 252	if (ck) {
 253		ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
 254		if (unlikely(ret)) {
 255			bkey_cached_move_to_freelist(bc, ck);
 256			return ERR_PTR(ret);
 257		}
 258
 259		path->l[0].b = (void *) ck;
 260		path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
 261		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 262
 263		ret = bch2_btree_node_lock_write(trans, path, &ck->c);
 264		if (unlikely(ret)) {
 265			btree_node_unlock(trans, path, 0);
 266			bkey_cached_move_to_freelist(bc, ck);
 267			return ERR_PTR(ret);
 268		}
 269
 270		return ck;
 271	}
 272
 273	ck = allocate_dropping_locks(trans, ret,
 274			kmem_cache_zalloc(bch2_key_cache, _gfp));
 275	if (ret) {
 276		kmem_cache_free(bch2_key_cache, ck);
 277		return ERR_PTR(ret);
 278	}
 279
 280	if (!ck)
 281		return NULL;
 282
 283	INIT_LIST_HEAD(&ck->list);
 284	bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
 285
 286	ck->c.cached = true;
 287	BUG_ON(!six_trylock_intent(&ck->c.lock));
 288	BUG_ON(!six_trylock_write(&ck->c.lock));
 289	*was_new = true;
 290	return ck;
 291}
 292
 293static struct bkey_cached *
 294bkey_cached_reuse(struct btree_key_cache *c)
 295{
 296	struct bucket_table *tbl;
 297	struct rhash_head *pos;
 298	struct bkey_cached *ck;
 299	unsigned i;
 300
 301	mutex_lock(&c->lock);
 302	rcu_read_lock();
 303	tbl = rht_dereference_rcu(c->table.tbl, &c->table);
 304	for (i = 0; i < tbl->size; i++)
 305		rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
 306			if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
 307			    bkey_cached_lock_for_evict(ck)) {
 308				bkey_cached_evict(c, ck);
 309				goto out;
 310			}
 311		}
 312	ck = NULL;
 313out:
 314	rcu_read_unlock();
 315	mutex_unlock(&c->lock);
 316	return ck;
 317}
 318
 319static struct bkey_cached *
 320btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
 321{
 322	struct bch_fs *c = trans->c;
 323	struct btree_key_cache *bc = &c->btree_key_cache;
 324	struct bkey_cached *ck;
 325	bool was_new = false;
 326
 327	ck = bkey_cached_alloc(trans, path, &was_new);
 328	if (IS_ERR(ck))
 329		return ck;
 330
 331	if (unlikely(!ck)) {
 332		ck = bkey_cached_reuse(bc);
 333		if (unlikely(!ck)) {
 334			bch_err(c, "error allocating memory for key cache item, btree %s",
 335				bch2_btree_id_str(path->btree_id));
 336			return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
 337		}
 338
 339		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 340	}
 341
 342	ck->c.level		= 0;
 343	ck->c.btree_id		= path->btree_id;
 344	ck->key.btree_id	= path->btree_id;
 345	ck->key.pos		= path->pos;
 346	ck->valid		= false;
 347	ck->flags		= 1U << BKEY_CACHED_ACCESSED;
 348
 349	if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
 350					  &ck->hash,
 351					  bch2_btree_key_cache_params))) {
 352		/* We raced with another fill: */
 353
 354		if (likely(was_new)) {
 355			six_unlock_write(&ck->c.lock);
 356			six_unlock_intent(&ck->c.lock);
 357			kfree(ck);
 358		} else {
 359			bkey_cached_free_fast(bc, ck);
 360		}
 361
 362		mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
 363		return NULL;
 364	}
 365
 366	atomic_long_inc(&bc->nr_keys);
 367
 368	six_unlock_write(&ck->c.lock);
 369
 370	return ck;
 371}
 372
 373static int btree_key_cache_fill(struct btree_trans *trans,
 374				struct btree_path *ck_path,
 375				struct bkey_cached *ck)
 376{
 377	struct btree_iter iter;
 378	struct bkey_s_c k;
 379	unsigned new_u64s = 0;
 380	struct bkey_i *new_k = NULL;
 381	int ret;
 382
 383	k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
 384			       BTREE_ITER_KEY_CACHE_FILL|
 385			       BTREE_ITER_CACHED_NOFILL);
 386	ret = bkey_err(k);
 387	if (ret)
 388		goto err;
 389
 390	if (!bch2_btree_node_relock(trans, ck_path, 0)) {
 391		trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
 392		ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
 393		goto err;
 394	}
 395
 396	/*
 397	 * bch2_varint_decode can read past the end of the buffer by at
 398	 * most 7 bytes (it won't be used):
 399	 */
 400	new_u64s = k.k->u64s + 1;
 401
 402	/*
 403	 * Allocate some extra space so that the transaction commit path is less
 404	 * likely to have to reallocate, since that requires a transaction
 405	 * restart:
 406	 */
 407	new_u64s = min(256U, (new_u64s * 3) / 2);
 408
 409	if (new_u64s > ck->u64s) {
 410		new_u64s = roundup_pow_of_two(new_u64s);
 411		new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
 412		if (!new_k) {
 413			bch2_trans_unlock(trans);
 414
 415			new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
 416			if (!new_k) {
 417				bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
 418					bch2_btree_id_str(ck->key.btree_id), new_u64s);
 419				ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
 420				goto err;
 421			}
 422
 423			if (!bch2_btree_node_relock(trans, ck_path, 0)) {
 424				kfree(new_k);
 425				trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
 426				ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
 427				goto err;
 428			}
 429
 430			ret = bch2_trans_relock(trans);
 431			if (ret) {
 432				kfree(new_k);
 433				goto err;
 434			}
 435		}
 436	}
 437
 438	ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c);
 439	if (ret) {
 440		kfree(new_k);
 441		goto err;
 442	}
 443
 444	if (new_k) {
 445		kfree(ck->k);
 446		ck->u64s = new_u64s;
 447		ck->k = new_k;
 448	}
 449
 450	bkey_reassemble(ck->k, k);
 451	ck->valid = true;
 452	bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
 453
 454	/* We're not likely to need this iterator again: */
 455	set_btree_iter_dontneed(&iter);
 456err:
 457	bch2_trans_iter_exit(trans, &iter);
 458	return ret;
 459}
 460
 461static noinline int
 462bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path,
 463					 unsigned flags)
 464{
 465	struct bch_fs *c = trans->c;
 466	struct bkey_cached *ck;
 467	int ret = 0;
 468
 469	BUG_ON(path->level);
 470
 471	path->l[1].b = NULL;
 472
 473	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
 474		ck = (void *) path->l[0].b;
 475		goto fill;
 476	}
 477retry:
 478	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
 479	if (!ck) {
 480		ck = btree_key_cache_create(trans, path);
 481		ret = PTR_ERR_OR_ZERO(ck);
 482		if (ret)
 483			goto err;
 484		if (!ck)
 485			goto retry;
 486
 487		mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 488		path->locks_want = 1;
 489	} else {
 490		enum six_lock_type lock_want = __btree_lock_want(path, 0);
 491
 492		ret = btree_node_lock(trans, path, (void *) ck, 0,
 493				      lock_want, _THIS_IP_);
 494		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 495			goto err;
 496
 497		BUG_ON(ret);
 498
 499		if (ck->key.btree_id != path->btree_id ||
 500		    !bpos_eq(ck->key.pos, path->pos)) {
 501			six_unlock_type(&ck->c.lock, lock_want);
 502			goto retry;
 503		}
 504
 505		mark_btree_node_locked(trans, path, 0,
 506				       (enum btree_node_locked_type) lock_want);
 507	}
 508
 509	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
 510	path->l[0].b		= (void *) ck;
 511fill:
 512	path->uptodate = BTREE_ITER_UPTODATE;
 513
 514	if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
 515		/*
 516		 * Using the underscore version because we haven't set
 517		 * path->uptodate yet:
 518		 */
 519		if (!path->locks_want &&
 520		    !__bch2_btree_path_upgrade(trans, path, 1, NULL)) {
 521			trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
 522			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
 523			goto err;
 524		}
 525
 526		ret = btree_key_cache_fill(trans, path, ck);
 527		if (ret)
 528			goto err;
 529
 530		ret = bch2_btree_path_relock(trans, path, _THIS_IP_);
 531		if (ret)
 532			goto err;
 533
 534		path->uptodate = BTREE_ITER_UPTODATE;
 535	}
 536
 537	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 538		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 539
 540	BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
 541	BUG_ON(path->uptodate);
 542
 543	return ret;
 544err:
 545	path->uptodate = BTREE_ITER_NEED_TRAVERSE;
 546	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 547		btree_node_unlock(trans, path, 0);
 548		path->l[0].b = ERR_PTR(ret);
 549	}
 550	return ret;
 551}
 552
 553int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
 554				    unsigned flags)
 555{
 556	struct bch_fs *c = trans->c;
 557	struct bkey_cached *ck;
 558	int ret = 0;
 559
 560	EBUG_ON(path->level);
 561
 562	path->l[1].b = NULL;
 563
 564	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
 565		ck = (void *) path->l[0].b;
 566		goto fill;
 567	}
 568retry:
 569	ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
 570	if (!ck) {
 571		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
 572	} else {
 573		enum six_lock_type lock_want = __btree_lock_want(path, 0);
 574
 575		ret = btree_node_lock(trans, path, (void *) ck, 0,
 576				      lock_want, _THIS_IP_);
 577		EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart));
 578
 579		if (ret)
 580			return ret;
 581
 582		if (ck->key.btree_id != path->btree_id ||
 583		    !bpos_eq(ck->key.pos, path->pos)) {
 584			six_unlock_type(&ck->c.lock, lock_want);
 585			goto retry;
 586		}
 587
 588		mark_btree_node_locked(trans, path, 0,
 589				       (enum btree_node_locked_type) lock_want);
 590	}
 591
 592	path->l[0].lock_seq	= six_lock_seq(&ck->c.lock);
 593	path->l[0].b		= (void *) ck;
 594fill:
 595	if (!ck->valid)
 596		return bch2_btree_path_traverse_cached_slowpath(trans, path, flags);
 597
 598	if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 599		set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 600
 601	path->uptodate = BTREE_ITER_UPTODATE;
 602	EBUG_ON(!ck->valid);
 603	EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
 604
 605	return ret;
 606}
 607
 608static int btree_key_cache_flush_pos(struct btree_trans *trans,
 609				     struct bkey_cached_key key,
 610				     u64 journal_seq,
 611				     unsigned commit_flags,
 612				     bool evict)
 613{
 614	struct bch_fs *c = trans->c;
 615	struct journal *j = &c->journal;
 616	struct btree_iter c_iter, b_iter;
 617	struct bkey_cached *ck = NULL;
 618	int ret;
 619
 620	bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
 621			     BTREE_ITER_SLOTS|
 622			     BTREE_ITER_INTENT|
 623			     BTREE_ITER_ALL_SNAPSHOTS);
 624	bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
 625			     BTREE_ITER_CACHED|
 626			     BTREE_ITER_INTENT);
 627	b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
 628
 629	ret = bch2_btree_iter_traverse(&c_iter);
 630	if (ret)
 631		goto out;
 632
 633	ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b;
 634	if (!ck)
 635		goto out;
 636
 637	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 638		if (evict)
 639			goto evict;
 640		goto out;
 641	}
 642
 643	BUG_ON(!ck->valid);
 644
 645	if (journal_seq && ck->journal.seq != journal_seq)
 646		goto out;
 647
 648	trans->journal_res.seq = ck->journal.seq;
 649
 650	/*
 651	 * If we're at the end of the journal, we really want to free up space
 652	 * in the journal right away - we don't want to pin that old journal
 653	 * sequence number with a new btree node write, we want to re-journal
 654	 * the update
 655	 */
 656	if (ck->journal.seq == journal_last_seq(j))
 657		commit_flags |= BCH_WATERMARK_reclaim;
 658
 659	if (ck->journal.seq != journal_last_seq(j) ||
 660	    j->watermark == BCH_WATERMARK_stripe)
 661		commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
 662
 663	ret   = bch2_btree_iter_traverse(&b_iter) ?:
 664		bch2_trans_update(trans, &b_iter, ck->k,
 665				  BTREE_UPDATE_KEY_CACHE_RECLAIM|
 666				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
 667				  BTREE_TRIGGER_NORUN) ?:
 668		bch2_trans_commit(trans, NULL, NULL,
 669				  BCH_TRANS_COMMIT_no_check_rw|
 670				  BCH_TRANS_COMMIT_no_enospc|
 671				  commit_flags);
 672
 673	bch2_fs_fatal_err_on(ret &&
 674			     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
 675			     !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
 676			     !bch2_journal_error(j), c,
 677			     "error flushing key cache: %s", bch2_err_str(ret));
 678	if (ret)
 679		goto out;
 680
 681	bch2_journal_pin_drop(j, &ck->journal);
 682
 683	struct btree_path *path = btree_iter_path(trans, &c_iter);
 684	BUG_ON(!btree_node_locked(path, 0));
 685
 686	if (!evict) {
 687		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 688			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
 689			atomic_long_dec(&c->btree_key_cache.nr_dirty);
 690		}
 691	} else {
 692		struct btree_path *path2;
 693		unsigned i;
 694evict:
 695		trans_for_each_path(trans, path2, i)
 696			if (path2 != path)
 697				__bch2_btree_path_unlock(trans, path2);
 698
 699		bch2_btree_node_lock_write_nofail(trans, path, &ck->c);
 700
 701		if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 702			clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
 703			atomic_long_dec(&c->btree_key_cache.nr_dirty);
 704		}
 705
 706		mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
 707		bkey_cached_evict(&c->btree_key_cache, ck);
 708		bkey_cached_free_fast(&c->btree_key_cache, ck);
 709	}
 710out:
 711	bch2_trans_iter_exit(trans, &b_iter);
 712	bch2_trans_iter_exit(trans, &c_iter);
 713	return ret;
 714}
 715
 716int bch2_btree_key_cache_journal_flush(struct journal *j,
 717				struct journal_entry_pin *pin, u64 seq)
 718{
 719	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 720	struct bkey_cached *ck =
 721		container_of(pin, struct bkey_cached, journal);
 722	struct bkey_cached_key key;
 723	struct btree_trans *trans = bch2_trans_get(c);
 724	int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 725	int ret = 0;
 726
 727	btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
 728	key = ck->key;
 729
 730	if (ck->journal.seq != seq ||
 731	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 732		six_unlock_read(&ck->c.lock);
 733		goto unlock;
 734	}
 735
 736	if (ck->seq != seq) {
 737		bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal,
 738					bch2_btree_key_cache_journal_flush);
 739		six_unlock_read(&ck->c.lock);
 740		goto unlock;
 741	}
 742	six_unlock_read(&ck->c.lock);
 743
 744	ret = lockrestart_do(trans,
 745		btree_key_cache_flush_pos(trans, key, seq,
 746				BCH_TRANS_COMMIT_journal_reclaim, false));
 747unlock:
 748	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 749
 750	bch2_trans_put(trans);
 751	return ret;
 752}
 753
 754bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 755				  unsigned flags,
 756				  struct btree_insert_entry *insert_entry)
 757{
 758	struct bch_fs *c = trans->c;
 759	struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b;
 760	struct bkey_i *insert = insert_entry->k;
 761	bool kick_reclaim = false;
 762
 763	BUG_ON(insert->k.u64s > ck->u64s);
 764
 765	bkey_copy(ck->k, insert);
 766	ck->valid = true;
 767
 768	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 769		EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
 770		set_bit(BKEY_CACHED_DIRTY, &ck->flags);
 771		atomic_long_inc(&c->btree_key_cache.nr_dirty);
 772
 773		if (bch2_nr_btree_keys_need_flush(c))
 774			kick_reclaim = true;
 775	}
 776
 777	/*
 778	 * To minimize lock contention, we only add the journal pin here and
 779	 * defer pin updates to the flush callback via ->seq. Be careful not to
 780	 * update ->seq on nojournal commits because we don't want to update the
 781	 * pin to a seq that doesn't include journal updates on disk. Otherwise
 782	 * we risk losing the update after a crash.
 783	 *
 784	 * The only exception is if the pin is not active in the first place. We
 785	 * have to add the pin because journal reclaim drives key cache
 786	 * flushing. The flush callback will not proceed unless ->seq matches
 787	 * the latest pin, so make sure it starts with a consistent value.
 788	 */
 789	if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) ||
 790	    !journal_pin_active(&ck->journal)) {
 791		ck->seq = trans->journal_res.seq;
 792	}
 793	bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
 794			     &ck->journal, bch2_btree_key_cache_journal_flush);
 795
 796	if (kick_reclaim)
 797		journal_reclaim_kick(&c->journal);
 798	return true;
 799}
 800
 801void bch2_btree_key_cache_drop(struct btree_trans *trans,
 802			       struct btree_path *path)
 803{
 804	struct bch_fs *c = trans->c;
 805	struct bkey_cached *ck = (void *) path->l[0].b;
 806
 807	BUG_ON(!ck->valid);
 808
 809	/*
 810	 * We just did an update to the btree, bypassing the key cache: the key
 811	 * cache key is now stale and must be dropped, even if dirty:
 812	 */
 813	if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
 814		clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
 815		atomic_long_dec(&c->btree_key_cache.nr_dirty);
 816		bch2_journal_pin_drop(&c->journal, &ck->journal);
 817	}
 818
 819	ck->valid = false;
 820}
 821
 822static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 823					   struct shrink_control *sc)
 824{
 825	struct bch_fs *c = shrink->private_data;
 826	struct btree_key_cache *bc = &c->btree_key_cache;
 827	struct bucket_table *tbl;
 828	struct bkey_cached *ck, *t;
 829	size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
 830	unsigned start, flags;
 831	int srcu_idx;
 832
 833	mutex_lock(&bc->lock);
 834	srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 835	flags = memalloc_nofs_save();
 836
 837	/*
 838	 * Newest freed entries are at the end of the list - once we hit one
 839	 * that's too new to be freed, we can bail out:
 840	 */
 841	scanned += bc->nr_freed_nonpcpu;
 842
 843	list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
 844		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
 845						 ck->btree_trans_barrier_seq))
 846			break;
 847
 848		list_del(&ck->list);
 849		six_lock_exit(&ck->c.lock);
 850		kmem_cache_free(bch2_key_cache, ck);
 851		atomic_long_dec(&bc->nr_freed);
 852		freed++;
 853		bc->nr_freed_nonpcpu--;
 854	}
 855
 856	if (scanned >= nr)
 857		goto out;
 858
 859	scanned += bc->nr_freed_pcpu;
 860
 861	list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
 862		if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
 863						 ck->btree_trans_barrier_seq))
 864			break;
 865
 866		list_del(&ck->list);
 867		six_lock_exit(&ck->c.lock);
 868		kmem_cache_free(bch2_key_cache, ck);
 869		atomic_long_dec(&bc->nr_freed);
 870		freed++;
 871		bc->nr_freed_pcpu--;
 872	}
 873
 874	if (scanned >= nr)
 875		goto out;
 876
 877	rcu_read_lock();
 878	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
 879	if (bc->shrink_iter >= tbl->size)
 880		bc->shrink_iter = 0;
 881	start = bc->shrink_iter;
 882
 883	do {
 884		struct rhash_head *pos, *next;
 885
 886		pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
 887
 888		while (!rht_is_a_nulls(pos)) {
 889			next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
 890			ck = container_of(pos, struct bkey_cached, hash);
 891
 892			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
 893				goto next;
 894
 895			if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
 896				clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 897			else if (bkey_cached_lock_for_evict(ck)) {
 898				bkey_cached_evict(bc, ck);
 899				bkey_cached_free(bc, ck);
 900			}
 901
 902			scanned++;
 903			if (scanned >= nr)
 904				break;
 905next:
 906			pos = next;
 907		}
 908
 909		bc->shrink_iter++;
 910		if (bc->shrink_iter >= tbl->size)
 911			bc->shrink_iter = 0;
 912	} while (scanned < nr && bc->shrink_iter != start);
 913
 914	rcu_read_unlock();
 915out:
 916	memalloc_nofs_restore(flags);
 917	srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 918	mutex_unlock(&bc->lock);
 919
 920	return freed;
 921}
 922
 923static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
 924					    struct shrink_control *sc)
 925{
 926	struct bch_fs *c = shrink->private_data;
 927	struct btree_key_cache *bc = &c->btree_key_cache;
 928	long nr = atomic_long_read(&bc->nr_keys) -
 929		atomic_long_read(&bc->nr_dirty);
 930
 931	return max(0L, nr);
 932}
 933
 934void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 935{
 936	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
 937	struct bucket_table *tbl;
 938	struct bkey_cached *ck, *n;
 939	struct rhash_head *pos;
 940	LIST_HEAD(items);
 941	unsigned i;
 942#ifdef __KERNEL__
 943	int cpu;
 944#endif
 945
 946	shrinker_free(bc->shrink);
 947
 948	mutex_lock(&bc->lock);
 949
 950	/*
 951	 * The loop is needed to guard against racing with rehash:
 952	 */
 953	while (atomic_long_read(&bc->nr_keys)) {
 954		rcu_read_lock();
 955		tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
 956		if (tbl)
 957			for (i = 0; i < tbl->size; i++)
 958				rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
 959					bkey_cached_evict(bc, ck);
 960					list_add(&ck->list, &items);
 961				}
 962		rcu_read_unlock();
 963	}
 964
 965#ifdef __KERNEL__
 966	for_each_possible_cpu(cpu) {
 967		struct btree_key_cache_freelist *f =
 968			per_cpu_ptr(bc->pcpu_freed, cpu);
 969
 970		for (i = 0; i < f->nr; i++) {
 971			ck = f->objs[i];
 972			list_add(&ck->list, &items);
 973		}
 974	}
 975#endif
 976
 977	BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
 978	BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
 979
 980	list_splice(&bc->freed_pcpu,	&items);
 981	list_splice(&bc->freed_nonpcpu,	&items);
 982
 983	mutex_unlock(&bc->lock);
 984
 985	list_for_each_entry_safe(ck, n, &items, list) {
 986		cond_resched();
 987
 988		list_del(&ck->list);
 989		kfree(ck->k);
 990		six_lock_exit(&ck->c.lock);
 991		kmem_cache_free(bch2_key_cache, ck);
 992	}
 993
 994	if (atomic_long_read(&bc->nr_dirty) &&
 995	    !bch2_journal_error(&c->journal) &&
 996	    test_bit(BCH_FS_was_rw, &c->flags))
 997		panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
 998		      atomic_long_read(&bc->nr_dirty));
 999
1000	if (atomic_long_read(&bc->nr_keys))
1001		panic("btree key cache shutdown error: nr_keys nonzero (%li)\n",
1002		      atomic_long_read(&bc->nr_keys));
1003
1004	if (bc->table_init_done)
1005		rhashtable_destroy(&bc->table);
1006
1007	free_percpu(bc->pcpu_freed);
1008}
1009
1010void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
1011{
1012	mutex_init(&c->lock);
1013	INIT_LIST_HEAD(&c->freed_pcpu);
1014	INIT_LIST_HEAD(&c->freed_nonpcpu);
1015}
1016
1017int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
1018{
1019	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
1020	struct shrinker *shrink;
1021
1022#ifdef __KERNEL__
1023	bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
1024	if (!bc->pcpu_freed)
1025		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
1026#endif
1027
1028	if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
1029		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
1030
1031	bc->table_init_done = true;
1032
1033	shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
1034	if (!shrink)
1035		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
1036	bc->shrink = shrink;
1037	shrink->seeks		= 0;
1038	shrink->count_objects	= bch2_btree_key_cache_count;
1039	shrink->scan_objects	= bch2_btree_key_cache_scan;
1040	shrink->private_data	= c;
1041	shrinker_register(shrink);
1042	return 0;
1043}
1044
1045void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
1046{
1047	prt_printf(out, "nr_freed:\t%lu",	atomic_long_read(&c->nr_freed));
1048	prt_newline(out);
1049	prt_printf(out, "nr_keys:\t%lu",	atomic_long_read(&c->nr_keys));
1050	prt_newline(out);
1051	prt_printf(out, "nr_dirty:\t%lu",	atomic_long_read(&c->nr_dirty));
1052	prt_newline(out);
1053}
1054
1055void bch2_btree_key_cache_exit(void)
1056{
1057	kmem_cache_destroy(bch2_key_cache);
1058}
1059
1060int __init bch2_btree_key_cache_init(void)
1061{
1062	bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT);
1063	if (!bch2_key_cache)
1064		return -ENOMEM;
1065
1066	return 0;
1067}