Linux Audio

Check our new training course

Loading...
Note: File does not exist in v4.10.11.
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include "bcachefs.h"
   4#include "btree_key_cache.h"
   5#include "btree_write_buffer.h"
   6#include "bkey_methods.h"
   7#include "btree_update.h"
   8#include "buckets.h"
   9#include "compress.h"
  10#include "dirent.h"
  11#include "disk_accounting.h"
  12#include "error.h"
  13#include "extents.h"
  14#include "extent_update.h"
  15#include "fs.h"
  16#include "inode.h"
  17#include "str_hash.h"
  18#include "snapshot.h"
  19#include "subvolume.h"
  20#include "varint.h"
  21
  22#include <linux/random.h>
  23
  24#include <linux/unaligned.h>
  25
  26#define x(name, ...)	#name,
  27const char * const bch2_inode_opts[] = {
  28	BCH_INODE_OPTS()
  29	NULL,
  30};
  31
  32static const char * const bch2_inode_flag_strs[] = {
  33	BCH_INODE_FLAGS()
  34	NULL
  35};
  36#undef  x
  37
  38static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
  39
  40static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
  41
  42static int inode_decode_field(const u8 *in, const u8 *end,
  43			      u64 out[2], unsigned *out_bits)
  44{
  45	__be64 be[2] = { 0, 0 };
  46	unsigned bytes, shift;
  47	u8 *p;
  48
  49	if (in >= end)
  50		return -1;
  51
  52	if (!*in)
  53		return -1;
  54
  55	/*
  56	 * position of highest set bit indicates number of bytes:
  57	 * shift = number of bits to remove in high byte:
  58	 */
  59	shift	= 8 - __fls(*in); /* 1 <= shift <= 8 */
  60	bytes	= byte_table[shift - 1];
  61
  62	if (in + bytes > end)
  63		return -1;
  64
  65	p = (u8 *) be + 16 - bytes;
  66	memcpy(p, in, bytes);
  67	*p ^= (1 << 8) >> shift;
  68
  69	out[0] = be64_to_cpu(be[0]);
  70	out[1] = be64_to_cpu(be[1]);
  71	*out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
  72
  73	return bytes;
  74}
  75
  76static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
  77					   const struct bch_inode_unpacked *inode)
  78{
  79	struct bkey_i_inode_v3 *k = &packed->inode;
  80	u8 *out = k->v.fields;
  81	u8 *end = (void *) &packed[1];
  82	u8 *last_nonzero_field = out;
  83	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
  84	unsigned bytes;
  85	int ret;
  86
  87	bkey_inode_v3_init(&packed->inode.k_i);
  88	packed->inode.k.p.offset	= inode->bi_inum;
  89	packed->inode.v.bi_journal_seq	= cpu_to_le64(inode->bi_journal_seq);
  90	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
  91	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
  92	packed->inode.v.bi_sectors	= cpu_to_le64(inode->bi_sectors);
  93	packed->inode.v.bi_size		= cpu_to_le64(inode->bi_size);
  94	packed->inode.v.bi_version	= cpu_to_le64(inode->bi_version);
  95	SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
  96	SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
  97
  98
  99#define x(_name, _bits)							\
 100	nr_fields++;							\
 101									\
 102	if (inode->_name) {						\
 103		ret = bch2_varint_encode_fast(out, inode->_name);	\
 104		out += ret;						\
 105									\
 106		if (_bits > 64)						\
 107			*out++ = 0;					\
 108									\
 109		last_nonzero_field = out;				\
 110		last_nonzero_fieldnr = nr_fields;			\
 111	} else {							\
 112		*out++ = 0;						\
 113									\
 114		if (_bits > 64)						\
 115			*out++ = 0;					\
 116	}
 117
 118	BCH_INODE_FIELDS_v3()
 119#undef  x
 120	BUG_ON(out > end);
 121
 122	out = last_nonzero_field;
 123	nr_fields = last_nonzero_fieldnr;
 124
 125	bytes = out - (u8 *) &packed->inode.v;
 126	set_bkey_val_bytes(&packed->inode.k, bytes);
 127	memset_u64s_tail(&packed->inode.v, 0, bytes);
 128
 129	SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
 130
 131	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
 132		struct bch_inode_unpacked unpacked;
 133
 134		ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
 135		BUG_ON(ret);
 136		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
 137		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
 138		BUG_ON(unpacked.bi_sectors	!= inode->bi_sectors);
 139		BUG_ON(unpacked.bi_size		!= inode->bi_size);
 140		BUG_ON(unpacked.bi_version	!= inode->bi_version);
 141		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
 142
 143#define x(_name, _bits)	if (unpacked._name != inode->_name)		\
 144			panic("unpacked %llu should be %llu",		\
 145			      (u64) unpacked._name, (u64) inode->_name);
 146		BCH_INODE_FIELDS_v3()
 147#undef  x
 148	}
 149}
 150
 151void bch2_inode_pack(struct bkey_inode_buf *packed,
 152		     const struct bch_inode_unpacked *inode)
 153{
 154	bch2_inode_pack_inlined(packed, inode);
 155}
 156
 157static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
 158				struct bch_inode_unpacked *unpacked)
 159{
 160	const u8 *in = inode.v->fields;
 161	const u8 *end = bkey_val_end(inode);
 162	u64 field[2];
 163	unsigned fieldnr = 0, field_bits;
 164	int ret;
 165
 166#define x(_name, _bits)							\
 167	if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) {			\
 168		unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
 169		memset((void *) unpacked + offset, 0,			\
 170		       sizeof(*unpacked) - offset);			\
 171		return 0;						\
 172	}								\
 173									\
 174	ret = inode_decode_field(in, end, field, &field_bits);		\
 175	if (ret < 0)							\
 176		return ret;						\
 177									\
 178	if (field_bits > sizeof(unpacked->_name) * 8)			\
 179		return -1;						\
 180									\
 181	unpacked->_name = field[1];					\
 182	in += ret;
 183
 184	BCH_INODE_FIELDS_v2()
 185#undef  x
 186
 187	/* XXX: signal if there were more fields than expected? */
 188	return 0;
 189}
 190
 191static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
 192				const u8 *in, const u8 *end,
 193				unsigned nr_fields)
 194{
 195	unsigned fieldnr = 0;
 196	int ret;
 197	u64 v[2];
 198
 199#define x(_name, _bits)							\
 200	if (fieldnr < nr_fields) {					\
 201		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
 202		if (ret < 0)						\
 203			return ret;					\
 204		in += ret;						\
 205									\
 206		if (_bits > 64) {					\
 207			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
 208			if (ret < 0)					\
 209				return ret;				\
 210			in += ret;					\
 211		} else {						\
 212			v[1] = 0;					\
 213		}							\
 214	} else {							\
 215		v[0] = v[1] = 0;					\
 216	}								\
 217									\
 218	unpacked->_name = v[0];						\
 219	if (v[1] || v[0] != unpacked->_name)				\
 220		return -1;						\
 221	fieldnr++;
 222
 223	BCH_INODE_FIELDS_v2()
 224#undef  x
 225
 226	/* XXX: signal if there were more fields than expected? */
 227	return 0;
 228}
 229
 230static int bch2_inode_unpack_v3(struct bkey_s_c k,
 231				struct bch_inode_unpacked *unpacked)
 232{
 233	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
 234	const u8 *in = inode.v->fields;
 235	const u8 *end = bkey_val_end(inode);
 236	unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
 237	unsigned fieldnr = 0;
 238	int ret;
 239	u64 v[2];
 240
 241	unpacked->bi_inum	= inode.k->p.offset;
 242	unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
 243	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
 244	unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
 245	unpacked->bi_sectors	= le64_to_cpu(inode.v->bi_sectors);
 246	unpacked->bi_size	= le64_to_cpu(inode.v->bi_size);
 247	unpacked->bi_version	= le64_to_cpu(inode.v->bi_version);
 248	unpacked->bi_mode	= INODEv3_MODE(inode.v);
 249
 250#define x(_name, _bits)							\
 251	if (fieldnr < nr_fields) {					\
 252		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
 253		if (ret < 0)						\
 254			return ret;					\
 255		in += ret;						\
 256									\
 257		if (_bits > 64) {					\
 258			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
 259			if (ret < 0)					\
 260				return ret;				\
 261			in += ret;					\
 262		} else {						\
 263			v[1] = 0;					\
 264		}							\
 265	} else {							\
 266		v[0] = v[1] = 0;					\
 267	}								\
 268									\
 269	unpacked->_name = v[0];						\
 270	if (v[1] || v[0] != unpacked->_name)				\
 271		return -1;						\
 272	fieldnr++;
 273
 274	BCH_INODE_FIELDS_v3()
 275#undef  x
 276
 277	/* XXX: signal if there were more fields than expected? */
 278	return 0;
 279}
 280
 281static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
 282					       struct bch_inode_unpacked *unpacked)
 283{
 284	memset(unpacked, 0, sizeof(*unpacked));
 285
 286	unpacked->bi_snapshot = k.k->p.snapshot;
 287
 288	switch (k.k->type) {
 289	case KEY_TYPE_inode: {
 290		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 291
 292		unpacked->bi_inum	= inode.k->p.offset;
 293		unpacked->bi_journal_seq= 0;
 294		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
 295		unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
 296		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
 297
 298		if (INODEv1_NEW_VARINT(inode.v)) {
 299			return bch2_inode_unpack_v2(unpacked, inode.v->fields,
 300						    bkey_val_end(inode),
 301						    INODEv1_NR_FIELDS(inode.v));
 302		} else {
 303			return bch2_inode_unpack_v1(inode, unpacked);
 304		}
 305		break;
 306	}
 307	case KEY_TYPE_inode_v2: {
 308		struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
 309
 310		unpacked->bi_inum	= inode.k->p.offset;
 311		unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
 312		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
 313		unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
 314		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
 315
 316		return bch2_inode_unpack_v2(unpacked, inode.v->fields,
 317					    bkey_val_end(inode),
 318					    INODEv2_NR_FIELDS(inode.v));
 319	}
 320	default:
 321		BUG();
 322	}
 323}
 324
 325int bch2_inode_unpack(struct bkey_s_c k,
 326		      struct bch_inode_unpacked *unpacked)
 327{
 328	unpacked->bi_snapshot = k.k->p.snapshot;
 329
 330	return likely(k.k->type == KEY_TYPE_inode_v3)
 331		? bch2_inode_unpack_v3(k, unpacked)
 332		: bch2_inode_unpack_slowpath(k, unpacked);
 333}
 334
 335int __bch2_inode_peek(struct btree_trans *trans,
 336		      struct btree_iter *iter,
 337		      struct bch_inode_unpacked *inode,
 338		      subvol_inum inum, unsigned flags,
 339		      bool warn)
 340{
 341	u32 snapshot;
 342	int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn);
 343	if (ret)
 344		return ret;
 345
 346	struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
 347					       SPOS(0, inum.inum, snapshot),
 348					       flags|BTREE_ITER_cached);
 349	ret = bkey_err(k);
 350	if (ret)
 351		return ret;
 352
 353	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
 354	if (ret)
 355		goto err;
 356
 357	ret = bch2_inode_unpack(k, inode);
 358	if (ret)
 359		goto err;
 360
 361	return 0;
 362err:
 363	if (warn)
 364		bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
 365	bch2_trans_iter_exit(trans, iter);
 366	return ret;
 367}
 368
 369int bch2_inode_write_flags(struct btree_trans *trans,
 370		     struct btree_iter *iter,
 371		     struct bch_inode_unpacked *inode,
 372		     enum btree_iter_update_trigger_flags flags)
 373{
 374	struct bkey_inode_buf *inode_p;
 375
 376	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
 377	if (IS_ERR(inode_p))
 378		return PTR_ERR(inode_p);
 379
 380	bch2_inode_pack_inlined(inode_p, inode);
 381	inode_p->inode.k.p.snapshot = iter->snapshot;
 382	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
 383}
 384
 385int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
 386{
 387	struct bkey_inode_buf *inode_p =
 388		bch2_trans_kmalloc(trans, sizeof(*inode_p));
 389
 390	if (IS_ERR(inode_p))
 391		return PTR_ERR(inode_p);
 392
 393	bch2_inode_pack(inode_p, inode);
 394	inode_p->inode.k.p.snapshot = inode->bi_snapshot;
 395
 396	return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
 397				&inode_p->inode.k_i,
 398				BTREE_UPDATE_internal_snapshot_node);
 399}
 400
 401int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
 402{
 403	int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
 404			    __bch2_fsck_write_inode(trans, inode));
 405	bch_err_fn(trans->c, ret);
 406	return ret;
 407}
 408
 409struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
 410{
 411	struct bch_inode_unpacked u;
 412	struct bkey_inode_buf *inode_p;
 413	int ret;
 414
 415	if (!bkey_is_inode(&k->k))
 416		return ERR_PTR(-ENOENT);
 417
 418	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
 419	if (IS_ERR(inode_p))
 420		return ERR_CAST(inode_p);
 421
 422	ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
 423	if (ret)
 424		return ERR_PTR(ret);
 425
 426	bch2_inode_pack(inode_p, &u);
 427	return &inode_p->inode.k_i;
 428}
 429
 430static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
 431				 enum bch_validate_flags flags)
 432{
 433	struct bch_inode_unpacked unpacked;
 434	int ret = 0;
 435
 436	bkey_fsck_err_on(k.k->p.inode,
 437			 c, inode_pos_inode_nonzero,
 438			 "nonzero k.p.inode");
 439
 440	bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX,
 441			 c, inode_pos_blockdev_range,
 442			 "fs inode in blockdev range");
 443
 444	bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked),
 445			 c, inode_unpack_error,
 446			 "invalid variable length fields");
 447
 448	bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1,
 449			 c, inode_checksum_type_invalid,
 450			 "invalid data checksum type (%u >= %u",
 451			 unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
 452
 453	bkey_fsck_err_on(unpacked.bi_compression &&
 454			 !bch2_compression_opt_valid(unpacked.bi_compression - 1),
 455			 c, inode_compression_type_invalid,
 456			 "invalid compression opt %u", unpacked.bi_compression - 1);
 457
 458	bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
 459			 unpacked.bi_nlink != 0,
 460			 c, inode_unlinked_but_nlink_nonzero,
 461			 "flagged as unlinked but bi_nlink != 0");
 462
 463	bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode),
 464			 c, inode_subvol_root_but_not_dir,
 465			 "subvolume root but not a directory");
 466fsck_err:
 467	return ret;
 468}
 469
 470int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
 471			enum bch_validate_flags flags)
 472{
 473	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 474	int ret = 0;
 475
 476	bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
 477			 c, inode_str_hash_invalid,
 478			 "invalid str hash type (%llu >= %u)",
 479			 INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR);
 480
 481	ret = __bch2_inode_validate(c, k, flags);
 482fsck_err:
 483	return ret;
 484}
 485
 486int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k,
 487			   enum bch_validate_flags flags)
 488{
 489	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
 490	int ret = 0;
 491
 492	bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
 493			 c, inode_str_hash_invalid,
 494			 "invalid str hash type (%llu >= %u)",
 495			 INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
 496
 497	ret = __bch2_inode_validate(c, k, flags);
 498fsck_err:
 499	return ret;
 500}
 501
 502int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k,
 503			   enum bch_validate_flags flags)
 504{
 505	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
 506	int ret = 0;
 507
 508	bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
 509			 INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k),
 510			 c, inode_v3_fields_start_bad,
 511			 "invalid fields_start (got %llu, min %u max %zu)",
 512			 INODEv3_FIELDS_START(inode.v),
 513			 INODEv3_FIELDS_START_INITIAL,
 514			 bkey_val_u64s(inode.k));
 515
 516	bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
 517			 c, inode_str_hash_invalid,
 518			 "invalid str hash type (%llu >= %u)",
 519			 INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
 520
 521	ret = __bch2_inode_validate(c, k, flags);
 522fsck_err:
 523	return ret;
 524}
 525
 526static void __bch2_inode_unpacked_to_text(struct printbuf *out,
 527					  struct bch_inode_unpacked *inode)
 528{
 529	prt_printf(out, "\n");
 530	printbuf_indent_add(out, 2);
 531	prt_printf(out, "mode=%o\n", inode->bi_mode);
 532
 533	prt_str(out, "flags=");
 534	prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
 535	prt_printf(out, "(%x)\n", inode->bi_flags);
 536
 537	prt_printf(out, "journal_seq=%llu\n",	inode->bi_journal_seq);
 538	prt_printf(out, "hash_seed=%llx\n",	inode->bi_hash_seed);
 539	prt_printf(out, "hash_type=");
 540	bch2_prt_str_hash_type(out, INODE_STR_HASH(inode));
 541	prt_newline(out);
 542	prt_printf(out, "bi_size=%llu\n",	inode->bi_size);
 543	prt_printf(out, "bi_sectors=%llu\n",	inode->bi_sectors);
 544	prt_printf(out, "bi_version=%llu\n",	inode->bi_version);
 545
 546#define x(_name, _bits)						\
 547	prt_printf(out, #_name "=%llu\n", (u64) inode->_name);
 548	BCH_INODE_FIELDS_v3()
 549#undef  x
 550
 551	bch2_printbuf_strip_trailing_newline(out);
 552	printbuf_indent_sub(out, 2);
 553}
 554
 555void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
 556{
 557	prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot);
 558	__bch2_inode_unpacked_to_text(out, inode);
 559}
 560
 561void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
 562{
 563	struct bch_inode_unpacked inode;
 564
 565	if (bch2_inode_unpack(k, &inode)) {
 566		prt_printf(out, "(unpack error)");
 567		return;
 568	}
 569
 570	__bch2_inode_unpacked_to_text(out, &inode);
 571}
 572
 573static inline u64 bkey_inode_flags(struct bkey_s_c k)
 574{
 575	switch (k.k->type) {
 576	case KEY_TYPE_inode:
 577		return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
 578	case KEY_TYPE_inode_v2:
 579		return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
 580	case KEY_TYPE_inode_v3:
 581		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
 582	default:
 583		return 0;
 584	}
 585}
 586
 587static inline void bkey_inode_flags_set(struct bkey_s k, u64 f)
 588{
 589	switch (k.k->type) {
 590	case KEY_TYPE_inode:
 591		bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f);
 592		return;
 593	case KEY_TYPE_inode_v2:
 594		bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f);
 595		return;
 596	case KEY_TYPE_inode_v3:
 597		bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f);
 598		return;
 599	default:
 600		BUG();
 601	}
 602}
 603
 604static inline bool bkey_is_unlinked_inode(struct bkey_s_c k)
 605{
 606	unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked;
 607
 608	return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot);
 609}
 610
 611static struct bkey_s_c
 612bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
 613				   enum btree_id btree, struct bpos pos,
 614				   unsigned flags)
 615{
 616	struct bch_fs *c = trans->c;
 617	struct bkey_s_c k;
 618	int ret = 0;
 619
 620	for_each_btree_key_upto_norestart(trans, *iter, btree,
 621					  bpos_successor(pos),
 622					  SPOS(pos.inode, pos.offset, U32_MAX),
 623					  flags|BTREE_ITER_all_snapshots, k, ret)
 624		if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot))
 625			return k;
 626
 627	bch2_trans_iter_exit(trans, iter);
 628	return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
 629}
 630
 631static struct bkey_s_c
 632bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
 633				    struct bpos pos, unsigned flags)
 634{
 635	struct bkey_s_c k;
 636again:
 637	k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags);
 638	if (!k.k ||
 639	    bkey_err(k) ||
 640	    bkey_is_inode(k.k))
 641		return k;
 642
 643	bch2_trans_iter_exit(trans, iter);
 644	pos = k.k->p;
 645	goto again;
 646}
 647
 648int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
 649{
 650	struct bch_fs *c = trans->c;
 651	struct btree_iter iter;
 652	struct bkey_s_c k;
 653	int ret = 0;
 654
 655	for_each_btree_key_upto_norestart(trans, iter,
 656			BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
 657			BTREE_ITER_all_snapshots|
 658			BTREE_ITER_with_updates, k, ret)
 659		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) &&
 660		    bkey_is_inode(k.k)) {
 661			ret = 1;
 662			break;
 663		}
 664	bch2_trans_iter_exit(trans, &iter);
 665	return ret;
 666}
 667
 668static int update_inode_has_children(struct btree_trans *trans,
 669				     struct bkey_s k,
 670				     bool have_child)
 671{
 672	if (!have_child) {
 673		int ret = bch2_inode_has_child_snapshots(trans, k.k->p);
 674		if (ret)
 675			return ret < 0 ? ret : 0;
 676	}
 677
 678	u64 f = bkey_inode_flags(k.s_c);
 679	if (have_child != !!(f & BCH_INODE_has_child_snapshot))
 680		bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot);
 681
 682	return 0;
 683}
 684
 685static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos,
 686					    bool have_child)
 687{
 688	struct btree_iter iter;
 689	struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans,
 690						&iter, pos, BTREE_ITER_with_updates);
 691	int ret = bkey_err(k);
 692	if (ret)
 693		return ret;
 694	if (!k.k)
 695		return 0;
 696
 697	if (!have_child) {
 698		ret = bch2_inode_has_child_snapshots(trans, k.k->p);
 699		if (ret) {
 700			ret = ret < 0 ? ret : 0;
 701			goto err;
 702		}
 703	}
 704
 705	u64 f = bkey_inode_flags(k);
 706	if (have_child != !!(f & BCH_INODE_has_child_snapshot)) {
 707		struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k,
 708					     BTREE_UPDATE_internal_snapshot_node);
 709		ret = PTR_ERR_OR_ZERO(update);
 710		if (ret)
 711			goto err;
 712
 713		bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot);
 714	}
 715err:
 716	bch2_trans_iter_exit(trans, &iter);
 717	return ret;
 718}
 719
 720int bch2_trigger_inode(struct btree_trans *trans,
 721		       enum btree_id btree_id, unsigned level,
 722		       struct bkey_s_c old,
 723		       struct bkey_s new,
 724		       enum btree_iter_update_trigger_flags flags)
 725{
 726	struct bch_fs *c = trans->c;
 727
 728	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
 729		BUG_ON(!trans->journal_res.seq);
 730		bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
 731	}
 732
 733	s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
 734	if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) {
 735		struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes };
 736		int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc);
 737		if (ret)
 738			return ret;
 739	}
 740
 741	if (flags & BTREE_TRIGGER_transactional) {
 742		int unlinked_delta =	(int) bkey_is_unlinked_inode(new.s_c) -
 743					(int) bkey_is_unlinked_inode(old);
 744		if (unlinked_delta) {
 745			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
 746							      new.k->p, unlinked_delta > 0);
 747			if (ret)
 748				return ret;
 749		}
 750
 751		/*
 752		 * If we're creating or deleting an inode at this snapshot ID,
 753		 * and there might be an inode in a parent snapshot ID, we might
 754		 * need to set or clear the has_child_snapshot flag on the
 755		 * parent.
 756		 */
 757		int deleted_delta = (int) bkey_is_inode(new.k) -
 758				    (int) bkey_is_inode(old.k);
 759		if (deleted_delta &&
 760		    bch2_snapshot_parent(c, new.k->p.snapshot)) {
 761			int ret = update_parent_inode_has_children(trans, new.k->p,
 762								   deleted_delta > 0);
 763			if (ret)
 764				return ret;
 765		}
 766
 767		/*
 768		 * When an inode is first updated in a new snapshot, we may need
 769		 * to clear has_child_snapshot
 770		 */
 771		if (deleted_delta > 0) {
 772			int ret = update_inode_has_children(trans, new, false);
 773			if (ret)
 774				return ret;
 775		}
 776	}
 777
 778	return 0;
 779}
 780
 781int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k,
 782				   enum bch_validate_flags flags)
 783{
 784	int ret = 0;
 785
 786	bkey_fsck_err_on(k.k->p.inode,
 787			 c, inode_pos_inode_nonzero,
 788			 "nonzero k.p.inode");
 789fsck_err:
 790	return ret;
 791}
 792
 793void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
 794				   struct bkey_s_c k)
 795{
 796	struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
 797
 798	prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
 799}
 800
 801void bch2_inode_init_early(struct bch_fs *c,
 802			   struct bch_inode_unpacked *inode_u)
 803{
 804	enum bch_str_hash_type str_hash =
 805		bch2_str_hash_opt_to_type(c, c->opts.str_hash);
 806
 807	memset(inode_u, 0, sizeof(*inode_u));
 808
 809	SET_INODE_STR_HASH(inode_u, str_hash);
 810	get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
 811}
 812
 813void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
 814			  uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
 815			  struct bch_inode_unpacked *parent)
 816{
 817	inode_u->bi_mode	= mode;
 818	inode_u->bi_uid		= uid;
 819	inode_u->bi_gid		= gid;
 820	inode_u->bi_dev		= rdev;
 821	inode_u->bi_atime	= now;
 822	inode_u->bi_mtime	= now;
 823	inode_u->bi_ctime	= now;
 824	inode_u->bi_otime	= now;
 825
 826	if (parent && parent->bi_mode & S_ISGID) {
 827		inode_u->bi_gid = parent->bi_gid;
 828		if (S_ISDIR(mode))
 829			inode_u->bi_mode |= S_ISGID;
 830	}
 831
 832	if (parent) {
 833#define x(_name, ...)	inode_u->bi_##_name = parent->bi_##_name;
 834		BCH_INODE_OPTS()
 835#undef x
 836	}
 837}
 838
 839void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 840		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
 841		     struct bch_inode_unpacked *parent)
 842{
 843	bch2_inode_init_early(c, inode_u);
 844	bch2_inode_init_late(inode_u, bch2_current_time(c),
 845			     uid, gid, mode, rdev, parent);
 846}
 847
 848static inline u32 bkey_generation(struct bkey_s_c k)
 849{
 850	switch (k.k->type) {
 851	case KEY_TYPE_inode:
 852	case KEY_TYPE_inode_v2:
 853		BUG();
 854	case KEY_TYPE_inode_generation:
 855		return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
 856	default:
 857		return 0;
 858	}
 859}
 860
 861/*
 862 * This just finds an empty slot:
 863 */
 864int bch2_inode_create(struct btree_trans *trans,
 865		      struct btree_iter *iter,
 866		      struct bch_inode_unpacked *inode_u,
 867		      u32 snapshot, u64 cpu)
 868{
 869	struct bch_fs *c = trans->c;
 870	struct bkey_s_c k;
 871	u64 min, max, start, pos, *hint;
 872	int ret = 0;
 873	unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
 874
 875	if (c->opts.shard_inode_numbers) {
 876		bits -= c->inode_shard_bits;
 877
 878		min = (cpu << bits);
 879		max = (cpu << bits) | ~(ULLONG_MAX << bits);
 880
 881		min = max_t(u64, min, BLOCKDEV_INODE_MAX);
 882		hint = c->unused_inode_hints + cpu;
 883	} else {
 884		min = BLOCKDEV_INODE_MAX;
 885		max = ~(ULLONG_MAX << bits);
 886		hint = c->unused_inode_hints;
 887	}
 888
 889	start = READ_ONCE(*hint);
 890
 891	if (start >= max || start < min)
 892		start = min;
 893
 894	pos = start;
 895	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
 896			     BTREE_ITER_all_snapshots|
 897			     BTREE_ITER_intent);
 898again:
 899	while ((k = bch2_btree_iter_peek(iter)).k &&
 900	       !(ret = bkey_err(k)) &&
 901	       bkey_lt(k.k->p, POS(0, max))) {
 902		if (pos < iter->pos.offset)
 903			goto found_slot;
 904
 905		/*
 906		 * We don't need to iterate over keys in every snapshot once
 907		 * we've found just one:
 908		 */
 909		pos = iter->pos.offset + 1;
 910		bch2_btree_iter_set_pos(iter, POS(0, pos));
 911	}
 912
 913	if (!ret && pos < max)
 914		goto found_slot;
 915
 916	if (!ret && start == min)
 917		ret = -BCH_ERR_ENOSPC_inode_create;
 918
 919	if (ret) {
 920		bch2_trans_iter_exit(trans, iter);
 921		return ret;
 922	}
 923
 924	/* Retry from start */
 925	pos = start = min;
 926	bch2_btree_iter_set_pos(iter, POS(0, pos));
 927	goto again;
 928found_slot:
 929	bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
 930	k = bch2_btree_iter_peek_slot(iter);
 931	ret = bkey_err(k);
 932	if (ret) {
 933		bch2_trans_iter_exit(trans, iter);
 934		return ret;
 935	}
 936
 937	*hint			= k.k->p.offset;
 938	inode_u->bi_inum	= k.k->p.offset;
 939	inode_u->bi_generation	= bkey_generation(k);
 940	return 0;
 941}
 942
 943static int bch2_inode_delete_keys(struct btree_trans *trans,
 944				  subvol_inum inum, enum btree_id id)
 945{
 946	struct btree_iter iter;
 947	struct bkey_s_c k;
 948	struct bkey_i delete;
 949	struct bpos end = POS(inum.inum, U64_MAX);
 950	u32 snapshot;
 951	int ret = 0;
 952
 953	/*
 954	 * We're never going to be deleting partial extents, no need to use an
 955	 * extent iterator:
 956	 */
 957	bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
 958			     BTREE_ITER_intent);
 959
 960	while (1) {
 961		bch2_trans_begin(trans);
 962
 963		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 964		if (ret)
 965			goto err;
 966
 967		bch2_btree_iter_set_snapshot(&iter, snapshot);
 968
 969		k = bch2_btree_iter_peek_upto(&iter, end);
 970		ret = bkey_err(k);
 971		if (ret)
 972			goto err;
 973
 974		if (!k.k)
 975			break;
 976
 977		bkey_init(&delete.k);
 978		delete.k.p = iter.pos;
 979
 980		if (iter.flags & BTREE_ITER_is_extents)
 981			bch2_key_resize(&delete.k,
 982					bpos_min(end, k.k->p).offset -
 983					iter.pos.offset);
 984
 985		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
 986		      bch2_trans_commit(trans, NULL, NULL,
 987					BCH_TRANS_COMMIT_no_enospc);
 988err:
 989		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 990			break;
 991	}
 992
 993	bch2_trans_iter_exit(trans, &iter);
 994	return ret;
 995}
 996
 997int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 998{
 999	struct btree_trans *trans = bch2_trans_get(c);
1000	struct btree_iter iter = { NULL };
1001	struct bkey_i_inode_generation delete;
1002	struct bch_inode_unpacked inode_u;
1003	struct bkey_s_c k;
1004	u32 snapshot;
1005	int ret;
1006
1007	/*
1008	 * If this was a directory, there shouldn't be any real dirents left -
1009	 * but there could be whiteouts (from hash collisions) that we should
1010	 * delete:
1011	 *
1012	 * XXX: the dirent could ideally would delete whiteouts when they're no
1013	 * longer needed
1014	 */
1015	ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
1016		bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
1017		bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
1018	if (ret)
1019		goto err;
1020retry:
1021	bch2_trans_begin(trans);
1022
1023	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
1024	if (ret)
1025		goto err;
1026
1027	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
1028			       SPOS(0, inum.inum, snapshot),
1029			       BTREE_ITER_intent|BTREE_ITER_cached);
1030	ret = bkey_err(k);
1031	if (ret)
1032		goto err;
1033
1034	if (!bkey_is_inode(k.k)) {
1035		bch2_fs_inconsistent(c,
1036				     "inode %llu:%u not found when deleting",
1037				     inum.inum, snapshot);
1038		ret = -EIO;
1039		goto err;
1040	}
1041
1042	bch2_inode_unpack(k, &inode_u);
1043
1044	bkey_inode_generation_init(&delete.k_i);
1045	delete.k.p = iter.pos;
1046	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
1047
1048	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
1049		bch2_trans_commit(trans, NULL, NULL,
1050				BCH_TRANS_COMMIT_no_enospc);
1051err:
1052	bch2_trans_iter_exit(trans, &iter);
1053	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1054		goto retry;
1055
1056	if (ret)
1057		goto err2;
1058
1059	ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot));
1060err2:
1061	bch2_trans_put(trans);
1062	return ret;
1063}
1064
1065int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
1066				  subvol_inum inum,
1067				  struct bch_inode_unpacked *inode)
1068{
1069	struct btree_iter iter;
1070	int ret;
1071
1072	ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
1073	if (!ret)
1074		bch2_trans_iter_exit(trans, &iter);
1075	return ret;
1076}
1077
1078int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
1079				  subvol_inum inum,
1080				  struct bch_inode_unpacked *inode)
1081{
1082	struct btree_iter iter;
1083	int ret;
1084
1085	ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
1086	if (!ret)
1087		bch2_trans_iter_exit(trans, &iter);
1088	return ret;
1089}
1090
1091int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
1092			    struct bch_inode_unpacked *inode)
1093{
1094	return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
1095}
1096
1097int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
1098{
1099	if (bi->bi_flags & BCH_INODE_unlinked)
1100		bi->bi_flags &= ~BCH_INODE_unlinked;
1101	else {
1102		if (bi->bi_nlink == U32_MAX)
1103			return -EINVAL;
1104
1105		bi->bi_nlink++;
1106	}
1107
1108	return 0;
1109}
1110
1111void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
1112{
1113	if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
1114		bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
1115					bi->bi_inum);
1116		return;
1117	}
1118
1119	if (bi->bi_flags & BCH_INODE_unlinked) {
1120		bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
1121		return;
1122	}
1123
1124	if (bi->bi_nlink)
1125		bi->bi_nlink--;
1126	else
1127		bi->bi_flags |= BCH_INODE_unlinked;
1128}
1129
1130struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
1131{
1132	struct bch_opts ret = { 0 };
1133#define x(_name, _bits)							\
1134	if (inode->bi_##_name)						\
1135		opt_set(ret, _name, inode->bi_##_name - 1);
1136	BCH_INODE_OPTS()
1137#undef x
1138	return ret;
1139}
1140
1141void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
1142			 struct bch_inode_unpacked *inode)
1143{
1144#define x(_name, _bits)		opts->_name = inode_opt_get(c, inode, _name);
1145	BCH_INODE_OPTS()
1146#undef x
1147
1148	if (opts->nocow)
1149		opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
1150}
1151
1152int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
1153{
1154	struct bch_inode_unpacked inode;
1155	int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
1156
1157	if (ret)
1158		return ret;
1159
1160	bch2_inode_opts_get(opts, trans->c, &inode);
1161	return 0;
1162}
1163
1164static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
1165{
1166	struct bch_fs *c = trans->c;
1167	struct btree_iter iter = { NULL };
1168	struct bkey_i_inode_generation delete;
1169	struct bch_inode_unpacked inode_u;
1170	struct bkey_s_c k;
1171	int ret;
1172
1173	do {
1174		ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1175						      SPOS(inum, 0, snapshot),
1176						      SPOS(inum, U64_MAX, snapshot),
1177						      0, NULL) ?:
1178			bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
1179						      SPOS(inum, 0, snapshot),
1180						      SPOS(inum, U64_MAX, snapshot),
1181						      0, NULL) ?:
1182			bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
1183						      SPOS(inum, 0, snapshot),
1184						      SPOS(inum, U64_MAX, snapshot),
1185						      0, NULL);
1186	} while (ret == -BCH_ERR_transaction_restart_nested);
1187	if (ret)
1188		goto err;
1189retry:
1190	bch2_trans_begin(trans);
1191
1192	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
1193			       SPOS(0, inum, snapshot), BTREE_ITER_intent);
1194	ret = bkey_err(k);
1195	if (ret)
1196		goto err;
1197
1198	if (!bkey_is_inode(k.k)) {
1199		bch2_fs_inconsistent(c,
1200				     "inode %llu:%u not found when deleting",
1201				     inum, snapshot);
1202		ret = -EIO;
1203		goto err;
1204	}
1205
1206	bch2_inode_unpack(k, &inode_u);
1207
1208	/* Subvolume root? */
1209	if (inode_u.bi_subvol)
1210		bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
1211
1212	bkey_inode_generation_init(&delete.k_i);
1213	delete.k.p = iter.pos;
1214	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
1215
1216	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
1217		bch2_trans_commit(trans, NULL, NULL,
1218				BCH_TRANS_COMMIT_no_enospc);
1219err:
1220	bch2_trans_iter_exit(trans, &iter);
1221	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1222		goto retry;
1223
1224	return ret ?: -BCH_ERR_transaction_restart_nested;
1225}
1226
1227/*
1228 * After deleting an inode, there may be versions in older snapshots that should
1229 * also be deleted - if they're not referenced by sibling snapshots and not open
1230 * in other subvolumes:
1231 */
1232static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos)
1233{
1234	struct btree_iter iter;
1235	struct bkey_s_c k;
1236	int ret;
1237next_parent:
1238	ret = lockrestart_do(trans,
1239		bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0)));
1240	if (ret || !k.k)
1241		return ret;
1242
1243	bool unlinked = bkey_is_unlinked_inode(k);
1244	pos = k.k->p;
1245	bch2_trans_iter_exit(trans, &iter);
1246
1247	if (!unlinked)
1248		return 0;
1249
1250	ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos));
1251	if (ret)
1252		return ret < 0 ? ret : 0;
1253
1254	ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
1255	if (ret)
1256		return ret;
1257	goto next_parent;
1258}
1259
1260int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
1261{
1262	return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?:
1263		delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
1264}
1265
1266static int may_delete_deleted_inode(struct btree_trans *trans,
1267				    struct btree_iter *iter,
1268				    struct bpos pos,
1269				    bool *need_another_pass)
1270{
1271	struct bch_fs *c = trans->c;
1272	struct btree_iter inode_iter;
1273	struct bkey_s_c k;
1274	struct bch_inode_unpacked inode;
1275	struct printbuf buf = PRINTBUF;
1276	int ret;
1277
1278	k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
1279	ret = bkey_err(k);
1280	if (ret)
1281		return ret;
1282
1283	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
1284	if (fsck_err_on(!bkey_is_inode(k.k),
1285			trans, deleted_inode_missing,
1286			"nonexistent inode %llu:%u in deleted_inodes btree",
1287			pos.offset, pos.snapshot))
1288		goto delete;
1289
1290	ret = bch2_inode_unpack(k, &inode);
1291	if (ret)
1292		goto out;
1293
1294	if (S_ISDIR(inode.bi_mode)) {
1295		ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
1296		if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
1297				trans, deleted_inode_is_dir,
1298				"non empty directory %llu:%u in deleted_inodes btree",
1299				pos.offset, pos.snapshot))
1300			goto delete;
1301		if (ret)
1302			goto out;
1303	}
1304
1305	if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked),
1306			trans, deleted_inode_not_unlinked,
1307			"non-deleted inode %llu:%u in deleted_inodes btree",
1308			pos.offset, pos.snapshot))
1309		goto delete;
1310
1311	if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot,
1312			trans, deleted_inode_has_child_snapshots,
1313			"inode with child snapshots %llu:%u in deleted_inodes btree",
1314			pos.offset, pos.snapshot))
1315		goto delete;
1316
1317	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
1318	if (ret < 0)
1319		goto out;
1320
1321	if (ret) {
1322		if (fsck_err(trans, inode_has_child_snapshots_wrong,
1323			     "inode has_child_snapshots flag wrong (should be set)\n%s",
1324			     (printbuf_reset(&buf),
1325			      bch2_inode_unpacked_to_text(&buf, &inode),
1326			      buf.buf))) {
1327			inode.bi_flags |= BCH_INODE_has_child_snapshot;
1328			ret = __bch2_fsck_write_inode(trans, &inode);
1329			if (ret)
1330				goto out;
1331		}
1332		goto delete;
1333
1334	}
1335
1336	if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
1337	    !fsck_err(trans, deleted_inode_but_clean,
1338		      "filesystem marked as clean but have deleted inode %llu:%u",
1339		      pos.offset, pos.snapshot)) {
1340		ret = 0;
1341		goto out;
1342	}
1343
1344	ret = 1;
1345out:
1346fsck_err:
1347	bch2_trans_iter_exit(trans, &inode_iter);
1348	printbuf_exit(&buf);
1349	return ret;
1350delete:
1351	ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
1352	goto out;
1353}
1354
1355int bch2_delete_dead_inodes(struct bch_fs *c)
1356{
1357	struct btree_trans *trans = bch2_trans_get(c);
1358	bool need_another_pass;
1359	int ret;
1360again:
1361	/*
1362	 * if we ran check_inodes() unlinked inodes will have already been
1363	 * cleaned up but the write buffer will be out of sync; therefore we
1364	 * alway need a write buffer flush
1365	 */
1366	ret = bch2_btree_write_buffer_flush_sync(trans);
1367	if (ret)
1368		goto err;
1369
1370	need_another_pass = false;
1371
1372	/*
1373	 * Weird transaction restart handling here because on successful delete,
1374	 * bch2_inode_rm_snapshot() will return a nested transaction restart,
1375	 * but we can't retry because the btree write buffer won't have been
1376	 * flushed and we'd spin:
1377	 */
1378	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
1379					BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
1380					NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
1381		ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
1382		if (ret > 0) {
1383			bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
1384
1385			ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
1386			/*
1387			 * We don't want to loop here: a transaction restart
1388			 * error here means we handled a transaction restart and
1389			 * we're actually done, but if we loop we'll retry the
1390			 * same key because the write buffer hasn't been flushed
1391			 * yet
1392			 */
1393			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1394				ret = 0;
1395				continue;
1396			}
1397		}
1398
1399		ret;
1400	}));
1401
1402	if (!ret && need_another_pass)
1403		goto again;
1404err:
1405	bch2_trans_put(trans);
1406	return ret;
1407}