Linux Audio

Check our new training course

Loading...
Note: File does not exist in v4.6.
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include "bcachefs.h"
   4#include "btree_key_cache.h"
   5#include "btree_write_buffer.h"
   6#include "bkey_methods.h"
   7#include "btree_update.h"
   8#include "buckets.h"
   9#include "compress.h"
  10#include "dirent.h"
  11#include "error.h"
  12#include "extents.h"
  13#include "extent_update.h"
  14#include "inode.h"
  15#include "str_hash.h"
  16#include "snapshot.h"
  17#include "subvolume.h"
  18#include "varint.h"
  19
  20#include <linux/random.h>
  21
  22#include <asm/unaligned.h>
  23
  24#define x(name, ...)	#name,
  25const char * const bch2_inode_opts[] = {
  26	BCH_INODE_OPTS()
  27	NULL,
  28};
  29
  30static const char * const bch2_inode_flag_strs[] = {
  31	BCH_INODE_FLAGS()
  32	NULL
  33};
  34#undef  x
  35
  36static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
  37
  38static int inode_decode_field(const u8 *in, const u8 *end,
  39			      u64 out[2], unsigned *out_bits)
  40{
  41	__be64 be[2] = { 0, 0 };
  42	unsigned bytes, shift;
  43	u8 *p;
  44
  45	if (in >= end)
  46		return -1;
  47
  48	if (!*in)
  49		return -1;
  50
  51	/*
  52	 * position of highest set bit indicates number of bytes:
  53	 * shift = number of bits to remove in high byte:
  54	 */
  55	shift	= 8 - __fls(*in); /* 1 <= shift <= 8 */
  56	bytes	= byte_table[shift - 1];
  57
  58	if (in + bytes > end)
  59		return -1;
  60
  61	p = (u8 *) be + 16 - bytes;
  62	memcpy(p, in, bytes);
  63	*p ^= (1 << 8) >> shift;
  64
  65	out[0] = be64_to_cpu(be[0]);
  66	out[1] = be64_to_cpu(be[1]);
  67	*out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
  68
  69	return bytes;
  70}
  71
  72static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
  73					   const struct bch_inode_unpacked *inode)
  74{
  75	struct bkey_i_inode_v3 *k = &packed->inode;
  76	u8 *out = k->v.fields;
  77	u8 *end = (void *) &packed[1];
  78	u8 *last_nonzero_field = out;
  79	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
  80	unsigned bytes;
  81	int ret;
  82
  83	bkey_inode_v3_init(&packed->inode.k_i);
  84	packed->inode.k.p.offset	= inode->bi_inum;
  85	packed->inode.v.bi_journal_seq	= cpu_to_le64(inode->bi_journal_seq);
  86	packed->inode.v.bi_hash_seed	= inode->bi_hash_seed;
  87	packed->inode.v.bi_flags	= cpu_to_le64(inode->bi_flags);
  88	packed->inode.v.bi_sectors	= cpu_to_le64(inode->bi_sectors);
  89	packed->inode.v.bi_size		= cpu_to_le64(inode->bi_size);
  90	packed->inode.v.bi_version	= cpu_to_le64(inode->bi_version);
  91	SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode);
  92	SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR);
  93
  94
  95#define x(_name, _bits)							\
  96	nr_fields++;							\
  97									\
  98	if (inode->_name) {						\
  99		ret = bch2_varint_encode_fast(out, inode->_name);	\
 100		out += ret;						\
 101									\
 102		if (_bits > 64)						\
 103			*out++ = 0;					\
 104									\
 105		last_nonzero_field = out;				\
 106		last_nonzero_fieldnr = nr_fields;			\
 107	} else {							\
 108		*out++ = 0;						\
 109									\
 110		if (_bits > 64)						\
 111			*out++ = 0;					\
 112	}
 113
 114	BCH_INODE_FIELDS_v3()
 115#undef  x
 116	BUG_ON(out > end);
 117
 118	out = last_nonzero_field;
 119	nr_fields = last_nonzero_fieldnr;
 120
 121	bytes = out - (u8 *) &packed->inode.v;
 122	set_bkey_val_bytes(&packed->inode.k, bytes);
 123	memset_u64s_tail(&packed->inode.v, 0, bytes);
 124
 125	SET_INODEv3_NR_FIELDS(&k->v, nr_fields);
 126
 127	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
 128		struct bch_inode_unpacked unpacked;
 129
 130		ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
 131		BUG_ON(ret);
 132		BUG_ON(unpacked.bi_inum		!= inode->bi_inum);
 133		BUG_ON(unpacked.bi_hash_seed	!= inode->bi_hash_seed);
 134		BUG_ON(unpacked.bi_sectors	!= inode->bi_sectors);
 135		BUG_ON(unpacked.bi_size		!= inode->bi_size);
 136		BUG_ON(unpacked.bi_version	!= inode->bi_version);
 137		BUG_ON(unpacked.bi_mode		!= inode->bi_mode);
 138
 139#define x(_name, _bits)	if (unpacked._name != inode->_name)		\
 140			panic("unpacked %llu should be %llu",		\
 141			      (u64) unpacked._name, (u64) inode->_name);
 142		BCH_INODE_FIELDS_v3()
 143#undef  x
 144	}
 145}
 146
 147void bch2_inode_pack(struct bkey_inode_buf *packed,
 148		     const struct bch_inode_unpacked *inode)
 149{
 150	bch2_inode_pack_inlined(packed, inode);
 151}
 152
 153static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
 154				struct bch_inode_unpacked *unpacked)
 155{
 156	const u8 *in = inode.v->fields;
 157	const u8 *end = bkey_val_end(inode);
 158	u64 field[2];
 159	unsigned fieldnr = 0, field_bits;
 160	int ret;
 161
 162#define x(_name, _bits)					\
 163	if (fieldnr++ == INODE_NR_FIELDS(inode.v)) {			\
 164		unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
 165		memset((void *) unpacked + offset, 0,			\
 166		       sizeof(*unpacked) - offset);			\
 167		return 0;						\
 168	}								\
 169									\
 170	ret = inode_decode_field(in, end, field, &field_bits);		\
 171	if (ret < 0)							\
 172		return ret;						\
 173									\
 174	if (field_bits > sizeof(unpacked->_name) * 8)			\
 175		return -1;						\
 176									\
 177	unpacked->_name = field[1];					\
 178	in += ret;
 179
 180	BCH_INODE_FIELDS_v2()
 181#undef  x
 182
 183	/* XXX: signal if there were more fields than expected? */
 184	return 0;
 185}
 186
 187static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
 188				const u8 *in, const u8 *end,
 189				unsigned nr_fields)
 190{
 191	unsigned fieldnr = 0;
 192	int ret;
 193	u64 v[2];
 194
 195#define x(_name, _bits)							\
 196	if (fieldnr < nr_fields) {					\
 197		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
 198		if (ret < 0)						\
 199			return ret;					\
 200		in += ret;						\
 201									\
 202		if (_bits > 64) {					\
 203			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
 204			if (ret < 0)					\
 205				return ret;				\
 206			in += ret;					\
 207		} else {						\
 208			v[1] = 0;					\
 209		}							\
 210	} else {							\
 211		v[0] = v[1] = 0;					\
 212	}								\
 213									\
 214	unpacked->_name = v[0];						\
 215	if (v[1] || v[0] != unpacked->_name)				\
 216		return -1;						\
 217	fieldnr++;
 218
 219	BCH_INODE_FIELDS_v2()
 220#undef  x
 221
 222	/* XXX: signal if there were more fields than expected? */
 223	return 0;
 224}
 225
 226static int bch2_inode_unpack_v3(struct bkey_s_c k,
 227				struct bch_inode_unpacked *unpacked)
 228{
 229	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
 230	const u8 *in = inode.v->fields;
 231	const u8 *end = bkey_val_end(inode);
 232	unsigned nr_fields = INODEv3_NR_FIELDS(inode.v);
 233	unsigned fieldnr = 0;
 234	int ret;
 235	u64 v[2];
 236
 237	unpacked->bi_inum	= inode.k->p.offset;
 238	unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
 239	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
 240	unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
 241	unpacked->bi_sectors	= le64_to_cpu(inode.v->bi_sectors);
 242	unpacked->bi_size	= le64_to_cpu(inode.v->bi_size);
 243	unpacked->bi_version	= le64_to_cpu(inode.v->bi_version);
 244	unpacked->bi_mode	= INODEv3_MODE(inode.v);
 245
 246#define x(_name, _bits)							\
 247	if (fieldnr < nr_fields) {					\
 248		ret = bch2_varint_decode_fast(in, end, &v[0]);		\
 249		if (ret < 0)						\
 250			return ret;					\
 251		in += ret;						\
 252									\
 253		if (_bits > 64) {					\
 254			ret = bch2_varint_decode_fast(in, end, &v[1]);	\
 255			if (ret < 0)					\
 256				return ret;				\
 257			in += ret;					\
 258		} else {						\
 259			v[1] = 0;					\
 260		}							\
 261	} else {							\
 262		v[0] = v[1] = 0;					\
 263	}								\
 264									\
 265	unpacked->_name = v[0];						\
 266	if (v[1] || v[0] != unpacked->_name)				\
 267		return -1;						\
 268	fieldnr++;
 269
 270	BCH_INODE_FIELDS_v3()
 271#undef  x
 272
 273	/* XXX: signal if there were more fields than expected? */
 274	return 0;
 275}
 276
 277static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
 278					       struct bch_inode_unpacked *unpacked)
 279{
 280	memset(unpacked, 0, sizeof(*unpacked));
 281
 282	switch (k.k->type) {
 283	case KEY_TYPE_inode: {
 284		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 285
 286		unpacked->bi_inum	= inode.k->p.offset;
 287		unpacked->bi_journal_seq= 0;
 288		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
 289		unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
 290		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
 291
 292		if (INODE_NEW_VARINT(inode.v)) {
 293			return bch2_inode_unpack_v2(unpacked, inode.v->fields,
 294						    bkey_val_end(inode),
 295						    INODE_NR_FIELDS(inode.v));
 296		} else {
 297			return bch2_inode_unpack_v1(inode, unpacked);
 298		}
 299		break;
 300	}
 301	case KEY_TYPE_inode_v2: {
 302		struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
 303
 304		unpacked->bi_inum	= inode.k->p.offset;
 305		unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
 306		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
 307		unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
 308		unpacked->bi_mode	= le16_to_cpu(inode.v->bi_mode);
 309
 310		return bch2_inode_unpack_v2(unpacked, inode.v->fields,
 311					    bkey_val_end(inode),
 312					    INODEv2_NR_FIELDS(inode.v));
 313	}
 314	default:
 315		BUG();
 316	}
 317}
 318
 319int bch2_inode_unpack(struct bkey_s_c k,
 320		      struct bch_inode_unpacked *unpacked)
 321{
 322	if (likely(k.k->type == KEY_TYPE_inode_v3))
 323		return bch2_inode_unpack_v3(k, unpacked);
 324	return bch2_inode_unpack_slowpath(k, unpacked);
 325}
 326
 327static int bch2_inode_peek_nowarn(struct btree_trans *trans,
 328		    struct btree_iter *iter,
 329		    struct bch_inode_unpacked *inode,
 330		    subvol_inum inum, unsigned flags)
 331{
 332	struct bkey_s_c k;
 333	u32 snapshot;
 334	int ret;
 335
 336	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 337	if (ret)
 338		return ret;
 339
 340	k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
 341			       SPOS(0, inum.inum, snapshot),
 342			       flags|BTREE_ITER_CACHED);
 343	ret = bkey_err(k);
 344	if (ret)
 345		return ret;
 346
 347	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
 348	if (ret)
 349		goto err;
 350
 351	ret = bch2_inode_unpack(k, inode);
 352	if (ret)
 353		goto err;
 354
 355	return 0;
 356err:
 357	bch2_trans_iter_exit(trans, iter);
 358	return ret;
 359}
 360
 361int bch2_inode_peek(struct btree_trans *trans,
 362		    struct btree_iter *iter,
 363		    struct bch_inode_unpacked *inode,
 364		    subvol_inum inum, unsigned flags)
 365{
 366	int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
 367	bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
 368	return ret;
 369}
 370
 371int bch2_inode_write_flags(struct btree_trans *trans,
 372		     struct btree_iter *iter,
 373		     struct bch_inode_unpacked *inode,
 374		     enum btree_update_flags flags)
 375{
 376	struct bkey_inode_buf *inode_p;
 377
 378	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
 379	if (IS_ERR(inode_p))
 380		return PTR_ERR(inode_p);
 381
 382	bch2_inode_pack_inlined(inode_p, inode);
 383	inode_p->inode.k.p.snapshot = iter->snapshot;
 384	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
 385}
 386
 387struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
 388{
 389	struct bch_inode_unpacked u;
 390	struct bkey_inode_buf *inode_p;
 391	int ret;
 392
 393	if (!bkey_is_inode(&k->k))
 394		return ERR_PTR(-ENOENT);
 395
 396	inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
 397	if (IS_ERR(inode_p))
 398		return ERR_CAST(inode_p);
 399
 400	ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u);
 401	if (ret)
 402		return ERR_PTR(ret);
 403
 404	bch2_inode_pack(inode_p, &u);
 405	return &inode_p->inode.k_i;
 406}
 407
 408static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
 409{
 410	struct bch_inode_unpacked unpacked;
 411	int ret = 0;
 412
 413	bkey_fsck_err_on(k.k->p.inode, c, err,
 414			 inode_pos_inode_nonzero,
 415			 "nonzero k.p.inode");
 416
 417	bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err,
 418			 inode_pos_blockdev_range,
 419			 "fs inode in blockdev range");
 420
 421	bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err,
 422			 inode_unpack_error,
 423			 "invalid variable length fields");
 424
 425	bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err,
 426			 inode_checksum_type_invalid,
 427			 "invalid data checksum type (%u >= %u",
 428			 unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
 429
 430	bkey_fsck_err_on(unpacked.bi_compression &&
 431			 !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err,
 432			 inode_compression_type_invalid,
 433			 "invalid compression opt %u", unpacked.bi_compression - 1);
 434
 435	bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
 436			 unpacked.bi_nlink != 0, c, err,
 437			 inode_unlinked_but_nlink_nonzero,
 438			 "flagged as unlinked but bi_nlink != 0");
 439
 440	bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err,
 441			 inode_subvol_root_but_not_dir,
 442			 "subvolume root but not a directory");
 443fsck_err:
 444	return ret;
 445}
 446
 447int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
 448		       enum bkey_invalid_flags flags,
 449		       struct printbuf *err)
 450{
 451	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 452	int ret = 0;
 453
 454	bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
 455			 inode_str_hash_invalid,
 456			 "invalid str hash type (%llu >= %u)",
 457			 INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
 458
 459	ret = __bch2_inode_invalid(c, k, err);
 460fsck_err:
 461	return ret;
 462}
 463
 464int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
 465			  enum bkey_invalid_flags flags,
 466			  struct printbuf *err)
 467{
 468	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
 469	int ret = 0;
 470
 471	bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
 472			 inode_str_hash_invalid,
 473			 "invalid str hash type (%llu >= %u)",
 474			 INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
 475
 476	ret = __bch2_inode_invalid(c, k, err);
 477fsck_err:
 478	return ret;
 479}
 480
 481int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
 482			  enum bkey_invalid_flags flags,
 483			  struct printbuf *err)
 484{
 485	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
 486	int ret = 0;
 487
 488	bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
 489			 INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err,
 490			 inode_v3_fields_start_bad,
 491			 "invalid fields_start (got %llu, min %u max %zu)",
 492			 INODEv3_FIELDS_START(inode.v),
 493			 INODEv3_FIELDS_START_INITIAL,
 494			 bkey_val_u64s(inode.k));
 495
 496	bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
 497			 inode_str_hash_invalid,
 498			 "invalid str hash type (%llu >= %u)",
 499			 INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
 500
 501	ret = __bch2_inode_invalid(c, k, err);
 502fsck_err:
 503	return ret;
 504}
 505
 506static void __bch2_inode_unpacked_to_text(struct printbuf *out,
 507					  struct bch_inode_unpacked *inode)
 508{
 509	printbuf_indent_add(out, 2);
 510	prt_printf(out, "mode=%o", inode->bi_mode);
 511	prt_newline(out);
 512
 513	prt_str(out, "flags=");
 514	prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
 515	prt_printf(out, " (%x)", inode->bi_flags);
 516	prt_newline(out);
 517
 518	prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq);
 519	prt_newline(out);
 520
 521	prt_printf(out, "bi_size=%llu", inode->bi_size);
 522	prt_newline(out);
 523
 524	prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
 525	prt_newline(out);
 526
 527	prt_newline(out);
 528	prt_printf(out, "bi_version=%llu", inode->bi_version);
 529
 530#define x(_name, _bits)						\
 531	prt_printf(out, #_name "=%llu", (u64) inode->_name);	\
 532	prt_newline(out);
 533	BCH_INODE_FIELDS_v3()
 534#undef  x
 535	printbuf_indent_sub(out, 2);
 536}
 537
 538void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
 539{
 540	prt_printf(out, "inum: %llu ", inode->bi_inum);
 541	__bch2_inode_unpacked_to_text(out, inode);
 542}
 543
 544void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
 545{
 546	struct bch_inode_unpacked inode;
 547
 548	if (bch2_inode_unpack(k, &inode)) {
 549		prt_printf(out, "(unpack error)");
 550		return;
 551	}
 552
 553	__bch2_inode_unpacked_to_text(out, &inode);
 554}
 555
 556static inline u64 bkey_inode_flags(struct bkey_s_c k)
 557{
 558	switch (k.k->type) {
 559	case KEY_TYPE_inode:
 560		return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
 561	case KEY_TYPE_inode_v2:
 562		return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
 563	case KEY_TYPE_inode_v3:
 564		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
 565	default:
 566		return 0;
 567	}
 568}
 569
 570static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
 571{
 572	return bkey_inode_flags(k) & BCH_INODE_unlinked;
 573}
 574
 575int bch2_trigger_inode(struct btree_trans *trans,
 576		       enum btree_id btree_id, unsigned level,
 577		       struct bkey_s_c old,
 578		       struct bkey_s new,
 579		       unsigned flags)
 580{
 581	s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
 582
 583	if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
 584		if (nr) {
 585			int ret = bch2_replicas_deltas_realloc(trans, 0);
 586			if (ret)
 587				return ret;
 588
 589			trans->fs_usage_deltas->nr_inodes += nr;
 590		}
 591
 592		bool old_deleted = bkey_is_deleted_inode(old);
 593		bool new_deleted = bkey_is_deleted_inode(new.s_c);
 594		if (old_deleted != new_deleted) {
 595			int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted);
 596			if (ret)
 597				return ret;
 598		}
 599	}
 600
 601	if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
 602		BUG_ON(!trans->journal_res.seq);
 603
 604		bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
 605	}
 606
 607	if (flags & BTREE_TRIGGER_GC) {
 608		struct bch_fs *c = trans->c;
 609
 610		percpu_down_read(&c->mark_lock);
 611		this_cpu_add(c->usage_gc->b.nr_inodes, nr);
 612		percpu_up_read(&c->mark_lock);
 613	}
 614
 615	return 0;
 616}
 617
 618int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
 619				  enum bkey_invalid_flags flags,
 620				  struct printbuf *err)
 621{
 622	int ret = 0;
 623
 624	bkey_fsck_err_on(k.k->p.inode, c, err,
 625			 inode_pos_inode_nonzero,
 626			 "nonzero k.p.inode");
 627fsck_err:
 628	return ret;
 629}
 630
 631void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
 632				   struct bkey_s_c k)
 633{
 634	struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
 635
 636	prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
 637}
 638
 639void bch2_inode_init_early(struct bch_fs *c,
 640			   struct bch_inode_unpacked *inode_u)
 641{
 642	enum bch_str_hash_type str_hash =
 643		bch2_str_hash_opt_to_type(c, c->opts.str_hash);
 644
 645	memset(inode_u, 0, sizeof(*inode_u));
 646
 647	/* ick */
 648	inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
 649	get_random_bytes(&inode_u->bi_hash_seed,
 650			 sizeof(inode_u->bi_hash_seed));
 651}
 652
 653void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
 654			  uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
 655			  struct bch_inode_unpacked *parent)
 656{
 657	inode_u->bi_mode	= mode;
 658	inode_u->bi_uid		= uid;
 659	inode_u->bi_gid		= gid;
 660	inode_u->bi_dev		= rdev;
 661	inode_u->bi_atime	= now;
 662	inode_u->bi_mtime	= now;
 663	inode_u->bi_ctime	= now;
 664	inode_u->bi_otime	= now;
 665
 666	if (parent && parent->bi_mode & S_ISGID) {
 667		inode_u->bi_gid = parent->bi_gid;
 668		if (S_ISDIR(mode))
 669			inode_u->bi_mode |= S_ISGID;
 670	}
 671
 672	if (parent) {
 673#define x(_name, ...)	inode_u->bi_##_name = parent->bi_##_name;
 674		BCH_INODE_OPTS()
 675#undef x
 676	}
 677}
 678
 679void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 680		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
 681		     struct bch_inode_unpacked *parent)
 682{
 683	bch2_inode_init_early(c, inode_u);
 684	bch2_inode_init_late(inode_u, bch2_current_time(c),
 685			     uid, gid, mode, rdev, parent);
 686}
 687
 688static inline u32 bkey_generation(struct bkey_s_c k)
 689{
 690	switch (k.k->type) {
 691	case KEY_TYPE_inode:
 692	case KEY_TYPE_inode_v2:
 693		BUG();
 694	case KEY_TYPE_inode_generation:
 695		return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
 696	default:
 697		return 0;
 698	}
 699}
 700
 701/*
 702 * This just finds an empty slot:
 703 */
 704int bch2_inode_create(struct btree_trans *trans,
 705		      struct btree_iter *iter,
 706		      struct bch_inode_unpacked *inode_u,
 707		      u32 snapshot, u64 cpu)
 708{
 709	struct bch_fs *c = trans->c;
 710	struct bkey_s_c k;
 711	u64 min, max, start, pos, *hint;
 712	int ret = 0;
 713	unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
 714
 715	if (c->opts.shard_inode_numbers) {
 716		bits -= c->inode_shard_bits;
 717
 718		min = (cpu << bits);
 719		max = (cpu << bits) | ~(ULLONG_MAX << bits);
 720
 721		min = max_t(u64, min, BLOCKDEV_INODE_MAX);
 722		hint = c->unused_inode_hints + cpu;
 723	} else {
 724		min = BLOCKDEV_INODE_MAX;
 725		max = ~(ULLONG_MAX << bits);
 726		hint = c->unused_inode_hints;
 727	}
 728
 729	start = READ_ONCE(*hint);
 730
 731	if (start >= max || start < min)
 732		start = min;
 733
 734	pos = start;
 735	bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
 736			     BTREE_ITER_ALL_SNAPSHOTS|
 737			     BTREE_ITER_INTENT);
 738again:
 739	while ((k = bch2_btree_iter_peek(iter)).k &&
 740	       !(ret = bkey_err(k)) &&
 741	       bkey_lt(k.k->p, POS(0, max))) {
 742		if (pos < iter->pos.offset)
 743			goto found_slot;
 744
 745		/*
 746		 * We don't need to iterate over keys in every snapshot once
 747		 * we've found just one:
 748		 */
 749		pos = iter->pos.offset + 1;
 750		bch2_btree_iter_set_pos(iter, POS(0, pos));
 751	}
 752
 753	if (!ret && pos < max)
 754		goto found_slot;
 755
 756	if (!ret && start == min)
 757		ret = -BCH_ERR_ENOSPC_inode_create;
 758
 759	if (ret) {
 760		bch2_trans_iter_exit(trans, iter);
 761		return ret;
 762	}
 763
 764	/* Retry from start */
 765	pos = start = min;
 766	bch2_btree_iter_set_pos(iter, POS(0, pos));
 767	goto again;
 768found_slot:
 769	bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
 770	k = bch2_btree_iter_peek_slot(iter);
 771	ret = bkey_err(k);
 772	if (ret) {
 773		bch2_trans_iter_exit(trans, iter);
 774		return ret;
 775	}
 776
 777	*hint			= k.k->p.offset;
 778	inode_u->bi_inum	= k.k->p.offset;
 779	inode_u->bi_generation	= bkey_generation(k);
 780	return 0;
 781}
 782
 783static int bch2_inode_delete_keys(struct btree_trans *trans,
 784				  subvol_inum inum, enum btree_id id)
 785{
 786	struct btree_iter iter;
 787	struct bkey_s_c k;
 788	struct bkey_i delete;
 789	struct bpos end = POS(inum.inum, U64_MAX);
 790	u32 snapshot;
 791	int ret = 0;
 792
 793	/*
 794	 * We're never going to be deleting partial extents, no need to use an
 795	 * extent iterator:
 796	 */
 797	bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
 798			     BTREE_ITER_INTENT);
 799
 800	while (1) {
 801		bch2_trans_begin(trans);
 802
 803		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 804		if (ret)
 805			goto err;
 806
 807		bch2_btree_iter_set_snapshot(&iter, snapshot);
 808
 809		k = bch2_btree_iter_peek_upto(&iter, end);
 810		ret = bkey_err(k);
 811		if (ret)
 812			goto err;
 813
 814		if (!k.k)
 815			break;
 816
 817		bkey_init(&delete.k);
 818		delete.k.p = iter.pos;
 819
 820		if (iter.flags & BTREE_ITER_IS_EXTENTS)
 821			bch2_key_resize(&delete.k,
 822					bpos_min(end, k.k->p).offset -
 823					iter.pos.offset);
 824
 825		ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
 826		      bch2_trans_commit(trans, NULL, NULL,
 827					BCH_TRANS_COMMIT_no_enospc);
 828err:
 829		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 830			break;
 831	}
 832
 833	bch2_trans_iter_exit(trans, &iter);
 834	return ret;
 835}
 836
 837int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 838{
 839	struct btree_trans *trans = bch2_trans_get(c);
 840	struct btree_iter iter = { NULL };
 841	struct bkey_i_inode_generation delete;
 842	struct bch_inode_unpacked inode_u;
 843	struct bkey_s_c k;
 844	u32 snapshot;
 845	int ret;
 846
 847	/*
 848	 * If this was a directory, there shouldn't be any real dirents left -
 849	 * but there could be whiteouts (from hash collisions) that we should
 850	 * delete:
 851	 *
 852	 * XXX: the dirent could ideally would delete whiteouts when they're no
 853	 * longer needed
 854	 */
 855	ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
 856		bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
 857		bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
 858	if (ret)
 859		goto err;
 860retry:
 861	bch2_trans_begin(trans);
 862
 863	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
 864	if (ret)
 865		goto err;
 866
 867	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
 868			       SPOS(0, inum.inum, snapshot),
 869			       BTREE_ITER_INTENT|BTREE_ITER_CACHED);
 870	ret = bkey_err(k);
 871	if (ret)
 872		goto err;
 873
 874	if (!bkey_is_inode(k.k)) {
 875		bch2_fs_inconsistent(c,
 876				     "inode %llu:%u not found when deleting",
 877				     inum.inum, snapshot);
 878		ret = -EIO;
 879		goto err;
 880	}
 881
 882	bch2_inode_unpack(k, &inode_u);
 883
 884	bkey_inode_generation_init(&delete.k_i);
 885	delete.k.p = iter.pos;
 886	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
 887
 888	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
 889		bch2_trans_commit(trans, NULL, NULL,
 890				BCH_TRANS_COMMIT_no_enospc);
 891err:
 892	bch2_trans_iter_exit(trans, &iter);
 893	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 894		goto retry;
 895
 896	bch2_trans_put(trans);
 897	return ret;
 898}
 899
 900int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
 901				  subvol_inum inum,
 902				  struct bch_inode_unpacked *inode)
 903{
 904	struct btree_iter iter;
 905	int ret;
 906
 907	ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
 908	if (!ret)
 909		bch2_trans_iter_exit(trans, &iter);
 910	return ret;
 911}
 912
 913int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
 914				  subvol_inum inum,
 915				  struct bch_inode_unpacked *inode)
 916{
 917	struct btree_iter iter;
 918	int ret;
 919
 920	ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
 921	if (!ret)
 922		bch2_trans_iter_exit(trans, &iter);
 923	return ret;
 924}
 925
 926int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
 927			    struct bch_inode_unpacked *inode)
 928{
 929	return bch2_trans_do(c, NULL, NULL, 0,
 930		bch2_inode_find_by_inum_trans(trans, inum, inode));
 931}
 932
 933int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
 934{
 935	if (bi->bi_flags & BCH_INODE_unlinked)
 936		bi->bi_flags &= ~BCH_INODE_unlinked;
 937	else {
 938		if (bi->bi_nlink == U32_MAX)
 939			return -EINVAL;
 940
 941		bi->bi_nlink++;
 942	}
 943
 944	return 0;
 945}
 946
 947void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
 948{
 949	if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
 950		bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
 951					bi->bi_inum);
 952		return;
 953	}
 954
 955	if (bi->bi_flags & BCH_INODE_unlinked) {
 956		bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
 957		return;
 958	}
 959
 960	if (bi->bi_nlink)
 961		bi->bi_nlink--;
 962	else
 963		bi->bi_flags |= BCH_INODE_unlinked;
 964}
 965
 966struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
 967{
 968	struct bch_opts ret = { 0 };
 969#define x(_name, _bits)							\
 970	if (inode->bi_##_name)						\
 971		opt_set(ret, _name, inode->bi_##_name - 1);
 972	BCH_INODE_OPTS()
 973#undef x
 974	return ret;
 975}
 976
 977void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
 978			 struct bch_inode_unpacked *inode)
 979{
 980#define x(_name, _bits)		opts->_name = inode_opt_get(c, inode, _name);
 981	BCH_INODE_OPTS()
 982#undef x
 983
 984	if (opts->nocow)
 985		opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
 986}
 987
 988int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
 989{
 990	struct bch_inode_unpacked inode;
 991	int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
 992
 993	if (ret)
 994		return ret;
 995
 996	bch2_inode_opts_get(opts, trans->c, &inode);
 997	return 0;
 998}
 999
1000int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
1001{
1002	struct bch_fs *c = trans->c;
1003	struct btree_iter iter = { NULL };
1004	struct bkey_i_inode_generation delete;
1005	struct bch_inode_unpacked inode_u;
1006	struct bkey_s_c k;
1007	int ret;
1008
1009	do {
1010		ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
1011						      SPOS(inum, 0, snapshot),
1012						      SPOS(inum, U64_MAX, snapshot),
1013						      0, NULL) ?:
1014			bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
1015						      SPOS(inum, 0, snapshot),
1016						      SPOS(inum, U64_MAX, snapshot),
1017						      0, NULL) ?:
1018			bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
1019						      SPOS(inum, 0, snapshot),
1020						      SPOS(inum, U64_MAX, snapshot),
1021						      0, NULL);
1022	} while (ret == -BCH_ERR_transaction_restart_nested);
1023	if (ret)
1024		goto err;
1025retry:
1026	bch2_trans_begin(trans);
1027
1028	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
1029			       SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
1030	ret = bkey_err(k);
1031	if (ret)
1032		goto err;
1033
1034	if (!bkey_is_inode(k.k)) {
1035		bch2_fs_inconsistent(c,
1036				     "inode %llu:%u not found when deleting",
1037				     inum, snapshot);
1038		ret = -EIO;
1039		goto err;
1040	}
1041
1042	bch2_inode_unpack(k, &inode_u);
1043
1044	/* Subvolume root? */
1045	if (inode_u.bi_subvol)
1046		bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
1047
1048	bkey_inode_generation_init(&delete.k_i);
1049	delete.k.p = iter.pos;
1050	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
1051
1052	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
1053		bch2_trans_commit(trans, NULL, NULL,
1054				BCH_TRANS_COMMIT_no_enospc);
1055err:
1056	bch2_trans_iter_exit(trans, &iter);
1057	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
1058		goto retry;
1059
1060	return ret ?: -BCH_ERR_transaction_restart_nested;
1061}
1062
1063static int may_delete_deleted_inode(struct btree_trans *trans,
1064				    struct btree_iter *iter,
1065				    struct bpos pos,
1066				    bool *need_another_pass)
1067{
1068	struct bch_fs *c = trans->c;
1069	struct btree_iter inode_iter;
1070	struct bkey_s_c k;
1071	struct bch_inode_unpacked inode;
1072	int ret;
1073
1074	k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
1075	ret = bkey_err(k);
1076	if (ret)
1077		return ret;
1078
1079	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
1080	if (fsck_err_on(!bkey_is_inode(k.k), c,
1081			deleted_inode_missing,
1082			"nonexistent inode %llu:%u in deleted_inodes btree",
1083			pos.offset, pos.snapshot))
1084		goto delete;
1085
1086	ret = bch2_inode_unpack(k, &inode);
1087	if (ret)
1088		goto out;
1089
1090	if (S_ISDIR(inode.bi_mode)) {
1091		ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot);
1092		if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir,
1093				"non empty directory %llu:%u in deleted_inodes btree",
1094				pos.offset, pos.snapshot))
1095			goto delete;
1096		if (ret)
1097			goto out;
1098	}
1099
1100	if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
1101			deleted_inode_not_unlinked,
1102			"non-deleted inode %llu:%u in deleted_inodes btree",
1103			pos.offset, pos.snapshot))
1104		goto delete;
1105
1106	if (c->sb.clean &&
1107	    !fsck_err(c,
1108		      deleted_inode_but_clean,
1109		      "filesystem marked as clean but have deleted inode %llu:%u",
1110		      pos.offset, pos.snapshot)) {
1111		ret = 0;
1112		goto out;
1113	}
1114
1115	if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
1116		struct bpos new_min_pos;
1117
1118		ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
1119		if (ret)
1120			goto out;
1121
1122		inode.bi_flags &= ~BCH_INODE_unlinked;
1123
1124		ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
1125					     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1126		bch_err_msg(c, ret, "clearing inode unlinked flag");
1127		if (ret)
1128			goto out;
1129
1130		/*
1131		 * We'll need another write buffer flush to pick up the new
1132		 * unlinked inodes in the snapshot leaves:
1133		 */
1134		*need_another_pass = true;
1135		goto out;
1136	}
1137
1138	ret = 1;
1139out:
1140fsck_err:
1141	bch2_trans_iter_exit(trans, &inode_iter);
1142	return ret;
1143delete:
1144	ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
1145	goto out;
1146}
1147
1148int bch2_delete_dead_inodes(struct bch_fs *c)
1149{
1150	struct btree_trans *trans = bch2_trans_get(c);
1151	bool need_another_pass;
1152	int ret;
1153again:
1154	need_another_pass = false;
1155
1156	/*
1157	 * Weird transaction restart handling here because on successful delete,
1158	 * bch2_inode_rm_snapshot() will return a nested transaction restart,
1159	 * but we can't retry because the btree write buffer won't have been
1160	 * flushed and we'd spin:
1161	 */
1162	ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
1163					BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1164					NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
1165		ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
1166		if (ret > 0) {
1167			bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
1168
1169			ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
1170			/*
1171			 * We don't want to loop here: a transaction restart
1172			 * error here means we handled a transaction restart and
1173			 * we're actually done, but if we loop we'll retry the
1174			 * same key because the write buffer hasn't been flushed
1175			 * yet
1176			 */
1177			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1178				ret = 0;
1179				continue;
1180			}
1181		}
1182
1183		ret;
1184	}));
1185
1186	if (!ret && need_another_pass) {
1187		ret = bch2_btree_write_buffer_flush_sync(trans);
1188		if (ret)
1189			goto err;
1190		goto again;
1191	}
1192err:
1193	bch2_trans_put(trans);
1194	return ret;
1195}