Linux Audio

Check our new training course

Loading...
v6.8
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * Copyright (C) 2011 Red Hat, Inc.
  4 *
  5 * This file is released under the GPL.
  6 */
  7
  8#include "dm-btree.h"
  9#include "dm-btree-internal.h"
 10#include "dm-transaction-manager.h"
 11
 12#include <linux/export.h>
 13#include <linux/device-mapper.h>
 14
 15#define DM_MSG_PREFIX "btree"
 16
 17/*
 18 * Removing an entry from a btree
 19 * ==============================
 20 *
 21 * A very important constraint for our btree is that no node, except the
 22 * root, may have fewer than a certain number of entries.
 23 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
 24 *
 25 * Ensuring this is complicated by the way we want to only ever hold the
 26 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
 27 * fashion.
 28 *
 29 * Each node may have a left or right sibling.  When decending the spine,
 30 * if a node contains only MIN_ENTRIES then we try and increase this to at
 31 * least MIN_ENTRIES + 1.  We do this in the following ways:
 32 *
 33 * [A] No siblings => this can only happen if the node is the root, in which
 34 *     case we copy the childs contents over the root.
 35 *
 36 * [B] No left sibling
 37 *     ==> rebalance(node, right sibling)
 38 *
 39 * [C] No right sibling
 40 *     ==> rebalance(left sibling, node)
 41 *
 42 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
 43 *     ==> delete node adding it's contents to left and right
 44 *
 45 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
 46 *     ==> rebalance(left, node, right)
 47 *
 48 * After these operations it's possible that the our original node no
 49 * longer contains the desired sub tree.  For this reason this rebalancing
 50 * is performed on the children of the current node.  This also avoids
 51 * having a special case for the root.
 52 *
 53 * Once this rebalancing has occurred we can then step into the child node
 54 * for internal nodes.  Or delete the entry for leaf nodes.
 55 */
 56
 57/*
 58 * Some little utilities for moving node data around.
 59 */
 60static void node_shift(struct btree_node *n, int shift)
 61{
 62	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
 63	uint32_t value_size = le32_to_cpu(n->header.value_size);
 64
 65	if (shift < 0) {
 66		shift = -shift;
 67		BUG_ON(shift > nr_entries);
 68		BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
 69		memmove(key_ptr(n, 0),
 70			key_ptr(n, shift),
 71			(nr_entries - shift) * sizeof(__le64));
 72		memmove(value_ptr(n, 0),
 73			value_ptr(n, shift),
 74			(nr_entries - shift) * value_size);
 75	} else {
 76		BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
 77		memmove(key_ptr(n, shift),
 78			key_ptr(n, 0),
 79			nr_entries * sizeof(__le64));
 80		memmove(value_ptr(n, shift),
 81			value_ptr(n, 0),
 82			nr_entries * value_size);
 83	}
 84}
 85
 86static int node_copy(struct btree_node *left, struct btree_node *right, int shift)
 87{
 88	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 89	uint32_t value_size = le32_to_cpu(left->header.value_size);
 90
 91	if (value_size != le32_to_cpu(right->header.value_size)) {
 92		DMERR("mismatched value size");
 93		return -EILSEQ;
 94	}
 95
 96	if (shift < 0) {
 97		shift = -shift;
 98
 99		if (nr_left + shift > le32_to_cpu(left->header.max_entries)) {
100			DMERR("bad shift");
101			return -EINVAL;
102		}
103
104		memcpy(key_ptr(left, nr_left),
105		       key_ptr(right, 0),
106		       shift * sizeof(__le64));
107		memcpy(value_ptr(left, nr_left),
108		       value_ptr(right, 0),
109		       shift * value_size);
110	} else {
111		if (shift > le32_to_cpu(right->header.max_entries)) {
112			DMERR("bad shift");
113			return -EINVAL;
114		}
115
116		memcpy(key_ptr(right, 0),
117		       key_ptr(left, nr_left - shift),
118		       shift * sizeof(__le64));
119		memcpy(value_ptr(right, 0),
120		       value_ptr(left, nr_left - shift),
121		       shift * value_size);
122	}
123	return 0;
124}
125
126/*
127 * Delete a specific entry from a leaf node.
128 */
129static void delete_at(struct btree_node *n, unsigned int index)
130{
131	unsigned int nr_entries = le32_to_cpu(n->header.nr_entries);
132	unsigned int nr_to_copy = nr_entries - (index + 1);
133	uint32_t value_size = le32_to_cpu(n->header.value_size);
134
135	BUG_ON(index >= nr_entries);
136
137	if (nr_to_copy) {
138		memmove(key_ptr(n, index),
139			key_ptr(n, index + 1),
140			nr_to_copy * sizeof(__le64));
141
142		memmove(value_ptr(n, index),
143			value_ptr(n, index + 1),
144			nr_to_copy * value_size);
145	}
146
147	n->header.nr_entries = cpu_to_le32(nr_entries - 1);
148}
149
150static unsigned int merge_threshold(struct btree_node *n)
151{
152	return le32_to_cpu(n->header.max_entries) / 3;
153}
154
155struct child {
156	unsigned int index;
157	struct dm_block *block;
158	struct btree_node *n;
159};
160
161static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
162		      struct btree_node *parent,
163		      unsigned int index, struct child *result)
164{
165	int r, inc;
166	dm_block_t root;
167
168	result->index = index;
169	root = value64(parent, index);
170
171	r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
172			       &result->block, &inc);
173	if (r)
174		return r;
175
176	result->n = dm_block_data(result->block);
177
178	if (inc)
179		inc_children(info->tm, result->n, vt);
180
181	*((__le64 *) value_ptr(parent, index)) =
182		cpu_to_le64(dm_block_location(result->block));
183
184	return 0;
185}
186
187static void exit_child(struct dm_btree_info *info, struct child *c)
188{
189	dm_tm_unlock(info->tm, c->block);
190}
191
192static int shift(struct btree_node *left, struct btree_node *right, int count)
193{
194	int r;
195	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
196	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
197	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
198	uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
199
200	if (max_entries != r_max_entries) {
201		DMERR("node max_entries mismatch");
202		return -EILSEQ;
203	}
204
205	if (nr_left - count > max_entries) {
206		DMERR("node shift out of bounds");
207		return -EINVAL;
208	}
209
210	if (nr_right + count > max_entries) {
211		DMERR("node shift out of bounds");
212		return -EINVAL;
213	}
214
215	if (!count)
216		return 0;
217
218	if (count > 0) {
219		node_shift(right, count);
220		r = node_copy(left, right, count);
221		if (r)
222			return r;
223	} else {
224		r = node_copy(left, right, count);
225		if (r)
226			return r;
227		node_shift(right, count);
228	}
229
230	left->header.nr_entries = cpu_to_le32(nr_left - count);
231	right->header.nr_entries = cpu_to_le32(nr_right + count);
232
233	return 0;
234}
235
236static int __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
237			struct child *l, struct child *r)
238{
239	int ret;
240	struct btree_node *left = l->n;
241	struct btree_node *right = r->n;
242	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
243	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
244	/*
245	 * Ensure the number of entries in each child will be greater
246	 * than or equal to (max_entries / 3 + 1), so no matter which
247	 * child is used for removal, the number will still be not
248	 * less than (max_entries / 3).
249	 */
250	unsigned int threshold = 2 * (merge_threshold(left) + 1);
251
252	if (nr_left + nr_right < threshold) {
253		/*
254		 * Merge
255		 */
256		node_copy(left, right, -nr_right);
257		left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
258		delete_at(parent, r->index);
259
260		/*
261		 * We need to decrement the right block, but not it's
262		 * children, since they're still referenced by left.
263		 */
264		dm_tm_dec(info->tm, dm_block_location(r->block));
265	} else {
266		/*
267		 * Rebalance.
268		 */
269		unsigned int target_left = (nr_left + nr_right) / 2;
270
271		ret = shift(left, right, nr_left - target_left);
272		if (ret)
273			return ret;
274		*key_ptr(parent, r->index) = right->keys[0];
275	}
276	return 0;
277}
278
279static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
280		      struct dm_btree_value_type *vt, unsigned int left_index)
281{
282	int r;
283	struct btree_node *parent;
284	struct child left, right;
285
286	parent = dm_block_data(shadow_current(s));
287
288	r = init_child(info, vt, parent, left_index, &left);
289	if (r)
290		return r;
291
292	r = init_child(info, vt, parent, left_index + 1, &right);
293	if (r) {
294		exit_child(info, &left);
295		return r;
296	}
297
298	r = __rebalance2(info, parent, &left, &right);
299
300	exit_child(info, &left);
301	exit_child(info, &right);
302
303	return r;
304}
305
306/*
307 * We dump as many entries from center as possible into left, then the rest
308 * in right, then rebalance2.  This wastes some cpu, but I want something
309 * simple atm.
310 */
311static int delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
312			      struct child *l, struct child *c, struct child *r,
313			      struct btree_node *left, struct btree_node *center, struct btree_node *right,
314			      uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
315{
316	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
317	unsigned int shift = min(max_entries - nr_left, nr_center);
318
319	if (nr_left + shift > max_entries) {
320		DMERR("node shift out of bounds");
321		return -EINVAL;
322	}
323
324	node_copy(left, center, -shift);
325	left->header.nr_entries = cpu_to_le32(nr_left + shift);
326
327	if (shift != nr_center) {
328		shift = nr_center - shift;
329
330		if ((nr_right + shift) > max_entries) {
331			DMERR("node shift out of bounds");
332			return -EINVAL;
333		}
334
335		node_shift(right, shift);
336		node_copy(center, right, shift);
337		right->header.nr_entries = cpu_to_le32(nr_right + shift);
338	}
339	*key_ptr(parent, r->index) = right->keys[0];
340
341	delete_at(parent, c->index);
342	r->index--;
343
344	dm_tm_dec(info->tm, dm_block_location(c->block));
345	return __rebalance2(info, parent, l, r);
346}
347
348/*
349 * Redistributes entries among 3 sibling nodes.
350 */
351static int redistribute3(struct dm_btree_info *info, struct btree_node *parent,
352			 struct child *l, struct child *c, struct child *r,
353			 struct btree_node *left, struct btree_node *center, struct btree_node *right,
354			 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
355{
356	int s, ret;
357	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
358	unsigned int total = nr_left + nr_center + nr_right;
359	unsigned int target_right = total / 3;
360	unsigned int remainder = (target_right * 3) != total;
361	unsigned int target_left = target_right + remainder;
362
363	BUG_ON(target_left > max_entries);
364	BUG_ON(target_right > max_entries);
365
366	if (nr_left < nr_right) {
367		s = nr_left - target_left;
368
369		if (s < 0 && nr_center < -s) {
370			/* not enough in central node */
371			ret = shift(left, center, -nr_center);
372			if (ret)
373				return ret;
374
375			s += nr_center;
376			ret = shift(left, right, s);
377			if (ret)
378				return ret;
379
380			nr_right += s;
381		} else {
382			ret = shift(left, center, s);
383			if (ret)
384				return ret;
385		}
386
387		ret = shift(center, right, target_right - nr_right);
388		if (ret)
389			return ret;
390	} else {
391		s = target_right - nr_right;
392		if (s > 0 && nr_center < s) {
393			/* not enough in central node */
394			ret = shift(center, right, nr_center);
395			if (ret)
396				return ret;
397			s -= nr_center;
398			ret = shift(left, right, s);
399			if (ret)
400				return ret;
401			nr_left -= s;
402		} else {
403			ret = shift(center, right, s);
404			if (ret)
405				return ret;
406		}
407
408		ret = shift(left, center, nr_left - target_left);
409		if (ret)
410			return ret;
411	}
412
413	*key_ptr(parent, c->index) = center->keys[0];
414	*key_ptr(parent, r->index) = right->keys[0];
415	return 0;
416}
417
418static int __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
419			struct child *l, struct child *c, struct child *r)
420{
421	struct btree_node *left = l->n;
422	struct btree_node *center = c->n;
423	struct btree_node *right = r->n;
424
425	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
426	uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
427	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
428
429	unsigned int threshold = merge_threshold(left) * 4 + 1;
430
431	if ((left->header.max_entries != center->header.max_entries) ||
432	    (center->header.max_entries != right->header.max_entries)) {
433		DMERR("bad btree metadata, max_entries differ");
434		return -EILSEQ;
435	}
436
437	if ((nr_left + nr_center + nr_right) < threshold) {
438		return delete_center_node(info, parent, l, c, r, left, center, right,
439					  nr_left, nr_center, nr_right);
440	}
441
442	return redistribute3(info, parent, l, c, r, left, center, right,
443			     nr_left, nr_center, nr_right);
444}
445
446static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
447		      struct dm_btree_value_type *vt, unsigned int left_index)
448{
449	int r;
450	struct btree_node *parent = dm_block_data(shadow_current(s));
451	struct child left, center, right;
452
453	/*
454	 * FIXME: fill out an array?
455	 */
456	r = init_child(info, vt, parent, left_index, &left);
457	if (r)
458		return r;
459
460	r = init_child(info, vt, parent, left_index + 1, &center);
461	if (r) {
462		exit_child(info, &left);
463		return r;
464	}
465
466	r = init_child(info, vt, parent, left_index + 2, &right);
467	if (r) {
468		exit_child(info, &left);
469		exit_child(info, &center);
470		return r;
471	}
472
473	r = __rebalance3(info, parent, &left, &center, &right);
474
475	exit_child(info, &left);
476	exit_child(info, &center);
477	exit_child(info, &right);
478
479	return r;
480}
481
482static int rebalance_children(struct shadow_spine *s,
483			      struct dm_btree_info *info,
484			      struct dm_btree_value_type *vt, uint64_t key)
485{
486	int i, r, has_left_sibling, has_right_sibling;
487	struct btree_node *n;
488
489	n = dm_block_data(shadow_current(s));
490
491	if (le32_to_cpu(n->header.nr_entries) == 1) {
492		struct dm_block *child;
493		dm_block_t b = value64(n, 0);
494
495		r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
496		if (r)
497			return r;
498
499		memcpy(n, dm_block_data(child),
500		       dm_bm_block_size(dm_tm_get_bm(info->tm)));
501
502		dm_tm_dec(info->tm, dm_block_location(child));
503		dm_tm_unlock(info->tm, child);
504		return 0;
505	}
506
507	i = lower_bound(n, key);
508	if (i < 0)
509		return -ENODATA;
510
511	has_left_sibling = i > 0;
512	has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
513
514	if (!has_left_sibling)
515		r = rebalance2(s, info, vt, i);
516
517	else if (!has_right_sibling)
518		r = rebalance2(s, info, vt, i - 1);
519
520	else
521		r = rebalance3(s, info, vt, i - 1);
522
523	return r;
524}
525
526static int do_leaf(struct btree_node *n, uint64_t key, unsigned int *index)
527{
528	int i = lower_bound(n, key);
529
530	if ((i < 0) ||
531	    (i >= le32_to_cpu(n->header.nr_entries)) ||
532	    (le64_to_cpu(n->keys[i]) != key))
533		return -ENODATA;
534
535	*index = i;
536
537	return 0;
538}
539
540/*
541 * Prepares for removal from one level of the hierarchy.  The caller must
542 * call delete_at() to remove the entry at index.
543 */
544static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
545		      struct dm_btree_value_type *vt, dm_block_t root,
546		      uint64_t key, unsigned int *index)
547{
548	int i = *index, r;
549	struct btree_node *n;
550
551	for (;;) {
552		r = shadow_step(s, root, vt);
553		if (r < 0)
554			break;
555
556		/*
557		 * We have to patch up the parent node, ugly, but I don't
558		 * see a way to do this automatically as part of the spine
559		 * op.
560		 */
561		if (shadow_has_parent(s)) {
562			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
563
564			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
565			       &location, sizeof(__le64));
566		}
567
568		n = dm_block_data(shadow_current(s));
569
570		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
571			return do_leaf(n, key, index);
572
573		r = rebalance_children(s, info, vt, key);
574		if (r)
575			break;
576
577		n = dm_block_data(shadow_current(s));
578		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
579			return do_leaf(n, key, index);
580
581		i = lower_bound(n, key);
582
583		/*
584		 * We know the key is present, or else
585		 * rebalance_children would have returned
586		 * -ENODATA
587		 */
588		root = value64(n, i);
589	}
590
591	return r;
592}
593
594int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
595		    uint64_t *keys, dm_block_t *new_root)
596{
597	unsigned int level, last_level = info->levels - 1;
598	int index = 0, r = 0;
599	struct shadow_spine spine;
600	struct btree_node *n;
601	struct dm_btree_value_type le64_vt;
602
603	init_le64_type(info->tm, &le64_vt);
604	init_shadow_spine(&spine, info);
605	for (level = 0; level < info->levels; level++) {
606		r = remove_raw(&spine, info,
607			       (level == last_level ?
608				&info->value_type : &le64_vt),
609			       root, keys[level], (unsigned int *)&index);
610		if (r < 0)
611			break;
612
613		n = dm_block_data(shadow_current(&spine));
614		if (level != last_level) {
615			root = value64(n, index);
616			continue;
617		}
618
619		BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
620
621		if (info->value_type.dec)
622			info->value_type.dec(info->value_type.context,
623					     value_ptr(n, index), 1);
624
625		delete_at(n, index);
626	}
627
628	if (!r)
629		*new_root = shadow_root(&spine);
630	exit_shadow_spine(&spine);
631
632	return r;
633}
634EXPORT_SYMBOL_GPL(dm_btree_remove);
635
636/*----------------------------------------------------------------*/
637
638static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
639			  struct dm_btree_value_type *vt, dm_block_t root,
640			  uint64_t key, int *index)
641{
642	int i = *index, r;
643	struct btree_node *n;
644
645	for (;;) {
646		r = shadow_step(s, root, vt);
647		if (r < 0)
648			break;
649
650		/*
651		 * We have to patch up the parent node, ugly, but I don't
652		 * see a way to do this automatically as part of the spine
653		 * op.
654		 */
655		if (shadow_has_parent(s)) {
656			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
657
658			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
659			       &location, sizeof(__le64));
660		}
661
662		n = dm_block_data(shadow_current(s));
663
664		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
665			*index = lower_bound(n, key);
666			return 0;
667		}
668
669		r = rebalance_children(s, info, vt, key);
670		if (r)
671			break;
672
673		n = dm_block_data(shadow_current(s));
674		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
675			*index = lower_bound(n, key);
676			return 0;
677		}
678
679		i = lower_bound(n, key);
680
681		/*
682		 * We know the key is present, or else
683		 * rebalance_children would have returned
684		 * -ENODATA
685		 */
686		root = value64(n, i);
687	}
688
689	return r;
690}
691
692static int remove_one(struct dm_btree_info *info, dm_block_t root,
693		      uint64_t *keys, uint64_t end_key,
694		      dm_block_t *new_root, unsigned int *nr_removed)
695{
696	unsigned int level, last_level = info->levels - 1;
697	int index = 0, r = 0;
698	struct shadow_spine spine;
699	struct btree_node *n;
700	struct dm_btree_value_type le64_vt;
701	uint64_t k;
702
703	init_le64_type(info->tm, &le64_vt);
704	init_shadow_spine(&spine, info);
705	for (level = 0; level < last_level; level++) {
706		r = remove_raw(&spine, info, &le64_vt,
707			       root, keys[level], (unsigned int *) &index);
708		if (r < 0)
709			goto out;
710
711		n = dm_block_data(shadow_current(&spine));
712		root = value64(n, index);
713	}
714
715	r = remove_nearest(&spine, info, &info->value_type,
716			   root, keys[last_level], &index);
717	if (r < 0)
718		goto out;
719
720	n = dm_block_data(shadow_current(&spine));
721
722	if (index < 0)
723		index = 0;
724
725	if (index >= le32_to_cpu(n->header.nr_entries)) {
726		r = -ENODATA;
727		goto out;
728	}
729
730	k = le64_to_cpu(n->keys[index]);
731	if (k >= keys[last_level] && k < end_key) {
732		if (info->value_type.dec)
733			info->value_type.dec(info->value_type.context,
734					     value_ptr(n, index), 1);
735
736		delete_at(n, index);
737		keys[last_level] = k + 1ull;
738
739	} else
740		r = -ENODATA;
741
742out:
743	*new_root = shadow_root(&spine);
744	exit_shadow_spine(&spine);
745
746	return r;
747}
748
749int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
750			   uint64_t *first_key, uint64_t end_key,
751			   dm_block_t *new_root, unsigned int *nr_removed)
752{
753	int r;
754
755	*nr_removed = 0;
756	do {
757		r = remove_one(info, root, first_key, end_key, &root, nr_removed);
758		if (!r)
759			(*nr_removed)++;
760	} while (!r);
761
762	*new_root = root;
763	return r == -ENODATA ? 0 : r;
764}
765EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);
v6.2
 
  1/*
  2 * Copyright (C) 2011 Red Hat, Inc.
  3 *
  4 * This file is released under the GPL.
  5 */
  6
  7#include "dm-btree.h"
  8#include "dm-btree-internal.h"
  9#include "dm-transaction-manager.h"
 10
 11#include <linux/export.h>
 12#include <linux/device-mapper.h>
 13
 14#define DM_MSG_PREFIX "btree"
 15
 16/*
 17 * Removing an entry from a btree
 18 * ==============================
 19 *
 20 * A very important constraint for our btree is that no node, except the
 21 * root, may have fewer than a certain number of entries.
 22 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
 23 *
 24 * Ensuring this is complicated by the way we want to only ever hold the
 25 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
 26 * fashion.
 27 *
 28 * Each node may have a left or right sibling.  When decending the spine,
 29 * if a node contains only MIN_ENTRIES then we try and increase this to at
 30 * least MIN_ENTRIES + 1.  We do this in the following ways:
 31 *
 32 * [A] No siblings => this can only happen if the node is the root, in which
 33 *     case we copy the childs contents over the root.
 34 *
 35 * [B] No left sibling
 36 *     ==> rebalance(node, right sibling)
 37 *
 38 * [C] No right sibling
 39 *     ==> rebalance(left sibling, node)
 40 *
 41 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
 42 *     ==> delete node adding it's contents to left and right
 43 *
 44 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
 45 *     ==> rebalance(left, node, right)
 46 *
 47 * After these operations it's possible that the our original node no
 48 * longer contains the desired sub tree.  For this reason this rebalancing
 49 * is performed on the children of the current node.  This also avoids
 50 * having a special case for the root.
 51 *
 52 * Once this rebalancing has occurred we can then step into the child node
 53 * for internal nodes.  Or delete the entry for leaf nodes.
 54 */
 55
 56/*
 57 * Some little utilities for moving node data around.
 58 */
 59static void node_shift(struct btree_node *n, int shift)
 60{
 61	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
 62	uint32_t value_size = le32_to_cpu(n->header.value_size);
 63
 64	if (shift < 0) {
 65		shift = -shift;
 66		BUG_ON(shift > nr_entries);
 67		BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
 68		memmove(key_ptr(n, 0),
 69			key_ptr(n, shift),
 70			(nr_entries - shift) * sizeof(__le64));
 71		memmove(value_ptr(n, 0),
 72			value_ptr(n, shift),
 73			(nr_entries - shift) * value_size);
 74	} else {
 75		BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
 76		memmove(key_ptr(n, shift),
 77			key_ptr(n, 0),
 78			nr_entries * sizeof(__le64));
 79		memmove(value_ptr(n, shift),
 80			value_ptr(n, 0),
 81			nr_entries * value_size);
 82	}
 83}
 84
 85static int node_copy(struct btree_node *left, struct btree_node *right, int shift)
 86{
 87	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 88	uint32_t value_size = le32_to_cpu(left->header.value_size);
 
 89	if (value_size != le32_to_cpu(right->header.value_size)) {
 90		DMERR("mismatched value size");
 91		return -EILSEQ;
 92	}
 93
 94	if (shift < 0) {
 95		shift = -shift;
 96
 97		if (nr_left + shift > le32_to_cpu(left->header.max_entries)) {
 98			DMERR("bad shift");
 99			return -EINVAL;
100		}
101
102		memcpy(key_ptr(left, nr_left),
103		       key_ptr(right, 0),
104		       shift * sizeof(__le64));
105		memcpy(value_ptr(left, nr_left),
106		       value_ptr(right, 0),
107		       shift * value_size);
108	} else {
109		if (shift > le32_to_cpu(right->header.max_entries)) {
110			DMERR("bad shift");
111			return -EINVAL;
112		}
113
114		memcpy(key_ptr(right, 0),
115		       key_ptr(left, nr_left - shift),
116		       shift * sizeof(__le64));
117		memcpy(value_ptr(right, 0),
118		       value_ptr(left, nr_left - shift),
119		       shift * value_size);
120	}
121	return 0;
122}
123
124/*
125 * Delete a specific entry from a leaf node.
126 */
127static void delete_at(struct btree_node *n, unsigned index)
128{
129	unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
130	unsigned nr_to_copy = nr_entries - (index + 1);
131	uint32_t value_size = le32_to_cpu(n->header.value_size);
 
132	BUG_ON(index >= nr_entries);
133
134	if (nr_to_copy) {
135		memmove(key_ptr(n, index),
136			key_ptr(n, index + 1),
137			nr_to_copy * sizeof(__le64));
138
139		memmove(value_ptr(n, index),
140			value_ptr(n, index + 1),
141			nr_to_copy * value_size);
142	}
143
144	n->header.nr_entries = cpu_to_le32(nr_entries - 1);
145}
146
147static unsigned merge_threshold(struct btree_node *n)
148{
149	return le32_to_cpu(n->header.max_entries) / 3;
150}
151
152struct child {
153	unsigned index;
154	struct dm_block *block;
155	struct btree_node *n;
156};
157
158static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
159		      struct btree_node *parent,
160		      unsigned index, struct child *result)
161{
162	int r, inc;
163	dm_block_t root;
164
165	result->index = index;
166	root = value64(parent, index);
167
168	r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
169			       &result->block, &inc);
170	if (r)
171		return r;
172
173	result->n = dm_block_data(result->block);
174
175	if (inc)
176		inc_children(info->tm, result->n, vt);
177
178	*((__le64 *) value_ptr(parent, index)) =
179		cpu_to_le64(dm_block_location(result->block));
180
181	return 0;
182}
183
184static void exit_child(struct dm_btree_info *info, struct child *c)
185{
186	dm_tm_unlock(info->tm, c->block);
187}
188
189static int shift(struct btree_node *left, struct btree_node *right, int count)
190{
191	int r;
192	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
193	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
194	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
195	uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
196
197	if (max_entries != r_max_entries) {
198		DMERR("node max_entries mismatch");
199		return -EILSEQ;
200	}
201
202	if (nr_left - count > max_entries) {
203		DMERR("node shift out of bounds");
204		return -EINVAL;
205	}
206
207	if (nr_right + count > max_entries) {
208		DMERR("node shift out of bounds");
209		return -EINVAL;
210	}
211
212	if (!count)
213		return 0;
214
215	if (count > 0) {
216		node_shift(right, count);
217		r = node_copy(left, right, count);
218		if (r)
219			return r;
220	} else {
221		r = node_copy(left, right, count);
222		if (r)
223			return r;
224		node_shift(right, count);
225	}
226
227	left->header.nr_entries = cpu_to_le32(nr_left - count);
228	right->header.nr_entries = cpu_to_le32(nr_right + count);
229
230	return 0;
231}
232
233static int __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
234			struct child *l, struct child *r)
235{
236	int ret;
237	struct btree_node *left = l->n;
238	struct btree_node *right = r->n;
239	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
240	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
241	/*
242	 * Ensure the number of entries in each child will be greater
243	 * than or equal to (max_entries / 3 + 1), so no matter which
244	 * child is used for removal, the number will still be not
245	 * less than (max_entries / 3).
246	 */
247	unsigned int threshold = 2 * (merge_threshold(left) + 1);
248
249	if (nr_left + nr_right < threshold) {
250		/*
251		 * Merge
252		 */
253		node_copy(left, right, -nr_right);
254		left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
255		delete_at(parent, r->index);
256
257		/*
258		 * We need to decrement the right block, but not it's
259		 * children, since they're still referenced by left.
260		 */
261		dm_tm_dec(info->tm, dm_block_location(r->block));
262	} else {
263		/*
264		 * Rebalance.
265		 */
266		unsigned target_left = (nr_left + nr_right) / 2;
 
267		ret = shift(left, right, nr_left - target_left);
268		if (ret)
269			return ret;
270		*key_ptr(parent, r->index) = right->keys[0];
271	}
272	return 0;
273}
274
275static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
276		      struct dm_btree_value_type *vt, unsigned left_index)
277{
278	int r;
279	struct btree_node *parent;
280	struct child left, right;
281
282	parent = dm_block_data(shadow_current(s));
283
284	r = init_child(info, vt, parent, left_index, &left);
285	if (r)
286		return r;
287
288	r = init_child(info, vt, parent, left_index + 1, &right);
289	if (r) {
290		exit_child(info, &left);
291		return r;
292	}
293
294	r = __rebalance2(info, parent, &left, &right);
295
296	exit_child(info, &left);
297	exit_child(info, &right);
298
299	return r;
300}
301
302/*
303 * We dump as many entries from center as possible into left, then the rest
304 * in right, then rebalance2.  This wastes some cpu, but I want something
305 * simple atm.
306 */
307static int delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
308			      struct child *l, struct child *c, struct child *r,
309			      struct btree_node *left, struct btree_node *center, struct btree_node *right,
310			      uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
311{
312	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
313	unsigned shift = min(max_entries - nr_left, nr_center);
314
315	if (nr_left + shift > max_entries) {
316		DMERR("node shift out of bounds");
317		return -EINVAL;
318	}
319
320	node_copy(left, center, -shift);
321	left->header.nr_entries = cpu_to_le32(nr_left + shift);
322
323	if (shift != nr_center) {
324		shift = nr_center - shift;
325
326		if ((nr_right + shift) > max_entries) {
327			DMERR("node shift out of bounds");
328			return -EINVAL;
329		}
330
331		node_shift(right, shift);
332		node_copy(center, right, shift);
333		right->header.nr_entries = cpu_to_le32(nr_right + shift);
334	}
335	*key_ptr(parent, r->index) = right->keys[0];
336
337	delete_at(parent, c->index);
338	r->index--;
339
340	dm_tm_dec(info->tm, dm_block_location(c->block));
341	return __rebalance2(info, parent, l, r);
342}
343
344/*
345 * Redistributes entries among 3 sibling nodes.
346 */
347static int redistribute3(struct dm_btree_info *info, struct btree_node *parent,
348			 struct child *l, struct child *c, struct child *r,
349			 struct btree_node *left, struct btree_node *center, struct btree_node *right,
350			 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
351{
352	int s, ret;
353	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
354	unsigned total = nr_left + nr_center + nr_right;
355	unsigned target_right = total / 3;
356	unsigned remainder = (target_right * 3) != total;
357	unsigned target_left = target_right + remainder;
358
359	BUG_ON(target_left > max_entries);
360	BUG_ON(target_right > max_entries);
361
362	if (nr_left < nr_right) {
363		s = nr_left - target_left;
364
365		if (s < 0 && nr_center < -s) {
366			/* not enough in central node */
367			ret = shift(left, center, -nr_center);
368			if (ret)
369				return ret;
370
371			s += nr_center;
372			ret = shift(left, right, s);
373			if (ret)
374				return ret;
375
376			nr_right += s;
377		} else {
378			ret = shift(left, center, s);
379			if (ret)
380				return ret;
381		}
382
383		ret = shift(center, right, target_right - nr_right);
384		if (ret)
385			return ret;
386	} else {
387		s = target_right - nr_right;
388		if (s > 0 && nr_center < s) {
389			/* not enough in central node */
390			ret = shift(center, right, nr_center);
391			if (ret)
392				return ret;
393			s -= nr_center;
394			ret = shift(left, right, s);
395			if (ret)
396				return ret;
397			nr_left -= s;
398		} else {
399			ret = shift(center, right, s);
400			if (ret)
401				return ret;
402		}
403
404		ret = shift(left, center, nr_left - target_left);
405		if (ret)
406			return ret;
407	}
408
409	*key_ptr(parent, c->index) = center->keys[0];
410	*key_ptr(parent, r->index) = right->keys[0];
411	return 0;
412}
413
414static int __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
415			struct child *l, struct child *c, struct child *r)
416{
417	struct btree_node *left = l->n;
418	struct btree_node *center = c->n;
419	struct btree_node *right = r->n;
420
421	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
422	uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
423	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
424
425	unsigned threshold = merge_threshold(left) * 4 + 1;
426
427	if ((left->header.max_entries != center->header.max_entries) ||
428	    (center->header.max_entries != right->header.max_entries)) {
429		DMERR("bad btree metadata, max_entries differ");
430		return -EILSEQ;
431	}
432
433	if ((nr_left + nr_center + nr_right) < threshold) {
434		return delete_center_node(info, parent, l, c, r, left, center, right,
435					  nr_left, nr_center, nr_right);
436	}
437
438	return redistribute3(info, parent, l, c, r, left, center, right,
439			     nr_left, nr_center, nr_right);
440}
441
442static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
443		      struct dm_btree_value_type *vt, unsigned left_index)
444{
445	int r;
446	struct btree_node *parent = dm_block_data(shadow_current(s));
447	struct child left, center, right;
448
449	/*
450	 * FIXME: fill out an array?
451	 */
452	r = init_child(info, vt, parent, left_index, &left);
453	if (r)
454		return r;
455
456	r = init_child(info, vt, parent, left_index + 1, &center);
457	if (r) {
458		exit_child(info, &left);
459		return r;
460	}
461
462	r = init_child(info, vt, parent, left_index + 2, &right);
463	if (r) {
464		exit_child(info, &left);
465		exit_child(info, &center);
466		return r;
467	}
468
469	r = __rebalance3(info, parent, &left, &center, &right);
470
471	exit_child(info, &left);
472	exit_child(info, &center);
473	exit_child(info, &right);
474
475	return r;
476}
477
478static int rebalance_children(struct shadow_spine *s,
479			      struct dm_btree_info *info,
480			      struct dm_btree_value_type *vt, uint64_t key)
481{
482	int i, r, has_left_sibling, has_right_sibling;
483	struct btree_node *n;
484
485	n = dm_block_data(shadow_current(s));
486
487	if (le32_to_cpu(n->header.nr_entries) == 1) {
488		struct dm_block *child;
489		dm_block_t b = value64(n, 0);
490
491		r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
492		if (r)
493			return r;
494
495		memcpy(n, dm_block_data(child),
496		       dm_bm_block_size(dm_tm_get_bm(info->tm)));
497
498		dm_tm_dec(info->tm, dm_block_location(child));
499		dm_tm_unlock(info->tm, child);
500		return 0;
501	}
502
503	i = lower_bound(n, key);
504	if (i < 0)
505		return -ENODATA;
506
507	has_left_sibling = i > 0;
508	has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
509
510	if (!has_left_sibling)
511		r = rebalance2(s, info, vt, i);
512
513	else if (!has_right_sibling)
514		r = rebalance2(s, info, vt, i - 1);
515
516	else
517		r = rebalance3(s, info, vt, i - 1);
518
519	return r;
520}
521
522static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index)
523{
524	int i = lower_bound(n, key);
525
526	if ((i < 0) ||
527	    (i >= le32_to_cpu(n->header.nr_entries)) ||
528	    (le64_to_cpu(n->keys[i]) != key))
529		return -ENODATA;
530
531	*index = i;
532
533	return 0;
534}
535
536/*
537 * Prepares for removal from one level of the hierarchy.  The caller must
538 * call delete_at() to remove the entry at index.
539 */
540static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
541		      struct dm_btree_value_type *vt, dm_block_t root,
542		      uint64_t key, unsigned *index)
543{
544	int i = *index, r;
545	struct btree_node *n;
546
547	for (;;) {
548		r = shadow_step(s, root, vt);
549		if (r < 0)
550			break;
551
552		/*
553		 * We have to patch up the parent node, ugly, but I don't
554		 * see a way to do this automatically as part of the spine
555		 * op.
556		 */
557		if (shadow_has_parent(s)) {
558			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 
559			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
560			       &location, sizeof(__le64));
561		}
562
563		n = dm_block_data(shadow_current(s));
564
565		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
566			return do_leaf(n, key, index);
567
568		r = rebalance_children(s, info, vt, key);
569		if (r)
570			break;
571
572		n = dm_block_data(shadow_current(s));
573		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
574			return do_leaf(n, key, index);
575
576		i = lower_bound(n, key);
577
578		/*
579		 * We know the key is present, or else
580		 * rebalance_children would have returned
581		 * -ENODATA
582		 */
583		root = value64(n, i);
584	}
585
586	return r;
587}
588
589int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
590		    uint64_t *keys, dm_block_t *new_root)
591{
592	unsigned level, last_level = info->levels - 1;
593	int index = 0, r = 0;
594	struct shadow_spine spine;
595	struct btree_node *n;
596	struct dm_btree_value_type le64_vt;
597
598	init_le64_type(info->tm, &le64_vt);
599	init_shadow_spine(&spine, info);
600	for (level = 0; level < info->levels; level++) {
601		r = remove_raw(&spine, info,
602			       (level == last_level ?
603				&info->value_type : &le64_vt),
604			       root, keys[level], (unsigned *)&index);
605		if (r < 0)
606			break;
607
608		n = dm_block_data(shadow_current(&spine));
609		if (level != last_level) {
610			root = value64(n, index);
611			continue;
612		}
613
614		BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
615
616		if (info->value_type.dec)
617			info->value_type.dec(info->value_type.context,
618					     value_ptr(n, index), 1);
619
620		delete_at(n, index);
621	}
622
623	if (!r)
624		*new_root = shadow_root(&spine);
625	exit_shadow_spine(&spine);
626
627	return r;
628}
629EXPORT_SYMBOL_GPL(dm_btree_remove);
630
631/*----------------------------------------------------------------*/
632
633static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
634			  struct dm_btree_value_type *vt, dm_block_t root,
635			  uint64_t key, int *index)
636{
637	int i = *index, r;
638	struct btree_node *n;
639
640	for (;;) {
641		r = shadow_step(s, root, vt);
642		if (r < 0)
643			break;
644
645		/*
646		 * We have to patch up the parent node, ugly, but I don't
647		 * see a way to do this automatically as part of the spine
648		 * op.
649		 */
650		if (shadow_has_parent(s)) {
651			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 
652			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
653			       &location, sizeof(__le64));
654		}
655
656		n = dm_block_data(shadow_current(s));
657
658		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
659			*index = lower_bound(n, key);
660			return 0;
661		}
662
663		r = rebalance_children(s, info, vt, key);
664		if (r)
665			break;
666
667		n = dm_block_data(shadow_current(s));
668		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
669			*index = lower_bound(n, key);
670			return 0;
671		}
672
673		i = lower_bound(n, key);
674
675		/*
676		 * We know the key is present, or else
677		 * rebalance_children would have returned
678		 * -ENODATA
679		 */
680		root = value64(n, i);
681	}
682
683	return r;
684}
685
686static int remove_one(struct dm_btree_info *info, dm_block_t root,
687		      uint64_t *keys, uint64_t end_key,
688		      dm_block_t *new_root, unsigned *nr_removed)
689{
690	unsigned level, last_level = info->levels - 1;
691	int index = 0, r = 0;
692	struct shadow_spine spine;
693	struct btree_node *n;
694	struct dm_btree_value_type le64_vt;
695	uint64_t k;
696
697	init_le64_type(info->tm, &le64_vt);
698	init_shadow_spine(&spine, info);
699	for (level = 0; level < last_level; level++) {
700		r = remove_raw(&spine, info, &le64_vt,
701			       root, keys[level], (unsigned *) &index);
702		if (r < 0)
703			goto out;
704
705		n = dm_block_data(shadow_current(&spine));
706		root = value64(n, index);
707	}
708
709	r = remove_nearest(&spine, info, &info->value_type,
710			   root, keys[last_level], &index);
711	if (r < 0)
712		goto out;
713
714	n = dm_block_data(shadow_current(&spine));
715
716	if (index < 0)
717		index = 0;
718
719	if (index >= le32_to_cpu(n->header.nr_entries)) {
720		r = -ENODATA;
721		goto out;
722	}
723
724	k = le64_to_cpu(n->keys[index]);
725	if (k >= keys[last_level] && k < end_key) {
726		if (info->value_type.dec)
727			info->value_type.dec(info->value_type.context,
728					     value_ptr(n, index), 1);
729
730		delete_at(n, index);
731		keys[last_level] = k + 1ull;
732
733	} else
734		r = -ENODATA;
735
736out:
737	*new_root = shadow_root(&spine);
738	exit_shadow_spine(&spine);
739
740	return r;
741}
742
743int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
744			   uint64_t *first_key, uint64_t end_key,
745			   dm_block_t *new_root, unsigned *nr_removed)
746{
747	int r;
748
749	*nr_removed = 0;
750	do {
751		r = remove_one(info, root, first_key, end_key, &root, nr_removed);
752		if (!r)
753			(*nr_removed)++;
754	} while (!r);
755
756	*new_root = root;
757	return r == -ENODATA ? 0 : r;
758}
759EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);