Linux Audio

Check our new training course

Loading...
v6.8
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * Copyright (C) 2011 Red Hat, Inc.
  4 *
  5 * This file is released under the GPL.
  6 */
  7
  8#include "dm-btree.h"
  9#include "dm-btree-internal.h"
 10#include "dm-transaction-manager.h"
 11
 12#include <linux/export.h>
 13#include <linux/device-mapper.h>
 14
 15#define DM_MSG_PREFIX "btree"
 16
 17/*
 18 * Removing an entry from a btree
 19 * ==============================
 20 *
 21 * A very important constraint for our btree is that no node, except the
 22 * root, may have fewer than a certain number of entries.
 23 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
 24 *
 25 * Ensuring this is complicated by the way we want to only ever hold the
 26 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
 27 * fashion.
 28 *
 29 * Each node may have a left or right sibling.  When decending the spine,
 30 * if a node contains only MIN_ENTRIES then we try and increase this to at
 31 * least MIN_ENTRIES + 1.  We do this in the following ways:
 32 *
 33 * [A] No siblings => this can only happen if the node is the root, in which
 34 *     case we copy the childs contents over the root.
 35 *
 36 * [B] No left sibling
 37 *     ==> rebalance(node, right sibling)
 38 *
 39 * [C] No right sibling
 40 *     ==> rebalance(left sibling, node)
 41 *
 42 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
 43 *     ==> delete node adding it's contents to left and right
 44 *
 45 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
 46 *     ==> rebalance(left, node, right)
 47 *
 48 * After these operations it's possible that the our original node no
 49 * longer contains the desired sub tree.  For this reason this rebalancing
 50 * is performed on the children of the current node.  This also avoids
 51 * having a special case for the root.
 52 *
 53 * Once this rebalancing has occurred we can then step into the child node
 54 * for internal nodes.  Or delete the entry for leaf nodes.
 55 */
 56
 57/*
 58 * Some little utilities for moving node data around.
 59 */
 60static void node_shift(struct btree_node *n, int shift)
 61{
 62	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
 63	uint32_t value_size = le32_to_cpu(n->header.value_size);
 64
 65	if (shift < 0) {
 66		shift = -shift;
 67		BUG_ON(shift > nr_entries);
 68		BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
 69		memmove(key_ptr(n, 0),
 70			key_ptr(n, shift),
 71			(nr_entries - shift) * sizeof(__le64));
 72		memmove(value_ptr(n, 0),
 73			value_ptr(n, shift),
 74			(nr_entries - shift) * value_size);
 75	} else {
 76		BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
 77		memmove(key_ptr(n, shift),
 78			key_ptr(n, 0),
 79			nr_entries * sizeof(__le64));
 80		memmove(value_ptr(n, shift),
 81			value_ptr(n, 0),
 82			nr_entries * value_size);
 83	}
 84}
 85
 86static int node_copy(struct btree_node *left, struct btree_node *right, int shift)
 87{
 88	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 89	uint32_t value_size = le32_to_cpu(left->header.value_size);
 90
 91	if (value_size != le32_to_cpu(right->header.value_size)) {
 92		DMERR("mismatched value size");
 93		return -EILSEQ;
 94	}
 95
 96	if (shift < 0) {
 97		shift = -shift;
 98
 99		if (nr_left + shift > le32_to_cpu(left->header.max_entries)) {
100			DMERR("bad shift");
101			return -EINVAL;
102		}
103
104		memcpy(key_ptr(left, nr_left),
105		       key_ptr(right, 0),
106		       shift * sizeof(__le64));
107		memcpy(value_ptr(left, nr_left),
108		       value_ptr(right, 0),
109		       shift * value_size);
110	} else {
111		if (shift > le32_to_cpu(right->header.max_entries)) {
112			DMERR("bad shift");
113			return -EINVAL;
114		}
115
116		memcpy(key_ptr(right, 0),
117		       key_ptr(left, nr_left - shift),
118		       shift * sizeof(__le64));
119		memcpy(value_ptr(right, 0),
120		       value_ptr(left, nr_left - shift),
121		       shift * value_size);
122	}
123	return 0;
124}
125
126/*
127 * Delete a specific entry from a leaf node.
128 */
129static void delete_at(struct btree_node *n, unsigned int index)
130{
131	unsigned int nr_entries = le32_to_cpu(n->header.nr_entries);
132	unsigned int nr_to_copy = nr_entries - (index + 1);
133	uint32_t value_size = le32_to_cpu(n->header.value_size);
134
135	BUG_ON(index >= nr_entries);
136
137	if (nr_to_copy) {
138		memmove(key_ptr(n, index),
139			key_ptr(n, index + 1),
140			nr_to_copy * sizeof(__le64));
141
142		memmove(value_ptr(n, index),
143			value_ptr(n, index + 1),
144			nr_to_copy * value_size);
145	}
146
147	n->header.nr_entries = cpu_to_le32(nr_entries - 1);
148}
149
150static unsigned int merge_threshold(struct btree_node *n)
151{
152	return le32_to_cpu(n->header.max_entries) / 3;
153}
154
155struct child {
156	unsigned int index;
157	struct dm_block *block;
158	struct btree_node *n;
159};
160
161static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
162		      struct btree_node *parent,
163		      unsigned int index, struct child *result)
164{
165	int r, inc;
166	dm_block_t root;
167
168	result->index = index;
169	root = value64(parent, index);
170
171	r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
172			       &result->block, &inc);
173	if (r)
174		return r;
175
176	result->n = dm_block_data(result->block);
177
178	if (inc)
179		inc_children(info->tm, result->n, vt);
180
181	*((__le64 *) value_ptr(parent, index)) =
182		cpu_to_le64(dm_block_location(result->block));
183
184	return 0;
185}
186
187static void exit_child(struct dm_btree_info *info, struct child *c)
188{
189	dm_tm_unlock(info->tm, c->block);
190}
191
192static int shift(struct btree_node *left, struct btree_node *right, int count)
193{
194	int r;
195	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
196	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
197	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
198	uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
199
200	if (max_entries != r_max_entries) {
201		DMERR("node max_entries mismatch");
202		return -EILSEQ;
203	}
204
205	if (nr_left - count > max_entries) {
206		DMERR("node shift out of bounds");
207		return -EINVAL;
208	}
209
210	if (nr_right + count > max_entries) {
211		DMERR("node shift out of bounds");
212		return -EINVAL;
213	}
214
215	if (!count)
216		return 0;
217
218	if (count > 0) {
219		node_shift(right, count);
220		r = node_copy(left, right, count);
221		if (r)
222			return r;
223	} else {
224		r = node_copy(left, right, count);
225		if (r)
226			return r;
227		node_shift(right, count);
228	}
229
230	left->header.nr_entries = cpu_to_le32(nr_left - count);
231	right->header.nr_entries = cpu_to_le32(nr_right + count);
232
233	return 0;
234}
235
236static int __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
237			struct child *l, struct child *r)
238{
239	int ret;
240	struct btree_node *left = l->n;
241	struct btree_node *right = r->n;
242	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
243	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
244	/*
245	 * Ensure the number of entries in each child will be greater
246	 * than or equal to (max_entries / 3 + 1), so no matter which
247	 * child is used for removal, the number will still be not
248	 * less than (max_entries / 3).
249	 */
250	unsigned int threshold = 2 * (merge_threshold(left) + 1);
251
252	if (nr_left + nr_right < threshold) {
253		/*
254		 * Merge
255		 */
256		node_copy(left, right, -nr_right);
257		left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
258		delete_at(parent, r->index);
259
260		/*
261		 * We need to decrement the right block, but not it's
262		 * children, since they're still referenced by left.
263		 */
264		dm_tm_dec(info->tm, dm_block_location(r->block));
265	} else {
266		/*
267		 * Rebalance.
268		 */
269		unsigned int target_left = (nr_left + nr_right) / 2;
270
271		ret = shift(left, right, nr_left - target_left);
272		if (ret)
273			return ret;
274		*key_ptr(parent, r->index) = right->keys[0];
275	}
276	return 0;
277}
278
279static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
280		      struct dm_btree_value_type *vt, unsigned int left_index)
281{
282	int r;
283	struct btree_node *parent;
284	struct child left, right;
285
286	parent = dm_block_data(shadow_current(s));
287
288	r = init_child(info, vt, parent, left_index, &left);
289	if (r)
290		return r;
291
292	r = init_child(info, vt, parent, left_index + 1, &right);
293	if (r) {
294		exit_child(info, &left);
295		return r;
296	}
297
298	r = __rebalance2(info, parent, &left, &right);
299
300	exit_child(info, &left);
301	exit_child(info, &right);
302
303	return r;
304}
305
306/*
307 * We dump as many entries from center as possible into left, then the rest
308 * in right, then rebalance2.  This wastes some cpu, but I want something
309 * simple atm.
310 */
311static int delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
312			      struct child *l, struct child *c, struct child *r,
313			      struct btree_node *left, struct btree_node *center, struct btree_node *right,
314			      uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
315{
316	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
317	unsigned int shift = min(max_entries - nr_left, nr_center);
318
319	if (nr_left + shift > max_entries) {
320		DMERR("node shift out of bounds");
321		return -EINVAL;
322	}
323
 
324	node_copy(left, center, -shift);
325	left->header.nr_entries = cpu_to_le32(nr_left + shift);
326
327	if (shift != nr_center) {
328		shift = nr_center - shift;
329
330		if ((nr_right + shift) > max_entries) {
331			DMERR("node shift out of bounds");
332			return -EINVAL;
333		}
334
335		node_shift(right, shift);
336		node_copy(center, right, shift);
337		right->header.nr_entries = cpu_to_le32(nr_right + shift);
338	}
339	*key_ptr(parent, r->index) = right->keys[0];
340
341	delete_at(parent, c->index);
342	r->index--;
343
344	dm_tm_dec(info->tm, dm_block_location(c->block));
345	return __rebalance2(info, parent, l, r);
346}
347
348/*
349 * Redistributes entries among 3 sibling nodes.
350 */
351static int redistribute3(struct dm_btree_info *info, struct btree_node *parent,
352			 struct child *l, struct child *c, struct child *r,
353			 struct btree_node *left, struct btree_node *center, struct btree_node *right,
354			 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
355{
356	int s, ret;
357	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
358	unsigned int total = nr_left + nr_center + nr_right;
359	unsigned int target_right = total / 3;
360	unsigned int remainder = (target_right * 3) != total;
361	unsigned int target_left = target_right + remainder;
362
363	BUG_ON(target_left > max_entries);
364	BUG_ON(target_right > max_entries);
365
366	if (nr_left < nr_right) {
367		s = nr_left - target_left;
368
369		if (s < 0 && nr_center < -s) {
370			/* not enough in central node */
371			ret = shift(left, center, -nr_center);
372			if (ret)
373				return ret;
374
375			s += nr_center;
376			ret = shift(left, right, s);
377			if (ret)
378				return ret;
379
380			nr_right += s;
381		} else {
382			ret = shift(left, center, s);
383			if (ret)
384				return ret;
385		}
386
387		ret = shift(center, right, target_right - nr_right);
388		if (ret)
389			return ret;
390	} else {
391		s = target_right - nr_right;
392		if (s > 0 && nr_center < s) {
393			/* not enough in central node */
394			ret = shift(center, right, nr_center);
395			if (ret)
396				return ret;
397			s -= nr_center;
398			ret = shift(left, right, s);
399			if (ret)
400				return ret;
401			nr_left -= s;
402		} else {
403			ret = shift(center, right, s);
404			if (ret)
405				return ret;
406		}
407
408		ret = shift(left, center, nr_left - target_left);
409		if (ret)
410			return ret;
411	}
412
413	*key_ptr(parent, c->index) = center->keys[0];
414	*key_ptr(parent, r->index) = right->keys[0];
415	return 0;
416}
417
418static int __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
419			struct child *l, struct child *c, struct child *r)
420{
421	struct btree_node *left = l->n;
422	struct btree_node *center = c->n;
423	struct btree_node *right = r->n;
424
425	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
426	uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
427	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
428
429	unsigned int threshold = merge_threshold(left) * 4 + 1;
430
431	if ((left->header.max_entries != center->header.max_entries) ||
432	    (center->header.max_entries != right->header.max_entries)) {
433		DMERR("bad btree metadata, max_entries differ");
434		return -EILSEQ;
435	}
436
437	if ((nr_left + nr_center + nr_right) < threshold) {
438		return delete_center_node(info, parent, l, c, r, left, center, right,
439					  nr_left, nr_center, nr_right);
440	}
441
442	return redistribute3(info, parent, l, c, r, left, center, right,
443			     nr_left, nr_center, nr_right);
 
 
 
 
444}
445
446static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
447		      struct dm_btree_value_type *vt, unsigned int left_index)
448{
449	int r;
450	struct btree_node *parent = dm_block_data(shadow_current(s));
451	struct child left, center, right;
452
453	/*
454	 * FIXME: fill out an array?
455	 */
456	r = init_child(info, vt, parent, left_index, &left);
457	if (r)
458		return r;
459
460	r = init_child(info, vt, parent, left_index + 1, &center);
461	if (r) {
462		exit_child(info, &left);
463		return r;
464	}
465
466	r = init_child(info, vt, parent, left_index + 2, &right);
467	if (r) {
468		exit_child(info, &left);
469		exit_child(info, &center);
470		return r;
471	}
472
473	r = __rebalance3(info, parent, &left, &center, &right);
474
475	exit_child(info, &left);
476	exit_child(info, &center);
477	exit_child(info, &right);
478
479	return r;
480}
481
482static int rebalance_children(struct shadow_spine *s,
483			      struct dm_btree_info *info,
484			      struct dm_btree_value_type *vt, uint64_t key)
485{
486	int i, r, has_left_sibling, has_right_sibling;
487	struct btree_node *n;
488
489	n = dm_block_data(shadow_current(s));
490
491	if (le32_to_cpu(n->header.nr_entries) == 1) {
492		struct dm_block *child;
493		dm_block_t b = value64(n, 0);
494
495		r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
496		if (r)
497			return r;
498
499		memcpy(n, dm_block_data(child),
500		       dm_bm_block_size(dm_tm_get_bm(info->tm)));
 
501
502		dm_tm_dec(info->tm, dm_block_location(child));
503		dm_tm_unlock(info->tm, child);
504		return 0;
505	}
506
507	i = lower_bound(n, key);
508	if (i < 0)
509		return -ENODATA;
510
511	has_left_sibling = i > 0;
512	has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
513
514	if (!has_left_sibling)
515		r = rebalance2(s, info, vt, i);
516
517	else if (!has_right_sibling)
518		r = rebalance2(s, info, vt, i - 1);
519
520	else
521		r = rebalance3(s, info, vt, i - 1);
522
523	return r;
524}
525
526static int do_leaf(struct btree_node *n, uint64_t key, unsigned int *index)
527{
528	int i = lower_bound(n, key);
529
530	if ((i < 0) ||
531	    (i >= le32_to_cpu(n->header.nr_entries)) ||
532	    (le64_to_cpu(n->keys[i]) != key))
533		return -ENODATA;
534
535	*index = i;
536
537	return 0;
538}
539
540/*
541 * Prepares for removal from one level of the hierarchy.  The caller must
542 * call delete_at() to remove the entry at index.
543 */
544static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
545		      struct dm_btree_value_type *vt, dm_block_t root,
546		      uint64_t key, unsigned int *index)
547{
548	int i = *index, r;
549	struct btree_node *n;
550
551	for (;;) {
552		r = shadow_step(s, root, vt);
553		if (r < 0)
554			break;
555
556		/*
557		 * We have to patch up the parent node, ugly, but I don't
558		 * see a way to do this automatically as part of the spine
559		 * op.
560		 */
561		if (shadow_has_parent(s)) {
562			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
563
564			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
565			       &location, sizeof(__le64));
566		}
567
568		n = dm_block_data(shadow_current(s));
569
570		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
571			return do_leaf(n, key, index);
572
573		r = rebalance_children(s, info, vt, key);
574		if (r)
575			break;
576
577		n = dm_block_data(shadow_current(s));
578		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
579			return do_leaf(n, key, index);
580
581		i = lower_bound(n, key);
582
583		/*
584		 * We know the key is present, or else
585		 * rebalance_children would have returned
586		 * -ENODATA
587		 */
588		root = value64(n, i);
589	}
590
591	return r;
592}
593
594int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
595		    uint64_t *keys, dm_block_t *new_root)
596{
597	unsigned int level, last_level = info->levels - 1;
598	int index = 0, r = 0;
599	struct shadow_spine spine;
600	struct btree_node *n;
601	struct dm_btree_value_type le64_vt;
602
603	init_le64_type(info->tm, &le64_vt);
604	init_shadow_spine(&spine, info);
605	for (level = 0; level < info->levels; level++) {
606		r = remove_raw(&spine, info,
607			       (level == last_level ?
608				&info->value_type : &le64_vt),
609			       root, keys[level], (unsigned int *)&index);
610		if (r < 0)
611			break;
612
613		n = dm_block_data(shadow_current(&spine));
614		if (level != last_level) {
615			root = value64(n, index);
616			continue;
617		}
618
619		BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
620
621		if (info->value_type.dec)
622			info->value_type.dec(info->value_type.context,
623					     value_ptr(n, index), 1);
624
625		delete_at(n, index);
626	}
627
628	if (!r)
629		*new_root = shadow_root(&spine);
630	exit_shadow_spine(&spine);
631
632	return r;
633}
634EXPORT_SYMBOL_GPL(dm_btree_remove);
635
636/*----------------------------------------------------------------*/
637
638static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
639			  struct dm_btree_value_type *vt, dm_block_t root,
640			  uint64_t key, int *index)
641{
642	int i = *index, r;
643	struct btree_node *n;
644
645	for (;;) {
646		r = shadow_step(s, root, vt);
647		if (r < 0)
648			break;
649
650		/*
651		 * We have to patch up the parent node, ugly, but I don't
652		 * see a way to do this automatically as part of the spine
653		 * op.
654		 */
655		if (shadow_has_parent(s)) {
656			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
657
658			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
659			       &location, sizeof(__le64));
660		}
661
662		n = dm_block_data(shadow_current(s));
663
664		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
665			*index = lower_bound(n, key);
666			return 0;
667		}
668
669		r = rebalance_children(s, info, vt, key);
670		if (r)
671			break;
672
673		n = dm_block_data(shadow_current(s));
674		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
675			*index = lower_bound(n, key);
676			return 0;
677		}
678
679		i = lower_bound(n, key);
680
681		/*
682		 * We know the key is present, or else
683		 * rebalance_children would have returned
684		 * -ENODATA
685		 */
686		root = value64(n, i);
687	}
688
689	return r;
690}
691
692static int remove_one(struct dm_btree_info *info, dm_block_t root,
693		      uint64_t *keys, uint64_t end_key,
694		      dm_block_t *new_root, unsigned int *nr_removed)
695{
696	unsigned int level, last_level = info->levels - 1;
697	int index = 0, r = 0;
698	struct shadow_spine spine;
699	struct btree_node *n;
700	struct dm_btree_value_type le64_vt;
701	uint64_t k;
702
703	init_le64_type(info->tm, &le64_vt);
704	init_shadow_spine(&spine, info);
705	for (level = 0; level < last_level; level++) {
706		r = remove_raw(&spine, info, &le64_vt,
707			       root, keys[level], (unsigned int *) &index);
708		if (r < 0)
709			goto out;
710
711		n = dm_block_data(shadow_current(&spine));
712		root = value64(n, index);
713	}
714
715	r = remove_nearest(&spine, info, &info->value_type,
716			   root, keys[last_level], &index);
717	if (r < 0)
718		goto out;
719
720	n = dm_block_data(shadow_current(&spine));
721
722	if (index < 0)
723		index = 0;
724
725	if (index >= le32_to_cpu(n->header.nr_entries)) {
726		r = -ENODATA;
727		goto out;
728	}
729
730	k = le64_to_cpu(n->keys[index]);
731	if (k >= keys[last_level] && k < end_key) {
732		if (info->value_type.dec)
733			info->value_type.dec(info->value_type.context,
734					     value_ptr(n, index), 1);
735
736		delete_at(n, index);
737		keys[last_level] = k + 1ull;
738
739	} else
740		r = -ENODATA;
741
742out:
743	*new_root = shadow_root(&spine);
744	exit_shadow_spine(&spine);
745
746	return r;
747}
748
749int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
750			   uint64_t *first_key, uint64_t end_key,
751			   dm_block_t *new_root, unsigned int *nr_removed)
752{
753	int r;
754
755	*nr_removed = 0;
756	do {
757		r = remove_one(info, root, first_key, end_key, &root, nr_removed);
758		if (!r)
759			(*nr_removed)++;
760	} while (!r);
761
762	*new_root = root;
763	return r == -ENODATA ? 0 : r;
764}
765EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);
v5.9
 
  1/*
  2 * Copyright (C) 2011 Red Hat, Inc.
  3 *
  4 * This file is released under the GPL.
  5 */
  6
  7#include "dm-btree.h"
  8#include "dm-btree-internal.h"
  9#include "dm-transaction-manager.h"
 10
 11#include <linux/export.h>
 
 
 
 12
 13/*
 14 * Removing an entry from a btree
 15 * ==============================
 16 *
 17 * A very important constraint for our btree is that no node, except the
 18 * root, may have fewer than a certain number of entries.
 19 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
 20 *
 21 * Ensuring this is complicated by the way we want to only ever hold the
 22 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
 23 * fashion.
 24 *
 25 * Each node may have a left or right sibling.  When decending the spine,
 26 * if a node contains only MIN_ENTRIES then we try and increase this to at
 27 * least MIN_ENTRIES + 1.  We do this in the following ways:
 28 *
 29 * [A] No siblings => this can only happen if the node is the root, in which
 30 *     case we copy the childs contents over the root.
 31 *
 32 * [B] No left sibling
 33 *     ==> rebalance(node, right sibling)
 34 *
 35 * [C] No right sibling
 36 *     ==> rebalance(left sibling, node)
 37 *
 38 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
 39 *     ==> delete node adding it's contents to left and right
 40 *
 41 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
 42 *     ==> rebalance(left, node, right)
 43 *
 44 * After these operations it's possible that the our original node no
 45 * longer contains the desired sub tree.  For this reason this rebalancing
 46 * is performed on the children of the current node.  This also avoids
 47 * having a special case for the root.
 48 *
 49 * Once this rebalancing has occurred we can then step into the child node
 50 * for internal nodes.  Or delete the entry for leaf nodes.
 51 */
 52
 53/*
 54 * Some little utilities for moving node data around.
 55 */
 56static void node_shift(struct btree_node *n, int shift)
 57{
 58	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
 59	uint32_t value_size = le32_to_cpu(n->header.value_size);
 60
 61	if (shift < 0) {
 62		shift = -shift;
 63		BUG_ON(shift > nr_entries);
 64		BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
 65		memmove(key_ptr(n, 0),
 66			key_ptr(n, shift),
 67			(nr_entries - shift) * sizeof(__le64));
 68		memmove(value_ptr(n, 0),
 69			value_ptr(n, shift),
 70			(nr_entries - shift) * value_size);
 71	} else {
 72		BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
 73		memmove(key_ptr(n, shift),
 74			key_ptr(n, 0),
 75			nr_entries * sizeof(__le64));
 76		memmove(value_ptr(n, shift),
 77			value_ptr(n, 0),
 78			nr_entries * value_size);
 79	}
 80}
 81
 82static void node_copy(struct btree_node *left, struct btree_node *right, int shift)
 83{
 84	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 85	uint32_t value_size = le32_to_cpu(left->header.value_size);
 86	BUG_ON(value_size != le32_to_cpu(right->header.value_size));
 
 
 
 
 87
 88	if (shift < 0) {
 89		shift = -shift;
 90		BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries));
 
 
 
 
 
 91		memcpy(key_ptr(left, nr_left),
 92		       key_ptr(right, 0),
 93		       shift * sizeof(__le64));
 94		memcpy(value_ptr(left, nr_left),
 95		       value_ptr(right, 0),
 96		       shift * value_size);
 97	} else {
 98		BUG_ON(shift > le32_to_cpu(right->header.max_entries));
 
 
 
 
 99		memcpy(key_ptr(right, 0),
100		       key_ptr(left, nr_left - shift),
101		       shift * sizeof(__le64));
102		memcpy(value_ptr(right, 0),
103		       value_ptr(left, nr_left - shift),
104		       shift * value_size);
105	}
 
106}
107
108/*
109 * Delete a specific entry from a leaf node.
110 */
111static void delete_at(struct btree_node *n, unsigned index)
112{
113	unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
114	unsigned nr_to_copy = nr_entries - (index + 1);
115	uint32_t value_size = le32_to_cpu(n->header.value_size);
 
116	BUG_ON(index >= nr_entries);
117
118	if (nr_to_copy) {
119		memmove(key_ptr(n, index),
120			key_ptr(n, index + 1),
121			nr_to_copy * sizeof(__le64));
122
123		memmove(value_ptr(n, index),
124			value_ptr(n, index + 1),
125			nr_to_copy * value_size);
126	}
127
128	n->header.nr_entries = cpu_to_le32(nr_entries - 1);
129}
130
131static unsigned merge_threshold(struct btree_node *n)
132{
133	return le32_to_cpu(n->header.max_entries) / 3;
134}
135
136struct child {
137	unsigned index;
138	struct dm_block *block;
139	struct btree_node *n;
140};
141
142static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
143		      struct btree_node *parent,
144		      unsigned index, struct child *result)
145{
146	int r, inc;
147	dm_block_t root;
148
149	result->index = index;
150	root = value64(parent, index);
151
152	r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
153			       &result->block, &inc);
154	if (r)
155		return r;
156
157	result->n = dm_block_data(result->block);
158
159	if (inc)
160		inc_children(info->tm, result->n, vt);
161
162	*((__le64 *) value_ptr(parent, index)) =
163		cpu_to_le64(dm_block_location(result->block));
164
165	return 0;
166}
167
168static void exit_child(struct dm_btree_info *info, struct child *c)
169{
170	dm_tm_unlock(info->tm, c->block);
171}
172
173static void shift(struct btree_node *left, struct btree_node *right, int count)
174{
 
175	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
176	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
177	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
178	uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
179
180	BUG_ON(max_entries != r_max_entries);
181	BUG_ON(nr_left - count > max_entries);
182	BUG_ON(nr_right + count > max_entries);
 
 
 
 
 
 
 
 
 
 
 
183
184	if (!count)
185		return;
186
187	if (count > 0) {
188		node_shift(right, count);
189		node_copy(left, right, count);
 
 
190	} else {
191		node_copy(left, right, count);
 
 
192		node_shift(right, count);
193	}
194
195	left->header.nr_entries = cpu_to_le32(nr_left - count);
196	right->header.nr_entries = cpu_to_le32(nr_right + count);
 
 
197}
198
199static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
200			 struct child *l, struct child *r)
201{
 
202	struct btree_node *left = l->n;
203	struct btree_node *right = r->n;
204	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
205	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
206	/*
207	 * Ensure the number of entries in each child will be greater
208	 * than or equal to (max_entries / 3 + 1), so no matter which
209	 * child is used for removal, the number will still be not
210	 * less than (max_entries / 3).
211	 */
212	unsigned int threshold = 2 * (merge_threshold(left) + 1);
213
214	if (nr_left + nr_right < threshold) {
215		/*
216		 * Merge
217		 */
218		node_copy(left, right, -nr_right);
219		left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
220		delete_at(parent, r->index);
221
222		/*
223		 * We need to decrement the right block, but not it's
224		 * children, since they're still referenced by left.
225		 */
226		dm_tm_dec(info->tm, dm_block_location(r->block));
227	} else {
228		/*
229		 * Rebalance.
230		 */
231		unsigned target_left = (nr_left + nr_right) / 2;
232		shift(left, right, nr_left - target_left);
 
 
 
233		*key_ptr(parent, r->index) = right->keys[0];
234	}
 
235}
236
237static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
238		      struct dm_btree_value_type *vt, unsigned left_index)
239{
240	int r;
241	struct btree_node *parent;
242	struct child left, right;
243
244	parent = dm_block_data(shadow_current(s));
245
246	r = init_child(info, vt, parent, left_index, &left);
247	if (r)
248		return r;
249
250	r = init_child(info, vt, parent, left_index + 1, &right);
251	if (r) {
252		exit_child(info, &left);
253		return r;
254	}
255
256	__rebalance2(info, parent, &left, &right);
257
258	exit_child(info, &left);
259	exit_child(info, &right);
260
261	return 0;
262}
263
264/*
265 * We dump as many entries from center as possible into left, then the rest
266 * in right, then rebalance2.  This wastes some cpu, but I want something
267 * simple atm.
268 */
269static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
270			       struct child *l, struct child *c, struct child *r,
271			       struct btree_node *left, struct btree_node *center, struct btree_node *right,
272			       uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
273{
274	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
275	unsigned shift = min(max_entries - nr_left, nr_center);
 
 
 
 
 
276
277	BUG_ON(nr_left + shift > max_entries);
278	node_copy(left, center, -shift);
279	left->header.nr_entries = cpu_to_le32(nr_left + shift);
280
281	if (shift != nr_center) {
282		shift = nr_center - shift;
283		BUG_ON((nr_right + shift) > max_entries);
 
 
 
 
 
284		node_shift(right, shift);
285		node_copy(center, right, shift);
286		right->header.nr_entries = cpu_to_le32(nr_right + shift);
287	}
288	*key_ptr(parent, r->index) = right->keys[0];
289
290	delete_at(parent, c->index);
291	r->index--;
292
293	dm_tm_dec(info->tm, dm_block_location(c->block));
294	__rebalance2(info, parent, l, r);
295}
296
297/*
298 * Redistributes entries among 3 sibling nodes.
299 */
300static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
301			  struct child *l, struct child *c, struct child *r,
302			  struct btree_node *left, struct btree_node *center, struct btree_node *right,
303			  uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
304{
305	int s;
306	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
307	unsigned total = nr_left + nr_center + nr_right;
308	unsigned target_right = total / 3;
309	unsigned remainder = (target_right * 3) != total;
310	unsigned target_left = target_right + remainder;
311
312	BUG_ON(target_left > max_entries);
313	BUG_ON(target_right > max_entries);
314
315	if (nr_left < nr_right) {
316		s = nr_left - target_left;
317
318		if (s < 0 && nr_center < -s) {
319			/* not enough in central node */
320			shift(left, center, -nr_center);
 
 
 
321			s += nr_center;
322			shift(left, right, s);
 
 
 
323			nr_right += s;
324		} else
325			shift(left, center, s);
326
327		shift(center, right, target_right - nr_right);
 
328
 
 
 
329	} else {
330		s = target_right - nr_right;
331		if (s > 0 && nr_center < s) {
332			/* not enough in central node */
333			shift(center, right, nr_center);
 
 
334			s -= nr_center;
335			shift(left, right, s);
 
 
336			nr_left -= s;
337		} else
338			shift(center, right, s);
 
 
 
339
340		shift(left, center, nr_left - target_left);
 
 
341	}
342
343	*key_ptr(parent, c->index) = center->keys[0];
344	*key_ptr(parent, r->index) = right->keys[0];
 
345}
346
347static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
348			 struct child *l, struct child *c, struct child *r)
349{
350	struct btree_node *left = l->n;
351	struct btree_node *center = c->n;
352	struct btree_node *right = r->n;
353
354	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
355	uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
356	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
357
358	unsigned threshold = merge_threshold(left) * 4 + 1;
 
 
 
 
 
 
359
360	BUG_ON(left->header.max_entries != center->header.max_entries);
361	BUG_ON(center->header.max_entries != right->header.max_entries);
 
 
362
363	if ((nr_left + nr_center + nr_right) < threshold)
364		delete_center_node(info, parent, l, c, r, left, center, right,
365				   nr_left, nr_center, nr_right);
366	else
367		redistribute3(info, parent, l, c, r, left, center, right,
368			      nr_left, nr_center, nr_right);
369}
370
371static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
372		      struct dm_btree_value_type *vt, unsigned left_index)
373{
374	int r;
375	struct btree_node *parent = dm_block_data(shadow_current(s));
376	struct child left, center, right;
377
378	/*
379	 * FIXME: fill out an array?
380	 */
381	r = init_child(info, vt, parent, left_index, &left);
382	if (r)
383		return r;
384
385	r = init_child(info, vt, parent, left_index + 1, &center);
386	if (r) {
387		exit_child(info, &left);
388		return r;
389	}
390
391	r = init_child(info, vt, parent, left_index + 2, &right);
392	if (r) {
393		exit_child(info, &left);
394		exit_child(info, &center);
395		return r;
396	}
397
398	__rebalance3(info, parent, &left, &center, &right);
399
400	exit_child(info, &left);
401	exit_child(info, &center);
402	exit_child(info, &right);
403
404	return 0;
405}
406
407static int rebalance_children(struct shadow_spine *s,
408			      struct dm_btree_info *info,
409			      struct dm_btree_value_type *vt, uint64_t key)
410{
411	int i, r, has_left_sibling, has_right_sibling;
412	struct btree_node *n;
413
414	n = dm_block_data(shadow_current(s));
415
416	if (le32_to_cpu(n->header.nr_entries) == 1) {
417		struct dm_block *child;
418		dm_block_t b = value64(n, 0);
419
420		r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
421		if (r)
422			return r;
423
424		memcpy(n, dm_block_data(child),
425		       dm_bm_block_size(dm_tm_get_bm(info->tm)));
426		dm_tm_unlock(info->tm, child);
427
428		dm_tm_dec(info->tm, dm_block_location(child));
 
429		return 0;
430	}
431
432	i = lower_bound(n, key);
433	if (i < 0)
434		return -ENODATA;
435
436	has_left_sibling = i > 0;
437	has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
438
439	if (!has_left_sibling)
440		r = rebalance2(s, info, vt, i);
441
442	else if (!has_right_sibling)
443		r = rebalance2(s, info, vt, i - 1);
444
445	else
446		r = rebalance3(s, info, vt, i - 1);
447
448	return r;
449}
450
451static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index)
452{
453	int i = lower_bound(n, key);
454
455	if ((i < 0) ||
456	    (i >= le32_to_cpu(n->header.nr_entries)) ||
457	    (le64_to_cpu(n->keys[i]) != key))
458		return -ENODATA;
459
460	*index = i;
461
462	return 0;
463}
464
465/*
466 * Prepares for removal from one level of the hierarchy.  The caller must
467 * call delete_at() to remove the entry at index.
468 */
469static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
470		      struct dm_btree_value_type *vt, dm_block_t root,
471		      uint64_t key, unsigned *index)
472{
473	int i = *index, r;
474	struct btree_node *n;
475
476	for (;;) {
477		r = shadow_step(s, root, vt);
478		if (r < 0)
479			break;
480
481		/*
482		 * We have to patch up the parent node, ugly, but I don't
483		 * see a way to do this automatically as part of the spine
484		 * op.
485		 */
486		if (shadow_has_parent(s)) {
487			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 
488			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
489			       &location, sizeof(__le64));
490		}
491
492		n = dm_block_data(shadow_current(s));
493
494		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
495			return do_leaf(n, key, index);
496
497		r = rebalance_children(s, info, vt, key);
498		if (r)
499			break;
500
501		n = dm_block_data(shadow_current(s));
502		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
503			return do_leaf(n, key, index);
504
505		i = lower_bound(n, key);
506
507		/*
508		 * We know the key is present, or else
509		 * rebalance_children would have returned
510		 * -ENODATA
511		 */
512		root = value64(n, i);
513	}
514
515	return r;
516}
517
518int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
519		    uint64_t *keys, dm_block_t *new_root)
520{
521	unsigned level, last_level = info->levels - 1;
522	int index = 0, r = 0;
523	struct shadow_spine spine;
524	struct btree_node *n;
525	struct dm_btree_value_type le64_vt;
526
527	init_le64_type(info->tm, &le64_vt);
528	init_shadow_spine(&spine, info);
529	for (level = 0; level < info->levels; level++) {
530		r = remove_raw(&spine, info,
531			       (level == last_level ?
532				&info->value_type : &le64_vt),
533			       root, keys[level], (unsigned *)&index);
534		if (r < 0)
535			break;
536
537		n = dm_block_data(shadow_current(&spine));
538		if (level != last_level) {
539			root = value64(n, index);
540			continue;
541		}
542
543		BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
544
545		if (info->value_type.dec)
546			info->value_type.dec(info->value_type.context,
547					     value_ptr(n, index));
548
549		delete_at(n, index);
550	}
551
552	*new_root = shadow_root(&spine);
 
553	exit_shadow_spine(&spine);
554
555	return r;
556}
557EXPORT_SYMBOL_GPL(dm_btree_remove);
558
559/*----------------------------------------------------------------*/
560
561static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
562			  struct dm_btree_value_type *vt, dm_block_t root,
563			  uint64_t key, int *index)
564{
565	int i = *index, r;
566	struct btree_node *n;
567
568	for (;;) {
569		r = shadow_step(s, root, vt);
570		if (r < 0)
571			break;
572
573		/*
574		 * We have to patch up the parent node, ugly, but I don't
575		 * see a way to do this automatically as part of the spine
576		 * op.
577		 */
578		if (shadow_has_parent(s)) {
579			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 
580			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
581			       &location, sizeof(__le64));
582		}
583
584		n = dm_block_data(shadow_current(s));
585
586		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
587			*index = lower_bound(n, key);
588			return 0;
589		}
590
591		r = rebalance_children(s, info, vt, key);
592		if (r)
593			break;
594
595		n = dm_block_data(shadow_current(s));
596		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
597			*index = lower_bound(n, key);
598			return 0;
599		}
600
601		i = lower_bound(n, key);
602
603		/*
604		 * We know the key is present, or else
605		 * rebalance_children would have returned
606		 * -ENODATA
607		 */
608		root = value64(n, i);
609	}
610
611	return r;
612}
613
614static int remove_one(struct dm_btree_info *info, dm_block_t root,
615		      uint64_t *keys, uint64_t end_key,
616		      dm_block_t *new_root, unsigned *nr_removed)
617{
618	unsigned level, last_level = info->levels - 1;
619	int index = 0, r = 0;
620	struct shadow_spine spine;
621	struct btree_node *n;
622	struct dm_btree_value_type le64_vt;
623	uint64_t k;
624
625	init_le64_type(info->tm, &le64_vt);
626	init_shadow_spine(&spine, info);
627	for (level = 0; level < last_level; level++) {
628		r = remove_raw(&spine, info, &le64_vt,
629			       root, keys[level], (unsigned *) &index);
630		if (r < 0)
631			goto out;
632
633		n = dm_block_data(shadow_current(&spine));
634		root = value64(n, index);
635	}
636
637	r = remove_nearest(&spine, info, &info->value_type,
638			   root, keys[last_level], &index);
639	if (r < 0)
640		goto out;
641
642	n = dm_block_data(shadow_current(&spine));
643
644	if (index < 0)
645		index = 0;
646
647	if (index >= le32_to_cpu(n->header.nr_entries)) {
648		r = -ENODATA;
649		goto out;
650	}
651
652	k = le64_to_cpu(n->keys[index]);
653	if (k >= keys[last_level] && k < end_key) {
654		if (info->value_type.dec)
655			info->value_type.dec(info->value_type.context,
656					     value_ptr(n, index));
657
658		delete_at(n, index);
659		keys[last_level] = k + 1ull;
660
661	} else
662		r = -ENODATA;
663
664out:
665	*new_root = shadow_root(&spine);
666	exit_shadow_spine(&spine);
667
668	return r;
669}
670
671int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
672			   uint64_t *first_key, uint64_t end_key,
673			   dm_block_t *new_root, unsigned *nr_removed)
674{
675	int r;
676
677	*nr_removed = 0;
678	do {
679		r = remove_one(info, root, first_key, end_key, &root, nr_removed);
680		if (!r)
681			(*nr_removed)++;
682	} while (!r);
683
684	*new_root = root;
685	return r == -ENODATA ? 0 : r;
686}
687EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);