Linux Audio

Check our new training course

Loading...
v6.8
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * Copyright (C) 2011 Red Hat, Inc.
  4 *
  5 * This file is released under the GPL.
  6 */
  7
  8#include "dm-btree.h"
  9#include "dm-btree-internal.h"
 10#include "dm-transaction-manager.h"
 11
 12#include <linux/export.h>
 13#include <linux/device-mapper.h>
 14
 15#define DM_MSG_PREFIX "btree"
 16
 17/*
 18 * Removing an entry from a btree
 19 * ==============================
 20 *
 21 * A very important constraint for our btree is that no node, except the
 22 * root, may have fewer than a certain number of entries.
 23 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
 24 *
 25 * Ensuring this is complicated by the way we want to only ever hold the
 26 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
 27 * fashion.
 28 *
 29 * Each node may have a left or right sibling.  When decending the spine,
 30 * if a node contains only MIN_ENTRIES then we try and increase this to at
 31 * least MIN_ENTRIES + 1.  We do this in the following ways:
 32 *
 33 * [A] No siblings => this can only happen if the node is the root, in which
 34 *     case we copy the childs contents over the root.
 35 *
 36 * [B] No left sibling
 37 *     ==> rebalance(node, right sibling)
 38 *
 39 * [C] No right sibling
 40 *     ==> rebalance(left sibling, node)
 41 *
 42 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
 43 *     ==> delete node adding it's contents to left and right
 44 *
 45 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
 46 *     ==> rebalance(left, node, right)
 47 *
 48 * After these operations it's possible that the our original node no
 49 * longer contains the desired sub tree.  For this reason this rebalancing
 50 * is performed on the children of the current node.  This also avoids
 51 * having a special case for the root.
 52 *
 53 * Once this rebalancing has occurred we can then step into the child node
 54 * for internal nodes.  Or delete the entry for leaf nodes.
 55 */
 56
 57/*
 58 * Some little utilities for moving node data around.
 59 */
 60static void node_shift(struct btree_node *n, int shift)
 61{
 62	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
 63	uint32_t value_size = le32_to_cpu(n->header.value_size);
 64
 65	if (shift < 0) {
 66		shift = -shift;
 67		BUG_ON(shift > nr_entries);
 68		BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
 69		memmove(key_ptr(n, 0),
 70			key_ptr(n, shift),
 71			(nr_entries - shift) * sizeof(__le64));
 72		memmove(value_ptr(n, 0),
 73			value_ptr(n, shift),
 74			(nr_entries - shift) * value_size);
 75	} else {
 76		BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
 77		memmove(key_ptr(n, shift),
 78			key_ptr(n, 0),
 79			nr_entries * sizeof(__le64));
 80		memmove(value_ptr(n, shift),
 81			value_ptr(n, 0),
 82			nr_entries * value_size);
 83	}
 84}
 85
 86static int node_copy(struct btree_node *left, struct btree_node *right, int shift)
 87{
 88	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 89	uint32_t value_size = le32_to_cpu(left->header.value_size);
 90
 91	if (value_size != le32_to_cpu(right->header.value_size)) {
 92		DMERR("mismatched value size");
 93		return -EILSEQ;
 94	}
 95
 96	if (shift < 0) {
 97		shift = -shift;
 98
 99		if (nr_left + shift > le32_to_cpu(left->header.max_entries)) {
100			DMERR("bad shift");
101			return -EINVAL;
102		}
103
104		memcpy(key_ptr(left, nr_left),
105		       key_ptr(right, 0),
106		       shift * sizeof(__le64));
107		memcpy(value_ptr(left, nr_left),
108		       value_ptr(right, 0),
109		       shift * value_size);
110	} else {
111		if (shift > le32_to_cpu(right->header.max_entries)) {
112			DMERR("bad shift");
113			return -EINVAL;
114		}
115
116		memcpy(key_ptr(right, 0),
117		       key_ptr(left, nr_left - shift),
118		       shift * sizeof(__le64));
119		memcpy(value_ptr(right, 0),
120		       value_ptr(left, nr_left - shift),
121		       shift * value_size);
122	}
123	return 0;
124}
125
126/*
127 * Delete a specific entry from a leaf node.
128 */
129static void delete_at(struct btree_node *n, unsigned int index)
130{
131	unsigned int nr_entries = le32_to_cpu(n->header.nr_entries);
132	unsigned int nr_to_copy = nr_entries - (index + 1);
133	uint32_t value_size = le32_to_cpu(n->header.value_size);
134
135	BUG_ON(index >= nr_entries);
136
137	if (nr_to_copy) {
138		memmove(key_ptr(n, index),
139			key_ptr(n, index + 1),
140			nr_to_copy * sizeof(__le64));
141
142		memmove(value_ptr(n, index),
143			value_ptr(n, index + 1),
144			nr_to_copy * value_size);
145	}
146
147	n->header.nr_entries = cpu_to_le32(nr_entries - 1);
148}
149
150static unsigned int merge_threshold(struct btree_node *n)
151{
152	return le32_to_cpu(n->header.max_entries) / 3;
153}
154
155struct child {
156	unsigned int index;
157	struct dm_block *block;
158	struct btree_node *n;
159};
160
161static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
162		      struct btree_node *parent,
163		      unsigned int index, struct child *result)
164{
165	int r, inc;
166	dm_block_t root;
167
168	result->index = index;
169	root = value64(parent, index);
170
171	r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
172			       &result->block, &inc);
173	if (r)
174		return r;
175
176	result->n = dm_block_data(result->block);
177
178	if (inc)
179		inc_children(info->tm, result->n, vt);
180
181	*((__le64 *) value_ptr(parent, index)) =
182		cpu_to_le64(dm_block_location(result->block));
183
184	return 0;
185}
186
187static void exit_child(struct dm_btree_info *info, struct child *c)
188{
189	dm_tm_unlock(info->tm, c->block);
190}
191
192static int shift(struct btree_node *left, struct btree_node *right, int count)
193{
194	int r;
195	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
196	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
197	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
198	uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
199
200	if (max_entries != r_max_entries) {
201		DMERR("node max_entries mismatch");
202		return -EILSEQ;
203	}
204
205	if (nr_left - count > max_entries) {
206		DMERR("node shift out of bounds");
207		return -EINVAL;
208	}
209
210	if (nr_right + count > max_entries) {
211		DMERR("node shift out of bounds");
212		return -EINVAL;
213	}
214
215	if (!count)
216		return 0;
217
218	if (count > 0) {
219		node_shift(right, count);
220		r = node_copy(left, right, count);
221		if (r)
222			return r;
223	} else {
224		r = node_copy(left, right, count);
225		if (r)
226			return r;
227		node_shift(right, count);
228	}
229
230	left->header.nr_entries = cpu_to_le32(nr_left - count);
231	right->header.nr_entries = cpu_to_le32(nr_right + count);
232
233	return 0;
234}
235
236static int __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
237			struct child *l, struct child *r)
238{
239	int ret;
240	struct btree_node *left = l->n;
241	struct btree_node *right = r->n;
242	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
243	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
244	/*
245	 * Ensure the number of entries in each child will be greater
246	 * than or equal to (max_entries / 3 + 1), so no matter which
247	 * child is used for removal, the number will still be not
248	 * less than (max_entries / 3).
249	 */
250	unsigned int threshold = 2 * (merge_threshold(left) + 1);
251
252	if (nr_left + nr_right < threshold) {
253		/*
254		 * Merge
255		 */
256		node_copy(left, right, -nr_right);
257		left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
258		delete_at(parent, r->index);
259
260		/*
261		 * We need to decrement the right block, but not it's
262		 * children, since they're still referenced by left.
263		 */
264		dm_tm_dec(info->tm, dm_block_location(r->block));
265	} else {
266		/*
267		 * Rebalance.
268		 */
269		unsigned int target_left = (nr_left + nr_right) / 2;
270
271		ret = shift(left, right, nr_left - target_left);
272		if (ret)
273			return ret;
274		*key_ptr(parent, r->index) = right->keys[0];
275	}
276	return 0;
277}
278
279static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
280		      struct dm_btree_value_type *vt, unsigned int left_index)
281{
282	int r;
283	struct btree_node *parent;
284	struct child left, right;
285
286	parent = dm_block_data(shadow_current(s));
287
288	r = init_child(info, vt, parent, left_index, &left);
289	if (r)
290		return r;
291
292	r = init_child(info, vt, parent, left_index + 1, &right);
293	if (r) {
294		exit_child(info, &left);
295		return r;
296	}
297
298	r = __rebalance2(info, parent, &left, &right);
299
300	exit_child(info, &left);
301	exit_child(info, &right);
302
303	return r;
304}
305
306/*
307 * We dump as many entries from center as possible into left, then the rest
308 * in right, then rebalance2.  This wastes some cpu, but I want something
309 * simple atm.
310 */
311static int delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
312			      struct child *l, struct child *c, struct child *r,
313			      struct btree_node *left, struct btree_node *center, struct btree_node *right,
314			      uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
315{
316	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
317	unsigned int shift = min(max_entries - nr_left, nr_center);
318
319	if (nr_left + shift > max_entries) {
320		DMERR("node shift out of bounds");
321		return -EINVAL;
322	}
323
 
324	node_copy(left, center, -shift);
325	left->header.nr_entries = cpu_to_le32(nr_left + shift);
326
327	if (shift != nr_center) {
328		shift = nr_center - shift;
329
330		if ((nr_right + shift) > max_entries) {
331			DMERR("node shift out of bounds");
332			return -EINVAL;
333		}
334
335		node_shift(right, shift);
336		node_copy(center, right, shift);
337		right->header.nr_entries = cpu_to_le32(nr_right + shift);
338	}
339	*key_ptr(parent, r->index) = right->keys[0];
340
341	delete_at(parent, c->index);
342	r->index--;
343
344	dm_tm_dec(info->tm, dm_block_location(c->block));
345	return __rebalance2(info, parent, l, r);
346}
347
348/*
349 * Redistributes entries among 3 sibling nodes.
350 */
351static int redistribute3(struct dm_btree_info *info, struct btree_node *parent,
352			 struct child *l, struct child *c, struct child *r,
353			 struct btree_node *left, struct btree_node *center, struct btree_node *right,
354			 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
355{
356	int s, ret;
357	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
358	unsigned int total = nr_left + nr_center + nr_right;
359	unsigned int target_right = total / 3;
360	unsigned int remainder = (target_right * 3) != total;
361	unsigned int target_left = target_right + remainder;
362
363	BUG_ON(target_left > max_entries);
364	BUG_ON(target_right > max_entries);
365
366	if (nr_left < nr_right) {
367		s = nr_left - target_left;
368
369		if (s < 0 && nr_center < -s) {
370			/* not enough in central node */
371			ret = shift(left, center, -nr_center);
372			if (ret)
373				return ret;
374
375			s += nr_center;
376			ret = shift(left, right, s);
377			if (ret)
378				return ret;
379
380			nr_right += s;
381		} else {
382			ret = shift(left, center, s);
383			if (ret)
384				return ret;
385		}
386
387		ret = shift(center, right, target_right - nr_right);
388		if (ret)
389			return ret;
390	} else {
391		s = target_right - nr_right;
392		if (s > 0 && nr_center < s) {
393			/* not enough in central node */
394			ret = shift(center, right, nr_center);
395			if (ret)
396				return ret;
397			s -= nr_center;
398			ret = shift(left, right, s);
399			if (ret)
400				return ret;
401			nr_left -= s;
402		} else {
403			ret = shift(center, right, s);
404			if (ret)
405				return ret;
406		}
407
408		ret = shift(left, center, nr_left - target_left);
409		if (ret)
410			return ret;
411	}
412
413	*key_ptr(parent, c->index) = center->keys[0];
414	*key_ptr(parent, r->index) = right->keys[0];
415	return 0;
416}
417
418static int __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
419			struct child *l, struct child *c, struct child *r)
420{
421	struct btree_node *left = l->n;
422	struct btree_node *center = c->n;
423	struct btree_node *right = r->n;
424
425	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
426	uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
427	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
428
429	unsigned int threshold = merge_threshold(left) * 4 + 1;
430
431	if ((left->header.max_entries != center->header.max_entries) ||
432	    (center->header.max_entries != right->header.max_entries)) {
433		DMERR("bad btree metadata, max_entries differ");
434		return -EILSEQ;
435	}
436
437	if ((nr_left + nr_center + nr_right) < threshold) {
438		return delete_center_node(info, parent, l, c, r, left, center, right,
439					  nr_left, nr_center, nr_right);
440	}
441
442	return redistribute3(info, parent, l, c, r, left, center, right,
443			     nr_left, nr_center, nr_right);
 
 
 
 
444}
445
446static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
447		      struct dm_btree_value_type *vt, unsigned int left_index)
448{
449	int r;
450	struct btree_node *parent = dm_block_data(shadow_current(s));
451	struct child left, center, right;
452
453	/*
454	 * FIXME: fill out an array?
455	 */
456	r = init_child(info, vt, parent, left_index, &left);
457	if (r)
458		return r;
459
460	r = init_child(info, vt, parent, left_index + 1, &center);
461	if (r) {
462		exit_child(info, &left);
463		return r;
464	}
465
466	r = init_child(info, vt, parent, left_index + 2, &right);
467	if (r) {
468		exit_child(info, &left);
469		exit_child(info, &center);
470		return r;
471	}
472
473	r = __rebalance3(info, parent, &left, &center, &right);
474
475	exit_child(info, &left);
476	exit_child(info, &center);
477	exit_child(info, &right);
478
479	return r;
480}
481
482static int rebalance_children(struct shadow_spine *s,
483			      struct dm_btree_info *info,
484			      struct dm_btree_value_type *vt, uint64_t key)
485{
486	int i, r, has_left_sibling, has_right_sibling;
487	struct btree_node *n;
488
489	n = dm_block_data(shadow_current(s));
490
491	if (le32_to_cpu(n->header.nr_entries) == 1) {
492		struct dm_block *child;
493		dm_block_t b = value64(n, 0);
494
495		r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
496		if (r)
497			return r;
498
499		memcpy(n, dm_block_data(child),
500		       dm_bm_block_size(dm_tm_get_bm(info->tm)));
 
501
502		dm_tm_dec(info->tm, dm_block_location(child));
503		dm_tm_unlock(info->tm, child);
504		return 0;
505	}
506
507	i = lower_bound(n, key);
508	if (i < 0)
509		return -ENODATA;
510
511	has_left_sibling = i > 0;
512	has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
513
514	if (!has_left_sibling)
515		r = rebalance2(s, info, vt, i);
516
517	else if (!has_right_sibling)
518		r = rebalance2(s, info, vt, i - 1);
519
520	else
521		r = rebalance3(s, info, vt, i - 1);
522
523	return r;
524}
525
526static int do_leaf(struct btree_node *n, uint64_t key, unsigned int *index)
527{
528	int i = lower_bound(n, key);
529
530	if ((i < 0) ||
531	    (i >= le32_to_cpu(n->header.nr_entries)) ||
532	    (le64_to_cpu(n->keys[i]) != key))
533		return -ENODATA;
534
535	*index = i;
536
537	return 0;
538}
539
540/*
541 * Prepares for removal from one level of the hierarchy.  The caller must
542 * call delete_at() to remove the entry at index.
543 */
544static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
545		      struct dm_btree_value_type *vt, dm_block_t root,
546		      uint64_t key, unsigned int *index)
547{
548	int i = *index, r;
549	struct btree_node *n;
550
551	for (;;) {
552		r = shadow_step(s, root, vt);
553		if (r < 0)
554			break;
555
556		/*
557		 * We have to patch up the parent node, ugly, but I don't
558		 * see a way to do this automatically as part of the spine
559		 * op.
560		 */
561		if (shadow_has_parent(s)) {
562			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
563
564			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
565			       &location, sizeof(__le64));
566		}
567
568		n = dm_block_data(shadow_current(s));
569
570		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
571			return do_leaf(n, key, index);
572
573		r = rebalance_children(s, info, vt, key);
574		if (r)
575			break;
576
577		n = dm_block_data(shadow_current(s));
578		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
579			return do_leaf(n, key, index);
580
581		i = lower_bound(n, key);
582
583		/*
584		 * We know the key is present, or else
585		 * rebalance_children would have returned
586		 * -ENODATA
587		 */
588		root = value64(n, i);
589	}
590
591	return r;
592}
593
594int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
595		    uint64_t *keys, dm_block_t *new_root)
596{
597	unsigned int level, last_level = info->levels - 1;
598	int index = 0, r = 0;
599	struct shadow_spine spine;
600	struct btree_node *n;
601	struct dm_btree_value_type le64_vt;
602
603	init_le64_type(info->tm, &le64_vt);
604	init_shadow_spine(&spine, info);
605	for (level = 0; level < info->levels; level++) {
606		r = remove_raw(&spine, info,
607			       (level == last_level ?
608				&info->value_type : &le64_vt),
609			       root, keys[level], (unsigned int *)&index);
610		if (r < 0)
611			break;
612
613		n = dm_block_data(shadow_current(&spine));
614		if (level != last_level) {
615			root = value64(n, index);
616			continue;
617		}
618
619		BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
620
621		if (info->value_type.dec)
622			info->value_type.dec(info->value_type.context,
623					     value_ptr(n, index), 1);
624
625		delete_at(n, index);
626	}
627
628	if (!r)
629		*new_root = shadow_root(&spine);
630	exit_shadow_spine(&spine);
631
632	return r;
633}
634EXPORT_SYMBOL_GPL(dm_btree_remove);
635
636/*----------------------------------------------------------------*/
637
638static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
639			  struct dm_btree_value_type *vt, dm_block_t root,
640			  uint64_t key, int *index)
641{
642	int i = *index, r;
643	struct btree_node *n;
644
645	for (;;) {
646		r = shadow_step(s, root, vt);
647		if (r < 0)
648			break;
649
650		/*
651		 * We have to patch up the parent node, ugly, but I don't
652		 * see a way to do this automatically as part of the spine
653		 * op.
654		 */
655		if (shadow_has_parent(s)) {
656			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
657
658			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
659			       &location, sizeof(__le64));
660		}
661
662		n = dm_block_data(shadow_current(s));
663
664		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
665			*index = lower_bound(n, key);
666			return 0;
667		}
668
669		r = rebalance_children(s, info, vt, key);
670		if (r)
671			break;
672
673		n = dm_block_data(shadow_current(s));
674		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
675			*index = lower_bound(n, key);
676			return 0;
677		}
678
679		i = lower_bound(n, key);
680
681		/*
682		 * We know the key is present, or else
683		 * rebalance_children would have returned
684		 * -ENODATA
685		 */
686		root = value64(n, i);
687	}
688
689	return r;
690}
691
692static int remove_one(struct dm_btree_info *info, dm_block_t root,
693		      uint64_t *keys, uint64_t end_key,
694		      dm_block_t *new_root, unsigned int *nr_removed)
695{
696	unsigned int level, last_level = info->levels - 1;
697	int index = 0, r = 0;
698	struct shadow_spine spine;
699	struct btree_node *n;
700	struct dm_btree_value_type le64_vt;
701	uint64_t k;
702
703	init_le64_type(info->tm, &le64_vt);
704	init_shadow_spine(&spine, info);
705	for (level = 0; level < last_level; level++) {
706		r = remove_raw(&spine, info, &le64_vt,
707			       root, keys[level], (unsigned int *) &index);
708		if (r < 0)
709			goto out;
710
711		n = dm_block_data(shadow_current(&spine));
712		root = value64(n, index);
713	}
714
715	r = remove_nearest(&spine, info, &info->value_type,
716			   root, keys[last_level], &index);
717	if (r < 0)
718		goto out;
719
720	n = dm_block_data(shadow_current(&spine));
721
722	if (index < 0)
723		index = 0;
724
725	if (index >= le32_to_cpu(n->header.nr_entries)) {
726		r = -ENODATA;
727		goto out;
728	}
729
730	k = le64_to_cpu(n->keys[index]);
731	if (k >= keys[last_level] && k < end_key) {
732		if (info->value_type.dec)
733			info->value_type.dec(info->value_type.context,
734					     value_ptr(n, index), 1);
735
736		delete_at(n, index);
737		keys[last_level] = k + 1ull;
738
739	} else
740		r = -ENODATA;
741
742out:
743	*new_root = shadow_root(&spine);
744	exit_shadow_spine(&spine);
745
746	return r;
747}
748
749int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
750			   uint64_t *first_key, uint64_t end_key,
751			   dm_block_t *new_root, unsigned int *nr_removed)
752{
753	int r;
754
755	*nr_removed = 0;
756	do {
757		r = remove_one(info, root, first_key, end_key, &root, nr_removed);
758		if (!r)
759			(*nr_removed)++;
760	} while (!r);
761
762	*new_root = root;
763	return r == -ENODATA ? 0 : r;
764}
765EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);
v4.6
 
  1/*
  2 * Copyright (C) 2011 Red Hat, Inc.
  3 *
  4 * This file is released under the GPL.
  5 */
  6
  7#include "dm-btree.h"
  8#include "dm-btree-internal.h"
  9#include "dm-transaction-manager.h"
 10
 11#include <linux/export.h>
 
 
 
 12
 13/*
 14 * Removing an entry from a btree
 15 * ==============================
 16 *
 17 * A very important constraint for our btree is that no node, except the
 18 * root, may have fewer than a certain number of entries.
 19 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
 20 *
 21 * Ensuring this is complicated by the way we want to only ever hold the
 22 * locks on 2 nodes concurrently, and only change nodes in a top to bottom
 23 * fashion.
 24 *
 25 * Each node may have a left or right sibling.  When decending the spine,
 26 * if a node contains only MIN_ENTRIES then we try and increase this to at
 27 * least MIN_ENTRIES + 1.  We do this in the following ways:
 28 *
 29 * [A] No siblings => this can only happen if the node is the root, in which
 30 *     case we copy the childs contents over the root.
 31 *
 32 * [B] No left sibling
 33 *     ==> rebalance(node, right sibling)
 34 *
 35 * [C] No right sibling
 36 *     ==> rebalance(left sibling, node)
 37 *
 38 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
 39 *     ==> delete node adding it's contents to left and right
 40 *
 41 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
 42 *     ==> rebalance(left, node, right)
 43 *
 44 * After these operations it's possible that the our original node no
 45 * longer contains the desired sub tree.  For this reason this rebalancing
 46 * is performed on the children of the current node.  This also avoids
 47 * having a special case for the root.
 48 *
 49 * Once this rebalancing has occurred we can then step into the child node
 50 * for internal nodes.  Or delete the entry for leaf nodes.
 51 */
 52
 53/*
 54 * Some little utilities for moving node data around.
 55 */
 56static void node_shift(struct btree_node *n, int shift)
 57{
 58	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
 59	uint32_t value_size = le32_to_cpu(n->header.value_size);
 60
 61	if (shift < 0) {
 62		shift = -shift;
 63		BUG_ON(shift > nr_entries);
 64		BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
 65		memmove(key_ptr(n, 0),
 66			key_ptr(n, shift),
 67			(nr_entries - shift) * sizeof(__le64));
 68		memmove(value_ptr(n, 0),
 69			value_ptr(n, shift),
 70			(nr_entries - shift) * value_size);
 71	} else {
 72		BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
 73		memmove(key_ptr(n, shift),
 74			key_ptr(n, 0),
 75			nr_entries * sizeof(__le64));
 76		memmove(value_ptr(n, shift),
 77			value_ptr(n, 0),
 78			nr_entries * value_size);
 79	}
 80}
 81
 82static void node_copy(struct btree_node *left, struct btree_node *right, int shift)
 83{
 84	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
 85	uint32_t value_size = le32_to_cpu(left->header.value_size);
 86	BUG_ON(value_size != le32_to_cpu(right->header.value_size));
 
 
 
 
 87
 88	if (shift < 0) {
 89		shift = -shift;
 90		BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries));
 
 
 
 
 
 91		memcpy(key_ptr(left, nr_left),
 92		       key_ptr(right, 0),
 93		       shift * sizeof(__le64));
 94		memcpy(value_ptr(left, nr_left),
 95		       value_ptr(right, 0),
 96		       shift * value_size);
 97	} else {
 98		BUG_ON(shift > le32_to_cpu(right->header.max_entries));
 
 
 
 
 99		memcpy(key_ptr(right, 0),
100		       key_ptr(left, nr_left - shift),
101		       shift * sizeof(__le64));
102		memcpy(value_ptr(right, 0),
103		       value_ptr(left, nr_left - shift),
104		       shift * value_size);
105	}
 
106}
107
108/*
109 * Delete a specific entry from a leaf node.
110 */
111static void delete_at(struct btree_node *n, unsigned index)
112{
113	unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
114	unsigned nr_to_copy = nr_entries - (index + 1);
115	uint32_t value_size = le32_to_cpu(n->header.value_size);
 
116	BUG_ON(index >= nr_entries);
117
118	if (nr_to_copy) {
119		memmove(key_ptr(n, index),
120			key_ptr(n, index + 1),
121			nr_to_copy * sizeof(__le64));
122
123		memmove(value_ptr(n, index),
124			value_ptr(n, index + 1),
125			nr_to_copy * value_size);
126	}
127
128	n->header.nr_entries = cpu_to_le32(nr_entries - 1);
129}
130
131static unsigned merge_threshold(struct btree_node *n)
132{
133	return le32_to_cpu(n->header.max_entries) / 3;
134}
135
136struct child {
137	unsigned index;
138	struct dm_block *block;
139	struct btree_node *n;
140};
141
142static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
143		      struct btree_node *parent,
144		      unsigned index, struct child *result)
145{
146	int r, inc;
147	dm_block_t root;
148
149	result->index = index;
150	root = value64(parent, index);
151
152	r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
153			       &result->block, &inc);
154	if (r)
155		return r;
156
157	result->n = dm_block_data(result->block);
158
159	if (inc)
160		inc_children(info->tm, result->n, vt);
161
162	*((__le64 *) value_ptr(parent, index)) =
163		cpu_to_le64(dm_block_location(result->block));
164
165	return 0;
166}
167
168static void exit_child(struct dm_btree_info *info, struct child *c)
169{
170	dm_tm_unlock(info->tm, c->block);
171}
172
173static void shift(struct btree_node *left, struct btree_node *right, int count)
174{
 
175	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
176	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
177	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
178	uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
179
180	BUG_ON(max_entries != r_max_entries);
181	BUG_ON(nr_left - count > max_entries);
182	BUG_ON(nr_right + count > max_entries);
 
 
 
 
 
 
 
 
 
 
 
183
184	if (!count)
185		return;
186
187	if (count > 0) {
188		node_shift(right, count);
189		node_copy(left, right, count);
 
 
190	} else {
191		node_copy(left, right, count);
 
 
192		node_shift(right, count);
193	}
194
195	left->header.nr_entries = cpu_to_le32(nr_left - count);
196	right->header.nr_entries = cpu_to_le32(nr_right + count);
 
 
197}
198
199static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
200			 struct child *l, struct child *r)
201{
 
202	struct btree_node *left = l->n;
203	struct btree_node *right = r->n;
204	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
205	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
206	unsigned threshold = 2 * merge_threshold(left) + 1;
 
 
 
 
 
 
207
208	if (nr_left + nr_right < threshold) {
209		/*
210		 * Merge
211		 */
212		node_copy(left, right, -nr_right);
213		left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
214		delete_at(parent, r->index);
215
216		/*
217		 * We need to decrement the right block, but not it's
218		 * children, since they're still referenced by left.
219		 */
220		dm_tm_dec(info->tm, dm_block_location(r->block));
221	} else {
222		/*
223		 * Rebalance.
224		 */
225		unsigned target_left = (nr_left + nr_right) / 2;
226		shift(left, right, nr_left - target_left);
 
 
 
227		*key_ptr(parent, r->index) = right->keys[0];
228	}
 
229}
230
231static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
232		      struct dm_btree_value_type *vt, unsigned left_index)
233{
234	int r;
235	struct btree_node *parent;
236	struct child left, right;
237
238	parent = dm_block_data(shadow_current(s));
239
240	r = init_child(info, vt, parent, left_index, &left);
241	if (r)
242		return r;
243
244	r = init_child(info, vt, parent, left_index + 1, &right);
245	if (r) {
246		exit_child(info, &left);
247		return r;
248	}
249
250	__rebalance2(info, parent, &left, &right);
251
252	exit_child(info, &left);
253	exit_child(info, &right);
254
255	return 0;
256}
257
258/*
259 * We dump as many entries from center as possible into left, then the rest
260 * in right, then rebalance2.  This wastes some cpu, but I want something
261 * simple atm.
262 */
263static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
264			       struct child *l, struct child *c, struct child *r,
265			       struct btree_node *left, struct btree_node *center, struct btree_node *right,
266			       uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
267{
268	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
269	unsigned shift = min(max_entries - nr_left, nr_center);
 
 
 
 
 
270
271	BUG_ON(nr_left + shift > max_entries);
272	node_copy(left, center, -shift);
273	left->header.nr_entries = cpu_to_le32(nr_left + shift);
274
275	if (shift != nr_center) {
276		shift = nr_center - shift;
277		BUG_ON((nr_right + shift) > max_entries);
 
 
 
 
 
278		node_shift(right, shift);
279		node_copy(center, right, shift);
280		right->header.nr_entries = cpu_to_le32(nr_right + shift);
281	}
282	*key_ptr(parent, r->index) = right->keys[0];
283
284	delete_at(parent, c->index);
285	r->index--;
286
287	dm_tm_dec(info->tm, dm_block_location(c->block));
288	__rebalance2(info, parent, l, r);
289}
290
291/*
292 * Redistributes entries among 3 sibling nodes.
293 */
294static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
295			  struct child *l, struct child *c, struct child *r,
296			  struct btree_node *left, struct btree_node *center, struct btree_node *right,
297			  uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
298{
299	int s;
300	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
301	unsigned total = nr_left + nr_center + nr_right;
302	unsigned target_right = total / 3;
303	unsigned remainder = (target_right * 3) != total;
304	unsigned target_left = target_right + remainder;
305
306	BUG_ON(target_left > max_entries);
307	BUG_ON(target_right > max_entries);
308
309	if (nr_left < nr_right) {
310		s = nr_left - target_left;
311
312		if (s < 0 && nr_center < -s) {
313			/* not enough in central node */
314			shift(left, center, -nr_center);
 
 
 
315			s += nr_center;
316			shift(left, right, s);
 
 
 
317			nr_right += s;
318		} else
319			shift(left, center, s);
320
321		shift(center, right, target_right - nr_right);
 
322
 
 
 
323	} else {
324		s = target_right - nr_right;
325		if (s > 0 && nr_center < s) {
326			/* not enough in central node */
327			shift(center, right, nr_center);
 
 
328			s -= nr_center;
329			shift(left, right, s);
 
 
330			nr_left -= s;
331		} else
332			shift(center, right, s);
 
 
 
333
334		shift(left, center, nr_left - target_left);
 
 
335	}
336
337	*key_ptr(parent, c->index) = center->keys[0];
338	*key_ptr(parent, r->index) = right->keys[0];
 
339}
340
341static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
342			 struct child *l, struct child *c, struct child *r)
343{
344	struct btree_node *left = l->n;
345	struct btree_node *center = c->n;
346	struct btree_node *right = r->n;
347
348	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
349	uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
350	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
351
352	unsigned threshold = merge_threshold(left) * 4 + 1;
 
 
 
 
 
 
353
354	BUG_ON(left->header.max_entries != center->header.max_entries);
355	BUG_ON(center->header.max_entries != right->header.max_entries);
 
 
356
357	if ((nr_left + nr_center + nr_right) < threshold)
358		delete_center_node(info, parent, l, c, r, left, center, right,
359				   nr_left, nr_center, nr_right);
360	else
361		redistribute3(info, parent, l, c, r, left, center, right,
362			      nr_left, nr_center, nr_right);
363}
364
365static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
366		      struct dm_btree_value_type *vt, unsigned left_index)
367{
368	int r;
369	struct btree_node *parent = dm_block_data(shadow_current(s));
370	struct child left, center, right;
371
372	/*
373	 * FIXME: fill out an array?
374	 */
375	r = init_child(info, vt, parent, left_index, &left);
376	if (r)
377		return r;
378
379	r = init_child(info, vt, parent, left_index + 1, &center);
380	if (r) {
381		exit_child(info, &left);
382		return r;
383	}
384
385	r = init_child(info, vt, parent, left_index + 2, &right);
386	if (r) {
387		exit_child(info, &left);
388		exit_child(info, &center);
389		return r;
390	}
391
392	__rebalance3(info, parent, &left, &center, &right);
393
394	exit_child(info, &left);
395	exit_child(info, &center);
396	exit_child(info, &right);
397
398	return 0;
399}
400
401static int rebalance_children(struct shadow_spine *s,
402			      struct dm_btree_info *info,
403			      struct dm_btree_value_type *vt, uint64_t key)
404{
405	int i, r, has_left_sibling, has_right_sibling;
406	struct btree_node *n;
407
408	n = dm_block_data(shadow_current(s));
409
410	if (le32_to_cpu(n->header.nr_entries) == 1) {
411		struct dm_block *child;
412		dm_block_t b = value64(n, 0);
413
414		r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
415		if (r)
416			return r;
417
418		memcpy(n, dm_block_data(child),
419		       dm_bm_block_size(dm_tm_get_bm(info->tm)));
420		dm_tm_unlock(info->tm, child);
421
422		dm_tm_dec(info->tm, dm_block_location(child));
 
423		return 0;
424	}
425
426	i = lower_bound(n, key);
427	if (i < 0)
428		return -ENODATA;
429
430	has_left_sibling = i > 0;
431	has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
432
433	if (!has_left_sibling)
434		r = rebalance2(s, info, vt, i);
435
436	else if (!has_right_sibling)
437		r = rebalance2(s, info, vt, i - 1);
438
439	else
440		r = rebalance3(s, info, vt, i - 1);
441
442	return r;
443}
444
445static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index)
446{
447	int i = lower_bound(n, key);
448
449	if ((i < 0) ||
450	    (i >= le32_to_cpu(n->header.nr_entries)) ||
451	    (le64_to_cpu(n->keys[i]) != key))
452		return -ENODATA;
453
454	*index = i;
455
456	return 0;
457}
458
459/*
460 * Prepares for removal from one level of the hierarchy.  The caller must
461 * call delete_at() to remove the entry at index.
462 */
463static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
464		      struct dm_btree_value_type *vt, dm_block_t root,
465		      uint64_t key, unsigned *index)
466{
467	int i = *index, r;
468	struct btree_node *n;
469
470	for (;;) {
471		r = shadow_step(s, root, vt);
472		if (r < 0)
473			break;
474
475		/*
476		 * We have to patch up the parent node, ugly, but I don't
477		 * see a way to do this automatically as part of the spine
478		 * op.
479		 */
480		if (shadow_has_parent(s)) {
481			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 
482			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
483			       &location, sizeof(__le64));
484		}
485
486		n = dm_block_data(shadow_current(s));
487
488		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
489			return do_leaf(n, key, index);
490
491		r = rebalance_children(s, info, vt, key);
492		if (r)
493			break;
494
495		n = dm_block_data(shadow_current(s));
496		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
497			return do_leaf(n, key, index);
498
499		i = lower_bound(n, key);
500
501		/*
502		 * We know the key is present, or else
503		 * rebalance_children would have returned
504		 * -ENODATA
505		 */
506		root = value64(n, i);
507	}
508
509	return r;
510}
511
512int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
513		    uint64_t *keys, dm_block_t *new_root)
514{
515	unsigned level, last_level = info->levels - 1;
516	int index = 0, r = 0;
517	struct shadow_spine spine;
518	struct btree_node *n;
519	struct dm_btree_value_type le64_vt;
520
521	init_le64_type(info->tm, &le64_vt);
522	init_shadow_spine(&spine, info);
523	for (level = 0; level < info->levels; level++) {
524		r = remove_raw(&spine, info,
525			       (level == last_level ?
526				&info->value_type : &le64_vt),
527			       root, keys[level], (unsigned *)&index);
528		if (r < 0)
529			break;
530
531		n = dm_block_data(shadow_current(&spine));
532		if (level != last_level) {
533			root = value64(n, index);
534			continue;
535		}
536
537		BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
538
539		if (info->value_type.dec)
540			info->value_type.dec(info->value_type.context,
541					     value_ptr(n, index));
542
543		delete_at(n, index);
544	}
545
546	*new_root = shadow_root(&spine);
 
547	exit_shadow_spine(&spine);
548
549	return r;
550}
551EXPORT_SYMBOL_GPL(dm_btree_remove);
552
553/*----------------------------------------------------------------*/
554
555static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
556			  struct dm_btree_value_type *vt, dm_block_t root,
557			  uint64_t key, int *index)
558{
559	int i = *index, r;
560	struct btree_node *n;
561
562	for (;;) {
563		r = shadow_step(s, root, vt);
564		if (r < 0)
565			break;
566
567		/*
568		 * We have to patch up the parent node, ugly, but I don't
569		 * see a way to do this automatically as part of the spine
570		 * op.
571		 */
572		if (shadow_has_parent(s)) {
573			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
 
574			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
575			       &location, sizeof(__le64));
576		}
577
578		n = dm_block_data(shadow_current(s));
579
580		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
581			*index = lower_bound(n, key);
582			return 0;
583		}
584
585		r = rebalance_children(s, info, vt, key);
586		if (r)
587			break;
588
589		n = dm_block_data(shadow_current(s));
590		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
591			*index = lower_bound(n, key);
592			return 0;
593		}
594
595		i = lower_bound(n, key);
596
597		/*
598		 * We know the key is present, or else
599		 * rebalance_children would have returned
600		 * -ENODATA
601		 */
602		root = value64(n, i);
603	}
604
605	return r;
606}
607
608static int remove_one(struct dm_btree_info *info, dm_block_t root,
609		      uint64_t *keys, uint64_t end_key,
610		      dm_block_t *new_root, unsigned *nr_removed)
611{
612	unsigned level, last_level = info->levels - 1;
613	int index = 0, r = 0;
614	struct shadow_spine spine;
615	struct btree_node *n;
616	struct dm_btree_value_type le64_vt;
617	uint64_t k;
618
619	init_le64_type(info->tm, &le64_vt);
620	init_shadow_spine(&spine, info);
621	for (level = 0; level < last_level; level++) {
622		r = remove_raw(&spine, info, &le64_vt,
623			       root, keys[level], (unsigned *) &index);
624		if (r < 0)
625			goto out;
626
627		n = dm_block_data(shadow_current(&spine));
628		root = value64(n, index);
629	}
630
631	r = remove_nearest(&spine, info, &info->value_type,
632			   root, keys[last_level], &index);
633	if (r < 0)
634		goto out;
635
636	n = dm_block_data(shadow_current(&spine));
637
638	if (index < 0)
639		index = 0;
640
641	if (index >= le32_to_cpu(n->header.nr_entries)) {
642		r = -ENODATA;
643		goto out;
644	}
645
646	k = le64_to_cpu(n->keys[index]);
647	if (k >= keys[last_level] && k < end_key) {
648		if (info->value_type.dec)
649			info->value_type.dec(info->value_type.context,
650					     value_ptr(n, index));
651
652		delete_at(n, index);
653		keys[last_level] = k + 1ull;
654
655	} else
656		r = -ENODATA;
657
658out:
659	*new_root = shadow_root(&spine);
660	exit_shadow_spine(&spine);
661
662	return r;
663}
664
665int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
666			   uint64_t *first_key, uint64_t end_key,
667			   dm_block_t *new_root, unsigned *nr_removed)
668{
669	int r;
670
671	*nr_removed = 0;
672	do {
673		r = remove_one(info, root, first_key, end_key, &root, nr_removed);
674		if (!r)
675			(*nr_removed)++;
676	} while (!r);
677
678	*new_root = root;
679	return r == -ENODATA ? 0 : r;
680}
681EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);