Linux Audio

Check our new training course

Loading...
v6.9.4
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Copyright (C) 2021-2023 Oracle.  All Rights Reserved.
   4 * Author: Darrick J. Wong <djwong@kernel.org>
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
 
  10#include "scrub/xfile.h"
  11#include "scrub/xfarray.h"
  12#include "scrub/scrub.h"
  13#include "scrub/trace.h"
  14
  15/*
  16 * Large Arrays of Fixed-Size Records
  17 * ==================================
  18 *
  19 * This memory array uses an xfile (which itself is a shmem file) to store
  20 * large numbers of fixed-size records in memory that can be paged out.  This
  21 * puts less stress on the memory reclaim algorithms during an online repair
  22 * because we don't have to pin so much memory.  However, array access is less
  23 * direct than would be in a regular memory array.  Access to the array is
  24 * performed via indexed load and store methods, and an append method is
  25 * provided for convenience.  Array elements can be unset, which sets them to
  26 * all zeroes.  Unset entries are skipped during iteration, though direct loads
  27 * will return a zeroed buffer.  Callers are responsible for concurrency
  28 * control.
  29 */
  30
  31/*
  32 * Pointer to scratch space.  Because we can't access the xfile data directly,
  33 * we allocate a small amount of memory on the end of the xfarray structure to
  34 * buffer array items when we need space to store values temporarily.
  35 */
  36static inline void *xfarray_scratch(struct xfarray *array)
  37{
  38	return (array + 1);
  39}
  40
  41/* Compute array index given an xfile offset. */
  42static xfarray_idx_t
  43xfarray_idx(
  44	struct xfarray	*array,
  45	loff_t		pos)
  46{
  47	if (array->obj_size_log >= 0)
  48		return (xfarray_idx_t)pos >> array->obj_size_log;
  49
  50	return div_u64((xfarray_idx_t)pos, array->obj_size);
  51}
  52
  53/* Compute xfile offset of array element. */
  54static inline loff_t xfarray_pos(struct xfarray *array, xfarray_idx_t idx)
  55{
  56	if (array->obj_size_log >= 0)
  57		return idx << array->obj_size_log;
  58
  59	return idx * array->obj_size;
  60}
  61
  62/*
  63 * Initialize a big memory array.  Array records cannot be larger than a
  64 * page, and the array cannot span more bytes than the page cache supports.
  65 * If @required_capacity is nonzero, the maximum array size will be set to this
  66 * quantity and the array creation will fail if the underlying storage cannot
  67 * support that many records.
  68 */
  69int
  70xfarray_create(
  71	const char		*description,
  72	unsigned long long	required_capacity,
  73	size_t			obj_size,
  74	struct xfarray		**arrayp)
  75{
  76	struct xfarray		*array;
  77	struct xfile		*xfile;
  78	int			error;
  79
  80	ASSERT(obj_size < PAGE_SIZE);
  81
  82	error = xfile_create(description, 0, &xfile);
  83	if (error)
  84		return error;
  85
  86	error = -ENOMEM;
  87	array = kzalloc(sizeof(struct xfarray) + obj_size, XCHK_GFP_FLAGS);
  88	if (!array)
  89		goto out_xfile;
  90
  91	array->xfile = xfile;
  92	array->obj_size = obj_size;
  93
  94	if (is_power_of_2(obj_size))
  95		array->obj_size_log = ilog2(obj_size);
  96	else
  97		array->obj_size_log = -1;
  98
  99	array->max_nr = xfarray_idx(array, MAX_LFS_FILESIZE);
 100	trace_xfarray_create(array, required_capacity);
 101
 102	if (required_capacity > 0) {
 103		if (array->max_nr < required_capacity) {
 104			error = -ENOMEM;
 105			goto out_xfarray;
 106		}
 107		array->max_nr = required_capacity;
 108	}
 109
 110	*arrayp = array;
 111	return 0;
 112
 113out_xfarray:
 114	kfree(array);
 115out_xfile:
 116	xfile_destroy(xfile);
 117	return error;
 118}
 119
 120/* Destroy the array. */
 121void
 122xfarray_destroy(
 123	struct xfarray	*array)
 124{
 125	xfile_destroy(array->xfile);
 126	kfree(array);
 127}
 128
 129/* Load an element from the array. */
 130int
 131xfarray_load(
 132	struct xfarray	*array,
 133	xfarray_idx_t	idx,
 134	void		*ptr)
 135{
 136	if (idx >= array->nr)
 137		return -ENODATA;
 138
 139	return xfile_load(array->xfile, ptr, array->obj_size,
 140			xfarray_pos(array, idx));
 141}
 142
 143/* Is this array element potentially unset? */
 144static inline bool
 145xfarray_is_unset(
 146	struct xfarray	*array,
 147	loff_t		pos)
 148{
 149	void		*temp = xfarray_scratch(array);
 150	int		error;
 151
 152	if (array->unset_slots == 0)
 153		return false;
 154
 155	error = xfile_load(array->xfile, temp, array->obj_size, pos);
 156	if (!error && xfarray_element_is_null(array, temp))
 157		return true;
 158
 159	return false;
 160}
 161
 162/*
 163 * Unset an array element.  If @idx is the last element in the array, the
 164 * array will be truncated.  Otherwise, the entry will be zeroed.
 165 */
 166int
 167xfarray_unset(
 168	struct xfarray	*array,
 169	xfarray_idx_t	idx)
 170{
 171	void		*temp = xfarray_scratch(array);
 172	loff_t		pos = xfarray_pos(array, idx);
 173	int		error;
 174
 175	if (idx >= array->nr)
 176		return -ENODATA;
 177
 178	if (idx == array->nr - 1) {
 179		array->nr--;
 180		return 0;
 181	}
 182
 183	if (xfarray_is_unset(array, pos))
 184		return 0;
 185
 186	memset(temp, 0, array->obj_size);
 187	error = xfile_store(array->xfile, temp, array->obj_size, pos);
 188	if (error)
 189		return error;
 190
 191	array->unset_slots++;
 192	return 0;
 193}
 194
 195/*
 196 * Store an element in the array.  The element must not be completely zeroed,
 197 * because those are considered unset sparse elements.
 198 */
 199int
 200xfarray_store(
 201	struct xfarray	*array,
 202	xfarray_idx_t	idx,
 203	const void	*ptr)
 204{
 205	int		ret;
 206
 207	if (idx >= array->max_nr)
 208		return -EFBIG;
 209
 210	ASSERT(!xfarray_element_is_null(array, ptr));
 211
 212	ret = xfile_store(array->xfile, ptr, array->obj_size,
 213			xfarray_pos(array, idx));
 214	if (ret)
 215		return ret;
 216
 217	array->nr = max(array->nr, idx + 1);
 218	return 0;
 219}
 220
 221/* Is this array element NULL? */
 222bool
 223xfarray_element_is_null(
 224	struct xfarray	*array,
 225	const void	*ptr)
 226{
 227	return !memchr_inv(ptr, 0, array->obj_size);
 228}
 229
 230/*
 231 * Store an element anywhere in the array that is unset.  If there are no
 232 * unset slots, append the element to the array.
 233 */
 234int
 235xfarray_store_anywhere(
 236	struct xfarray	*array,
 237	const void	*ptr)
 238{
 239	void		*temp = xfarray_scratch(array);
 240	loff_t		endpos = xfarray_pos(array, array->nr);
 241	loff_t		pos;
 242	int		error;
 243
 244	/* Find an unset slot to put it in. */
 245	for (pos = 0;
 246	     pos < endpos && array->unset_slots > 0;
 247	     pos += array->obj_size) {
 248		error = xfile_load(array->xfile, temp, array->obj_size,
 249				pos);
 250		if (error || !xfarray_element_is_null(array, temp))
 251			continue;
 252
 253		error = xfile_store(array->xfile, ptr, array->obj_size,
 254				pos);
 255		if (error)
 256			return error;
 257
 258		array->unset_slots--;
 259		return 0;
 260	}
 261
 262	/* No unset slots found; attach it on the end. */
 263	array->unset_slots = 0;
 264	return xfarray_append(array, ptr);
 265}
 266
 267/* Return length of array. */
 268uint64_t
 269xfarray_length(
 270	struct xfarray	*array)
 271{
 272	return array->nr;
 273}
 274
 275/*
 276 * Decide which array item we're going to read as part of an _iter_get.
 277 * @cur is the array index, and @pos is the file offset of that array index in
 278 * the backing xfile.  Returns ENODATA if we reach the end of the records.
 279 *
 280 * Reading from a hole in a sparse xfile causes page instantiation, so for
 281 * iterating a (possibly sparse) array we need to figure out if the cursor is
 282 * pointing at a totally uninitialized hole and move the cursor up if
 283 * necessary.
 284 */
 285static inline int
 286xfarray_find_data(
 287	struct xfarray	*array,
 288	xfarray_idx_t	*cur,
 289	loff_t		*pos)
 290{
 291	unsigned int	pgoff = offset_in_page(*pos);
 292	loff_t		end_pos = *pos + array->obj_size - 1;
 293	loff_t		new_pos;
 294
 295	/*
 296	 * If the current array record is not adjacent to a page boundary, we
 297	 * are in the middle of the page.  We do not need to move the cursor.
 298	 */
 299	if (pgoff != 0 && pgoff + array->obj_size - 1 < PAGE_SIZE)
 300		return 0;
 301
 302	/*
 303	 * Call SEEK_DATA on the last byte in the record we're about to read.
 304	 * If the record ends at (or crosses) the end of a page then we know
 305	 * that the first byte of the record is backed by pages and don't need
 306	 * to query it.  If instead the record begins at the start of the page
 307	 * then we know that querying the last byte is just as good as querying
 308	 * the first byte, since records cannot be larger than a page.
 309	 *
 310	 * If the call returns the same file offset, we know this record is
 311	 * backed by real pages.  We do not need to move the cursor.
 312	 */
 313	new_pos = xfile_seek_data(array->xfile, end_pos);
 314	if (new_pos == -ENXIO)
 315		return -ENODATA;
 316	if (new_pos < 0)
 317		return new_pos;
 318	if (new_pos == end_pos)
 319		return 0;
 320
 321	/*
 322	 * Otherwise, SEEK_DATA told us how far up to move the file pointer to
 323	 * find more data.  Move the array index to the first record past the
 324	 * byte offset we were given.
 325	 */
 326	new_pos = roundup_64(new_pos, array->obj_size);
 327	*cur = xfarray_idx(array, new_pos);
 328	*pos = xfarray_pos(array, *cur);
 329	return 0;
 330}
 331
 332/*
 333 * Starting at *idx, fetch the next non-null array entry and advance the index
 334 * to set up the next _load_next call.  Returns ENODATA if we reach the end of
 335 * the array.  Callers must set @*idx to XFARRAY_CURSOR_INIT before the first
 336 * call to this function.
 337 */
 338int
 339xfarray_load_next(
 340	struct xfarray	*array,
 341	xfarray_idx_t	*idx,
 342	void		*rec)
 343{
 344	xfarray_idx_t	cur = *idx;
 345	loff_t		pos = xfarray_pos(array, cur);
 346	int		error;
 347
 348	do {
 349		if (cur >= array->nr)
 350			return -ENODATA;
 351
 352		/*
 353		 * Ask the backing store for the location of next possible
 354		 * written record, then retrieve that record.
 355		 */
 356		error = xfarray_find_data(array, &cur, &pos);
 357		if (error)
 358			return error;
 359		error = xfarray_load(array, cur, rec);
 360		if (error)
 361			return error;
 362
 363		cur++;
 364		pos += array->obj_size;
 365	} while (xfarray_element_is_null(array, rec));
 366
 367	*idx = cur;
 368	return 0;
 369}
 370
 371/* Sorting functions */
 372
 373#ifdef DEBUG
 374# define xfarray_sort_bump_loads(si)	do { (si)->loads++; } while (0)
 375# define xfarray_sort_bump_stores(si)	do { (si)->stores++; } while (0)
 376# define xfarray_sort_bump_compares(si)	do { (si)->compares++; } while (0)
 377# define xfarray_sort_bump_heapsorts(si) do { (si)->heapsorts++; } while (0)
 378#else
 379# define xfarray_sort_bump_loads(si)
 380# define xfarray_sort_bump_stores(si)
 381# define xfarray_sort_bump_compares(si)
 382# define xfarray_sort_bump_heapsorts(si)
 383#endif /* DEBUG */
 384
 385/* Load an array element for sorting. */
 386static inline int
 387xfarray_sort_load(
 388	struct xfarray_sortinfo	*si,
 389	xfarray_idx_t		idx,
 390	void			*ptr)
 391{
 392	xfarray_sort_bump_loads(si);
 393	return xfarray_load(si->array, idx, ptr);
 394}
 395
 396/* Store an array element for sorting. */
 397static inline int
 398xfarray_sort_store(
 399	struct xfarray_sortinfo	*si,
 400	xfarray_idx_t		idx,
 401	void			*ptr)
 402{
 403	xfarray_sort_bump_stores(si);
 404	return xfarray_store(si->array, idx, ptr);
 405}
 406
 407/* Compare an array element for sorting. */
 408static inline int
 409xfarray_sort_cmp(
 410	struct xfarray_sortinfo	*si,
 411	const void		*a,
 412	const void		*b)
 413{
 414	xfarray_sort_bump_compares(si);
 415	return si->cmp_fn(a, b);
 416}
 417
 418/* Return a pointer to the low index stack for quicksort partitioning. */
 419static inline xfarray_idx_t *xfarray_sortinfo_lo(struct xfarray_sortinfo *si)
 420{
 421	return (xfarray_idx_t *)(si + 1);
 422}
 423
 424/* Return a pointer to the high index stack for quicksort partitioning. */
 425static inline xfarray_idx_t *xfarray_sortinfo_hi(struct xfarray_sortinfo *si)
 426{
 427	return xfarray_sortinfo_lo(si) + si->max_stack_depth;
 428}
 429
 430/* Size of each element in the quicksort pivot array. */
 431static inline size_t
 432xfarray_pivot_rec_sz(
 433	struct xfarray		*array)
 434{
 435	return round_up(array->obj_size, 8) + sizeof(xfarray_idx_t);
 436}
 437
 438/* Allocate memory to handle the sort. */
 439static inline int
 440xfarray_sortinfo_alloc(
 441	struct xfarray		*array,
 442	xfarray_cmp_fn		cmp_fn,
 443	unsigned int		flags,
 444	struct xfarray_sortinfo	**infop)
 445{
 446	struct xfarray_sortinfo	*si;
 447	size_t			nr_bytes = sizeof(struct xfarray_sortinfo);
 448	size_t			pivot_rec_sz = xfarray_pivot_rec_sz(array);
 449	int			max_stack_depth;
 450
 451	/*
 452	 * The median-of-nine pivot algorithm doesn't work if a subset has
 453	 * fewer than 9 items.  Make sure the in-memory sort will always take
 454	 * over for subsets where this wouldn't be the case.
 455	 */
 456	BUILD_BUG_ON(XFARRAY_QSORT_PIVOT_NR >= XFARRAY_ISORT_NR);
 457
 458	/*
 459	 * Tail-call recursion during the partitioning phase means that
 460	 * quicksort will never recurse more than log2(nr) times.  We need one
 461	 * extra level of stack to hold the initial parameters.  In-memory
 462	 * sort will always take care of the last few levels of recursion for
 463	 * us, so we can reduce the stack depth by that much.
 464	 */
 465	max_stack_depth = ilog2(array->nr) + 1 - (XFARRAY_ISORT_SHIFT - 1);
 466	if (max_stack_depth < 1)
 467		max_stack_depth = 1;
 468
 469	/* Each level of quicksort uses a lo and a hi index */
 470	nr_bytes += max_stack_depth * sizeof(xfarray_idx_t) * 2;
 471
 472	/* Scratchpad for in-memory sort, or finding the pivot */
 473	nr_bytes += max_t(size_t,
 474			(XFARRAY_QSORT_PIVOT_NR + 1) * pivot_rec_sz,
 475			XFARRAY_ISORT_NR * array->obj_size);
 476
 477	si = kvzalloc(nr_bytes, XCHK_GFP_FLAGS);
 478	if (!si)
 479		return -ENOMEM;
 480
 481	si->array = array;
 482	si->cmp_fn = cmp_fn;
 483	si->flags = flags;
 484	si->max_stack_depth = max_stack_depth;
 485	si->max_stack_used = 1;
 486
 487	xfarray_sortinfo_lo(si)[0] = 0;
 488	xfarray_sortinfo_hi(si)[0] = array->nr - 1;
 
 
 
 489
 490	trace_xfarray_sort(si, nr_bytes);
 491	*infop = si;
 492	return 0;
 493}
 494
 495/* Should this sort be terminated by a fatal signal? */
 496static inline bool
 497xfarray_sort_terminated(
 498	struct xfarray_sortinfo	*si,
 499	int			*error)
 500{
 501	/*
 502	 * If preemption is disabled, we need to yield to the scheduler every
 503	 * few seconds so that we don't run afoul of the soft lockup watchdog
 504	 * or RCU stall detector.
 505	 */
 506	cond_resched();
 507
 508	if ((si->flags & XFARRAY_SORT_KILLABLE) &&
 509	    fatal_signal_pending(current)) {
 510		if (*error == 0)
 511			*error = -EINTR;
 512		return true;
 513	}
 514	return false;
 515}
 516
 517/* Do we want an in-memory sort? */
 518static inline bool
 519xfarray_want_isort(
 520	struct xfarray_sortinfo *si,
 521	xfarray_idx_t		start,
 522	xfarray_idx_t		end)
 523{
 524	/*
 525	 * For array subsets that fit in the scratchpad, it's much faster to
 526	 * use the kernel's heapsort than quicksort's stack machine.
 527	 */
 528	return (end - start) < XFARRAY_ISORT_NR;
 529}
 530
 531/* Return the scratch space within the sortinfo structure. */
 532static inline void *xfarray_sortinfo_isort_scratch(struct xfarray_sortinfo *si)
 533{
 534	return xfarray_sortinfo_hi(si) + si->max_stack_depth;
 535}
 536
 537/*
 538 * Sort a small number of array records using scratchpad memory.  The records
 539 * need not be contiguous in the xfile's memory pages.
 540 */
 541STATIC int
 542xfarray_isort(
 543	struct xfarray_sortinfo	*si,
 544	xfarray_idx_t		lo,
 545	xfarray_idx_t		hi)
 546{
 547	void			*scratch = xfarray_sortinfo_isort_scratch(si);
 548	loff_t			lo_pos = xfarray_pos(si->array, lo);
 549	loff_t			len = xfarray_pos(si->array, hi - lo + 1);
 550	int			error;
 551
 552	trace_xfarray_isort(si, lo, hi);
 553
 554	xfarray_sort_bump_loads(si);
 555	error = xfile_load(si->array->xfile, scratch, len, lo_pos);
 556	if (error)
 557		return error;
 558
 559	xfarray_sort_bump_heapsorts(si);
 560	sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
 561
 562	xfarray_sort_bump_stores(si);
 563	return xfile_store(si->array->xfile, scratch, len, lo_pos);
 564}
 565
 566/*
 567 * Sort the records from lo to hi (inclusive) if they are all backed by the
 568 * same memory folio.  Returns 1 if it sorted, 0 if it did not, or a negative
 569 * errno.
 570 */
 571STATIC int
 572xfarray_foliosort(
 573	struct xfarray_sortinfo	*si,
 574	xfarray_idx_t		lo,
 575	xfarray_idx_t		hi)
 576{
 577	struct folio		*folio;
 578	void			*startp;
 579	loff_t			lo_pos = xfarray_pos(si->array, lo);
 580	uint64_t		len = xfarray_pos(si->array, hi - lo + 1);
 581
 582	/* No single folio could back this many records. */
 583	if (len > XFILE_MAX_FOLIO_SIZE)
 584		return 0;
 585
 586	xfarray_sort_bump_loads(si);
 587	folio = xfile_get_folio(si->array->xfile, lo_pos, len, XFILE_ALLOC);
 588	if (IS_ERR(folio))
 589		return PTR_ERR(folio);
 590	if (!folio)
 591		return 0;
 592
 593	trace_xfarray_foliosort(si, lo, hi);
 594
 595	xfarray_sort_bump_heapsorts(si);
 596	startp = folio_address(folio) + offset_in_folio(folio, lo_pos);
 597	sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
 598
 599	xfarray_sort_bump_stores(si);
 600	xfile_put_folio(si->array->xfile, folio);
 601	return 1;
 602}
 603
 604/* Return a pointer to the xfarray pivot record within the sortinfo struct. */
 605static inline void *xfarray_sortinfo_pivot(struct xfarray_sortinfo *si)
 606{
 607	return xfarray_sortinfo_hi(si) + si->max_stack_depth;
 608}
 609
 610/* Return a pointer to the start of the pivot array. */
 611static inline void *
 612xfarray_sortinfo_pivot_array(
 613	struct xfarray_sortinfo	*si)
 614{
 615	return xfarray_sortinfo_pivot(si) + si->array->obj_size;
 616}
 617
 618/* The xfarray record is stored at the start of each pivot array element. */
 619static inline void *
 620xfarray_pivot_array_rec(
 621	void			*pa,
 622	size_t			pa_recsz,
 623	unsigned int		pa_idx)
 624{
 625	return pa + (pa_recsz * pa_idx);
 626}
 627
 628/* The xfarray index is stored at the end of each pivot array element. */
 629static inline xfarray_idx_t *
 630xfarray_pivot_array_idx(
 631	void			*pa,
 632	size_t			pa_recsz,
 633	unsigned int		pa_idx)
 634{
 635	return xfarray_pivot_array_rec(pa, pa_recsz, pa_idx + 1) -
 636			sizeof(xfarray_idx_t);
 637}
 638
 639/*
 640 * Find a pivot value for quicksort partitioning, swap it with a[lo], and save
 641 * the cached pivot record for the next step.
 642 *
 643 * Load evenly-spaced records within the given range into memory, sort them,
 644 * and choose the pivot from the median record.  Using multiple points will
 645 * improve the quality of the pivot selection, and hopefully avoid the worst
 646 * quicksort behavior, since our array values are nearly always evenly sorted.
 647 */
 648STATIC int
 649xfarray_qsort_pivot(
 650	struct xfarray_sortinfo	*si,
 651	xfarray_idx_t		lo,
 652	xfarray_idx_t		hi)
 653{
 654	void			*pivot = xfarray_sortinfo_pivot(si);
 655	void			*parray = xfarray_sortinfo_pivot_array(si);
 656	void			*recp;
 657	xfarray_idx_t		*idxp;
 658	xfarray_idx_t		step = (hi - lo) / (XFARRAY_QSORT_PIVOT_NR - 1);
 659	size_t			pivot_rec_sz = xfarray_pivot_rec_sz(si->array);
 660	int			i, j;
 661	int			error;
 662
 663	ASSERT(step > 0);
 664
 665	/*
 666	 * Load the xfarray indexes of the records we intend to sample into the
 667	 * pivot array.
 668	 */
 669	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 0);
 670	*idxp = lo;
 671	for (i = 1; i < XFARRAY_QSORT_PIVOT_NR - 1; i++) {
 672		idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
 673		*idxp = lo + (i * step);
 674	}
 675	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
 676			XFARRAY_QSORT_PIVOT_NR - 1);
 677	*idxp = hi;
 678
 679	/* Load the selected xfarray records into the pivot array. */
 680	for (i = 0; i < XFARRAY_QSORT_PIVOT_NR; i++) {
 681		xfarray_idx_t	idx;
 682
 683		recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, i);
 684		idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
 685
 686		/* No unset records; load directly into the array. */
 687		if (likely(si->array->unset_slots == 0)) {
 688			error = xfarray_sort_load(si, *idxp, recp);
 689			if (error)
 690				return error;
 691			continue;
 692		}
 693
 694		/*
 695		 * Load non-null records into the scratchpad without changing
 696		 * the xfarray_idx_t in the pivot array.
 697		 */
 698		idx = *idxp;
 699		xfarray_sort_bump_loads(si);
 700		error = xfarray_load_next(si->array, &idx, recp);
 701		if (error)
 702			return error;
 703	}
 704
 705	xfarray_sort_bump_heapsorts(si);
 706	sort(parray, XFARRAY_QSORT_PIVOT_NR, pivot_rec_sz, si->cmp_fn, NULL);
 707
 708	/*
 709	 * We sorted the pivot array records (which includes the xfarray
 710	 * indices) in xfarray record order.  The median element of the pivot
 711	 * array contains the xfarray record that we will use as the pivot.
 712	 * Copy that xfarray record to the designated space.
 713	 */
 714	recp = xfarray_pivot_array_rec(parray, pivot_rec_sz,
 715			XFARRAY_QSORT_PIVOT_NR / 2);
 716	memcpy(pivot, recp, si->array->obj_size);
 717
 718	/* If the pivot record we chose was already in a[lo] then we're done. */
 719	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
 720			XFARRAY_QSORT_PIVOT_NR / 2);
 721	if (*idxp == lo)
 722		return 0;
 723
 724	/*
 725	 * Find the cached copy of a[lo] in the pivot array so that we can swap
 726	 * a[lo] and a[pivot].
 727	 */
 728	for (i = 0, j = -1; i < XFARRAY_QSORT_PIVOT_NR; i++) {
 729		idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
 730		if (*idxp == lo)
 731			j = i;
 732	}
 733	if (j < 0) {
 734		ASSERT(j >= 0);
 735		return -EFSCORRUPTED;
 736	}
 737
 738	/* Swap a[lo] and a[pivot]. */
 739	error = xfarray_sort_store(si, lo, pivot);
 740	if (error)
 741		return error;
 742
 743	recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, j);
 744	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
 745			XFARRAY_QSORT_PIVOT_NR / 2);
 746	return xfarray_sort_store(si, *idxp, recp);
 747}
 748
 749/*
 750 * Set up the pointers for the next iteration.  We push onto the stack all of
 751 * the unsorted values between a[lo + 1] and a[end[i]], and we tweak the
 752 * current stack frame to point to the unsorted values between a[beg[i]] and
 753 * a[lo] so that those values will be sorted when we pop the stack.
 754 */
 755static inline int
 756xfarray_qsort_push(
 757	struct xfarray_sortinfo	*si,
 758	xfarray_idx_t		*si_lo,
 759	xfarray_idx_t		*si_hi,
 760	xfarray_idx_t		lo,
 761	xfarray_idx_t		hi)
 762{
 763	/* Check for stack overflows */
 764	if (si->stack_depth >= si->max_stack_depth - 1) {
 765		ASSERT(si->stack_depth < si->max_stack_depth - 1);
 766		return -EFSCORRUPTED;
 767	}
 768
 769	si->max_stack_used = max_t(uint8_t, si->max_stack_used,
 770					    si->stack_depth + 2);
 771
 772	si_lo[si->stack_depth + 1] = lo + 1;
 773	si_hi[si->stack_depth + 1] = si_hi[si->stack_depth];
 774	si_hi[si->stack_depth++] = lo - 1;
 775
 776	/*
 777	 * Always start with the smaller of the two partitions to keep the
 778	 * amount of recursion in check.
 779	 */
 780	if (si_hi[si->stack_depth]     - si_lo[si->stack_depth] >
 781	    si_hi[si->stack_depth - 1] - si_lo[si->stack_depth - 1]) {
 782		swap(si_lo[si->stack_depth], si_lo[si->stack_depth - 1]);
 783		swap(si_hi[si->stack_depth], si_hi[si->stack_depth - 1]);
 784	}
 785
 786	return 0;
 787}
 788
 789static inline void
 790xfarray_sort_scan_done(
 791	struct xfarray_sortinfo	*si)
 792{
 793	if (si->folio)
 794		xfile_put_folio(si->array->xfile, si->folio);
 795	si->folio = NULL;
 796}
 797
 798/*
 799 * Cache the folio backing the start of the given array element.  If the array
 800 * element is contained entirely within the folio, return a pointer to the
 801 * cached folio.  Otherwise, load the element into the scratchpad and return a
 802 * pointer to the scratchpad.
 803 */
 804static inline int
 805xfarray_sort_scan(
 806	struct xfarray_sortinfo	*si,
 807	xfarray_idx_t		idx,
 808	void			**ptrp)
 809{
 810	loff_t			idx_pos = xfarray_pos(si->array, idx);
 811	int			error = 0;
 812
 813	if (xfarray_sort_terminated(si, &error))
 814		return error;
 815
 816	trace_xfarray_sort_scan(si, idx);
 817
 818	/* If the cached folio doesn't cover this index, release it. */
 819	if (si->folio &&
 820	    (idx < si->first_folio_idx || idx > si->last_folio_idx))
 821		xfarray_sort_scan_done(si);
 822
 823	/* Grab the first folio that backs this array element. */
 824	if (!si->folio) {
 
 825		loff_t		next_pos;
 826
 827		si->folio = xfile_get_folio(si->array->xfile, idx_pos,
 828				si->array->obj_size, XFILE_ALLOC);
 829		if (IS_ERR(si->folio))
 830			return PTR_ERR(si->folio);
 
 831
 832		si->first_folio_idx = xfarray_idx(si->array,
 833				folio_pos(si->folio) + si->array->obj_size - 1);
 834
 835		next_pos = folio_pos(si->folio) + folio_size(si->folio);
 836		si->last_folio_idx = xfarray_idx(si->array, next_pos - 1);
 837		if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos)
 838			si->last_folio_idx--;
 839
 840		trace_xfarray_sort_scan(si, idx);
 841	}
 842
 843	/*
 844	 * If this folio still doesn't cover the desired element, it must cross
 845	 * a folio boundary.  Read into the scratchpad and we're done.
 846	 */
 847	if (idx < si->first_folio_idx || idx > si->last_folio_idx) {
 848		void		*temp = xfarray_scratch(si->array);
 849
 850		error = xfile_load(si->array->xfile, temp, si->array->obj_size,
 851				idx_pos);
 852		if (error)
 853			return error;
 854
 855		*ptrp = temp;
 856		return 0;
 857	}
 858
 859	/* Otherwise return a pointer to the array element in the folio. */
 860	*ptrp = folio_address(si->folio) + offset_in_folio(si->folio, idx_pos);
 861	return 0;
 862}
 863
 864/*
 865 * Sort the array elements via quicksort.  This implementation incorporates
 866 * four optimizations discussed in Sedgewick:
 867 *
 868 * 1. Use an explicit stack of array indices to store the next array partition
 869 *    to sort.  This helps us to avoid recursion in the call stack, which is
 870 *    particularly expensive in the kernel.
 871 *
 872 * 2. For arrays with records in arbitrary or user-controlled order, choose the
 873 *    pivot element using a median-of-nine decision tree.  This reduces the
 874 *    probability of selecting a bad pivot value which causes worst case
 875 *    behavior (i.e. partition sizes of 1).
 876 *
 877 * 3. The smaller of the two sub-partitions is pushed onto the stack to start
 878 *    the next level of recursion, and the larger sub-partition replaces the
 879 *    current stack frame.  This guarantees that we won't need more than
 880 *    log2(nr) stack space.
 881 *
 882 * 4. For small sets, load the records into the scratchpad and run heapsort on
 883 *    them because that is very fast.  In the author's experience, this yields
 884 *    a ~10% reduction in runtime.
 885 *
 886 *    If a small set is contained entirely within a single xfile memory page,
 887 *    map the page directly and run heap sort directly on the xfile page
 888 *    instead of using the load/store interface.  This halves the runtime.
 889 *
 890 * 5. This optimization is specific to the implementation.  When converging lo
 891 *    and hi after selecting a pivot, we will try to retain the xfile memory
 892 *    page between load calls, which reduces run time by 50%.
 893 */
 894
 895/*
 896 * Due to the use of signed indices, we can only support up to 2^63 records.
 897 * Files can only grow to 2^63 bytes, so this is not much of a limitation.
 898 */
 899#define QSORT_MAX_RECS		(1ULL << 63)
 900
 901int
 902xfarray_sort(
 903	struct xfarray		*array,
 904	xfarray_cmp_fn		cmp_fn,
 905	unsigned int		flags)
 906{
 907	struct xfarray_sortinfo	*si;
 908	xfarray_idx_t		*si_lo, *si_hi;
 909	void			*pivot;
 910	void			*scratch = xfarray_scratch(array);
 911	xfarray_idx_t		lo, hi;
 912	int			error = 0;
 913
 914	if (array->nr < 2)
 915		return 0;
 916	if (array->nr >= QSORT_MAX_RECS)
 917		return -E2BIG;
 918
 919	error = xfarray_sortinfo_alloc(array, cmp_fn, flags, &si);
 920	if (error)
 921		return error;
 922	si_lo = xfarray_sortinfo_lo(si);
 923	si_hi = xfarray_sortinfo_hi(si);
 924	pivot = xfarray_sortinfo_pivot(si);
 925
 926	while (si->stack_depth >= 0) {
 927		int		ret;
 928
 929		lo = si_lo[si->stack_depth];
 930		hi = si_hi[si->stack_depth];
 931
 932		trace_xfarray_qsort(si, lo, hi);
 933
 934		/* Nothing left in this partition to sort; pop stack. */
 935		if (lo >= hi) {
 936			si->stack_depth--;
 937			continue;
 938		}
 939
 940		/*
 941		 * If directly mapping the folio and sorting can solve our
 942		 * problems, we're done.
 943		 */
 944		ret = xfarray_foliosort(si, lo, hi);
 945		if (ret < 0)
 946			goto out_free;
 947		if (ret == 1) {
 948			si->stack_depth--;
 949			continue;
 950		}
 951
 952		/* If insertion sort can solve our problems, we're done. */
 953		if (xfarray_want_isort(si, lo, hi)) {
 954			error = xfarray_isort(si, lo, hi);
 955			if (error)
 956				goto out_free;
 957			si->stack_depth--;
 958			continue;
 959		}
 960
 961		/* Pick a pivot, move it to a[lo] and stash it. */
 962		error = xfarray_qsort_pivot(si, lo, hi);
 963		if (error)
 964			goto out_free;
 965
 966		/*
 967		 * Rearrange a[lo..hi] such that everything smaller than the
 968		 * pivot is on the left side of the range and everything larger
 969		 * than the pivot is on the right side of the range.
 970		 */
 971		while (lo < hi) {
 972			void	*p;
 973
 974			/*
 975			 * Decrement hi until it finds an a[hi] less than the
 976			 * pivot value.
 977			 */
 978			error = xfarray_sort_scan(si, hi, &p);
 979			if (error)
 980				goto out_free;
 981			while (xfarray_sort_cmp(si, p, pivot) >= 0 && lo < hi) {
 982				hi--;
 983				error = xfarray_sort_scan(si, hi, &p);
 984				if (error)
 985					goto out_free;
 986			}
 987			if (p != scratch)
 988				memcpy(scratch, p, si->array->obj_size);
 989			xfarray_sort_scan_done(si);
 990			if (xfarray_sort_terminated(si, &error))
 991				goto out_free;
 992
 993			/* Copy that item (a[hi]) to a[lo]. */
 994			if (lo < hi) {
 995				error = xfarray_sort_store(si, lo++, scratch);
 996				if (error)
 997					goto out_free;
 998			}
 999
1000			/*
1001			 * Increment lo until it finds an a[lo] greater than
1002			 * the pivot value.
1003			 */
1004			error = xfarray_sort_scan(si, lo, &p);
1005			if (error)
1006				goto out_free;
1007			while (xfarray_sort_cmp(si, p, pivot) <= 0 && lo < hi) {
1008				lo++;
1009				error = xfarray_sort_scan(si, lo, &p);
1010				if (error)
1011					goto out_free;
1012			}
1013			if (p != scratch)
1014				memcpy(scratch, p, si->array->obj_size);
1015			xfarray_sort_scan_done(si);
1016			if (xfarray_sort_terminated(si, &error))
1017				goto out_free;
1018
1019			/* Copy that item (a[lo]) to a[hi]. */
1020			if (lo < hi) {
1021				error = xfarray_sort_store(si, hi--, scratch);
1022				if (error)
1023					goto out_free;
1024			}
1025
1026			if (xfarray_sort_terminated(si, &error))
1027				goto out_free;
1028		}
1029
1030		/*
1031		 * Put our pivot value in the correct place at a[lo].  All
1032		 * values between a[beg[i]] and a[lo - 1] should be less than
1033		 * the pivot; and all values between a[lo + 1] and a[end[i]-1]
1034		 * should be greater than the pivot.
1035		 */
1036		error = xfarray_sort_store(si, lo, pivot);
1037		if (error)
1038			goto out_free;
1039
1040		/* Set up the stack frame to process the two partitions. */
1041		error = xfarray_qsort_push(si, si_lo, si_hi, lo, hi);
1042		if (error)
1043			goto out_free;
1044
1045		if (xfarray_sort_terminated(si, &error))
1046			goto out_free;
1047	}
1048
1049out_free:
1050	trace_xfarray_sort_stats(si, error);
 
1051	kvfree(si);
1052	return error;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1053}
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Copyright (C) 2021-2023 Oracle.  All Rights Reserved.
   4 * Author: Darrick J. Wong <djwong@kernel.org>
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "scrub/scrub.h"
  11#include "scrub/xfile.h"
  12#include "scrub/xfarray.h"
 
  13#include "scrub/trace.h"
  14
  15/*
  16 * Large Arrays of Fixed-Size Records
  17 * ==================================
  18 *
  19 * This memory array uses an xfile (which itself is a shmem file) to store
  20 * large numbers of fixed-size records in memory that can be paged out.  This
  21 * puts less stress on the memory reclaim algorithms during an online repair
  22 * because we don't have to pin so much memory.  However, array access is less
  23 * direct than would be in a regular memory array.  Access to the array is
  24 * performed via indexed load and store methods, and an append method is
  25 * provided for convenience.  Array elements can be unset, which sets them to
  26 * all zeroes.  Unset entries are skipped during iteration, though direct loads
  27 * will return a zeroed buffer.  Callers are responsible for concurrency
  28 * control.
  29 */
  30
  31/*
  32 * Pointer to scratch space.  Because we can't access the xfile data directly,
  33 * we allocate a small amount of memory on the end of the xfarray structure to
  34 * buffer array items when we need space to store values temporarily.
  35 */
  36static inline void *xfarray_scratch(struct xfarray *array)
  37{
  38	return (array + 1);
  39}
  40
  41/* Compute array index given an xfile offset. */
  42static xfarray_idx_t
  43xfarray_idx(
  44	struct xfarray	*array,
  45	loff_t		pos)
  46{
  47	if (array->obj_size_log >= 0)
  48		return (xfarray_idx_t)pos >> array->obj_size_log;
  49
  50	return div_u64((xfarray_idx_t)pos, array->obj_size);
  51}
  52
  53/* Compute xfile offset of array element. */
  54static inline loff_t xfarray_pos(struct xfarray *array, xfarray_idx_t idx)
  55{
  56	if (array->obj_size_log >= 0)
  57		return idx << array->obj_size_log;
  58
  59	return idx * array->obj_size;
  60}
  61
  62/*
  63 * Initialize a big memory array.  Array records cannot be larger than a
  64 * page, and the array cannot span more bytes than the page cache supports.
  65 * If @required_capacity is nonzero, the maximum array size will be set to this
  66 * quantity and the array creation will fail if the underlying storage cannot
  67 * support that many records.
  68 */
  69int
  70xfarray_create(
  71	const char		*description,
  72	unsigned long long	required_capacity,
  73	size_t			obj_size,
  74	struct xfarray		**arrayp)
  75{
  76	struct xfarray		*array;
  77	struct xfile		*xfile;
  78	int			error;
  79
  80	ASSERT(obj_size < PAGE_SIZE);
  81
  82	error = xfile_create(description, 0, &xfile);
  83	if (error)
  84		return error;
  85
  86	error = -ENOMEM;
  87	array = kzalloc(sizeof(struct xfarray) + obj_size, XCHK_GFP_FLAGS);
  88	if (!array)
  89		goto out_xfile;
  90
  91	array->xfile = xfile;
  92	array->obj_size = obj_size;
  93
  94	if (is_power_of_2(obj_size))
  95		array->obj_size_log = ilog2(obj_size);
  96	else
  97		array->obj_size_log = -1;
  98
  99	array->max_nr = xfarray_idx(array, MAX_LFS_FILESIZE);
 100	trace_xfarray_create(array, required_capacity);
 101
 102	if (required_capacity > 0) {
 103		if (array->max_nr < required_capacity) {
 104			error = -ENOMEM;
 105			goto out_xfarray;
 106		}
 107		array->max_nr = required_capacity;
 108	}
 109
 110	*arrayp = array;
 111	return 0;
 112
 113out_xfarray:
 114	kfree(array);
 115out_xfile:
 116	xfile_destroy(xfile);
 117	return error;
 118}
 119
 120/* Destroy the array. */
 121void
 122xfarray_destroy(
 123	struct xfarray	*array)
 124{
 125	xfile_destroy(array->xfile);
 126	kfree(array);
 127}
 128
 129/* Load an element from the array. */
 130int
 131xfarray_load(
 132	struct xfarray	*array,
 133	xfarray_idx_t	idx,
 134	void		*ptr)
 135{
 136	if (idx >= array->nr)
 137		return -ENODATA;
 138
 139	return xfile_load(array->xfile, ptr, array->obj_size,
 140			xfarray_pos(array, idx));
 141}
 142
 143/* Is this array element potentially unset? */
 144static inline bool
 145xfarray_is_unset(
 146	struct xfarray	*array,
 147	loff_t		pos)
 148{
 149	void		*temp = xfarray_scratch(array);
 150	int		error;
 151
 152	if (array->unset_slots == 0)
 153		return false;
 154
 155	error = xfile_load(array->xfile, temp, array->obj_size, pos);
 156	if (!error && xfarray_element_is_null(array, temp))
 157		return true;
 158
 159	return false;
 160}
 161
 162/*
 163 * Unset an array element.  If @idx is the last element in the array, the
 164 * array will be truncated.  Otherwise, the entry will be zeroed.
 165 */
 166int
 167xfarray_unset(
 168	struct xfarray	*array,
 169	xfarray_idx_t	idx)
 170{
 171	void		*temp = xfarray_scratch(array);
 172	loff_t		pos = xfarray_pos(array, idx);
 173	int		error;
 174
 175	if (idx >= array->nr)
 176		return -ENODATA;
 177
 178	if (idx == array->nr - 1) {
 179		array->nr--;
 180		return 0;
 181	}
 182
 183	if (xfarray_is_unset(array, pos))
 184		return 0;
 185
 186	memset(temp, 0, array->obj_size);
 187	error = xfile_store(array->xfile, temp, array->obj_size, pos);
 188	if (error)
 189		return error;
 190
 191	array->unset_slots++;
 192	return 0;
 193}
 194
 195/*
 196 * Store an element in the array.  The element must not be completely zeroed,
 197 * because those are considered unset sparse elements.
 198 */
 199int
 200xfarray_store(
 201	struct xfarray	*array,
 202	xfarray_idx_t	idx,
 203	const void	*ptr)
 204{
 205	int		ret;
 206
 207	if (idx >= array->max_nr)
 208		return -EFBIG;
 209
 210	ASSERT(!xfarray_element_is_null(array, ptr));
 211
 212	ret = xfile_store(array->xfile, ptr, array->obj_size,
 213			xfarray_pos(array, idx));
 214	if (ret)
 215		return ret;
 216
 217	array->nr = max(array->nr, idx + 1);
 218	return 0;
 219}
 220
 221/* Is this array element NULL? */
 222bool
 223xfarray_element_is_null(
 224	struct xfarray	*array,
 225	const void	*ptr)
 226{
 227	return !memchr_inv(ptr, 0, array->obj_size);
 228}
 229
 230/*
 231 * Store an element anywhere in the array that is unset.  If there are no
 232 * unset slots, append the element to the array.
 233 */
 234int
 235xfarray_store_anywhere(
 236	struct xfarray	*array,
 237	const void	*ptr)
 238{
 239	void		*temp = xfarray_scratch(array);
 240	loff_t		endpos = xfarray_pos(array, array->nr);
 241	loff_t		pos;
 242	int		error;
 243
 244	/* Find an unset slot to put it in. */
 245	for (pos = 0;
 246	     pos < endpos && array->unset_slots > 0;
 247	     pos += array->obj_size) {
 248		error = xfile_load(array->xfile, temp, array->obj_size,
 249				pos);
 250		if (error || !xfarray_element_is_null(array, temp))
 251			continue;
 252
 253		error = xfile_store(array->xfile, ptr, array->obj_size,
 254				pos);
 255		if (error)
 256			return error;
 257
 258		array->unset_slots--;
 259		return 0;
 260	}
 261
 262	/* No unset slots found; attach it on the end. */
 263	array->unset_slots = 0;
 264	return xfarray_append(array, ptr);
 265}
 266
 267/* Return length of array. */
 268uint64_t
 269xfarray_length(
 270	struct xfarray	*array)
 271{
 272	return array->nr;
 273}
 274
 275/*
 276 * Decide which array item we're going to read as part of an _iter_get.
 277 * @cur is the array index, and @pos is the file offset of that array index in
 278 * the backing xfile.  Returns ENODATA if we reach the end of the records.
 279 *
 280 * Reading from a hole in a sparse xfile causes page instantiation, so for
 281 * iterating a (possibly sparse) array we need to figure out if the cursor is
 282 * pointing at a totally uninitialized hole and move the cursor up if
 283 * necessary.
 284 */
 285static inline int
 286xfarray_find_data(
 287	struct xfarray	*array,
 288	xfarray_idx_t	*cur,
 289	loff_t		*pos)
 290{
 291	unsigned int	pgoff = offset_in_page(*pos);
 292	loff_t		end_pos = *pos + array->obj_size - 1;
 293	loff_t		new_pos;
 294
 295	/*
 296	 * If the current array record is not adjacent to a page boundary, we
 297	 * are in the middle of the page.  We do not need to move the cursor.
 298	 */
 299	if (pgoff != 0 && pgoff + array->obj_size - 1 < PAGE_SIZE)
 300		return 0;
 301
 302	/*
 303	 * Call SEEK_DATA on the last byte in the record we're about to read.
 304	 * If the record ends at (or crosses) the end of a page then we know
 305	 * that the first byte of the record is backed by pages and don't need
 306	 * to query it.  If instead the record begins at the start of the page
 307	 * then we know that querying the last byte is just as good as querying
 308	 * the first byte, since records cannot be larger than a page.
 309	 *
 310	 * If the call returns the same file offset, we know this record is
 311	 * backed by real pages.  We do not need to move the cursor.
 312	 */
 313	new_pos = xfile_seek_data(array->xfile, end_pos);
 314	if (new_pos == -ENXIO)
 315		return -ENODATA;
 316	if (new_pos < 0)
 317		return new_pos;
 318	if (new_pos == end_pos)
 319		return 0;
 320
 321	/*
 322	 * Otherwise, SEEK_DATA told us how far up to move the file pointer to
 323	 * find more data.  Move the array index to the first record past the
 324	 * byte offset we were given.
 325	 */
 326	new_pos = roundup_64(new_pos, array->obj_size);
 327	*cur = xfarray_idx(array, new_pos);
 328	*pos = xfarray_pos(array, *cur);
 329	return 0;
 330}
 331
 332/*
 333 * Starting at *idx, fetch the next non-null array entry and advance the index
 334 * to set up the next _load_next call.  Returns ENODATA if we reach the end of
 335 * the array.  Callers must set @*idx to XFARRAY_CURSOR_INIT before the first
 336 * call to this function.
 337 */
 338int
 339xfarray_load_next(
 340	struct xfarray	*array,
 341	xfarray_idx_t	*idx,
 342	void		*rec)
 343{
 344	xfarray_idx_t	cur = *idx;
 345	loff_t		pos = xfarray_pos(array, cur);
 346	int		error;
 347
 348	do {
 349		if (cur >= array->nr)
 350			return -ENODATA;
 351
 352		/*
 353		 * Ask the backing store for the location of next possible
 354		 * written record, then retrieve that record.
 355		 */
 356		error = xfarray_find_data(array, &cur, &pos);
 357		if (error)
 358			return error;
 359		error = xfarray_load(array, cur, rec);
 360		if (error)
 361			return error;
 362
 363		cur++;
 364		pos += array->obj_size;
 365	} while (xfarray_element_is_null(array, rec));
 366
 367	*idx = cur;
 368	return 0;
 369}
 370
 371/* Sorting functions */
 372
 373#ifdef DEBUG
 374# define xfarray_sort_bump_loads(si)	do { (si)->loads++; } while (0)
 375# define xfarray_sort_bump_stores(si)	do { (si)->stores++; } while (0)
 376# define xfarray_sort_bump_compares(si)	do { (si)->compares++; } while (0)
 377# define xfarray_sort_bump_heapsorts(si) do { (si)->heapsorts++; } while (0)
 378#else
 379# define xfarray_sort_bump_loads(si)
 380# define xfarray_sort_bump_stores(si)
 381# define xfarray_sort_bump_compares(si)
 382# define xfarray_sort_bump_heapsorts(si)
 383#endif /* DEBUG */
 384
 385/* Load an array element for sorting. */
 386static inline int
 387xfarray_sort_load(
 388	struct xfarray_sortinfo	*si,
 389	xfarray_idx_t		idx,
 390	void			*ptr)
 391{
 392	xfarray_sort_bump_loads(si);
 393	return xfarray_load(si->array, idx, ptr);
 394}
 395
 396/* Store an array element for sorting. */
 397static inline int
 398xfarray_sort_store(
 399	struct xfarray_sortinfo	*si,
 400	xfarray_idx_t		idx,
 401	void			*ptr)
 402{
 403	xfarray_sort_bump_stores(si);
 404	return xfarray_store(si->array, idx, ptr);
 405}
 406
 407/* Compare an array element for sorting. */
 408static inline int
 409xfarray_sort_cmp(
 410	struct xfarray_sortinfo	*si,
 411	const void		*a,
 412	const void		*b)
 413{
 414	xfarray_sort_bump_compares(si);
 415	return si->cmp_fn(a, b);
 416}
 417
 418/* Return a pointer to the low index stack for quicksort partitioning. */
 419static inline xfarray_idx_t *xfarray_sortinfo_lo(struct xfarray_sortinfo *si)
 420{
 421	return (xfarray_idx_t *)(si + 1);
 422}
 423
 424/* Return a pointer to the high index stack for quicksort partitioning. */
 425static inline xfarray_idx_t *xfarray_sortinfo_hi(struct xfarray_sortinfo *si)
 426{
 427	return xfarray_sortinfo_lo(si) + si->max_stack_depth;
 428}
 429
 430/* Size of each element in the quicksort pivot array. */
 431static inline size_t
 432xfarray_pivot_rec_sz(
 433	struct xfarray		*array)
 434{
 435	return round_up(array->obj_size, 8) + sizeof(xfarray_idx_t);
 436}
 437
 438/* Allocate memory to handle the sort. */
 439static inline int
 440xfarray_sortinfo_alloc(
 441	struct xfarray		*array,
 442	xfarray_cmp_fn		cmp_fn,
 443	unsigned int		flags,
 444	struct xfarray_sortinfo	**infop)
 445{
 446	struct xfarray_sortinfo	*si;
 447	size_t			nr_bytes = sizeof(struct xfarray_sortinfo);
 448	size_t			pivot_rec_sz = xfarray_pivot_rec_sz(array);
 449	int			max_stack_depth;
 450
 451	/*
 452	 * The median-of-nine pivot algorithm doesn't work if a subset has
 453	 * fewer than 9 items.  Make sure the in-memory sort will always take
 454	 * over for subsets where this wouldn't be the case.
 455	 */
 456	BUILD_BUG_ON(XFARRAY_QSORT_PIVOT_NR >= XFARRAY_ISORT_NR);
 457
 458	/*
 459	 * Tail-call recursion during the partitioning phase means that
 460	 * quicksort will never recurse more than log2(nr) times.  We need one
 461	 * extra level of stack to hold the initial parameters.  In-memory
 462	 * sort will always take care of the last few levels of recursion for
 463	 * us, so we can reduce the stack depth by that much.
 464	 */
 465	max_stack_depth = ilog2(array->nr) + 1 - (XFARRAY_ISORT_SHIFT - 1);
 466	if (max_stack_depth < 1)
 467		max_stack_depth = 1;
 468
 469	/* Each level of quicksort uses a lo and a hi index */
 470	nr_bytes += max_stack_depth * sizeof(xfarray_idx_t) * 2;
 471
 472	/* Scratchpad for in-memory sort, or finding the pivot */
 473	nr_bytes += max_t(size_t,
 474			(XFARRAY_QSORT_PIVOT_NR + 1) * pivot_rec_sz,
 475			XFARRAY_ISORT_NR * array->obj_size);
 476
 477	si = kvzalloc(nr_bytes, XCHK_GFP_FLAGS);
 478	if (!si)
 479		return -ENOMEM;
 480
 481	si->array = array;
 482	si->cmp_fn = cmp_fn;
 483	si->flags = flags;
 484	si->max_stack_depth = max_stack_depth;
 485	si->max_stack_used = 1;
 486
 487	xfarray_sortinfo_lo(si)[0] = 0;
 488	xfarray_sortinfo_hi(si)[0] = array->nr - 1;
 489	si->relax = INIT_XCHK_RELAX;
 490	if (flags & XFARRAY_SORT_KILLABLE)
 491		si->relax.interruptible = false;
 492
 493	trace_xfarray_sort(si, nr_bytes);
 494	*infop = si;
 495	return 0;
 496}
 497
 498/* Should this sort be terminated by a fatal signal? */
 499static inline bool
 500xfarray_sort_terminated(
 501	struct xfarray_sortinfo	*si,
 502	int			*error)
 503{
 504	/*
 505	 * If preemption is disabled, we need to yield to the scheduler every
 506	 * few seconds so that we don't run afoul of the soft lockup watchdog
 507	 * or RCU stall detector.
 508	 */
 509	if (xchk_maybe_relax(&si->relax)) {
 
 
 
 510		if (*error == 0)
 511			*error = -EINTR;
 512		return true;
 513	}
 514	return false;
 515}
 516
 517/* Do we want an in-memory sort? */
 518static inline bool
 519xfarray_want_isort(
 520	struct xfarray_sortinfo *si,
 521	xfarray_idx_t		start,
 522	xfarray_idx_t		end)
 523{
 524	/*
 525	 * For array subsets that fit in the scratchpad, it's much faster to
 526	 * use the kernel's heapsort than quicksort's stack machine.
 527	 */
 528	return (end - start) < XFARRAY_ISORT_NR;
 529}
 530
 531/* Return the scratch space within the sortinfo structure. */
 532static inline void *xfarray_sortinfo_isort_scratch(struct xfarray_sortinfo *si)
 533{
 534	return xfarray_sortinfo_hi(si) + si->max_stack_depth;
 535}
 536
 537/*
 538 * Sort a small number of array records using scratchpad memory.  The records
 539 * need not be contiguous in the xfile's memory pages.
 540 */
 541STATIC int
 542xfarray_isort(
 543	struct xfarray_sortinfo	*si,
 544	xfarray_idx_t		lo,
 545	xfarray_idx_t		hi)
 546{
 547	void			*scratch = xfarray_sortinfo_isort_scratch(si);
 548	loff_t			lo_pos = xfarray_pos(si->array, lo);
 549	loff_t			len = xfarray_pos(si->array, hi - lo + 1);
 550	int			error;
 551
 552	trace_xfarray_isort(si, lo, hi);
 553
 554	xfarray_sort_bump_loads(si);
 555	error = xfile_load(si->array->xfile, scratch, len, lo_pos);
 556	if (error)
 557		return error;
 558
 559	xfarray_sort_bump_heapsorts(si);
 560	sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
 561
 562	xfarray_sort_bump_stores(si);
 563	return xfile_store(si->array->xfile, scratch, len, lo_pos);
 564}
 565
 566/*
 567 * Sort the records from lo to hi (inclusive) if they are all backed by the
 568 * same memory folio.  Returns 1 if it sorted, 0 if it did not, or a negative
 569 * errno.
 570 */
 571STATIC int
 572xfarray_foliosort(
 573	struct xfarray_sortinfo	*si,
 574	xfarray_idx_t		lo,
 575	xfarray_idx_t		hi)
 576{
 577	struct folio		*folio;
 578	void			*startp;
 579	loff_t			lo_pos = xfarray_pos(si->array, lo);
 580	uint64_t		len = xfarray_pos(si->array, hi - lo + 1);
 581
 582	/* No single folio could back this many records. */
 583	if (len > XFILE_MAX_FOLIO_SIZE)
 584		return 0;
 585
 586	xfarray_sort_bump_loads(si);
 587	folio = xfile_get_folio(si->array->xfile, lo_pos, len, XFILE_ALLOC);
 588	if (IS_ERR(folio))
 589		return PTR_ERR(folio);
 590	if (!folio)
 591		return 0;
 592
 593	trace_xfarray_foliosort(si, lo, hi);
 594
 595	xfarray_sort_bump_heapsorts(si);
 596	startp = folio_address(folio) + offset_in_folio(folio, lo_pos);
 597	sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
 598
 599	xfarray_sort_bump_stores(si);
 600	xfile_put_folio(si->array->xfile, folio);
 601	return 1;
 602}
 603
 604/* Return a pointer to the xfarray pivot record within the sortinfo struct. */
 605static inline void *xfarray_sortinfo_pivot(struct xfarray_sortinfo *si)
 606{
 607	return xfarray_sortinfo_hi(si) + si->max_stack_depth;
 608}
 609
 610/* Return a pointer to the start of the pivot array. */
 611static inline void *
 612xfarray_sortinfo_pivot_array(
 613	struct xfarray_sortinfo	*si)
 614{
 615	return xfarray_sortinfo_pivot(si) + si->array->obj_size;
 616}
 617
 618/* The xfarray record is stored at the start of each pivot array element. */
 619static inline void *
 620xfarray_pivot_array_rec(
 621	void			*pa,
 622	size_t			pa_recsz,
 623	unsigned int		pa_idx)
 624{
 625	return pa + (pa_recsz * pa_idx);
 626}
 627
 628/* The xfarray index is stored at the end of each pivot array element. */
 629static inline xfarray_idx_t *
 630xfarray_pivot_array_idx(
 631	void			*pa,
 632	size_t			pa_recsz,
 633	unsigned int		pa_idx)
 634{
 635	return xfarray_pivot_array_rec(pa, pa_recsz, pa_idx + 1) -
 636			sizeof(xfarray_idx_t);
 637}
 638
 639/*
 640 * Find a pivot value for quicksort partitioning, swap it with a[lo], and save
 641 * the cached pivot record for the next step.
 642 *
 643 * Load evenly-spaced records within the given range into memory, sort them,
 644 * and choose the pivot from the median record.  Using multiple points will
 645 * improve the quality of the pivot selection, and hopefully avoid the worst
 646 * quicksort behavior, since our array values are nearly always evenly sorted.
 647 */
 648STATIC int
 649xfarray_qsort_pivot(
 650	struct xfarray_sortinfo	*si,
 651	xfarray_idx_t		lo,
 652	xfarray_idx_t		hi)
 653{
 654	void			*pivot = xfarray_sortinfo_pivot(si);
 655	void			*parray = xfarray_sortinfo_pivot_array(si);
 656	void			*recp;
 657	xfarray_idx_t		*idxp;
 658	xfarray_idx_t		step = (hi - lo) / (XFARRAY_QSORT_PIVOT_NR - 1);
 659	size_t			pivot_rec_sz = xfarray_pivot_rec_sz(si->array);
 660	int			i, j;
 661	int			error;
 662
 663	ASSERT(step > 0);
 664
 665	/*
 666	 * Load the xfarray indexes of the records we intend to sample into the
 667	 * pivot array.
 668	 */
 669	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 0);
 670	*idxp = lo;
 671	for (i = 1; i < XFARRAY_QSORT_PIVOT_NR - 1; i++) {
 672		idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
 673		*idxp = lo + (i * step);
 674	}
 675	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
 676			XFARRAY_QSORT_PIVOT_NR - 1);
 677	*idxp = hi;
 678
 679	/* Load the selected xfarray records into the pivot array. */
 680	for (i = 0; i < XFARRAY_QSORT_PIVOT_NR; i++) {
 681		xfarray_idx_t	idx;
 682
 683		recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, i);
 684		idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
 685
 686		/* No unset records; load directly into the array. */
 687		if (likely(si->array->unset_slots == 0)) {
 688			error = xfarray_sort_load(si, *idxp, recp);
 689			if (error)
 690				return error;
 691			continue;
 692		}
 693
 694		/*
 695		 * Load non-null records into the scratchpad without changing
 696		 * the xfarray_idx_t in the pivot array.
 697		 */
 698		idx = *idxp;
 699		xfarray_sort_bump_loads(si);
 700		error = xfarray_load_next(si->array, &idx, recp);
 701		if (error)
 702			return error;
 703	}
 704
 705	xfarray_sort_bump_heapsorts(si);
 706	sort(parray, XFARRAY_QSORT_PIVOT_NR, pivot_rec_sz, si->cmp_fn, NULL);
 707
 708	/*
 709	 * We sorted the pivot array records (which includes the xfarray
 710	 * indices) in xfarray record order.  The median element of the pivot
 711	 * array contains the xfarray record that we will use as the pivot.
 712	 * Copy that xfarray record to the designated space.
 713	 */
 714	recp = xfarray_pivot_array_rec(parray, pivot_rec_sz,
 715			XFARRAY_QSORT_PIVOT_NR / 2);
 716	memcpy(pivot, recp, si->array->obj_size);
 717
 718	/* If the pivot record we chose was already in a[lo] then we're done. */
 719	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
 720			XFARRAY_QSORT_PIVOT_NR / 2);
 721	if (*idxp == lo)
 722		return 0;
 723
 724	/*
 725	 * Find the cached copy of a[lo] in the pivot array so that we can swap
 726	 * a[lo] and a[pivot].
 727	 */
 728	for (i = 0, j = -1; i < XFARRAY_QSORT_PIVOT_NR; i++) {
 729		idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
 730		if (*idxp == lo)
 731			j = i;
 732	}
 733	if (j < 0) {
 734		ASSERT(j >= 0);
 735		return -EFSCORRUPTED;
 736	}
 737
 738	/* Swap a[lo] and a[pivot]. */
 739	error = xfarray_sort_store(si, lo, pivot);
 740	if (error)
 741		return error;
 742
 743	recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, j);
 744	idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
 745			XFARRAY_QSORT_PIVOT_NR / 2);
 746	return xfarray_sort_store(si, *idxp, recp);
 747}
 748
 749/*
 750 * Set up the pointers for the next iteration.  We push onto the stack all of
 751 * the unsorted values between a[lo + 1] and a[end[i]], and we tweak the
 752 * current stack frame to point to the unsorted values between a[beg[i]] and
 753 * a[lo] so that those values will be sorted when we pop the stack.
 754 */
 755static inline int
 756xfarray_qsort_push(
 757	struct xfarray_sortinfo	*si,
 758	xfarray_idx_t		*si_lo,
 759	xfarray_idx_t		*si_hi,
 760	xfarray_idx_t		lo,
 761	xfarray_idx_t		hi)
 762{
 763	/* Check for stack overflows */
 764	if (si->stack_depth >= si->max_stack_depth - 1) {
 765		ASSERT(si->stack_depth < si->max_stack_depth - 1);
 766		return -EFSCORRUPTED;
 767	}
 768
 769	si->max_stack_used = max_t(uint8_t, si->max_stack_used,
 770					    si->stack_depth + 2);
 771
 772	si_lo[si->stack_depth + 1] = lo + 1;
 773	si_hi[si->stack_depth + 1] = si_hi[si->stack_depth];
 774	si_hi[si->stack_depth++] = lo - 1;
 775
 776	/*
 777	 * Always start with the smaller of the two partitions to keep the
 778	 * amount of recursion in check.
 779	 */
 780	if (si_hi[si->stack_depth]     - si_lo[si->stack_depth] >
 781	    si_hi[si->stack_depth - 1] - si_lo[si->stack_depth - 1]) {
 782		swap(si_lo[si->stack_depth], si_lo[si->stack_depth - 1]);
 783		swap(si_hi[si->stack_depth], si_hi[si->stack_depth - 1]);
 784	}
 785
 786	return 0;
 787}
 788
 789static inline void
 790xfarray_sort_scan_done(
 791	struct xfarray_sortinfo	*si)
 792{
 793	if (si->folio)
 794		xfile_put_folio(si->array->xfile, si->folio);
 795	si->folio = NULL;
 796}
 797
 798/*
 799 * Cache the folio backing the start of the given array element.  If the array
 800 * element is contained entirely within the folio, return a pointer to the
 801 * cached folio.  Otherwise, load the element into the scratchpad and return a
 802 * pointer to the scratchpad.
 803 */
 804static inline int
 805xfarray_sort_scan(
 806	struct xfarray_sortinfo	*si,
 807	xfarray_idx_t		idx,
 808	void			**ptrp)
 809{
 810	loff_t			idx_pos = xfarray_pos(si->array, idx);
 811	int			error = 0;
 812
 813	if (xfarray_sort_terminated(si, &error))
 814		return error;
 815
 816	trace_xfarray_sort_scan(si, idx);
 817
 818	/* If the cached folio doesn't cover this index, release it. */
 819	if (si->folio &&
 820	    (idx < si->first_folio_idx || idx > si->last_folio_idx))
 821		xfarray_sort_scan_done(si);
 822
 823	/* Grab the first folio that backs this array element. */
 824	if (!si->folio) {
 825		struct folio	*folio;
 826		loff_t		next_pos;
 827
 828		folio = xfile_get_folio(si->array->xfile, idx_pos,
 829				si->array->obj_size, XFILE_ALLOC);
 830		if (IS_ERR(folio))
 831			return PTR_ERR(folio);
 832		si->folio = folio;
 833
 834		si->first_folio_idx = xfarray_idx(si->array,
 835				folio_pos(si->folio) + si->array->obj_size - 1);
 836
 837		next_pos = folio_pos(si->folio) + folio_size(si->folio);
 838		si->last_folio_idx = xfarray_idx(si->array, next_pos - 1);
 839		if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos)
 840			si->last_folio_idx--;
 841
 842		trace_xfarray_sort_scan(si, idx);
 843	}
 844
 845	/*
 846	 * If this folio still doesn't cover the desired element, it must cross
 847	 * a folio boundary.  Read into the scratchpad and we're done.
 848	 */
 849	if (idx < si->first_folio_idx || idx > si->last_folio_idx) {
 850		void		*temp = xfarray_scratch(si->array);
 851
 852		error = xfile_load(si->array->xfile, temp, si->array->obj_size,
 853				idx_pos);
 854		if (error)
 855			return error;
 856
 857		*ptrp = temp;
 858		return 0;
 859	}
 860
 861	/* Otherwise return a pointer to the array element in the folio. */
 862	*ptrp = folio_address(si->folio) + offset_in_folio(si->folio, idx_pos);
 863	return 0;
 864}
 865
 866/*
 867 * Sort the array elements via quicksort.  This implementation incorporates
 868 * four optimizations discussed in Sedgewick:
 869 *
 870 * 1. Use an explicit stack of array indices to store the next array partition
 871 *    to sort.  This helps us to avoid recursion in the call stack, which is
 872 *    particularly expensive in the kernel.
 873 *
 874 * 2. For arrays with records in arbitrary or user-controlled order, choose the
 875 *    pivot element using a median-of-nine decision tree.  This reduces the
 876 *    probability of selecting a bad pivot value which causes worst case
 877 *    behavior (i.e. partition sizes of 1).
 878 *
 879 * 3. The smaller of the two sub-partitions is pushed onto the stack to start
 880 *    the next level of recursion, and the larger sub-partition replaces the
 881 *    current stack frame.  This guarantees that we won't need more than
 882 *    log2(nr) stack space.
 883 *
 884 * 4. For small sets, load the records into the scratchpad and run heapsort on
 885 *    them because that is very fast.  In the author's experience, this yields
 886 *    a ~10% reduction in runtime.
 887 *
 888 *    If a small set is contained entirely within a single xfile memory page,
 889 *    map the page directly and run heap sort directly on the xfile page
 890 *    instead of using the load/store interface.  This halves the runtime.
 891 *
 892 * 5. This optimization is specific to the implementation.  When converging lo
 893 *    and hi after selecting a pivot, we will try to retain the xfile memory
 894 *    page between load calls, which reduces run time by 50%.
 895 */
 896
 897/*
 898 * Due to the use of signed indices, we can only support up to 2^63 records.
 899 * Files can only grow to 2^63 bytes, so this is not much of a limitation.
 900 */
 901#define QSORT_MAX_RECS		(1ULL << 63)
 902
 903int
 904xfarray_sort(
 905	struct xfarray		*array,
 906	xfarray_cmp_fn		cmp_fn,
 907	unsigned int		flags)
 908{
 909	struct xfarray_sortinfo	*si;
 910	xfarray_idx_t		*si_lo, *si_hi;
 911	void			*pivot;
 912	void			*scratch = xfarray_scratch(array);
 913	xfarray_idx_t		lo, hi;
 914	int			error = 0;
 915
 916	if (array->nr < 2)
 917		return 0;
 918	if (array->nr >= QSORT_MAX_RECS)
 919		return -E2BIG;
 920
 921	error = xfarray_sortinfo_alloc(array, cmp_fn, flags, &si);
 922	if (error)
 923		return error;
 924	si_lo = xfarray_sortinfo_lo(si);
 925	si_hi = xfarray_sortinfo_hi(si);
 926	pivot = xfarray_sortinfo_pivot(si);
 927
 928	while (si->stack_depth >= 0) {
 929		int		ret;
 930
 931		lo = si_lo[si->stack_depth];
 932		hi = si_hi[si->stack_depth];
 933
 934		trace_xfarray_qsort(si, lo, hi);
 935
 936		/* Nothing left in this partition to sort; pop stack. */
 937		if (lo >= hi) {
 938			si->stack_depth--;
 939			continue;
 940		}
 941
 942		/*
 943		 * If directly mapping the folio and sorting can solve our
 944		 * problems, we're done.
 945		 */
 946		ret = xfarray_foliosort(si, lo, hi);
 947		if (ret < 0)
 948			goto out_free;
 949		if (ret == 1) {
 950			si->stack_depth--;
 951			continue;
 952		}
 953
 954		/* If insertion sort can solve our problems, we're done. */
 955		if (xfarray_want_isort(si, lo, hi)) {
 956			error = xfarray_isort(si, lo, hi);
 957			if (error)
 958				goto out_free;
 959			si->stack_depth--;
 960			continue;
 961		}
 962
 963		/* Pick a pivot, move it to a[lo] and stash it. */
 964		error = xfarray_qsort_pivot(si, lo, hi);
 965		if (error)
 966			goto out_free;
 967
 968		/*
 969		 * Rearrange a[lo..hi] such that everything smaller than the
 970		 * pivot is on the left side of the range and everything larger
 971		 * than the pivot is on the right side of the range.
 972		 */
 973		while (lo < hi) {
 974			void	*p;
 975
 976			/*
 977			 * Decrement hi until it finds an a[hi] less than the
 978			 * pivot value.
 979			 */
 980			error = xfarray_sort_scan(si, hi, &p);
 981			if (error)
 982				goto out_free;
 983			while (xfarray_sort_cmp(si, p, pivot) >= 0 && lo < hi) {
 984				hi--;
 985				error = xfarray_sort_scan(si, hi, &p);
 986				if (error)
 987					goto out_free;
 988			}
 989			if (p != scratch)
 990				memcpy(scratch, p, si->array->obj_size);
 991			xfarray_sort_scan_done(si);
 992			if (xfarray_sort_terminated(si, &error))
 993				goto out_free;
 994
 995			/* Copy that item (a[hi]) to a[lo]. */
 996			if (lo < hi) {
 997				error = xfarray_sort_store(si, lo++, scratch);
 998				if (error)
 999					goto out_free;
1000			}
1001
1002			/*
1003			 * Increment lo until it finds an a[lo] greater than
1004			 * the pivot value.
1005			 */
1006			error = xfarray_sort_scan(si, lo, &p);
1007			if (error)
1008				goto out_free;
1009			while (xfarray_sort_cmp(si, p, pivot) <= 0 && lo < hi) {
1010				lo++;
1011				error = xfarray_sort_scan(si, lo, &p);
1012				if (error)
1013					goto out_free;
1014			}
1015			if (p != scratch)
1016				memcpy(scratch, p, si->array->obj_size);
1017			xfarray_sort_scan_done(si);
1018			if (xfarray_sort_terminated(si, &error))
1019				goto out_free;
1020
1021			/* Copy that item (a[lo]) to a[hi]. */
1022			if (lo < hi) {
1023				error = xfarray_sort_store(si, hi--, scratch);
1024				if (error)
1025					goto out_free;
1026			}
1027
1028			if (xfarray_sort_terminated(si, &error))
1029				goto out_free;
1030		}
1031
1032		/*
1033		 * Put our pivot value in the correct place at a[lo].  All
1034		 * values between a[beg[i]] and a[lo - 1] should be less than
1035		 * the pivot; and all values between a[lo + 1] and a[end[i]-1]
1036		 * should be greater than the pivot.
1037		 */
1038		error = xfarray_sort_store(si, lo, pivot);
1039		if (error)
1040			goto out_free;
1041
1042		/* Set up the stack frame to process the two partitions. */
1043		error = xfarray_qsort_push(si, si_lo, si_hi, lo, hi);
1044		if (error)
1045			goto out_free;
1046
1047		if (xfarray_sort_terminated(si, &error))
1048			goto out_free;
1049	}
1050
1051out_free:
1052	trace_xfarray_sort_stats(si, error);
1053	xfarray_sort_scan_done(si);
1054	kvfree(si);
1055	return error;
1056}
1057
1058/* How many bytes is this array consuming? */
1059unsigned long long
1060xfarray_bytes(
1061	struct xfarray		*array)
1062{
1063	return xfile_bytes(array->xfile);
1064}
1065
1066/* Empty the entire array. */
1067void
1068xfarray_truncate(
1069	struct xfarray	*array)
1070{
1071	xfile_discard(array->xfile, 0, MAX_LFS_FILESIZE);
1072	array->nr = 0;
1073}