Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.15.
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Stress userfaultfd syscall.
   4 *
   5 *  Copyright (C) 2015  Red Hat, Inc.
   6 *
   7 * This test allocates two virtual areas and bounces the physical
   8 * memory across the two virtual areas (from area_src to area_dst)
   9 * using userfaultfd.
  10 *
  11 * There are three threads running per CPU:
  12 *
  13 * 1) one per-CPU thread takes a per-page pthread_mutex in a random
  14 *    page of the area_dst (while the physical page may still be in
  15 *    area_src), and increments a per-page counter in the same page,
  16 *    and checks its value against a verification region.
  17 *
  18 * 2) another per-CPU thread handles the userfaults generated by
  19 *    thread 1 above. userfaultfd blocking reads or poll() modes are
  20 *    exercised interleaved.
  21 *
  22 * 3) one last per-CPU thread transfers the memory in the background
  23 *    at maximum bandwidth (if not already transferred by thread
  24 *    2). Each cpu thread takes cares of transferring a portion of the
  25 *    area.
  26 *
  27 * When all threads of type 3 completed the transfer, one bounce is
  28 * complete. area_src and area_dst are then swapped. All threads are
  29 * respawned and so the bounce is immediately restarted in the
  30 * opposite direction.
  31 *
  32 * per-CPU threads 1 by triggering userfaults inside
  33 * pthread_mutex_lock will also verify the atomicity of the memory
  34 * transfer (UFFDIO_COPY).
  35 */
  36
  37#define _GNU_SOURCE
  38#include <stdio.h>
  39#include <errno.h>
  40#include <unistd.h>
  41#include <stdlib.h>
  42#include <sys/types.h>
  43#include <sys/stat.h>
  44#include <fcntl.h>
  45#include <time.h>
  46#include <signal.h>
  47#include <poll.h>
  48#include <string.h>
  49#include <sys/mman.h>
  50#include <sys/syscall.h>
  51#include <sys/ioctl.h>
  52#include <sys/wait.h>
  53#include <pthread.h>
  54#include <linux/userfaultfd.h>
  55#include <setjmp.h>
  56#include <stdbool.h>
  57#include <assert.h>
  58#include <inttypes.h>
  59#include <stdint.h>
  60
  61#include "../kselftest.h"
  62
  63#ifdef __NR_userfaultfd
  64
  65static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
  66
  67#define BOUNCE_RANDOM		(1<<0)
  68#define BOUNCE_RACINGFAULTS	(1<<1)
  69#define BOUNCE_VERIFY		(1<<2)
  70#define BOUNCE_POLL		(1<<3)
  71static int bounces;
  72
  73#define TEST_ANON	1
  74#define TEST_HUGETLB	2
  75#define TEST_SHMEM	3
  76static int test_type;
  77
  78/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
  79#define ALARM_INTERVAL_SECS 10
  80static volatile bool test_uffdio_copy_eexist = true;
  81static volatile bool test_uffdio_zeropage_eexist = true;
  82/* Whether to test uffd write-protection */
  83static bool test_uffdio_wp = false;
  84/* Whether to test uffd minor faults */
  85static bool test_uffdio_minor = false;
  86
  87static bool map_shared;
  88static int shm_fd;
  89static int huge_fd;
  90static char *huge_fd_off0;
  91static unsigned long long *count_verify;
  92static int uffd = -1;
  93static int uffd_flags, finished, *pipefd;
  94static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
  95static char *zeropage;
  96pthread_attr_t attr;
  97
  98/* Userfaultfd test statistics */
  99struct uffd_stats {
 100	int cpu;
 101	unsigned long missing_faults;
 102	unsigned long wp_faults;
 103	unsigned long minor_faults;
 104};
 105
 106/* pthread_mutex_t starts at page offset 0 */
 107#define area_mutex(___area, ___nr)					\
 108	((pthread_mutex_t *) ((___area) + (___nr)*page_size))
 109/*
 110 * count is placed in the page after pthread_mutex_t naturally aligned
 111 * to avoid non alignment faults on non-x86 archs.
 112 */
 113#define area_count(___area, ___nr)					\
 114	((volatile unsigned long long *) ((unsigned long)		\
 115				 ((___area) + (___nr)*page_size +	\
 116				  sizeof(pthread_mutex_t) +		\
 117				  sizeof(unsigned long long) - 1) &	\
 118				 ~(unsigned long)(sizeof(unsigned long long) \
 119						  -  1)))
 120
 121const char *examples =
 122    "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
 123    "./userfaultfd anon 100 99999\n\n"
 124    "# Run share memory test on 1GiB region with 99 bounces:\n"
 125    "./userfaultfd shmem 1000 99\n\n"
 126    "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n"
 127    "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n"
 128    "# Run the same hugetlb test but using shmem:\n"
 129    "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
 130    "# 10MiB-~6GiB 999 bounces anonymous test, "
 131    "continue forever unless an error triggers\n"
 132    "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
 133
 134static void usage(void)
 135{
 136	fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
 137		"[hugetlbfs_file]\n\n");
 138	fprintf(stderr, "Supported <test type>: anon, hugetlb, "
 139		"hugetlb_shared, shmem\n\n");
 140	fprintf(stderr, "Examples:\n\n");
 141	fprintf(stderr, "%s", examples);
 142	exit(1);
 143}
 144
 145#define _err(fmt, ...)						\
 146	do {							\
 147		int ret = errno;				\
 148		fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__);	\
 149		fprintf(stderr, " (errno=%d, line=%d)\n",	\
 150			ret, __LINE__);				\
 151	} while (0)
 152
 153#define err(fmt, ...)				\
 154	do {					\
 155		_err(fmt, ##__VA_ARGS__);	\
 156		exit(1);			\
 157	} while (0)
 158
 159static void uffd_stats_reset(struct uffd_stats *uffd_stats,
 160			     unsigned long n_cpus)
 161{
 162	int i;
 163
 164	for (i = 0; i < n_cpus; i++) {
 165		uffd_stats[i].cpu = i;
 166		uffd_stats[i].missing_faults = 0;
 167		uffd_stats[i].wp_faults = 0;
 168		uffd_stats[i].minor_faults = 0;
 169	}
 170}
 171
 172static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
 173{
 174	int i;
 175	unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
 176
 177	for (i = 0; i < n_cpus; i++) {
 178		miss_total += stats[i].missing_faults;
 179		wp_total += stats[i].wp_faults;
 180		minor_total += stats[i].minor_faults;
 181	}
 182
 183	printf("userfaults: ");
 184	if (miss_total) {
 185		printf("%llu missing (", miss_total);
 186		for (i = 0; i < n_cpus; i++)
 187			printf("%lu+", stats[i].missing_faults);
 188		printf("\b) ");
 189	}
 190	if (wp_total) {
 191		printf("%llu wp (", wp_total);
 192		for (i = 0; i < n_cpus; i++)
 193			printf("%lu+", stats[i].wp_faults);
 194		printf("\b) ");
 195	}
 196	if (minor_total) {
 197		printf("%llu minor (", minor_total);
 198		for (i = 0; i < n_cpus; i++)
 199			printf("%lu+", stats[i].minor_faults);
 200		printf("\b)");
 201	}
 202	printf("\n");
 203}
 204
 205static void anon_release_pages(char *rel_area)
 206{
 207	if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
 208		err("madvise(MADV_DONTNEED) failed");
 209}
 210
 211static void anon_allocate_area(void **alloc_area)
 212{
 213	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 214			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
 215	if (*alloc_area == MAP_FAILED)
 216		err("mmap of anonymous memory failed");
 217}
 218
 219static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
 220{
 221}
 222
 223static void hugetlb_release_pages(char *rel_area)
 224{
 225	if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 226		      rel_area == huge_fd_off0 ? 0 : nr_pages * page_size,
 227		      nr_pages * page_size))
 228		err("fallocate() failed");
 229}
 230
 231static void hugetlb_allocate_area(void **alloc_area)
 232{
 233	void *area_alias = NULL;
 234	char **alloc_area_alias;
 235
 236	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 237			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
 238			   MAP_HUGETLB,
 239			   huge_fd, *alloc_area == area_src ? 0 :
 240			   nr_pages * page_size);
 241	if (*alloc_area == MAP_FAILED)
 242		err("mmap of hugetlbfs file failed");
 243
 244	if (map_shared) {
 245		area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 246				  MAP_SHARED | MAP_HUGETLB,
 247				  huge_fd, *alloc_area == area_src ? 0 :
 248				  nr_pages * page_size);
 249		if (area_alias == MAP_FAILED)
 250			err("mmap of hugetlb file alias failed");
 251	}
 252
 253	if (*alloc_area == area_src) {
 254		huge_fd_off0 = *alloc_area;
 255		alloc_area_alias = &area_src_alias;
 256	} else {
 257		alloc_area_alias = &area_dst_alias;
 258	}
 259	if (area_alias)
 260		*alloc_area_alias = area_alias;
 261}
 262
 263static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
 264{
 265	if (!map_shared)
 266		return;
 267	/*
 268	 * We can't zap just the pagetable with hugetlbfs because
 269	 * MADV_DONTEED won't work. So exercise -EEXIST on a alias
 270	 * mapping where the pagetables are not established initially,
 271	 * this way we'll exercise the -EEXEC at the fs level.
 272	 */
 273	*start = (unsigned long) area_dst_alias + offset;
 274}
 275
 276static void shmem_release_pages(char *rel_area)
 277{
 278	if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
 279		err("madvise(MADV_REMOVE) failed");
 280}
 281
 282static void shmem_allocate_area(void **alloc_area)
 283{
 284	void *area_alias = NULL;
 285	bool is_src = alloc_area == (void **)&area_src;
 286	unsigned long offset = is_src ? 0 : nr_pages * page_size;
 287
 288	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 289			   MAP_SHARED, shm_fd, offset);
 290	if (*alloc_area == MAP_FAILED)
 291		err("mmap of memfd failed");
 292
 293	area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
 294			  MAP_SHARED, shm_fd, offset);
 295	if (area_alias == MAP_FAILED)
 296		err("mmap of memfd alias failed");
 297
 298	if (is_src)
 299		area_src_alias = area_alias;
 300	else
 301		area_dst_alias = area_alias;
 302}
 303
 304static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
 305{
 306	*start = (unsigned long)area_dst_alias + offset;
 307}
 308
 309struct uffd_test_ops {
 310	unsigned long expected_ioctls;
 311	void (*allocate_area)(void **alloc_area);
 312	void (*release_pages)(char *rel_area);
 313	void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
 314};
 315
 316#define SHMEM_EXPECTED_IOCTLS		((1 << _UFFDIO_WAKE) | \
 317					 (1 << _UFFDIO_COPY) | \
 318					 (1 << _UFFDIO_ZEROPAGE))
 319
 320#define ANON_EXPECTED_IOCTLS		((1 << _UFFDIO_WAKE) | \
 321					 (1 << _UFFDIO_COPY) | \
 322					 (1 << _UFFDIO_ZEROPAGE) | \
 323					 (1 << _UFFDIO_WRITEPROTECT))
 324
 325static struct uffd_test_ops anon_uffd_test_ops = {
 326	.expected_ioctls = ANON_EXPECTED_IOCTLS,
 327	.allocate_area	= anon_allocate_area,
 328	.release_pages	= anon_release_pages,
 329	.alias_mapping = noop_alias_mapping,
 330};
 331
 332static struct uffd_test_ops shmem_uffd_test_ops = {
 333	.expected_ioctls = SHMEM_EXPECTED_IOCTLS,
 334	.allocate_area	= shmem_allocate_area,
 335	.release_pages	= shmem_release_pages,
 336	.alias_mapping = shmem_alias_mapping,
 337};
 338
 339static struct uffd_test_ops hugetlb_uffd_test_ops = {
 340	.expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE),
 341	.allocate_area	= hugetlb_allocate_area,
 342	.release_pages	= hugetlb_release_pages,
 343	.alias_mapping = hugetlb_alias_mapping,
 344};
 345
 346static struct uffd_test_ops *uffd_test_ops;
 347
 348static void userfaultfd_open(uint64_t *features)
 349{
 350	struct uffdio_api uffdio_api;
 351
 352	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
 353	if (uffd < 0)
 354		err("userfaultfd syscall not available in this kernel");
 355	uffd_flags = fcntl(uffd, F_GETFD, NULL);
 356
 357	uffdio_api.api = UFFD_API;
 358	uffdio_api.features = *features;
 359	if (ioctl(uffd, UFFDIO_API, &uffdio_api))
 360		err("UFFDIO_API failed.\nPlease make sure to "
 361		    "run with either root or ptrace capability.");
 362	if (uffdio_api.api != UFFD_API)
 363		err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
 364
 365	*features = uffdio_api.features;
 366}
 367
 368static inline void munmap_area(void **area)
 369{
 370	if (*area)
 371		if (munmap(*area, nr_pages * page_size))
 372			err("munmap");
 373
 374	*area = NULL;
 375}
 376
 377static void uffd_test_ctx_clear(void)
 378{
 379	size_t i;
 380
 381	if (pipefd) {
 382		for (i = 0; i < nr_cpus * 2; ++i) {
 383			if (close(pipefd[i]))
 384				err("close pipefd");
 385		}
 386		free(pipefd);
 387		pipefd = NULL;
 388	}
 389
 390	if (count_verify) {
 391		free(count_verify);
 392		count_verify = NULL;
 393	}
 394
 395	if (uffd != -1) {
 396		if (close(uffd))
 397			err("close uffd");
 398		uffd = -1;
 399	}
 400
 401	huge_fd_off0 = NULL;
 402	munmap_area((void **)&area_src);
 403	munmap_area((void **)&area_src_alias);
 404	munmap_area((void **)&area_dst);
 405	munmap_area((void **)&area_dst_alias);
 406}
 407
 408static void uffd_test_ctx_init_ext(uint64_t *features)
 409{
 410	unsigned long nr, cpu;
 411
 412	uffd_test_ctx_clear();
 413
 414	uffd_test_ops->allocate_area((void **)&area_src);
 415	uffd_test_ops->allocate_area((void **)&area_dst);
 416
 417	userfaultfd_open(features);
 418
 419	count_verify = malloc(nr_pages * sizeof(unsigned long long));
 420	if (!count_verify)
 421		err("count_verify");
 422
 423	for (nr = 0; nr < nr_pages; nr++) {
 424		*area_mutex(area_src, nr) =
 425			(pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
 426		count_verify[nr] = *area_count(area_src, nr) = 1;
 427		/*
 428		 * In the transition between 255 to 256, powerpc will
 429		 * read out of order in my_bcmp and see both bytes as
 430		 * zero, so leave a placeholder below always non-zero
 431		 * after the count, to avoid my_bcmp to trigger false
 432		 * positives.
 433		 */
 434		*(area_count(area_src, nr) + 1) = 1;
 435	}
 436
 437	/*
 438	 * After initialization of area_src, we must explicitly release pages
 439	 * for area_dst to make sure it's fully empty.  Otherwise we could have
 440	 * some area_dst pages be errornously initialized with zero pages,
 441	 * hence we could hit memory corruption later in the test.
 442	 *
 443	 * One example is when THP is globally enabled, above allocate_area()
 444	 * calls could have the two areas merged into a single VMA (as they
 445	 * will have the same VMA flags so they're mergeable).  When we
 446	 * initialize the area_src above, it's possible that some part of
 447	 * area_dst could have been faulted in via one huge THP that will be
 448	 * shared between area_src and area_dst.  It could cause some of the
 449	 * area_dst won't be trapped by missing userfaults.
 450	 *
 451	 * This release_pages() will guarantee even if that happened, we'll
 452	 * proactively split the thp and drop any accidentally initialized
 453	 * pages within area_dst.
 454	 */
 455	uffd_test_ops->release_pages(area_dst);
 456
 457	pipefd = malloc(sizeof(int) * nr_cpus * 2);
 458	if (!pipefd)
 459		err("pipefd");
 460	for (cpu = 0; cpu < nr_cpus; cpu++)
 461		if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
 462			err("pipe");
 463}
 464
 465static inline void uffd_test_ctx_init(uint64_t features)
 466{
 467	uffd_test_ctx_init_ext(&features);
 468}
 469
 470static int my_bcmp(char *str1, char *str2, size_t n)
 471{
 472	unsigned long i;
 473	for (i = 0; i < n; i++)
 474		if (str1[i] != str2[i])
 475			return 1;
 476	return 0;
 477}
 478
 479static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
 480{
 481	struct uffdio_writeprotect prms;
 482
 483	/* Write protection page faults */
 484	prms.range.start = start;
 485	prms.range.len = len;
 486	/* Undo write-protect, do wakeup after that */
 487	prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
 488
 489	if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
 490		err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
 491}
 492
 493static void continue_range(int ufd, __u64 start, __u64 len)
 494{
 495	struct uffdio_continue req;
 496	int ret;
 497
 498	req.range.start = start;
 499	req.range.len = len;
 500	req.mode = 0;
 501
 502	if (ioctl(ufd, UFFDIO_CONTINUE, &req))
 503		err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
 504		    (uint64_t)start);
 505
 506	/*
 507	 * Error handling within the kernel for continue is subtly different
 508	 * from copy or zeropage, so it may be a source of bugs. Trigger an
 509	 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
 510	 */
 511	req.mapped = 0;
 512	ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
 513	if (ret >= 0 || req.mapped != -EEXIST)
 514		err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
 515		    ret, (int64_t) req.mapped);
 516}
 517
 518static void *locking_thread(void *arg)
 519{
 520	unsigned long cpu = (unsigned long) arg;
 521	struct random_data rand;
 522	unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
 523	int32_t rand_nr;
 524	unsigned long long count;
 525	char randstate[64];
 526	unsigned int seed;
 527
 528	if (bounces & BOUNCE_RANDOM) {
 529		seed = (unsigned int) time(NULL) - bounces;
 530		if (!(bounces & BOUNCE_RACINGFAULTS))
 531			seed += cpu;
 532		bzero(&rand, sizeof(rand));
 533		bzero(&randstate, sizeof(randstate));
 534		if (initstate_r(seed, randstate, sizeof(randstate), &rand))
 535			err("initstate_r failed");
 536	} else {
 537		page_nr = -bounces;
 538		if (!(bounces & BOUNCE_RACINGFAULTS))
 539			page_nr += cpu * nr_pages_per_cpu;
 540	}
 541
 542	while (!finished) {
 543		if (bounces & BOUNCE_RANDOM) {
 544			if (random_r(&rand, &rand_nr))
 545				err("random_r failed");
 546			page_nr = rand_nr;
 547			if (sizeof(page_nr) > sizeof(rand_nr)) {
 548				if (random_r(&rand, &rand_nr))
 549					err("random_r failed");
 550				page_nr |= (((unsigned long) rand_nr) << 16) <<
 551					   16;
 552			}
 553		} else
 554			page_nr += 1;
 555		page_nr %= nr_pages;
 556		pthread_mutex_lock(area_mutex(area_dst, page_nr));
 557		count = *area_count(area_dst, page_nr);
 558		if (count != count_verify[page_nr])
 559			err("page_nr %lu memory corruption %llu %llu",
 560			    page_nr, count, count_verify[page_nr]);
 561		count++;
 562		*area_count(area_dst, page_nr) = count_verify[page_nr] = count;
 563		pthread_mutex_unlock(area_mutex(area_dst, page_nr));
 564	}
 565
 566	return NULL;
 567}
 568
 569static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
 570			    unsigned long offset)
 571{
 572	uffd_test_ops->alias_mapping(&uffdio_copy->dst,
 573				     uffdio_copy->len,
 574				     offset);
 575	if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
 576		/* real retval in ufdio_copy.copy */
 577		if (uffdio_copy->copy != -EEXIST)
 578			err("UFFDIO_COPY retry error: %"PRId64,
 579			    (int64_t)uffdio_copy->copy);
 580	} else {
 581		err("UFFDIO_COPY retry unexpected: %"PRId64,
 582		    (int64_t)uffdio_copy->copy);
 583	}
 584}
 585
 586static int __copy_page(int ufd, unsigned long offset, bool retry)
 587{
 588	struct uffdio_copy uffdio_copy;
 589
 590	if (offset >= nr_pages * page_size)
 591		err("unexpected offset %lu\n", offset);
 592	uffdio_copy.dst = (unsigned long) area_dst + offset;
 593	uffdio_copy.src = (unsigned long) area_src + offset;
 594	uffdio_copy.len = page_size;
 595	if (test_uffdio_wp)
 596		uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
 597	else
 598		uffdio_copy.mode = 0;
 599	uffdio_copy.copy = 0;
 600	if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
 601		/* real retval in ufdio_copy.copy */
 602		if (uffdio_copy.copy != -EEXIST)
 603			err("UFFDIO_COPY error: %"PRId64,
 604			    (int64_t)uffdio_copy.copy);
 605	} else if (uffdio_copy.copy != page_size) {
 606		err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
 607	} else {
 608		if (test_uffdio_copy_eexist && retry) {
 609			test_uffdio_copy_eexist = false;
 610			retry_copy_page(ufd, &uffdio_copy, offset);
 611		}
 612		return 1;
 613	}
 614	return 0;
 615}
 616
 617static int copy_page_retry(int ufd, unsigned long offset)
 618{
 619	return __copy_page(ufd, offset, true);
 620}
 621
 622static int copy_page(int ufd, unsigned long offset)
 623{
 624	return __copy_page(ufd, offset, false);
 625}
 626
 627static int uffd_read_msg(int ufd, struct uffd_msg *msg)
 628{
 629	int ret = read(uffd, msg, sizeof(*msg));
 630
 631	if (ret != sizeof(*msg)) {
 632		if (ret < 0) {
 633			if (errno == EAGAIN)
 634				return 1;
 635			err("blocking read error");
 636		} else {
 637			err("short read");
 638		}
 639	}
 640
 641	return 0;
 642}
 643
 644static void uffd_handle_page_fault(struct uffd_msg *msg,
 645				   struct uffd_stats *stats)
 646{
 647	unsigned long offset;
 648
 649	if (msg->event != UFFD_EVENT_PAGEFAULT)
 650		err("unexpected msg event %u", msg->event);
 651
 652	if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
 653		/* Write protect page faults */
 654		wp_range(uffd, msg->arg.pagefault.address, page_size, false);
 655		stats->wp_faults++;
 656	} else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
 657		uint8_t *area;
 658		int b;
 659
 660		/*
 661		 * Minor page faults
 662		 *
 663		 * To prove we can modify the original range for testing
 664		 * purposes, we're going to bit flip this range before
 665		 * continuing.
 666		 *
 667		 * Note that this requires all minor page fault tests operate on
 668		 * area_dst (non-UFFD-registered) and area_dst_alias
 669		 * (UFFD-registered).
 670		 */
 671
 672		area = (uint8_t *)(area_dst +
 673				   ((char *)msg->arg.pagefault.address -
 674				    area_dst_alias));
 675		for (b = 0; b < page_size; ++b)
 676			area[b] = ~area[b];
 677		continue_range(uffd, msg->arg.pagefault.address, page_size);
 678		stats->minor_faults++;
 679	} else {
 680		/* Missing page faults */
 681		if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
 682			err("unexpected write fault");
 683
 684		offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
 685		offset &= ~(page_size-1);
 686
 687		if (copy_page(uffd, offset))
 688			stats->missing_faults++;
 689	}
 690}
 691
 692static void *uffd_poll_thread(void *arg)
 693{
 694	struct uffd_stats *stats = (struct uffd_stats *)arg;
 695	unsigned long cpu = stats->cpu;
 696	struct pollfd pollfd[2];
 697	struct uffd_msg msg;
 698	struct uffdio_register uffd_reg;
 699	int ret;
 700	char tmp_chr;
 701
 702	pollfd[0].fd = uffd;
 703	pollfd[0].events = POLLIN;
 704	pollfd[1].fd = pipefd[cpu*2];
 705	pollfd[1].events = POLLIN;
 706
 707	for (;;) {
 708		ret = poll(pollfd, 2, -1);
 709		if (ret <= 0)
 710			err("poll error: %d", ret);
 711		if (pollfd[1].revents & POLLIN) {
 712			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
 713				err("read pipefd error");
 714			break;
 715		}
 716		if (!(pollfd[0].revents & POLLIN))
 717			err("pollfd[0].revents %d", pollfd[0].revents);
 718		if (uffd_read_msg(uffd, &msg))
 719			continue;
 720		switch (msg.event) {
 721		default:
 722			err("unexpected msg event %u\n", msg.event);
 723			break;
 724		case UFFD_EVENT_PAGEFAULT:
 725			uffd_handle_page_fault(&msg, stats);
 726			break;
 727		case UFFD_EVENT_FORK:
 728			close(uffd);
 729			uffd = msg.arg.fork.ufd;
 730			pollfd[0].fd = uffd;
 731			break;
 732		case UFFD_EVENT_REMOVE:
 733			uffd_reg.range.start = msg.arg.remove.start;
 734			uffd_reg.range.len = msg.arg.remove.end -
 735				msg.arg.remove.start;
 736			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
 737				err("remove failure");
 738			break;
 739		case UFFD_EVENT_REMAP:
 740			area_dst = (char *)(unsigned long)msg.arg.remap.to;
 741			break;
 742		}
 743	}
 744
 745	return NULL;
 746}
 747
 748pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
 749
 750static void *uffd_read_thread(void *arg)
 751{
 752	struct uffd_stats *stats = (struct uffd_stats *)arg;
 753	struct uffd_msg msg;
 754
 755	pthread_mutex_unlock(&uffd_read_mutex);
 756	/* from here cancellation is ok */
 757
 758	for (;;) {
 759		if (uffd_read_msg(uffd, &msg))
 760			continue;
 761		uffd_handle_page_fault(&msg, stats);
 762	}
 763
 764	return NULL;
 765}
 766
 767static void *background_thread(void *arg)
 768{
 769	unsigned long cpu = (unsigned long) arg;
 770	unsigned long page_nr, start_nr, mid_nr, end_nr;
 771
 772	start_nr = cpu * nr_pages_per_cpu;
 773	end_nr = (cpu+1) * nr_pages_per_cpu;
 774	mid_nr = (start_nr + end_nr) / 2;
 775
 776	/* Copy the first half of the pages */
 777	for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
 778		copy_page_retry(uffd, page_nr * page_size);
 779
 780	/*
 781	 * If we need to test uffd-wp, set it up now.  Then we'll have
 782	 * at least the first half of the pages mapped already which
 783	 * can be write-protected for testing
 784	 */
 785	if (test_uffdio_wp)
 786		wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
 787			nr_pages_per_cpu * page_size, true);
 788
 789	/*
 790	 * Continue the 2nd half of the page copying, handling write
 791	 * protection faults if any
 792	 */
 793	for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
 794		copy_page_retry(uffd, page_nr * page_size);
 795
 796	return NULL;
 797}
 798
 799static int stress(struct uffd_stats *uffd_stats)
 800{
 801	unsigned long cpu;
 802	pthread_t locking_threads[nr_cpus];
 803	pthread_t uffd_threads[nr_cpus];
 804	pthread_t background_threads[nr_cpus];
 805
 806	finished = 0;
 807	for (cpu = 0; cpu < nr_cpus; cpu++) {
 808		if (pthread_create(&locking_threads[cpu], &attr,
 809				   locking_thread, (void *)cpu))
 810			return 1;
 811		if (bounces & BOUNCE_POLL) {
 812			if (pthread_create(&uffd_threads[cpu], &attr,
 813					   uffd_poll_thread,
 814					   (void *)&uffd_stats[cpu]))
 815				return 1;
 816		} else {
 817			if (pthread_create(&uffd_threads[cpu], &attr,
 818					   uffd_read_thread,
 819					   (void *)&uffd_stats[cpu]))
 820				return 1;
 821			pthread_mutex_lock(&uffd_read_mutex);
 822		}
 823		if (pthread_create(&background_threads[cpu], &attr,
 824				   background_thread, (void *)cpu))
 825			return 1;
 826	}
 827	for (cpu = 0; cpu < nr_cpus; cpu++)
 828		if (pthread_join(background_threads[cpu], NULL))
 829			return 1;
 830
 831	/*
 832	 * Be strict and immediately zap area_src, the whole area has
 833	 * been transferred already by the background treads. The
 834	 * area_src could then be faulted in in a racy way by still
 835	 * running uffdio_threads reading zeropages after we zapped
 836	 * area_src (but they're guaranteed to get -EEXIST from
 837	 * UFFDIO_COPY without writing zero pages into area_dst
 838	 * because the background threads already completed).
 839	 */
 840	uffd_test_ops->release_pages(area_src);
 841
 842	finished = 1;
 843	for (cpu = 0; cpu < nr_cpus; cpu++)
 844		if (pthread_join(locking_threads[cpu], NULL))
 845			return 1;
 846
 847	for (cpu = 0; cpu < nr_cpus; cpu++) {
 848		char c;
 849		if (bounces & BOUNCE_POLL) {
 850			if (write(pipefd[cpu*2+1], &c, 1) != 1)
 851				err("pipefd write error");
 852			if (pthread_join(uffd_threads[cpu],
 853					 (void *)&uffd_stats[cpu]))
 854				return 1;
 855		} else {
 856			if (pthread_cancel(uffd_threads[cpu]))
 857				return 1;
 858			if (pthread_join(uffd_threads[cpu], NULL))
 859				return 1;
 860		}
 861	}
 862
 863	return 0;
 864}
 865
 866sigjmp_buf jbuf, *sigbuf;
 867
 868static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
 869{
 870	if (sig == SIGBUS) {
 871		if (sigbuf)
 872			siglongjmp(*sigbuf, 1);
 873		abort();
 874	}
 875}
 876
 877/*
 878 * For non-cooperative userfaultfd test we fork() a process that will
 879 * generate pagefaults, will mremap the area monitored by the
 880 * userfaultfd and at last this process will release the monitored
 881 * area.
 882 * For the anonymous and shared memory the area is divided into two
 883 * parts, the first part is accessed before mremap, and the second
 884 * part is accessed after mremap. Since hugetlbfs does not support
 885 * mremap, the entire monitored area is accessed in a single pass for
 886 * HUGETLB_TEST.
 887 * The release of the pages currently generates event for shmem and
 888 * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
 889 * for hugetlb.
 890 * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
 891 * monitored area, generate pagefaults and test that signal is delivered.
 892 * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
 893 * test robustness use case - we release monitored area, fork a process
 894 * that will generate pagefaults and verify signal is generated.
 895 * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
 896 * feature. Using monitor thread, verify no userfault events are generated.
 897 */
 898static int faulting_process(int signal_test)
 899{
 900	unsigned long nr;
 901	unsigned long long count;
 902	unsigned long split_nr_pages;
 903	unsigned long lastnr;
 904	struct sigaction act;
 905	unsigned long signalled = 0;
 906
 907	if (test_type != TEST_HUGETLB)
 908		split_nr_pages = (nr_pages + 1) / 2;
 909	else
 910		split_nr_pages = nr_pages;
 911
 912	if (signal_test) {
 913		sigbuf = &jbuf;
 914		memset(&act, 0, sizeof(act));
 915		act.sa_sigaction = sighndl;
 916		act.sa_flags = SA_SIGINFO;
 917		if (sigaction(SIGBUS, &act, 0))
 918			err("sigaction");
 919		lastnr = (unsigned long)-1;
 920	}
 921
 922	for (nr = 0; nr < split_nr_pages; nr++) {
 923		int steps = 1;
 924		unsigned long offset = nr * page_size;
 925
 926		if (signal_test) {
 927			if (sigsetjmp(*sigbuf, 1) != 0) {
 928				if (steps == 1 && nr == lastnr)
 929					err("Signal repeated");
 930
 931				lastnr = nr;
 932				if (signal_test == 1) {
 933					if (steps == 1) {
 934						/* This is a MISSING request */
 935						steps++;
 936						if (copy_page(uffd, offset))
 937							signalled++;
 938					} else {
 939						/* This is a WP request */
 940						assert(steps == 2);
 941						wp_range(uffd,
 942							 (__u64)area_dst +
 943							 offset,
 944							 page_size, false);
 945					}
 946				} else {
 947					signalled++;
 948					continue;
 949				}
 950			}
 951		}
 952
 953		count = *area_count(area_dst, nr);
 954		if (count != count_verify[nr])
 955			err("nr %lu memory corruption %llu %llu\n",
 956			    nr, count, count_verify[nr]);
 957		/*
 958		 * Trigger write protection if there is by writing
 959		 * the same value back.
 960		 */
 961		*area_count(area_dst, nr) = count;
 962	}
 963
 964	if (signal_test)
 965		return signalled != split_nr_pages;
 966
 967	if (test_type == TEST_HUGETLB)
 968		return 0;
 969
 970	area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
 971			  MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
 972	if (area_dst == MAP_FAILED)
 973		err("mremap");
 974	/* Reset area_src since we just clobbered it */
 975	area_src = NULL;
 976
 977	for (; nr < nr_pages; nr++) {
 978		count = *area_count(area_dst, nr);
 979		if (count != count_verify[nr]) {
 980			err("nr %lu memory corruption %llu %llu\n",
 981			    nr, count, count_verify[nr]);
 982		}
 983		/*
 984		 * Trigger write protection if there is by writing
 985		 * the same value back.
 986		 */
 987		*area_count(area_dst, nr) = count;
 988	}
 989
 990	uffd_test_ops->release_pages(area_dst);
 991
 992	for (nr = 0; nr < nr_pages; nr++)
 993		if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
 994			err("nr %lu is not zero", nr);
 995
 996	return 0;
 997}
 998
 999static void retry_uffdio_zeropage(int ufd,
1000				  struct uffdio_zeropage *uffdio_zeropage,
1001				  unsigned long offset)
1002{
1003	uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
1004				     uffdio_zeropage->range.len,
1005				     offset);
1006	if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
1007		if (uffdio_zeropage->zeropage != -EEXIST)
1008			err("UFFDIO_ZEROPAGE error: %"PRId64,
1009			    (int64_t)uffdio_zeropage->zeropage);
1010	} else {
1011		err("UFFDIO_ZEROPAGE error: %"PRId64,
1012		    (int64_t)uffdio_zeropage->zeropage);
1013	}
1014}
1015
1016static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
1017{
1018	struct uffdio_zeropage uffdio_zeropage;
1019	int ret;
1020	unsigned long has_zeropage;
1021	__s64 res;
1022
1023	has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
1024
1025	if (offset >= nr_pages * page_size)
1026		err("unexpected offset %lu", offset);
1027	uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
1028	uffdio_zeropage.range.len = page_size;
1029	uffdio_zeropage.mode = 0;
1030	ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
1031	res = uffdio_zeropage.zeropage;
1032	if (ret) {
1033		/* real retval in ufdio_zeropage.zeropage */
1034		if (has_zeropage)
1035			err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res);
1036		else if (res != -EINVAL)
1037			err("UFFDIO_ZEROPAGE not -EINVAL");
1038	} else if (has_zeropage) {
1039		if (res != page_size) {
1040			err("UFFDIO_ZEROPAGE unexpected size");
1041		} else {
1042			if (test_uffdio_zeropage_eexist && retry) {
1043				test_uffdio_zeropage_eexist = false;
1044				retry_uffdio_zeropage(ufd, &uffdio_zeropage,
1045						      offset);
1046			}
1047			return 1;
1048		}
1049	} else
1050		err("UFFDIO_ZEROPAGE succeeded");
1051
1052	return 0;
1053}
1054
1055static int uffdio_zeropage(int ufd, unsigned long offset)
1056{
1057	return __uffdio_zeropage(ufd, offset, false);
1058}
1059
1060/* exercise UFFDIO_ZEROPAGE */
1061static int userfaultfd_zeropage_test(void)
1062{
1063	struct uffdio_register uffdio_register;
1064	unsigned long expected_ioctls;
1065
1066	printf("testing UFFDIO_ZEROPAGE: ");
1067	fflush(stdout);
1068
1069	uffd_test_ctx_init(0);
1070
1071	uffdio_register.range.start = (unsigned long) area_dst;
1072	uffdio_register.range.len = nr_pages * page_size;
1073	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1074	if (test_uffdio_wp)
1075		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1076	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1077		err("register failure");
1078
1079	expected_ioctls = uffd_test_ops->expected_ioctls;
1080	if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
1081		err("unexpected missing ioctl for anon memory");
1082
1083	if (uffdio_zeropage(uffd, 0))
1084		if (my_bcmp(area_dst, zeropage, page_size))
1085			err("zeropage is not zero");
1086
1087	printf("done.\n");
1088	return 0;
1089}
1090
1091static int userfaultfd_events_test(void)
1092{
1093	struct uffdio_register uffdio_register;
1094	unsigned long expected_ioctls;
1095	pthread_t uffd_mon;
1096	int err, features;
1097	pid_t pid;
1098	char c;
1099	struct uffd_stats stats = { 0 };
1100
1101	printf("testing events (fork, remap, remove): ");
1102	fflush(stdout);
1103
1104	features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
1105		UFFD_FEATURE_EVENT_REMOVE;
1106	uffd_test_ctx_init(features);
1107
1108	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1109
1110	uffdio_register.range.start = (unsigned long) area_dst;
1111	uffdio_register.range.len = nr_pages * page_size;
1112	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1113	if (test_uffdio_wp)
1114		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1115	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1116		err("register failure");
1117
1118	expected_ioctls = uffd_test_ops->expected_ioctls;
1119	if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
1120		err("unexpected missing ioctl for anon memory");
1121
1122	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1123		err("uffd_poll_thread create");
1124
1125	pid = fork();
1126	if (pid < 0)
1127		err("fork");
1128
1129	if (!pid)
1130		exit(faulting_process(0));
1131
1132	waitpid(pid, &err, 0);
1133	if (err)
1134		err("faulting process failed");
1135	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1136		err("pipe write");
1137	if (pthread_join(uffd_mon, NULL))
1138		return 1;
1139
1140	uffd_stats_report(&stats, 1);
1141
1142	return stats.missing_faults != nr_pages;
1143}
1144
1145static int userfaultfd_sig_test(void)
1146{
1147	struct uffdio_register uffdio_register;
1148	unsigned long expected_ioctls;
1149	unsigned long userfaults;
1150	pthread_t uffd_mon;
1151	int err, features;
1152	pid_t pid;
1153	char c;
1154	struct uffd_stats stats = { 0 };
1155
1156	printf("testing signal delivery: ");
1157	fflush(stdout);
1158
1159	features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
1160	uffd_test_ctx_init(features);
1161
1162	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1163
1164	uffdio_register.range.start = (unsigned long) area_dst;
1165	uffdio_register.range.len = nr_pages * page_size;
1166	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1167	if (test_uffdio_wp)
1168		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1169	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1170		err("register failure");
1171
1172	expected_ioctls = uffd_test_ops->expected_ioctls;
1173	if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
1174		err("unexpected missing ioctl for anon memory");
1175
1176	if (faulting_process(1))
1177		err("faulting process failed");
1178
1179	uffd_test_ops->release_pages(area_dst);
1180
1181	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1182		err("uffd_poll_thread create");
1183
1184	pid = fork();
1185	if (pid < 0)
1186		err("fork");
1187
1188	if (!pid)
1189		exit(faulting_process(2));
1190
1191	waitpid(pid, &err, 0);
1192	if (err)
1193		err("faulting process failed");
1194	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1195		err("pipe write");
1196	if (pthread_join(uffd_mon, (void **)&userfaults))
1197		return 1;
1198
1199	printf("done.\n");
1200	if (userfaults)
1201		err("Signal test failed, userfaults: %ld", userfaults);
1202
1203	return userfaults != 0;
1204}
1205
1206static int userfaultfd_minor_test(void)
1207{
1208	struct uffdio_register uffdio_register;
1209	unsigned long expected_ioctls;
1210	unsigned long p;
1211	pthread_t uffd_mon;
1212	uint8_t expected_byte;
1213	void *expected_page;
1214	char c;
1215	struct uffd_stats stats = { 0 };
1216	uint64_t req_features, features_out;
1217
1218	if (!test_uffdio_minor)
1219		return 0;
1220
1221	printf("testing minor faults: ");
1222	fflush(stdout);
1223
1224	if (test_type == TEST_HUGETLB)
1225		req_features = UFFD_FEATURE_MINOR_HUGETLBFS;
1226	else if (test_type == TEST_SHMEM)
1227		req_features = UFFD_FEATURE_MINOR_SHMEM;
1228	else
1229		return 1;
1230
1231	features_out = req_features;
1232	uffd_test_ctx_init_ext(&features_out);
1233	/* If kernel reports required features aren't supported, skip test. */
1234	if ((features_out & req_features) != req_features) {
1235		printf("skipping test due to lack of feature support\n");
1236		fflush(stdout);
1237		return 0;
1238	}
1239
1240	uffdio_register.range.start = (unsigned long)area_dst_alias;
1241	uffdio_register.range.len = nr_pages * page_size;
1242	uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
1243	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1244		err("register failure");
1245
1246	expected_ioctls = uffd_test_ops->expected_ioctls;
1247	expected_ioctls |= 1 << _UFFDIO_CONTINUE;
1248	if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls)
1249		err("unexpected missing ioctl(s)");
1250
1251	/*
1252	 * After registering with UFFD, populate the non-UFFD-registered side of
1253	 * the shared mapping. This should *not* trigger any UFFD minor faults.
1254	 */
1255	for (p = 0; p < nr_pages; ++p) {
1256		memset(area_dst + (p * page_size), p % ((uint8_t)-1),
1257		       page_size);
1258	}
1259
1260	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats))
1261		err("uffd_poll_thread create");
1262
1263	/*
1264	 * Read each of the pages back using the UFFD-registered mapping. We
1265	 * expect that the first time we touch a page, it will result in a minor
1266	 * fault. uffd_poll_thread will resolve the fault by bit-flipping the
1267	 * page's contents, and then issuing a CONTINUE ioctl.
1268	 */
1269
1270	if (posix_memalign(&expected_page, page_size, page_size))
1271		err("out of memory");
1272
1273	for (p = 0; p < nr_pages; ++p) {
1274		expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
1275		memset(expected_page, expected_byte, page_size);
1276		if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
1277			    page_size))
1278			err("unexpected page contents after minor fault");
1279	}
1280
1281	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
1282		err("pipe write");
1283	if (pthread_join(uffd_mon, NULL))
1284		return 1;
1285
1286	uffd_stats_report(&stats, 1);
1287
1288	return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
1289}
1290
1291#define BIT_ULL(nr)                   (1ULL << (nr))
1292#define PM_SOFT_DIRTY                 BIT_ULL(55)
1293#define PM_MMAP_EXCLUSIVE             BIT_ULL(56)
1294#define PM_UFFD_WP                    BIT_ULL(57)
1295#define PM_FILE                       BIT_ULL(61)
1296#define PM_SWAP                       BIT_ULL(62)
1297#define PM_PRESENT                    BIT_ULL(63)
1298
1299static int pagemap_open(void)
1300{
1301	int fd = open("/proc/self/pagemap", O_RDONLY);
1302
1303	if (fd < 0)
1304		err("open pagemap");
1305
1306	return fd;
1307}
1308
1309static uint64_t pagemap_read_vaddr(int fd, void *vaddr)
1310{
1311	uint64_t value;
1312	int ret;
1313
1314	ret = pread(fd, &value, sizeof(uint64_t),
1315		    ((uint64_t)vaddr >> 12) * sizeof(uint64_t));
1316	if (ret != sizeof(uint64_t))
1317		err("pread() on pagemap failed");
1318
1319	return value;
1320}
1321
1322/* This macro let __LINE__ works in err() */
1323#define  pagemap_check_wp(value, wp) do {				\
1324		if (!!(value & PM_UFFD_WP) != wp)			\
1325			err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \
1326	} while (0)
1327
1328static int pagemap_test_fork(bool present)
1329{
1330	pid_t child = fork();
1331	uint64_t value;
1332	int fd, result;
1333
1334	if (!child) {
1335		/* Open the pagemap fd of the child itself */
1336		fd = pagemap_open();
1337		value = pagemap_read_vaddr(fd, area_dst);
1338		/*
1339		 * After fork() uffd-wp bit should be gone as long as we're
1340		 * without UFFD_FEATURE_EVENT_FORK
1341		 */
1342		pagemap_check_wp(value, false);
1343		/* Succeed */
1344		exit(0);
1345	}
1346	waitpid(child, &result, 0);
1347	return result;
1348}
1349
1350static void userfaultfd_pagemap_test(unsigned int test_pgsize)
1351{
1352	struct uffdio_register uffdio_register;
1353	int pagemap_fd;
1354	uint64_t value;
1355
1356	/* Pagemap tests uffd-wp only */
1357	if (!test_uffdio_wp)
1358		return;
1359
1360	/* Not enough memory to test this page size */
1361	if (test_pgsize > nr_pages * page_size)
1362		return;
1363
1364	printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize);
1365	/* Flush so it doesn't flush twice in parent/child later */
1366	fflush(stdout);
1367
1368	uffd_test_ctx_init(0);
1369
1370	if (test_pgsize > page_size) {
1371		/* This is a thp test */
1372		if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE))
1373			err("madvise(MADV_HUGEPAGE) failed");
1374	} else if (test_pgsize == page_size) {
1375		/* This is normal page test; force no thp */
1376		if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE))
1377			err("madvise(MADV_NOHUGEPAGE) failed");
1378	}
1379
1380	uffdio_register.range.start = (unsigned long) area_dst;
1381	uffdio_register.range.len = nr_pages * page_size;
1382	uffdio_register.mode = UFFDIO_REGISTER_MODE_WP;
1383	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1384		err("register failed");
1385
1386	pagemap_fd = pagemap_open();
1387
1388	/* Touch the page */
1389	*area_dst = 1;
1390	wp_range(uffd, (uint64_t)area_dst, test_pgsize, true);
1391	value = pagemap_read_vaddr(pagemap_fd, area_dst);
1392	pagemap_check_wp(value, true);
1393	/* Make sure uffd-wp bit dropped when fork */
1394	if (pagemap_test_fork(true))
1395		err("Detected stall uffd-wp bit in child");
1396
1397	/* Exclusive required or PAGEOUT won't work */
1398	if (!(value & PM_MMAP_EXCLUSIVE))
1399		err("multiple mapping detected: 0x%"PRIx64, value);
1400
1401	if (madvise(area_dst, test_pgsize, MADV_PAGEOUT))
1402		err("madvise(MADV_PAGEOUT) failed");
1403
1404	/* Uffd-wp should persist even swapped out */
1405	value = pagemap_read_vaddr(pagemap_fd, area_dst);
1406	pagemap_check_wp(value, true);
1407	/* Make sure uffd-wp bit dropped when fork */
1408	if (pagemap_test_fork(false))
1409		err("Detected stall uffd-wp bit in child");
1410
1411	/* Unprotect; this tests swap pte modifications */
1412	wp_range(uffd, (uint64_t)area_dst, page_size, false);
1413	value = pagemap_read_vaddr(pagemap_fd, area_dst);
1414	pagemap_check_wp(value, false);
1415
1416	/* Fault in the page from disk */
1417	*area_dst = 2;
1418	value = pagemap_read_vaddr(pagemap_fd, area_dst);
1419	pagemap_check_wp(value, false);
1420
1421	close(pagemap_fd);
1422	printf("done\n");
1423}
1424
1425static int userfaultfd_stress(void)
1426{
1427	void *area;
1428	char *tmp_area;
1429	unsigned long nr;
1430	struct uffdio_register uffdio_register;
1431	struct uffd_stats uffd_stats[nr_cpus];
1432
1433	uffd_test_ctx_init(0);
1434
1435	if (posix_memalign(&area, page_size, page_size))
1436		err("out of memory");
1437	zeropage = area;
1438	bzero(zeropage, page_size);
1439
1440	pthread_mutex_lock(&uffd_read_mutex);
1441
1442	pthread_attr_init(&attr);
1443	pthread_attr_setstacksize(&attr, 16*1024*1024);
1444
1445	while (bounces--) {
1446		unsigned long expected_ioctls;
1447
1448		printf("bounces: %d, mode:", bounces);
1449		if (bounces & BOUNCE_RANDOM)
1450			printf(" rnd");
1451		if (bounces & BOUNCE_RACINGFAULTS)
1452			printf(" racing");
1453		if (bounces & BOUNCE_VERIFY)
1454			printf(" ver");
1455		if (bounces & BOUNCE_POLL)
1456			printf(" poll");
1457		else
1458			printf(" read");
1459		printf(", ");
1460		fflush(stdout);
1461
1462		if (bounces & BOUNCE_POLL)
1463			fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1464		else
1465			fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
1466
1467		/* register */
1468		uffdio_register.range.start = (unsigned long) area_dst;
1469		uffdio_register.range.len = nr_pages * page_size;
1470		uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1471		if (test_uffdio_wp)
1472			uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1473		if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1474			err("register failure");
1475		expected_ioctls = uffd_test_ops->expected_ioctls;
1476		if ((uffdio_register.ioctls & expected_ioctls) !=
1477		    expected_ioctls)
1478			err("unexpected missing ioctl for anon memory");
1479
1480		if (area_dst_alias) {
1481			uffdio_register.range.start = (unsigned long)
1482				area_dst_alias;
1483			if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
1484				err("register failure alias");
1485		}
1486
1487		/*
1488		 * The madvise done previously isn't enough: some
1489		 * uffd_thread could have read userfaults (one of
1490		 * those already resolved by the background thread)
1491		 * and it may be in the process of calling
1492		 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
1493		 * area_src and it would map a zero page in it (of
1494		 * course such a UFFDIO_COPY is perfectly safe as it'd
1495		 * return -EEXIST). The problem comes at the next
1496		 * bounce though: that racing UFFDIO_COPY would
1497		 * generate zeropages in the area_src, so invalidating
1498		 * the previous MADV_DONTNEED. Without this additional
1499		 * MADV_DONTNEED those zeropages leftovers in the
1500		 * area_src would lead to -EEXIST failure during the
1501		 * next bounce, effectively leaving a zeropage in the
1502		 * area_dst.
1503		 *
1504		 * Try to comment this out madvise to see the memory
1505		 * corruption being caught pretty quick.
1506		 *
1507		 * khugepaged is also inhibited to collapse THP after
1508		 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
1509		 * required to MADV_DONTNEED here.
1510		 */
1511		uffd_test_ops->release_pages(area_dst);
1512
1513		uffd_stats_reset(uffd_stats, nr_cpus);
1514
1515		/* bounce pass */
1516		if (stress(uffd_stats))
1517			return 1;
1518
1519		/* Clear all the write protections if there is any */
1520		if (test_uffdio_wp)
1521			wp_range(uffd, (unsigned long)area_dst,
1522				 nr_pages * page_size, false);
1523
1524		/* unregister */
1525		if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range))
1526			err("unregister failure");
1527		if (area_dst_alias) {
1528			uffdio_register.range.start = (unsigned long) area_dst;
1529			if (ioctl(uffd, UFFDIO_UNREGISTER,
1530				  &uffdio_register.range))
1531				err("unregister failure alias");
1532		}
1533
1534		/* verification */
1535		if (bounces & BOUNCE_VERIFY)
1536			for (nr = 0; nr < nr_pages; nr++)
1537				if (*area_count(area_dst, nr) != count_verify[nr])
1538					err("error area_count %llu %llu %lu\n",
1539					    *area_count(area_src, nr),
1540					    count_verify[nr], nr);
1541
1542		/* prepare next bounce */
1543		tmp_area = area_src;
1544		area_src = area_dst;
1545		area_dst = tmp_area;
1546
1547		tmp_area = area_src_alias;
1548		area_src_alias = area_dst_alias;
1549		area_dst_alias = tmp_area;
1550
1551		uffd_stats_report(uffd_stats, nr_cpus);
1552	}
1553
1554	if (test_type == TEST_ANON) {
1555		/*
1556		 * shmem/hugetlb won't be able to run since they have different
1557		 * behavior on fork() (file-backed memory normally drops ptes
1558		 * directly when fork), meanwhile the pagemap test will verify
1559		 * pgtable entry of fork()ed child.
1560		 */
1561		userfaultfd_pagemap_test(page_size);
1562		/*
1563		 * Hard-code for x86_64 for now for 2M THP, as x86_64 is
1564		 * currently the only one that supports uffd-wp
1565		 */
1566		userfaultfd_pagemap_test(page_size * 512);
1567	}
1568
1569	return userfaultfd_zeropage_test() || userfaultfd_sig_test()
1570		|| userfaultfd_events_test() || userfaultfd_minor_test();
1571}
1572
1573/*
1574 * Copied from mlock2-tests.c
1575 */
1576unsigned long default_huge_page_size(void)
1577{
1578	unsigned long hps = 0;
1579	char *line = NULL;
1580	size_t linelen = 0;
1581	FILE *f = fopen("/proc/meminfo", "r");
1582
1583	if (!f)
1584		return 0;
1585	while (getline(&line, &linelen, f) > 0) {
1586		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
1587			hps <<= 10;
1588			break;
1589		}
1590	}
1591
1592	free(line);
1593	fclose(f);
1594	return hps;
1595}
1596
1597static void set_test_type(const char *type)
1598{
1599	if (!strcmp(type, "anon")) {
1600		test_type = TEST_ANON;
1601		uffd_test_ops = &anon_uffd_test_ops;
1602		/* Only enable write-protect test for anonymous test */
1603		test_uffdio_wp = true;
1604	} else if (!strcmp(type, "hugetlb")) {
1605		test_type = TEST_HUGETLB;
1606		uffd_test_ops = &hugetlb_uffd_test_ops;
1607	} else if (!strcmp(type, "hugetlb_shared")) {
1608		map_shared = true;
1609		test_type = TEST_HUGETLB;
1610		uffd_test_ops = &hugetlb_uffd_test_ops;
1611		/* Minor faults require shared hugetlb; only enable here. */
1612		test_uffdio_minor = true;
1613	} else if (!strcmp(type, "shmem")) {
1614		map_shared = true;
1615		test_type = TEST_SHMEM;
1616		uffd_test_ops = &shmem_uffd_test_ops;
1617		test_uffdio_minor = true;
1618	} else {
1619		err("Unknown test type: %s", type);
1620	}
1621
1622	if (test_type == TEST_HUGETLB)
1623		page_size = default_huge_page_size();
1624	else
1625		page_size = sysconf(_SC_PAGE_SIZE);
1626
1627	if (!page_size)
1628		err("Unable to determine page size");
1629	if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1630	    > page_size)
1631		err("Impossible to run this test");
1632}
1633
1634static void sigalrm(int sig)
1635{
1636	if (sig != SIGALRM)
1637		abort();
1638	test_uffdio_copy_eexist = true;
1639	test_uffdio_zeropage_eexist = true;
1640	alarm(ALARM_INTERVAL_SECS);
1641}
1642
1643int main(int argc, char **argv)
1644{
1645	if (argc < 4)
1646		usage();
1647
1648	if (signal(SIGALRM, sigalrm) == SIG_ERR)
1649		err("failed to arm SIGALRM");
1650	alarm(ALARM_INTERVAL_SECS);
1651
1652	set_test_type(argv[1]);
1653
1654	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1655	nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
1656		nr_cpus;
1657	if (!nr_pages_per_cpu) {
1658		_err("invalid MiB");
1659		usage();
1660	}
1661
1662	bounces = atoi(argv[3]);
1663	if (bounces <= 0) {
1664		_err("invalid bounces");
1665		usage();
1666	}
1667	nr_pages = nr_pages_per_cpu * nr_cpus;
1668
1669	if (test_type == TEST_HUGETLB) {
1670		if (argc < 5)
1671			usage();
1672		huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
1673		if (huge_fd < 0)
1674			err("Open of %s failed", argv[4]);
1675		if (ftruncate(huge_fd, 0))
1676			err("ftruncate %s to size 0 failed", argv[4]);
1677	} else if (test_type == TEST_SHMEM) {
1678		shm_fd = memfd_create(argv[0], 0);
1679		if (shm_fd < 0)
1680			err("memfd_create");
1681		if (ftruncate(shm_fd, nr_pages * page_size * 2))
1682			err("ftruncate");
1683		if (fallocate(shm_fd,
1684			      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
1685			      nr_pages * page_size * 2))
1686			err("fallocate");
1687	}
1688	printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1689	       nr_pages, nr_pages_per_cpu);
1690	return userfaultfd_stress();
1691}
1692
1693#else /* __NR_userfaultfd */
1694
1695#warning "missing __NR_userfaultfd definition"
1696
1697int main(void)
1698{
1699	printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1700	return KSFT_SKIP;
1701}
1702
1703#endif /* __NR_userfaultfd */