Linux Audio

Check our new training course

Loading...
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0-only
   2/******************************************************************************
   3 * privcmd.c
   4 *
   5 * Interface to privileged domain-0 commands.
   6 *
   7 * Copyright (c) 2002-2004, K A Fraser, B Dragovic
   8 */
   9
  10#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
  11
  12#include <linux/eventfd.h>
  13#include <linux/file.h>
  14#include <linux/kernel.h>
  15#include <linux/module.h>
  16#include <linux/mutex.h>
  17#include <linux/poll.h>
  18#include <linux/sched.h>
  19#include <linux/slab.h>
  20#include <linux/srcu.h>
  21#include <linux/string.h>
  22#include <linux/workqueue.h>
  23#include <linux/errno.h>
  24#include <linux/mm.h>
  25#include <linux/mman.h>
  26#include <linux/uaccess.h>
  27#include <linux/swap.h>
  28#include <linux/highmem.h>
  29#include <linux/pagemap.h>
  30#include <linux/seq_file.h>
  31#include <linux/miscdevice.h>
  32#include <linux/moduleparam.h>
  33#include <linux/virtio_mmio.h>
  34
  35#include <asm/xen/hypervisor.h>
  36#include <asm/xen/hypercall.h>
  37
  38#include <xen/xen.h>
  39#include <xen/events.h>
  40#include <xen/privcmd.h>
  41#include <xen/interface/xen.h>
  42#include <xen/interface/memory.h>
  43#include <xen/interface/hvm/dm_op.h>
  44#include <xen/interface/hvm/ioreq.h>
  45#include <xen/features.h>
  46#include <xen/page.h>
  47#include <xen/xen-ops.h>
  48#include <xen/balloon.h>
  49#ifdef CONFIG_XEN_ACPI
  50#include <xen/acpi.h>
  51#endif
  52
  53#include "privcmd.h"
  54
  55MODULE_DESCRIPTION("Xen hypercall passthrough driver");
  56MODULE_LICENSE("GPL");
  57
  58#define PRIV_VMA_LOCKED ((void *)1)
  59
  60static unsigned int privcmd_dm_op_max_num = 16;
  61module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644);
  62MODULE_PARM_DESC(dm_op_max_nr_bufs,
  63		 "Maximum number of buffers per dm_op hypercall");
  64
  65static unsigned int privcmd_dm_op_buf_max_size = 4096;
  66module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint,
  67		   0644);
  68MODULE_PARM_DESC(dm_op_buf_max_size,
  69		 "Maximum size of a dm_op hypercall buffer");
  70
  71struct privcmd_data {
  72	domid_t domid;
  73};
  74
  75static int privcmd_vma_range_is_mapped(
  76               struct vm_area_struct *vma,
  77               unsigned long addr,
  78               unsigned long nr_pages);
  79
  80static long privcmd_ioctl_hypercall(struct file *file, void __user *udata)
  81{
  82	struct privcmd_data *data = file->private_data;
  83	struct privcmd_hypercall hypercall;
  84	long ret;
  85
  86	/* Disallow arbitrary hypercalls if restricted */
  87	if (data->domid != DOMID_INVALID)
  88		return -EPERM;
  89
  90	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
  91		return -EFAULT;
  92
  93	xen_preemptible_hcall_begin();
  94	ret = privcmd_call(hypercall.op,
  95			   hypercall.arg[0], hypercall.arg[1],
  96			   hypercall.arg[2], hypercall.arg[3],
  97			   hypercall.arg[4]);
  98	xen_preemptible_hcall_end();
  99
 100	return ret;
 101}
 102
 103static void free_page_list(struct list_head *pages)
 104{
 105	struct page *p, *n;
 106
 107	list_for_each_entry_safe(p, n, pages, lru)
 108		__free_page(p);
 109
 110	INIT_LIST_HEAD(pages);
 111}
 112
 113/*
 114 * Given an array of items in userspace, return a list of pages
 115 * containing the data.  If copying fails, either because of memory
 116 * allocation failure or a problem reading user memory, return an
 117 * error code; its up to the caller to dispose of any partial list.
 118 */
 119static int gather_array(struct list_head *pagelist,
 120			unsigned nelem, size_t size,
 121			const void __user *data)
 122{
 123	unsigned pageidx;
 124	void *pagedata;
 125	int ret;
 126
 127	if (size > PAGE_SIZE)
 128		return 0;
 129
 130	pageidx = PAGE_SIZE;
 131	pagedata = NULL;	/* quiet, gcc */
 132	while (nelem--) {
 133		if (pageidx > PAGE_SIZE-size) {
 134			struct page *page = alloc_page(GFP_KERNEL);
 135
 136			ret = -ENOMEM;
 137			if (page == NULL)
 138				goto fail;
 139
 140			pagedata = page_address(page);
 141
 142			list_add_tail(&page->lru, pagelist);
 143			pageidx = 0;
 144		}
 145
 146		ret = -EFAULT;
 147		if (copy_from_user(pagedata + pageidx, data, size))
 148			goto fail;
 149
 150		data += size;
 151		pageidx += size;
 152	}
 153
 154	ret = 0;
 155
 156fail:
 157	return ret;
 158}
 159
 160/*
 161 * Call function "fn" on each element of the array fragmented
 162 * over a list of pages.
 163 */
 164static int traverse_pages(unsigned nelem, size_t size,
 165			  struct list_head *pos,
 166			  int (*fn)(void *data, void *state),
 167			  void *state)
 168{
 169	void *pagedata;
 170	unsigned pageidx;
 171	int ret = 0;
 172
 173	BUG_ON(size > PAGE_SIZE);
 174
 175	pageidx = PAGE_SIZE;
 176	pagedata = NULL;	/* hush, gcc */
 177
 178	while (nelem--) {
 179		if (pageidx > PAGE_SIZE-size) {
 180			struct page *page;
 181			pos = pos->next;
 182			page = list_entry(pos, struct page, lru);
 183			pagedata = page_address(page);
 184			pageidx = 0;
 185		}
 186
 187		ret = (*fn)(pagedata + pageidx, state);
 188		if (ret)
 189			break;
 190		pageidx += size;
 191	}
 192
 193	return ret;
 194}
 195
 196/*
 197 * Similar to traverse_pages, but use each page as a "block" of
 198 * data to be processed as one unit.
 199 */
 200static int traverse_pages_block(unsigned nelem, size_t size,
 201				struct list_head *pos,
 202				int (*fn)(void *data, int nr, void *state),
 203				void *state)
 204{
 205	void *pagedata;
 206	int ret = 0;
 207
 208	BUG_ON(size > PAGE_SIZE);
 209
 210	while (nelem) {
 211		int nr = (PAGE_SIZE/size);
 212		struct page *page;
 213		if (nr > nelem)
 214			nr = nelem;
 215		pos = pos->next;
 216		page = list_entry(pos, struct page, lru);
 217		pagedata = page_address(page);
 218		ret = (*fn)(pagedata, nr, state);
 219		if (ret)
 220			break;
 221		nelem -= nr;
 222	}
 223
 224	return ret;
 225}
 226
 227struct mmap_gfn_state {
 228	unsigned long va;
 229	struct vm_area_struct *vma;
 230	domid_t domain;
 231};
 232
 233static int mmap_gfn_range(void *data, void *state)
 234{
 235	struct privcmd_mmap_entry *msg = data;
 236	struct mmap_gfn_state *st = state;
 237	struct vm_area_struct *vma = st->vma;
 238	int rc;
 239
 240	/* Do not allow range to wrap the address space. */
 241	if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
 242	    ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
 243		return -EINVAL;
 244
 245	/* Range chunks must be contiguous in va space. */
 246	if ((msg->va != st->va) ||
 247	    ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
 248		return -EINVAL;
 249
 250	rc = xen_remap_domain_gfn_range(vma,
 251					msg->va & PAGE_MASK,
 252					msg->mfn, msg->npages,
 253					vma->vm_page_prot,
 254					st->domain, NULL);
 255	if (rc < 0)
 256		return rc;
 257
 258	st->va += msg->npages << PAGE_SHIFT;
 259
 260	return 0;
 261}
 262
 263static long privcmd_ioctl_mmap(struct file *file, void __user *udata)
 264{
 265	struct privcmd_data *data = file->private_data;
 266	struct privcmd_mmap mmapcmd;
 267	struct mm_struct *mm = current->mm;
 268	struct vm_area_struct *vma;
 269	int rc;
 270	LIST_HEAD(pagelist);
 271	struct mmap_gfn_state state;
 272
 273	/* We only support privcmd_ioctl_mmap_batch for non-auto-translated. */
 274	if (xen_feature(XENFEAT_auto_translated_physmap))
 275		return -ENOSYS;
 276
 277	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
 278		return -EFAULT;
 279
 280	/* If restriction is in place, check the domid matches */
 281	if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom)
 282		return -EPERM;
 283
 284	rc = gather_array(&pagelist,
 285			  mmapcmd.num, sizeof(struct privcmd_mmap_entry),
 286			  mmapcmd.entry);
 287
 288	if (rc || list_empty(&pagelist))
 289		goto out;
 290
 291	mmap_write_lock(mm);
 292
 293	{
 294		struct page *page = list_first_entry(&pagelist,
 295						     struct page, lru);
 296		struct privcmd_mmap_entry *msg = page_address(page);
 297
 298		vma = vma_lookup(mm, msg->va);
 299		rc = -EINVAL;
 300
 301		if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data)
 302			goto out_up;
 303		vma->vm_private_data = PRIV_VMA_LOCKED;
 304	}
 305
 306	state.va = vma->vm_start;
 307	state.vma = vma;
 308	state.domain = mmapcmd.dom;
 309
 310	rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
 311			    &pagelist,
 312			    mmap_gfn_range, &state);
 313
 314
 315out_up:
 316	mmap_write_unlock(mm);
 317
 318out:
 319	free_page_list(&pagelist);
 320
 321	return rc;
 322}
 323
 324struct mmap_batch_state {
 325	domid_t domain;
 326	unsigned long va;
 327	struct vm_area_struct *vma;
 328	int index;
 329	/* A tristate:
 330	 *      0 for no errors
 331	 *      1 if at least one error has happened (and no
 332	 *          -ENOENT errors have happened)
 333	 *      -ENOENT if at least 1 -ENOENT has happened.
 334	 */
 335	int global_error;
 336	int version;
 337
 338	/* User-space gfn array to store errors in the second pass for V1. */
 339	xen_pfn_t __user *user_gfn;
 340	/* User-space int array to store errors in the second pass for V2. */
 341	int __user *user_err;
 342};
 343
 344/* auto translated dom0 note: if domU being created is PV, then gfn is
 345 * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP).
 346 */
 347static int mmap_batch_fn(void *data, int nr, void *state)
 348{
 349	xen_pfn_t *gfnp = data;
 350	struct mmap_batch_state *st = state;
 351	struct vm_area_struct *vma = st->vma;
 352	struct page **pages = vma->vm_private_data;
 353	struct page **cur_pages = NULL;
 354	int ret;
 355
 356	if (xen_feature(XENFEAT_auto_translated_physmap))
 357		cur_pages = &pages[st->index];
 358
 359	BUG_ON(nr < 0);
 360	ret = xen_remap_domain_gfn_array(st->vma, st->va & PAGE_MASK, gfnp, nr,
 361					 (int *)gfnp, st->vma->vm_page_prot,
 362					 st->domain, cur_pages);
 363
 364	/* Adjust the global_error? */
 365	if (ret != nr) {
 366		if (ret == -ENOENT)
 367			st->global_error = -ENOENT;
 368		else {
 369			/* Record that at least one error has happened. */
 370			if (st->global_error == 0)
 371				st->global_error = 1;
 372		}
 373	}
 374	st->va += XEN_PAGE_SIZE * nr;
 375	st->index += nr / XEN_PFN_PER_PAGE;
 376
 377	return 0;
 378}
 379
 380static int mmap_return_error(int err, struct mmap_batch_state *st)
 381{
 382	int ret;
 383
 384	if (st->version == 1) {
 385		if (err) {
 386			xen_pfn_t gfn;
 387
 388			ret = get_user(gfn, st->user_gfn);
 389			if (ret < 0)
 390				return ret;
 391			/*
 392			 * V1 encodes the error codes in the 32bit top
 393			 * nibble of the gfn (with its known
 394			 * limitations vis-a-vis 64 bit callers).
 395			 */
 396			gfn |= (err == -ENOENT) ?
 397				PRIVCMD_MMAPBATCH_PAGED_ERROR :
 398				PRIVCMD_MMAPBATCH_MFN_ERROR;
 399			return __put_user(gfn, st->user_gfn++);
 400		} else
 401			st->user_gfn++;
 402	} else { /* st->version == 2 */
 403		if (err)
 404			return __put_user(err, st->user_err++);
 405		else
 406			st->user_err++;
 407	}
 408
 409	return 0;
 410}
 411
 412static int mmap_return_errors(void *data, int nr, void *state)
 413{
 414	struct mmap_batch_state *st = state;
 415	int *errs = data;
 416	int i;
 417	int ret;
 418
 419	for (i = 0; i < nr; i++) {
 420		ret = mmap_return_error(errs[i], st);
 421		if (ret < 0)
 422			return ret;
 423	}
 424	return 0;
 425}
 426
 427/* Allocate pfns that are then mapped with gfns from foreign domid. Update
 428 * the vma with the page info to use later.
 429 * Returns: 0 if success, otherwise -errno
 430 */
 431static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs)
 432{
 433	int rc;
 434	struct page **pages;
 435
 436	pages = kvcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL);
 437	if (pages == NULL)
 438		return -ENOMEM;
 439
 440	rc = xen_alloc_unpopulated_pages(numpgs, pages);
 441	if (rc != 0) {
 442		pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__,
 443			numpgs, rc);
 444		kvfree(pages);
 445		return -ENOMEM;
 446	}
 447	BUG_ON(vma->vm_private_data != NULL);
 448	vma->vm_private_data = pages;
 449
 450	return 0;
 451}
 452
 453static const struct vm_operations_struct privcmd_vm_ops;
 454
 455static long privcmd_ioctl_mmap_batch(
 456	struct file *file, void __user *udata, int version)
 457{
 458	struct privcmd_data *data = file->private_data;
 459	int ret;
 460	struct privcmd_mmapbatch_v2 m;
 461	struct mm_struct *mm = current->mm;
 462	struct vm_area_struct *vma;
 463	unsigned long nr_pages;
 464	LIST_HEAD(pagelist);
 465	struct mmap_batch_state state;
 466
 467	switch (version) {
 468	case 1:
 469		if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch)))
 470			return -EFAULT;
 471		/* Returns per-frame error in m.arr. */
 472		m.err = NULL;
 473		if (!access_ok(m.arr, m.num * sizeof(*m.arr)))
 474			return -EFAULT;
 475		break;
 476	case 2:
 477		if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2)))
 478			return -EFAULT;
 479		/* Returns per-frame error code in m.err. */
 480		if (!access_ok(m.err, m.num * (sizeof(*m.err))))
 481			return -EFAULT;
 482		break;
 483	default:
 484		return -EINVAL;
 485	}
 486
 487	/* If restriction is in place, check the domid matches */
 488	if (data->domid != DOMID_INVALID && data->domid != m.dom)
 489		return -EPERM;
 490
 491	nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE);
 492	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
 493		return -EINVAL;
 494
 495	ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr);
 496
 497	if (ret)
 498		goto out;
 499	if (list_empty(&pagelist)) {
 500		ret = -EINVAL;
 501		goto out;
 502	}
 503
 504	if (version == 2) {
 505		/* Zero error array now to only copy back actual errors. */
 506		if (clear_user(m.err, sizeof(int) * m.num)) {
 507			ret = -EFAULT;
 508			goto out;
 509		}
 510	}
 511
 512	mmap_write_lock(mm);
 513
 514	vma = find_vma(mm, m.addr);
 515	if (!vma ||
 516	    vma->vm_ops != &privcmd_vm_ops) {
 517		ret = -EINVAL;
 518		goto out_unlock;
 519	}
 520
 521	/*
 522	 * Caller must either:
 523	 *
 524	 * Map the whole VMA range, which will also allocate all the
 525	 * pages required for the auto_translated_physmap case.
 526	 *
 527	 * Or
 528	 *
 529	 * Map unmapped holes left from a previous map attempt (e.g.,
 530	 * because those foreign frames were previously paged out).
 531	 */
 532	if (vma->vm_private_data == NULL) {
 533		if (m.addr != vma->vm_start ||
 534		    m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) {
 535			ret = -EINVAL;
 536			goto out_unlock;
 537		}
 538		if (xen_feature(XENFEAT_auto_translated_physmap)) {
 539			ret = alloc_empty_pages(vma, nr_pages);
 540			if (ret < 0)
 541				goto out_unlock;
 542		} else
 543			vma->vm_private_data = PRIV_VMA_LOCKED;
 544	} else {
 545		if (m.addr < vma->vm_start ||
 546		    m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) {
 547			ret = -EINVAL;
 548			goto out_unlock;
 549		}
 550		if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) {
 551			ret = -EINVAL;
 552			goto out_unlock;
 553		}
 554	}
 555
 556	state.domain        = m.dom;
 557	state.vma           = vma;
 558	state.va            = m.addr;
 559	state.index         = 0;
 560	state.global_error  = 0;
 561	state.version       = version;
 562
 563	BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0);
 564	/* mmap_batch_fn guarantees ret == 0 */
 565	BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t),
 566				    &pagelist, mmap_batch_fn, &state));
 567
 568	mmap_write_unlock(mm);
 569
 570	if (state.global_error) {
 571		/* Write back errors in second pass. */
 572		state.user_gfn = (xen_pfn_t *)m.arr;
 573		state.user_err = m.err;
 574		ret = traverse_pages_block(m.num, sizeof(xen_pfn_t),
 575					   &pagelist, mmap_return_errors, &state);
 576	} else
 577		ret = 0;
 578
 579	/* If we have not had any EFAULT-like global errors then set the global
 580	 * error to -ENOENT if necessary. */
 581	if ((ret == 0) && (state.global_error == -ENOENT))
 582		ret = -ENOENT;
 583
 584out:
 585	free_page_list(&pagelist);
 586	return ret;
 587
 588out_unlock:
 589	mmap_write_unlock(mm);
 590	goto out;
 591}
 592
 593static int lock_pages(
 594	struct privcmd_dm_op_buf kbufs[], unsigned int num,
 595	struct page *pages[], unsigned int nr_pages, unsigned int *pinned)
 596{
 597	unsigned int i, off = 0;
 598
 599	for (i = 0; i < num; ) {
 600		unsigned int requested;
 601		int page_count;
 602
 603		requested = DIV_ROUND_UP(
 604			offset_in_page(kbufs[i].uptr) + kbufs[i].size,
 605			PAGE_SIZE) - off;
 606		if (requested > nr_pages)
 607			return -ENOSPC;
 608
 609		page_count = pin_user_pages_fast(
 610			(unsigned long)kbufs[i].uptr + off * PAGE_SIZE,
 611			requested, FOLL_WRITE, pages);
 612		if (page_count <= 0)
 613			return page_count ? : -EFAULT;
 614
 615		*pinned += page_count;
 616		nr_pages -= page_count;
 617		pages += page_count;
 618
 619		off = (requested == page_count) ? 0 : off + page_count;
 620		i += !off;
 621	}
 622
 623	return 0;
 624}
 625
 626static void unlock_pages(struct page *pages[], unsigned int nr_pages)
 627{
 628	unpin_user_pages_dirty_lock(pages, nr_pages, true);
 629}
 630
 631static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
 632{
 633	struct privcmd_data *data = file->private_data;
 634	struct privcmd_dm_op kdata;
 635	struct privcmd_dm_op_buf *kbufs;
 636	unsigned int nr_pages = 0;
 637	struct page **pages = NULL;
 638	struct xen_dm_op_buf *xbufs = NULL;
 639	unsigned int i;
 640	long rc;
 641	unsigned int pinned = 0;
 642
 643	if (copy_from_user(&kdata, udata, sizeof(kdata)))
 644		return -EFAULT;
 645
 646	/* If restriction is in place, check the domid matches */
 647	if (data->domid != DOMID_INVALID && data->domid != kdata.dom)
 648		return -EPERM;
 649
 650	if (kdata.num == 0)
 651		return 0;
 652
 653	if (kdata.num > privcmd_dm_op_max_num)
 654		return -E2BIG;
 655
 656	kbufs = kcalloc(kdata.num, sizeof(*kbufs), GFP_KERNEL);
 657	if (!kbufs)
 658		return -ENOMEM;
 659
 660	if (copy_from_user(kbufs, kdata.ubufs,
 661			   sizeof(*kbufs) * kdata.num)) {
 662		rc = -EFAULT;
 663		goto out;
 664	}
 665
 666	for (i = 0; i < kdata.num; i++) {
 667		if (kbufs[i].size > privcmd_dm_op_buf_max_size) {
 668			rc = -E2BIG;
 669			goto out;
 670		}
 671
 672		if (!access_ok(kbufs[i].uptr,
 673			       kbufs[i].size)) {
 674			rc = -EFAULT;
 675			goto out;
 676		}
 677
 678		nr_pages += DIV_ROUND_UP(
 679			offset_in_page(kbufs[i].uptr) + kbufs[i].size,
 680			PAGE_SIZE);
 681	}
 682
 683	pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
 684	if (!pages) {
 685		rc = -ENOMEM;
 686		goto out;
 687	}
 688
 689	xbufs = kcalloc(kdata.num, sizeof(*xbufs), GFP_KERNEL);
 690	if (!xbufs) {
 691		rc = -ENOMEM;
 692		goto out;
 693	}
 694
 695	rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned);
 696	if (rc < 0)
 
 697		goto out;
 
 698
 699	for (i = 0; i < kdata.num; i++) {
 700		set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr);
 701		xbufs[i].size = kbufs[i].size;
 702	}
 703
 704	xen_preemptible_hcall_begin();
 705	rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs);
 706	xen_preemptible_hcall_end();
 707
 708out:
 709	unlock_pages(pages, pinned);
 710	kfree(xbufs);
 711	kfree(pages);
 712	kfree(kbufs);
 713
 714	return rc;
 715}
 716
 717static long privcmd_ioctl_restrict(struct file *file, void __user *udata)
 718{
 719	struct privcmd_data *data = file->private_data;
 720	domid_t dom;
 721
 722	if (copy_from_user(&dom, udata, sizeof(dom)))
 723		return -EFAULT;
 724
 725	/* Set restriction to the specified domain, or check it matches */
 726	if (data->domid == DOMID_INVALID)
 727		data->domid = dom;
 728	else if (data->domid != dom)
 729		return -EINVAL;
 730
 731	return 0;
 732}
 733
 734static long privcmd_ioctl_mmap_resource(struct file *file,
 735				struct privcmd_mmap_resource __user *udata)
 736{
 737	struct privcmd_data *data = file->private_data;
 738	struct mm_struct *mm = current->mm;
 739	struct vm_area_struct *vma;
 740	struct privcmd_mmap_resource kdata;
 741	xen_pfn_t *pfns = NULL;
 742	struct xen_mem_acquire_resource xdata = { };
 743	int rc;
 744
 745	if (copy_from_user(&kdata, udata, sizeof(kdata)))
 746		return -EFAULT;
 747
 748	/* If restriction is in place, check the domid matches */
 749	if (data->domid != DOMID_INVALID && data->domid != kdata.dom)
 750		return -EPERM;
 751
 752	/* Both fields must be set or unset */
 753	if (!!kdata.addr != !!kdata.num)
 754		return -EINVAL;
 755
 756	xdata.domid = kdata.dom;
 757	xdata.type = kdata.type;
 758	xdata.id = kdata.id;
 759
 760	if (!kdata.addr && !kdata.num) {
 761		/* Query the size of the resource. */
 762		rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata);
 763		if (rc)
 764			return rc;
 765		return __put_user(xdata.nr_frames, &udata->num);
 766	}
 767
 768	mmap_write_lock(mm);
 769
 770	vma = find_vma(mm, kdata.addr);
 771	if (!vma || vma->vm_ops != &privcmd_vm_ops) {
 772		rc = -EINVAL;
 773		goto out;
 774	}
 775
 776	pfns = kcalloc(kdata.num, sizeof(*pfns), GFP_KERNEL | __GFP_NOWARN);
 777	if (!pfns) {
 778		rc = -ENOMEM;
 779		goto out;
 780	}
 781
 782	if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) &&
 783	    xen_feature(XENFEAT_auto_translated_physmap)) {
 784		unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE);
 785		struct page **pages;
 786		unsigned int i;
 787
 788		rc = alloc_empty_pages(vma, nr);
 789		if (rc < 0)
 790			goto out;
 791
 792		pages = vma->vm_private_data;
 793
 794		for (i = 0; i < kdata.num; i++) {
 795			xen_pfn_t pfn =
 796				page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]);
 797
 798			pfns[i] = pfn + (i % XEN_PFN_PER_PAGE);
 799		}
 800	} else
 801		vma->vm_private_data = PRIV_VMA_LOCKED;
 802
 
 
 
 
 803	xdata.frame = kdata.idx;
 804	xdata.nr_frames = kdata.num;
 805	set_xen_guest_handle(xdata.frame_list, pfns);
 806
 807	xen_preemptible_hcall_begin();
 808	rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata);
 809	xen_preemptible_hcall_end();
 810
 811	if (rc)
 812		goto out;
 813
 814	if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) &&
 815	    xen_feature(XENFEAT_auto_translated_physmap)) {
 816		rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT);
 817	} else {
 818		unsigned int domid =
 819			(xdata.flags & XENMEM_rsrc_acq_caller_owned) ?
 820			DOMID_SELF : kdata.dom;
 821		int num, *errs = (int *)pfns;
 822
 823		BUILD_BUG_ON(sizeof(*errs) > sizeof(*pfns));
 824		num = xen_remap_domain_mfn_array(vma,
 825						 kdata.addr & PAGE_MASK,
 826						 pfns, kdata.num, errs,
 827						 vma->vm_page_prot,
 828						 domid);
 
 829		if (num < 0)
 830			rc = num;
 831		else if (num != kdata.num) {
 832			unsigned int i;
 833
 834			for (i = 0; i < num; i++) {
 835				rc = errs[i];
 836				if (rc < 0)
 837					break;
 838			}
 839		} else
 840			rc = 0;
 841	}
 842
 843out:
 844	mmap_write_unlock(mm);
 845	kfree(pfns);
 846
 847	return rc;
 848}
 849
 850static long privcmd_ioctl_pcidev_get_gsi(struct file *file, void __user *udata)
 851{
 852#if defined(CONFIG_XEN_ACPI)
 853	int rc;
 854	struct privcmd_pcidev_get_gsi kdata;
 855
 856	if (copy_from_user(&kdata, udata, sizeof(kdata)))
 857		return -EFAULT;
 858
 859	rc = xen_acpi_get_gsi_from_sbdf(kdata.sbdf);
 860	if (rc < 0)
 861		return rc;
 862
 863	kdata.gsi = rc;
 864	if (copy_to_user(udata, &kdata, sizeof(kdata)))
 865		return -EFAULT;
 866
 867	return 0;
 868#else
 869	return -EINVAL;
 870#endif
 871}
 872
 873#ifdef CONFIG_XEN_PRIVCMD_EVENTFD
 874/* Irqfd support */
 875static struct workqueue_struct *irqfd_cleanup_wq;
 876static DEFINE_SPINLOCK(irqfds_lock);
 877DEFINE_STATIC_SRCU(irqfds_srcu);
 878static LIST_HEAD(irqfds_list);
 879
 880struct privcmd_kernel_irqfd {
 881	struct xen_dm_op_buf xbufs;
 882	domid_t dom;
 883	bool error;
 884	struct eventfd_ctx *eventfd;
 885	struct work_struct shutdown;
 886	wait_queue_entry_t wait;
 887	struct list_head list;
 888	poll_table pt;
 889};
 890
 891static void irqfd_deactivate(struct privcmd_kernel_irqfd *kirqfd)
 892{
 893	lockdep_assert_held(&irqfds_lock);
 894
 895	list_del_init(&kirqfd->list);
 896	queue_work(irqfd_cleanup_wq, &kirqfd->shutdown);
 897}
 898
 899static void irqfd_shutdown(struct work_struct *work)
 900{
 901	struct privcmd_kernel_irqfd *kirqfd =
 902		container_of(work, struct privcmd_kernel_irqfd, shutdown);
 903	u64 cnt;
 904
 905	/* Make sure irqfd has been initialized in assign path */
 906	synchronize_srcu(&irqfds_srcu);
 907
 908	eventfd_ctx_remove_wait_queue(kirqfd->eventfd, &kirqfd->wait, &cnt);
 909	eventfd_ctx_put(kirqfd->eventfd);
 910	kfree(kirqfd);
 911}
 912
 913static void irqfd_inject(struct privcmd_kernel_irqfd *kirqfd)
 914{
 915	u64 cnt;
 916	long rc;
 917
 918	eventfd_ctx_do_read(kirqfd->eventfd, &cnt);
 919
 920	xen_preemptible_hcall_begin();
 921	rc = HYPERVISOR_dm_op(kirqfd->dom, 1, &kirqfd->xbufs);
 922	xen_preemptible_hcall_end();
 923
 924	/* Don't repeat the error message for consecutive failures */
 925	if (rc && !kirqfd->error) {
 926		pr_err("Failed to configure irq for guest domain: %d\n",
 927		       kirqfd->dom);
 928	}
 929
 930	kirqfd->error = rc;
 931}
 932
 933static int
 934irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key)
 935{
 936	struct privcmd_kernel_irqfd *kirqfd =
 937		container_of(wait, struct privcmd_kernel_irqfd, wait);
 938	__poll_t flags = key_to_poll(key);
 939
 940	if (flags & EPOLLIN)
 941		irqfd_inject(kirqfd);
 942
 943	if (flags & EPOLLHUP) {
 944		unsigned long flags;
 945
 946		spin_lock_irqsave(&irqfds_lock, flags);
 947		irqfd_deactivate(kirqfd);
 948		spin_unlock_irqrestore(&irqfds_lock, flags);
 949	}
 950
 951	return 0;
 952}
 953
 954static void
 955irqfd_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt)
 956{
 957	struct privcmd_kernel_irqfd *kirqfd =
 958		container_of(pt, struct privcmd_kernel_irqfd, pt);
 959
 960	add_wait_queue_priority(wqh, &kirqfd->wait);
 961}
 962
 963static int privcmd_irqfd_assign(struct privcmd_irqfd *irqfd)
 964{
 965	struct privcmd_kernel_irqfd *kirqfd, *tmp;
 966	unsigned long flags;
 967	__poll_t events;
 968	void *dm_op;
 969	int ret, idx;
 970
 971	CLASS(fd, f)(irqfd->fd);
 972
 973	kirqfd = kzalloc(sizeof(*kirqfd) + irqfd->size, GFP_KERNEL);
 974	if (!kirqfd)
 975		return -ENOMEM;
 976	dm_op = kirqfd + 1;
 977
 978	if (copy_from_user(dm_op, u64_to_user_ptr(irqfd->dm_op), irqfd->size)) {
 979		ret = -EFAULT;
 980		goto error_kfree;
 981	}
 982
 983	kirqfd->xbufs.size = irqfd->size;
 984	set_xen_guest_handle(kirqfd->xbufs.h, dm_op);
 985	kirqfd->dom = irqfd->dom;
 986	INIT_WORK(&kirqfd->shutdown, irqfd_shutdown);
 987
 988	if (fd_empty(f)) {
 989		ret = -EBADF;
 990		goto error_kfree;
 991	}
 992
 993	kirqfd->eventfd = eventfd_ctx_fileget(fd_file(f));
 994	if (IS_ERR(kirqfd->eventfd)) {
 995		ret = PTR_ERR(kirqfd->eventfd);
 996		goto error_kfree;
 997	}
 998
 999	/*
1000	 * Install our own custom wake-up handling so we are notified via a
1001	 * callback whenever someone signals the underlying eventfd.
1002	 */
1003	init_waitqueue_func_entry(&kirqfd->wait, irqfd_wakeup);
1004	init_poll_funcptr(&kirqfd->pt, irqfd_poll_func);
1005
1006	spin_lock_irqsave(&irqfds_lock, flags);
1007
1008	list_for_each_entry(tmp, &irqfds_list, list) {
1009		if (kirqfd->eventfd == tmp->eventfd) {
1010			ret = -EBUSY;
1011			spin_unlock_irqrestore(&irqfds_lock, flags);
1012			goto error_eventfd;
1013		}
1014	}
1015
1016	idx = srcu_read_lock(&irqfds_srcu);
1017	list_add_tail(&kirqfd->list, &irqfds_list);
1018	spin_unlock_irqrestore(&irqfds_lock, flags);
1019
1020	/*
1021	 * Check if there was an event already pending on the eventfd before we
1022	 * registered, and trigger it as if we didn't miss it.
1023	 */
1024	events = vfs_poll(fd_file(f), &kirqfd->pt);
1025	if (events & EPOLLIN)
1026		irqfd_inject(kirqfd);
1027
1028	srcu_read_unlock(&irqfds_srcu, idx);
1029	return 0;
1030
1031error_eventfd:
1032	eventfd_ctx_put(kirqfd->eventfd);
1033
1034error_kfree:
1035	kfree(kirqfd);
1036	return ret;
1037}
1038
1039static int privcmd_irqfd_deassign(struct privcmd_irqfd *irqfd)
1040{
1041	struct privcmd_kernel_irqfd *kirqfd;
1042	struct eventfd_ctx *eventfd;
1043	unsigned long flags;
1044
1045	eventfd = eventfd_ctx_fdget(irqfd->fd);
1046	if (IS_ERR(eventfd))
1047		return PTR_ERR(eventfd);
1048
1049	spin_lock_irqsave(&irqfds_lock, flags);
1050
1051	list_for_each_entry(kirqfd, &irqfds_list, list) {
1052		if (kirqfd->eventfd == eventfd) {
1053			irqfd_deactivate(kirqfd);
1054			break;
1055		}
1056	}
1057
1058	spin_unlock_irqrestore(&irqfds_lock, flags);
1059
1060	eventfd_ctx_put(eventfd);
1061
1062	/*
1063	 * Block until we know all outstanding shutdown jobs have completed so
1064	 * that we guarantee there will not be any more interrupts once this
1065	 * deassign function returns.
1066	 */
1067	flush_workqueue(irqfd_cleanup_wq);
1068
1069	return 0;
1070}
1071
1072static long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
1073{
1074	struct privcmd_data *data = file->private_data;
1075	struct privcmd_irqfd irqfd;
1076
1077	if (copy_from_user(&irqfd, udata, sizeof(irqfd)))
1078		return -EFAULT;
1079
1080	/* No other flags should be set */
1081	if (irqfd.flags & ~PRIVCMD_IRQFD_FLAG_DEASSIGN)
1082		return -EINVAL;
1083
1084	/* If restriction is in place, check the domid matches */
1085	if (data->domid != DOMID_INVALID && data->domid != irqfd.dom)
1086		return -EPERM;
1087
1088	if (irqfd.flags & PRIVCMD_IRQFD_FLAG_DEASSIGN)
1089		return privcmd_irqfd_deassign(&irqfd);
1090
1091	return privcmd_irqfd_assign(&irqfd);
1092}
1093
1094static int privcmd_irqfd_init(void)
1095{
1096	irqfd_cleanup_wq = alloc_workqueue("privcmd-irqfd-cleanup", 0, 0);
1097	if (!irqfd_cleanup_wq)
1098		return -ENOMEM;
1099
1100	return 0;
1101}
1102
1103static void privcmd_irqfd_exit(void)
1104{
1105	struct privcmd_kernel_irqfd *kirqfd, *tmp;
1106	unsigned long flags;
1107
1108	spin_lock_irqsave(&irqfds_lock, flags);
1109
1110	list_for_each_entry_safe(kirqfd, tmp, &irqfds_list, list)
1111		irqfd_deactivate(kirqfd);
1112
1113	spin_unlock_irqrestore(&irqfds_lock, flags);
1114
1115	destroy_workqueue(irqfd_cleanup_wq);
1116}
1117
1118/* Ioeventfd Support */
1119#define QUEUE_NOTIFY_VQ_MASK 0xFFFF
1120
1121static DEFINE_MUTEX(ioreq_lock);
1122static LIST_HEAD(ioreq_list);
1123
1124/* per-eventfd structure */
1125struct privcmd_kernel_ioeventfd {
1126	struct eventfd_ctx *eventfd;
1127	struct list_head list;
1128	u64 addr;
1129	unsigned int addr_len;
1130	unsigned int vq;
1131};
1132
1133/* per-guest CPU / port structure */
1134struct ioreq_port {
1135	int vcpu;
1136	unsigned int port;
1137	struct privcmd_kernel_ioreq *kioreq;
1138};
1139
1140/* per-guest structure */
1141struct privcmd_kernel_ioreq {
1142	domid_t dom;
1143	unsigned int vcpus;
1144	u64 uioreq;
1145	struct ioreq *ioreq;
1146	spinlock_t lock; /* Protects ioeventfds list */
1147	struct list_head ioeventfds;
1148	struct list_head list;
1149	struct ioreq_port ports[] __counted_by(vcpus);
1150};
1151
1152static irqreturn_t ioeventfd_interrupt(int irq, void *dev_id)
1153{
1154	struct ioreq_port *port = dev_id;
1155	struct privcmd_kernel_ioreq *kioreq = port->kioreq;
1156	struct ioreq *ioreq = &kioreq->ioreq[port->vcpu];
1157	struct privcmd_kernel_ioeventfd *kioeventfd;
1158	unsigned int state = STATE_IOREQ_READY;
1159
1160	if (ioreq->state != STATE_IOREQ_READY ||
1161	    ioreq->type != IOREQ_TYPE_COPY || ioreq->dir != IOREQ_WRITE)
1162		return IRQ_NONE;
1163
1164	/*
1165	 * We need a barrier, smp_mb(), here to ensure reads are finished before
1166	 * `state` is updated. Since the lock implementation ensures that
1167	 * appropriate barrier will be added anyway, we can avoid adding
1168	 * explicit barrier here.
1169	 *
1170	 * Ideally we don't need to update `state` within the locks, but we do
1171	 * that here to avoid adding explicit barrier.
1172	 */
1173
1174	spin_lock(&kioreq->lock);
1175	ioreq->state = STATE_IOREQ_INPROCESS;
1176
1177	list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) {
1178		if (ioreq->addr == kioeventfd->addr + VIRTIO_MMIO_QUEUE_NOTIFY &&
1179		    ioreq->size == kioeventfd->addr_len &&
1180		    (ioreq->data & QUEUE_NOTIFY_VQ_MASK) == kioeventfd->vq) {
1181			eventfd_signal(kioeventfd->eventfd);
1182			state = STATE_IORESP_READY;
1183			break;
1184		}
1185	}
1186	spin_unlock(&kioreq->lock);
1187
1188	/*
1189	 * We need a barrier, smp_mb(), here to ensure writes are finished
1190	 * before `state` is updated. Since the lock implementation ensures that
1191	 * appropriate barrier will be added anyway, we can avoid adding
1192	 * explicit barrier here.
1193	 */
1194
1195	ioreq->state = state;
1196
1197	if (state == STATE_IORESP_READY) {
1198		notify_remote_via_evtchn(port->port);
1199		return IRQ_HANDLED;
1200	}
1201
1202	return IRQ_NONE;
1203}
1204
1205static void ioreq_free(struct privcmd_kernel_ioreq *kioreq)
1206{
1207	struct ioreq_port *ports = kioreq->ports;
1208	int i;
1209
1210	lockdep_assert_held(&ioreq_lock);
1211
1212	list_del(&kioreq->list);
1213
1214	for (i = kioreq->vcpus - 1; i >= 0; i--)
1215		unbind_from_irqhandler(irq_from_evtchn(ports[i].port), &ports[i]);
1216
1217	kfree(kioreq);
1218}
1219
1220static
1221struct privcmd_kernel_ioreq *alloc_ioreq(struct privcmd_ioeventfd *ioeventfd)
1222{
1223	struct privcmd_kernel_ioreq *kioreq;
1224	struct mm_struct *mm = current->mm;
1225	struct vm_area_struct *vma;
1226	struct page **pages;
1227	unsigned int *ports;
1228	int ret, size, i;
1229
1230	lockdep_assert_held(&ioreq_lock);
1231
1232	size = struct_size(kioreq, ports, ioeventfd->vcpus);
1233	kioreq = kzalloc(size, GFP_KERNEL);
1234	if (!kioreq)
1235		return ERR_PTR(-ENOMEM);
1236
1237	kioreq->dom = ioeventfd->dom;
1238	kioreq->vcpus = ioeventfd->vcpus;
1239	kioreq->uioreq = ioeventfd->ioreq;
1240	spin_lock_init(&kioreq->lock);
1241	INIT_LIST_HEAD(&kioreq->ioeventfds);
1242
1243	/* The memory for ioreq server must have been mapped earlier */
1244	mmap_write_lock(mm);
1245	vma = find_vma(mm, (unsigned long)ioeventfd->ioreq);
1246	if (!vma) {
1247		pr_err("Failed to find vma for ioreq page!\n");
1248		mmap_write_unlock(mm);
1249		ret = -EFAULT;
1250		goto error_kfree;
1251	}
1252
1253	pages = vma->vm_private_data;
1254	kioreq->ioreq = (struct ioreq *)(page_to_virt(pages[0]));
1255	mmap_write_unlock(mm);
1256
1257	ports = memdup_array_user(u64_to_user_ptr(ioeventfd->ports),
1258				  kioreq->vcpus, sizeof(*ports));
1259	if (IS_ERR(ports)) {
1260		ret = PTR_ERR(ports);
1261		goto error_kfree;
1262	}
1263
1264	for (i = 0; i < kioreq->vcpus; i++) {
1265		kioreq->ports[i].vcpu = i;
1266		kioreq->ports[i].port = ports[i];
1267		kioreq->ports[i].kioreq = kioreq;
1268
1269		ret = bind_evtchn_to_irqhandler_lateeoi(ports[i],
1270				ioeventfd_interrupt, IRQF_SHARED, "ioeventfd",
1271				&kioreq->ports[i]);
1272		if (ret < 0)
1273			goto error_unbind;
1274	}
1275
1276	kfree(ports);
1277
1278	list_add_tail(&kioreq->list, &ioreq_list);
1279
1280	return kioreq;
1281
1282error_unbind:
1283	while (--i >= 0)
1284		unbind_from_irqhandler(irq_from_evtchn(ports[i]), &kioreq->ports[i]);
1285
1286	kfree(ports);
1287error_kfree:
1288	kfree(kioreq);
1289	return ERR_PTR(ret);
1290}
1291
1292static struct privcmd_kernel_ioreq *
1293get_ioreq(struct privcmd_ioeventfd *ioeventfd, struct eventfd_ctx *eventfd)
1294{
1295	struct privcmd_kernel_ioreq *kioreq;
1296	unsigned long flags;
1297
1298	list_for_each_entry(kioreq, &ioreq_list, list) {
1299		struct privcmd_kernel_ioeventfd *kioeventfd;
1300
1301		/*
1302		 * kioreq fields can be accessed here without a lock as they are
1303		 * never updated after being added to the ioreq_list.
1304		 */
1305		if (kioreq->uioreq != ioeventfd->ioreq) {
1306			continue;
1307		} else if (kioreq->dom != ioeventfd->dom ||
1308			   kioreq->vcpus != ioeventfd->vcpus) {
1309			pr_err("Invalid ioeventfd configuration mismatch, dom (%u vs %u), vcpus (%u vs %u)\n",
1310			       kioreq->dom, ioeventfd->dom, kioreq->vcpus,
1311			       ioeventfd->vcpus);
1312			return ERR_PTR(-EINVAL);
1313		}
1314
1315		/* Look for a duplicate eventfd for the same guest */
1316		spin_lock_irqsave(&kioreq->lock, flags);
1317		list_for_each_entry(kioeventfd, &kioreq->ioeventfds, list) {
1318			if (eventfd == kioeventfd->eventfd) {
1319				spin_unlock_irqrestore(&kioreq->lock, flags);
1320				return ERR_PTR(-EBUSY);
1321			}
1322		}
1323		spin_unlock_irqrestore(&kioreq->lock, flags);
1324
1325		return kioreq;
1326	}
1327
1328	/* Matching kioreq isn't found, allocate a new one */
1329	return alloc_ioreq(ioeventfd);
1330}
1331
1332static void ioeventfd_free(struct privcmd_kernel_ioeventfd *kioeventfd)
1333{
1334	list_del(&kioeventfd->list);
1335	eventfd_ctx_put(kioeventfd->eventfd);
1336	kfree(kioeventfd);
1337}
1338
1339static int privcmd_ioeventfd_assign(struct privcmd_ioeventfd *ioeventfd)
1340{
1341	struct privcmd_kernel_ioeventfd *kioeventfd;
1342	struct privcmd_kernel_ioreq *kioreq;
1343	unsigned long flags;
1344	int ret;
1345
1346	/* Check for range overflow */
1347	if (ioeventfd->addr + ioeventfd->addr_len < ioeventfd->addr)
1348		return -EINVAL;
1349
1350	/* Vhost requires us to support length 1, 2, 4, and 8 */
1351	if (!(ioeventfd->addr_len == 1 || ioeventfd->addr_len == 2 ||
1352	      ioeventfd->addr_len == 4 || ioeventfd->addr_len == 8))
1353		return -EINVAL;
1354
1355	/* 4096 vcpus limit enough ? */
1356	if (!ioeventfd->vcpus || ioeventfd->vcpus > 4096)
1357		return -EINVAL;
1358
1359	kioeventfd = kzalloc(sizeof(*kioeventfd), GFP_KERNEL);
1360	if (!kioeventfd)
1361		return -ENOMEM;
1362
1363	kioeventfd->eventfd = eventfd_ctx_fdget(ioeventfd->event_fd);
1364	if (IS_ERR(kioeventfd->eventfd)) {
1365		ret = PTR_ERR(kioeventfd->eventfd);
1366		goto error_kfree;
1367	}
1368
1369	kioeventfd->addr = ioeventfd->addr;
1370	kioeventfd->addr_len = ioeventfd->addr_len;
1371	kioeventfd->vq = ioeventfd->vq;
1372
1373	mutex_lock(&ioreq_lock);
1374	kioreq = get_ioreq(ioeventfd, kioeventfd->eventfd);
1375	if (IS_ERR(kioreq)) {
1376		mutex_unlock(&ioreq_lock);
1377		ret = PTR_ERR(kioreq);
1378		goto error_eventfd;
1379	}
1380
1381	spin_lock_irqsave(&kioreq->lock, flags);
1382	list_add_tail(&kioeventfd->list, &kioreq->ioeventfds);
1383	spin_unlock_irqrestore(&kioreq->lock, flags);
1384
1385	mutex_unlock(&ioreq_lock);
1386
1387	return 0;
1388
1389error_eventfd:
1390	eventfd_ctx_put(kioeventfd->eventfd);
1391
1392error_kfree:
1393	kfree(kioeventfd);
1394	return ret;
1395}
1396
1397static int privcmd_ioeventfd_deassign(struct privcmd_ioeventfd *ioeventfd)
1398{
1399	struct privcmd_kernel_ioreq *kioreq, *tkioreq;
1400	struct eventfd_ctx *eventfd;
1401	unsigned long flags;
1402	int ret = 0;
1403
1404	eventfd = eventfd_ctx_fdget(ioeventfd->event_fd);
1405	if (IS_ERR(eventfd))
1406		return PTR_ERR(eventfd);
1407
1408	mutex_lock(&ioreq_lock);
1409	list_for_each_entry_safe(kioreq, tkioreq, &ioreq_list, list) {
1410		struct privcmd_kernel_ioeventfd *kioeventfd, *tmp;
1411		/*
1412		 * kioreq fields can be accessed here without a lock as they are
1413		 * never updated after being added to the ioreq_list.
1414		 */
1415		if (kioreq->dom != ioeventfd->dom ||
1416		    kioreq->uioreq != ioeventfd->ioreq ||
1417		    kioreq->vcpus != ioeventfd->vcpus)
1418			continue;
1419
1420		spin_lock_irqsave(&kioreq->lock, flags);
1421		list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list) {
1422			if (eventfd == kioeventfd->eventfd) {
1423				ioeventfd_free(kioeventfd);
1424				spin_unlock_irqrestore(&kioreq->lock, flags);
1425
1426				if (list_empty(&kioreq->ioeventfds))
1427					ioreq_free(kioreq);
1428				goto unlock;
1429			}
1430		}
1431		spin_unlock_irqrestore(&kioreq->lock, flags);
1432		break;
1433	}
1434
1435	pr_err("Ioeventfd isn't already assigned, dom: %u, addr: %llu\n",
1436	       ioeventfd->dom, ioeventfd->addr);
1437	ret = -ENODEV;
1438
1439unlock:
1440	mutex_unlock(&ioreq_lock);
1441	eventfd_ctx_put(eventfd);
1442
1443	return ret;
1444}
1445
1446static long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata)
1447{
1448	struct privcmd_data *data = file->private_data;
1449	struct privcmd_ioeventfd ioeventfd;
1450
1451	if (copy_from_user(&ioeventfd, udata, sizeof(ioeventfd)))
1452		return -EFAULT;
1453
1454	/* No other flags should be set */
1455	if (ioeventfd.flags & ~PRIVCMD_IOEVENTFD_FLAG_DEASSIGN)
1456		return -EINVAL;
1457
1458	/* If restriction is in place, check the domid matches */
1459	if (data->domid != DOMID_INVALID && data->domid != ioeventfd.dom)
1460		return -EPERM;
1461
1462	if (ioeventfd.flags & PRIVCMD_IOEVENTFD_FLAG_DEASSIGN)
1463		return privcmd_ioeventfd_deassign(&ioeventfd);
1464
1465	return privcmd_ioeventfd_assign(&ioeventfd);
1466}
1467
1468static void privcmd_ioeventfd_exit(void)
1469{
1470	struct privcmd_kernel_ioreq *kioreq, *tmp;
1471	unsigned long flags;
1472
1473	mutex_lock(&ioreq_lock);
1474	list_for_each_entry_safe(kioreq, tmp, &ioreq_list, list) {
1475		struct privcmd_kernel_ioeventfd *kioeventfd, *tmp;
1476
1477		spin_lock_irqsave(&kioreq->lock, flags);
1478		list_for_each_entry_safe(kioeventfd, tmp, &kioreq->ioeventfds, list)
1479			ioeventfd_free(kioeventfd);
1480		spin_unlock_irqrestore(&kioreq->lock, flags);
1481
1482		ioreq_free(kioreq);
1483	}
1484	mutex_unlock(&ioreq_lock);
1485}
1486#else
1487static inline long privcmd_ioctl_irqfd(struct file *file, void __user *udata)
1488{
1489	return -EOPNOTSUPP;
1490}
1491
1492static inline int privcmd_irqfd_init(void)
1493{
1494	return 0;
1495}
1496
1497static inline void privcmd_irqfd_exit(void)
1498{
1499}
1500
1501static inline long privcmd_ioctl_ioeventfd(struct file *file, void __user *udata)
1502{
1503	return -EOPNOTSUPP;
1504}
1505
1506static inline void privcmd_ioeventfd_exit(void)
1507{
1508}
1509#endif /* CONFIG_XEN_PRIVCMD_EVENTFD */
1510
1511static long privcmd_ioctl(struct file *file,
1512			  unsigned int cmd, unsigned long data)
1513{
1514	int ret = -ENOTTY;
1515	void __user *udata = (void __user *) data;
1516
1517	switch (cmd) {
1518	case IOCTL_PRIVCMD_HYPERCALL:
1519		ret = privcmd_ioctl_hypercall(file, udata);
1520		break;
1521
1522	case IOCTL_PRIVCMD_MMAP:
1523		ret = privcmd_ioctl_mmap(file, udata);
1524		break;
1525
1526	case IOCTL_PRIVCMD_MMAPBATCH:
1527		ret = privcmd_ioctl_mmap_batch(file, udata, 1);
1528		break;
1529
1530	case IOCTL_PRIVCMD_MMAPBATCH_V2:
1531		ret = privcmd_ioctl_mmap_batch(file, udata, 2);
1532		break;
1533
1534	case IOCTL_PRIVCMD_DM_OP:
1535		ret = privcmd_ioctl_dm_op(file, udata);
1536		break;
1537
1538	case IOCTL_PRIVCMD_RESTRICT:
1539		ret = privcmd_ioctl_restrict(file, udata);
1540		break;
1541
1542	case IOCTL_PRIVCMD_MMAP_RESOURCE:
1543		ret = privcmd_ioctl_mmap_resource(file, udata);
1544		break;
1545
1546	case IOCTL_PRIVCMD_IRQFD:
1547		ret = privcmd_ioctl_irqfd(file, udata);
1548		break;
1549
1550	case IOCTL_PRIVCMD_IOEVENTFD:
1551		ret = privcmd_ioctl_ioeventfd(file, udata);
1552		break;
1553
1554	case IOCTL_PRIVCMD_PCIDEV_GET_GSI:
1555		ret = privcmd_ioctl_pcidev_get_gsi(file, udata);
1556		break;
1557
1558	default:
1559		break;
1560	}
1561
1562	return ret;
1563}
1564
1565static int privcmd_open(struct inode *ino, struct file *file)
1566{
1567	struct privcmd_data *data = kzalloc(sizeof(*data), GFP_KERNEL);
1568
1569	if (!data)
1570		return -ENOMEM;
1571
1572	/* DOMID_INVALID implies no restriction */
1573	data->domid = DOMID_INVALID;
1574
1575	file->private_data = data;
1576	return 0;
1577}
1578
1579static int privcmd_release(struct inode *ino, struct file *file)
1580{
1581	struct privcmd_data *data = file->private_data;
1582
1583	kfree(data);
1584	return 0;
1585}
1586
1587static void privcmd_close(struct vm_area_struct *vma)
1588{
1589	struct page **pages = vma->vm_private_data;
1590	int numpgs = vma_pages(vma);
1591	int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT;
1592	int rc;
1593
1594	if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
1595		return;
1596
1597	rc = xen_unmap_domain_gfn_range(vma, numgfns, pages);
1598	if (rc == 0)
1599		xen_free_unpopulated_pages(numpgs, pages);
1600	else
1601		pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n",
1602			numpgs, rc);
1603	kvfree(pages);
1604}
1605
1606static vm_fault_t privcmd_fault(struct vm_fault *vmf)
1607{
1608	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
1609	       vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end,
1610	       vmf->pgoff, (void *)vmf->address);
1611
1612	return VM_FAULT_SIGBUS;
1613}
1614
1615static const struct vm_operations_struct privcmd_vm_ops = {
1616	.close = privcmd_close,
1617	.fault = privcmd_fault
1618};
1619
1620static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
1621{
1622	/* DONTCOPY is essential for Xen because copy_page_range doesn't know
1623	 * how to recreate these mappings */
1624	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTCOPY |
1625			 VM_DONTEXPAND | VM_DONTDUMP);
1626	vma->vm_ops = &privcmd_vm_ops;
1627	vma->vm_private_data = NULL;
1628
1629	return 0;
1630}
1631
1632/*
1633 * For MMAPBATCH*. This allows asserting the singleshot mapping
1634 * on a per pfn/pte basis. Mapping calls that fail with ENOENT
1635 * can be then retried until success.
1636 */
1637static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data)
1638{
1639	return pte_none(ptep_get(pte)) ? 0 : -EBUSY;
1640}
1641
1642static int privcmd_vma_range_is_mapped(
1643	           struct vm_area_struct *vma,
1644	           unsigned long addr,
1645	           unsigned long nr_pages)
1646{
1647	return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT,
1648				   is_mapped_fn, NULL) != 0;
1649}
1650
1651const struct file_operations xen_privcmd_fops = {
1652	.owner = THIS_MODULE,
1653	.unlocked_ioctl = privcmd_ioctl,
1654	.open = privcmd_open,
1655	.release = privcmd_release,
1656	.mmap = privcmd_mmap,
1657};
1658EXPORT_SYMBOL_GPL(xen_privcmd_fops);
1659
1660static struct miscdevice privcmd_dev = {
1661	.minor = MISC_DYNAMIC_MINOR,
1662	.name = "xen/privcmd",
1663	.fops = &xen_privcmd_fops,
1664};
1665
1666static int __init privcmd_init(void)
1667{
1668	int err;
1669
1670	if (!xen_domain())
1671		return -ENODEV;
1672
1673	err = misc_register(&privcmd_dev);
1674	if (err != 0) {
1675		pr_err("Could not register Xen privcmd device\n");
1676		return err;
1677	}
1678
1679	err = misc_register(&xen_privcmdbuf_dev);
1680	if (err != 0) {
1681		pr_err("Could not register Xen hypercall-buf device\n");
1682		goto err_privcmdbuf;
1683	}
1684
1685	err = privcmd_irqfd_init();
1686	if (err != 0) {
1687		pr_err("irqfd init failed\n");
1688		goto err_irqfd;
1689	}
1690
1691	return 0;
1692
1693err_irqfd:
1694	misc_deregister(&xen_privcmdbuf_dev);
1695err_privcmdbuf:
1696	misc_deregister(&privcmd_dev);
1697	return err;
1698}
1699
1700static void __exit privcmd_exit(void)
1701{
1702	privcmd_ioeventfd_exit();
1703	privcmd_irqfd_exit();
1704	misc_deregister(&privcmd_dev);
1705	misc_deregister(&xen_privcmdbuf_dev);
1706}
1707
1708module_init(privcmd_init);
1709module_exit(privcmd_exit);
v5.9
  1// SPDX-License-Identifier: GPL-2.0-only
  2/******************************************************************************
  3 * privcmd.c
  4 *
  5 * Interface to privileged domain-0 commands.
  6 *
  7 * Copyright (c) 2002-2004, K A Fraser, B Dragovic
  8 */
  9
 10#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt
 11
 
 
 12#include <linux/kernel.h>
 13#include <linux/module.h>
 
 
 14#include <linux/sched.h>
 15#include <linux/slab.h>
 
 16#include <linux/string.h>
 
 17#include <linux/errno.h>
 18#include <linux/mm.h>
 19#include <linux/mman.h>
 20#include <linux/uaccess.h>
 21#include <linux/swap.h>
 22#include <linux/highmem.h>
 23#include <linux/pagemap.h>
 24#include <linux/seq_file.h>
 25#include <linux/miscdevice.h>
 26#include <linux/moduleparam.h>
 
 27
 28#include <asm/xen/hypervisor.h>
 29#include <asm/xen/hypercall.h>
 30
 31#include <xen/xen.h>
 
 32#include <xen/privcmd.h>
 33#include <xen/interface/xen.h>
 34#include <xen/interface/memory.h>
 35#include <xen/interface/hvm/dm_op.h>
 
 36#include <xen/features.h>
 37#include <xen/page.h>
 38#include <xen/xen-ops.h>
 39#include <xen/balloon.h>
 
 
 
 40
 41#include "privcmd.h"
 42
 
 43MODULE_LICENSE("GPL");
 44
 45#define PRIV_VMA_LOCKED ((void *)1)
 46
 47static unsigned int privcmd_dm_op_max_num = 16;
 48module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644);
 49MODULE_PARM_DESC(dm_op_max_nr_bufs,
 50		 "Maximum number of buffers per dm_op hypercall");
 51
 52static unsigned int privcmd_dm_op_buf_max_size = 4096;
 53module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint,
 54		   0644);
 55MODULE_PARM_DESC(dm_op_buf_max_size,
 56		 "Maximum size of a dm_op hypercall buffer");
 57
 58struct privcmd_data {
 59	domid_t domid;
 60};
 61
 62static int privcmd_vma_range_is_mapped(
 63               struct vm_area_struct *vma,
 64               unsigned long addr,
 65               unsigned long nr_pages);
 66
 67static long privcmd_ioctl_hypercall(struct file *file, void __user *udata)
 68{
 69	struct privcmd_data *data = file->private_data;
 70	struct privcmd_hypercall hypercall;
 71	long ret;
 72
 73	/* Disallow arbitrary hypercalls if restricted */
 74	if (data->domid != DOMID_INVALID)
 75		return -EPERM;
 76
 77	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
 78		return -EFAULT;
 79
 80	xen_preemptible_hcall_begin();
 81	ret = privcmd_call(hypercall.op,
 82			   hypercall.arg[0], hypercall.arg[1],
 83			   hypercall.arg[2], hypercall.arg[3],
 84			   hypercall.arg[4]);
 85	xen_preemptible_hcall_end();
 86
 87	return ret;
 88}
 89
 90static void free_page_list(struct list_head *pages)
 91{
 92	struct page *p, *n;
 93
 94	list_for_each_entry_safe(p, n, pages, lru)
 95		__free_page(p);
 96
 97	INIT_LIST_HEAD(pages);
 98}
 99
100/*
101 * Given an array of items in userspace, return a list of pages
102 * containing the data.  If copying fails, either because of memory
103 * allocation failure or a problem reading user memory, return an
104 * error code; its up to the caller to dispose of any partial list.
105 */
106static int gather_array(struct list_head *pagelist,
107			unsigned nelem, size_t size,
108			const void __user *data)
109{
110	unsigned pageidx;
111	void *pagedata;
112	int ret;
113
114	if (size > PAGE_SIZE)
115		return 0;
116
117	pageidx = PAGE_SIZE;
118	pagedata = NULL;	/* quiet, gcc */
119	while (nelem--) {
120		if (pageidx > PAGE_SIZE-size) {
121			struct page *page = alloc_page(GFP_KERNEL);
122
123			ret = -ENOMEM;
124			if (page == NULL)
125				goto fail;
126
127			pagedata = page_address(page);
128
129			list_add_tail(&page->lru, pagelist);
130			pageidx = 0;
131		}
132
133		ret = -EFAULT;
134		if (copy_from_user(pagedata + pageidx, data, size))
135			goto fail;
136
137		data += size;
138		pageidx += size;
139	}
140
141	ret = 0;
142
143fail:
144	return ret;
145}
146
147/*
148 * Call function "fn" on each element of the array fragmented
149 * over a list of pages.
150 */
151static int traverse_pages(unsigned nelem, size_t size,
152			  struct list_head *pos,
153			  int (*fn)(void *data, void *state),
154			  void *state)
155{
156	void *pagedata;
157	unsigned pageidx;
158	int ret = 0;
159
160	BUG_ON(size > PAGE_SIZE);
161
162	pageidx = PAGE_SIZE;
163	pagedata = NULL;	/* hush, gcc */
164
165	while (nelem--) {
166		if (pageidx > PAGE_SIZE-size) {
167			struct page *page;
168			pos = pos->next;
169			page = list_entry(pos, struct page, lru);
170			pagedata = page_address(page);
171			pageidx = 0;
172		}
173
174		ret = (*fn)(pagedata + pageidx, state);
175		if (ret)
176			break;
177		pageidx += size;
178	}
179
180	return ret;
181}
182
183/*
184 * Similar to traverse_pages, but use each page as a "block" of
185 * data to be processed as one unit.
186 */
187static int traverse_pages_block(unsigned nelem, size_t size,
188				struct list_head *pos,
189				int (*fn)(void *data, int nr, void *state),
190				void *state)
191{
192	void *pagedata;
193	int ret = 0;
194
195	BUG_ON(size > PAGE_SIZE);
196
197	while (nelem) {
198		int nr = (PAGE_SIZE/size);
199		struct page *page;
200		if (nr > nelem)
201			nr = nelem;
202		pos = pos->next;
203		page = list_entry(pos, struct page, lru);
204		pagedata = page_address(page);
205		ret = (*fn)(pagedata, nr, state);
206		if (ret)
207			break;
208		nelem -= nr;
209	}
210
211	return ret;
212}
213
214struct mmap_gfn_state {
215	unsigned long va;
216	struct vm_area_struct *vma;
217	domid_t domain;
218};
219
220static int mmap_gfn_range(void *data, void *state)
221{
222	struct privcmd_mmap_entry *msg = data;
223	struct mmap_gfn_state *st = state;
224	struct vm_area_struct *vma = st->vma;
225	int rc;
226
227	/* Do not allow range to wrap the address space. */
228	if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
229	    ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
230		return -EINVAL;
231
232	/* Range chunks must be contiguous in va space. */
233	if ((msg->va != st->va) ||
234	    ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
235		return -EINVAL;
236
237	rc = xen_remap_domain_gfn_range(vma,
238					msg->va & PAGE_MASK,
239					msg->mfn, msg->npages,
240					vma->vm_page_prot,
241					st->domain, NULL);
242	if (rc < 0)
243		return rc;
244
245	st->va += msg->npages << PAGE_SHIFT;
246
247	return 0;
248}
249
250static long privcmd_ioctl_mmap(struct file *file, void __user *udata)
251{
252	struct privcmd_data *data = file->private_data;
253	struct privcmd_mmap mmapcmd;
254	struct mm_struct *mm = current->mm;
255	struct vm_area_struct *vma;
256	int rc;
257	LIST_HEAD(pagelist);
258	struct mmap_gfn_state state;
259
260	/* We only support privcmd_ioctl_mmap_batch for auto translated. */
261	if (xen_feature(XENFEAT_auto_translated_physmap))
262		return -ENOSYS;
263
264	if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
265		return -EFAULT;
266
267	/* If restriction is in place, check the domid matches */
268	if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom)
269		return -EPERM;
270
271	rc = gather_array(&pagelist,
272			  mmapcmd.num, sizeof(struct privcmd_mmap_entry),
273			  mmapcmd.entry);
274
275	if (rc || list_empty(&pagelist))
276		goto out;
277
278	mmap_write_lock(mm);
279
280	{
281		struct page *page = list_first_entry(&pagelist,
282						     struct page, lru);
283		struct privcmd_mmap_entry *msg = page_address(page);
284
285		vma = find_vma(mm, msg->va);
286		rc = -EINVAL;
287
288		if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data)
289			goto out_up;
290		vma->vm_private_data = PRIV_VMA_LOCKED;
291	}
292
293	state.va = vma->vm_start;
294	state.vma = vma;
295	state.domain = mmapcmd.dom;
296
297	rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
298			    &pagelist,
299			    mmap_gfn_range, &state);
300
301
302out_up:
303	mmap_write_unlock(mm);
304
305out:
306	free_page_list(&pagelist);
307
308	return rc;
309}
310
311struct mmap_batch_state {
312	domid_t domain;
313	unsigned long va;
314	struct vm_area_struct *vma;
315	int index;
316	/* A tristate:
317	 *      0 for no errors
318	 *      1 if at least one error has happened (and no
319	 *          -ENOENT errors have happened)
320	 *      -ENOENT if at least 1 -ENOENT has happened.
321	 */
322	int global_error;
323	int version;
324
325	/* User-space gfn array to store errors in the second pass for V1. */
326	xen_pfn_t __user *user_gfn;
327	/* User-space int array to store errors in the second pass for V2. */
328	int __user *user_err;
329};
330
331/* auto translated dom0 note: if domU being created is PV, then gfn is
332 * mfn(addr on bus). If it's auto xlated, then gfn is pfn (input to HAP).
333 */
334static int mmap_batch_fn(void *data, int nr, void *state)
335{
336	xen_pfn_t *gfnp = data;
337	struct mmap_batch_state *st = state;
338	struct vm_area_struct *vma = st->vma;
339	struct page **pages = vma->vm_private_data;
340	struct page **cur_pages = NULL;
341	int ret;
342
343	if (xen_feature(XENFEAT_auto_translated_physmap))
344		cur_pages = &pages[st->index];
345
346	BUG_ON(nr < 0);
347	ret = xen_remap_domain_gfn_array(st->vma, st->va & PAGE_MASK, gfnp, nr,
348					 (int *)gfnp, st->vma->vm_page_prot,
349					 st->domain, cur_pages);
350
351	/* Adjust the global_error? */
352	if (ret != nr) {
353		if (ret == -ENOENT)
354			st->global_error = -ENOENT;
355		else {
356			/* Record that at least one error has happened. */
357			if (st->global_error == 0)
358				st->global_error = 1;
359		}
360	}
361	st->va += XEN_PAGE_SIZE * nr;
362	st->index += nr / XEN_PFN_PER_PAGE;
363
364	return 0;
365}
366
367static int mmap_return_error(int err, struct mmap_batch_state *st)
368{
369	int ret;
370
371	if (st->version == 1) {
372		if (err) {
373			xen_pfn_t gfn;
374
375			ret = get_user(gfn, st->user_gfn);
376			if (ret < 0)
377				return ret;
378			/*
379			 * V1 encodes the error codes in the 32bit top
380			 * nibble of the gfn (with its known
381			 * limitations vis-a-vis 64 bit callers).
382			 */
383			gfn |= (err == -ENOENT) ?
384				PRIVCMD_MMAPBATCH_PAGED_ERROR :
385				PRIVCMD_MMAPBATCH_MFN_ERROR;
386			return __put_user(gfn, st->user_gfn++);
387		} else
388			st->user_gfn++;
389	} else { /* st->version == 2 */
390		if (err)
391			return __put_user(err, st->user_err++);
392		else
393			st->user_err++;
394	}
395
396	return 0;
397}
398
399static int mmap_return_errors(void *data, int nr, void *state)
400{
401	struct mmap_batch_state *st = state;
402	int *errs = data;
403	int i;
404	int ret;
405
406	for (i = 0; i < nr; i++) {
407		ret = mmap_return_error(errs[i], st);
408		if (ret < 0)
409			return ret;
410	}
411	return 0;
412}
413
414/* Allocate pfns that are then mapped with gfns from foreign domid. Update
415 * the vma with the page info to use later.
416 * Returns: 0 if success, otherwise -errno
417 */
418static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs)
419{
420	int rc;
421	struct page **pages;
422
423	pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL);
424	if (pages == NULL)
425		return -ENOMEM;
426
427	rc = xen_alloc_unpopulated_pages(numpgs, pages);
428	if (rc != 0) {
429		pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__,
430			numpgs, rc);
431		kfree(pages);
432		return -ENOMEM;
433	}
434	BUG_ON(vma->vm_private_data != NULL);
435	vma->vm_private_data = pages;
436
437	return 0;
438}
439
440static const struct vm_operations_struct privcmd_vm_ops;
441
442static long privcmd_ioctl_mmap_batch(
443	struct file *file, void __user *udata, int version)
444{
445	struct privcmd_data *data = file->private_data;
446	int ret;
447	struct privcmd_mmapbatch_v2 m;
448	struct mm_struct *mm = current->mm;
449	struct vm_area_struct *vma;
450	unsigned long nr_pages;
451	LIST_HEAD(pagelist);
452	struct mmap_batch_state state;
453
454	switch (version) {
455	case 1:
456		if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch)))
457			return -EFAULT;
458		/* Returns per-frame error in m.arr. */
459		m.err = NULL;
460		if (!access_ok(m.arr, m.num * sizeof(*m.arr)))
461			return -EFAULT;
462		break;
463	case 2:
464		if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2)))
465			return -EFAULT;
466		/* Returns per-frame error code in m.err. */
467		if (!access_ok(m.err, m.num * (sizeof(*m.err))))
468			return -EFAULT;
469		break;
470	default:
471		return -EINVAL;
472	}
473
474	/* If restriction is in place, check the domid matches */
475	if (data->domid != DOMID_INVALID && data->domid != m.dom)
476		return -EPERM;
477
478	nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE);
479	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
480		return -EINVAL;
481
482	ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr);
483
484	if (ret)
485		goto out;
486	if (list_empty(&pagelist)) {
487		ret = -EINVAL;
488		goto out;
489	}
490
491	if (version == 2) {
492		/* Zero error array now to only copy back actual errors. */
493		if (clear_user(m.err, sizeof(int) * m.num)) {
494			ret = -EFAULT;
495			goto out;
496		}
497	}
498
499	mmap_write_lock(mm);
500
501	vma = find_vma(mm, m.addr);
502	if (!vma ||
503	    vma->vm_ops != &privcmd_vm_ops) {
504		ret = -EINVAL;
505		goto out_unlock;
506	}
507
508	/*
509	 * Caller must either:
510	 *
511	 * Map the whole VMA range, which will also allocate all the
512	 * pages required for the auto_translated_physmap case.
513	 *
514	 * Or
515	 *
516	 * Map unmapped holes left from a previous map attempt (e.g.,
517	 * because those foreign frames were previously paged out).
518	 */
519	if (vma->vm_private_data == NULL) {
520		if (m.addr != vma->vm_start ||
521		    m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) {
522			ret = -EINVAL;
523			goto out_unlock;
524		}
525		if (xen_feature(XENFEAT_auto_translated_physmap)) {
526			ret = alloc_empty_pages(vma, nr_pages);
527			if (ret < 0)
528				goto out_unlock;
529		} else
530			vma->vm_private_data = PRIV_VMA_LOCKED;
531	} else {
532		if (m.addr < vma->vm_start ||
533		    m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) {
534			ret = -EINVAL;
535			goto out_unlock;
536		}
537		if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) {
538			ret = -EINVAL;
539			goto out_unlock;
540		}
541	}
542
543	state.domain        = m.dom;
544	state.vma           = vma;
545	state.va            = m.addr;
546	state.index         = 0;
547	state.global_error  = 0;
548	state.version       = version;
549
550	BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0);
551	/* mmap_batch_fn guarantees ret == 0 */
552	BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t),
553				    &pagelist, mmap_batch_fn, &state));
554
555	mmap_write_unlock(mm);
556
557	if (state.global_error) {
558		/* Write back errors in second pass. */
559		state.user_gfn = (xen_pfn_t *)m.arr;
560		state.user_err = m.err;
561		ret = traverse_pages_block(m.num, sizeof(xen_pfn_t),
562					   &pagelist, mmap_return_errors, &state);
563	} else
564		ret = 0;
565
566	/* If we have not had any EFAULT-like global errors then set the global
567	 * error to -ENOENT if necessary. */
568	if ((ret == 0) && (state.global_error == -ENOENT))
569		ret = -ENOENT;
570
571out:
572	free_page_list(&pagelist);
573	return ret;
574
575out_unlock:
576	mmap_write_unlock(mm);
577	goto out;
578}
579
580static int lock_pages(
581	struct privcmd_dm_op_buf kbufs[], unsigned int num,
582	struct page *pages[], unsigned int nr_pages, unsigned int *pinned)
583{
584	unsigned int i;
585
586	for (i = 0; i < num; i++) {
587		unsigned int requested;
588		int page_count;
589
590		requested = DIV_ROUND_UP(
591			offset_in_page(kbufs[i].uptr) + kbufs[i].size,
592			PAGE_SIZE);
593		if (requested > nr_pages)
594			return -ENOSPC;
595
596		page_count = pin_user_pages_fast(
597			(unsigned long) kbufs[i].uptr,
598			requested, FOLL_WRITE, pages);
599		if (page_count < 0)
600			return page_count;
601
602		*pinned += page_count;
603		nr_pages -= page_count;
604		pages += page_count;
 
 
 
605	}
606
607	return 0;
608}
609
610static void unlock_pages(struct page *pages[], unsigned int nr_pages)
611{
612	unpin_user_pages_dirty_lock(pages, nr_pages, true);
613}
614
615static long privcmd_ioctl_dm_op(struct file *file, void __user *udata)
616{
617	struct privcmd_data *data = file->private_data;
618	struct privcmd_dm_op kdata;
619	struct privcmd_dm_op_buf *kbufs;
620	unsigned int nr_pages = 0;
621	struct page **pages = NULL;
622	struct xen_dm_op_buf *xbufs = NULL;
623	unsigned int i;
624	long rc;
625	unsigned int pinned = 0;
626
627	if (copy_from_user(&kdata, udata, sizeof(kdata)))
628		return -EFAULT;
629
630	/* If restriction is in place, check the domid matches */
631	if (data->domid != DOMID_INVALID && data->domid != kdata.dom)
632		return -EPERM;
633
634	if (kdata.num == 0)
635		return 0;
636
637	if (kdata.num > privcmd_dm_op_max_num)
638		return -E2BIG;
639
640	kbufs = kcalloc(kdata.num, sizeof(*kbufs), GFP_KERNEL);
641	if (!kbufs)
642		return -ENOMEM;
643
644	if (copy_from_user(kbufs, kdata.ubufs,
645			   sizeof(*kbufs) * kdata.num)) {
646		rc = -EFAULT;
647		goto out;
648	}
649
650	for (i = 0; i < kdata.num; i++) {
651		if (kbufs[i].size > privcmd_dm_op_buf_max_size) {
652			rc = -E2BIG;
653			goto out;
654		}
655
656		if (!access_ok(kbufs[i].uptr,
657			       kbufs[i].size)) {
658			rc = -EFAULT;
659			goto out;
660		}
661
662		nr_pages += DIV_ROUND_UP(
663			offset_in_page(kbufs[i].uptr) + kbufs[i].size,
664			PAGE_SIZE);
665	}
666
667	pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
668	if (!pages) {
669		rc = -ENOMEM;
670		goto out;
671	}
672
673	xbufs = kcalloc(kdata.num, sizeof(*xbufs), GFP_KERNEL);
674	if (!xbufs) {
675		rc = -ENOMEM;
676		goto out;
677	}
678
679	rc = lock_pages(kbufs, kdata.num, pages, nr_pages, &pinned);
680	if (rc < 0) {
681		nr_pages = pinned;
682		goto out;
683	}
684
685	for (i = 0; i < kdata.num; i++) {
686		set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr);
687		xbufs[i].size = kbufs[i].size;
688	}
689
690	xen_preemptible_hcall_begin();
691	rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs);
692	xen_preemptible_hcall_end();
693
694out:
695	unlock_pages(pages, nr_pages);
696	kfree(xbufs);
697	kfree(pages);
698	kfree(kbufs);
699
700	return rc;
701}
702
703static long privcmd_ioctl_restrict(struct file *file, void __user *udata)
704{
705	struct privcmd_data *data = file->private_data;
706	domid_t dom;
707
708	if (copy_from_user(&dom, udata, sizeof(dom)))
709		return -EFAULT;
710
711	/* Set restriction to the specified domain, or check it matches */
712	if (data->domid == DOMID_INVALID)
713		data->domid = dom;
714	else if (data->domid != dom)
715		return -EINVAL;
716
717	return 0;
718}
719
720static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata)
 
721{
722	struct privcmd_data *data = file->private_data;
723	struct mm_struct *mm = current->mm;
724	struct vm_area_struct *vma;
725	struct privcmd_mmap_resource kdata;
726	xen_pfn_t *pfns = NULL;
727	struct xen_mem_acquire_resource xdata;
728	int rc;
729
730	if (copy_from_user(&kdata, udata, sizeof(kdata)))
731		return -EFAULT;
732
733	/* If restriction is in place, check the domid matches */
734	if (data->domid != DOMID_INVALID && data->domid != kdata.dom)
735		return -EPERM;
736
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737	mmap_write_lock(mm);
738
739	vma = find_vma(mm, kdata.addr);
740	if (!vma || vma->vm_ops != &privcmd_vm_ops) {
741		rc = -EINVAL;
742		goto out;
743	}
744
745	pfns = kcalloc(kdata.num, sizeof(*pfns), GFP_KERNEL);
746	if (!pfns) {
747		rc = -ENOMEM;
748		goto out;
749	}
750
751	if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) &&
752	    xen_feature(XENFEAT_auto_translated_physmap)) {
753		unsigned int nr = DIV_ROUND_UP(kdata.num, XEN_PFN_PER_PAGE);
754		struct page **pages;
755		unsigned int i;
756
757		rc = alloc_empty_pages(vma, nr);
758		if (rc < 0)
759			goto out;
760
761		pages = vma->vm_private_data;
 
762		for (i = 0; i < kdata.num; i++) {
763			xen_pfn_t pfn =
764				page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]);
765
766			pfns[i] = pfn + (i % XEN_PFN_PER_PAGE);
767		}
768	} else
769		vma->vm_private_data = PRIV_VMA_LOCKED;
770
771	memset(&xdata, 0, sizeof(xdata));
772	xdata.domid = kdata.dom;
773	xdata.type = kdata.type;
774	xdata.id = kdata.id;
775	xdata.frame = kdata.idx;
776	xdata.nr_frames = kdata.num;
777	set_xen_guest_handle(xdata.frame_list, pfns);
778
779	xen_preemptible_hcall_begin();
780	rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata);
781	xen_preemptible_hcall_end();
782
783	if (rc)
784		goto out;
785
786	if (IS_ENABLED(CONFIG_XEN_AUTO_XLATE) &&
787	    xen_feature(XENFEAT_auto_translated_physmap)) {
788		rc = xen_remap_vma_range(vma, kdata.addr, kdata.num << PAGE_SHIFT);
789	} else {
790		unsigned int domid =
791			(xdata.flags & XENMEM_rsrc_acq_caller_owned) ?
792			DOMID_SELF : kdata.dom;
793		int num;
794
 
795		num = xen_remap_domain_mfn_array(vma,
796						 kdata.addr & PAGE_MASK,
797						 pfns, kdata.num, (int *)pfns,
798						 vma->vm_page_prot,
799						 domid,
800						 vma->vm_private_data);
801		if (num < 0)
802			rc = num;
803		else if (num != kdata.num) {
804			unsigned int i;
805
806			for (i = 0; i < num; i++) {
807				rc = pfns[i];
808				if (rc < 0)
809					break;
810			}
811		} else
812			rc = 0;
813	}
814
815out:
816	mmap_write_unlock(mm);
817	kfree(pfns);
818
819	return rc;
820}
821
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
822static long privcmd_ioctl(struct file *file,
823			  unsigned int cmd, unsigned long data)
824{
825	int ret = -ENOTTY;
826	void __user *udata = (void __user *) data;
827
828	switch (cmd) {
829	case IOCTL_PRIVCMD_HYPERCALL:
830		ret = privcmd_ioctl_hypercall(file, udata);
831		break;
832
833	case IOCTL_PRIVCMD_MMAP:
834		ret = privcmd_ioctl_mmap(file, udata);
835		break;
836
837	case IOCTL_PRIVCMD_MMAPBATCH:
838		ret = privcmd_ioctl_mmap_batch(file, udata, 1);
839		break;
840
841	case IOCTL_PRIVCMD_MMAPBATCH_V2:
842		ret = privcmd_ioctl_mmap_batch(file, udata, 2);
843		break;
844
845	case IOCTL_PRIVCMD_DM_OP:
846		ret = privcmd_ioctl_dm_op(file, udata);
847		break;
848
849	case IOCTL_PRIVCMD_RESTRICT:
850		ret = privcmd_ioctl_restrict(file, udata);
851		break;
852
853	case IOCTL_PRIVCMD_MMAP_RESOURCE:
854		ret = privcmd_ioctl_mmap_resource(file, udata);
855		break;
856
 
 
 
 
 
 
 
 
 
 
 
 
857	default:
858		break;
859	}
860
861	return ret;
862}
863
864static int privcmd_open(struct inode *ino, struct file *file)
865{
866	struct privcmd_data *data = kzalloc(sizeof(*data), GFP_KERNEL);
867
868	if (!data)
869		return -ENOMEM;
870
871	/* DOMID_INVALID implies no restriction */
872	data->domid = DOMID_INVALID;
873
874	file->private_data = data;
875	return 0;
876}
877
878static int privcmd_release(struct inode *ino, struct file *file)
879{
880	struct privcmd_data *data = file->private_data;
881
882	kfree(data);
883	return 0;
884}
885
886static void privcmd_close(struct vm_area_struct *vma)
887{
888	struct page **pages = vma->vm_private_data;
889	int numpgs = vma_pages(vma);
890	int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT;
891	int rc;
892
893	if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
894		return;
895
896	rc = xen_unmap_domain_gfn_range(vma, numgfns, pages);
897	if (rc == 0)
898		xen_free_unpopulated_pages(numpgs, pages);
899	else
900		pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n",
901			numpgs, rc);
902	kfree(pages);
903}
904
905static vm_fault_t privcmd_fault(struct vm_fault *vmf)
906{
907	printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
908	       vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end,
909	       vmf->pgoff, (void *)vmf->address);
910
911	return VM_FAULT_SIGBUS;
912}
913
914static const struct vm_operations_struct privcmd_vm_ops = {
915	.close = privcmd_close,
916	.fault = privcmd_fault
917};
918
919static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
920{
921	/* DONTCOPY is essential for Xen because copy_page_range doesn't know
922	 * how to recreate these mappings */
923	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY |
924			 VM_DONTEXPAND | VM_DONTDUMP;
925	vma->vm_ops = &privcmd_vm_ops;
926	vma->vm_private_data = NULL;
927
928	return 0;
929}
930
931/*
932 * For MMAPBATCH*. This allows asserting the singleshot mapping
933 * on a per pfn/pte basis. Mapping calls that fail with ENOENT
934 * can be then retried until success.
935 */
936static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data)
937{
938	return pte_none(*pte) ? 0 : -EBUSY;
939}
940
941static int privcmd_vma_range_is_mapped(
942	           struct vm_area_struct *vma,
943	           unsigned long addr,
944	           unsigned long nr_pages)
945{
946	return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT,
947				   is_mapped_fn, NULL) != 0;
948}
949
950const struct file_operations xen_privcmd_fops = {
951	.owner = THIS_MODULE,
952	.unlocked_ioctl = privcmd_ioctl,
953	.open = privcmd_open,
954	.release = privcmd_release,
955	.mmap = privcmd_mmap,
956};
957EXPORT_SYMBOL_GPL(xen_privcmd_fops);
958
959static struct miscdevice privcmd_dev = {
960	.minor = MISC_DYNAMIC_MINOR,
961	.name = "xen/privcmd",
962	.fops = &xen_privcmd_fops,
963};
964
965static int __init privcmd_init(void)
966{
967	int err;
968
969	if (!xen_domain())
970		return -ENODEV;
971
972	err = misc_register(&privcmd_dev);
973	if (err != 0) {
974		pr_err("Could not register Xen privcmd device\n");
975		return err;
976	}
977
978	err = misc_register(&xen_privcmdbuf_dev);
979	if (err != 0) {
980		pr_err("Could not register Xen hypercall-buf device\n");
981		misc_deregister(&privcmd_dev);
982		return err;
 
 
 
 
 
983	}
984
985	return 0;
 
 
 
 
 
 
986}
987
988static void __exit privcmd_exit(void)
989{
 
 
990	misc_deregister(&privcmd_dev);
991	misc_deregister(&xen_privcmdbuf_dev);
992}
993
994module_init(privcmd_init);
995module_exit(privcmd_exit);