Linux Audio

Check our new training course

Loading...
v5.4
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO: IOMMU DMA mapping support for TCE on POWER
   4 *
   5 * Copyright (C) 2013 IBM Corp.  All rights reserved.
   6 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
   7 *
   8 * Derived from original vfio_iommu_type1.c:
   9 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
  10 *     Author: Alex Williamson <alex.williamson@redhat.com>
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/pci.h>
  15#include <linux/slab.h>
  16#include <linux/uaccess.h>
  17#include <linux/err.h>
  18#include <linux/vfio.h>
  19#include <linux/vmalloc.h>
  20#include <linux/sched/mm.h>
  21#include <linux/sched/signal.h>
  22#include <linux/mm.h>
  23
  24#include <asm/iommu.h>
  25#include <asm/tce.h>
  26#include <asm/mmu_context.h>
  27
  28#define DRIVER_VERSION  "0.1"
  29#define DRIVER_AUTHOR   "aik@ozlabs.ru"
  30#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
  31
  32static void tce_iommu_detach_group(void *iommu_data,
  33		struct iommu_group *iommu_group);
  34
  35/*
  36 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  37 *
  38 * This code handles mapping and unmapping of user data buffers
  39 * into DMA'ble space using the IOMMU
  40 */
  41
  42struct tce_iommu_group {
  43	struct list_head next;
  44	struct iommu_group *grp;
  45};
  46
  47/*
  48 * A container needs to remember which preregistered region  it has
  49 * referenced to do proper cleanup at the userspace process exit.
  50 */
  51struct tce_iommu_prereg {
  52	struct list_head next;
  53	struct mm_iommu_table_group_mem_t *mem;
  54};
  55
  56/*
  57 * The container descriptor supports only a single group per container.
  58 * Required by the API as the container is not supplied with the IOMMU group
  59 * at the moment of initialization.
  60 */
  61struct tce_container {
  62	struct mutex lock;
  63	bool enabled;
  64	bool v2;
  65	bool def_window_pending;
  66	unsigned long locked_pages;
  67	struct mm_struct *mm;
  68	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
  69	struct list_head group_list;
  70	struct list_head prereg_list;
  71};
  72
  73static long tce_iommu_mm_set(struct tce_container *container)
  74{
  75	if (container->mm) {
  76		if (container->mm == current->mm)
  77			return 0;
  78		return -EPERM;
  79	}
  80	BUG_ON(!current->mm);
  81	container->mm = current->mm;
  82	atomic_inc(&container->mm->mm_count);
  83
  84	return 0;
  85}
  86
  87static long tce_iommu_prereg_free(struct tce_container *container,
  88		struct tce_iommu_prereg *tcemem)
  89{
  90	long ret;
  91
  92	ret = mm_iommu_put(container->mm, tcemem->mem);
  93	if (ret)
  94		return ret;
  95
  96	list_del(&tcemem->next);
  97	kfree(tcemem);
  98
  99	return 0;
 100}
 101
 102static long tce_iommu_unregister_pages(struct tce_container *container,
 103		__u64 vaddr, __u64 size)
 104{
 105	struct mm_iommu_table_group_mem_t *mem;
 106	struct tce_iommu_prereg *tcemem;
 107	bool found = false;
 108	long ret;
 109
 110	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
 111		return -EINVAL;
 112
 113	mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
 114	if (!mem)
 115		return -ENOENT;
 116
 117	list_for_each_entry(tcemem, &container->prereg_list, next) {
 118		if (tcemem->mem == mem) {
 119			found = true;
 120			break;
 121		}
 122	}
 123
 124	if (!found)
 125		ret = -ENOENT;
 126	else
 127		ret = tce_iommu_prereg_free(container, tcemem);
 128
 129	mm_iommu_put(container->mm, mem);
 130
 131	return ret;
 132}
 133
 134static long tce_iommu_register_pages(struct tce_container *container,
 135		__u64 vaddr, __u64 size)
 136{
 137	long ret = 0;
 138	struct mm_iommu_table_group_mem_t *mem = NULL;
 139	struct tce_iommu_prereg *tcemem;
 140	unsigned long entries = size >> PAGE_SHIFT;
 141
 142	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
 143			((vaddr + size) < vaddr))
 144		return -EINVAL;
 145
 146	mem = mm_iommu_get(container->mm, vaddr, entries);
 147	if (mem) {
 148		list_for_each_entry(tcemem, &container->prereg_list, next) {
 149			if (tcemem->mem == mem) {
 150				ret = -EBUSY;
 151				goto put_exit;
 152			}
 153		}
 154	} else {
 155		ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
 156		if (ret)
 157			return ret;
 158	}
 159
 160	tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
 161	if (!tcemem) {
 162		ret = -ENOMEM;
 163		goto put_exit;
 164	}
 165
 166	tcemem->mem = mem;
 167	list_add(&tcemem->next, &container->prereg_list);
 168
 169	container->enabled = true;
 170
 171	return 0;
 172
 173put_exit:
 174	mm_iommu_put(container->mm, mem);
 175	return ret;
 176}
 177
 178static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
 179		unsigned int it_page_shift)
 180{
 181	struct page *page;
 182	unsigned long size = 0;
 183
 184	if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
 185		return size == (1UL << it_page_shift);
 186
 187	page = pfn_to_page(hpa >> PAGE_SHIFT);
 188	/*
 189	 * Check that the TCE table granularity is not bigger than the size of
 190	 * a page we just found. Otherwise the hardware can get access to
 191	 * a bigger memory chunk that it should.
 192	 */
 193	return page_shift(compound_head(page)) >= it_page_shift;
 194}
 195
 196static inline bool tce_groups_attached(struct tce_container *container)
 197{
 198	return !list_empty(&container->group_list);
 199}
 200
 201static long tce_iommu_find_table(struct tce_container *container,
 202		phys_addr_t ioba, struct iommu_table **ptbl)
 203{
 204	long i;
 205
 206	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 207		struct iommu_table *tbl = container->tables[i];
 208
 209		if (tbl) {
 210			unsigned long entry = ioba >> tbl->it_page_shift;
 211			unsigned long start = tbl->it_offset;
 212			unsigned long end = start + tbl->it_size;
 213
 214			if ((start <= entry) && (entry < end)) {
 215				*ptbl = tbl;
 216				return i;
 217			}
 218		}
 219	}
 220
 221	return -1;
 222}
 223
 224static int tce_iommu_find_free_table(struct tce_container *container)
 225{
 226	int i;
 227
 228	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 229		if (!container->tables[i])
 230			return i;
 231	}
 232
 233	return -ENOSPC;
 234}
 235
 236static int tce_iommu_enable(struct tce_container *container)
 237{
 238	int ret = 0;
 239	unsigned long locked;
 240	struct iommu_table_group *table_group;
 241	struct tce_iommu_group *tcegrp;
 242
 243	if (container->enabled)
 244		return -EBUSY;
 245
 246	/*
 247	 * When userspace pages are mapped into the IOMMU, they are effectively
 248	 * locked memory, so, theoretically, we need to update the accounting
 249	 * of locked pages on each map and unmap.  For powerpc, the map unmap
 250	 * paths can be very hot, though, and the accounting would kill
 251	 * performance, especially since it would be difficult to impossible
 252	 * to handle the accounting in real mode only.
 253	 *
 254	 * To address that, rather than precisely accounting every page, we
 255	 * instead account for a worst case on locked memory when the iommu is
 256	 * enabled and disabled.  The worst case upper bound on locked memory
 257	 * is the size of the whole iommu window, which is usually relatively
 258	 * small (compared to total memory sizes) on POWER hardware.
 259	 *
 260	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 261	 * that would effectively kill the guest at random points, much better
 262	 * enforcing the limit based on the max that the guest can map.
 263	 *
 264	 * Unfortunately at the moment it counts whole tables, no matter how
 265	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
 266	 * each with 2GB DMA window, 8GB will be counted here. The reason for
 267	 * this is that we cannot tell here the amount of RAM used by the guest
 268	 * as this information is only available from KVM and VFIO is
 269	 * KVM agnostic.
 270	 *
 271	 * So we do not allow enabling a container without a group attached
 272	 * as there is no way to know how much we should increment
 273	 * the locked_vm counter.
 274	 */
 275	if (!tce_groups_attached(container))
 276		return -ENODEV;
 277
 278	tcegrp = list_first_entry(&container->group_list,
 279			struct tce_iommu_group, next);
 280	table_group = iommu_group_get_iommudata(tcegrp->grp);
 281	if (!table_group)
 282		return -ENODEV;
 283
 284	if (!table_group->tce32_size)
 285		return -EPERM;
 286
 287	ret = tce_iommu_mm_set(container);
 288	if (ret)
 289		return ret;
 290
 291	locked = table_group->tce32_size >> PAGE_SHIFT;
 292	ret = account_locked_vm(container->mm, locked, true);
 293	if (ret)
 294		return ret;
 295
 296	container->locked_pages = locked;
 297
 298	container->enabled = true;
 299
 300	return ret;
 301}
 302
 303static void tce_iommu_disable(struct tce_container *container)
 304{
 305	if (!container->enabled)
 306		return;
 307
 308	container->enabled = false;
 309
 310	BUG_ON(!container->mm);
 311	account_locked_vm(container->mm, container->locked_pages, false);
 312}
 313
 314static void *tce_iommu_open(unsigned long arg)
 315{
 316	struct tce_container *container;
 317
 318	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
 319		pr_err("tce_vfio: Wrong IOMMU type\n");
 320		return ERR_PTR(-EINVAL);
 321	}
 322
 323	container = kzalloc(sizeof(*container), GFP_KERNEL);
 324	if (!container)
 325		return ERR_PTR(-ENOMEM);
 326
 327	mutex_init(&container->lock);
 328	INIT_LIST_HEAD_RCU(&container->group_list);
 329	INIT_LIST_HEAD_RCU(&container->prereg_list);
 330
 331	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 332
 333	return container;
 334}
 335
 336static int tce_iommu_clear(struct tce_container *container,
 337		struct iommu_table *tbl,
 338		unsigned long entry, unsigned long pages);
 339static void tce_iommu_free_table(struct tce_container *container,
 340		struct iommu_table *tbl);
 341
 342static void tce_iommu_release(void *iommu_data)
 343{
 344	struct tce_container *container = iommu_data;
 345	struct tce_iommu_group *tcegrp;
 346	struct tce_iommu_prereg *tcemem, *tmtmp;
 347	long i;
 348
 349	while (tce_groups_attached(container)) {
 350		tcegrp = list_first_entry(&container->group_list,
 351				struct tce_iommu_group, next);
 352		tce_iommu_detach_group(iommu_data, tcegrp->grp);
 353	}
 354
 355	/*
 356	 * If VFIO created a table, it was not disposed
 357	 * by tce_iommu_detach_group() so do it now.
 358	 */
 359	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 360		struct iommu_table *tbl = container->tables[i];
 361
 362		if (!tbl)
 363			continue;
 364
 365		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 366		tce_iommu_free_table(container, tbl);
 367	}
 368
 369	list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
 370		WARN_ON(tce_iommu_prereg_free(container, tcemem));
 371
 372	tce_iommu_disable(container);
 373	if (container->mm)
 374		mmdrop(container->mm);
 375	mutex_destroy(&container->lock);
 376
 377	kfree(container);
 378}
 379
 380static void tce_iommu_unuse_page(struct tce_container *container,
 381		unsigned long hpa)
 382{
 383	struct page *page;
 384
 385	page = pfn_to_page(hpa >> PAGE_SHIFT);
 386	put_page(page);
 387}
 388
 389static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
 390		unsigned long tce, unsigned long shift,
 391		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
 392{
 393	long ret = 0;
 394	struct mm_iommu_table_group_mem_t *mem;
 395
 396	mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
 397	if (!mem)
 398		return -EINVAL;
 399
 400	ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
 401	if (ret)
 402		return -EINVAL;
 403
 404	*pmem = mem;
 405
 406	return 0;
 407}
 408
 409static void tce_iommu_unuse_page_v2(struct tce_container *container,
 410		struct iommu_table *tbl, unsigned long entry)
 411{
 412	struct mm_iommu_table_group_mem_t *mem = NULL;
 413	int ret;
 414	unsigned long hpa = 0;
 415	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 416
 417	if (!pua)
 418		return;
 419
 420	ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
 421			tbl->it_page_shift, &hpa, &mem);
 422	if (ret)
 423		pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
 424				__func__, be64_to_cpu(*pua), entry, ret);
 425	if (mem)
 426		mm_iommu_mapped_dec(mem);
 427
 428	*pua = cpu_to_be64(0);
 429}
 430
 431static int tce_iommu_clear(struct tce_container *container,
 432		struct iommu_table *tbl,
 433		unsigned long entry, unsigned long pages)
 434{
 435	unsigned long oldhpa;
 436	long ret;
 437	enum dma_data_direction direction;
 438	unsigned long lastentry = entry + pages, firstentry = entry;
 439
 440	for ( ; entry < lastentry; ++entry) {
 441		if (tbl->it_indirect_levels && tbl->it_userspace) {
 442			/*
 443			 * For multilevel tables, we can take a shortcut here
 444			 * and skip some TCEs as we know that the userspace
 445			 * addresses cache is a mirror of the real TCE table
 446			 * and if it is missing some indirect levels, then
 447			 * the hardware table does not have them allocated
 448			 * either and therefore does not require updating.
 449			 */
 450			__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
 451					entry);
 452			if (!pua) {
 453				/* align to level_size which is power of two */
 454				entry |= tbl->it_level_size - 1;
 455				continue;
 456			}
 457		}
 458
 459		cond_resched();
 460
 461		direction = DMA_NONE;
 462		oldhpa = 0;
 463		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
 464				&direction);
 465		if (ret)
 466			continue;
 467
 468		if (direction == DMA_NONE)
 469			continue;
 470
 471		if (container->v2) {
 472			tce_iommu_unuse_page_v2(container, tbl, entry);
 473			continue;
 474		}
 475
 476		tce_iommu_unuse_page(container, oldhpa);
 477	}
 478
 479	iommu_tce_kill(tbl, firstentry, pages);
 480
 481	return 0;
 482}
 483
 484static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 485{
 486	struct page *page = NULL;
 487	enum dma_data_direction direction = iommu_tce_direction(tce);
 488
 489	if (get_user_pages_fast(tce & PAGE_MASK, 1,
 490			direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
 491			&page) != 1)
 492		return -EFAULT;
 493
 494	*hpa = __pa((unsigned long) page_address(page));
 495
 496	return 0;
 497}
 498
 499static long tce_iommu_build(struct tce_container *container,
 500		struct iommu_table *tbl,
 501		unsigned long entry, unsigned long tce, unsigned long pages,
 502		enum dma_data_direction direction)
 503{
 504	long i, ret = 0;
 505	unsigned long hpa;
 506	enum dma_data_direction dirtmp;
 507
 508	for (i = 0; i < pages; ++i) {
 509		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 510
 511		ret = tce_iommu_use_page(tce, &hpa);
 512		if (ret)
 513			break;
 514
 515		if (!tce_page_is_contained(container->mm, hpa,
 516				tbl->it_page_shift)) {
 517			ret = -EPERM;
 518			break;
 519		}
 520
 521		hpa |= offset;
 522		dirtmp = direction;
 523		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
 524				&hpa, &dirtmp);
 525		if (ret) {
 526			tce_iommu_unuse_page(container, hpa);
 527			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 528					__func__, entry << tbl->it_page_shift,
 529					tce, ret);
 530			break;
 531		}
 532
 533		if (dirtmp != DMA_NONE)
 534			tce_iommu_unuse_page(container, hpa);
 535
 536		tce += IOMMU_PAGE_SIZE(tbl);
 537	}
 538
 539	if (ret)
 540		tce_iommu_clear(container, tbl, entry, i);
 541	else
 542		iommu_tce_kill(tbl, entry, pages);
 543
 544	return ret;
 545}
 546
 547static long tce_iommu_build_v2(struct tce_container *container,
 548		struct iommu_table *tbl,
 549		unsigned long entry, unsigned long tce, unsigned long pages,
 550		enum dma_data_direction direction)
 551{
 552	long i, ret = 0;
 553	unsigned long hpa;
 554	enum dma_data_direction dirtmp;
 555
 556	for (i = 0; i < pages; ++i) {
 557		struct mm_iommu_table_group_mem_t *mem = NULL;
 558		__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
 559
 560		ret = tce_iommu_prereg_ua_to_hpa(container,
 561				tce, tbl->it_page_shift, &hpa, &mem);
 562		if (ret)
 563			break;
 564
 565		if (!tce_page_is_contained(container->mm, hpa,
 566				tbl->it_page_shift)) {
 567			ret = -EPERM;
 568			break;
 569		}
 570
 571		/* Preserve offset within IOMMU page */
 572		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 573		dirtmp = direction;
 574
 575		/* The registered region is being unregistered */
 576		if (mm_iommu_mapped_inc(mem))
 577			break;
 578
 579		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
 580				&hpa, &dirtmp);
 581		if (ret) {
 582			/* dirtmp cannot be DMA_NONE here */
 583			tce_iommu_unuse_page_v2(container, tbl, entry + i);
 584			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 585					__func__, entry << tbl->it_page_shift,
 586					tce, ret);
 587			break;
 588		}
 589
 590		if (dirtmp != DMA_NONE)
 591			tce_iommu_unuse_page_v2(container, tbl, entry + i);
 592
 593		*pua = cpu_to_be64(tce);
 594
 595		tce += IOMMU_PAGE_SIZE(tbl);
 596	}
 597
 598	if (ret)
 599		tce_iommu_clear(container, tbl, entry, i);
 600	else
 601		iommu_tce_kill(tbl, entry, pages);
 602
 603	return ret;
 604}
 605
 606static long tce_iommu_create_table(struct tce_container *container,
 607			struct iommu_table_group *table_group,
 608			int num,
 609			__u32 page_shift,
 610			__u64 window_size,
 611			__u32 levels,
 612			struct iommu_table **ptbl)
 613{
 614	long ret, table_size;
 615
 616	table_size = table_group->ops->get_table_size(page_shift, window_size,
 617			levels);
 618	if (!table_size)
 619		return -EINVAL;
 620
 621	ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
 622	if (ret)
 623		return ret;
 624
 625	ret = table_group->ops->create_table(table_group, num,
 626			page_shift, window_size, levels, ptbl);
 627
 628	WARN_ON(!ret && !(*ptbl)->it_ops->free);
 629	WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
 630
 631	return ret;
 632}
 633
 634static void tce_iommu_free_table(struct tce_container *container,
 635		struct iommu_table *tbl)
 636{
 637	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 638
 639	iommu_tce_table_put(tbl);
 640	account_locked_vm(container->mm, pages, false);
 641}
 642
 643static long tce_iommu_create_window(struct tce_container *container,
 644		__u32 page_shift, __u64 window_size, __u32 levels,
 645		__u64 *start_addr)
 646{
 647	struct tce_iommu_group *tcegrp;
 648	struct iommu_table_group *table_group;
 649	struct iommu_table *tbl = NULL;
 650	long ret, num;
 651
 652	num = tce_iommu_find_free_table(container);
 653	if (num < 0)
 654		return num;
 655
 656	/* Get the first group for ops::create_table */
 657	tcegrp = list_first_entry(&container->group_list,
 658			struct tce_iommu_group, next);
 659	table_group = iommu_group_get_iommudata(tcegrp->grp);
 660	if (!table_group)
 661		return -EFAULT;
 662
 663	if (!(table_group->pgsizes & (1ULL << page_shift)))
 664		return -EINVAL;
 665
 666	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
 667			!table_group->ops->get_table_size ||
 668			!table_group->ops->create_table)
 669		return -EPERM;
 670
 671	/* Create TCE table */
 672	ret = tce_iommu_create_table(container, table_group, num,
 673			page_shift, window_size, levels, &tbl);
 674	if (ret)
 675		return ret;
 676
 677	BUG_ON(!tbl->it_ops->free);
 678
 679	/*
 680	 * Program the table to every group.
 681	 * Groups have been tested for compatibility at the attach time.
 682	 */
 683	list_for_each_entry(tcegrp, &container->group_list, next) {
 684		table_group = iommu_group_get_iommudata(tcegrp->grp);
 685
 686		ret = table_group->ops->set_window(table_group, num, tbl);
 687		if (ret)
 688			goto unset_exit;
 689	}
 690
 691	container->tables[num] = tbl;
 692
 693	/* Return start address assigned by platform in create_table() */
 694	*start_addr = tbl->it_offset << tbl->it_page_shift;
 695
 696	return 0;
 697
 698unset_exit:
 699	list_for_each_entry(tcegrp, &container->group_list, next) {
 700		table_group = iommu_group_get_iommudata(tcegrp->grp);
 701		table_group->ops->unset_window(table_group, num);
 702	}
 703	tce_iommu_free_table(container, tbl);
 704
 705	return ret;
 706}
 707
 708static long tce_iommu_remove_window(struct tce_container *container,
 709		__u64 start_addr)
 710{
 711	struct iommu_table_group *table_group = NULL;
 712	struct iommu_table *tbl;
 713	struct tce_iommu_group *tcegrp;
 714	int num;
 715
 716	num = tce_iommu_find_table(container, start_addr, &tbl);
 717	if (num < 0)
 718		return -EINVAL;
 719
 720	BUG_ON(!tbl->it_size);
 721
 722	/* Detach groups from IOMMUs */
 723	list_for_each_entry(tcegrp, &container->group_list, next) {
 724		table_group = iommu_group_get_iommudata(tcegrp->grp);
 725
 726		/*
 727		 * SPAPR TCE IOMMU exposes the default DMA window to
 728		 * the guest via dma32_window_start/size of
 729		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
 730		 * the userspace to remove this window, some do not so
 731		 * here we check for the platform capability.
 732		 */
 733		if (!table_group->ops || !table_group->ops->unset_window)
 734			return -EPERM;
 735
 736		table_group->ops->unset_window(table_group, num);
 737	}
 738
 739	/* Free table */
 740	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 741	tce_iommu_free_table(container, tbl);
 742	container->tables[num] = NULL;
 743
 744	return 0;
 745}
 746
 747static long tce_iommu_create_default_window(struct tce_container *container)
 748{
 749	long ret;
 750	__u64 start_addr = 0;
 751	struct tce_iommu_group *tcegrp;
 752	struct iommu_table_group *table_group;
 753
 754	if (!container->def_window_pending)
 755		return 0;
 756
 757	if (!tce_groups_attached(container))
 758		return -ENODEV;
 759
 760	tcegrp = list_first_entry(&container->group_list,
 761			struct tce_iommu_group, next);
 762	table_group = iommu_group_get_iommudata(tcegrp->grp);
 763	if (!table_group)
 764		return -ENODEV;
 765
 766	ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
 767			table_group->tce32_size, 1, &start_addr);
 768	WARN_ON_ONCE(!ret && start_addr);
 769
 770	if (!ret)
 771		container->def_window_pending = false;
 772
 773	return ret;
 774}
 775
 776static long tce_iommu_ioctl(void *iommu_data,
 777				 unsigned int cmd, unsigned long arg)
 778{
 779	struct tce_container *container = iommu_data;
 780	unsigned long minsz, ddwsz;
 781	long ret;
 782
 783	switch (cmd) {
 784	case VFIO_CHECK_EXTENSION:
 785		switch (arg) {
 786		case VFIO_SPAPR_TCE_IOMMU:
 787		case VFIO_SPAPR_TCE_v2_IOMMU:
 788			ret = 1;
 789			break;
 790		default:
 791			ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
 792			break;
 793		}
 794
 795		return (ret < 0) ? 0 : ret;
 796	}
 797
 798	/*
 799	 * Sanity check to prevent one userspace from manipulating
 800	 * another userspace mm.
 801	 */
 802	BUG_ON(!container);
 803	if (container->mm && container->mm != current->mm)
 804		return -EPERM;
 805
 806	switch (cmd) {
 807	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 808		struct vfio_iommu_spapr_tce_info info;
 809		struct tce_iommu_group *tcegrp;
 810		struct iommu_table_group *table_group;
 811
 812		if (!tce_groups_attached(container))
 813			return -ENXIO;
 814
 815		tcegrp = list_first_entry(&container->group_list,
 816				struct tce_iommu_group, next);
 817		table_group = iommu_group_get_iommudata(tcegrp->grp);
 818
 819		if (!table_group)
 820			return -ENXIO;
 821
 822		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
 823				dma32_window_size);
 824
 825		if (copy_from_user(&info, (void __user *)arg, minsz))
 826			return -EFAULT;
 827
 828		if (info.argsz < minsz)
 829			return -EINVAL;
 830
 831		info.dma32_window_start = table_group->tce32_start;
 832		info.dma32_window_size = table_group->tce32_size;
 833		info.flags = 0;
 834		memset(&info.ddw, 0, sizeof(info.ddw));
 835
 836		if (table_group->max_dynamic_windows_supported &&
 837				container->v2) {
 838			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
 839			info.ddw.pgsizes = table_group->pgsizes;
 840			info.ddw.max_dynamic_windows_supported =
 841				table_group->max_dynamic_windows_supported;
 842			info.ddw.levels = table_group->max_levels;
 843		}
 844
 845		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
 846
 847		if (info.argsz >= ddwsz)
 848			minsz = ddwsz;
 849
 850		if (copy_to_user((void __user *)arg, &info, minsz))
 851			return -EFAULT;
 852
 853		return 0;
 854	}
 855	case VFIO_IOMMU_MAP_DMA: {
 856		struct vfio_iommu_type1_dma_map param;
 857		struct iommu_table *tbl = NULL;
 858		long num;
 859		enum dma_data_direction direction;
 860
 861		if (!container->enabled)
 862			return -EPERM;
 863
 864		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 865
 866		if (copy_from_user(&param, (void __user *)arg, minsz))
 867			return -EFAULT;
 868
 869		if (param.argsz < minsz)
 870			return -EINVAL;
 871
 872		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
 873				VFIO_DMA_MAP_FLAG_WRITE))
 874			return -EINVAL;
 875
 876		ret = tce_iommu_create_default_window(container);
 877		if (ret)
 878			return ret;
 879
 880		num = tce_iommu_find_table(container, param.iova, &tbl);
 881		if (num < 0)
 882			return -ENXIO;
 883
 884		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
 885				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
 886			return -EINVAL;
 887
 888		/* iova is checked by the IOMMU API */
 889		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
 890			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 891				direction = DMA_BIDIRECTIONAL;
 892			else
 893				direction = DMA_TO_DEVICE;
 894		} else {
 895			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 896				direction = DMA_FROM_DEVICE;
 897			else
 898				return -EINVAL;
 899		}
 900
 901		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
 902		if (ret)
 903			return ret;
 904
 905		if (container->v2)
 906			ret = tce_iommu_build_v2(container, tbl,
 907					param.iova >> tbl->it_page_shift,
 908					param.vaddr,
 909					param.size >> tbl->it_page_shift,
 910					direction);
 911		else
 912			ret = tce_iommu_build(container, tbl,
 913					param.iova >> tbl->it_page_shift,
 914					param.vaddr,
 915					param.size >> tbl->it_page_shift,
 916					direction);
 917
 918		iommu_flush_tce(tbl);
 919
 920		return ret;
 921	}
 922	case VFIO_IOMMU_UNMAP_DMA: {
 923		struct vfio_iommu_type1_dma_unmap param;
 924		struct iommu_table *tbl = NULL;
 925		long num;
 926
 927		if (!container->enabled)
 928			return -EPERM;
 929
 930		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
 931				size);
 932
 933		if (copy_from_user(&param, (void __user *)arg, minsz))
 934			return -EFAULT;
 935
 936		if (param.argsz < minsz)
 937			return -EINVAL;
 938
 939		/* No flag is supported now */
 940		if (param.flags)
 941			return -EINVAL;
 942
 943		ret = tce_iommu_create_default_window(container);
 944		if (ret)
 945			return ret;
 946
 947		num = tce_iommu_find_table(container, param.iova, &tbl);
 948		if (num < 0)
 949			return -ENXIO;
 950
 951		if (param.size & ~IOMMU_PAGE_MASK(tbl))
 952			return -EINVAL;
 953
 954		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
 955				param.size >> tbl->it_page_shift);
 956		if (ret)
 957			return ret;
 958
 959		ret = tce_iommu_clear(container, tbl,
 960				param.iova >> tbl->it_page_shift,
 961				param.size >> tbl->it_page_shift);
 962		iommu_flush_tce(tbl);
 963
 964		return ret;
 965	}
 966	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
 967		struct vfio_iommu_spapr_register_memory param;
 968
 969		if (!container->v2)
 970			break;
 971
 972		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
 973				size);
 974
 975		ret = tce_iommu_mm_set(container);
 976		if (ret)
 977			return ret;
 978
 979		if (copy_from_user(&param, (void __user *)arg, minsz))
 980			return -EFAULT;
 981
 982		if (param.argsz < minsz)
 983			return -EINVAL;
 984
 985		/* No flag is supported now */
 986		if (param.flags)
 987			return -EINVAL;
 988
 989		mutex_lock(&container->lock);
 990		ret = tce_iommu_register_pages(container, param.vaddr,
 991				param.size);
 992		mutex_unlock(&container->lock);
 993
 994		return ret;
 995	}
 996	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
 997		struct vfio_iommu_spapr_register_memory param;
 998
 999		if (!container->v2)
1000			break;
1001
1002		if (!container->mm)
1003			return -EPERM;
1004
1005		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1006				size);
1007
1008		if (copy_from_user(&param, (void __user *)arg, minsz))
1009			return -EFAULT;
1010
1011		if (param.argsz < minsz)
1012			return -EINVAL;
1013
1014		/* No flag is supported now */
1015		if (param.flags)
1016			return -EINVAL;
1017
1018		mutex_lock(&container->lock);
1019		ret = tce_iommu_unregister_pages(container, param.vaddr,
1020				param.size);
1021		mutex_unlock(&container->lock);
1022
1023		return ret;
1024	}
1025	case VFIO_IOMMU_ENABLE:
1026		if (container->v2)
1027			break;
1028
1029		mutex_lock(&container->lock);
1030		ret = tce_iommu_enable(container);
1031		mutex_unlock(&container->lock);
1032		return ret;
1033
1034
1035	case VFIO_IOMMU_DISABLE:
1036		if (container->v2)
1037			break;
1038
1039		mutex_lock(&container->lock);
1040		tce_iommu_disable(container);
1041		mutex_unlock(&container->lock);
1042		return 0;
1043
1044	case VFIO_EEH_PE_OP: {
1045		struct tce_iommu_group *tcegrp;
1046
1047		ret = 0;
1048		list_for_each_entry(tcegrp, &container->group_list, next) {
1049			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
1050					cmd, arg);
1051			if (ret)
1052				return ret;
1053		}
1054		return ret;
1055	}
1056
1057	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1058		struct vfio_iommu_spapr_tce_create create;
1059
1060		if (!container->v2)
1061			break;
1062
1063		ret = tce_iommu_mm_set(container);
1064		if (ret)
1065			return ret;
1066
1067		if (!tce_groups_attached(container))
1068			return -ENXIO;
1069
1070		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1071				start_addr);
1072
1073		if (copy_from_user(&create, (void __user *)arg, minsz))
1074			return -EFAULT;
1075
1076		if (create.argsz < minsz)
1077			return -EINVAL;
1078
1079		if (create.flags)
1080			return -EINVAL;
1081
1082		mutex_lock(&container->lock);
1083
1084		ret = tce_iommu_create_default_window(container);
1085		if (!ret)
1086			ret = tce_iommu_create_window(container,
1087					create.page_shift,
1088					create.window_size, create.levels,
1089					&create.start_addr);
1090
1091		mutex_unlock(&container->lock);
1092
1093		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1094			ret = -EFAULT;
1095
1096		return ret;
1097	}
1098	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1099		struct vfio_iommu_spapr_tce_remove remove;
1100
1101		if (!container->v2)
1102			break;
1103
1104		ret = tce_iommu_mm_set(container);
1105		if (ret)
1106			return ret;
1107
1108		if (!tce_groups_attached(container))
1109			return -ENXIO;
1110
1111		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1112				start_addr);
1113
1114		if (copy_from_user(&remove, (void __user *)arg, minsz))
1115			return -EFAULT;
1116
1117		if (remove.argsz < minsz)
1118			return -EINVAL;
1119
1120		if (remove.flags)
1121			return -EINVAL;
1122
1123		if (container->def_window_pending && !remove.start_addr) {
1124			container->def_window_pending = false;
1125			return 0;
1126		}
1127
1128		mutex_lock(&container->lock);
1129
1130		ret = tce_iommu_remove_window(container, remove.start_addr);
1131
1132		mutex_unlock(&container->lock);
1133
1134		return ret;
1135	}
1136	}
1137
1138	return -ENOTTY;
1139}
1140
1141static void tce_iommu_release_ownership(struct tce_container *container,
1142		struct iommu_table_group *table_group)
1143{
1144	int i;
1145
1146	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1147		struct iommu_table *tbl = container->tables[i];
1148
1149		if (!tbl)
1150			continue;
1151
1152		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1153		if (tbl->it_map)
1154			iommu_release_ownership(tbl);
1155
1156		container->tables[i] = NULL;
1157	}
1158}
1159
1160static int tce_iommu_take_ownership(struct tce_container *container,
1161		struct iommu_table_group *table_group)
1162{
1163	int i, j, rc = 0;
1164
1165	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1166		struct iommu_table *tbl = table_group->tables[i];
1167
1168		if (!tbl || !tbl->it_map)
1169			continue;
1170
1171		rc = iommu_take_ownership(tbl);
1172		if (rc) {
1173			for (j = 0; j < i; ++j)
1174				iommu_release_ownership(
1175						table_group->tables[j]);
1176
1177			return rc;
1178		}
1179	}
1180
1181	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1182		container->tables[i] = table_group->tables[i];
1183
1184	return 0;
1185}
1186
1187static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1188		struct iommu_table_group *table_group)
1189{
1190	long i;
1191
1192	if (!table_group->ops->unset_window) {
1193		WARN_ON_ONCE(1);
1194		return;
1195	}
1196
1197	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1198		if (container->tables[i])
1199			table_group->ops->unset_window(table_group, i);
1200
1201	table_group->ops->release_ownership(table_group);
1202}
1203
1204static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1205		struct iommu_table_group *table_group)
1206{
1207	long i, ret = 0;
1208
1209	if (!table_group->ops->create_table || !table_group->ops->set_window ||
1210			!table_group->ops->release_ownership) {
1211		WARN_ON_ONCE(1);
1212		return -EFAULT;
1213	}
1214
1215	table_group->ops->take_ownership(table_group);
1216
1217	/* Set all windows to the new group */
1218	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1219		struct iommu_table *tbl = container->tables[i];
1220
1221		if (!tbl)
1222			continue;
1223
1224		ret = table_group->ops->set_window(table_group, i, tbl);
1225		if (ret)
1226			goto release_exit;
1227	}
1228
1229	return 0;
1230
1231release_exit:
1232	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1233		table_group->ops->unset_window(table_group, i);
1234
1235	table_group->ops->release_ownership(table_group);
1236
1237	return ret;
1238}
1239
1240static int tce_iommu_attach_group(void *iommu_data,
1241		struct iommu_group *iommu_group)
1242{
1243	int ret = 0;
1244	struct tce_container *container = iommu_data;
1245	struct iommu_table_group *table_group;
1246	struct tce_iommu_group *tcegrp = NULL;
1247
1248	mutex_lock(&container->lock);
1249
1250	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1251			iommu_group_id(iommu_group), iommu_group); */
1252	table_group = iommu_group_get_iommudata(iommu_group);
1253	if (!table_group) {
1254		ret = -ENODEV;
1255		goto unlock_exit;
1256	}
1257
1258	if (tce_groups_attached(container) && (!table_group->ops ||
1259			!table_group->ops->take_ownership ||
1260			!table_group->ops->release_ownership)) {
1261		ret = -EBUSY;
1262		goto unlock_exit;
1263	}
1264
1265	/* Check if new group has the same iommu_ops (i.e. compatible) */
1266	list_for_each_entry(tcegrp, &container->group_list, next) {
1267		struct iommu_table_group *table_group_tmp;
1268
1269		if (tcegrp->grp == iommu_group) {
1270			pr_warn("tce_vfio: Group %d is already attached\n",
1271					iommu_group_id(iommu_group));
1272			ret = -EBUSY;
1273			goto unlock_exit;
1274		}
1275		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1276		if (table_group_tmp->ops->create_table !=
1277				table_group->ops->create_table) {
1278			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1279					iommu_group_id(iommu_group),
1280					iommu_group_id(tcegrp->grp));
1281			ret = -EPERM;
1282			goto unlock_exit;
1283		}
1284	}
1285
1286	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1287	if (!tcegrp) {
1288		ret = -ENOMEM;
1289		goto unlock_exit;
1290	}
1291
1292	if (!table_group->ops || !table_group->ops->take_ownership ||
1293			!table_group->ops->release_ownership) {
1294		if (container->v2) {
1295			ret = -EPERM;
1296			goto free_exit;
1297		}
1298		ret = tce_iommu_take_ownership(container, table_group);
1299	} else {
1300		if (!container->v2) {
1301			ret = -EPERM;
1302			goto free_exit;
1303		}
1304		ret = tce_iommu_take_ownership_ddw(container, table_group);
1305		if (!tce_groups_attached(container) && !container->tables[0])
1306			container->def_window_pending = true;
1307	}
1308
1309	if (!ret) {
1310		tcegrp->grp = iommu_group;
1311		list_add(&tcegrp->next, &container->group_list);
1312	}
1313
1314free_exit:
1315	if (ret && tcegrp)
1316		kfree(tcegrp);
1317
1318unlock_exit:
1319	mutex_unlock(&container->lock);
1320
1321	return ret;
1322}
1323
1324static void tce_iommu_detach_group(void *iommu_data,
1325		struct iommu_group *iommu_group)
1326{
1327	struct tce_container *container = iommu_data;
1328	struct iommu_table_group *table_group;
1329	bool found = false;
1330	struct tce_iommu_group *tcegrp;
1331
1332	mutex_lock(&container->lock);
1333
1334	list_for_each_entry(tcegrp, &container->group_list, next) {
1335		if (tcegrp->grp == iommu_group) {
1336			found = true;
1337			break;
1338		}
1339	}
1340
1341	if (!found) {
1342		pr_warn("tce_vfio: detaching unattached group #%u\n",
1343				iommu_group_id(iommu_group));
1344		goto unlock_exit;
1345	}
1346
1347	list_del(&tcegrp->next);
1348	kfree(tcegrp);
1349
1350	table_group = iommu_group_get_iommudata(iommu_group);
1351	BUG_ON(!table_group);
1352
1353	if (!table_group->ops || !table_group->ops->release_ownership)
1354		tce_iommu_release_ownership(container, table_group);
1355	else
1356		tce_iommu_release_ownership_ddw(container, table_group);
1357
1358unlock_exit:
1359	mutex_unlock(&container->lock);
1360}
1361
1362static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1363	.name		= "iommu-vfio-powerpc",
1364	.owner		= THIS_MODULE,
1365	.open		= tce_iommu_open,
1366	.release	= tce_iommu_release,
1367	.ioctl		= tce_iommu_ioctl,
1368	.attach_group	= tce_iommu_attach_group,
1369	.detach_group	= tce_iommu_detach_group,
1370};
1371
1372static int __init tce_iommu_init(void)
1373{
1374	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1375}
1376
1377static void __exit tce_iommu_cleanup(void)
1378{
1379	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1380}
1381
1382module_init(tce_iommu_init);
1383module_exit(tce_iommu_cleanup);
1384
1385MODULE_VERSION(DRIVER_VERSION);
1386MODULE_LICENSE("GPL v2");
1387MODULE_AUTHOR(DRIVER_AUTHOR);
1388MODULE_DESCRIPTION(DRIVER_DESC);
1389
v5.14.15
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO: IOMMU DMA mapping support for TCE on POWER
   4 *
   5 * Copyright (C) 2013 IBM Corp.  All rights reserved.
   6 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
   7 *
   8 * Derived from original vfio_iommu_type1.c:
   9 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
  10 *     Author: Alex Williamson <alex.williamson@redhat.com>
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/pci.h>
  15#include <linux/slab.h>
  16#include <linux/uaccess.h>
  17#include <linux/err.h>
  18#include <linux/vfio.h>
  19#include <linux/vmalloc.h>
  20#include <linux/sched/mm.h>
  21#include <linux/sched/signal.h>
  22#include <linux/mm.h>
  23
  24#include <asm/iommu.h>
  25#include <asm/tce.h>
  26#include <asm/mmu_context.h>
  27
  28#define DRIVER_VERSION  "0.1"
  29#define DRIVER_AUTHOR   "aik@ozlabs.ru"
  30#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
  31
  32static void tce_iommu_detach_group(void *iommu_data,
  33		struct iommu_group *iommu_group);
  34
  35/*
  36 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  37 *
  38 * This code handles mapping and unmapping of user data buffers
  39 * into DMA'ble space using the IOMMU
  40 */
  41
  42struct tce_iommu_group {
  43	struct list_head next;
  44	struct iommu_group *grp;
  45};
  46
  47/*
  48 * A container needs to remember which preregistered region  it has
  49 * referenced to do proper cleanup at the userspace process exit.
  50 */
  51struct tce_iommu_prereg {
  52	struct list_head next;
  53	struct mm_iommu_table_group_mem_t *mem;
  54};
  55
  56/*
  57 * The container descriptor supports only a single group per container.
  58 * Required by the API as the container is not supplied with the IOMMU group
  59 * at the moment of initialization.
  60 */
  61struct tce_container {
  62	struct mutex lock;
  63	bool enabled;
  64	bool v2;
  65	bool def_window_pending;
  66	unsigned long locked_pages;
  67	struct mm_struct *mm;
  68	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
  69	struct list_head group_list;
  70	struct list_head prereg_list;
  71};
  72
  73static long tce_iommu_mm_set(struct tce_container *container)
  74{
  75	if (container->mm) {
  76		if (container->mm == current->mm)
  77			return 0;
  78		return -EPERM;
  79	}
  80	BUG_ON(!current->mm);
  81	container->mm = current->mm;
  82	mmgrab(container->mm);
  83
  84	return 0;
  85}
  86
  87static long tce_iommu_prereg_free(struct tce_container *container,
  88		struct tce_iommu_prereg *tcemem)
  89{
  90	long ret;
  91
  92	ret = mm_iommu_put(container->mm, tcemem->mem);
  93	if (ret)
  94		return ret;
  95
  96	list_del(&tcemem->next);
  97	kfree(tcemem);
  98
  99	return 0;
 100}
 101
 102static long tce_iommu_unregister_pages(struct tce_container *container,
 103		__u64 vaddr, __u64 size)
 104{
 105	struct mm_iommu_table_group_mem_t *mem;
 106	struct tce_iommu_prereg *tcemem;
 107	bool found = false;
 108	long ret;
 109
 110	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
 111		return -EINVAL;
 112
 113	mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
 114	if (!mem)
 115		return -ENOENT;
 116
 117	list_for_each_entry(tcemem, &container->prereg_list, next) {
 118		if (tcemem->mem == mem) {
 119			found = true;
 120			break;
 121		}
 122	}
 123
 124	if (!found)
 125		ret = -ENOENT;
 126	else
 127		ret = tce_iommu_prereg_free(container, tcemem);
 128
 129	mm_iommu_put(container->mm, mem);
 130
 131	return ret;
 132}
 133
 134static long tce_iommu_register_pages(struct tce_container *container,
 135		__u64 vaddr, __u64 size)
 136{
 137	long ret = 0;
 138	struct mm_iommu_table_group_mem_t *mem = NULL;
 139	struct tce_iommu_prereg *tcemem;
 140	unsigned long entries = size >> PAGE_SHIFT;
 141
 142	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
 143			((vaddr + size) < vaddr))
 144		return -EINVAL;
 145
 146	mem = mm_iommu_get(container->mm, vaddr, entries);
 147	if (mem) {
 148		list_for_each_entry(tcemem, &container->prereg_list, next) {
 149			if (tcemem->mem == mem) {
 150				ret = -EBUSY;
 151				goto put_exit;
 152			}
 153		}
 154	} else {
 155		ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
 156		if (ret)
 157			return ret;
 158	}
 159
 160	tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
 161	if (!tcemem) {
 162		ret = -ENOMEM;
 163		goto put_exit;
 164	}
 165
 166	tcemem->mem = mem;
 167	list_add(&tcemem->next, &container->prereg_list);
 168
 169	container->enabled = true;
 170
 171	return 0;
 172
 173put_exit:
 174	mm_iommu_put(container->mm, mem);
 175	return ret;
 176}
 177
 178static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
 179		unsigned int it_page_shift)
 180{
 181	struct page *page;
 182	unsigned long size = 0;
 183
 184	if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
 185		return size == (1UL << it_page_shift);
 186
 187	page = pfn_to_page(hpa >> PAGE_SHIFT);
 188	/*
 189	 * Check that the TCE table granularity is not bigger than the size of
 190	 * a page we just found. Otherwise the hardware can get access to
 191	 * a bigger memory chunk that it should.
 192	 */
 193	return page_shift(compound_head(page)) >= it_page_shift;
 194}
 195
 196static inline bool tce_groups_attached(struct tce_container *container)
 197{
 198	return !list_empty(&container->group_list);
 199}
 200
 201static long tce_iommu_find_table(struct tce_container *container,
 202		phys_addr_t ioba, struct iommu_table **ptbl)
 203{
 204	long i;
 205
 206	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 207		struct iommu_table *tbl = container->tables[i];
 208
 209		if (tbl) {
 210			unsigned long entry = ioba >> tbl->it_page_shift;
 211			unsigned long start = tbl->it_offset;
 212			unsigned long end = start + tbl->it_size;
 213
 214			if ((start <= entry) && (entry < end)) {
 215				*ptbl = tbl;
 216				return i;
 217			}
 218		}
 219	}
 220
 221	return -1;
 222}
 223
 224static int tce_iommu_find_free_table(struct tce_container *container)
 225{
 226	int i;
 227
 228	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 229		if (!container->tables[i])
 230			return i;
 231	}
 232
 233	return -ENOSPC;
 234}
 235
 236static int tce_iommu_enable(struct tce_container *container)
 237{
 238	int ret = 0;
 239	unsigned long locked;
 240	struct iommu_table_group *table_group;
 241	struct tce_iommu_group *tcegrp;
 242
 243	if (container->enabled)
 244		return -EBUSY;
 245
 246	/*
 247	 * When userspace pages are mapped into the IOMMU, they are effectively
 248	 * locked memory, so, theoretically, we need to update the accounting
 249	 * of locked pages on each map and unmap.  For powerpc, the map unmap
 250	 * paths can be very hot, though, and the accounting would kill
 251	 * performance, especially since it would be difficult to impossible
 252	 * to handle the accounting in real mode only.
 253	 *
 254	 * To address that, rather than precisely accounting every page, we
 255	 * instead account for a worst case on locked memory when the iommu is
 256	 * enabled and disabled.  The worst case upper bound on locked memory
 257	 * is the size of the whole iommu window, which is usually relatively
 258	 * small (compared to total memory sizes) on POWER hardware.
 259	 *
 260	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 261	 * that would effectively kill the guest at random points, much better
 262	 * enforcing the limit based on the max that the guest can map.
 263	 *
 264	 * Unfortunately at the moment it counts whole tables, no matter how
 265	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
 266	 * each with 2GB DMA window, 8GB will be counted here. The reason for
 267	 * this is that we cannot tell here the amount of RAM used by the guest
 268	 * as this information is only available from KVM and VFIO is
 269	 * KVM agnostic.
 270	 *
 271	 * So we do not allow enabling a container without a group attached
 272	 * as there is no way to know how much we should increment
 273	 * the locked_vm counter.
 274	 */
 275	if (!tce_groups_attached(container))
 276		return -ENODEV;
 277
 278	tcegrp = list_first_entry(&container->group_list,
 279			struct tce_iommu_group, next);
 280	table_group = iommu_group_get_iommudata(tcegrp->grp);
 281	if (!table_group)
 282		return -ENODEV;
 283
 284	if (!table_group->tce32_size)
 285		return -EPERM;
 286
 287	ret = tce_iommu_mm_set(container);
 288	if (ret)
 289		return ret;
 290
 291	locked = table_group->tce32_size >> PAGE_SHIFT;
 292	ret = account_locked_vm(container->mm, locked, true);
 293	if (ret)
 294		return ret;
 295
 296	container->locked_pages = locked;
 297
 298	container->enabled = true;
 299
 300	return ret;
 301}
 302
 303static void tce_iommu_disable(struct tce_container *container)
 304{
 305	if (!container->enabled)
 306		return;
 307
 308	container->enabled = false;
 309
 310	BUG_ON(!container->mm);
 311	account_locked_vm(container->mm, container->locked_pages, false);
 312}
 313
 314static void *tce_iommu_open(unsigned long arg)
 315{
 316	struct tce_container *container;
 317
 318	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
 319		pr_err("tce_vfio: Wrong IOMMU type\n");
 320		return ERR_PTR(-EINVAL);
 321	}
 322
 323	container = kzalloc(sizeof(*container), GFP_KERNEL);
 324	if (!container)
 325		return ERR_PTR(-ENOMEM);
 326
 327	mutex_init(&container->lock);
 328	INIT_LIST_HEAD_RCU(&container->group_list);
 329	INIT_LIST_HEAD_RCU(&container->prereg_list);
 330
 331	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 332
 333	return container;
 334}
 335
 336static int tce_iommu_clear(struct tce_container *container,
 337		struct iommu_table *tbl,
 338		unsigned long entry, unsigned long pages);
 339static void tce_iommu_free_table(struct tce_container *container,
 340		struct iommu_table *tbl);
 341
 342static void tce_iommu_release(void *iommu_data)
 343{
 344	struct tce_container *container = iommu_data;
 345	struct tce_iommu_group *tcegrp;
 346	struct tce_iommu_prereg *tcemem, *tmtmp;
 347	long i;
 348
 349	while (tce_groups_attached(container)) {
 350		tcegrp = list_first_entry(&container->group_list,
 351				struct tce_iommu_group, next);
 352		tce_iommu_detach_group(iommu_data, tcegrp->grp);
 353	}
 354
 355	/*
 356	 * If VFIO created a table, it was not disposed
 357	 * by tce_iommu_detach_group() so do it now.
 358	 */
 359	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 360		struct iommu_table *tbl = container->tables[i];
 361
 362		if (!tbl)
 363			continue;
 364
 365		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 366		tce_iommu_free_table(container, tbl);
 367	}
 368
 369	list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
 370		WARN_ON(tce_iommu_prereg_free(container, tcemem));
 371
 372	tce_iommu_disable(container);
 373	if (container->mm)
 374		mmdrop(container->mm);
 375	mutex_destroy(&container->lock);
 376
 377	kfree(container);
 378}
 379
 380static void tce_iommu_unuse_page(struct tce_container *container,
 381		unsigned long hpa)
 382{
 383	struct page *page;
 384
 385	page = pfn_to_page(hpa >> PAGE_SHIFT);
 386	unpin_user_page(page);
 387}
 388
 389static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
 390		unsigned long tce, unsigned long shift,
 391		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
 392{
 393	long ret = 0;
 394	struct mm_iommu_table_group_mem_t *mem;
 395
 396	mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
 397	if (!mem)
 398		return -EINVAL;
 399
 400	ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
 401	if (ret)
 402		return -EINVAL;
 403
 404	*pmem = mem;
 405
 406	return 0;
 407}
 408
 409static void tce_iommu_unuse_page_v2(struct tce_container *container,
 410		struct iommu_table *tbl, unsigned long entry)
 411{
 412	struct mm_iommu_table_group_mem_t *mem = NULL;
 413	int ret;
 414	unsigned long hpa = 0;
 415	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 416
 417	if (!pua)
 418		return;
 419
 420	ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
 421			tbl->it_page_shift, &hpa, &mem);
 422	if (ret)
 423		pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
 424				__func__, be64_to_cpu(*pua), entry, ret);
 425	if (mem)
 426		mm_iommu_mapped_dec(mem);
 427
 428	*pua = cpu_to_be64(0);
 429}
 430
 431static int tce_iommu_clear(struct tce_container *container,
 432		struct iommu_table *tbl,
 433		unsigned long entry, unsigned long pages)
 434{
 435	unsigned long oldhpa;
 436	long ret;
 437	enum dma_data_direction direction;
 438	unsigned long lastentry = entry + pages, firstentry = entry;
 439
 440	for ( ; entry < lastentry; ++entry) {
 441		if (tbl->it_indirect_levels && tbl->it_userspace) {
 442			/*
 443			 * For multilevel tables, we can take a shortcut here
 444			 * and skip some TCEs as we know that the userspace
 445			 * addresses cache is a mirror of the real TCE table
 446			 * and if it is missing some indirect levels, then
 447			 * the hardware table does not have them allocated
 448			 * either and therefore does not require updating.
 449			 */
 450			__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
 451					entry);
 452			if (!pua) {
 453				/* align to level_size which is power of two */
 454				entry |= tbl->it_level_size - 1;
 455				continue;
 456			}
 457		}
 458
 459		cond_resched();
 460
 461		direction = DMA_NONE;
 462		oldhpa = 0;
 463		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
 464				&direction);
 465		if (ret)
 466			continue;
 467
 468		if (direction == DMA_NONE)
 469			continue;
 470
 471		if (container->v2) {
 472			tce_iommu_unuse_page_v2(container, tbl, entry);
 473			continue;
 474		}
 475
 476		tce_iommu_unuse_page(container, oldhpa);
 477	}
 478
 479	iommu_tce_kill(tbl, firstentry, pages);
 480
 481	return 0;
 482}
 483
 484static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 485{
 486	struct page *page = NULL;
 487	enum dma_data_direction direction = iommu_tce_direction(tce);
 488
 489	if (pin_user_pages_fast(tce & PAGE_MASK, 1,
 490			direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
 491			&page) != 1)
 492		return -EFAULT;
 493
 494	*hpa = __pa((unsigned long) page_address(page));
 495
 496	return 0;
 497}
 498
 499static long tce_iommu_build(struct tce_container *container,
 500		struct iommu_table *tbl,
 501		unsigned long entry, unsigned long tce, unsigned long pages,
 502		enum dma_data_direction direction)
 503{
 504	long i, ret = 0;
 505	unsigned long hpa;
 506	enum dma_data_direction dirtmp;
 507
 508	for (i = 0; i < pages; ++i) {
 509		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 510
 511		ret = tce_iommu_use_page(tce, &hpa);
 512		if (ret)
 513			break;
 514
 515		if (!tce_page_is_contained(container->mm, hpa,
 516				tbl->it_page_shift)) {
 517			ret = -EPERM;
 518			break;
 519		}
 520
 521		hpa |= offset;
 522		dirtmp = direction;
 523		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
 524				&hpa, &dirtmp);
 525		if (ret) {
 526			tce_iommu_unuse_page(container, hpa);
 527			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 528					__func__, entry << tbl->it_page_shift,
 529					tce, ret);
 530			break;
 531		}
 532
 533		if (dirtmp != DMA_NONE)
 534			tce_iommu_unuse_page(container, hpa);
 535
 536		tce += IOMMU_PAGE_SIZE(tbl);
 537	}
 538
 539	if (ret)
 540		tce_iommu_clear(container, tbl, entry, i);
 541	else
 542		iommu_tce_kill(tbl, entry, pages);
 543
 544	return ret;
 545}
 546
 547static long tce_iommu_build_v2(struct tce_container *container,
 548		struct iommu_table *tbl,
 549		unsigned long entry, unsigned long tce, unsigned long pages,
 550		enum dma_data_direction direction)
 551{
 552	long i, ret = 0;
 553	unsigned long hpa;
 554	enum dma_data_direction dirtmp;
 555
 556	for (i = 0; i < pages; ++i) {
 557		struct mm_iommu_table_group_mem_t *mem = NULL;
 558		__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
 559
 560		ret = tce_iommu_prereg_ua_to_hpa(container,
 561				tce, tbl->it_page_shift, &hpa, &mem);
 562		if (ret)
 563			break;
 564
 565		if (!tce_page_is_contained(container->mm, hpa,
 566				tbl->it_page_shift)) {
 567			ret = -EPERM;
 568			break;
 569		}
 570
 571		/* Preserve offset within IOMMU page */
 572		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 573		dirtmp = direction;
 574
 575		/* The registered region is being unregistered */
 576		if (mm_iommu_mapped_inc(mem))
 577			break;
 578
 579		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
 580				&hpa, &dirtmp);
 581		if (ret) {
 582			/* dirtmp cannot be DMA_NONE here */
 583			tce_iommu_unuse_page_v2(container, tbl, entry + i);
 584			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 585					__func__, entry << tbl->it_page_shift,
 586					tce, ret);
 587			break;
 588		}
 589
 590		if (dirtmp != DMA_NONE)
 591			tce_iommu_unuse_page_v2(container, tbl, entry + i);
 592
 593		*pua = cpu_to_be64(tce);
 594
 595		tce += IOMMU_PAGE_SIZE(tbl);
 596	}
 597
 598	if (ret)
 599		tce_iommu_clear(container, tbl, entry, i);
 600	else
 601		iommu_tce_kill(tbl, entry, pages);
 602
 603	return ret;
 604}
 605
 606static long tce_iommu_create_table(struct tce_container *container,
 607			struct iommu_table_group *table_group,
 608			int num,
 609			__u32 page_shift,
 610			__u64 window_size,
 611			__u32 levels,
 612			struct iommu_table **ptbl)
 613{
 614	long ret, table_size;
 615
 616	table_size = table_group->ops->get_table_size(page_shift, window_size,
 617			levels);
 618	if (!table_size)
 619		return -EINVAL;
 620
 621	ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
 622	if (ret)
 623		return ret;
 624
 625	ret = table_group->ops->create_table(table_group, num,
 626			page_shift, window_size, levels, ptbl);
 627
 628	WARN_ON(!ret && !(*ptbl)->it_ops->free);
 629	WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
 630
 631	return ret;
 632}
 633
 634static void tce_iommu_free_table(struct tce_container *container,
 635		struct iommu_table *tbl)
 636{
 637	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 638
 639	iommu_tce_table_put(tbl);
 640	account_locked_vm(container->mm, pages, false);
 641}
 642
 643static long tce_iommu_create_window(struct tce_container *container,
 644		__u32 page_shift, __u64 window_size, __u32 levels,
 645		__u64 *start_addr)
 646{
 647	struct tce_iommu_group *tcegrp;
 648	struct iommu_table_group *table_group;
 649	struct iommu_table *tbl = NULL;
 650	long ret, num;
 651
 652	num = tce_iommu_find_free_table(container);
 653	if (num < 0)
 654		return num;
 655
 656	/* Get the first group for ops::create_table */
 657	tcegrp = list_first_entry(&container->group_list,
 658			struct tce_iommu_group, next);
 659	table_group = iommu_group_get_iommudata(tcegrp->grp);
 660	if (!table_group)
 661		return -EFAULT;
 662
 663	if (!(table_group->pgsizes & (1ULL << page_shift)))
 664		return -EINVAL;
 665
 666	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
 667			!table_group->ops->get_table_size ||
 668			!table_group->ops->create_table)
 669		return -EPERM;
 670
 671	/* Create TCE table */
 672	ret = tce_iommu_create_table(container, table_group, num,
 673			page_shift, window_size, levels, &tbl);
 674	if (ret)
 675		return ret;
 676
 677	BUG_ON(!tbl->it_ops->free);
 678
 679	/*
 680	 * Program the table to every group.
 681	 * Groups have been tested for compatibility at the attach time.
 682	 */
 683	list_for_each_entry(tcegrp, &container->group_list, next) {
 684		table_group = iommu_group_get_iommudata(tcegrp->grp);
 685
 686		ret = table_group->ops->set_window(table_group, num, tbl);
 687		if (ret)
 688			goto unset_exit;
 689	}
 690
 691	container->tables[num] = tbl;
 692
 693	/* Return start address assigned by platform in create_table() */
 694	*start_addr = tbl->it_offset << tbl->it_page_shift;
 695
 696	return 0;
 697
 698unset_exit:
 699	list_for_each_entry(tcegrp, &container->group_list, next) {
 700		table_group = iommu_group_get_iommudata(tcegrp->grp);
 701		table_group->ops->unset_window(table_group, num);
 702	}
 703	tce_iommu_free_table(container, tbl);
 704
 705	return ret;
 706}
 707
 708static long tce_iommu_remove_window(struct tce_container *container,
 709		__u64 start_addr)
 710{
 711	struct iommu_table_group *table_group = NULL;
 712	struct iommu_table *tbl;
 713	struct tce_iommu_group *tcegrp;
 714	int num;
 715
 716	num = tce_iommu_find_table(container, start_addr, &tbl);
 717	if (num < 0)
 718		return -EINVAL;
 719
 720	BUG_ON(!tbl->it_size);
 721
 722	/* Detach groups from IOMMUs */
 723	list_for_each_entry(tcegrp, &container->group_list, next) {
 724		table_group = iommu_group_get_iommudata(tcegrp->grp);
 725
 726		/*
 727		 * SPAPR TCE IOMMU exposes the default DMA window to
 728		 * the guest via dma32_window_start/size of
 729		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
 730		 * the userspace to remove this window, some do not so
 731		 * here we check for the platform capability.
 732		 */
 733		if (!table_group->ops || !table_group->ops->unset_window)
 734			return -EPERM;
 735
 736		table_group->ops->unset_window(table_group, num);
 737	}
 738
 739	/* Free table */
 740	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 741	tce_iommu_free_table(container, tbl);
 742	container->tables[num] = NULL;
 743
 744	return 0;
 745}
 746
 747static long tce_iommu_create_default_window(struct tce_container *container)
 748{
 749	long ret;
 750	__u64 start_addr = 0;
 751	struct tce_iommu_group *tcegrp;
 752	struct iommu_table_group *table_group;
 753
 754	if (!container->def_window_pending)
 755		return 0;
 756
 757	if (!tce_groups_attached(container))
 758		return -ENODEV;
 759
 760	tcegrp = list_first_entry(&container->group_list,
 761			struct tce_iommu_group, next);
 762	table_group = iommu_group_get_iommudata(tcegrp->grp);
 763	if (!table_group)
 764		return -ENODEV;
 765
 766	ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
 767			table_group->tce32_size, 1, &start_addr);
 768	WARN_ON_ONCE(!ret && start_addr);
 769
 770	if (!ret)
 771		container->def_window_pending = false;
 772
 773	return ret;
 774}
 775
 776static long tce_iommu_ioctl(void *iommu_data,
 777				 unsigned int cmd, unsigned long arg)
 778{
 779	struct tce_container *container = iommu_data;
 780	unsigned long minsz, ddwsz;
 781	long ret;
 782
 783	switch (cmd) {
 784	case VFIO_CHECK_EXTENSION:
 785		switch (arg) {
 786		case VFIO_SPAPR_TCE_IOMMU:
 787		case VFIO_SPAPR_TCE_v2_IOMMU:
 788			ret = 1;
 789			break;
 790		default:
 791			ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
 792			break;
 793		}
 794
 795		return (ret < 0) ? 0 : ret;
 796	}
 797
 798	/*
 799	 * Sanity check to prevent one userspace from manipulating
 800	 * another userspace mm.
 801	 */
 802	BUG_ON(!container);
 803	if (container->mm && container->mm != current->mm)
 804		return -EPERM;
 805
 806	switch (cmd) {
 807	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 808		struct vfio_iommu_spapr_tce_info info;
 809		struct tce_iommu_group *tcegrp;
 810		struct iommu_table_group *table_group;
 811
 812		if (!tce_groups_attached(container))
 813			return -ENXIO;
 814
 815		tcegrp = list_first_entry(&container->group_list,
 816				struct tce_iommu_group, next);
 817		table_group = iommu_group_get_iommudata(tcegrp->grp);
 818
 819		if (!table_group)
 820			return -ENXIO;
 821
 822		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
 823				dma32_window_size);
 824
 825		if (copy_from_user(&info, (void __user *)arg, minsz))
 826			return -EFAULT;
 827
 828		if (info.argsz < minsz)
 829			return -EINVAL;
 830
 831		info.dma32_window_start = table_group->tce32_start;
 832		info.dma32_window_size = table_group->tce32_size;
 833		info.flags = 0;
 834		memset(&info.ddw, 0, sizeof(info.ddw));
 835
 836		if (table_group->max_dynamic_windows_supported &&
 837				container->v2) {
 838			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
 839			info.ddw.pgsizes = table_group->pgsizes;
 840			info.ddw.max_dynamic_windows_supported =
 841				table_group->max_dynamic_windows_supported;
 842			info.ddw.levels = table_group->max_levels;
 843		}
 844
 845		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
 846
 847		if (info.argsz >= ddwsz)
 848			minsz = ddwsz;
 849
 850		if (copy_to_user((void __user *)arg, &info, minsz))
 851			return -EFAULT;
 852
 853		return 0;
 854	}
 855	case VFIO_IOMMU_MAP_DMA: {
 856		struct vfio_iommu_type1_dma_map param;
 857		struct iommu_table *tbl = NULL;
 858		long num;
 859		enum dma_data_direction direction;
 860
 861		if (!container->enabled)
 862			return -EPERM;
 863
 864		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 865
 866		if (copy_from_user(&param, (void __user *)arg, minsz))
 867			return -EFAULT;
 868
 869		if (param.argsz < minsz)
 870			return -EINVAL;
 871
 872		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
 873				VFIO_DMA_MAP_FLAG_WRITE))
 874			return -EINVAL;
 875
 876		ret = tce_iommu_create_default_window(container);
 877		if (ret)
 878			return ret;
 879
 880		num = tce_iommu_find_table(container, param.iova, &tbl);
 881		if (num < 0)
 882			return -ENXIO;
 883
 884		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
 885				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
 886			return -EINVAL;
 887
 888		/* iova is checked by the IOMMU API */
 889		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
 890			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 891				direction = DMA_BIDIRECTIONAL;
 892			else
 893				direction = DMA_TO_DEVICE;
 894		} else {
 895			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 896				direction = DMA_FROM_DEVICE;
 897			else
 898				return -EINVAL;
 899		}
 900
 901		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
 902		if (ret)
 903			return ret;
 904
 905		if (container->v2)
 906			ret = tce_iommu_build_v2(container, tbl,
 907					param.iova >> tbl->it_page_shift,
 908					param.vaddr,
 909					param.size >> tbl->it_page_shift,
 910					direction);
 911		else
 912			ret = tce_iommu_build(container, tbl,
 913					param.iova >> tbl->it_page_shift,
 914					param.vaddr,
 915					param.size >> tbl->it_page_shift,
 916					direction);
 917
 918		iommu_flush_tce(tbl);
 919
 920		return ret;
 921	}
 922	case VFIO_IOMMU_UNMAP_DMA: {
 923		struct vfio_iommu_type1_dma_unmap param;
 924		struct iommu_table *tbl = NULL;
 925		long num;
 926
 927		if (!container->enabled)
 928			return -EPERM;
 929
 930		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
 931				size);
 932
 933		if (copy_from_user(&param, (void __user *)arg, minsz))
 934			return -EFAULT;
 935
 936		if (param.argsz < minsz)
 937			return -EINVAL;
 938
 939		/* No flag is supported now */
 940		if (param.flags)
 941			return -EINVAL;
 942
 943		ret = tce_iommu_create_default_window(container);
 944		if (ret)
 945			return ret;
 946
 947		num = tce_iommu_find_table(container, param.iova, &tbl);
 948		if (num < 0)
 949			return -ENXIO;
 950
 951		if (param.size & ~IOMMU_PAGE_MASK(tbl))
 952			return -EINVAL;
 953
 954		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
 955				param.size >> tbl->it_page_shift);
 956		if (ret)
 957			return ret;
 958
 959		ret = tce_iommu_clear(container, tbl,
 960				param.iova >> tbl->it_page_shift,
 961				param.size >> tbl->it_page_shift);
 962		iommu_flush_tce(tbl);
 963
 964		return ret;
 965	}
 966	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
 967		struct vfio_iommu_spapr_register_memory param;
 968
 969		if (!container->v2)
 970			break;
 971
 972		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
 973				size);
 974
 975		ret = tce_iommu_mm_set(container);
 976		if (ret)
 977			return ret;
 978
 979		if (copy_from_user(&param, (void __user *)arg, minsz))
 980			return -EFAULT;
 981
 982		if (param.argsz < minsz)
 983			return -EINVAL;
 984
 985		/* No flag is supported now */
 986		if (param.flags)
 987			return -EINVAL;
 988
 989		mutex_lock(&container->lock);
 990		ret = tce_iommu_register_pages(container, param.vaddr,
 991				param.size);
 992		mutex_unlock(&container->lock);
 993
 994		return ret;
 995	}
 996	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
 997		struct vfio_iommu_spapr_register_memory param;
 998
 999		if (!container->v2)
1000			break;
1001
1002		if (!container->mm)
1003			return -EPERM;
1004
1005		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1006				size);
1007
1008		if (copy_from_user(&param, (void __user *)arg, minsz))
1009			return -EFAULT;
1010
1011		if (param.argsz < minsz)
1012			return -EINVAL;
1013
1014		/* No flag is supported now */
1015		if (param.flags)
1016			return -EINVAL;
1017
1018		mutex_lock(&container->lock);
1019		ret = tce_iommu_unregister_pages(container, param.vaddr,
1020				param.size);
1021		mutex_unlock(&container->lock);
1022
1023		return ret;
1024	}
1025	case VFIO_IOMMU_ENABLE:
1026		if (container->v2)
1027			break;
1028
1029		mutex_lock(&container->lock);
1030		ret = tce_iommu_enable(container);
1031		mutex_unlock(&container->lock);
1032		return ret;
1033
1034
1035	case VFIO_IOMMU_DISABLE:
1036		if (container->v2)
1037			break;
1038
1039		mutex_lock(&container->lock);
1040		tce_iommu_disable(container);
1041		mutex_unlock(&container->lock);
1042		return 0;
1043
1044	case VFIO_EEH_PE_OP: {
1045		struct tce_iommu_group *tcegrp;
1046
1047		ret = 0;
1048		list_for_each_entry(tcegrp, &container->group_list, next) {
1049			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
1050					cmd, arg);
1051			if (ret)
1052				return ret;
1053		}
1054		return ret;
1055	}
1056
1057	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1058		struct vfio_iommu_spapr_tce_create create;
1059
1060		if (!container->v2)
1061			break;
1062
1063		ret = tce_iommu_mm_set(container);
1064		if (ret)
1065			return ret;
1066
1067		if (!tce_groups_attached(container))
1068			return -ENXIO;
1069
1070		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1071				start_addr);
1072
1073		if (copy_from_user(&create, (void __user *)arg, minsz))
1074			return -EFAULT;
1075
1076		if (create.argsz < minsz)
1077			return -EINVAL;
1078
1079		if (create.flags)
1080			return -EINVAL;
1081
1082		mutex_lock(&container->lock);
1083
1084		ret = tce_iommu_create_default_window(container);
1085		if (!ret)
1086			ret = tce_iommu_create_window(container,
1087					create.page_shift,
1088					create.window_size, create.levels,
1089					&create.start_addr);
1090
1091		mutex_unlock(&container->lock);
1092
1093		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1094			ret = -EFAULT;
1095
1096		return ret;
1097	}
1098	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1099		struct vfio_iommu_spapr_tce_remove remove;
1100
1101		if (!container->v2)
1102			break;
1103
1104		ret = tce_iommu_mm_set(container);
1105		if (ret)
1106			return ret;
1107
1108		if (!tce_groups_attached(container))
1109			return -ENXIO;
1110
1111		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1112				start_addr);
1113
1114		if (copy_from_user(&remove, (void __user *)arg, minsz))
1115			return -EFAULT;
1116
1117		if (remove.argsz < minsz)
1118			return -EINVAL;
1119
1120		if (remove.flags)
1121			return -EINVAL;
1122
1123		if (container->def_window_pending && !remove.start_addr) {
1124			container->def_window_pending = false;
1125			return 0;
1126		}
1127
1128		mutex_lock(&container->lock);
1129
1130		ret = tce_iommu_remove_window(container, remove.start_addr);
1131
1132		mutex_unlock(&container->lock);
1133
1134		return ret;
1135	}
1136	}
1137
1138	return -ENOTTY;
1139}
1140
1141static void tce_iommu_release_ownership(struct tce_container *container,
1142		struct iommu_table_group *table_group)
1143{
1144	int i;
1145
1146	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1147		struct iommu_table *tbl = container->tables[i];
1148
1149		if (!tbl)
1150			continue;
1151
1152		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1153		if (tbl->it_map)
1154			iommu_release_ownership(tbl);
1155
1156		container->tables[i] = NULL;
1157	}
1158}
1159
1160static int tce_iommu_take_ownership(struct tce_container *container,
1161		struct iommu_table_group *table_group)
1162{
1163	int i, j, rc = 0;
1164
1165	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1166		struct iommu_table *tbl = table_group->tables[i];
1167
1168		if (!tbl || !tbl->it_map)
1169			continue;
1170
1171		rc = iommu_take_ownership(tbl);
1172		if (rc) {
1173			for (j = 0; j < i; ++j)
1174				iommu_release_ownership(
1175						table_group->tables[j]);
1176
1177			return rc;
1178		}
1179	}
1180
1181	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1182		container->tables[i] = table_group->tables[i];
1183
1184	return 0;
1185}
1186
1187static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1188		struct iommu_table_group *table_group)
1189{
1190	long i;
1191
1192	if (!table_group->ops->unset_window) {
1193		WARN_ON_ONCE(1);
1194		return;
1195	}
1196
1197	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1198		if (container->tables[i])
1199			table_group->ops->unset_window(table_group, i);
1200
1201	table_group->ops->release_ownership(table_group);
1202}
1203
1204static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1205		struct iommu_table_group *table_group)
1206{
1207	long i, ret = 0;
1208
1209	if (!table_group->ops->create_table || !table_group->ops->set_window ||
1210			!table_group->ops->release_ownership) {
1211		WARN_ON_ONCE(1);
1212		return -EFAULT;
1213	}
1214
1215	table_group->ops->take_ownership(table_group);
1216
1217	/* Set all windows to the new group */
1218	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1219		struct iommu_table *tbl = container->tables[i];
1220
1221		if (!tbl)
1222			continue;
1223
1224		ret = table_group->ops->set_window(table_group, i, tbl);
1225		if (ret)
1226			goto release_exit;
1227	}
1228
1229	return 0;
1230
1231release_exit:
1232	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1233		table_group->ops->unset_window(table_group, i);
1234
1235	table_group->ops->release_ownership(table_group);
1236
1237	return ret;
1238}
1239
1240static int tce_iommu_attach_group(void *iommu_data,
1241		struct iommu_group *iommu_group)
1242{
1243	int ret = 0;
1244	struct tce_container *container = iommu_data;
1245	struct iommu_table_group *table_group;
1246	struct tce_iommu_group *tcegrp = NULL;
1247
1248	mutex_lock(&container->lock);
1249
1250	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1251			iommu_group_id(iommu_group), iommu_group); */
1252	table_group = iommu_group_get_iommudata(iommu_group);
1253	if (!table_group) {
1254		ret = -ENODEV;
1255		goto unlock_exit;
1256	}
1257
1258	if (tce_groups_attached(container) && (!table_group->ops ||
1259			!table_group->ops->take_ownership ||
1260			!table_group->ops->release_ownership)) {
1261		ret = -EBUSY;
1262		goto unlock_exit;
1263	}
1264
1265	/* Check if new group has the same iommu_ops (i.e. compatible) */
1266	list_for_each_entry(tcegrp, &container->group_list, next) {
1267		struct iommu_table_group *table_group_tmp;
1268
1269		if (tcegrp->grp == iommu_group) {
1270			pr_warn("tce_vfio: Group %d is already attached\n",
1271					iommu_group_id(iommu_group));
1272			ret = -EBUSY;
1273			goto unlock_exit;
1274		}
1275		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1276		if (table_group_tmp->ops->create_table !=
1277				table_group->ops->create_table) {
1278			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1279					iommu_group_id(iommu_group),
1280					iommu_group_id(tcegrp->grp));
1281			ret = -EPERM;
1282			goto unlock_exit;
1283		}
1284	}
1285
1286	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1287	if (!tcegrp) {
1288		ret = -ENOMEM;
1289		goto unlock_exit;
1290	}
1291
1292	if (!table_group->ops || !table_group->ops->take_ownership ||
1293			!table_group->ops->release_ownership) {
1294		if (container->v2) {
1295			ret = -EPERM;
1296			goto free_exit;
1297		}
1298		ret = tce_iommu_take_ownership(container, table_group);
1299	} else {
1300		if (!container->v2) {
1301			ret = -EPERM;
1302			goto free_exit;
1303		}
1304		ret = tce_iommu_take_ownership_ddw(container, table_group);
1305		if (!tce_groups_attached(container) && !container->tables[0])
1306			container->def_window_pending = true;
1307	}
1308
1309	if (!ret) {
1310		tcegrp->grp = iommu_group;
1311		list_add(&tcegrp->next, &container->group_list);
1312	}
1313
1314free_exit:
1315	if (ret && tcegrp)
1316		kfree(tcegrp);
1317
1318unlock_exit:
1319	mutex_unlock(&container->lock);
1320
1321	return ret;
1322}
1323
1324static void tce_iommu_detach_group(void *iommu_data,
1325		struct iommu_group *iommu_group)
1326{
1327	struct tce_container *container = iommu_data;
1328	struct iommu_table_group *table_group;
1329	bool found = false;
1330	struct tce_iommu_group *tcegrp;
1331
1332	mutex_lock(&container->lock);
1333
1334	list_for_each_entry(tcegrp, &container->group_list, next) {
1335		if (tcegrp->grp == iommu_group) {
1336			found = true;
1337			break;
1338		}
1339	}
1340
1341	if (!found) {
1342		pr_warn("tce_vfio: detaching unattached group #%u\n",
1343				iommu_group_id(iommu_group));
1344		goto unlock_exit;
1345	}
1346
1347	list_del(&tcegrp->next);
1348	kfree(tcegrp);
1349
1350	table_group = iommu_group_get_iommudata(iommu_group);
1351	BUG_ON(!table_group);
1352
1353	if (!table_group->ops || !table_group->ops->release_ownership)
1354		tce_iommu_release_ownership(container, table_group);
1355	else
1356		tce_iommu_release_ownership_ddw(container, table_group);
1357
1358unlock_exit:
1359	mutex_unlock(&container->lock);
1360}
1361
1362static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1363	.name		= "iommu-vfio-powerpc",
1364	.owner		= THIS_MODULE,
1365	.open		= tce_iommu_open,
1366	.release	= tce_iommu_release,
1367	.ioctl		= tce_iommu_ioctl,
1368	.attach_group	= tce_iommu_attach_group,
1369	.detach_group	= tce_iommu_detach_group,
1370};
1371
1372static int __init tce_iommu_init(void)
1373{
1374	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1375}
1376
1377static void __exit tce_iommu_cleanup(void)
1378{
1379	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1380}
1381
1382module_init(tce_iommu_init);
1383module_exit(tce_iommu_cleanup);
1384
1385MODULE_VERSION(DRIVER_VERSION);
1386MODULE_LICENSE("GPL v2");
1387MODULE_AUTHOR(DRIVER_AUTHOR);
1388MODULE_DESCRIPTION(DRIVER_DESC);
1389