Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.5.6.
   1/*
   2 * VFIO: IOMMU DMA mapping support for TCE on POWER
   3 *
   4 * Copyright (C) 2013 IBM Corp.  All rights reserved.
   5 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 *
  11 * Derived from original vfio_iommu_type1.c:
  12 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
  13 *     Author: Alex Williamson <alex.williamson@redhat.com>
  14 */
  15
  16#include <linux/module.h>
  17#include <linux/pci.h>
  18#include <linux/slab.h>
  19#include <linux/uaccess.h>
  20#include <linux/err.h>
  21#include <linux/vfio.h>
  22#include <linux/vmalloc.h>
  23#include <asm/iommu.h>
  24#include <asm/tce.h>
  25#include <asm/mmu_context.h>
  26
  27#define DRIVER_VERSION  "0.1"
  28#define DRIVER_AUTHOR   "aik@ozlabs.ru"
  29#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
  30
  31static void tce_iommu_detach_group(void *iommu_data,
  32		struct iommu_group *iommu_group);
  33
  34static long try_increment_locked_vm(long npages)
  35{
  36	long ret = 0, locked, lock_limit;
  37
  38	if (!current || !current->mm)
  39		return -ESRCH; /* process exited */
  40
  41	if (!npages)
  42		return 0;
  43
  44	down_write(&current->mm->mmap_sem);
  45	locked = current->mm->locked_vm + npages;
  46	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  47	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
  48		ret = -ENOMEM;
  49	else
  50		current->mm->locked_vm += npages;
  51
  52	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
  53			npages << PAGE_SHIFT,
  54			current->mm->locked_vm << PAGE_SHIFT,
  55			rlimit(RLIMIT_MEMLOCK),
  56			ret ? " - exceeded" : "");
  57
  58	up_write(&current->mm->mmap_sem);
  59
  60	return ret;
  61}
  62
  63static void decrement_locked_vm(long npages)
  64{
  65	if (!current || !current->mm || !npages)
  66		return; /* process exited */
  67
  68	down_write(&current->mm->mmap_sem);
  69	if (WARN_ON_ONCE(npages > current->mm->locked_vm))
  70		npages = current->mm->locked_vm;
  71	current->mm->locked_vm -= npages;
  72	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
  73			npages << PAGE_SHIFT,
  74			current->mm->locked_vm << PAGE_SHIFT,
  75			rlimit(RLIMIT_MEMLOCK));
  76	up_write(&current->mm->mmap_sem);
  77}
  78
  79/*
  80 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  81 *
  82 * This code handles mapping and unmapping of user data buffers
  83 * into DMA'ble space using the IOMMU
  84 */
  85
  86struct tce_iommu_group {
  87	struct list_head next;
  88	struct iommu_group *grp;
  89};
  90
  91/*
  92 * The container descriptor supports only a single group per container.
  93 * Required by the API as the container is not supplied with the IOMMU group
  94 * at the moment of initialization.
  95 */
  96struct tce_container {
  97	struct mutex lock;
  98	bool enabled;
  99	bool v2;
 100	unsigned long locked_pages;
 101	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
 102	struct list_head group_list;
 103};
 104
 105static long tce_iommu_unregister_pages(struct tce_container *container,
 106		__u64 vaddr, __u64 size)
 107{
 108	struct mm_iommu_table_group_mem_t *mem;
 109
 110	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
 111		return -EINVAL;
 112
 113	mem = mm_iommu_find(vaddr, size >> PAGE_SHIFT);
 114	if (!mem)
 115		return -ENOENT;
 116
 117	return mm_iommu_put(mem);
 118}
 119
 120static long tce_iommu_register_pages(struct tce_container *container,
 121		__u64 vaddr, __u64 size)
 122{
 123	long ret = 0;
 124	struct mm_iommu_table_group_mem_t *mem = NULL;
 125	unsigned long entries = size >> PAGE_SHIFT;
 126
 127	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
 128			((vaddr + size) < vaddr))
 129		return -EINVAL;
 130
 131	ret = mm_iommu_get(vaddr, entries, &mem);
 132	if (ret)
 133		return ret;
 134
 135	container->enabled = true;
 136
 137	return 0;
 138}
 139
 140static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl)
 141{
 142	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
 143			tbl->it_size, PAGE_SIZE);
 144	unsigned long *uas;
 145	long ret;
 146
 147	BUG_ON(tbl->it_userspace);
 148
 149	ret = try_increment_locked_vm(cb >> PAGE_SHIFT);
 150	if (ret)
 151		return ret;
 152
 153	uas = vzalloc(cb);
 154	if (!uas) {
 155		decrement_locked_vm(cb >> PAGE_SHIFT);
 156		return -ENOMEM;
 157	}
 158	tbl->it_userspace = uas;
 159
 160	return 0;
 161}
 162
 163static void tce_iommu_userspace_view_free(struct iommu_table *tbl)
 164{
 165	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
 166			tbl->it_size, PAGE_SIZE);
 167
 168	if (!tbl->it_userspace)
 169		return;
 170
 171	vfree(tbl->it_userspace);
 172	tbl->it_userspace = NULL;
 173	decrement_locked_vm(cb >> PAGE_SHIFT);
 174}
 175
 176static bool tce_page_is_contained(struct page *page, unsigned page_shift)
 177{
 178	/*
 179	 * Check that the TCE table granularity is not bigger than the size of
 180	 * a page we just found. Otherwise the hardware can get access to
 181	 * a bigger memory chunk that it should.
 182	 */
 183	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
 184}
 185
 186static inline bool tce_groups_attached(struct tce_container *container)
 187{
 188	return !list_empty(&container->group_list);
 189}
 190
 191static long tce_iommu_find_table(struct tce_container *container,
 192		phys_addr_t ioba, struct iommu_table **ptbl)
 193{
 194	long i;
 195
 196	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 197		struct iommu_table *tbl = container->tables[i];
 198
 199		if (tbl) {
 200			unsigned long entry = ioba >> tbl->it_page_shift;
 201			unsigned long start = tbl->it_offset;
 202			unsigned long end = start + tbl->it_size;
 203
 204			if ((start <= entry) && (entry < end)) {
 205				*ptbl = tbl;
 206				return i;
 207			}
 208		}
 209	}
 210
 211	return -1;
 212}
 213
 214static int tce_iommu_find_free_table(struct tce_container *container)
 215{
 216	int i;
 217
 218	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 219		if (!container->tables[i])
 220			return i;
 221	}
 222
 223	return -ENOSPC;
 224}
 225
 226static int tce_iommu_enable(struct tce_container *container)
 227{
 228	int ret = 0;
 229	unsigned long locked;
 230	struct iommu_table_group *table_group;
 231	struct tce_iommu_group *tcegrp;
 232
 233	if (!current->mm)
 234		return -ESRCH; /* process exited */
 235
 236	if (container->enabled)
 237		return -EBUSY;
 238
 239	/*
 240	 * When userspace pages are mapped into the IOMMU, they are effectively
 241	 * locked memory, so, theoretically, we need to update the accounting
 242	 * of locked pages on each map and unmap.  For powerpc, the map unmap
 243	 * paths can be very hot, though, and the accounting would kill
 244	 * performance, especially since it would be difficult to impossible
 245	 * to handle the accounting in real mode only.
 246	 *
 247	 * To address that, rather than precisely accounting every page, we
 248	 * instead account for a worst case on locked memory when the iommu is
 249	 * enabled and disabled.  The worst case upper bound on locked memory
 250	 * is the size of the whole iommu window, which is usually relatively
 251	 * small (compared to total memory sizes) on POWER hardware.
 252	 *
 253	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 254	 * that would effectively kill the guest at random points, much better
 255	 * enforcing the limit based on the max that the guest can map.
 256	 *
 257	 * Unfortunately at the moment it counts whole tables, no matter how
 258	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
 259	 * each with 2GB DMA window, 8GB will be counted here. The reason for
 260	 * this is that we cannot tell here the amount of RAM used by the guest
 261	 * as this information is only available from KVM and VFIO is
 262	 * KVM agnostic.
 263	 *
 264	 * So we do not allow enabling a container without a group attached
 265	 * as there is no way to know how much we should increment
 266	 * the locked_vm counter.
 267	 */
 268	if (!tce_groups_attached(container))
 269		return -ENODEV;
 270
 271	tcegrp = list_first_entry(&container->group_list,
 272			struct tce_iommu_group, next);
 273	table_group = iommu_group_get_iommudata(tcegrp->grp);
 274	if (!table_group)
 275		return -ENODEV;
 276
 277	if (!table_group->tce32_size)
 278		return -EPERM;
 279
 280	locked = table_group->tce32_size >> PAGE_SHIFT;
 281	ret = try_increment_locked_vm(locked);
 282	if (ret)
 283		return ret;
 284
 285	container->locked_pages = locked;
 286
 287	container->enabled = true;
 288
 289	return ret;
 290}
 291
 292static void tce_iommu_disable(struct tce_container *container)
 293{
 294	if (!container->enabled)
 295		return;
 296
 297	container->enabled = false;
 298
 299	if (!current->mm)
 300		return;
 301
 302	decrement_locked_vm(container->locked_pages);
 303}
 304
 305static void *tce_iommu_open(unsigned long arg)
 306{
 307	struct tce_container *container;
 308
 309	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
 310		pr_err("tce_vfio: Wrong IOMMU type\n");
 311		return ERR_PTR(-EINVAL);
 312	}
 313
 314	container = kzalloc(sizeof(*container), GFP_KERNEL);
 315	if (!container)
 316		return ERR_PTR(-ENOMEM);
 317
 318	mutex_init(&container->lock);
 319	INIT_LIST_HEAD_RCU(&container->group_list);
 320
 321	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 322
 323	return container;
 324}
 325
 326static int tce_iommu_clear(struct tce_container *container,
 327		struct iommu_table *tbl,
 328		unsigned long entry, unsigned long pages);
 329static void tce_iommu_free_table(struct iommu_table *tbl);
 330
 331static void tce_iommu_release(void *iommu_data)
 332{
 333	struct tce_container *container = iommu_data;
 334	struct iommu_table_group *table_group;
 335	struct tce_iommu_group *tcegrp;
 336	long i;
 337
 338	while (tce_groups_attached(container)) {
 339		tcegrp = list_first_entry(&container->group_list,
 340				struct tce_iommu_group, next);
 341		table_group = iommu_group_get_iommudata(tcegrp->grp);
 342		tce_iommu_detach_group(iommu_data, tcegrp->grp);
 343	}
 344
 345	/*
 346	 * If VFIO created a table, it was not disposed
 347	 * by tce_iommu_detach_group() so do it now.
 348	 */
 349	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 350		struct iommu_table *tbl = container->tables[i];
 351
 352		if (!tbl)
 353			continue;
 354
 355		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 356		tce_iommu_free_table(tbl);
 357	}
 358
 359	tce_iommu_disable(container);
 360	mutex_destroy(&container->lock);
 361
 362	kfree(container);
 363}
 364
 365static void tce_iommu_unuse_page(struct tce_container *container,
 366		unsigned long hpa)
 367{
 368	struct page *page;
 369
 370	page = pfn_to_page(hpa >> PAGE_SHIFT);
 371	put_page(page);
 372}
 373
 374static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size,
 375		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
 376{
 377	long ret = 0;
 378	struct mm_iommu_table_group_mem_t *mem;
 379
 380	mem = mm_iommu_lookup(tce, size);
 381	if (!mem)
 382		return -EINVAL;
 383
 384	ret = mm_iommu_ua_to_hpa(mem, tce, phpa);
 385	if (ret)
 386		return -EINVAL;
 387
 388	*pmem = mem;
 389
 390	return 0;
 391}
 392
 393static void tce_iommu_unuse_page_v2(struct iommu_table *tbl,
 394		unsigned long entry)
 395{
 396	struct mm_iommu_table_group_mem_t *mem = NULL;
 397	int ret;
 398	unsigned long hpa = 0;
 399	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
 400
 401	if (!pua || !current || !current->mm)
 402		return;
 403
 404	ret = tce_iommu_prereg_ua_to_hpa(*pua, IOMMU_PAGE_SIZE(tbl),
 405			&hpa, &mem);
 406	if (ret)
 407		pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n",
 408				__func__, *pua, entry, ret);
 409	if (mem)
 410		mm_iommu_mapped_dec(mem);
 411
 412	*pua = 0;
 413}
 414
 415static int tce_iommu_clear(struct tce_container *container,
 416		struct iommu_table *tbl,
 417		unsigned long entry, unsigned long pages)
 418{
 419	unsigned long oldhpa;
 420	long ret;
 421	enum dma_data_direction direction;
 422
 423	for ( ; pages; --pages, ++entry) {
 424		direction = DMA_NONE;
 425		oldhpa = 0;
 426		ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
 427		if (ret)
 428			continue;
 429
 430		if (direction == DMA_NONE)
 431			continue;
 432
 433		if (container->v2) {
 434			tce_iommu_unuse_page_v2(tbl, entry);
 435			continue;
 436		}
 437
 438		tce_iommu_unuse_page(container, oldhpa);
 439	}
 440
 441	return 0;
 442}
 443
 444static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 445{
 446	struct page *page = NULL;
 447	enum dma_data_direction direction = iommu_tce_direction(tce);
 448
 449	if (get_user_pages_fast(tce & PAGE_MASK, 1,
 450			direction != DMA_TO_DEVICE, &page) != 1)
 451		return -EFAULT;
 452
 453	*hpa = __pa((unsigned long) page_address(page));
 454
 455	return 0;
 456}
 457
 458static long tce_iommu_build(struct tce_container *container,
 459		struct iommu_table *tbl,
 460		unsigned long entry, unsigned long tce, unsigned long pages,
 461		enum dma_data_direction direction)
 462{
 463	long i, ret = 0;
 464	struct page *page;
 465	unsigned long hpa;
 466	enum dma_data_direction dirtmp;
 467
 468	for (i = 0; i < pages; ++i) {
 469		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 470
 471		ret = tce_iommu_use_page(tce, &hpa);
 472		if (ret)
 473			break;
 474
 475		page = pfn_to_page(hpa >> PAGE_SHIFT);
 476		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
 477			ret = -EPERM;
 478			break;
 479		}
 480
 481		hpa |= offset;
 482		dirtmp = direction;
 483		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
 484		if (ret) {
 485			tce_iommu_unuse_page(container, hpa);
 486			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 487					__func__, entry << tbl->it_page_shift,
 488					tce, ret);
 489			break;
 490		}
 491
 492		if (dirtmp != DMA_NONE)
 493			tce_iommu_unuse_page(container, hpa);
 494
 495		tce += IOMMU_PAGE_SIZE(tbl);
 496	}
 497
 498	if (ret)
 499		tce_iommu_clear(container, tbl, entry, i);
 500
 501	return ret;
 502}
 503
 504static long tce_iommu_build_v2(struct tce_container *container,
 505		struct iommu_table *tbl,
 506		unsigned long entry, unsigned long tce, unsigned long pages,
 507		enum dma_data_direction direction)
 508{
 509	long i, ret = 0;
 510	struct page *page;
 511	unsigned long hpa;
 512	enum dma_data_direction dirtmp;
 513
 514	for (i = 0; i < pages; ++i) {
 515		struct mm_iommu_table_group_mem_t *mem = NULL;
 516		unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl,
 517				entry + i);
 518
 519		ret = tce_iommu_prereg_ua_to_hpa(tce, IOMMU_PAGE_SIZE(tbl),
 520				&hpa, &mem);
 521		if (ret)
 522			break;
 523
 524		page = pfn_to_page(hpa >> PAGE_SHIFT);
 525		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
 526			ret = -EPERM;
 527			break;
 528		}
 529
 530		/* Preserve offset within IOMMU page */
 531		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 532		dirtmp = direction;
 533
 534		/* The registered region is being unregistered */
 535		if (mm_iommu_mapped_inc(mem))
 536			break;
 537
 538		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
 539		if (ret) {
 540			/* dirtmp cannot be DMA_NONE here */
 541			tce_iommu_unuse_page_v2(tbl, entry + i);
 542			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 543					__func__, entry << tbl->it_page_shift,
 544					tce, ret);
 545			break;
 546		}
 547
 548		if (dirtmp != DMA_NONE)
 549			tce_iommu_unuse_page_v2(tbl, entry + i);
 550
 551		*pua = tce;
 552
 553		tce += IOMMU_PAGE_SIZE(tbl);
 554	}
 555
 556	if (ret)
 557		tce_iommu_clear(container, tbl, entry, i);
 558
 559	return ret;
 560}
 561
 562static long tce_iommu_create_table(struct tce_container *container,
 563			struct iommu_table_group *table_group,
 564			int num,
 565			__u32 page_shift,
 566			__u64 window_size,
 567			__u32 levels,
 568			struct iommu_table **ptbl)
 569{
 570	long ret, table_size;
 571
 572	table_size = table_group->ops->get_table_size(page_shift, window_size,
 573			levels);
 574	if (!table_size)
 575		return -EINVAL;
 576
 577	ret = try_increment_locked_vm(table_size >> PAGE_SHIFT);
 578	if (ret)
 579		return ret;
 580
 581	ret = table_group->ops->create_table(table_group, num,
 582			page_shift, window_size, levels, ptbl);
 583
 584	WARN_ON(!ret && !(*ptbl)->it_ops->free);
 585	WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
 586
 587	if (!ret && container->v2) {
 588		ret = tce_iommu_userspace_view_alloc(*ptbl);
 589		if (ret)
 590			(*ptbl)->it_ops->free(*ptbl);
 591	}
 592
 593	if (ret)
 594		decrement_locked_vm(table_size >> PAGE_SHIFT);
 595
 596	return ret;
 597}
 598
 599static void tce_iommu_free_table(struct iommu_table *tbl)
 600{
 601	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 602
 603	tce_iommu_userspace_view_free(tbl);
 604	tbl->it_ops->free(tbl);
 605	decrement_locked_vm(pages);
 606}
 607
 608static long tce_iommu_create_window(struct tce_container *container,
 609		__u32 page_shift, __u64 window_size, __u32 levels,
 610		__u64 *start_addr)
 611{
 612	struct tce_iommu_group *tcegrp;
 613	struct iommu_table_group *table_group;
 614	struct iommu_table *tbl = NULL;
 615	long ret, num;
 616
 617	num = tce_iommu_find_free_table(container);
 618	if (num < 0)
 619		return num;
 620
 621	/* Get the first group for ops::create_table */
 622	tcegrp = list_first_entry(&container->group_list,
 623			struct tce_iommu_group, next);
 624	table_group = iommu_group_get_iommudata(tcegrp->grp);
 625	if (!table_group)
 626		return -EFAULT;
 627
 628	if (!(table_group->pgsizes & (1ULL << page_shift)))
 629		return -EINVAL;
 630
 631	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
 632			!table_group->ops->get_table_size ||
 633			!table_group->ops->create_table)
 634		return -EPERM;
 635
 636	/* Create TCE table */
 637	ret = tce_iommu_create_table(container, table_group, num,
 638			page_shift, window_size, levels, &tbl);
 639	if (ret)
 640		return ret;
 641
 642	BUG_ON(!tbl->it_ops->free);
 643
 644	/*
 645	 * Program the table to every group.
 646	 * Groups have been tested for compatibility at the attach time.
 647	 */
 648	list_for_each_entry(tcegrp, &container->group_list, next) {
 649		table_group = iommu_group_get_iommudata(tcegrp->grp);
 650
 651		ret = table_group->ops->set_window(table_group, num, tbl);
 652		if (ret)
 653			goto unset_exit;
 654	}
 655
 656	container->tables[num] = tbl;
 657
 658	/* Return start address assigned by platform in create_table() */
 659	*start_addr = tbl->it_offset << tbl->it_page_shift;
 660
 661	return 0;
 662
 663unset_exit:
 664	list_for_each_entry(tcegrp, &container->group_list, next) {
 665		table_group = iommu_group_get_iommudata(tcegrp->grp);
 666		table_group->ops->unset_window(table_group, num);
 667	}
 668	tce_iommu_free_table(tbl);
 669
 670	return ret;
 671}
 672
 673static long tce_iommu_remove_window(struct tce_container *container,
 674		__u64 start_addr)
 675{
 676	struct iommu_table_group *table_group = NULL;
 677	struct iommu_table *tbl;
 678	struct tce_iommu_group *tcegrp;
 679	int num;
 680
 681	num = tce_iommu_find_table(container, start_addr, &tbl);
 682	if (num < 0)
 683		return -EINVAL;
 684
 685	BUG_ON(!tbl->it_size);
 686
 687	/* Detach groups from IOMMUs */
 688	list_for_each_entry(tcegrp, &container->group_list, next) {
 689		table_group = iommu_group_get_iommudata(tcegrp->grp);
 690
 691		/*
 692		 * SPAPR TCE IOMMU exposes the default DMA window to
 693		 * the guest via dma32_window_start/size of
 694		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
 695		 * the userspace to remove this window, some do not so
 696		 * here we check for the platform capability.
 697		 */
 698		if (!table_group->ops || !table_group->ops->unset_window)
 699			return -EPERM;
 700
 701		table_group->ops->unset_window(table_group, num);
 702	}
 703
 704	/* Free table */
 705	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 706	tce_iommu_free_table(tbl);
 707	container->tables[num] = NULL;
 708
 709	return 0;
 710}
 711
 712static long tce_iommu_ioctl(void *iommu_data,
 713				 unsigned int cmd, unsigned long arg)
 714{
 715	struct tce_container *container = iommu_data;
 716	unsigned long minsz, ddwsz;
 717	long ret;
 718
 719	switch (cmd) {
 720	case VFIO_CHECK_EXTENSION:
 721		switch (arg) {
 722		case VFIO_SPAPR_TCE_IOMMU:
 723		case VFIO_SPAPR_TCE_v2_IOMMU:
 724			ret = 1;
 725			break;
 726		default:
 727			ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
 728			break;
 729		}
 730
 731		return (ret < 0) ? 0 : ret;
 732
 733	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 734		struct vfio_iommu_spapr_tce_info info;
 735		struct tce_iommu_group *tcegrp;
 736		struct iommu_table_group *table_group;
 737
 738		if (!tce_groups_attached(container))
 739			return -ENXIO;
 740
 741		tcegrp = list_first_entry(&container->group_list,
 742				struct tce_iommu_group, next);
 743		table_group = iommu_group_get_iommudata(tcegrp->grp);
 744
 745		if (!table_group)
 746			return -ENXIO;
 747
 748		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
 749				dma32_window_size);
 750
 751		if (copy_from_user(&info, (void __user *)arg, minsz))
 752			return -EFAULT;
 753
 754		if (info.argsz < minsz)
 755			return -EINVAL;
 756
 757		info.dma32_window_start = table_group->tce32_start;
 758		info.dma32_window_size = table_group->tce32_size;
 759		info.flags = 0;
 760		memset(&info.ddw, 0, sizeof(info.ddw));
 761
 762		if (table_group->max_dynamic_windows_supported &&
 763				container->v2) {
 764			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
 765			info.ddw.pgsizes = table_group->pgsizes;
 766			info.ddw.max_dynamic_windows_supported =
 767				table_group->max_dynamic_windows_supported;
 768			info.ddw.levels = table_group->max_levels;
 769		}
 770
 771		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
 772
 773		if (info.argsz >= ddwsz)
 774			minsz = ddwsz;
 775
 776		if (copy_to_user((void __user *)arg, &info, minsz))
 777			return -EFAULT;
 778
 779		return 0;
 780	}
 781	case VFIO_IOMMU_MAP_DMA: {
 782		struct vfio_iommu_type1_dma_map param;
 783		struct iommu_table *tbl = NULL;
 784		long num;
 785		enum dma_data_direction direction;
 786
 787		if (!container->enabled)
 788			return -EPERM;
 789
 790		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 791
 792		if (copy_from_user(&param, (void __user *)arg, minsz))
 793			return -EFAULT;
 794
 795		if (param.argsz < minsz)
 796			return -EINVAL;
 797
 798		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
 799				VFIO_DMA_MAP_FLAG_WRITE))
 800			return -EINVAL;
 801
 802		num = tce_iommu_find_table(container, param.iova, &tbl);
 803		if (num < 0)
 804			return -ENXIO;
 805
 806		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
 807				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
 808			return -EINVAL;
 809
 810		/* iova is checked by the IOMMU API */
 811		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
 812			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 813				direction = DMA_BIDIRECTIONAL;
 814			else
 815				direction = DMA_TO_DEVICE;
 816		} else {
 817			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 818				direction = DMA_FROM_DEVICE;
 819			else
 820				return -EINVAL;
 821		}
 822
 823		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
 824		if (ret)
 825			return ret;
 826
 827		if (container->v2)
 828			ret = tce_iommu_build_v2(container, tbl,
 829					param.iova >> tbl->it_page_shift,
 830					param.vaddr,
 831					param.size >> tbl->it_page_shift,
 832					direction);
 833		else
 834			ret = tce_iommu_build(container, tbl,
 835					param.iova >> tbl->it_page_shift,
 836					param.vaddr,
 837					param.size >> tbl->it_page_shift,
 838					direction);
 839
 840		iommu_flush_tce(tbl);
 841
 842		return ret;
 843	}
 844	case VFIO_IOMMU_UNMAP_DMA: {
 845		struct vfio_iommu_type1_dma_unmap param;
 846		struct iommu_table *tbl = NULL;
 847		long num;
 848
 849		if (!container->enabled)
 850			return -EPERM;
 851
 852		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
 853				size);
 854
 855		if (copy_from_user(&param, (void __user *)arg, minsz))
 856			return -EFAULT;
 857
 858		if (param.argsz < minsz)
 859			return -EINVAL;
 860
 861		/* No flag is supported now */
 862		if (param.flags)
 863			return -EINVAL;
 864
 865		num = tce_iommu_find_table(container, param.iova, &tbl);
 866		if (num < 0)
 867			return -ENXIO;
 868
 869		if (param.size & ~IOMMU_PAGE_MASK(tbl))
 870			return -EINVAL;
 871
 872		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
 873				param.size >> tbl->it_page_shift);
 874		if (ret)
 875			return ret;
 876
 877		ret = tce_iommu_clear(container, tbl,
 878				param.iova >> tbl->it_page_shift,
 879				param.size >> tbl->it_page_shift);
 880		iommu_flush_tce(tbl);
 881
 882		return ret;
 883	}
 884	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
 885		struct vfio_iommu_spapr_register_memory param;
 886
 887		if (!container->v2)
 888			break;
 889
 890		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
 891				size);
 892
 893		if (copy_from_user(&param, (void __user *)arg, minsz))
 894			return -EFAULT;
 895
 896		if (param.argsz < minsz)
 897			return -EINVAL;
 898
 899		/* No flag is supported now */
 900		if (param.flags)
 901			return -EINVAL;
 902
 903		mutex_lock(&container->lock);
 904		ret = tce_iommu_register_pages(container, param.vaddr,
 905				param.size);
 906		mutex_unlock(&container->lock);
 907
 908		return ret;
 909	}
 910	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
 911		struct vfio_iommu_spapr_register_memory param;
 912
 913		if (!container->v2)
 914			break;
 915
 916		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
 917				size);
 918
 919		if (copy_from_user(&param, (void __user *)arg, minsz))
 920			return -EFAULT;
 921
 922		if (param.argsz < minsz)
 923			return -EINVAL;
 924
 925		/* No flag is supported now */
 926		if (param.flags)
 927			return -EINVAL;
 928
 929		mutex_lock(&container->lock);
 930		ret = tce_iommu_unregister_pages(container, param.vaddr,
 931				param.size);
 932		mutex_unlock(&container->lock);
 933
 934		return ret;
 935	}
 936	case VFIO_IOMMU_ENABLE:
 937		if (container->v2)
 938			break;
 939
 940		mutex_lock(&container->lock);
 941		ret = tce_iommu_enable(container);
 942		mutex_unlock(&container->lock);
 943		return ret;
 944
 945
 946	case VFIO_IOMMU_DISABLE:
 947		if (container->v2)
 948			break;
 949
 950		mutex_lock(&container->lock);
 951		tce_iommu_disable(container);
 952		mutex_unlock(&container->lock);
 953		return 0;
 954
 955	case VFIO_EEH_PE_OP: {
 956		struct tce_iommu_group *tcegrp;
 957
 958		ret = 0;
 959		list_for_each_entry(tcegrp, &container->group_list, next) {
 960			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
 961					cmd, arg);
 962			if (ret)
 963				return ret;
 964		}
 965		return ret;
 966	}
 967
 968	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
 969		struct vfio_iommu_spapr_tce_create create;
 970
 971		if (!container->v2)
 972			break;
 973
 974		if (!tce_groups_attached(container))
 975			return -ENXIO;
 976
 977		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
 978				start_addr);
 979
 980		if (copy_from_user(&create, (void __user *)arg, minsz))
 981			return -EFAULT;
 982
 983		if (create.argsz < minsz)
 984			return -EINVAL;
 985
 986		if (create.flags)
 987			return -EINVAL;
 988
 989		mutex_lock(&container->lock);
 990
 991		ret = tce_iommu_create_window(container, create.page_shift,
 992				create.window_size, create.levels,
 993				&create.start_addr);
 994
 995		mutex_unlock(&container->lock);
 996
 997		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
 998			ret = -EFAULT;
 999
1000		return ret;
1001	}
1002	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1003		struct vfio_iommu_spapr_tce_remove remove;
1004
1005		if (!container->v2)
1006			break;
1007
1008		if (!tce_groups_attached(container))
1009			return -ENXIO;
1010
1011		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1012				start_addr);
1013
1014		if (copy_from_user(&remove, (void __user *)arg, minsz))
1015			return -EFAULT;
1016
1017		if (remove.argsz < minsz)
1018			return -EINVAL;
1019
1020		if (remove.flags)
1021			return -EINVAL;
1022
1023		mutex_lock(&container->lock);
1024
1025		ret = tce_iommu_remove_window(container, remove.start_addr);
1026
1027		mutex_unlock(&container->lock);
1028
1029		return ret;
1030	}
1031	}
1032
1033	return -ENOTTY;
1034}
1035
1036static void tce_iommu_release_ownership(struct tce_container *container,
1037		struct iommu_table_group *table_group)
1038{
1039	int i;
1040
1041	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1042		struct iommu_table *tbl = container->tables[i];
1043
1044		if (!tbl)
1045			continue;
1046
1047		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1048		tce_iommu_userspace_view_free(tbl);
1049		if (tbl->it_map)
1050			iommu_release_ownership(tbl);
1051
1052		container->tables[i] = NULL;
1053	}
1054}
1055
1056static int tce_iommu_take_ownership(struct tce_container *container,
1057		struct iommu_table_group *table_group)
1058{
1059	int i, j, rc = 0;
1060
1061	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1062		struct iommu_table *tbl = table_group->tables[i];
1063
1064		if (!tbl || !tbl->it_map)
1065			continue;
1066
1067		rc = tce_iommu_userspace_view_alloc(tbl);
1068		if (!rc)
1069			rc = iommu_take_ownership(tbl);
1070
1071		if (rc) {
1072			for (j = 0; j < i; ++j)
1073				iommu_release_ownership(
1074						table_group->tables[j]);
1075
1076			return rc;
1077		}
1078	}
1079
1080	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1081		container->tables[i] = table_group->tables[i];
1082
1083	return 0;
1084}
1085
1086static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1087		struct iommu_table_group *table_group)
1088{
1089	long i;
1090
1091	if (!table_group->ops->unset_window) {
1092		WARN_ON_ONCE(1);
1093		return;
1094	}
1095
1096	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1097		table_group->ops->unset_window(table_group, i);
1098
1099	table_group->ops->release_ownership(table_group);
1100}
1101
1102static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1103		struct iommu_table_group *table_group)
1104{
1105	long i, ret = 0;
1106	struct iommu_table *tbl = NULL;
1107
1108	if (!table_group->ops->create_table || !table_group->ops->set_window ||
1109			!table_group->ops->release_ownership) {
1110		WARN_ON_ONCE(1);
1111		return -EFAULT;
1112	}
1113
1114	table_group->ops->take_ownership(table_group);
1115
1116	/*
1117	 * If it the first group attached, check if there is
1118	 * a default DMA window and create one if none as
1119	 * the userspace expects it to exist.
1120	 */
1121	if (!tce_groups_attached(container) && !container->tables[0]) {
1122		ret = tce_iommu_create_table(container,
1123				table_group,
1124				0, /* window number */
1125				IOMMU_PAGE_SHIFT_4K,
1126				table_group->tce32_size,
1127				1, /* default levels */
1128				&tbl);
1129		if (ret)
1130			goto release_exit;
1131		else
1132			container->tables[0] = tbl;
1133	}
1134
1135	/* Set all windows to the new group */
1136	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1137		tbl = container->tables[i];
1138
1139		if (!tbl)
1140			continue;
1141
1142		/* Set the default window to a new group */
1143		ret = table_group->ops->set_window(table_group, i, tbl);
1144		if (ret)
1145			goto release_exit;
1146	}
1147
1148	return 0;
1149
1150release_exit:
1151	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1152		table_group->ops->unset_window(table_group, i);
1153
1154	table_group->ops->release_ownership(table_group);
1155
1156	return ret;
1157}
1158
1159static int tce_iommu_attach_group(void *iommu_data,
1160		struct iommu_group *iommu_group)
1161{
1162	int ret;
1163	struct tce_container *container = iommu_data;
1164	struct iommu_table_group *table_group;
1165	struct tce_iommu_group *tcegrp = NULL;
1166
1167	mutex_lock(&container->lock);
1168
1169	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1170			iommu_group_id(iommu_group), iommu_group); */
1171	table_group = iommu_group_get_iommudata(iommu_group);
1172
1173	if (tce_groups_attached(container) && (!table_group->ops ||
1174			!table_group->ops->take_ownership ||
1175			!table_group->ops->release_ownership)) {
1176		ret = -EBUSY;
1177		goto unlock_exit;
1178	}
1179
1180	/* Check if new group has the same iommu_ops (i.e. compatible) */
1181	list_for_each_entry(tcegrp, &container->group_list, next) {
1182		struct iommu_table_group *table_group_tmp;
1183
1184		if (tcegrp->grp == iommu_group) {
1185			pr_warn("tce_vfio: Group %d is already attached\n",
1186					iommu_group_id(iommu_group));
1187			ret = -EBUSY;
1188			goto unlock_exit;
1189		}
1190		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1191		if (table_group_tmp->ops != table_group->ops) {
1192			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1193					iommu_group_id(iommu_group),
1194					iommu_group_id(tcegrp->grp));
1195			ret = -EPERM;
1196			goto unlock_exit;
1197		}
1198	}
1199
1200	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1201	if (!tcegrp) {
1202		ret = -ENOMEM;
1203		goto unlock_exit;
1204	}
1205
1206	if (!table_group->ops || !table_group->ops->take_ownership ||
1207			!table_group->ops->release_ownership)
1208		ret = tce_iommu_take_ownership(container, table_group);
1209	else
1210		ret = tce_iommu_take_ownership_ddw(container, table_group);
1211
1212	if (!ret) {
1213		tcegrp->grp = iommu_group;
1214		list_add(&tcegrp->next, &container->group_list);
1215	}
1216
1217unlock_exit:
1218	if (ret && tcegrp)
1219		kfree(tcegrp);
1220
1221	mutex_unlock(&container->lock);
1222
1223	return ret;
1224}
1225
1226static void tce_iommu_detach_group(void *iommu_data,
1227		struct iommu_group *iommu_group)
1228{
1229	struct tce_container *container = iommu_data;
1230	struct iommu_table_group *table_group;
1231	bool found = false;
1232	struct tce_iommu_group *tcegrp;
1233
1234	mutex_lock(&container->lock);
1235
1236	list_for_each_entry(tcegrp, &container->group_list, next) {
1237		if (tcegrp->grp == iommu_group) {
1238			found = true;
1239			break;
1240		}
1241	}
1242
1243	if (!found) {
1244		pr_warn("tce_vfio: detaching unattached group #%u\n",
1245				iommu_group_id(iommu_group));
1246		goto unlock_exit;
1247	}
1248
1249	list_del(&tcegrp->next);
1250	kfree(tcegrp);
1251
1252	table_group = iommu_group_get_iommudata(iommu_group);
1253	BUG_ON(!table_group);
1254
1255	if (!table_group->ops || !table_group->ops->release_ownership)
1256		tce_iommu_release_ownership(container, table_group);
1257	else
1258		tce_iommu_release_ownership_ddw(container, table_group);
1259
1260unlock_exit:
1261	mutex_unlock(&container->lock);
1262}
1263
1264const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1265	.name		= "iommu-vfio-powerpc",
1266	.owner		= THIS_MODULE,
1267	.open		= tce_iommu_open,
1268	.release	= tce_iommu_release,
1269	.ioctl		= tce_iommu_ioctl,
1270	.attach_group	= tce_iommu_attach_group,
1271	.detach_group	= tce_iommu_detach_group,
1272};
1273
1274static int __init tce_iommu_init(void)
1275{
1276	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1277}
1278
1279static void __exit tce_iommu_cleanup(void)
1280{
1281	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1282}
1283
1284module_init(tce_iommu_init);
1285module_exit(tce_iommu_cleanup);
1286
1287MODULE_VERSION(DRIVER_VERSION);
1288MODULE_LICENSE("GPL v2");
1289MODULE_AUTHOR(DRIVER_AUTHOR);
1290MODULE_DESCRIPTION(DRIVER_DESC);
1291