Linux Audio

Check our new training course

Loading...
v5.4
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO: IOMMU DMA mapping support for TCE on POWER
   4 *
   5 * Copyright (C) 2013 IBM Corp.  All rights reserved.
   6 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
   7 *
 
 
 
 
   8 * Derived from original vfio_iommu_type1.c:
   9 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
  10 *     Author: Alex Williamson <alex.williamson@redhat.com>
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/pci.h>
  15#include <linux/slab.h>
  16#include <linux/uaccess.h>
  17#include <linux/err.h>
  18#include <linux/vfio.h>
  19#include <linux/vmalloc.h>
  20#include <linux/sched/mm.h>
  21#include <linux/sched/signal.h>
  22#include <linux/mm.h>
  23
  24#include <asm/iommu.h>
  25#include <asm/tce.h>
  26#include <asm/mmu_context.h>
  27
  28#define DRIVER_VERSION  "0.1"
  29#define DRIVER_AUTHOR   "aik@ozlabs.ru"
  30#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
  31
  32static void tce_iommu_detach_group(void *iommu_data,
  33		struct iommu_group *iommu_group);
  34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  35/*
  36 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  37 *
  38 * This code handles mapping and unmapping of user data buffers
  39 * into DMA'ble space using the IOMMU
  40 */
  41
  42struct tce_iommu_group {
  43	struct list_head next;
  44	struct iommu_group *grp;
  45};
  46
  47/*
  48 * A container needs to remember which preregistered region  it has
  49 * referenced to do proper cleanup at the userspace process exit.
  50 */
  51struct tce_iommu_prereg {
  52	struct list_head next;
  53	struct mm_iommu_table_group_mem_t *mem;
  54};
  55
  56/*
  57 * The container descriptor supports only a single group per container.
  58 * Required by the API as the container is not supplied with the IOMMU group
  59 * at the moment of initialization.
  60 */
  61struct tce_container {
  62	struct mutex lock;
  63	bool enabled;
  64	bool v2;
  65	bool def_window_pending;
  66	unsigned long locked_pages;
  67	struct mm_struct *mm;
  68	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
  69	struct list_head group_list;
  70	struct list_head prereg_list;
  71};
  72
  73static long tce_iommu_mm_set(struct tce_container *container)
  74{
  75	if (container->mm) {
  76		if (container->mm == current->mm)
  77			return 0;
  78		return -EPERM;
  79	}
  80	BUG_ON(!current->mm);
  81	container->mm = current->mm;
  82	atomic_inc(&container->mm->mm_count);
  83
  84	return 0;
  85}
  86
  87static long tce_iommu_prereg_free(struct tce_container *container,
  88		struct tce_iommu_prereg *tcemem)
  89{
  90	long ret;
  91
  92	ret = mm_iommu_put(container->mm, tcemem->mem);
  93	if (ret)
  94		return ret;
  95
  96	list_del(&tcemem->next);
  97	kfree(tcemem);
  98
  99	return 0;
 100}
 101
 102static long tce_iommu_unregister_pages(struct tce_container *container,
 103		__u64 vaddr, __u64 size)
 104{
 105	struct mm_iommu_table_group_mem_t *mem;
 106	struct tce_iommu_prereg *tcemem;
 107	bool found = false;
 108	long ret;
 109
 110	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
 111		return -EINVAL;
 112
 113	mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
 114	if (!mem)
 115		return -ENOENT;
 116
 117	list_for_each_entry(tcemem, &container->prereg_list, next) {
 118		if (tcemem->mem == mem) {
 119			found = true;
 120			break;
 121		}
 122	}
 123
 124	if (!found)
 125		ret = -ENOENT;
 126	else
 127		ret = tce_iommu_prereg_free(container, tcemem);
 128
 129	mm_iommu_put(container->mm, mem);
 130
 131	return ret;
 132}
 133
 134static long tce_iommu_register_pages(struct tce_container *container,
 135		__u64 vaddr, __u64 size)
 136{
 137	long ret = 0;
 138	struct mm_iommu_table_group_mem_t *mem = NULL;
 139	struct tce_iommu_prereg *tcemem;
 140	unsigned long entries = size >> PAGE_SHIFT;
 141
 142	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
 143			((vaddr + size) < vaddr))
 144		return -EINVAL;
 145
 146	mem = mm_iommu_get(container->mm, vaddr, entries);
 147	if (mem) {
 148		list_for_each_entry(tcemem, &container->prereg_list, next) {
 149			if (tcemem->mem == mem) {
 150				ret = -EBUSY;
 151				goto put_exit;
 152			}
 153		}
 154	} else {
 155		ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
 156		if (ret)
 157			return ret;
 158	}
 159
 160	tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
 161	if (!tcemem) {
 162		ret = -ENOMEM;
 163		goto put_exit;
 164	}
 165
 
 166	tcemem->mem = mem;
 167	list_add(&tcemem->next, &container->prereg_list);
 168
 169	container->enabled = true;
 170
 171	return 0;
 
 172
 173put_exit:
 174	mm_iommu_put(container->mm, mem);
 175	return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 176}
 177
 178static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
 179		unsigned int it_page_shift)
 180{
 181	struct page *page;
 182	unsigned long size = 0;
 
 
 
 183
 184	if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
 185		return size == (1UL << it_page_shift);
 
 
 186
 187	page = pfn_to_page(hpa >> PAGE_SHIFT);
 
 188	/*
 189	 * Check that the TCE table granularity is not bigger than the size of
 190	 * a page we just found. Otherwise the hardware can get access to
 191	 * a bigger memory chunk that it should.
 192	 */
 193	return page_shift(compound_head(page)) >= it_page_shift;
 194}
 195
 196static inline bool tce_groups_attached(struct tce_container *container)
 197{
 198	return !list_empty(&container->group_list);
 199}
 200
 201static long tce_iommu_find_table(struct tce_container *container,
 202		phys_addr_t ioba, struct iommu_table **ptbl)
 203{
 204	long i;
 205
 206	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 207		struct iommu_table *tbl = container->tables[i];
 208
 209		if (tbl) {
 210			unsigned long entry = ioba >> tbl->it_page_shift;
 211			unsigned long start = tbl->it_offset;
 212			unsigned long end = start + tbl->it_size;
 213
 214			if ((start <= entry) && (entry < end)) {
 215				*ptbl = tbl;
 216				return i;
 217			}
 218		}
 219	}
 220
 221	return -1;
 222}
 223
 224static int tce_iommu_find_free_table(struct tce_container *container)
 225{
 226	int i;
 227
 228	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 229		if (!container->tables[i])
 230			return i;
 231	}
 232
 233	return -ENOSPC;
 234}
 235
 236static int tce_iommu_enable(struct tce_container *container)
 237{
 238	int ret = 0;
 239	unsigned long locked;
 240	struct iommu_table_group *table_group;
 241	struct tce_iommu_group *tcegrp;
 242
 243	if (container->enabled)
 244		return -EBUSY;
 245
 246	/*
 247	 * When userspace pages are mapped into the IOMMU, they are effectively
 248	 * locked memory, so, theoretically, we need to update the accounting
 249	 * of locked pages on each map and unmap.  For powerpc, the map unmap
 250	 * paths can be very hot, though, and the accounting would kill
 251	 * performance, especially since it would be difficult to impossible
 252	 * to handle the accounting in real mode only.
 253	 *
 254	 * To address that, rather than precisely accounting every page, we
 255	 * instead account for a worst case on locked memory when the iommu is
 256	 * enabled and disabled.  The worst case upper bound on locked memory
 257	 * is the size of the whole iommu window, which is usually relatively
 258	 * small (compared to total memory sizes) on POWER hardware.
 259	 *
 260	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 261	 * that would effectively kill the guest at random points, much better
 262	 * enforcing the limit based on the max that the guest can map.
 263	 *
 264	 * Unfortunately at the moment it counts whole tables, no matter how
 265	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
 266	 * each with 2GB DMA window, 8GB will be counted here. The reason for
 267	 * this is that we cannot tell here the amount of RAM used by the guest
 268	 * as this information is only available from KVM and VFIO is
 269	 * KVM agnostic.
 270	 *
 271	 * So we do not allow enabling a container without a group attached
 272	 * as there is no way to know how much we should increment
 273	 * the locked_vm counter.
 274	 */
 275	if (!tce_groups_attached(container))
 276		return -ENODEV;
 277
 278	tcegrp = list_first_entry(&container->group_list,
 279			struct tce_iommu_group, next);
 280	table_group = iommu_group_get_iommudata(tcegrp->grp);
 281	if (!table_group)
 282		return -ENODEV;
 283
 284	if (!table_group->tce32_size)
 285		return -EPERM;
 286
 287	ret = tce_iommu_mm_set(container);
 288	if (ret)
 289		return ret;
 290
 291	locked = table_group->tce32_size >> PAGE_SHIFT;
 292	ret = account_locked_vm(container->mm, locked, true);
 293	if (ret)
 294		return ret;
 295
 296	container->locked_pages = locked;
 297
 298	container->enabled = true;
 299
 300	return ret;
 301}
 302
 303static void tce_iommu_disable(struct tce_container *container)
 304{
 305	if (!container->enabled)
 306		return;
 307
 308	container->enabled = false;
 309
 310	BUG_ON(!container->mm);
 311	account_locked_vm(container->mm, container->locked_pages, false);
 312}
 313
 314static void *tce_iommu_open(unsigned long arg)
 315{
 316	struct tce_container *container;
 317
 318	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
 319		pr_err("tce_vfio: Wrong IOMMU type\n");
 320		return ERR_PTR(-EINVAL);
 321	}
 322
 323	container = kzalloc(sizeof(*container), GFP_KERNEL);
 324	if (!container)
 325		return ERR_PTR(-ENOMEM);
 326
 327	mutex_init(&container->lock);
 328	INIT_LIST_HEAD_RCU(&container->group_list);
 329	INIT_LIST_HEAD_RCU(&container->prereg_list);
 330
 331	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 332
 333	return container;
 334}
 335
 336static int tce_iommu_clear(struct tce_container *container,
 337		struct iommu_table *tbl,
 338		unsigned long entry, unsigned long pages);
 339static void tce_iommu_free_table(struct tce_container *container,
 340		struct iommu_table *tbl);
 341
 342static void tce_iommu_release(void *iommu_data)
 343{
 344	struct tce_container *container = iommu_data;
 345	struct tce_iommu_group *tcegrp;
 346	struct tce_iommu_prereg *tcemem, *tmtmp;
 347	long i;
 348
 349	while (tce_groups_attached(container)) {
 350		tcegrp = list_first_entry(&container->group_list,
 351				struct tce_iommu_group, next);
 352		tce_iommu_detach_group(iommu_data, tcegrp->grp);
 353	}
 354
 355	/*
 356	 * If VFIO created a table, it was not disposed
 357	 * by tce_iommu_detach_group() so do it now.
 358	 */
 359	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 360		struct iommu_table *tbl = container->tables[i];
 361
 362		if (!tbl)
 363			continue;
 364
 365		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 366		tce_iommu_free_table(container, tbl);
 367	}
 368
 369	list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
 370		WARN_ON(tce_iommu_prereg_free(container, tcemem));
 
 
 
 
 
 371
 372	tce_iommu_disable(container);
 373	if (container->mm)
 374		mmdrop(container->mm);
 375	mutex_destroy(&container->lock);
 376
 377	kfree(container);
 378}
 379
 380static void tce_iommu_unuse_page(struct tce_container *container,
 381		unsigned long hpa)
 382{
 383	struct page *page;
 384
 385	page = pfn_to_page(hpa >> PAGE_SHIFT);
 386	put_page(page);
 387}
 388
 389static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
 390		unsigned long tce, unsigned long shift,
 391		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
 392{
 393	long ret = 0;
 394	struct mm_iommu_table_group_mem_t *mem;
 395
 396	mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
 397	if (!mem)
 398		return -EINVAL;
 399
 400	ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
 401	if (ret)
 402		return -EINVAL;
 403
 404	*pmem = mem;
 405
 406	return 0;
 407}
 408
 409static void tce_iommu_unuse_page_v2(struct tce_container *container,
 410		struct iommu_table *tbl, unsigned long entry)
 411{
 412	struct mm_iommu_table_group_mem_t *mem = NULL;
 413	int ret;
 414	unsigned long hpa = 0;
 415	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 416
 417	if (!pua)
 418		return;
 419
 420	ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
 421			tbl->it_page_shift, &hpa, &mem);
 422	if (ret)
 423		pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
 424				__func__, be64_to_cpu(*pua), entry, ret);
 425	if (mem)
 426		mm_iommu_mapped_dec(mem);
 427
 428	*pua = cpu_to_be64(0);
 429}
 430
 431static int tce_iommu_clear(struct tce_container *container,
 432		struct iommu_table *tbl,
 433		unsigned long entry, unsigned long pages)
 434{
 435	unsigned long oldhpa;
 436	long ret;
 437	enum dma_data_direction direction;
 438	unsigned long lastentry = entry + pages, firstentry = entry;
 439
 440	for ( ; entry < lastentry; ++entry) {
 441		if (tbl->it_indirect_levels && tbl->it_userspace) {
 442			/*
 443			 * For multilevel tables, we can take a shortcut here
 444			 * and skip some TCEs as we know that the userspace
 445			 * addresses cache is a mirror of the real TCE table
 446			 * and if it is missing some indirect levels, then
 447			 * the hardware table does not have them allocated
 448			 * either and therefore does not require updating.
 449			 */
 450			__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
 451					entry);
 452			if (!pua) {
 453				/* align to level_size which is power of two */
 454				entry |= tbl->it_level_size - 1;
 455				continue;
 456			}
 457		}
 458
 459		cond_resched();
 460
 
 461		direction = DMA_NONE;
 462		oldhpa = 0;
 463		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
 464				&direction);
 465		if (ret)
 466			continue;
 467
 468		if (direction == DMA_NONE)
 469			continue;
 470
 471		if (container->v2) {
 472			tce_iommu_unuse_page_v2(container, tbl, entry);
 473			continue;
 474		}
 475
 476		tce_iommu_unuse_page(container, oldhpa);
 477	}
 478
 479	iommu_tce_kill(tbl, firstentry, pages);
 480
 481	return 0;
 482}
 483
 484static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 485{
 486	struct page *page = NULL;
 487	enum dma_data_direction direction = iommu_tce_direction(tce);
 488
 489	if (get_user_pages_fast(tce & PAGE_MASK, 1,
 490			direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
 491			&page) != 1)
 492		return -EFAULT;
 493
 494	*hpa = __pa((unsigned long) page_address(page));
 495
 496	return 0;
 497}
 498
 499static long tce_iommu_build(struct tce_container *container,
 500		struct iommu_table *tbl,
 501		unsigned long entry, unsigned long tce, unsigned long pages,
 502		enum dma_data_direction direction)
 503{
 504	long i, ret = 0;
 
 505	unsigned long hpa;
 506	enum dma_data_direction dirtmp;
 507
 508	for (i = 0; i < pages; ++i) {
 509		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 510
 511		ret = tce_iommu_use_page(tce, &hpa);
 512		if (ret)
 513			break;
 514
 515		if (!tce_page_is_contained(container->mm, hpa,
 516				tbl->it_page_shift)) {
 517			ret = -EPERM;
 518			break;
 519		}
 520
 521		hpa |= offset;
 522		dirtmp = direction;
 523		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
 524				&hpa, &dirtmp);
 525		if (ret) {
 526			tce_iommu_unuse_page(container, hpa);
 527			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 528					__func__, entry << tbl->it_page_shift,
 529					tce, ret);
 530			break;
 531		}
 532
 533		if (dirtmp != DMA_NONE)
 534			tce_iommu_unuse_page(container, hpa);
 535
 536		tce += IOMMU_PAGE_SIZE(tbl);
 537	}
 538
 539	if (ret)
 540		tce_iommu_clear(container, tbl, entry, i);
 541	else
 542		iommu_tce_kill(tbl, entry, pages);
 543
 544	return ret;
 545}
 546
 547static long tce_iommu_build_v2(struct tce_container *container,
 548		struct iommu_table *tbl,
 549		unsigned long entry, unsigned long tce, unsigned long pages,
 550		enum dma_data_direction direction)
 551{
 552	long i, ret = 0;
 
 553	unsigned long hpa;
 554	enum dma_data_direction dirtmp;
 555
 
 
 
 
 
 
 556	for (i = 0; i < pages; ++i) {
 557		struct mm_iommu_table_group_mem_t *mem = NULL;
 558		__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
 
 559
 560		ret = tce_iommu_prereg_ua_to_hpa(container,
 561				tce, tbl->it_page_shift, &hpa, &mem);
 562		if (ret)
 563			break;
 564
 565		if (!tce_page_is_contained(container->mm, hpa,
 566				tbl->it_page_shift)) {
 567			ret = -EPERM;
 568			break;
 569		}
 570
 571		/* Preserve offset within IOMMU page */
 572		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 573		dirtmp = direction;
 574
 575		/* The registered region is being unregistered */
 576		if (mm_iommu_mapped_inc(mem))
 577			break;
 578
 579		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
 580				&hpa, &dirtmp);
 581		if (ret) {
 582			/* dirtmp cannot be DMA_NONE here */
 583			tce_iommu_unuse_page_v2(container, tbl, entry + i);
 584			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 585					__func__, entry << tbl->it_page_shift,
 586					tce, ret);
 587			break;
 588		}
 589
 590		if (dirtmp != DMA_NONE)
 591			tce_iommu_unuse_page_v2(container, tbl, entry + i);
 592
 593		*pua = cpu_to_be64(tce);
 594
 595		tce += IOMMU_PAGE_SIZE(tbl);
 596	}
 597
 598	if (ret)
 599		tce_iommu_clear(container, tbl, entry, i);
 600	else
 601		iommu_tce_kill(tbl, entry, pages);
 602
 603	return ret;
 604}
 605
 606static long tce_iommu_create_table(struct tce_container *container,
 607			struct iommu_table_group *table_group,
 608			int num,
 609			__u32 page_shift,
 610			__u64 window_size,
 611			__u32 levels,
 612			struct iommu_table **ptbl)
 613{
 614	long ret, table_size;
 615
 616	table_size = table_group->ops->get_table_size(page_shift, window_size,
 617			levels);
 618	if (!table_size)
 619		return -EINVAL;
 620
 621	ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
 622	if (ret)
 623		return ret;
 624
 625	ret = table_group->ops->create_table(table_group, num,
 626			page_shift, window_size, levels, ptbl);
 627
 628	WARN_ON(!ret && !(*ptbl)->it_ops->free);
 629	WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
 630
 631	return ret;
 632}
 633
 634static void tce_iommu_free_table(struct tce_container *container,
 635		struct iommu_table *tbl)
 636{
 637	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 638
 639	iommu_tce_table_put(tbl);
 640	account_locked_vm(container->mm, pages, false);
 
 641}
 642
 643static long tce_iommu_create_window(struct tce_container *container,
 644		__u32 page_shift, __u64 window_size, __u32 levels,
 645		__u64 *start_addr)
 646{
 647	struct tce_iommu_group *tcegrp;
 648	struct iommu_table_group *table_group;
 649	struct iommu_table *tbl = NULL;
 650	long ret, num;
 651
 652	num = tce_iommu_find_free_table(container);
 653	if (num < 0)
 654		return num;
 655
 656	/* Get the first group for ops::create_table */
 657	tcegrp = list_first_entry(&container->group_list,
 658			struct tce_iommu_group, next);
 659	table_group = iommu_group_get_iommudata(tcegrp->grp);
 660	if (!table_group)
 661		return -EFAULT;
 662
 663	if (!(table_group->pgsizes & (1ULL << page_shift)))
 664		return -EINVAL;
 665
 666	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
 667			!table_group->ops->get_table_size ||
 668			!table_group->ops->create_table)
 669		return -EPERM;
 670
 671	/* Create TCE table */
 672	ret = tce_iommu_create_table(container, table_group, num,
 673			page_shift, window_size, levels, &tbl);
 674	if (ret)
 675		return ret;
 676
 677	BUG_ON(!tbl->it_ops->free);
 678
 679	/*
 680	 * Program the table to every group.
 681	 * Groups have been tested for compatibility at the attach time.
 682	 */
 683	list_for_each_entry(tcegrp, &container->group_list, next) {
 684		table_group = iommu_group_get_iommudata(tcegrp->grp);
 685
 686		ret = table_group->ops->set_window(table_group, num, tbl);
 687		if (ret)
 688			goto unset_exit;
 689	}
 690
 691	container->tables[num] = tbl;
 692
 693	/* Return start address assigned by platform in create_table() */
 694	*start_addr = tbl->it_offset << tbl->it_page_shift;
 695
 696	return 0;
 697
 698unset_exit:
 699	list_for_each_entry(tcegrp, &container->group_list, next) {
 700		table_group = iommu_group_get_iommudata(tcegrp->grp);
 701		table_group->ops->unset_window(table_group, num);
 702	}
 703	tce_iommu_free_table(container, tbl);
 704
 705	return ret;
 706}
 707
 708static long tce_iommu_remove_window(struct tce_container *container,
 709		__u64 start_addr)
 710{
 711	struct iommu_table_group *table_group = NULL;
 712	struct iommu_table *tbl;
 713	struct tce_iommu_group *tcegrp;
 714	int num;
 715
 716	num = tce_iommu_find_table(container, start_addr, &tbl);
 717	if (num < 0)
 718		return -EINVAL;
 719
 720	BUG_ON(!tbl->it_size);
 721
 722	/* Detach groups from IOMMUs */
 723	list_for_each_entry(tcegrp, &container->group_list, next) {
 724		table_group = iommu_group_get_iommudata(tcegrp->grp);
 725
 726		/*
 727		 * SPAPR TCE IOMMU exposes the default DMA window to
 728		 * the guest via dma32_window_start/size of
 729		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
 730		 * the userspace to remove this window, some do not so
 731		 * here we check for the platform capability.
 732		 */
 733		if (!table_group->ops || !table_group->ops->unset_window)
 734			return -EPERM;
 735
 736		table_group->ops->unset_window(table_group, num);
 737	}
 738
 739	/* Free table */
 740	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 741	tce_iommu_free_table(container, tbl);
 742	container->tables[num] = NULL;
 743
 744	return 0;
 745}
 746
 747static long tce_iommu_create_default_window(struct tce_container *container)
 748{
 749	long ret;
 750	__u64 start_addr = 0;
 751	struct tce_iommu_group *tcegrp;
 752	struct iommu_table_group *table_group;
 753
 754	if (!container->def_window_pending)
 755		return 0;
 756
 757	if (!tce_groups_attached(container))
 758		return -ENODEV;
 759
 760	tcegrp = list_first_entry(&container->group_list,
 761			struct tce_iommu_group, next);
 762	table_group = iommu_group_get_iommudata(tcegrp->grp);
 763	if (!table_group)
 764		return -ENODEV;
 765
 766	ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
 767			table_group->tce32_size, 1, &start_addr);
 768	WARN_ON_ONCE(!ret && start_addr);
 769
 770	if (!ret)
 771		container->def_window_pending = false;
 772
 773	return ret;
 774}
 775
 776static long tce_iommu_ioctl(void *iommu_data,
 777				 unsigned int cmd, unsigned long arg)
 778{
 779	struct tce_container *container = iommu_data;
 780	unsigned long minsz, ddwsz;
 781	long ret;
 782
 783	switch (cmd) {
 784	case VFIO_CHECK_EXTENSION:
 785		switch (arg) {
 786		case VFIO_SPAPR_TCE_IOMMU:
 787		case VFIO_SPAPR_TCE_v2_IOMMU:
 788			ret = 1;
 789			break;
 790		default:
 791			ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
 792			break;
 793		}
 794
 795		return (ret < 0) ? 0 : ret;
 796	}
 797
 798	/*
 799	 * Sanity check to prevent one userspace from manipulating
 800	 * another userspace mm.
 801	 */
 802	BUG_ON(!container);
 803	if (container->mm && container->mm != current->mm)
 804		return -EPERM;
 805
 806	switch (cmd) {
 807	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 808		struct vfio_iommu_spapr_tce_info info;
 809		struct tce_iommu_group *tcegrp;
 810		struct iommu_table_group *table_group;
 811
 812		if (!tce_groups_attached(container))
 813			return -ENXIO;
 814
 815		tcegrp = list_first_entry(&container->group_list,
 816				struct tce_iommu_group, next);
 817		table_group = iommu_group_get_iommudata(tcegrp->grp);
 818
 819		if (!table_group)
 820			return -ENXIO;
 821
 822		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
 823				dma32_window_size);
 824
 825		if (copy_from_user(&info, (void __user *)arg, minsz))
 826			return -EFAULT;
 827
 828		if (info.argsz < minsz)
 829			return -EINVAL;
 830
 831		info.dma32_window_start = table_group->tce32_start;
 832		info.dma32_window_size = table_group->tce32_size;
 833		info.flags = 0;
 834		memset(&info.ddw, 0, sizeof(info.ddw));
 835
 836		if (table_group->max_dynamic_windows_supported &&
 837				container->v2) {
 838			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
 839			info.ddw.pgsizes = table_group->pgsizes;
 840			info.ddw.max_dynamic_windows_supported =
 841				table_group->max_dynamic_windows_supported;
 842			info.ddw.levels = table_group->max_levels;
 843		}
 844
 845		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
 846
 847		if (info.argsz >= ddwsz)
 848			minsz = ddwsz;
 849
 850		if (copy_to_user((void __user *)arg, &info, minsz))
 851			return -EFAULT;
 852
 853		return 0;
 854	}
 855	case VFIO_IOMMU_MAP_DMA: {
 856		struct vfio_iommu_type1_dma_map param;
 857		struct iommu_table *tbl = NULL;
 858		long num;
 859		enum dma_data_direction direction;
 860
 861		if (!container->enabled)
 862			return -EPERM;
 863
 864		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 865
 866		if (copy_from_user(&param, (void __user *)arg, minsz))
 867			return -EFAULT;
 868
 869		if (param.argsz < minsz)
 870			return -EINVAL;
 871
 872		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
 873				VFIO_DMA_MAP_FLAG_WRITE))
 874			return -EINVAL;
 875
 876		ret = tce_iommu_create_default_window(container);
 877		if (ret)
 878			return ret;
 879
 880		num = tce_iommu_find_table(container, param.iova, &tbl);
 881		if (num < 0)
 882			return -ENXIO;
 883
 884		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
 885				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
 886			return -EINVAL;
 887
 888		/* iova is checked by the IOMMU API */
 889		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
 890			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 891				direction = DMA_BIDIRECTIONAL;
 892			else
 893				direction = DMA_TO_DEVICE;
 894		} else {
 895			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 896				direction = DMA_FROM_DEVICE;
 897			else
 898				return -EINVAL;
 899		}
 900
 901		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
 902		if (ret)
 903			return ret;
 904
 905		if (container->v2)
 906			ret = tce_iommu_build_v2(container, tbl,
 907					param.iova >> tbl->it_page_shift,
 908					param.vaddr,
 909					param.size >> tbl->it_page_shift,
 910					direction);
 911		else
 912			ret = tce_iommu_build(container, tbl,
 913					param.iova >> tbl->it_page_shift,
 914					param.vaddr,
 915					param.size >> tbl->it_page_shift,
 916					direction);
 917
 918		iommu_flush_tce(tbl);
 919
 920		return ret;
 921	}
 922	case VFIO_IOMMU_UNMAP_DMA: {
 923		struct vfio_iommu_type1_dma_unmap param;
 924		struct iommu_table *tbl = NULL;
 925		long num;
 926
 927		if (!container->enabled)
 928			return -EPERM;
 929
 930		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
 931				size);
 932
 933		if (copy_from_user(&param, (void __user *)arg, minsz))
 934			return -EFAULT;
 935
 936		if (param.argsz < minsz)
 937			return -EINVAL;
 938
 939		/* No flag is supported now */
 940		if (param.flags)
 941			return -EINVAL;
 942
 943		ret = tce_iommu_create_default_window(container);
 944		if (ret)
 945			return ret;
 946
 947		num = tce_iommu_find_table(container, param.iova, &tbl);
 948		if (num < 0)
 949			return -ENXIO;
 950
 951		if (param.size & ~IOMMU_PAGE_MASK(tbl))
 952			return -EINVAL;
 953
 954		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
 955				param.size >> tbl->it_page_shift);
 956		if (ret)
 957			return ret;
 958
 959		ret = tce_iommu_clear(container, tbl,
 960				param.iova >> tbl->it_page_shift,
 961				param.size >> tbl->it_page_shift);
 962		iommu_flush_tce(tbl);
 963
 964		return ret;
 965	}
 966	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
 967		struct vfio_iommu_spapr_register_memory param;
 968
 969		if (!container->v2)
 970			break;
 971
 972		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
 973				size);
 974
 975		ret = tce_iommu_mm_set(container);
 976		if (ret)
 977			return ret;
 978
 979		if (copy_from_user(&param, (void __user *)arg, minsz))
 980			return -EFAULT;
 981
 982		if (param.argsz < minsz)
 983			return -EINVAL;
 984
 985		/* No flag is supported now */
 986		if (param.flags)
 987			return -EINVAL;
 988
 989		mutex_lock(&container->lock);
 990		ret = tce_iommu_register_pages(container, param.vaddr,
 991				param.size);
 992		mutex_unlock(&container->lock);
 993
 994		return ret;
 995	}
 996	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
 997		struct vfio_iommu_spapr_register_memory param;
 998
 999		if (!container->v2)
1000			break;
1001
1002		if (!container->mm)
1003			return -EPERM;
1004
1005		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1006				size);
1007
1008		if (copy_from_user(&param, (void __user *)arg, minsz))
1009			return -EFAULT;
1010
1011		if (param.argsz < minsz)
1012			return -EINVAL;
1013
1014		/* No flag is supported now */
1015		if (param.flags)
1016			return -EINVAL;
1017
1018		mutex_lock(&container->lock);
1019		ret = tce_iommu_unregister_pages(container, param.vaddr,
1020				param.size);
1021		mutex_unlock(&container->lock);
1022
1023		return ret;
1024	}
1025	case VFIO_IOMMU_ENABLE:
1026		if (container->v2)
1027			break;
1028
1029		mutex_lock(&container->lock);
1030		ret = tce_iommu_enable(container);
1031		mutex_unlock(&container->lock);
1032		return ret;
1033
1034
1035	case VFIO_IOMMU_DISABLE:
1036		if (container->v2)
1037			break;
1038
1039		mutex_lock(&container->lock);
1040		tce_iommu_disable(container);
1041		mutex_unlock(&container->lock);
1042		return 0;
1043
1044	case VFIO_EEH_PE_OP: {
1045		struct tce_iommu_group *tcegrp;
1046
1047		ret = 0;
1048		list_for_each_entry(tcegrp, &container->group_list, next) {
1049			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
1050					cmd, arg);
1051			if (ret)
1052				return ret;
1053		}
1054		return ret;
1055	}
1056
1057	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1058		struct vfio_iommu_spapr_tce_create create;
1059
1060		if (!container->v2)
1061			break;
1062
1063		ret = tce_iommu_mm_set(container);
1064		if (ret)
1065			return ret;
1066
1067		if (!tce_groups_attached(container))
1068			return -ENXIO;
1069
1070		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1071				start_addr);
1072
1073		if (copy_from_user(&create, (void __user *)arg, minsz))
1074			return -EFAULT;
1075
1076		if (create.argsz < minsz)
1077			return -EINVAL;
1078
1079		if (create.flags)
1080			return -EINVAL;
1081
1082		mutex_lock(&container->lock);
1083
1084		ret = tce_iommu_create_default_window(container);
1085		if (!ret)
1086			ret = tce_iommu_create_window(container,
1087					create.page_shift,
1088					create.window_size, create.levels,
1089					&create.start_addr);
1090
1091		mutex_unlock(&container->lock);
1092
1093		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1094			ret = -EFAULT;
1095
1096		return ret;
1097	}
1098	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1099		struct vfio_iommu_spapr_tce_remove remove;
1100
1101		if (!container->v2)
1102			break;
1103
1104		ret = tce_iommu_mm_set(container);
1105		if (ret)
1106			return ret;
1107
1108		if (!tce_groups_attached(container))
1109			return -ENXIO;
1110
1111		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1112				start_addr);
1113
1114		if (copy_from_user(&remove, (void __user *)arg, minsz))
1115			return -EFAULT;
1116
1117		if (remove.argsz < minsz)
1118			return -EINVAL;
1119
1120		if (remove.flags)
1121			return -EINVAL;
1122
1123		if (container->def_window_pending && !remove.start_addr) {
1124			container->def_window_pending = false;
1125			return 0;
1126		}
1127
1128		mutex_lock(&container->lock);
1129
1130		ret = tce_iommu_remove_window(container, remove.start_addr);
1131
1132		mutex_unlock(&container->lock);
1133
1134		return ret;
1135	}
1136	}
1137
1138	return -ENOTTY;
1139}
1140
1141static void tce_iommu_release_ownership(struct tce_container *container,
1142		struct iommu_table_group *table_group)
1143{
1144	int i;
1145
1146	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1147		struct iommu_table *tbl = container->tables[i];
1148
1149		if (!tbl)
1150			continue;
1151
1152		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 
1153		if (tbl->it_map)
1154			iommu_release_ownership(tbl);
1155
1156		container->tables[i] = NULL;
1157	}
1158}
1159
1160static int tce_iommu_take_ownership(struct tce_container *container,
1161		struct iommu_table_group *table_group)
1162{
1163	int i, j, rc = 0;
1164
1165	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1166		struct iommu_table *tbl = table_group->tables[i];
1167
1168		if (!tbl || !tbl->it_map)
1169			continue;
1170
1171		rc = iommu_take_ownership(tbl);
1172		if (rc) {
1173			for (j = 0; j < i; ++j)
1174				iommu_release_ownership(
1175						table_group->tables[j]);
1176
1177			return rc;
1178		}
1179	}
1180
1181	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1182		container->tables[i] = table_group->tables[i];
1183
1184	return 0;
1185}
1186
1187static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1188		struct iommu_table_group *table_group)
1189{
1190	long i;
1191
1192	if (!table_group->ops->unset_window) {
1193		WARN_ON_ONCE(1);
1194		return;
1195	}
1196
1197	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1198		if (container->tables[i])
1199			table_group->ops->unset_window(table_group, i);
1200
1201	table_group->ops->release_ownership(table_group);
1202}
1203
1204static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1205		struct iommu_table_group *table_group)
1206{
1207	long i, ret = 0;
1208
1209	if (!table_group->ops->create_table || !table_group->ops->set_window ||
1210			!table_group->ops->release_ownership) {
1211		WARN_ON_ONCE(1);
1212		return -EFAULT;
1213	}
1214
1215	table_group->ops->take_ownership(table_group);
1216
1217	/* Set all windows to the new group */
1218	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1219		struct iommu_table *tbl = container->tables[i];
1220
1221		if (!tbl)
1222			continue;
1223
1224		ret = table_group->ops->set_window(table_group, i, tbl);
1225		if (ret)
1226			goto release_exit;
1227	}
1228
1229	return 0;
1230
1231release_exit:
1232	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1233		table_group->ops->unset_window(table_group, i);
1234
1235	table_group->ops->release_ownership(table_group);
1236
1237	return ret;
1238}
1239
1240static int tce_iommu_attach_group(void *iommu_data,
1241		struct iommu_group *iommu_group)
1242{
1243	int ret = 0;
1244	struct tce_container *container = iommu_data;
1245	struct iommu_table_group *table_group;
1246	struct tce_iommu_group *tcegrp = NULL;
1247
1248	mutex_lock(&container->lock);
1249
1250	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1251			iommu_group_id(iommu_group), iommu_group); */
1252	table_group = iommu_group_get_iommudata(iommu_group);
1253	if (!table_group) {
1254		ret = -ENODEV;
1255		goto unlock_exit;
1256	}
1257
1258	if (tce_groups_attached(container) && (!table_group->ops ||
1259			!table_group->ops->take_ownership ||
1260			!table_group->ops->release_ownership)) {
1261		ret = -EBUSY;
1262		goto unlock_exit;
1263	}
1264
1265	/* Check if new group has the same iommu_ops (i.e. compatible) */
1266	list_for_each_entry(tcegrp, &container->group_list, next) {
1267		struct iommu_table_group *table_group_tmp;
1268
1269		if (tcegrp->grp == iommu_group) {
1270			pr_warn("tce_vfio: Group %d is already attached\n",
1271					iommu_group_id(iommu_group));
1272			ret = -EBUSY;
1273			goto unlock_exit;
1274		}
1275		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1276		if (table_group_tmp->ops->create_table !=
1277				table_group->ops->create_table) {
1278			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1279					iommu_group_id(iommu_group),
1280					iommu_group_id(tcegrp->grp));
1281			ret = -EPERM;
1282			goto unlock_exit;
1283		}
1284	}
1285
1286	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1287	if (!tcegrp) {
1288		ret = -ENOMEM;
1289		goto unlock_exit;
1290	}
1291
1292	if (!table_group->ops || !table_group->ops->take_ownership ||
1293			!table_group->ops->release_ownership) {
1294		if (container->v2) {
1295			ret = -EPERM;
1296			goto free_exit;
1297		}
1298		ret = tce_iommu_take_ownership(container, table_group);
1299	} else {
1300		if (!container->v2) {
1301			ret = -EPERM;
1302			goto free_exit;
1303		}
1304		ret = tce_iommu_take_ownership_ddw(container, table_group);
1305		if (!tce_groups_attached(container) && !container->tables[0])
1306			container->def_window_pending = true;
1307	}
1308
1309	if (!ret) {
1310		tcegrp->grp = iommu_group;
1311		list_add(&tcegrp->next, &container->group_list);
1312	}
1313
1314free_exit:
1315	if (ret && tcegrp)
1316		kfree(tcegrp);
1317
1318unlock_exit:
1319	mutex_unlock(&container->lock);
1320
1321	return ret;
1322}
1323
1324static void tce_iommu_detach_group(void *iommu_data,
1325		struct iommu_group *iommu_group)
1326{
1327	struct tce_container *container = iommu_data;
1328	struct iommu_table_group *table_group;
1329	bool found = false;
1330	struct tce_iommu_group *tcegrp;
1331
1332	mutex_lock(&container->lock);
1333
1334	list_for_each_entry(tcegrp, &container->group_list, next) {
1335		if (tcegrp->grp == iommu_group) {
1336			found = true;
1337			break;
1338		}
1339	}
1340
1341	if (!found) {
1342		pr_warn("tce_vfio: detaching unattached group #%u\n",
1343				iommu_group_id(iommu_group));
1344		goto unlock_exit;
1345	}
1346
1347	list_del(&tcegrp->next);
1348	kfree(tcegrp);
1349
1350	table_group = iommu_group_get_iommudata(iommu_group);
1351	BUG_ON(!table_group);
1352
1353	if (!table_group->ops || !table_group->ops->release_ownership)
1354		tce_iommu_release_ownership(container, table_group);
1355	else
1356		tce_iommu_release_ownership_ddw(container, table_group);
1357
1358unlock_exit:
1359	mutex_unlock(&container->lock);
1360}
1361
1362static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1363	.name		= "iommu-vfio-powerpc",
1364	.owner		= THIS_MODULE,
1365	.open		= tce_iommu_open,
1366	.release	= tce_iommu_release,
1367	.ioctl		= tce_iommu_ioctl,
1368	.attach_group	= tce_iommu_attach_group,
1369	.detach_group	= tce_iommu_detach_group,
1370};
1371
1372static int __init tce_iommu_init(void)
1373{
1374	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1375}
1376
1377static void __exit tce_iommu_cleanup(void)
1378{
1379	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1380}
1381
1382module_init(tce_iommu_init);
1383module_exit(tce_iommu_cleanup);
1384
1385MODULE_VERSION(DRIVER_VERSION);
1386MODULE_LICENSE("GPL v2");
1387MODULE_AUTHOR(DRIVER_AUTHOR);
1388MODULE_DESCRIPTION(DRIVER_DESC);
1389
v4.10.11
 
   1/*
   2 * VFIO: IOMMU DMA mapping support for TCE on POWER
   3 *
   4 * Copyright (C) 2013 IBM Corp.  All rights reserved.
   5 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
   6 *
   7 * This program is free software; you can redistribute it and/or modify
   8 * it under the terms of the GNU General Public License version 2 as
   9 * published by the Free Software Foundation.
  10 *
  11 * Derived from original vfio_iommu_type1.c:
  12 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
  13 *     Author: Alex Williamson <alex.williamson@redhat.com>
  14 */
  15
  16#include <linux/module.h>
  17#include <linux/pci.h>
  18#include <linux/slab.h>
  19#include <linux/uaccess.h>
  20#include <linux/err.h>
  21#include <linux/vfio.h>
  22#include <linux/vmalloc.h>
 
 
 
 
  23#include <asm/iommu.h>
  24#include <asm/tce.h>
  25#include <asm/mmu_context.h>
  26
  27#define DRIVER_VERSION  "0.1"
  28#define DRIVER_AUTHOR   "aik@ozlabs.ru"
  29#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
  30
  31static void tce_iommu_detach_group(void *iommu_data,
  32		struct iommu_group *iommu_group);
  33
  34static long try_increment_locked_vm(struct mm_struct *mm, long npages)
  35{
  36	long ret = 0, locked, lock_limit;
  37
  38	if (WARN_ON_ONCE(!mm))
  39		return -EPERM;
  40
  41	if (!npages)
  42		return 0;
  43
  44	down_write(&mm->mmap_sem);
  45	locked = mm->locked_vm + npages;
  46	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  47	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
  48		ret = -ENOMEM;
  49	else
  50		mm->locked_vm += npages;
  51
  52	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
  53			npages << PAGE_SHIFT,
  54			mm->locked_vm << PAGE_SHIFT,
  55			rlimit(RLIMIT_MEMLOCK),
  56			ret ? " - exceeded" : "");
  57
  58	up_write(&mm->mmap_sem);
  59
  60	return ret;
  61}
  62
  63static void decrement_locked_vm(struct mm_struct *mm, long npages)
  64{
  65	if (!mm || !npages)
  66		return;
  67
  68	down_write(&mm->mmap_sem);
  69	if (WARN_ON_ONCE(npages > mm->locked_vm))
  70		npages = mm->locked_vm;
  71	mm->locked_vm -= npages;
  72	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
  73			npages << PAGE_SHIFT,
  74			mm->locked_vm << PAGE_SHIFT,
  75			rlimit(RLIMIT_MEMLOCK));
  76	up_write(&mm->mmap_sem);
  77}
  78
  79/*
  80 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  81 *
  82 * This code handles mapping and unmapping of user data buffers
  83 * into DMA'ble space using the IOMMU
  84 */
  85
  86struct tce_iommu_group {
  87	struct list_head next;
  88	struct iommu_group *grp;
  89};
  90
  91/*
  92 * A container needs to remember which preregistered region  it has
  93 * referenced to do proper cleanup at the userspace process exit.
  94 */
  95struct tce_iommu_prereg {
  96	struct list_head next;
  97	struct mm_iommu_table_group_mem_t *mem;
  98};
  99
 100/*
 101 * The container descriptor supports only a single group per container.
 102 * Required by the API as the container is not supplied with the IOMMU group
 103 * at the moment of initialization.
 104 */
 105struct tce_container {
 106	struct mutex lock;
 107	bool enabled;
 108	bool v2;
 109	bool def_window_pending;
 110	unsigned long locked_pages;
 111	struct mm_struct *mm;
 112	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
 113	struct list_head group_list;
 114	struct list_head prereg_list;
 115};
 116
 117static long tce_iommu_mm_set(struct tce_container *container)
 118{
 119	if (container->mm) {
 120		if (container->mm == current->mm)
 121			return 0;
 122		return -EPERM;
 123	}
 124	BUG_ON(!current->mm);
 125	container->mm = current->mm;
 126	atomic_inc(&container->mm->mm_count);
 127
 128	return 0;
 129}
 130
 131static long tce_iommu_prereg_free(struct tce_container *container,
 132		struct tce_iommu_prereg *tcemem)
 133{
 134	long ret;
 135
 136	ret = mm_iommu_put(container->mm, tcemem->mem);
 137	if (ret)
 138		return ret;
 139
 140	list_del(&tcemem->next);
 141	kfree(tcemem);
 142
 143	return 0;
 144}
 145
 146static long tce_iommu_unregister_pages(struct tce_container *container,
 147		__u64 vaddr, __u64 size)
 148{
 149	struct mm_iommu_table_group_mem_t *mem;
 150	struct tce_iommu_prereg *tcemem;
 151	bool found = false;
 
 152
 153	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
 154		return -EINVAL;
 155
 156	mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT);
 157	if (!mem)
 158		return -ENOENT;
 159
 160	list_for_each_entry(tcemem, &container->prereg_list, next) {
 161		if (tcemem->mem == mem) {
 162			found = true;
 163			break;
 164		}
 165	}
 166
 167	if (!found)
 168		return -ENOENT;
 
 
 
 
 169
 170	return tce_iommu_prereg_free(container, tcemem);
 171}
 172
 173static long tce_iommu_register_pages(struct tce_container *container,
 174		__u64 vaddr, __u64 size)
 175{
 176	long ret = 0;
 177	struct mm_iommu_table_group_mem_t *mem = NULL;
 178	struct tce_iommu_prereg *tcemem;
 179	unsigned long entries = size >> PAGE_SHIFT;
 180
 181	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
 182			((vaddr + size) < vaddr))
 183		return -EINVAL;
 184
 185	mem = mm_iommu_find(container->mm, vaddr, entries);
 186	if (mem) {
 187		list_for_each_entry(tcemem, &container->prereg_list, next) {
 188			if (tcemem->mem == mem)
 189				return -EBUSY;
 
 
 190		}
 
 
 
 
 191	}
 192
 193	ret = mm_iommu_get(container->mm, vaddr, entries, &mem);
 194	if (ret)
 195		return ret;
 
 
 196
 197	tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
 198	tcemem->mem = mem;
 199	list_add(&tcemem->next, &container->prereg_list);
 200
 201	container->enabled = true;
 202
 203	return 0;
 204}
 205
 206static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl,
 207		struct mm_struct *mm)
 208{
 209	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
 210			tbl->it_size, PAGE_SIZE);
 211	unsigned long *uas;
 212	long ret;
 213
 214	BUG_ON(tbl->it_userspace);
 215
 216	ret = try_increment_locked_vm(mm, cb >> PAGE_SHIFT);
 217	if (ret)
 218		return ret;
 219
 220	uas = vzalloc(cb);
 221	if (!uas) {
 222		decrement_locked_vm(mm, cb >> PAGE_SHIFT);
 223		return -ENOMEM;
 224	}
 225	tbl->it_userspace = uas;
 226
 227	return 0;
 228}
 229
 230static void tce_iommu_userspace_view_free(struct iommu_table *tbl,
 231		struct mm_struct *mm)
 232{
 233	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
 234			tbl->it_size, PAGE_SIZE);
 235
 236	if (!tbl->it_userspace)
 237		return;
 238
 239	vfree(tbl->it_userspace);
 240	tbl->it_userspace = NULL;
 241	decrement_locked_vm(mm, cb >> PAGE_SHIFT);
 242}
 243
 244static bool tce_page_is_contained(struct page *page, unsigned page_shift)
 245{
 246	/*
 247	 * Check that the TCE table granularity is not bigger than the size of
 248	 * a page we just found. Otherwise the hardware can get access to
 249	 * a bigger memory chunk that it should.
 250	 */
 251	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
 252}
 253
 254static inline bool tce_groups_attached(struct tce_container *container)
 255{
 256	return !list_empty(&container->group_list);
 257}
 258
 259static long tce_iommu_find_table(struct tce_container *container,
 260		phys_addr_t ioba, struct iommu_table **ptbl)
 261{
 262	long i;
 263
 264	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 265		struct iommu_table *tbl = container->tables[i];
 266
 267		if (tbl) {
 268			unsigned long entry = ioba >> tbl->it_page_shift;
 269			unsigned long start = tbl->it_offset;
 270			unsigned long end = start + tbl->it_size;
 271
 272			if ((start <= entry) && (entry < end)) {
 273				*ptbl = tbl;
 274				return i;
 275			}
 276		}
 277	}
 278
 279	return -1;
 280}
 281
 282static int tce_iommu_find_free_table(struct tce_container *container)
 283{
 284	int i;
 285
 286	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 287		if (!container->tables[i])
 288			return i;
 289	}
 290
 291	return -ENOSPC;
 292}
 293
 294static int tce_iommu_enable(struct tce_container *container)
 295{
 296	int ret = 0;
 297	unsigned long locked;
 298	struct iommu_table_group *table_group;
 299	struct tce_iommu_group *tcegrp;
 300
 301	if (container->enabled)
 302		return -EBUSY;
 303
 304	/*
 305	 * When userspace pages are mapped into the IOMMU, they are effectively
 306	 * locked memory, so, theoretically, we need to update the accounting
 307	 * of locked pages on each map and unmap.  For powerpc, the map unmap
 308	 * paths can be very hot, though, and the accounting would kill
 309	 * performance, especially since it would be difficult to impossible
 310	 * to handle the accounting in real mode only.
 311	 *
 312	 * To address that, rather than precisely accounting every page, we
 313	 * instead account for a worst case on locked memory when the iommu is
 314	 * enabled and disabled.  The worst case upper bound on locked memory
 315	 * is the size of the whole iommu window, which is usually relatively
 316	 * small (compared to total memory sizes) on POWER hardware.
 317	 *
 318	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 319	 * that would effectively kill the guest at random points, much better
 320	 * enforcing the limit based on the max that the guest can map.
 321	 *
 322	 * Unfortunately at the moment it counts whole tables, no matter how
 323	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
 324	 * each with 2GB DMA window, 8GB will be counted here. The reason for
 325	 * this is that we cannot tell here the amount of RAM used by the guest
 326	 * as this information is only available from KVM and VFIO is
 327	 * KVM agnostic.
 328	 *
 329	 * So we do not allow enabling a container without a group attached
 330	 * as there is no way to know how much we should increment
 331	 * the locked_vm counter.
 332	 */
 333	if (!tce_groups_attached(container))
 334		return -ENODEV;
 335
 336	tcegrp = list_first_entry(&container->group_list,
 337			struct tce_iommu_group, next);
 338	table_group = iommu_group_get_iommudata(tcegrp->grp);
 339	if (!table_group)
 340		return -ENODEV;
 341
 342	if (!table_group->tce32_size)
 343		return -EPERM;
 344
 345	ret = tce_iommu_mm_set(container);
 346	if (ret)
 347		return ret;
 348
 349	locked = table_group->tce32_size >> PAGE_SHIFT;
 350	ret = try_increment_locked_vm(container->mm, locked);
 351	if (ret)
 352		return ret;
 353
 354	container->locked_pages = locked;
 355
 356	container->enabled = true;
 357
 358	return ret;
 359}
 360
 361static void tce_iommu_disable(struct tce_container *container)
 362{
 363	if (!container->enabled)
 364		return;
 365
 366	container->enabled = false;
 367
 368	BUG_ON(!container->mm);
 369	decrement_locked_vm(container->mm, container->locked_pages);
 370}
 371
 372static void *tce_iommu_open(unsigned long arg)
 373{
 374	struct tce_container *container;
 375
 376	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
 377		pr_err("tce_vfio: Wrong IOMMU type\n");
 378		return ERR_PTR(-EINVAL);
 379	}
 380
 381	container = kzalloc(sizeof(*container), GFP_KERNEL);
 382	if (!container)
 383		return ERR_PTR(-ENOMEM);
 384
 385	mutex_init(&container->lock);
 386	INIT_LIST_HEAD_RCU(&container->group_list);
 387	INIT_LIST_HEAD_RCU(&container->prereg_list);
 388
 389	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 390
 391	return container;
 392}
 393
 394static int tce_iommu_clear(struct tce_container *container,
 395		struct iommu_table *tbl,
 396		unsigned long entry, unsigned long pages);
 397static void tce_iommu_free_table(struct tce_container *container,
 398		struct iommu_table *tbl);
 399
 400static void tce_iommu_release(void *iommu_data)
 401{
 402	struct tce_container *container = iommu_data;
 403	struct tce_iommu_group *tcegrp;
 
 404	long i;
 405
 406	while (tce_groups_attached(container)) {
 407		tcegrp = list_first_entry(&container->group_list,
 408				struct tce_iommu_group, next);
 409		tce_iommu_detach_group(iommu_data, tcegrp->grp);
 410	}
 411
 412	/*
 413	 * If VFIO created a table, it was not disposed
 414	 * by tce_iommu_detach_group() so do it now.
 415	 */
 416	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 417		struct iommu_table *tbl = container->tables[i];
 418
 419		if (!tbl)
 420			continue;
 421
 422		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 423		tce_iommu_free_table(container, tbl);
 424	}
 425
 426	while (!list_empty(&container->prereg_list)) {
 427		struct tce_iommu_prereg *tcemem;
 428
 429		tcemem = list_first_entry(&container->prereg_list,
 430				struct tce_iommu_prereg, next);
 431		WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem));
 432	}
 433
 434	tce_iommu_disable(container);
 435	if (container->mm)
 436		mmdrop(container->mm);
 437	mutex_destroy(&container->lock);
 438
 439	kfree(container);
 440}
 441
 442static void tce_iommu_unuse_page(struct tce_container *container,
 443		unsigned long hpa)
 444{
 445	struct page *page;
 446
 447	page = pfn_to_page(hpa >> PAGE_SHIFT);
 448	put_page(page);
 449}
 450
 451static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
 452		unsigned long tce, unsigned long size,
 453		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
 454{
 455	long ret = 0;
 456	struct mm_iommu_table_group_mem_t *mem;
 457
 458	mem = mm_iommu_lookup(container->mm, tce, size);
 459	if (!mem)
 460		return -EINVAL;
 461
 462	ret = mm_iommu_ua_to_hpa(mem, tce, phpa);
 463	if (ret)
 464		return -EINVAL;
 465
 466	*pmem = mem;
 467
 468	return 0;
 469}
 470
 471static void tce_iommu_unuse_page_v2(struct tce_container *container,
 472		struct iommu_table *tbl, unsigned long entry)
 473{
 474	struct mm_iommu_table_group_mem_t *mem = NULL;
 475	int ret;
 476	unsigned long hpa = 0;
 477	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
 478
 479	if (!pua)
 480		return;
 481
 482	ret = tce_iommu_prereg_ua_to_hpa(container, *pua, IOMMU_PAGE_SIZE(tbl),
 483			&hpa, &mem);
 484	if (ret)
 485		pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n",
 486				__func__, *pua, entry, ret);
 487	if (mem)
 488		mm_iommu_mapped_dec(mem);
 489
 490	*pua = 0;
 491}
 492
 493static int tce_iommu_clear(struct tce_container *container,
 494		struct iommu_table *tbl,
 495		unsigned long entry, unsigned long pages)
 496{
 497	unsigned long oldhpa;
 498	long ret;
 499	enum dma_data_direction direction;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 500
 501	for ( ; pages; --pages, ++entry) {
 502		direction = DMA_NONE;
 503		oldhpa = 0;
 504		ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
 
 505		if (ret)
 506			continue;
 507
 508		if (direction == DMA_NONE)
 509			continue;
 510
 511		if (container->v2) {
 512			tce_iommu_unuse_page_v2(container, tbl, entry);
 513			continue;
 514		}
 515
 516		tce_iommu_unuse_page(container, oldhpa);
 517	}
 518
 
 
 519	return 0;
 520}
 521
 522static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 523{
 524	struct page *page = NULL;
 525	enum dma_data_direction direction = iommu_tce_direction(tce);
 526
 527	if (get_user_pages_fast(tce & PAGE_MASK, 1,
 528			direction != DMA_TO_DEVICE, &page) != 1)
 
 529		return -EFAULT;
 530
 531	*hpa = __pa((unsigned long) page_address(page));
 532
 533	return 0;
 534}
 535
 536static long tce_iommu_build(struct tce_container *container,
 537		struct iommu_table *tbl,
 538		unsigned long entry, unsigned long tce, unsigned long pages,
 539		enum dma_data_direction direction)
 540{
 541	long i, ret = 0;
 542	struct page *page;
 543	unsigned long hpa;
 544	enum dma_data_direction dirtmp;
 545
 546	for (i = 0; i < pages; ++i) {
 547		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 548
 549		ret = tce_iommu_use_page(tce, &hpa);
 550		if (ret)
 551			break;
 552
 553		page = pfn_to_page(hpa >> PAGE_SHIFT);
 554		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
 555			ret = -EPERM;
 556			break;
 557		}
 558
 559		hpa |= offset;
 560		dirtmp = direction;
 561		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
 
 562		if (ret) {
 563			tce_iommu_unuse_page(container, hpa);
 564			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 565					__func__, entry << tbl->it_page_shift,
 566					tce, ret);
 567			break;
 568		}
 569
 570		if (dirtmp != DMA_NONE)
 571			tce_iommu_unuse_page(container, hpa);
 572
 573		tce += IOMMU_PAGE_SIZE(tbl);
 574	}
 575
 576	if (ret)
 577		tce_iommu_clear(container, tbl, entry, i);
 
 
 578
 579	return ret;
 580}
 581
 582static long tce_iommu_build_v2(struct tce_container *container,
 583		struct iommu_table *tbl,
 584		unsigned long entry, unsigned long tce, unsigned long pages,
 585		enum dma_data_direction direction)
 586{
 587	long i, ret = 0;
 588	struct page *page;
 589	unsigned long hpa;
 590	enum dma_data_direction dirtmp;
 591
 592	if (!tbl->it_userspace) {
 593		ret = tce_iommu_userspace_view_alloc(tbl, container->mm);
 594		if (ret)
 595			return ret;
 596	}
 597
 598	for (i = 0; i < pages; ++i) {
 599		struct mm_iommu_table_group_mem_t *mem = NULL;
 600		unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl,
 601				entry + i);
 602
 603		ret = tce_iommu_prereg_ua_to_hpa(container,
 604				tce, IOMMU_PAGE_SIZE(tbl), &hpa, &mem);
 605		if (ret)
 606			break;
 607
 608		page = pfn_to_page(hpa >> PAGE_SHIFT);
 609		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
 610			ret = -EPERM;
 611			break;
 612		}
 613
 614		/* Preserve offset within IOMMU page */
 615		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 616		dirtmp = direction;
 617
 618		/* The registered region is being unregistered */
 619		if (mm_iommu_mapped_inc(mem))
 620			break;
 621
 622		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
 
 623		if (ret) {
 624			/* dirtmp cannot be DMA_NONE here */
 625			tce_iommu_unuse_page_v2(container, tbl, entry + i);
 626			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 627					__func__, entry << tbl->it_page_shift,
 628					tce, ret);
 629			break;
 630		}
 631
 632		if (dirtmp != DMA_NONE)
 633			tce_iommu_unuse_page_v2(container, tbl, entry + i);
 634
 635		*pua = tce;
 636
 637		tce += IOMMU_PAGE_SIZE(tbl);
 638	}
 639
 640	if (ret)
 641		tce_iommu_clear(container, tbl, entry, i);
 
 
 642
 643	return ret;
 644}
 645
 646static long tce_iommu_create_table(struct tce_container *container,
 647			struct iommu_table_group *table_group,
 648			int num,
 649			__u32 page_shift,
 650			__u64 window_size,
 651			__u32 levels,
 652			struct iommu_table **ptbl)
 653{
 654	long ret, table_size;
 655
 656	table_size = table_group->ops->get_table_size(page_shift, window_size,
 657			levels);
 658	if (!table_size)
 659		return -EINVAL;
 660
 661	ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
 662	if (ret)
 663		return ret;
 664
 665	ret = table_group->ops->create_table(table_group, num,
 666			page_shift, window_size, levels, ptbl);
 667
 668	WARN_ON(!ret && !(*ptbl)->it_ops->free);
 669	WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
 670
 671	return ret;
 672}
 673
 674static void tce_iommu_free_table(struct tce_container *container,
 675		struct iommu_table *tbl)
 676{
 677	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 678
 679	tce_iommu_userspace_view_free(tbl, container->mm);
 680	tbl->it_ops->free(tbl);
 681	decrement_locked_vm(container->mm, pages);
 682}
 683
 684static long tce_iommu_create_window(struct tce_container *container,
 685		__u32 page_shift, __u64 window_size, __u32 levels,
 686		__u64 *start_addr)
 687{
 688	struct tce_iommu_group *tcegrp;
 689	struct iommu_table_group *table_group;
 690	struct iommu_table *tbl = NULL;
 691	long ret, num;
 692
 693	num = tce_iommu_find_free_table(container);
 694	if (num < 0)
 695		return num;
 696
 697	/* Get the first group for ops::create_table */
 698	tcegrp = list_first_entry(&container->group_list,
 699			struct tce_iommu_group, next);
 700	table_group = iommu_group_get_iommudata(tcegrp->grp);
 701	if (!table_group)
 702		return -EFAULT;
 703
 704	if (!(table_group->pgsizes & (1ULL << page_shift)))
 705		return -EINVAL;
 706
 707	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
 708			!table_group->ops->get_table_size ||
 709			!table_group->ops->create_table)
 710		return -EPERM;
 711
 712	/* Create TCE table */
 713	ret = tce_iommu_create_table(container, table_group, num,
 714			page_shift, window_size, levels, &tbl);
 715	if (ret)
 716		return ret;
 717
 718	BUG_ON(!tbl->it_ops->free);
 719
 720	/*
 721	 * Program the table to every group.
 722	 * Groups have been tested for compatibility at the attach time.
 723	 */
 724	list_for_each_entry(tcegrp, &container->group_list, next) {
 725		table_group = iommu_group_get_iommudata(tcegrp->grp);
 726
 727		ret = table_group->ops->set_window(table_group, num, tbl);
 728		if (ret)
 729			goto unset_exit;
 730	}
 731
 732	container->tables[num] = tbl;
 733
 734	/* Return start address assigned by platform in create_table() */
 735	*start_addr = tbl->it_offset << tbl->it_page_shift;
 736
 737	return 0;
 738
 739unset_exit:
 740	list_for_each_entry(tcegrp, &container->group_list, next) {
 741		table_group = iommu_group_get_iommudata(tcegrp->grp);
 742		table_group->ops->unset_window(table_group, num);
 743	}
 744	tce_iommu_free_table(container, tbl);
 745
 746	return ret;
 747}
 748
 749static long tce_iommu_remove_window(struct tce_container *container,
 750		__u64 start_addr)
 751{
 752	struct iommu_table_group *table_group = NULL;
 753	struct iommu_table *tbl;
 754	struct tce_iommu_group *tcegrp;
 755	int num;
 756
 757	num = tce_iommu_find_table(container, start_addr, &tbl);
 758	if (num < 0)
 759		return -EINVAL;
 760
 761	BUG_ON(!tbl->it_size);
 762
 763	/* Detach groups from IOMMUs */
 764	list_for_each_entry(tcegrp, &container->group_list, next) {
 765		table_group = iommu_group_get_iommudata(tcegrp->grp);
 766
 767		/*
 768		 * SPAPR TCE IOMMU exposes the default DMA window to
 769		 * the guest via dma32_window_start/size of
 770		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
 771		 * the userspace to remove this window, some do not so
 772		 * here we check for the platform capability.
 773		 */
 774		if (!table_group->ops || !table_group->ops->unset_window)
 775			return -EPERM;
 776
 777		table_group->ops->unset_window(table_group, num);
 778	}
 779
 780	/* Free table */
 781	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 782	tce_iommu_free_table(container, tbl);
 783	container->tables[num] = NULL;
 784
 785	return 0;
 786}
 787
 788static long tce_iommu_create_default_window(struct tce_container *container)
 789{
 790	long ret;
 791	__u64 start_addr = 0;
 792	struct tce_iommu_group *tcegrp;
 793	struct iommu_table_group *table_group;
 794
 795	if (!container->def_window_pending)
 796		return 0;
 797
 798	if (!tce_groups_attached(container))
 799		return -ENODEV;
 800
 801	tcegrp = list_first_entry(&container->group_list,
 802			struct tce_iommu_group, next);
 803	table_group = iommu_group_get_iommudata(tcegrp->grp);
 804	if (!table_group)
 805		return -ENODEV;
 806
 807	ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
 808			table_group->tce32_size, 1, &start_addr);
 809	WARN_ON_ONCE(!ret && start_addr);
 810
 811	if (!ret)
 812		container->def_window_pending = false;
 813
 814	return ret;
 815}
 816
 817static long tce_iommu_ioctl(void *iommu_data,
 818				 unsigned int cmd, unsigned long arg)
 819{
 820	struct tce_container *container = iommu_data;
 821	unsigned long minsz, ddwsz;
 822	long ret;
 823
 824	switch (cmd) {
 825	case VFIO_CHECK_EXTENSION:
 826		switch (arg) {
 827		case VFIO_SPAPR_TCE_IOMMU:
 828		case VFIO_SPAPR_TCE_v2_IOMMU:
 829			ret = 1;
 830			break;
 831		default:
 832			ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
 833			break;
 834		}
 835
 836		return (ret < 0) ? 0 : ret;
 837	}
 838
 839	/*
 840	 * Sanity check to prevent one userspace from manipulating
 841	 * another userspace mm.
 842	 */
 843	BUG_ON(!container);
 844	if (container->mm && container->mm != current->mm)
 845		return -EPERM;
 846
 847	switch (cmd) {
 848	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 849		struct vfio_iommu_spapr_tce_info info;
 850		struct tce_iommu_group *tcegrp;
 851		struct iommu_table_group *table_group;
 852
 853		if (!tce_groups_attached(container))
 854			return -ENXIO;
 855
 856		tcegrp = list_first_entry(&container->group_list,
 857				struct tce_iommu_group, next);
 858		table_group = iommu_group_get_iommudata(tcegrp->grp);
 859
 860		if (!table_group)
 861			return -ENXIO;
 862
 863		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
 864				dma32_window_size);
 865
 866		if (copy_from_user(&info, (void __user *)arg, minsz))
 867			return -EFAULT;
 868
 869		if (info.argsz < minsz)
 870			return -EINVAL;
 871
 872		info.dma32_window_start = table_group->tce32_start;
 873		info.dma32_window_size = table_group->tce32_size;
 874		info.flags = 0;
 875		memset(&info.ddw, 0, sizeof(info.ddw));
 876
 877		if (table_group->max_dynamic_windows_supported &&
 878				container->v2) {
 879			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
 880			info.ddw.pgsizes = table_group->pgsizes;
 881			info.ddw.max_dynamic_windows_supported =
 882				table_group->max_dynamic_windows_supported;
 883			info.ddw.levels = table_group->max_levels;
 884		}
 885
 886		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
 887
 888		if (info.argsz >= ddwsz)
 889			minsz = ddwsz;
 890
 891		if (copy_to_user((void __user *)arg, &info, minsz))
 892			return -EFAULT;
 893
 894		return 0;
 895	}
 896	case VFIO_IOMMU_MAP_DMA: {
 897		struct vfio_iommu_type1_dma_map param;
 898		struct iommu_table *tbl = NULL;
 899		long num;
 900		enum dma_data_direction direction;
 901
 902		if (!container->enabled)
 903			return -EPERM;
 904
 905		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 906
 907		if (copy_from_user(&param, (void __user *)arg, minsz))
 908			return -EFAULT;
 909
 910		if (param.argsz < minsz)
 911			return -EINVAL;
 912
 913		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
 914				VFIO_DMA_MAP_FLAG_WRITE))
 915			return -EINVAL;
 916
 917		ret = tce_iommu_create_default_window(container);
 918		if (ret)
 919			return ret;
 920
 921		num = tce_iommu_find_table(container, param.iova, &tbl);
 922		if (num < 0)
 923			return -ENXIO;
 924
 925		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
 926				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
 927			return -EINVAL;
 928
 929		/* iova is checked by the IOMMU API */
 930		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
 931			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 932				direction = DMA_BIDIRECTIONAL;
 933			else
 934				direction = DMA_TO_DEVICE;
 935		} else {
 936			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 937				direction = DMA_FROM_DEVICE;
 938			else
 939				return -EINVAL;
 940		}
 941
 942		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
 943		if (ret)
 944			return ret;
 945
 946		if (container->v2)
 947			ret = tce_iommu_build_v2(container, tbl,
 948					param.iova >> tbl->it_page_shift,
 949					param.vaddr,
 950					param.size >> tbl->it_page_shift,
 951					direction);
 952		else
 953			ret = tce_iommu_build(container, tbl,
 954					param.iova >> tbl->it_page_shift,
 955					param.vaddr,
 956					param.size >> tbl->it_page_shift,
 957					direction);
 958
 959		iommu_flush_tce(tbl);
 960
 961		return ret;
 962	}
 963	case VFIO_IOMMU_UNMAP_DMA: {
 964		struct vfio_iommu_type1_dma_unmap param;
 965		struct iommu_table *tbl = NULL;
 966		long num;
 967
 968		if (!container->enabled)
 969			return -EPERM;
 970
 971		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
 972				size);
 973
 974		if (copy_from_user(&param, (void __user *)arg, minsz))
 975			return -EFAULT;
 976
 977		if (param.argsz < minsz)
 978			return -EINVAL;
 979
 980		/* No flag is supported now */
 981		if (param.flags)
 982			return -EINVAL;
 983
 984		ret = tce_iommu_create_default_window(container);
 985		if (ret)
 986			return ret;
 987
 988		num = tce_iommu_find_table(container, param.iova, &tbl);
 989		if (num < 0)
 990			return -ENXIO;
 991
 992		if (param.size & ~IOMMU_PAGE_MASK(tbl))
 993			return -EINVAL;
 994
 995		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
 996				param.size >> tbl->it_page_shift);
 997		if (ret)
 998			return ret;
 999
1000		ret = tce_iommu_clear(container, tbl,
1001				param.iova >> tbl->it_page_shift,
1002				param.size >> tbl->it_page_shift);
1003		iommu_flush_tce(tbl);
1004
1005		return ret;
1006	}
1007	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
1008		struct vfio_iommu_spapr_register_memory param;
1009
1010		if (!container->v2)
1011			break;
1012
1013		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1014				size);
1015
1016		ret = tce_iommu_mm_set(container);
1017		if (ret)
1018			return ret;
1019
1020		if (copy_from_user(&param, (void __user *)arg, minsz))
1021			return -EFAULT;
1022
1023		if (param.argsz < minsz)
1024			return -EINVAL;
1025
1026		/* No flag is supported now */
1027		if (param.flags)
1028			return -EINVAL;
1029
1030		mutex_lock(&container->lock);
1031		ret = tce_iommu_register_pages(container, param.vaddr,
1032				param.size);
1033		mutex_unlock(&container->lock);
1034
1035		return ret;
1036	}
1037	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
1038		struct vfio_iommu_spapr_register_memory param;
1039
1040		if (!container->v2)
1041			break;
1042
1043		if (!container->mm)
1044			return -EPERM;
1045
1046		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1047				size);
1048
1049		if (copy_from_user(&param, (void __user *)arg, minsz))
1050			return -EFAULT;
1051
1052		if (param.argsz < minsz)
1053			return -EINVAL;
1054
1055		/* No flag is supported now */
1056		if (param.flags)
1057			return -EINVAL;
1058
1059		mutex_lock(&container->lock);
1060		ret = tce_iommu_unregister_pages(container, param.vaddr,
1061				param.size);
1062		mutex_unlock(&container->lock);
1063
1064		return ret;
1065	}
1066	case VFIO_IOMMU_ENABLE:
1067		if (container->v2)
1068			break;
1069
1070		mutex_lock(&container->lock);
1071		ret = tce_iommu_enable(container);
1072		mutex_unlock(&container->lock);
1073		return ret;
1074
1075
1076	case VFIO_IOMMU_DISABLE:
1077		if (container->v2)
1078			break;
1079
1080		mutex_lock(&container->lock);
1081		tce_iommu_disable(container);
1082		mutex_unlock(&container->lock);
1083		return 0;
1084
1085	case VFIO_EEH_PE_OP: {
1086		struct tce_iommu_group *tcegrp;
1087
1088		ret = 0;
1089		list_for_each_entry(tcegrp, &container->group_list, next) {
1090			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
1091					cmd, arg);
1092			if (ret)
1093				return ret;
1094		}
1095		return ret;
1096	}
1097
1098	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1099		struct vfio_iommu_spapr_tce_create create;
1100
1101		if (!container->v2)
1102			break;
1103
1104		ret = tce_iommu_mm_set(container);
1105		if (ret)
1106			return ret;
1107
1108		if (!tce_groups_attached(container))
1109			return -ENXIO;
1110
1111		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1112				start_addr);
1113
1114		if (copy_from_user(&create, (void __user *)arg, minsz))
1115			return -EFAULT;
1116
1117		if (create.argsz < minsz)
1118			return -EINVAL;
1119
1120		if (create.flags)
1121			return -EINVAL;
1122
1123		mutex_lock(&container->lock);
1124
1125		ret = tce_iommu_create_default_window(container);
1126		if (!ret)
1127			ret = tce_iommu_create_window(container,
1128					create.page_shift,
1129					create.window_size, create.levels,
1130					&create.start_addr);
1131
1132		mutex_unlock(&container->lock);
1133
1134		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1135			ret = -EFAULT;
1136
1137		return ret;
1138	}
1139	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1140		struct vfio_iommu_spapr_tce_remove remove;
1141
1142		if (!container->v2)
1143			break;
1144
1145		ret = tce_iommu_mm_set(container);
1146		if (ret)
1147			return ret;
1148
1149		if (!tce_groups_attached(container))
1150			return -ENXIO;
1151
1152		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1153				start_addr);
1154
1155		if (copy_from_user(&remove, (void __user *)arg, minsz))
1156			return -EFAULT;
1157
1158		if (remove.argsz < minsz)
1159			return -EINVAL;
1160
1161		if (remove.flags)
1162			return -EINVAL;
1163
1164		if (container->def_window_pending && !remove.start_addr) {
1165			container->def_window_pending = false;
1166			return 0;
1167		}
1168
1169		mutex_lock(&container->lock);
1170
1171		ret = tce_iommu_remove_window(container, remove.start_addr);
1172
1173		mutex_unlock(&container->lock);
1174
1175		return ret;
1176	}
1177	}
1178
1179	return -ENOTTY;
1180}
1181
1182static void tce_iommu_release_ownership(struct tce_container *container,
1183		struct iommu_table_group *table_group)
1184{
1185	int i;
1186
1187	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1188		struct iommu_table *tbl = container->tables[i];
1189
1190		if (!tbl)
1191			continue;
1192
1193		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1194		tce_iommu_userspace_view_free(tbl, container->mm);
1195		if (tbl->it_map)
1196			iommu_release_ownership(tbl);
1197
1198		container->tables[i] = NULL;
1199	}
1200}
1201
1202static int tce_iommu_take_ownership(struct tce_container *container,
1203		struct iommu_table_group *table_group)
1204{
1205	int i, j, rc = 0;
1206
1207	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1208		struct iommu_table *tbl = table_group->tables[i];
1209
1210		if (!tbl || !tbl->it_map)
1211			continue;
1212
1213		rc = iommu_take_ownership(tbl);
1214		if (rc) {
1215			for (j = 0; j < i; ++j)
1216				iommu_release_ownership(
1217						table_group->tables[j]);
1218
1219			return rc;
1220		}
1221	}
1222
1223	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1224		container->tables[i] = table_group->tables[i];
1225
1226	return 0;
1227}
1228
1229static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1230		struct iommu_table_group *table_group)
1231{
1232	long i;
1233
1234	if (!table_group->ops->unset_window) {
1235		WARN_ON_ONCE(1);
1236		return;
1237	}
1238
1239	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1240		table_group->ops->unset_window(table_group, i);
 
1241
1242	table_group->ops->release_ownership(table_group);
1243}
1244
1245static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1246		struct iommu_table_group *table_group)
1247{
1248	long i, ret = 0;
1249
1250	if (!table_group->ops->create_table || !table_group->ops->set_window ||
1251			!table_group->ops->release_ownership) {
1252		WARN_ON_ONCE(1);
1253		return -EFAULT;
1254	}
1255
1256	table_group->ops->take_ownership(table_group);
1257
1258	/* Set all windows to the new group */
1259	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1260		struct iommu_table *tbl = container->tables[i];
1261
1262		if (!tbl)
1263			continue;
1264
1265		ret = table_group->ops->set_window(table_group, i, tbl);
1266		if (ret)
1267			goto release_exit;
1268	}
1269
1270	return 0;
1271
1272release_exit:
1273	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1274		table_group->ops->unset_window(table_group, i);
1275
1276	table_group->ops->release_ownership(table_group);
1277
1278	return ret;
1279}
1280
1281static int tce_iommu_attach_group(void *iommu_data,
1282		struct iommu_group *iommu_group)
1283{
1284	int ret;
1285	struct tce_container *container = iommu_data;
1286	struct iommu_table_group *table_group;
1287	struct tce_iommu_group *tcegrp = NULL;
1288
1289	mutex_lock(&container->lock);
1290
1291	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1292			iommu_group_id(iommu_group), iommu_group); */
1293	table_group = iommu_group_get_iommudata(iommu_group);
1294	if (!table_group) {
1295		ret = -ENODEV;
1296		goto unlock_exit;
1297	}
1298
1299	if (tce_groups_attached(container) && (!table_group->ops ||
1300			!table_group->ops->take_ownership ||
1301			!table_group->ops->release_ownership)) {
1302		ret = -EBUSY;
1303		goto unlock_exit;
1304	}
1305
1306	/* Check if new group has the same iommu_ops (i.e. compatible) */
1307	list_for_each_entry(tcegrp, &container->group_list, next) {
1308		struct iommu_table_group *table_group_tmp;
1309
1310		if (tcegrp->grp == iommu_group) {
1311			pr_warn("tce_vfio: Group %d is already attached\n",
1312					iommu_group_id(iommu_group));
1313			ret = -EBUSY;
1314			goto unlock_exit;
1315		}
1316		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1317		if (table_group_tmp->ops->create_table !=
1318				table_group->ops->create_table) {
1319			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1320					iommu_group_id(iommu_group),
1321					iommu_group_id(tcegrp->grp));
1322			ret = -EPERM;
1323			goto unlock_exit;
1324		}
1325	}
1326
1327	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1328	if (!tcegrp) {
1329		ret = -ENOMEM;
1330		goto unlock_exit;
1331	}
1332
1333	if (!table_group->ops || !table_group->ops->take_ownership ||
1334			!table_group->ops->release_ownership) {
 
 
 
 
1335		ret = tce_iommu_take_ownership(container, table_group);
1336	} else {
 
 
 
 
1337		ret = tce_iommu_take_ownership_ddw(container, table_group);
1338		if (!tce_groups_attached(container) && !container->tables[0])
1339			container->def_window_pending = true;
1340	}
1341
1342	if (!ret) {
1343		tcegrp->grp = iommu_group;
1344		list_add(&tcegrp->next, &container->group_list);
1345	}
1346
1347unlock_exit:
1348	if (ret && tcegrp)
1349		kfree(tcegrp);
1350
 
1351	mutex_unlock(&container->lock);
1352
1353	return ret;
1354}
1355
1356static void tce_iommu_detach_group(void *iommu_data,
1357		struct iommu_group *iommu_group)
1358{
1359	struct tce_container *container = iommu_data;
1360	struct iommu_table_group *table_group;
1361	bool found = false;
1362	struct tce_iommu_group *tcegrp;
1363
1364	mutex_lock(&container->lock);
1365
1366	list_for_each_entry(tcegrp, &container->group_list, next) {
1367		if (tcegrp->grp == iommu_group) {
1368			found = true;
1369			break;
1370		}
1371	}
1372
1373	if (!found) {
1374		pr_warn("tce_vfio: detaching unattached group #%u\n",
1375				iommu_group_id(iommu_group));
1376		goto unlock_exit;
1377	}
1378
1379	list_del(&tcegrp->next);
1380	kfree(tcegrp);
1381
1382	table_group = iommu_group_get_iommudata(iommu_group);
1383	BUG_ON(!table_group);
1384
1385	if (!table_group->ops || !table_group->ops->release_ownership)
1386		tce_iommu_release_ownership(container, table_group);
1387	else
1388		tce_iommu_release_ownership_ddw(container, table_group);
1389
1390unlock_exit:
1391	mutex_unlock(&container->lock);
1392}
1393
1394const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1395	.name		= "iommu-vfio-powerpc",
1396	.owner		= THIS_MODULE,
1397	.open		= tce_iommu_open,
1398	.release	= tce_iommu_release,
1399	.ioctl		= tce_iommu_ioctl,
1400	.attach_group	= tce_iommu_attach_group,
1401	.detach_group	= tce_iommu_detach_group,
1402};
1403
1404static int __init tce_iommu_init(void)
1405{
1406	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1407}
1408
1409static void __exit tce_iommu_cleanup(void)
1410{
1411	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1412}
1413
1414module_init(tce_iommu_init);
1415module_exit(tce_iommu_cleanup);
1416
1417MODULE_VERSION(DRIVER_VERSION);
1418MODULE_LICENSE("GPL v2");
1419MODULE_AUTHOR(DRIVER_AUTHOR);
1420MODULE_DESCRIPTION(DRIVER_DESC);
1421