Linux Audio

Check our new training course

Loading...
v6.8
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO: IOMMU DMA mapping support for TCE on POWER
   4 *
   5 * Copyright (C) 2013 IBM Corp.  All rights reserved.
   6 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
   7 * Copyright Gavin Shan, IBM Corporation 2014.
   8 *
   9 * Derived from original vfio_iommu_type1.c:
  10 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
  11 *     Author: Alex Williamson <alex.williamson@redhat.com>
  12 */
  13
  14#include <linux/module.h>
  15#include <linux/pci.h>
  16#include <linux/slab.h>
  17#include <linux/uaccess.h>
  18#include <linux/err.h>
  19#include <linux/vfio.h>
  20#include <linux/vmalloc.h>
  21#include <linux/sched/mm.h>
  22#include <linux/sched/signal.h>
  23#include <linux/mm.h>
  24#include "vfio.h"
  25
  26#include <asm/iommu.h>
  27#include <asm/tce.h>
  28#include <asm/mmu_context.h>
  29
  30#define DRIVER_VERSION  "0.1"
  31#define DRIVER_AUTHOR   "aik@ozlabs.ru"
  32#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
  33
  34static void tce_iommu_detach_group(void *iommu_data,
  35		struct iommu_group *iommu_group);
  36
  37/*
  38 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  39 *
  40 * This code handles mapping and unmapping of user data buffers
  41 * into DMA'ble space using the IOMMU
  42 */
  43
  44struct tce_iommu_group {
  45	struct list_head next;
  46	struct iommu_group *grp;
  47};
  48
  49/*
  50 * A container needs to remember which preregistered region  it has
  51 * referenced to do proper cleanup at the userspace process exit.
  52 */
  53struct tce_iommu_prereg {
  54	struct list_head next;
  55	struct mm_iommu_table_group_mem_t *mem;
  56};
  57
  58/*
  59 * The container descriptor supports only a single group per container.
  60 * Required by the API as the container is not supplied with the IOMMU group
  61 * at the moment of initialization.
  62 */
  63struct tce_container {
  64	struct mutex lock;
  65	bool enabled;
  66	bool v2;
  67	bool def_window_pending;
  68	unsigned long locked_pages;
  69	struct mm_struct *mm;
  70	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
  71	struct list_head group_list;
  72	struct list_head prereg_list;
  73};
  74
  75static long tce_iommu_mm_set(struct tce_container *container)
  76{
  77	if (container->mm) {
  78		if (container->mm == current->mm)
  79			return 0;
  80		return -EPERM;
  81	}
  82	BUG_ON(!current->mm);
  83	container->mm = current->mm;
  84	mmgrab(container->mm);
  85
  86	return 0;
  87}
  88
  89static long tce_iommu_prereg_free(struct tce_container *container,
  90		struct tce_iommu_prereg *tcemem)
  91{
  92	long ret;
  93
  94	ret = mm_iommu_put(container->mm, tcemem->mem);
  95	if (ret)
  96		return ret;
  97
  98	list_del(&tcemem->next);
  99	kfree(tcemem);
 100
 101	return 0;
 102}
 103
 104static long tce_iommu_unregister_pages(struct tce_container *container,
 105		__u64 vaddr, __u64 size)
 106{
 107	struct mm_iommu_table_group_mem_t *mem;
 108	struct tce_iommu_prereg *tcemem;
 109	bool found = false;
 110	long ret;
 111
 112	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
 113		return -EINVAL;
 114
 115	mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
 116	if (!mem)
 117		return -ENOENT;
 118
 119	list_for_each_entry(tcemem, &container->prereg_list, next) {
 120		if (tcemem->mem == mem) {
 121			found = true;
 122			break;
 123		}
 124	}
 125
 126	if (!found)
 127		ret = -ENOENT;
 128	else
 129		ret = tce_iommu_prereg_free(container, tcemem);
 130
 131	mm_iommu_put(container->mm, mem);
 132
 133	return ret;
 134}
 135
 136static long tce_iommu_register_pages(struct tce_container *container,
 137		__u64 vaddr, __u64 size)
 138{
 139	long ret = 0;
 140	struct mm_iommu_table_group_mem_t *mem = NULL;
 141	struct tce_iommu_prereg *tcemem;
 142	unsigned long entries = size >> PAGE_SHIFT;
 143
 144	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
 145			((vaddr + size) < vaddr))
 146		return -EINVAL;
 147
 148	mem = mm_iommu_get(container->mm, vaddr, entries);
 149	if (mem) {
 150		list_for_each_entry(tcemem, &container->prereg_list, next) {
 151			if (tcemem->mem == mem) {
 152				ret = -EBUSY;
 153				goto put_exit;
 154			}
 155		}
 156	} else {
 157		ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
 158		if (ret)
 159			return ret;
 160	}
 161
 162	tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
 163	if (!tcemem) {
 164		ret = -ENOMEM;
 165		goto put_exit;
 166	}
 167
 168	tcemem->mem = mem;
 169	list_add(&tcemem->next, &container->prereg_list);
 170
 171	container->enabled = true;
 172
 173	return 0;
 174
 175put_exit:
 176	mm_iommu_put(container->mm, mem);
 177	return ret;
 178}
 179
 180static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
 181		unsigned int it_page_shift)
 182{
 183	struct page *page;
 184	unsigned long size = 0;
 185
 186	if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
 187		return size == (1UL << it_page_shift);
 188
 189	page = pfn_to_page(hpa >> PAGE_SHIFT);
 190	/*
 191	 * Check that the TCE table granularity is not bigger than the size of
 192	 * a page we just found. Otherwise the hardware can get access to
 193	 * a bigger memory chunk that it should.
 194	 */
 195	return page_shift(compound_head(page)) >= it_page_shift;
 196}
 197
 198static inline bool tce_groups_attached(struct tce_container *container)
 199{
 200	return !list_empty(&container->group_list);
 201}
 202
 203static long tce_iommu_find_table(struct tce_container *container,
 204		phys_addr_t ioba, struct iommu_table **ptbl)
 205{
 206	long i;
 207
 208	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 209		struct iommu_table *tbl = container->tables[i];
 210
 211		if (tbl) {
 212			unsigned long entry = ioba >> tbl->it_page_shift;
 213			unsigned long start = tbl->it_offset;
 214			unsigned long end = start + tbl->it_size;
 215
 216			if ((start <= entry) && (entry < end)) {
 217				*ptbl = tbl;
 218				return i;
 219			}
 220		}
 221	}
 222
 223	return -1;
 224}
 225
 226static int tce_iommu_find_free_table(struct tce_container *container)
 227{
 228	int i;
 229
 230	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 231		if (!container->tables[i])
 232			return i;
 233	}
 234
 235	return -ENOSPC;
 236}
 237
 238static int tce_iommu_enable(struct tce_container *container)
 239{
 240	int ret = 0;
 241	unsigned long locked;
 242	struct iommu_table_group *table_group;
 243	struct tce_iommu_group *tcegrp;
 244
 245	if (container->enabled)
 246		return -EBUSY;
 247
 248	/*
 249	 * When userspace pages are mapped into the IOMMU, they are effectively
 250	 * locked memory, so, theoretically, we need to update the accounting
 251	 * of locked pages on each map and unmap.  For powerpc, the map unmap
 252	 * paths can be very hot, though, and the accounting would kill
 253	 * performance, especially since it would be difficult to impossible
 254	 * to handle the accounting in real mode only.
 255	 *
 256	 * To address that, rather than precisely accounting every page, we
 257	 * instead account for a worst case on locked memory when the iommu is
 258	 * enabled and disabled.  The worst case upper bound on locked memory
 259	 * is the size of the whole iommu window, which is usually relatively
 260	 * small (compared to total memory sizes) on POWER hardware.
 261	 *
 262	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 263	 * that would effectively kill the guest at random points, much better
 264	 * enforcing the limit based on the max that the guest can map.
 265	 *
 266	 * Unfortunately at the moment it counts whole tables, no matter how
 267	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
 268	 * each with 2GB DMA window, 8GB will be counted here. The reason for
 269	 * this is that we cannot tell here the amount of RAM used by the guest
 270	 * as this information is only available from KVM and VFIO is
 271	 * KVM agnostic.
 272	 *
 273	 * So we do not allow enabling a container without a group attached
 274	 * as there is no way to know how much we should increment
 275	 * the locked_vm counter.
 276	 */
 277	if (!tce_groups_attached(container))
 278		return -ENODEV;
 279
 280	tcegrp = list_first_entry(&container->group_list,
 281			struct tce_iommu_group, next);
 282	table_group = iommu_group_get_iommudata(tcegrp->grp);
 283	if (!table_group)
 284		return -ENODEV;
 285
 286	if (!table_group->tce32_size)
 287		return -EPERM;
 288
 289	ret = tce_iommu_mm_set(container);
 290	if (ret)
 291		return ret;
 292
 293	locked = table_group->tce32_size >> PAGE_SHIFT;
 294	ret = account_locked_vm(container->mm, locked, true);
 295	if (ret)
 296		return ret;
 297
 298	container->locked_pages = locked;
 299
 300	container->enabled = true;
 301
 302	return ret;
 303}
 304
 305static void tce_iommu_disable(struct tce_container *container)
 306{
 307	if (!container->enabled)
 308		return;
 309
 310	container->enabled = false;
 311
 312	BUG_ON(!container->mm);
 313	account_locked_vm(container->mm, container->locked_pages, false);
 314}
 315
 316static void *tce_iommu_open(unsigned long arg)
 317{
 318	struct tce_container *container;
 319
 320	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
 321		pr_err("tce_vfio: Wrong IOMMU type\n");
 322		return ERR_PTR(-EINVAL);
 323	}
 324
 325	container = kzalloc(sizeof(*container), GFP_KERNEL);
 326	if (!container)
 327		return ERR_PTR(-ENOMEM);
 328
 329	mutex_init(&container->lock);
 330	INIT_LIST_HEAD_RCU(&container->group_list);
 331	INIT_LIST_HEAD_RCU(&container->prereg_list);
 332
 333	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 334
 335	return container;
 336}
 337
 338static int tce_iommu_clear(struct tce_container *container,
 339		struct iommu_table *tbl,
 340		unsigned long entry, unsigned long pages);
 341static void tce_iommu_free_table(struct tce_container *container,
 342		struct iommu_table *tbl);
 343
 344static void tce_iommu_release(void *iommu_data)
 345{
 346	struct tce_container *container = iommu_data;
 347	struct tce_iommu_group *tcegrp;
 348	struct tce_iommu_prereg *tcemem, *tmtmp;
 349	long i;
 350
 351	while (tce_groups_attached(container)) {
 352		tcegrp = list_first_entry(&container->group_list,
 353				struct tce_iommu_group, next);
 354		tce_iommu_detach_group(iommu_data, tcegrp->grp);
 355	}
 356
 357	/*
 358	 * If VFIO created a table, it was not disposed
 359	 * by tce_iommu_detach_group() so do it now.
 360	 */
 361	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 362		struct iommu_table *tbl = container->tables[i];
 363
 364		if (!tbl)
 365			continue;
 366
 367		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 368		tce_iommu_free_table(container, tbl);
 369	}
 370
 371	list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
 372		WARN_ON(tce_iommu_prereg_free(container, tcemem));
 373
 374	tce_iommu_disable(container);
 375	if (container->mm)
 376		mmdrop(container->mm);
 377	mutex_destroy(&container->lock);
 378
 379	kfree(container);
 380}
 381
 382static void tce_iommu_unuse_page(unsigned long hpa)
 
 383{
 384	struct page *page;
 385
 386	page = pfn_to_page(hpa >> PAGE_SHIFT);
 387	unpin_user_page(page);
 388}
 389
 390static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
 391		unsigned long tce, unsigned long shift,
 392		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
 393{
 394	long ret = 0;
 395	struct mm_iommu_table_group_mem_t *mem;
 396
 397	mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
 398	if (!mem)
 399		return -EINVAL;
 400
 401	ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
 402	if (ret)
 403		return -EINVAL;
 404
 405	*pmem = mem;
 406
 407	return 0;
 408}
 409
 410static void tce_iommu_unuse_page_v2(struct tce_container *container,
 411		struct iommu_table *tbl, unsigned long entry)
 412{
 413	struct mm_iommu_table_group_mem_t *mem = NULL;
 414	int ret;
 415	unsigned long hpa = 0;
 416	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 417
 418	if (!pua)
 419		return;
 420
 421	ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
 422			tbl->it_page_shift, &hpa, &mem);
 423	if (ret)
 424		pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
 425				__func__, be64_to_cpu(*pua), entry, ret);
 426	if (mem)
 427		mm_iommu_mapped_dec(mem);
 428
 429	*pua = cpu_to_be64(0);
 430}
 431
 432static int tce_iommu_clear(struct tce_container *container,
 433		struct iommu_table *tbl,
 434		unsigned long entry, unsigned long pages)
 435{
 436	unsigned long oldhpa;
 437	long ret;
 438	enum dma_data_direction direction;
 439	unsigned long lastentry = entry + pages, firstentry = entry;
 440
 441	for ( ; entry < lastentry; ++entry) {
 442		if (tbl->it_indirect_levels && tbl->it_userspace) {
 443			/*
 444			 * For multilevel tables, we can take a shortcut here
 445			 * and skip some TCEs as we know that the userspace
 446			 * addresses cache is a mirror of the real TCE table
 447			 * and if it is missing some indirect levels, then
 448			 * the hardware table does not have them allocated
 449			 * either and therefore does not require updating.
 450			 */
 451			__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
 452					entry);
 453			if (!pua) {
 454				/* align to level_size which is power of two */
 455				entry |= tbl->it_level_size - 1;
 456				continue;
 457			}
 458		}
 459
 460		cond_resched();
 461
 462		direction = DMA_NONE;
 463		oldhpa = 0;
 464		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
 465				&direction);
 466		if (ret)
 467			continue;
 468
 469		if (direction == DMA_NONE)
 470			continue;
 471
 472		if (container->v2) {
 473			tce_iommu_unuse_page_v2(container, tbl, entry);
 474			continue;
 475		}
 476
 477		tce_iommu_unuse_page(oldhpa);
 478	}
 479
 480	iommu_tce_kill(tbl, firstentry, pages);
 481
 482	return 0;
 483}
 484
 485static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 486{
 487	struct page *page = NULL;
 488	enum dma_data_direction direction = iommu_tce_direction(tce);
 489
 490	if (pin_user_pages_fast(tce & PAGE_MASK, 1,
 491			direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
 492			&page) != 1)
 493		return -EFAULT;
 494
 495	*hpa = __pa((unsigned long) page_address(page));
 496
 497	return 0;
 498}
 499
 500static long tce_iommu_build(struct tce_container *container,
 501		struct iommu_table *tbl,
 502		unsigned long entry, unsigned long tce, unsigned long pages,
 503		enum dma_data_direction direction)
 504{
 505	long i, ret = 0;
 506	unsigned long hpa;
 507	enum dma_data_direction dirtmp;
 508
 509	for (i = 0; i < pages; ++i) {
 510		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 511
 512		ret = tce_iommu_use_page(tce, &hpa);
 513		if (ret)
 514			break;
 515
 516		if (!tce_page_is_contained(container->mm, hpa,
 517				tbl->it_page_shift)) {
 518			ret = -EPERM;
 519			break;
 520		}
 521
 522		hpa |= offset;
 523		dirtmp = direction;
 524		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
 525				&hpa, &dirtmp);
 526		if (ret) {
 527			tce_iommu_unuse_page(hpa);
 528			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 529					__func__, entry << tbl->it_page_shift,
 530					tce, ret);
 531			break;
 532		}
 533
 534		if (dirtmp != DMA_NONE)
 535			tce_iommu_unuse_page(hpa);
 536
 537		tce += IOMMU_PAGE_SIZE(tbl);
 538	}
 539
 540	if (ret)
 541		tce_iommu_clear(container, tbl, entry, i);
 542	else
 543		iommu_tce_kill(tbl, entry, pages);
 544
 545	return ret;
 546}
 547
 548static long tce_iommu_build_v2(struct tce_container *container,
 549		struct iommu_table *tbl,
 550		unsigned long entry, unsigned long tce, unsigned long pages,
 551		enum dma_data_direction direction)
 552{
 553	long i, ret = 0;
 554	unsigned long hpa;
 555	enum dma_data_direction dirtmp;
 556
 557	for (i = 0; i < pages; ++i) {
 558		struct mm_iommu_table_group_mem_t *mem = NULL;
 559		__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
 560
 561		ret = tce_iommu_prereg_ua_to_hpa(container,
 562				tce, tbl->it_page_shift, &hpa, &mem);
 563		if (ret)
 564			break;
 565
 566		if (!tce_page_is_contained(container->mm, hpa,
 567				tbl->it_page_shift)) {
 568			ret = -EPERM;
 569			break;
 570		}
 571
 572		/* Preserve offset within IOMMU page */
 573		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 574		dirtmp = direction;
 575
 576		/* The registered region is being unregistered */
 577		if (mm_iommu_mapped_inc(mem))
 578			break;
 579
 580		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
 581				&hpa, &dirtmp);
 582		if (ret) {
 583			/* dirtmp cannot be DMA_NONE here */
 584			tce_iommu_unuse_page_v2(container, tbl, entry + i);
 585			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 586					__func__, entry << tbl->it_page_shift,
 587					tce, ret);
 588			break;
 589		}
 590
 591		if (dirtmp != DMA_NONE)
 592			tce_iommu_unuse_page_v2(container, tbl, entry + i);
 593
 594		*pua = cpu_to_be64(tce);
 595
 596		tce += IOMMU_PAGE_SIZE(tbl);
 597	}
 598
 599	if (ret)
 600		tce_iommu_clear(container, tbl, entry, i);
 601	else
 602		iommu_tce_kill(tbl, entry, pages);
 603
 604	return ret;
 605}
 606
 607static long tce_iommu_create_table(struct tce_container *container,
 608			struct iommu_table_group *table_group,
 609			int num,
 610			__u32 page_shift,
 611			__u64 window_size,
 612			__u32 levels,
 613			struct iommu_table **ptbl)
 614{
 615	long ret, table_size;
 616
 617	table_size = table_group->ops->get_table_size(page_shift, window_size,
 618			levels);
 619	if (!table_size)
 620		return -EINVAL;
 621
 622	ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
 623	if (ret)
 624		return ret;
 625
 626	ret = table_group->ops->create_table(table_group, num,
 627			page_shift, window_size, levels, ptbl);
 628
 629	WARN_ON(!ret && !(*ptbl)->it_ops->free);
 630	WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
 631
 632	return ret;
 633}
 634
 635static void tce_iommu_free_table(struct tce_container *container,
 636		struct iommu_table *tbl)
 637{
 638	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 639
 640	iommu_tce_table_put(tbl);
 641	account_locked_vm(container->mm, pages, false);
 642}
 643
 644static long tce_iommu_create_window(struct tce_container *container,
 645		__u32 page_shift, __u64 window_size, __u32 levels,
 646		__u64 *start_addr)
 647{
 648	struct tce_iommu_group *tcegrp;
 649	struct iommu_table_group *table_group;
 650	struct iommu_table *tbl = NULL;
 651	long ret, num;
 652
 653	num = tce_iommu_find_free_table(container);
 654	if (num < 0)
 655		return num;
 656
 657	/* Get the first group for ops::create_table */
 658	tcegrp = list_first_entry(&container->group_list,
 659			struct tce_iommu_group, next);
 660	table_group = iommu_group_get_iommudata(tcegrp->grp);
 661	if (!table_group)
 662		return -EFAULT;
 663
 664	if (!(table_group->pgsizes & (1ULL << page_shift)))
 665		return -EINVAL;
 666
 667	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
 668			!table_group->ops->get_table_size ||
 669			!table_group->ops->create_table)
 670		return -EPERM;
 671
 672	/* Create TCE table */
 673	ret = tce_iommu_create_table(container, table_group, num,
 674			page_shift, window_size, levels, &tbl);
 675	if (ret)
 676		return ret;
 677
 678	BUG_ON(!tbl->it_ops->free);
 679
 680	/*
 681	 * Program the table to every group.
 682	 * Groups have been tested for compatibility at the attach time.
 683	 */
 684	list_for_each_entry(tcegrp, &container->group_list, next) {
 685		table_group = iommu_group_get_iommudata(tcegrp->grp);
 686
 687		ret = table_group->ops->set_window(table_group, num, tbl);
 688		if (ret)
 689			goto unset_exit;
 690	}
 691
 692	container->tables[num] = tbl;
 693
 694	/* Return start address assigned by platform in create_table() */
 695	*start_addr = tbl->it_offset << tbl->it_page_shift;
 696
 697	return 0;
 698
 699unset_exit:
 700	list_for_each_entry(tcegrp, &container->group_list, next) {
 701		table_group = iommu_group_get_iommudata(tcegrp->grp);
 702		table_group->ops->unset_window(table_group, num);
 703	}
 704	tce_iommu_free_table(container, tbl);
 705
 706	return ret;
 707}
 708
 709static long tce_iommu_remove_window(struct tce_container *container,
 710		__u64 start_addr)
 711{
 712	struct iommu_table_group *table_group = NULL;
 713	struct iommu_table *tbl;
 714	struct tce_iommu_group *tcegrp;
 715	int num;
 716
 717	num = tce_iommu_find_table(container, start_addr, &tbl);
 718	if (num < 0)
 719		return -EINVAL;
 720
 721	BUG_ON(!tbl->it_size);
 722
 723	/* Detach groups from IOMMUs */
 724	list_for_each_entry(tcegrp, &container->group_list, next) {
 725		table_group = iommu_group_get_iommudata(tcegrp->grp);
 726
 727		/*
 728		 * SPAPR TCE IOMMU exposes the default DMA window to
 729		 * the guest via dma32_window_start/size of
 730		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
 731		 * the userspace to remove this window, some do not so
 732		 * here we check for the platform capability.
 733		 */
 734		if (!table_group->ops || !table_group->ops->unset_window)
 735			return -EPERM;
 736
 737		table_group->ops->unset_window(table_group, num);
 738	}
 739
 740	/* Free table */
 741	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 742	tce_iommu_free_table(container, tbl);
 743	container->tables[num] = NULL;
 744
 745	return 0;
 746}
 747
 748static long tce_iommu_create_default_window(struct tce_container *container)
 749{
 750	long ret;
 751	__u64 start_addr = 0;
 752	struct tce_iommu_group *tcegrp;
 753	struct iommu_table_group *table_group;
 754
 755	if (!container->def_window_pending)
 756		return 0;
 757
 758	if (!tce_groups_attached(container))
 759		return -ENODEV;
 760
 761	tcegrp = list_first_entry(&container->group_list,
 762			struct tce_iommu_group, next);
 763	table_group = iommu_group_get_iommudata(tcegrp->grp);
 764	if (!table_group)
 765		return -ENODEV;
 766
 767	ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
 768			table_group->tce32_size, 1, &start_addr);
 769	WARN_ON_ONCE(!ret && start_addr);
 770
 771	if (!ret)
 772		container->def_window_pending = false;
 773
 774	return ret;
 775}
 776
 777static long vfio_spapr_ioctl_eeh_pe_op(struct iommu_group *group,
 778				       unsigned long arg)
 779{
 780	struct eeh_pe *pe;
 781	struct vfio_eeh_pe_op op;
 782	unsigned long minsz;
 783
 784	pe = eeh_iommu_group_to_pe(group);
 785	if (!pe)
 786		return -ENODEV;
 787
 788	minsz = offsetofend(struct vfio_eeh_pe_op, op);
 789	if (copy_from_user(&op, (void __user *)arg, minsz))
 790		return -EFAULT;
 791	if (op.argsz < minsz || op.flags)
 792		return -EINVAL;
 793
 794	switch (op.op) {
 795	case VFIO_EEH_PE_DISABLE:
 796		return eeh_pe_set_option(pe, EEH_OPT_DISABLE);
 797	case VFIO_EEH_PE_ENABLE:
 798		return eeh_pe_set_option(pe, EEH_OPT_ENABLE);
 799	case VFIO_EEH_PE_UNFREEZE_IO:
 800		return eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO);
 801	case VFIO_EEH_PE_UNFREEZE_DMA:
 802		return eeh_pe_set_option(pe, EEH_OPT_THAW_DMA);
 803	case VFIO_EEH_PE_GET_STATE:
 804		return eeh_pe_get_state(pe);
 805		break;
 806	case VFIO_EEH_PE_RESET_DEACTIVATE:
 807		return eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true);
 808	case VFIO_EEH_PE_RESET_HOT:
 809		return eeh_pe_reset(pe, EEH_RESET_HOT, true);
 810	case VFIO_EEH_PE_RESET_FUNDAMENTAL:
 811		return eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true);
 812	case VFIO_EEH_PE_CONFIGURE:
 813		return eeh_pe_configure(pe);
 814	case VFIO_EEH_PE_INJECT_ERR:
 815		minsz = offsetofend(struct vfio_eeh_pe_op, err.mask);
 816		if (op.argsz < minsz)
 817			return -EINVAL;
 818		if (copy_from_user(&op, (void __user *)arg, minsz))
 819			return -EFAULT;
 820
 821		return eeh_pe_inject_err(pe, op.err.type, op.err.func,
 822					 op.err.addr, op.err.mask);
 823	default:
 824		return -EINVAL;
 825	}
 826}
 827
 828static long tce_iommu_ioctl(void *iommu_data,
 829				 unsigned int cmd, unsigned long arg)
 830{
 831	struct tce_container *container = iommu_data;
 832	unsigned long minsz, ddwsz;
 833	long ret;
 834
 835	switch (cmd) {
 836	case VFIO_CHECK_EXTENSION:
 837		switch (arg) {
 838		case VFIO_SPAPR_TCE_IOMMU:
 839		case VFIO_SPAPR_TCE_v2_IOMMU:
 840			return 1;
 841		case VFIO_EEH:
 842			return eeh_enabled();
 843		default:
 844			return 0;
 
 845		}
 
 
 846	}
 847
 848	/*
 849	 * Sanity check to prevent one userspace from manipulating
 850	 * another userspace mm.
 851	 */
 852	BUG_ON(!container);
 853	if (container->mm && container->mm != current->mm)
 854		return -EPERM;
 855
 856	switch (cmd) {
 857	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 858		struct vfio_iommu_spapr_tce_info info;
 859		struct tce_iommu_group *tcegrp;
 860		struct iommu_table_group *table_group;
 861
 862		if (!tce_groups_attached(container))
 863			return -ENXIO;
 864
 865		tcegrp = list_first_entry(&container->group_list,
 866				struct tce_iommu_group, next);
 867		table_group = iommu_group_get_iommudata(tcegrp->grp);
 868
 869		if (!table_group)
 870			return -ENXIO;
 871
 872		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
 873				dma32_window_size);
 874
 875		if (copy_from_user(&info, (void __user *)arg, minsz))
 876			return -EFAULT;
 877
 878		if (info.argsz < minsz)
 879			return -EINVAL;
 880
 881		info.dma32_window_start = table_group->tce32_start;
 882		info.dma32_window_size = table_group->tce32_size;
 883		info.flags = 0;
 884		memset(&info.ddw, 0, sizeof(info.ddw));
 885
 886		if (table_group->max_dynamic_windows_supported &&
 887				container->v2) {
 888			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
 889			info.ddw.pgsizes = table_group->pgsizes;
 890			info.ddw.max_dynamic_windows_supported =
 891				table_group->max_dynamic_windows_supported;
 892			info.ddw.levels = table_group->max_levels;
 893		}
 894
 895		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
 896
 897		if (info.argsz >= ddwsz)
 898			minsz = ddwsz;
 899
 900		if (copy_to_user((void __user *)arg, &info, minsz))
 901			return -EFAULT;
 902
 903		return 0;
 904	}
 905	case VFIO_IOMMU_MAP_DMA: {
 906		struct vfio_iommu_type1_dma_map param;
 907		struct iommu_table *tbl = NULL;
 908		long num;
 909		enum dma_data_direction direction;
 910
 911		if (!container->enabled)
 912			return -EPERM;
 913
 914		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 915
 916		if (copy_from_user(&param, (void __user *)arg, minsz))
 917			return -EFAULT;
 918
 919		if (param.argsz < minsz)
 920			return -EINVAL;
 921
 922		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
 923				VFIO_DMA_MAP_FLAG_WRITE))
 924			return -EINVAL;
 925
 926		ret = tce_iommu_create_default_window(container);
 927		if (ret)
 928			return ret;
 929
 930		num = tce_iommu_find_table(container, param.iova, &tbl);
 931		if (num < 0)
 932			return -ENXIO;
 933
 934		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
 935				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
 936			return -EINVAL;
 937
 938		/* iova is checked by the IOMMU API */
 939		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
 940			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 941				direction = DMA_BIDIRECTIONAL;
 942			else
 943				direction = DMA_TO_DEVICE;
 944		} else {
 945			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 946				direction = DMA_FROM_DEVICE;
 947			else
 948				return -EINVAL;
 949		}
 950
 951		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
 952		if (ret)
 953			return ret;
 954
 955		if (container->v2)
 956			ret = tce_iommu_build_v2(container, tbl,
 957					param.iova >> tbl->it_page_shift,
 958					param.vaddr,
 959					param.size >> tbl->it_page_shift,
 960					direction);
 961		else
 962			ret = tce_iommu_build(container, tbl,
 963					param.iova >> tbl->it_page_shift,
 964					param.vaddr,
 965					param.size >> tbl->it_page_shift,
 966					direction);
 967
 968		iommu_flush_tce(tbl);
 969
 970		return ret;
 971	}
 972	case VFIO_IOMMU_UNMAP_DMA: {
 973		struct vfio_iommu_type1_dma_unmap param;
 974		struct iommu_table *tbl = NULL;
 975		long num;
 976
 977		if (!container->enabled)
 978			return -EPERM;
 979
 980		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
 981				size);
 982
 983		if (copy_from_user(&param, (void __user *)arg, minsz))
 984			return -EFAULT;
 985
 986		if (param.argsz < minsz)
 987			return -EINVAL;
 988
 989		/* No flag is supported now */
 990		if (param.flags)
 991			return -EINVAL;
 992
 993		ret = tce_iommu_create_default_window(container);
 994		if (ret)
 995			return ret;
 996
 997		num = tce_iommu_find_table(container, param.iova, &tbl);
 998		if (num < 0)
 999			return -ENXIO;
1000
1001		if (param.size & ~IOMMU_PAGE_MASK(tbl))
1002			return -EINVAL;
1003
1004		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
1005				param.size >> tbl->it_page_shift);
1006		if (ret)
1007			return ret;
1008
1009		ret = tce_iommu_clear(container, tbl,
1010				param.iova >> tbl->it_page_shift,
1011				param.size >> tbl->it_page_shift);
1012		iommu_flush_tce(tbl);
1013
1014		return ret;
1015	}
1016	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
1017		struct vfio_iommu_spapr_register_memory param;
1018
1019		if (!container->v2)
1020			break;
1021
1022		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1023				size);
1024
1025		ret = tce_iommu_mm_set(container);
1026		if (ret)
1027			return ret;
1028
1029		if (copy_from_user(&param, (void __user *)arg, minsz))
1030			return -EFAULT;
1031
1032		if (param.argsz < minsz)
1033			return -EINVAL;
1034
1035		/* No flag is supported now */
1036		if (param.flags)
1037			return -EINVAL;
1038
1039		mutex_lock(&container->lock);
1040		ret = tce_iommu_register_pages(container, param.vaddr,
1041				param.size);
1042		mutex_unlock(&container->lock);
1043
1044		return ret;
1045	}
1046	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
1047		struct vfio_iommu_spapr_register_memory param;
1048
1049		if (!container->v2)
1050			break;
1051
1052		if (!container->mm)
1053			return -EPERM;
1054
1055		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1056				size);
1057
1058		if (copy_from_user(&param, (void __user *)arg, minsz))
1059			return -EFAULT;
1060
1061		if (param.argsz < minsz)
1062			return -EINVAL;
1063
1064		/* No flag is supported now */
1065		if (param.flags)
1066			return -EINVAL;
1067
1068		mutex_lock(&container->lock);
1069		ret = tce_iommu_unregister_pages(container, param.vaddr,
1070				param.size);
1071		mutex_unlock(&container->lock);
1072
1073		return ret;
1074	}
1075	case VFIO_IOMMU_ENABLE:
1076		if (container->v2)
1077			break;
1078
1079		mutex_lock(&container->lock);
1080		ret = tce_iommu_enable(container);
1081		mutex_unlock(&container->lock);
1082		return ret;
1083
1084
1085	case VFIO_IOMMU_DISABLE:
1086		if (container->v2)
1087			break;
1088
1089		mutex_lock(&container->lock);
1090		tce_iommu_disable(container);
1091		mutex_unlock(&container->lock);
1092		return 0;
1093
1094	case VFIO_EEH_PE_OP: {
1095		struct tce_iommu_group *tcegrp;
1096
1097		ret = 0;
1098		list_for_each_entry(tcegrp, &container->group_list, next) {
1099			ret = vfio_spapr_ioctl_eeh_pe_op(tcegrp->grp, arg);
 
1100			if (ret)
1101				return ret;
1102		}
1103		return ret;
1104	}
1105
1106	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1107		struct vfio_iommu_spapr_tce_create create;
1108
1109		if (!container->v2)
1110			break;
1111
1112		ret = tce_iommu_mm_set(container);
1113		if (ret)
1114			return ret;
1115
1116		if (!tce_groups_attached(container))
1117			return -ENXIO;
1118
1119		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1120				start_addr);
1121
1122		if (copy_from_user(&create, (void __user *)arg, minsz))
1123			return -EFAULT;
1124
1125		if (create.argsz < minsz)
1126			return -EINVAL;
1127
1128		if (create.flags)
1129			return -EINVAL;
1130
1131		mutex_lock(&container->lock);
1132
1133		ret = tce_iommu_create_default_window(container);
1134		if (!ret)
1135			ret = tce_iommu_create_window(container,
1136					create.page_shift,
1137					create.window_size, create.levels,
1138					&create.start_addr);
1139
1140		mutex_unlock(&container->lock);
1141
1142		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1143			ret = -EFAULT;
1144
1145		return ret;
1146	}
1147	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1148		struct vfio_iommu_spapr_tce_remove remove;
1149
1150		if (!container->v2)
1151			break;
1152
1153		ret = tce_iommu_mm_set(container);
1154		if (ret)
1155			return ret;
1156
1157		if (!tce_groups_attached(container))
1158			return -ENXIO;
1159
1160		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1161				start_addr);
1162
1163		if (copy_from_user(&remove, (void __user *)arg, minsz))
1164			return -EFAULT;
1165
1166		if (remove.argsz < minsz)
1167			return -EINVAL;
1168
1169		if (remove.flags)
1170			return -EINVAL;
1171
1172		if (container->def_window_pending && !remove.start_addr) {
1173			container->def_window_pending = false;
1174			return 0;
1175		}
1176
1177		mutex_lock(&container->lock);
1178
1179		ret = tce_iommu_remove_window(container, remove.start_addr);
1180
1181		mutex_unlock(&container->lock);
1182
1183		return ret;
1184	}
1185	}
1186
1187	return -ENOTTY;
1188}
1189
1190static void tce_iommu_release_ownership(struct tce_container *container,
1191		struct iommu_table_group *table_group)
1192{
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1193	long i;
1194
1195	if (!table_group->ops->unset_window) {
1196		WARN_ON_ONCE(1);
1197		return;
1198	}
1199
1200	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1201		if (container->tables[i])
1202			table_group->ops->unset_window(table_group, i);
 
 
1203}
1204
1205static long tce_iommu_take_ownership(struct tce_container *container,
1206		struct iommu_table_group *table_group)
1207{
1208	long i, ret = 0;
1209
 
 
 
 
 
 
 
 
1210	/* Set all windows to the new group */
1211	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1212		struct iommu_table *tbl = container->tables[i];
1213
1214		if (!tbl)
1215			continue;
1216
1217		ret = table_group->ops->set_window(table_group, i, tbl);
1218		if (ret)
1219			goto release_exit;
1220	}
1221
1222	return 0;
1223
1224release_exit:
1225	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1226		table_group->ops->unset_window(table_group, i);
1227
 
 
1228	return ret;
1229}
1230
1231static int tce_iommu_attach_group(void *iommu_data,
1232		struct iommu_group *iommu_group, enum vfio_group_type type)
1233{
1234	int ret = 0;
1235	struct tce_container *container = iommu_data;
1236	struct iommu_table_group *table_group;
1237	struct tce_iommu_group *tcegrp = NULL;
1238
1239	if (type == VFIO_EMULATED_IOMMU)
1240		return -EINVAL;
1241
1242	mutex_lock(&container->lock);
1243
1244	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1245			iommu_group_id(iommu_group), iommu_group); */
1246	table_group = iommu_group_get_iommudata(iommu_group);
1247	if (!table_group) {
1248		ret = -ENODEV;
1249		goto unlock_exit;
1250	}
1251
1252	/* v2 requires full support of dynamic DMA windows */
1253	if (container->v2 && table_group->max_dynamic_windows_supported == 0) {
1254		ret = -EINVAL;
1255		goto unlock_exit;
1256	}
1257
1258	/* v1 reuses TCE tables and does not share them among PEs */
1259	if (!container->v2 && tce_groups_attached(container)) {
1260		ret = -EBUSY;
1261		goto unlock_exit;
1262	}
1263
1264	/*
1265	 * Check if new group has the same iommu_table_group_ops
1266	 * (i.e. compatible)
1267	 */
1268	list_for_each_entry(tcegrp, &container->group_list, next) {
1269		struct iommu_table_group *table_group_tmp;
1270
1271		if (tcegrp->grp == iommu_group) {
1272			pr_warn("tce_vfio: Group %d is already attached\n",
1273					iommu_group_id(iommu_group));
1274			ret = -EBUSY;
1275			goto unlock_exit;
1276		}
1277		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1278		if (table_group_tmp->ops->create_table !=
1279				table_group->ops->create_table) {
1280			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1281					iommu_group_id(iommu_group),
1282					iommu_group_id(tcegrp->grp));
1283			ret = -EPERM;
1284			goto unlock_exit;
1285		}
1286	}
1287
1288	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1289	if (!tcegrp) {
1290		ret = -ENOMEM;
1291		goto unlock_exit;
1292	}
1293
1294	ret = tce_iommu_take_ownership(container, table_group);
1295	if (!tce_groups_attached(container) && !container->tables[0])
1296		container->def_window_pending = true;
 
 
 
 
 
 
 
 
 
 
 
 
 
1297
1298	if (!ret) {
1299		tcegrp->grp = iommu_group;
1300		list_add(&tcegrp->next, &container->group_list);
1301	}
1302
 
1303	if (ret && tcegrp)
1304		kfree(tcegrp);
1305
1306unlock_exit:
1307	mutex_unlock(&container->lock);
1308
1309	return ret;
1310}
1311
1312static void tce_iommu_detach_group(void *iommu_data,
1313		struct iommu_group *iommu_group)
1314{
1315	struct tce_container *container = iommu_data;
1316	struct iommu_table_group *table_group;
1317	bool found = false;
1318	struct tce_iommu_group *tcegrp;
1319
1320	mutex_lock(&container->lock);
1321
1322	list_for_each_entry(tcegrp, &container->group_list, next) {
1323		if (tcegrp->grp == iommu_group) {
1324			found = true;
1325			break;
1326		}
1327	}
1328
1329	if (!found) {
1330		pr_warn("tce_vfio: detaching unattached group #%u\n",
1331				iommu_group_id(iommu_group));
1332		goto unlock_exit;
1333	}
1334
1335	list_del(&tcegrp->next);
1336	kfree(tcegrp);
1337
1338	table_group = iommu_group_get_iommudata(iommu_group);
1339	BUG_ON(!table_group);
1340
1341	tce_iommu_release_ownership(container, table_group);
 
 
 
1342
1343unlock_exit:
1344	mutex_unlock(&container->lock);
1345}
1346
1347static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1348	.name		= "iommu-vfio-powerpc",
1349	.owner		= THIS_MODULE,
1350	.open		= tce_iommu_open,
1351	.release	= tce_iommu_release,
1352	.ioctl		= tce_iommu_ioctl,
1353	.attach_group	= tce_iommu_attach_group,
1354	.detach_group	= tce_iommu_detach_group,
1355};
1356
1357static int __init tce_iommu_init(void)
1358{
1359	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1360}
1361
1362static void __exit tce_iommu_cleanup(void)
1363{
1364	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1365}
1366
1367module_init(tce_iommu_init);
1368module_exit(tce_iommu_cleanup);
1369
1370MODULE_VERSION(DRIVER_VERSION);
1371MODULE_LICENSE("GPL v2");
1372MODULE_AUTHOR(DRIVER_AUTHOR);
1373MODULE_DESCRIPTION(DRIVER_DESC);
1374
v5.14.15
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO: IOMMU DMA mapping support for TCE on POWER
   4 *
   5 * Copyright (C) 2013 IBM Corp.  All rights reserved.
   6 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
 
   7 *
   8 * Derived from original vfio_iommu_type1.c:
   9 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
  10 *     Author: Alex Williamson <alex.williamson@redhat.com>
  11 */
  12
  13#include <linux/module.h>
  14#include <linux/pci.h>
  15#include <linux/slab.h>
  16#include <linux/uaccess.h>
  17#include <linux/err.h>
  18#include <linux/vfio.h>
  19#include <linux/vmalloc.h>
  20#include <linux/sched/mm.h>
  21#include <linux/sched/signal.h>
  22#include <linux/mm.h>
 
  23
  24#include <asm/iommu.h>
  25#include <asm/tce.h>
  26#include <asm/mmu_context.h>
  27
  28#define DRIVER_VERSION  "0.1"
  29#define DRIVER_AUTHOR   "aik@ozlabs.ru"
  30#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
  31
  32static void tce_iommu_detach_group(void *iommu_data,
  33		struct iommu_group *iommu_group);
  34
  35/*
  36 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  37 *
  38 * This code handles mapping and unmapping of user data buffers
  39 * into DMA'ble space using the IOMMU
  40 */
  41
  42struct tce_iommu_group {
  43	struct list_head next;
  44	struct iommu_group *grp;
  45};
  46
  47/*
  48 * A container needs to remember which preregistered region  it has
  49 * referenced to do proper cleanup at the userspace process exit.
  50 */
  51struct tce_iommu_prereg {
  52	struct list_head next;
  53	struct mm_iommu_table_group_mem_t *mem;
  54};
  55
  56/*
  57 * The container descriptor supports only a single group per container.
  58 * Required by the API as the container is not supplied with the IOMMU group
  59 * at the moment of initialization.
  60 */
  61struct tce_container {
  62	struct mutex lock;
  63	bool enabled;
  64	bool v2;
  65	bool def_window_pending;
  66	unsigned long locked_pages;
  67	struct mm_struct *mm;
  68	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
  69	struct list_head group_list;
  70	struct list_head prereg_list;
  71};
  72
  73static long tce_iommu_mm_set(struct tce_container *container)
  74{
  75	if (container->mm) {
  76		if (container->mm == current->mm)
  77			return 0;
  78		return -EPERM;
  79	}
  80	BUG_ON(!current->mm);
  81	container->mm = current->mm;
  82	mmgrab(container->mm);
  83
  84	return 0;
  85}
  86
  87static long tce_iommu_prereg_free(struct tce_container *container,
  88		struct tce_iommu_prereg *tcemem)
  89{
  90	long ret;
  91
  92	ret = mm_iommu_put(container->mm, tcemem->mem);
  93	if (ret)
  94		return ret;
  95
  96	list_del(&tcemem->next);
  97	kfree(tcemem);
  98
  99	return 0;
 100}
 101
 102static long tce_iommu_unregister_pages(struct tce_container *container,
 103		__u64 vaddr, __u64 size)
 104{
 105	struct mm_iommu_table_group_mem_t *mem;
 106	struct tce_iommu_prereg *tcemem;
 107	bool found = false;
 108	long ret;
 109
 110	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
 111		return -EINVAL;
 112
 113	mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
 114	if (!mem)
 115		return -ENOENT;
 116
 117	list_for_each_entry(tcemem, &container->prereg_list, next) {
 118		if (tcemem->mem == mem) {
 119			found = true;
 120			break;
 121		}
 122	}
 123
 124	if (!found)
 125		ret = -ENOENT;
 126	else
 127		ret = tce_iommu_prereg_free(container, tcemem);
 128
 129	mm_iommu_put(container->mm, mem);
 130
 131	return ret;
 132}
 133
 134static long tce_iommu_register_pages(struct tce_container *container,
 135		__u64 vaddr, __u64 size)
 136{
 137	long ret = 0;
 138	struct mm_iommu_table_group_mem_t *mem = NULL;
 139	struct tce_iommu_prereg *tcemem;
 140	unsigned long entries = size >> PAGE_SHIFT;
 141
 142	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
 143			((vaddr + size) < vaddr))
 144		return -EINVAL;
 145
 146	mem = mm_iommu_get(container->mm, vaddr, entries);
 147	if (mem) {
 148		list_for_each_entry(tcemem, &container->prereg_list, next) {
 149			if (tcemem->mem == mem) {
 150				ret = -EBUSY;
 151				goto put_exit;
 152			}
 153		}
 154	} else {
 155		ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
 156		if (ret)
 157			return ret;
 158	}
 159
 160	tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
 161	if (!tcemem) {
 162		ret = -ENOMEM;
 163		goto put_exit;
 164	}
 165
 166	tcemem->mem = mem;
 167	list_add(&tcemem->next, &container->prereg_list);
 168
 169	container->enabled = true;
 170
 171	return 0;
 172
 173put_exit:
 174	mm_iommu_put(container->mm, mem);
 175	return ret;
 176}
 177
 178static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
 179		unsigned int it_page_shift)
 180{
 181	struct page *page;
 182	unsigned long size = 0;
 183
 184	if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
 185		return size == (1UL << it_page_shift);
 186
 187	page = pfn_to_page(hpa >> PAGE_SHIFT);
 188	/*
 189	 * Check that the TCE table granularity is not bigger than the size of
 190	 * a page we just found. Otherwise the hardware can get access to
 191	 * a bigger memory chunk that it should.
 192	 */
 193	return page_shift(compound_head(page)) >= it_page_shift;
 194}
 195
 196static inline bool tce_groups_attached(struct tce_container *container)
 197{
 198	return !list_empty(&container->group_list);
 199}
 200
 201static long tce_iommu_find_table(struct tce_container *container,
 202		phys_addr_t ioba, struct iommu_table **ptbl)
 203{
 204	long i;
 205
 206	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 207		struct iommu_table *tbl = container->tables[i];
 208
 209		if (tbl) {
 210			unsigned long entry = ioba >> tbl->it_page_shift;
 211			unsigned long start = tbl->it_offset;
 212			unsigned long end = start + tbl->it_size;
 213
 214			if ((start <= entry) && (entry < end)) {
 215				*ptbl = tbl;
 216				return i;
 217			}
 218		}
 219	}
 220
 221	return -1;
 222}
 223
 224static int tce_iommu_find_free_table(struct tce_container *container)
 225{
 226	int i;
 227
 228	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 229		if (!container->tables[i])
 230			return i;
 231	}
 232
 233	return -ENOSPC;
 234}
 235
 236static int tce_iommu_enable(struct tce_container *container)
 237{
 238	int ret = 0;
 239	unsigned long locked;
 240	struct iommu_table_group *table_group;
 241	struct tce_iommu_group *tcegrp;
 242
 243	if (container->enabled)
 244		return -EBUSY;
 245
 246	/*
 247	 * When userspace pages are mapped into the IOMMU, they are effectively
 248	 * locked memory, so, theoretically, we need to update the accounting
 249	 * of locked pages on each map and unmap.  For powerpc, the map unmap
 250	 * paths can be very hot, though, and the accounting would kill
 251	 * performance, especially since it would be difficult to impossible
 252	 * to handle the accounting in real mode only.
 253	 *
 254	 * To address that, rather than precisely accounting every page, we
 255	 * instead account for a worst case on locked memory when the iommu is
 256	 * enabled and disabled.  The worst case upper bound on locked memory
 257	 * is the size of the whole iommu window, which is usually relatively
 258	 * small (compared to total memory sizes) on POWER hardware.
 259	 *
 260	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 261	 * that would effectively kill the guest at random points, much better
 262	 * enforcing the limit based on the max that the guest can map.
 263	 *
 264	 * Unfortunately at the moment it counts whole tables, no matter how
 265	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
 266	 * each with 2GB DMA window, 8GB will be counted here. The reason for
 267	 * this is that we cannot tell here the amount of RAM used by the guest
 268	 * as this information is only available from KVM and VFIO is
 269	 * KVM agnostic.
 270	 *
 271	 * So we do not allow enabling a container without a group attached
 272	 * as there is no way to know how much we should increment
 273	 * the locked_vm counter.
 274	 */
 275	if (!tce_groups_attached(container))
 276		return -ENODEV;
 277
 278	tcegrp = list_first_entry(&container->group_list,
 279			struct tce_iommu_group, next);
 280	table_group = iommu_group_get_iommudata(tcegrp->grp);
 281	if (!table_group)
 282		return -ENODEV;
 283
 284	if (!table_group->tce32_size)
 285		return -EPERM;
 286
 287	ret = tce_iommu_mm_set(container);
 288	if (ret)
 289		return ret;
 290
 291	locked = table_group->tce32_size >> PAGE_SHIFT;
 292	ret = account_locked_vm(container->mm, locked, true);
 293	if (ret)
 294		return ret;
 295
 296	container->locked_pages = locked;
 297
 298	container->enabled = true;
 299
 300	return ret;
 301}
 302
 303static void tce_iommu_disable(struct tce_container *container)
 304{
 305	if (!container->enabled)
 306		return;
 307
 308	container->enabled = false;
 309
 310	BUG_ON(!container->mm);
 311	account_locked_vm(container->mm, container->locked_pages, false);
 312}
 313
 314static void *tce_iommu_open(unsigned long arg)
 315{
 316	struct tce_container *container;
 317
 318	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
 319		pr_err("tce_vfio: Wrong IOMMU type\n");
 320		return ERR_PTR(-EINVAL);
 321	}
 322
 323	container = kzalloc(sizeof(*container), GFP_KERNEL);
 324	if (!container)
 325		return ERR_PTR(-ENOMEM);
 326
 327	mutex_init(&container->lock);
 328	INIT_LIST_HEAD_RCU(&container->group_list);
 329	INIT_LIST_HEAD_RCU(&container->prereg_list);
 330
 331	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 332
 333	return container;
 334}
 335
 336static int tce_iommu_clear(struct tce_container *container,
 337		struct iommu_table *tbl,
 338		unsigned long entry, unsigned long pages);
 339static void tce_iommu_free_table(struct tce_container *container,
 340		struct iommu_table *tbl);
 341
 342static void tce_iommu_release(void *iommu_data)
 343{
 344	struct tce_container *container = iommu_data;
 345	struct tce_iommu_group *tcegrp;
 346	struct tce_iommu_prereg *tcemem, *tmtmp;
 347	long i;
 348
 349	while (tce_groups_attached(container)) {
 350		tcegrp = list_first_entry(&container->group_list,
 351				struct tce_iommu_group, next);
 352		tce_iommu_detach_group(iommu_data, tcegrp->grp);
 353	}
 354
 355	/*
 356	 * If VFIO created a table, it was not disposed
 357	 * by tce_iommu_detach_group() so do it now.
 358	 */
 359	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 360		struct iommu_table *tbl = container->tables[i];
 361
 362		if (!tbl)
 363			continue;
 364
 365		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 366		tce_iommu_free_table(container, tbl);
 367	}
 368
 369	list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
 370		WARN_ON(tce_iommu_prereg_free(container, tcemem));
 371
 372	tce_iommu_disable(container);
 373	if (container->mm)
 374		mmdrop(container->mm);
 375	mutex_destroy(&container->lock);
 376
 377	kfree(container);
 378}
 379
 380static void tce_iommu_unuse_page(struct tce_container *container,
 381		unsigned long hpa)
 382{
 383	struct page *page;
 384
 385	page = pfn_to_page(hpa >> PAGE_SHIFT);
 386	unpin_user_page(page);
 387}
 388
 389static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
 390		unsigned long tce, unsigned long shift,
 391		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
 392{
 393	long ret = 0;
 394	struct mm_iommu_table_group_mem_t *mem;
 395
 396	mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
 397	if (!mem)
 398		return -EINVAL;
 399
 400	ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
 401	if (ret)
 402		return -EINVAL;
 403
 404	*pmem = mem;
 405
 406	return 0;
 407}
 408
 409static void tce_iommu_unuse_page_v2(struct tce_container *container,
 410		struct iommu_table *tbl, unsigned long entry)
 411{
 412	struct mm_iommu_table_group_mem_t *mem = NULL;
 413	int ret;
 414	unsigned long hpa = 0;
 415	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 416
 417	if (!pua)
 418		return;
 419
 420	ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
 421			tbl->it_page_shift, &hpa, &mem);
 422	if (ret)
 423		pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
 424				__func__, be64_to_cpu(*pua), entry, ret);
 425	if (mem)
 426		mm_iommu_mapped_dec(mem);
 427
 428	*pua = cpu_to_be64(0);
 429}
 430
 431static int tce_iommu_clear(struct tce_container *container,
 432		struct iommu_table *tbl,
 433		unsigned long entry, unsigned long pages)
 434{
 435	unsigned long oldhpa;
 436	long ret;
 437	enum dma_data_direction direction;
 438	unsigned long lastentry = entry + pages, firstentry = entry;
 439
 440	for ( ; entry < lastentry; ++entry) {
 441		if (tbl->it_indirect_levels && tbl->it_userspace) {
 442			/*
 443			 * For multilevel tables, we can take a shortcut here
 444			 * and skip some TCEs as we know that the userspace
 445			 * addresses cache is a mirror of the real TCE table
 446			 * and if it is missing some indirect levels, then
 447			 * the hardware table does not have them allocated
 448			 * either and therefore does not require updating.
 449			 */
 450			__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
 451					entry);
 452			if (!pua) {
 453				/* align to level_size which is power of two */
 454				entry |= tbl->it_level_size - 1;
 455				continue;
 456			}
 457		}
 458
 459		cond_resched();
 460
 461		direction = DMA_NONE;
 462		oldhpa = 0;
 463		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
 464				&direction);
 465		if (ret)
 466			continue;
 467
 468		if (direction == DMA_NONE)
 469			continue;
 470
 471		if (container->v2) {
 472			tce_iommu_unuse_page_v2(container, tbl, entry);
 473			continue;
 474		}
 475
 476		tce_iommu_unuse_page(container, oldhpa);
 477	}
 478
 479	iommu_tce_kill(tbl, firstentry, pages);
 480
 481	return 0;
 482}
 483
 484static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 485{
 486	struct page *page = NULL;
 487	enum dma_data_direction direction = iommu_tce_direction(tce);
 488
 489	if (pin_user_pages_fast(tce & PAGE_MASK, 1,
 490			direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
 491			&page) != 1)
 492		return -EFAULT;
 493
 494	*hpa = __pa((unsigned long) page_address(page));
 495
 496	return 0;
 497}
 498
 499static long tce_iommu_build(struct tce_container *container,
 500		struct iommu_table *tbl,
 501		unsigned long entry, unsigned long tce, unsigned long pages,
 502		enum dma_data_direction direction)
 503{
 504	long i, ret = 0;
 505	unsigned long hpa;
 506	enum dma_data_direction dirtmp;
 507
 508	for (i = 0; i < pages; ++i) {
 509		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 510
 511		ret = tce_iommu_use_page(tce, &hpa);
 512		if (ret)
 513			break;
 514
 515		if (!tce_page_is_contained(container->mm, hpa,
 516				tbl->it_page_shift)) {
 517			ret = -EPERM;
 518			break;
 519		}
 520
 521		hpa |= offset;
 522		dirtmp = direction;
 523		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
 524				&hpa, &dirtmp);
 525		if (ret) {
 526			tce_iommu_unuse_page(container, hpa);
 527			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 528					__func__, entry << tbl->it_page_shift,
 529					tce, ret);
 530			break;
 531		}
 532
 533		if (dirtmp != DMA_NONE)
 534			tce_iommu_unuse_page(container, hpa);
 535
 536		tce += IOMMU_PAGE_SIZE(tbl);
 537	}
 538
 539	if (ret)
 540		tce_iommu_clear(container, tbl, entry, i);
 541	else
 542		iommu_tce_kill(tbl, entry, pages);
 543
 544	return ret;
 545}
 546
 547static long tce_iommu_build_v2(struct tce_container *container,
 548		struct iommu_table *tbl,
 549		unsigned long entry, unsigned long tce, unsigned long pages,
 550		enum dma_data_direction direction)
 551{
 552	long i, ret = 0;
 553	unsigned long hpa;
 554	enum dma_data_direction dirtmp;
 555
 556	for (i = 0; i < pages; ++i) {
 557		struct mm_iommu_table_group_mem_t *mem = NULL;
 558		__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
 559
 560		ret = tce_iommu_prereg_ua_to_hpa(container,
 561				tce, tbl->it_page_shift, &hpa, &mem);
 562		if (ret)
 563			break;
 564
 565		if (!tce_page_is_contained(container->mm, hpa,
 566				tbl->it_page_shift)) {
 567			ret = -EPERM;
 568			break;
 569		}
 570
 571		/* Preserve offset within IOMMU page */
 572		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 573		dirtmp = direction;
 574
 575		/* The registered region is being unregistered */
 576		if (mm_iommu_mapped_inc(mem))
 577			break;
 578
 579		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
 580				&hpa, &dirtmp);
 581		if (ret) {
 582			/* dirtmp cannot be DMA_NONE here */
 583			tce_iommu_unuse_page_v2(container, tbl, entry + i);
 584			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 585					__func__, entry << tbl->it_page_shift,
 586					tce, ret);
 587			break;
 588		}
 589
 590		if (dirtmp != DMA_NONE)
 591			tce_iommu_unuse_page_v2(container, tbl, entry + i);
 592
 593		*pua = cpu_to_be64(tce);
 594
 595		tce += IOMMU_PAGE_SIZE(tbl);
 596	}
 597
 598	if (ret)
 599		tce_iommu_clear(container, tbl, entry, i);
 600	else
 601		iommu_tce_kill(tbl, entry, pages);
 602
 603	return ret;
 604}
 605
 606static long tce_iommu_create_table(struct tce_container *container,
 607			struct iommu_table_group *table_group,
 608			int num,
 609			__u32 page_shift,
 610			__u64 window_size,
 611			__u32 levels,
 612			struct iommu_table **ptbl)
 613{
 614	long ret, table_size;
 615
 616	table_size = table_group->ops->get_table_size(page_shift, window_size,
 617			levels);
 618	if (!table_size)
 619		return -EINVAL;
 620
 621	ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
 622	if (ret)
 623		return ret;
 624
 625	ret = table_group->ops->create_table(table_group, num,
 626			page_shift, window_size, levels, ptbl);
 627
 628	WARN_ON(!ret && !(*ptbl)->it_ops->free);
 629	WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
 630
 631	return ret;
 632}
 633
 634static void tce_iommu_free_table(struct tce_container *container,
 635		struct iommu_table *tbl)
 636{
 637	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 638
 639	iommu_tce_table_put(tbl);
 640	account_locked_vm(container->mm, pages, false);
 641}
 642
 643static long tce_iommu_create_window(struct tce_container *container,
 644		__u32 page_shift, __u64 window_size, __u32 levels,
 645		__u64 *start_addr)
 646{
 647	struct tce_iommu_group *tcegrp;
 648	struct iommu_table_group *table_group;
 649	struct iommu_table *tbl = NULL;
 650	long ret, num;
 651
 652	num = tce_iommu_find_free_table(container);
 653	if (num < 0)
 654		return num;
 655
 656	/* Get the first group for ops::create_table */
 657	tcegrp = list_first_entry(&container->group_list,
 658			struct tce_iommu_group, next);
 659	table_group = iommu_group_get_iommudata(tcegrp->grp);
 660	if (!table_group)
 661		return -EFAULT;
 662
 663	if (!(table_group->pgsizes & (1ULL << page_shift)))
 664		return -EINVAL;
 665
 666	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
 667			!table_group->ops->get_table_size ||
 668			!table_group->ops->create_table)
 669		return -EPERM;
 670
 671	/* Create TCE table */
 672	ret = tce_iommu_create_table(container, table_group, num,
 673			page_shift, window_size, levels, &tbl);
 674	if (ret)
 675		return ret;
 676
 677	BUG_ON(!tbl->it_ops->free);
 678
 679	/*
 680	 * Program the table to every group.
 681	 * Groups have been tested for compatibility at the attach time.
 682	 */
 683	list_for_each_entry(tcegrp, &container->group_list, next) {
 684		table_group = iommu_group_get_iommudata(tcegrp->grp);
 685
 686		ret = table_group->ops->set_window(table_group, num, tbl);
 687		if (ret)
 688			goto unset_exit;
 689	}
 690
 691	container->tables[num] = tbl;
 692
 693	/* Return start address assigned by platform in create_table() */
 694	*start_addr = tbl->it_offset << tbl->it_page_shift;
 695
 696	return 0;
 697
 698unset_exit:
 699	list_for_each_entry(tcegrp, &container->group_list, next) {
 700		table_group = iommu_group_get_iommudata(tcegrp->grp);
 701		table_group->ops->unset_window(table_group, num);
 702	}
 703	tce_iommu_free_table(container, tbl);
 704
 705	return ret;
 706}
 707
 708static long tce_iommu_remove_window(struct tce_container *container,
 709		__u64 start_addr)
 710{
 711	struct iommu_table_group *table_group = NULL;
 712	struct iommu_table *tbl;
 713	struct tce_iommu_group *tcegrp;
 714	int num;
 715
 716	num = tce_iommu_find_table(container, start_addr, &tbl);
 717	if (num < 0)
 718		return -EINVAL;
 719
 720	BUG_ON(!tbl->it_size);
 721
 722	/* Detach groups from IOMMUs */
 723	list_for_each_entry(tcegrp, &container->group_list, next) {
 724		table_group = iommu_group_get_iommudata(tcegrp->grp);
 725
 726		/*
 727		 * SPAPR TCE IOMMU exposes the default DMA window to
 728		 * the guest via dma32_window_start/size of
 729		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
 730		 * the userspace to remove this window, some do not so
 731		 * here we check for the platform capability.
 732		 */
 733		if (!table_group->ops || !table_group->ops->unset_window)
 734			return -EPERM;
 735
 736		table_group->ops->unset_window(table_group, num);
 737	}
 738
 739	/* Free table */
 740	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 741	tce_iommu_free_table(container, tbl);
 742	container->tables[num] = NULL;
 743
 744	return 0;
 745}
 746
 747static long tce_iommu_create_default_window(struct tce_container *container)
 748{
 749	long ret;
 750	__u64 start_addr = 0;
 751	struct tce_iommu_group *tcegrp;
 752	struct iommu_table_group *table_group;
 753
 754	if (!container->def_window_pending)
 755		return 0;
 756
 757	if (!tce_groups_attached(container))
 758		return -ENODEV;
 759
 760	tcegrp = list_first_entry(&container->group_list,
 761			struct tce_iommu_group, next);
 762	table_group = iommu_group_get_iommudata(tcegrp->grp);
 763	if (!table_group)
 764		return -ENODEV;
 765
 766	ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
 767			table_group->tce32_size, 1, &start_addr);
 768	WARN_ON_ONCE(!ret && start_addr);
 769
 770	if (!ret)
 771		container->def_window_pending = false;
 772
 773	return ret;
 774}
 775
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 776static long tce_iommu_ioctl(void *iommu_data,
 777				 unsigned int cmd, unsigned long arg)
 778{
 779	struct tce_container *container = iommu_data;
 780	unsigned long minsz, ddwsz;
 781	long ret;
 782
 783	switch (cmd) {
 784	case VFIO_CHECK_EXTENSION:
 785		switch (arg) {
 786		case VFIO_SPAPR_TCE_IOMMU:
 787		case VFIO_SPAPR_TCE_v2_IOMMU:
 788			ret = 1;
 789			break;
 
 790		default:
 791			ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
 792			break;
 793		}
 794
 795		return (ret < 0) ? 0 : ret;
 796	}
 797
 798	/*
 799	 * Sanity check to prevent one userspace from manipulating
 800	 * another userspace mm.
 801	 */
 802	BUG_ON(!container);
 803	if (container->mm && container->mm != current->mm)
 804		return -EPERM;
 805
 806	switch (cmd) {
 807	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 808		struct vfio_iommu_spapr_tce_info info;
 809		struct tce_iommu_group *tcegrp;
 810		struct iommu_table_group *table_group;
 811
 812		if (!tce_groups_attached(container))
 813			return -ENXIO;
 814
 815		tcegrp = list_first_entry(&container->group_list,
 816				struct tce_iommu_group, next);
 817		table_group = iommu_group_get_iommudata(tcegrp->grp);
 818
 819		if (!table_group)
 820			return -ENXIO;
 821
 822		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
 823				dma32_window_size);
 824
 825		if (copy_from_user(&info, (void __user *)arg, minsz))
 826			return -EFAULT;
 827
 828		if (info.argsz < minsz)
 829			return -EINVAL;
 830
 831		info.dma32_window_start = table_group->tce32_start;
 832		info.dma32_window_size = table_group->tce32_size;
 833		info.flags = 0;
 834		memset(&info.ddw, 0, sizeof(info.ddw));
 835
 836		if (table_group->max_dynamic_windows_supported &&
 837				container->v2) {
 838			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
 839			info.ddw.pgsizes = table_group->pgsizes;
 840			info.ddw.max_dynamic_windows_supported =
 841				table_group->max_dynamic_windows_supported;
 842			info.ddw.levels = table_group->max_levels;
 843		}
 844
 845		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
 846
 847		if (info.argsz >= ddwsz)
 848			minsz = ddwsz;
 849
 850		if (copy_to_user((void __user *)arg, &info, minsz))
 851			return -EFAULT;
 852
 853		return 0;
 854	}
 855	case VFIO_IOMMU_MAP_DMA: {
 856		struct vfio_iommu_type1_dma_map param;
 857		struct iommu_table *tbl = NULL;
 858		long num;
 859		enum dma_data_direction direction;
 860
 861		if (!container->enabled)
 862			return -EPERM;
 863
 864		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 865
 866		if (copy_from_user(&param, (void __user *)arg, minsz))
 867			return -EFAULT;
 868
 869		if (param.argsz < minsz)
 870			return -EINVAL;
 871
 872		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
 873				VFIO_DMA_MAP_FLAG_WRITE))
 874			return -EINVAL;
 875
 876		ret = tce_iommu_create_default_window(container);
 877		if (ret)
 878			return ret;
 879
 880		num = tce_iommu_find_table(container, param.iova, &tbl);
 881		if (num < 0)
 882			return -ENXIO;
 883
 884		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
 885				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
 886			return -EINVAL;
 887
 888		/* iova is checked by the IOMMU API */
 889		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
 890			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 891				direction = DMA_BIDIRECTIONAL;
 892			else
 893				direction = DMA_TO_DEVICE;
 894		} else {
 895			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 896				direction = DMA_FROM_DEVICE;
 897			else
 898				return -EINVAL;
 899		}
 900
 901		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
 902		if (ret)
 903			return ret;
 904
 905		if (container->v2)
 906			ret = tce_iommu_build_v2(container, tbl,
 907					param.iova >> tbl->it_page_shift,
 908					param.vaddr,
 909					param.size >> tbl->it_page_shift,
 910					direction);
 911		else
 912			ret = tce_iommu_build(container, tbl,
 913					param.iova >> tbl->it_page_shift,
 914					param.vaddr,
 915					param.size >> tbl->it_page_shift,
 916					direction);
 917
 918		iommu_flush_tce(tbl);
 919
 920		return ret;
 921	}
 922	case VFIO_IOMMU_UNMAP_DMA: {
 923		struct vfio_iommu_type1_dma_unmap param;
 924		struct iommu_table *tbl = NULL;
 925		long num;
 926
 927		if (!container->enabled)
 928			return -EPERM;
 929
 930		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
 931				size);
 932
 933		if (copy_from_user(&param, (void __user *)arg, minsz))
 934			return -EFAULT;
 935
 936		if (param.argsz < minsz)
 937			return -EINVAL;
 938
 939		/* No flag is supported now */
 940		if (param.flags)
 941			return -EINVAL;
 942
 943		ret = tce_iommu_create_default_window(container);
 944		if (ret)
 945			return ret;
 946
 947		num = tce_iommu_find_table(container, param.iova, &tbl);
 948		if (num < 0)
 949			return -ENXIO;
 950
 951		if (param.size & ~IOMMU_PAGE_MASK(tbl))
 952			return -EINVAL;
 953
 954		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
 955				param.size >> tbl->it_page_shift);
 956		if (ret)
 957			return ret;
 958
 959		ret = tce_iommu_clear(container, tbl,
 960				param.iova >> tbl->it_page_shift,
 961				param.size >> tbl->it_page_shift);
 962		iommu_flush_tce(tbl);
 963
 964		return ret;
 965	}
 966	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
 967		struct vfio_iommu_spapr_register_memory param;
 968
 969		if (!container->v2)
 970			break;
 971
 972		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
 973				size);
 974
 975		ret = tce_iommu_mm_set(container);
 976		if (ret)
 977			return ret;
 978
 979		if (copy_from_user(&param, (void __user *)arg, minsz))
 980			return -EFAULT;
 981
 982		if (param.argsz < minsz)
 983			return -EINVAL;
 984
 985		/* No flag is supported now */
 986		if (param.flags)
 987			return -EINVAL;
 988
 989		mutex_lock(&container->lock);
 990		ret = tce_iommu_register_pages(container, param.vaddr,
 991				param.size);
 992		mutex_unlock(&container->lock);
 993
 994		return ret;
 995	}
 996	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
 997		struct vfio_iommu_spapr_register_memory param;
 998
 999		if (!container->v2)
1000			break;
1001
1002		if (!container->mm)
1003			return -EPERM;
1004
1005		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1006				size);
1007
1008		if (copy_from_user(&param, (void __user *)arg, minsz))
1009			return -EFAULT;
1010
1011		if (param.argsz < minsz)
1012			return -EINVAL;
1013
1014		/* No flag is supported now */
1015		if (param.flags)
1016			return -EINVAL;
1017
1018		mutex_lock(&container->lock);
1019		ret = tce_iommu_unregister_pages(container, param.vaddr,
1020				param.size);
1021		mutex_unlock(&container->lock);
1022
1023		return ret;
1024	}
1025	case VFIO_IOMMU_ENABLE:
1026		if (container->v2)
1027			break;
1028
1029		mutex_lock(&container->lock);
1030		ret = tce_iommu_enable(container);
1031		mutex_unlock(&container->lock);
1032		return ret;
1033
1034
1035	case VFIO_IOMMU_DISABLE:
1036		if (container->v2)
1037			break;
1038
1039		mutex_lock(&container->lock);
1040		tce_iommu_disable(container);
1041		mutex_unlock(&container->lock);
1042		return 0;
1043
1044	case VFIO_EEH_PE_OP: {
1045		struct tce_iommu_group *tcegrp;
1046
1047		ret = 0;
1048		list_for_each_entry(tcegrp, &container->group_list, next) {
1049			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
1050					cmd, arg);
1051			if (ret)
1052				return ret;
1053		}
1054		return ret;
1055	}
1056
1057	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1058		struct vfio_iommu_spapr_tce_create create;
1059
1060		if (!container->v2)
1061			break;
1062
1063		ret = tce_iommu_mm_set(container);
1064		if (ret)
1065			return ret;
1066
1067		if (!tce_groups_attached(container))
1068			return -ENXIO;
1069
1070		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1071				start_addr);
1072
1073		if (copy_from_user(&create, (void __user *)arg, minsz))
1074			return -EFAULT;
1075
1076		if (create.argsz < minsz)
1077			return -EINVAL;
1078
1079		if (create.flags)
1080			return -EINVAL;
1081
1082		mutex_lock(&container->lock);
1083
1084		ret = tce_iommu_create_default_window(container);
1085		if (!ret)
1086			ret = tce_iommu_create_window(container,
1087					create.page_shift,
1088					create.window_size, create.levels,
1089					&create.start_addr);
1090
1091		mutex_unlock(&container->lock);
1092
1093		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1094			ret = -EFAULT;
1095
1096		return ret;
1097	}
1098	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1099		struct vfio_iommu_spapr_tce_remove remove;
1100
1101		if (!container->v2)
1102			break;
1103
1104		ret = tce_iommu_mm_set(container);
1105		if (ret)
1106			return ret;
1107
1108		if (!tce_groups_attached(container))
1109			return -ENXIO;
1110
1111		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1112				start_addr);
1113
1114		if (copy_from_user(&remove, (void __user *)arg, minsz))
1115			return -EFAULT;
1116
1117		if (remove.argsz < minsz)
1118			return -EINVAL;
1119
1120		if (remove.flags)
1121			return -EINVAL;
1122
1123		if (container->def_window_pending && !remove.start_addr) {
1124			container->def_window_pending = false;
1125			return 0;
1126		}
1127
1128		mutex_lock(&container->lock);
1129
1130		ret = tce_iommu_remove_window(container, remove.start_addr);
1131
1132		mutex_unlock(&container->lock);
1133
1134		return ret;
1135	}
1136	}
1137
1138	return -ENOTTY;
1139}
1140
1141static void tce_iommu_release_ownership(struct tce_container *container,
1142		struct iommu_table_group *table_group)
1143{
1144	int i;
1145
1146	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1147		struct iommu_table *tbl = container->tables[i];
1148
1149		if (!tbl)
1150			continue;
1151
1152		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1153		if (tbl->it_map)
1154			iommu_release_ownership(tbl);
1155
1156		container->tables[i] = NULL;
1157	}
1158}
1159
1160static int tce_iommu_take_ownership(struct tce_container *container,
1161		struct iommu_table_group *table_group)
1162{
1163	int i, j, rc = 0;
1164
1165	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1166		struct iommu_table *tbl = table_group->tables[i];
1167
1168		if (!tbl || !tbl->it_map)
1169			continue;
1170
1171		rc = iommu_take_ownership(tbl);
1172		if (rc) {
1173			for (j = 0; j < i; ++j)
1174				iommu_release_ownership(
1175						table_group->tables[j]);
1176
1177			return rc;
1178		}
1179	}
1180
1181	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1182		container->tables[i] = table_group->tables[i];
1183
1184	return 0;
1185}
1186
1187static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1188		struct iommu_table_group *table_group)
1189{
1190	long i;
1191
1192	if (!table_group->ops->unset_window) {
1193		WARN_ON_ONCE(1);
1194		return;
1195	}
1196
1197	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1198		if (container->tables[i])
1199			table_group->ops->unset_window(table_group, i);
1200
1201	table_group->ops->release_ownership(table_group);
1202}
1203
1204static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1205		struct iommu_table_group *table_group)
1206{
1207	long i, ret = 0;
1208
1209	if (!table_group->ops->create_table || !table_group->ops->set_window ||
1210			!table_group->ops->release_ownership) {
1211		WARN_ON_ONCE(1);
1212		return -EFAULT;
1213	}
1214
1215	table_group->ops->take_ownership(table_group);
1216
1217	/* Set all windows to the new group */
1218	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1219		struct iommu_table *tbl = container->tables[i];
1220
1221		if (!tbl)
1222			continue;
1223
1224		ret = table_group->ops->set_window(table_group, i, tbl);
1225		if (ret)
1226			goto release_exit;
1227	}
1228
1229	return 0;
1230
1231release_exit:
1232	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1233		table_group->ops->unset_window(table_group, i);
1234
1235	table_group->ops->release_ownership(table_group);
1236
1237	return ret;
1238}
1239
1240static int tce_iommu_attach_group(void *iommu_data,
1241		struct iommu_group *iommu_group)
1242{
1243	int ret = 0;
1244	struct tce_container *container = iommu_data;
1245	struct iommu_table_group *table_group;
1246	struct tce_iommu_group *tcegrp = NULL;
1247
 
 
 
1248	mutex_lock(&container->lock);
1249
1250	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1251			iommu_group_id(iommu_group), iommu_group); */
1252	table_group = iommu_group_get_iommudata(iommu_group);
1253	if (!table_group) {
1254		ret = -ENODEV;
1255		goto unlock_exit;
1256	}
1257
1258	if (tce_groups_attached(container) && (!table_group->ops ||
1259			!table_group->ops->take_ownership ||
1260			!table_group->ops->release_ownership)) {
 
 
 
 
 
1261		ret = -EBUSY;
1262		goto unlock_exit;
1263	}
1264
1265	/* Check if new group has the same iommu_ops (i.e. compatible) */
 
 
 
1266	list_for_each_entry(tcegrp, &container->group_list, next) {
1267		struct iommu_table_group *table_group_tmp;
1268
1269		if (tcegrp->grp == iommu_group) {
1270			pr_warn("tce_vfio: Group %d is already attached\n",
1271					iommu_group_id(iommu_group));
1272			ret = -EBUSY;
1273			goto unlock_exit;
1274		}
1275		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1276		if (table_group_tmp->ops->create_table !=
1277				table_group->ops->create_table) {
1278			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1279					iommu_group_id(iommu_group),
1280					iommu_group_id(tcegrp->grp));
1281			ret = -EPERM;
1282			goto unlock_exit;
1283		}
1284	}
1285
1286	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1287	if (!tcegrp) {
1288		ret = -ENOMEM;
1289		goto unlock_exit;
1290	}
1291
1292	if (!table_group->ops || !table_group->ops->take_ownership ||
1293			!table_group->ops->release_ownership) {
1294		if (container->v2) {
1295			ret = -EPERM;
1296			goto free_exit;
1297		}
1298		ret = tce_iommu_take_ownership(container, table_group);
1299	} else {
1300		if (!container->v2) {
1301			ret = -EPERM;
1302			goto free_exit;
1303		}
1304		ret = tce_iommu_take_ownership_ddw(container, table_group);
1305		if (!tce_groups_attached(container) && !container->tables[0])
1306			container->def_window_pending = true;
1307	}
1308
1309	if (!ret) {
1310		tcegrp->grp = iommu_group;
1311		list_add(&tcegrp->next, &container->group_list);
1312	}
1313
1314free_exit:
1315	if (ret && tcegrp)
1316		kfree(tcegrp);
1317
1318unlock_exit:
1319	mutex_unlock(&container->lock);
1320
1321	return ret;
1322}
1323
1324static void tce_iommu_detach_group(void *iommu_data,
1325		struct iommu_group *iommu_group)
1326{
1327	struct tce_container *container = iommu_data;
1328	struct iommu_table_group *table_group;
1329	bool found = false;
1330	struct tce_iommu_group *tcegrp;
1331
1332	mutex_lock(&container->lock);
1333
1334	list_for_each_entry(tcegrp, &container->group_list, next) {
1335		if (tcegrp->grp == iommu_group) {
1336			found = true;
1337			break;
1338		}
1339	}
1340
1341	if (!found) {
1342		pr_warn("tce_vfio: detaching unattached group #%u\n",
1343				iommu_group_id(iommu_group));
1344		goto unlock_exit;
1345	}
1346
1347	list_del(&tcegrp->next);
1348	kfree(tcegrp);
1349
1350	table_group = iommu_group_get_iommudata(iommu_group);
1351	BUG_ON(!table_group);
1352
1353	if (!table_group->ops || !table_group->ops->release_ownership)
1354		tce_iommu_release_ownership(container, table_group);
1355	else
1356		tce_iommu_release_ownership_ddw(container, table_group);
1357
1358unlock_exit:
1359	mutex_unlock(&container->lock);
1360}
1361
1362static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1363	.name		= "iommu-vfio-powerpc",
1364	.owner		= THIS_MODULE,
1365	.open		= tce_iommu_open,
1366	.release	= tce_iommu_release,
1367	.ioctl		= tce_iommu_ioctl,
1368	.attach_group	= tce_iommu_attach_group,
1369	.detach_group	= tce_iommu_detach_group,
1370};
1371
1372static int __init tce_iommu_init(void)
1373{
1374	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1375}
1376
1377static void __exit tce_iommu_cleanup(void)
1378{
1379	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1380}
1381
1382module_init(tce_iommu_init);
1383module_exit(tce_iommu_cleanup);
1384
1385MODULE_VERSION(DRIVER_VERSION);
1386MODULE_LICENSE("GPL v2");
1387MODULE_AUTHOR(DRIVER_AUTHOR);
1388MODULE_DESCRIPTION(DRIVER_DESC);
1389