Linux Audio

Check our new training course

Loading...
v6.8
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * VFIO: IOMMU DMA mapping support for TCE on POWER
   4 *
   5 * Copyright (C) 2013 IBM Corp.  All rights reserved.
   6 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
   7 * Copyright Gavin Shan, IBM Corporation 2014.
 
 
 
   8 *
   9 * Derived from original vfio_iommu_type1.c:
  10 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
  11 *     Author: Alex Williamson <alex.williamson@redhat.com>
  12 */
  13
  14#include <linux/module.h>
  15#include <linux/pci.h>
  16#include <linux/slab.h>
  17#include <linux/uaccess.h>
  18#include <linux/err.h>
  19#include <linux/vfio.h>
  20#include <linux/vmalloc.h>
  21#include <linux/sched/mm.h>
  22#include <linux/sched/signal.h>
  23#include <linux/mm.h>
  24#include "vfio.h"
  25
  26#include <asm/iommu.h>
  27#include <asm/tce.h>
  28#include <asm/mmu_context.h>
  29
  30#define DRIVER_VERSION  "0.1"
  31#define DRIVER_AUTHOR   "aik@ozlabs.ru"
  32#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
  33
  34static void tce_iommu_detach_group(void *iommu_data,
  35		struct iommu_group *iommu_group);
  36
  37/*
  38 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  39 *
  40 * This code handles mapping and unmapping of user data buffers
  41 * into DMA'ble space using the IOMMU
  42 */
  43
  44struct tce_iommu_group {
  45	struct list_head next;
  46	struct iommu_group *grp;
  47};
  48
  49/*
  50 * A container needs to remember which preregistered region  it has
  51 * referenced to do proper cleanup at the userspace process exit.
  52 */
  53struct tce_iommu_prereg {
  54	struct list_head next;
  55	struct mm_iommu_table_group_mem_t *mem;
  56};
  57
  58/*
  59 * The container descriptor supports only a single group per container.
  60 * Required by the API as the container is not supplied with the IOMMU group
  61 * at the moment of initialization.
  62 */
  63struct tce_container {
  64	struct mutex lock;
 
  65	bool enabled;
  66	bool v2;
  67	bool def_window_pending;
  68	unsigned long locked_pages;
  69	struct mm_struct *mm;
  70	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
  71	struct list_head group_list;
  72	struct list_head prereg_list;
  73};
  74
  75static long tce_iommu_mm_set(struct tce_container *container)
  76{
  77	if (container->mm) {
  78		if (container->mm == current->mm)
  79			return 0;
  80		return -EPERM;
  81	}
  82	BUG_ON(!current->mm);
  83	container->mm = current->mm;
  84	mmgrab(container->mm);
  85
  86	return 0;
  87}
  88
  89static long tce_iommu_prereg_free(struct tce_container *container,
  90		struct tce_iommu_prereg *tcemem)
  91{
  92	long ret;
  93
  94	ret = mm_iommu_put(container->mm, tcemem->mem);
  95	if (ret)
  96		return ret;
  97
  98	list_del(&tcemem->next);
  99	kfree(tcemem);
 100
 101	return 0;
 102}
 103
 104static long tce_iommu_unregister_pages(struct tce_container *container,
 105		__u64 vaddr, __u64 size)
 106{
 107	struct mm_iommu_table_group_mem_t *mem;
 108	struct tce_iommu_prereg *tcemem;
 109	bool found = false;
 110	long ret;
 111
 112	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
 113		return -EINVAL;
 114
 115	mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
 116	if (!mem)
 117		return -ENOENT;
 118
 119	list_for_each_entry(tcemem, &container->prereg_list, next) {
 120		if (tcemem->mem == mem) {
 121			found = true;
 122			break;
 123		}
 124	}
 125
 126	if (!found)
 127		ret = -ENOENT;
 128	else
 129		ret = tce_iommu_prereg_free(container, tcemem);
 130
 131	mm_iommu_put(container->mm, mem);
 132
 133	return ret;
 134}
 135
 136static long tce_iommu_register_pages(struct tce_container *container,
 137		__u64 vaddr, __u64 size)
 138{
 139	long ret = 0;
 140	struct mm_iommu_table_group_mem_t *mem = NULL;
 141	struct tce_iommu_prereg *tcemem;
 142	unsigned long entries = size >> PAGE_SHIFT;
 143
 144	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
 145			((vaddr + size) < vaddr))
 146		return -EINVAL;
 147
 148	mem = mm_iommu_get(container->mm, vaddr, entries);
 149	if (mem) {
 150		list_for_each_entry(tcemem, &container->prereg_list, next) {
 151			if (tcemem->mem == mem) {
 152				ret = -EBUSY;
 153				goto put_exit;
 154			}
 155		}
 156	} else {
 157		ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
 158		if (ret)
 159			return ret;
 160	}
 161
 162	tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
 163	if (!tcemem) {
 164		ret = -ENOMEM;
 165		goto put_exit;
 166	}
 167
 168	tcemem->mem = mem;
 169	list_add(&tcemem->next, &container->prereg_list);
 170
 171	container->enabled = true;
 172
 173	return 0;
 174
 175put_exit:
 176	mm_iommu_put(container->mm, mem);
 177	return ret;
 178}
 179
 180static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
 181		unsigned int it_page_shift)
 182{
 183	struct page *page;
 184	unsigned long size = 0;
 185
 186	if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
 187		return size == (1UL << it_page_shift);
 188
 189	page = pfn_to_page(hpa >> PAGE_SHIFT);
 190	/*
 191	 * Check that the TCE table granularity is not bigger than the size of
 192	 * a page we just found. Otherwise the hardware can get access to
 193	 * a bigger memory chunk that it should.
 194	 */
 195	return page_shift(compound_head(page)) >= it_page_shift;
 196}
 197
 198static inline bool tce_groups_attached(struct tce_container *container)
 199{
 200	return !list_empty(&container->group_list);
 201}
 202
 203static long tce_iommu_find_table(struct tce_container *container,
 204		phys_addr_t ioba, struct iommu_table **ptbl)
 205{
 206	long i;
 207
 208	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 209		struct iommu_table *tbl = container->tables[i];
 210
 211		if (tbl) {
 212			unsigned long entry = ioba >> tbl->it_page_shift;
 213			unsigned long start = tbl->it_offset;
 214			unsigned long end = start + tbl->it_size;
 215
 216			if ((start <= entry) && (entry < end)) {
 217				*ptbl = tbl;
 218				return i;
 219			}
 220		}
 221	}
 222
 223	return -1;
 224}
 225
 226static int tce_iommu_find_free_table(struct tce_container *container)
 227{
 228	int i;
 229
 230	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 231		if (!container->tables[i])
 232			return i;
 233	}
 234
 235	return -ENOSPC;
 236}
 237
 238static int tce_iommu_enable(struct tce_container *container)
 239{
 240	int ret = 0;
 241	unsigned long locked;
 242	struct iommu_table_group *table_group;
 243	struct tce_iommu_group *tcegrp;
 
 
 
 
 
 244
 245	if (container->enabled)
 246		return -EBUSY;
 247
 248	/*
 249	 * When userspace pages are mapped into the IOMMU, they are effectively
 250	 * locked memory, so, theoretically, we need to update the accounting
 251	 * of locked pages on each map and unmap.  For powerpc, the map unmap
 252	 * paths can be very hot, though, and the accounting would kill
 253	 * performance, especially since it would be difficult to impossible
 254	 * to handle the accounting in real mode only.
 255	 *
 256	 * To address that, rather than precisely accounting every page, we
 257	 * instead account for a worst case on locked memory when the iommu is
 258	 * enabled and disabled.  The worst case upper bound on locked memory
 259	 * is the size of the whole iommu window, which is usually relatively
 260	 * small (compared to total memory sizes) on POWER hardware.
 261	 *
 262	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 263	 * that would effectively kill the guest at random points, much better
 264	 * enforcing the limit based on the max that the guest can map.
 265	 *
 266	 * Unfortunately at the moment it counts whole tables, no matter how
 267	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
 268	 * each with 2GB DMA window, 8GB will be counted here. The reason for
 269	 * this is that we cannot tell here the amount of RAM used by the guest
 270	 * as this information is only available from KVM and VFIO is
 271	 * KVM agnostic.
 272	 *
 273	 * So we do not allow enabling a container without a group attached
 274	 * as there is no way to know how much we should increment
 275	 * the locked_vm counter.
 276	 */
 277	if (!tce_groups_attached(container))
 278		return -ENODEV;
 279
 280	tcegrp = list_first_entry(&container->group_list,
 281			struct tce_iommu_group, next);
 282	table_group = iommu_group_get_iommudata(tcegrp->grp);
 283	if (!table_group)
 284		return -ENODEV;
 285
 286	if (!table_group->tce32_size)
 287		return -EPERM;
 288
 289	ret = tce_iommu_mm_set(container);
 290	if (ret)
 291		return ret;
 292
 293	locked = table_group->tce32_size >> PAGE_SHIFT;
 294	ret = account_locked_vm(container->mm, locked, true);
 295	if (ret)
 296		return ret;
 297
 298	container->locked_pages = locked;
 299
 300	container->enabled = true;
 
 
 
 301
 302	return ret;
 303}
 304
 305static void tce_iommu_disable(struct tce_container *container)
 306{
 307	if (!container->enabled)
 308		return;
 309
 310	container->enabled = false;
 311
 312	BUG_ON(!container->mm);
 313	account_locked_vm(container->mm, container->locked_pages, false);
 
 
 
 
 
 314}
 315
 316static void *tce_iommu_open(unsigned long arg)
 317{
 318	struct tce_container *container;
 319
 320	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
 321		pr_err("tce_vfio: Wrong IOMMU type\n");
 322		return ERR_PTR(-EINVAL);
 323	}
 324
 325	container = kzalloc(sizeof(*container), GFP_KERNEL);
 326	if (!container)
 327		return ERR_PTR(-ENOMEM);
 328
 329	mutex_init(&container->lock);
 330	INIT_LIST_HEAD_RCU(&container->group_list);
 331	INIT_LIST_HEAD_RCU(&container->prereg_list);
 332
 333	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 334
 335	return container;
 336}
 337
 338static int tce_iommu_clear(struct tce_container *container,
 339		struct iommu_table *tbl,
 340		unsigned long entry, unsigned long pages);
 341static void tce_iommu_free_table(struct tce_container *container,
 342		struct iommu_table *tbl);
 343
 344static void tce_iommu_release(void *iommu_data)
 345{
 346	struct tce_container *container = iommu_data;
 347	struct tce_iommu_group *tcegrp;
 348	struct tce_iommu_prereg *tcemem, *tmtmp;
 349	long i;
 350
 351	while (tce_groups_attached(container)) {
 352		tcegrp = list_first_entry(&container->group_list,
 353				struct tce_iommu_group, next);
 354		tce_iommu_detach_group(iommu_data, tcegrp->grp);
 355	}
 356
 357	/*
 358	 * If VFIO created a table, it was not disposed
 359	 * by tce_iommu_detach_group() so do it now.
 360	 */
 361	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
 362		struct iommu_table *tbl = container->tables[i];
 363
 364		if (!tbl)
 365			continue;
 366
 367		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 368		tce_iommu_free_table(container, tbl);
 369	}
 370
 371	list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
 372		WARN_ON(tce_iommu_prereg_free(container, tcemem));
 373
 374	tce_iommu_disable(container);
 375	if (container->mm)
 376		mmdrop(container->mm);
 377	mutex_destroy(&container->lock);
 378
 379	kfree(container);
 380}
 381
 382static void tce_iommu_unuse_page(unsigned long hpa)
 383{
 384	struct page *page;
 385
 386	page = pfn_to_page(hpa >> PAGE_SHIFT);
 387	unpin_user_page(page);
 388}
 389
 390static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
 391		unsigned long tce, unsigned long shift,
 392		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
 393{
 394	long ret = 0;
 395	struct mm_iommu_table_group_mem_t *mem;
 396
 397	mem = mm_iommu_lookup(container->mm, tce, 1ULL << shift);
 398	if (!mem)
 399		return -EINVAL;
 400
 401	ret = mm_iommu_ua_to_hpa(mem, tce, shift, phpa);
 402	if (ret)
 403		return -EINVAL;
 404
 405	*pmem = mem;
 406
 407	return 0;
 408}
 409
 410static void tce_iommu_unuse_page_v2(struct tce_container *container,
 411		struct iommu_table *tbl, unsigned long entry)
 412{
 413	struct mm_iommu_table_group_mem_t *mem = NULL;
 414	int ret;
 415	unsigned long hpa = 0;
 416	__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 417
 418	if (!pua)
 419		return;
 420
 421	ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua),
 422			tbl->it_page_shift, &hpa, &mem);
 423	if (ret)
 424		pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n",
 425				__func__, be64_to_cpu(*pua), entry, ret);
 426	if (mem)
 427		mm_iommu_mapped_dec(mem);
 428
 429	*pua = cpu_to_be64(0);
 430}
 431
 432static int tce_iommu_clear(struct tce_container *container,
 433		struct iommu_table *tbl,
 434		unsigned long entry, unsigned long pages)
 435{
 436	unsigned long oldhpa;
 437	long ret;
 438	enum dma_data_direction direction;
 439	unsigned long lastentry = entry + pages, firstentry = entry;
 440
 441	for ( ; entry < lastentry; ++entry) {
 442		if (tbl->it_indirect_levels && tbl->it_userspace) {
 443			/*
 444			 * For multilevel tables, we can take a shortcut here
 445			 * and skip some TCEs as we know that the userspace
 446			 * addresses cache is a mirror of the real TCE table
 447			 * and if it is missing some indirect levels, then
 448			 * the hardware table does not have them allocated
 449			 * either and therefore does not require updating.
 450			 */
 451			__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
 452					entry);
 453			if (!pua) {
 454				/* align to level_size which is power of two */
 455				entry |= tbl->it_level_size - 1;
 456				continue;
 457			}
 458		}
 459
 460		cond_resched();
 461
 462		direction = DMA_NONE;
 463		oldhpa = 0;
 464		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
 465				&direction);
 466		if (ret)
 467			continue;
 468
 469		if (direction == DMA_NONE)
 470			continue;
 471
 472		if (container->v2) {
 473			tce_iommu_unuse_page_v2(container, tbl, entry);
 474			continue;
 475		}
 476
 477		tce_iommu_unuse_page(oldhpa);
 478	}
 479
 480	iommu_tce_kill(tbl, firstentry, pages);
 481
 482	return 0;
 483}
 484
 485static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
 486{
 487	struct page *page = NULL;
 488	enum dma_data_direction direction = iommu_tce_direction(tce);
 489
 490	if (pin_user_pages_fast(tce & PAGE_MASK, 1,
 491			direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
 492			&page) != 1)
 493		return -EFAULT;
 494
 495	*hpa = __pa((unsigned long) page_address(page));
 496
 497	return 0;
 498}
 499
 500static long tce_iommu_build(struct tce_container *container,
 501		struct iommu_table *tbl,
 502		unsigned long entry, unsigned long tce, unsigned long pages,
 503		enum dma_data_direction direction)
 504{
 505	long i, ret = 0;
 506	unsigned long hpa;
 507	enum dma_data_direction dirtmp;
 508
 509	for (i = 0; i < pages; ++i) {
 510		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 511
 512		ret = tce_iommu_use_page(tce, &hpa);
 513		if (ret)
 514			break;
 515
 516		if (!tce_page_is_contained(container->mm, hpa,
 517				tbl->it_page_shift)) {
 518			ret = -EPERM;
 519			break;
 520		}
 521
 522		hpa |= offset;
 523		dirtmp = direction;
 524		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
 525				&hpa, &dirtmp);
 526		if (ret) {
 527			tce_iommu_unuse_page(hpa);
 528			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 529					__func__, entry << tbl->it_page_shift,
 530					tce, ret);
 531			break;
 532		}
 533
 534		if (dirtmp != DMA_NONE)
 535			tce_iommu_unuse_page(hpa);
 536
 537		tce += IOMMU_PAGE_SIZE(tbl);
 538	}
 539
 540	if (ret)
 541		tce_iommu_clear(container, tbl, entry, i);
 542	else
 543		iommu_tce_kill(tbl, entry, pages);
 544
 545	return ret;
 546}
 547
 548static long tce_iommu_build_v2(struct tce_container *container,
 549		struct iommu_table *tbl,
 550		unsigned long entry, unsigned long tce, unsigned long pages,
 551		enum dma_data_direction direction)
 552{
 553	long i, ret = 0;
 554	unsigned long hpa;
 555	enum dma_data_direction dirtmp;
 556
 557	for (i = 0; i < pages; ++i) {
 558		struct mm_iommu_table_group_mem_t *mem = NULL;
 559		__be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i);
 560
 561		ret = tce_iommu_prereg_ua_to_hpa(container,
 562				tce, tbl->it_page_shift, &hpa, &mem);
 563		if (ret)
 564			break;
 565
 566		if (!tce_page_is_contained(container->mm, hpa,
 567				tbl->it_page_shift)) {
 568			ret = -EPERM;
 569			break;
 570		}
 571
 572		/* Preserve offset within IOMMU page */
 573		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 574		dirtmp = direction;
 575
 576		/* The registered region is being unregistered */
 577		if (mm_iommu_mapped_inc(mem))
 578			break;
 579
 580		ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
 581				&hpa, &dirtmp);
 582		if (ret) {
 583			/* dirtmp cannot be DMA_NONE here */
 584			tce_iommu_unuse_page_v2(container, tbl, entry + i);
 585			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
 586					__func__, entry << tbl->it_page_shift,
 587					tce, ret);
 588			break;
 589		}
 590
 591		if (dirtmp != DMA_NONE)
 592			tce_iommu_unuse_page_v2(container, tbl, entry + i);
 593
 594		*pua = cpu_to_be64(tce);
 595
 596		tce += IOMMU_PAGE_SIZE(tbl);
 597	}
 598
 599	if (ret)
 600		tce_iommu_clear(container, tbl, entry, i);
 601	else
 602		iommu_tce_kill(tbl, entry, pages);
 603
 604	return ret;
 605}
 606
 607static long tce_iommu_create_table(struct tce_container *container,
 608			struct iommu_table_group *table_group,
 609			int num,
 610			__u32 page_shift,
 611			__u64 window_size,
 612			__u32 levels,
 613			struct iommu_table **ptbl)
 614{
 615	long ret, table_size;
 616
 617	table_size = table_group->ops->get_table_size(page_shift, window_size,
 618			levels);
 619	if (!table_size)
 620		return -EINVAL;
 621
 622	ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
 623	if (ret)
 624		return ret;
 625
 626	ret = table_group->ops->create_table(table_group, num,
 627			page_shift, window_size, levels, ptbl);
 628
 629	WARN_ON(!ret && !(*ptbl)->it_ops->free);
 630	WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size));
 631
 632	return ret;
 633}
 634
 635static void tce_iommu_free_table(struct tce_container *container,
 636		struct iommu_table *tbl)
 637{
 638	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 639
 640	iommu_tce_table_put(tbl);
 641	account_locked_vm(container->mm, pages, false);
 642}
 643
 644static long tce_iommu_create_window(struct tce_container *container,
 645		__u32 page_shift, __u64 window_size, __u32 levels,
 646		__u64 *start_addr)
 647{
 648	struct tce_iommu_group *tcegrp;
 649	struct iommu_table_group *table_group;
 650	struct iommu_table *tbl = NULL;
 651	long ret, num;
 652
 653	num = tce_iommu_find_free_table(container);
 654	if (num < 0)
 655		return num;
 656
 657	/* Get the first group for ops::create_table */
 658	tcegrp = list_first_entry(&container->group_list,
 659			struct tce_iommu_group, next);
 660	table_group = iommu_group_get_iommudata(tcegrp->grp);
 661	if (!table_group)
 662		return -EFAULT;
 663
 664	if (!(table_group->pgsizes & (1ULL << page_shift)))
 665		return -EINVAL;
 666
 667	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
 668			!table_group->ops->get_table_size ||
 669			!table_group->ops->create_table)
 670		return -EPERM;
 671
 672	/* Create TCE table */
 673	ret = tce_iommu_create_table(container, table_group, num,
 674			page_shift, window_size, levels, &tbl);
 675	if (ret)
 676		return ret;
 677
 678	BUG_ON(!tbl->it_ops->free);
 679
 680	/*
 681	 * Program the table to every group.
 682	 * Groups have been tested for compatibility at the attach time.
 683	 */
 684	list_for_each_entry(tcegrp, &container->group_list, next) {
 685		table_group = iommu_group_get_iommudata(tcegrp->grp);
 686
 687		ret = table_group->ops->set_window(table_group, num, tbl);
 688		if (ret)
 689			goto unset_exit;
 690	}
 691
 692	container->tables[num] = tbl;
 693
 694	/* Return start address assigned by platform in create_table() */
 695	*start_addr = tbl->it_offset << tbl->it_page_shift;
 696
 697	return 0;
 698
 699unset_exit:
 700	list_for_each_entry(tcegrp, &container->group_list, next) {
 701		table_group = iommu_group_get_iommudata(tcegrp->grp);
 702		table_group->ops->unset_window(table_group, num);
 703	}
 704	tce_iommu_free_table(container, tbl);
 705
 706	return ret;
 707}
 708
 709static long tce_iommu_remove_window(struct tce_container *container,
 710		__u64 start_addr)
 711{
 712	struct iommu_table_group *table_group = NULL;
 713	struct iommu_table *tbl;
 714	struct tce_iommu_group *tcegrp;
 715	int num;
 716
 717	num = tce_iommu_find_table(container, start_addr, &tbl);
 718	if (num < 0)
 719		return -EINVAL;
 720
 721	BUG_ON(!tbl->it_size);
 722
 723	/* Detach groups from IOMMUs */
 724	list_for_each_entry(tcegrp, &container->group_list, next) {
 725		table_group = iommu_group_get_iommudata(tcegrp->grp);
 726
 727		/*
 728		 * SPAPR TCE IOMMU exposes the default DMA window to
 729		 * the guest via dma32_window_start/size of
 730		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
 731		 * the userspace to remove this window, some do not so
 732		 * here we check for the platform capability.
 733		 */
 734		if (!table_group->ops || !table_group->ops->unset_window)
 735			return -EPERM;
 736
 737		table_group->ops->unset_window(table_group, num);
 738	}
 739
 740	/* Free table */
 741	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
 742	tce_iommu_free_table(container, tbl);
 743	container->tables[num] = NULL;
 744
 745	return 0;
 746}
 747
 748static long tce_iommu_create_default_window(struct tce_container *container)
 749{
 750	long ret;
 751	__u64 start_addr = 0;
 752	struct tce_iommu_group *tcegrp;
 753	struct iommu_table_group *table_group;
 754
 755	if (!container->def_window_pending)
 756		return 0;
 757
 758	if (!tce_groups_attached(container))
 759		return -ENODEV;
 760
 761	tcegrp = list_first_entry(&container->group_list,
 762			struct tce_iommu_group, next);
 763	table_group = iommu_group_get_iommudata(tcegrp->grp);
 764	if (!table_group)
 765		return -ENODEV;
 766
 767	ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
 768			table_group->tce32_size, 1, &start_addr);
 769	WARN_ON_ONCE(!ret && start_addr);
 770
 771	if (!ret)
 772		container->def_window_pending = false;
 773
 774	return ret;
 775}
 776
 777static long vfio_spapr_ioctl_eeh_pe_op(struct iommu_group *group,
 778				       unsigned long arg)
 779{
 780	struct eeh_pe *pe;
 781	struct vfio_eeh_pe_op op;
 782	unsigned long minsz;
 783
 784	pe = eeh_iommu_group_to_pe(group);
 785	if (!pe)
 786		return -ENODEV;
 787
 788	minsz = offsetofend(struct vfio_eeh_pe_op, op);
 789	if (copy_from_user(&op, (void __user *)arg, minsz))
 790		return -EFAULT;
 791	if (op.argsz < minsz || op.flags)
 792		return -EINVAL;
 793
 794	switch (op.op) {
 795	case VFIO_EEH_PE_DISABLE:
 796		return eeh_pe_set_option(pe, EEH_OPT_DISABLE);
 797	case VFIO_EEH_PE_ENABLE:
 798		return eeh_pe_set_option(pe, EEH_OPT_ENABLE);
 799	case VFIO_EEH_PE_UNFREEZE_IO:
 800		return eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO);
 801	case VFIO_EEH_PE_UNFREEZE_DMA:
 802		return eeh_pe_set_option(pe, EEH_OPT_THAW_DMA);
 803	case VFIO_EEH_PE_GET_STATE:
 804		return eeh_pe_get_state(pe);
 805		break;
 806	case VFIO_EEH_PE_RESET_DEACTIVATE:
 807		return eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true);
 808	case VFIO_EEH_PE_RESET_HOT:
 809		return eeh_pe_reset(pe, EEH_RESET_HOT, true);
 810	case VFIO_EEH_PE_RESET_FUNDAMENTAL:
 811		return eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true);
 812	case VFIO_EEH_PE_CONFIGURE:
 813		return eeh_pe_configure(pe);
 814	case VFIO_EEH_PE_INJECT_ERR:
 815		minsz = offsetofend(struct vfio_eeh_pe_op, err.mask);
 816		if (op.argsz < minsz)
 817			return -EINVAL;
 818		if (copy_from_user(&op, (void __user *)arg, minsz))
 819			return -EFAULT;
 820
 821		return eeh_pe_inject_err(pe, op.err.type, op.err.func,
 822					 op.err.addr, op.err.mask);
 823	default:
 824		return -EINVAL;
 825	}
 826}
 827
 828static long tce_iommu_ioctl(void *iommu_data,
 829				 unsigned int cmd, unsigned long arg)
 830{
 831	struct tce_container *container = iommu_data;
 832	unsigned long minsz, ddwsz;
 833	long ret;
 834
 835	switch (cmd) {
 836	case VFIO_CHECK_EXTENSION:
 837		switch (arg) {
 838		case VFIO_SPAPR_TCE_IOMMU:
 839		case VFIO_SPAPR_TCE_v2_IOMMU:
 840			return 1;
 841		case VFIO_EEH:
 842			return eeh_enabled();
 843		default:
 844			return 0;
 845		}
 846	}
 847
 848	/*
 849	 * Sanity check to prevent one userspace from manipulating
 850	 * another userspace mm.
 851	 */
 852	BUG_ON(!container);
 853	if (container->mm && container->mm != current->mm)
 854		return -EPERM;
 855
 856	switch (cmd) {
 857	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
 858		struct vfio_iommu_spapr_tce_info info;
 859		struct tce_iommu_group *tcegrp;
 860		struct iommu_table_group *table_group;
 861
 862		if (!tce_groups_attached(container))
 863			return -ENXIO;
 864
 865		tcegrp = list_first_entry(&container->group_list,
 866				struct tce_iommu_group, next);
 867		table_group = iommu_group_get_iommudata(tcegrp->grp);
 868
 869		if (!table_group)
 870			return -ENXIO;
 871
 872		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
 873				dma32_window_size);
 874
 875		if (copy_from_user(&info, (void __user *)arg, minsz))
 876			return -EFAULT;
 877
 878		if (info.argsz < minsz)
 879			return -EINVAL;
 880
 881		info.dma32_window_start = table_group->tce32_start;
 882		info.dma32_window_size = table_group->tce32_size;
 883		info.flags = 0;
 884		memset(&info.ddw, 0, sizeof(info.ddw));
 885
 886		if (table_group->max_dynamic_windows_supported &&
 887				container->v2) {
 888			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
 889			info.ddw.pgsizes = table_group->pgsizes;
 890			info.ddw.max_dynamic_windows_supported =
 891				table_group->max_dynamic_windows_supported;
 892			info.ddw.levels = table_group->max_levels;
 893		}
 894
 895		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
 896
 897		if (info.argsz >= ddwsz)
 898			minsz = ddwsz;
 899
 900		if (copy_to_user((void __user *)arg, &info, minsz))
 901			return -EFAULT;
 902
 903		return 0;
 904	}
 905	case VFIO_IOMMU_MAP_DMA: {
 906		struct vfio_iommu_type1_dma_map param;
 907		struct iommu_table *tbl = NULL;
 908		long num;
 909		enum dma_data_direction direction;
 910
 911		if (!container->enabled)
 912			return -EPERM;
 
 
 913
 914		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 915
 916		if (copy_from_user(&param, (void __user *)arg, minsz))
 917			return -EFAULT;
 918
 919		if (param.argsz < minsz)
 920			return -EINVAL;
 921
 922		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
 923				VFIO_DMA_MAP_FLAG_WRITE))
 924			return -EINVAL;
 925
 926		ret = tce_iommu_create_default_window(container);
 927		if (ret)
 928			return ret;
 929
 930		num = tce_iommu_find_table(container, param.iova, &tbl);
 931		if (num < 0)
 932			return -ENXIO;
 933
 934		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
 935				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
 936			return -EINVAL;
 937
 938		/* iova is checked by the IOMMU API */
 939		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
 940			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 941				direction = DMA_BIDIRECTIONAL;
 942			else
 943				direction = DMA_TO_DEVICE;
 944		} else {
 945			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
 946				direction = DMA_FROM_DEVICE;
 947			else
 948				return -EINVAL;
 949		}
 950
 951		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
 952		if (ret)
 953			return ret;
 954
 955		if (container->v2)
 956			ret = tce_iommu_build_v2(container, tbl,
 957					param.iova >> tbl->it_page_shift,
 958					param.vaddr,
 959					param.size >> tbl->it_page_shift,
 960					direction);
 961		else
 962			ret = tce_iommu_build(container, tbl,
 963					param.iova >> tbl->it_page_shift,
 964					param.vaddr,
 965					param.size >> tbl->it_page_shift,
 966					direction);
 967
 968		iommu_flush_tce(tbl);
 969
 970		return ret;
 971	}
 972	case VFIO_IOMMU_UNMAP_DMA: {
 973		struct vfio_iommu_type1_dma_unmap param;
 974		struct iommu_table *tbl = NULL;
 975		long num;
 976
 977		if (!container->enabled)
 978			return -EPERM;
 979
 980		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
 981				size);
 982
 983		if (copy_from_user(&param, (void __user *)arg, minsz))
 984			return -EFAULT;
 985
 986		if (param.argsz < minsz)
 987			return -EINVAL;
 988
 989		/* No flag is supported now */
 990		if (param.flags)
 991			return -EINVAL;
 992
 993		ret = tce_iommu_create_default_window(container);
 994		if (ret)
 995			return ret;
 996
 997		num = tce_iommu_find_table(container, param.iova, &tbl);
 998		if (num < 0)
 999			return -ENXIO;
1000
1001		if (param.size & ~IOMMU_PAGE_MASK(tbl))
1002			return -EINVAL;
1003
1004		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
1005				param.size >> tbl->it_page_shift);
1006		if (ret)
1007			return ret;
1008
1009		ret = tce_iommu_clear(container, tbl,
1010				param.iova >> tbl->it_page_shift,
1011				param.size >> tbl->it_page_shift);
1012		iommu_flush_tce(tbl);
1013
1014		return ret;
1015	}
1016	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
1017		struct vfio_iommu_spapr_register_memory param;
1018
1019		if (!container->v2)
1020			break;
1021
1022		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1023				size);
1024
1025		ret = tce_iommu_mm_set(container);
1026		if (ret)
1027			return ret;
1028
1029		if (copy_from_user(&param, (void __user *)arg, minsz))
1030			return -EFAULT;
1031
1032		if (param.argsz < minsz)
1033			return -EINVAL;
1034
1035		/* No flag is supported now */
1036		if (param.flags)
1037			return -EINVAL;
1038
1039		mutex_lock(&container->lock);
1040		ret = tce_iommu_register_pages(container, param.vaddr,
1041				param.size);
1042		mutex_unlock(&container->lock);
1043
1044		return ret;
1045	}
1046	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
1047		struct vfio_iommu_spapr_register_memory param;
1048
1049		if (!container->v2)
1050			break;
1051
1052		if (!container->mm)
1053			return -EPERM;
1054
1055		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
1056				size);
1057
1058		if (copy_from_user(&param, (void __user *)arg, minsz))
1059			return -EFAULT;
1060
1061		if (param.argsz < minsz)
1062			return -EINVAL;
1063
1064		/* No flag is supported now */
1065		if (param.flags)
1066			return -EINVAL;
1067
1068		mutex_lock(&container->lock);
1069		ret = tce_iommu_unregister_pages(container, param.vaddr,
1070				param.size);
1071		mutex_unlock(&container->lock);
1072
1073		return ret;
1074	}
1075	case VFIO_IOMMU_ENABLE:
1076		if (container->v2)
1077			break;
1078
1079		mutex_lock(&container->lock);
1080		ret = tce_iommu_enable(container);
1081		mutex_unlock(&container->lock);
1082		return ret;
1083
1084
1085	case VFIO_IOMMU_DISABLE:
1086		if (container->v2)
1087			break;
1088
1089		mutex_lock(&container->lock);
1090		tce_iommu_disable(container);
1091		mutex_unlock(&container->lock);
1092		return 0;
1093
1094	case VFIO_EEH_PE_OP: {
1095		struct tce_iommu_group *tcegrp;
1096
1097		ret = 0;
1098		list_for_each_entry(tcegrp, &container->group_list, next) {
1099			ret = vfio_spapr_ioctl_eeh_pe_op(tcegrp->grp, arg);
1100			if (ret)
1101				return ret;
1102		}
1103		return ret;
1104	}
1105
1106	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
1107		struct vfio_iommu_spapr_tce_create create;
1108
1109		if (!container->v2)
1110			break;
1111
1112		ret = tce_iommu_mm_set(container);
1113		if (ret)
1114			return ret;
1115
1116		if (!tce_groups_attached(container))
1117			return -ENXIO;
1118
1119		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
1120				start_addr);
1121
1122		if (copy_from_user(&create, (void __user *)arg, minsz))
1123			return -EFAULT;
1124
1125		if (create.argsz < minsz)
1126			return -EINVAL;
1127
1128		if (create.flags)
1129			return -EINVAL;
1130
1131		mutex_lock(&container->lock);
1132
1133		ret = tce_iommu_create_default_window(container);
1134		if (!ret)
1135			ret = tce_iommu_create_window(container,
1136					create.page_shift,
1137					create.window_size, create.levels,
1138					&create.start_addr);
1139
1140		mutex_unlock(&container->lock);
1141
1142		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
1143			ret = -EFAULT;
1144
1145		return ret;
1146	}
1147	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1148		struct vfio_iommu_spapr_tce_remove remove;
1149
1150		if (!container->v2)
1151			break;
1152
1153		ret = tce_iommu_mm_set(container);
1154		if (ret)
1155			return ret;
1156
1157		if (!tce_groups_attached(container))
1158			return -ENXIO;
1159
1160		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1161				start_addr);
1162
1163		if (copy_from_user(&remove, (void __user *)arg, minsz))
1164			return -EFAULT;
1165
1166		if (remove.argsz < minsz)
1167			return -EINVAL;
1168
1169		if (remove.flags)
1170			return -EINVAL;
1171
1172		if (container->def_window_pending && !remove.start_addr) {
1173			container->def_window_pending = false;
1174			return 0;
1175		}
1176
1177		mutex_lock(&container->lock);
1178
1179		ret = tce_iommu_remove_window(container, remove.start_addr);
1180
1181		mutex_unlock(&container->lock);
1182
1183		return ret;
1184	}
1185	}
1186
1187	return -ENOTTY;
1188}
1189
1190static void tce_iommu_release_ownership(struct tce_container *container,
1191		struct iommu_table_group *table_group)
1192{
1193	long i;
1194
1195	if (!table_group->ops->unset_window) {
1196		WARN_ON_ONCE(1);
1197		return;
1198	}
1199
1200	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1201		if (container->tables[i])
1202			table_group->ops->unset_window(table_group, i);
1203}
1204
1205static long tce_iommu_take_ownership(struct tce_container *container,
1206		struct iommu_table_group *table_group)
1207{
1208	long i, ret = 0;
1209
1210	/* Set all windows to the new group */
1211	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1212		struct iommu_table *tbl = container->tables[i];
1213
1214		if (!tbl)
1215			continue;
1216
1217		ret = table_group->ops->set_window(table_group, i, tbl);
1218		if (ret)
1219			goto release_exit;
1220	}
1221
1222	return 0;
1223
1224release_exit:
1225	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1226		table_group->ops->unset_window(table_group, i);
1227
1228	return ret;
1229}
1230
1231static int tce_iommu_attach_group(void *iommu_data,
1232		struct iommu_group *iommu_group, enum vfio_group_type type)
1233{
1234	int ret = 0;
1235	struct tce_container *container = iommu_data;
1236	struct iommu_table_group *table_group;
1237	struct tce_iommu_group *tcegrp = NULL;
1238
1239	if (type == VFIO_EMULATED_IOMMU)
1240		return -EINVAL;
1241
 
1242	mutex_lock(&container->lock);
1243
1244	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1245			iommu_group_id(iommu_group), iommu_group); */
1246	table_group = iommu_group_get_iommudata(iommu_group);
1247	if (!table_group) {
1248		ret = -ENODEV;
1249		goto unlock_exit;
1250	}
1251
1252	/* v2 requires full support of dynamic DMA windows */
1253	if (container->v2 && table_group->max_dynamic_windows_supported == 0) {
1254		ret = -EINVAL;
1255		goto unlock_exit;
1256	}
1257
1258	/* v1 reuses TCE tables and does not share them among PEs */
1259	if (!container->v2 && tce_groups_attached(container)) {
1260		ret = -EBUSY;
1261		goto unlock_exit;
1262	}
1263
1264	/*
1265	 * Check if new group has the same iommu_table_group_ops
1266	 * (i.e. compatible)
1267	 */
1268	list_for_each_entry(tcegrp, &container->group_list, next) {
1269		struct iommu_table_group *table_group_tmp;
1270
1271		if (tcegrp->grp == iommu_group) {
1272			pr_warn("tce_vfio: Group %d is already attached\n",
1273					iommu_group_id(iommu_group));
1274			ret = -EBUSY;
1275			goto unlock_exit;
1276		}
1277		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1278		if (table_group_tmp->ops->create_table !=
1279				table_group->ops->create_table) {
1280			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1281					iommu_group_id(iommu_group),
1282					iommu_group_id(tcegrp->grp));
1283			ret = -EPERM;
1284			goto unlock_exit;
1285		}
1286	}
1287
1288	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1289	if (!tcegrp) {
1290		ret = -ENOMEM;
1291		goto unlock_exit;
1292	}
1293
1294	ret = tce_iommu_take_ownership(container, table_group);
1295	if (!tce_groups_attached(container) && !container->tables[0])
1296		container->def_window_pending = true;
1297
1298	if (!ret) {
1299		tcegrp->grp = iommu_group;
1300		list_add(&tcegrp->next, &container->group_list);
1301	}
1302
1303	if (ret && tcegrp)
1304		kfree(tcegrp);
1305
1306unlock_exit:
1307	mutex_unlock(&container->lock);
1308
1309	return ret;
1310}
1311
1312static void tce_iommu_detach_group(void *iommu_data,
1313		struct iommu_group *iommu_group)
1314{
1315	struct tce_container *container = iommu_data;
1316	struct iommu_table_group *table_group;
1317	bool found = false;
1318	struct tce_iommu_group *tcegrp;
1319
 
1320	mutex_lock(&container->lock);
1321
1322	list_for_each_entry(tcegrp, &container->group_list, next) {
1323		if (tcegrp->grp == iommu_group) {
1324			found = true;
1325			break;
 
 
 
 
1326		}
1327	}
1328
1329	if (!found) {
1330		pr_warn("tce_vfio: detaching unattached group #%u\n",
1331				iommu_group_id(iommu_group));
1332		goto unlock_exit;
1333	}
1334
1335	list_del(&tcegrp->next);
1336	kfree(tcegrp);
1337
1338	table_group = iommu_group_get_iommudata(iommu_group);
1339	BUG_ON(!table_group);
1340
1341	tce_iommu_release_ownership(container, table_group);
1342
1343unlock_exit:
1344	mutex_unlock(&container->lock);
1345}
1346
1347static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1348	.name		= "iommu-vfio-powerpc",
1349	.owner		= THIS_MODULE,
1350	.open		= tce_iommu_open,
1351	.release	= tce_iommu_release,
1352	.ioctl		= tce_iommu_ioctl,
1353	.attach_group	= tce_iommu_attach_group,
1354	.detach_group	= tce_iommu_detach_group,
1355};
1356
1357static int __init tce_iommu_init(void)
1358{
1359	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1360}
1361
1362static void __exit tce_iommu_cleanup(void)
1363{
1364	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1365}
1366
1367module_init(tce_iommu_init);
1368module_exit(tce_iommu_cleanup);
1369
1370MODULE_VERSION(DRIVER_VERSION);
1371MODULE_LICENSE("GPL v2");
1372MODULE_AUTHOR(DRIVER_AUTHOR);
1373MODULE_DESCRIPTION(DRIVER_DESC);
1374
v3.15
 
  1/*
  2 * VFIO: IOMMU DMA mapping support for TCE on POWER
  3 *
  4 * Copyright (C) 2013 IBM Corp.  All rights reserved.
  5 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
  6 *
  7 * This program is free software; you can redistribute it and/or modify
  8 * it under the terms of the GNU General Public License version 2 as
  9 * published by the Free Software Foundation.
 10 *
 11 * Derived from original vfio_iommu_type1.c:
 12 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
 13 *     Author: Alex Williamson <alex.williamson@redhat.com>
 14 */
 15
 16#include <linux/module.h>
 17#include <linux/pci.h>
 18#include <linux/slab.h>
 19#include <linux/uaccess.h>
 20#include <linux/err.h>
 21#include <linux/vfio.h>
 
 
 
 
 
 
 22#include <asm/iommu.h>
 23#include <asm/tce.h>
 
 24
 25#define DRIVER_VERSION  "0.1"
 26#define DRIVER_AUTHOR   "aik@ozlabs.ru"
 27#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
 28
 29static void tce_iommu_detach_group(void *iommu_data,
 30		struct iommu_group *iommu_group);
 31
 32/*
 33 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
 34 *
 35 * This code handles mapping and unmapping of user data buffers
 36 * into DMA'ble space using the IOMMU
 37 */
 38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 39/*
 40 * The container descriptor supports only a single group per container.
 41 * Required by the API as the container is not supplied with the IOMMU group
 42 * at the moment of initialization.
 43 */
 44struct tce_container {
 45	struct mutex lock;
 46	struct iommu_table *tbl;
 47	bool enabled;
 
 
 
 
 
 
 
 48};
 49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 50static int tce_iommu_enable(struct tce_container *container)
 51{
 52	int ret = 0;
 53	unsigned long locked, lock_limit, npages;
 54	struct iommu_table *tbl = container->tbl;
 55
 56	if (!container->tbl)
 57		return -ENXIO;
 58
 59	if (!current->mm)
 60		return -ESRCH; /* process exited */
 61
 62	if (container->enabled)
 63		return -EBUSY;
 64
 65	/*
 66	 * When userspace pages are mapped into the IOMMU, they are effectively
 67	 * locked memory, so, theoretically, we need to update the accounting
 68	 * of locked pages on each map and unmap.  For powerpc, the map unmap
 69	 * paths can be very hot, though, and the accounting would kill
 70	 * performance, especially since it would be difficult to impossible
 71	 * to handle the accounting in real mode only.
 72	 *
 73	 * To address that, rather than precisely accounting every page, we
 74	 * instead account for a worst case on locked memory when the iommu is
 75	 * enabled and disabled.  The worst case upper bound on locked memory
 76	 * is the size of the whole iommu window, which is usually relatively
 77	 * small (compared to total memory sizes) on POWER hardware.
 78	 *
 79	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
 80	 * that would effectively kill the guest at random points, much better
 81	 * enforcing the limit based on the max that the guest can map.
 
 
 
 
 
 
 
 
 
 
 
 82	 */
 83	down_write(&current->mm->mmap_sem);
 84	npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
 85	locked = current->mm->locked_vm + npages;
 86	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 87	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
 88		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
 89				rlimit(RLIMIT_MEMLOCK));
 90		ret = -ENOMEM;
 91	} else {
 
 
 
 
 
 
 
 
 
 
 
 
 
 92
 93		current->mm->locked_vm += npages;
 94		container->enabled = true;
 95	}
 96	up_write(&current->mm->mmap_sem);
 97
 98	return ret;
 99}
100
101static void tce_iommu_disable(struct tce_container *container)
102{
103	if (!container->enabled)
104		return;
105
106	container->enabled = false;
107
108	if (!container->tbl || !current->mm)
109		return;
110
111	down_write(&current->mm->mmap_sem);
112	current->mm->locked_vm -= (container->tbl->it_size <<
113			IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
114	up_write(&current->mm->mmap_sem);
115}
116
117static void *tce_iommu_open(unsigned long arg)
118{
119	struct tce_container *container;
120
121	if (arg != VFIO_SPAPR_TCE_IOMMU) {
122		pr_err("tce_vfio: Wrong IOMMU type\n");
123		return ERR_PTR(-EINVAL);
124	}
125
126	container = kzalloc(sizeof(*container), GFP_KERNEL);
127	if (!container)
128		return ERR_PTR(-ENOMEM);
129
130	mutex_init(&container->lock);
 
 
 
 
131
132	return container;
133}
134
 
 
 
 
 
 
135static void tce_iommu_release(void *iommu_data)
136{
137	struct tce_container *container = iommu_data;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
139	WARN_ON(container->tbl && !container->tbl->it_group);
140	tce_iommu_disable(container);
 
 
 
 
141
142	if (container->tbl && container->tbl->it_group)
143		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
144
 
 
 
145	mutex_destroy(&container->lock);
146
147	kfree(container);
148}
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150static long tce_iommu_ioctl(void *iommu_data,
151				 unsigned int cmd, unsigned long arg)
152{
153	struct tce_container *container = iommu_data;
154	unsigned long minsz;
155	long ret;
156
157	switch (cmd) {
158	case VFIO_CHECK_EXTENSION:
159		return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
 
 
 
 
 
 
 
 
 
160
 
 
 
 
 
 
 
 
 
161	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
162		struct vfio_iommu_spapr_tce_info info;
163		struct iommu_table *tbl = container->tbl;
 
164
165		if (WARN_ON(!tbl))
 
 
 
 
 
 
 
166			return -ENXIO;
167
168		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
169				dma32_window_size);
170
171		if (copy_from_user(&info, (void __user *)arg, minsz))
172			return -EFAULT;
173
174		if (info.argsz < minsz)
175			return -EINVAL;
176
177		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT_4K;
178		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT_4K;
179		info.flags = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
181		if (copy_to_user((void __user *)arg, &info, minsz))
182			return -EFAULT;
183
184		return 0;
185	}
186	case VFIO_IOMMU_MAP_DMA: {
187		struct vfio_iommu_type1_dma_map param;
188		struct iommu_table *tbl = container->tbl;
189		unsigned long tce, i;
 
190
191		if (!tbl)
192			return -ENXIO;
193
194		BUG_ON(!tbl->it_group);
195
196		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
197
198		if (copy_from_user(&param, (void __user *)arg, minsz))
199			return -EFAULT;
200
201		if (param.argsz < minsz)
202			return -EINVAL;
203
204		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
205				VFIO_DMA_MAP_FLAG_WRITE))
206			return -EINVAL;
207
208		if ((param.size & ~IOMMU_PAGE_MASK_4K) ||
209				(param.vaddr & ~IOMMU_PAGE_MASK_4K))
 
 
 
 
 
 
 
 
210			return -EINVAL;
211
212		/* iova is checked by the IOMMU API */
213		tce = param.vaddr;
214		if (param.flags & VFIO_DMA_MAP_FLAG_READ)
215			tce |= TCE_PCI_READ;
216		if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
217			tce |= TCE_PCI_WRITE;
 
 
 
 
 
 
218
219		ret = iommu_tce_put_param_check(tbl, param.iova, tce);
220		if (ret)
221			return ret;
222
223		for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT_4K); ++i) {
224			ret = iommu_put_tce_user_mode(tbl,
225					(param.iova >> IOMMU_PAGE_SHIFT_4K) + i,
226					tce);
227			if (ret)
228				break;
229			tce += IOMMU_PAGE_SIZE_4K;
230		}
231		if (ret)
232			iommu_clear_tces_and_put_pages(tbl,
233					param.iova >> IOMMU_PAGE_SHIFT_4K, i);
 
234
235		iommu_flush_tce(tbl);
236
237		return ret;
238	}
239	case VFIO_IOMMU_UNMAP_DMA: {
240		struct vfio_iommu_type1_dma_unmap param;
241		struct iommu_table *tbl = container->tbl;
 
242
243		if (WARN_ON(!tbl))
244			return -ENXIO;
245
246		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
247				size);
248
249		if (copy_from_user(&param, (void __user *)arg, minsz))
250			return -EFAULT;
251
252		if (param.argsz < minsz)
253			return -EINVAL;
254
255		/* No flag is supported now */
256		if (param.flags)
257			return -EINVAL;
258
259		if (param.size & ~IOMMU_PAGE_MASK_4K)
 
 
 
 
 
 
 
 
260			return -EINVAL;
261
262		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
263				param.size >> IOMMU_PAGE_SHIFT_4K);
264		if (ret)
265			return ret;
266
267		ret = iommu_clear_tces_and_put_pages(tbl,
268				param.iova >> IOMMU_PAGE_SHIFT_4K,
269				param.size >> IOMMU_PAGE_SHIFT_4K);
270		iommu_flush_tce(tbl);
271
272		return ret;
273	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274	case VFIO_IOMMU_ENABLE:
 
 
 
275		mutex_lock(&container->lock);
276		ret = tce_iommu_enable(container);
277		mutex_unlock(&container->lock);
278		return ret;
279
280
281	case VFIO_IOMMU_DISABLE:
 
 
 
282		mutex_lock(&container->lock);
283		tce_iommu_disable(container);
284		mutex_unlock(&container->lock);
285		return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286	}
287
288	return -ENOTTY;
289}
290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291static int tce_iommu_attach_group(void *iommu_data,
292		struct iommu_group *iommu_group)
293{
294	int ret;
295	struct tce_container *container = iommu_data;
296	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
 
 
 
 
297
298	BUG_ON(!tbl);
299	mutex_lock(&container->lock);
300
301	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
302			iommu_group_id(iommu_group), iommu_group); */
303	if (container->tbl) {
304		pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
305				iommu_group_id(container->tbl->it_group),
306				iommu_group_id(iommu_group));
 
 
 
 
 
 
 
 
 
 
307		ret = -EBUSY;
308	} else if (container->enabled) {
309		pr_err("tce_vfio: attaching group #%u to enabled container\n",
310				iommu_group_id(iommu_group));
311		ret = -EBUSY;
312	} else {
313		ret = iommu_take_ownership(tbl);
314		if (!ret)
315			container->tbl = tbl;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316	}
317
 
 
 
 
318	mutex_unlock(&container->lock);
319
320	return ret;
321}
322
323static void tce_iommu_detach_group(void *iommu_data,
324		struct iommu_group *iommu_group)
325{
326	struct tce_container *container = iommu_data;
327	struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group);
 
 
328
329	BUG_ON(!tbl);
330	mutex_lock(&container->lock);
331	if (tbl != container->tbl) {
332		pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
333				iommu_group_id(iommu_group),
334				iommu_group_id(tbl->it_group));
335	} else {
336		if (container->enabled) {
337			pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
338					iommu_group_id(tbl->it_group));
339			tce_iommu_disable(container);
340		}
 
341
342		/* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
343				iommu_group_id(iommu_group), iommu_group); */
344		container->tbl = NULL;
345		iommu_release_ownership(tbl);
346	}
 
 
 
 
 
 
 
 
 
 
347	mutex_unlock(&container->lock);
348}
349
350const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
351	.name		= "iommu-vfio-powerpc",
352	.owner		= THIS_MODULE,
353	.open		= tce_iommu_open,
354	.release	= tce_iommu_release,
355	.ioctl		= tce_iommu_ioctl,
356	.attach_group	= tce_iommu_attach_group,
357	.detach_group	= tce_iommu_detach_group,
358};
359
360static int __init tce_iommu_init(void)
361{
362	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
363}
364
365static void __exit tce_iommu_cleanup(void)
366{
367	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
368}
369
370module_init(tce_iommu_init);
371module_exit(tce_iommu_cleanup);
372
373MODULE_VERSION(DRIVER_VERSION);
374MODULE_LICENSE("GPL v2");
375MODULE_AUTHOR(DRIVER_AUTHOR);
376MODULE_DESCRIPTION(DRIVER_DESC);
377