Loading...
Note: File does not exist in v3.1.
1// SPDX-License-Identifier: GPL-2.0 OR MIT
2/*
3 * Copyright 2020-2021 Advanced Micro Devices, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24#include <linux/types.h>
25#include <linux/sched/task.h>
26#include "amdgpu_sync.h"
27#include "amdgpu_object.h"
28#include "amdgpu_vm.h"
29#include "amdgpu_mn.h"
30#include "amdgpu.h"
31#include "amdgpu_xgmi.h"
32#include "kfd_priv.h"
33#include "kfd_svm.h"
34#include "kfd_migrate.h"
35
36#define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1
37
38/* Long enough to ensure no retry fault comes after svm range is restored and
39 * page table is updated.
40 */
41#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING 2000
42
43static void svm_range_evict_svm_bo_worker(struct work_struct *work);
44static bool
45svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
46 const struct mmu_notifier_range *range,
47 unsigned long cur_seq);
48
49static const struct mmu_interval_notifier_ops svm_range_mn_ops = {
50 .invalidate = svm_range_cpu_invalidate_pagetables,
51};
52
53/**
54 * svm_range_unlink - unlink svm_range from lists and interval tree
55 * @prange: svm range structure to be removed
56 *
57 * Remove the svm_range from the svms and svm_bo lists and the svms
58 * interval tree.
59 *
60 * Context: The caller must hold svms->lock
61 */
62static void svm_range_unlink(struct svm_range *prange)
63{
64 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
65 prange, prange->start, prange->last);
66
67 if (prange->svm_bo) {
68 spin_lock(&prange->svm_bo->list_lock);
69 list_del(&prange->svm_bo_list);
70 spin_unlock(&prange->svm_bo->list_lock);
71 }
72
73 list_del(&prange->list);
74 if (prange->it_node.start != 0 && prange->it_node.last != 0)
75 interval_tree_remove(&prange->it_node, &prange->svms->objects);
76}
77
78static void
79svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange)
80{
81 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
82 prange, prange->start, prange->last);
83
84 mmu_interval_notifier_insert_locked(&prange->notifier, mm,
85 prange->start << PAGE_SHIFT,
86 prange->npages << PAGE_SHIFT,
87 &svm_range_mn_ops);
88}
89
90/**
91 * svm_range_add_to_svms - add svm range to svms
92 * @prange: svm range structure to be added
93 *
94 * Add the svm range to svms interval tree and link list
95 *
96 * Context: The caller must hold svms->lock
97 */
98static void svm_range_add_to_svms(struct svm_range *prange)
99{
100 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
101 prange, prange->start, prange->last);
102
103 list_add_tail(&prange->list, &prange->svms->list);
104 prange->it_node.start = prange->start;
105 prange->it_node.last = prange->last;
106 interval_tree_insert(&prange->it_node, &prange->svms->objects);
107}
108
109static void svm_range_remove_notifier(struct svm_range *prange)
110{
111 pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n",
112 prange->svms, prange,
113 prange->notifier.interval_tree.start >> PAGE_SHIFT,
114 prange->notifier.interval_tree.last >> PAGE_SHIFT);
115
116 if (prange->notifier.interval_tree.start != 0 &&
117 prange->notifier.interval_tree.last != 0)
118 mmu_interval_notifier_remove(&prange->notifier);
119}
120
121static bool
122svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr)
123{
124 return dma_addr && !dma_mapping_error(dev, dma_addr) &&
125 !(dma_addr & SVM_RANGE_VRAM_DOMAIN);
126}
127
128static int
129svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange,
130 unsigned long offset, unsigned long npages,
131 unsigned long *hmm_pfns, uint32_t gpuidx)
132{
133 enum dma_data_direction dir = DMA_BIDIRECTIONAL;
134 dma_addr_t *addr = prange->dma_addr[gpuidx];
135 struct device *dev = adev->dev;
136 struct page *page;
137 int i, r;
138
139 if (!addr) {
140 addr = kvmalloc_array(prange->npages, sizeof(*addr),
141 GFP_KERNEL | __GFP_ZERO);
142 if (!addr)
143 return -ENOMEM;
144 prange->dma_addr[gpuidx] = addr;
145 }
146
147 addr += offset;
148 for (i = 0; i < npages; i++) {
149 if (svm_is_valid_dma_mapping_addr(dev, addr[i]))
150 dma_unmap_page(dev, addr[i], PAGE_SIZE, dir);
151
152 page = hmm_pfn_to_page(hmm_pfns[i]);
153 if (is_zone_device_page(page)) {
154 struct amdgpu_device *bo_adev =
155 amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
156
157 addr[i] = (hmm_pfns[i] << PAGE_SHIFT) +
158 bo_adev->vm_manager.vram_base_offset -
159 bo_adev->kfd.dev->pgmap.range.start;
160 addr[i] |= SVM_RANGE_VRAM_DOMAIN;
161 pr_debug("vram address detected: 0x%llx\n", addr[i]);
162 continue;
163 }
164 addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
165 r = dma_mapping_error(dev, addr[i]);
166 if (r) {
167 pr_debug("failed %d dma_map_page\n", r);
168 return r;
169 }
170 pr_debug("dma mapping 0x%llx for page addr 0x%lx\n",
171 addr[i] >> PAGE_SHIFT, page_to_pfn(page));
172 }
173 return 0;
174}
175
176static int
177svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap,
178 unsigned long offset, unsigned long npages,
179 unsigned long *hmm_pfns)
180{
181 struct kfd_process *p;
182 uint32_t gpuidx;
183 int r;
184
185 p = container_of(prange->svms, struct kfd_process, svms);
186
187 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
188 struct kfd_process_device *pdd;
189 struct amdgpu_device *adev;
190
191 pr_debug("mapping to gpu idx 0x%x\n", gpuidx);
192 pdd = kfd_process_device_from_gpuidx(p, gpuidx);
193 if (!pdd) {
194 pr_debug("failed to find device idx %d\n", gpuidx);
195 return -EINVAL;
196 }
197 adev = (struct amdgpu_device *)pdd->dev->kgd;
198
199 r = svm_range_dma_map_dev(adev, prange, offset, npages,
200 hmm_pfns, gpuidx);
201 if (r)
202 break;
203 }
204
205 return r;
206}
207
208void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr,
209 unsigned long offset, unsigned long npages)
210{
211 enum dma_data_direction dir = DMA_BIDIRECTIONAL;
212 int i;
213
214 if (!dma_addr)
215 return;
216
217 for (i = offset; i < offset + npages; i++) {
218 if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i]))
219 continue;
220 pr_debug("dma unmapping 0x%llx\n", dma_addr[i] >> PAGE_SHIFT);
221 dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
222 dma_addr[i] = 0;
223 }
224}
225
226void svm_range_free_dma_mappings(struct svm_range *prange)
227{
228 struct kfd_process_device *pdd;
229 dma_addr_t *dma_addr;
230 struct device *dev;
231 struct kfd_process *p;
232 uint32_t gpuidx;
233
234 p = container_of(prange->svms, struct kfd_process, svms);
235
236 for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) {
237 dma_addr = prange->dma_addr[gpuidx];
238 if (!dma_addr)
239 continue;
240
241 pdd = kfd_process_device_from_gpuidx(p, gpuidx);
242 if (!pdd) {
243 pr_debug("failed to find device idx %d\n", gpuidx);
244 continue;
245 }
246 dev = &pdd->dev->pdev->dev;
247 svm_range_dma_unmap(dev, dma_addr, 0, prange->npages);
248 kvfree(dma_addr);
249 prange->dma_addr[gpuidx] = NULL;
250 }
251}
252
253static void svm_range_free(struct svm_range *prange)
254{
255 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange,
256 prange->start, prange->last);
257
258 svm_range_vram_node_free(prange);
259 svm_range_free_dma_mappings(prange);
260 mutex_destroy(&prange->lock);
261 mutex_destroy(&prange->migrate_mutex);
262 kfree(prange);
263}
264
265static void
266svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc,
267 uint8_t *granularity, uint32_t *flags)
268{
269 *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
270 *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
271 *granularity = 9;
272 *flags =
273 KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT;
274}
275
276static struct
277svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
278 uint64_t last)
279{
280 uint64_t size = last - start + 1;
281 struct svm_range *prange;
282 struct kfd_process *p;
283
284 prange = kzalloc(sizeof(*prange), GFP_KERNEL);
285 if (!prange)
286 return NULL;
287 prange->npages = size;
288 prange->svms = svms;
289 prange->start = start;
290 prange->last = last;
291 INIT_LIST_HEAD(&prange->list);
292 INIT_LIST_HEAD(&prange->update_list);
293 INIT_LIST_HEAD(&prange->remove_list);
294 INIT_LIST_HEAD(&prange->insert_list);
295 INIT_LIST_HEAD(&prange->svm_bo_list);
296 INIT_LIST_HEAD(&prange->deferred_list);
297 INIT_LIST_HEAD(&prange->child_list);
298 atomic_set(&prange->invalid, 0);
299 prange->validate_timestamp = 0;
300 mutex_init(&prange->migrate_mutex);
301 mutex_init(&prange->lock);
302
303 p = container_of(svms, struct kfd_process, svms);
304 if (p->xnack_enabled)
305 bitmap_copy(prange->bitmap_access, svms->bitmap_supported,
306 MAX_GPU_INSTANCE);
307
308 svm_range_set_default_attributes(&prange->preferred_loc,
309 &prange->prefetch_loc,
310 &prange->granularity, &prange->flags);
311
312 pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last);
313
314 return prange;
315}
316
317static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo)
318{
319 if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref))
320 return false;
321
322 return true;
323}
324
325static void svm_range_bo_release(struct kref *kref)
326{
327 struct svm_range_bo *svm_bo;
328
329 svm_bo = container_of(kref, struct svm_range_bo, kref);
330 spin_lock(&svm_bo->list_lock);
331 while (!list_empty(&svm_bo->range_list)) {
332 struct svm_range *prange =
333 list_first_entry(&svm_bo->range_list,
334 struct svm_range, svm_bo_list);
335 /* list_del_init tells a concurrent svm_range_vram_node_new when
336 * it's safe to reuse the svm_bo pointer and svm_bo_list head.
337 */
338 list_del_init(&prange->svm_bo_list);
339 spin_unlock(&svm_bo->list_lock);
340
341 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
342 prange->start, prange->last);
343 mutex_lock(&prange->lock);
344 prange->svm_bo = NULL;
345 mutex_unlock(&prange->lock);
346
347 spin_lock(&svm_bo->list_lock);
348 }
349 spin_unlock(&svm_bo->list_lock);
350 if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base)) {
351 /* We're not in the eviction worker.
352 * Signal the fence and synchronize with any
353 * pending eviction work.
354 */
355 dma_fence_signal(&svm_bo->eviction_fence->base);
356 cancel_work_sync(&svm_bo->eviction_work);
357 }
358 dma_fence_put(&svm_bo->eviction_fence->base);
359 amdgpu_bo_unref(&svm_bo->bo);
360 kfree(svm_bo);
361}
362
363void svm_range_bo_unref(struct svm_range_bo *svm_bo)
364{
365 if (!svm_bo)
366 return;
367
368 kref_put(&svm_bo->kref, svm_range_bo_release);
369}
370
371static bool
372svm_range_validate_svm_bo(struct amdgpu_device *adev, struct svm_range *prange)
373{
374 struct amdgpu_device *bo_adev;
375
376 mutex_lock(&prange->lock);
377 if (!prange->svm_bo) {
378 mutex_unlock(&prange->lock);
379 return false;
380 }
381 if (prange->ttm_res) {
382 /* We still have a reference, all is well */
383 mutex_unlock(&prange->lock);
384 return true;
385 }
386 if (svm_bo_ref_unless_zero(prange->svm_bo)) {
387 /*
388 * Migrate from GPU to GPU, remove range from source bo_adev
389 * svm_bo range list, and return false to allocate svm_bo from
390 * destination adev.
391 */
392 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
393 if (bo_adev != adev) {
394 mutex_unlock(&prange->lock);
395
396 spin_lock(&prange->svm_bo->list_lock);
397 list_del_init(&prange->svm_bo_list);
398 spin_unlock(&prange->svm_bo->list_lock);
399
400 svm_range_bo_unref(prange->svm_bo);
401 return false;
402 }
403 if (READ_ONCE(prange->svm_bo->evicting)) {
404 struct dma_fence *f;
405 struct svm_range_bo *svm_bo;
406 /* The BO is getting evicted,
407 * we need to get a new one
408 */
409 mutex_unlock(&prange->lock);
410 svm_bo = prange->svm_bo;
411 f = dma_fence_get(&svm_bo->eviction_fence->base);
412 svm_range_bo_unref(prange->svm_bo);
413 /* wait for the fence to avoid long spin-loop
414 * at list_empty_careful
415 */
416 dma_fence_wait(f, false);
417 dma_fence_put(f);
418 } else {
419 /* The BO was still around and we got
420 * a new reference to it
421 */
422 mutex_unlock(&prange->lock);
423 pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n",
424 prange->svms, prange->start, prange->last);
425
426 prange->ttm_res = prange->svm_bo->bo->tbo.resource;
427 return true;
428 }
429
430 } else {
431 mutex_unlock(&prange->lock);
432 }
433
434 /* We need a new svm_bo. Spin-loop to wait for concurrent
435 * svm_range_bo_release to finish removing this range from
436 * its range list. After this, it is safe to reuse the
437 * svm_bo pointer and svm_bo_list head.
438 */
439 while (!list_empty_careful(&prange->svm_bo_list))
440 ;
441
442 return false;
443}
444
445static struct svm_range_bo *svm_range_bo_new(void)
446{
447 struct svm_range_bo *svm_bo;
448
449 svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL);
450 if (!svm_bo)
451 return NULL;
452
453 kref_init(&svm_bo->kref);
454 INIT_LIST_HEAD(&svm_bo->range_list);
455 spin_lock_init(&svm_bo->list_lock);
456
457 return svm_bo;
458}
459
460int
461svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange,
462 bool clear)
463{
464 struct amdgpu_bo_param bp;
465 struct svm_range_bo *svm_bo;
466 struct amdgpu_bo_user *ubo;
467 struct amdgpu_bo *bo;
468 struct kfd_process *p;
469 struct mm_struct *mm;
470 int r;
471
472 p = container_of(prange->svms, struct kfd_process, svms);
473 pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms,
474 prange->start, prange->last);
475
476 if (svm_range_validate_svm_bo(adev, prange))
477 return 0;
478
479 svm_bo = svm_range_bo_new();
480 if (!svm_bo) {
481 pr_debug("failed to alloc svm bo\n");
482 return -ENOMEM;
483 }
484 mm = get_task_mm(p->lead_thread);
485 if (!mm) {
486 pr_debug("failed to get mm\n");
487 kfree(svm_bo);
488 return -ESRCH;
489 }
490 svm_bo->svms = prange->svms;
491 svm_bo->eviction_fence =
492 amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
493 mm,
494 svm_bo);
495 mmput(mm);
496 INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker);
497 svm_bo->evicting = 0;
498 memset(&bp, 0, sizeof(bp));
499 bp.size = prange->npages * PAGE_SIZE;
500 bp.byte_align = PAGE_SIZE;
501 bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
502 bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
503 bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0;
504 bp.flags |= AMDGPU_AMDKFD_CREATE_SVM_BO;
505 bp.type = ttm_bo_type_device;
506 bp.resv = NULL;
507
508 r = amdgpu_bo_create_user(adev, &bp, &ubo);
509 if (r) {
510 pr_debug("failed %d to create bo\n", r);
511 goto create_bo_failed;
512 }
513 bo = &ubo->bo;
514 r = amdgpu_bo_reserve(bo, true);
515 if (r) {
516 pr_debug("failed %d to reserve bo\n", r);
517 goto reserve_bo_failed;
518 }
519
520 r = dma_resv_reserve_shared(bo->tbo.base.resv, 1);
521 if (r) {
522 pr_debug("failed %d to reserve bo\n", r);
523 amdgpu_bo_unreserve(bo);
524 goto reserve_bo_failed;
525 }
526 amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true);
527
528 amdgpu_bo_unreserve(bo);
529
530 svm_bo->bo = bo;
531 prange->svm_bo = svm_bo;
532 prange->ttm_res = bo->tbo.resource;
533 prange->offset = 0;
534
535 spin_lock(&svm_bo->list_lock);
536 list_add(&prange->svm_bo_list, &svm_bo->range_list);
537 spin_unlock(&svm_bo->list_lock);
538
539 return 0;
540
541reserve_bo_failed:
542 amdgpu_bo_unref(&bo);
543create_bo_failed:
544 dma_fence_put(&svm_bo->eviction_fence->base);
545 kfree(svm_bo);
546 prange->ttm_res = NULL;
547
548 return r;
549}
550
551void svm_range_vram_node_free(struct svm_range *prange)
552{
553 svm_range_bo_unref(prange->svm_bo);
554 prange->ttm_res = NULL;
555}
556
557struct amdgpu_device *
558svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id)
559{
560 struct kfd_process_device *pdd;
561 struct kfd_process *p;
562 int32_t gpu_idx;
563
564 p = container_of(prange->svms, struct kfd_process, svms);
565
566 gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id);
567 if (gpu_idx < 0) {
568 pr_debug("failed to get device by id 0x%x\n", gpu_id);
569 return NULL;
570 }
571 pdd = kfd_process_device_from_gpuidx(p, gpu_idx);
572 if (!pdd) {
573 pr_debug("failed to get device by idx 0x%x\n", gpu_idx);
574 return NULL;
575 }
576
577 return (struct amdgpu_device *)pdd->dev->kgd;
578}
579
580struct kfd_process_device *
581svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device *adev)
582{
583 struct kfd_process *p;
584 int32_t gpu_idx, gpuid;
585 int r;
586
587 p = container_of(prange->svms, struct kfd_process, svms);
588
589 r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpu_idx);
590 if (r) {
591 pr_debug("failed to get device id by adev %p\n", adev);
592 return NULL;
593 }
594
595 return kfd_process_device_from_gpuidx(p, gpu_idx);
596}
597
598static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo)
599{
600 struct ttm_operation_ctx ctx = { false, false };
601
602 amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM);
603
604 return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
605}
606
607static int
608svm_range_check_attr(struct kfd_process *p,
609 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
610{
611 uint32_t i;
612
613 for (i = 0; i < nattr; i++) {
614 uint32_t val = attrs[i].value;
615 int gpuidx = MAX_GPU_INSTANCE;
616
617 switch (attrs[i].type) {
618 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
619 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM &&
620 val != KFD_IOCTL_SVM_LOCATION_UNDEFINED)
621 gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
622 break;
623 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
624 if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM)
625 gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
626 break;
627 case KFD_IOCTL_SVM_ATTR_ACCESS:
628 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
629 case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
630 gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
631 break;
632 case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
633 break;
634 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
635 break;
636 case KFD_IOCTL_SVM_ATTR_GRANULARITY:
637 break;
638 default:
639 pr_debug("unknown attr type 0x%x\n", attrs[i].type);
640 return -EINVAL;
641 }
642
643 if (gpuidx < 0) {
644 pr_debug("no GPU 0x%x found\n", val);
645 return -EINVAL;
646 } else if (gpuidx < MAX_GPU_INSTANCE &&
647 !test_bit(gpuidx, p->svms.bitmap_supported)) {
648 pr_debug("GPU 0x%x not supported\n", val);
649 return -EINVAL;
650 }
651 }
652
653 return 0;
654}
655
656static void
657svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange,
658 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
659{
660 uint32_t i;
661 int gpuidx;
662
663 for (i = 0; i < nattr; i++) {
664 switch (attrs[i].type) {
665 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
666 prange->preferred_loc = attrs[i].value;
667 break;
668 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
669 prange->prefetch_loc = attrs[i].value;
670 break;
671 case KFD_IOCTL_SVM_ATTR_ACCESS:
672 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
673 case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
674 gpuidx = kfd_process_gpuidx_from_gpuid(p,
675 attrs[i].value);
676 if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) {
677 bitmap_clear(prange->bitmap_access, gpuidx, 1);
678 bitmap_clear(prange->bitmap_aip, gpuidx, 1);
679 } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) {
680 bitmap_set(prange->bitmap_access, gpuidx, 1);
681 bitmap_clear(prange->bitmap_aip, gpuidx, 1);
682 } else {
683 bitmap_clear(prange->bitmap_access, gpuidx, 1);
684 bitmap_set(prange->bitmap_aip, gpuidx, 1);
685 }
686 break;
687 case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
688 prange->flags |= attrs[i].value;
689 break;
690 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
691 prange->flags &= ~attrs[i].value;
692 break;
693 case KFD_IOCTL_SVM_ATTR_GRANULARITY:
694 prange->granularity = attrs[i].value;
695 break;
696 default:
697 WARN_ONCE(1, "svm_range_check_attrs wasn't called?");
698 }
699 }
700}
701
702/**
703 * svm_range_debug_dump - print all range information from svms
704 * @svms: svm range list header
705 *
706 * debug output svm range start, end, prefetch location from svms
707 * interval tree and link list
708 *
709 * Context: The caller must hold svms->lock
710 */
711static void svm_range_debug_dump(struct svm_range_list *svms)
712{
713 struct interval_tree_node *node;
714 struct svm_range *prange;
715
716 pr_debug("dump svms 0x%p list\n", svms);
717 pr_debug("range\tstart\tpage\tend\t\tlocation\n");
718
719 list_for_each_entry(prange, &svms->list, list) {
720 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n",
721 prange, prange->start, prange->npages,
722 prange->start + prange->npages - 1,
723 prange->actual_loc);
724 }
725
726 pr_debug("dump svms 0x%p interval tree\n", svms);
727 pr_debug("range\tstart\tpage\tend\t\tlocation\n");
728 node = interval_tree_iter_first(&svms->objects, 0, ~0ULL);
729 while (node) {
730 prange = container_of(node, struct svm_range, it_node);
731 pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n",
732 prange, prange->start, prange->npages,
733 prange->start + prange->npages - 1,
734 prange->actual_loc);
735 node = interval_tree_iter_next(node, 0, ~0ULL);
736 }
737}
738
739static bool
740svm_range_is_same_attrs(struct svm_range *old, struct svm_range *new)
741{
742 return (old->prefetch_loc == new->prefetch_loc &&
743 old->flags == new->flags &&
744 old->granularity == new->granularity);
745}
746
747static int
748svm_range_split_array(void *ppnew, void *ppold, size_t size,
749 uint64_t old_start, uint64_t old_n,
750 uint64_t new_start, uint64_t new_n)
751{
752 unsigned char *new, *old, *pold;
753 uint64_t d;
754
755 if (!ppold)
756 return 0;
757 pold = *(unsigned char **)ppold;
758 if (!pold)
759 return 0;
760
761 new = kvmalloc_array(new_n, size, GFP_KERNEL);
762 if (!new)
763 return -ENOMEM;
764
765 d = (new_start - old_start) * size;
766 memcpy(new, pold + d, new_n * size);
767
768 old = kvmalloc_array(old_n, size, GFP_KERNEL);
769 if (!old) {
770 kvfree(new);
771 return -ENOMEM;
772 }
773
774 d = (new_start == old_start) ? new_n * size : 0;
775 memcpy(old, pold + d, old_n * size);
776
777 kvfree(pold);
778 *(void **)ppold = old;
779 *(void **)ppnew = new;
780
781 return 0;
782}
783
784static int
785svm_range_split_pages(struct svm_range *new, struct svm_range *old,
786 uint64_t start, uint64_t last)
787{
788 uint64_t npages = last - start + 1;
789 int i, r;
790
791 for (i = 0; i < MAX_GPU_INSTANCE; i++) {
792 r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i],
793 sizeof(*old->dma_addr[i]), old->start,
794 npages, new->start, new->npages);
795 if (r)
796 return r;
797 }
798
799 return 0;
800}
801
802static int
803svm_range_split_nodes(struct svm_range *new, struct svm_range *old,
804 uint64_t start, uint64_t last)
805{
806 uint64_t npages = last - start + 1;
807
808 pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n",
809 new->svms, new, new->start, start, last);
810
811 if (new->start == old->start) {
812 new->offset = old->offset;
813 old->offset += new->npages;
814 } else {
815 new->offset = old->offset + npages;
816 }
817
818 new->svm_bo = svm_range_bo_ref(old->svm_bo);
819 new->ttm_res = old->ttm_res;
820
821 spin_lock(&new->svm_bo->list_lock);
822 list_add(&new->svm_bo_list, &new->svm_bo->range_list);
823 spin_unlock(&new->svm_bo->list_lock);
824
825 return 0;
826}
827
828/**
829 * svm_range_split_adjust - split range and adjust
830 *
831 * @new: new range
832 * @old: the old range
833 * @start: the old range adjust to start address in pages
834 * @last: the old range adjust to last address in pages
835 *
836 * Copy system memory dma_addr or vram ttm_res in old range to new
837 * range from new_start up to size new->npages, the remaining old range is from
838 * start to last
839 *
840 * Return:
841 * 0 - OK, -ENOMEM - out of memory
842 */
843static int
844svm_range_split_adjust(struct svm_range *new, struct svm_range *old,
845 uint64_t start, uint64_t last)
846{
847 int r;
848
849 pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n",
850 new->svms, new->start, old->start, old->last, start, last);
851
852 if (new->start < old->start ||
853 new->last > old->last) {
854 WARN_ONCE(1, "invalid new range start or last\n");
855 return -EINVAL;
856 }
857
858 r = svm_range_split_pages(new, old, start, last);
859 if (r)
860 return r;
861
862 if (old->actual_loc && old->ttm_res) {
863 r = svm_range_split_nodes(new, old, start, last);
864 if (r)
865 return r;
866 }
867
868 old->npages = last - start + 1;
869 old->start = start;
870 old->last = last;
871 new->flags = old->flags;
872 new->preferred_loc = old->preferred_loc;
873 new->prefetch_loc = old->prefetch_loc;
874 new->actual_loc = old->actual_loc;
875 new->granularity = old->granularity;
876 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
877 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
878
879 return 0;
880}
881
882/**
883 * svm_range_split - split a range in 2 ranges
884 *
885 * @prange: the svm range to split
886 * @start: the remaining range start address in pages
887 * @last: the remaining range last address in pages
888 * @new: the result new range generated
889 *
890 * Two cases only:
891 * case 1: if start == prange->start
892 * prange ==> prange[start, last]
893 * new range [last + 1, prange->last]
894 *
895 * case 2: if last == prange->last
896 * prange ==> prange[start, last]
897 * new range [prange->start, start - 1]
898 *
899 * Return:
900 * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last
901 */
902static int
903svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last,
904 struct svm_range **new)
905{
906 uint64_t old_start = prange->start;
907 uint64_t old_last = prange->last;
908 struct svm_range_list *svms;
909 int r = 0;
910
911 pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms,
912 old_start, old_last, start, last);
913
914 if (old_start != start && old_last != last)
915 return -EINVAL;
916 if (start < old_start || last > old_last)
917 return -EINVAL;
918
919 svms = prange->svms;
920 if (old_start == start)
921 *new = svm_range_new(svms, last + 1, old_last);
922 else
923 *new = svm_range_new(svms, old_start, start - 1);
924 if (!*new)
925 return -ENOMEM;
926
927 r = svm_range_split_adjust(*new, prange, start, last);
928 if (r) {
929 pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n",
930 r, old_start, old_last, start, last);
931 svm_range_free(*new);
932 *new = NULL;
933 }
934
935 return r;
936}
937
938static int
939svm_range_split_tail(struct svm_range *prange, struct svm_range *new,
940 uint64_t new_last, struct list_head *insert_list)
941{
942 struct svm_range *tail;
943 int r = svm_range_split(prange, prange->start, new_last, &tail);
944
945 if (!r)
946 list_add(&tail->insert_list, insert_list);
947 return r;
948}
949
950static int
951svm_range_split_head(struct svm_range *prange, struct svm_range *new,
952 uint64_t new_start, struct list_head *insert_list)
953{
954 struct svm_range *head;
955 int r = svm_range_split(prange, new_start, prange->last, &head);
956
957 if (!r)
958 list_add(&head->insert_list, insert_list);
959 return r;
960}
961
962static void
963svm_range_add_child(struct svm_range *prange, struct mm_struct *mm,
964 struct svm_range *pchild, enum svm_work_list_ops op)
965{
966 pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n",
967 pchild, pchild->start, pchild->last, prange, op);
968
969 pchild->work_item.mm = mm;
970 pchild->work_item.op = op;
971 list_add_tail(&pchild->child_list, &prange->child_list);
972}
973
974/**
975 * svm_range_split_by_granularity - collect ranges within granularity boundary
976 *
977 * @p: the process with svms list
978 * @mm: mm structure
979 * @addr: the vm fault address in pages, to split the prange
980 * @parent: parent range if prange is from child list
981 * @prange: prange to split
982 *
983 * Trims @prange to be a single aligned block of prange->granularity if
984 * possible. The head and tail are added to the child_list in @parent.
985 *
986 * Context: caller must hold mmap_read_lock and prange->lock
987 *
988 * Return:
989 * 0 - OK, otherwise error code
990 */
991int
992svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm,
993 unsigned long addr, struct svm_range *parent,
994 struct svm_range *prange)
995{
996 struct svm_range *head, *tail;
997 unsigned long start, last, size;
998 int r;
999
1000 /* Align splited range start and size to granularity size, then a single
1001 * PTE will be used for whole range, this reduces the number of PTE
1002 * updated and the L1 TLB space used for translation.
1003 */
1004 size = 1UL << prange->granularity;
1005 start = ALIGN_DOWN(addr, size);
1006 last = ALIGN(addr + 1, size) - 1;
1007
1008 pr_debug("svms 0x%p split [0x%lx 0x%lx] to [0x%lx 0x%lx] size 0x%lx\n",
1009 prange->svms, prange->start, prange->last, start, last, size);
1010
1011 if (start > prange->start) {
1012 r = svm_range_split(prange, start, prange->last, &head);
1013 if (r)
1014 return r;
1015 svm_range_add_child(parent, mm, head, SVM_OP_ADD_RANGE);
1016 }
1017
1018 if (last < prange->last) {
1019 r = svm_range_split(prange, prange->start, last, &tail);
1020 if (r)
1021 return r;
1022 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
1023 }
1024
1025 /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */
1026 if (p->xnack_enabled && prange->work_item.op == SVM_OP_ADD_RANGE) {
1027 prange->work_item.op = SVM_OP_ADD_RANGE_AND_MAP;
1028 pr_debug("change prange 0x%p [0x%lx 0x%lx] op %d\n",
1029 prange, prange->start, prange->last,
1030 SVM_OP_ADD_RANGE_AND_MAP);
1031 }
1032 return 0;
1033}
1034
1035static uint64_t
1036svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange,
1037 int domain)
1038{
1039 struct amdgpu_device *bo_adev;
1040 uint32_t flags = prange->flags;
1041 uint32_t mapping_flags = 0;
1042 uint64_t pte_flags;
1043 bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN);
1044 bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT;
1045
1046 if (domain == SVM_RANGE_VRAM_DOMAIN)
1047 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
1048
1049 switch (adev->asic_type) {
1050 case CHIP_ARCTURUS:
1051 if (domain == SVM_RANGE_VRAM_DOMAIN) {
1052 if (bo_adev == adev) {
1053 mapping_flags |= coherent ?
1054 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1055 } else {
1056 mapping_flags |= coherent ?
1057 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1058 if (amdgpu_xgmi_same_hive(adev, bo_adev))
1059 snoop = true;
1060 }
1061 } else {
1062 mapping_flags |= coherent ?
1063 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1064 }
1065 break;
1066 case CHIP_ALDEBARAN:
1067 if (domain == SVM_RANGE_VRAM_DOMAIN) {
1068 if (bo_adev == adev) {
1069 mapping_flags |= coherent ?
1070 AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1071 if (adev->gmc.xgmi.connected_to_cpu)
1072 snoop = true;
1073 } else {
1074 mapping_flags |= coherent ?
1075 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1076 if (amdgpu_xgmi_same_hive(adev, bo_adev))
1077 snoop = true;
1078 }
1079 } else {
1080 mapping_flags |= coherent ?
1081 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1082 }
1083 break;
1084 default:
1085 mapping_flags |= coherent ?
1086 AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1087 }
1088
1089 mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE;
1090
1091 if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO)
1092 mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE;
1093 if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC)
1094 mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
1095
1096 pte_flags = AMDGPU_PTE_VALID;
1097 pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM;
1098 pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0;
1099
1100 pte_flags |= amdgpu_gem_va_map_flags(adev, mapping_flags);
1101 return pte_flags;
1102}
1103
1104static int
1105svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
1106 uint64_t start, uint64_t last,
1107 struct dma_fence **fence)
1108{
1109 uint64_t init_pte_value = 0;
1110
1111 pr_debug("[0x%llx 0x%llx]\n", start, last);
1112
1113 return amdgpu_vm_bo_update_mapping(adev, adev, vm, false, true, NULL,
1114 start, last, init_pte_value, 0,
1115 NULL, NULL, fence, NULL);
1116}
1117
1118static int
1119svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
1120 unsigned long last)
1121{
1122 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
1123 struct kfd_process_device *pdd;
1124 struct dma_fence *fence = NULL;
1125 struct amdgpu_device *adev;
1126 struct kfd_process *p;
1127 uint32_t gpuidx;
1128 int r = 0;
1129
1130 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
1131 MAX_GPU_INSTANCE);
1132 p = container_of(prange->svms, struct kfd_process, svms);
1133
1134 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
1135 pr_debug("unmap from gpu idx 0x%x\n", gpuidx);
1136 pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1137 if (!pdd) {
1138 pr_debug("failed to find device idx %d\n", gpuidx);
1139 return -EINVAL;
1140 }
1141 adev = (struct amdgpu_device *)pdd->dev->kgd;
1142
1143 r = svm_range_unmap_from_gpu(adev, drm_priv_to_vm(pdd->drm_priv),
1144 start, last, &fence);
1145 if (r)
1146 break;
1147
1148 if (fence) {
1149 r = dma_fence_wait(fence, false);
1150 dma_fence_put(fence);
1151 fence = NULL;
1152 if (r)
1153 break;
1154 }
1155 amdgpu_amdkfd_flush_gpu_tlb_pasid((struct kgd_dev *)adev,
1156 p->pasid, TLB_FLUSH_HEAVYWEIGHT);
1157 }
1158
1159 return r;
1160}
1161
1162static int
1163svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
1164 struct svm_range *prange, unsigned long offset,
1165 unsigned long npages, bool readonly, dma_addr_t *dma_addr,
1166 struct amdgpu_device *bo_adev, struct dma_fence **fence)
1167{
1168 struct amdgpu_bo_va bo_va;
1169 bool table_freed = false;
1170 uint64_t pte_flags;
1171 unsigned long last_start;
1172 int last_domain;
1173 int r = 0;
1174 int64_t i, j;
1175
1176 last_start = prange->start + offset;
1177
1178 pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms,
1179 last_start, last_start + npages - 1, readonly);
1180
1181 if (prange->svm_bo && prange->ttm_res)
1182 bo_va.is_xgmi = amdgpu_xgmi_same_hive(adev, bo_adev);
1183
1184 for (i = offset; i < offset + npages; i++) {
1185 last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN;
1186 dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN;
1187 if ((prange->start + i) < prange->last &&
1188 last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN))
1189 continue;
1190
1191 pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n",
1192 last_start, prange->start + i, last_domain ? "GPU" : "CPU");
1193
1194 pte_flags = svm_range_get_pte_flags(adev, prange, last_domain);
1195 if (readonly)
1196 pte_flags &= ~AMDGPU_PTE_WRITEABLE;
1197
1198 pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n",
1199 prange->svms, last_start, prange->start + i,
1200 (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0,
1201 pte_flags);
1202
1203 r = amdgpu_vm_bo_update_mapping(adev, bo_adev, vm, false, false,
1204 NULL, last_start,
1205 prange->start + i, pte_flags,
1206 last_start - prange->start,
1207 NULL, dma_addr,
1208 &vm->last_update,
1209 &table_freed);
1210
1211 for (j = last_start - prange->start; j <= i; j++)
1212 dma_addr[j] |= last_domain;
1213
1214 if (r) {
1215 pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start);
1216 goto out;
1217 }
1218 last_start = prange->start + i + 1;
1219 }
1220
1221 r = amdgpu_vm_update_pdes(adev, vm, false);
1222 if (r) {
1223 pr_debug("failed %d to update directories 0x%lx\n", r,
1224 prange->start);
1225 goto out;
1226 }
1227
1228 if (fence)
1229 *fence = dma_fence_get(vm->last_update);
1230
1231 if (table_freed) {
1232 struct kfd_process *p;
1233
1234 p = container_of(prange->svms, struct kfd_process, svms);
1235 amdgpu_amdkfd_flush_gpu_tlb_pasid((struct kgd_dev *)adev,
1236 p->pasid, TLB_FLUSH_LEGACY);
1237 }
1238out:
1239 return r;
1240}
1241
1242static int
1243svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
1244 unsigned long npages, bool readonly,
1245 unsigned long *bitmap, bool wait)
1246{
1247 struct kfd_process_device *pdd;
1248 struct amdgpu_device *bo_adev;
1249 struct amdgpu_device *adev;
1250 struct kfd_process *p;
1251 struct dma_fence *fence = NULL;
1252 uint32_t gpuidx;
1253 int r = 0;
1254
1255 if (prange->svm_bo && prange->ttm_res)
1256 bo_adev = amdgpu_ttm_adev(prange->svm_bo->bo->tbo.bdev);
1257 else
1258 bo_adev = NULL;
1259
1260 p = container_of(prange->svms, struct kfd_process, svms);
1261 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
1262 pr_debug("mapping to gpu idx 0x%x\n", gpuidx);
1263 pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1264 if (!pdd) {
1265 pr_debug("failed to find device idx %d\n", gpuidx);
1266 return -EINVAL;
1267 }
1268 adev = (struct amdgpu_device *)pdd->dev->kgd;
1269
1270 pdd = kfd_bind_process_to_device(pdd->dev, p);
1271 if (IS_ERR(pdd))
1272 return -EINVAL;
1273
1274 if (bo_adev && adev != bo_adev &&
1275 !amdgpu_xgmi_same_hive(adev, bo_adev)) {
1276 pr_debug("cannot map to device idx %d\n", gpuidx);
1277 continue;
1278 }
1279
1280 r = svm_range_map_to_gpu(adev, drm_priv_to_vm(pdd->drm_priv),
1281 prange, offset, npages, readonly,
1282 prange->dma_addr[gpuidx],
1283 bo_adev, wait ? &fence : NULL);
1284 if (r)
1285 break;
1286
1287 if (fence) {
1288 r = dma_fence_wait(fence, false);
1289 dma_fence_put(fence);
1290 fence = NULL;
1291 if (r) {
1292 pr_debug("failed %d to dma fence wait\n", r);
1293 break;
1294 }
1295 }
1296 }
1297
1298 return r;
1299}
1300
1301struct svm_validate_context {
1302 struct kfd_process *process;
1303 struct svm_range *prange;
1304 bool intr;
1305 unsigned long bitmap[MAX_GPU_INSTANCE];
1306 struct ttm_validate_buffer tv[MAX_GPU_INSTANCE+1];
1307 struct list_head validate_list;
1308 struct ww_acquire_ctx ticket;
1309};
1310
1311static int svm_range_reserve_bos(struct svm_validate_context *ctx)
1312{
1313 struct kfd_process_device *pdd;
1314 struct amdgpu_device *adev;
1315 struct amdgpu_vm *vm;
1316 uint32_t gpuidx;
1317 int r;
1318
1319 INIT_LIST_HEAD(&ctx->validate_list);
1320 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) {
1321 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx);
1322 if (!pdd) {
1323 pr_debug("failed to find device idx %d\n", gpuidx);
1324 return -EINVAL;
1325 }
1326 adev = (struct amdgpu_device *)pdd->dev->kgd;
1327 vm = drm_priv_to_vm(pdd->drm_priv);
1328
1329 ctx->tv[gpuidx].bo = &vm->root.bo->tbo;
1330 ctx->tv[gpuidx].num_shared = 4;
1331 list_add(&ctx->tv[gpuidx].head, &ctx->validate_list);
1332 }
1333 if (ctx->prange->svm_bo && ctx->prange->ttm_res) {
1334 ctx->tv[MAX_GPU_INSTANCE].bo = &ctx->prange->svm_bo->bo->tbo;
1335 ctx->tv[MAX_GPU_INSTANCE].num_shared = 1;
1336 list_add(&ctx->tv[MAX_GPU_INSTANCE].head, &ctx->validate_list);
1337 }
1338
1339 r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list,
1340 ctx->intr, NULL);
1341 if (r) {
1342 pr_debug("failed %d to reserve bo\n", r);
1343 return r;
1344 }
1345
1346 for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) {
1347 pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx);
1348 if (!pdd) {
1349 pr_debug("failed to find device idx %d\n", gpuidx);
1350 r = -EINVAL;
1351 goto unreserve_out;
1352 }
1353 adev = (struct amdgpu_device *)pdd->dev->kgd;
1354
1355 r = amdgpu_vm_validate_pt_bos(adev, drm_priv_to_vm(pdd->drm_priv),
1356 svm_range_bo_validate, NULL);
1357 if (r) {
1358 pr_debug("failed %d validate pt bos\n", r);
1359 goto unreserve_out;
1360 }
1361 }
1362
1363 return 0;
1364
1365unreserve_out:
1366 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list);
1367 return r;
1368}
1369
1370static void svm_range_unreserve_bos(struct svm_validate_context *ctx)
1371{
1372 ttm_eu_backoff_reservation(&ctx->ticket, &ctx->validate_list);
1373}
1374
1375static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx)
1376{
1377 struct kfd_process_device *pdd;
1378 struct amdgpu_device *adev;
1379
1380 pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1381 adev = (struct amdgpu_device *)pdd->dev->kgd;
1382
1383 return SVM_ADEV_PGMAP_OWNER(adev);
1384}
1385
1386/*
1387 * Validation+GPU mapping with concurrent invalidation (MMU notifiers)
1388 *
1389 * To prevent concurrent destruction or change of range attributes, the
1390 * svm_read_lock must be held. The caller must not hold the svm_write_lock
1391 * because that would block concurrent evictions and lead to deadlocks. To
1392 * serialize concurrent migrations or validations of the same range, the
1393 * prange->migrate_mutex must be held.
1394 *
1395 * For VRAM ranges, the SVM BO must be allocated and valid (protected by its
1396 * eviction fence.
1397 *
1398 * The following sequence ensures race-free validation and GPU mapping:
1399 *
1400 * 1. Reserve page table (and SVM BO if range is in VRAM)
1401 * 2. hmm_range_fault to get page addresses (if system memory)
1402 * 3. DMA-map pages (if system memory)
1403 * 4-a. Take notifier lock
1404 * 4-b. Check that pages still valid (mmu_interval_read_retry)
1405 * 4-c. Check that the range was not split or otherwise invalidated
1406 * 4-d. Update GPU page table
1407 * 4.e. Release notifier lock
1408 * 5. Release page table (and SVM BO) reservation
1409 */
1410static int svm_range_validate_and_map(struct mm_struct *mm,
1411 struct svm_range *prange,
1412 int32_t gpuidx, bool intr, bool wait)
1413{
1414 struct svm_validate_context ctx;
1415 unsigned long start, end, addr;
1416 struct kfd_process *p;
1417 void *owner;
1418 int32_t idx;
1419 int r = 0;
1420
1421 ctx.process = container_of(prange->svms, struct kfd_process, svms);
1422 ctx.prange = prange;
1423 ctx.intr = intr;
1424
1425 if (gpuidx < MAX_GPU_INSTANCE) {
1426 bitmap_zero(ctx.bitmap, MAX_GPU_INSTANCE);
1427 bitmap_set(ctx.bitmap, gpuidx, 1);
1428 } else if (ctx.process->xnack_enabled) {
1429 bitmap_copy(ctx.bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
1430
1431 /* If prefetch range to GPU, or GPU retry fault migrate range to
1432 * GPU, which has ACCESS attribute to the range, create mapping
1433 * on that GPU.
1434 */
1435 if (prange->actual_loc) {
1436 gpuidx = kfd_process_gpuidx_from_gpuid(ctx.process,
1437 prange->actual_loc);
1438 if (gpuidx < 0) {
1439 WARN_ONCE(1, "failed get device by id 0x%x\n",
1440 prange->actual_loc);
1441 return -EINVAL;
1442 }
1443 if (test_bit(gpuidx, prange->bitmap_access))
1444 bitmap_set(ctx.bitmap, gpuidx, 1);
1445 }
1446 } else {
1447 bitmap_or(ctx.bitmap, prange->bitmap_access,
1448 prange->bitmap_aip, MAX_GPU_INSTANCE);
1449 }
1450
1451 if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE))
1452 return 0;
1453
1454 if (prange->actual_loc && !prange->ttm_res) {
1455 /* This should never happen. actual_loc gets set by
1456 * svm_migrate_ram_to_vram after allocating a BO.
1457 */
1458 WARN(1, "VRAM BO missing during validation\n");
1459 return -EINVAL;
1460 }
1461
1462 svm_range_reserve_bos(&ctx);
1463
1464 p = container_of(prange->svms, struct kfd_process, svms);
1465 owner = kfd_svm_page_owner(p, find_first_bit(ctx.bitmap,
1466 MAX_GPU_INSTANCE));
1467 for_each_set_bit(idx, ctx.bitmap, MAX_GPU_INSTANCE) {
1468 if (kfd_svm_page_owner(p, idx) != owner) {
1469 owner = NULL;
1470 break;
1471 }
1472 }
1473
1474 start = prange->start << PAGE_SHIFT;
1475 end = (prange->last + 1) << PAGE_SHIFT;
1476 for (addr = start; addr < end && !r; ) {
1477 struct hmm_range *hmm_range;
1478 struct vm_area_struct *vma;
1479 unsigned long next;
1480 unsigned long offset;
1481 unsigned long npages;
1482 bool readonly;
1483
1484 vma = find_vma(mm, addr);
1485 if (!vma || addr < vma->vm_start) {
1486 r = -EFAULT;
1487 goto unreserve_out;
1488 }
1489 readonly = !(vma->vm_flags & VM_WRITE);
1490
1491 next = min(vma->vm_end, end);
1492 npages = (next - addr) >> PAGE_SHIFT;
1493 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL,
1494 addr, npages, &hmm_range,
1495 readonly, true, owner);
1496 if (r) {
1497 pr_debug("failed %d to get svm range pages\n", r);
1498 goto unreserve_out;
1499 }
1500
1501 offset = (addr - start) >> PAGE_SHIFT;
1502 r = svm_range_dma_map(prange, ctx.bitmap, offset, npages,
1503 hmm_range->hmm_pfns);
1504 if (r) {
1505 pr_debug("failed %d to dma map range\n", r);
1506 goto unreserve_out;
1507 }
1508
1509 svm_range_lock(prange);
1510 if (amdgpu_hmm_range_get_pages_done(hmm_range)) {
1511 pr_debug("hmm update the range, need validate again\n");
1512 r = -EAGAIN;
1513 goto unlock_out;
1514 }
1515 if (!list_empty(&prange->child_list)) {
1516 pr_debug("range split by unmap in parallel, validate again\n");
1517 r = -EAGAIN;
1518 goto unlock_out;
1519 }
1520
1521 r = svm_range_map_to_gpus(prange, offset, npages, readonly,
1522 ctx.bitmap, wait);
1523
1524unlock_out:
1525 svm_range_unlock(prange);
1526
1527 addr = next;
1528 }
1529
1530 if (addr == end)
1531 prange->validated_once = true;
1532
1533unreserve_out:
1534 svm_range_unreserve_bos(&ctx);
1535
1536 if (!r)
1537 prange->validate_timestamp = ktime_to_us(ktime_get());
1538
1539 return r;
1540}
1541
1542/**
1543 * svm_range_list_lock_and_flush_work - flush pending deferred work
1544 *
1545 * @svms: the svm range list
1546 * @mm: the mm structure
1547 *
1548 * Context: Returns with mmap write lock held, pending deferred work flushed
1549 *
1550 */
1551static void
1552svm_range_list_lock_and_flush_work(struct svm_range_list *svms,
1553 struct mm_struct *mm)
1554{
1555retry_flush_work:
1556 flush_work(&svms->deferred_list_work);
1557 mmap_write_lock(mm);
1558
1559 if (list_empty(&svms->deferred_range_list))
1560 return;
1561 mmap_write_unlock(mm);
1562 pr_debug("retry flush\n");
1563 goto retry_flush_work;
1564}
1565
1566static void svm_range_restore_work(struct work_struct *work)
1567{
1568 struct delayed_work *dwork = to_delayed_work(work);
1569 struct amdkfd_process_info *process_info;
1570 struct svm_range_list *svms;
1571 struct svm_range *prange;
1572 struct kfd_process *p;
1573 struct mm_struct *mm;
1574 int evicted_ranges;
1575 int invalid;
1576 int r;
1577
1578 svms = container_of(dwork, struct svm_range_list, restore_work);
1579 evicted_ranges = atomic_read(&svms->evicted_ranges);
1580 if (!evicted_ranges)
1581 return;
1582
1583 pr_debug("restore svm ranges\n");
1584
1585 /* kfd_process_notifier_release destroys this worker thread. So during
1586 * the lifetime of this thread, kfd_process and mm will be valid.
1587 */
1588 p = container_of(svms, struct kfd_process, svms);
1589 process_info = p->kgd_process_info;
1590 mm = p->mm;
1591 if (!mm)
1592 return;
1593
1594 mutex_lock(&process_info->lock);
1595 svm_range_list_lock_and_flush_work(svms, mm);
1596 mutex_lock(&svms->lock);
1597
1598 evicted_ranges = atomic_read(&svms->evicted_ranges);
1599
1600 list_for_each_entry(prange, &svms->list, list) {
1601 invalid = atomic_read(&prange->invalid);
1602 if (!invalid)
1603 continue;
1604
1605 pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n",
1606 prange->svms, prange, prange->start, prange->last,
1607 invalid);
1608
1609 /*
1610 * If range is migrating, wait for migration is done.
1611 */
1612 mutex_lock(&prange->migrate_mutex);
1613
1614 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
1615 false, true);
1616 if (r)
1617 pr_debug("failed %d to map 0x%lx to gpus\n", r,
1618 prange->start);
1619
1620 mutex_unlock(&prange->migrate_mutex);
1621 if (r)
1622 goto out_reschedule;
1623
1624 if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid)
1625 goto out_reschedule;
1626 }
1627
1628 if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) !=
1629 evicted_ranges)
1630 goto out_reschedule;
1631
1632 evicted_ranges = 0;
1633
1634 r = kgd2kfd_resume_mm(mm);
1635 if (r) {
1636 /* No recovery from this failure. Probably the CP is
1637 * hanging. No point trying again.
1638 */
1639 pr_debug("failed %d to resume KFD\n", r);
1640 }
1641
1642 pr_debug("restore svm ranges successfully\n");
1643
1644out_reschedule:
1645 mutex_unlock(&svms->lock);
1646 mmap_write_unlock(mm);
1647 mutex_unlock(&process_info->lock);
1648
1649 /* If validation failed, reschedule another attempt */
1650 if (evicted_ranges) {
1651 pr_debug("reschedule to restore svm range\n");
1652 schedule_delayed_work(&svms->restore_work,
1653 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
1654 }
1655}
1656
1657/**
1658 * svm_range_evict - evict svm range
1659 *
1660 * Stop all queues of the process to ensure GPU doesn't access the memory, then
1661 * return to let CPU evict the buffer and proceed CPU pagetable update.
1662 *
1663 * Don't need use lock to sync cpu pagetable invalidation with GPU execution.
1664 * If invalidation happens while restore work is running, restore work will
1665 * restart to ensure to get the latest CPU pages mapping to GPU, then start
1666 * the queues.
1667 */
1668static int
1669svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
1670 unsigned long start, unsigned long last)
1671{
1672 struct svm_range_list *svms = prange->svms;
1673 struct svm_range *pchild;
1674 struct kfd_process *p;
1675 int r = 0;
1676
1677 p = container_of(svms, struct kfd_process, svms);
1678
1679 pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
1680 svms, prange->start, prange->last, start, last);
1681
1682 if (!p->xnack_enabled) {
1683 int evicted_ranges;
1684
1685 list_for_each_entry(pchild, &prange->child_list, child_list) {
1686 mutex_lock_nested(&pchild->lock, 1);
1687 if (pchild->start <= last && pchild->last >= start) {
1688 pr_debug("increment pchild invalid [0x%lx 0x%lx]\n",
1689 pchild->start, pchild->last);
1690 atomic_inc(&pchild->invalid);
1691 }
1692 mutex_unlock(&pchild->lock);
1693 }
1694
1695 if (prange->start <= last && prange->last >= start)
1696 atomic_inc(&prange->invalid);
1697
1698 evicted_ranges = atomic_inc_return(&svms->evicted_ranges);
1699 if (evicted_ranges != 1)
1700 return r;
1701
1702 pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n",
1703 prange->svms, prange->start, prange->last);
1704
1705 /* First eviction, stop the queues */
1706 r = kgd2kfd_quiesce_mm(mm);
1707 if (r)
1708 pr_debug("failed to quiesce KFD\n");
1709
1710 pr_debug("schedule to restore svm %p ranges\n", svms);
1711 schedule_delayed_work(&svms->restore_work,
1712 msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
1713 } else {
1714 unsigned long s, l;
1715
1716 pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n",
1717 prange->svms, start, last);
1718 list_for_each_entry(pchild, &prange->child_list, child_list) {
1719 mutex_lock_nested(&pchild->lock, 1);
1720 s = max(start, pchild->start);
1721 l = min(last, pchild->last);
1722 if (l >= s)
1723 svm_range_unmap_from_gpus(pchild, s, l);
1724 mutex_unlock(&pchild->lock);
1725 }
1726 s = max(start, prange->start);
1727 l = min(last, prange->last);
1728 if (l >= s)
1729 svm_range_unmap_from_gpus(prange, s, l);
1730 }
1731
1732 return r;
1733}
1734
1735static struct svm_range *svm_range_clone(struct svm_range *old)
1736{
1737 struct svm_range *new;
1738
1739 new = svm_range_new(old->svms, old->start, old->last);
1740 if (!new)
1741 return NULL;
1742
1743 if (old->svm_bo) {
1744 new->ttm_res = old->ttm_res;
1745 new->offset = old->offset;
1746 new->svm_bo = svm_range_bo_ref(old->svm_bo);
1747 spin_lock(&new->svm_bo->list_lock);
1748 list_add(&new->svm_bo_list, &new->svm_bo->range_list);
1749 spin_unlock(&new->svm_bo->list_lock);
1750 }
1751 new->flags = old->flags;
1752 new->preferred_loc = old->preferred_loc;
1753 new->prefetch_loc = old->prefetch_loc;
1754 new->actual_loc = old->actual_loc;
1755 new->granularity = old->granularity;
1756 bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
1757 bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);
1758
1759 return new;
1760}
1761
1762/**
1763 * svm_range_handle_overlap - split overlap ranges
1764 * @svms: svm range list header
1765 * @new: range added with this attributes
1766 * @start: range added start address, in pages
1767 * @last: range last address, in pages
1768 * @update_list: output, the ranges attributes are updated. For set_attr, this
1769 * will do validation and map to GPUs. For unmap, this will be
1770 * removed and unmap from GPUs
1771 * @insert_list: output, the ranges will be inserted into svms, attributes are
1772 * not changes. For set_attr, this will add into svms.
1773 * @remove_list:output, the ranges will be removed from svms
1774 * @left: the remaining range after overlap, For set_attr, this will be added
1775 * as new range.
1776 *
1777 * Total have 5 overlap cases.
1778 *
1779 * This function handles overlap of an address interval with existing
1780 * struct svm_ranges for applying new attributes. This may require
1781 * splitting existing struct svm_ranges. All changes should be applied to
1782 * the range_list and interval tree transactionally. If any split operation
1783 * fails, the entire update fails. Therefore the existing overlapping
1784 * svm_ranges are cloned and the original svm_ranges left unchanged. If the
1785 * transaction succeeds, the modified clones are added and the originals
1786 * freed. Otherwise the clones are removed and the old svm_ranges remain.
1787 *
1788 * Context: The caller must hold svms->lock
1789 */
1790static int
1791svm_range_handle_overlap(struct svm_range_list *svms, struct svm_range *new,
1792 unsigned long start, unsigned long last,
1793 struct list_head *update_list,
1794 struct list_head *insert_list,
1795 struct list_head *remove_list,
1796 unsigned long *left)
1797{
1798 struct interval_tree_node *node;
1799 struct svm_range *prange;
1800 struct svm_range *tmp;
1801 int r = 0;
1802
1803 INIT_LIST_HEAD(update_list);
1804 INIT_LIST_HEAD(insert_list);
1805 INIT_LIST_HEAD(remove_list);
1806
1807 node = interval_tree_iter_first(&svms->objects, start, last);
1808 while (node) {
1809 struct interval_tree_node *next;
1810 struct svm_range *old;
1811 unsigned long next_start;
1812
1813 pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start,
1814 node->last);
1815
1816 old = container_of(node, struct svm_range, it_node);
1817 next = interval_tree_iter_next(node, start, last);
1818 next_start = min(node->last, last) + 1;
1819
1820 if (node->start < start || node->last > last) {
1821 /* node intersects the updated range, clone+split it */
1822 prange = svm_range_clone(old);
1823 if (!prange) {
1824 r = -ENOMEM;
1825 goto out;
1826 }
1827
1828 list_add(&old->remove_list, remove_list);
1829 list_add(&prange->insert_list, insert_list);
1830
1831 if (node->start < start) {
1832 pr_debug("change old range start\n");
1833 r = svm_range_split_head(prange, new, start,
1834 insert_list);
1835 if (r)
1836 goto out;
1837 }
1838 if (node->last > last) {
1839 pr_debug("change old range last\n");
1840 r = svm_range_split_tail(prange, new, last,
1841 insert_list);
1842 if (r)
1843 goto out;
1844 }
1845 } else {
1846 /* The node is contained within start..last,
1847 * just update it
1848 */
1849 prange = old;
1850 }
1851
1852 if (!svm_range_is_same_attrs(prange, new))
1853 list_add(&prange->update_list, update_list);
1854
1855 /* insert a new node if needed */
1856 if (node->start > start) {
1857 prange = svm_range_new(prange->svms, start,
1858 node->start - 1);
1859 if (!prange) {
1860 r = -ENOMEM;
1861 goto out;
1862 }
1863
1864 list_add(&prange->insert_list, insert_list);
1865 list_add(&prange->update_list, update_list);
1866 }
1867
1868 node = next;
1869 start = next_start;
1870 }
1871
1872 if (left && start <= last)
1873 *left = last - start + 1;
1874
1875out:
1876 if (r)
1877 list_for_each_entry_safe(prange, tmp, insert_list, insert_list)
1878 svm_range_free(prange);
1879
1880 return r;
1881}
1882
1883static void
1884svm_range_update_notifier_and_interval_tree(struct mm_struct *mm,
1885 struct svm_range *prange)
1886{
1887 unsigned long start;
1888 unsigned long last;
1889
1890 start = prange->notifier.interval_tree.start >> PAGE_SHIFT;
1891 last = prange->notifier.interval_tree.last >> PAGE_SHIFT;
1892
1893 if (prange->start == start && prange->last == last)
1894 return;
1895
1896 pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
1897 prange->svms, prange, start, last, prange->start,
1898 prange->last);
1899
1900 if (start != 0 && last != 0) {
1901 interval_tree_remove(&prange->it_node, &prange->svms->objects);
1902 svm_range_remove_notifier(prange);
1903 }
1904 prange->it_node.start = prange->start;
1905 prange->it_node.last = prange->last;
1906
1907 interval_tree_insert(&prange->it_node, &prange->svms->objects);
1908 svm_range_add_notifier_locked(mm, prange);
1909}
1910
1911static void
1912svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange)
1913{
1914 struct mm_struct *mm = prange->work_item.mm;
1915
1916 switch (prange->work_item.op) {
1917 case SVM_OP_NULL:
1918 pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n",
1919 svms, prange, prange->start, prange->last);
1920 break;
1921 case SVM_OP_UNMAP_RANGE:
1922 pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n",
1923 svms, prange, prange->start, prange->last);
1924 svm_range_unlink(prange);
1925 svm_range_remove_notifier(prange);
1926 svm_range_free(prange);
1927 break;
1928 case SVM_OP_UPDATE_RANGE_NOTIFIER:
1929 pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n",
1930 svms, prange, prange->start, prange->last);
1931 svm_range_update_notifier_and_interval_tree(mm, prange);
1932 break;
1933 case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP:
1934 pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n",
1935 svms, prange, prange->start, prange->last);
1936 svm_range_update_notifier_and_interval_tree(mm, prange);
1937 /* TODO: implement deferred validation and mapping */
1938 break;
1939 case SVM_OP_ADD_RANGE:
1940 pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange,
1941 prange->start, prange->last);
1942 svm_range_add_to_svms(prange);
1943 svm_range_add_notifier_locked(mm, prange);
1944 break;
1945 case SVM_OP_ADD_RANGE_AND_MAP:
1946 pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms,
1947 prange, prange->start, prange->last);
1948 svm_range_add_to_svms(prange);
1949 svm_range_add_notifier_locked(mm, prange);
1950 /* TODO: implement deferred validation and mapping */
1951 break;
1952 default:
1953 WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange,
1954 prange->work_item.op);
1955 }
1956}
1957
1958static void svm_range_drain_retry_fault(struct svm_range_list *svms)
1959{
1960 struct kfd_process_device *pdd;
1961 struct amdgpu_device *adev;
1962 struct kfd_process *p;
1963 uint32_t i;
1964
1965 p = container_of(svms, struct kfd_process, svms);
1966
1967 for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
1968 pdd = p->pdds[i];
1969 if (!pdd)
1970 continue;
1971
1972 pr_debug("drain retry fault gpu %d svms %p\n", i, svms);
1973 adev = (struct amdgpu_device *)pdd->dev->kgd;
1974
1975 amdgpu_ih_wait_on_checkpoint_process(adev, &adev->irq.ih1);
1976 pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
1977 }
1978}
1979
1980static void svm_range_deferred_list_work(struct work_struct *work)
1981{
1982 struct svm_range_list *svms;
1983 struct svm_range *prange;
1984 struct mm_struct *mm;
1985
1986 svms = container_of(work, struct svm_range_list, deferred_list_work);
1987 pr_debug("enter svms 0x%p\n", svms);
1988
1989 spin_lock(&svms->deferred_list_lock);
1990 while (!list_empty(&svms->deferred_range_list)) {
1991 prange = list_first_entry(&svms->deferred_range_list,
1992 struct svm_range, deferred_list);
1993 spin_unlock(&svms->deferred_list_lock);
1994 pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
1995 prange->start, prange->last, prange->work_item.op);
1996
1997 /* Make sure no stale retry fault coming after range is freed */
1998 if (prange->work_item.op == SVM_OP_UNMAP_RANGE)
1999 svm_range_drain_retry_fault(prange->svms);
2000
2001 mm = prange->work_item.mm;
2002 mmap_write_lock(mm);
2003 mutex_lock(&svms->lock);
2004
2005 /* Remove from deferred_list must be inside mmap write lock,
2006 * otherwise, svm_range_list_lock_and_flush_work may hold mmap
2007 * write lock, and continue because deferred_list is empty, then
2008 * deferred_list handle is blocked by mmap write lock.
2009 */
2010 spin_lock(&svms->deferred_list_lock);
2011 list_del_init(&prange->deferred_list);
2012 spin_unlock(&svms->deferred_list_lock);
2013
2014 mutex_lock(&prange->migrate_mutex);
2015 while (!list_empty(&prange->child_list)) {
2016 struct svm_range *pchild;
2017
2018 pchild = list_first_entry(&prange->child_list,
2019 struct svm_range, child_list);
2020 pr_debug("child prange 0x%p op %d\n", pchild,
2021 pchild->work_item.op);
2022 list_del_init(&pchild->child_list);
2023 svm_range_handle_list_op(svms, pchild);
2024 }
2025 mutex_unlock(&prange->migrate_mutex);
2026
2027 svm_range_handle_list_op(svms, prange);
2028 mutex_unlock(&svms->lock);
2029 mmap_write_unlock(mm);
2030
2031 spin_lock(&svms->deferred_list_lock);
2032 }
2033 spin_unlock(&svms->deferred_list_lock);
2034
2035 pr_debug("exit svms 0x%p\n", svms);
2036}
2037
2038void
2039svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange,
2040 struct mm_struct *mm, enum svm_work_list_ops op)
2041{
2042 spin_lock(&svms->deferred_list_lock);
2043 /* if prange is on the deferred list */
2044 if (!list_empty(&prange->deferred_list)) {
2045 pr_debug("update exist prange 0x%p work op %d\n", prange, op);
2046 WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n");
2047 if (op != SVM_OP_NULL &&
2048 prange->work_item.op != SVM_OP_UNMAP_RANGE)
2049 prange->work_item.op = op;
2050 } else {
2051 prange->work_item.op = op;
2052 prange->work_item.mm = mm;
2053 list_add_tail(&prange->deferred_list,
2054 &prange->svms->deferred_range_list);
2055 pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n",
2056 prange, prange->start, prange->last, op);
2057 }
2058 spin_unlock(&svms->deferred_list_lock);
2059}
2060
2061void schedule_deferred_list_work(struct svm_range_list *svms)
2062{
2063 spin_lock(&svms->deferred_list_lock);
2064 if (!list_empty(&svms->deferred_range_list))
2065 schedule_work(&svms->deferred_list_work);
2066 spin_unlock(&svms->deferred_list_lock);
2067}
2068
2069static void
2070svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent,
2071 struct svm_range *prange, unsigned long start,
2072 unsigned long last)
2073{
2074 struct svm_range *head;
2075 struct svm_range *tail;
2076
2077 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
2078 pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange,
2079 prange->start, prange->last);
2080 return;
2081 }
2082 if (start > prange->last || last < prange->start)
2083 return;
2084
2085 head = tail = prange;
2086 if (start > prange->start)
2087 svm_range_split(prange, prange->start, start - 1, &tail);
2088 if (last < tail->last)
2089 svm_range_split(tail, last + 1, tail->last, &head);
2090
2091 if (head != prange && tail != prange) {
2092 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
2093 svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
2094 } else if (tail != prange) {
2095 svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE);
2096 } else if (head != prange) {
2097 svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
2098 } else if (parent != prange) {
2099 prange->work_item.op = SVM_OP_UNMAP_RANGE;
2100 }
2101}
2102
2103static void
2104svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
2105 unsigned long start, unsigned long last)
2106{
2107 struct svm_range_list *svms;
2108 struct svm_range *pchild;
2109 struct kfd_process *p;
2110 unsigned long s, l;
2111 bool unmap_parent;
2112
2113 p = kfd_lookup_process_by_mm(mm);
2114 if (!p)
2115 return;
2116 svms = &p->svms;
2117
2118 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms,
2119 prange, prange->start, prange->last, start, last);
2120
2121 unmap_parent = start <= prange->start && last >= prange->last;
2122
2123 list_for_each_entry(pchild, &prange->child_list, child_list) {
2124 mutex_lock_nested(&pchild->lock, 1);
2125 s = max(start, pchild->start);
2126 l = min(last, pchild->last);
2127 if (l >= s)
2128 svm_range_unmap_from_gpus(pchild, s, l);
2129 svm_range_unmap_split(mm, prange, pchild, start, last);
2130 mutex_unlock(&pchild->lock);
2131 }
2132 s = max(start, prange->start);
2133 l = min(last, prange->last);
2134 if (l >= s)
2135 svm_range_unmap_from_gpus(prange, s, l);
2136 svm_range_unmap_split(mm, prange, prange, start, last);
2137
2138 if (unmap_parent)
2139 svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE);
2140 else
2141 svm_range_add_list_work(svms, prange, mm,
2142 SVM_OP_UPDATE_RANGE_NOTIFIER);
2143 schedule_deferred_list_work(svms);
2144
2145 kfd_unref_process(p);
2146}
2147
2148/**
2149 * svm_range_cpu_invalidate_pagetables - interval notifier callback
2150 *
2151 * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it
2152 * is from migration, or CPU page invalidation callback.
2153 *
2154 * For unmap event, unmap range from GPUs, remove prange from svms in a delayed
2155 * work thread, and split prange if only part of prange is unmapped.
2156 *
2157 * For invalidation event, if GPU retry fault is not enabled, evict the queues,
2158 * then schedule svm_range_restore_work to update GPU mapping and resume queues.
2159 * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will
2160 * update GPU mapping to recover.
2161 *
2162 * Context: mmap lock, notifier_invalidate_start lock are held
2163 * for invalidate event, prange lock is held if this is from migration
2164 */
2165static bool
2166svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
2167 const struct mmu_notifier_range *range,
2168 unsigned long cur_seq)
2169{
2170 struct svm_range *prange;
2171 unsigned long start;
2172 unsigned long last;
2173
2174 if (range->event == MMU_NOTIFY_RELEASE)
2175 return true;
2176
2177 start = mni->interval_tree.start;
2178 last = mni->interval_tree.last;
2179 start = (start > range->start ? start : range->start) >> PAGE_SHIFT;
2180 last = (last < (range->end - 1) ? last : range->end - 1) >> PAGE_SHIFT;
2181 pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n",
2182 start, last, range->start >> PAGE_SHIFT,
2183 (range->end - 1) >> PAGE_SHIFT,
2184 mni->interval_tree.start >> PAGE_SHIFT,
2185 mni->interval_tree.last >> PAGE_SHIFT, range->event);
2186
2187 prange = container_of(mni, struct svm_range, notifier);
2188
2189 svm_range_lock(prange);
2190 mmu_interval_set_seq(mni, cur_seq);
2191
2192 switch (range->event) {
2193 case MMU_NOTIFY_UNMAP:
2194 svm_range_unmap_from_cpu(mni->mm, prange, start, last);
2195 break;
2196 default:
2197 svm_range_evict(prange, mni->mm, start, last);
2198 break;
2199 }
2200
2201 svm_range_unlock(prange);
2202
2203 return true;
2204}
2205
2206/**
2207 * svm_range_from_addr - find svm range from fault address
2208 * @svms: svm range list header
2209 * @addr: address to search range interval tree, in pages
2210 * @parent: parent range if range is on child list
2211 *
2212 * Context: The caller must hold svms->lock
2213 *
2214 * Return: the svm_range found or NULL
2215 */
2216struct svm_range *
2217svm_range_from_addr(struct svm_range_list *svms, unsigned long addr,
2218 struct svm_range **parent)
2219{
2220 struct interval_tree_node *node;
2221 struct svm_range *prange;
2222 struct svm_range *pchild;
2223
2224 node = interval_tree_iter_first(&svms->objects, addr, addr);
2225 if (!node)
2226 return NULL;
2227
2228 prange = container_of(node, struct svm_range, it_node);
2229 pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n",
2230 addr, prange->start, prange->last, node->start, node->last);
2231
2232 if (addr >= prange->start && addr <= prange->last) {
2233 if (parent)
2234 *parent = prange;
2235 return prange;
2236 }
2237 list_for_each_entry(pchild, &prange->child_list, child_list)
2238 if (addr >= pchild->start && addr <= pchild->last) {
2239 pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n",
2240 addr, pchild->start, pchild->last);
2241 if (parent)
2242 *parent = prange;
2243 return pchild;
2244 }
2245
2246 return NULL;
2247}
2248
2249/* svm_range_best_restore_location - decide the best fault restore location
2250 * @prange: svm range structure
2251 * @adev: the GPU on which vm fault happened
2252 *
2253 * This is only called when xnack is on, to decide the best location to restore
2254 * the range mapping after GPU vm fault. Caller uses the best location to do
2255 * migration if actual loc is not best location, then update GPU page table
2256 * mapping to the best location.
2257 *
2258 * If vm fault gpu is range preferred loc, the best_loc is preferred loc.
2259 * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu
2260 * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then
2261 * if range actual loc is cpu, best_loc is cpu
2262 * if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is
2263 * range actual loc.
2264 * Otherwise, GPU no access, best_loc is -1.
2265 *
2266 * Return:
2267 * -1 means vm fault GPU no access
2268 * 0 for CPU or GPU id
2269 */
2270static int32_t
2271svm_range_best_restore_location(struct svm_range *prange,
2272 struct amdgpu_device *adev,
2273 int32_t *gpuidx)
2274{
2275 struct amdgpu_device *bo_adev;
2276 struct kfd_process *p;
2277 uint32_t gpuid;
2278 int r;
2279
2280 p = container_of(prange->svms, struct kfd_process, svms);
2281
2282 r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, gpuidx);
2283 if (r < 0) {
2284 pr_debug("failed to get gpuid from kgd\n");
2285 return -1;
2286 }
2287
2288 if (prange->preferred_loc == gpuid)
2289 return prange->preferred_loc;
2290
2291 if (test_bit(*gpuidx, prange->bitmap_access))
2292 return gpuid;
2293
2294 if (test_bit(*gpuidx, prange->bitmap_aip)) {
2295 if (!prange->actual_loc)
2296 return 0;
2297
2298 bo_adev = svm_range_get_adev_by_id(prange, prange->actual_loc);
2299 if (amdgpu_xgmi_same_hive(adev, bo_adev))
2300 return prange->actual_loc;
2301 else
2302 return 0;
2303 }
2304
2305 return -1;
2306}
2307static int
2308svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
2309 unsigned long *start, unsigned long *last)
2310{
2311 struct vm_area_struct *vma;
2312 struct interval_tree_node *node;
2313 unsigned long start_limit, end_limit;
2314
2315 vma = find_vma(p->mm, addr << PAGE_SHIFT);
2316 if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
2317 pr_debug("VMA does not exist in address [0x%llx]\n", addr);
2318 return -EFAULT;
2319 }
2320 start_limit = max(vma->vm_start >> PAGE_SHIFT,
2321 (unsigned long)ALIGN_DOWN(addr, 2UL << 8));
2322 end_limit = min(vma->vm_end >> PAGE_SHIFT,
2323 (unsigned long)ALIGN(addr + 1, 2UL << 8));
2324 /* First range that starts after the fault address */
2325 node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX);
2326 if (node) {
2327 end_limit = min(end_limit, node->start);
2328 /* Last range that ends before the fault address */
2329 node = container_of(rb_prev(&node->rb),
2330 struct interval_tree_node, rb);
2331 } else {
2332 /* Last range must end before addr because
2333 * there was no range after addr
2334 */
2335 node = container_of(rb_last(&p->svms.objects.rb_root),
2336 struct interval_tree_node, rb);
2337 }
2338 if (node) {
2339 if (node->last >= addr) {
2340 WARN(1, "Overlap with prev node and page fault addr\n");
2341 return -EFAULT;
2342 }
2343 start_limit = max(start_limit, node->last + 1);
2344 }
2345
2346 *start = start_limit;
2347 *last = end_limit - 1;
2348
2349 pr_debug("vma start: 0x%lx start: 0x%lx vma end: 0x%lx last: 0x%lx\n",
2350 vma->vm_start >> PAGE_SHIFT, *start,
2351 vma->vm_end >> PAGE_SHIFT, *last);
2352
2353 return 0;
2354
2355}
2356static struct
2357svm_range *svm_range_create_unregistered_range(struct amdgpu_device *adev,
2358 struct kfd_process *p,
2359 struct mm_struct *mm,
2360 int64_t addr)
2361{
2362 struct svm_range *prange = NULL;
2363 unsigned long start, last;
2364 uint32_t gpuid, gpuidx;
2365
2366 if (svm_range_get_range_boundaries(p, addr, &start, &last))
2367 return NULL;
2368
2369 prange = svm_range_new(&p->svms, start, last);
2370 if (!prange) {
2371 pr_debug("Failed to create prange in address [0x%llx]\n", addr);
2372 return NULL;
2373 }
2374 if (kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpuidx)) {
2375 pr_debug("failed to get gpuid from kgd\n");
2376 svm_range_free(prange);
2377 return NULL;
2378 }
2379
2380 svm_range_add_to_svms(prange);
2381 svm_range_add_notifier_locked(mm, prange);
2382
2383 return prange;
2384}
2385
2386/* svm_range_skip_recover - decide if prange can be recovered
2387 * @prange: svm range structure
2388 *
2389 * GPU vm retry fault handle skip recover the range for cases:
2390 * 1. prange is on deferred list to be removed after unmap, it is stale fault,
2391 * deferred list work will drain the stale fault before free the prange.
2392 * 2. prange is on deferred list to add interval notifier after split, or
2393 * 3. prange is child range, it is split from parent prange, recover later
2394 * after interval notifier is added.
2395 *
2396 * Return: true to skip recover, false to recover
2397 */
2398static bool svm_range_skip_recover(struct svm_range *prange)
2399{
2400 struct svm_range_list *svms = prange->svms;
2401
2402 spin_lock(&svms->deferred_list_lock);
2403 if (list_empty(&prange->deferred_list) &&
2404 list_empty(&prange->child_list)) {
2405 spin_unlock(&svms->deferred_list_lock);
2406 return false;
2407 }
2408 spin_unlock(&svms->deferred_list_lock);
2409
2410 if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
2411 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n",
2412 svms, prange, prange->start, prange->last);
2413 return true;
2414 }
2415 if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP ||
2416 prange->work_item.op == SVM_OP_ADD_RANGE) {
2417 pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n",
2418 svms, prange, prange->start, prange->last);
2419 return true;
2420 }
2421 return false;
2422}
2423
2424static void
2425svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p,
2426 int32_t gpuidx)
2427{
2428 struct kfd_process_device *pdd;
2429
2430 /* fault is on different page of same range
2431 * or fault is skipped to recover later
2432 * or fault is on invalid virtual address
2433 */
2434 if (gpuidx == MAX_GPU_INSTANCE) {
2435 uint32_t gpuid;
2436 int r;
2437
2438 r = kfd_process_gpuid_from_kgd(p, adev, &gpuid, &gpuidx);
2439 if (r < 0)
2440 return;
2441 }
2442
2443 /* fault is recovered
2444 * or fault cannot recover because GPU no access on the range
2445 */
2446 pdd = kfd_process_device_from_gpuidx(p, gpuidx);
2447 if (pdd)
2448 WRITE_ONCE(pdd->faults, pdd->faults + 1);
2449}
2450
2451int
2452svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
2453 uint64_t addr)
2454{
2455 struct mm_struct *mm = NULL;
2456 struct svm_range_list *svms;
2457 struct svm_range *prange;
2458 struct kfd_process *p;
2459 uint64_t timestamp;
2460 int32_t best_loc;
2461 int32_t gpuidx = MAX_GPU_INSTANCE;
2462 bool write_locked = false;
2463 int r = 0;
2464
2465 if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
2466 pr_debug("device does not support SVM\n");
2467 return -EFAULT;
2468 }
2469
2470 p = kfd_lookup_process_by_pasid(pasid);
2471 if (!p) {
2472 pr_debug("kfd process not founded pasid 0x%x\n", pasid);
2473 return -ESRCH;
2474 }
2475 if (!p->xnack_enabled) {
2476 pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
2477 r = -EFAULT;
2478 goto out;
2479 }
2480 svms = &p->svms;
2481
2482 pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);
2483
2484 mm = get_task_mm(p->lead_thread);
2485 if (!mm) {
2486 pr_debug("svms 0x%p failed to get mm\n", svms);
2487 r = -ESRCH;
2488 goto out;
2489 }
2490
2491 mmap_read_lock(mm);
2492retry_write_locked:
2493 mutex_lock(&svms->lock);
2494 prange = svm_range_from_addr(svms, addr, NULL);
2495 if (!prange) {
2496 pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
2497 svms, addr);
2498 if (!write_locked) {
2499 /* Need the write lock to create new range with MMU notifier.
2500 * Also flush pending deferred work to make sure the interval
2501 * tree is up to date before we add a new range
2502 */
2503 mutex_unlock(&svms->lock);
2504 mmap_read_unlock(mm);
2505 mmap_write_lock(mm);
2506 write_locked = true;
2507 goto retry_write_locked;
2508 }
2509 prange = svm_range_create_unregistered_range(adev, p, mm, addr);
2510 if (!prange) {
2511 pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n",
2512 svms, addr);
2513 mmap_write_downgrade(mm);
2514 r = -EFAULT;
2515 goto out_unlock_svms;
2516 }
2517 }
2518 if (write_locked)
2519 mmap_write_downgrade(mm);
2520
2521 mutex_lock(&prange->migrate_mutex);
2522
2523 if (svm_range_skip_recover(prange)) {
2524 amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
2525 goto out_unlock_range;
2526 }
2527
2528 timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
2529 /* skip duplicate vm fault on different pages of same range */
2530 if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) {
2531 pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
2532 svms, prange->start, prange->last);
2533 goto out_unlock_range;
2534 }
2535
2536 best_loc = svm_range_best_restore_location(prange, adev, &gpuidx);
2537 if (best_loc == -1) {
2538 pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n",
2539 svms, prange->start, prange->last);
2540 r = -EACCES;
2541 goto out_unlock_range;
2542 }
2543
2544 pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n",
2545 svms, prange->start, prange->last, best_loc,
2546 prange->actual_loc);
2547
2548 if (prange->actual_loc != best_loc) {
2549 if (best_loc) {
2550 r = svm_migrate_to_vram(prange, best_loc, mm);
2551 if (r) {
2552 pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n",
2553 r, addr);
2554 /* Fallback to system memory if migration to
2555 * VRAM failed
2556 */
2557 if (prange->actual_loc)
2558 r = svm_migrate_vram_to_ram(prange, mm);
2559 else
2560 r = 0;
2561 }
2562 } else {
2563 r = svm_migrate_vram_to_ram(prange, mm);
2564 }
2565 if (r) {
2566 pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
2567 r, svms, prange->start, prange->last);
2568 goto out_unlock_range;
2569 }
2570 }
2571
2572 r = svm_range_validate_and_map(mm, prange, gpuidx, false, false);
2573 if (r)
2574 pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
2575 r, svms, prange->start, prange->last);
2576
2577out_unlock_range:
2578 mutex_unlock(&prange->migrate_mutex);
2579out_unlock_svms:
2580 mutex_unlock(&svms->lock);
2581 mmap_read_unlock(mm);
2582
2583 svm_range_count_fault(adev, p, gpuidx);
2584
2585 mmput(mm);
2586out:
2587 kfd_unref_process(p);
2588
2589 if (r == -EAGAIN) {
2590 pr_debug("recover vm fault later\n");
2591 amdgpu_gmc_filter_faults_remove(adev, addr, pasid);
2592 r = 0;
2593 }
2594 return r;
2595}
2596
2597void svm_range_list_fini(struct kfd_process *p)
2598{
2599 struct svm_range *prange;
2600 struct svm_range *next;
2601
2602 pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms);
2603
2604 /* Ensure list work is finished before process is destroyed */
2605 flush_work(&p->svms.deferred_list_work);
2606
2607 list_for_each_entry_safe(prange, next, &p->svms.list, list) {
2608 svm_range_unlink(prange);
2609 svm_range_remove_notifier(prange);
2610 svm_range_free(prange);
2611 }
2612
2613 mutex_destroy(&p->svms.lock);
2614
2615 pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms);
2616}
2617
2618int svm_range_list_init(struct kfd_process *p)
2619{
2620 struct svm_range_list *svms = &p->svms;
2621 int i;
2622
2623 svms->objects = RB_ROOT_CACHED;
2624 mutex_init(&svms->lock);
2625 INIT_LIST_HEAD(&svms->list);
2626 atomic_set(&svms->evicted_ranges, 0);
2627 INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work);
2628 INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work);
2629 INIT_LIST_HEAD(&svms->deferred_range_list);
2630 spin_lock_init(&svms->deferred_list_lock);
2631
2632 for (i = 0; i < p->n_pdds; i++)
2633 if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev))
2634 bitmap_set(svms->bitmap_supported, i, 1);
2635
2636 return 0;
2637}
2638
2639/**
2640 * svm_range_is_valid - check if virtual address range is valid
2641 * @mm: current process mm_struct
2642 * @start: range start address, in pages
2643 * @size: range size, in pages
2644 *
2645 * Valid virtual address range means it belongs to one or more VMAs
2646 *
2647 * Context: Process context
2648 *
2649 * Return:
2650 * true - valid svm range
2651 * false - invalid svm range
2652 */
2653static bool
2654svm_range_is_valid(struct mm_struct *mm, uint64_t start, uint64_t size)
2655{
2656 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
2657 struct vm_area_struct *vma;
2658 unsigned long end;
2659
2660 start <<= PAGE_SHIFT;
2661 end = start + (size << PAGE_SHIFT);
2662
2663 do {
2664 vma = find_vma(mm, start);
2665 if (!vma || start < vma->vm_start ||
2666 (vma->vm_flags & device_vma))
2667 return false;
2668 start = min(end, vma->vm_end);
2669 } while (start < end);
2670
2671 return true;
2672}
2673
2674/**
2675 * svm_range_add - add svm range and handle overlap
2676 * @p: the range add to this process svms
2677 * @start: page size aligned
2678 * @size: page size aligned
2679 * @nattr: number of attributes
2680 * @attrs: array of attributes
2681 * @update_list: output, the ranges need validate and update GPU mapping
2682 * @insert_list: output, the ranges need insert to svms
2683 * @remove_list: output, the ranges are replaced and need remove from svms
2684 *
2685 * Check if the virtual address range has overlap with the registered ranges,
2686 * split the overlapped range, copy and adjust pages address and vram nodes in
2687 * old and new ranges.
2688 *
2689 * Context: Process context, caller must hold svms->lock
2690 *
2691 * Return:
2692 * 0 - OK, otherwise error code
2693 */
2694static int
2695svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
2696 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
2697 struct list_head *update_list, struct list_head *insert_list,
2698 struct list_head *remove_list)
2699{
2700 uint64_t last = start + size - 1UL;
2701 struct svm_range_list *svms;
2702 struct svm_range new = {0};
2703 struct svm_range *prange;
2704 unsigned long left = 0;
2705 int r = 0;
2706
2707 pr_debug("svms 0x%p [0x%llx 0x%llx]\n", &p->svms, start, last);
2708
2709 svm_range_apply_attrs(p, &new, nattr, attrs);
2710
2711 svms = &p->svms;
2712
2713 r = svm_range_handle_overlap(svms, &new, start, last, update_list,
2714 insert_list, remove_list, &left);
2715 if (r)
2716 return r;
2717
2718 if (left) {
2719 prange = svm_range_new(svms, last - left + 1, last);
2720 list_add(&prange->insert_list, insert_list);
2721 list_add(&prange->update_list, update_list);
2722 }
2723
2724 return 0;
2725}
2726
2727/* svm_range_best_prefetch_location - decide the best prefetch location
2728 * @prange: svm range structure
2729 *
2730 * For xnack off:
2731 * If range map to single GPU, the best acutal location is prefetch loc, which
2732 * can be CPU or GPU.
2733 *
2734 * If range map to multiple GPUs, only if mGPU connection on xgmi same hive,
2735 * the best actual location could be prefetch_loc GPU. If mGPU connection on
2736 * PCIe, the best actual location is always CPU, because GPU cannot access vram
2737 * of other GPUs, assuming PCIe small bar (large bar support is not upstream).
2738 *
2739 * For xnack on:
2740 * The best actual location is prefetch location. If mGPU connection on xgmi
2741 * same hive, range map to multiple GPUs. Otherwise, the range only map to
2742 * actual location GPU. Other GPU access vm fault will trigger migration.
2743 *
2744 * Context: Process context
2745 *
2746 * Return:
2747 * 0 for CPU or GPU id
2748 */
2749static uint32_t
2750svm_range_best_prefetch_location(struct svm_range *prange)
2751{
2752 DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
2753 uint32_t best_loc = prange->prefetch_loc;
2754 struct kfd_process_device *pdd;
2755 struct amdgpu_device *bo_adev;
2756 struct amdgpu_device *adev;
2757 struct kfd_process *p;
2758 uint32_t gpuidx;
2759
2760 p = container_of(prange->svms, struct kfd_process, svms);
2761
2762 /* xnack on */
2763 if (p->xnack_enabled)
2764 goto out;
2765
2766 /* xnack off */
2767 if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED)
2768 goto out;
2769
2770 bo_adev = svm_range_get_adev_by_id(prange, best_loc);
2771 if (!bo_adev) {
2772 WARN_ONCE(1, "failed to get device by id 0x%x\n", best_loc);
2773 best_loc = 0;
2774 goto out;
2775 }
2776 bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
2777 MAX_GPU_INSTANCE);
2778
2779 for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
2780 pdd = kfd_process_device_from_gpuidx(p, gpuidx);
2781 if (!pdd) {
2782 pr_debug("failed to get device by idx 0x%x\n", gpuidx);
2783 continue;
2784 }
2785 adev = (struct amdgpu_device *)pdd->dev->kgd;
2786
2787 if (adev == bo_adev)
2788 continue;
2789
2790 if (!amdgpu_xgmi_same_hive(adev, bo_adev)) {
2791 best_loc = 0;
2792 break;
2793 }
2794 }
2795
2796out:
2797 pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n",
2798 p->xnack_enabled, &p->svms, prange->start, prange->last,
2799 best_loc);
2800
2801 return best_loc;
2802}
2803
2804/* FIXME: This is a workaround for page locking bug when some pages are
2805 * invalid during migration to VRAM
2806 */
2807void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm,
2808 void *owner)
2809{
2810 struct hmm_range *hmm_range;
2811 int r;
2812
2813 if (prange->validated_once)
2814 return;
2815
2816 r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL,
2817 prange->start << PAGE_SHIFT,
2818 prange->npages, &hmm_range,
2819 false, true, owner);
2820 if (!r) {
2821 amdgpu_hmm_range_get_pages_done(hmm_range);
2822 prange->validated_once = true;
2823 }
2824}
2825
2826/* svm_range_trigger_migration - start page migration if prefetch loc changed
2827 * @mm: current process mm_struct
2828 * @prange: svm range structure
2829 * @migrated: output, true if migration is triggered
2830 *
2831 * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range
2832 * from ram to vram.
2833 * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range
2834 * from vram to ram.
2835 *
2836 * If GPU vm fault retry is not enabled, migration interact with MMU notifier
2837 * and restore work:
2838 * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict
2839 * stops all queues, schedule restore work
2840 * 2. svm_range_restore_work wait for migration is done by
2841 * a. svm_range_validate_vram takes prange->migrate_mutex
2842 * b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns
2843 * 3. restore work update mappings of GPU, resume all queues.
2844 *
2845 * Context: Process context
2846 *
2847 * Return:
2848 * 0 - OK, otherwise - error code of migration
2849 */
2850static int
2851svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
2852 bool *migrated)
2853{
2854 uint32_t best_loc;
2855 int r = 0;
2856
2857 *migrated = false;
2858 best_loc = svm_range_best_prefetch_location(prange);
2859
2860 if (best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
2861 best_loc == prange->actual_loc)
2862 return 0;
2863
2864 if (!best_loc) {
2865 r = svm_migrate_vram_to_ram(prange, mm);
2866 *migrated = !r;
2867 return r;
2868 }
2869
2870 r = svm_migrate_to_vram(prange, best_loc, mm);
2871 *migrated = !r;
2872
2873 return r;
2874}
2875
2876int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence)
2877{
2878 if (!fence)
2879 return -EINVAL;
2880
2881 if (dma_fence_is_signaled(&fence->base))
2882 return 0;
2883
2884 if (fence->svm_bo) {
2885 WRITE_ONCE(fence->svm_bo->evicting, 1);
2886 schedule_work(&fence->svm_bo->eviction_work);
2887 }
2888
2889 return 0;
2890}
2891
2892static void svm_range_evict_svm_bo_worker(struct work_struct *work)
2893{
2894 struct svm_range_bo *svm_bo;
2895 struct kfd_process *p;
2896 struct mm_struct *mm;
2897
2898 svm_bo = container_of(work, struct svm_range_bo, eviction_work);
2899 if (!svm_bo_ref_unless_zero(svm_bo))
2900 return; /* svm_bo was freed while eviction was pending */
2901
2902 /* svm_range_bo_release destroys this worker thread. So during
2903 * the lifetime of this thread, kfd_process and mm will be valid.
2904 */
2905 p = container_of(svm_bo->svms, struct kfd_process, svms);
2906 mm = p->mm;
2907 if (!mm)
2908 return;
2909
2910 mmap_read_lock(mm);
2911 spin_lock(&svm_bo->list_lock);
2912 while (!list_empty(&svm_bo->range_list)) {
2913 struct svm_range *prange =
2914 list_first_entry(&svm_bo->range_list,
2915 struct svm_range, svm_bo_list);
2916 list_del_init(&prange->svm_bo_list);
2917 spin_unlock(&svm_bo->list_lock);
2918
2919 pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
2920 prange->start, prange->last);
2921
2922 mutex_lock(&prange->migrate_mutex);
2923 svm_migrate_vram_to_ram(prange, svm_bo->eviction_fence->mm);
2924
2925 mutex_lock(&prange->lock);
2926 prange->svm_bo = NULL;
2927 mutex_unlock(&prange->lock);
2928
2929 mutex_unlock(&prange->migrate_mutex);
2930
2931 spin_lock(&svm_bo->list_lock);
2932 }
2933 spin_unlock(&svm_bo->list_lock);
2934 mmap_read_unlock(mm);
2935
2936 dma_fence_signal(&svm_bo->eviction_fence->base);
2937 /* This is the last reference to svm_bo, after svm_range_vram_node_free
2938 * has been called in svm_migrate_vram_to_ram
2939 */
2940 WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was not the last reference\n");
2941 svm_range_bo_unref(svm_bo);
2942}
2943
2944static int
2945svm_range_set_attr(struct kfd_process *p, uint64_t start, uint64_t size,
2946 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
2947{
2948 struct amdkfd_process_info *process_info = p->kgd_process_info;
2949 struct mm_struct *mm = current->mm;
2950 struct list_head update_list;
2951 struct list_head insert_list;
2952 struct list_head remove_list;
2953 struct svm_range_list *svms;
2954 struct svm_range *prange;
2955 struct svm_range *next;
2956 int r = 0;
2957
2958 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
2959 p->pasid, &p->svms, start, start + size - 1, size);
2960
2961 r = svm_range_check_attr(p, nattr, attrs);
2962 if (r)
2963 return r;
2964
2965 svms = &p->svms;
2966
2967 mutex_lock(&process_info->lock);
2968
2969 svm_range_list_lock_and_flush_work(svms, mm);
2970
2971 if (!svm_range_is_valid(mm, start, size)) {
2972 pr_debug("invalid range\n");
2973 r = -EFAULT;
2974 mmap_write_unlock(mm);
2975 goto out;
2976 }
2977
2978 mutex_lock(&svms->lock);
2979
2980 /* Add new range and split existing ranges as needed */
2981 r = svm_range_add(p, start, size, nattr, attrs, &update_list,
2982 &insert_list, &remove_list);
2983 if (r) {
2984 mutex_unlock(&svms->lock);
2985 mmap_write_unlock(mm);
2986 goto out;
2987 }
2988 /* Apply changes as a transaction */
2989 list_for_each_entry_safe(prange, next, &insert_list, insert_list) {
2990 svm_range_add_to_svms(prange);
2991 svm_range_add_notifier_locked(mm, prange);
2992 }
2993 list_for_each_entry(prange, &update_list, update_list) {
2994 svm_range_apply_attrs(p, prange, nattr, attrs);
2995 /* TODO: unmap ranges from GPU that lost access */
2996 }
2997 list_for_each_entry_safe(prange, next, &remove_list,
2998 remove_list) {
2999 pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n",
3000 prange->svms, prange, prange->start,
3001 prange->last);
3002 svm_range_unlink(prange);
3003 svm_range_remove_notifier(prange);
3004 svm_range_free(prange);
3005 }
3006
3007 mmap_write_downgrade(mm);
3008 /* Trigger migrations and revalidate and map to GPUs as needed. If
3009 * this fails we may be left with partially completed actions. There
3010 * is no clean way of rolling back to the previous state in such a
3011 * case because the rollback wouldn't be guaranteed to work either.
3012 */
3013 list_for_each_entry(prange, &update_list, update_list) {
3014 bool migrated;
3015
3016 mutex_lock(&prange->migrate_mutex);
3017
3018 r = svm_range_trigger_migration(mm, prange, &migrated);
3019 if (r)
3020 goto out_unlock_range;
3021
3022 if (migrated && !p->xnack_enabled) {
3023 pr_debug("restore_work will update mappings of GPUs\n");
3024 mutex_unlock(&prange->migrate_mutex);
3025 continue;
3026 }
3027
3028 r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
3029 true, true);
3030 if (r)
3031 pr_debug("failed %d to map svm range\n", r);
3032
3033out_unlock_range:
3034 mutex_unlock(&prange->migrate_mutex);
3035 if (r)
3036 break;
3037 }
3038
3039 svm_range_debug_dump(svms);
3040
3041 mutex_unlock(&svms->lock);
3042 mmap_read_unlock(mm);
3043out:
3044 mutex_unlock(&process_info->lock);
3045
3046 pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid,
3047 &p->svms, start, start + size - 1, r);
3048
3049 return r;
3050}
3051
3052static int
3053svm_range_get_attr(struct kfd_process *p, uint64_t start, uint64_t size,
3054 uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
3055{
3056 DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
3057 DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
3058 bool get_preferred_loc = false;
3059 bool get_prefetch_loc = false;
3060 bool get_granularity = false;
3061 bool get_accessible = false;
3062 bool get_flags = false;
3063 uint64_t last = start + size - 1UL;
3064 struct mm_struct *mm = current->mm;
3065 uint8_t granularity = 0xff;
3066 struct interval_tree_node *node;
3067 struct svm_range_list *svms;
3068 struct svm_range *prange;
3069 uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3070 uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3071 uint32_t flags = 0xffffffff;
3072 int gpuidx;
3073 uint32_t i;
3074
3075 pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start,
3076 start + size - 1, nattr);
3077
3078 /* Flush pending deferred work to avoid racing with deferred actions from
3079 * previous memory map changes (e.g. munmap). Concurrent memory map changes
3080 * can still race with get_attr because we don't hold the mmap lock. But that
3081 * would be a race condition in the application anyway, and undefined
3082 * behaviour is acceptable in that case.
3083 */
3084 flush_work(&p->svms.deferred_list_work);
3085
3086 mmap_read_lock(mm);
3087 if (!svm_range_is_valid(mm, start, size)) {
3088 pr_debug("invalid range\n");
3089 mmap_read_unlock(mm);
3090 return -EINVAL;
3091 }
3092 mmap_read_unlock(mm);
3093
3094 for (i = 0; i < nattr; i++) {
3095 switch (attrs[i].type) {
3096 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
3097 get_preferred_loc = true;
3098 break;
3099 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3100 get_prefetch_loc = true;
3101 break;
3102 case KFD_IOCTL_SVM_ATTR_ACCESS:
3103 get_accessible = true;
3104 break;
3105 case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3106 get_flags = true;
3107 break;
3108 case KFD_IOCTL_SVM_ATTR_GRANULARITY:
3109 get_granularity = true;
3110 break;
3111 case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
3112 case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
3113 case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
3114 fallthrough;
3115 default:
3116 pr_debug("get invalid attr type 0x%x\n", attrs[i].type);
3117 return -EINVAL;
3118 }
3119 }
3120
3121 svms = &p->svms;
3122
3123 mutex_lock(&svms->lock);
3124
3125 node = interval_tree_iter_first(&svms->objects, start, last);
3126 if (!node) {
3127 pr_debug("range attrs not found return default values\n");
3128 svm_range_set_default_attributes(&location, &prefetch_loc,
3129 &granularity, &flags);
3130 if (p->xnack_enabled)
3131 bitmap_copy(bitmap_access, svms->bitmap_supported,
3132 MAX_GPU_INSTANCE);
3133 else
3134 bitmap_zero(bitmap_access, MAX_GPU_INSTANCE);
3135 bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE);
3136 goto fill_values;
3137 }
3138 bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE);
3139 bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE);
3140
3141 while (node) {
3142 struct interval_tree_node *next;
3143
3144 prange = container_of(node, struct svm_range, it_node);
3145 next = interval_tree_iter_next(node, start, last);
3146
3147 if (get_preferred_loc) {
3148 if (prange->preferred_loc ==
3149 KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3150 (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED &&
3151 location != prange->preferred_loc)) {
3152 location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3153 get_preferred_loc = false;
3154 } else {
3155 location = prange->preferred_loc;
3156 }
3157 }
3158 if (get_prefetch_loc) {
3159 if (prange->prefetch_loc ==
3160 KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
3161 (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED &&
3162 prefetch_loc != prange->prefetch_loc)) {
3163 prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3164 get_prefetch_loc = false;
3165 } else {
3166 prefetch_loc = prange->prefetch_loc;
3167 }
3168 }
3169 if (get_accessible) {
3170 bitmap_and(bitmap_access, bitmap_access,
3171 prange->bitmap_access, MAX_GPU_INSTANCE);
3172 bitmap_and(bitmap_aip, bitmap_aip,
3173 prange->bitmap_aip, MAX_GPU_INSTANCE);
3174 }
3175 if (get_flags)
3176 flags &= prange->flags;
3177
3178 if (get_granularity && prange->granularity < granularity)
3179 granularity = prange->granularity;
3180
3181 node = next;
3182 }
3183fill_values:
3184 mutex_unlock(&svms->lock);
3185
3186 for (i = 0; i < nattr; i++) {
3187 switch (attrs[i].type) {
3188 case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
3189 attrs[i].value = location;
3190 break;
3191 case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
3192 attrs[i].value = prefetch_loc;
3193 break;
3194 case KFD_IOCTL_SVM_ATTR_ACCESS:
3195 gpuidx = kfd_process_gpuidx_from_gpuid(p,
3196 attrs[i].value);
3197 if (gpuidx < 0) {
3198 pr_debug("invalid gpuid %x\n", attrs[i].value);
3199 return -EINVAL;
3200 }
3201 if (test_bit(gpuidx, bitmap_access))
3202 attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS;
3203 else if (test_bit(gpuidx, bitmap_aip))
3204 attrs[i].type =
3205 KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE;
3206 else
3207 attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS;
3208 break;
3209 case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3210 attrs[i].value = flags;
3211 break;
3212 case KFD_IOCTL_SVM_ATTR_GRANULARITY:
3213 attrs[i].value = (uint32_t)granularity;
3214 break;
3215 }
3216 }
3217
3218 return 0;
3219}
3220
3221int
3222svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,
3223 uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs)
3224{
3225 int r;
3226
3227 start >>= PAGE_SHIFT;
3228 size >>= PAGE_SHIFT;
3229
3230 switch (op) {
3231 case KFD_IOCTL_SVM_OP_SET_ATTR:
3232 r = svm_range_set_attr(p, start, size, nattrs, attrs);
3233 break;
3234 case KFD_IOCTL_SVM_OP_GET_ATTR:
3235 r = svm_range_get_attr(p, start, size, nattrs, attrs);
3236 break;
3237 default:
3238 r = EINVAL;
3239 break;
3240 }
3241
3242 return r;
3243}