Loading...
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/backing-dev.h>
3#include <linux/falloc.h>
4#include <linux/kvm_host.h>
5#include <linux/pagemap.h>
6#include <linux/anon_inodes.h>
7
8#include "kvm_mm.h"
9
10struct kvm_gmem {
11 struct kvm *kvm;
12 struct xarray bindings;
13 struct list_head entry;
14};
15
16static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
17{
18 struct folio *folio;
19
20 /* TODO: Support huge pages. */
21 folio = filemap_grab_folio(inode->i_mapping, index);
22 if (IS_ERR_OR_NULL(folio))
23 return NULL;
24
25 /*
26 * Use the up-to-date flag to track whether or not the memory has been
27 * zeroed before being handed off to the guest. There is no backing
28 * storage for the memory, so the folio will remain up-to-date until
29 * it's removed.
30 *
31 * TODO: Skip clearing pages when trusted firmware will do it when
32 * assigning memory to the guest.
33 */
34 if (!folio_test_uptodate(folio)) {
35 unsigned long nr_pages = folio_nr_pages(folio);
36 unsigned long i;
37
38 for (i = 0; i < nr_pages; i++)
39 clear_highpage(folio_page(folio, i));
40
41 folio_mark_uptodate(folio);
42 }
43
44 /*
45 * Ignore accessed, referenced, and dirty flags. The memory is
46 * unevictable and there is no storage to write back to.
47 */
48 return folio;
49}
50
51static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
52 pgoff_t end)
53{
54 bool flush = false, found_memslot = false;
55 struct kvm_memory_slot *slot;
56 struct kvm *kvm = gmem->kvm;
57 unsigned long index;
58
59 xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
60 pgoff_t pgoff = slot->gmem.pgoff;
61
62 struct kvm_gfn_range gfn_range = {
63 .start = slot->base_gfn + max(pgoff, start) - pgoff,
64 .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
65 .slot = slot,
66 .may_block = true,
67 };
68
69 if (!found_memslot) {
70 found_memslot = true;
71
72 KVM_MMU_LOCK(kvm);
73 kvm_mmu_invalidate_begin(kvm);
74 }
75
76 flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
77 }
78
79 if (flush)
80 kvm_flush_remote_tlbs(kvm);
81
82 if (found_memslot)
83 KVM_MMU_UNLOCK(kvm);
84}
85
86static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
87 pgoff_t end)
88{
89 struct kvm *kvm = gmem->kvm;
90
91 if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
92 KVM_MMU_LOCK(kvm);
93 kvm_mmu_invalidate_end(kvm);
94 KVM_MMU_UNLOCK(kvm);
95 }
96}
97
98static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
99{
100 struct list_head *gmem_list = &inode->i_mapping->i_private_list;
101 pgoff_t start = offset >> PAGE_SHIFT;
102 pgoff_t end = (offset + len) >> PAGE_SHIFT;
103 struct kvm_gmem *gmem;
104
105 /*
106 * Bindings must be stable across invalidation to ensure the start+end
107 * are balanced.
108 */
109 filemap_invalidate_lock(inode->i_mapping);
110
111 list_for_each_entry(gmem, gmem_list, entry)
112 kvm_gmem_invalidate_begin(gmem, start, end);
113
114 truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
115
116 list_for_each_entry(gmem, gmem_list, entry)
117 kvm_gmem_invalidate_end(gmem, start, end);
118
119 filemap_invalidate_unlock(inode->i_mapping);
120
121 return 0;
122}
123
124static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
125{
126 struct address_space *mapping = inode->i_mapping;
127 pgoff_t start, index, end;
128 int r;
129
130 /* Dedicated guest is immutable by default. */
131 if (offset + len > i_size_read(inode))
132 return -EINVAL;
133
134 filemap_invalidate_lock_shared(mapping);
135
136 start = offset >> PAGE_SHIFT;
137 end = (offset + len) >> PAGE_SHIFT;
138
139 r = 0;
140 for (index = start; index < end; ) {
141 struct folio *folio;
142
143 if (signal_pending(current)) {
144 r = -EINTR;
145 break;
146 }
147
148 folio = kvm_gmem_get_folio(inode, index);
149 if (!folio) {
150 r = -ENOMEM;
151 break;
152 }
153
154 index = folio_next_index(folio);
155
156 folio_unlock(folio);
157 folio_put(folio);
158
159 /* 64-bit only, wrapping the index should be impossible. */
160 if (WARN_ON_ONCE(!index))
161 break;
162
163 cond_resched();
164 }
165
166 filemap_invalidate_unlock_shared(mapping);
167
168 return r;
169}
170
171static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
172 loff_t len)
173{
174 int ret;
175
176 if (!(mode & FALLOC_FL_KEEP_SIZE))
177 return -EOPNOTSUPP;
178
179 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
180 return -EOPNOTSUPP;
181
182 if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
183 return -EINVAL;
184
185 if (mode & FALLOC_FL_PUNCH_HOLE)
186 ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
187 else
188 ret = kvm_gmem_allocate(file_inode(file), offset, len);
189
190 if (!ret)
191 file_modified(file);
192 return ret;
193}
194
195static int kvm_gmem_release(struct inode *inode, struct file *file)
196{
197 struct kvm_gmem *gmem = file->private_data;
198 struct kvm_memory_slot *slot;
199 struct kvm *kvm = gmem->kvm;
200 unsigned long index;
201
202 /*
203 * Prevent concurrent attempts to *unbind* a memslot. This is the last
204 * reference to the file and thus no new bindings can be created, but
205 * dereferencing the slot for existing bindings needs to be protected
206 * against memslot updates, specifically so that unbind doesn't race
207 * and free the memslot (kvm_gmem_get_file() will return NULL).
208 */
209 mutex_lock(&kvm->slots_lock);
210
211 filemap_invalidate_lock(inode->i_mapping);
212
213 xa_for_each(&gmem->bindings, index, slot)
214 rcu_assign_pointer(slot->gmem.file, NULL);
215
216 synchronize_rcu();
217
218 /*
219 * All in-flight operations are gone and new bindings can be created.
220 * Zap all SPTEs pointed at by this file. Do not free the backing
221 * memory, as its lifetime is associated with the inode, not the file.
222 */
223 kvm_gmem_invalidate_begin(gmem, 0, -1ul);
224 kvm_gmem_invalidate_end(gmem, 0, -1ul);
225
226 list_del(&gmem->entry);
227
228 filemap_invalidate_unlock(inode->i_mapping);
229
230 mutex_unlock(&kvm->slots_lock);
231
232 xa_destroy(&gmem->bindings);
233 kfree(gmem);
234
235 kvm_put_kvm(kvm);
236
237 return 0;
238}
239
240static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
241{
242 /*
243 * Do not return slot->gmem.file if it has already been closed;
244 * there might be some time between the last fput() and when
245 * kvm_gmem_release() clears slot->gmem.file, and you do not
246 * want to spin in the meanwhile.
247 */
248 return get_file_active(&slot->gmem.file);
249}
250
251static struct file_operations kvm_gmem_fops = {
252 .open = generic_file_open,
253 .release = kvm_gmem_release,
254 .fallocate = kvm_gmem_fallocate,
255};
256
257void kvm_gmem_init(struct module *module)
258{
259 kvm_gmem_fops.owner = module;
260}
261
262static int kvm_gmem_migrate_folio(struct address_space *mapping,
263 struct folio *dst, struct folio *src,
264 enum migrate_mode mode)
265{
266 WARN_ON_ONCE(1);
267 return -EINVAL;
268}
269
270static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
271{
272 struct list_head *gmem_list = &mapping->i_private_list;
273 struct kvm_gmem *gmem;
274 pgoff_t start, end;
275
276 filemap_invalidate_lock_shared(mapping);
277
278 start = folio->index;
279 end = start + folio_nr_pages(folio);
280
281 list_for_each_entry(gmem, gmem_list, entry)
282 kvm_gmem_invalidate_begin(gmem, start, end);
283
284 /*
285 * Do not truncate the range, what action is taken in response to the
286 * error is userspace's decision (assuming the architecture supports
287 * gracefully handling memory errors). If/when the guest attempts to
288 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
289 * at which point KVM can either terminate the VM or propagate the
290 * error to userspace.
291 */
292
293 list_for_each_entry(gmem, gmem_list, entry)
294 kvm_gmem_invalidate_end(gmem, start, end);
295
296 filemap_invalidate_unlock_shared(mapping);
297
298 return MF_DELAYED;
299}
300
301static const struct address_space_operations kvm_gmem_aops = {
302 .dirty_folio = noop_dirty_folio,
303 .migrate_folio = kvm_gmem_migrate_folio,
304 .error_remove_folio = kvm_gmem_error_folio,
305};
306
307static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
308 struct kstat *stat, u32 request_mask,
309 unsigned int query_flags)
310{
311 struct inode *inode = path->dentry->d_inode;
312
313 generic_fillattr(idmap, request_mask, inode, stat);
314 return 0;
315}
316
317static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
318 struct iattr *attr)
319{
320 return -EINVAL;
321}
322static const struct inode_operations kvm_gmem_iops = {
323 .getattr = kvm_gmem_getattr,
324 .setattr = kvm_gmem_setattr,
325};
326
327static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
328{
329 const char *anon_name = "[kvm-gmem]";
330 struct kvm_gmem *gmem;
331 struct inode *inode;
332 struct file *file;
333 int fd, err;
334
335 fd = get_unused_fd_flags(0);
336 if (fd < 0)
337 return fd;
338
339 gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
340 if (!gmem) {
341 err = -ENOMEM;
342 goto err_fd;
343 }
344
345 file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
346 O_RDWR, NULL);
347 if (IS_ERR(file)) {
348 err = PTR_ERR(file);
349 goto err_gmem;
350 }
351
352 file->f_flags |= O_LARGEFILE;
353
354 inode = file->f_inode;
355 WARN_ON(file->f_mapping != inode->i_mapping);
356
357 inode->i_private = (void *)(unsigned long)flags;
358 inode->i_op = &kvm_gmem_iops;
359 inode->i_mapping->a_ops = &kvm_gmem_aops;
360 inode->i_mode |= S_IFREG;
361 inode->i_size = size;
362 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
363 mapping_set_unmovable(inode->i_mapping);
364 /* Unmovable mappings are supposed to be marked unevictable as well. */
365 WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
366
367 kvm_get_kvm(kvm);
368 gmem->kvm = kvm;
369 xa_init(&gmem->bindings);
370 list_add(&gmem->entry, &inode->i_mapping->i_private_list);
371
372 fd_install(fd, file);
373 return fd;
374
375err_gmem:
376 kfree(gmem);
377err_fd:
378 put_unused_fd(fd);
379 return err;
380}
381
382int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
383{
384 loff_t size = args->size;
385 u64 flags = args->flags;
386 u64 valid_flags = 0;
387
388 if (flags & ~valid_flags)
389 return -EINVAL;
390
391 if (size <= 0 || !PAGE_ALIGNED(size))
392 return -EINVAL;
393
394 return __kvm_gmem_create(kvm, size, flags);
395}
396
397int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
398 unsigned int fd, loff_t offset)
399{
400 loff_t size = slot->npages << PAGE_SHIFT;
401 unsigned long start, end;
402 struct kvm_gmem *gmem;
403 struct inode *inode;
404 struct file *file;
405 int r = -EINVAL;
406
407 BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
408
409 file = fget(fd);
410 if (!file)
411 return -EBADF;
412
413 if (file->f_op != &kvm_gmem_fops)
414 goto err;
415
416 gmem = file->private_data;
417 if (gmem->kvm != kvm)
418 goto err;
419
420 inode = file_inode(file);
421
422 if (offset < 0 || !PAGE_ALIGNED(offset) ||
423 offset + size > i_size_read(inode))
424 goto err;
425
426 filemap_invalidate_lock(inode->i_mapping);
427
428 start = offset >> PAGE_SHIFT;
429 end = start + slot->npages;
430
431 if (!xa_empty(&gmem->bindings) &&
432 xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
433 filemap_invalidate_unlock(inode->i_mapping);
434 goto err;
435 }
436
437 /*
438 * No synchronize_rcu() needed, any in-flight readers are guaranteed to
439 * be see either a NULL file or this new file, no need for them to go
440 * away.
441 */
442 rcu_assign_pointer(slot->gmem.file, file);
443 slot->gmem.pgoff = start;
444
445 xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
446 filemap_invalidate_unlock(inode->i_mapping);
447
448 /*
449 * Drop the reference to the file, even on success. The file pins KVM,
450 * not the other way 'round. Active bindings are invalidated if the
451 * file is closed before memslots are destroyed.
452 */
453 r = 0;
454err:
455 fput(file);
456 return r;
457}
458
459void kvm_gmem_unbind(struct kvm_memory_slot *slot)
460{
461 unsigned long start = slot->gmem.pgoff;
462 unsigned long end = start + slot->npages;
463 struct kvm_gmem *gmem;
464 struct file *file;
465
466 /*
467 * Nothing to do if the underlying file was already closed (or is being
468 * closed right now), kvm_gmem_release() invalidates all bindings.
469 */
470 file = kvm_gmem_get_file(slot);
471 if (!file)
472 return;
473
474 gmem = file->private_data;
475
476 filemap_invalidate_lock(file->f_mapping);
477 xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
478 rcu_assign_pointer(slot->gmem.file, NULL);
479 synchronize_rcu();
480 filemap_invalidate_unlock(file->f_mapping);
481
482 fput(file);
483}
484
485int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
486 gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
487{
488 pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
489 struct kvm_gmem *gmem;
490 struct folio *folio;
491 struct page *page;
492 struct file *file;
493 int r;
494
495 file = kvm_gmem_get_file(slot);
496 if (!file)
497 return -EFAULT;
498
499 gmem = file->private_data;
500
501 if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
502 r = -EIO;
503 goto out_fput;
504 }
505
506 folio = kvm_gmem_get_folio(file_inode(file), index);
507 if (!folio) {
508 r = -ENOMEM;
509 goto out_fput;
510 }
511
512 if (folio_test_hwpoison(folio)) {
513 r = -EHWPOISON;
514 goto out_unlock;
515 }
516
517 page = folio_file_page(folio, index);
518
519 *pfn = page_to_pfn(page);
520 if (max_order)
521 *max_order = 0;
522
523 r = 0;
524
525out_unlock:
526 folio_unlock(folio);
527out_fput:
528 fput(file);
529
530 return r;
531}
532EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
1// SPDX-License-Identifier: GPL-2.0
2#include <linux/backing-dev.h>
3#include <linux/falloc.h>
4#include <linux/kvm_host.h>
5#include <linux/pagemap.h>
6#include <linux/anon_inodes.h>
7
8#include "kvm_mm.h"
9
10struct kvm_gmem {
11 struct kvm *kvm;
12 struct xarray bindings;
13 struct list_head entry;
14};
15
16/**
17 * folio_file_pfn - like folio_file_page, but return a pfn.
18 * @folio: The folio which contains this index.
19 * @index: The index we want to look up.
20 *
21 * Return: The pfn for this index.
22 */
23static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
24{
25 return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
26}
27
28static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
29 pgoff_t index, struct folio *folio)
30{
31#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
32 kvm_pfn_t pfn = folio_file_pfn(folio, index);
33 gfn_t gfn = slot->base_gfn + index - slot->gmem.pgoff;
34 int rc = kvm_arch_gmem_prepare(kvm, gfn, pfn, folio_order(folio));
35 if (rc) {
36 pr_warn_ratelimited("gmem: Failed to prepare folio for index %lx GFN %llx PFN %llx error %d.\n",
37 index, gfn, pfn, rc);
38 return rc;
39 }
40#endif
41
42 return 0;
43}
44
45static inline void kvm_gmem_mark_prepared(struct folio *folio)
46{
47 folio_mark_uptodate(folio);
48}
49
50/*
51 * Process @folio, which contains @gfn, so that the guest can use it.
52 * The folio must be locked and the gfn must be contained in @slot.
53 * On successful return the guest sees a zero page so as to avoid
54 * leaking host data and the up-to-date flag is set.
55 */
56static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
57 gfn_t gfn, struct folio *folio)
58{
59 unsigned long nr_pages, i;
60 pgoff_t index;
61 int r;
62
63 nr_pages = folio_nr_pages(folio);
64 for (i = 0; i < nr_pages; i++)
65 clear_highpage(folio_page(folio, i));
66
67 /*
68 * Preparing huge folios should always be safe, since it should
69 * be possible to split them later if needed.
70 *
71 * Right now the folio order is always going to be zero, but the
72 * code is ready for huge folios. The only assumption is that
73 * the base pgoff of memslots is naturally aligned with the
74 * requested page order, ensuring that huge folios can also use
75 * huge page table entries for GPA->HPA mapping.
76 *
77 * The order will be passed when creating the guest_memfd, and
78 * checked when creating memslots.
79 */
80 WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio)));
81 index = gfn - slot->base_gfn + slot->gmem.pgoff;
82 index = ALIGN_DOWN(index, 1 << folio_order(folio));
83 r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
84 if (!r)
85 kvm_gmem_mark_prepared(folio);
86
87 return r;
88}
89
90/*
91 * Returns a locked folio on success. The caller is responsible for
92 * setting the up-to-date flag before the memory is mapped into the guest.
93 * There is no backing storage for the memory, so the folio will remain
94 * up-to-date until it's removed.
95 *
96 * Ignore accessed, referenced, and dirty flags. The memory is
97 * unevictable and there is no storage to write back to.
98 */
99static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
100{
101 /* TODO: Support huge pages. */
102 return filemap_grab_folio(inode->i_mapping, index);
103}
104
105static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
106 pgoff_t end)
107{
108 bool flush = false, found_memslot = false;
109 struct kvm_memory_slot *slot;
110 struct kvm *kvm = gmem->kvm;
111 unsigned long index;
112
113 xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
114 pgoff_t pgoff = slot->gmem.pgoff;
115
116 struct kvm_gfn_range gfn_range = {
117 .start = slot->base_gfn + max(pgoff, start) - pgoff,
118 .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
119 .slot = slot,
120 .may_block = true,
121 };
122
123 if (!found_memslot) {
124 found_memslot = true;
125
126 KVM_MMU_LOCK(kvm);
127 kvm_mmu_invalidate_begin(kvm);
128 }
129
130 flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
131 }
132
133 if (flush)
134 kvm_flush_remote_tlbs(kvm);
135
136 if (found_memslot)
137 KVM_MMU_UNLOCK(kvm);
138}
139
140static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
141 pgoff_t end)
142{
143 struct kvm *kvm = gmem->kvm;
144
145 if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
146 KVM_MMU_LOCK(kvm);
147 kvm_mmu_invalidate_end(kvm);
148 KVM_MMU_UNLOCK(kvm);
149 }
150}
151
152static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
153{
154 struct list_head *gmem_list = &inode->i_mapping->i_private_list;
155 pgoff_t start = offset >> PAGE_SHIFT;
156 pgoff_t end = (offset + len) >> PAGE_SHIFT;
157 struct kvm_gmem *gmem;
158
159 /*
160 * Bindings must be stable across invalidation to ensure the start+end
161 * are balanced.
162 */
163 filemap_invalidate_lock(inode->i_mapping);
164
165 list_for_each_entry(gmem, gmem_list, entry)
166 kvm_gmem_invalidate_begin(gmem, start, end);
167
168 truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
169
170 list_for_each_entry(gmem, gmem_list, entry)
171 kvm_gmem_invalidate_end(gmem, start, end);
172
173 filemap_invalidate_unlock(inode->i_mapping);
174
175 return 0;
176}
177
178static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
179{
180 struct address_space *mapping = inode->i_mapping;
181 pgoff_t start, index, end;
182 int r;
183
184 /* Dedicated guest is immutable by default. */
185 if (offset + len > i_size_read(inode))
186 return -EINVAL;
187
188 filemap_invalidate_lock_shared(mapping);
189
190 start = offset >> PAGE_SHIFT;
191 end = (offset + len) >> PAGE_SHIFT;
192
193 r = 0;
194 for (index = start; index < end; ) {
195 struct folio *folio;
196
197 if (signal_pending(current)) {
198 r = -EINTR;
199 break;
200 }
201
202 folio = kvm_gmem_get_folio(inode, index);
203 if (IS_ERR(folio)) {
204 r = PTR_ERR(folio);
205 break;
206 }
207
208 index = folio_next_index(folio);
209
210 folio_unlock(folio);
211 folio_put(folio);
212
213 /* 64-bit only, wrapping the index should be impossible. */
214 if (WARN_ON_ONCE(!index))
215 break;
216
217 cond_resched();
218 }
219
220 filemap_invalidate_unlock_shared(mapping);
221
222 return r;
223}
224
225static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
226 loff_t len)
227{
228 int ret;
229
230 if (!(mode & FALLOC_FL_KEEP_SIZE))
231 return -EOPNOTSUPP;
232
233 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
234 return -EOPNOTSUPP;
235
236 if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
237 return -EINVAL;
238
239 if (mode & FALLOC_FL_PUNCH_HOLE)
240 ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
241 else
242 ret = kvm_gmem_allocate(file_inode(file), offset, len);
243
244 if (!ret)
245 file_modified(file);
246 return ret;
247}
248
249static int kvm_gmem_release(struct inode *inode, struct file *file)
250{
251 struct kvm_gmem *gmem = file->private_data;
252 struct kvm_memory_slot *slot;
253 struct kvm *kvm = gmem->kvm;
254 unsigned long index;
255
256 /*
257 * Prevent concurrent attempts to *unbind* a memslot. This is the last
258 * reference to the file and thus no new bindings can be created, but
259 * dereferencing the slot for existing bindings needs to be protected
260 * against memslot updates, specifically so that unbind doesn't race
261 * and free the memslot (kvm_gmem_get_file() will return NULL).
262 */
263 mutex_lock(&kvm->slots_lock);
264
265 filemap_invalidate_lock(inode->i_mapping);
266
267 xa_for_each(&gmem->bindings, index, slot)
268 rcu_assign_pointer(slot->gmem.file, NULL);
269
270 synchronize_rcu();
271
272 /*
273 * All in-flight operations are gone and new bindings can be created.
274 * Zap all SPTEs pointed at by this file. Do not free the backing
275 * memory, as its lifetime is associated with the inode, not the file.
276 */
277 kvm_gmem_invalidate_begin(gmem, 0, -1ul);
278 kvm_gmem_invalidate_end(gmem, 0, -1ul);
279
280 list_del(&gmem->entry);
281
282 filemap_invalidate_unlock(inode->i_mapping);
283
284 mutex_unlock(&kvm->slots_lock);
285
286 xa_destroy(&gmem->bindings);
287 kfree(gmem);
288
289 kvm_put_kvm(kvm);
290
291 return 0;
292}
293
294static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
295{
296 /*
297 * Do not return slot->gmem.file if it has already been closed;
298 * there might be some time between the last fput() and when
299 * kvm_gmem_release() clears slot->gmem.file, and you do not
300 * want to spin in the meanwhile.
301 */
302 return get_file_active(&slot->gmem.file);
303}
304
305static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
306{
307 return gfn - slot->base_gfn + slot->gmem.pgoff;
308}
309
310static struct file_operations kvm_gmem_fops = {
311 .open = generic_file_open,
312 .release = kvm_gmem_release,
313 .fallocate = kvm_gmem_fallocate,
314};
315
316void kvm_gmem_init(struct module *module)
317{
318 kvm_gmem_fops.owner = module;
319}
320
321static int kvm_gmem_migrate_folio(struct address_space *mapping,
322 struct folio *dst, struct folio *src,
323 enum migrate_mode mode)
324{
325 WARN_ON_ONCE(1);
326 return -EINVAL;
327}
328
329static int kvm_gmem_error_folio(struct address_space *mapping, struct folio *folio)
330{
331 struct list_head *gmem_list = &mapping->i_private_list;
332 struct kvm_gmem *gmem;
333 pgoff_t start, end;
334
335 filemap_invalidate_lock_shared(mapping);
336
337 start = folio->index;
338 end = start + folio_nr_pages(folio);
339
340 list_for_each_entry(gmem, gmem_list, entry)
341 kvm_gmem_invalidate_begin(gmem, start, end);
342
343 /*
344 * Do not truncate the range, what action is taken in response to the
345 * error is userspace's decision (assuming the architecture supports
346 * gracefully handling memory errors). If/when the guest attempts to
347 * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON,
348 * at which point KVM can either terminate the VM or propagate the
349 * error to userspace.
350 */
351
352 list_for_each_entry(gmem, gmem_list, entry)
353 kvm_gmem_invalidate_end(gmem, start, end);
354
355 filemap_invalidate_unlock_shared(mapping);
356
357 return MF_DELAYED;
358}
359
360#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
361static void kvm_gmem_free_folio(struct folio *folio)
362{
363 struct page *page = folio_page(folio, 0);
364 kvm_pfn_t pfn = page_to_pfn(page);
365 int order = folio_order(folio);
366
367 kvm_arch_gmem_invalidate(pfn, pfn + (1ul << order));
368}
369#endif
370
371static const struct address_space_operations kvm_gmem_aops = {
372 .dirty_folio = noop_dirty_folio,
373 .migrate_folio = kvm_gmem_migrate_folio,
374 .error_remove_folio = kvm_gmem_error_folio,
375#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
376 .free_folio = kvm_gmem_free_folio,
377#endif
378};
379
380static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path,
381 struct kstat *stat, u32 request_mask,
382 unsigned int query_flags)
383{
384 struct inode *inode = path->dentry->d_inode;
385
386 generic_fillattr(idmap, request_mask, inode, stat);
387 return 0;
388}
389
390static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
391 struct iattr *attr)
392{
393 return -EINVAL;
394}
395static const struct inode_operations kvm_gmem_iops = {
396 .getattr = kvm_gmem_getattr,
397 .setattr = kvm_gmem_setattr,
398};
399
400static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
401{
402 const char *anon_name = "[kvm-gmem]";
403 struct kvm_gmem *gmem;
404 struct inode *inode;
405 struct file *file;
406 int fd, err;
407
408 fd = get_unused_fd_flags(0);
409 if (fd < 0)
410 return fd;
411
412 gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
413 if (!gmem) {
414 err = -ENOMEM;
415 goto err_fd;
416 }
417
418 file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
419 O_RDWR, NULL);
420 if (IS_ERR(file)) {
421 err = PTR_ERR(file);
422 goto err_gmem;
423 }
424
425 file->f_flags |= O_LARGEFILE;
426
427 inode = file->f_inode;
428 WARN_ON(file->f_mapping != inode->i_mapping);
429
430 inode->i_private = (void *)(unsigned long)flags;
431 inode->i_op = &kvm_gmem_iops;
432 inode->i_mapping->a_ops = &kvm_gmem_aops;
433 inode->i_mode |= S_IFREG;
434 inode->i_size = size;
435 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
436 mapping_set_inaccessible(inode->i_mapping);
437 /* Unmovable mappings are supposed to be marked unevictable as well. */
438 WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
439
440 kvm_get_kvm(kvm);
441 gmem->kvm = kvm;
442 xa_init(&gmem->bindings);
443 list_add(&gmem->entry, &inode->i_mapping->i_private_list);
444
445 fd_install(fd, file);
446 return fd;
447
448err_gmem:
449 kfree(gmem);
450err_fd:
451 put_unused_fd(fd);
452 return err;
453}
454
455int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
456{
457 loff_t size = args->size;
458 u64 flags = args->flags;
459 u64 valid_flags = 0;
460
461 if (flags & ~valid_flags)
462 return -EINVAL;
463
464 if (size <= 0 || !PAGE_ALIGNED(size))
465 return -EINVAL;
466
467 return __kvm_gmem_create(kvm, size, flags);
468}
469
470int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
471 unsigned int fd, loff_t offset)
472{
473 loff_t size = slot->npages << PAGE_SHIFT;
474 unsigned long start, end;
475 struct kvm_gmem *gmem;
476 struct inode *inode;
477 struct file *file;
478 int r = -EINVAL;
479
480 BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
481
482 file = fget(fd);
483 if (!file)
484 return -EBADF;
485
486 if (file->f_op != &kvm_gmem_fops)
487 goto err;
488
489 gmem = file->private_data;
490 if (gmem->kvm != kvm)
491 goto err;
492
493 inode = file_inode(file);
494
495 if (offset < 0 || !PAGE_ALIGNED(offset) ||
496 offset + size > i_size_read(inode))
497 goto err;
498
499 filemap_invalidate_lock(inode->i_mapping);
500
501 start = offset >> PAGE_SHIFT;
502 end = start + slot->npages;
503
504 if (!xa_empty(&gmem->bindings) &&
505 xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
506 filemap_invalidate_unlock(inode->i_mapping);
507 goto err;
508 }
509
510 /*
511 * No synchronize_rcu() needed, any in-flight readers are guaranteed to
512 * be see either a NULL file or this new file, no need for them to go
513 * away.
514 */
515 rcu_assign_pointer(slot->gmem.file, file);
516 slot->gmem.pgoff = start;
517
518 xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
519 filemap_invalidate_unlock(inode->i_mapping);
520
521 /*
522 * Drop the reference to the file, even on success. The file pins KVM,
523 * not the other way 'round. Active bindings are invalidated if the
524 * file is closed before memslots are destroyed.
525 */
526 r = 0;
527err:
528 fput(file);
529 return r;
530}
531
532void kvm_gmem_unbind(struct kvm_memory_slot *slot)
533{
534 unsigned long start = slot->gmem.pgoff;
535 unsigned long end = start + slot->npages;
536 struct kvm_gmem *gmem;
537 struct file *file;
538
539 /*
540 * Nothing to do if the underlying file was already closed (or is being
541 * closed right now), kvm_gmem_release() invalidates all bindings.
542 */
543 file = kvm_gmem_get_file(slot);
544 if (!file)
545 return;
546
547 gmem = file->private_data;
548
549 filemap_invalidate_lock(file->f_mapping);
550 xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
551 rcu_assign_pointer(slot->gmem.file, NULL);
552 synchronize_rcu();
553 filemap_invalidate_unlock(file->f_mapping);
554
555 fput(file);
556}
557
558/* Returns a locked folio on success. */
559static struct folio *__kvm_gmem_get_pfn(struct file *file,
560 struct kvm_memory_slot *slot,
561 pgoff_t index, kvm_pfn_t *pfn,
562 bool *is_prepared, int *max_order)
563{
564 struct kvm_gmem *gmem = file->private_data;
565 struct folio *folio;
566
567 if (file != slot->gmem.file) {
568 WARN_ON_ONCE(slot->gmem.file);
569 return ERR_PTR(-EFAULT);
570 }
571
572 gmem = file->private_data;
573 if (xa_load(&gmem->bindings, index) != slot) {
574 WARN_ON_ONCE(xa_load(&gmem->bindings, index));
575 return ERR_PTR(-EIO);
576 }
577
578 folio = kvm_gmem_get_folio(file_inode(file), index);
579 if (IS_ERR(folio))
580 return folio;
581
582 if (folio_test_hwpoison(folio)) {
583 folio_unlock(folio);
584 folio_put(folio);
585 return ERR_PTR(-EHWPOISON);
586 }
587
588 *pfn = folio_file_pfn(folio, index);
589 if (max_order)
590 *max_order = 0;
591
592 *is_prepared = folio_test_uptodate(folio);
593 return folio;
594}
595
596int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
597 gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
598 int *max_order)
599{
600 pgoff_t index = kvm_gmem_get_index(slot, gfn);
601 struct file *file = kvm_gmem_get_file(slot);
602 struct folio *folio;
603 bool is_prepared = false;
604 int r = 0;
605
606 if (!file)
607 return -EFAULT;
608
609 folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
610 if (IS_ERR(folio)) {
611 r = PTR_ERR(folio);
612 goto out;
613 }
614
615 if (!is_prepared)
616 r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
617
618 folio_unlock(folio);
619
620 if (!r)
621 *page = folio_file_page(folio, index);
622 else
623 folio_put(folio);
624
625out:
626 fput(file);
627 return r;
628}
629EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
630
631#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM
632long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
633 kvm_gmem_populate_cb post_populate, void *opaque)
634{
635 struct file *file;
636 struct kvm_memory_slot *slot;
637 void __user *p;
638
639 int ret = 0, max_order;
640 long i;
641
642 lockdep_assert_held(&kvm->slots_lock);
643 if (npages < 0)
644 return -EINVAL;
645
646 slot = gfn_to_memslot(kvm, start_gfn);
647 if (!kvm_slot_can_be_private(slot))
648 return -EINVAL;
649
650 file = kvm_gmem_get_file(slot);
651 if (!file)
652 return -EFAULT;
653
654 filemap_invalidate_lock(file->f_mapping);
655
656 npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
657 for (i = 0; i < npages; i += (1 << max_order)) {
658 struct folio *folio;
659 gfn_t gfn = start_gfn + i;
660 pgoff_t index = kvm_gmem_get_index(slot, gfn);
661 bool is_prepared = false;
662 kvm_pfn_t pfn;
663
664 if (signal_pending(current)) {
665 ret = -EINTR;
666 break;
667 }
668
669 folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order);
670 if (IS_ERR(folio)) {
671 ret = PTR_ERR(folio);
672 break;
673 }
674
675 if (is_prepared) {
676 folio_unlock(folio);
677 folio_put(folio);
678 ret = -EEXIST;
679 break;
680 }
681
682 folio_unlock(folio);
683 WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
684 (npages - i) < (1 << max_order));
685
686 ret = -EINVAL;
687 while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
688 KVM_MEMORY_ATTRIBUTE_PRIVATE,
689 KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
690 if (!max_order)
691 goto put_folio_and_exit;
692 max_order--;
693 }
694
695 p = src ? src + i * PAGE_SIZE : NULL;
696 ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
697 if (!ret)
698 kvm_gmem_mark_prepared(folio);
699
700put_folio_and_exit:
701 folio_put(folio);
702 if (ret)
703 break;
704 }
705
706 filemap_invalidate_unlock(file->f_mapping);
707
708 fput(file);
709 return ret && !i ? ret : i;
710}
711EXPORT_SYMBOL_GPL(kvm_gmem_populate);
712#endif