gup.c - arch/x86/mm/gup.c - Linux source code v5.4 - Bootlin Elixir Cross Referencer

Note: File does not exist in v5.4.
  1/*
  2 * Lockless get_user_pages_fast for x86
  3 *
  4 * Copyright (C) 2008 Nick Piggin
  5 * Copyright (C) 2008 Novell Inc.
  6 */
  7#include <linux/sched.h>
  8#include <linux/mm.h>
  9#include <linux/vmstat.h>
 10#include <linux/highmem.h>
 11#include <linux/swap.h>
 12#include <linux/memremap.h>
 13
 14#include <asm/mmu_context.h>
 15#include <asm/pgtable.h>
 16
 17static inline pte_t gup_get_pte(pte_t *ptep)
 18{
 19#ifndef CONFIG_X86_PAE
 20	return READ_ONCE(*ptep);
 21#else
 22	/*
 23	 * With get_user_pages_fast, we walk down the pagetables without taking
 24	 * any locks.  For this we would like to load the pointers atomically,
 25	 * but that is not possible (without expensive cmpxchg8b) on PAE.  What
 26	 * we do have is the guarantee that a pte will only either go from not
 27	 * present to present, or present to not present or both -- it will not
 28	 * switch to a completely different present page without a TLB flush in
 29	 * between; something that we are blocking by holding interrupts off.
 30	 *
 31	 * Setting ptes from not present to present goes:
 32	 * ptep->pte_high = h;
 33	 * smp_wmb();
 34	 * ptep->pte_low = l;
 35	 *
 36	 * And present to not present goes:
 37	 * ptep->pte_low = 0;
 38	 * smp_wmb();
 39	 * ptep->pte_high = 0;
 40	 *
 41	 * We must ensure here that the load of pte_low sees l iff pte_high
 42	 * sees h. We load pte_high *after* loading pte_low, which ensures we
 43	 * don't see an older value of pte_high.  *Then* we recheck pte_low,
 44	 * which ensures that we haven't picked up a changed pte high. We might
 45	 * have got rubbish values from pte_low and pte_high, but we are
 46	 * guaranteed that pte_low will not have the present bit set *unless*
 47	 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
 48	 * we're safe.
 49	 *
 50	 * gup_get_pte should not be used or copied outside gup.c without being
 51	 * very careful -- it does not atomically load the pte or anything that
 52	 * is likely to be useful for you.
 53	 */
 54	pte_t pte;
 55
 56retry:
 57	pte.pte_low = ptep->pte_low;
 58	smp_rmb();
 59	pte.pte_high = ptep->pte_high;
 60	smp_rmb();
 61	if (unlikely(pte.pte_low != ptep->pte_low))
 62		goto retry;
 63
 64	return pte;
 65#endif
 66}
 67
 68static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
 69{
 70	while ((*nr) - nr_start) {
 71		struct page *page = pages[--(*nr)];
 72
 73		ClearPageReferenced(page);
 74		put_page(page);
 75	}
 76}
 77
 78/*
 79 * 'pteval' can come from a pte, pmd or pud.  We only check
 80 * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
 81 * same value on all 3 types.
 82 */
 83static inline int pte_allows_gup(unsigned long pteval, int write)
 84{
 85	unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
 86
 87	if (write)
 88		need_pte_bits |= _PAGE_RW;
 89
 90	if ((pteval & need_pte_bits) != need_pte_bits)
 91		return 0;
 92
 93	/* Check memory protection keys permissions. */
 94	if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
 95		return 0;
 96
 97	return 1;
 98}
 99
100/*
101 * The performance critical leaf functions are made noinline otherwise gcc
102 * inlines everything into a single function which results in too much
103 * register pressure.
104 */
105static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
106		unsigned long end, int write, struct page **pages, int *nr)
107{
108	struct dev_pagemap *pgmap = NULL;
109	int nr_start = *nr;
110	pte_t *ptep;
111
112	ptep = pte_offset_map(&pmd, addr);
113	do {
114		pte_t pte = gup_get_pte(ptep);
115		struct page *page;
116
117		/* Similar to the PMD case, NUMA hinting must take slow path */
118		if (pte_protnone(pte)) {
119			pte_unmap(ptep);
120			return 0;
121		}
122
123		if (pte_devmap(pte)) {
124			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
125			if (unlikely(!pgmap)) {
126				undo_dev_pagemap(nr, nr_start, pages);
127				pte_unmap(ptep);
128				return 0;
129			}
130		} else if (!pte_allows_gup(pte_val(pte), write) ||
131			   pte_special(pte)) {
132			pte_unmap(ptep);
133			return 0;
134		}
135		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
136		page = pte_page(pte);
137		get_page(page);
138		put_dev_pagemap(pgmap);
139		SetPageReferenced(page);
140		pages[*nr] = page;
141		(*nr)++;
142
143	} while (ptep++, addr += PAGE_SIZE, addr != end);
144	pte_unmap(ptep - 1);
145
146	return 1;
147}
148
149static inline void get_head_page_multiple(struct page *page, int nr)
150{
151	VM_BUG_ON_PAGE(page != compound_head(page), page);
152	VM_BUG_ON_PAGE(page_count(page) == 0, page);
153	page_ref_add(page, nr);
154	SetPageReferenced(page);
155}
156
157static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
158		unsigned long end, struct page **pages, int *nr)
159{
160	int nr_start = *nr;
161	unsigned long pfn = pmd_pfn(pmd);
162	struct dev_pagemap *pgmap = NULL;
163
164	pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
165	do {
166		struct page *page = pfn_to_page(pfn);
167
168		pgmap = get_dev_pagemap(pfn, pgmap);
169		if (unlikely(!pgmap)) {
170			undo_dev_pagemap(nr, nr_start, pages);
171			return 0;
172		}
173		SetPageReferenced(page);
174		pages[*nr] = page;
175		get_page(page);
176		put_dev_pagemap(pgmap);
177		(*nr)++;
178		pfn++;
179	} while (addr += PAGE_SIZE, addr != end);
180	return 1;
181}
182
183static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
184		unsigned long end, int write, struct page **pages, int *nr)
185{
186	struct page *head, *page;
187	int refs;
188
189	if (!pte_allows_gup(pmd_val(pmd), write))
190		return 0;
191
192	VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
193	if (pmd_devmap(pmd))
194		return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
195
196	/* hugepages are never "special" */
197	VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
198
199	refs = 0;
200	head = pmd_page(pmd);
201	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
202	do {
203		VM_BUG_ON_PAGE(compound_head(page) != head, page);
204		pages[*nr] = page;
205		(*nr)++;
206		page++;
207		refs++;
208	} while (addr += PAGE_SIZE, addr != end);
209	get_head_page_multiple(head, refs);
210
211	return 1;
212}
213
214static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
215		int write, struct page **pages, int *nr)
216{
217	unsigned long next;
218	pmd_t *pmdp;
219
220	pmdp = pmd_offset(&pud, addr);
221	do {
222		pmd_t pmd = *pmdp;
223
224		next = pmd_addr_end(addr, end);
225		if (pmd_none(pmd))
226			return 0;
227		if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
228			/*
229			 * NUMA hinting faults need to be handled in the GUP
230			 * slowpath for accounting purposes and so that they
231			 * can be serialised against THP migration.
232			 */
233			if (pmd_protnone(pmd))
234				return 0;
235			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
236				return 0;
237		} else {
238			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
239				return 0;
240		}
241	} while (pmdp++, addr = next, addr != end);
242
243	return 1;
244}
245
246static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
247		unsigned long end, int write, struct page **pages, int *nr)
248{
249	struct page *head, *page;
250	int refs;
251
252	if (!pte_allows_gup(pud_val(pud), write))
253		return 0;
254	/* hugepages are never "special" */
255	VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
256	VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
257
258	refs = 0;
259	head = pud_page(pud);
260	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
261	do {
262		VM_BUG_ON_PAGE(compound_head(page) != head, page);
263		pages[*nr] = page;
264		(*nr)++;
265		page++;
266		refs++;
267	} while (addr += PAGE_SIZE, addr != end);
268	get_head_page_multiple(head, refs);
269
270	return 1;
271}
272
273static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
274			int write, struct page **pages, int *nr)
275{
276	unsigned long next;
277	pud_t *pudp;
278
279	pudp = pud_offset(&pgd, addr);
280	do {
281		pud_t pud = *pudp;
282
283		next = pud_addr_end(addr, end);
284		if (pud_none(pud))
285			return 0;
286		if (unlikely(pud_large(pud))) {
287			if (!gup_huge_pud(pud, addr, next, write, pages, nr))
288				return 0;
289		} else {
290			if (!gup_pmd_range(pud, addr, next, write, pages, nr))
291				return 0;
292		}
293	} while (pudp++, addr = next, addr != end);
294
295	return 1;
296}
297
298/*
299 * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
300 * back to the regular GUP.
301 */
302int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
303			  struct page **pages)
304{
305	struct mm_struct *mm = current->mm;
306	unsigned long addr, len, end;
307	unsigned long next;
308	unsigned long flags;
309	pgd_t *pgdp;
310	int nr = 0;
311
312	start &= PAGE_MASK;
313	addr = start;
314	len = (unsigned long) nr_pages << PAGE_SHIFT;
315	end = start + len;
316	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
317					(void __user *)start, len)))
318		return 0;
319
320	/*
321	 * XXX: batch / limit 'nr', to avoid large irq off latency
322	 * needs some instrumenting to determine the common sizes used by
323	 * important workloads (eg. DB2), and whether limiting the batch size
324	 * will decrease performance.
325	 *
326	 * It seems like we're in the clear for the moment. Direct-IO is
327	 * the main guy that batches up lots of get_user_pages, and even
328	 * they are limited to 64-at-a-time which is not so many.
329	 */
330	/*
331	 * This doesn't prevent pagetable teardown, but does prevent
332	 * the pagetables and pages from being freed on x86.
333	 *
334	 * So long as we atomically load page table pointers versus teardown
335	 * (which we do on x86, with the above PAE exception), we can follow the
336	 * address down to the the page and take a ref on it.
337	 */
338	local_irq_save(flags);
339	pgdp = pgd_offset(mm, addr);
340	do {
341		pgd_t pgd = *pgdp;
342
343		next = pgd_addr_end(addr, end);
344		if (pgd_none(pgd))
345			break;
346		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
347			break;
348	} while (pgdp++, addr = next, addr != end);
349	local_irq_restore(flags);
350
351	return nr;
352}
353
354/**
355 * get_user_pages_fast() - pin user pages in memory
356 * @start:	starting user address
357 * @nr_pages:	number of pages from start to pin
358 * @write:	whether pages will be written to
359 * @pages:	array that receives pointers to the pages pinned.
360 * 		Should be at least nr_pages long.
361 *
362 * Attempt to pin user pages in memory without taking mm->mmap_sem.
363 * If not successful, it will fall back to taking the lock and
364 * calling get_user_pages().
365 *
366 * Returns number of pages pinned. This may be fewer than the number
367 * requested. If nr_pages is 0 or negative, returns 0. If no pages
368 * were pinned, returns -errno.
369 */
370int get_user_pages_fast(unsigned long start, int nr_pages, int write,
371			struct page **pages)
372{
373	struct mm_struct *mm = current->mm;
374	unsigned long addr, len, end;
375	unsigned long next;
376	pgd_t *pgdp;
377	int nr = 0;
378
379	start &= PAGE_MASK;
380	addr = start;
381	len = (unsigned long) nr_pages << PAGE_SHIFT;
382
383	end = start + len;
384	if (end < start)
385		goto slow_irqon;
386
387#ifdef CONFIG_X86_64
388	if (end >> __VIRTUAL_MASK_SHIFT)
389		goto slow_irqon;
390#endif
391
392	/*
393	 * XXX: batch / limit 'nr', to avoid large irq off latency
394	 * needs some instrumenting to determine the common sizes used by
395	 * important workloads (eg. DB2), and whether limiting the batch size
396	 * will decrease performance.
397	 *
398	 * It seems like we're in the clear for the moment. Direct-IO is
399	 * the main guy that batches up lots of get_user_pages, and even
400	 * they are limited to 64-at-a-time which is not so many.
401	 */
402	/*
403	 * This doesn't prevent pagetable teardown, but does prevent
404	 * the pagetables and pages from being freed on x86.
405	 *
406	 * So long as we atomically load page table pointers versus teardown
407	 * (which we do on x86, with the above PAE exception), we can follow the
408	 * address down to the the page and take a ref on it.
409	 */
410	local_irq_disable();
411	pgdp = pgd_offset(mm, addr);
412	do {
413		pgd_t pgd = *pgdp;
414
415		next = pgd_addr_end(addr, end);
416		if (pgd_none(pgd))
417			goto slow;
418		if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
419			goto slow;
420	} while (pgdp++, addr = next, addr != end);
421	local_irq_enable();
422
423	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
424	return nr;
425
426	{
427		int ret;
428
429slow:
430		local_irq_enable();
431slow_irqon:
432		/* Try to get the remaining pages with get_user_pages */
433		start += nr << PAGE_SHIFT;
434		pages += nr;
435
436		ret = get_user_pages_unlocked(start,
437					      (end - start) >> PAGE_SHIFT,
438					      write, 0, pages);
439
440		/* Have to be a bit careful with return values */
441		if (nr > 0) {
442			if (ret < 0)
443				ret = nr;
444			else
445				ret += nr;
446		}
447
448		return ret;
449	}
450}