hugetlb_vmemmap.c - mm/hugetlb_vmemmap.c - Linux diff v5.14.15

  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * Free some vmemmap pages of HugeTLB
  4 *
  5 * Copyright (c) 2020, Bytedance. All rights reserved.
  6 *
  7 *     Author: Muchun Song <songmuchun@bytedance.com>
  8 *
  9 * The struct page structures (page structs) are used to describe a physical
 10 * page frame. By default, there is a one-to-one mapping from a page frame to
 11 * it's corresponding page struct.
 12 *
 13 * HugeTLB pages consist of multiple base page size pages and is supported by
 14 * many architectures. See hugetlbpage.rst in the Documentation directory for
 15 * more details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB
 16 * are currently supported. Since the base page size on x86 is 4KB, a 2MB
 17 * HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of
 18 * 4096 base pages. For each base page, there is a corresponding page struct.
 19 *
 20 * Within the HugeTLB subsystem, only the first 4 page structs are used to
 21 * contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
 22 * this upper limit. The only 'useful' information in the remaining page structs
 23 * is the compound_head field, and this field is the same for all tail pages.
 24 *
 25 * By removing redundant page structs for HugeTLB pages, memory can be returned
 26 * to the buddy allocator for other uses.
 27 *
 28 * Different architectures support different HugeTLB pages. For example, the
 29 * following table is the HugeTLB page size supported by x86 and arm64
 30 * architectures. Because arm64 supports 4k, 16k, and 64k base pages and
 31 * supports contiguous entries, so it supports many kinds of sizes of HugeTLB
 32 * page.
 33 *
 34 * +--------------+-----------+-----------------------------------------------+
 35 * | Architecture | Page Size |                HugeTLB Page Size              |
 36 * +--------------+-----------+-----------+-----------+-----------+-----------+
 37 * |    x86-64    |    4KB    |    2MB    |    1GB    |           |           |
 38 * +--------------+-----------+-----------+-----------+-----------+-----------+
 39 * |              |    4KB    |   64KB    |    2MB    |    32MB   |    1GB    |
 40 * |              +-----------+-----------+-----------+-----------+-----------+
 41 * |    arm64     |   16KB    |    2MB    |   32MB    |     1GB   |           |
 42 * |              +-----------+-----------+-----------+-----------+-----------+
 43 * |              |   64KB    |    2MB    |  512MB    |    16GB   |           |
 44 * +--------------+-----------+-----------+-----------+-----------+-----------+
 45 *
 46 * When the system boot up, every HugeTLB page has more than one struct page
 47 * structs which size is (unit: pages):
 48 *
 49 *    struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
 50 *
 51 * Where HugeTLB_Size is the size of the HugeTLB page. We know that the size
 52 * of the HugeTLB page is always n times PAGE_SIZE. So we can get the following
 53 * relationship.
 54 *
 55 *    HugeTLB_Size = n * PAGE_SIZE
 56 *
 57 * Then,
 58 *
 59 *    struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
 60 *                = n * sizeof(struct page) / PAGE_SIZE
 61 *
 62 * We can use huge mapping at the pud/pmd level for the HugeTLB page.
 63 *
 64 * For the HugeTLB page of the pmd level mapping, then
 65 *
 66 *    struct_size = n * sizeof(struct page) / PAGE_SIZE
 67 *                = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE
 68 *                = sizeof(struct page) / sizeof(pte_t)
 69 *                = 64 / 8
 70 *                = 8 (pages)
 71 *
 72 * Where n is how many pte entries which one page can contains. So the value of
 73 * n is (PAGE_SIZE / sizeof(pte_t)).
 74 *
 75 * This optimization only supports 64-bit system, so the value of sizeof(pte_t)
 76 * is 8. And this optimization also applicable only when the size of struct page
 77 * is a power of two. In most cases, the size of struct page is 64 bytes (e.g.
 78 * x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
 79 * size of struct page structs of it is 8 page frames which size depends on the
 80 * size of the base page.
 81 *
 82 * For the HugeTLB page of the pud level mapping, then
 83 *
 84 *    struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd)
 85 *                = PAGE_SIZE / 8 * 8 (pages)
 86 *                = PAGE_SIZE (pages)
 87 *
 88 * Where the struct_size(pmd) is the size of the struct page structs of a
 89 * HugeTLB page of the pmd level mapping.
 90 *
 91 * E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
 92 * HugeTLB page consists in 4096.
 93 *
 94 * Next, we take the pmd level mapping of the HugeTLB page as an example to
 95 * show the internal implementation of this optimization. There are 8 pages
 96 * struct page structs associated with a HugeTLB page which is pmd mapped.
 97 *
 98 * Here is how things look before optimization.
 99 *
100 *    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
101 * +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
102 * |           |                     |     0     | -------------> |     0     |
103 * |           |                     +-----------+                +-----------+
104 * |           |                     |     1     | -------------> |     1     |
105 * |           |                     +-----------+                +-----------+
106 * |           |                     |     2     | -------------> |     2     |
107 * |           |                     +-----------+                +-----------+
108 * |           |                     |     3     | -------------> |     3     |
109 * |           |                     +-----------+                +-----------+
110 * |           |                     |     4     | -------------> |     4     |
111 * |    PMD    |                     +-----------+                +-----------+
112 * |   level   |                     |     5     | -------------> |     5     |
113 * |  mapping  |                     +-----------+                +-----------+
114 * |           |                     |     6     | -------------> |     6     |
115 * |           |                     +-----------+                +-----------+
116 * |           |                     |     7     | -------------> |     7     |
117 * |           |                     +-----------+                +-----------+
118 * |           |
119 * |           |
120 * |           |
121 * +-----------+
122 *
123 * The value of page->compound_head is the same for all tail pages. The first
124 * page of page structs (page 0) associated with the HugeTLB page contains the 4
125 * page structs necessary to describe the HugeTLB. The only use of the remaining
126 * pages of page structs (page 1 to page 7) is to point to page->compound_head.
127 * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs
128 * will be used for each HugeTLB page. This will allow us to free the remaining
129 * 6 pages to the buddy allocator.
130 *
131 * Here is how things look after remapping.
132 *
133 *    HugeTLB                  struct pages(8 pages)         page frame(8 pages)
134 * +-----------+ ---virt_to_page---> +-----------+   mapping to   +-----------+
135 * |           |                     |     0     | -------------> |     0     |
136 * |           |                     +-----------+                +-----------+
137 * |           |                     |     1     | -------------> |     1     |
138 * |           |                     +-----------+                +-----------+
139 * |           |                     |     2     | ----------------^ ^ ^ ^ ^ ^
140 * |           |                     +-----------+                   | | | | |
141 * |           |                     |     3     | ------------------+ | | | |
142 * |           |                     +-----------+                     | | | |
143 * |           |                     |     4     | --------------------+ | | |
144 * |    PMD    |                     +-----------+                       | | |
145 * |   level   |                     |     5     | ----------------------+ | |
146 * |  mapping  |                     +-----------+                         | |
147 * |           |                     |     6     | ------------------------+ |
148 * |           |                     +-----------+                           |
149 * |           |                     |     7     | --------------------------+
150 * |           |                     +-----------+
151 * |           |
152 * |           |
153 * |           |
154 * +-----------+
155 *
156 * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for
157 * vmemmap pages and restore the previous mapping relationship.
158 *
159 * For the HugeTLB page of the pud level mapping. It is similar to the former.
160 * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages.
161 *
162 * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
163 * (e.g. aarch64) provides a contiguous bit in the translation table entries
164 * that hints to the MMU to indicate that it is one of a contiguous set of
165 * entries that can be cached in a single TLB entry.
166 *
167 * The contiguous bit is used to increase the mapping size at the pmd and pte
168 * (last) level. So this type of HugeTLB page can be optimized only when its
169 * size of the struct page structs is greater than 2 pages.
170 */
171#define pr_fmt(fmt)	"HugeTLB: " fmt
172
 
 
 
 
 
 
 
173#include "hugetlb_vmemmap.h"
174
175/*
176 * There are a lot of struct page structures associated with each HugeTLB page.
177 * For tail pages, the value of compound_head is the same. So we can reuse first
178 * page of tail page structures. We map the virtual addresses of the remaining
179 * pages of tail page structures to the first tail page struct, and then free
180 * these page frames. Therefore, we need to reserve two pages as vmemmap areas.
 
 
 
 
 
181 */
182#define RESERVE_VMEMMAP_NR		2U
183#define RESERVE_VMEMMAP_SIZE		(RESERVE_VMEMMAP_NR << PAGE_SHIFT)
 
 
 
 
 
 
 
 
 
 
 
 
184
185bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
187static int __init early_hugetlb_free_vmemmap_param(char *buf)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188{
189	/* We cannot optimize if a "struct page" crosses page boundaries. */
190	if ((!is_power_of_2(sizeof(struct page)))) {
191		pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n");
192		return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193	}
 
 
 
 
 
 
194
195	if (!buf)
196		return -EINVAL;
 
 
197
198	if (!strcmp(buf, "on"))
199		hugetlb_free_vmemmap_enabled = true;
200	else if (!strcmp(buf, "off"))
201		hugetlb_free_vmemmap_enabled = false;
 
 
202	else
203		return -EINVAL;
 
204
205	return 0;
206}
207early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param);
208
209static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210{
211	return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212}
213
214/*
215 * Previously discarded vmemmap pages will be allocated and remapping
216 * after this function returns zero.
 
 
 
 
 
217 */
218int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219{
220	int ret;
221	unsigned long vmemmap_addr = (unsigned long)head;
222	unsigned long vmemmap_end, vmemmap_reuse;
223
224	if (!HPageVmemmapOptimized(head))
 
225		return 0;
226
227	vmemmap_addr += RESERVE_VMEMMAP_SIZE;
228	vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
229	vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
 
230	/*
231	 * The pages which the vmemmap virtual address range [@vmemmap_addr,
232	 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
233	 * the range is mapped to the page which @vmemmap_reuse is mapped to.
234	 * When a HugeTLB page is freed to the buddy allocator, previously
235	 * discarded vmemmap pages must be allocated and remapping.
236	 */
237	ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
238				  GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
 
 
 
239
240	if (!ret)
241		ClearHPageVmemmapOptimized(head);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243	return ret;
244}
245
246void free_huge_page_vmemmap(struct hstate *h, struct page *head)
 
247{
248	unsigned long vmemmap_addr = (unsigned long)head;
249	unsigned long vmemmap_end, vmemmap_reuse;
250
251	if (!free_vmemmap_pages_per_hpage(h))
252		return;
253
254	vmemmap_addr += RESERVE_VMEMMAP_SIZE;
255	vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
256	vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
257
258	/*
259	 * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end)
260	 * to the page which @vmemmap_reuse is mapped to, then free the pages
261	 * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
262	 */
263	if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
264		SetHPageVmemmapOptimized(head);
265}
266
267void __init hugetlb_vmemmap_init(struct hstate *h)
 
 
 
268{
269	unsigned int nr_pages = pages_per_huge_page(h);
270	unsigned int vmemmap_pages;
 
 
 
 
 
271
 
272	/*
273	 * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
274	 * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP,
275	 * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.
 
 
 
 
 
 
276	 */
277	BUILD_BUG_ON(__NR_USED_SUBPAGE >=
278		     RESERVE_VMEMMAP_SIZE / sizeof(struct page));
279
280	if (!hugetlb_free_vmemmap_enabled)
281		return;
 
282
283	vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
284	/*
285	 * The head page and the first tail page are not to be freed to buddy
286	 * allocator, the other pages will map to the first tail page, so they
287	 * can be freed.
288	 *
289	 * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
290	 * on some architectures (e.g. aarch64). See Documentation/arm64/
291	 * hugetlbpage.rst for more details.
292	 */
293	if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
294		h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
296	pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages,
297		h->name);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298}

  1// SPDX-License-Identifier: GPL-2.0
  2/*
  3 * HugeTLB Vmemmap Optimization (HVO)
  4 *
  5 * Copyright (c) 2020, ByteDance. All rights reserved.
  6 *
  7 *     Author: Muchun Song <songmuchun@bytedance.com>
  8 *
  9 * See Documentation/mm/vmemmap_dedup.rst
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 10 */
 11#define pr_fmt(fmt)	"HugeTLB: " fmt
 12
 13#include <linux/pgtable.h>
 14#include <linux/moduleparam.h>
 15#include <linux/bootmem_info.h>
 16#include <linux/mmdebug.h>
 17#include <linux/pagewalk.h>
 18#include <asm/pgalloc.h>
 19#include <asm/tlbflush.h>
 20#include "hugetlb_vmemmap.h"
 21
 22/**
 23 * struct vmemmap_remap_walk - walk vmemmap page table
 24 *
 25 * @remap_pte:		called for each lowest-level entry (PTE).
 26 * @nr_walked:		the number of walked pte.
 27 * @reuse_page:		the page which is reused for the tail vmemmap pages.
 28 * @reuse_addr:		the virtual address of the @reuse_page page.
 29 * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
 30 *			or is mapped from.
 31 * @flags:		used to modify behavior in vmemmap page table walking
 32 *			operations.
 33 */
 34struct vmemmap_remap_walk {
 35	void			(*remap_pte)(pte_t *pte, unsigned long addr,
 36					     struct vmemmap_remap_walk *walk);
 37	unsigned long		nr_walked;
 38	struct page		*reuse_page;
 39	unsigned long		reuse_addr;
 40	struct list_head	*vmemmap_pages;
 41
 42/* Skip the TLB flush when we split the PMD */
 43#define VMEMMAP_SPLIT_NO_TLB_FLUSH	BIT(0)
 44/* Skip the TLB flush when we remap the PTE */
 45#define VMEMMAP_REMAP_NO_TLB_FLUSH	BIT(1)
 46	unsigned long		flags;
 47};
 48
 49static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
 50			     struct vmemmap_remap_walk *walk)
 51{
 52	pmd_t __pmd;
 53	int i;
 54	unsigned long addr = start;
 55	pte_t *pgtable;
 56
 57	pgtable = pte_alloc_one_kernel(&init_mm);
 58	if (!pgtable)
 59		return -ENOMEM;
 60
 61	pmd_populate_kernel(&init_mm, &__pmd, pgtable);
 62
 63	for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
 64		pte_t entry, *pte;
 65		pgprot_t pgprot = PAGE_KERNEL;
 66
 67		entry = mk_pte(head + i, pgprot);
 68		pte = pte_offset_kernel(&__pmd, addr);
 69		set_pte_at(&init_mm, addr, pte, entry);
 70	}
 71
 72	spin_lock(&init_mm.page_table_lock);
 73	if (likely(pmd_leaf(*pmd))) {
 74		/*
 75		 * Higher order allocations from buddy allocator must be able to
 76		 * be treated as indepdenent small pages (as they can be freed
 77		 * individually).
 78		 */
 79		if (!PageReserved(head))
 80			split_page(head, get_order(PMD_SIZE));
 81
 82		/* Make pte visible before pmd. See comment in pmd_install(). */
 83		smp_wmb();
 84		pmd_populate_kernel(&init_mm, pmd, pgtable);
 85		if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
 86			flush_tlb_kernel_range(start, start + PMD_SIZE);
 87	} else {
 88		pte_free_kernel(&init_mm, pgtable);
 89	}
 90	spin_unlock(&init_mm.page_table_lock);
 91
 92	return 0;
 93}
 94
 95static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
 96			     unsigned long next, struct mm_walk *walk)
 97{
 98	int ret = 0;
 99	struct page *head;
100	struct vmemmap_remap_walk *vmemmap_walk = walk->private;
101
102	/* Only splitting, not remapping the vmemmap pages. */
103	if (!vmemmap_walk->remap_pte)
104		walk->action = ACTION_CONTINUE;
105
106	spin_lock(&init_mm.page_table_lock);
107	head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
108	/*
109	 * Due to HugeTLB alignment requirements and the vmemmap
110	 * pages being at the start of the hotplugged memory
111	 * region in memory_hotplug.memmap_on_memory case. Checking
112	 * the vmemmap page associated with the first vmemmap page
113	 * if it is self-hosted is sufficient.
114	 *
115	 * [                  hotplugged memory                  ]
116	 * [        section        ][...][        section        ]
117	 * [ vmemmap ][              usable memory               ]
118	 *   ^  | ^                        |
119	 *   +--+ |                        |
120	 *        +------------------------+
121	 */
122	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
123		struct page *page = head ? head + pte_index(addr) :
124				    pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
125
126		if (PageVmemmapSelfHosted(page))
127			ret = -ENOTSUPP;
128	}
129	spin_unlock(&init_mm.page_table_lock);
130	if (!head || ret)
131		return ret;
132
133	return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
134}
135
136static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
137			     unsigned long next, struct mm_walk *walk)
138{
139	struct vmemmap_remap_walk *vmemmap_walk = walk->private;
140
141	/*
142	 * The reuse_page is found 'first' in page table walking before
143	 * starting remapping.
144	 */
145	if (!vmemmap_walk->reuse_page)
146		vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
147	else
148		vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
149	vmemmap_walk->nr_walked++;
150
151	return 0;
152}
 
153
154static const struct mm_walk_ops vmemmap_remap_ops = {
155	.pmd_entry	= vmemmap_pmd_entry,
156	.pte_entry	= vmemmap_pte_entry,
157};
158
159static int vmemmap_remap_range(unsigned long start, unsigned long end,
160			       struct vmemmap_remap_walk *walk)
161{
162	int ret;
163
164	VM_BUG_ON(!PAGE_ALIGNED(start | end));
165
166	mmap_read_lock(&init_mm);
167	ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
168				    NULL, walk);
169	mmap_read_unlock(&init_mm);
170	if (ret)
171		return ret;
172
173	if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
174		flush_tlb_kernel_range(start, end);
175
176	return 0;
177}
178
179/*
180 * Free a vmemmap page. A vmemmap page can be allocated from the memblock
181 * allocator or buddy allocator. If the PG_reserved flag is set, it means
182 * that it allocated from the memblock allocator, just free it via the
183 * free_bootmem_page(). Otherwise, use __free_page().
184 */
185static inline void free_vmemmap_page(struct page *page)
186{
187	if (PageReserved(page))
188		free_bootmem_page(page);
189	else
190		__free_page(page);
191}
192
193/* Free a list of the vmemmap pages */
194static void free_vmemmap_page_list(struct list_head *list)
195{
196	struct page *page, *next;
197
198	list_for_each_entry_safe(page, next, list, lru)
199		free_vmemmap_page(page);
200}
201
202static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
203			      struct vmemmap_remap_walk *walk)
204{
205	/*
206	 * Remap the tail pages as read-only to catch illegal write operation
207	 * to the tail pages.
208	 */
209	pgprot_t pgprot = PAGE_KERNEL_RO;
210	struct page *page = pte_page(ptep_get(pte));
211	pte_t entry;
212
213	/* Remapping the head page requires r/w */
214	if (unlikely(addr == walk->reuse_addr)) {
215		pgprot = PAGE_KERNEL;
216		list_del(&walk->reuse_page->lru);
217
218		/*
219		 * Makes sure that preceding stores to the page contents from
220		 * vmemmap_remap_free() become visible before the set_pte_at()
221		 * write.
222		 */
223		smp_wmb();
224	}
225
226	entry = mk_pte(walk->reuse_page, pgprot);
227	list_add(&page->lru, walk->vmemmap_pages);
228	set_pte_at(&init_mm, addr, pte, entry);
229}
230
231/*
232 * How many struct page structs need to be reset. When we reuse the head
233 * struct page, the special metadata (e.g. page->flags or page->mapping)
234 * cannot copy to the tail struct page structs. The invalid value will be
235 * checked in the free_tail_page_prepare(). In order to avoid the message
236 * of "corrupted mapping in tail page". We need to reset at least 3 (one
237 * head struct page struct and two tail struct page structs) struct page
238 * structs.
239 */
240#define NR_RESET_STRUCT_PAGE		3
241
242static inline void reset_struct_pages(struct page *start)
243{
244	struct page *from = start + NR_RESET_STRUCT_PAGE;
245
246	BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
247	memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
248}
249
250static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
251				struct vmemmap_remap_walk *walk)
252{
253	pgprot_t pgprot = PAGE_KERNEL;
254	struct page *page;
255	void *to;
256
257	BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
258
259	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
260	list_del(&page->lru);
261	to = page_to_virt(page);
262	copy_page(to, (void *)walk->reuse_addr);
263	reset_struct_pages(to);
264
265	/*
266	 * Makes sure that preceding stores to the page contents become visible
267	 * before the set_pte_at() write.
268	 */
269	smp_wmb();
270	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
271}
272
273/**
274 * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
275 *                      backing PMDs of the directmap into PTEs
276 * @start:     start address of the vmemmap virtual address range that we want
277 *             to remap.
278 * @end:       end address of the vmemmap virtual address range that we want to
279 *             remap.
280 * @reuse:     reuse address.
281 *
282 * Return: %0 on success, negative error code otherwise.
283 */
284static int vmemmap_remap_split(unsigned long start, unsigned long end,
285			       unsigned long reuse)
286{
287	struct vmemmap_remap_walk walk = {
288		.remap_pte	= NULL,
289		.flags		= VMEMMAP_SPLIT_NO_TLB_FLUSH,
290	};
291
292	/* See the comment in the vmemmap_remap_free(). */
293	BUG_ON(start - reuse != PAGE_SIZE);
294
295	return vmemmap_remap_range(reuse, end, &walk);
296}
297
298/**
299 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
300 *			to the page which @reuse is mapped to, then free vmemmap
301 *			which the range are mapped to.
302 * @start:	start address of the vmemmap virtual address range that we want
303 *		to remap.
304 * @end:	end address of the vmemmap virtual address range that we want to
305 *		remap.
306 * @reuse:	reuse address.
307 * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
308 *		responsibility to free pages.
309 * @flags:	modifications to vmemmap_remap_walk flags
310 *
311 * Return: %0 on success, negative error code otherwise.
312 */
313static int vmemmap_remap_free(unsigned long start, unsigned long end,
314			      unsigned long reuse,
315			      struct list_head *vmemmap_pages,
316			      unsigned long flags)
317{
318	int ret;
319	struct vmemmap_remap_walk walk = {
320		.remap_pte	= vmemmap_remap_pte,
321		.reuse_addr	= reuse,
322		.vmemmap_pages	= vmemmap_pages,
323		.flags		= flags,
324	};
325	int nid = page_to_nid((struct page *)reuse);
326	gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
327
328	/*
329	 * Allocate a new head vmemmap page to avoid breaking a contiguous
330	 * block of struct page memory when freeing it back to page allocator
331	 * in free_vmemmap_page_list(). This will allow the likely contiguous
332	 * struct page backing memory to be kept contiguous and allowing for
333	 * more allocations of hugepages. Fallback to the currently
334	 * mapped head page in case should it fail to allocate.
335	 */
336	walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
337	if (walk.reuse_page) {
338		copy_page(page_to_virt(walk.reuse_page),
339			  (void *)walk.reuse_addr);
340		list_add(&walk.reuse_page->lru, vmemmap_pages);
341	}
342
343	/*
344	 * In order to make remapping routine most efficient for the huge pages,
345	 * the routine of vmemmap page table walking has the following rules
346	 * (see more details from the vmemmap_pte_range()):
347	 *
348	 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
349	 *   should be continuous.
350	 * - The @reuse address is part of the range [@reuse, @end) that we are
351	 *   walking which is passed to vmemmap_remap_range().
352	 * - The @reuse address is the first in the complete range.
353	 *
354	 * So we need to make sure that @start and @reuse meet the above rules.
355	 */
356	BUG_ON(start - reuse != PAGE_SIZE);
357
358	ret = vmemmap_remap_range(reuse, end, &walk);
359	if (ret && walk.nr_walked) {
360		end = reuse + walk.nr_walked * PAGE_SIZE;
361		/*
362		 * vmemmap_pages contains pages from the previous
363		 * vmemmap_remap_range call which failed.  These
364		 * are pages which were removed from the vmemmap.
365		 * They will be restored in the following call.
366		 */
367		walk = (struct vmemmap_remap_walk) {
368			.remap_pte	= vmemmap_restore_pte,
369			.reuse_addr	= reuse,
370			.vmemmap_pages	= vmemmap_pages,
371			.flags		= 0,
372		};
373
374		vmemmap_remap_range(reuse, end, &walk);
375	}
376
377	return ret;
378}
379
380static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
381				   struct list_head *list)
382{
383	gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
384	unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
385	int nid = page_to_nid((struct page *)start);
386	struct page *page, *next;
387
388	while (nr_pages--) {
389		page = alloc_pages_node(nid, gfp_mask, 0);
390		if (!page)
391			goto out;
392		list_add(&page->lru, list);
393	}
394
395	return 0;
396out:
397	list_for_each_entry_safe(page, next, list, lru)
398		__free_page(page);
399	return -ENOMEM;
400}
401
402/**
403 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
404 *			 to the page which is from the @vmemmap_pages
405 *			 respectively.
406 * @start:	start address of the vmemmap virtual address range that we want
407 *		to remap.
408 * @end:	end address of the vmemmap virtual address range that we want to
409 *		remap.
410 * @reuse:	reuse address.
411 * @flags:	modifications to vmemmap_remap_walk flags
412 *
413 * Return: %0 on success, negative error code otherwise.
414 */
415static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
416			       unsigned long reuse, unsigned long flags)
417{
418	LIST_HEAD(vmemmap_pages);
419	struct vmemmap_remap_walk walk = {
420		.remap_pte	= vmemmap_restore_pte,
421		.reuse_addr	= reuse,
422		.vmemmap_pages	= &vmemmap_pages,
423		.flags		= flags,
424	};
425
426	/* See the comment in the vmemmap_remap_free(). */
427	BUG_ON(start - reuse != PAGE_SIZE);
428
429	if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
430		return -ENOMEM;
431
432	return vmemmap_remap_range(reuse, end, &walk);
433}
434
435DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
436EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
437
438static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
439core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
440
441static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
442					   struct folio *folio, unsigned long flags)
443{
444	int ret;
445	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
446	unsigned long vmemmap_reuse;
447
448	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
449	if (!folio_test_hugetlb_vmemmap_optimized(folio))
450		return 0;
451
452	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
453	vmemmap_reuse	= vmemmap_start;
454	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
455
456	/*
457	 * The pages which the vmemmap virtual address range [@vmemmap_start,
458	 * @vmemmap_end) are mapped to are freed to the buddy allocator, and
459	 * the range is mapped to the page which @vmemmap_reuse is mapped to.
460	 * When a HugeTLB page is freed to the buddy allocator, previously
461	 * discarded vmemmap pages must be allocated and remapping.
462	 */
463	ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
464	if (!ret) {
465		folio_clear_hugetlb_vmemmap_optimized(folio);
466		static_branch_dec(&hugetlb_optimize_vmemmap_key);
467	}
468
469	return ret;
470}
471
472/**
473 * hugetlb_vmemmap_restore_folio - restore previously optimized (by
474 *				hugetlb_vmemmap_optimize_folio()) vmemmap pages which
475 *				will be reallocated and remapped.
476 * @h:		struct hstate.
477 * @folio:     the folio whose vmemmap pages will be restored.
478 *
479 * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
480 * negative error code otherwise.
481 */
482int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
483{
484	return __hugetlb_vmemmap_restore_folio(h, folio, 0);
485}
486
487/**
488 * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
489 * @h:			hstate.
490 * @folio_list:		list of folios.
491 * @non_hvo_folios:	Output list of folios for which vmemmap exists.
492 *
493 * Return: number of folios for which vmemmap was restored, or an error code
494 *		if an error was encountered restoring vmemmap for a folio.
495 *		Folios that have vmemmap are moved to the non_hvo_folios
496 *		list.  Processing of entries stops when the first error is
497 *		encountered. The folio that experienced the error and all
498 *		non-processed folios will remain on folio_list.
499 */
500long hugetlb_vmemmap_restore_folios(const struct hstate *h,
501					struct list_head *folio_list,
502					struct list_head *non_hvo_folios)
503{
504	struct folio *folio, *t_folio;
505	long restored = 0;
506	long ret = 0;
507
508	list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
509		if (folio_test_hugetlb_vmemmap_optimized(folio)) {
510			ret = __hugetlb_vmemmap_restore_folio(h, folio,
511							      VMEMMAP_REMAP_NO_TLB_FLUSH);
512			if (ret)
513				break;
514			restored++;
515		}
516
517		/* Add non-optimized folios to output list */
518		list_move(&folio->lru, non_hvo_folios);
519	}
520
521	if (restored)
522		flush_tlb_all();
523	if (!ret)
524		ret = restored;
525	return ret;
526}
527
528/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
529static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
530{
531	if (folio_test_hugetlb_vmemmap_optimized(folio))
532		return false;
533
534	if (!READ_ONCE(vmemmap_optimize_enabled))
535		return false;
536
537	if (!hugetlb_vmemmap_optimizable(h))
538		return false;
 
539
540	return true;
 
 
 
 
 
 
541}
542
543static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
544					    struct folio *folio,
545					    struct list_head *vmemmap_pages,
546					    unsigned long flags)
547{
548	int ret = 0;
549	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
550	unsigned long vmemmap_reuse;
551
552	VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
553	if (!vmemmap_should_optimize_folio(h, folio))
554		return ret;
555
556	static_branch_inc(&hugetlb_optimize_vmemmap_key);
557	/*
558	 * Very Subtle
559	 * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
560	 * immediately after remapping.  As a result, subsequent accesses
561	 * and modifications to struct pages associated with the hugetlb
562	 * page could be to the OLD struct pages.  Set the vmemmap optimized
563	 * flag here so that it is copied to the new head page.  This keeps
564	 * the old and new struct pages in sync.
565	 * If there is an error during optimization, we will immediately FLUSH
566	 * the TLB and clear the flag below.
567	 */
568	folio_set_hugetlb_vmemmap_optimized(folio);
 
569
570	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
571	vmemmap_reuse	= vmemmap_start;
572	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
573
 
574	/*
575	 * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
576	 * to the page which @vmemmap_reuse is mapped to.  Add pages previously
577	 * mapping the range to vmemmap_pages list so that they can be freed by
578	 * the caller.
 
 
 
579	 */
580	ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
581				 vmemmap_pages, flags);
582	if (ret) {
583		static_branch_dec(&hugetlb_optimize_vmemmap_key);
584		folio_clear_hugetlb_vmemmap_optimized(folio);
585	}
586
587	return ret;
588}
589
590/**
591 * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
592 * @h:		struct hstate.
593 * @folio:     the folio whose vmemmap pages will be optimized.
594 *
595 * This function only tries to optimize @folio's vmemmap pages and does not
596 * guarantee that the optimization will succeed after it returns. The caller
597 * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
598 * vmemmap pages have been optimized.
599 */
600void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
601{
602	LIST_HEAD(vmemmap_pages);
603
604	__hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0);
605	free_vmemmap_page_list(&vmemmap_pages);
606}
607
608static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
609{
610	unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
611	unsigned long vmemmap_reuse;
612
613	if (!vmemmap_should_optimize_folio(h, folio))
614		return 0;
615
616	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
617	vmemmap_reuse	= vmemmap_start;
618	vmemmap_start	+= HUGETLB_VMEMMAP_RESERVE_SIZE;
619
620	/*
621	 * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
622	 * @vmemmap_end]
623	 */
624	return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
625}
626
627void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
628{
629	struct folio *folio;
630	LIST_HEAD(vmemmap_pages);
631
632	list_for_each_entry(folio, folio_list, lru) {
633		int ret = hugetlb_vmemmap_split_folio(h, folio);
634
635		/*
636		 * Spliting the PMD requires allocating a page, thus lets fail
637		 * early once we encounter the first OOM. No point in retrying
638		 * as it can be dynamically done on remap with the memory
639		 * we get back from the vmemmap deduplication.
640		 */
641		if (ret == -ENOMEM)
642			break;
643	}
644
645	flush_tlb_all();
646
647	list_for_each_entry(folio, folio_list, lru) {
648		int ret;
649
650		ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
651						       VMEMMAP_REMAP_NO_TLB_FLUSH);
652
653		/*
654		 * Pages to be freed may have been accumulated.  If we
655		 * encounter an ENOMEM,  free what we have and try again.
656		 * This can occur in the case that both spliting fails
657		 * halfway and head page allocation also failed. In this
658		 * case __hugetlb_vmemmap_optimize_folio() would free memory
659		 * allowing more vmemmap remaps to occur.
660		 */
661		if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
662			flush_tlb_all();
663			free_vmemmap_page_list(&vmemmap_pages);
664			INIT_LIST_HEAD(&vmemmap_pages);
665			__hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
666							 VMEMMAP_REMAP_NO_TLB_FLUSH);
667		}
668	}
669
670	flush_tlb_all();
671	free_vmemmap_page_list(&vmemmap_pages);
672}
673
674static struct ctl_table hugetlb_vmemmap_sysctls[] = {
675	{
676		.procname	= "hugetlb_optimize_vmemmap",
677		.data		= &vmemmap_optimize_enabled,
678		.maxlen		= sizeof(vmemmap_optimize_enabled),
679		.mode		= 0644,
680		.proc_handler	= proc_dobool,
681	},
682	{ }
683};
684
685static int __init hugetlb_vmemmap_init(void)
686{
687	const struct hstate *h;
688
689	/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
690	BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
691
692	for_each_hstate(h) {
693		if (hugetlb_vmemmap_optimizable(h)) {
694			register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
695			break;
696		}
697	}
698	return 0;
699}
700late_initcall(hugetlb_vmemmap_init);