pageattr.c - arch/x86/mm/pageattr.c - Linux source code v6.2

Note: File does not exist in v6.2.
   1/*
   2 * Copyright 2002 Andi Kleen, SuSE Labs.
   3 * Thanks to Ben LaHaise for precious feedback.
   4 */
   5#include <linux/highmem.h>
   6#include <linux/bootmem.h>
   7#include <linux/module.h>
   8#include <linux/sched.h>
   9#include <linux/mm.h>
  10#include <linux/interrupt.h>
  11#include <linux/seq_file.h>
  12#include <linux/debugfs.h>
  13#include <linux/pfn.h>
  14#include <linux/percpu.h>
  15#include <linux/gfp.h>
  16#include <linux/pci.h>
  17
  18#include <asm/e820.h>
  19#include <asm/processor.h>
  20#include <asm/tlbflush.h>
  21#include <asm/sections.h>
  22#include <asm/setup.h>
  23#include <asm/uaccess.h>
  24#include <asm/pgalloc.h>
  25#include <asm/proto.h>
  26#include <asm/pat.h>
  27
  28/*
  29 * The current flushing context - we pass it instead of 5 arguments:
  30 */
  31struct cpa_data {
  32	unsigned long	*vaddr;
  33	pgd_t		*pgd;
  34	pgprot_t	mask_set;
  35	pgprot_t	mask_clr;
  36	int		numpages;
  37	int		flags;
  38	unsigned long	pfn;
  39	unsigned	force_split : 1;
  40	int		curpage;
  41	struct page	**pages;
  42};
  43
  44/*
  45 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
  46 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
  47 * entries change the page attribute in parallel to some other cpu
  48 * splitting a large page entry along with changing the attribute.
  49 */
  50static DEFINE_SPINLOCK(cpa_lock);
  51
  52#define CPA_FLUSHTLB 1
  53#define CPA_ARRAY 2
  54#define CPA_PAGES_ARRAY 4
  55
  56#ifdef CONFIG_PROC_FS
  57static unsigned long direct_pages_count[PG_LEVEL_NUM];
  58
  59void update_page_count(int level, unsigned long pages)
  60{
  61	/* Protect against CPA */
  62	spin_lock(&pgd_lock);
  63	direct_pages_count[level] += pages;
  64	spin_unlock(&pgd_lock);
  65}
  66
  67static void split_page_count(int level)
  68{
  69	direct_pages_count[level]--;
  70	direct_pages_count[level - 1] += PTRS_PER_PTE;
  71}
  72
  73void arch_report_meminfo(struct seq_file *m)
  74{
  75	seq_printf(m, "DirectMap4k:    %8lu kB\n",
  76			direct_pages_count[PG_LEVEL_4K] << 2);
  77#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
  78	seq_printf(m, "DirectMap2M:    %8lu kB\n",
  79			direct_pages_count[PG_LEVEL_2M] << 11);
  80#else
  81	seq_printf(m, "DirectMap4M:    %8lu kB\n",
  82			direct_pages_count[PG_LEVEL_2M] << 12);
  83#endif
  84#ifdef CONFIG_X86_64
  85	if (direct_gbpages)
  86		seq_printf(m, "DirectMap1G:    %8lu kB\n",
  87			direct_pages_count[PG_LEVEL_1G] << 20);
  88#endif
  89}
  90#else
  91static inline void split_page_count(int level) { }
  92#endif
  93
  94#ifdef CONFIG_X86_64
  95
  96static inline unsigned long highmap_start_pfn(void)
  97{
  98	return __pa_symbol(_text) >> PAGE_SHIFT;
  99}
 100
 101static inline unsigned long highmap_end_pfn(void)
 102{
 103	return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
 104}
 105
 106#endif
 107
 108#ifdef CONFIG_DEBUG_PAGEALLOC
 109# define debug_pagealloc 1
 110#else
 111# define debug_pagealloc 0
 112#endif
 113
 114static inline int
 115within(unsigned long addr, unsigned long start, unsigned long end)
 116{
 117	return addr >= start && addr < end;
 118}
 119
 120/*
 121 * Flushing functions
 122 */
 123
 124/**
 125 * clflush_cache_range - flush a cache range with clflush
 126 * @vaddr:	virtual start address
 127 * @size:	number of bytes to flush
 128 *
 129 * clflushopt is an unordered instruction which needs fencing with mfence or
 130 * sfence to avoid ordering issues.
 131 */
 132void clflush_cache_range(void *vaddr, unsigned int size)
 133{
 134	void *vend = vaddr + size - 1;
 135
 136	mb();
 137
 138	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
 139		clflushopt(vaddr);
 140	/*
 141	 * Flush any possible final partial cacheline:
 142	 */
 143	clflushopt(vend);
 144
 145	mb();
 146}
 147EXPORT_SYMBOL_GPL(clflush_cache_range);
 148
 149static void __cpa_flush_all(void *arg)
 150{
 151	unsigned long cache = (unsigned long)arg;
 152
 153	/*
 154	 * Flush all to work around Errata in early athlons regarding
 155	 * large page flushing.
 156	 */
 157	__flush_tlb_all();
 158
 159	if (cache && boot_cpu_data.x86 >= 4)
 160		wbinvd();
 161}
 162
 163static void cpa_flush_all(unsigned long cache)
 164{
 165	BUG_ON(irqs_disabled());
 166
 167	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 168}
 169
 170static void __cpa_flush_range(void *arg)
 171{
 172	/*
 173	 * We could optimize that further and do individual per page
 174	 * tlb invalidates for a low number of pages. Caveat: we must
 175	 * flush the high aliases on 64bit as well.
 176	 */
 177	__flush_tlb_all();
 178}
 179
 180static void cpa_flush_range(unsigned long start, int numpages, int cache)
 181{
 182	unsigned int i, level;
 183	unsigned long addr;
 184
 185	BUG_ON(irqs_disabled());
 186	WARN_ON(PAGE_ALIGN(start) != start);
 187
 188	on_each_cpu(__cpa_flush_range, NULL, 1);
 189
 190	if (!cache)
 191		return;
 192
 193	/*
 194	 * We only need to flush on one CPU,
 195	 * clflush is a MESI-coherent instruction that
 196	 * will cause all other CPUs to flush the same
 197	 * cachelines:
 198	 */
 199	for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
 200		pte_t *pte = lookup_address(addr, &level);
 201
 202		/*
 203		 * Only flush present addresses:
 204		 */
 205		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 206			clflush_cache_range((void *) addr, PAGE_SIZE);
 207	}
 208}
 209
 210static void cpa_flush_array(unsigned long *start, int numpages, int cache,
 211			    int in_flags, struct page **pages)
 212{
 213	unsigned int i, level;
 214	unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
 215
 216	BUG_ON(irqs_disabled());
 217
 218	on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
 219
 220	if (!cache || do_wbinvd)
 221		return;
 222
 223	/*
 224	 * We only need to flush on one CPU,
 225	 * clflush is a MESI-coherent instruction that
 226	 * will cause all other CPUs to flush the same
 227	 * cachelines:
 228	 */
 229	for (i = 0; i < numpages; i++) {
 230		unsigned long addr;
 231		pte_t *pte;
 232
 233		if (in_flags & CPA_PAGES_ARRAY)
 234			addr = (unsigned long)page_address(pages[i]);
 235		else
 236			addr = start[i];
 237
 238		pte = lookup_address(addr, &level);
 239
 240		/*
 241		 * Only flush present addresses:
 242		 */
 243		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 244			clflush_cache_range((void *)addr, PAGE_SIZE);
 245	}
 246}
 247
 248/*
 249 * Certain areas of memory on x86 require very specific protection flags,
 250 * for example the BIOS area or kernel text. Callers don't always get this
 251 * right (again, ioremap() on BIOS memory is not uncommon) so this function
 252 * checks and fixes these known static required protection bits.
 253 */
 254static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 255				   unsigned long pfn)
 256{
 257	pgprot_t forbidden = __pgprot(0);
 258
 259	/*
 260	 * The BIOS area between 640k and 1Mb needs to be executable for
 261	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
 262	 */
 263#ifdef CONFIG_PCI_BIOS
 264	if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
 265		pgprot_val(forbidden) |= _PAGE_NX;
 266#endif
 267
 268	/*
 269	 * The kernel text needs to be executable for obvious reasons
 270	 * Does not cover __inittext since that is gone later on. On
 271	 * 64bit we do not enforce !NX on the low mapping
 272	 */
 273	if (within(address, (unsigned long)_text, (unsigned long)_etext))
 274		pgprot_val(forbidden) |= _PAGE_NX;
 275
 276	/*
 277	 * The .rodata section needs to be read-only. Using the pfn
 278	 * catches all aliases.
 279	 */
 280	if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
 281		   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
 282		pgprot_val(forbidden) |= _PAGE_RW;
 283
 284#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
 285	/*
 286	 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
 287	 * kernel text mappings for the large page aligned text, rodata sections
 288	 * will be always read-only. For the kernel identity mappings covering
 289	 * the holes caused by this alignment can be anything that user asks.
 290	 *
 291	 * This will preserve the large page mappings for kernel text/data
 292	 * at no extra cost.
 293	 */
 294	if (kernel_set_to_readonly &&
 295	    within(address, (unsigned long)_text,
 296		   (unsigned long)__end_rodata_hpage_align)) {
 297		unsigned int level;
 298
 299		/*
 300		 * Don't enforce the !RW mapping for the kernel text mapping,
 301		 * if the current mapping is already using small page mapping.
 302		 * No need to work hard to preserve large page mappings in this
 303		 * case.
 304		 *
 305		 * This also fixes the Linux Xen paravirt guest boot failure
 306		 * (because of unexpected read-only mappings for kernel identity
 307		 * mappings). In this paravirt guest case, the kernel text
 308		 * mapping and the kernel identity mapping share the same
 309		 * page-table pages. Thus we can't really use different
 310		 * protections for the kernel text and identity mappings. Also,
 311		 * these shared mappings are made of small page mappings.
 312		 * Thus this don't enforce !RW mapping for small page kernel
 313		 * text mapping logic will help Linux Xen parvirt guest boot
 314		 * as well.
 315		 */
 316		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
 317			pgprot_val(forbidden) |= _PAGE_RW;
 318	}
 319#endif
 320
 321	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
 322
 323	return prot;
 324}
 325
 326/*
 327 * Lookup the page table entry for a virtual address in a specific pgd.
 328 * Return a pointer to the entry and the level of the mapping.
 329 */
 330pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
 331			     unsigned int *level)
 332{
 333	pud_t *pud;
 334	pmd_t *pmd;
 335
 336	*level = PG_LEVEL_NONE;
 337
 338	if (pgd_none(*pgd))
 339		return NULL;
 340
 341	pud = pud_offset(pgd, address);
 342	if (pud_none(*pud))
 343		return NULL;
 344
 345	*level = PG_LEVEL_1G;
 346	if (pud_large(*pud) || !pud_present(*pud))
 347		return (pte_t *)pud;
 348
 349	pmd = pmd_offset(pud, address);
 350	if (pmd_none(*pmd))
 351		return NULL;
 352
 353	*level = PG_LEVEL_2M;
 354	if (pmd_large(*pmd) || !pmd_present(*pmd))
 355		return (pte_t *)pmd;
 356
 357	*level = PG_LEVEL_4K;
 358
 359	return pte_offset_kernel(pmd, address);
 360}
 361
 362/*
 363 * Lookup the page table entry for a virtual address. Return a pointer
 364 * to the entry and the level of the mapping.
 365 *
 366 * Note: We return pud and pmd either when the entry is marked large
 367 * or when the present bit is not set. Otherwise we would return a
 368 * pointer to a nonexisting mapping.
 369 */
 370pte_t *lookup_address(unsigned long address, unsigned int *level)
 371{
 372        return lookup_address_in_pgd(pgd_offset_k(address), address, level);
 373}
 374EXPORT_SYMBOL_GPL(lookup_address);
 375
 376static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
 377				  unsigned int *level)
 378{
 379        if (cpa->pgd)
 380		return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
 381					       address, level);
 382
 383        return lookup_address(address, level);
 384}
 385
 386/*
 387 * This is necessary because __pa() does not work on some
 388 * kinds of memory, like vmalloc() or the alloc_remap()
 389 * areas on 32-bit NUMA systems.  The percpu areas can
 390 * end up in this kind of memory, for instance.
 391 *
 392 * This could be optimized, but it is only intended to be
 393 * used at inititalization time, and keeping it
 394 * unoptimized should increase the testing coverage for
 395 * the more obscure platforms.
 396 */
 397phys_addr_t slow_virt_to_phys(void *__virt_addr)
 398{
 399	unsigned long virt_addr = (unsigned long)__virt_addr;
 400	phys_addr_t phys_addr;
 401	unsigned long offset;
 402	enum pg_level level;
 403	unsigned long psize;
 404	unsigned long pmask;
 405	pte_t *pte;
 406
 407	pte = lookup_address(virt_addr, &level);
 408	BUG_ON(!pte);
 409	psize = page_level_size(level);
 410	pmask = page_level_mask(level);
 411	offset = virt_addr & ~pmask;
 412	phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
 413	return (phys_addr | offset);
 414}
 415EXPORT_SYMBOL_GPL(slow_virt_to_phys);
 416
 417/*
 418 * Set the new pmd in all the pgds we know about:
 419 */
 420static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 421{
 422	/* change init_mm */
 423	set_pte_atomic(kpte, pte);
 424#ifdef CONFIG_X86_32
 425	if (!SHARED_KERNEL_PMD) {
 426		struct page *page;
 427
 428		list_for_each_entry(page, &pgd_list, lru) {
 429			pgd_t *pgd;
 430			pud_t *pud;
 431			pmd_t *pmd;
 432
 433			pgd = (pgd_t *)page_address(page) + pgd_index(address);
 434			pud = pud_offset(pgd, address);
 435			pmd = pmd_offset(pud, address);
 436			set_pte_atomic((pte_t *)pmd, pte);
 437		}
 438	}
 439#endif
 440}
 441
 442static int
 443try_preserve_large_page(pte_t *kpte, unsigned long address,
 444			struct cpa_data *cpa)
 445{
 446	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
 447	pte_t new_pte, old_pte, *tmp;
 448	pgprot_t old_prot, new_prot, req_prot;
 449	int i, do_split = 1;
 450	enum pg_level level;
 451
 452	if (cpa->force_split)
 453		return 1;
 454
 455	spin_lock(&pgd_lock);
 456	/*
 457	 * Check for races, another CPU might have split this page
 458	 * up already:
 459	 */
 460	tmp = _lookup_address_cpa(cpa, address, &level);
 461	if (tmp != kpte)
 462		goto out_unlock;
 463
 464	switch (level) {
 465	case PG_LEVEL_2M:
 466#ifdef CONFIG_X86_64
 467	case PG_LEVEL_1G:
 468#endif
 469		psize = page_level_size(level);
 470		pmask = page_level_mask(level);
 471		break;
 472	default:
 473		do_split = -EINVAL;
 474		goto out_unlock;
 475	}
 476
 477	/*
 478	 * Calculate the number of pages, which fit into this large
 479	 * page starting at address:
 480	 */
 481	nextpage_addr = (address + psize) & pmask;
 482	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
 483	if (numpages < cpa->numpages)
 484		cpa->numpages = numpages;
 485
 486	/*
 487	 * We are safe now. Check whether the new pgprot is the same:
 488	 */
 489	old_pte = *kpte;
 490	old_prot = req_prot = pte_pgprot(old_pte);
 491
 492	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
 493	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
 494
 495	/*
 496	 * Set the PSE and GLOBAL flags only if the PRESENT flag is
 497	 * set otherwise pmd_present/pmd_huge will return true even on
 498	 * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
 499	 * for the ancient hardware that doesn't support it.
 500	 */
 501	if (pgprot_val(req_prot) & _PAGE_PRESENT)
 502		pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL;
 503	else
 504		pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL);
 505
 506	req_prot = canon_pgprot(req_prot);
 507
 508	/*
 509	 * old_pte points to the large page base address. So we need
 510	 * to add the offset of the virtual address:
 511	 */
 512	pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
 513	cpa->pfn = pfn;
 514
 515	new_prot = static_protections(req_prot, address, pfn);
 516
 517	/*
 518	 * We need to check the full range, whether
 519	 * static_protection() requires a different pgprot for one of
 520	 * the pages in the range we try to preserve:
 521	 */
 522	addr = address & pmask;
 523	pfn = pte_pfn(old_pte);
 524	for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
 525		pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
 526
 527		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
 528			goto out_unlock;
 529	}
 530
 531	/*
 532	 * If there are no changes, return. maxpages has been updated
 533	 * above:
 534	 */
 535	if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
 536		do_split = 0;
 537		goto out_unlock;
 538	}
 539
 540	/*
 541	 * We need to change the attributes. Check, whether we can
 542	 * change the large page in one go. We request a split, when
 543	 * the address is not aligned and the number of pages is
 544	 * smaller than the number of pages in the large page. Note
 545	 * that we limited the number of possible pages already to
 546	 * the number of pages in the large page.
 547	 */
 548	if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
 549		/*
 550		 * The address is aligned and the number of pages
 551		 * covers the full page.
 552		 */
 553		new_pte = pfn_pte(pte_pfn(old_pte), new_prot);
 554		__set_pmd_pte(kpte, address, new_pte);
 555		cpa->flags |= CPA_FLUSHTLB;
 556		do_split = 0;
 557	}
 558
 559out_unlock:
 560	spin_unlock(&pgd_lock);
 561
 562	return do_split;
 563}
 564
 565static int
 566__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 567		   struct page *base)
 568{
 569	pte_t *pbase = (pte_t *)page_address(base);
 570	unsigned long pfn, pfninc = 1;
 571	unsigned int i, level;
 572	pte_t *tmp;
 573	pgprot_t ref_prot;
 574
 575	spin_lock(&pgd_lock);
 576	/*
 577	 * Check for races, another CPU might have split this page
 578	 * up for us already:
 579	 */
 580	tmp = _lookup_address_cpa(cpa, address, &level);
 581	if (tmp != kpte) {
 582		spin_unlock(&pgd_lock);
 583		return 1;
 584	}
 585
 586	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
 587	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
 588	/*
 589	 * If we ever want to utilize the PAT bit, we need to
 590	 * update this function to make sure it's converted from
 591	 * bit 12 to bit 7 when we cross from the 2MB level to
 592	 * the 4K level:
 593	 */
 594	WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE);
 595
 596#ifdef CONFIG_X86_64
 597	if (level == PG_LEVEL_1G) {
 598		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
 599		/*
 600		 * Set the PSE flags only if the PRESENT flag is set
 601		 * otherwise pmd_present/pmd_huge will return true
 602		 * even on a non present pmd.
 603		 */
 604		if (pgprot_val(ref_prot) & _PAGE_PRESENT)
 605			pgprot_val(ref_prot) |= _PAGE_PSE;
 606		else
 607			pgprot_val(ref_prot) &= ~_PAGE_PSE;
 608	}
 609#endif
 610
 611	/*
 612	 * Set the GLOBAL flags only if the PRESENT flag is set
 613	 * otherwise pmd/pte_present will return true even on a non
 614	 * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL
 615	 * for the ancient hardware that doesn't support it.
 616	 */
 617	if (pgprot_val(ref_prot) & _PAGE_PRESENT)
 618		pgprot_val(ref_prot) |= _PAGE_GLOBAL;
 619	else
 620		pgprot_val(ref_prot) &= ~_PAGE_GLOBAL;
 621
 622	/*
 623	 * Get the target pfn from the original entry:
 624	 */
 625	pfn = pte_pfn(*kpte);
 626	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
 627		set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
 628
 629	if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
 630				PFN_DOWN(__pa(address)) + 1))
 631		split_page_count(level);
 632
 633	/*
 634	 * Install the new, split up pagetable.
 635	 *
 636	 * We use the standard kernel pagetable protections for the new
 637	 * pagetable protections, the actual ptes set above control the
 638	 * primary protection behavior:
 639	 */
 640	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
 641
 642	/*
 643	 * Intel Atom errata AAH41 workaround.
 644	 *
 645	 * The real fix should be in hw or in a microcode update, but
 646	 * we also probabilistically try to reduce the window of having
 647	 * a large TLB mixed with 4K TLBs while instruction fetches are
 648	 * going on.
 649	 */
 650	__flush_tlb_all();
 651	spin_unlock(&pgd_lock);
 652
 653	return 0;
 654}
 655
 656static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 657			    unsigned long address)
 658{
 659	struct page *base;
 660
 661	if (!debug_pagealloc)
 662		spin_unlock(&cpa_lock);
 663	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
 664	if (!debug_pagealloc)
 665		spin_lock(&cpa_lock);
 666	if (!base)
 667		return -ENOMEM;
 668
 669	if (__split_large_page(cpa, kpte, address, base))
 670		__free_page(base);
 671
 672	return 0;
 673}
 674
 675static bool try_to_free_pte_page(pte_t *pte)
 676{
 677	int i;
 678
 679	for (i = 0; i < PTRS_PER_PTE; i++)
 680		if (!pte_none(pte[i]))
 681			return false;
 682
 683	free_page((unsigned long)pte);
 684	return true;
 685}
 686
 687static bool try_to_free_pmd_page(pmd_t *pmd)
 688{
 689	int i;
 690
 691	for (i = 0; i < PTRS_PER_PMD; i++)
 692		if (!pmd_none(pmd[i]))
 693			return false;
 694
 695	free_page((unsigned long)pmd);
 696	return true;
 697}
 698
 699static bool try_to_free_pud_page(pud_t *pud)
 700{
 701	int i;
 702
 703	for (i = 0; i < PTRS_PER_PUD; i++)
 704		if (!pud_none(pud[i]))
 705			return false;
 706
 707	free_page((unsigned long)pud);
 708	return true;
 709}
 710
 711static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
 712{
 713	pte_t *pte = pte_offset_kernel(pmd, start);
 714
 715	while (start < end) {
 716		set_pte(pte, __pte(0));
 717
 718		start += PAGE_SIZE;
 719		pte++;
 720	}
 721
 722	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
 723		pmd_clear(pmd);
 724		return true;
 725	}
 726	return false;
 727}
 728
 729static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
 730			      unsigned long start, unsigned long end)
 731{
 732	if (unmap_pte_range(pmd, start, end))
 733		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
 734			pud_clear(pud);
 735}
 736
 737static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 738{
 739	pmd_t *pmd = pmd_offset(pud, start);
 740
 741	/*
 742	 * Not on a 2MB page boundary?
 743	 */
 744	if (start & (PMD_SIZE - 1)) {
 745		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 746		unsigned long pre_end = min_t(unsigned long, end, next_page);
 747
 748		__unmap_pmd_range(pud, pmd, start, pre_end);
 749
 750		start = pre_end;
 751		pmd++;
 752	}
 753
 754	/*
 755	 * Try to unmap in 2M chunks.
 756	 */
 757	while (end - start >= PMD_SIZE) {
 758		if (pmd_large(*pmd))
 759			pmd_clear(pmd);
 760		else
 761			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
 762
 763		start += PMD_SIZE;
 764		pmd++;
 765	}
 766
 767	/*
 768	 * 4K leftovers?
 769	 */
 770	if (start < end)
 771		return __unmap_pmd_range(pud, pmd, start, end);
 772
 773	/*
 774	 * Try again to free the PMD page if haven't succeeded above.
 775	 */
 776	if (!pud_none(*pud))
 777		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
 778			pud_clear(pud);
 779}
 780
 781static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 782{
 783	pud_t *pud = pud_offset(pgd, start);
 784
 785	/*
 786	 * Not on a GB page boundary?
 787	 */
 788	if (start & (PUD_SIZE - 1)) {
 789		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
 790		unsigned long pre_end	= min_t(unsigned long, end, next_page);
 791
 792		unmap_pmd_range(pud, start, pre_end);
 793
 794		start = pre_end;
 795		pud++;
 796	}
 797
 798	/*
 799	 * Try to unmap in 1G chunks?
 800	 */
 801	while (end - start >= PUD_SIZE) {
 802
 803		if (pud_large(*pud))
 804			pud_clear(pud);
 805		else
 806			unmap_pmd_range(pud, start, start + PUD_SIZE);
 807
 808		start += PUD_SIZE;
 809		pud++;
 810	}
 811
 812	/*
 813	 * 2M leftovers?
 814	 */
 815	if (start < end)
 816		unmap_pmd_range(pud, start, end);
 817
 818	/*
 819	 * No need to try to free the PUD page because we'll free it in
 820	 * populate_pgd's error path
 821	 */
 822}
 823
 824static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
 825{
 826	pgd_t *pgd_entry = root + pgd_index(addr);
 827
 828	unmap_pud_range(pgd_entry, addr, end);
 829
 830	if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
 831		pgd_clear(pgd_entry);
 832}
 833
 834static int alloc_pte_page(pmd_t *pmd)
 835{
 836	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
 837	if (!pte)
 838		return -1;
 839
 840	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 841	return 0;
 842}
 843
 844static int alloc_pmd_page(pud_t *pud)
 845{
 846	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
 847	if (!pmd)
 848		return -1;
 849
 850	set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 851	return 0;
 852}
 853
 854static void populate_pte(struct cpa_data *cpa,
 855			 unsigned long start, unsigned long end,
 856			 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
 857{
 858	pte_t *pte;
 859
 860	pte = pte_offset_kernel(pmd, start);
 861
 862	while (num_pages-- && start < end) {
 863
 864		/* deal with the NX bit */
 865		if (!(pgprot_val(pgprot) & _PAGE_NX))
 866			cpa->pfn &= ~_PAGE_NX;
 867
 868		set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
 869
 870		start	 += PAGE_SIZE;
 871		cpa->pfn += PAGE_SIZE;
 872		pte++;
 873	}
 874}
 875
 876static int populate_pmd(struct cpa_data *cpa,
 877			unsigned long start, unsigned long end,
 878			unsigned num_pages, pud_t *pud, pgprot_t pgprot)
 879{
 880	unsigned int cur_pages = 0;
 881	pmd_t *pmd;
 882
 883	/*
 884	 * Not on a 2M boundary?
 885	 */
 886	if (start & (PMD_SIZE - 1)) {
 887		unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
 888		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 889
 890		pre_end   = min_t(unsigned long, pre_end, next_page);
 891		cur_pages = (pre_end - start) >> PAGE_SHIFT;
 892		cur_pages = min_t(unsigned int, num_pages, cur_pages);
 893
 894		/*
 895		 * Need a PTE page?
 896		 */
 897		pmd = pmd_offset(pud, start);
 898		if (pmd_none(*pmd))
 899			if (alloc_pte_page(pmd))
 900				return -1;
 901
 902		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
 903
 904		start = pre_end;
 905	}
 906
 907	/*
 908	 * We mapped them all?
 909	 */
 910	if (num_pages == cur_pages)
 911		return cur_pages;
 912
 913	while (end - start >= PMD_SIZE) {
 914
 915		/*
 916		 * We cannot use a 1G page so allocate a PMD page if needed.
 917		 */
 918		if (pud_none(*pud))
 919			if (alloc_pmd_page(pud))
 920				return -1;
 921
 922		pmd = pmd_offset(pud, start);
 923
 924		set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
 925
 926		start	  += PMD_SIZE;
 927		cpa->pfn  += PMD_SIZE;
 928		cur_pages += PMD_SIZE >> PAGE_SHIFT;
 929	}
 930
 931	/*
 932	 * Map trailing 4K pages.
 933	 */
 934	if (start < end) {
 935		pmd = pmd_offset(pud, start);
 936		if (pmd_none(*pmd))
 937			if (alloc_pte_page(pmd))
 938				return -1;
 939
 940		populate_pte(cpa, start, end, num_pages - cur_pages,
 941			     pmd, pgprot);
 942	}
 943	return num_pages;
 944}
 945
 946static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
 947			pgprot_t pgprot)
 948{
 949	pud_t *pud;
 950	unsigned long end;
 951	int cur_pages = 0;
 952
 953	end = start + (cpa->numpages << PAGE_SHIFT);
 954
 955	/*
 956	 * Not on a Gb page boundary? => map everything up to it with
 957	 * smaller pages.
 958	 */
 959	if (start & (PUD_SIZE - 1)) {
 960		unsigned long pre_end;
 961		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
 962
 963		pre_end   = min_t(unsigned long, end, next_page);
 964		cur_pages = (pre_end - start) >> PAGE_SHIFT;
 965		cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
 966
 967		pud = pud_offset(pgd, start);
 968
 969		/*
 970		 * Need a PMD page?
 971		 */
 972		if (pud_none(*pud))
 973			if (alloc_pmd_page(pud))
 974				return -1;
 975
 976		cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
 977					 pud, pgprot);
 978		if (cur_pages < 0)
 979			return cur_pages;
 980
 981		start = pre_end;
 982	}
 983
 984	/* We mapped them all? */
 985	if (cpa->numpages == cur_pages)
 986		return cur_pages;
 987
 988	pud = pud_offset(pgd, start);
 989
 990	/*
 991	 * Map everything starting from the Gb boundary, possibly with 1G pages
 992	 */
 993	while (end - start >= PUD_SIZE) {
 994		set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
 995
 996		start	  += PUD_SIZE;
 997		cpa->pfn  += PUD_SIZE;
 998		cur_pages += PUD_SIZE >> PAGE_SHIFT;
 999		pud++;
1000	}
1001
1002	/* Map trailing leftover */
1003	if (start < end) {
1004		int tmp;
1005
1006		pud = pud_offset(pgd, start);
1007		if (pud_none(*pud))
1008			if (alloc_pmd_page(pud))
1009				return -1;
1010
1011		tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
1012				   pud, pgprot);
1013		if (tmp < 0)
1014			return cur_pages;
1015
1016		cur_pages += tmp;
1017	}
1018	return cur_pages;
1019}
1020
1021/*
1022 * Restrictions for kernel page table do not necessarily apply when mapping in
1023 * an alternate PGD.
1024 */
1025static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1026{
1027	pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
1028	pud_t *pud = NULL;	/* shut up gcc */
1029	pgd_t *pgd_entry;
1030	int ret;
1031
1032	pgd_entry = cpa->pgd + pgd_index(addr);
1033
1034	/*
1035	 * Allocate a PUD page and hand it down for mapping.
1036	 */
1037	if (pgd_none(*pgd_entry)) {
1038		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
1039		if (!pud)
1040			return -1;
1041
1042		set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
1043	}
1044
1045	pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
1046	pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);
1047
1048	ret = populate_pud(cpa, addr, pgd_entry, pgprot);
1049	if (ret < 0) {
1050		unmap_pgd_range(cpa->pgd, addr,
1051				addr + (cpa->numpages << PAGE_SHIFT));
1052		return ret;
1053	}
1054
1055	cpa->numpages = ret;
1056	return 0;
1057}
1058
1059static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
1060			       int primary)
1061{
1062	if (cpa->pgd)
1063		return populate_pgd(cpa, vaddr);
1064
1065	/*
1066	 * Ignore all non primary paths.
1067	 */
1068	if (!primary)
1069		return 0;
1070
1071	/*
1072	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
1073	 * to have holes.
1074	 * Also set numpages to '1' indicating that we processed cpa req for
1075	 * one virtual address page and its pfn. TBD: numpages can be set based
1076	 * on the initial value and the level returned by lookup_address().
1077	 */
1078	if (within(vaddr, PAGE_OFFSET,
1079		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
1080		cpa->numpages = 1;
1081		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
1082		return 0;
1083	} else {
1084		WARN(1, KERN_WARNING "CPA: called for zero pte. "
1085			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
1086			*cpa->vaddr);
1087
1088		return -EFAULT;
1089	}
1090}
1091
1092static int __change_page_attr(struct cpa_data *cpa, int primary)
1093{
1094	unsigned long address;
1095	int do_split, err;
1096	unsigned int level;
1097	pte_t *kpte, old_pte;
1098
1099	if (cpa->flags & CPA_PAGES_ARRAY) {
1100		struct page *page = cpa->pages[cpa->curpage];
1101		if (unlikely(PageHighMem(page)))
1102			return 0;
1103		address = (unsigned long)page_address(page);
1104	} else if (cpa->flags & CPA_ARRAY)
1105		address = cpa->vaddr[cpa->curpage];
1106	else
1107		address = *cpa->vaddr;
1108repeat:
1109	kpte = _lookup_address_cpa(cpa, address, &level);
1110	if (!kpte)
1111		return __cpa_process_fault(cpa, address, primary);
1112
1113	old_pte = *kpte;
1114	if (!pte_val(old_pte))
1115		return __cpa_process_fault(cpa, address, primary);
1116
1117	if (level == PG_LEVEL_4K) {
1118		pte_t new_pte;
1119		pgprot_t new_prot = pte_pgprot(old_pte);
1120		unsigned long pfn = pte_pfn(old_pte);
1121
1122		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
1123		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
1124
1125		new_prot = static_protections(new_prot, address, pfn);
1126
1127		/*
1128		 * Set the GLOBAL flags only if the PRESENT flag is
1129		 * set otherwise pte_present will return true even on
1130		 * a non present pte. The canon_pgprot will clear
1131		 * _PAGE_GLOBAL for the ancient hardware that doesn't
1132		 * support it.
1133		 */
1134		if (pgprot_val(new_prot) & _PAGE_PRESENT)
1135			pgprot_val(new_prot) |= _PAGE_GLOBAL;
1136		else
1137			pgprot_val(new_prot) &= ~_PAGE_GLOBAL;
1138
1139		/*
1140		 * We need to keep the pfn from the existing PTE,
1141		 * after all we're only going to change it's attributes
1142		 * not the memory it points to
1143		 */
1144		new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
1145		cpa->pfn = pfn;
1146		/*
1147		 * Do we really change anything ?
1148		 */
1149		if (pte_val(old_pte) != pte_val(new_pte)) {
1150			set_pte_atomic(kpte, new_pte);
1151			cpa->flags |= CPA_FLUSHTLB;
1152		}
1153		cpa->numpages = 1;
1154		return 0;
1155	}
1156
1157	/*
1158	 * Check, whether we can keep the large page intact
1159	 * and just change the pte:
1160	 */
1161	do_split = try_preserve_large_page(kpte, address, cpa);
1162	/*
1163	 * When the range fits into the existing large page,
1164	 * return. cp->numpages and cpa->tlbflush have been updated in
1165	 * try_large_page:
1166	 */
1167	if (do_split <= 0)
1168		return do_split;
1169
1170	/*
1171	 * We have to split the large page:
1172	 */
1173	err = split_large_page(cpa, kpte, address);
1174	if (!err) {
1175		/*
1176	 	 * Do a global flush tlb after splitting the large page
1177	 	 * and before we do the actual change page attribute in the PTE.
1178	 	 *
1179	 	 * With out this, we violate the TLB application note, that says
1180	 	 * "The TLBs may contain both ordinary and large-page
1181		 *  translations for a 4-KByte range of linear addresses. This
1182		 *  may occur if software modifies the paging structures so that
1183		 *  the page size used for the address range changes. If the two
1184		 *  translations differ with respect to page frame or attributes
1185		 *  (e.g., permissions), processor behavior is undefined and may
1186		 *  be implementation-specific."
1187	 	 *
1188	 	 * We do this global tlb flush inside the cpa_lock, so that we
1189		 * don't allow any other cpu, with stale tlb entries change the
1190		 * page attribute in parallel, that also falls into the
1191		 * just split large page entry.
1192	 	 */
1193		flush_tlb_all();
1194		goto repeat;
1195	}
1196
1197	return err;
1198}
1199
1200static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
1201
1202static int cpa_process_alias(struct cpa_data *cpa)
1203{
1204	struct cpa_data alias_cpa;
1205	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
1206	unsigned long vaddr;
1207	int ret;
1208
1209	if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
1210		return 0;
1211
1212	/*
1213	 * No need to redo, when the primary call touched the direct
1214	 * mapping already:
1215	 */
1216	if (cpa->flags & CPA_PAGES_ARRAY) {
1217		struct page *page = cpa->pages[cpa->curpage];
1218		if (unlikely(PageHighMem(page)))
1219			return 0;
1220		vaddr = (unsigned long)page_address(page);
1221	} else if (cpa->flags & CPA_ARRAY)
1222		vaddr = cpa->vaddr[cpa->curpage];
1223	else
1224		vaddr = *cpa->vaddr;
1225
1226	if (!(within(vaddr, PAGE_OFFSET,
1227		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
1228
1229		alias_cpa = *cpa;
1230		alias_cpa.vaddr = &laddr;
1231		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1232
1233		ret = __change_page_attr_set_clr(&alias_cpa, 0);
1234		if (ret)
1235			return ret;
1236	}
1237
1238#ifdef CONFIG_X86_64
1239	/*
1240	 * If the primary call didn't touch the high mapping already
1241	 * and the physical address is inside the kernel map, we need
1242	 * to touch the high mapped kernel as well:
1243	 */
1244	if (!within(vaddr, (unsigned long)_text, _brk_end) &&
1245	    within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
1246		unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
1247					       __START_KERNEL_map - phys_base;
1248		alias_cpa = *cpa;
1249		alias_cpa.vaddr = &temp_cpa_vaddr;
1250		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1251
1252		/*
1253		 * The high mapping range is imprecise, so ignore the
1254		 * return value.
1255		 */
1256		__change_page_attr_set_clr(&alias_cpa, 0);
1257	}
1258#endif
1259
1260	return 0;
1261}
1262
1263static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
1264{
1265	int ret, numpages = cpa->numpages;
1266
1267	while (numpages) {
1268		/*
1269		 * Store the remaining nr of pages for the large page
1270		 * preservation check.
1271		 */
1272		cpa->numpages = numpages;
1273		/* for array changes, we can't use large page */
1274		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
1275			cpa->numpages = 1;
1276
1277		if (!debug_pagealloc)
1278			spin_lock(&cpa_lock);
1279		ret = __change_page_attr(cpa, checkalias);
1280		if (!debug_pagealloc)
1281			spin_unlock(&cpa_lock);
1282		if (ret)
1283			return ret;
1284
1285		if (checkalias) {
1286			ret = cpa_process_alias(cpa);
1287			if (ret)
1288				return ret;
1289		}
1290
1291		/*
1292		 * Adjust the number of pages with the result of the
1293		 * CPA operation. Either a large page has been
1294		 * preserved or a single page update happened.
1295		 */
1296		BUG_ON(cpa->numpages > numpages);
1297		numpages -= cpa->numpages;
1298		if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
1299			cpa->curpage++;
1300		else
1301			*cpa->vaddr += cpa->numpages * PAGE_SIZE;
1302
1303	}
1304	return 0;
1305}
1306
1307static inline int cache_attr(pgprot_t attr)
1308{
1309	return pgprot_val(attr) &
1310		(_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
1311}
1312
1313static int change_page_attr_set_clr(unsigned long *addr, int numpages,
1314				    pgprot_t mask_set, pgprot_t mask_clr,
1315				    int force_split, int in_flag,
1316				    struct page **pages)
1317{
1318	struct cpa_data cpa;
1319	int ret, cache, checkalias;
1320	unsigned long baddr = 0;
1321
1322	memset(&cpa, 0, sizeof(cpa));
1323
1324	/*
1325	 * Check, if we are requested to change a not supported
1326	 * feature:
1327	 */
1328	mask_set = canon_pgprot(mask_set);
1329	mask_clr = canon_pgprot(mask_clr);
1330	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
1331		return 0;
1332
1333	/* Ensure we are PAGE_SIZE aligned */
1334	if (in_flag & CPA_ARRAY) {
1335		int i;
1336		for (i = 0; i < numpages; i++) {
1337			if (addr[i] & ~PAGE_MASK) {
1338				addr[i] &= PAGE_MASK;
1339				WARN_ON_ONCE(1);
1340			}
1341		}
1342	} else if (!(in_flag & CPA_PAGES_ARRAY)) {
1343		/*
1344		 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
1345		 * No need to cehck in that case
1346		 */
1347		if (*addr & ~PAGE_MASK) {
1348			*addr &= PAGE_MASK;
1349			/*
1350			 * People should not be passing in unaligned addresses:
1351			 */
1352			WARN_ON_ONCE(1);
1353		}
1354		/*
1355		 * Save address for cache flush. *addr is modified in the call
1356		 * to __change_page_attr_set_clr() below.
1357		 */
1358		baddr = *addr;
1359	}
1360
1361	/* Must avoid aliasing mappings in the highmem code */
1362	kmap_flush_unused();
1363
1364	vm_unmap_aliases();
1365
1366	cpa.vaddr = addr;
1367	cpa.pages = pages;
1368	cpa.numpages = numpages;
1369	cpa.mask_set = mask_set;
1370	cpa.mask_clr = mask_clr;
1371	cpa.flags = 0;
1372	cpa.curpage = 0;
1373	cpa.force_split = force_split;
1374
1375	if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
1376		cpa.flags |= in_flag;
1377
1378	/* No alias checking for _NX bit modifications */
1379	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
1380
1381	ret = __change_page_attr_set_clr(&cpa, checkalias);
1382
1383	/*
1384	 * Check whether we really changed something:
1385	 */
1386	if (!(cpa.flags & CPA_FLUSHTLB))
1387		goto out;
1388
1389	/*
1390	 * No need to flush, when we did not set any of the caching
1391	 * attributes:
1392	 */
1393	cache = cache_attr(mask_set);
1394
1395	/*
1396	 * On success we use CLFLUSH, when the CPU supports it to
1397	 * avoid the WBINVD. If the CPU does not support it and in the
1398	 * error case we fall back to cpa_flush_all (which uses
1399	 * WBINVD):
1400	 */
1401	if (!ret && cpu_has_clflush) {
1402		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
1403			cpa_flush_array(addr, numpages, cache,
1404					cpa.flags, pages);
1405		} else
1406			cpa_flush_range(baddr, numpages, cache);
1407	} else
1408		cpa_flush_all(cache);
1409
1410out:
1411	return ret;
1412}
1413
1414static inline int change_page_attr_set(unsigned long *addr, int numpages,
1415				       pgprot_t mask, int array)
1416{
1417	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
1418		(array ? CPA_ARRAY : 0), NULL);
1419}
1420
1421static inline int change_page_attr_clear(unsigned long *addr, int numpages,
1422					 pgprot_t mask, int array)
1423{
1424	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
1425		(array ? CPA_ARRAY : 0), NULL);
1426}
1427
1428static inline int cpa_set_pages_array(struct page **pages, int numpages,
1429				       pgprot_t mask)
1430{
1431	return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
1432		CPA_PAGES_ARRAY, pages);
1433}
1434
1435static inline int cpa_clear_pages_array(struct page **pages, int numpages,
1436					 pgprot_t mask)
1437{
1438	return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
1439		CPA_PAGES_ARRAY, pages);
1440}
1441
1442int _set_memory_uc(unsigned long addr, int numpages)
1443{
1444	/*
1445	 * for now UC MINUS. see comments in ioremap_nocache()
1446	 */
1447	return change_page_attr_set(&addr, numpages,
1448				    __pgprot(_PAGE_CACHE_UC_MINUS), 0);
1449}
1450
1451int set_memory_uc(unsigned long addr, int numpages)
1452{
1453	int ret;
1454
1455	/*
1456	 * for now UC MINUS. see comments in ioremap_nocache()
1457	 */
1458	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1459			    _PAGE_CACHE_UC_MINUS, NULL);
1460	if (ret)
1461		goto out_err;
1462
1463	ret = _set_memory_uc(addr, numpages);
1464	if (ret)
1465		goto out_free;
1466
1467	return 0;
1468
1469out_free:
1470	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1471out_err:
1472	return ret;
1473}
1474EXPORT_SYMBOL(set_memory_uc);
1475
1476static int _set_memory_array(unsigned long *addr, int addrinarray,
1477		unsigned long new_type)
1478{
1479	int i, j;
1480	int ret;
1481
1482	/*
1483	 * for now UC MINUS. see comments in ioremap_nocache()
1484	 */
1485	for (i = 0; i < addrinarray; i++) {
1486		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
1487					new_type, NULL);
1488		if (ret)
1489			goto out_free;
1490	}
1491
1492	ret = change_page_attr_set(addr, addrinarray,
1493				    __pgprot(_PAGE_CACHE_UC_MINUS), 1);
1494
1495	if (!ret && new_type == _PAGE_CACHE_WC)
1496		ret = change_page_attr_set_clr(addr, addrinarray,
1497					       __pgprot(_PAGE_CACHE_WC),
1498					       __pgprot(_PAGE_CACHE_MASK),
1499					       0, CPA_ARRAY, NULL);
1500	if (ret)
1501		goto out_free;
1502
1503	return 0;
1504
1505out_free:
1506	for (j = 0; j < i; j++)
1507		free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
1508
1509	return ret;
1510}
1511
1512int set_memory_array_uc(unsigned long *addr, int addrinarray)
1513{
1514	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS);
1515}
1516EXPORT_SYMBOL(set_memory_array_uc);
1517
1518int set_memory_array_wc(unsigned long *addr, int addrinarray)
1519{
1520	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC);
1521}
1522EXPORT_SYMBOL(set_memory_array_wc);
1523
1524int _set_memory_wc(unsigned long addr, int numpages)
1525{
1526	int ret;
1527	unsigned long addr_copy = addr;
1528
1529	ret = change_page_attr_set(&addr, numpages,
1530				    __pgprot(_PAGE_CACHE_UC_MINUS), 0);
1531	if (!ret) {
1532		ret = change_page_attr_set_clr(&addr_copy, numpages,
1533					       __pgprot(_PAGE_CACHE_WC),
1534					       __pgprot(_PAGE_CACHE_MASK),
1535					       0, 0, NULL);
1536	}
1537	return ret;
1538}
1539
1540int set_memory_wc(unsigned long addr, int numpages)
1541{
1542	int ret;
1543
1544	if (!pat_enabled)
1545		return set_memory_uc(addr, numpages);
1546
1547	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1548		_PAGE_CACHE_WC, NULL);
1549	if (ret)
1550		goto out_err;
1551
1552	ret = _set_memory_wc(addr, numpages);
1553	if (ret)
1554		goto out_free;
1555
1556	return 0;
1557
1558out_free:
1559	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1560out_err:
1561	return ret;
1562}
1563EXPORT_SYMBOL(set_memory_wc);
1564
1565int _set_memory_wb(unsigned long addr, int numpages)
1566{
1567	return change_page_attr_clear(&addr, numpages,
1568				      __pgprot(_PAGE_CACHE_MASK), 0);
1569}
1570
1571int set_memory_wb(unsigned long addr, int numpages)
1572{
1573	int ret;
1574
1575	ret = _set_memory_wb(addr, numpages);
1576	if (ret)
1577		return ret;
1578
1579	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1580	return 0;
1581}
1582EXPORT_SYMBOL(set_memory_wb);
1583
1584int set_memory_array_wb(unsigned long *addr, int addrinarray)
1585{
1586	int i;
1587	int ret;
1588
1589	ret = change_page_attr_clear(addr, addrinarray,
1590				      __pgprot(_PAGE_CACHE_MASK), 1);
1591	if (ret)
1592		return ret;
1593
1594	for (i = 0; i < addrinarray; i++)
1595		free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
1596
1597	return 0;
1598}
1599EXPORT_SYMBOL(set_memory_array_wb);
1600
1601int set_memory_x(unsigned long addr, int numpages)
1602{
1603	if (!(__supported_pte_mask & _PAGE_NX))
1604		return 0;
1605
1606	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1607}
1608EXPORT_SYMBOL(set_memory_x);
1609
1610int set_memory_nx(unsigned long addr, int numpages)
1611{
1612	if (!(__supported_pte_mask & _PAGE_NX))
1613		return 0;
1614
1615	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1616}
1617EXPORT_SYMBOL(set_memory_nx);
1618
1619int set_memory_ro(unsigned long addr, int numpages)
1620{
1621	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
1622}
1623EXPORT_SYMBOL_GPL(set_memory_ro);
1624
1625int set_memory_rw(unsigned long addr, int numpages)
1626{
1627	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
1628}
1629EXPORT_SYMBOL_GPL(set_memory_rw);
1630
1631int set_memory_np(unsigned long addr, int numpages)
1632{
1633	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
1634}
1635
1636int set_memory_4k(unsigned long addr, int numpages)
1637{
1638	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1639					__pgprot(0), 1, 0, NULL);
1640}
1641
1642int set_pages_uc(struct page *page, int numpages)
1643{
1644	unsigned long addr = (unsigned long)page_address(page);
1645
1646	return set_memory_uc(addr, numpages);
1647}
1648EXPORT_SYMBOL(set_pages_uc);
1649
1650static int _set_pages_array(struct page **pages, int addrinarray,
1651		unsigned long new_type)
1652{
1653	unsigned long start;
1654	unsigned long end;
1655	int i;
1656	int free_idx;
1657	int ret;
1658
1659	for (i = 0; i < addrinarray; i++) {
1660		if (PageHighMem(pages[i]))
1661			continue;
1662		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1663		end = start + PAGE_SIZE;
1664		if (reserve_memtype(start, end, new_type, NULL))
1665			goto err_out;
1666	}
1667
1668	ret = cpa_set_pages_array(pages, addrinarray,
1669			__pgprot(_PAGE_CACHE_UC_MINUS));
1670	if (!ret && new_type == _PAGE_CACHE_WC)
1671		ret = change_page_attr_set_clr(NULL, addrinarray,
1672					       __pgprot(_PAGE_CACHE_WC),
1673					       __pgprot(_PAGE_CACHE_MASK),
1674					       0, CPA_PAGES_ARRAY, pages);
1675	if (ret)
1676		goto err_out;
1677	return 0; /* Success */
1678err_out:
1679	free_idx = i;
1680	for (i = 0; i < free_idx; i++) {
1681		if (PageHighMem(pages[i]))
1682			continue;
1683		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1684		end = start + PAGE_SIZE;
1685		free_memtype(start, end);
1686	}
1687	return -EINVAL;
1688}
1689
1690int set_pages_array_uc(struct page **pages, int addrinarray)
1691{
1692	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS);
1693}
1694EXPORT_SYMBOL(set_pages_array_uc);
1695
1696int set_pages_array_wc(struct page **pages, int addrinarray)
1697{
1698	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC);
1699}
1700EXPORT_SYMBOL(set_pages_array_wc);
1701
1702int set_pages_wb(struct page *page, int numpages)
1703{
1704	unsigned long addr = (unsigned long)page_address(page);
1705
1706	return set_memory_wb(addr, numpages);
1707}
1708EXPORT_SYMBOL(set_pages_wb);
1709
1710int set_pages_array_wb(struct page **pages, int addrinarray)
1711{
1712	int retval;
1713	unsigned long start;
1714	unsigned long end;
1715	int i;
1716
1717	retval = cpa_clear_pages_array(pages, addrinarray,
1718			__pgprot(_PAGE_CACHE_MASK));
1719	if (retval)
1720		return retval;
1721
1722	for (i = 0; i < addrinarray; i++) {
1723		if (PageHighMem(pages[i]))
1724			continue;
1725		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1726		end = start + PAGE_SIZE;
1727		free_memtype(start, end);
1728	}
1729
1730	return 0;
1731}
1732EXPORT_SYMBOL(set_pages_array_wb);
1733
1734int set_pages_x(struct page *page, int numpages)
1735{
1736	unsigned long addr = (unsigned long)page_address(page);
1737
1738	return set_memory_x(addr, numpages);
1739}
1740EXPORT_SYMBOL(set_pages_x);
1741
1742int set_pages_nx(struct page *page, int numpages)
1743{
1744	unsigned long addr = (unsigned long)page_address(page);
1745
1746	return set_memory_nx(addr, numpages);
1747}
1748EXPORT_SYMBOL(set_pages_nx);
1749
1750int set_pages_ro(struct page *page, int numpages)
1751{
1752	unsigned long addr = (unsigned long)page_address(page);
1753
1754	return set_memory_ro(addr, numpages);
1755}
1756
1757int set_pages_rw(struct page *page, int numpages)
1758{
1759	unsigned long addr = (unsigned long)page_address(page);
1760
1761	return set_memory_rw(addr, numpages);
1762}
1763
1764#ifdef CONFIG_DEBUG_PAGEALLOC
1765
1766static int __set_pages_p(struct page *page, int numpages)
1767{
1768	unsigned long tempaddr = (unsigned long) page_address(page);
1769	struct cpa_data cpa = { .vaddr = &tempaddr,
1770				.pgd = NULL,
1771				.numpages = numpages,
1772				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1773				.mask_clr = __pgprot(0),
1774				.flags = 0};
1775
1776	/*
1777	 * No alias checking needed for setting present flag. otherwise,
1778	 * we may need to break large pages for 64-bit kernel text
1779	 * mappings (this adds to complexity if we want to do this from
1780	 * atomic context especially). Let's keep it simple!
1781	 */
1782	return __change_page_attr_set_clr(&cpa, 0);
1783}
1784
1785static int __set_pages_np(struct page *page, int numpages)
1786{
1787	unsigned long tempaddr = (unsigned long) page_address(page);
1788	struct cpa_data cpa = { .vaddr = &tempaddr,
1789				.pgd = NULL,
1790				.numpages = numpages,
1791				.mask_set = __pgprot(0),
1792				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1793				.flags = 0};
1794
1795	/*
1796	 * No alias checking needed for setting not present flag. otherwise,
1797	 * we may need to break large pages for 64-bit kernel text
1798	 * mappings (this adds to complexity if we want to do this from
1799	 * atomic context especially). Let's keep it simple!
1800	 */
1801	return __change_page_attr_set_clr(&cpa, 0);
1802}
1803
1804void kernel_map_pages(struct page *page, int numpages, int enable)
1805{
1806	if (PageHighMem(page))
1807		return;
1808	if (!enable) {
1809		debug_check_no_locks_freed(page_address(page),
1810					   numpages * PAGE_SIZE);
1811	}
1812
1813	/*
1814	 * The return value is ignored as the calls cannot fail.
1815	 * Large pages for identity mappings are not used at boot time
1816	 * and hence no memory allocations during large page split.
1817	 */
1818	if (enable)
1819		__set_pages_p(page, numpages);
1820	else
1821		__set_pages_np(page, numpages);
1822
1823	/*
1824	 * We should perform an IPI and flush all tlbs,
1825	 * but that can deadlock->flush only current cpu:
1826	 */
1827	__flush_tlb_all();
1828
1829	arch_flush_lazy_mmu_mode();
1830}
1831
1832#ifdef CONFIG_HIBERNATION
1833
1834bool kernel_page_present(struct page *page)
1835{
1836	unsigned int level;
1837	pte_t *pte;
1838
1839	if (PageHighMem(page))
1840		return false;
1841
1842	pte = lookup_address((unsigned long)page_address(page), &level);
1843	return (pte_val(*pte) & _PAGE_PRESENT);
1844}
1845
1846#endif /* CONFIG_HIBERNATION */
1847
1848#endif /* CONFIG_DEBUG_PAGEALLOC */
1849
1850int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
1851			    unsigned numpages, unsigned long page_flags)
1852{
1853	int retval = -EINVAL;
1854
1855	struct cpa_data cpa = {
1856		.vaddr = &address,
1857		.pfn = pfn,
1858		.pgd = pgd,
1859		.numpages = numpages,
1860		.mask_set = __pgprot(0),
1861		.mask_clr = __pgprot(0),
1862		.flags = 0,
1863	};
1864
1865	if (!(__supported_pte_mask & _PAGE_NX))
1866		goto out;
1867
1868	if (!(page_flags & _PAGE_NX))
1869		cpa.mask_clr = __pgprot(_PAGE_NX);
1870
1871	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
1872
1873	retval = __change_page_attr_set_clr(&cpa, 0);
1874	__flush_tlb_all();
1875
1876out:
1877	return retval;
1878}
1879
1880void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
1881			       unsigned numpages)
1882{
1883	unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
1884}
1885
1886/*
1887 * The testcases use internal knowledge of the implementation that shouldn't
1888 * be exposed to the rest of the kernel. Include these directly here.
1889 */
1890#ifdef CONFIG_CPA_DEBUG
1891#include "pageattr-test.c"
1892#endif