pageattr.c - arch/x86/mm/pageattr.c - Linux source code v3.5.6

   1/*
   2 * Copyright 2002 Andi Kleen, SuSE Labs.
   3 * Thanks to Ben LaHaise for precious feedback.
   4 */
   5#include <linux/highmem.h>
   6#include <linux/bootmem.h>
   7#include <linux/module.h>
   8#include <linux/sched.h>
   9#include <linux/mm.h>
  10#include <linux/interrupt.h>
  11#include <linux/seq_file.h>
  12#include <linux/debugfs.h>
  13#include <linux/pfn.h>
  14#include <linux/percpu.h>
  15#include <linux/gfp.h>
  16#include <linux/pci.h>
  17
  18#include <asm/e820.h>
  19#include <asm/processor.h>
  20#include <asm/tlbflush.h>
  21#include <asm/sections.h>
  22#include <asm/setup.h>
  23#include <asm/uaccess.h>
  24#include <asm/pgalloc.h>
  25#include <asm/proto.h>
  26#include <asm/pat.h>
  27
  28/*
  29 * The current flushing context - we pass it instead of 5 arguments:
  30 */
  31struct cpa_data {
  32	unsigned long	*vaddr;
  33	pgprot_t	mask_set;
  34	pgprot_t	mask_clr;
  35	int		numpages;
  36	int		flags;
  37	unsigned long	pfn;
  38	unsigned	force_split : 1;
  39	int		curpage;
  40	struct page	**pages;
  41};
  42
  43/*
  44 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
  45 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
  46 * entries change the page attribute in parallel to some other cpu
  47 * splitting a large page entry along with changing the attribute.
  48 */
  49static DEFINE_SPINLOCK(cpa_lock);
  50
  51#define CPA_FLUSHTLB 1
  52#define CPA_ARRAY 2
  53#define CPA_PAGES_ARRAY 4
  54
  55#ifdef CONFIG_PROC_FS
  56static unsigned long direct_pages_count[PG_LEVEL_NUM];
  57
  58void update_page_count(int level, unsigned long pages)
  59{
  60	/* Protect against CPA */
  61	spin_lock(&pgd_lock);
  62	direct_pages_count[level] += pages;
  63	spin_unlock(&pgd_lock);
  64}
  65
  66static void split_page_count(int level)
  67{
  68	direct_pages_count[level]--;
  69	direct_pages_count[level - 1] += PTRS_PER_PTE;
  70}
  71
  72void arch_report_meminfo(struct seq_file *m)
  73{
  74	seq_printf(m, "DirectMap4k:    %8lu kB\n",
  75			direct_pages_count[PG_LEVEL_4K] << 2);
  76#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
  77	seq_printf(m, "DirectMap2M:    %8lu kB\n",
  78			direct_pages_count[PG_LEVEL_2M] << 11);
  79#else
  80	seq_printf(m, "DirectMap4M:    %8lu kB\n",
  81			direct_pages_count[PG_LEVEL_2M] << 12);
  82#endif
  83#ifdef CONFIG_X86_64
  84	if (direct_gbpages)
  85		seq_printf(m, "DirectMap1G:    %8lu kB\n",
  86			direct_pages_count[PG_LEVEL_1G] << 20);
  87#endif
  88}
  89#else
  90static inline void split_page_count(int level) { }
  91#endif
  92
  93#ifdef CONFIG_X86_64
  94
  95static inline unsigned long highmap_start_pfn(void)
  96{
  97	return __pa(_text) >> PAGE_SHIFT;
  98}
  99
 100static inline unsigned long highmap_end_pfn(void)
 101{
 102	return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
 103}
 104
 105#endif
 106
 107#ifdef CONFIG_DEBUG_PAGEALLOC
 108# define debug_pagealloc 1
 109#else
 110# define debug_pagealloc 0
 111#endif
 112
 113static inline int
 114within(unsigned long addr, unsigned long start, unsigned long end)
 115{
 116	return addr >= start && addr < end;
 117}
 118
 119/*
 120 * Flushing functions
 121 */
 122
 123/**
 124 * clflush_cache_range - flush a cache range with clflush
 125 * @vaddr:	virtual start address
 126 * @size:	number of bytes to flush
 127 *
 128 * clflush is an unordered instruction which needs fencing with mfence
 129 * to avoid ordering issues.
 130 */
 131void clflush_cache_range(void *vaddr, unsigned int size)
 132{
 133	void *vend = vaddr + size - 1;
 134
 135	mb();
 136
 137	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
 138		clflush(vaddr);
 139	/*
 140	 * Flush any possible final partial cacheline:
 141	 */
 142	clflush(vend);
 143
 144	mb();
 145}
 146EXPORT_SYMBOL_GPL(clflush_cache_range);
 147
 148static void __cpa_flush_all(void *arg)
 149{
 150	unsigned long cache = (unsigned long)arg;
 151
 152	/*
 153	 * Flush all to work around Errata in early athlons regarding
 154	 * large page flushing.
 155	 */
 156	__flush_tlb_all();
 157
 158	if (cache && boot_cpu_data.x86 >= 4)
 159		wbinvd();
 160}
 161
 162static void cpa_flush_all(unsigned long cache)
 163{
 164	BUG_ON(irqs_disabled());
 165
 166	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 167}
 168
 169static void __cpa_flush_range(void *arg)
 170{
 171	/*
 172	 * We could optimize that further and do individual per page
 173	 * tlb invalidates for a low number of pages. Caveat: we must
 174	 * flush the high aliases on 64bit as well.
 175	 */
 176	__flush_tlb_all();
 177}
 178
 179static void cpa_flush_range(unsigned long start, int numpages, int cache)
 180{
 181	unsigned int i, level;
 182	unsigned long addr;
 183
 184	BUG_ON(irqs_disabled());
 185	WARN_ON(PAGE_ALIGN(start) != start);
 186
 187	on_each_cpu(__cpa_flush_range, NULL, 1);
 188
 189	if (!cache)
 190		return;
 191
 192	/*
 193	 * We only need to flush on one CPU,
 194	 * clflush is a MESI-coherent instruction that
 195	 * will cause all other CPUs to flush the same
 196	 * cachelines:
 197	 */
 198	for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
 199		pte_t *pte = lookup_address(addr, &level);
 200
 201		/*
 202		 * Only flush present addresses:
 203		 */
 204		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 205			clflush_cache_range((void *) addr, PAGE_SIZE);
 206	}
 207}
 208
 209static void cpa_flush_array(unsigned long *start, int numpages, int cache,
 210			    int in_flags, struct page **pages)
 211{
 212	unsigned int i, level;
 213	unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
 214
 215	BUG_ON(irqs_disabled());
 216
 217	on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
 218
 219	if (!cache || do_wbinvd)
 220		return;
 221
 222	/*
 223	 * We only need to flush on one CPU,
 224	 * clflush is a MESI-coherent instruction that
 225	 * will cause all other CPUs to flush the same
 226	 * cachelines:
 227	 */
 228	for (i = 0; i < numpages; i++) {
 229		unsigned long addr;
 230		pte_t *pte;
 231
 232		if (in_flags & CPA_PAGES_ARRAY)
 233			addr = (unsigned long)page_address(pages[i]);
 234		else
 235			addr = start[i];
 236
 237		pte = lookup_address(addr, &level);
 238
 239		/*
 240		 * Only flush present addresses:
 241		 */
 242		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 243			clflush_cache_range((void *)addr, PAGE_SIZE);
 244	}
 245}
 246
 247/*
 248 * Certain areas of memory on x86 require very specific protection flags,
 249 * for example the BIOS area or kernel text. Callers don't always get this
 250 * right (again, ioremap() on BIOS memory is not uncommon) so this function
 251 * checks and fixes these known static required protection bits.
 252 */
 253static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 254				   unsigned long pfn)
 255{
 256	pgprot_t forbidden = __pgprot(0);
 257
 258	/*
 259	 * The BIOS area between 640k and 1Mb needs to be executable for
 260	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
 261	 */
 262#ifdef CONFIG_PCI_BIOS
 263	if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
 264		pgprot_val(forbidden) |= _PAGE_NX;
 265#endif
 266
 267	/*
 268	 * The kernel text needs to be executable for obvious reasons
 269	 * Does not cover __inittext since that is gone later on. On
 270	 * 64bit we do not enforce !NX on the low mapping
 271	 */
 272	if (within(address, (unsigned long)_text, (unsigned long)_etext))
 273		pgprot_val(forbidden) |= _PAGE_NX;
 274
 275	/*
 276	 * The .rodata section needs to be read-only. Using the pfn
 277	 * catches all aliases.
 278	 */
 279	if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
 280		   __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
 281		pgprot_val(forbidden) |= _PAGE_RW;
 282
 283#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
 284	/*
 285	 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
 286	 * kernel text mappings for the large page aligned text, rodata sections
 287	 * will be always read-only. For the kernel identity mappings covering
 288	 * the holes caused by this alignment can be anything that user asks.
 289	 *
 290	 * This will preserve the large page mappings for kernel text/data
 291	 * at no extra cost.
 292	 */
 293	if (kernel_set_to_readonly &&
 294	    within(address, (unsigned long)_text,
 295		   (unsigned long)__end_rodata_hpage_align)) {
 296		unsigned int level;
 297
 298		/*
 299		 * Don't enforce the !RW mapping for the kernel text mapping,
 300		 * if the current mapping is already using small page mapping.
 301		 * No need to work hard to preserve large page mappings in this
 302		 * case.
 303		 *
 304		 * This also fixes the Linux Xen paravirt guest boot failure
 305		 * (because of unexpected read-only mappings for kernel identity
 306		 * mappings). In this paravirt guest case, the kernel text
 307		 * mapping and the kernel identity mapping share the same
 308		 * page-table pages. Thus we can't really use different
 309		 * protections for the kernel text and identity mappings. Also,
 310		 * these shared mappings are made of small page mappings.
 311		 * Thus this don't enforce !RW mapping for small page kernel
 312		 * text mapping logic will help Linux Xen parvirt guest boot
 313		 * as well.
 314		 */
 315		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
 316			pgprot_val(forbidden) |= _PAGE_RW;
 317	}
 318#endif
 319
 320	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
 321
 322	return prot;
 323}
 324
 325/*
 326 * Lookup the page table entry for a virtual address. Return a pointer
 327 * to the entry and the level of the mapping.
 328 *
 329 * Note: We return pud and pmd either when the entry is marked large
 330 * or when the present bit is not set. Otherwise we would return a
 331 * pointer to a nonexisting mapping.
 332 */
 333pte_t *lookup_address(unsigned long address, unsigned int *level)
 334{
 335	pgd_t *pgd = pgd_offset_k(address);
 336	pud_t *pud;
 337	pmd_t *pmd;
 338
 339	*level = PG_LEVEL_NONE;
 340
 341	if (pgd_none(*pgd))
 342		return NULL;
 343
 344	pud = pud_offset(pgd, address);
 345	if (pud_none(*pud))
 346		return NULL;
 347
 348	*level = PG_LEVEL_1G;
 349	if (pud_large(*pud) || !pud_present(*pud))
 350		return (pte_t *)pud;
 351
 352	pmd = pmd_offset(pud, address);
 353	if (pmd_none(*pmd))
 354		return NULL;
 355
 356	*level = PG_LEVEL_2M;
 357	if (pmd_large(*pmd) || !pmd_present(*pmd))
 358		return (pte_t *)pmd;
 359
 360	*level = PG_LEVEL_4K;
 361
 362	return pte_offset_kernel(pmd, address);
 363}
 364EXPORT_SYMBOL_GPL(lookup_address);
 365
 366/*
 367 * Set the new pmd in all the pgds we know about:
 368 */
 369static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 370{
 371	/* change init_mm */
 372	set_pte_atomic(kpte, pte);
 373#ifdef CONFIG_X86_32
 374	if (!SHARED_KERNEL_PMD) {
 375		struct page *page;
 376
 377		list_for_each_entry(page, &pgd_list, lru) {
 378			pgd_t *pgd;
 379			pud_t *pud;
 380			pmd_t *pmd;
 381
 382			pgd = (pgd_t *)page_address(page) + pgd_index(address);
 383			pud = pud_offset(pgd, address);
 384			pmd = pmd_offset(pud, address);
 385			set_pte_atomic((pte_t *)pmd, pte);
 386		}
 387	}
 388#endif
 389}
 390
 391static int
 392try_preserve_large_page(pte_t *kpte, unsigned long address,
 393			struct cpa_data *cpa)
 394{
 395	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
 396	pte_t new_pte, old_pte, *tmp;
 397	pgprot_t old_prot, new_prot, req_prot;
 398	int i, do_split = 1;
 399	unsigned int level;
 400
 401	if (cpa->force_split)
 402		return 1;
 403
 404	spin_lock(&pgd_lock);
 405	/*
 406	 * Check for races, another CPU might have split this page
 407	 * up already:
 408	 */
 409	tmp = lookup_address(address, &level);
 410	if (tmp != kpte)
 411		goto out_unlock;
 412
 413	switch (level) {
 414	case PG_LEVEL_2M:
 415		psize = PMD_PAGE_SIZE;
 416		pmask = PMD_PAGE_MASK;
 417		break;
 418#ifdef CONFIG_X86_64
 419	case PG_LEVEL_1G:
 420		psize = PUD_PAGE_SIZE;
 421		pmask = PUD_PAGE_MASK;
 422		break;
 423#endif
 424	default:
 425		do_split = -EINVAL;
 426		goto out_unlock;
 427	}
 428
 429	/*
 430	 * Calculate the number of pages, which fit into this large
 431	 * page starting at address:
 432	 */
 433	nextpage_addr = (address + psize) & pmask;
 434	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
 435	if (numpages < cpa->numpages)
 436		cpa->numpages = numpages;
 437
 438	/*
 439	 * We are safe now. Check whether the new pgprot is the same:
 440	 */
 441	old_pte = *kpte;
 442	old_prot = new_prot = req_prot = pte_pgprot(old_pte);
 443
 444	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
 445	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
 446
 447	/*
 448	 * old_pte points to the large page base address. So we need
 449	 * to add the offset of the virtual address:
 450	 */
 451	pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
 452	cpa->pfn = pfn;
 453
 454	new_prot = static_protections(req_prot, address, pfn);
 455
 456	/*
 457	 * We need to check the full range, whether
 458	 * static_protection() requires a different pgprot for one of
 459	 * the pages in the range we try to preserve:
 460	 */
 461	addr = address & pmask;
 462	pfn = pte_pfn(old_pte);
 463	for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
 464		pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
 465
 466		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
 467			goto out_unlock;
 468	}
 469
 470	/*
 471	 * If there are no changes, return. maxpages has been updated
 472	 * above:
 473	 */
 474	if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
 475		do_split = 0;
 476		goto out_unlock;
 477	}
 478
 479	/*
 480	 * We need to change the attributes. Check, whether we can
 481	 * change the large page in one go. We request a split, when
 482	 * the address is not aligned and the number of pages is
 483	 * smaller than the number of pages in the large page. Note
 484	 * that we limited the number of possible pages already to
 485	 * the number of pages in the large page.
 486	 */
 487	if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
 488		/*
 489		 * The address is aligned and the number of pages
 490		 * covers the full page.
 491		 */
 492		new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
 493		__set_pmd_pte(kpte, address, new_pte);
 494		cpa->flags |= CPA_FLUSHTLB;
 495		do_split = 0;
 496	}
 497
 498out_unlock:
 499	spin_unlock(&pgd_lock);
 500
 501	return do_split;
 502}
 503
 504static int split_large_page(pte_t *kpte, unsigned long address)
 505{
 506	unsigned long pfn, pfninc = 1;
 507	unsigned int i, level;
 508	pte_t *pbase, *tmp;
 509	pgprot_t ref_prot;
 510	struct page *base;
 511
 512	if (!debug_pagealloc)
 513		spin_unlock(&cpa_lock);
 514	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
 515	if (!debug_pagealloc)
 516		spin_lock(&cpa_lock);
 517	if (!base)
 518		return -ENOMEM;
 519
 520	spin_lock(&pgd_lock);
 521	/*
 522	 * Check for races, another CPU might have split this page
 523	 * up for us already:
 524	 */
 525	tmp = lookup_address(address, &level);
 526	if (tmp != kpte)
 527		goto out_unlock;
 528
 529	pbase = (pte_t *)page_address(base);
 530	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
 531	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
 532	/*
 533	 * If we ever want to utilize the PAT bit, we need to
 534	 * update this function to make sure it's converted from
 535	 * bit 12 to bit 7 when we cross from the 2MB level to
 536	 * the 4K level:
 537	 */
 538	WARN_ON_ONCE(pgprot_val(ref_prot) & _PAGE_PAT_LARGE);
 539
 540#ifdef CONFIG_X86_64
 541	if (level == PG_LEVEL_1G) {
 542		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
 543		pgprot_val(ref_prot) |= _PAGE_PSE;
 544	}
 545#endif
 546
 547	/*
 548	 * Get the target pfn from the original entry:
 549	 */
 550	pfn = pte_pfn(*kpte);
 551	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
 552		set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
 553
 554	if (address >= (unsigned long)__va(0) &&
 555		address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
 556		split_page_count(level);
 557
 558#ifdef CONFIG_X86_64
 559	if (address >= (unsigned long)__va(1UL<<32) &&
 560		address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
 561		split_page_count(level);
 562#endif
 563
 564	/*
 565	 * Install the new, split up pagetable.
 566	 *
 567	 * We use the standard kernel pagetable protections for the new
 568	 * pagetable protections, the actual ptes set above control the
 569	 * primary protection behavior:
 570	 */
 571	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
 572
 573	/*
 574	 * Intel Atom errata AAH41 workaround.
 575	 *
 576	 * The real fix should be in hw or in a microcode update, but
 577	 * we also probabilistically try to reduce the window of having
 578	 * a large TLB mixed with 4K TLBs while instruction fetches are
 579	 * going on.
 580	 */
 581	__flush_tlb_all();
 582
 583	base = NULL;
 584
 585out_unlock:
 586	/*
 587	 * If we dropped out via the lookup_address check under
 588	 * pgd_lock then stick the page back into the pool:
 589	 */
 590	if (base)
 591		__free_page(base);
 592	spin_unlock(&pgd_lock);
 593
 594	return 0;
 595}
 596
 597static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
 598			       int primary)
 599{
 600	/*
 601	 * Ignore all non primary paths.
 602	 */
 603	if (!primary)
 604		return 0;
 605
 606	/*
 607	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
 608	 * to have holes.
 609	 * Also set numpages to '1' indicating that we processed cpa req for
 610	 * one virtual address page and its pfn. TBD: numpages can be set based
 611	 * on the initial value and the level returned by lookup_address().
 612	 */
 613	if (within(vaddr, PAGE_OFFSET,
 614		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
 615		cpa->numpages = 1;
 616		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
 617		return 0;
 618	} else {
 619		WARN(1, KERN_WARNING "CPA: called for zero pte. "
 620			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
 621			*cpa->vaddr);
 622
 623		return -EFAULT;
 624	}
 625}
 626
 627static int __change_page_attr(struct cpa_data *cpa, int primary)
 628{
 629	unsigned long address;
 630	int do_split, err;
 631	unsigned int level;
 632	pte_t *kpte, old_pte;
 633
 634	if (cpa->flags & CPA_PAGES_ARRAY) {
 635		struct page *page = cpa->pages[cpa->curpage];
 636		if (unlikely(PageHighMem(page)))
 637			return 0;
 638		address = (unsigned long)page_address(page);
 639	} else if (cpa->flags & CPA_ARRAY)
 640		address = cpa->vaddr[cpa->curpage];
 641	else
 642		address = *cpa->vaddr;
 643repeat:
 644	kpte = lookup_address(address, &level);
 645	if (!kpte)
 646		return __cpa_process_fault(cpa, address, primary);
 647
 648	old_pte = *kpte;
 649	if (!pte_val(old_pte))
 650		return __cpa_process_fault(cpa, address, primary);
 651
 652	if (level == PG_LEVEL_4K) {
 653		pte_t new_pte;
 654		pgprot_t new_prot = pte_pgprot(old_pte);
 655		unsigned long pfn = pte_pfn(old_pte);
 656
 657		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
 658		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
 659
 660		new_prot = static_protections(new_prot, address, pfn);
 661
 662		/*
 663		 * We need to keep the pfn from the existing PTE,
 664		 * after all we're only going to change it's attributes
 665		 * not the memory it points to
 666		 */
 667		new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
 668		cpa->pfn = pfn;
 669		/*
 670		 * Do we really change anything ?
 671		 */
 672		if (pte_val(old_pte) != pte_val(new_pte)) {
 673			set_pte_atomic(kpte, new_pte);
 674			cpa->flags |= CPA_FLUSHTLB;
 675		}
 676		cpa->numpages = 1;
 677		return 0;
 678	}
 679
 680	/*
 681	 * Check, whether we can keep the large page intact
 682	 * and just change the pte:
 683	 */
 684	do_split = try_preserve_large_page(kpte, address, cpa);
 685	/*
 686	 * When the range fits into the existing large page,
 687	 * return. cp->numpages and cpa->tlbflush have been updated in
 688	 * try_large_page:
 689	 */
 690	if (do_split <= 0)
 691		return do_split;
 692
 693	/*
 694	 * We have to split the large page:
 695	 */
 696	err = split_large_page(kpte, address);
 697	if (!err) {
 698		/*
 699	 	 * Do a global flush tlb after splitting the large page
 700	 	 * and before we do the actual change page attribute in the PTE.
 701	 	 *
 702	 	 * With out this, we violate the TLB application note, that says
 703	 	 * "The TLBs may contain both ordinary and large-page
 704		 *  translations for a 4-KByte range of linear addresses. This
 705		 *  may occur if software modifies the paging structures so that
 706		 *  the page size used for the address range changes. If the two
 707		 *  translations differ with respect to page frame or attributes
 708		 *  (e.g., permissions), processor behavior is undefined and may
 709		 *  be implementation-specific."
 710	 	 *
 711	 	 * We do this global tlb flush inside the cpa_lock, so that we
 712		 * don't allow any other cpu, with stale tlb entries change the
 713		 * page attribute in parallel, that also falls into the
 714		 * just split large page entry.
 715	 	 */
 716		flush_tlb_all();
 717		goto repeat;
 718	}
 719
 720	return err;
 721}
 722
 723static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
 724
 725static int cpa_process_alias(struct cpa_data *cpa)
 726{
 727	struct cpa_data alias_cpa;
 728	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
 729	unsigned long vaddr;
 730	int ret;
 731
 732	if (cpa->pfn >= max_pfn_mapped)
 733		return 0;
 734
 735#ifdef CONFIG_X86_64
 736	if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
 737		return 0;
 738#endif
 739	/*
 740	 * No need to redo, when the primary call touched the direct
 741	 * mapping already:
 742	 */
 743	if (cpa->flags & CPA_PAGES_ARRAY) {
 744		struct page *page = cpa->pages[cpa->curpage];
 745		if (unlikely(PageHighMem(page)))
 746			return 0;
 747		vaddr = (unsigned long)page_address(page);
 748	} else if (cpa->flags & CPA_ARRAY)
 749		vaddr = cpa->vaddr[cpa->curpage];
 750	else
 751		vaddr = *cpa->vaddr;
 752
 753	if (!(within(vaddr, PAGE_OFFSET,
 754		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
 755
 756		alias_cpa = *cpa;
 757		alias_cpa.vaddr = &laddr;
 758		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
 759
 760		ret = __change_page_attr_set_clr(&alias_cpa, 0);
 761		if (ret)
 762			return ret;
 763	}
 764
 765#ifdef CONFIG_X86_64
 766	/*
 767	 * If the primary call didn't touch the high mapping already
 768	 * and the physical address is inside the kernel map, we need
 769	 * to touch the high mapped kernel as well:
 770	 */
 771	if (!within(vaddr, (unsigned long)_text, _brk_end) &&
 772	    within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
 773		unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
 774					       __START_KERNEL_map - phys_base;
 775		alias_cpa = *cpa;
 776		alias_cpa.vaddr = &temp_cpa_vaddr;
 777		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
 778
 779		/*
 780		 * The high mapping range is imprecise, so ignore the
 781		 * return value.
 782		 */
 783		__change_page_attr_set_clr(&alias_cpa, 0);
 784	}
 785#endif
 786
 787	return 0;
 788}
 789
 790static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
 791{
 792	int ret, numpages = cpa->numpages;
 793
 794	while (numpages) {
 795		/*
 796		 * Store the remaining nr of pages for the large page
 797		 * preservation check.
 798		 */
 799		cpa->numpages = numpages;
 800		/* for array changes, we can't use large page */
 801		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
 802			cpa->numpages = 1;
 803
 804		if (!debug_pagealloc)
 805			spin_lock(&cpa_lock);
 806		ret = __change_page_attr(cpa, checkalias);
 807		if (!debug_pagealloc)
 808			spin_unlock(&cpa_lock);
 809		if (ret)
 810			return ret;
 811
 812		if (checkalias) {
 813			ret = cpa_process_alias(cpa);
 814			if (ret)
 815				return ret;
 816		}
 817
 818		/*
 819		 * Adjust the number of pages with the result of the
 820		 * CPA operation. Either a large page has been
 821		 * preserved or a single page update happened.
 822		 */
 823		BUG_ON(cpa->numpages > numpages);
 824		numpages -= cpa->numpages;
 825		if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
 826			cpa->curpage++;
 827		else
 828			*cpa->vaddr += cpa->numpages * PAGE_SIZE;
 829
 830	}
 831	return 0;
 832}
 833
 834static inline int cache_attr(pgprot_t attr)
 835{
 836	return pgprot_val(attr) &
 837		(_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
 838}
 839
 840static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 841				    pgprot_t mask_set, pgprot_t mask_clr,
 842				    int force_split, int in_flag,
 843				    struct page **pages)
 844{
 845	struct cpa_data cpa;
 846	int ret, cache, checkalias;
 847	unsigned long baddr = 0;
 848
 849	/*
 850	 * Check, if we are requested to change a not supported
 851	 * feature:
 852	 */
 853	mask_set = canon_pgprot(mask_set);
 854	mask_clr = canon_pgprot(mask_clr);
 855	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
 856		return 0;
 857
 858	/* Ensure we are PAGE_SIZE aligned */
 859	if (in_flag & CPA_ARRAY) {
 860		int i;
 861		for (i = 0; i < numpages; i++) {
 862			if (addr[i] & ~PAGE_MASK) {
 863				addr[i] &= PAGE_MASK;
 864				WARN_ON_ONCE(1);
 865			}
 866		}
 867	} else if (!(in_flag & CPA_PAGES_ARRAY)) {
 868		/*
 869		 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
 870		 * No need to cehck in that case
 871		 */
 872		if (*addr & ~PAGE_MASK) {
 873			*addr &= PAGE_MASK;
 874			/*
 875			 * People should not be passing in unaligned addresses:
 876			 */
 877			WARN_ON_ONCE(1);
 878		}
 879		/*
 880		 * Save address for cache flush. *addr is modified in the call
 881		 * to __change_page_attr_set_clr() below.
 882		 */
 883		baddr = *addr;
 884	}
 885
 886	/* Must avoid aliasing mappings in the highmem code */
 887	kmap_flush_unused();
 888
 889	vm_unmap_aliases();
 890
 891	cpa.vaddr = addr;
 892	cpa.pages = pages;
 893	cpa.numpages = numpages;
 894	cpa.mask_set = mask_set;
 895	cpa.mask_clr = mask_clr;
 896	cpa.flags = 0;
 897	cpa.curpage = 0;
 898	cpa.force_split = force_split;
 899
 900	if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
 901		cpa.flags |= in_flag;
 902
 903	/* No alias checking for _NX bit modifications */
 904	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
 905
 906	ret = __change_page_attr_set_clr(&cpa, checkalias);
 907
 908	/*
 909	 * Check whether we really changed something:
 910	 */
 911	if (!(cpa.flags & CPA_FLUSHTLB))
 912		goto out;
 913
 914	/*
 915	 * No need to flush, when we did not set any of the caching
 916	 * attributes:
 917	 */
 918	cache = cache_attr(mask_set);
 919
 920	/*
 921	 * On success we use clflush, when the CPU supports it to
 922	 * avoid the wbindv. If the CPU does not support it and in the
 923	 * error case we fall back to cpa_flush_all (which uses
 924	 * wbindv):
 925	 */
 926	if (!ret && cpu_has_clflush) {
 927		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
 928			cpa_flush_array(addr, numpages, cache,
 929					cpa.flags, pages);
 930		} else
 931			cpa_flush_range(baddr, numpages, cache);
 932	} else
 933		cpa_flush_all(cache);
 934
 935out:
 936	return ret;
 937}
 938
 939static inline int change_page_attr_set(unsigned long *addr, int numpages,
 940				       pgprot_t mask, int array)
 941{
 942	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
 943		(array ? CPA_ARRAY : 0), NULL);
 944}
 945
 946static inline int change_page_attr_clear(unsigned long *addr, int numpages,
 947					 pgprot_t mask, int array)
 948{
 949	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
 950		(array ? CPA_ARRAY : 0), NULL);
 951}
 952
 953static inline int cpa_set_pages_array(struct page **pages, int numpages,
 954				       pgprot_t mask)
 955{
 956	return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
 957		CPA_PAGES_ARRAY, pages);
 958}
 959
 960static inline int cpa_clear_pages_array(struct page **pages, int numpages,
 961					 pgprot_t mask)
 962{
 963	return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
 964		CPA_PAGES_ARRAY, pages);
 965}
 966
 967int _set_memory_uc(unsigned long addr, int numpages)
 968{
 969	/*
 970	 * for now UC MINUS. see comments in ioremap_nocache()
 971	 */
 972	return change_page_attr_set(&addr, numpages,
 973				    __pgprot(_PAGE_CACHE_UC_MINUS), 0);
 974}
 975
 976int set_memory_uc(unsigned long addr, int numpages)
 977{
 978	int ret;
 979
 980	/*
 981	 * for now UC MINUS. see comments in ioremap_nocache()
 982	 */
 983	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
 984			    _PAGE_CACHE_UC_MINUS, NULL);
 985	if (ret)
 986		goto out_err;
 987
 988	ret = _set_memory_uc(addr, numpages);
 989	if (ret)
 990		goto out_free;
 991
 992	return 0;
 993
 994out_free:
 995	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
 996out_err:
 997	return ret;
 998}
 999EXPORT_SYMBOL(set_memory_uc);
1000
1001static int _set_memory_array(unsigned long *addr, int addrinarray,
1002		unsigned long new_type)
1003{
1004	int i, j;
1005	int ret;
1006
1007	/*
1008	 * for now UC MINUS. see comments in ioremap_nocache()
1009	 */
1010	for (i = 0; i < addrinarray; i++) {
1011		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
1012					new_type, NULL);
1013		if (ret)
1014			goto out_free;
1015	}
1016
1017	ret = change_page_attr_set(addr, addrinarray,
1018				    __pgprot(_PAGE_CACHE_UC_MINUS), 1);
1019
1020	if (!ret && new_type == _PAGE_CACHE_WC)
1021		ret = change_page_attr_set_clr(addr, addrinarray,
1022					       __pgprot(_PAGE_CACHE_WC),
1023					       __pgprot(_PAGE_CACHE_MASK),
1024					       0, CPA_ARRAY, NULL);
1025	if (ret)
1026		goto out_free;
1027
1028	return 0;
1029
1030out_free:
1031	for (j = 0; j < i; j++)
1032		free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
1033
1034	return ret;
1035}
1036
1037int set_memory_array_uc(unsigned long *addr, int addrinarray)
1038{
1039	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS);
1040}
1041EXPORT_SYMBOL(set_memory_array_uc);
1042
1043int set_memory_array_wc(unsigned long *addr, int addrinarray)
1044{
1045	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC);
1046}
1047EXPORT_SYMBOL(set_memory_array_wc);
1048
1049int _set_memory_wc(unsigned long addr, int numpages)
1050{
1051	int ret;
1052	unsigned long addr_copy = addr;
1053
1054	ret = change_page_attr_set(&addr, numpages,
1055				    __pgprot(_PAGE_CACHE_UC_MINUS), 0);
1056	if (!ret) {
1057		ret = change_page_attr_set_clr(&addr_copy, numpages,
1058					       __pgprot(_PAGE_CACHE_WC),
1059					       __pgprot(_PAGE_CACHE_MASK),
1060					       0, 0, NULL);
1061	}
1062	return ret;
1063}
1064
1065int set_memory_wc(unsigned long addr, int numpages)
1066{
1067	int ret;
1068
1069	if (!pat_enabled)
1070		return set_memory_uc(addr, numpages);
1071
1072	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1073		_PAGE_CACHE_WC, NULL);
1074	if (ret)
1075		goto out_err;
1076
1077	ret = _set_memory_wc(addr, numpages);
1078	if (ret)
1079		goto out_free;
1080
1081	return 0;
1082
1083out_free:
1084	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1085out_err:
1086	return ret;
1087}
1088EXPORT_SYMBOL(set_memory_wc);
1089
1090int _set_memory_wb(unsigned long addr, int numpages)
1091{
1092	return change_page_attr_clear(&addr, numpages,
1093				      __pgprot(_PAGE_CACHE_MASK), 0);
1094}
1095
1096int set_memory_wb(unsigned long addr, int numpages)
1097{
1098	int ret;
1099
1100	ret = _set_memory_wb(addr, numpages);
1101	if (ret)
1102		return ret;
1103
1104	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1105	return 0;
1106}
1107EXPORT_SYMBOL(set_memory_wb);
1108
1109int set_memory_array_wb(unsigned long *addr, int addrinarray)
1110{
1111	int i;
1112	int ret;
1113
1114	ret = change_page_attr_clear(addr, addrinarray,
1115				      __pgprot(_PAGE_CACHE_MASK), 1);
1116	if (ret)
1117		return ret;
1118
1119	for (i = 0; i < addrinarray; i++)
1120		free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
1121
1122	return 0;
1123}
1124EXPORT_SYMBOL(set_memory_array_wb);
1125
1126int set_memory_x(unsigned long addr, int numpages)
1127{
1128	if (!(__supported_pte_mask & _PAGE_NX))
1129		return 0;
1130
1131	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1132}
1133EXPORT_SYMBOL(set_memory_x);
1134
1135int set_memory_nx(unsigned long addr, int numpages)
1136{
1137	if (!(__supported_pte_mask & _PAGE_NX))
1138		return 0;
1139
1140	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1141}
1142EXPORT_SYMBOL(set_memory_nx);
1143
1144int set_memory_ro(unsigned long addr, int numpages)
1145{
1146	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
1147}
1148EXPORT_SYMBOL_GPL(set_memory_ro);
1149
1150int set_memory_rw(unsigned long addr, int numpages)
1151{
1152	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
1153}
1154EXPORT_SYMBOL_GPL(set_memory_rw);
1155
1156int set_memory_np(unsigned long addr, int numpages)
1157{
1158	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
1159}
1160
1161int set_memory_4k(unsigned long addr, int numpages)
1162{
1163	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1164					__pgprot(0), 1, 0, NULL);
1165}
1166
1167int set_pages_uc(struct page *page, int numpages)
1168{
1169	unsigned long addr = (unsigned long)page_address(page);
1170
1171	return set_memory_uc(addr, numpages);
1172}
1173EXPORT_SYMBOL(set_pages_uc);
1174
1175static int _set_pages_array(struct page **pages, int addrinarray,
1176		unsigned long new_type)
1177{
1178	unsigned long start;
1179	unsigned long end;
1180	int i;
1181	int free_idx;
1182	int ret;
1183
1184	for (i = 0; i < addrinarray; i++) {
1185		if (PageHighMem(pages[i]))
1186			continue;
1187		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1188		end = start + PAGE_SIZE;
1189		if (reserve_memtype(start, end, new_type, NULL))
1190			goto err_out;
1191	}
1192
1193	ret = cpa_set_pages_array(pages, addrinarray,
1194			__pgprot(_PAGE_CACHE_UC_MINUS));
1195	if (!ret && new_type == _PAGE_CACHE_WC)
1196		ret = change_page_attr_set_clr(NULL, addrinarray,
1197					       __pgprot(_PAGE_CACHE_WC),
1198					       __pgprot(_PAGE_CACHE_MASK),
1199					       0, CPA_PAGES_ARRAY, pages);
1200	if (ret)
1201		goto err_out;
1202	return 0; /* Success */
1203err_out:
1204	free_idx = i;
1205	for (i = 0; i < free_idx; i++) {
1206		if (PageHighMem(pages[i]))
1207			continue;
1208		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1209		end = start + PAGE_SIZE;
1210		free_memtype(start, end);
1211	}
1212	return -EINVAL;
1213}
1214
1215int set_pages_array_uc(struct page **pages, int addrinarray)
1216{
1217	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS);
1218}
1219EXPORT_SYMBOL(set_pages_array_uc);
1220
1221int set_pages_array_wc(struct page **pages, int addrinarray)
1222{
1223	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC);
1224}
1225EXPORT_SYMBOL(set_pages_array_wc);
1226
1227int set_pages_wb(struct page *page, int numpages)
1228{
1229	unsigned long addr = (unsigned long)page_address(page);
1230
1231	return set_memory_wb(addr, numpages);
1232}
1233EXPORT_SYMBOL(set_pages_wb);
1234
1235int set_pages_array_wb(struct page **pages, int addrinarray)
1236{
1237	int retval;
1238	unsigned long start;
1239	unsigned long end;
1240	int i;
1241
1242	retval = cpa_clear_pages_array(pages, addrinarray,
1243			__pgprot(_PAGE_CACHE_MASK));
1244	if (retval)
1245		return retval;
1246
1247	for (i = 0; i < addrinarray; i++) {
1248		if (PageHighMem(pages[i]))
1249			continue;
1250		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1251		end = start + PAGE_SIZE;
1252		free_memtype(start, end);
1253	}
1254
1255	return 0;
1256}
1257EXPORT_SYMBOL(set_pages_array_wb);
1258
1259int set_pages_x(struct page *page, int numpages)
1260{
1261	unsigned long addr = (unsigned long)page_address(page);
1262
1263	return set_memory_x(addr, numpages);
1264}
1265EXPORT_SYMBOL(set_pages_x);
1266
1267int set_pages_nx(struct page *page, int numpages)
1268{
1269	unsigned long addr = (unsigned long)page_address(page);
1270
1271	return set_memory_nx(addr, numpages);
1272}
1273EXPORT_SYMBOL(set_pages_nx);
1274
1275int set_pages_ro(struct page *page, int numpages)
1276{
1277	unsigned long addr = (unsigned long)page_address(page);
1278
1279	return set_memory_ro(addr, numpages);
1280}
1281
1282int set_pages_rw(struct page *page, int numpages)
1283{
1284	unsigned long addr = (unsigned long)page_address(page);
1285
1286	return set_memory_rw(addr, numpages);
1287}
1288
1289#ifdef CONFIG_DEBUG_PAGEALLOC
1290
1291static int __set_pages_p(struct page *page, int numpages)
1292{
1293	unsigned long tempaddr = (unsigned long) page_address(page);
1294	struct cpa_data cpa = { .vaddr = &tempaddr,
1295				.numpages = numpages,
1296				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1297				.mask_clr = __pgprot(0),
1298				.flags = 0};
1299
1300	/*
1301	 * No alias checking needed for setting present flag. otherwise,
1302	 * we may need to break large pages for 64-bit kernel text
1303	 * mappings (this adds to complexity if we want to do this from
1304	 * atomic context especially). Let's keep it simple!
1305	 */
1306	return __change_page_attr_set_clr(&cpa, 0);
1307}
1308
1309static int __set_pages_np(struct page *page, int numpages)
1310{
1311	unsigned long tempaddr = (unsigned long) page_address(page);
1312	struct cpa_data cpa = { .vaddr = &tempaddr,
1313				.numpages = numpages,
1314				.mask_set = __pgprot(0),
1315				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1316				.flags = 0};
1317
1318	/*
1319	 * No alias checking needed for setting not present flag. otherwise,
1320	 * we may need to break large pages for 64-bit kernel text
1321	 * mappings (this adds to complexity if we want to do this from
1322	 * atomic context especially). Let's keep it simple!
1323	 */
1324	return __change_page_attr_set_clr(&cpa, 0);
1325}
1326
1327void kernel_map_pages(struct page *page, int numpages, int enable)
1328{
1329	if (PageHighMem(page))
1330		return;
1331	if (!enable) {
1332		debug_check_no_locks_freed(page_address(page),
1333					   numpages * PAGE_SIZE);
1334	}
1335
1336	/*
1337	 * The return value is ignored as the calls cannot fail.
1338	 * Large pages for identity mappings are not used at boot time
1339	 * and hence no memory allocations during large page split.
1340	 */
1341	if (enable)
1342		__set_pages_p(page, numpages);
1343	else
1344		__set_pages_np(page, numpages);
1345
1346	/*
1347	 * We should perform an IPI and flush all tlbs,
1348	 * but that can deadlock->flush only current cpu:
1349	 */
1350	__flush_tlb_all();
1351}
1352
1353#ifdef CONFIG_HIBERNATION
1354
1355bool kernel_page_present(struct page *page)
1356{
1357	unsigned int level;
1358	pte_t *pte;
1359
1360	if (PageHighMem(page))
1361		return false;
1362
1363	pte = lookup_address((unsigned long)page_address(page), &level);
1364	return (pte_val(*pte) & _PAGE_PRESENT);
1365}
1366
1367#endif /* CONFIG_HIBERNATION */
1368
1369#endif /* CONFIG_DEBUG_PAGEALLOC */
1370
1371/*
1372 * The testcases use internal knowledge of the implementation that shouldn't
1373 * be exposed to the rest of the kernel. Include these directly here.
1374 */
1375#ifdef CONFIG_CPA_DEBUG
1376#include "pageattr-test.c"
1377#endif