Linux Audio

Check our new training course

Embedded Linux training

Mar 31-Apr 8, 2025
Register
Loading...
Note: File does not exist in v6.2.
   1/*
   2 * Copyright 2002 Andi Kleen, SuSE Labs.
   3 * Thanks to Ben LaHaise for precious feedback.
   4 */
   5#include <linux/highmem.h>
   6#include <linux/bootmem.h>
   7#include <linux/sched.h>
   8#include <linux/mm.h>
   9#include <linux/interrupt.h>
  10#include <linux/seq_file.h>
  11#include <linux/debugfs.h>
  12#include <linux/pfn.h>
  13#include <linux/percpu.h>
  14#include <linux/gfp.h>
  15#include <linux/pci.h>
  16#include <linux/vmalloc.h>
  17
  18#include <asm/e820.h>
  19#include <asm/processor.h>
  20#include <asm/tlbflush.h>
  21#include <asm/sections.h>
  22#include <asm/setup.h>
  23#include <linux/uaccess.h>
  24#include <asm/pgalloc.h>
  25#include <asm/proto.h>
  26#include <asm/pat.h>
  27
  28/*
  29 * The current flushing context - we pass it instead of 5 arguments:
  30 */
  31struct cpa_data {
  32	unsigned long	*vaddr;
  33	pgd_t		*pgd;
  34	pgprot_t	mask_set;
  35	pgprot_t	mask_clr;
  36	unsigned long	numpages;
  37	int		flags;
  38	unsigned long	pfn;
  39	unsigned	force_split : 1;
  40	int		curpage;
  41	struct page	**pages;
  42};
  43
  44/*
  45 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
  46 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
  47 * entries change the page attribute in parallel to some other cpu
  48 * splitting a large page entry along with changing the attribute.
  49 */
  50static DEFINE_SPINLOCK(cpa_lock);
  51
  52#define CPA_FLUSHTLB 1
  53#define CPA_ARRAY 2
  54#define CPA_PAGES_ARRAY 4
  55
  56#ifdef CONFIG_PROC_FS
  57static unsigned long direct_pages_count[PG_LEVEL_NUM];
  58
  59void update_page_count(int level, unsigned long pages)
  60{
  61	/* Protect against CPA */
  62	spin_lock(&pgd_lock);
  63	direct_pages_count[level] += pages;
  64	spin_unlock(&pgd_lock);
  65}
  66
  67static void split_page_count(int level)
  68{
  69	if (direct_pages_count[level] == 0)
  70		return;
  71
  72	direct_pages_count[level]--;
  73	direct_pages_count[level - 1] += PTRS_PER_PTE;
  74}
  75
  76void arch_report_meminfo(struct seq_file *m)
  77{
  78	seq_printf(m, "DirectMap4k:    %8lu kB\n",
  79			direct_pages_count[PG_LEVEL_4K] << 2);
  80#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
  81	seq_printf(m, "DirectMap2M:    %8lu kB\n",
  82			direct_pages_count[PG_LEVEL_2M] << 11);
  83#else
  84	seq_printf(m, "DirectMap4M:    %8lu kB\n",
  85			direct_pages_count[PG_LEVEL_2M] << 12);
  86#endif
  87	if (direct_gbpages)
  88		seq_printf(m, "DirectMap1G:    %8lu kB\n",
  89			direct_pages_count[PG_LEVEL_1G] << 20);
  90}
  91#else
  92static inline void split_page_count(int level) { }
  93#endif
  94
  95#ifdef CONFIG_X86_64
  96
  97static inline unsigned long highmap_start_pfn(void)
  98{
  99	return __pa_symbol(_text) >> PAGE_SHIFT;
 100}
 101
 102static inline unsigned long highmap_end_pfn(void)
 103{
 104	/* Do not reference physical address outside the kernel. */
 105	return __pa_symbol(roundup(_brk_end, PMD_SIZE) - 1) >> PAGE_SHIFT;
 106}
 107
 108#endif
 109
 110static inline int
 111within(unsigned long addr, unsigned long start, unsigned long end)
 112{
 113	return addr >= start && addr < end;
 114}
 115
 116static inline int
 117within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
 118{
 119	return addr >= start && addr <= end;
 120}
 121
 122/*
 123 * Flushing functions
 124 */
 125
 126/**
 127 * clflush_cache_range - flush a cache range with clflush
 128 * @vaddr:	virtual start address
 129 * @size:	number of bytes to flush
 130 *
 131 * clflushopt is an unordered instruction which needs fencing with mfence or
 132 * sfence to avoid ordering issues.
 133 */
 134void clflush_cache_range(void *vaddr, unsigned int size)
 135{
 136	const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
 137	void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
 138	void *vend = vaddr + size;
 139
 140	if (p >= vend)
 141		return;
 142
 143	mb();
 144
 145	for (; p < vend; p += clflush_size)
 146		clflushopt(p);
 147
 148	mb();
 149}
 150EXPORT_SYMBOL_GPL(clflush_cache_range);
 151
 152static void __cpa_flush_all(void *arg)
 153{
 154	unsigned long cache = (unsigned long)arg;
 155
 156	/*
 157	 * Flush all to work around Errata in early athlons regarding
 158	 * large page flushing.
 159	 */
 160	__flush_tlb_all();
 161
 162	if (cache && boot_cpu_data.x86 >= 4)
 163		wbinvd();
 164}
 165
 166static void cpa_flush_all(unsigned long cache)
 167{
 168	BUG_ON(irqs_disabled());
 169
 170	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 171}
 172
 173static void __cpa_flush_range(void *arg)
 174{
 175	/*
 176	 * We could optimize that further and do individual per page
 177	 * tlb invalidates for a low number of pages. Caveat: we must
 178	 * flush the high aliases on 64bit as well.
 179	 */
 180	__flush_tlb_all();
 181}
 182
 183static void cpa_flush_range(unsigned long start, int numpages, int cache)
 184{
 185	unsigned int i, level;
 186	unsigned long addr;
 187
 188	BUG_ON(irqs_disabled());
 189	WARN_ON(PAGE_ALIGN(start) != start);
 190
 191	on_each_cpu(__cpa_flush_range, NULL, 1);
 192
 193	if (!cache)
 194		return;
 195
 196	/*
 197	 * We only need to flush on one CPU,
 198	 * clflush is a MESI-coherent instruction that
 199	 * will cause all other CPUs to flush the same
 200	 * cachelines:
 201	 */
 202	for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
 203		pte_t *pte = lookup_address(addr, &level);
 204
 205		/*
 206		 * Only flush present addresses:
 207		 */
 208		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 209			clflush_cache_range((void *) addr, PAGE_SIZE);
 210	}
 211}
 212
 213static void cpa_flush_array(unsigned long *start, int numpages, int cache,
 214			    int in_flags, struct page **pages)
 215{
 216	unsigned int i, level;
 217	unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
 218
 219	BUG_ON(irqs_disabled());
 220
 221	on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
 222
 223	if (!cache || do_wbinvd)
 224		return;
 225
 226	/*
 227	 * We only need to flush on one CPU,
 228	 * clflush is a MESI-coherent instruction that
 229	 * will cause all other CPUs to flush the same
 230	 * cachelines:
 231	 */
 232	for (i = 0; i < numpages; i++) {
 233		unsigned long addr;
 234		pte_t *pte;
 235
 236		if (in_flags & CPA_PAGES_ARRAY)
 237			addr = (unsigned long)page_address(pages[i]);
 238		else
 239			addr = start[i];
 240
 241		pte = lookup_address(addr, &level);
 242
 243		/*
 244		 * Only flush present addresses:
 245		 */
 246		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 247			clflush_cache_range((void *)addr, PAGE_SIZE);
 248	}
 249}
 250
 251/*
 252 * Certain areas of memory on x86 require very specific protection flags,
 253 * for example the BIOS area or kernel text. Callers don't always get this
 254 * right (again, ioremap() on BIOS memory is not uncommon) so this function
 255 * checks and fixes these known static required protection bits.
 256 */
 257static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 258				   unsigned long pfn)
 259{
 260	pgprot_t forbidden = __pgprot(0);
 261
 262	/*
 263	 * The BIOS area between 640k and 1Mb needs to be executable for
 264	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
 265	 */
 266#ifdef CONFIG_PCI_BIOS
 267	if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
 268		pgprot_val(forbidden) |= _PAGE_NX;
 269#endif
 270
 271	/*
 272	 * The kernel text needs to be executable for obvious reasons
 273	 * Does not cover __inittext since that is gone later on. On
 274	 * 64bit we do not enforce !NX on the low mapping
 275	 */
 276	if (within(address, (unsigned long)_text, (unsigned long)_etext))
 277		pgprot_val(forbidden) |= _PAGE_NX;
 278
 279	/*
 280	 * The .rodata section needs to be read-only. Using the pfn
 281	 * catches all aliases.
 282	 */
 283	if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
 284		   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
 285		pgprot_val(forbidden) |= _PAGE_RW;
 286
 287#if defined(CONFIG_X86_64)
 288	/*
 289	 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
 290	 * kernel text mappings for the large page aligned text, rodata sections
 291	 * will be always read-only. For the kernel identity mappings covering
 292	 * the holes caused by this alignment can be anything that user asks.
 293	 *
 294	 * This will preserve the large page mappings for kernel text/data
 295	 * at no extra cost.
 296	 */
 297	if (kernel_set_to_readonly &&
 298	    within(address, (unsigned long)_text,
 299		   (unsigned long)__end_rodata_hpage_align)) {
 300		unsigned int level;
 301
 302		/*
 303		 * Don't enforce the !RW mapping for the kernel text mapping,
 304		 * if the current mapping is already using small page mapping.
 305		 * No need to work hard to preserve large page mappings in this
 306		 * case.
 307		 *
 308		 * This also fixes the Linux Xen paravirt guest boot failure
 309		 * (because of unexpected read-only mappings for kernel identity
 310		 * mappings). In this paravirt guest case, the kernel text
 311		 * mapping and the kernel identity mapping share the same
 312		 * page-table pages. Thus we can't really use different
 313		 * protections for the kernel text and identity mappings. Also,
 314		 * these shared mappings are made of small page mappings.
 315		 * Thus this don't enforce !RW mapping for small page kernel
 316		 * text mapping logic will help Linux Xen parvirt guest boot
 317		 * as well.
 318		 */
 319		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
 320			pgprot_val(forbidden) |= _PAGE_RW;
 321	}
 322#endif
 323
 324	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
 325
 326	return prot;
 327}
 328
 329/*
 330 * Lookup the page table entry for a virtual address in a specific pgd.
 331 * Return a pointer to the entry and the level of the mapping.
 332 */
 333pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
 334			     unsigned int *level)
 335{
 336	pud_t *pud;
 337	pmd_t *pmd;
 338
 339	*level = PG_LEVEL_NONE;
 340
 341	if (pgd_none(*pgd))
 342		return NULL;
 343
 344	pud = pud_offset(pgd, address);
 345	if (pud_none(*pud))
 346		return NULL;
 347
 348	*level = PG_LEVEL_1G;
 349	if (pud_large(*pud) || !pud_present(*pud))
 350		return (pte_t *)pud;
 351
 352	pmd = pmd_offset(pud, address);
 353	if (pmd_none(*pmd))
 354		return NULL;
 355
 356	*level = PG_LEVEL_2M;
 357	if (pmd_large(*pmd) || !pmd_present(*pmd))
 358		return (pte_t *)pmd;
 359
 360	*level = PG_LEVEL_4K;
 361
 362	return pte_offset_kernel(pmd, address);
 363}
 364
 365/*
 366 * Lookup the page table entry for a virtual address. Return a pointer
 367 * to the entry and the level of the mapping.
 368 *
 369 * Note: We return pud and pmd either when the entry is marked large
 370 * or when the present bit is not set. Otherwise we would return a
 371 * pointer to a nonexisting mapping.
 372 */
 373pte_t *lookup_address(unsigned long address, unsigned int *level)
 374{
 375        return lookup_address_in_pgd(pgd_offset_k(address), address, level);
 376}
 377EXPORT_SYMBOL_GPL(lookup_address);
 378
 379static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
 380				  unsigned int *level)
 381{
 382        if (cpa->pgd)
 383		return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
 384					       address, level);
 385
 386        return lookup_address(address, level);
 387}
 388
 389/*
 390 * Lookup the PMD entry for a virtual address. Return a pointer to the entry
 391 * or NULL if not present.
 392 */
 393pmd_t *lookup_pmd_address(unsigned long address)
 394{
 395	pgd_t *pgd;
 396	pud_t *pud;
 397
 398	pgd = pgd_offset_k(address);
 399	if (pgd_none(*pgd))
 400		return NULL;
 401
 402	pud = pud_offset(pgd, address);
 403	if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
 404		return NULL;
 405
 406	return pmd_offset(pud, address);
 407}
 408
 409/*
 410 * This is necessary because __pa() does not work on some
 411 * kinds of memory, like vmalloc() or the alloc_remap()
 412 * areas on 32-bit NUMA systems.  The percpu areas can
 413 * end up in this kind of memory, for instance.
 414 *
 415 * This could be optimized, but it is only intended to be
 416 * used at inititalization time, and keeping it
 417 * unoptimized should increase the testing coverage for
 418 * the more obscure platforms.
 419 */
 420phys_addr_t slow_virt_to_phys(void *__virt_addr)
 421{
 422	unsigned long virt_addr = (unsigned long)__virt_addr;
 423	phys_addr_t phys_addr;
 424	unsigned long offset;
 425	enum pg_level level;
 426	pte_t *pte;
 427
 428	pte = lookup_address(virt_addr, &level);
 429	BUG_ON(!pte);
 430
 431	/*
 432	 * pXX_pfn() returns unsigned long, which must be cast to phys_addr_t
 433	 * before being left-shifted PAGE_SHIFT bits -- this trick is to
 434	 * make 32-PAE kernel work correctly.
 435	 */
 436	switch (level) {
 437	case PG_LEVEL_1G:
 438		phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
 439		offset = virt_addr & ~PUD_PAGE_MASK;
 440		break;
 441	case PG_LEVEL_2M:
 442		phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
 443		offset = virt_addr & ~PMD_PAGE_MASK;
 444		break;
 445	default:
 446		phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
 447		offset = virt_addr & ~PAGE_MASK;
 448	}
 449
 450	return (phys_addr_t)(phys_addr | offset);
 451}
 452EXPORT_SYMBOL_GPL(slow_virt_to_phys);
 453
 454/*
 455 * Set the new pmd in all the pgds we know about:
 456 */
 457static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 458{
 459	/* change init_mm */
 460	set_pte_atomic(kpte, pte);
 461#ifdef CONFIG_X86_32
 462	if (!SHARED_KERNEL_PMD) {
 463		struct page *page;
 464
 465		list_for_each_entry(page, &pgd_list, lru) {
 466			pgd_t *pgd;
 467			pud_t *pud;
 468			pmd_t *pmd;
 469
 470			pgd = (pgd_t *)page_address(page) + pgd_index(address);
 471			pud = pud_offset(pgd, address);
 472			pmd = pmd_offset(pud, address);
 473			set_pte_atomic((pte_t *)pmd, pte);
 474		}
 475	}
 476#endif
 477}
 478
 479static int
 480try_preserve_large_page(pte_t *kpte, unsigned long address,
 481			struct cpa_data *cpa)
 482{
 483	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
 484	pte_t new_pte, old_pte, *tmp;
 485	pgprot_t old_prot, new_prot, req_prot;
 486	int i, do_split = 1;
 487	enum pg_level level;
 488
 489	if (cpa->force_split)
 490		return 1;
 491
 492	spin_lock(&pgd_lock);
 493	/*
 494	 * Check for races, another CPU might have split this page
 495	 * up already:
 496	 */
 497	tmp = _lookup_address_cpa(cpa, address, &level);
 498	if (tmp != kpte)
 499		goto out_unlock;
 500
 501	switch (level) {
 502	case PG_LEVEL_2M:
 503		old_prot = pmd_pgprot(*(pmd_t *)kpte);
 504		old_pfn = pmd_pfn(*(pmd_t *)kpte);
 505		break;
 506	case PG_LEVEL_1G:
 507		old_prot = pud_pgprot(*(pud_t *)kpte);
 508		old_pfn = pud_pfn(*(pud_t *)kpte);
 509		break;
 510	default:
 511		do_split = -EINVAL;
 512		goto out_unlock;
 513	}
 514
 515	psize = page_level_size(level);
 516	pmask = page_level_mask(level);
 517
 518	/*
 519	 * Calculate the number of pages, which fit into this large
 520	 * page starting at address:
 521	 */
 522	nextpage_addr = (address + psize) & pmask;
 523	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
 524	if (numpages < cpa->numpages)
 525		cpa->numpages = numpages;
 526
 527	/*
 528	 * We are safe now. Check whether the new pgprot is the same:
 529	 * Convert protection attributes to 4k-format, as cpa->mask* are set
 530	 * up accordingly.
 531	 */
 532	old_pte = *kpte;
 533	req_prot = pgprot_large_2_4k(old_prot);
 534
 535	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
 536	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
 537
 538	/*
 539	 * req_prot is in format of 4k pages. It must be converted to large
 540	 * page format: the caching mode includes the PAT bit located at
 541	 * different bit positions in the two formats.
 542	 */
 543	req_prot = pgprot_4k_2_large(req_prot);
 544
 545	/*
 546	 * Set the PSE and GLOBAL flags only if the PRESENT flag is
 547	 * set otherwise pmd_present/pmd_huge will return true even on
 548	 * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
 549	 * for the ancient hardware that doesn't support it.
 550	 */
 551	if (pgprot_val(req_prot) & _PAGE_PRESENT)
 552		pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL;
 553	else
 554		pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL);
 555
 556	req_prot = canon_pgprot(req_prot);
 557
 558	/*
 559	 * old_pfn points to the large page base pfn. So we need
 560	 * to add the offset of the virtual address:
 561	 */
 562	pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
 563	cpa->pfn = pfn;
 564
 565	new_prot = static_protections(req_prot, address, pfn);
 566
 567	/*
 568	 * We need to check the full range, whether
 569	 * static_protection() requires a different pgprot for one of
 570	 * the pages in the range we try to preserve:
 571	 */
 572	addr = address & pmask;
 573	pfn = old_pfn;
 574	for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
 575		pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
 576
 577		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
 578			goto out_unlock;
 579	}
 580
 581	/*
 582	 * If there are no changes, return. maxpages has been updated
 583	 * above:
 584	 */
 585	if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
 586		do_split = 0;
 587		goto out_unlock;
 588	}
 589
 590	/*
 591	 * We need to change the attributes. Check, whether we can
 592	 * change the large page in one go. We request a split, when
 593	 * the address is not aligned and the number of pages is
 594	 * smaller than the number of pages in the large page. Note
 595	 * that we limited the number of possible pages already to
 596	 * the number of pages in the large page.
 597	 */
 598	if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
 599		/*
 600		 * The address is aligned and the number of pages
 601		 * covers the full page.
 602		 */
 603		new_pte = pfn_pte(old_pfn, new_prot);
 604		__set_pmd_pte(kpte, address, new_pte);
 605		cpa->flags |= CPA_FLUSHTLB;
 606		do_split = 0;
 607	}
 608
 609out_unlock:
 610	spin_unlock(&pgd_lock);
 611
 612	return do_split;
 613}
 614
 615static int
 616__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 617		   struct page *base)
 618{
 619	pte_t *pbase = (pte_t *)page_address(base);
 620	unsigned long ref_pfn, pfn, pfninc = 1;
 621	unsigned int i, level;
 622	pte_t *tmp;
 623	pgprot_t ref_prot;
 624
 625	spin_lock(&pgd_lock);
 626	/*
 627	 * Check for races, another CPU might have split this page
 628	 * up for us already:
 629	 */
 630	tmp = _lookup_address_cpa(cpa, address, &level);
 631	if (tmp != kpte) {
 632		spin_unlock(&pgd_lock);
 633		return 1;
 634	}
 635
 636	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
 637
 638	switch (level) {
 639	case PG_LEVEL_2M:
 640		ref_prot = pmd_pgprot(*(pmd_t *)kpte);
 641		/* clear PSE and promote PAT bit to correct position */
 642		ref_prot = pgprot_large_2_4k(ref_prot);
 643		ref_pfn = pmd_pfn(*(pmd_t *)kpte);
 644		break;
 645
 646	case PG_LEVEL_1G:
 647		ref_prot = pud_pgprot(*(pud_t *)kpte);
 648		ref_pfn = pud_pfn(*(pud_t *)kpte);
 649		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
 650
 651		/*
 652		 * Clear the PSE flags if the PRESENT flag is not set
 653		 * otherwise pmd_present/pmd_huge will return true
 654		 * even on a non present pmd.
 655		 */
 656		if (!(pgprot_val(ref_prot) & _PAGE_PRESENT))
 657			pgprot_val(ref_prot) &= ~_PAGE_PSE;
 658		break;
 659
 660	default:
 661		spin_unlock(&pgd_lock);
 662		return 1;
 663	}
 664
 665	/*
 666	 * Set the GLOBAL flags only if the PRESENT flag is set
 667	 * otherwise pmd/pte_present will return true even on a non
 668	 * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL
 669	 * for the ancient hardware that doesn't support it.
 670	 */
 671	if (pgprot_val(ref_prot) & _PAGE_PRESENT)
 672		pgprot_val(ref_prot) |= _PAGE_GLOBAL;
 673	else
 674		pgprot_val(ref_prot) &= ~_PAGE_GLOBAL;
 675
 676	/*
 677	 * Get the target pfn from the original entry:
 678	 */
 679	pfn = ref_pfn;
 680	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
 681		set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
 682
 683	if (virt_addr_valid(address)) {
 684		unsigned long pfn = PFN_DOWN(__pa(address));
 685
 686		if (pfn_range_is_mapped(pfn, pfn + 1))
 687			split_page_count(level);
 688	}
 689
 690	/*
 691	 * Install the new, split up pagetable.
 692	 *
 693	 * We use the standard kernel pagetable protections for the new
 694	 * pagetable protections, the actual ptes set above control the
 695	 * primary protection behavior:
 696	 */
 697	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
 698
 699	/*
 700	 * Intel Atom errata AAH41 workaround.
 701	 *
 702	 * The real fix should be in hw or in a microcode update, but
 703	 * we also probabilistically try to reduce the window of having
 704	 * a large TLB mixed with 4K TLBs while instruction fetches are
 705	 * going on.
 706	 */
 707	__flush_tlb_all();
 708	spin_unlock(&pgd_lock);
 709
 710	return 0;
 711}
 712
 713static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 714			    unsigned long address)
 715{
 716	struct page *base;
 717
 718	if (!debug_pagealloc_enabled())
 719		spin_unlock(&cpa_lock);
 720	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
 721	if (!debug_pagealloc_enabled())
 722		spin_lock(&cpa_lock);
 723	if (!base)
 724		return -ENOMEM;
 725
 726	if (__split_large_page(cpa, kpte, address, base))
 727		__free_page(base);
 728
 729	return 0;
 730}
 731
 732static bool try_to_free_pte_page(pte_t *pte)
 733{
 734	int i;
 735
 736	for (i = 0; i < PTRS_PER_PTE; i++)
 737		if (!pte_none(pte[i]))
 738			return false;
 739
 740	free_page((unsigned long)pte);
 741	return true;
 742}
 743
 744static bool try_to_free_pmd_page(pmd_t *pmd)
 745{
 746	int i;
 747
 748	for (i = 0; i < PTRS_PER_PMD; i++)
 749		if (!pmd_none(pmd[i]))
 750			return false;
 751
 752	free_page((unsigned long)pmd);
 753	return true;
 754}
 755
 756static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
 757{
 758	pte_t *pte = pte_offset_kernel(pmd, start);
 759
 760	while (start < end) {
 761		set_pte(pte, __pte(0));
 762
 763		start += PAGE_SIZE;
 764		pte++;
 765	}
 766
 767	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
 768		pmd_clear(pmd);
 769		return true;
 770	}
 771	return false;
 772}
 773
 774static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
 775			      unsigned long start, unsigned long end)
 776{
 777	if (unmap_pte_range(pmd, start, end))
 778		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
 779			pud_clear(pud);
 780}
 781
 782static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 783{
 784	pmd_t *pmd = pmd_offset(pud, start);
 785
 786	/*
 787	 * Not on a 2MB page boundary?
 788	 */
 789	if (start & (PMD_SIZE - 1)) {
 790		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 791		unsigned long pre_end = min_t(unsigned long, end, next_page);
 792
 793		__unmap_pmd_range(pud, pmd, start, pre_end);
 794
 795		start = pre_end;
 796		pmd++;
 797	}
 798
 799	/*
 800	 * Try to unmap in 2M chunks.
 801	 */
 802	while (end - start >= PMD_SIZE) {
 803		if (pmd_large(*pmd))
 804			pmd_clear(pmd);
 805		else
 806			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
 807
 808		start += PMD_SIZE;
 809		pmd++;
 810	}
 811
 812	/*
 813	 * 4K leftovers?
 814	 */
 815	if (start < end)
 816		return __unmap_pmd_range(pud, pmd, start, end);
 817
 818	/*
 819	 * Try again to free the PMD page if haven't succeeded above.
 820	 */
 821	if (!pud_none(*pud))
 822		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
 823			pud_clear(pud);
 824}
 825
 826static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 827{
 828	pud_t *pud = pud_offset(pgd, start);
 829
 830	/*
 831	 * Not on a GB page boundary?
 832	 */
 833	if (start & (PUD_SIZE - 1)) {
 834		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
 835		unsigned long pre_end	= min_t(unsigned long, end, next_page);
 836
 837		unmap_pmd_range(pud, start, pre_end);
 838
 839		start = pre_end;
 840		pud++;
 841	}
 842
 843	/*
 844	 * Try to unmap in 1G chunks?
 845	 */
 846	while (end - start >= PUD_SIZE) {
 847
 848		if (pud_large(*pud))
 849			pud_clear(pud);
 850		else
 851			unmap_pmd_range(pud, start, start + PUD_SIZE);
 852
 853		start += PUD_SIZE;
 854		pud++;
 855	}
 856
 857	/*
 858	 * 2M leftovers?
 859	 */
 860	if (start < end)
 861		unmap_pmd_range(pud, start, end);
 862
 863	/*
 864	 * No need to try to free the PUD page because we'll free it in
 865	 * populate_pgd's error path
 866	 */
 867}
 868
 869static int alloc_pte_page(pmd_t *pmd)
 870{
 871	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
 872	if (!pte)
 873		return -1;
 874
 875	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 876	return 0;
 877}
 878
 879static int alloc_pmd_page(pud_t *pud)
 880{
 881	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
 882	if (!pmd)
 883		return -1;
 884
 885	set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 886	return 0;
 887}
 888
 889static void populate_pte(struct cpa_data *cpa,
 890			 unsigned long start, unsigned long end,
 891			 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
 892{
 893	pte_t *pte;
 894
 895	pte = pte_offset_kernel(pmd, start);
 896
 897	/*
 898	 * Set the GLOBAL flags only if the PRESENT flag is
 899	 * set otherwise pte_present will return true even on
 900	 * a non present pte. The canon_pgprot will clear
 901	 * _PAGE_GLOBAL for the ancient hardware that doesn't
 902	 * support it.
 903	 */
 904	if (pgprot_val(pgprot) & _PAGE_PRESENT)
 905		pgprot_val(pgprot) |= _PAGE_GLOBAL;
 906	else
 907		pgprot_val(pgprot) &= ~_PAGE_GLOBAL;
 908
 909	pgprot = canon_pgprot(pgprot);
 910
 911	while (num_pages-- && start < end) {
 912		set_pte(pte, pfn_pte(cpa->pfn, pgprot));
 913
 914		start	 += PAGE_SIZE;
 915		cpa->pfn++;
 916		pte++;
 917	}
 918}
 919
 920static long populate_pmd(struct cpa_data *cpa,
 921			 unsigned long start, unsigned long end,
 922			 unsigned num_pages, pud_t *pud, pgprot_t pgprot)
 923{
 924	long cur_pages = 0;
 925	pmd_t *pmd;
 926	pgprot_t pmd_pgprot;
 927
 928	/*
 929	 * Not on a 2M boundary?
 930	 */
 931	if (start & (PMD_SIZE - 1)) {
 932		unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
 933		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 934
 935		pre_end   = min_t(unsigned long, pre_end, next_page);
 936		cur_pages = (pre_end - start) >> PAGE_SHIFT;
 937		cur_pages = min_t(unsigned int, num_pages, cur_pages);
 938
 939		/*
 940		 * Need a PTE page?
 941		 */
 942		pmd = pmd_offset(pud, start);
 943		if (pmd_none(*pmd))
 944			if (alloc_pte_page(pmd))
 945				return -1;
 946
 947		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
 948
 949		start = pre_end;
 950	}
 951
 952	/*
 953	 * We mapped them all?
 954	 */
 955	if (num_pages == cur_pages)
 956		return cur_pages;
 957
 958	pmd_pgprot = pgprot_4k_2_large(pgprot);
 959
 960	while (end - start >= PMD_SIZE) {
 961
 962		/*
 963		 * We cannot use a 1G page so allocate a PMD page if needed.
 964		 */
 965		if (pud_none(*pud))
 966			if (alloc_pmd_page(pud))
 967				return -1;
 968
 969		pmd = pmd_offset(pud, start);
 970
 971		set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
 972				   massage_pgprot(pmd_pgprot)));
 973
 974		start	  += PMD_SIZE;
 975		cpa->pfn  += PMD_SIZE >> PAGE_SHIFT;
 976		cur_pages += PMD_SIZE >> PAGE_SHIFT;
 977	}
 978
 979	/*
 980	 * Map trailing 4K pages.
 981	 */
 982	if (start < end) {
 983		pmd = pmd_offset(pud, start);
 984		if (pmd_none(*pmd))
 985			if (alloc_pte_page(pmd))
 986				return -1;
 987
 988		populate_pte(cpa, start, end, num_pages - cur_pages,
 989			     pmd, pgprot);
 990	}
 991	return num_pages;
 992}
 993
 994static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
 995			 pgprot_t pgprot)
 996{
 997	pud_t *pud;
 998	unsigned long end;
 999	long cur_pages = 0;
1000	pgprot_t pud_pgprot;
1001
1002	end = start + (cpa->numpages << PAGE_SHIFT);
1003
1004	/*
1005	 * Not on a Gb page boundary? => map everything up to it with
1006	 * smaller pages.
1007	 */
1008	if (start & (PUD_SIZE - 1)) {
1009		unsigned long pre_end;
1010		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
1011
1012		pre_end   = min_t(unsigned long, end, next_page);
1013		cur_pages = (pre_end - start) >> PAGE_SHIFT;
1014		cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
1015
1016		pud = pud_offset(pgd, start);
1017
1018		/*
1019		 * Need a PMD page?
1020		 */
1021		if (pud_none(*pud))
1022			if (alloc_pmd_page(pud))
1023				return -1;
1024
1025		cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
1026					 pud, pgprot);
1027		if (cur_pages < 0)
1028			return cur_pages;
1029
1030		start = pre_end;
1031	}
1032
1033	/* We mapped them all? */
1034	if (cpa->numpages == cur_pages)
1035		return cur_pages;
1036
1037	pud = pud_offset(pgd, start);
1038	pud_pgprot = pgprot_4k_2_large(pgprot);
1039
1040	/*
1041	 * Map everything starting from the Gb boundary, possibly with 1G pages
1042	 */
1043	while (boot_cpu_has(X86_FEATURE_GBPAGES) && end - start >= PUD_SIZE) {
1044		set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
1045				   massage_pgprot(pud_pgprot)));
1046
1047		start	  += PUD_SIZE;
1048		cpa->pfn  += PUD_SIZE >> PAGE_SHIFT;
1049		cur_pages += PUD_SIZE >> PAGE_SHIFT;
1050		pud++;
1051	}
1052
1053	/* Map trailing leftover */
1054	if (start < end) {
1055		long tmp;
1056
1057		pud = pud_offset(pgd, start);
1058		if (pud_none(*pud))
1059			if (alloc_pmd_page(pud))
1060				return -1;
1061
1062		tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
1063				   pud, pgprot);
1064		if (tmp < 0)
1065			return cur_pages;
1066
1067		cur_pages += tmp;
1068	}
1069	return cur_pages;
1070}
1071
1072/*
1073 * Restrictions for kernel page table do not necessarily apply when mapping in
1074 * an alternate PGD.
1075 */
1076static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1077{
1078	pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
1079	pud_t *pud = NULL;	/* shut up gcc */
1080	pgd_t *pgd_entry;
1081	long ret;
1082
1083	pgd_entry = cpa->pgd + pgd_index(addr);
1084
1085	/*
1086	 * Allocate a PUD page and hand it down for mapping.
1087	 */
1088	if (pgd_none(*pgd_entry)) {
1089		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
1090		if (!pud)
1091			return -1;
1092
1093		set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
1094	}
1095
1096	pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
1097	pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);
1098
1099	ret = populate_pud(cpa, addr, pgd_entry, pgprot);
1100	if (ret < 0) {
1101		/*
1102		 * Leave the PUD page in place in case some other CPU or thread
1103		 * already found it, but remove any useless entries we just
1104		 * added to it.
1105		 */
1106		unmap_pud_range(pgd_entry, addr,
1107				addr + (cpa->numpages << PAGE_SHIFT));
1108		return ret;
1109	}
1110
1111	cpa->numpages = ret;
1112	return 0;
1113}
1114
1115static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
1116			       int primary)
1117{
1118	if (cpa->pgd) {
1119		/*
1120		 * Right now, we only execute this code path when mapping
1121		 * the EFI virtual memory map regions, no other users
1122		 * provide a ->pgd value. This may change in the future.
1123		 */
1124		return populate_pgd(cpa, vaddr);
1125	}
1126
1127	/*
1128	 * Ignore all non primary paths.
1129	 */
1130	if (!primary) {
1131		cpa->numpages = 1;
1132		return 0;
1133	}
1134
1135	/*
1136	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
1137	 * to have holes.
1138	 * Also set numpages to '1' indicating that we processed cpa req for
1139	 * one virtual address page and its pfn. TBD: numpages can be set based
1140	 * on the initial value and the level returned by lookup_address().
1141	 */
1142	if (within(vaddr, PAGE_OFFSET,
1143		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
1144		cpa->numpages = 1;
1145		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
1146		return 0;
1147	} else {
1148		WARN(1, KERN_WARNING "CPA: called for zero pte. "
1149			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
1150			*cpa->vaddr);
1151
1152		return -EFAULT;
1153	}
1154}
1155
1156static int __change_page_attr(struct cpa_data *cpa, int primary)
1157{
1158	unsigned long address;
1159	int do_split, err;
1160	unsigned int level;
1161	pte_t *kpte, old_pte;
1162
1163	if (cpa->flags & CPA_PAGES_ARRAY) {
1164		struct page *page = cpa->pages[cpa->curpage];
1165		if (unlikely(PageHighMem(page)))
1166			return 0;
1167		address = (unsigned long)page_address(page);
1168	} else if (cpa->flags & CPA_ARRAY)
1169		address = cpa->vaddr[cpa->curpage];
1170	else
1171		address = *cpa->vaddr;
1172repeat:
1173	kpte = _lookup_address_cpa(cpa, address, &level);
1174	if (!kpte)
1175		return __cpa_process_fault(cpa, address, primary);
1176
1177	old_pte = *kpte;
1178	if (pte_none(old_pte))
1179		return __cpa_process_fault(cpa, address, primary);
1180
1181	if (level == PG_LEVEL_4K) {
1182		pte_t new_pte;
1183		pgprot_t new_prot = pte_pgprot(old_pte);
1184		unsigned long pfn = pte_pfn(old_pte);
1185
1186		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
1187		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
1188
1189		new_prot = static_protections(new_prot, address, pfn);
1190
1191		/*
1192		 * Set the GLOBAL flags only if the PRESENT flag is
1193		 * set otherwise pte_present will return true even on
1194		 * a non present pte. The canon_pgprot will clear
1195		 * _PAGE_GLOBAL for the ancient hardware that doesn't
1196		 * support it.
1197		 */
1198		if (pgprot_val(new_prot) & _PAGE_PRESENT)
1199			pgprot_val(new_prot) |= _PAGE_GLOBAL;
1200		else
1201			pgprot_val(new_prot) &= ~_PAGE_GLOBAL;
1202
1203		/*
1204		 * We need to keep the pfn from the existing PTE,
1205		 * after all we're only going to change it's attributes
1206		 * not the memory it points to
1207		 */
1208		new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
1209		cpa->pfn = pfn;
1210		/*
1211		 * Do we really change anything ?
1212		 */
1213		if (pte_val(old_pte) != pte_val(new_pte)) {
1214			set_pte_atomic(kpte, new_pte);
1215			cpa->flags |= CPA_FLUSHTLB;
1216		}
1217		cpa->numpages = 1;
1218		return 0;
1219	}
1220
1221	/*
1222	 * Check, whether we can keep the large page intact
1223	 * and just change the pte:
1224	 */
1225	do_split = try_preserve_large_page(kpte, address, cpa);
1226	/*
1227	 * When the range fits into the existing large page,
1228	 * return. cp->numpages and cpa->tlbflush have been updated in
1229	 * try_large_page:
1230	 */
1231	if (do_split <= 0)
1232		return do_split;
1233
1234	/*
1235	 * We have to split the large page:
1236	 */
1237	err = split_large_page(cpa, kpte, address);
1238	if (!err) {
1239		/*
1240	 	 * Do a global flush tlb after splitting the large page
1241	 	 * and before we do the actual change page attribute in the PTE.
1242	 	 *
1243	 	 * With out this, we violate the TLB application note, that says
1244	 	 * "The TLBs may contain both ordinary and large-page
1245		 *  translations for a 4-KByte range of linear addresses. This
1246		 *  may occur if software modifies the paging structures so that
1247		 *  the page size used for the address range changes. If the two
1248		 *  translations differ with respect to page frame or attributes
1249		 *  (e.g., permissions), processor behavior is undefined and may
1250		 *  be implementation-specific."
1251	 	 *
1252	 	 * We do this global tlb flush inside the cpa_lock, so that we
1253		 * don't allow any other cpu, with stale tlb entries change the
1254		 * page attribute in parallel, that also falls into the
1255		 * just split large page entry.
1256	 	 */
1257		flush_tlb_all();
1258		goto repeat;
1259	}
1260
1261	return err;
1262}
1263
1264static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
1265
1266static int cpa_process_alias(struct cpa_data *cpa)
1267{
1268	struct cpa_data alias_cpa;
1269	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
1270	unsigned long vaddr;
1271	int ret;
1272
1273	if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
1274		return 0;
1275
1276	/*
1277	 * No need to redo, when the primary call touched the direct
1278	 * mapping already:
1279	 */
1280	if (cpa->flags & CPA_PAGES_ARRAY) {
1281		struct page *page = cpa->pages[cpa->curpage];
1282		if (unlikely(PageHighMem(page)))
1283			return 0;
1284		vaddr = (unsigned long)page_address(page);
1285	} else if (cpa->flags & CPA_ARRAY)
1286		vaddr = cpa->vaddr[cpa->curpage];
1287	else
1288		vaddr = *cpa->vaddr;
1289
1290	if (!(within(vaddr, PAGE_OFFSET,
1291		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
1292
1293		alias_cpa = *cpa;
1294		alias_cpa.vaddr = &laddr;
1295		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1296
1297		ret = __change_page_attr_set_clr(&alias_cpa, 0);
1298		if (ret)
1299			return ret;
1300	}
1301
1302#ifdef CONFIG_X86_64
1303	/*
1304	 * If the primary call didn't touch the high mapping already
1305	 * and the physical address is inside the kernel map, we need
1306	 * to touch the high mapped kernel as well:
1307	 */
1308	if (!within(vaddr, (unsigned long)_text, _brk_end) &&
1309	    within_inclusive(cpa->pfn, highmap_start_pfn(),
1310			     highmap_end_pfn())) {
1311		unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
1312					       __START_KERNEL_map - phys_base;
1313		alias_cpa = *cpa;
1314		alias_cpa.vaddr = &temp_cpa_vaddr;
1315		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1316
1317		/*
1318		 * The high mapping range is imprecise, so ignore the
1319		 * return value.
1320		 */
1321		__change_page_attr_set_clr(&alias_cpa, 0);
1322	}
1323#endif
1324
1325	return 0;
1326}
1327
1328static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
1329{
1330	unsigned long numpages = cpa->numpages;
1331	int ret;
1332
1333	while (numpages) {
1334		/*
1335		 * Store the remaining nr of pages for the large page
1336		 * preservation check.
1337		 */
1338		cpa->numpages = numpages;
1339		/* for array changes, we can't use large page */
1340		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
1341			cpa->numpages = 1;
1342
1343		if (!debug_pagealloc_enabled())
1344			spin_lock(&cpa_lock);
1345		ret = __change_page_attr(cpa, checkalias);
1346		if (!debug_pagealloc_enabled())
1347			spin_unlock(&cpa_lock);
1348		if (ret)
1349			return ret;
1350
1351		if (checkalias) {
1352			ret = cpa_process_alias(cpa);
1353			if (ret)
1354				return ret;
1355		}
1356
1357		/*
1358		 * Adjust the number of pages with the result of the
1359		 * CPA operation. Either a large page has been
1360		 * preserved or a single page update happened.
1361		 */
1362		BUG_ON(cpa->numpages > numpages || !cpa->numpages);
1363		numpages -= cpa->numpages;
1364		if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
1365			cpa->curpage++;
1366		else
1367			*cpa->vaddr += cpa->numpages * PAGE_SIZE;
1368
1369	}
1370	return 0;
1371}
1372
1373static int change_page_attr_set_clr(unsigned long *addr, int numpages,
1374				    pgprot_t mask_set, pgprot_t mask_clr,
1375				    int force_split, int in_flag,
1376				    struct page **pages)
1377{
1378	struct cpa_data cpa;
1379	int ret, cache, checkalias;
1380	unsigned long baddr = 0;
1381
1382	memset(&cpa, 0, sizeof(cpa));
1383
1384	/*
1385	 * Check, if we are requested to change a not supported
1386	 * feature:
1387	 */
1388	mask_set = canon_pgprot(mask_set);
1389	mask_clr = canon_pgprot(mask_clr);
1390	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
1391		return 0;
1392
1393	/* Ensure we are PAGE_SIZE aligned */
1394	if (in_flag & CPA_ARRAY) {
1395		int i;
1396		for (i = 0; i < numpages; i++) {
1397			if (addr[i] & ~PAGE_MASK) {
1398				addr[i] &= PAGE_MASK;
1399				WARN_ON_ONCE(1);
1400			}
1401		}
1402	} else if (!(in_flag & CPA_PAGES_ARRAY)) {
1403		/*
1404		 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
1405		 * No need to cehck in that case
1406		 */
1407		if (*addr & ~PAGE_MASK) {
1408			*addr &= PAGE_MASK;
1409			/*
1410			 * People should not be passing in unaligned addresses:
1411			 */
1412			WARN_ON_ONCE(1);
1413		}
1414		/*
1415		 * Save address for cache flush. *addr is modified in the call
1416		 * to __change_page_attr_set_clr() below.
1417		 */
1418		baddr = *addr;
1419	}
1420
1421	/* Must avoid aliasing mappings in the highmem code */
1422	kmap_flush_unused();
1423
1424	vm_unmap_aliases();
1425
1426	cpa.vaddr = addr;
1427	cpa.pages = pages;
1428	cpa.numpages = numpages;
1429	cpa.mask_set = mask_set;
1430	cpa.mask_clr = mask_clr;
1431	cpa.flags = 0;
1432	cpa.curpage = 0;
1433	cpa.force_split = force_split;
1434
1435	if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
1436		cpa.flags |= in_flag;
1437
1438	/* No alias checking for _NX bit modifications */
1439	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
1440
1441	ret = __change_page_attr_set_clr(&cpa, checkalias);
1442
1443	/*
1444	 * Check whether we really changed something:
1445	 */
1446	if (!(cpa.flags & CPA_FLUSHTLB))
1447		goto out;
1448
1449	/*
1450	 * No need to flush, when we did not set any of the caching
1451	 * attributes:
1452	 */
1453	cache = !!pgprot2cachemode(mask_set);
1454
1455	/*
1456	 * On success we use CLFLUSH, when the CPU supports it to
1457	 * avoid the WBINVD. If the CPU does not support it and in the
1458	 * error case we fall back to cpa_flush_all (which uses
1459	 * WBINVD):
1460	 */
1461	if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
1462		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
1463			cpa_flush_array(addr, numpages, cache,
1464					cpa.flags, pages);
1465		} else
1466			cpa_flush_range(baddr, numpages, cache);
1467	} else
1468		cpa_flush_all(cache);
1469
1470out:
1471	return ret;
1472}
1473
1474static inline int change_page_attr_set(unsigned long *addr, int numpages,
1475				       pgprot_t mask, int array)
1476{
1477	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
1478		(array ? CPA_ARRAY : 0), NULL);
1479}
1480
1481static inline int change_page_attr_clear(unsigned long *addr, int numpages,
1482					 pgprot_t mask, int array)
1483{
1484	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
1485		(array ? CPA_ARRAY : 0), NULL);
1486}
1487
1488static inline int cpa_set_pages_array(struct page **pages, int numpages,
1489				       pgprot_t mask)
1490{
1491	return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
1492		CPA_PAGES_ARRAY, pages);
1493}
1494
1495static inline int cpa_clear_pages_array(struct page **pages, int numpages,
1496					 pgprot_t mask)
1497{
1498	return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
1499		CPA_PAGES_ARRAY, pages);
1500}
1501
1502int _set_memory_uc(unsigned long addr, int numpages)
1503{
1504	/*
1505	 * for now UC MINUS. see comments in ioremap_nocache()
1506	 * If you really need strong UC use ioremap_uc(), but note
1507	 * that you cannot override IO areas with set_memory_*() as
1508	 * these helpers cannot work with IO memory.
1509	 */
1510	return change_page_attr_set(&addr, numpages,
1511				    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1512				    0);
1513}
1514
1515int set_memory_uc(unsigned long addr, int numpages)
1516{
1517	int ret;
1518
1519	/*
1520	 * for now UC MINUS. see comments in ioremap_nocache()
1521	 */
1522	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1523			      _PAGE_CACHE_MODE_UC_MINUS, NULL);
1524	if (ret)
1525		goto out_err;
1526
1527	ret = _set_memory_uc(addr, numpages);
1528	if (ret)
1529		goto out_free;
1530
1531	return 0;
1532
1533out_free:
1534	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1535out_err:
1536	return ret;
1537}
1538EXPORT_SYMBOL(set_memory_uc);
1539
1540static int _set_memory_array(unsigned long *addr, int addrinarray,
1541		enum page_cache_mode new_type)
1542{
1543	enum page_cache_mode set_type;
1544	int i, j;
1545	int ret;
1546
1547	for (i = 0; i < addrinarray; i++) {
1548		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
1549					new_type, NULL);
1550		if (ret)
1551			goto out_free;
1552	}
1553
1554	/* If WC, set to UC- first and then WC */
1555	set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
1556				_PAGE_CACHE_MODE_UC_MINUS : new_type;
1557
1558	ret = change_page_attr_set(addr, addrinarray,
1559				   cachemode2pgprot(set_type), 1);
1560
1561	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
1562		ret = change_page_attr_set_clr(addr, addrinarray,
1563					       cachemode2pgprot(
1564						_PAGE_CACHE_MODE_WC),
1565					       __pgprot(_PAGE_CACHE_MASK),
1566					       0, CPA_ARRAY, NULL);
1567	if (ret)
1568		goto out_free;
1569
1570	return 0;
1571
1572out_free:
1573	for (j = 0; j < i; j++)
1574		free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
1575
1576	return ret;
1577}
1578
1579int set_memory_array_uc(unsigned long *addr, int addrinarray)
1580{
1581	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
1582}
1583EXPORT_SYMBOL(set_memory_array_uc);
1584
1585int set_memory_array_wc(unsigned long *addr, int addrinarray)
1586{
1587	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC);
1588}
1589EXPORT_SYMBOL(set_memory_array_wc);
1590
1591int set_memory_array_wt(unsigned long *addr, int addrinarray)
1592{
1593	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT);
1594}
1595EXPORT_SYMBOL_GPL(set_memory_array_wt);
1596
1597int _set_memory_wc(unsigned long addr, int numpages)
1598{
1599	int ret;
1600	unsigned long addr_copy = addr;
1601
1602	ret = change_page_attr_set(&addr, numpages,
1603				   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1604				   0);
1605	if (!ret) {
1606		ret = change_page_attr_set_clr(&addr_copy, numpages,
1607					       cachemode2pgprot(
1608						_PAGE_CACHE_MODE_WC),
1609					       __pgprot(_PAGE_CACHE_MASK),
1610					       0, 0, NULL);
1611	}
1612	return ret;
1613}
1614
1615int set_memory_wc(unsigned long addr, int numpages)
1616{
1617	int ret;
1618
1619	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1620		_PAGE_CACHE_MODE_WC, NULL);
1621	if (ret)
1622		return ret;
1623
1624	ret = _set_memory_wc(addr, numpages);
1625	if (ret)
1626		free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1627
1628	return ret;
1629}
1630EXPORT_SYMBOL(set_memory_wc);
1631
1632int _set_memory_wt(unsigned long addr, int numpages)
1633{
1634	return change_page_attr_set(&addr, numpages,
1635				    cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
1636}
1637
1638int set_memory_wt(unsigned long addr, int numpages)
1639{
1640	int ret;
1641
1642	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1643			      _PAGE_CACHE_MODE_WT, NULL);
1644	if (ret)
1645		return ret;
1646
1647	ret = _set_memory_wt(addr, numpages);
1648	if (ret)
1649		free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1650
1651	return ret;
1652}
1653EXPORT_SYMBOL_GPL(set_memory_wt);
1654
1655int _set_memory_wb(unsigned long addr, int numpages)
1656{
1657	/* WB cache mode is hard wired to all cache attribute bits being 0 */
1658	return change_page_attr_clear(&addr, numpages,
1659				      __pgprot(_PAGE_CACHE_MASK), 0);
1660}
1661
1662int set_memory_wb(unsigned long addr, int numpages)
1663{
1664	int ret;
1665
1666	ret = _set_memory_wb(addr, numpages);
1667	if (ret)
1668		return ret;
1669
1670	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1671	return 0;
1672}
1673EXPORT_SYMBOL(set_memory_wb);
1674
1675int set_memory_array_wb(unsigned long *addr, int addrinarray)
1676{
1677	int i;
1678	int ret;
1679
1680	/* WB cache mode is hard wired to all cache attribute bits being 0 */
1681	ret = change_page_attr_clear(addr, addrinarray,
1682				      __pgprot(_PAGE_CACHE_MASK), 1);
1683	if (ret)
1684		return ret;
1685
1686	for (i = 0; i < addrinarray; i++)
1687		free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
1688
1689	return 0;
1690}
1691EXPORT_SYMBOL(set_memory_array_wb);
1692
1693int set_memory_x(unsigned long addr, int numpages)
1694{
1695	if (!(__supported_pte_mask & _PAGE_NX))
1696		return 0;
1697
1698	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1699}
1700EXPORT_SYMBOL(set_memory_x);
1701
1702int set_memory_nx(unsigned long addr, int numpages)
1703{
1704	if (!(__supported_pte_mask & _PAGE_NX))
1705		return 0;
1706
1707	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1708}
1709EXPORT_SYMBOL(set_memory_nx);
1710
1711int set_memory_ro(unsigned long addr, int numpages)
1712{
1713	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
1714}
1715
1716int set_memory_rw(unsigned long addr, int numpages)
1717{
1718	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
1719}
1720
1721int set_memory_np(unsigned long addr, int numpages)
1722{
1723	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
1724}
1725
1726int set_memory_4k(unsigned long addr, int numpages)
1727{
1728	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1729					__pgprot(0), 1, 0, NULL);
1730}
1731
1732int set_pages_uc(struct page *page, int numpages)
1733{
1734	unsigned long addr = (unsigned long)page_address(page);
1735
1736	return set_memory_uc(addr, numpages);
1737}
1738EXPORT_SYMBOL(set_pages_uc);
1739
1740static int _set_pages_array(struct page **pages, int addrinarray,
1741		enum page_cache_mode new_type)
1742{
1743	unsigned long start;
1744	unsigned long end;
1745	enum page_cache_mode set_type;
1746	int i;
1747	int free_idx;
1748	int ret;
1749
1750	for (i = 0; i < addrinarray; i++) {
1751		if (PageHighMem(pages[i]))
1752			continue;
1753		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1754		end = start + PAGE_SIZE;
1755		if (reserve_memtype(start, end, new_type, NULL))
1756			goto err_out;
1757	}
1758
1759	/* If WC, set to UC- first and then WC */
1760	set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
1761				_PAGE_CACHE_MODE_UC_MINUS : new_type;
1762
1763	ret = cpa_set_pages_array(pages, addrinarray,
1764				  cachemode2pgprot(set_type));
1765	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
1766		ret = change_page_attr_set_clr(NULL, addrinarray,
1767					       cachemode2pgprot(
1768						_PAGE_CACHE_MODE_WC),
1769					       __pgprot(_PAGE_CACHE_MASK),
1770					       0, CPA_PAGES_ARRAY, pages);
1771	if (ret)
1772		goto err_out;
1773	return 0; /* Success */
1774err_out:
1775	free_idx = i;
1776	for (i = 0; i < free_idx; i++) {
1777		if (PageHighMem(pages[i]))
1778			continue;
1779		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1780		end = start + PAGE_SIZE;
1781		free_memtype(start, end);
1782	}
1783	return -EINVAL;
1784}
1785
1786int set_pages_array_uc(struct page **pages, int addrinarray)
1787{
1788	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
1789}
1790EXPORT_SYMBOL(set_pages_array_uc);
1791
1792int set_pages_array_wc(struct page **pages, int addrinarray)
1793{
1794	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC);
1795}
1796EXPORT_SYMBOL(set_pages_array_wc);
1797
1798int set_pages_array_wt(struct page **pages, int addrinarray)
1799{
1800	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT);
1801}
1802EXPORT_SYMBOL_GPL(set_pages_array_wt);
1803
1804int set_pages_wb(struct page *page, int numpages)
1805{
1806	unsigned long addr = (unsigned long)page_address(page);
1807
1808	return set_memory_wb(addr, numpages);
1809}
1810EXPORT_SYMBOL(set_pages_wb);
1811
1812int set_pages_array_wb(struct page **pages, int addrinarray)
1813{
1814	int retval;
1815	unsigned long start;
1816	unsigned long end;
1817	int i;
1818
1819	/* WB cache mode is hard wired to all cache attribute bits being 0 */
1820	retval = cpa_clear_pages_array(pages, addrinarray,
1821			__pgprot(_PAGE_CACHE_MASK));
1822	if (retval)
1823		return retval;
1824
1825	for (i = 0; i < addrinarray; i++) {
1826		if (PageHighMem(pages[i]))
1827			continue;
1828		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1829		end = start + PAGE_SIZE;
1830		free_memtype(start, end);
1831	}
1832
1833	return 0;
1834}
1835EXPORT_SYMBOL(set_pages_array_wb);
1836
1837int set_pages_x(struct page *page, int numpages)
1838{
1839	unsigned long addr = (unsigned long)page_address(page);
1840
1841	return set_memory_x(addr, numpages);
1842}
1843EXPORT_SYMBOL(set_pages_x);
1844
1845int set_pages_nx(struct page *page, int numpages)
1846{
1847	unsigned long addr = (unsigned long)page_address(page);
1848
1849	return set_memory_nx(addr, numpages);
1850}
1851EXPORT_SYMBOL(set_pages_nx);
1852
1853int set_pages_ro(struct page *page, int numpages)
1854{
1855	unsigned long addr = (unsigned long)page_address(page);
1856
1857	return set_memory_ro(addr, numpages);
1858}
1859
1860int set_pages_rw(struct page *page, int numpages)
1861{
1862	unsigned long addr = (unsigned long)page_address(page);
1863
1864	return set_memory_rw(addr, numpages);
1865}
1866
1867#ifdef CONFIG_DEBUG_PAGEALLOC
1868
1869static int __set_pages_p(struct page *page, int numpages)
1870{
1871	unsigned long tempaddr = (unsigned long) page_address(page);
1872	struct cpa_data cpa = { .vaddr = &tempaddr,
1873				.pgd = NULL,
1874				.numpages = numpages,
1875				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1876				.mask_clr = __pgprot(0),
1877				.flags = 0};
1878
1879	/*
1880	 * No alias checking needed for setting present flag. otherwise,
1881	 * we may need to break large pages for 64-bit kernel text
1882	 * mappings (this adds to complexity if we want to do this from
1883	 * atomic context especially). Let's keep it simple!
1884	 */
1885	return __change_page_attr_set_clr(&cpa, 0);
1886}
1887
1888static int __set_pages_np(struct page *page, int numpages)
1889{
1890	unsigned long tempaddr = (unsigned long) page_address(page);
1891	struct cpa_data cpa = { .vaddr = &tempaddr,
1892				.pgd = NULL,
1893				.numpages = numpages,
1894				.mask_set = __pgprot(0),
1895				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1896				.flags = 0};
1897
1898	/*
1899	 * No alias checking needed for setting not present flag. otherwise,
1900	 * we may need to break large pages for 64-bit kernel text
1901	 * mappings (this adds to complexity if we want to do this from
1902	 * atomic context especially). Let's keep it simple!
1903	 */
1904	return __change_page_attr_set_clr(&cpa, 0);
1905}
1906
1907void __kernel_map_pages(struct page *page, int numpages, int enable)
1908{
1909	if (PageHighMem(page))
1910		return;
1911	if (!enable) {
1912		debug_check_no_locks_freed(page_address(page),
1913					   numpages * PAGE_SIZE);
1914	}
1915
1916	/*
1917	 * The return value is ignored as the calls cannot fail.
1918	 * Large pages for identity mappings are not used at boot time
1919	 * and hence no memory allocations during large page split.
1920	 */
1921	if (enable)
1922		__set_pages_p(page, numpages);
1923	else
1924		__set_pages_np(page, numpages);
1925
1926	/*
1927	 * We should perform an IPI and flush all tlbs,
1928	 * but that can deadlock->flush only current cpu:
1929	 */
1930	__flush_tlb_all();
1931
1932	arch_flush_lazy_mmu_mode();
1933}
1934
1935#ifdef CONFIG_HIBERNATION
1936
1937bool kernel_page_present(struct page *page)
1938{
1939	unsigned int level;
1940	pte_t *pte;
1941
1942	if (PageHighMem(page))
1943		return false;
1944
1945	pte = lookup_address((unsigned long)page_address(page), &level);
1946	return (pte_val(*pte) & _PAGE_PRESENT);
1947}
1948
1949#endif /* CONFIG_HIBERNATION */
1950
1951#endif /* CONFIG_DEBUG_PAGEALLOC */
1952
1953int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
1954			    unsigned numpages, unsigned long page_flags)
1955{
1956	int retval = -EINVAL;
1957
1958	struct cpa_data cpa = {
1959		.vaddr = &address,
1960		.pfn = pfn,
1961		.pgd = pgd,
1962		.numpages = numpages,
1963		.mask_set = __pgprot(0),
1964		.mask_clr = __pgprot(0),
1965		.flags = 0,
1966	};
1967
1968	if (!(__supported_pte_mask & _PAGE_NX))
1969		goto out;
1970
1971	if (!(page_flags & _PAGE_NX))
1972		cpa.mask_clr = __pgprot(_PAGE_NX);
1973
1974	if (!(page_flags & _PAGE_RW))
1975		cpa.mask_clr = __pgprot(_PAGE_RW);
1976
1977	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
1978
1979	retval = __change_page_attr_set_clr(&cpa, 0);
1980	__flush_tlb_all();
1981
1982out:
1983	return retval;
1984}
1985
1986/*
1987 * The testcases use internal knowledge of the implementation that shouldn't
1988 * be exposed to the rest of the kernel. Include these directly here.
1989 */
1990#ifdef CONFIG_CPA_DEBUG
1991#include "pageattr-test.c"
1992#endif