Linux Audio

Check our new training course

Loading...
   1/*
   2 * Xen mmu operations
   3 *
   4 * This file contains the various mmu fetch and update operations.
   5 * The most important job they must perform is the mapping between the
   6 * domain's pfn and the overall machine mfns.
   7 *
   8 * Xen allows guests to directly update the pagetable, in a controlled
   9 * fashion.  In other words, the guest modifies the same pagetable
  10 * that the CPU actually uses, which eliminates the overhead of having
  11 * a separate shadow pagetable.
  12 *
  13 * In order to allow this, it falls on the guest domain to map its
  14 * notion of a "physical" pfn - which is just a domain-local linear
  15 * address - into a real "machine address" which the CPU's MMU can
  16 * use.
  17 *
  18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19 * inserted directly into the pagetable.  When creating a new
  20 * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22 * the mfn back into a pfn.
  23 *
  24 * The other constraint is that all pages which make up a pagetable
  25 * must be mapped read-only in the guest.  This prevents uncontrolled
  26 * guest updates to the pagetable.  Xen strictly enforces this, and
  27 * will disallow any pagetable update which will end up mapping a
  28 * pagetable page RW, and will disallow using any writable page as a
  29 * pagetable.
  30 *
  31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32 * would need to validate the whole pagetable before going on.
  33 * Naturally, this is quite slow.  The solution is to "pin" a
  34 * pagetable, which enforces all the constraints on the pagetable even
  35 * when it is not actively in use.  This menas that Xen can be assured
  36 * that it is still valid when you do load it into %cr3, and doesn't
  37 * need to revalidate it.
  38 *
  39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40 */
  41#include <linux/sched.h>
  42#include <linux/highmem.h>
  43#include <linux/debugfs.h>
  44#include <linux/bug.h>
  45#include <linux/vmalloc.h>
  46#include <linux/module.h>
  47#include <linux/gfp.h>
  48#include <linux/memblock.h>
  49#include <linux/seq_file.h>
  50
  51#include <trace/events/xen.h>
  52
  53#include <asm/pgtable.h>
  54#include <asm/tlbflush.h>
  55#include <asm/fixmap.h>
  56#include <asm/mmu_context.h>
  57#include <asm/setup.h>
  58#include <asm/paravirt.h>
  59#include <asm/e820.h>
  60#include <asm/linkage.h>
  61#include <asm/page.h>
  62#include <asm/init.h>
  63#include <asm/pat.h>
  64#include <asm/smp.h>
  65
  66#include <asm/xen/hypercall.h>
  67#include <asm/xen/hypervisor.h>
  68
  69#include <xen/xen.h>
  70#include <xen/page.h>
  71#include <xen/interface/xen.h>
  72#include <xen/interface/hvm/hvm_op.h>
  73#include <xen/interface/version.h>
  74#include <xen/interface/memory.h>
  75#include <xen/hvc-console.h>
  76
  77#include "multicalls.h"
  78#include "mmu.h"
  79#include "debugfs.h"
  80
  81/*
  82 * Protects atomic reservation decrease/increase against concurrent increases.
  83 * Also protects non-atomic updates of current_pages and balloon lists.
  84 */
  85DEFINE_SPINLOCK(xen_reservation_lock);
  86
  87/*
  88 * Identity map, in addition to plain kernel map.  This needs to be
  89 * large enough to allocate page table pages to allocate the rest.
  90 * Each page can map 2MB.
  91 */
  92#define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)
  93static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
  94
  95#ifdef CONFIG_X86_64
  96/* l3 pud for userspace vsyscall mapping */
  97static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
  98#endif /* CONFIG_X86_64 */
  99
 100/*
 101 * Note about cr3 (pagetable base) values:
 102 *
 103 * xen_cr3 contains the current logical cr3 value; it contains the
 104 * last set cr3.  This may not be the current effective cr3, because
 105 * its update may be being lazily deferred.  However, a vcpu looking
 106 * at its own cr3 can use this value knowing that it everything will
 107 * be self-consistent.
 108 *
 109 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
 110 * hypercall to set the vcpu cr3 is complete (so it may be a little
 111 * out of date, but it will never be set early).  If one vcpu is
 112 * looking at another vcpu's cr3 value, it should use this variable.
 113 */
 114DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
 115DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
 116
 117
 118/*
 119 * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 120 * redzone above it, so round it up to a PGD boundary.
 121 */
 122#define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 123
 124unsigned long arbitrary_virt_to_mfn(void *vaddr)
 125{
 126	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
 127
 128	return PFN_DOWN(maddr.maddr);
 129}
 130
 131xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 132{
 133	unsigned long address = (unsigned long)vaddr;
 134	unsigned int level;
 135	pte_t *pte;
 136	unsigned offset;
 137
 138	/*
 139	 * if the PFN is in the linear mapped vaddr range, we can just use
 140	 * the (quick) virt_to_machine() p2m lookup
 141	 */
 142	if (virt_addr_valid(vaddr))
 143		return virt_to_machine(vaddr);
 144
 145	/* otherwise we have to do a (slower) full page-table walk */
 146
 147	pte = lookup_address(address, &level);
 148	BUG_ON(pte == NULL);
 149	offset = address & ~PAGE_MASK;
 150	return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
 151}
 152EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
 153
 154void make_lowmem_page_readonly(void *vaddr)
 155{
 156	pte_t *pte, ptev;
 157	unsigned long address = (unsigned long)vaddr;
 158	unsigned int level;
 159
 160	pte = lookup_address(address, &level);
 161	if (pte == NULL)
 162		return;		/* vaddr missing */
 163
 164	ptev = pte_wrprotect(*pte);
 165
 166	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 167		BUG();
 168}
 169
 170void make_lowmem_page_readwrite(void *vaddr)
 171{
 172	pte_t *pte, ptev;
 173	unsigned long address = (unsigned long)vaddr;
 174	unsigned int level;
 175
 176	pte = lookup_address(address, &level);
 177	if (pte == NULL)
 178		return;		/* vaddr missing */
 179
 180	ptev = pte_mkwrite(*pte);
 181
 182	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 183		BUG();
 184}
 185
 186
 187static bool xen_page_pinned(void *ptr)
 188{
 189	struct page *page = virt_to_page(ptr);
 190
 191	return PagePinned(page);
 192}
 193
 194void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
 195{
 196	struct multicall_space mcs;
 197	struct mmu_update *u;
 198
 199	trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
 200
 201	mcs = xen_mc_entry(sizeof(*u));
 202	u = mcs.args;
 203
 204	/* ptep might be kmapped when using 32-bit HIGHPTE */
 205	u->ptr = virt_to_machine(ptep).maddr;
 206	u->val = pte_val_ma(pteval);
 207
 208	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
 209
 210	xen_mc_issue(PARAVIRT_LAZY_MMU);
 211}
 212EXPORT_SYMBOL_GPL(xen_set_domain_pte);
 213
 214static void xen_extend_mmu_update(const struct mmu_update *update)
 215{
 216	struct multicall_space mcs;
 217	struct mmu_update *u;
 218
 219	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 220
 221	if (mcs.mc != NULL) {
 222		mcs.mc->args[1]++;
 223	} else {
 224		mcs = __xen_mc_entry(sizeof(*u));
 225		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 226	}
 227
 228	u = mcs.args;
 229	*u = *update;
 230}
 231
 232static void xen_extend_mmuext_op(const struct mmuext_op *op)
 233{
 234	struct multicall_space mcs;
 235	struct mmuext_op *u;
 236
 237	mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
 238
 239	if (mcs.mc != NULL) {
 240		mcs.mc->args[1]++;
 241	} else {
 242		mcs = __xen_mc_entry(sizeof(*u));
 243		MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 244	}
 245
 246	u = mcs.args;
 247	*u = *op;
 248}
 249
 250static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 251{
 252	struct mmu_update u;
 253
 254	preempt_disable();
 255
 256	xen_mc_batch();
 257
 258	/* ptr may be ioremapped for 64-bit pagetable setup */
 259	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 260	u.val = pmd_val_ma(val);
 261	xen_extend_mmu_update(&u);
 262
 263	xen_mc_issue(PARAVIRT_LAZY_MMU);
 264
 265	preempt_enable();
 266}
 267
 268static void xen_set_pmd(pmd_t *ptr, pmd_t val)
 269{
 270	trace_xen_mmu_set_pmd(ptr, val);
 271
 272	/* If page is not pinned, we can just update the entry
 273	   directly */
 274	if (!xen_page_pinned(ptr)) {
 275		*ptr = val;
 276		return;
 277	}
 278
 279	xen_set_pmd_hyper(ptr, val);
 280}
 281
 282/*
 283 * Associate a virtual page frame with a given physical page frame
 284 * and protection flags for that frame.
 285 */
 286void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 287{
 288	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
 289}
 290
 291static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
 292{
 293	struct mmu_update u;
 294
 295	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
 296		return false;
 297
 298	xen_mc_batch();
 299
 300	u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
 301	u.val = pte_val_ma(pteval);
 302	xen_extend_mmu_update(&u);
 303
 304	xen_mc_issue(PARAVIRT_LAZY_MMU);
 305
 306	return true;
 307}
 308
 309static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
 310{
 311	if (!xen_batched_set_pte(ptep, pteval))
 312		native_set_pte(ptep, pteval);
 313}
 314
 315static void xen_set_pte(pte_t *ptep, pte_t pteval)
 316{
 317	trace_xen_mmu_set_pte(ptep, pteval);
 318	__xen_set_pte(ptep, pteval);
 319}
 320
 321static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 322		    pte_t *ptep, pte_t pteval)
 323{
 324	trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
 325	__xen_set_pte(ptep, pteval);
 326}
 327
 328pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
 329				 unsigned long addr, pte_t *ptep)
 330{
 331	/* Just return the pte as-is.  We preserve the bits on commit */
 332	trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
 333	return *ptep;
 334}
 335
 336void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 337				 pte_t *ptep, pte_t pte)
 338{
 339	struct mmu_update u;
 340
 341	trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
 342	xen_mc_batch();
 343
 344	u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
 345	u.val = pte_val_ma(pte);
 346	xen_extend_mmu_update(&u);
 347
 348	xen_mc_issue(PARAVIRT_LAZY_MMU);
 349}
 350
 351/* Assume pteval_t is equivalent to all the other *val_t types. */
 352static pteval_t pte_mfn_to_pfn(pteval_t val)
 353{
 354	if (val & _PAGE_PRESENT) {
 355		unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 356		unsigned long pfn = mfn_to_pfn(mfn);
 357
 358		pteval_t flags = val & PTE_FLAGS_MASK;
 359		if (unlikely(pfn == ~0))
 360			val = flags & ~_PAGE_PRESENT;
 361		else
 362			val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
 363	}
 364
 365	return val;
 366}
 367
 368static pteval_t pte_pfn_to_mfn(pteval_t val)
 369{
 370	if (val & _PAGE_PRESENT) {
 371		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 372		pteval_t flags = val & PTE_FLAGS_MASK;
 373		unsigned long mfn;
 374
 375		if (!xen_feature(XENFEAT_auto_translated_physmap))
 376			mfn = get_phys_to_machine(pfn);
 377		else
 378			mfn = pfn;
 379		/*
 380		 * If there's no mfn for the pfn, then just create an
 381		 * empty non-present pte.  Unfortunately this loses
 382		 * information about the original pfn, so
 383		 * pte_mfn_to_pfn is asymmetric.
 384		 */
 385		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
 386			mfn = 0;
 387			flags = 0;
 388		} else {
 389			/*
 390			 * Paramount to do this test _after_ the
 391			 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
 392			 * IDENTITY_FRAME_BIT resolves to true.
 393			 */
 394			mfn &= ~FOREIGN_FRAME_BIT;
 395			if (mfn & IDENTITY_FRAME_BIT) {
 396				mfn &= ~IDENTITY_FRAME_BIT;
 397				flags |= _PAGE_IOMAP;
 398			}
 399		}
 400		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
 401	}
 402
 403	return val;
 404}
 405
 406static pteval_t iomap_pte(pteval_t val)
 407{
 408	if (val & _PAGE_PRESENT) {
 409		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 410		pteval_t flags = val & PTE_FLAGS_MASK;
 411
 412		/* We assume the pte frame number is a MFN, so
 413		   just use it as-is. */
 414		val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
 415	}
 416
 417	return val;
 418}
 419
 420static pteval_t xen_pte_val(pte_t pte)
 421{
 422	pteval_t pteval = pte.pte;
 423#if 0
 424	/* If this is a WC pte, convert back from Xen WC to Linux WC */
 425	if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
 426		WARN_ON(!pat_enabled);
 427		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
 428	}
 429#endif
 430	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
 431		return pteval;
 432
 433	return pte_mfn_to_pfn(pteval);
 434}
 435PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 436
 437static pgdval_t xen_pgd_val(pgd_t pgd)
 438{
 439	return pte_mfn_to_pfn(pgd.pgd);
 440}
 441PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 442
 443/*
 444 * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
 445 * are reserved for now, to correspond to the Intel-reserved PAT
 446 * types.
 447 *
 448 * We expect Linux's PAT set as follows:
 449 *
 450 * Idx  PTE flags        Linux    Xen    Default
 451 * 0                     WB       WB     WB
 452 * 1            PWT      WC       WT     WT
 453 * 2        PCD          UC-      UC-    UC-
 454 * 3        PCD PWT      UC       UC     UC
 455 * 4    PAT              WB       WC     WB
 456 * 5    PAT     PWT      WC       WP     WT
 457 * 6    PAT PCD          UC-      UC     UC-
 458 * 7    PAT PCD PWT      UC       UC     UC
 459 */
 460
 461void xen_set_pat(u64 pat)
 462{
 463	/* We expect Linux to use a PAT setting of
 464	 * UC UC- WC WB (ignoring the PAT flag) */
 465	WARN_ON(pat != 0x0007010600070106ull);
 466}
 467
 468static pte_t xen_make_pte(pteval_t pte)
 469{
 470	phys_addr_t addr = (pte & PTE_PFN_MASK);
 471#if 0
 472	/* If Linux is trying to set a WC pte, then map to the Xen WC.
 473	 * If _PAGE_PAT is set, then it probably means it is really
 474	 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
 475	 * things work out OK...
 476	 *
 477	 * (We should never see kernel mappings with _PAGE_PSE set,
 478	 * but we could see hugetlbfs mappings, I think.).
 479	 */
 480	if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
 481		if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
 482			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
 483	}
 484#endif
 485	/*
 486	 * Unprivileged domains are allowed to do IOMAPpings for
 487	 * PCI passthrough, but not map ISA space.  The ISA
 488	 * mappings are just dummy local mappings to keep other
 489	 * parts of the kernel happy.
 490	 */
 491	if (unlikely(pte & _PAGE_IOMAP) &&
 492	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
 493		pte = iomap_pte(pte);
 494	} else {
 495		pte &= ~_PAGE_IOMAP;
 496		pte = pte_pfn_to_mfn(pte);
 497	}
 498
 499	return native_make_pte(pte);
 500}
 501PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 502
 503static pgd_t xen_make_pgd(pgdval_t pgd)
 504{
 505	pgd = pte_pfn_to_mfn(pgd);
 506	return native_make_pgd(pgd);
 507}
 508PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
 509
 510static pmdval_t xen_pmd_val(pmd_t pmd)
 511{
 512	return pte_mfn_to_pfn(pmd.pmd);
 513}
 514PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
 515
 516static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 517{
 518	struct mmu_update u;
 519
 520	preempt_disable();
 521
 522	xen_mc_batch();
 523
 524	/* ptr may be ioremapped for 64-bit pagetable setup */
 525	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 526	u.val = pud_val_ma(val);
 527	xen_extend_mmu_update(&u);
 528
 529	xen_mc_issue(PARAVIRT_LAZY_MMU);
 530
 531	preempt_enable();
 532}
 533
 534static void xen_set_pud(pud_t *ptr, pud_t val)
 535{
 536	trace_xen_mmu_set_pud(ptr, val);
 537
 538	/* If page is not pinned, we can just update the entry
 539	   directly */
 540	if (!xen_page_pinned(ptr)) {
 541		*ptr = val;
 542		return;
 543	}
 544
 545	xen_set_pud_hyper(ptr, val);
 546}
 547
 548#ifdef CONFIG_X86_PAE
 549static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 550{
 551	trace_xen_mmu_set_pte_atomic(ptep, pte);
 552	set_64bit((u64 *)ptep, native_pte_val(pte));
 553}
 554
 555static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 556{
 557	trace_xen_mmu_pte_clear(mm, addr, ptep);
 558	if (!xen_batched_set_pte(ptep, native_make_pte(0)))
 559		native_pte_clear(mm, addr, ptep);
 560}
 561
 562static void xen_pmd_clear(pmd_t *pmdp)
 563{
 564	trace_xen_mmu_pmd_clear(pmdp);
 565	set_pmd(pmdp, __pmd(0));
 566}
 567#endif	/* CONFIG_X86_PAE */
 568
 569static pmd_t xen_make_pmd(pmdval_t pmd)
 570{
 571	pmd = pte_pfn_to_mfn(pmd);
 572	return native_make_pmd(pmd);
 573}
 574PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 575
 576#if PAGETABLE_LEVELS == 4
 577static pudval_t xen_pud_val(pud_t pud)
 578{
 579	return pte_mfn_to_pfn(pud.pud);
 580}
 581PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
 582
 583static pud_t xen_make_pud(pudval_t pud)
 584{
 585	pud = pte_pfn_to_mfn(pud);
 586
 587	return native_make_pud(pud);
 588}
 589PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
 590
 591static pgd_t *xen_get_user_pgd(pgd_t *pgd)
 592{
 593	pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
 594	unsigned offset = pgd - pgd_page;
 595	pgd_t *user_ptr = NULL;
 596
 597	if (offset < pgd_index(USER_LIMIT)) {
 598		struct page *page = virt_to_page(pgd_page);
 599		user_ptr = (pgd_t *)page->private;
 600		if (user_ptr)
 601			user_ptr += offset;
 602	}
 603
 604	return user_ptr;
 605}
 606
 607static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 608{
 609	struct mmu_update u;
 610
 611	u.ptr = virt_to_machine(ptr).maddr;
 612	u.val = pgd_val_ma(val);
 613	xen_extend_mmu_update(&u);
 614}
 615
 616/*
 617 * Raw hypercall-based set_pgd, intended for in early boot before
 618 * there's a page structure.  This implies:
 619 *  1. The only existing pagetable is the kernel's
 620 *  2. It is always pinned
 621 *  3. It has no user pagetable attached to it
 622 */
 623static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 624{
 625	preempt_disable();
 626
 627	xen_mc_batch();
 628
 629	__xen_set_pgd_hyper(ptr, val);
 630
 631	xen_mc_issue(PARAVIRT_LAZY_MMU);
 632
 633	preempt_enable();
 634}
 635
 636static void xen_set_pgd(pgd_t *ptr, pgd_t val)
 637{
 638	pgd_t *user_ptr = xen_get_user_pgd(ptr);
 639
 640	trace_xen_mmu_set_pgd(ptr, user_ptr, val);
 641
 642	/* If page is not pinned, we can just update the entry
 643	   directly */
 644	if (!xen_page_pinned(ptr)) {
 645		*ptr = val;
 646		if (user_ptr) {
 647			WARN_ON(xen_page_pinned(user_ptr));
 648			*user_ptr = val;
 649		}
 650		return;
 651	}
 652
 653	/* If it's pinned, then we can at least batch the kernel and
 654	   user updates together. */
 655	xen_mc_batch();
 656
 657	__xen_set_pgd_hyper(ptr, val);
 658	if (user_ptr)
 659		__xen_set_pgd_hyper(user_ptr, val);
 660
 661	xen_mc_issue(PARAVIRT_LAZY_MMU);
 662}
 663#endif	/* PAGETABLE_LEVELS == 4 */
 664
 665/*
 666 * (Yet another) pagetable walker.  This one is intended for pinning a
 667 * pagetable.  This means that it walks a pagetable and calls the
 668 * callback function on each page it finds making up the page table,
 669 * at every level.  It walks the entire pagetable, but it only bothers
 670 * pinning pte pages which are below limit.  In the normal case this
 671 * will be STACK_TOP_MAX, but at boot we need to pin up to
 672 * FIXADDR_TOP.
 673 *
 674 * For 32-bit the important bit is that we don't pin beyond there,
 675 * because then we start getting into Xen's ptes.
 676 *
 677 * For 64-bit, we must skip the Xen hole in the middle of the address
 678 * space, just after the big x86-64 virtual hole.
 679 */
 680static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
 681			  int (*func)(struct mm_struct *mm, struct page *,
 682				      enum pt_level),
 683			  unsigned long limit)
 684{
 685	int flush = 0;
 686	unsigned hole_low, hole_high;
 687	unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
 688	unsigned pgdidx, pudidx, pmdidx;
 689
 690	/* The limit is the last byte to be touched */
 691	limit--;
 692	BUG_ON(limit >= FIXADDR_TOP);
 693
 694	if (xen_feature(XENFEAT_auto_translated_physmap))
 695		return 0;
 696
 697	/*
 698	 * 64-bit has a great big hole in the middle of the address
 699	 * space, which contains the Xen mappings.  On 32-bit these
 700	 * will end up making a zero-sized hole and so is a no-op.
 701	 */
 702	hole_low = pgd_index(USER_LIMIT);
 703	hole_high = pgd_index(PAGE_OFFSET);
 704
 705	pgdidx_limit = pgd_index(limit);
 706#if PTRS_PER_PUD > 1
 707	pudidx_limit = pud_index(limit);
 708#else
 709	pudidx_limit = 0;
 710#endif
 711#if PTRS_PER_PMD > 1
 712	pmdidx_limit = pmd_index(limit);
 713#else
 714	pmdidx_limit = 0;
 715#endif
 716
 717	for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
 718		pud_t *pud;
 719
 720		if (pgdidx >= hole_low && pgdidx < hole_high)
 721			continue;
 722
 723		if (!pgd_val(pgd[pgdidx]))
 724			continue;
 725
 726		pud = pud_offset(&pgd[pgdidx], 0);
 727
 728		if (PTRS_PER_PUD > 1) /* not folded */
 729			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
 730
 731		for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
 732			pmd_t *pmd;
 733
 734			if (pgdidx == pgdidx_limit &&
 735			    pudidx > pudidx_limit)
 736				goto out;
 737
 738			if (pud_none(pud[pudidx]))
 739				continue;
 740
 741			pmd = pmd_offset(&pud[pudidx], 0);
 742
 743			if (PTRS_PER_PMD > 1) /* not folded */
 744				flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
 745
 746			for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
 747				struct page *pte;
 748
 749				if (pgdidx == pgdidx_limit &&
 750				    pudidx == pudidx_limit &&
 751				    pmdidx > pmdidx_limit)
 752					goto out;
 753
 754				if (pmd_none(pmd[pmdidx]))
 755					continue;
 756
 757				pte = pmd_page(pmd[pmdidx]);
 758				flush |= (*func)(mm, pte, PT_PTE);
 759			}
 760		}
 761	}
 762
 763out:
 764	/* Do the top level last, so that the callbacks can use it as
 765	   a cue to do final things like tlb flushes. */
 766	flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
 767
 768	return flush;
 769}
 770
 771static int xen_pgd_walk(struct mm_struct *mm,
 772			int (*func)(struct mm_struct *mm, struct page *,
 773				    enum pt_level),
 774			unsigned long limit)
 775{
 776	return __xen_pgd_walk(mm, mm->pgd, func, limit);
 777}
 778
 779/* If we're using split pte locks, then take the page's lock and
 780   return a pointer to it.  Otherwise return NULL. */
 781static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 782{
 783	spinlock_t *ptl = NULL;
 784
 785#if USE_SPLIT_PTLOCKS
 786	ptl = __pte_lockptr(page);
 787	spin_lock_nest_lock(ptl, &mm->page_table_lock);
 788#endif
 789
 790	return ptl;
 791}
 792
 793static void xen_pte_unlock(void *v)
 794{
 795	spinlock_t *ptl = v;
 796	spin_unlock(ptl);
 797}
 798
 799static void xen_do_pin(unsigned level, unsigned long pfn)
 800{
 801	struct mmuext_op op;
 802
 803	op.cmd = level;
 804	op.arg1.mfn = pfn_to_mfn(pfn);
 805
 806	xen_extend_mmuext_op(&op);
 807}
 808
 809static int xen_pin_page(struct mm_struct *mm, struct page *page,
 810			enum pt_level level)
 811{
 812	unsigned pgfl = TestSetPagePinned(page);
 813	int flush;
 814
 815	if (pgfl)
 816		flush = 0;		/* already pinned */
 817	else if (PageHighMem(page))
 818		/* kmaps need flushing if we found an unpinned
 819		   highpage */
 820		flush = 1;
 821	else {
 822		void *pt = lowmem_page_address(page);
 823		unsigned long pfn = page_to_pfn(page);
 824		struct multicall_space mcs = __xen_mc_entry(0);
 825		spinlock_t *ptl;
 826
 827		flush = 0;
 828
 829		/*
 830		 * We need to hold the pagetable lock between the time
 831		 * we make the pagetable RO and when we actually pin
 832		 * it.  If we don't, then other users may come in and
 833		 * attempt to update the pagetable by writing it,
 834		 * which will fail because the memory is RO but not
 835		 * pinned, so Xen won't do the trap'n'emulate.
 836		 *
 837		 * If we're using split pte locks, we can't hold the
 838		 * entire pagetable's worth of locks during the
 839		 * traverse, because we may wrap the preempt count (8
 840		 * bits).  The solution is to mark RO and pin each PTE
 841		 * page while holding the lock.  This means the number
 842		 * of locks we end up holding is never more than a
 843		 * batch size (~32 entries, at present).
 844		 *
 845		 * If we're not using split pte locks, we needn't pin
 846		 * the PTE pages independently, because we're
 847		 * protected by the overall pagetable lock.
 848		 */
 849		ptl = NULL;
 850		if (level == PT_PTE)
 851			ptl = xen_pte_lock(page, mm);
 852
 853		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 854					pfn_pte(pfn, PAGE_KERNEL_RO),
 855					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 856
 857		if (ptl) {
 858			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
 859
 860			/* Queue a deferred unlock for when this batch
 861			   is completed. */
 862			xen_mc_callback(xen_pte_unlock, ptl);
 863		}
 864	}
 865
 866	return flush;
 867}
 868
 869/* This is called just after a mm has been created, but it has not
 870   been used yet.  We need to make sure that its pagetable is all
 871   read-only, and can be pinned. */
 872static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 873{
 874	trace_xen_mmu_pgd_pin(mm, pgd);
 875
 876	xen_mc_batch();
 877
 878	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
 879		/* re-enable interrupts for flushing */
 880		xen_mc_issue(0);
 881
 882		kmap_flush_unused();
 883
 884		xen_mc_batch();
 885	}
 886
 887#ifdef CONFIG_X86_64
 888	{
 889		pgd_t *user_pgd = xen_get_user_pgd(pgd);
 890
 891		xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
 892
 893		if (user_pgd) {
 894			xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
 895			xen_do_pin(MMUEXT_PIN_L4_TABLE,
 896				   PFN_DOWN(__pa(user_pgd)));
 897		}
 898	}
 899#else /* CONFIG_X86_32 */
 900#ifdef CONFIG_X86_PAE
 901	/* Need to make sure unshared kernel PMD is pinnable */
 902	xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
 903		     PT_PMD);
 904#endif
 905	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 906#endif /* CONFIG_X86_64 */
 907	xen_mc_issue(0);
 908}
 909
 910static void xen_pgd_pin(struct mm_struct *mm)
 911{
 912	__xen_pgd_pin(mm, mm->pgd);
 913}
 914
 915/*
 916 * On save, we need to pin all pagetables to make sure they get their
 917 * mfns turned into pfns.  Search the list for any unpinned pgds and pin
 918 * them (unpinned pgds are not currently in use, probably because the
 919 * process is under construction or destruction).
 920 *
 921 * Expected to be called in stop_machine() ("equivalent to taking
 922 * every spinlock in the system"), so the locking doesn't really
 923 * matter all that much.
 924 */
 925void xen_mm_pin_all(void)
 926{
 927	struct page *page;
 928
 929	spin_lock(&pgd_lock);
 930
 931	list_for_each_entry(page, &pgd_list, lru) {
 932		if (!PagePinned(page)) {
 933			__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
 934			SetPageSavePinned(page);
 935		}
 936	}
 937
 938	spin_unlock(&pgd_lock);
 939}
 940
 941/*
 942 * The init_mm pagetable is really pinned as soon as its created, but
 943 * that's before we have page structures to store the bits.  So do all
 944 * the book-keeping now.
 945 */
 946static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
 947				  enum pt_level level)
 948{
 949	SetPagePinned(page);
 950	return 0;
 951}
 952
 953static void __init xen_mark_init_mm_pinned(void)
 954{
 955	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
 956}
 957
 958static int xen_unpin_page(struct mm_struct *mm, struct page *page,
 959			  enum pt_level level)
 960{
 961	unsigned pgfl = TestClearPagePinned(page);
 962
 963	if (pgfl && !PageHighMem(page)) {
 964		void *pt = lowmem_page_address(page);
 965		unsigned long pfn = page_to_pfn(page);
 966		spinlock_t *ptl = NULL;
 967		struct multicall_space mcs;
 968
 969		/*
 970		 * Do the converse to pin_page.  If we're using split
 971		 * pte locks, we must be holding the lock for while
 972		 * the pte page is unpinned but still RO to prevent
 973		 * concurrent updates from seeing it in this
 974		 * partially-pinned state.
 975		 */
 976		if (level == PT_PTE) {
 977			ptl = xen_pte_lock(page, mm);
 978
 979			if (ptl)
 980				xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
 981		}
 982
 983		mcs = __xen_mc_entry(0);
 984
 985		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 986					pfn_pte(pfn, PAGE_KERNEL),
 987					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 988
 989		if (ptl) {
 990			/* unlock when batch completed */
 991			xen_mc_callback(xen_pte_unlock, ptl);
 992		}
 993	}
 994
 995	return 0;		/* never need to flush on unpin */
 996}
 997
 998/* Release a pagetables pages back as normal RW */
 999static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
1000{
1001	trace_xen_mmu_pgd_unpin(mm, pgd);
1002
1003	xen_mc_batch();
1004
1005	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1006
1007#ifdef CONFIG_X86_64
1008	{
1009		pgd_t *user_pgd = xen_get_user_pgd(pgd);
1010
1011		if (user_pgd) {
1012			xen_do_pin(MMUEXT_UNPIN_TABLE,
1013				   PFN_DOWN(__pa(user_pgd)));
1014			xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
1015		}
1016	}
1017#endif
1018
1019#ifdef CONFIG_X86_PAE
1020	/* Need to make sure unshared kernel PMD is unpinned */
1021	xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1022		       PT_PMD);
1023#endif
1024
1025	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1026
1027	xen_mc_issue(0);
1028}
1029
1030static void xen_pgd_unpin(struct mm_struct *mm)
1031{
1032	__xen_pgd_unpin(mm, mm->pgd);
1033}
1034
1035/*
1036 * On resume, undo any pinning done at save, so that the rest of the
1037 * kernel doesn't see any unexpected pinned pagetables.
1038 */
1039void xen_mm_unpin_all(void)
1040{
1041	struct page *page;
1042
1043	spin_lock(&pgd_lock);
1044
1045	list_for_each_entry(page, &pgd_list, lru) {
1046		if (PageSavePinned(page)) {
1047			BUG_ON(!PagePinned(page));
1048			__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1049			ClearPageSavePinned(page);
1050		}
1051	}
1052
1053	spin_unlock(&pgd_lock);
1054}
1055
1056static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1057{
1058	spin_lock(&next->page_table_lock);
1059	xen_pgd_pin(next);
1060	spin_unlock(&next->page_table_lock);
1061}
1062
1063static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1064{
1065	spin_lock(&mm->page_table_lock);
1066	xen_pgd_pin(mm);
1067	spin_unlock(&mm->page_table_lock);
1068}
1069
1070
1071#ifdef CONFIG_SMP
1072/* Another cpu may still have their %cr3 pointing at the pagetable, so
1073   we need to repoint it somewhere else before we can unpin it. */
1074static void drop_other_mm_ref(void *info)
1075{
1076	struct mm_struct *mm = info;
1077	struct mm_struct *active_mm;
1078
1079	active_mm = this_cpu_read(cpu_tlbstate.active_mm);
1080
1081	if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
1082		leave_mm(smp_processor_id());
1083
1084	/* If this cpu still has a stale cr3 reference, then make sure
1085	   it has been flushed. */
1086	if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
1087		load_cr3(swapper_pg_dir);
1088}
1089
1090static void xen_drop_mm_ref(struct mm_struct *mm)
1091{
1092	cpumask_var_t mask;
1093	unsigned cpu;
1094
1095	if (current->active_mm == mm) {
1096		if (current->mm == mm)
1097			load_cr3(swapper_pg_dir);
1098		else
1099			leave_mm(smp_processor_id());
1100	}
1101
1102	/* Get the "official" set of cpus referring to our pagetable. */
1103	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1104		for_each_online_cpu(cpu) {
1105			if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1106			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1107				continue;
1108			smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1109		}
1110		return;
1111	}
1112	cpumask_copy(mask, mm_cpumask(mm));
1113
1114	/* It's possible that a vcpu may have a stale reference to our
1115	   cr3, because its in lazy mode, and it hasn't yet flushed
1116	   its set of pending hypercalls yet.  In this case, we can
1117	   look at its actual current cr3 value, and force it to flush
1118	   if needed. */
1119	for_each_online_cpu(cpu) {
1120		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1121			cpumask_set_cpu(cpu, mask);
1122	}
1123
1124	if (!cpumask_empty(mask))
1125		smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1126	free_cpumask_var(mask);
1127}
1128#else
1129static void xen_drop_mm_ref(struct mm_struct *mm)
1130{
1131	if (current->active_mm == mm)
1132		load_cr3(swapper_pg_dir);
1133}
1134#endif
1135
1136/*
1137 * While a process runs, Xen pins its pagetables, which means that the
1138 * hypervisor forces it to be read-only, and it controls all updates
1139 * to it.  This means that all pagetable updates have to go via the
1140 * hypervisor, which is moderately expensive.
1141 *
1142 * Since we're pulling the pagetable down, we switch to use init_mm,
1143 * unpin old process pagetable and mark it all read-write, which
1144 * allows further operations on it to be simple memory accesses.
1145 *
1146 * The only subtle point is that another CPU may be still using the
1147 * pagetable because of lazy tlb flushing.  This means we need need to
1148 * switch all CPUs off this pagetable before we can unpin it.
1149 */
1150static void xen_exit_mmap(struct mm_struct *mm)
1151{
1152	get_cpu();		/* make sure we don't move around */
1153	xen_drop_mm_ref(mm);
1154	put_cpu();
1155
1156	spin_lock(&mm->page_table_lock);
1157
1158	/* pgd may not be pinned in the error exit path of execve */
1159	if (xen_page_pinned(mm->pgd))
1160		xen_pgd_unpin(mm);
1161
1162	spin_unlock(&mm->page_table_lock);
1163}
1164
1165static void __init xen_pagetable_setup_start(pgd_t *base)
1166{
1167}
1168
1169static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1170{
1171	/* reserve the range used */
1172	native_pagetable_reserve(start, end);
1173
1174	/* set as RW the rest */
1175	printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
1176			PFN_PHYS(pgt_buf_top));
1177	while (end < PFN_PHYS(pgt_buf_top)) {
1178		make_lowmem_page_readwrite(__va(end));
1179		end += PAGE_SIZE;
1180	}
1181}
1182
1183static void xen_post_allocator_init(void);
1184
1185static void __init xen_pagetable_setup_done(pgd_t *base)
1186{
1187	xen_setup_shared_info();
1188	xen_post_allocator_init();
1189}
1190
1191static void xen_write_cr2(unsigned long cr2)
1192{
1193	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
1194}
1195
1196static unsigned long xen_read_cr2(void)
1197{
1198	return this_cpu_read(xen_vcpu)->arch.cr2;
1199}
1200
1201unsigned long xen_read_cr2_direct(void)
1202{
1203	return this_cpu_read(xen_vcpu_info.arch.cr2);
1204}
1205
1206static void xen_flush_tlb(void)
1207{
1208	struct mmuext_op *op;
1209	struct multicall_space mcs;
1210
1211	trace_xen_mmu_flush_tlb(0);
1212
1213	preempt_disable();
1214
1215	mcs = xen_mc_entry(sizeof(*op));
1216
1217	op = mcs.args;
1218	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1219	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1220
1221	xen_mc_issue(PARAVIRT_LAZY_MMU);
1222
1223	preempt_enable();
1224}
1225
1226static void xen_flush_tlb_single(unsigned long addr)
1227{
1228	struct mmuext_op *op;
1229	struct multicall_space mcs;
1230
1231	trace_xen_mmu_flush_tlb_single(addr);
1232
1233	preempt_disable();
1234
1235	mcs = xen_mc_entry(sizeof(*op));
1236	op = mcs.args;
1237	op->cmd = MMUEXT_INVLPG_LOCAL;
1238	op->arg1.linear_addr = addr & PAGE_MASK;
1239	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1240
1241	xen_mc_issue(PARAVIRT_LAZY_MMU);
1242
1243	preempt_enable();
1244}
1245
1246static void xen_flush_tlb_others(const struct cpumask *cpus,
1247				 struct mm_struct *mm, unsigned long va)
1248{
1249	struct {
1250		struct mmuext_op op;
1251#ifdef CONFIG_SMP
1252		DECLARE_BITMAP(mask, num_processors);
1253#else
1254		DECLARE_BITMAP(mask, NR_CPUS);
1255#endif
1256	} *args;
1257	struct multicall_space mcs;
1258
1259	trace_xen_mmu_flush_tlb_others(cpus, mm, va);
1260
1261	if (cpumask_empty(cpus))
1262		return;		/* nothing to do */
1263
1264	mcs = xen_mc_entry(sizeof(*args));
1265	args = mcs.args;
1266	args->op.arg2.vcpumask = to_cpumask(args->mask);
1267
1268	/* Remove us, and any offline CPUS. */
1269	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1270	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1271
1272	if (va == TLB_FLUSH_ALL) {
1273		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1274	} else {
1275		args->op.cmd = MMUEXT_INVLPG_MULTI;
1276		args->op.arg1.linear_addr = va;
1277	}
1278
1279	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1280
1281	xen_mc_issue(PARAVIRT_LAZY_MMU);
1282}
1283
1284static unsigned long xen_read_cr3(void)
1285{
1286	return this_cpu_read(xen_cr3);
1287}
1288
1289static void set_current_cr3(void *v)
1290{
1291	this_cpu_write(xen_current_cr3, (unsigned long)v);
1292}
1293
1294static void __xen_write_cr3(bool kernel, unsigned long cr3)
1295{
1296	struct mmuext_op op;
1297	unsigned long mfn;
1298
1299	trace_xen_mmu_write_cr3(kernel, cr3);
1300
1301	if (cr3)
1302		mfn = pfn_to_mfn(PFN_DOWN(cr3));
1303	else
1304		mfn = 0;
1305
1306	WARN_ON(mfn == 0 && kernel);
1307
1308	op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1309	op.arg1.mfn = mfn;
1310
1311	xen_extend_mmuext_op(&op);
1312
1313	if (kernel) {
1314		this_cpu_write(xen_cr3, cr3);
1315
1316		/* Update xen_current_cr3 once the batch has actually
1317		   been submitted. */
1318		xen_mc_callback(set_current_cr3, (void *)cr3);
1319	}
1320}
1321
1322static void xen_write_cr3(unsigned long cr3)
1323{
1324	BUG_ON(preemptible());
1325
1326	xen_mc_batch();  /* disables interrupts */
1327
1328	/* Update while interrupts are disabled, so its atomic with
1329	   respect to ipis */
1330	this_cpu_write(xen_cr3, cr3);
1331
1332	__xen_write_cr3(true, cr3);
1333
1334#ifdef CONFIG_X86_64
1335	{
1336		pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1337		if (user_pgd)
1338			__xen_write_cr3(false, __pa(user_pgd));
1339		else
1340			__xen_write_cr3(false, 0);
1341	}
1342#endif
1343
1344	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1345}
1346
1347static int xen_pgd_alloc(struct mm_struct *mm)
1348{
1349	pgd_t *pgd = mm->pgd;
1350	int ret = 0;
1351
1352	BUG_ON(PagePinned(virt_to_page(pgd)));
1353
1354#ifdef CONFIG_X86_64
1355	{
1356		struct page *page = virt_to_page(pgd);
1357		pgd_t *user_pgd;
1358
1359		BUG_ON(page->private != 0);
1360
1361		ret = -ENOMEM;
1362
1363		user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1364		page->private = (unsigned long)user_pgd;
1365
1366		if (user_pgd != NULL) {
1367			user_pgd[pgd_index(VSYSCALL_START)] =
1368				__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1369			ret = 0;
1370		}
1371
1372		BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1373	}
1374#endif
1375
1376	return ret;
1377}
1378
1379static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1380{
1381#ifdef CONFIG_X86_64
1382	pgd_t *user_pgd = xen_get_user_pgd(pgd);
1383
1384	if (user_pgd)
1385		free_page((unsigned long)user_pgd);
1386#endif
1387}
1388
1389#ifdef CONFIG_X86_32
1390static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1391{
1392	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
1393	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1394		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1395			       pte_val_ma(pte));
1396
1397	return pte;
1398}
1399#else /* CONFIG_X86_64 */
1400static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1401{
1402	unsigned long pfn = pte_pfn(pte);
1403
1404	/*
1405	 * If the new pfn is within the range of the newly allocated
1406	 * kernel pagetable, and it isn't being mapped into an
1407	 * early_ioremap fixmap slot as a freshly allocated page, make sure
1408	 * it is RO.
1409	 */
1410	if (((!is_early_ioremap_ptep(ptep) &&
1411			pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
1412			(is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
1413		pte = pte_wrprotect(pte);
1414
1415	return pte;
1416}
1417#endif /* CONFIG_X86_64 */
1418
1419/* Init-time set_pte while constructing initial pagetables, which
1420   doesn't allow RO pagetable pages to be remapped RW */
1421static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1422{
1423	pte = mask_rw_pte(ptep, pte);
1424
1425	xen_set_pte(ptep, pte);
1426}
1427
1428static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1429{
1430	struct mmuext_op op;
1431	op.cmd = cmd;
1432	op.arg1.mfn = pfn_to_mfn(pfn);
1433	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1434		BUG();
1435}
1436
1437/* Early in boot, while setting up the initial pagetable, assume
1438   everything is pinned. */
1439static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1440{
1441#ifdef CONFIG_FLATMEM
1442	BUG_ON(mem_map);	/* should only be used early */
1443#endif
1444	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1445	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1446}
1447
1448/* Used for pmd and pud */
1449static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1450{
1451#ifdef CONFIG_FLATMEM
1452	BUG_ON(mem_map);	/* should only be used early */
1453#endif
1454	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1455}
1456
1457/* Early release_pte assumes that all pts are pinned, since there's
1458   only init_mm and anything attached to that is pinned. */
1459static void __init xen_release_pte_init(unsigned long pfn)
1460{
1461	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1462	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1463}
1464
1465static void __init xen_release_pmd_init(unsigned long pfn)
1466{
1467	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1468}
1469
1470static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1471{
1472	struct multicall_space mcs;
1473	struct mmuext_op *op;
1474
1475	mcs = __xen_mc_entry(sizeof(*op));
1476	op = mcs.args;
1477	op->cmd = cmd;
1478	op->arg1.mfn = pfn_to_mfn(pfn);
1479
1480	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
1481}
1482
1483static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1484{
1485	struct multicall_space mcs;
1486	unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1487
1488	mcs = __xen_mc_entry(0);
1489	MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
1490				pfn_pte(pfn, prot), 0);
1491}
1492
1493/* This needs to make sure the new pte page is pinned iff its being
1494   attached to a pinned pagetable. */
1495static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
1496				    unsigned level)
1497{
1498	bool pinned = PagePinned(virt_to_page(mm->pgd));
1499
1500	trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
1501
1502	if (pinned) {
1503		struct page *page = pfn_to_page(pfn);
1504
1505		SetPagePinned(page);
1506
1507		if (!PageHighMem(page)) {
1508			xen_mc_batch();
1509
1510			__set_pfn_prot(pfn, PAGE_KERNEL_RO);
1511
1512			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1513				__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1514
1515			xen_mc_issue(PARAVIRT_LAZY_MMU);
1516		} else {
1517			/* make sure there are no stray mappings of
1518			   this page */
1519			kmap_flush_unused();
1520		}
1521	}
1522}
1523
1524static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1525{
1526	xen_alloc_ptpage(mm, pfn, PT_PTE);
1527}
1528
1529static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1530{
1531	xen_alloc_ptpage(mm, pfn, PT_PMD);
1532}
1533
1534/* This should never happen until we're OK to use struct page */
1535static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
1536{
1537	struct page *page = pfn_to_page(pfn);
1538	bool pinned = PagePinned(page);
1539
1540	trace_xen_mmu_release_ptpage(pfn, level, pinned);
1541
1542	if (pinned) {
1543		if (!PageHighMem(page)) {
1544			xen_mc_batch();
1545
1546			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1547				__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1548
1549			__set_pfn_prot(pfn, PAGE_KERNEL);
1550
1551			xen_mc_issue(PARAVIRT_LAZY_MMU);
1552		}
1553		ClearPagePinned(page);
1554	}
1555}
1556
1557static void xen_release_pte(unsigned long pfn)
1558{
1559	xen_release_ptpage(pfn, PT_PTE);
1560}
1561
1562static void xen_release_pmd(unsigned long pfn)
1563{
1564	xen_release_ptpage(pfn, PT_PMD);
1565}
1566
1567#if PAGETABLE_LEVELS == 4
1568static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1569{
1570	xen_alloc_ptpage(mm, pfn, PT_PUD);
1571}
1572
1573static void xen_release_pud(unsigned long pfn)
1574{
1575	xen_release_ptpage(pfn, PT_PUD);
1576}
1577#endif
1578
1579void __init xen_reserve_top(void)
1580{
1581#ifdef CONFIG_X86_32
1582	unsigned long top = HYPERVISOR_VIRT_START;
1583	struct xen_platform_parameters pp;
1584
1585	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1586		top = pp.virt_start;
1587
1588	reserve_top_address(-top);
1589#endif	/* CONFIG_X86_32 */
1590}
1591
1592/*
1593 * Like __va(), but returns address in the kernel mapping (which is
1594 * all we have until the physical memory mapping has been set up.
1595 */
1596static void *__ka(phys_addr_t paddr)
1597{
1598#ifdef CONFIG_X86_64
1599	return (void *)(paddr + __START_KERNEL_map);
1600#else
1601	return __va(paddr);
1602#endif
1603}
1604
1605/* Convert a machine address to physical address */
1606static unsigned long m2p(phys_addr_t maddr)
1607{
1608	phys_addr_t paddr;
1609
1610	maddr &= PTE_PFN_MASK;
1611	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1612
1613	return paddr;
1614}
1615
1616/* Convert a machine address to kernel virtual */
1617static void *m2v(phys_addr_t maddr)
1618{
1619	return __ka(m2p(maddr));
1620}
1621
1622/* Set the page permissions on an identity-mapped pages */
1623static void set_page_prot(void *addr, pgprot_t prot)
1624{
1625	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1626	pte_t pte = pfn_pte(pfn, prot);
1627
1628	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1629		BUG();
1630}
1631
1632static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1633{
1634	unsigned pmdidx, pteidx;
1635	unsigned ident_pte;
1636	unsigned long pfn;
1637
1638	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1639				      PAGE_SIZE);
1640
1641	ident_pte = 0;
1642	pfn = 0;
1643	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1644		pte_t *pte_page;
1645
1646		/* Reuse or allocate a page of ptes */
1647		if (pmd_present(pmd[pmdidx]))
1648			pte_page = m2v(pmd[pmdidx].pmd);
1649		else {
1650			/* Check for free pte pages */
1651			if (ident_pte == LEVEL1_IDENT_ENTRIES)
1652				break;
1653
1654			pte_page = &level1_ident_pgt[ident_pte];
1655			ident_pte += PTRS_PER_PTE;
1656
1657			pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1658		}
1659
1660		/* Install mappings */
1661		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1662			pte_t pte;
1663
1664#ifdef CONFIG_X86_32
1665			if (pfn > max_pfn_mapped)
1666				max_pfn_mapped = pfn;
1667#endif
1668
1669			if (!pte_none(pte_page[pteidx]))
1670				continue;
1671
1672			pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1673			pte_page[pteidx] = pte;
1674		}
1675	}
1676
1677	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1678		set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1679
1680	set_page_prot(pmd, PAGE_KERNEL_RO);
1681}
1682
1683void __init xen_setup_machphys_mapping(void)
1684{
1685	struct xen_machphys_mapping mapping;
1686
1687	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1688		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1689		machine_to_phys_nr = mapping.max_mfn + 1;
1690	} else {
1691		machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
1692	}
1693#ifdef CONFIG_X86_32
1694	WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
1695		< machine_to_phys_mapping);
1696#endif
1697}
1698
1699#ifdef CONFIG_X86_64
1700static void convert_pfn_mfn(void *v)
1701{
1702	pte_t *pte = v;
1703	int i;
1704
1705	/* All levels are converted the same way, so just treat them
1706	   as ptes. */
1707	for (i = 0; i < PTRS_PER_PTE; i++)
1708		pte[i] = xen_make_pte(pte[i].pte);
1709}
1710
1711/*
1712 * Set up the initial kernel pagetable.
1713 *
1714 * We can construct this by grafting the Xen provided pagetable into
1715 * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1716 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
1717 * means that only the kernel has a physical mapping to start with -
1718 * but that's enough to get __va working.  We need to fill in the rest
1719 * of the physical mapping once some sort of allocator has been set
1720 * up.
1721 */
1722pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1723					 unsigned long max_pfn)
1724{
1725	pud_t *l3;
1726	pmd_t *l2;
1727
1728	/* max_pfn_mapped is the last pfn mapped in the initial memory
1729	 * mappings. Considering that on Xen after the kernel mappings we
1730	 * have the mappings of some pages that don't exist in pfn space, we
1731	 * set max_pfn_mapped to the last real pfn mapped. */
1732	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1733
1734	/* Zap identity mapping */
1735	init_level4_pgt[0] = __pgd(0);
1736
1737	/* Pre-constructed entries are in pfn, so convert to mfn */
1738	convert_pfn_mfn(init_level4_pgt);
1739	convert_pfn_mfn(level3_ident_pgt);
1740	convert_pfn_mfn(level3_kernel_pgt);
1741
1742	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1743	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1744
1745	memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1746	memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1747
1748	l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1749	l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1750	memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1751
1752	/* Set up identity map */
1753	xen_map_identity_early(level2_ident_pgt, max_pfn);
1754
1755	/* Make pagetable pieces RO */
1756	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1757	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1758	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1759	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1760	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1761	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1762
1763	/* Pin down new L4 */
1764	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1765			  PFN_DOWN(__pa_symbol(init_level4_pgt)));
1766
1767	/* Unpin Xen-provided one */
1768	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1769
1770	/* Switch over */
1771	pgd = init_level4_pgt;
1772
1773	/*
1774	 * At this stage there can be no user pgd, and no page
1775	 * structure to attach it to, so make sure we just set kernel
1776	 * pgd.
1777	 */
1778	xen_mc_batch();
1779	__xen_write_cr3(true, __pa(pgd));
1780	xen_mc_issue(PARAVIRT_LAZY_CPU);
1781
1782	memblock_reserve(__pa(xen_start_info->pt_base),
1783			 xen_start_info->nr_pt_frames * PAGE_SIZE);
1784
1785	return pgd;
1786}
1787#else	/* !CONFIG_X86_64 */
1788static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1789static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1790
1791static void __init xen_write_cr3_init(unsigned long cr3)
1792{
1793	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1794
1795	BUG_ON(read_cr3() != __pa(initial_page_table));
1796	BUG_ON(cr3 != __pa(swapper_pg_dir));
1797
1798	/*
1799	 * We are switching to swapper_pg_dir for the first time (from
1800	 * initial_page_table) and therefore need to mark that page
1801	 * read-only and then pin it.
1802	 *
1803	 * Xen disallows sharing of kernel PMDs for PAE
1804	 * guests. Therefore we must copy the kernel PMD from
1805	 * initial_page_table into a new kernel PMD to be used in
1806	 * swapper_pg_dir.
1807	 */
1808	swapper_kernel_pmd =
1809		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1810	memcpy(swapper_kernel_pmd, initial_kernel_pmd,
1811	       sizeof(pmd_t) * PTRS_PER_PMD);
1812	swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1813		__pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1814	set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1815
1816	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1817	xen_write_cr3(cr3);
1818	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1819
1820	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1821			  PFN_DOWN(__pa(initial_page_table)));
1822	set_page_prot(initial_page_table, PAGE_KERNEL);
1823	set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1824
1825	pv_mmu_ops.write_cr3 = &xen_write_cr3;
1826}
1827
1828pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1829					 unsigned long max_pfn)
1830{
1831	pmd_t *kernel_pmd;
1832
1833	initial_kernel_pmd =
1834		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1835
1836	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1837				  xen_start_info->nr_pt_frames * PAGE_SIZE +
1838				  512*1024);
1839
1840	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1841	memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1842
1843	xen_map_identity_early(initial_kernel_pmd, max_pfn);
1844
1845	memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1846	initial_page_table[KERNEL_PGD_BOUNDARY] =
1847		__pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
1848
1849	set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1850	set_page_prot(initial_page_table, PAGE_KERNEL_RO);
1851	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1852
1853	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1854
1855	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1856			  PFN_DOWN(__pa(initial_page_table)));
1857	xen_write_cr3(__pa(initial_page_table));
1858
1859	memblock_reserve(__pa(xen_start_info->pt_base),
1860			 xen_start_info->nr_pt_frames * PAGE_SIZE);
1861
1862	return initial_page_table;
1863}
1864#endif	/* CONFIG_X86_64 */
1865
1866static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1867
1868static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1869{
1870	pte_t pte;
1871
1872	phys >>= PAGE_SHIFT;
1873
1874	switch (idx) {
1875	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1876#ifdef CONFIG_X86_F00F_BUG
1877	case FIX_F00F_IDT:
1878#endif
1879#ifdef CONFIG_X86_32
1880	case FIX_WP_TEST:
1881	case FIX_VDSO:
1882# ifdef CONFIG_HIGHMEM
1883	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1884# endif
1885#else
1886	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1887	case VVAR_PAGE:
1888#endif
1889	case FIX_TEXT_POKE0:
1890	case FIX_TEXT_POKE1:
1891		/* All local page mappings */
1892		pte = pfn_pte(phys, prot);
1893		break;
1894
1895#ifdef CONFIG_X86_LOCAL_APIC
1896	case FIX_APIC_BASE:	/* maps dummy local APIC */
1897		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1898		break;
1899#endif
1900
1901#ifdef CONFIG_X86_IO_APIC
1902	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
1903		/*
1904		 * We just don't map the IO APIC - all access is via
1905		 * hypercalls.  Keep the address in the pte for reference.
1906		 */
1907		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1908		break;
1909#endif
1910
1911	case FIX_PARAVIRT_BOOTMAP:
1912		/* This is an MFN, but it isn't an IO mapping from the
1913		   IO domain */
1914		pte = mfn_pte(phys, prot);
1915		break;
1916
1917	default:
1918		/* By default, set_fixmap is used for hardware mappings */
1919		pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1920		break;
1921	}
1922
1923	__native_set_fixmap(idx, pte);
1924
1925#ifdef CONFIG_X86_64
1926	/* Replicate changes to map the vsyscall page into the user
1927	   pagetable vsyscall mapping. */
1928	if ((idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) ||
1929	    idx == VVAR_PAGE) {
1930		unsigned long vaddr = __fix_to_virt(idx);
1931		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1932	}
1933#endif
1934}
1935
1936static void __init xen_post_allocator_init(void)
1937{
1938	pv_mmu_ops.set_pte = xen_set_pte;
1939	pv_mmu_ops.set_pmd = xen_set_pmd;
1940	pv_mmu_ops.set_pud = xen_set_pud;
1941#if PAGETABLE_LEVELS == 4
1942	pv_mmu_ops.set_pgd = xen_set_pgd;
1943#endif
1944
1945	/* This will work as long as patching hasn't happened yet
1946	   (which it hasn't) */
1947	pv_mmu_ops.alloc_pte = xen_alloc_pte;
1948	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1949	pv_mmu_ops.release_pte = xen_release_pte;
1950	pv_mmu_ops.release_pmd = xen_release_pmd;
1951#if PAGETABLE_LEVELS == 4
1952	pv_mmu_ops.alloc_pud = xen_alloc_pud;
1953	pv_mmu_ops.release_pud = xen_release_pud;
1954#endif
1955
1956#ifdef CONFIG_X86_64
1957	SetPagePinned(virt_to_page(level3_user_vsyscall));
1958#endif
1959	xen_mark_init_mm_pinned();
1960}
1961
1962static void xen_leave_lazy_mmu(void)
1963{
1964	preempt_disable();
1965	xen_mc_flush();
1966	paravirt_leave_lazy_mmu();
1967	preempt_enable();
1968}
1969
1970static const struct pv_mmu_ops xen_mmu_ops __initconst = {
1971	.read_cr2 = xen_read_cr2,
1972	.write_cr2 = xen_write_cr2,
1973
1974	.read_cr3 = xen_read_cr3,
1975#ifdef CONFIG_X86_32
1976	.write_cr3 = xen_write_cr3_init,
1977#else
1978	.write_cr3 = xen_write_cr3,
1979#endif
1980
1981	.flush_tlb_user = xen_flush_tlb,
1982	.flush_tlb_kernel = xen_flush_tlb,
1983	.flush_tlb_single = xen_flush_tlb_single,
1984	.flush_tlb_others = xen_flush_tlb_others,
1985
1986	.pte_update = paravirt_nop,
1987	.pte_update_defer = paravirt_nop,
1988
1989	.pgd_alloc = xen_pgd_alloc,
1990	.pgd_free = xen_pgd_free,
1991
1992	.alloc_pte = xen_alloc_pte_init,
1993	.release_pte = xen_release_pte_init,
1994	.alloc_pmd = xen_alloc_pmd_init,
1995	.release_pmd = xen_release_pmd_init,
1996
1997	.set_pte = xen_set_pte_init,
1998	.set_pte_at = xen_set_pte_at,
1999	.set_pmd = xen_set_pmd_hyper,
2000
2001	.ptep_modify_prot_start = __ptep_modify_prot_start,
2002	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
2003
2004	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
2005	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2006
2007	.make_pte = PV_CALLEE_SAVE(xen_make_pte),
2008	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2009
2010#ifdef CONFIG_X86_PAE
2011	.set_pte_atomic = xen_set_pte_atomic,
2012	.pte_clear = xen_pte_clear,
2013	.pmd_clear = xen_pmd_clear,
2014#endif	/* CONFIG_X86_PAE */
2015	.set_pud = xen_set_pud_hyper,
2016
2017	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2018	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2019
2020#if PAGETABLE_LEVELS == 4
2021	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
2022	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
2023	.set_pgd = xen_set_pgd_hyper,
2024
2025	.alloc_pud = xen_alloc_pmd_init,
2026	.release_pud = xen_release_pmd_init,
2027#endif	/* PAGETABLE_LEVELS == 4 */
2028
2029	.activate_mm = xen_activate_mm,
2030	.dup_mmap = xen_dup_mmap,
2031	.exit_mmap = xen_exit_mmap,
2032
2033	.lazy_mode = {
2034		.enter = paravirt_enter_lazy_mmu,
2035		.leave = xen_leave_lazy_mmu,
2036	},
2037
2038	.set_fixmap = xen_set_fixmap,
2039};
2040
2041void __init xen_init_mmu_ops(void)
2042{
2043	x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
2044	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2045	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2046	pv_mmu_ops = xen_mmu_ops;
2047
2048	memset(dummy_mapping, 0xff, PAGE_SIZE);
2049}
2050
2051/* Protected by xen_reservation_lock. */
2052#define MAX_CONTIG_ORDER 9 /* 2MB */
2053static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2054
2055#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2056static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2057				unsigned long *in_frames,
2058				unsigned long *out_frames)
2059{
2060	int i;
2061	struct multicall_space mcs;
2062
2063	xen_mc_batch();
2064	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2065		mcs = __xen_mc_entry(0);
2066
2067		if (in_frames)
2068			in_frames[i] = virt_to_mfn(vaddr);
2069
2070		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2071		__set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2072
2073		if (out_frames)
2074			out_frames[i] = virt_to_pfn(vaddr);
2075	}
2076	xen_mc_issue(0);
2077}
2078
2079/*
2080 * Update the pfn-to-mfn mappings for a virtual address range, either to
2081 * point to an array of mfns, or contiguously from a single starting
2082 * mfn.
2083 */
2084static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2085				     unsigned long *mfns,
2086				     unsigned long first_mfn)
2087{
2088	unsigned i, limit;
2089	unsigned long mfn;
2090
2091	xen_mc_batch();
2092
2093	limit = 1u << order;
2094	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2095		struct multicall_space mcs;
2096		unsigned flags;
2097
2098		mcs = __xen_mc_entry(0);
2099		if (mfns)
2100			mfn = mfns[i];
2101		else
2102			mfn = first_mfn + i;
2103
2104		if (i < (limit - 1))
2105			flags = 0;
2106		else {
2107			if (order == 0)
2108				flags = UVMF_INVLPG | UVMF_ALL;
2109			else
2110				flags = UVMF_TLB_FLUSH | UVMF_ALL;
2111		}
2112
2113		MULTI_update_va_mapping(mcs.mc, vaddr,
2114				mfn_pte(mfn, PAGE_KERNEL), flags);
2115
2116		set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2117	}
2118
2119	xen_mc_issue(0);
2120}
2121
2122/*
2123 * Perform the hypercall to exchange a region of our pfns to point to
2124 * memory with the required contiguous alignment.  Takes the pfns as
2125 * input, and populates mfns as output.
2126 *
2127 * Returns a success code indicating whether the hypervisor was able to
2128 * satisfy the request or not.
2129 */
2130static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2131			       unsigned long *pfns_in,
2132			       unsigned long extents_out,
2133			       unsigned int order_out,
2134			       unsigned long *mfns_out,
2135			       unsigned int address_bits)
2136{
2137	long rc;
2138	int success;
2139
2140	struct xen_memory_exchange exchange = {
2141		.in = {
2142			.nr_extents   = extents_in,
2143			.extent_order = order_in,
2144			.extent_start = pfns_in,
2145			.domid        = DOMID_SELF
2146		},
2147		.out = {
2148			.nr_extents   = extents_out,
2149			.extent_order = order_out,
2150			.extent_start = mfns_out,
2151			.address_bits = address_bits,
2152			.domid        = DOMID_SELF
2153		}
2154	};
2155
2156	BUG_ON(extents_in << order_in != extents_out << order_out);
2157
2158	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2159	success = (exchange.nr_exchanged == extents_in);
2160
2161	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2162	BUG_ON(success && (rc != 0));
2163
2164	return success;
2165}
2166
2167int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2168				 unsigned int address_bits)
2169{
2170	unsigned long *in_frames = discontig_frames, out_frame;
2171	unsigned long  flags;
2172	int            success;
2173
2174	/*
2175	 * Currently an auto-translated guest will not perform I/O, nor will
2176	 * it require PAE page directories below 4GB. Therefore any calls to
2177	 * this function are redundant and can be ignored.
2178	 */
2179
2180	if (xen_feature(XENFEAT_auto_translated_physmap))
2181		return 0;
2182
2183	if (unlikely(order > MAX_CONTIG_ORDER))
2184		return -ENOMEM;
2185
2186	memset((void *) vstart, 0, PAGE_SIZE << order);
2187
2188	spin_lock_irqsave(&xen_reservation_lock, flags);
2189
2190	/* 1. Zap current PTEs, remembering MFNs. */
2191	xen_zap_pfn_range(vstart, order, in_frames, NULL);
2192
2193	/* 2. Get a new contiguous memory extent. */
2194	out_frame = virt_to_pfn(vstart);
2195	success = xen_exchange_memory(1UL << order, 0, in_frames,
2196				      1, order, &out_frame,
2197				      address_bits);
2198
2199	/* 3. Map the new extent in place of old pages. */
2200	if (success)
2201		xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2202	else
2203		xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2204
2205	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2206
2207	return success ? 0 : -ENOMEM;
2208}
2209EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2210
2211void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2212{
2213	unsigned long *out_frames = discontig_frames, in_frame;
2214	unsigned long  flags;
2215	int success;
2216
2217	if (xen_feature(XENFEAT_auto_translated_physmap))
2218		return;
2219
2220	if (unlikely(order > MAX_CONTIG_ORDER))
2221		return;
2222
2223	memset((void *) vstart, 0, PAGE_SIZE << order);
2224
2225	spin_lock_irqsave(&xen_reservation_lock, flags);
2226
2227	/* 1. Find start MFN of contiguous extent. */
2228	in_frame = virt_to_mfn(vstart);
2229
2230	/* 2. Zap current PTEs. */
2231	xen_zap_pfn_range(vstart, order, NULL, out_frames);
2232
2233	/* 3. Do the exchange for non-contiguous MFNs. */
2234	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2235					0, out_frames, 0);
2236
2237	/* 4. Map new pages in place of old pages. */
2238	if (success)
2239		xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2240	else
2241		xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2242
2243	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2244}
2245EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2246
2247#ifdef CONFIG_XEN_PVHVM
2248static void xen_hvm_exit_mmap(struct mm_struct *mm)
2249{
2250	struct xen_hvm_pagetable_dying a;
2251	int rc;
2252
2253	a.domid = DOMID_SELF;
2254	a.gpa = __pa(mm->pgd);
2255	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2256	WARN_ON_ONCE(rc < 0);
2257}
2258
2259static int is_pagetable_dying_supported(void)
2260{
2261	struct xen_hvm_pagetable_dying a;
2262	int rc = 0;
2263
2264	a.domid = DOMID_SELF;
2265	a.gpa = 0x00;
2266	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2267	if (rc < 0) {
2268		printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2269		return 0;
2270	}
2271	return 1;
2272}
2273
2274void __init xen_hvm_init_mmu_ops(void)
2275{
2276	if (is_pagetable_dying_supported())
2277		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2278}
2279#endif
2280
2281#define REMAP_BATCH_SIZE 16
2282
2283struct remap_data {
2284	unsigned long mfn;
2285	pgprot_t prot;
2286	struct mmu_update *mmu_update;
2287};
2288
2289static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2290				 unsigned long addr, void *data)
2291{
2292	struct remap_data *rmd = data;
2293	pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2294
2295	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2296	rmd->mmu_update->val = pte_val_ma(pte);
2297	rmd->mmu_update++;
2298
2299	return 0;
2300}
2301
2302int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2303			       unsigned long addr,
2304			       unsigned long mfn, int nr,
2305			       pgprot_t prot, unsigned domid)
2306{
2307	struct remap_data rmd;
2308	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2309	int batch;
2310	unsigned long range;
2311	int err = 0;
2312
2313	prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2314
2315	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2316				(VM_PFNMAP | VM_RESERVED | VM_IO)));
2317
2318	rmd.mfn = mfn;
2319	rmd.prot = prot;
2320
2321	while (nr) {
2322		batch = min(REMAP_BATCH_SIZE, nr);
2323		range = (unsigned long)batch << PAGE_SHIFT;
2324
2325		rmd.mmu_update = mmu_update;
2326		err = apply_to_page_range(vma->vm_mm, addr, range,
2327					  remap_area_mfn_pte_fn, &rmd);
2328		if (err)
2329			goto out;
2330
2331		err = -EFAULT;
2332		if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2333			goto out;
2334
2335		nr -= batch;
2336		addr += range;
2337	}
2338
2339	err = 0;
2340out:
2341
2342	flush_tlb_all();
2343
2344	return err;
2345}
2346EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);