Linux Audio

Check our new training course

Loading...
v3.5.6
   1/*
   2 * Xen mmu operations
   3 *
   4 * This file contains the various mmu fetch and update operations.
   5 * The most important job they must perform is the mapping between the
   6 * domain's pfn and the overall machine mfns.
   7 *
   8 * Xen allows guests to directly update the pagetable, in a controlled
   9 * fashion.  In other words, the guest modifies the same pagetable
  10 * that the CPU actually uses, which eliminates the overhead of having
  11 * a separate shadow pagetable.
  12 *
  13 * In order to allow this, it falls on the guest domain to map its
  14 * notion of a "physical" pfn - which is just a domain-local linear
  15 * address - into a real "machine address" which the CPU's MMU can
  16 * use.
  17 *
  18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19 * inserted directly into the pagetable.  When creating a new
  20 * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22 * the mfn back into a pfn.
  23 *
  24 * The other constraint is that all pages which make up a pagetable
  25 * must be mapped read-only in the guest.  This prevents uncontrolled
  26 * guest updates to the pagetable.  Xen strictly enforces this, and
  27 * will disallow any pagetable update which will end up mapping a
  28 * pagetable page RW, and will disallow using any writable page as a
  29 * pagetable.
  30 *
  31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32 * would need to validate the whole pagetable before going on.
  33 * Naturally, this is quite slow.  The solution is to "pin" a
  34 * pagetable, which enforces all the constraints on the pagetable even
  35 * when it is not actively in use.  This menas that Xen can be assured
  36 * that it is still valid when you do load it into %cr3, and doesn't
  37 * need to revalidate it.
  38 *
  39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40 */
  41#include <linux/sched.h>
  42#include <linux/highmem.h>
  43#include <linux/debugfs.h>
  44#include <linux/bug.h>
  45#include <linux/vmalloc.h>
  46#include <linux/module.h>
  47#include <linux/gfp.h>
  48#include <linux/memblock.h>
  49#include <linux/seq_file.h>
 
  50
  51#include <trace/events/xen.h>
  52
  53#include <asm/pgtable.h>
  54#include <asm/tlbflush.h>
  55#include <asm/fixmap.h>
  56#include <asm/mmu_context.h>
  57#include <asm/setup.h>
  58#include <asm/paravirt.h>
  59#include <asm/e820.h>
  60#include <asm/linkage.h>
  61#include <asm/page.h>
  62#include <asm/init.h>
  63#include <asm/pat.h>
  64#include <asm/smp.h>
  65
  66#include <asm/xen/hypercall.h>
  67#include <asm/xen/hypervisor.h>
  68
  69#include <xen/xen.h>
  70#include <xen/page.h>
  71#include <xen/interface/xen.h>
  72#include <xen/interface/hvm/hvm_op.h>
  73#include <xen/interface/version.h>
  74#include <xen/interface/memory.h>
  75#include <xen/hvc-console.h>
  76
  77#include "multicalls.h"
  78#include "mmu.h"
  79#include "debugfs.h"
  80
  81/*
  82 * Protects atomic reservation decrease/increase against concurrent increases.
  83 * Also protects non-atomic updates of current_pages and balloon lists.
  84 */
  85DEFINE_SPINLOCK(xen_reservation_lock);
  86
 
  87/*
  88 * Identity map, in addition to plain kernel map.  This needs to be
  89 * large enough to allocate page table pages to allocate the rest.
  90 * Each page can map 2MB.
  91 */
  92#define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)
  93static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
  94
  95#ifdef CONFIG_X86_64
  96/* l3 pud for userspace vsyscall mapping */
  97static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
  98#endif /* CONFIG_X86_64 */
  99
 100/*
 101 * Note about cr3 (pagetable base) values:
 102 *
 103 * xen_cr3 contains the current logical cr3 value; it contains the
 104 * last set cr3.  This may not be the current effective cr3, because
 105 * its update may be being lazily deferred.  However, a vcpu looking
 106 * at its own cr3 can use this value knowing that it everything will
 107 * be self-consistent.
 108 *
 109 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
 110 * hypercall to set the vcpu cr3 is complete (so it may be a little
 111 * out of date, but it will never be set early).  If one vcpu is
 112 * looking at another vcpu's cr3 value, it should use this variable.
 113 */
 114DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
 115DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
 116
 
 117
 118/*
 119 * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 120 * redzone above it, so round it up to a PGD boundary.
 121 */
 122#define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 123
 124unsigned long arbitrary_virt_to_mfn(void *vaddr)
 125{
 126	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
 127
 128	return PFN_DOWN(maddr.maddr);
 129}
 130
 131xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 132{
 133	unsigned long address = (unsigned long)vaddr;
 134	unsigned int level;
 135	pte_t *pte;
 136	unsigned offset;
 137
 138	/*
 139	 * if the PFN is in the linear mapped vaddr range, we can just use
 140	 * the (quick) virt_to_machine() p2m lookup
 141	 */
 142	if (virt_addr_valid(vaddr))
 143		return virt_to_machine(vaddr);
 144
 145	/* otherwise we have to do a (slower) full page-table walk */
 146
 147	pte = lookup_address(address, &level);
 148	BUG_ON(pte == NULL);
 149	offset = address & ~PAGE_MASK;
 150	return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
 151}
 152EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
 153
 154void make_lowmem_page_readonly(void *vaddr)
 155{
 156	pte_t *pte, ptev;
 157	unsigned long address = (unsigned long)vaddr;
 158	unsigned int level;
 159
 160	pte = lookup_address(address, &level);
 161	if (pte == NULL)
 162		return;		/* vaddr missing */
 163
 164	ptev = pte_wrprotect(*pte);
 165
 166	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 167		BUG();
 168}
 169
 170void make_lowmem_page_readwrite(void *vaddr)
 171{
 172	pte_t *pte, ptev;
 173	unsigned long address = (unsigned long)vaddr;
 174	unsigned int level;
 175
 176	pte = lookup_address(address, &level);
 177	if (pte == NULL)
 178		return;		/* vaddr missing */
 179
 180	ptev = pte_mkwrite(*pte);
 181
 182	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 183		BUG();
 184}
 185
 186
 187static bool xen_page_pinned(void *ptr)
 188{
 189	struct page *page = virt_to_page(ptr);
 190
 191	return PagePinned(page);
 192}
 193
 194void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
 195{
 196	struct multicall_space mcs;
 197	struct mmu_update *u;
 198
 199	trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
 200
 201	mcs = xen_mc_entry(sizeof(*u));
 202	u = mcs.args;
 203
 204	/* ptep might be kmapped when using 32-bit HIGHPTE */
 205	u->ptr = virt_to_machine(ptep).maddr;
 206	u->val = pte_val_ma(pteval);
 207
 208	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
 209
 210	xen_mc_issue(PARAVIRT_LAZY_MMU);
 211}
 212EXPORT_SYMBOL_GPL(xen_set_domain_pte);
 213
 214static void xen_extend_mmu_update(const struct mmu_update *update)
 215{
 216	struct multicall_space mcs;
 217	struct mmu_update *u;
 218
 219	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 220
 221	if (mcs.mc != NULL) {
 222		mcs.mc->args[1]++;
 223	} else {
 224		mcs = __xen_mc_entry(sizeof(*u));
 225		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 226	}
 227
 228	u = mcs.args;
 229	*u = *update;
 230}
 231
 232static void xen_extend_mmuext_op(const struct mmuext_op *op)
 233{
 234	struct multicall_space mcs;
 235	struct mmuext_op *u;
 236
 237	mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
 238
 239	if (mcs.mc != NULL) {
 240		mcs.mc->args[1]++;
 241	} else {
 242		mcs = __xen_mc_entry(sizeof(*u));
 243		MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 244	}
 245
 246	u = mcs.args;
 247	*u = *op;
 248}
 249
 250static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 251{
 252	struct mmu_update u;
 253
 254	preempt_disable();
 255
 256	xen_mc_batch();
 257
 258	/* ptr may be ioremapped for 64-bit pagetable setup */
 259	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 260	u.val = pmd_val_ma(val);
 261	xen_extend_mmu_update(&u);
 262
 263	xen_mc_issue(PARAVIRT_LAZY_MMU);
 264
 265	preempt_enable();
 266}
 267
 268static void xen_set_pmd(pmd_t *ptr, pmd_t val)
 269{
 270	trace_xen_mmu_set_pmd(ptr, val);
 271
 272	/* If page is not pinned, we can just update the entry
 273	   directly */
 274	if (!xen_page_pinned(ptr)) {
 275		*ptr = val;
 276		return;
 277	}
 278
 279	xen_set_pmd_hyper(ptr, val);
 280}
 281
 282/*
 283 * Associate a virtual page frame with a given physical page frame
 284 * and protection flags for that frame.
 285 */
 286void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 287{
 288	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
 289}
 290
 291static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
 292{
 293	struct mmu_update u;
 294
 295	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
 296		return false;
 297
 298	xen_mc_batch();
 299
 300	u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
 301	u.val = pte_val_ma(pteval);
 302	xen_extend_mmu_update(&u);
 303
 304	xen_mc_issue(PARAVIRT_LAZY_MMU);
 305
 306	return true;
 307}
 308
 309static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
 310{
 311	if (!xen_batched_set_pte(ptep, pteval))
 312		native_set_pte(ptep, pteval);
 
 
 
 
 
 
 
 
 
 
 
 
 313}
 314
 315static void xen_set_pte(pte_t *ptep, pte_t pteval)
 316{
 317	trace_xen_mmu_set_pte(ptep, pteval);
 318	__xen_set_pte(ptep, pteval);
 319}
 320
 321static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 322		    pte_t *ptep, pte_t pteval)
 323{
 324	trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
 325	__xen_set_pte(ptep, pteval);
 326}
 327
 328pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
 329				 unsigned long addr, pte_t *ptep)
 330{
 331	/* Just return the pte as-is.  We preserve the bits on commit */
 332	trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
 333	return *ptep;
 334}
 335
 336void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 337				 pte_t *ptep, pte_t pte)
 338{
 339	struct mmu_update u;
 340
 341	trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
 342	xen_mc_batch();
 343
 344	u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
 345	u.val = pte_val_ma(pte);
 346	xen_extend_mmu_update(&u);
 347
 348	xen_mc_issue(PARAVIRT_LAZY_MMU);
 349}
 350
 351/* Assume pteval_t is equivalent to all the other *val_t types. */
 352static pteval_t pte_mfn_to_pfn(pteval_t val)
 353{
 354	if (val & _PAGE_PRESENT) {
 355		unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 356		unsigned long pfn = mfn_to_pfn(mfn);
 357
 358		pteval_t flags = val & PTE_FLAGS_MASK;
 359		if (unlikely(pfn == ~0))
 360			val = flags & ~_PAGE_PRESENT;
 361		else
 362			val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
 363	}
 364
 365	return val;
 366}
 367
 368static pteval_t pte_pfn_to_mfn(pteval_t val)
 369{
 370	if (val & _PAGE_PRESENT) {
 371		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 372		pteval_t flags = val & PTE_FLAGS_MASK;
 373		unsigned long mfn;
 374
 375		if (!xen_feature(XENFEAT_auto_translated_physmap))
 376			mfn = get_phys_to_machine(pfn);
 377		else
 378			mfn = pfn;
 379		/*
 380		 * If there's no mfn for the pfn, then just create an
 381		 * empty non-present pte.  Unfortunately this loses
 382		 * information about the original pfn, so
 383		 * pte_mfn_to_pfn is asymmetric.
 384		 */
 385		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
 386			mfn = 0;
 387			flags = 0;
 388		} else {
 389			/*
 390			 * Paramount to do this test _after_ the
 391			 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
 392			 * IDENTITY_FRAME_BIT resolves to true.
 393			 */
 394			mfn &= ~FOREIGN_FRAME_BIT;
 395			if (mfn & IDENTITY_FRAME_BIT) {
 396				mfn &= ~IDENTITY_FRAME_BIT;
 397				flags |= _PAGE_IOMAP;
 398			}
 399		}
 400		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
 401	}
 402
 403	return val;
 404}
 405
 406static pteval_t iomap_pte(pteval_t val)
 407{
 408	if (val & _PAGE_PRESENT) {
 409		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 410		pteval_t flags = val & PTE_FLAGS_MASK;
 411
 412		/* We assume the pte frame number is a MFN, so
 413		   just use it as-is. */
 414		val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
 415	}
 416
 417	return val;
 418}
 419
 420static pteval_t xen_pte_val(pte_t pte)
 421{
 422	pteval_t pteval = pte.pte;
 423#if 0
 424	/* If this is a WC pte, convert back from Xen WC to Linux WC */
 425	if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
 426		WARN_ON(!pat_enabled);
 427		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
 428	}
 429#endif
 430	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
 431		return pteval;
 432
 433	return pte_mfn_to_pfn(pteval);
 434}
 435PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 436
 437static pgdval_t xen_pgd_val(pgd_t pgd)
 438{
 439	return pte_mfn_to_pfn(pgd.pgd);
 440}
 441PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 442
 443/*
 444 * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
 445 * are reserved for now, to correspond to the Intel-reserved PAT
 446 * types.
 447 *
 448 * We expect Linux's PAT set as follows:
 449 *
 450 * Idx  PTE flags        Linux    Xen    Default
 451 * 0                     WB       WB     WB
 452 * 1            PWT      WC       WT     WT
 453 * 2        PCD          UC-      UC-    UC-
 454 * 3        PCD PWT      UC       UC     UC
 455 * 4    PAT              WB       WC     WB
 456 * 5    PAT     PWT      WC       WP     WT
 457 * 6    PAT PCD          UC-      UC     UC-
 458 * 7    PAT PCD PWT      UC       UC     UC
 459 */
 460
 461void xen_set_pat(u64 pat)
 462{
 463	/* We expect Linux to use a PAT setting of
 464	 * UC UC- WC WB (ignoring the PAT flag) */
 465	WARN_ON(pat != 0x0007010600070106ull);
 466}
 467
 468static pte_t xen_make_pte(pteval_t pte)
 469{
 470	phys_addr_t addr = (pte & PTE_PFN_MASK);
 471#if 0
 472	/* If Linux is trying to set a WC pte, then map to the Xen WC.
 473	 * If _PAGE_PAT is set, then it probably means it is really
 474	 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
 475	 * things work out OK...
 476	 *
 477	 * (We should never see kernel mappings with _PAGE_PSE set,
 478	 * but we could see hugetlbfs mappings, I think.).
 479	 */
 480	if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
 481		if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
 482			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
 483	}
 484#endif
 485	/*
 486	 * Unprivileged domains are allowed to do IOMAPpings for
 487	 * PCI passthrough, but not map ISA space.  The ISA
 488	 * mappings are just dummy local mappings to keep other
 489	 * parts of the kernel happy.
 490	 */
 491	if (unlikely(pte & _PAGE_IOMAP) &&
 492	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
 493		pte = iomap_pte(pte);
 494	} else {
 495		pte &= ~_PAGE_IOMAP;
 496		pte = pte_pfn_to_mfn(pte);
 497	}
 498
 499	return native_make_pte(pte);
 500}
 501PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 502
 503static pgd_t xen_make_pgd(pgdval_t pgd)
 504{
 505	pgd = pte_pfn_to_mfn(pgd);
 506	return native_make_pgd(pgd);
 507}
 508PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
 509
 510static pmdval_t xen_pmd_val(pmd_t pmd)
 511{
 512	return pte_mfn_to_pfn(pmd.pmd);
 513}
 514PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
 515
 516static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 517{
 518	struct mmu_update u;
 519
 520	preempt_disable();
 521
 522	xen_mc_batch();
 523
 524	/* ptr may be ioremapped for 64-bit pagetable setup */
 525	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 526	u.val = pud_val_ma(val);
 527	xen_extend_mmu_update(&u);
 528
 529	xen_mc_issue(PARAVIRT_LAZY_MMU);
 530
 531	preempt_enable();
 532}
 533
 534static void xen_set_pud(pud_t *ptr, pud_t val)
 535{
 536	trace_xen_mmu_set_pud(ptr, val);
 537
 538	/* If page is not pinned, we can just update the entry
 539	   directly */
 540	if (!xen_page_pinned(ptr)) {
 541		*ptr = val;
 542		return;
 543	}
 544
 545	xen_set_pud_hyper(ptr, val);
 546}
 547
 548#ifdef CONFIG_X86_PAE
 549static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 550{
 551	trace_xen_mmu_set_pte_atomic(ptep, pte);
 552	set_64bit((u64 *)ptep, native_pte_val(pte));
 553}
 554
 555static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 556{
 557	trace_xen_mmu_pte_clear(mm, addr, ptep);
 558	if (!xen_batched_set_pte(ptep, native_make_pte(0)))
 559		native_pte_clear(mm, addr, ptep);
 560}
 561
 562static void xen_pmd_clear(pmd_t *pmdp)
 563{
 564	trace_xen_mmu_pmd_clear(pmdp);
 565	set_pmd(pmdp, __pmd(0));
 566}
 567#endif	/* CONFIG_X86_PAE */
 568
 569static pmd_t xen_make_pmd(pmdval_t pmd)
 570{
 571	pmd = pte_pfn_to_mfn(pmd);
 572	return native_make_pmd(pmd);
 573}
 574PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 575
 576#if PAGETABLE_LEVELS == 4
 577static pudval_t xen_pud_val(pud_t pud)
 578{
 579	return pte_mfn_to_pfn(pud.pud);
 580}
 581PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
 582
 583static pud_t xen_make_pud(pudval_t pud)
 584{
 585	pud = pte_pfn_to_mfn(pud);
 586
 587	return native_make_pud(pud);
 588}
 589PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
 590
 591static pgd_t *xen_get_user_pgd(pgd_t *pgd)
 592{
 593	pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
 594	unsigned offset = pgd - pgd_page;
 595	pgd_t *user_ptr = NULL;
 596
 597	if (offset < pgd_index(USER_LIMIT)) {
 598		struct page *page = virt_to_page(pgd_page);
 599		user_ptr = (pgd_t *)page->private;
 600		if (user_ptr)
 601			user_ptr += offset;
 602	}
 603
 604	return user_ptr;
 605}
 606
 607static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 608{
 609	struct mmu_update u;
 610
 611	u.ptr = virt_to_machine(ptr).maddr;
 612	u.val = pgd_val_ma(val);
 613	xen_extend_mmu_update(&u);
 614}
 615
 616/*
 617 * Raw hypercall-based set_pgd, intended for in early boot before
 618 * there's a page structure.  This implies:
 619 *  1. The only existing pagetable is the kernel's
 620 *  2. It is always pinned
 621 *  3. It has no user pagetable attached to it
 622 */
 623static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 624{
 625	preempt_disable();
 626
 627	xen_mc_batch();
 628
 629	__xen_set_pgd_hyper(ptr, val);
 630
 631	xen_mc_issue(PARAVIRT_LAZY_MMU);
 632
 633	preempt_enable();
 634}
 635
 636static void xen_set_pgd(pgd_t *ptr, pgd_t val)
 637{
 638	pgd_t *user_ptr = xen_get_user_pgd(ptr);
 639
 640	trace_xen_mmu_set_pgd(ptr, user_ptr, val);
 641
 642	/* If page is not pinned, we can just update the entry
 643	   directly */
 644	if (!xen_page_pinned(ptr)) {
 645		*ptr = val;
 646		if (user_ptr) {
 647			WARN_ON(xen_page_pinned(user_ptr));
 648			*user_ptr = val;
 649		}
 650		return;
 651	}
 652
 653	/* If it's pinned, then we can at least batch the kernel and
 654	   user updates together. */
 655	xen_mc_batch();
 656
 657	__xen_set_pgd_hyper(ptr, val);
 658	if (user_ptr)
 659		__xen_set_pgd_hyper(user_ptr, val);
 660
 661	xen_mc_issue(PARAVIRT_LAZY_MMU);
 662}
 663#endif	/* PAGETABLE_LEVELS == 4 */
 664
 665/*
 666 * (Yet another) pagetable walker.  This one is intended for pinning a
 667 * pagetable.  This means that it walks a pagetable and calls the
 668 * callback function on each page it finds making up the page table,
 669 * at every level.  It walks the entire pagetable, but it only bothers
 670 * pinning pte pages which are below limit.  In the normal case this
 671 * will be STACK_TOP_MAX, but at boot we need to pin up to
 672 * FIXADDR_TOP.
 673 *
 674 * For 32-bit the important bit is that we don't pin beyond there,
 675 * because then we start getting into Xen's ptes.
 676 *
 677 * For 64-bit, we must skip the Xen hole in the middle of the address
 678 * space, just after the big x86-64 virtual hole.
 679 */
 680static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
 681			  int (*func)(struct mm_struct *mm, struct page *,
 682				      enum pt_level),
 683			  unsigned long limit)
 684{
 685	int flush = 0;
 686	unsigned hole_low, hole_high;
 687	unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
 688	unsigned pgdidx, pudidx, pmdidx;
 689
 690	/* The limit is the last byte to be touched */
 691	limit--;
 692	BUG_ON(limit >= FIXADDR_TOP);
 693
 694	if (xen_feature(XENFEAT_auto_translated_physmap))
 695		return 0;
 696
 697	/*
 698	 * 64-bit has a great big hole in the middle of the address
 699	 * space, which contains the Xen mappings.  On 32-bit these
 700	 * will end up making a zero-sized hole and so is a no-op.
 701	 */
 702	hole_low = pgd_index(USER_LIMIT);
 703	hole_high = pgd_index(PAGE_OFFSET);
 704
 705	pgdidx_limit = pgd_index(limit);
 706#if PTRS_PER_PUD > 1
 707	pudidx_limit = pud_index(limit);
 708#else
 709	pudidx_limit = 0;
 710#endif
 711#if PTRS_PER_PMD > 1
 712	pmdidx_limit = pmd_index(limit);
 713#else
 714	pmdidx_limit = 0;
 715#endif
 716
 717	for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
 718		pud_t *pud;
 719
 720		if (pgdidx >= hole_low && pgdidx < hole_high)
 721			continue;
 722
 723		if (!pgd_val(pgd[pgdidx]))
 724			continue;
 725
 726		pud = pud_offset(&pgd[pgdidx], 0);
 727
 728		if (PTRS_PER_PUD > 1) /* not folded */
 729			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
 730
 731		for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
 732			pmd_t *pmd;
 733
 734			if (pgdidx == pgdidx_limit &&
 735			    pudidx > pudidx_limit)
 736				goto out;
 737
 738			if (pud_none(pud[pudidx]))
 739				continue;
 740
 741			pmd = pmd_offset(&pud[pudidx], 0);
 742
 743			if (PTRS_PER_PMD > 1) /* not folded */
 744				flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
 745
 746			for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
 747				struct page *pte;
 748
 749				if (pgdidx == pgdidx_limit &&
 750				    pudidx == pudidx_limit &&
 751				    pmdidx > pmdidx_limit)
 752					goto out;
 753
 754				if (pmd_none(pmd[pmdidx]))
 755					continue;
 756
 757				pte = pmd_page(pmd[pmdidx]);
 758				flush |= (*func)(mm, pte, PT_PTE);
 759			}
 760		}
 761	}
 762
 763out:
 764	/* Do the top level last, so that the callbacks can use it as
 765	   a cue to do final things like tlb flushes. */
 766	flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
 767
 768	return flush;
 769}
 770
 771static int xen_pgd_walk(struct mm_struct *mm,
 772			int (*func)(struct mm_struct *mm, struct page *,
 773				    enum pt_level),
 774			unsigned long limit)
 775{
 776	return __xen_pgd_walk(mm, mm->pgd, func, limit);
 777}
 778
 779/* If we're using split pte locks, then take the page's lock and
 780   return a pointer to it.  Otherwise return NULL. */
 781static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 782{
 783	spinlock_t *ptl = NULL;
 784
 785#if USE_SPLIT_PTLOCKS
 786	ptl = __pte_lockptr(page);
 787	spin_lock_nest_lock(ptl, &mm->page_table_lock);
 788#endif
 789
 790	return ptl;
 791}
 792
 793static void xen_pte_unlock(void *v)
 794{
 795	spinlock_t *ptl = v;
 796	spin_unlock(ptl);
 797}
 798
 799static void xen_do_pin(unsigned level, unsigned long pfn)
 800{
 801	struct mmuext_op op;
 802
 803	op.cmd = level;
 804	op.arg1.mfn = pfn_to_mfn(pfn);
 805
 806	xen_extend_mmuext_op(&op);
 807}
 808
 809static int xen_pin_page(struct mm_struct *mm, struct page *page,
 810			enum pt_level level)
 811{
 812	unsigned pgfl = TestSetPagePinned(page);
 813	int flush;
 814
 815	if (pgfl)
 816		flush = 0;		/* already pinned */
 817	else if (PageHighMem(page))
 818		/* kmaps need flushing if we found an unpinned
 819		   highpage */
 820		flush = 1;
 821	else {
 822		void *pt = lowmem_page_address(page);
 823		unsigned long pfn = page_to_pfn(page);
 824		struct multicall_space mcs = __xen_mc_entry(0);
 825		spinlock_t *ptl;
 826
 827		flush = 0;
 828
 829		/*
 830		 * We need to hold the pagetable lock between the time
 831		 * we make the pagetable RO and when we actually pin
 832		 * it.  If we don't, then other users may come in and
 833		 * attempt to update the pagetable by writing it,
 834		 * which will fail because the memory is RO but not
 835		 * pinned, so Xen won't do the trap'n'emulate.
 836		 *
 837		 * If we're using split pte locks, we can't hold the
 838		 * entire pagetable's worth of locks during the
 839		 * traverse, because we may wrap the preempt count (8
 840		 * bits).  The solution is to mark RO and pin each PTE
 841		 * page while holding the lock.  This means the number
 842		 * of locks we end up holding is never more than a
 843		 * batch size (~32 entries, at present).
 844		 *
 845		 * If we're not using split pte locks, we needn't pin
 846		 * the PTE pages independently, because we're
 847		 * protected by the overall pagetable lock.
 848		 */
 849		ptl = NULL;
 850		if (level == PT_PTE)
 851			ptl = xen_pte_lock(page, mm);
 852
 853		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 854					pfn_pte(pfn, PAGE_KERNEL_RO),
 855					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 856
 857		if (ptl) {
 858			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
 859
 860			/* Queue a deferred unlock for when this batch
 861			   is completed. */
 862			xen_mc_callback(xen_pte_unlock, ptl);
 863		}
 864	}
 865
 866	return flush;
 867}
 868
 869/* This is called just after a mm has been created, but it has not
 870   been used yet.  We need to make sure that its pagetable is all
 871   read-only, and can be pinned. */
 872static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 873{
 874	trace_xen_mmu_pgd_pin(mm, pgd);
 875
 876	xen_mc_batch();
 877
 878	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
 879		/* re-enable interrupts for flushing */
 880		xen_mc_issue(0);
 881
 882		kmap_flush_unused();
 883
 884		xen_mc_batch();
 885	}
 886
 887#ifdef CONFIG_X86_64
 888	{
 889		pgd_t *user_pgd = xen_get_user_pgd(pgd);
 890
 891		xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
 892
 893		if (user_pgd) {
 894			xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
 895			xen_do_pin(MMUEXT_PIN_L4_TABLE,
 896				   PFN_DOWN(__pa(user_pgd)));
 897		}
 898	}
 899#else /* CONFIG_X86_32 */
 900#ifdef CONFIG_X86_PAE
 901	/* Need to make sure unshared kernel PMD is pinnable */
 902	xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
 903		     PT_PMD);
 904#endif
 905	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 906#endif /* CONFIG_X86_64 */
 907	xen_mc_issue(0);
 908}
 909
 910static void xen_pgd_pin(struct mm_struct *mm)
 911{
 912	__xen_pgd_pin(mm, mm->pgd);
 913}
 914
 915/*
 916 * On save, we need to pin all pagetables to make sure they get their
 917 * mfns turned into pfns.  Search the list for any unpinned pgds and pin
 918 * them (unpinned pgds are not currently in use, probably because the
 919 * process is under construction or destruction).
 920 *
 921 * Expected to be called in stop_machine() ("equivalent to taking
 922 * every spinlock in the system"), so the locking doesn't really
 923 * matter all that much.
 924 */
 925void xen_mm_pin_all(void)
 926{
 927	struct page *page;
 928
 929	spin_lock(&pgd_lock);
 930
 931	list_for_each_entry(page, &pgd_list, lru) {
 932		if (!PagePinned(page)) {
 933			__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
 934			SetPageSavePinned(page);
 935		}
 936	}
 937
 938	spin_unlock(&pgd_lock);
 939}
 940
 941/*
 942 * The init_mm pagetable is really pinned as soon as its created, but
 943 * that's before we have page structures to store the bits.  So do all
 944 * the book-keeping now.
 945 */
 946static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
 947				  enum pt_level level)
 948{
 949	SetPagePinned(page);
 950	return 0;
 951}
 952
 953static void __init xen_mark_init_mm_pinned(void)
 954{
 955	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
 956}
 957
 958static int xen_unpin_page(struct mm_struct *mm, struct page *page,
 959			  enum pt_level level)
 960{
 961	unsigned pgfl = TestClearPagePinned(page);
 962
 963	if (pgfl && !PageHighMem(page)) {
 964		void *pt = lowmem_page_address(page);
 965		unsigned long pfn = page_to_pfn(page);
 966		spinlock_t *ptl = NULL;
 967		struct multicall_space mcs;
 968
 969		/*
 970		 * Do the converse to pin_page.  If we're using split
 971		 * pte locks, we must be holding the lock for while
 972		 * the pte page is unpinned but still RO to prevent
 973		 * concurrent updates from seeing it in this
 974		 * partially-pinned state.
 975		 */
 976		if (level == PT_PTE) {
 977			ptl = xen_pte_lock(page, mm);
 978
 979			if (ptl)
 980				xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
 981		}
 982
 983		mcs = __xen_mc_entry(0);
 984
 985		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 986					pfn_pte(pfn, PAGE_KERNEL),
 987					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 988
 989		if (ptl) {
 990			/* unlock when batch completed */
 991			xen_mc_callback(xen_pte_unlock, ptl);
 992		}
 993	}
 994
 995	return 0;		/* never need to flush on unpin */
 996}
 997
 998/* Release a pagetables pages back as normal RW */
 999static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
1000{
1001	trace_xen_mmu_pgd_unpin(mm, pgd);
1002
1003	xen_mc_batch();
1004
1005	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1006
1007#ifdef CONFIG_X86_64
1008	{
1009		pgd_t *user_pgd = xen_get_user_pgd(pgd);
1010
1011		if (user_pgd) {
1012			xen_do_pin(MMUEXT_UNPIN_TABLE,
1013				   PFN_DOWN(__pa(user_pgd)));
1014			xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
1015		}
1016	}
1017#endif
1018
1019#ifdef CONFIG_X86_PAE
1020	/* Need to make sure unshared kernel PMD is unpinned */
1021	xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1022		       PT_PMD);
1023#endif
1024
1025	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1026
1027	xen_mc_issue(0);
1028}
1029
1030static void xen_pgd_unpin(struct mm_struct *mm)
1031{
1032	__xen_pgd_unpin(mm, mm->pgd);
1033}
1034
1035/*
1036 * On resume, undo any pinning done at save, so that the rest of the
1037 * kernel doesn't see any unexpected pinned pagetables.
1038 */
1039void xen_mm_unpin_all(void)
1040{
1041	struct page *page;
1042
1043	spin_lock(&pgd_lock);
1044
1045	list_for_each_entry(page, &pgd_list, lru) {
1046		if (PageSavePinned(page)) {
1047			BUG_ON(!PagePinned(page));
1048			__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1049			ClearPageSavePinned(page);
1050		}
1051	}
1052
1053	spin_unlock(&pgd_lock);
1054}
1055
1056static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1057{
1058	spin_lock(&next->page_table_lock);
1059	xen_pgd_pin(next);
1060	spin_unlock(&next->page_table_lock);
1061}
1062
1063static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1064{
1065	spin_lock(&mm->page_table_lock);
1066	xen_pgd_pin(mm);
1067	spin_unlock(&mm->page_table_lock);
1068}
1069
1070
1071#ifdef CONFIG_SMP
1072/* Another cpu may still have their %cr3 pointing at the pagetable, so
1073   we need to repoint it somewhere else before we can unpin it. */
1074static void drop_other_mm_ref(void *info)
1075{
1076	struct mm_struct *mm = info;
1077	struct mm_struct *active_mm;
1078
1079	active_mm = this_cpu_read(cpu_tlbstate.active_mm);
1080
1081	if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
1082		leave_mm(smp_processor_id());
1083
1084	/* If this cpu still has a stale cr3 reference, then make sure
1085	   it has been flushed. */
1086	if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
1087		load_cr3(swapper_pg_dir);
1088}
1089
1090static void xen_drop_mm_ref(struct mm_struct *mm)
1091{
1092	cpumask_var_t mask;
1093	unsigned cpu;
1094
1095	if (current->active_mm == mm) {
1096		if (current->mm == mm)
1097			load_cr3(swapper_pg_dir);
1098		else
1099			leave_mm(smp_processor_id());
1100	}
1101
1102	/* Get the "official" set of cpus referring to our pagetable. */
1103	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1104		for_each_online_cpu(cpu) {
1105			if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1106			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1107				continue;
1108			smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1109		}
1110		return;
1111	}
1112	cpumask_copy(mask, mm_cpumask(mm));
1113
1114	/* It's possible that a vcpu may have a stale reference to our
1115	   cr3, because its in lazy mode, and it hasn't yet flushed
1116	   its set of pending hypercalls yet.  In this case, we can
1117	   look at its actual current cr3 value, and force it to flush
1118	   if needed. */
1119	for_each_online_cpu(cpu) {
1120		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1121			cpumask_set_cpu(cpu, mask);
1122	}
1123
1124	if (!cpumask_empty(mask))
1125		smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1126	free_cpumask_var(mask);
1127}
1128#else
1129static void xen_drop_mm_ref(struct mm_struct *mm)
1130{
1131	if (current->active_mm == mm)
1132		load_cr3(swapper_pg_dir);
1133}
1134#endif
1135
1136/*
1137 * While a process runs, Xen pins its pagetables, which means that the
1138 * hypervisor forces it to be read-only, and it controls all updates
1139 * to it.  This means that all pagetable updates have to go via the
1140 * hypervisor, which is moderately expensive.
1141 *
1142 * Since we're pulling the pagetable down, we switch to use init_mm,
1143 * unpin old process pagetable and mark it all read-write, which
1144 * allows further operations on it to be simple memory accesses.
1145 *
1146 * The only subtle point is that another CPU may be still using the
1147 * pagetable because of lazy tlb flushing.  This means we need need to
1148 * switch all CPUs off this pagetable before we can unpin it.
1149 */
1150static void xen_exit_mmap(struct mm_struct *mm)
1151{
1152	get_cpu();		/* make sure we don't move around */
1153	xen_drop_mm_ref(mm);
1154	put_cpu();
1155
1156	spin_lock(&mm->page_table_lock);
1157
1158	/* pgd may not be pinned in the error exit path of execve */
1159	if (xen_page_pinned(mm->pgd))
1160		xen_pgd_unpin(mm);
1161
1162	spin_unlock(&mm->page_table_lock);
1163}
1164
1165static void __init xen_pagetable_setup_start(pgd_t *base)
 
 
1166{
 
 
 
 
 
 
1167}
1168
1169static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
 
 
1170{
1171	/* reserve the range used */
1172	native_pagetable_reserve(start, end);
1173
1174	/* set as RW the rest */
1175	printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
1176			PFN_PHYS(pgt_buf_top));
1177	while (end < PFN_PHYS(pgt_buf_top)) {
1178		make_lowmem_page_readwrite(__va(end));
1179		end += PAGE_SIZE;
 
 
1180	}
 
 
 
1181}
1182
1183static void xen_post_allocator_init(void);
 
 
 
 
 
 
 
 
 
 
 
 
1184
1185static void __init xen_pagetable_setup_done(pgd_t *base)
1186{
1187	xen_setup_shared_info();
1188	xen_post_allocator_init();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1189}
 
 
 
 
 
 
 
 
1190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1191static void xen_write_cr2(unsigned long cr2)
1192{
1193	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
1194}
1195
1196static unsigned long xen_read_cr2(void)
1197{
1198	return this_cpu_read(xen_vcpu)->arch.cr2;
1199}
1200
1201unsigned long xen_read_cr2_direct(void)
1202{
1203	return this_cpu_read(xen_vcpu_info.arch.cr2);
1204}
1205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1206static void xen_flush_tlb(void)
1207{
1208	struct mmuext_op *op;
1209	struct multicall_space mcs;
1210
1211	trace_xen_mmu_flush_tlb(0);
1212
1213	preempt_disable();
1214
1215	mcs = xen_mc_entry(sizeof(*op));
1216
1217	op = mcs.args;
1218	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1219	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1220
1221	xen_mc_issue(PARAVIRT_LAZY_MMU);
1222
1223	preempt_enable();
1224}
1225
1226static void xen_flush_tlb_single(unsigned long addr)
1227{
1228	struct mmuext_op *op;
1229	struct multicall_space mcs;
1230
1231	trace_xen_mmu_flush_tlb_single(addr);
1232
1233	preempt_disable();
1234
1235	mcs = xen_mc_entry(sizeof(*op));
1236	op = mcs.args;
1237	op->cmd = MMUEXT_INVLPG_LOCAL;
1238	op->arg1.linear_addr = addr & PAGE_MASK;
1239	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1240
1241	xen_mc_issue(PARAVIRT_LAZY_MMU);
1242
1243	preempt_enable();
1244}
1245
1246static void xen_flush_tlb_others(const struct cpumask *cpus,
1247				 struct mm_struct *mm, unsigned long va)
 
1248{
1249	struct {
1250		struct mmuext_op op;
1251#ifdef CONFIG_SMP
1252		DECLARE_BITMAP(mask, num_processors);
1253#else
1254		DECLARE_BITMAP(mask, NR_CPUS);
1255#endif
1256	} *args;
1257	struct multicall_space mcs;
1258
1259	trace_xen_mmu_flush_tlb_others(cpus, mm, va);
1260
1261	if (cpumask_empty(cpus))
1262		return;		/* nothing to do */
1263
1264	mcs = xen_mc_entry(sizeof(*args));
1265	args = mcs.args;
1266	args->op.arg2.vcpumask = to_cpumask(args->mask);
1267
1268	/* Remove us, and any offline CPUS. */
1269	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1270	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1271
1272	if (va == TLB_FLUSH_ALL) {
1273		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1274	} else {
1275		args->op.cmd = MMUEXT_INVLPG_MULTI;
1276		args->op.arg1.linear_addr = va;
1277	}
1278
1279	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1280
1281	xen_mc_issue(PARAVIRT_LAZY_MMU);
1282}
1283
1284static unsigned long xen_read_cr3(void)
1285{
1286	return this_cpu_read(xen_cr3);
1287}
1288
1289static void set_current_cr3(void *v)
1290{
1291	this_cpu_write(xen_current_cr3, (unsigned long)v);
1292}
1293
1294static void __xen_write_cr3(bool kernel, unsigned long cr3)
1295{
1296	struct mmuext_op op;
1297	unsigned long mfn;
1298
1299	trace_xen_mmu_write_cr3(kernel, cr3);
1300
1301	if (cr3)
1302		mfn = pfn_to_mfn(PFN_DOWN(cr3));
1303	else
1304		mfn = 0;
1305
1306	WARN_ON(mfn == 0 && kernel);
1307
1308	op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1309	op.arg1.mfn = mfn;
1310
1311	xen_extend_mmuext_op(&op);
1312
1313	if (kernel) {
1314		this_cpu_write(xen_cr3, cr3);
1315
1316		/* Update xen_current_cr3 once the batch has actually
1317		   been submitted. */
1318		xen_mc_callback(set_current_cr3, (void *)cr3);
1319	}
1320}
1321
1322static void xen_write_cr3(unsigned long cr3)
1323{
1324	BUG_ON(preemptible());
1325
1326	xen_mc_batch();  /* disables interrupts */
1327
1328	/* Update while interrupts are disabled, so its atomic with
1329	   respect to ipis */
1330	this_cpu_write(xen_cr3, cr3);
1331
1332	__xen_write_cr3(true, cr3);
1333
1334#ifdef CONFIG_X86_64
1335	{
1336		pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1337		if (user_pgd)
1338			__xen_write_cr3(false, __pa(user_pgd));
1339		else
1340			__xen_write_cr3(false, 0);
1341	}
1342#endif
1343
1344	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1345}
1346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1347static int xen_pgd_alloc(struct mm_struct *mm)
1348{
1349	pgd_t *pgd = mm->pgd;
1350	int ret = 0;
1351
1352	BUG_ON(PagePinned(virt_to_page(pgd)));
1353
1354#ifdef CONFIG_X86_64
1355	{
1356		struct page *page = virt_to_page(pgd);
1357		pgd_t *user_pgd;
1358
1359		BUG_ON(page->private != 0);
1360
1361		ret = -ENOMEM;
1362
1363		user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1364		page->private = (unsigned long)user_pgd;
1365
1366		if (user_pgd != NULL) {
1367			user_pgd[pgd_index(VSYSCALL_START)] =
 
1368				__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
 
1369			ret = 0;
1370		}
1371
1372		BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1373	}
1374#endif
1375
1376	return ret;
1377}
1378
1379static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1380{
1381#ifdef CONFIG_X86_64
1382	pgd_t *user_pgd = xen_get_user_pgd(pgd);
1383
1384	if (user_pgd)
1385		free_page((unsigned long)user_pgd);
1386#endif
1387}
1388
1389#ifdef CONFIG_X86_32
1390static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1391{
1392	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
1393	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1394		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1395			       pte_val_ma(pte));
1396
1397	return pte;
1398}
1399#else /* CONFIG_X86_64 */
1400static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1401{
1402	unsigned long pfn = pte_pfn(pte);
 
 
 
 
 
1403
1404	/*
1405	 * If the new pfn is within the range of the newly allocated
1406	 * kernel pagetable, and it isn't being mapped into an
1407	 * early_ioremap fixmap slot as a freshly allocated page, make sure
1408	 * it is RO.
1409	 */
1410	if (((!is_early_ioremap_ptep(ptep) &&
1411			pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
1412			(is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
1413		pte = pte_wrprotect(pte);
1414
1415	return pte;
1416}
1417#endif /* CONFIG_X86_64 */
1418
1419/* Init-time set_pte while constructing initial pagetables, which
1420   doesn't allow RO pagetable pages to be remapped RW */
 
 
 
 
 
 
 
 
 
 
 
 
1421static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1422{
1423	pte = mask_rw_pte(ptep, pte);
1424
1425	xen_set_pte(ptep, pte);
1426}
1427
1428static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1429{
1430	struct mmuext_op op;
1431	op.cmd = cmd;
1432	op.arg1.mfn = pfn_to_mfn(pfn);
1433	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1434		BUG();
1435}
1436
1437/* Early in boot, while setting up the initial pagetable, assume
1438   everything is pinned. */
1439static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1440{
1441#ifdef CONFIG_FLATMEM
1442	BUG_ON(mem_map);	/* should only be used early */
1443#endif
1444	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1445	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1446}
1447
1448/* Used for pmd and pud */
1449static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1450{
1451#ifdef CONFIG_FLATMEM
1452	BUG_ON(mem_map);	/* should only be used early */
1453#endif
1454	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1455}
1456
1457/* Early release_pte assumes that all pts are pinned, since there's
1458   only init_mm and anything attached to that is pinned. */
1459static void __init xen_release_pte_init(unsigned long pfn)
1460{
1461	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1462	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1463}
1464
1465static void __init xen_release_pmd_init(unsigned long pfn)
1466{
1467	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1468}
1469
1470static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1471{
1472	struct multicall_space mcs;
1473	struct mmuext_op *op;
1474
1475	mcs = __xen_mc_entry(sizeof(*op));
1476	op = mcs.args;
1477	op->cmd = cmd;
1478	op->arg1.mfn = pfn_to_mfn(pfn);
1479
1480	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
1481}
1482
1483static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1484{
1485	struct multicall_space mcs;
1486	unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1487
1488	mcs = __xen_mc_entry(0);
1489	MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
1490				pfn_pte(pfn, prot), 0);
1491}
1492
1493/* This needs to make sure the new pte page is pinned iff its being
1494   attached to a pinned pagetable. */
1495static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
1496				    unsigned level)
1497{
1498	bool pinned = PagePinned(virt_to_page(mm->pgd));
1499
1500	trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
1501
1502	if (pinned) {
1503		struct page *page = pfn_to_page(pfn);
1504
1505		SetPagePinned(page);
1506
1507		if (!PageHighMem(page)) {
1508			xen_mc_batch();
1509
1510			__set_pfn_prot(pfn, PAGE_KERNEL_RO);
1511
1512			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1513				__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1514
1515			xen_mc_issue(PARAVIRT_LAZY_MMU);
1516		} else {
1517			/* make sure there are no stray mappings of
1518			   this page */
1519			kmap_flush_unused();
1520		}
1521	}
1522}
1523
1524static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1525{
1526	xen_alloc_ptpage(mm, pfn, PT_PTE);
1527}
1528
1529static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1530{
1531	xen_alloc_ptpage(mm, pfn, PT_PMD);
1532}
1533
1534/* This should never happen until we're OK to use struct page */
1535static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
1536{
1537	struct page *page = pfn_to_page(pfn);
1538	bool pinned = PagePinned(page);
1539
1540	trace_xen_mmu_release_ptpage(pfn, level, pinned);
1541
1542	if (pinned) {
1543		if (!PageHighMem(page)) {
1544			xen_mc_batch();
1545
1546			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1547				__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1548
1549			__set_pfn_prot(pfn, PAGE_KERNEL);
1550
1551			xen_mc_issue(PARAVIRT_LAZY_MMU);
1552		}
1553		ClearPagePinned(page);
1554	}
1555}
1556
1557static void xen_release_pte(unsigned long pfn)
1558{
1559	xen_release_ptpage(pfn, PT_PTE);
1560}
1561
1562static void xen_release_pmd(unsigned long pfn)
1563{
1564	xen_release_ptpage(pfn, PT_PMD);
1565}
1566
1567#if PAGETABLE_LEVELS == 4
1568static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1569{
1570	xen_alloc_ptpage(mm, pfn, PT_PUD);
1571}
1572
1573static void xen_release_pud(unsigned long pfn)
1574{
1575	xen_release_ptpage(pfn, PT_PUD);
1576}
1577#endif
1578
1579void __init xen_reserve_top(void)
1580{
1581#ifdef CONFIG_X86_32
1582	unsigned long top = HYPERVISOR_VIRT_START;
1583	struct xen_platform_parameters pp;
1584
1585	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1586		top = pp.virt_start;
1587
1588	reserve_top_address(-top);
1589#endif	/* CONFIG_X86_32 */
1590}
1591
1592/*
1593 * Like __va(), but returns address in the kernel mapping (which is
1594 * all we have until the physical memory mapping has been set up.
1595 */
1596static void *__ka(phys_addr_t paddr)
1597{
1598#ifdef CONFIG_X86_64
1599	return (void *)(paddr + __START_KERNEL_map);
1600#else
1601	return __va(paddr);
1602#endif
1603}
1604
1605/* Convert a machine address to physical address */
1606static unsigned long m2p(phys_addr_t maddr)
1607{
1608	phys_addr_t paddr;
1609
1610	maddr &= PTE_PFN_MASK;
1611	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1612
1613	return paddr;
1614}
1615
1616/* Convert a machine address to kernel virtual */
1617static void *m2v(phys_addr_t maddr)
1618{
1619	return __ka(m2p(maddr));
1620}
1621
1622/* Set the page permissions on an identity-mapped pages */
1623static void set_page_prot(void *addr, pgprot_t prot)
 
1624{
1625	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1626	pte_t pte = pfn_pte(pfn, prot);
1627
1628	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
 
 
 
 
1629		BUG();
1630}
1631
 
 
 
 
1632static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1633{
1634	unsigned pmdidx, pteidx;
1635	unsigned ident_pte;
1636	unsigned long pfn;
1637
1638	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1639				      PAGE_SIZE);
1640
1641	ident_pte = 0;
1642	pfn = 0;
1643	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1644		pte_t *pte_page;
1645
1646		/* Reuse or allocate a page of ptes */
1647		if (pmd_present(pmd[pmdidx]))
1648			pte_page = m2v(pmd[pmdidx].pmd);
1649		else {
1650			/* Check for free pte pages */
1651			if (ident_pte == LEVEL1_IDENT_ENTRIES)
1652				break;
1653
1654			pte_page = &level1_ident_pgt[ident_pte];
1655			ident_pte += PTRS_PER_PTE;
1656
1657			pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1658		}
1659
1660		/* Install mappings */
1661		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1662			pte_t pte;
1663
1664#ifdef CONFIG_X86_32
1665			if (pfn > max_pfn_mapped)
1666				max_pfn_mapped = pfn;
1667#endif
1668
1669			if (!pte_none(pte_page[pteidx]))
1670				continue;
1671
1672			pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1673			pte_page[pteidx] = pte;
1674		}
1675	}
1676
1677	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1678		set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1679
1680	set_page_prot(pmd, PAGE_KERNEL_RO);
1681}
1682
1683void __init xen_setup_machphys_mapping(void)
1684{
1685	struct xen_machphys_mapping mapping;
1686
1687	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1688		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1689		machine_to_phys_nr = mapping.max_mfn + 1;
1690	} else {
1691		machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
1692	}
1693#ifdef CONFIG_X86_32
1694	WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
1695		< machine_to_phys_mapping);
1696#endif
1697}
1698
1699#ifdef CONFIG_X86_64
1700static void convert_pfn_mfn(void *v)
1701{
1702	pte_t *pte = v;
1703	int i;
1704
1705	/* All levels are converted the same way, so just treat them
1706	   as ptes. */
1707	for (i = 0; i < PTRS_PER_PTE; i++)
1708		pte[i] = xen_make_pte(pte[i].pte);
1709}
1710
 
 
 
 
 
 
 
 
 
 
 
 
 
1711/*
1712 * Set up the initial kernel pagetable.
1713 *
1714 * We can construct this by grafting the Xen provided pagetable into
1715 * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1716 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
1717 * means that only the kernel has a physical mapping to start with -
1718 * but that's enough to get __va working.  We need to fill in the rest
1719 * of the physical mapping once some sort of allocator has been set
1720 * up.
1721 */
1722pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1723					 unsigned long max_pfn)
1724{
1725	pud_t *l3;
1726	pmd_t *l2;
 
 
 
1727
1728	/* max_pfn_mapped is the last pfn mapped in the initial memory
1729	 * mappings. Considering that on Xen after the kernel mappings we
1730	 * have the mappings of some pages that don't exist in pfn space, we
1731	 * set max_pfn_mapped to the last real pfn mapped. */
1732	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
 
 
 
 
 
 
1733
1734	/* Zap identity mapping */
1735	init_level4_pgt[0] = __pgd(0);
1736
1737	/* Pre-constructed entries are in pfn, so convert to mfn */
1738	convert_pfn_mfn(init_level4_pgt);
1739	convert_pfn_mfn(level3_ident_pgt);
1740	convert_pfn_mfn(level3_kernel_pgt);
 
 
 
 
 
 
 
1741
 
 
 
 
1742	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1743	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1744
1745	memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1746	memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1747
1748	l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1749	l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1750	memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1751
1752	/* Set up identity map */
1753	xen_map_identity_early(level2_ident_pgt, max_pfn);
1754
1755	/* Make pagetable pieces RO */
1756	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1757	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1758	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1759	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1760	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1761	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1762
1763	/* Pin down new L4 */
1764	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1765			  PFN_DOWN(__pa_symbol(init_level4_pgt)));
1766
1767	/* Unpin Xen-provided one */
1768	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1769
1770	/* Switch over */
1771	pgd = init_level4_pgt;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1772
1773	/*
1774	 * At this stage there can be no user pgd, and no page
1775	 * structure to attach it to, so make sure we just set kernel
1776	 * pgd.
 
 
 
1777	 */
1778	xen_mc_batch();
1779	__xen_write_cr3(true, __pa(pgd));
1780	xen_mc_issue(PARAVIRT_LAZY_CPU);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1781
1782	memblock_reserve(__pa(xen_start_info->pt_base),
1783			 xen_start_info->nr_pt_frames * PAGE_SIZE);
 
 
 
 
 
 
 
1784
1785	return pgd;
 
 
1786}
 
1787#else	/* !CONFIG_X86_64 */
1788static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1789static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1790
1791static void __init xen_write_cr3_init(unsigned long cr3)
1792{
1793	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1794
1795	BUG_ON(read_cr3() != __pa(initial_page_table));
1796	BUG_ON(cr3 != __pa(swapper_pg_dir));
1797
1798	/*
1799	 * We are switching to swapper_pg_dir for the first time (from
1800	 * initial_page_table) and therefore need to mark that page
1801	 * read-only and then pin it.
1802	 *
1803	 * Xen disallows sharing of kernel PMDs for PAE
1804	 * guests. Therefore we must copy the kernel PMD from
1805	 * initial_page_table into a new kernel PMD to be used in
1806	 * swapper_pg_dir.
1807	 */
1808	swapper_kernel_pmd =
1809		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1810	memcpy(swapper_kernel_pmd, initial_kernel_pmd,
1811	       sizeof(pmd_t) * PTRS_PER_PMD);
1812	swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1813		__pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1814	set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1815
1816	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1817	xen_write_cr3(cr3);
1818	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1819
1820	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1821			  PFN_DOWN(__pa(initial_page_table)));
1822	set_page_prot(initial_page_table, PAGE_KERNEL);
1823	set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1824
1825	pv_mmu_ops.write_cr3 = &xen_write_cr3;
1826}
1827
1828pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1829					 unsigned long max_pfn)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1830{
1831	pmd_t *kernel_pmd;
1832
 
 
 
 
 
1833	initial_kernel_pmd =
1834		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1835
1836	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1837				  xen_start_info->nr_pt_frames * PAGE_SIZE +
1838				  512*1024);
1839
1840	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1841	memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1842
1843	xen_map_identity_early(initial_kernel_pmd, max_pfn);
1844
1845	memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1846	initial_page_table[KERNEL_PGD_BOUNDARY] =
1847		__pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
1848
1849	set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1850	set_page_prot(initial_page_table, PAGE_KERNEL_RO);
1851	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1852
1853	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1854
1855	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1856			  PFN_DOWN(__pa(initial_page_table)));
1857	xen_write_cr3(__pa(initial_page_table));
1858
1859	memblock_reserve(__pa(xen_start_info->pt_base),
1860			 xen_start_info->nr_pt_frames * PAGE_SIZE);
1861
1862	return initial_page_table;
1863}
1864#endif	/* CONFIG_X86_64 */
1865
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1866static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1867
1868static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1869{
1870	pte_t pte;
1871
1872	phys >>= PAGE_SHIFT;
1873
1874	switch (idx) {
1875	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1876#ifdef CONFIG_X86_F00F_BUG
1877	case FIX_F00F_IDT:
1878#endif
1879#ifdef CONFIG_X86_32
1880	case FIX_WP_TEST:
1881	case FIX_VDSO:
1882# ifdef CONFIG_HIGHMEM
1883	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1884# endif
1885#else
1886	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1887	case VVAR_PAGE:
1888#endif
1889	case FIX_TEXT_POKE0:
1890	case FIX_TEXT_POKE1:
1891		/* All local page mappings */
1892		pte = pfn_pte(phys, prot);
1893		break;
1894
1895#ifdef CONFIG_X86_LOCAL_APIC
1896	case FIX_APIC_BASE:	/* maps dummy local APIC */
1897		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1898		break;
1899#endif
1900
1901#ifdef CONFIG_X86_IO_APIC
1902	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
1903		/*
1904		 * We just don't map the IO APIC - all access is via
1905		 * hypercalls.  Keep the address in the pte for reference.
1906		 */
1907		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1908		break;
1909#endif
1910
1911	case FIX_PARAVIRT_BOOTMAP:
1912		/* This is an MFN, but it isn't an IO mapping from the
1913		   IO domain */
1914		pte = mfn_pte(phys, prot);
1915		break;
1916
1917	default:
1918		/* By default, set_fixmap is used for hardware mappings */
1919		pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1920		break;
1921	}
1922
1923	__native_set_fixmap(idx, pte);
1924
1925#ifdef CONFIG_X86_64
1926	/* Replicate changes to map the vsyscall page into the user
1927	   pagetable vsyscall mapping. */
1928	if ((idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) ||
1929	    idx == VVAR_PAGE) {
1930		unsigned long vaddr = __fix_to_virt(idx);
1931		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1932	}
1933#endif
1934}
1935
1936static void __init xen_post_allocator_init(void)
1937{
 
 
 
1938	pv_mmu_ops.set_pte = xen_set_pte;
1939	pv_mmu_ops.set_pmd = xen_set_pmd;
1940	pv_mmu_ops.set_pud = xen_set_pud;
1941#if PAGETABLE_LEVELS == 4
1942	pv_mmu_ops.set_pgd = xen_set_pgd;
1943#endif
1944
1945	/* This will work as long as patching hasn't happened yet
1946	   (which it hasn't) */
1947	pv_mmu_ops.alloc_pte = xen_alloc_pte;
1948	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1949	pv_mmu_ops.release_pte = xen_release_pte;
1950	pv_mmu_ops.release_pmd = xen_release_pmd;
1951#if PAGETABLE_LEVELS == 4
1952	pv_mmu_ops.alloc_pud = xen_alloc_pud;
1953	pv_mmu_ops.release_pud = xen_release_pud;
1954#endif
1955
1956#ifdef CONFIG_X86_64
 
1957	SetPagePinned(virt_to_page(level3_user_vsyscall));
1958#endif
1959	xen_mark_init_mm_pinned();
1960}
1961
1962static void xen_leave_lazy_mmu(void)
1963{
1964	preempt_disable();
1965	xen_mc_flush();
1966	paravirt_leave_lazy_mmu();
1967	preempt_enable();
1968}
1969
1970static const struct pv_mmu_ops xen_mmu_ops __initconst = {
1971	.read_cr2 = xen_read_cr2,
1972	.write_cr2 = xen_write_cr2,
1973
1974	.read_cr3 = xen_read_cr3,
1975#ifdef CONFIG_X86_32
1976	.write_cr3 = xen_write_cr3_init,
1977#else
1978	.write_cr3 = xen_write_cr3,
1979#endif
1980
1981	.flush_tlb_user = xen_flush_tlb,
1982	.flush_tlb_kernel = xen_flush_tlb,
1983	.flush_tlb_single = xen_flush_tlb_single,
1984	.flush_tlb_others = xen_flush_tlb_others,
1985
1986	.pte_update = paravirt_nop,
1987	.pte_update_defer = paravirt_nop,
1988
1989	.pgd_alloc = xen_pgd_alloc,
1990	.pgd_free = xen_pgd_free,
1991
1992	.alloc_pte = xen_alloc_pte_init,
1993	.release_pte = xen_release_pte_init,
1994	.alloc_pmd = xen_alloc_pmd_init,
1995	.release_pmd = xen_release_pmd_init,
1996
1997	.set_pte = xen_set_pte_init,
1998	.set_pte_at = xen_set_pte_at,
1999	.set_pmd = xen_set_pmd_hyper,
2000
2001	.ptep_modify_prot_start = __ptep_modify_prot_start,
2002	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
2003
2004	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
2005	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2006
2007	.make_pte = PV_CALLEE_SAVE(xen_make_pte),
2008	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2009
2010#ifdef CONFIG_X86_PAE
2011	.set_pte_atomic = xen_set_pte_atomic,
2012	.pte_clear = xen_pte_clear,
2013	.pmd_clear = xen_pmd_clear,
2014#endif	/* CONFIG_X86_PAE */
2015	.set_pud = xen_set_pud_hyper,
2016
2017	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2018	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2019
2020#if PAGETABLE_LEVELS == 4
2021	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
2022	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
2023	.set_pgd = xen_set_pgd_hyper,
2024
2025	.alloc_pud = xen_alloc_pmd_init,
2026	.release_pud = xen_release_pmd_init,
2027#endif	/* PAGETABLE_LEVELS == 4 */
2028
2029	.activate_mm = xen_activate_mm,
2030	.dup_mmap = xen_dup_mmap,
2031	.exit_mmap = xen_exit_mmap,
2032
2033	.lazy_mode = {
2034		.enter = paravirt_enter_lazy_mmu,
2035		.leave = xen_leave_lazy_mmu,
 
2036	},
2037
2038	.set_fixmap = xen_set_fixmap,
2039};
2040
2041void __init xen_init_mmu_ops(void)
2042{
2043	x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
2044	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2045	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
 
 
2046	pv_mmu_ops = xen_mmu_ops;
2047
2048	memset(dummy_mapping, 0xff, PAGE_SIZE);
2049}
2050
2051/* Protected by xen_reservation_lock. */
2052#define MAX_CONTIG_ORDER 9 /* 2MB */
2053static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2054
2055#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2056static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2057				unsigned long *in_frames,
2058				unsigned long *out_frames)
2059{
2060	int i;
2061	struct multicall_space mcs;
2062
2063	xen_mc_batch();
2064	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2065		mcs = __xen_mc_entry(0);
2066
2067		if (in_frames)
2068			in_frames[i] = virt_to_mfn(vaddr);
2069
2070		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2071		__set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2072
2073		if (out_frames)
2074			out_frames[i] = virt_to_pfn(vaddr);
2075	}
2076	xen_mc_issue(0);
2077}
2078
2079/*
2080 * Update the pfn-to-mfn mappings for a virtual address range, either to
2081 * point to an array of mfns, or contiguously from a single starting
2082 * mfn.
2083 */
2084static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2085				     unsigned long *mfns,
2086				     unsigned long first_mfn)
2087{
2088	unsigned i, limit;
2089	unsigned long mfn;
2090
2091	xen_mc_batch();
2092
2093	limit = 1u << order;
2094	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2095		struct multicall_space mcs;
2096		unsigned flags;
2097
2098		mcs = __xen_mc_entry(0);
2099		if (mfns)
2100			mfn = mfns[i];
2101		else
2102			mfn = first_mfn + i;
2103
2104		if (i < (limit - 1))
2105			flags = 0;
2106		else {
2107			if (order == 0)
2108				flags = UVMF_INVLPG | UVMF_ALL;
2109			else
2110				flags = UVMF_TLB_FLUSH | UVMF_ALL;
2111		}
2112
2113		MULTI_update_va_mapping(mcs.mc, vaddr,
2114				mfn_pte(mfn, PAGE_KERNEL), flags);
2115
2116		set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2117	}
2118
2119	xen_mc_issue(0);
2120}
2121
2122/*
2123 * Perform the hypercall to exchange a region of our pfns to point to
2124 * memory with the required contiguous alignment.  Takes the pfns as
2125 * input, and populates mfns as output.
2126 *
2127 * Returns a success code indicating whether the hypervisor was able to
2128 * satisfy the request or not.
2129 */
2130static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2131			       unsigned long *pfns_in,
2132			       unsigned long extents_out,
2133			       unsigned int order_out,
2134			       unsigned long *mfns_out,
2135			       unsigned int address_bits)
2136{
2137	long rc;
2138	int success;
2139
2140	struct xen_memory_exchange exchange = {
2141		.in = {
2142			.nr_extents   = extents_in,
2143			.extent_order = order_in,
2144			.extent_start = pfns_in,
2145			.domid        = DOMID_SELF
2146		},
2147		.out = {
2148			.nr_extents   = extents_out,
2149			.extent_order = order_out,
2150			.extent_start = mfns_out,
2151			.address_bits = address_bits,
2152			.domid        = DOMID_SELF
2153		}
2154	};
2155
2156	BUG_ON(extents_in << order_in != extents_out << order_out);
2157
2158	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2159	success = (exchange.nr_exchanged == extents_in);
2160
2161	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2162	BUG_ON(success && (rc != 0));
2163
2164	return success;
2165}
2166
2167int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2168				 unsigned int address_bits)
 
2169{
2170	unsigned long *in_frames = discontig_frames, out_frame;
2171	unsigned long  flags;
2172	int            success;
 
2173
2174	/*
2175	 * Currently an auto-translated guest will not perform I/O, nor will
2176	 * it require PAE page directories below 4GB. Therefore any calls to
2177	 * this function are redundant and can be ignored.
2178	 */
2179
2180	if (xen_feature(XENFEAT_auto_translated_physmap))
2181		return 0;
2182
2183	if (unlikely(order > MAX_CONTIG_ORDER))
2184		return -ENOMEM;
2185
2186	memset((void *) vstart, 0, PAGE_SIZE << order);
2187
2188	spin_lock_irqsave(&xen_reservation_lock, flags);
2189
2190	/* 1. Zap current PTEs, remembering MFNs. */
2191	xen_zap_pfn_range(vstart, order, in_frames, NULL);
2192
2193	/* 2. Get a new contiguous memory extent. */
2194	out_frame = virt_to_pfn(vstart);
2195	success = xen_exchange_memory(1UL << order, 0, in_frames,
2196				      1, order, &out_frame,
2197				      address_bits);
2198
2199	/* 3. Map the new extent in place of old pages. */
2200	if (success)
2201		xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2202	else
2203		xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2204
2205	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2206
 
2207	return success ? 0 : -ENOMEM;
2208}
2209EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2210
2211void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2212{
2213	unsigned long *out_frames = discontig_frames, in_frame;
2214	unsigned long  flags;
2215	int success;
 
2216
2217	if (xen_feature(XENFEAT_auto_translated_physmap))
2218		return;
2219
2220	if (unlikely(order > MAX_CONTIG_ORDER))
2221		return;
2222
 
2223	memset((void *) vstart, 0, PAGE_SIZE << order);
2224
2225	spin_lock_irqsave(&xen_reservation_lock, flags);
2226
2227	/* 1. Find start MFN of contiguous extent. */
2228	in_frame = virt_to_mfn(vstart);
2229
2230	/* 2. Zap current PTEs. */
2231	xen_zap_pfn_range(vstart, order, NULL, out_frames);
2232
2233	/* 3. Do the exchange for non-contiguous MFNs. */
2234	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2235					0, out_frames, 0);
2236
2237	/* 4. Map new pages in place of old pages. */
2238	if (success)
2239		xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2240	else
2241		xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2242
2243	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2244}
2245EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2246
2247#ifdef CONFIG_XEN_PVHVM
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2248static void xen_hvm_exit_mmap(struct mm_struct *mm)
2249{
2250	struct xen_hvm_pagetable_dying a;
2251	int rc;
2252
2253	a.domid = DOMID_SELF;
2254	a.gpa = __pa(mm->pgd);
2255	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2256	WARN_ON_ONCE(rc < 0);
2257}
2258
2259static int is_pagetable_dying_supported(void)
2260{
2261	struct xen_hvm_pagetable_dying a;
2262	int rc = 0;
2263
2264	a.domid = DOMID_SELF;
2265	a.gpa = 0x00;
2266	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2267	if (rc < 0) {
2268		printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2269		return 0;
2270	}
2271	return 1;
2272}
2273
2274void __init xen_hvm_init_mmu_ops(void)
2275{
2276	if (is_pagetable_dying_supported())
2277		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
 
 
 
2278}
2279#endif
2280
2281#define REMAP_BATCH_SIZE 16
2282
2283struct remap_data {
2284	unsigned long mfn;
 
2285	pgprot_t prot;
2286	struct mmu_update *mmu_update;
2287};
2288
2289static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2290				 unsigned long addr, void *data)
2291{
2292	struct remap_data *rmd = data;
2293	pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
 
 
 
 
 
 
 
2294
2295	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2296	rmd->mmu_update->val = pte_val_ma(pte);
2297	rmd->mmu_update++;
2298
2299	return 0;
2300}
2301
2302int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2303			       unsigned long addr,
2304			       unsigned long mfn, int nr,
2305			       pgprot_t prot, unsigned domid)
 
 
2306{
 
2307	struct remap_data rmd;
2308	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2309	int batch;
2310	unsigned long range;
2311	int err = 0;
2312
2313	prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2314
2315	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2316				(VM_PFNMAP | VM_RESERVED | VM_IO)));
 
 
 
 
 
 
 
2317
2318	rmd.mfn = mfn;
2319	rmd.prot = prot;
 
 
 
2320
2321	while (nr) {
2322		batch = min(REMAP_BATCH_SIZE, nr);
 
 
 
2323		range = (unsigned long)batch << PAGE_SHIFT;
2324
2325		rmd.mmu_update = mmu_update;
2326		err = apply_to_page_range(vma->vm_mm, addr, range,
2327					  remap_area_mfn_pte_fn, &rmd);
2328		if (err)
2329			goto out;
2330
2331		err = -EFAULT;
2332		if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2333			goto out;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2334
2335		nr -= batch;
2336		addr += range;
 
 
 
2337	}
2338
2339	err = 0;
2340out:
2341
2342	flush_tlb_all();
 
 
 
 
 
 
 
 
 
 
 
 
 
2343
2344	return err;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2345}
2346EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
v4.6
   1/*
   2 * Xen mmu operations
   3 *
   4 * This file contains the various mmu fetch and update operations.
   5 * The most important job they must perform is the mapping between the
   6 * domain's pfn and the overall machine mfns.
   7 *
   8 * Xen allows guests to directly update the pagetable, in a controlled
   9 * fashion.  In other words, the guest modifies the same pagetable
  10 * that the CPU actually uses, which eliminates the overhead of having
  11 * a separate shadow pagetable.
  12 *
  13 * In order to allow this, it falls on the guest domain to map its
  14 * notion of a "physical" pfn - which is just a domain-local linear
  15 * address - into a real "machine address" which the CPU's MMU can
  16 * use.
  17 *
  18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19 * inserted directly into the pagetable.  When creating a new
  20 * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22 * the mfn back into a pfn.
  23 *
  24 * The other constraint is that all pages which make up a pagetable
  25 * must be mapped read-only in the guest.  This prevents uncontrolled
  26 * guest updates to the pagetable.  Xen strictly enforces this, and
  27 * will disallow any pagetable update which will end up mapping a
  28 * pagetable page RW, and will disallow using any writable page as a
  29 * pagetable.
  30 *
  31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32 * would need to validate the whole pagetable before going on.
  33 * Naturally, this is quite slow.  The solution is to "pin" a
  34 * pagetable, which enforces all the constraints on the pagetable even
  35 * when it is not actively in use.  This menas that Xen can be assured
  36 * that it is still valid when you do load it into %cr3, and doesn't
  37 * need to revalidate it.
  38 *
  39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40 */
  41#include <linux/sched.h>
  42#include <linux/highmem.h>
  43#include <linux/debugfs.h>
  44#include <linux/bug.h>
  45#include <linux/vmalloc.h>
  46#include <linux/module.h>
  47#include <linux/gfp.h>
  48#include <linux/memblock.h>
  49#include <linux/seq_file.h>
  50#include <linux/crash_dump.h>
  51
  52#include <trace/events/xen.h>
  53
  54#include <asm/pgtable.h>
  55#include <asm/tlbflush.h>
  56#include <asm/fixmap.h>
  57#include <asm/mmu_context.h>
  58#include <asm/setup.h>
  59#include <asm/paravirt.h>
  60#include <asm/e820.h>
  61#include <asm/linkage.h>
  62#include <asm/page.h>
  63#include <asm/init.h>
  64#include <asm/pat.h>
  65#include <asm/smp.h>
  66
  67#include <asm/xen/hypercall.h>
  68#include <asm/xen/hypervisor.h>
  69
  70#include <xen/xen.h>
  71#include <xen/page.h>
  72#include <xen/interface/xen.h>
  73#include <xen/interface/hvm/hvm_op.h>
  74#include <xen/interface/version.h>
  75#include <xen/interface/memory.h>
  76#include <xen/hvc-console.h>
  77
  78#include "multicalls.h"
  79#include "mmu.h"
  80#include "debugfs.h"
  81
  82/*
  83 * Protects atomic reservation decrease/increase against concurrent increases.
  84 * Also protects non-atomic updates of current_pages and balloon lists.
  85 */
  86DEFINE_SPINLOCK(xen_reservation_lock);
  87
  88#ifdef CONFIG_X86_32
  89/*
  90 * Identity map, in addition to plain kernel map.  This needs to be
  91 * large enough to allocate page table pages to allocate the rest.
  92 * Each page can map 2MB.
  93 */
  94#define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)
  95static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
  96#endif
  97#ifdef CONFIG_X86_64
  98/* l3 pud for userspace vsyscall mapping */
  99static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
 100#endif /* CONFIG_X86_64 */
 101
 102/*
 103 * Note about cr3 (pagetable base) values:
 104 *
 105 * xen_cr3 contains the current logical cr3 value; it contains the
 106 * last set cr3.  This may not be the current effective cr3, because
 107 * its update may be being lazily deferred.  However, a vcpu looking
 108 * at its own cr3 can use this value knowing that it everything will
 109 * be self-consistent.
 110 *
 111 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
 112 * hypercall to set the vcpu cr3 is complete (so it may be a little
 113 * out of date, but it will never be set early).  If one vcpu is
 114 * looking at another vcpu's cr3 value, it should use this variable.
 115 */
 116DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
 117DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
 118
 119static phys_addr_t xen_pt_base, xen_pt_size __initdata;
 120
 121/*
 122 * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 123 * redzone above it, so round it up to a PGD boundary.
 124 */
 125#define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 126
 127unsigned long arbitrary_virt_to_mfn(void *vaddr)
 128{
 129	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
 130
 131	return PFN_DOWN(maddr.maddr);
 132}
 133
 134xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 135{
 136	unsigned long address = (unsigned long)vaddr;
 137	unsigned int level;
 138	pte_t *pte;
 139	unsigned offset;
 140
 141	/*
 142	 * if the PFN is in the linear mapped vaddr range, we can just use
 143	 * the (quick) virt_to_machine() p2m lookup
 144	 */
 145	if (virt_addr_valid(vaddr))
 146		return virt_to_machine(vaddr);
 147
 148	/* otherwise we have to do a (slower) full page-table walk */
 149
 150	pte = lookup_address(address, &level);
 151	BUG_ON(pte == NULL);
 152	offset = address & ~PAGE_MASK;
 153	return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
 154}
 155EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
 156
 157void make_lowmem_page_readonly(void *vaddr)
 158{
 159	pte_t *pte, ptev;
 160	unsigned long address = (unsigned long)vaddr;
 161	unsigned int level;
 162
 163	pte = lookup_address(address, &level);
 164	if (pte == NULL)
 165		return;		/* vaddr missing */
 166
 167	ptev = pte_wrprotect(*pte);
 168
 169	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 170		BUG();
 171}
 172
 173void make_lowmem_page_readwrite(void *vaddr)
 174{
 175	pte_t *pte, ptev;
 176	unsigned long address = (unsigned long)vaddr;
 177	unsigned int level;
 178
 179	pte = lookup_address(address, &level);
 180	if (pte == NULL)
 181		return;		/* vaddr missing */
 182
 183	ptev = pte_mkwrite(*pte);
 184
 185	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 186		BUG();
 187}
 188
 189
 190static bool xen_page_pinned(void *ptr)
 191{
 192	struct page *page = virt_to_page(ptr);
 193
 194	return PagePinned(page);
 195}
 196
 197void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
 198{
 199	struct multicall_space mcs;
 200	struct mmu_update *u;
 201
 202	trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
 203
 204	mcs = xen_mc_entry(sizeof(*u));
 205	u = mcs.args;
 206
 207	/* ptep might be kmapped when using 32-bit HIGHPTE */
 208	u->ptr = virt_to_machine(ptep).maddr;
 209	u->val = pte_val_ma(pteval);
 210
 211	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
 212
 213	xen_mc_issue(PARAVIRT_LAZY_MMU);
 214}
 215EXPORT_SYMBOL_GPL(xen_set_domain_pte);
 216
 217static void xen_extend_mmu_update(const struct mmu_update *update)
 218{
 219	struct multicall_space mcs;
 220	struct mmu_update *u;
 221
 222	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 223
 224	if (mcs.mc != NULL) {
 225		mcs.mc->args[1]++;
 226	} else {
 227		mcs = __xen_mc_entry(sizeof(*u));
 228		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 229	}
 230
 231	u = mcs.args;
 232	*u = *update;
 233}
 234
 235static void xen_extend_mmuext_op(const struct mmuext_op *op)
 236{
 237	struct multicall_space mcs;
 238	struct mmuext_op *u;
 239
 240	mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
 241
 242	if (mcs.mc != NULL) {
 243		mcs.mc->args[1]++;
 244	} else {
 245		mcs = __xen_mc_entry(sizeof(*u));
 246		MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 247	}
 248
 249	u = mcs.args;
 250	*u = *op;
 251}
 252
 253static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 254{
 255	struct mmu_update u;
 256
 257	preempt_disable();
 258
 259	xen_mc_batch();
 260
 261	/* ptr may be ioremapped for 64-bit pagetable setup */
 262	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 263	u.val = pmd_val_ma(val);
 264	xen_extend_mmu_update(&u);
 265
 266	xen_mc_issue(PARAVIRT_LAZY_MMU);
 267
 268	preempt_enable();
 269}
 270
 271static void xen_set_pmd(pmd_t *ptr, pmd_t val)
 272{
 273	trace_xen_mmu_set_pmd(ptr, val);
 274
 275	/* If page is not pinned, we can just update the entry
 276	   directly */
 277	if (!xen_page_pinned(ptr)) {
 278		*ptr = val;
 279		return;
 280	}
 281
 282	xen_set_pmd_hyper(ptr, val);
 283}
 284
 285/*
 286 * Associate a virtual page frame with a given physical page frame
 287 * and protection flags for that frame.
 288 */
 289void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 290{
 291	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
 292}
 293
 294static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
 295{
 296	struct mmu_update u;
 297
 298	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
 299		return false;
 300
 301	xen_mc_batch();
 302
 303	u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
 304	u.val = pte_val_ma(pteval);
 305	xen_extend_mmu_update(&u);
 306
 307	xen_mc_issue(PARAVIRT_LAZY_MMU);
 308
 309	return true;
 310}
 311
 312static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
 313{
 314	if (!xen_batched_set_pte(ptep, pteval)) {
 315		/*
 316		 * Could call native_set_pte() here and trap and
 317		 * emulate the PTE write but with 32-bit guests this
 318		 * needs two traps (one for each of the two 32-bit
 319		 * words in the PTE) so do one hypercall directly
 320		 * instead.
 321		 */
 322		struct mmu_update u;
 323
 324		u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
 325		u.val = pte_val_ma(pteval);
 326		HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
 327	}
 328}
 329
 330static void xen_set_pte(pte_t *ptep, pte_t pteval)
 331{
 332	trace_xen_mmu_set_pte(ptep, pteval);
 333	__xen_set_pte(ptep, pteval);
 334}
 335
 336static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 337		    pte_t *ptep, pte_t pteval)
 338{
 339	trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
 340	__xen_set_pte(ptep, pteval);
 341}
 342
 343pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
 344				 unsigned long addr, pte_t *ptep)
 345{
 346	/* Just return the pte as-is.  We preserve the bits on commit */
 347	trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
 348	return *ptep;
 349}
 350
 351void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 352				 pte_t *ptep, pte_t pte)
 353{
 354	struct mmu_update u;
 355
 356	trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
 357	xen_mc_batch();
 358
 359	u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
 360	u.val = pte_val_ma(pte);
 361	xen_extend_mmu_update(&u);
 362
 363	xen_mc_issue(PARAVIRT_LAZY_MMU);
 364}
 365
 366/* Assume pteval_t is equivalent to all the other *val_t types. */
 367static pteval_t pte_mfn_to_pfn(pteval_t val)
 368{
 369	if (val & _PAGE_PRESENT) {
 370		unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 371		unsigned long pfn = mfn_to_pfn(mfn);
 372
 373		pteval_t flags = val & PTE_FLAGS_MASK;
 374		if (unlikely(pfn == ~0))
 375			val = flags & ~_PAGE_PRESENT;
 376		else
 377			val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
 378	}
 379
 380	return val;
 381}
 382
 383static pteval_t pte_pfn_to_mfn(pteval_t val)
 384{
 385	if (val & _PAGE_PRESENT) {
 386		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 387		pteval_t flags = val & PTE_FLAGS_MASK;
 388		unsigned long mfn;
 389
 390		if (!xen_feature(XENFEAT_auto_translated_physmap))
 391			mfn = __pfn_to_mfn(pfn);
 392		else
 393			mfn = pfn;
 394		/*
 395		 * If there's no mfn for the pfn, then just create an
 396		 * empty non-present pte.  Unfortunately this loses
 397		 * information about the original pfn, so
 398		 * pte_mfn_to_pfn is asymmetric.
 399		 */
 400		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
 401			mfn = 0;
 402			flags = 0;
 403		} else
 404			mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
 
 
 
 
 
 
 
 
 
 
 405		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
 406	}
 407
 408	return val;
 409}
 410
 411__visible pteval_t xen_pte_val(pte_t pte)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 412{
 413	pteval_t pteval = pte.pte;
 
 
 
 
 
 
 
 
 
 414
 415	return pte_mfn_to_pfn(pteval);
 416}
 417PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 418
 419__visible pgdval_t xen_pgd_val(pgd_t pgd)
 420{
 421	return pte_mfn_to_pfn(pgd.pgd);
 422}
 423PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 424
 425__visible pte_t xen_make_pte(pteval_t pte)
 426{
 427	pte = pte_pfn_to_mfn(pte);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 428
 429	return native_make_pte(pte);
 430}
 431PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 432
 433__visible pgd_t xen_make_pgd(pgdval_t pgd)
 434{
 435	pgd = pte_pfn_to_mfn(pgd);
 436	return native_make_pgd(pgd);
 437}
 438PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
 439
 440__visible pmdval_t xen_pmd_val(pmd_t pmd)
 441{
 442	return pte_mfn_to_pfn(pmd.pmd);
 443}
 444PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
 445
 446static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 447{
 448	struct mmu_update u;
 449
 450	preempt_disable();
 451
 452	xen_mc_batch();
 453
 454	/* ptr may be ioremapped for 64-bit pagetable setup */
 455	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 456	u.val = pud_val_ma(val);
 457	xen_extend_mmu_update(&u);
 458
 459	xen_mc_issue(PARAVIRT_LAZY_MMU);
 460
 461	preempt_enable();
 462}
 463
 464static void xen_set_pud(pud_t *ptr, pud_t val)
 465{
 466	trace_xen_mmu_set_pud(ptr, val);
 467
 468	/* If page is not pinned, we can just update the entry
 469	   directly */
 470	if (!xen_page_pinned(ptr)) {
 471		*ptr = val;
 472		return;
 473	}
 474
 475	xen_set_pud_hyper(ptr, val);
 476}
 477
 478#ifdef CONFIG_X86_PAE
 479static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 480{
 481	trace_xen_mmu_set_pte_atomic(ptep, pte);
 482	set_64bit((u64 *)ptep, native_pte_val(pte));
 483}
 484
 485static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 486{
 487	trace_xen_mmu_pte_clear(mm, addr, ptep);
 488	if (!xen_batched_set_pte(ptep, native_make_pte(0)))
 489		native_pte_clear(mm, addr, ptep);
 490}
 491
 492static void xen_pmd_clear(pmd_t *pmdp)
 493{
 494	trace_xen_mmu_pmd_clear(pmdp);
 495	set_pmd(pmdp, __pmd(0));
 496}
 497#endif	/* CONFIG_X86_PAE */
 498
 499__visible pmd_t xen_make_pmd(pmdval_t pmd)
 500{
 501	pmd = pte_pfn_to_mfn(pmd);
 502	return native_make_pmd(pmd);
 503}
 504PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 505
 506#if CONFIG_PGTABLE_LEVELS == 4
 507__visible pudval_t xen_pud_val(pud_t pud)
 508{
 509	return pte_mfn_to_pfn(pud.pud);
 510}
 511PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
 512
 513__visible pud_t xen_make_pud(pudval_t pud)
 514{
 515	pud = pte_pfn_to_mfn(pud);
 516
 517	return native_make_pud(pud);
 518}
 519PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
 520
 521static pgd_t *xen_get_user_pgd(pgd_t *pgd)
 522{
 523	pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
 524	unsigned offset = pgd - pgd_page;
 525	pgd_t *user_ptr = NULL;
 526
 527	if (offset < pgd_index(USER_LIMIT)) {
 528		struct page *page = virt_to_page(pgd_page);
 529		user_ptr = (pgd_t *)page->private;
 530		if (user_ptr)
 531			user_ptr += offset;
 532	}
 533
 534	return user_ptr;
 535}
 536
 537static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 538{
 539	struct mmu_update u;
 540
 541	u.ptr = virt_to_machine(ptr).maddr;
 542	u.val = pgd_val_ma(val);
 543	xen_extend_mmu_update(&u);
 544}
 545
 546/*
 547 * Raw hypercall-based set_pgd, intended for in early boot before
 548 * there's a page structure.  This implies:
 549 *  1. The only existing pagetable is the kernel's
 550 *  2. It is always pinned
 551 *  3. It has no user pagetable attached to it
 552 */
 553static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 554{
 555	preempt_disable();
 556
 557	xen_mc_batch();
 558
 559	__xen_set_pgd_hyper(ptr, val);
 560
 561	xen_mc_issue(PARAVIRT_LAZY_MMU);
 562
 563	preempt_enable();
 564}
 565
 566static void xen_set_pgd(pgd_t *ptr, pgd_t val)
 567{
 568	pgd_t *user_ptr = xen_get_user_pgd(ptr);
 569
 570	trace_xen_mmu_set_pgd(ptr, user_ptr, val);
 571
 572	/* If page is not pinned, we can just update the entry
 573	   directly */
 574	if (!xen_page_pinned(ptr)) {
 575		*ptr = val;
 576		if (user_ptr) {
 577			WARN_ON(xen_page_pinned(user_ptr));
 578			*user_ptr = val;
 579		}
 580		return;
 581	}
 582
 583	/* If it's pinned, then we can at least batch the kernel and
 584	   user updates together. */
 585	xen_mc_batch();
 586
 587	__xen_set_pgd_hyper(ptr, val);
 588	if (user_ptr)
 589		__xen_set_pgd_hyper(user_ptr, val);
 590
 591	xen_mc_issue(PARAVIRT_LAZY_MMU);
 592}
 593#endif	/* CONFIG_PGTABLE_LEVELS == 4 */
 594
 595/*
 596 * (Yet another) pagetable walker.  This one is intended for pinning a
 597 * pagetable.  This means that it walks a pagetable and calls the
 598 * callback function on each page it finds making up the page table,
 599 * at every level.  It walks the entire pagetable, but it only bothers
 600 * pinning pte pages which are below limit.  In the normal case this
 601 * will be STACK_TOP_MAX, but at boot we need to pin up to
 602 * FIXADDR_TOP.
 603 *
 604 * For 32-bit the important bit is that we don't pin beyond there,
 605 * because then we start getting into Xen's ptes.
 606 *
 607 * For 64-bit, we must skip the Xen hole in the middle of the address
 608 * space, just after the big x86-64 virtual hole.
 609 */
 610static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
 611			  int (*func)(struct mm_struct *mm, struct page *,
 612				      enum pt_level),
 613			  unsigned long limit)
 614{
 615	int flush = 0;
 616	unsigned hole_low, hole_high;
 617	unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
 618	unsigned pgdidx, pudidx, pmdidx;
 619
 620	/* The limit is the last byte to be touched */
 621	limit--;
 622	BUG_ON(limit >= FIXADDR_TOP);
 623
 624	if (xen_feature(XENFEAT_auto_translated_physmap))
 625		return 0;
 626
 627	/*
 628	 * 64-bit has a great big hole in the middle of the address
 629	 * space, which contains the Xen mappings.  On 32-bit these
 630	 * will end up making a zero-sized hole and so is a no-op.
 631	 */
 632	hole_low = pgd_index(USER_LIMIT);
 633	hole_high = pgd_index(PAGE_OFFSET);
 634
 635	pgdidx_limit = pgd_index(limit);
 636#if PTRS_PER_PUD > 1
 637	pudidx_limit = pud_index(limit);
 638#else
 639	pudidx_limit = 0;
 640#endif
 641#if PTRS_PER_PMD > 1
 642	pmdidx_limit = pmd_index(limit);
 643#else
 644	pmdidx_limit = 0;
 645#endif
 646
 647	for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
 648		pud_t *pud;
 649
 650		if (pgdidx >= hole_low && pgdidx < hole_high)
 651			continue;
 652
 653		if (!pgd_val(pgd[pgdidx]))
 654			continue;
 655
 656		pud = pud_offset(&pgd[pgdidx], 0);
 657
 658		if (PTRS_PER_PUD > 1) /* not folded */
 659			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
 660
 661		for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
 662			pmd_t *pmd;
 663
 664			if (pgdidx == pgdidx_limit &&
 665			    pudidx > pudidx_limit)
 666				goto out;
 667
 668			if (pud_none(pud[pudidx]))
 669				continue;
 670
 671			pmd = pmd_offset(&pud[pudidx], 0);
 672
 673			if (PTRS_PER_PMD > 1) /* not folded */
 674				flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
 675
 676			for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
 677				struct page *pte;
 678
 679				if (pgdidx == pgdidx_limit &&
 680				    pudidx == pudidx_limit &&
 681				    pmdidx > pmdidx_limit)
 682					goto out;
 683
 684				if (pmd_none(pmd[pmdidx]))
 685					continue;
 686
 687				pte = pmd_page(pmd[pmdidx]);
 688				flush |= (*func)(mm, pte, PT_PTE);
 689			}
 690		}
 691	}
 692
 693out:
 694	/* Do the top level last, so that the callbacks can use it as
 695	   a cue to do final things like tlb flushes. */
 696	flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
 697
 698	return flush;
 699}
 700
 701static int xen_pgd_walk(struct mm_struct *mm,
 702			int (*func)(struct mm_struct *mm, struct page *,
 703				    enum pt_level),
 704			unsigned long limit)
 705{
 706	return __xen_pgd_walk(mm, mm->pgd, func, limit);
 707}
 708
 709/* If we're using split pte locks, then take the page's lock and
 710   return a pointer to it.  Otherwise return NULL. */
 711static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 712{
 713	spinlock_t *ptl = NULL;
 714
 715#if USE_SPLIT_PTE_PTLOCKS
 716	ptl = ptlock_ptr(page);
 717	spin_lock_nest_lock(ptl, &mm->page_table_lock);
 718#endif
 719
 720	return ptl;
 721}
 722
 723static void xen_pte_unlock(void *v)
 724{
 725	spinlock_t *ptl = v;
 726	spin_unlock(ptl);
 727}
 728
 729static void xen_do_pin(unsigned level, unsigned long pfn)
 730{
 731	struct mmuext_op op;
 732
 733	op.cmd = level;
 734	op.arg1.mfn = pfn_to_mfn(pfn);
 735
 736	xen_extend_mmuext_op(&op);
 737}
 738
 739static int xen_pin_page(struct mm_struct *mm, struct page *page,
 740			enum pt_level level)
 741{
 742	unsigned pgfl = TestSetPagePinned(page);
 743	int flush;
 744
 745	if (pgfl)
 746		flush = 0;		/* already pinned */
 747	else if (PageHighMem(page))
 748		/* kmaps need flushing if we found an unpinned
 749		   highpage */
 750		flush = 1;
 751	else {
 752		void *pt = lowmem_page_address(page);
 753		unsigned long pfn = page_to_pfn(page);
 754		struct multicall_space mcs = __xen_mc_entry(0);
 755		spinlock_t *ptl;
 756
 757		flush = 0;
 758
 759		/*
 760		 * We need to hold the pagetable lock between the time
 761		 * we make the pagetable RO and when we actually pin
 762		 * it.  If we don't, then other users may come in and
 763		 * attempt to update the pagetable by writing it,
 764		 * which will fail because the memory is RO but not
 765		 * pinned, so Xen won't do the trap'n'emulate.
 766		 *
 767		 * If we're using split pte locks, we can't hold the
 768		 * entire pagetable's worth of locks during the
 769		 * traverse, because we may wrap the preempt count (8
 770		 * bits).  The solution is to mark RO and pin each PTE
 771		 * page while holding the lock.  This means the number
 772		 * of locks we end up holding is never more than a
 773		 * batch size (~32 entries, at present).
 774		 *
 775		 * If we're not using split pte locks, we needn't pin
 776		 * the PTE pages independently, because we're
 777		 * protected by the overall pagetable lock.
 778		 */
 779		ptl = NULL;
 780		if (level == PT_PTE)
 781			ptl = xen_pte_lock(page, mm);
 782
 783		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 784					pfn_pte(pfn, PAGE_KERNEL_RO),
 785					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 786
 787		if (ptl) {
 788			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
 789
 790			/* Queue a deferred unlock for when this batch
 791			   is completed. */
 792			xen_mc_callback(xen_pte_unlock, ptl);
 793		}
 794	}
 795
 796	return flush;
 797}
 798
 799/* This is called just after a mm has been created, but it has not
 800   been used yet.  We need to make sure that its pagetable is all
 801   read-only, and can be pinned. */
 802static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 803{
 804	trace_xen_mmu_pgd_pin(mm, pgd);
 805
 806	xen_mc_batch();
 807
 808	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
 809		/* re-enable interrupts for flushing */
 810		xen_mc_issue(0);
 811
 812		kmap_flush_unused();
 813
 814		xen_mc_batch();
 815	}
 816
 817#ifdef CONFIG_X86_64
 818	{
 819		pgd_t *user_pgd = xen_get_user_pgd(pgd);
 820
 821		xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
 822
 823		if (user_pgd) {
 824			xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
 825			xen_do_pin(MMUEXT_PIN_L4_TABLE,
 826				   PFN_DOWN(__pa(user_pgd)));
 827		}
 828	}
 829#else /* CONFIG_X86_32 */
 830#ifdef CONFIG_X86_PAE
 831	/* Need to make sure unshared kernel PMD is pinnable */
 832	xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
 833		     PT_PMD);
 834#endif
 835	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 836#endif /* CONFIG_X86_64 */
 837	xen_mc_issue(0);
 838}
 839
 840static void xen_pgd_pin(struct mm_struct *mm)
 841{
 842	__xen_pgd_pin(mm, mm->pgd);
 843}
 844
 845/*
 846 * On save, we need to pin all pagetables to make sure they get their
 847 * mfns turned into pfns.  Search the list for any unpinned pgds and pin
 848 * them (unpinned pgds are not currently in use, probably because the
 849 * process is under construction or destruction).
 850 *
 851 * Expected to be called in stop_machine() ("equivalent to taking
 852 * every spinlock in the system"), so the locking doesn't really
 853 * matter all that much.
 854 */
 855void xen_mm_pin_all(void)
 856{
 857	struct page *page;
 858
 859	spin_lock(&pgd_lock);
 860
 861	list_for_each_entry(page, &pgd_list, lru) {
 862		if (!PagePinned(page)) {
 863			__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
 864			SetPageSavePinned(page);
 865		}
 866	}
 867
 868	spin_unlock(&pgd_lock);
 869}
 870
 871/*
 872 * The init_mm pagetable is really pinned as soon as its created, but
 873 * that's before we have page structures to store the bits.  So do all
 874 * the book-keeping now.
 875 */
 876static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
 877				  enum pt_level level)
 878{
 879	SetPagePinned(page);
 880	return 0;
 881}
 882
 883static void __init xen_mark_init_mm_pinned(void)
 884{
 885	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
 886}
 887
 888static int xen_unpin_page(struct mm_struct *mm, struct page *page,
 889			  enum pt_level level)
 890{
 891	unsigned pgfl = TestClearPagePinned(page);
 892
 893	if (pgfl && !PageHighMem(page)) {
 894		void *pt = lowmem_page_address(page);
 895		unsigned long pfn = page_to_pfn(page);
 896		spinlock_t *ptl = NULL;
 897		struct multicall_space mcs;
 898
 899		/*
 900		 * Do the converse to pin_page.  If we're using split
 901		 * pte locks, we must be holding the lock for while
 902		 * the pte page is unpinned but still RO to prevent
 903		 * concurrent updates from seeing it in this
 904		 * partially-pinned state.
 905		 */
 906		if (level == PT_PTE) {
 907			ptl = xen_pte_lock(page, mm);
 908
 909			if (ptl)
 910				xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
 911		}
 912
 913		mcs = __xen_mc_entry(0);
 914
 915		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 916					pfn_pte(pfn, PAGE_KERNEL),
 917					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 918
 919		if (ptl) {
 920			/* unlock when batch completed */
 921			xen_mc_callback(xen_pte_unlock, ptl);
 922		}
 923	}
 924
 925	return 0;		/* never need to flush on unpin */
 926}
 927
 928/* Release a pagetables pages back as normal RW */
 929static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
 930{
 931	trace_xen_mmu_pgd_unpin(mm, pgd);
 932
 933	xen_mc_batch();
 934
 935	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 936
 937#ifdef CONFIG_X86_64
 938	{
 939		pgd_t *user_pgd = xen_get_user_pgd(pgd);
 940
 941		if (user_pgd) {
 942			xen_do_pin(MMUEXT_UNPIN_TABLE,
 943				   PFN_DOWN(__pa(user_pgd)));
 944			xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
 945		}
 946	}
 947#endif
 948
 949#ifdef CONFIG_X86_PAE
 950	/* Need to make sure unshared kernel PMD is unpinned */
 951	xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
 952		       PT_PMD);
 953#endif
 954
 955	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
 956
 957	xen_mc_issue(0);
 958}
 959
 960static void xen_pgd_unpin(struct mm_struct *mm)
 961{
 962	__xen_pgd_unpin(mm, mm->pgd);
 963}
 964
 965/*
 966 * On resume, undo any pinning done at save, so that the rest of the
 967 * kernel doesn't see any unexpected pinned pagetables.
 968 */
 969void xen_mm_unpin_all(void)
 970{
 971	struct page *page;
 972
 973	spin_lock(&pgd_lock);
 974
 975	list_for_each_entry(page, &pgd_list, lru) {
 976		if (PageSavePinned(page)) {
 977			BUG_ON(!PagePinned(page));
 978			__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
 979			ClearPageSavePinned(page);
 980		}
 981	}
 982
 983	spin_unlock(&pgd_lock);
 984}
 985
 986static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
 987{
 988	spin_lock(&next->page_table_lock);
 989	xen_pgd_pin(next);
 990	spin_unlock(&next->page_table_lock);
 991}
 992
 993static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 994{
 995	spin_lock(&mm->page_table_lock);
 996	xen_pgd_pin(mm);
 997	spin_unlock(&mm->page_table_lock);
 998}
 999
1000
1001#ifdef CONFIG_SMP
1002/* Another cpu may still have their %cr3 pointing at the pagetable, so
1003   we need to repoint it somewhere else before we can unpin it. */
1004static void drop_other_mm_ref(void *info)
1005{
1006	struct mm_struct *mm = info;
1007	struct mm_struct *active_mm;
1008
1009	active_mm = this_cpu_read(cpu_tlbstate.active_mm);
1010
1011	if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
1012		leave_mm(smp_processor_id());
1013
1014	/* If this cpu still has a stale cr3 reference, then make sure
1015	   it has been flushed. */
1016	if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
1017		load_cr3(swapper_pg_dir);
1018}
1019
1020static void xen_drop_mm_ref(struct mm_struct *mm)
1021{
1022	cpumask_var_t mask;
1023	unsigned cpu;
1024
1025	if (current->active_mm == mm) {
1026		if (current->mm == mm)
1027			load_cr3(swapper_pg_dir);
1028		else
1029			leave_mm(smp_processor_id());
1030	}
1031
1032	/* Get the "official" set of cpus referring to our pagetable. */
1033	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1034		for_each_online_cpu(cpu) {
1035			if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1036			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1037				continue;
1038			smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1039		}
1040		return;
1041	}
1042	cpumask_copy(mask, mm_cpumask(mm));
1043
1044	/* It's possible that a vcpu may have a stale reference to our
1045	   cr3, because its in lazy mode, and it hasn't yet flushed
1046	   its set of pending hypercalls yet.  In this case, we can
1047	   look at its actual current cr3 value, and force it to flush
1048	   if needed. */
1049	for_each_online_cpu(cpu) {
1050		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1051			cpumask_set_cpu(cpu, mask);
1052	}
1053
1054	if (!cpumask_empty(mask))
1055		smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1056	free_cpumask_var(mask);
1057}
1058#else
1059static void xen_drop_mm_ref(struct mm_struct *mm)
1060{
1061	if (current->active_mm == mm)
1062		load_cr3(swapper_pg_dir);
1063}
1064#endif
1065
1066/*
1067 * While a process runs, Xen pins its pagetables, which means that the
1068 * hypervisor forces it to be read-only, and it controls all updates
1069 * to it.  This means that all pagetable updates have to go via the
1070 * hypervisor, which is moderately expensive.
1071 *
1072 * Since we're pulling the pagetable down, we switch to use init_mm,
1073 * unpin old process pagetable and mark it all read-write, which
1074 * allows further operations on it to be simple memory accesses.
1075 *
1076 * The only subtle point is that another CPU may be still using the
1077 * pagetable because of lazy tlb flushing.  This means we need need to
1078 * switch all CPUs off this pagetable before we can unpin it.
1079 */
1080static void xen_exit_mmap(struct mm_struct *mm)
1081{
1082	get_cpu();		/* make sure we don't move around */
1083	xen_drop_mm_ref(mm);
1084	put_cpu();
1085
1086	spin_lock(&mm->page_table_lock);
1087
1088	/* pgd may not be pinned in the error exit path of execve */
1089	if (xen_page_pinned(mm->pgd))
1090		xen_pgd_unpin(mm);
1091
1092	spin_unlock(&mm->page_table_lock);
1093}
1094
1095static void xen_post_allocator_init(void);
1096
1097static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1098{
1099	struct mmuext_op op;
1100
1101	op.cmd = cmd;
1102	op.arg1.mfn = pfn_to_mfn(pfn);
1103	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1104		BUG();
1105}
1106
1107#ifdef CONFIG_X86_64
1108static void __init xen_cleanhighmap(unsigned long vaddr,
1109				    unsigned long vaddr_end)
1110{
1111	unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
1112	pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
1113
1114	/* NOTE: The loop is more greedy than the cleanup_highmap variant.
1115	 * We include the PMD passed in on _both_ boundaries. */
1116	for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE));
1117			pmd++, vaddr += PMD_SIZE) {
1118		if (pmd_none(*pmd))
1119			continue;
1120		if (vaddr < (unsigned long) _text || vaddr > kernel_end)
1121			set_pmd(pmd, __pmd(0));
1122	}
1123	/* In case we did something silly, we should crash in this function
1124	 * instead of somewhere later and be confusing. */
1125	xen_mc_flush();
1126}
1127
1128/*
1129 * Make a page range writeable and free it.
1130 */
1131static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
1132{
1133	void *vaddr = __va(paddr);
1134	void *vaddr_end = vaddr + size;
1135
1136	for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
1137		make_lowmem_page_readwrite(vaddr);
1138
1139	memblock_free(paddr, size);
1140}
1141
1142static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
1143{
1144	unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
1145
1146	if (unpin)
1147		pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
1148	ClearPagePinned(virt_to_page(__va(pa)));
1149	xen_free_ro_pages(pa, PAGE_SIZE);
1150}
1151
1152/*
1153 * Since it is well isolated we can (and since it is perhaps large we should)
1154 * also free the page tables mapping the initial P->M table.
1155 */
1156static void __init xen_cleanmfnmap(unsigned long vaddr)
1157{
1158	unsigned long va = vaddr & PMD_MASK;
1159	unsigned long pa;
1160	pgd_t *pgd = pgd_offset_k(va);
1161	pud_t *pud_page = pud_offset(pgd, 0);
1162	pud_t *pud;
1163	pmd_t *pmd;
1164	pte_t *pte;
1165	unsigned int i;
1166	bool unpin;
1167
1168	unpin = (vaddr == 2 * PGDIR_SIZE);
1169	set_pgd(pgd, __pgd(0));
1170	do {
1171		pud = pud_page + pud_index(va);
1172		if (pud_none(*pud)) {
1173			va += PUD_SIZE;
1174		} else if (pud_large(*pud)) {
1175			pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
1176			xen_free_ro_pages(pa, PUD_SIZE);
1177			va += PUD_SIZE;
1178		} else {
1179			pmd = pmd_offset(pud, va);
1180			if (pmd_large(*pmd)) {
1181				pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
1182				xen_free_ro_pages(pa, PMD_SIZE);
1183			} else if (!pmd_none(*pmd)) {
1184				pte = pte_offset_kernel(pmd, va);
1185				set_pmd(pmd, __pmd(0));
1186				for (i = 0; i < PTRS_PER_PTE; ++i) {
1187					if (pte_none(pte[i]))
1188						break;
1189					pa = pte_pfn(pte[i]) << PAGE_SHIFT;
1190					xen_free_ro_pages(pa, PAGE_SIZE);
1191				}
1192				xen_cleanmfnmap_free_pgtbl(pte, unpin);
1193			}
1194			va += PMD_SIZE;
1195			if (pmd_index(va))
1196				continue;
1197			set_pud(pud, __pud(0));
1198			xen_cleanmfnmap_free_pgtbl(pmd, unpin);
1199		}
1200
1201	} while (pud_index(va) || pmd_index(va));
1202	xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
1203}
1204
1205static void __init xen_pagetable_p2m_free(void)
1206{
1207	unsigned long size;
1208	unsigned long addr;
1209
1210	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1211
1212	/* No memory or already called. */
1213	if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
1214		return;
1215
1216	/* using __ka address and sticking INVALID_P2M_ENTRY! */
1217	memset((void *)xen_start_info->mfn_list, 0xff, size);
1218
1219	addr = xen_start_info->mfn_list;
1220	/*
1221	 * We could be in __ka space.
1222	 * We roundup to the PMD, which means that if anybody at this stage is
1223	 * using the __ka address of xen_start_info or
1224	 * xen_start_info->shared_info they are in going to crash. Fortunatly
1225	 * we have already revectored in xen_setup_kernel_pagetable and in
1226	 * xen_setup_shared_info.
1227	 */
1228	size = roundup(size, PMD_SIZE);
1229
1230	if (addr >= __START_KERNEL_map) {
1231		xen_cleanhighmap(addr, addr + size);
1232		size = PAGE_ALIGN(xen_start_info->nr_pages *
1233				  sizeof(unsigned long));
1234		memblock_free(__pa(addr), size);
1235	} else {
1236		xen_cleanmfnmap(addr);
1237	}
1238}
1239
1240static void __init xen_pagetable_cleanhighmap(void)
1241{
1242	unsigned long size;
1243	unsigned long addr;
1244
1245	/* At this stage, cleanup_highmap has already cleaned __ka space
1246	 * from _brk_limit way up to the max_pfn_mapped (which is the end of
1247	 * the ramdisk). We continue on, erasing PMD entries that point to page
1248	 * tables - do note that they are accessible at this stage via __va.
1249	 * For good measure we also round up to the PMD - which means that if
1250	 * anybody is using __ka address to the initial boot-stack - and try
1251	 * to use it - they are going to crash. The xen_start_info has been
1252	 * taken care of already in xen_setup_kernel_pagetable. */
1253	addr = xen_start_info->pt_base;
1254	size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
1255
1256	xen_cleanhighmap(addr, addr + size);
1257	xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
1258#ifdef DEBUG
1259	/* This is superfluous and is not necessary, but you know what
1260	 * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
1261	 * anything at this stage. */
1262	xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
1263#endif
1264}
1265#endif
1266
1267static void __init xen_pagetable_p2m_setup(void)
1268{
1269	if (xen_feature(XENFEAT_auto_translated_physmap))
1270		return;
1271
1272	xen_vmalloc_p2m_tree();
1273
1274#ifdef CONFIG_X86_64
1275	xen_pagetable_p2m_free();
1276
1277	xen_pagetable_cleanhighmap();
1278#endif
1279	/* And revector! Bye bye old array */
1280	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
1281}
1282
1283static void __init xen_pagetable_init(void)
1284{
1285	paging_init();
1286	xen_post_allocator_init();
1287
1288	xen_pagetable_p2m_setup();
1289
1290	/* Allocate and initialize top and mid mfn levels for p2m structure */
1291	xen_build_mfn_list_list();
1292
1293	/* Remap memory freed due to conflicts with E820 map */
1294	if (!xen_feature(XENFEAT_auto_translated_physmap))
1295		xen_remap_memory();
1296
1297	xen_setup_shared_info();
1298}
1299static void xen_write_cr2(unsigned long cr2)
1300{
1301	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
1302}
1303
1304static unsigned long xen_read_cr2(void)
1305{
1306	return this_cpu_read(xen_vcpu)->arch.cr2;
1307}
1308
1309unsigned long xen_read_cr2_direct(void)
1310{
1311	return this_cpu_read(xen_vcpu_info.arch.cr2);
1312}
1313
1314void xen_flush_tlb_all(void)
1315{
1316	struct mmuext_op *op;
1317	struct multicall_space mcs;
1318
1319	trace_xen_mmu_flush_tlb_all(0);
1320
1321	preempt_disable();
1322
1323	mcs = xen_mc_entry(sizeof(*op));
1324
1325	op = mcs.args;
1326	op->cmd = MMUEXT_TLB_FLUSH_ALL;
1327	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1328
1329	xen_mc_issue(PARAVIRT_LAZY_MMU);
1330
1331	preempt_enable();
1332}
1333static void xen_flush_tlb(void)
1334{
1335	struct mmuext_op *op;
1336	struct multicall_space mcs;
1337
1338	trace_xen_mmu_flush_tlb(0);
1339
1340	preempt_disable();
1341
1342	mcs = xen_mc_entry(sizeof(*op));
1343
1344	op = mcs.args;
1345	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1346	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1347
1348	xen_mc_issue(PARAVIRT_LAZY_MMU);
1349
1350	preempt_enable();
1351}
1352
1353static void xen_flush_tlb_single(unsigned long addr)
1354{
1355	struct mmuext_op *op;
1356	struct multicall_space mcs;
1357
1358	trace_xen_mmu_flush_tlb_single(addr);
1359
1360	preempt_disable();
1361
1362	mcs = xen_mc_entry(sizeof(*op));
1363	op = mcs.args;
1364	op->cmd = MMUEXT_INVLPG_LOCAL;
1365	op->arg1.linear_addr = addr & PAGE_MASK;
1366	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1367
1368	xen_mc_issue(PARAVIRT_LAZY_MMU);
1369
1370	preempt_enable();
1371}
1372
1373static void xen_flush_tlb_others(const struct cpumask *cpus,
1374				 struct mm_struct *mm, unsigned long start,
1375				 unsigned long end)
1376{
1377	struct {
1378		struct mmuext_op op;
1379#ifdef CONFIG_SMP
1380		DECLARE_BITMAP(mask, num_processors);
1381#else
1382		DECLARE_BITMAP(mask, NR_CPUS);
1383#endif
1384	} *args;
1385	struct multicall_space mcs;
1386
1387	trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
1388
1389	if (cpumask_empty(cpus))
1390		return;		/* nothing to do */
1391
1392	mcs = xen_mc_entry(sizeof(*args));
1393	args = mcs.args;
1394	args->op.arg2.vcpumask = to_cpumask(args->mask);
1395
1396	/* Remove us, and any offline CPUS. */
1397	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1398	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1399
1400	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1401	if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
 
1402		args->op.cmd = MMUEXT_INVLPG_MULTI;
1403		args->op.arg1.linear_addr = start;
1404	}
1405
1406	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1407
1408	xen_mc_issue(PARAVIRT_LAZY_MMU);
1409}
1410
1411static unsigned long xen_read_cr3(void)
1412{
1413	return this_cpu_read(xen_cr3);
1414}
1415
1416static void set_current_cr3(void *v)
1417{
1418	this_cpu_write(xen_current_cr3, (unsigned long)v);
1419}
1420
1421static void __xen_write_cr3(bool kernel, unsigned long cr3)
1422{
1423	struct mmuext_op op;
1424	unsigned long mfn;
1425
1426	trace_xen_mmu_write_cr3(kernel, cr3);
1427
1428	if (cr3)
1429		mfn = pfn_to_mfn(PFN_DOWN(cr3));
1430	else
1431		mfn = 0;
1432
1433	WARN_ON(mfn == 0 && kernel);
1434
1435	op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1436	op.arg1.mfn = mfn;
1437
1438	xen_extend_mmuext_op(&op);
1439
1440	if (kernel) {
1441		this_cpu_write(xen_cr3, cr3);
1442
1443		/* Update xen_current_cr3 once the batch has actually
1444		   been submitted. */
1445		xen_mc_callback(set_current_cr3, (void *)cr3);
1446	}
1447}
 
1448static void xen_write_cr3(unsigned long cr3)
1449{
1450	BUG_ON(preemptible());
1451
1452	xen_mc_batch();  /* disables interrupts */
1453
1454	/* Update while interrupts are disabled, so its atomic with
1455	   respect to ipis */
1456	this_cpu_write(xen_cr3, cr3);
1457
1458	__xen_write_cr3(true, cr3);
1459
1460#ifdef CONFIG_X86_64
1461	{
1462		pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1463		if (user_pgd)
1464			__xen_write_cr3(false, __pa(user_pgd));
1465		else
1466			__xen_write_cr3(false, 0);
1467	}
1468#endif
1469
1470	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1471}
1472
1473#ifdef CONFIG_X86_64
1474/*
1475 * At the start of the day - when Xen launches a guest, it has already
1476 * built pagetables for the guest. We diligently look over them
1477 * in xen_setup_kernel_pagetable and graft as appropriate them in the
1478 * init_level4_pgt and its friends. Then when we are happy we load
1479 * the new init_level4_pgt - and continue on.
1480 *
1481 * The generic code starts (start_kernel) and 'init_mem_mapping' sets
1482 * up the rest of the pagetables. When it has completed it loads the cr3.
1483 * N.B. that baremetal would start at 'start_kernel' (and the early
1484 * #PF handler would create bootstrap pagetables) - so we are running
1485 * with the same assumptions as what to do when write_cr3 is executed
1486 * at this point.
1487 *
1488 * Since there are no user-page tables at all, we have two variants
1489 * of xen_write_cr3 - the early bootup (this one), and the late one
1490 * (xen_write_cr3). The reason we have to do that is that in 64-bit
1491 * the Linux kernel and user-space are both in ring 3 while the
1492 * hypervisor is in ring 0.
1493 */
1494static void __init xen_write_cr3_init(unsigned long cr3)
1495{
1496	BUG_ON(preemptible());
1497
1498	xen_mc_batch();  /* disables interrupts */
1499
1500	/* Update while interrupts are disabled, so its atomic with
1501	   respect to ipis */
1502	this_cpu_write(xen_cr3, cr3);
1503
1504	__xen_write_cr3(true, cr3);
1505
1506	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1507}
1508#endif
1509
1510static int xen_pgd_alloc(struct mm_struct *mm)
1511{
1512	pgd_t *pgd = mm->pgd;
1513	int ret = 0;
1514
1515	BUG_ON(PagePinned(virt_to_page(pgd)));
1516
1517#ifdef CONFIG_X86_64
1518	{
1519		struct page *page = virt_to_page(pgd);
1520		pgd_t *user_pgd;
1521
1522		BUG_ON(page->private != 0);
1523
1524		ret = -ENOMEM;
1525
1526		user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1527		page->private = (unsigned long)user_pgd;
1528
1529		if (user_pgd != NULL) {
1530#ifdef CONFIG_X86_VSYSCALL_EMULATION
1531			user_pgd[pgd_index(VSYSCALL_ADDR)] =
1532				__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1533#endif
1534			ret = 0;
1535		}
1536
1537		BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1538	}
1539#endif
1540
1541	return ret;
1542}
1543
1544static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1545{
1546#ifdef CONFIG_X86_64
1547	pgd_t *user_pgd = xen_get_user_pgd(pgd);
1548
1549	if (user_pgd)
1550		free_page((unsigned long)user_pgd);
1551#endif
1552}
1553
1554#ifdef CONFIG_X86_32
1555static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1556{
1557	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
1558	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1559		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1560			       pte_val_ma(pte));
1561
1562	return pte;
1563}
1564#else /* CONFIG_X86_64 */
1565static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1566{
1567	unsigned long pfn;
1568
1569	if (xen_feature(XENFEAT_writable_page_tables) ||
1570	    xen_feature(XENFEAT_auto_translated_physmap) ||
1571	    xen_start_info->mfn_list >= __START_KERNEL_map)
1572		return pte;
1573
1574	/*
1575	 * Pages belonging to the initial p2m list mapped outside the default
1576	 * address range must be mapped read-only. This region contains the
1577	 * page tables for mapping the p2m list, too, and page tables MUST be
1578	 * mapped read-only.
1579	 */
1580	pfn = pte_pfn(pte);
1581	if (pfn >= xen_start_info->first_p2m_pfn &&
1582	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
1583		pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
1584
1585	return pte;
1586}
1587#endif /* CONFIG_X86_64 */
1588
1589/*
1590 * Init-time set_pte while constructing initial pagetables, which
1591 * doesn't allow RO page table pages to be remapped RW.
1592 *
1593 * If there is no MFN for this PFN then this page is initially
1594 * ballooned out so clear the PTE (as in decrease_reservation() in
1595 * drivers/xen/balloon.c).
1596 *
1597 * Many of these PTE updates are done on unpinned and writable pages
1598 * and doing a hypercall for these is unnecessary and expensive.  At
1599 * this point it is not possible to tell if a page is pinned or not,
1600 * so always write the PTE directly and rely on Xen trapping and
1601 * emulating any updates as necessary.
1602 */
1603static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1604{
1605	if (pte_mfn(pte) != INVALID_P2M_ENTRY)
1606		pte = mask_rw_pte(ptep, pte);
1607	else
1608		pte = __pte_ma(0);
1609
1610	native_set_pte(ptep, pte);
 
 
 
 
 
 
1611}
1612
1613/* Early in boot, while setting up the initial pagetable, assume
1614   everything is pinned. */
1615static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1616{
1617#ifdef CONFIG_FLATMEM
1618	BUG_ON(mem_map);	/* should only be used early */
1619#endif
1620	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1621	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1622}
1623
1624/* Used for pmd and pud */
1625static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1626{
1627#ifdef CONFIG_FLATMEM
1628	BUG_ON(mem_map);	/* should only be used early */
1629#endif
1630	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1631}
1632
1633/* Early release_pte assumes that all pts are pinned, since there's
1634   only init_mm and anything attached to that is pinned. */
1635static void __init xen_release_pte_init(unsigned long pfn)
1636{
1637	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1638	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1639}
1640
1641static void __init xen_release_pmd_init(unsigned long pfn)
1642{
1643	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1644}
1645
1646static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1647{
1648	struct multicall_space mcs;
1649	struct mmuext_op *op;
1650
1651	mcs = __xen_mc_entry(sizeof(*op));
1652	op = mcs.args;
1653	op->cmd = cmd;
1654	op->arg1.mfn = pfn_to_mfn(pfn);
1655
1656	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
1657}
1658
1659static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1660{
1661	struct multicall_space mcs;
1662	unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1663
1664	mcs = __xen_mc_entry(0);
1665	MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
1666				pfn_pte(pfn, prot), 0);
1667}
1668
1669/* This needs to make sure the new pte page is pinned iff its being
1670   attached to a pinned pagetable. */
1671static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
1672				    unsigned level)
1673{
1674	bool pinned = PagePinned(virt_to_page(mm->pgd));
1675
1676	trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
1677
1678	if (pinned) {
1679		struct page *page = pfn_to_page(pfn);
1680
1681		SetPagePinned(page);
1682
1683		if (!PageHighMem(page)) {
1684			xen_mc_batch();
1685
1686			__set_pfn_prot(pfn, PAGE_KERNEL_RO);
1687
1688			if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1689				__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1690
1691			xen_mc_issue(PARAVIRT_LAZY_MMU);
1692		} else {
1693			/* make sure there are no stray mappings of
1694			   this page */
1695			kmap_flush_unused();
1696		}
1697	}
1698}
1699
1700static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1701{
1702	xen_alloc_ptpage(mm, pfn, PT_PTE);
1703}
1704
1705static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1706{
1707	xen_alloc_ptpage(mm, pfn, PT_PMD);
1708}
1709
1710/* This should never happen until we're OK to use struct page */
1711static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
1712{
1713	struct page *page = pfn_to_page(pfn);
1714	bool pinned = PagePinned(page);
1715
1716	trace_xen_mmu_release_ptpage(pfn, level, pinned);
1717
1718	if (pinned) {
1719		if (!PageHighMem(page)) {
1720			xen_mc_batch();
1721
1722			if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1723				__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1724
1725			__set_pfn_prot(pfn, PAGE_KERNEL);
1726
1727			xen_mc_issue(PARAVIRT_LAZY_MMU);
1728		}
1729		ClearPagePinned(page);
1730	}
1731}
1732
1733static void xen_release_pte(unsigned long pfn)
1734{
1735	xen_release_ptpage(pfn, PT_PTE);
1736}
1737
1738static void xen_release_pmd(unsigned long pfn)
1739{
1740	xen_release_ptpage(pfn, PT_PMD);
1741}
1742
1743#if CONFIG_PGTABLE_LEVELS == 4
1744static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1745{
1746	xen_alloc_ptpage(mm, pfn, PT_PUD);
1747}
1748
1749static void xen_release_pud(unsigned long pfn)
1750{
1751	xen_release_ptpage(pfn, PT_PUD);
1752}
1753#endif
1754
1755void __init xen_reserve_top(void)
1756{
1757#ifdef CONFIG_X86_32
1758	unsigned long top = HYPERVISOR_VIRT_START;
1759	struct xen_platform_parameters pp;
1760
1761	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1762		top = pp.virt_start;
1763
1764	reserve_top_address(-top);
1765#endif	/* CONFIG_X86_32 */
1766}
1767
1768/*
1769 * Like __va(), but returns address in the kernel mapping (which is
1770 * all we have until the physical memory mapping has been set up.
1771 */
1772static void * __init __ka(phys_addr_t paddr)
1773{
1774#ifdef CONFIG_X86_64
1775	return (void *)(paddr + __START_KERNEL_map);
1776#else
1777	return __va(paddr);
1778#endif
1779}
1780
1781/* Convert a machine address to physical address */
1782static unsigned long __init m2p(phys_addr_t maddr)
1783{
1784	phys_addr_t paddr;
1785
1786	maddr &= PTE_PFN_MASK;
1787	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1788
1789	return paddr;
1790}
1791
1792/* Convert a machine address to kernel virtual */
1793static void * __init m2v(phys_addr_t maddr)
1794{
1795	return __ka(m2p(maddr));
1796}
1797
1798/* Set the page permissions on an identity-mapped pages */
1799static void __init set_page_prot_flags(void *addr, pgprot_t prot,
1800				       unsigned long flags)
1801{
1802	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1803	pte_t pte = pfn_pte(pfn, prot);
1804
1805	/* For PVH no need to set R/O or R/W to pin them or unpin them. */
1806	if (xen_feature(XENFEAT_auto_translated_physmap))
1807		return;
1808
1809	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
1810		BUG();
1811}
1812static void __init set_page_prot(void *addr, pgprot_t prot)
1813{
1814	return set_page_prot_flags(addr, prot, UVMF_NONE);
1815}
1816#ifdef CONFIG_X86_32
1817static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1818{
1819	unsigned pmdidx, pteidx;
1820	unsigned ident_pte;
1821	unsigned long pfn;
1822
1823	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1824				      PAGE_SIZE);
1825
1826	ident_pte = 0;
1827	pfn = 0;
1828	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1829		pte_t *pte_page;
1830
1831		/* Reuse or allocate a page of ptes */
1832		if (pmd_present(pmd[pmdidx]))
1833			pte_page = m2v(pmd[pmdidx].pmd);
1834		else {
1835			/* Check for free pte pages */
1836			if (ident_pte == LEVEL1_IDENT_ENTRIES)
1837				break;
1838
1839			pte_page = &level1_ident_pgt[ident_pte];
1840			ident_pte += PTRS_PER_PTE;
1841
1842			pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1843		}
1844
1845		/* Install mappings */
1846		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1847			pte_t pte;
1848
 
1849			if (pfn > max_pfn_mapped)
1850				max_pfn_mapped = pfn;
 
1851
1852			if (!pte_none(pte_page[pteidx]))
1853				continue;
1854
1855			pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1856			pte_page[pteidx] = pte;
1857		}
1858	}
1859
1860	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1861		set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1862
1863	set_page_prot(pmd, PAGE_KERNEL_RO);
1864}
1865#endif
1866void __init xen_setup_machphys_mapping(void)
1867{
1868	struct xen_machphys_mapping mapping;
1869
1870	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1871		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1872		machine_to_phys_nr = mapping.max_mfn + 1;
1873	} else {
1874		machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
1875	}
1876#ifdef CONFIG_X86_32
1877	WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
1878		< machine_to_phys_mapping);
1879#endif
1880}
1881
1882#ifdef CONFIG_X86_64
1883static void __init convert_pfn_mfn(void *v)
1884{
1885	pte_t *pte = v;
1886	int i;
1887
1888	/* All levels are converted the same way, so just treat them
1889	   as ptes. */
1890	for (i = 0; i < PTRS_PER_PTE; i++)
1891		pte[i] = xen_make_pte(pte[i].pte);
1892}
1893static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
1894				 unsigned long addr)
1895{
1896	if (*pt_base == PFN_DOWN(__pa(addr))) {
1897		set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1898		clear_page((void *)addr);
1899		(*pt_base)++;
1900	}
1901	if (*pt_end == PFN_DOWN(__pa(addr))) {
1902		set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1903		clear_page((void *)addr);
1904		(*pt_end)--;
1905	}
1906}
1907/*
1908 * Set up the initial kernel pagetable.
1909 *
1910 * We can construct this by grafting the Xen provided pagetable into
1911 * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1912 * level2_ident_pgt, and level2_kernel_pgt.  This means that only the
1913 * kernel has a physical mapping to start with - but that's enough to
1914 * get __va working.  We need to fill in the rest of the physical
1915 * mapping once some sort of allocator has been set up.  NOTE: for
1916 * PVH, the page tables are native.
1917 */
1918void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 
1919{
1920	pud_t *l3;
1921	pmd_t *l2;
1922	unsigned long addr[3];
1923	unsigned long pt_base, pt_end;
1924	unsigned i;
1925
1926	/* max_pfn_mapped is the last pfn mapped in the initial memory
1927	 * mappings. Considering that on Xen after the kernel mappings we
1928	 * have the mappings of some pages that don't exist in pfn space, we
1929	 * set max_pfn_mapped to the last real pfn mapped. */
1930	if (xen_start_info->mfn_list < __START_KERNEL_map)
1931		max_pfn_mapped = xen_start_info->first_p2m_pfn;
1932	else
1933		max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1934
1935	pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
1936	pt_end = pt_base + xen_start_info->nr_pt_frames;
1937
1938	/* Zap identity mapping */
1939	init_level4_pgt[0] = __pgd(0);
1940
1941	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1942		/* Pre-constructed entries are in pfn, so convert to mfn */
1943		/* L4[272] -> level3_ident_pgt
1944		 * L4[511] -> level3_kernel_pgt */
1945		convert_pfn_mfn(init_level4_pgt);
1946
1947		/* L3_i[0] -> level2_ident_pgt */
1948		convert_pfn_mfn(level3_ident_pgt);
1949		/* L3_k[510] -> level2_kernel_pgt
1950		 * L3_k[511] -> level2_fixmap_pgt */
1951		convert_pfn_mfn(level3_kernel_pgt);
1952
1953		/* L3_k[511][506] -> level1_fixmap_pgt */
1954		convert_pfn_mfn(level2_fixmap_pgt);
1955	}
1956	/* We get [511][511] and have Xen's version of level2_kernel_pgt */
1957	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1958	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1959
1960	addr[0] = (unsigned long)pgd;
1961	addr[1] = (unsigned long)l3;
1962	addr[2] = (unsigned long)l2;
1963	/* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
1964	 * Both L4[272][0] and L4[511][510] have entries that point to the same
1965	 * L2 (PMD) tables. Meaning that if you modify it in __va space
1966	 * it will be also modified in the __ka space! (But if you just
1967	 * modify the PMD table to point to other PTE's or none, then you
1968	 * are OK - which is what cleanup_highmap does) */
1969	copy_page(level2_ident_pgt, l2);
1970	/* Graft it onto L4[511][510] */
1971	copy_page(level2_kernel_pgt, l2);
1972
1973	/* Copy the initial P->M table mappings if necessary. */
1974	i = pgd_index(xen_start_info->mfn_list);
1975	if (i && i < pgd_index(__START_KERNEL_map))
1976		init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
1977
1978	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1979		/* Make pagetable pieces RO */
1980		set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1981		set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1982		set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1983		set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1984		set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
1985		set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1986		set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1987		set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
1988
1989		/* Pin down new L4 */
1990		pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1991				  PFN_DOWN(__pa_symbol(init_level4_pgt)));
1992
1993		/* Unpin Xen-provided one */
1994		pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1995
1996		/*
1997		 * At this stage there can be no user pgd, and no page
1998		 * structure to attach it to, so make sure we just set kernel
1999		 * pgd.
2000		 */
2001		xen_mc_batch();
2002		__xen_write_cr3(true, __pa(init_level4_pgt));
2003		xen_mc_issue(PARAVIRT_LAZY_CPU);
2004	} else
2005		native_write_cr3(__pa(init_level4_pgt));
2006
2007	/* We can't that easily rip out L3 and L2, as the Xen pagetables are
2008	 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ...  for
2009	 * the initial domain. For guests using the toolstack, they are in:
2010	 * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
2011	 * rip out the [L4] (pgd), but for guests we shave off three pages.
2012	 */
2013	for (i = 0; i < ARRAY_SIZE(addr); i++)
2014		check_pt_base(&pt_base, &pt_end, addr[i]);
2015
2016	/* Our (by three pages) smaller Xen pagetable that we are using */
2017	xen_pt_base = PFN_PHYS(pt_base);
2018	xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
2019	memblock_reserve(xen_pt_base, xen_pt_size);
2020
2021	/* Revector the xen_start_info */
2022	xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
2023}
2024
2025/*
2026 * Read a value from a physical address.
2027 */
2028static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
2029{
2030	unsigned long *vaddr;
2031	unsigned long val;
2032
2033	vaddr = early_memremap_ro(addr, sizeof(val));
2034	val = *vaddr;
2035	early_memunmap(vaddr, sizeof(val));
2036	return val;
2037}
2038
2039/*
2040 * Translate a virtual address to a physical one without relying on mapped
2041 * page tables.
2042 */
2043static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
2044{
2045	phys_addr_t pa;
2046	pgd_t pgd;
2047	pud_t pud;
2048	pmd_t pmd;
2049	pte_t pte;
2050
2051	pa = read_cr3();
2052	pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
2053						       sizeof(pgd)));
2054	if (!pgd_present(pgd))
2055		return 0;
2056
2057	pa = pgd_val(pgd) & PTE_PFN_MASK;
2058	pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
2059						       sizeof(pud)));
2060	if (!pud_present(pud))
2061		return 0;
2062	pa = pud_pfn(pud) << PAGE_SHIFT;
2063	if (pud_large(pud))
2064		return pa + (vaddr & ~PUD_MASK);
2065
2066	pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
2067						       sizeof(pmd)));
2068	if (!pmd_present(pmd))
2069		return 0;
2070	pa = pmd_pfn(pmd) << PAGE_SHIFT;
2071	if (pmd_large(pmd))
2072		return pa + (vaddr & ~PMD_MASK);
2073
2074	pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
2075						       sizeof(pte)));
2076	if (!pte_present(pte))
2077		return 0;
2078	pa = pte_pfn(pte) << PAGE_SHIFT;
2079
2080	return pa | (vaddr & ~PAGE_MASK);
2081}
2082
2083/*
2084 * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
2085 * this area.
2086 */
2087void __init xen_relocate_p2m(void)
2088{
2089	phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
2090	unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
2091	int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
2092	pte_t *pt;
2093	pmd_t *pmd;
2094	pud_t *pud;
2095	pgd_t *pgd;
2096	unsigned long *new_p2m;
2097
2098	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
2099	n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
2100	n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
2101	n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
2102	n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
2103	n_frames = n_pte + n_pt + n_pmd + n_pud;
2104
2105	new_area = xen_find_free_area(PFN_PHYS(n_frames));
2106	if (!new_area) {
2107		xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
2108		BUG();
2109	}
2110
2111	/*
2112	 * Setup the page tables for addressing the new p2m list.
2113	 * We have asked the hypervisor to map the p2m list at the user address
2114	 * PUD_SIZE. It may have done so, or it may have used a kernel space
2115	 * address depending on the Xen version.
2116	 * To avoid any possible virtual address collision, just use
2117	 * 2 * PUD_SIZE for the new area.
2118	 */
2119	pud_phys = new_area;
2120	pmd_phys = pud_phys + PFN_PHYS(n_pud);
2121	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
2122	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
2123
2124	pgd = __va(read_cr3());
2125	new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
2126	for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
2127		pud = early_memremap(pud_phys, PAGE_SIZE);
2128		clear_page(pud);
2129		for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
2130		     idx_pmd++) {
2131			pmd = early_memremap(pmd_phys, PAGE_SIZE);
2132			clear_page(pmd);
2133			for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
2134			     idx_pt++) {
2135				pt = early_memremap(pt_phys, PAGE_SIZE);
2136				clear_page(pt);
2137				for (idx_pte = 0;
2138				     idx_pte < min(n_pte, PTRS_PER_PTE);
2139				     idx_pte++) {
2140					set_pte(pt + idx_pte,
2141						pfn_pte(p2m_pfn, PAGE_KERNEL));
2142					p2m_pfn++;
2143				}
2144				n_pte -= PTRS_PER_PTE;
2145				early_memunmap(pt, PAGE_SIZE);
2146				make_lowmem_page_readonly(__va(pt_phys));
2147				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
2148						  PFN_DOWN(pt_phys));
2149				set_pmd(pmd + idx_pt,
2150					__pmd(_PAGE_TABLE | pt_phys));
2151				pt_phys += PAGE_SIZE;
2152			}
2153			n_pt -= PTRS_PER_PMD;
2154			early_memunmap(pmd, PAGE_SIZE);
2155			make_lowmem_page_readonly(__va(pmd_phys));
2156			pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
2157					  PFN_DOWN(pmd_phys));
2158			set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
2159			pmd_phys += PAGE_SIZE;
2160		}
2161		n_pmd -= PTRS_PER_PUD;
2162		early_memunmap(pud, PAGE_SIZE);
2163		make_lowmem_page_readonly(__va(pud_phys));
2164		pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
2165		set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
2166		pud_phys += PAGE_SIZE;
2167	}
2168
2169	/* Now copy the old p2m info to the new area. */
2170	memcpy(new_p2m, xen_p2m_addr, size);
2171	xen_p2m_addr = new_p2m;
2172
2173	/* Release the old p2m list and set new list info. */
2174	p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
2175	BUG_ON(!p2m_pfn);
2176	p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
2177
2178	if (xen_start_info->mfn_list < __START_KERNEL_map) {
2179		pfn = xen_start_info->first_p2m_pfn;
2180		pfn_end = xen_start_info->first_p2m_pfn +
2181			  xen_start_info->nr_p2m_frames;
2182		set_pgd(pgd + 1, __pgd(0));
2183	} else {
2184		pfn = p2m_pfn;
2185		pfn_end = p2m_pfn_end;
2186	}
2187
2188	memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
2189	while (pfn < pfn_end) {
2190		if (pfn == p2m_pfn) {
2191			pfn = p2m_pfn_end;
2192			continue;
2193		}
2194		make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
2195		pfn++;
2196	}
2197
2198	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
2199	xen_start_info->first_p2m_pfn =  PFN_DOWN(new_area);
2200	xen_start_info->nr_p2m_frames = n_frames;
2201}
2202
2203#else	/* !CONFIG_X86_64 */
2204static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
2205static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
2206
2207static void __init xen_write_cr3_init(unsigned long cr3)
2208{
2209	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
2210
2211	BUG_ON(read_cr3() != __pa(initial_page_table));
2212	BUG_ON(cr3 != __pa(swapper_pg_dir));
2213
2214	/*
2215	 * We are switching to swapper_pg_dir for the first time (from
2216	 * initial_page_table) and therefore need to mark that page
2217	 * read-only and then pin it.
2218	 *
2219	 * Xen disallows sharing of kernel PMDs for PAE
2220	 * guests. Therefore we must copy the kernel PMD from
2221	 * initial_page_table into a new kernel PMD to be used in
2222	 * swapper_pg_dir.
2223	 */
2224	swapper_kernel_pmd =
2225		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2226	copy_page(swapper_kernel_pmd, initial_kernel_pmd);
 
2227	swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
2228		__pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
2229	set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
2230
2231	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
2232	xen_write_cr3(cr3);
2233	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
2234
2235	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
2236			  PFN_DOWN(__pa(initial_page_table)));
2237	set_page_prot(initial_page_table, PAGE_KERNEL);
2238	set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
2239
2240	pv_mmu_ops.write_cr3 = &xen_write_cr3;
2241}
2242
2243/*
2244 * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
2245 * not the first page table in the page table pool.
2246 * Iterate through the initial page tables to find the real page table base.
2247 */
2248static phys_addr_t xen_find_pt_base(pmd_t *pmd)
2249{
2250	phys_addr_t pt_base, paddr;
2251	unsigned pmdidx;
2252
2253	pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
2254
2255	for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
2256		if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
2257			paddr = m2p(pmd[pmdidx].pmd);
2258			pt_base = min(pt_base, paddr);
2259		}
2260
2261	return pt_base;
2262}
2263
2264void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
2265{
2266	pmd_t *kernel_pmd;
2267
2268	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
2269
2270	xen_pt_base = xen_find_pt_base(kernel_pmd);
2271	xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
2272
2273	initial_kernel_pmd =
2274		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2275
2276	max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
 
 
2277
2278	copy_page(initial_kernel_pmd, kernel_pmd);
 
2279
2280	xen_map_identity_early(initial_kernel_pmd, max_pfn);
2281
2282	copy_page(initial_page_table, pgd);
2283	initial_page_table[KERNEL_PGD_BOUNDARY] =
2284		__pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
2285
2286	set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
2287	set_page_prot(initial_page_table, PAGE_KERNEL_RO);
2288	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
2289
2290	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
2291
2292	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
2293			  PFN_DOWN(__pa(initial_page_table)));
2294	xen_write_cr3(__pa(initial_page_table));
2295
2296	memblock_reserve(xen_pt_base, xen_pt_size);
 
 
 
2297}
2298#endif	/* CONFIG_X86_64 */
2299
2300void __init xen_reserve_special_pages(void)
2301{
2302	phys_addr_t paddr;
2303
2304	memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
2305	if (xen_start_info->store_mfn) {
2306		paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
2307		memblock_reserve(paddr, PAGE_SIZE);
2308	}
2309	if (!xen_initial_domain()) {
2310		paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
2311		memblock_reserve(paddr, PAGE_SIZE);
2312	}
2313}
2314
2315void __init xen_pt_check_e820(void)
2316{
2317	if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
2318		xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
2319		BUG();
2320	}
2321}
2322
2323static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
2324
2325static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2326{
2327	pte_t pte;
2328
2329	phys >>= PAGE_SHIFT;
2330
2331	switch (idx) {
2332	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
2333	case FIX_RO_IDT:
 
 
2334#ifdef CONFIG_X86_32
2335	case FIX_WP_TEST:
 
2336# ifdef CONFIG_HIGHMEM
2337	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
2338# endif
2339#elif defined(CONFIG_X86_VSYSCALL_EMULATION)
2340	case VSYSCALL_PAGE:
 
2341#endif
2342	case FIX_TEXT_POKE0:
2343	case FIX_TEXT_POKE1:
2344		/* All local page mappings */
2345		pte = pfn_pte(phys, prot);
2346		break;
2347
2348#ifdef CONFIG_X86_LOCAL_APIC
2349	case FIX_APIC_BASE:	/* maps dummy local APIC */
2350		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2351		break;
2352#endif
2353
2354#ifdef CONFIG_X86_IO_APIC
2355	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
2356		/*
2357		 * We just don't map the IO APIC - all access is via
2358		 * hypercalls.  Keep the address in the pte for reference.
2359		 */
2360		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2361		break;
2362#endif
2363
2364	case FIX_PARAVIRT_BOOTMAP:
2365		/* This is an MFN, but it isn't an IO mapping from the
2366		   IO domain */
2367		pte = mfn_pte(phys, prot);
2368		break;
2369
2370	default:
2371		/* By default, set_fixmap is used for hardware mappings */
2372		pte = mfn_pte(phys, prot);
2373		break;
2374	}
2375
2376	__native_set_fixmap(idx, pte);
2377
2378#ifdef CONFIG_X86_VSYSCALL_EMULATION
2379	/* Replicate changes to map the vsyscall page into the user
2380	   pagetable vsyscall mapping. */
2381	if (idx == VSYSCALL_PAGE) {
 
2382		unsigned long vaddr = __fix_to_virt(idx);
2383		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
2384	}
2385#endif
2386}
2387
2388static void __init xen_post_allocator_init(void)
2389{
2390	if (xen_feature(XENFEAT_auto_translated_physmap))
2391		return;
2392
2393	pv_mmu_ops.set_pte = xen_set_pte;
2394	pv_mmu_ops.set_pmd = xen_set_pmd;
2395	pv_mmu_ops.set_pud = xen_set_pud;
2396#if CONFIG_PGTABLE_LEVELS == 4
2397	pv_mmu_ops.set_pgd = xen_set_pgd;
2398#endif
2399
2400	/* This will work as long as patching hasn't happened yet
2401	   (which it hasn't) */
2402	pv_mmu_ops.alloc_pte = xen_alloc_pte;
2403	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2404	pv_mmu_ops.release_pte = xen_release_pte;
2405	pv_mmu_ops.release_pmd = xen_release_pmd;
2406#if CONFIG_PGTABLE_LEVELS == 4
2407	pv_mmu_ops.alloc_pud = xen_alloc_pud;
2408	pv_mmu_ops.release_pud = xen_release_pud;
2409#endif
2410
2411#ifdef CONFIG_X86_64
2412	pv_mmu_ops.write_cr3 = &xen_write_cr3;
2413	SetPagePinned(virt_to_page(level3_user_vsyscall));
2414#endif
2415	xen_mark_init_mm_pinned();
2416}
2417
2418static void xen_leave_lazy_mmu(void)
2419{
2420	preempt_disable();
2421	xen_mc_flush();
2422	paravirt_leave_lazy_mmu();
2423	preempt_enable();
2424}
2425
2426static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2427	.read_cr2 = xen_read_cr2,
2428	.write_cr2 = xen_write_cr2,
2429
2430	.read_cr3 = xen_read_cr3,
 
2431	.write_cr3 = xen_write_cr3_init,
 
 
 
2432
2433	.flush_tlb_user = xen_flush_tlb,
2434	.flush_tlb_kernel = xen_flush_tlb,
2435	.flush_tlb_single = xen_flush_tlb_single,
2436	.flush_tlb_others = xen_flush_tlb_others,
2437
2438	.pte_update = paravirt_nop,
 
2439
2440	.pgd_alloc = xen_pgd_alloc,
2441	.pgd_free = xen_pgd_free,
2442
2443	.alloc_pte = xen_alloc_pte_init,
2444	.release_pte = xen_release_pte_init,
2445	.alloc_pmd = xen_alloc_pmd_init,
2446	.release_pmd = xen_release_pmd_init,
2447
2448	.set_pte = xen_set_pte_init,
2449	.set_pte_at = xen_set_pte_at,
2450	.set_pmd = xen_set_pmd_hyper,
2451
2452	.ptep_modify_prot_start = __ptep_modify_prot_start,
2453	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
2454
2455	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
2456	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2457
2458	.make_pte = PV_CALLEE_SAVE(xen_make_pte),
2459	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2460
2461#ifdef CONFIG_X86_PAE
2462	.set_pte_atomic = xen_set_pte_atomic,
2463	.pte_clear = xen_pte_clear,
2464	.pmd_clear = xen_pmd_clear,
2465#endif	/* CONFIG_X86_PAE */
2466	.set_pud = xen_set_pud_hyper,
2467
2468	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2469	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2470
2471#if CONFIG_PGTABLE_LEVELS == 4
2472	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
2473	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
2474	.set_pgd = xen_set_pgd_hyper,
2475
2476	.alloc_pud = xen_alloc_pmd_init,
2477	.release_pud = xen_release_pmd_init,
2478#endif	/* CONFIG_PGTABLE_LEVELS == 4 */
2479
2480	.activate_mm = xen_activate_mm,
2481	.dup_mmap = xen_dup_mmap,
2482	.exit_mmap = xen_exit_mmap,
2483
2484	.lazy_mode = {
2485		.enter = paravirt_enter_lazy_mmu,
2486		.leave = xen_leave_lazy_mmu,
2487		.flush = paravirt_flush_lazy_mmu,
2488	},
2489
2490	.set_fixmap = xen_set_fixmap,
2491};
2492
2493void __init xen_init_mmu_ops(void)
2494{
2495	x86_init.paging.pagetable_init = xen_pagetable_init;
2496
2497	if (xen_feature(XENFEAT_auto_translated_physmap))
2498		return;
2499
2500	pv_mmu_ops = xen_mmu_ops;
2501
2502	memset(dummy_mapping, 0xff, PAGE_SIZE);
2503}
2504
2505/* Protected by xen_reservation_lock. */
2506#define MAX_CONTIG_ORDER 9 /* 2MB */
2507static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2508
2509#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2510static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2511				unsigned long *in_frames,
2512				unsigned long *out_frames)
2513{
2514	int i;
2515	struct multicall_space mcs;
2516
2517	xen_mc_batch();
2518	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2519		mcs = __xen_mc_entry(0);
2520
2521		if (in_frames)
2522			in_frames[i] = virt_to_mfn(vaddr);
2523
2524		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2525		__set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2526
2527		if (out_frames)
2528			out_frames[i] = virt_to_pfn(vaddr);
2529	}
2530	xen_mc_issue(0);
2531}
2532
2533/*
2534 * Update the pfn-to-mfn mappings for a virtual address range, either to
2535 * point to an array of mfns, or contiguously from a single starting
2536 * mfn.
2537 */
2538static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2539				     unsigned long *mfns,
2540				     unsigned long first_mfn)
2541{
2542	unsigned i, limit;
2543	unsigned long mfn;
2544
2545	xen_mc_batch();
2546
2547	limit = 1u << order;
2548	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2549		struct multicall_space mcs;
2550		unsigned flags;
2551
2552		mcs = __xen_mc_entry(0);
2553		if (mfns)
2554			mfn = mfns[i];
2555		else
2556			mfn = first_mfn + i;
2557
2558		if (i < (limit - 1))
2559			flags = 0;
2560		else {
2561			if (order == 0)
2562				flags = UVMF_INVLPG | UVMF_ALL;
2563			else
2564				flags = UVMF_TLB_FLUSH | UVMF_ALL;
2565		}
2566
2567		MULTI_update_va_mapping(mcs.mc, vaddr,
2568				mfn_pte(mfn, PAGE_KERNEL), flags);
2569
2570		set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2571	}
2572
2573	xen_mc_issue(0);
2574}
2575
2576/*
2577 * Perform the hypercall to exchange a region of our pfns to point to
2578 * memory with the required contiguous alignment.  Takes the pfns as
2579 * input, and populates mfns as output.
2580 *
2581 * Returns a success code indicating whether the hypervisor was able to
2582 * satisfy the request or not.
2583 */
2584static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2585			       unsigned long *pfns_in,
2586			       unsigned long extents_out,
2587			       unsigned int order_out,
2588			       unsigned long *mfns_out,
2589			       unsigned int address_bits)
2590{
2591	long rc;
2592	int success;
2593
2594	struct xen_memory_exchange exchange = {
2595		.in = {
2596			.nr_extents   = extents_in,
2597			.extent_order = order_in,
2598			.extent_start = pfns_in,
2599			.domid        = DOMID_SELF
2600		},
2601		.out = {
2602			.nr_extents   = extents_out,
2603			.extent_order = order_out,
2604			.extent_start = mfns_out,
2605			.address_bits = address_bits,
2606			.domid        = DOMID_SELF
2607		}
2608	};
2609
2610	BUG_ON(extents_in << order_in != extents_out << order_out);
2611
2612	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2613	success = (exchange.nr_exchanged == extents_in);
2614
2615	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2616	BUG_ON(success && (rc != 0));
2617
2618	return success;
2619}
2620
2621int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
2622				 unsigned int address_bits,
2623				 dma_addr_t *dma_handle)
2624{
2625	unsigned long *in_frames = discontig_frames, out_frame;
2626	unsigned long  flags;
2627	int            success;
2628	unsigned long vstart = (unsigned long)phys_to_virt(pstart);
2629
2630	/*
2631	 * Currently an auto-translated guest will not perform I/O, nor will
2632	 * it require PAE page directories below 4GB. Therefore any calls to
2633	 * this function are redundant and can be ignored.
2634	 */
2635
2636	if (xen_feature(XENFEAT_auto_translated_physmap))
2637		return 0;
2638
2639	if (unlikely(order > MAX_CONTIG_ORDER))
2640		return -ENOMEM;
2641
2642	memset((void *) vstart, 0, PAGE_SIZE << order);
2643
2644	spin_lock_irqsave(&xen_reservation_lock, flags);
2645
2646	/* 1. Zap current PTEs, remembering MFNs. */
2647	xen_zap_pfn_range(vstart, order, in_frames, NULL);
2648
2649	/* 2. Get a new contiguous memory extent. */
2650	out_frame = virt_to_pfn(vstart);
2651	success = xen_exchange_memory(1UL << order, 0, in_frames,
2652				      1, order, &out_frame,
2653				      address_bits);
2654
2655	/* 3. Map the new extent in place of old pages. */
2656	if (success)
2657		xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2658	else
2659		xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2660
2661	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2662
2663	*dma_handle = virt_to_machine(vstart).maddr;
2664	return success ? 0 : -ENOMEM;
2665}
2666EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2667
2668void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
2669{
2670	unsigned long *out_frames = discontig_frames, in_frame;
2671	unsigned long  flags;
2672	int success;
2673	unsigned long vstart;
2674
2675	if (xen_feature(XENFEAT_auto_translated_physmap))
2676		return;
2677
2678	if (unlikely(order > MAX_CONTIG_ORDER))
2679		return;
2680
2681	vstart = (unsigned long)phys_to_virt(pstart);
2682	memset((void *) vstart, 0, PAGE_SIZE << order);
2683
2684	spin_lock_irqsave(&xen_reservation_lock, flags);
2685
2686	/* 1. Find start MFN of contiguous extent. */
2687	in_frame = virt_to_mfn(vstart);
2688
2689	/* 2. Zap current PTEs. */
2690	xen_zap_pfn_range(vstart, order, NULL, out_frames);
2691
2692	/* 3. Do the exchange for non-contiguous MFNs. */
2693	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2694					0, out_frames, 0);
2695
2696	/* 4. Map new pages in place of old pages. */
2697	if (success)
2698		xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2699	else
2700		xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2701
2702	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2703}
2704EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2705
2706#ifdef CONFIG_XEN_PVHVM
2707#ifdef CONFIG_PROC_VMCORE
2708/*
2709 * This function is used in two contexts:
2710 * - the kdump kernel has to check whether a pfn of the crashed kernel
2711 *   was a ballooned page. vmcore is using this function to decide
2712 *   whether to access a pfn of the crashed kernel.
2713 * - the kexec kernel has to check whether a pfn was ballooned by the
2714 *   previous kernel. If the pfn is ballooned, handle it properly.
2715 * Returns 0 if the pfn is not backed by a RAM page, the caller may
2716 * handle the pfn special in this case.
2717 */
2718static int xen_oldmem_pfn_is_ram(unsigned long pfn)
2719{
2720	struct xen_hvm_get_mem_type a = {
2721		.domid = DOMID_SELF,
2722		.pfn = pfn,
2723	};
2724	int ram;
2725
2726	if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
2727		return -ENXIO;
2728
2729	switch (a.mem_type) {
2730		case HVMMEM_mmio_dm:
2731			ram = 0;
2732			break;
2733		case HVMMEM_ram_rw:
2734		case HVMMEM_ram_ro:
2735		default:
2736			ram = 1;
2737			break;
2738	}
2739
2740	return ram;
2741}
2742#endif
2743
2744static void xen_hvm_exit_mmap(struct mm_struct *mm)
2745{
2746	struct xen_hvm_pagetable_dying a;
2747	int rc;
2748
2749	a.domid = DOMID_SELF;
2750	a.gpa = __pa(mm->pgd);
2751	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2752	WARN_ON_ONCE(rc < 0);
2753}
2754
2755static int is_pagetable_dying_supported(void)
2756{
2757	struct xen_hvm_pagetable_dying a;
2758	int rc = 0;
2759
2760	a.domid = DOMID_SELF;
2761	a.gpa = 0x00;
2762	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2763	if (rc < 0) {
2764		printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2765		return 0;
2766	}
2767	return 1;
2768}
2769
2770void __init xen_hvm_init_mmu_ops(void)
2771{
2772	if (is_pagetable_dying_supported())
2773		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2774#ifdef CONFIG_PROC_VMCORE
2775	register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram);
2776#endif
2777}
2778#endif
2779
2780#define REMAP_BATCH_SIZE 16
2781
2782struct remap_data {
2783	xen_pfn_t *mfn;
2784	bool contiguous;
2785	pgprot_t prot;
2786	struct mmu_update *mmu_update;
2787};
2788
2789static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2790				 unsigned long addr, void *data)
2791{
2792	struct remap_data *rmd = data;
2793	pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot));
2794
2795	/* If we have a contiguous range, just update the mfn itself,
2796	   else update pointer to be "next mfn". */
2797	if (rmd->contiguous)
2798		(*rmd->mfn)++;
2799	else
2800		rmd->mfn++;
2801
2802	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2803	rmd->mmu_update->val = pte_val_ma(pte);
2804	rmd->mmu_update++;
2805
2806	return 0;
2807}
2808
2809static int do_remap_gfn(struct vm_area_struct *vma,
2810			unsigned long addr,
2811			xen_pfn_t *gfn, int nr,
2812			int *err_ptr, pgprot_t prot,
2813			unsigned domid,
2814			struct page **pages)
2815{
2816	int err = 0;
2817	struct remap_data rmd;
2818	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
 
2819	unsigned long range;
2820	int mapped = 0;
2821
2822	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
2823
2824	if (xen_feature(XENFEAT_auto_translated_physmap)) {
2825#ifdef CONFIG_XEN_PVH
2826		/* We need to update the local page tables and the xen HAP */
2827		return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
2828						 prot, domid, pages);
2829#else
2830		return -EINVAL;
2831#endif
2832        }
2833
2834	rmd.mfn = gfn;
2835	rmd.prot = prot;
2836	/* We use the err_ptr to indicate if there we are doing a contiguous
2837	 * mapping or a discontigious mapping. */
2838	rmd.contiguous = !err_ptr;
2839
2840	while (nr) {
2841		int index = 0;
2842		int done = 0;
2843		int batch = min(REMAP_BATCH_SIZE, nr);
2844		int batch_left = batch;
2845		range = (unsigned long)batch << PAGE_SHIFT;
2846
2847		rmd.mmu_update = mmu_update;
2848		err = apply_to_page_range(vma->vm_mm, addr, range,
2849					  remap_area_mfn_pte_fn, &rmd);
2850		if (err)
2851			goto out;
2852
2853		/* We record the error for each page that gives an error, but
2854		 * continue mapping until the whole set is done */
2855		do {
2856			int i;
2857
2858			err = HYPERVISOR_mmu_update(&mmu_update[index],
2859						    batch_left, &done, domid);
2860
2861			/*
2862			 * @err_ptr may be the same buffer as @gfn, so
2863			 * only clear it after each chunk of @gfn is
2864			 * used.
2865			 */
2866			if (err_ptr) {
2867				for (i = index; i < index + done; i++)
2868					err_ptr[i] = 0;
2869			}
2870			if (err < 0) {
2871				if (!err_ptr)
2872					goto out;
2873				err_ptr[i] = err;
2874				done++; /* Skip failed frame. */
2875			} else
2876				mapped += done;
2877			batch_left -= done;
2878			index += done;
2879		} while (batch_left);
2880
2881		nr -= batch;
2882		addr += range;
2883		if (err_ptr)
2884			err_ptr += batch;
2885		cond_resched();
2886	}
 
 
2887out:
2888
2889	xen_flush_tlb_all();
2890
2891	return err < 0 ? err : mapped;
2892}
2893
2894int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
2895			       unsigned long addr,
2896			       xen_pfn_t gfn, int nr,
2897			       pgprot_t prot, unsigned domid,
2898			       struct page **pages)
2899{
2900	return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages);
2901}
2902EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
2903
2904int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
2905			       unsigned long addr,
2906			       xen_pfn_t *gfn, int nr,
2907			       int *err_ptr, pgprot_t prot,
2908			       unsigned domid, struct page **pages)
2909{
2910	/* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
2911	 * and the consequences later is quite hard to detect what the actual
2912	 * cause of "wrong memory was mapped in".
2913	 */
2914	BUG_ON(err_ptr == NULL);
2915	return do_remap_gfn(vma, addr, gfn, nr, err_ptr, prot, domid, pages);
2916}
2917EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
2918
2919
2920/* Returns: 0 success */
2921int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
2922			       int numpgs, struct page **pages)
2923{
2924	if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
2925		return 0;
2926
2927#ifdef CONFIG_XEN_PVH
2928	return xen_xlate_unmap_gfn_range(vma, numpgs, pages);
2929#else
2930	return -EINVAL;
2931#endif
2932}
2933EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);