Linux Audio

Check our new training course

Loading...
v4.10.11
   1/*
   2 * Handle caching attributes in page tables (PAT)
   3 *
   4 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
   5 *          Suresh B Siddha <suresh.b.siddha@intel.com>
   6 *
   7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
   8 */
   9
  10#include <linux/seq_file.h>
  11#include <linux/bootmem.h>
  12#include <linux/debugfs.h>
  13#include <linux/kernel.h>
  14#include <linux/pfn_t.h>
  15#include <linux/slab.h>
  16#include <linux/mm.h>
  17#include <linux/fs.h>
  18#include <linux/rbtree.h>
  19
  20#include <asm/cacheflush.h>
  21#include <asm/processor.h>
  22#include <asm/tlbflush.h>
  23#include <asm/x86_init.h>
  24#include <asm/pgtable.h>
  25#include <asm/fcntl.h>
  26#include <asm/e820.h>
  27#include <asm/mtrr.h>
  28#include <asm/page.h>
  29#include <asm/msr.h>
  30#include <asm/pat.h>
  31#include <asm/io.h>
  32
  33#include "pat_internal.h"
  34#include "mm_internal.h"
  35
  36#undef pr_fmt
  37#define pr_fmt(fmt) "" fmt
  38
  39static bool boot_cpu_done;
  40
  41static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT);
  42static void init_cache_modes(void);
  43
  44void pat_disable(const char *reason)
  45{
  46	if (!__pat_enabled)
  47		return;
  48
  49	if (boot_cpu_done) {
  50		WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n");
  51		return;
  52	}
  53
  54	__pat_enabled = 0;
  55	pr_info("x86/PAT: %s\n", reason);
  56
  57	init_cache_modes();
  58}
  59
  60static int __init nopat(char *str)
  61{
  62	pat_disable("PAT support disabled.");
  63	return 0;
  64}
  65early_param("nopat", nopat);
  66
  67bool pat_enabled(void)
  68{
  69	return !!__pat_enabled;
  70}
  71EXPORT_SYMBOL_GPL(pat_enabled);
 
  72
  73int pat_debug_enable;
  74
  75static int __init pat_debug_setup(char *str)
  76{
  77	pat_debug_enable = 1;
  78	return 0;
  79}
  80__setup("debugpat", pat_debug_setup);
  81
  82#ifdef CONFIG_X86_PAT
  83/*
  84 * X86 PAT uses page flags arch_1 and uncached together to keep track of
  85 * memory type of pages that have backing page struct.
  86 *
  87 * X86 PAT supports 4 different memory types:
  88 *  - _PAGE_CACHE_MODE_WB
  89 *  - _PAGE_CACHE_MODE_WC
  90 *  - _PAGE_CACHE_MODE_UC_MINUS
  91 *  - _PAGE_CACHE_MODE_WT
  92 *
  93 * _PAGE_CACHE_MODE_WB is the default type.
  94 */
  95
  96#define _PGMT_WB		0
  97#define _PGMT_WC		(1UL << PG_arch_1)
  98#define _PGMT_UC_MINUS		(1UL << PG_uncached)
  99#define _PGMT_WT		(1UL << PG_uncached | 1UL << PG_arch_1)
 100#define _PGMT_MASK		(1UL << PG_uncached | 1UL << PG_arch_1)
 101#define _PGMT_CLEAR_MASK	(~_PGMT_MASK)
 102
 103static inline enum page_cache_mode get_page_memtype(struct page *pg)
 104{
 105	unsigned long pg_flags = pg->flags & _PGMT_MASK;
 106
 107	if (pg_flags == _PGMT_WB)
 108		return _PAGE_CACHE_MODE_WB;
 109	else if (pg_flags == _PGMT_WC)
 110		return _PAGE_CACHE_MODE_WC;
 111	else if (pg_flags == _PGMT_UC_MINUS)
 112		return _PAGE_CACHE_MODE_UC_MINUS;
 113	else
 114		return _PAGE_CACHE_MODE_WT;
 115}
 116
 117static inline void set_page_memtype(struct page *pg,
 118				    enum page_cache_mode memtype)
 119{
 120	unsigned long memtype_flags;
 121	unsigned long old_flags;
 122	unsigned long new_flags;
 123
 124	switch (memtype) {
 125	case _PAGE_CACHE_MODE_WC:
 126		memtype_flags = _PGMT_WC;
 127		break;
 128	case _PAGE_CACHE_MODE_UC_MINUS:
 129		memtype_flags = _PGMT_UC_MINUS;
 130		break;
 131	case _PAGE_CACHE_MODE_WT:
 132		memtype_flags = _PGMT_WT;
 133		break;
 134	case _PAGE_CACHE_MODE_WB:
 135	default:
 136		memtype_flags = _PGMT_WB;
 137		break;
 138	}
 139
 140	do {
 141		old_flags = pg->flags;
 142		new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags;
 143	} while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags);
 144}
 145#else
 146static inline enum page_cache_mode get_page_memtype(struct page *pg)
 147{
 148	return -1;
 149}
 150static inline void set_page_memtype(struct page *pg,
 151				    enum page_cache_mode memtype)
 152{
 153}
 154#endif
 155
 156enum {
 157	PAT_UC = 0,		/* uncached */
 158	PAT_WC = 1,		/* Write combining */
 159	PAT_WT = 4,		/* Write Through */
 160	PAT_WP = 5,		/* Write Protected */
 161	PAT_WB = 6,		/* Write Back (default) */
 162	PAT_UC_MINUS = 7,	/* UC, but can be overridden by MTRR */
 163};
 164
 165#define CM(c) (_PAGE_CACHE_MODE_ ## c)
 166
 167static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
 168{
 169	enum page_cache_mode cache;
 170	char *cache_mode;
 171
 172	switch (pat_val) {
 173	case PAT_UC:       cache = CM(UC);       cache_mode = "UC  "; break;
 174	case PAT_WC:       cache = CM(WC);       cache_mode = "WC  "; break;
 175	case PAT_WT:       cache = CM(WT);       cache_mode = "WT  "; break;
 176	case PAT_WP:       cache = CM(WP);       cache_mode = "WP  "; break;
 177	case PAT_WB:       cache = CM(WB);       cache_mode = "WB  "; break;
 178	case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
 179	default:           cache = CM(WB);       cache_mode = "WB  "; break;
 180	}
 181
 182	memcpy(msg, cache_mode, 4);
 183
 184	return cache;
 185}
 186
 187#undef CM
 188
 189/*
 190 * Update the cache mode to pgprot translation tables according to PAT
 191 * configuration.
 192 * Using lower indices is preferred, so we start with highest index.
 193 */
 194static void __init_cache_modes(u64 pat)
 195{
 196	enum page_cache_mode cache;
 197	char pat_msg[33];
 198	int i;
 199
 200	pat_msg[32] = 0;
 201	for (i = 7; i >= 0; i--) {
 202		cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
 203					   pat_msg + 4 * i);
 204		update_cache_mode_entry(i, cache);
 205	}
 206	pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
 207}
 208
 209#define PAT(x, y)	((u64)PAT_ ## y << ((x)*8))
 210
 211static void pat_bsp_init(u64 pat)
 212{
 213	u64 tmp_pat;
 
 214
 215	if (!boot_cpu_has(X86_FEATURE_PAT)) {
 216		pat_disable("PAT not supported by CPU.");
 217		return;
 218	}
 219
 220	rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
 221	if (!tmp_pat) {
 222		pat_disable("PAT MSR is 0, disabled.");
 223		return;
 
 
 
 
 
 
 
 
 
 
 224	}
 225
 226	wrmsrl(MSR_IA32_CR_PAT, pat);
 227
 228	__init_cache_modes(pat);
 229}
 
 
 
 
 
 
 
 
 
 
 
 230
 231static void pat_ap_init(u64 pat)
 232{
 233	if (!boot_cpu_has(X86_FEATURE_PAT)) {
 234		/*
 235		 * If this happens we are on a secondary CPU, but switched to
 236		 * PAT on the boot CPU. We have no way to undo PAT.
 237		 */
 238		panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
 239	}
 240
 241	wrmsrl(MSR_IA32_CR_PAT, pat);
 242}
 243
 244static void init_cache_modes(void)
 245{
 246	u64 pat = 0;
 247	static int init_cm_done;
 248
 249	if (init_cm_done)
 250		return;
 251
 252	if (boot_cpu_has(X86_FEATURE_PAT)) {
 253		/*
 254		 * CPU supports PAT. Set PAT table to be consistent with
 255		 * PAT MSR. This case supports "nopat" boot option, and
 256		 * virtual machine environments which support PAT without
 257		 * MTRRs. In specific, Xen has unique setup to PAT MSR.
 258		 *
 259		 * If PAT MSR returns 0, it is considered invalid and emulates
 260		 * as No PAT.
 261		 */
 262		rdmsrl(MSR_IA32_CR_PAT, pat);
 263	}
 264
 265	if (!pat) {
 266		/*
 267		 * No PAT. Emulate the PAT table that corresponds to the two
 268		 * cache bits, PWT (Write Through) and PCD (Cache Disable).
 269		 * This setup is also the same as the BIOS default setup.
 270		 *
 271		 * PTE encoding:
 272		 *
 273		 *       PCD
 274		 *       |PWT  PAT
 275		 *       ||    slot
 276		 *       00    0    WB : _PAGE_CACHE_MODE_WB
 277		 *       01    1    WT : _PAGE_CACHE_MODE_WT
 278		 *       10    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
 279		 *       11    3    UC : _PAGE_CACHE_MODE_UC
 280		 *
 281		 * NOTE: When WC or WP is used, it is redirected to UC- per
 282		 * the default setup in __cachemode2pte_tbl[].
 283		 */
 284		pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
 285		      PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
 286	}
 287
 288	__init_cache_modes(pat);
 289
 290	init_cm_done = 1;
 291}
 292
 293/**
 294 * pat_init - Initialize PAT MSR and PAT table
 295 *
 296 * This function initializes PAT MSR and PAT table with an OS-defined value
 297 * to enable additional cache attributes, WC and WT.
 298 *
 299 * This function must be called on all CPUs using the specific sequence of
 300 * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this
 301 * procedure for PAT.
 302 */
 303void pat_init(void)
 304{
 305	u64 pat;
 306	struct cpuinfo_x86 *c = &boot_cpu_data;
 307
 308	if (!pat_enabled()) {
 309		init_cache_modes();
 310		return;
 311	}
 312
 313	if ((c->x86_vendor == X86_VENDOR_INTEL) &&
 314	    (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
 315	     ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
 316		/*
 317		 * PAT support with the lower four entries. Intel Pentium 2,
 318		 * 3, M, and 4 are affected by PAT errata, which makes the
 319		 * upper four entries unusable. To be on the safe side, we don't
 320		 * use those.
 321		 *
 322		 *  PTE encoding:
 323		 *      PAT
 324		 *      |PCD
 325		 *      ||PWT  PAT
 326		 *      |||    slot
 327		 *      000    0    WB : _PAGE_CACHE_MODE_WB
 328		 *      001    1    WC : _PAGE_CACHE_MODE_WC
 329		 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
 330		 *      011    3    UC : _PAGE_CACHE_MODE_UC
 331		 * PAT bit unused
 332		 *
 333		 * NOTE: When WT or WP is used, it is redirected to UC- per
 334		 * the default setup in __cachemode2pte_tbl[].
 335		 */
 336		pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
 337		      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
 338	} else {
 339		/*
 340		 * Full PAT support.  We put WT in slot 7 to improve
 341		 * robustness in the presence of errata that might cause
 342		 * the high PAT bit to be ignored.  This way, a buggy slot 7
 343		 * access will hit slot 3, and slot 3 is UC, so at worst
 344		 * we lose performance without causing a correctness issue.
 345		 * Pentium 4 erratum N46 is an example for such an erratum,
 346		 * although we try not to use PAT at all on affected CPUs.
 347		 *
 348		 *  PTE encoding:
 349		 *      PAT
 350		 *      |PCD
 351		 *      ||PWT  PAT
 352		 *      |||    slot
 353		 *      000    0    WB : _PAGE_CACHE_MODE_WB
 354		 *      001    1    WC : _PAGE_CACHE_MODE_WC
 355		 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
 356		 *      011    3    UC : _PAGE_CACHE_MODE_UC
 357		 *      100    4    WB : Reserved
 358		 *      101    5    WC : Reserved
 359		 *      110    6    UC-: Reserved
 360		 *      111    7    WT : _PAGE_CACHE_MODE_WT
 361		 *
 362		 * The reserved slots are unused, but mapped to their
 363		 * corresponding types in the presence of PAT errata.
 364		 */
 365		pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
 366		      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT);
 367	}
 368
 369	if (!boot_cpu_done) {
 370		pat_bsp_init(pat);
 371		boot_cpu_done = true;
 372	} else {
 373		pat_ap_init(pat);
 374	}
 375}
 376
 377#undef PAT
 378
 379static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype accesses */
 380
 381/*
 382 * Does intersection of PAT memory type and MTRR memory type and returns
 383 * the resulting memory type as PAT understands it.
 384 * (Type in pat and mtrr will not have same value)
 385 * The intersection is based on "Effective Memory Type" tables in IA-32
 386 * SDM vol 3a
 387 */
 388static unsigned long pat_x_mtrr_type(u64 start, u64 end,
 389				     enum page_cache_mode req_type)
 390{
 391	/*
 392	 * Look for MTRR hint to get the effective type in case where PAT
 393	 * request is for WB.
 394	 */
 395	if (req_type == _PAGE_CACHE_MODE_WB) {
 396		u8 mtrr_type, uniform;
 397
 398		mtrr_type = mtrr_type_lookup(start, end, &uniform);
 399		if (mtrr_type != MTRR_TYPE_WRBACK)
 400			return _PAGE_CACHE_MODE_UC_MINUS;
 401
 402		return _PAGE_CACHE_MODE_WB;
 403	}
 404
 405	return req_type;
 406}
 407
 408struct pagerange_state {
 409	unsigned long		cur_pfn;
 410	int			ram;
 411	int			not_ram;
 412};
 413
 414static int
 415pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
 416{
 417	struct pagerange_state *state = arg;
 418
 419	state->not_ram	|= initial_pfn > state->cur_pfn;
 420	state->ram	|= total_nr_pages > 0;
 421	state->cur_pfn	 = initial_pfn + total_nr_pages;
 422
 423	return state->ram && state->not_ram;
 424}
 425
 426static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
 427{
 428	int ret = 0;
 429	unsigned long start_pfn = start >> PAGE_SHIFT;
 430	unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
 431	struct pagerange_state state = {start_pfn, 0, 0};
 432
 433	/*
 434	 * For legacy reasons, physical address range in the legacy ISA
 435	 * region is tracked as non-RAM. This will allow users of
 436	 * /dev/mem to map portions of legacy ISA region, even when
 437	 * some of those portions are listed(or not even listed) with
 438	 * different e820 types(RAM/reserved/..)
 439	 */
 440	if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
 441		start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
 
 
 
 
 
 442
 443	if (start_pfn < end_pfn) {
 444		ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
 445				&state, pagerange_is_ram_callback);
 446	}
 447
 448	return (ret > 0) ? -1 : (state.ram ? 1 : 0);
 449}
 450
 451/*
 452 * For RAM pages, we use page flags to mark the pages with appropriate type.
 453 * The page flags are limited to four types, WB (default), WC, WT and UC-.
 454 * WP request fails with -EINVAL, and UC gets redirected to UC-.  Setting
 455 * a new memory type is only allowed for a page mapped with the default WB
 456 * type.
 457 *
 458 * Here we do two passes:
 459 * - Find the memtype of all the pages in the range, look for any conflicts.
 460 * - In case of no conflicts, set the new memtype for pages in the range.
 461 */
 462static int reserve_ram_pages_type(u64 start, u64 end,
 463				  enum page_cache_mode req_type,
 464				  enum page_cache_mode *new_type)
 465{
 466	struct page *page;
 467	u64 pfn;
 468
 469	if (req_type == _PAGE_CACHE_MODE_WP) {
 470		if (new_type)
 471			*new_type = _PAGE_CACHE_MODE_UC_MINUS;
 472		return -EINVAL;
 473	}
 474
 475	if (req_type == _PAGE_CACHE_MODE_UC) {
 476		/* We do not support strong UC */
 477		WARN_ON_ONCE(1);
 478		req_type = _PAGE_CACHE_MODE_UC_MINUS;
 479	}
 480
 481	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 482		enum page_cache_mode type;
 483
 484		page = pfn_to_page(pfn);
 485		type = get_page_memtype(page);
 486		if (type != _PAGE_CACHE_MODE_WB) {
 487			pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
 488				start, end - 1, type, req_type);
 
 489			if (new_type)
 490				*new_type = type;
 491
 492			return -EBUSY;
 493		}
 494	}
 495
 496	if (new_type)
 497		*new_type = req_type;
 498
 499	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 500		page = pfn_to_page(pfn);
 501		set_page_memtype(page, req_type);
 502	}
 503	return 0;
 504}
 505
 506static int free_ram_pages_type(u64 start, u64 end)
 507{
 508	struct page *page;
 509	u64 pfn;
 510
 511	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 512		page = pfn_to_page(pfn);
 513		set_page_memtype(page, _PAGE_CACHE_MODE_WB);
 514	}
 515	return 0;
 516}
 517
 518/*
 519 * req_type typically has one of the:
 520 * - _PAGE_CACHE_MODE_WB
 521 * - _PAGE_CACHE_MODE_WC
 522 * - _PAGE_CACHE_MODE_UC_MINUS
 523 * - _PAGE_CACHE_MODE_UC
 524 * - _PAGE_CACHE_MODE_WT
 525 *
 526 * If new_type is NULL, function will return an error if it cannot reserve the
 527 * region with req_type. If new_type is non-NULL, function will return
 528 * available type in new_type in case of no error. In case of any error
 529 * it will return a negative return value.
 530 */
 531int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
 532		    enum page_cache_mode *new_type)
 533{
 534	struct memtype *new;
 535	enum page_cache_mode actual_type;
 536	int is_range_ram;
 537	int err = 0;
 538
 539	BUG_ON(start >= end); /* end is exclusive */
 540
 541	if (!pat_enabled()) {
 542		/* This is identical to page table setting without PAT */
 543		if (new_type)
 544			*new_type = req_type;
 
 
 
 
 545		return 0;
 546	}
 547
 548	/* Low ISA region is always mapped WB in page table. No need to track */
 549	if (x86_platform.is_untracked_pat_range(start, end)) {
 550		if (new_type)
 551			*new_type = _PAGE_CACHE_MODE_WB;
 552		return 0;
 553	}
 554
 555	/*
 556	 * Call mtrr_lookup to get the type hint. This is an
 557	 * optimization for /dev/mem mmap'ers into WB memory (BIOS
 558	 * tools and ACPI tools). Use WB request for WB memory and use
 559	 * UC_MINUS otherwise.
 560	 */
 561	actual_type = pat_x_mtrr_type(start, end, req_type);
 562
 563	if (new_type)
 564		*new_type = actual_type;
 565
 566	is_range_ram = pat_pagerange_is_ram(start, end);
 567	if (is_range_ram == 1) {
 568
 569		err = reserve_ram_pages_type(start, end, req_type, new_type);
 570
 571		return err;
 572	} else if (is_range_ram < 0) {
 573		return -EINVAL;
 574	}
 575
 576	new  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
 577	if (!new)
 578		return -ENOMEM;
 579
 580	new->start	= start;
 581	new->end	= end;
 582	new->type	= actual_type;
 583
 584	spin_lock(&memtype_lock);
 585
 586	err = rbt_memtype_check_insert(new, new_type);
 587	if (err) {
 588		pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
 589			start, end - 1,
 590			cattr_name(new->type), cattr_name(req_type));
 591		kfree(new);
 592		spin_unlock(&memtype_lock);
 593
 594		return err;
 595	}
 596
 597	spin_unlock(&memtype_lock);
 598
 599	dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
 600		start, end - 1, cattr_name(new->type), cattr_name(req_type),
 601		new_type ? cattr_name(*new_type) : "-");
 602
 603	return err;
 604}
 605
 606int free_memtype(u64 start, u64 end)
 607{
 608	int err = -EINVAL;
 609	int is_range_ram;
 610	struct memtype *entry;
 611
 612	if (!pat_enabled())
 613		return 0;
 614
 615	/* Low ISA region is always mapped WB. No need to track */
 616	if (x86_platform.is_untracked_pat_range(start, end))
 617		return 0;
 618
 619	is_range_ram = pat_pagerange_is_ram(start, end);
 620	if (is_range_ram == 1) {
 621
 622		err = free_ram_pages_type(start, end);
 623
 624		return err;
 625	} else if (is_range_ram < 0) {
 626		return -EINVAL;
 627	}
 628
 629	spin_lock(&memtype_lock);
 630	entry = rbt_memtype_erase(start, end);
 631	spin_unlock(&memtype_lock);
 632
 633	if (IS_ERR(entry)) {
 634		pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
 635			current->comm, current->pid, start, end - 1);
 636		return -EINVAL;
 637	}
 638
 639	kfree(entry);
 640
 641	dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);
 642
 643	return 0;
 644}
 645
 646
 647/**
 648 * lookup_memtype - Looksup the memory type for a physical address
 649 * @paddr: physical address of which memory type needs to be looked up
 650 *
 651 * Only to be called when PAT is enabled
 652 *
 653 * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
 654 * or _PAGE_CACHE_MODE_WT.
 655 */
 656static enum page_cache_mode lookup_memtype(u64 paddr)
 657{
 658	enum page_cache_mode rettype = _PAGE_CACHE_MODE_WB;
 659	struct memtype *entry;
 660
 661	if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
 662		return rettype;
 663
 664	if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
 665		struct page *page;
 666
 667		page = pfn_to_page(paddr >> PAGE_SHIFT);
 668		return get_page_memtype(page);
 
 
 
 
 
 
 
 
 669	}
 670
 671	spin_lock(&memtype_lock);
 672
 673	entry = rbt_memtype_lookup(paddr);
 674	if (entry != NULL)
 675		rettype = entry->type;
 676	else
 677		rettype = _PAGE_CACHE_MODE_UC_MINUS;
 678
 679	spin_unlock(&memtype_lock);
 680	return rettype;
 681}
 682
 683/**
 684 * io_reserve_memtype - Request a memory type mapping for a region of memory
 685 * @start: start (physical address) of the region
 686 * @end: end (physical address) of the region
 687 * @type: A pointer to memtype, with requested type. On success, requested
 688 * or any other compatible type that was available for the region is returned
 689 *
 690 * On success, returns 0
 691 * On failure, returns non-zero
 692 */
 693int io_reserve_memtype(resource_size_t start, resource_size_t end,
 694			enum page_cache_mode *type)
 695{
 696	resource_size_t size = end - start;
 697	enum page_cache_mode req_type = *type;
 698	enum page_cache_mode new_type;
 699	int ret;
 700
 701	WARN_ON_ONCE(iomem_map_sanity_check(start, size));
 702
 703	ret = reserve_memtype(start, end, req_type, &new_type);
 704	if (ret)
 705		goto out_err;
 706
 707	if (!is_new_memtype_allowed(start, size, req_type, new_type))
 708		goto out_free;
 709
 710	if (kernel_map_sync_memtype(start, size, new_type) < 0)
 711		goto out_free;
 712
 713	*type = new_type;
 714	return 0;
 715
 716out_free:
 717	free_memtype(start, end);
 718	ret = -EBUSY;
 719out_err:
 720	return ret;
 721}
 722
 723/**
 724 * io_free_memtype - Release a memory type mapping for a region of memory
 725 * @start: start (physical address) of the region
 726 * @end: end (physical address) of the region
 727 */
 728void io_free_memtype(resource_size_t start, resource_size_t end)
 729{
 730	free_memtype(start, end);
 731}
 732
 733int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size)
 734{
 735	enum page_cache_mode type = _PAGE_CACHE_MODE_WC;
 736
 737	return io_reserve_memtype(start, start + size, &type);
 738}
 739EXPORT_SYMBOL(arch_io_reserve_memtype_wc);
 740
 741void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
 742{
 743	io_free_memtype(start, start + size);
 744}
 745EXPORT_SYMBOL(arch_io_free_memtype_wc);
 746
 747pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 748				unsigned long size, pgprot_t vma_prot)
 749{
 750	return vma_prot;
 751}
 752
 753#ifdef CONFIG_STRICT_DEVMEM
 754/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */
 755static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 756{
 757	return 1;
 758}
 759#else
 760/* This check is needed to avoid cache aliasing when PAT is enabled */
 761static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 762{
 763	u64 from = ((u64)pfn) << PAGE_SHIFT;
 764	u64 to = from + size;
 765	u64 cursor = from;
 766
 767	if (!pat_enabled())
 768		return 1;
 769
 770	while (cursor < to) {
 771		if (!devmem_is_allowed(pfn))
 
 
 
 772			return 0;
 
 773		cursor += PAGE_SIZE;
 774		pfn++;
 775	}
 776	return 1;
 777}
 778#endif /* CONFIG_STRICT_DEVMEM */
 779
 780int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 781				unsigned long size, pgprot_t *vma_prot)
 782{
 783	enum page_cache_mode pcm = _PAGE_CACHE_MODE_WB;
 784
 785	if (!range_is_allowed(pfn, size))
 786		return 0;
 787
 788	if (file->f_flags & O_DSYNC)
 789		pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 790
 791	*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
 792			     cachemode2protval(pcm));
 793	return 1;
 794}
 795
 796/*
 797 * Change the memory type for the physial address range in kernel identity
 798 * mapping space if that range is a part of identity map.
 799 */
 800int kernel_map_sync_memtype(u64 base, unsigned long size,
 801			    enum page_cache_mode pcm)
 802{
 803	unsigned long id_sz;
 804
 805	if (base > __pa(high_memory-1))
 806		return 0;
 807
 808	/*
 809	 * some areas in the middle of the kernel identity range
 810	 * are not mapped, like the PCI space.
 811	 */
 812	if (!page_is_ram(base >> PAGE_SHIFT))
 813		return 0;
 814
 815	id_sz = (__pa(high_memory-1) <= base + size) ?
 816				__pa(high_memory) - base :
 817				size;
 818
 819	if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
 820		pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n",
 
 
 821			current->comm, current->pid,
 822			cattr_name(pcm),
 823			base, (unsigned long long)(base + size-1));
 824		return -EINVAL;
 825	}
 826	return 0;
 827}
 828
 829/*
 830 * Internal interface to reserve a range of physical memory with prot.
 831 * Reserved non RAM regions only and after successful reserve_memtype,
 832 * this func also keeps identity mapping (if any) in sync with this new prot.
 833 */
 834static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 835				int strict_prot)
 836{
 837	int is_ram = 0;
 838	int ret;
 839	enum page_cache_mode want_pcm = pgprot2cachemode(*vma_prot);
 840	enum page_cache_mode pcm = want_pcm;
 841
 842	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 843
 844	/*
 845	 * reserve_pfn_range() for RAM pages. We do not refcount to keep
 846	 * track of number of mappings of RAM pages. We can assert that
 847	 * the type requested matches the type of first page in the range.
 848	 */
 849	if (is_ram) {
 850		if (!pat_enabled())
 851			return 0;
 852
 853		pcm = lookup_memtype(paddr);
 854		if (want_pcm != pcm) {
 855			pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
 
 856				current->comm, current->pid,
 857				cattr_name(want_pcm),
 858				(unsigned long long)paddr,
 859				(unsigned long long)(paddr + size - 1),
 860				cattr_name(pcm));
 861			*vma_prot = __pgprot((pgprot_val(*vma_prot) &
 862					     (~_PAGE_CACHE_MASK)) |
 863					     cachemode2protval(pcm));
 864		}
 865		return 0;
 866	}
 867
 868	ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm);
 869	if (ret)
 870		return ret;
 871
 872	if (pcm != want_pcm) {
 873		if (strict_prot ||
 874		    !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
 875			free_memtype(paddr, paddr + size);
 876			pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
 877			       current->comm, current->pid,
 878			       cattr_name(want_pcm),
 879			       (unsigned long long)paddr,
 880			       (unsigned long long)(paddr + size - 1),
 881			       cattr_name(pcm));
 
 882			return -EINVAL;
 883		}
 884		/*
 885		 * We allow returning different type than the one requested in
 886		 * non strict case.
 887		 */
 888		*vma_prot = __pgprot((pgprot_val(*vma_prot) &
 889				      (~_PAGE_CACHE_MASK)) |
 890				     cachemode2protval(pcm));
 891	}
 892
 893	if (kernel_map_sync_memtype(paddr, size, pcm) < 0) {
 894		free_memtype(paddr, paddr + size);
 895		return -EINVAL;
 896	}
 897	return 0;
 898}
 899
 900/*
 901 * Internal interface to free a range of physical memory.
 902 * Frees non RAM regions only.
 903 */
 904static void free_pfn_range(u64 paddr, unsigned long size)
 905{
 906	int is_ram;
 907
 908	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
 909	if (is_ram == 0)
 910		free_memtype(paddr, paddr + size);
 911}
 912
 913/*
 914 * track_pfn_copy is called when vma that is covering the pfnmap gets
 915 * copied through copy_page_range().
 916 *
 917 * If the vma has a linear pfn mapping for the entire range, we get the prot
 918 * from pte and reserve the entire vma range with single reserve_pfn_range call.
 919 */
 920int track_pfn_copy(struct vm_area_struct *vma)
 921{
 922	resource_size_t paddr;
 923	unsigned long prot;
 924	unsigned long vma_size = vma->vm_end - vma->vm_start;
 925	pgprot_t pgprot;
 926
 927	if (vma->vm_flags & VM_PAT) {
 928		/*
 929		 * reserve the whole chunk covered by vma. We need the
 930		 * starting address and protection from pte.
 931		 */
 932		if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
 933			WARN_ON_ONCE(1);
 934			return -EINVAL;
 935		}
 936		pgprot = __pgprot(prot);
 937		return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
 938	}
 939
 940	return 0;
 941}
 942
 943/*
 944 * prot is passed in as a parameter for the new mapping. If the vma has
 945 * a linear pfn mapping for the entire range, or no vma is provided,
 946 * reserve the entire pfn + size range with single reserve_pfn_range
 947 * call.
 
 
 948 */
 949int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 950		    unsigned long pfn, unsigned long addr, unsigned long size)
 951{
 952	resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
 953	enum page_cache_mode pcm;
 
 954
 955	/* reserve the whole chunk starting from paddr */
 956	if (!vma || (addr == vma->vm_start
 957				&& size == (vma->vm_end - vma->vm_start))) {
 958		int ret;
 959
 960		ret = reserve_pfn_range(paddr, size, prot, 0);
 961		if (ret == 0 && vma)
 962			vma->vm_flags |= VM_PAT;
 963		return ret;
 964	}
 965
 966	if (!pat_enabled())
 967		return 0;
 968
 969	/*
 970	 * For anything smaller than the vma size we set prot based on the
 971	 * lookup.
 972	 */
 973	pcm = lookup_memtype(paddr);
 974
 975	/* Check memtype for the remaining pages */
 976	while (size > PAGE_SIZE) {
 977		size -= PAGE_SIZE;
 978		paddr += PAGE_SIZE;
 979		if (pcm != lookup_memtype(paddr))
 980			return -EINVAL;
 981	}
 982
 983	*prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
 984			 cachemode2protval(pcm));
 985
 986	return 0;
 987}
 988
 989void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
 990{
 991	enum page_cache_mode pcm;
 992
 993	if (!pat_enabled())
 994		return;
 995
 996	/* Set prot based on lookup */
 997	pcm = lookup_memtype(pfn_t_to_phys(pfn));
 998	*prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
 999			 cachemode2protval(pcm));
1000}
1001
1002/*
1003 * untrack_pfn is called while unmapping a pfnmap for a region.
1004 * untrack can be called for a specific region indicated by pfn and size or
1005 * can be for the entire vma (in which case pfn, size are zero).
1006 */
1007void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
1008		 unsigned long size)
1009{
1010	resource_size_t paddr;
1011	unsigned long prot;
1012
1013	if (vma && !(vma->vm_flags & VM_PAT))
 
 
 
1014		return;
1015
1016	/* free the chunk starting from pfn or the whole chunk */
1017	paddr = (resource_size_t)pfn << PAGE_SHIFT;
1018	if (!paddr && !size) {
1019		if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
1020			WARN_ON_ONCE(1);
1021			return;
1022		}
1023
1024		size = vma->vm_end - vma->vm_start;
1025	}
1026	free_pfn_range(paddr, size);
1027	if (vma)
1028		vma->vm_flags &= ~VM_PAT;
1029}
1030
1031/*
1032 * untrack_pfn_moved is called, while mremapping a pfnmap for a new region,
1033 * with the old vma after its pfnmap page table has been removed.  The new
1034 * vma has a new pfnmap to the same pfn & cache type with VM_PAT set.
1035 */
1036void untrack_pfn_moved(struct vm_area_struct *vma)
1037{
1038	vma->vm_flags &= ~VM_PAT;
1039}
1040
1041pgprot_t pgprot_writecombine(pgprot_t prot)
1042{
1043	return __pgprot(pgprot_val(prot) |
1044				cachemode2protval(_PAGE_CACHE_MODE_WC));
 
 
1045}
1046EXPORT_SYMBOL_GPL(pgprot_writecombine);
1047
1048pgprot_t pgprot_writethrough(pgprot_t prot)
1049{
1050	return __pgprot(pgprot_val(prot) |
1051				cachemode2protval(_PAGE_CACHE_MODE_WT));
1052}
1053EXPORT_SYMBOL_GPL(pgprot_writethrough);
1054
1055#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
1056
1057static struct memtype *memtype_get_idx(loff_t pos)
1058{
1059	struct memtype *print_entry;
1060	int ret;
1061
1062	print_entry  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
1063	if (!print_entry)
1064		return NULL;
1065
1066	spin_lock(&memtype_lock);
1067	ret = rbt_memtype_copy_nth_element(print_entry, pos);
1068	spin_unlock(&memtype_lock);
1069
1070	if (!ret) {
1071		return print_entry;
1072	} else {
1073		kfree(print_entry);
1074		return NULL;
1075	}
1076}
1077
1078static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
1079{
1080	if (*pos == 0) {
1081		++*pos;
1082		seq_puts(seq, "PAT memtype list:\n");
1083	}
1084
1085	return memtype_get_idx(*pos);
1086}
1087
1088static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1089{
1090	++*pos;
1091	return memtype_get_idx(*pos);
1092}
1093
1094static void memtype_seq_stop(struct seq_file *seq, void *v)
1095{
1096}
1097
1098static int memtype_seq_show(struct seq_file *seq, void *v)
1099{
1100	struct memtype *print_entry = (struct memtype *)v;
1101
1102	seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
1103			print_entry->start, print_entry->end);
1104	kfree(print_entry);
1105
1106	return 0;
1107}
1108
1109static const struct seq_operations memtype_seq_ops = {
1110	.start = memtype_seq_start,
1111	.next  = memtype_seq_next,
1112	.stop  = memtype_seq_stop,
1113	.show  = memtype_seq_show,
1114};
1115
1116static int memtype_seq_open(struct inode *inode, struct file *file)
1117{
1118	return seq_open(file, &memtype_seq_ops);
1119}
1120
1121static const struct file_operations memtype_fops = {
1122	.open    = memtype_seq_open,
1123	.read    = seq_read,
1124	.llseek  = seq_lseek,
1125	.release = seq_release,
1126};
1127
1128static int __init pat_memtype_list_init(void)
1129{
1130	if (pat_enabled()) {
1131		debugfs_create_file("pat_memtype_list", S_IRUSR,
1132				    arch_debugfs_dir, NULL, &memtype_fops);
1133	}
1134	return 0;
1135}
1136
1137late_initcall(pat_memtype_list_init);
1138
1139#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
v3.1
  1/*
  2 * Handle caching attributes in page tables (PAT)
  3 *
  4 * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
  5 *          Suresh B Siddha <suresh.b.siddha@intel.com>
  6 *
  7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
  8 */
  9
 10#include <linux/seq_file.h>
 11#include <linux/bootmem.h>
 12#include <linux/debugfs.h>
 13#include <linux/kernel.h>
 14#include <linux/module.h>
 15#include <linux/slab.h>
 16#include <linux/mm.h>
 17#include <linux/fs.h>
 18#include <linux/rbtree.h>
 19
 20#include <asm/cacheflush.h>
 21#include <asm/processor.h>
 22#include <asm/tlbflush.h>
 23#include <asm/x86_init.h>
 24#include <asm/pgtable.h>
 25#include <asm/fcntl.h>
 26#include <asm/e820.h>
 27#include <asm/mtrr.h>
 28#include <asm/page.h>
 29#include <asm/msr.h>
 30#include <asm/pat.h>
 31#include <asm/io.h>
 32
 33#include "pat_internal.h"
 
 34
 35#ifdef CONFIG_X86_PAT
 36int __read_mostly pat_enabled = 1;
 
 
 
 
 
 37
 38static inline void pat_disable(const char *reason)
 39{
 40	pat_enabled = 0;
 41	printk(KERN_INFO "%s\n", reason);
 
 
 
 
 
 
 
 
 
 
 42}
 43
 44static int __init nopat(char *str)
 45{
 46	pat_disable("PAT support disabled.");
 47	return 0;
 48}
 49early_param("nopat", nopat);
 50#else
 51static inline void pat_disable(const char *reason)
 52{
 53	(void)reason;
 54}
 55#endif
 56
 57
 58int pat_debug_enable;
 59
 60static int __init pat_debug_setup(char *str)
 61{
 62	pat_debug_enable = 1;
 63	return 0;
 64}
 65__setup("debugpat", pat_debug_setup);
 66
 67static u64 __read_mostly boot_pat_state;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 68
 69enum {
 70	PAT_UC = 0,		/* uncached */
 71	PAT_WC = 1,		/* Write combining */
 72	PAT_WT = 4,		/* Write Through */
 73	PAT_WP = 5,		/* Write Protected */
 74	PAT_WB = 6,		/* Write Back (default) */
 75	PAT_UC_MINUS = 7,	/* UC, but can be overriden by MTRR */
 76};
 77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 78#define PAT(x, y)	((u64)PAT_ ## y << ((x)*8))
 79
 80void pat_init(void)
 81{
 82	u64 pat;
 83	bool boot_cpu = !boot_pat_state;
 84
 85	if (!pat_enabled)
 
 86		return;
 
 87
 88	if (!cpu_has_pat) {
 89		if (!boot_pat_state) {
 90			pat_disable("PAT not supported by CPU.");
 91			return;
 92		} else {
 93			/*
 94			 * If this happens we are on a secondary CPU, but
 95			 * switched to PAT on the boot CPU. We have no way to
 96			 * undo PAT.
 97			 */
 98			printk(KERN_ERR "PAT enabled, "
 99			       "but not supported by secondary CPU\n");
100			BUG();
101		}
102	}
103
104	/* Set PWT to Write-Combining. All other bits stay the same */
105	/*
106	 * PTE encoding used in Linux:
107	 *      PAT
108	 *      |PCD
109	 *      ||PWT
110	 *      |||
111	 *      000 WB		_PAGE_CACHE_WB
112	 *      001 WC		_PAGE_CACHE_WC
113	 *      010 UC-		_PAGE_CACHE_UC_MINUS
114	 *      011 UC		_PAGE_CACHE_UC
115	 * PAT bit unused
116	 */
117	pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
118	      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
119
120	/* Boot CPU check */
121	if (!boot_pat_state)
122		rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
 
 
 
 
 
 
123
124	wrmsrl(MSR_IA32_CR_PAT, pat);
 
 
 
 
 
 
 
 
 
125
126	if (boot_cpu)
127		printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
128		       smp_processor_id(), boot_pat_state, pat);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129}
130
131#undef PAT
132
133static DEFINE_SPINLOCK(memtype_lock);	/* protects memtype accesses */
134
135/*
136 * Does intersection of PAT memory type and MTRR memory type and returns
137 * the resulting memory type as PAT understands it.
138 * (Type in pat and mtrr will not have same value)
139 * The intersection is based on "Effective Memory Type" tables in IA-32
140 * SDM vol 3a
141 */
142static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
 
143{
144	/*
145	 * Look for MTRR hint to get the effective type in case where PAT
146	 * request is for WB.
147	 */
148	if (req_type == _PAGE_CACHE_WB) {
149		u8 mtrr_type;
150
151		mtrr_type = mtrr_type_lookup(start, end);
152		if (mtrr_type != MTRR_TYPE_WRBACK)
153			return _PAGE_CACHE_UC_MINUS;
154
155		return _PAGE_CACHE_WB;
156	}
157
158	return req_type;
159}
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
162{
163	int ram_page = 0, not_rampage = 0;
164	unsigned long page_nr;
 
 
165
166	for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
167	     ++page_nr) {
168		/*
169		 * For legacy reasons, physical address range in the legacy ISA
170		 * region is tracked as non-RAM. This will allow users of
171		 * /dev/mem to map portions of legacy ISA region, even when
172		 * some of those portions are listed(or not even listed) with
173		 * different e820 types(RAM/reserved/..)
174		 */
175		if (page_nr >= (ISA_END_ADDRESS >> PAGE_SHIFT) &&
176		    page_is_ram(page_nr))
177			ram_page = 1;
178		else
179			not_rampage = 1;
180
181		if (ram_page == not_rampage)
182			return -1;
 
183	}
184
185	return ram_page;
186}
187
188/*
189 * For RAM pages, we use page flags to mark the pages with appropriate type.
190 * Here we do two pass:
191 * - Find the memtype of all the pages in the range, look for any conflicts
192 * - In case of no conflicts, set the new memtype for pages in the range
 
 
 
 
 
193 */
194static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
195				  unsigned long *new_type)
 
196{
197	struct page *page;
198	u64 pfn;
199
200	if (req_type == _PAGE_CACHE_UC) {
 
 
 
 
 
 
201		/* We do not support strong UC */
202		WARN_ON_ONCE(1);
203		req_type = _PAGE_CACHE_UC_MINUS;
204	}
205
206	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
207		unsigned long type;
208
209		page = pfn_to_page(pfn);
210		type = get_page_memtype(page);
211		if (type != -1) {
212			printk(KERN_INFO "reserve_ram_pages_type failed "
213				"0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n",
214				start, end, type, req_type);
215			if (new_type)
216				*new_type = type;
217
218			return -EBUSY;
219		}
220	}
221
222	if (new_type)
223		*new_type = req_type;
224
225	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
226		page = pfn_to_page(pfn);
227		set_page_memtype(page, req_type);
228	}
229	return 0;
230}
231
232static int free_ram_pages_type(u64 start, u64 end)
233{
234	struct page *page;
235	u64 pfn;
236
237	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
238		page = pfn_to_page(pfn);
239		set_page_memtype(page, -1);
240	}
241	return 0;
242}
243
244/*
245 * req_type typically has one of the:
246 * - _PAGE_CACHE_WB
247 * - _PAGE_CACHE_WC
248 * - _PAGE_CACHE_UC_MINUS
249 * - _PAGE_CACHE_UC
 
250 *
251 * If new_type is NULL, function will return an error if it cannot reserve the
252 * region with req_type. If new_type is non-NULL, function will return
253 * available type in new_type in case of no error. In case of any error
254 * it will return a negative return value.
255 */
256int reserve_memtype(u64 start, u64 end, unsigned long req_type,
257		    unsigned long *new_type)
258{
259	struct memtype *new;
260	unsigned long actual_type;
261	int is_range_ram;
262	int err = 0;
263
264	BUG_ON(start >= end); /* end is exclusive */
265
266	if (!pat_enabled) {
267		/* This is identical to page table setting without PAT */
268		if (new_type) {
269			if (req_type == _PAGE_CACHE_WC)
270				*new_type = _PAGE_CACHE_UC_MINUS;
271			else
272				*new_type = req_type & _PAGE_CACHE_MASK;
273		}
274		return 0;
275	}
276
277	/* Low ISA region is always mapped WB in page table. No need to track */
278	if (x86_platform.is_untracked_pat_range(start, end)) {
279		if (new_type)
280			*new_type = _PAGE_CACHE_WB;
281		return 0;
282	}
283
284	/*
285	 * Call mtrr_lookup to get the type hint. This is an
286	 * optimization for /dev/mem mmap'ers into WB memory (BIOS
287	 * tools and ACPI tools). Use WB request for WB memory and use
288	 * UC_MINUS otherwise.
289	 */
290	actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK);
291
292	if (new_type)
293		*new_type = actual_type;
294
295	is_range_ram = pat_pagerange_is_ram(start, end);
296	if (is_range_ram == 1) {
297
298		err = reserve_ram_pages_type(start, end, req_type, new_type);
299
300		return err;
301	} else if (is_range_ram < 0) {
302		return -EINVAL;
303	}
304
305	new  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
306	if (!new)
307		return -ENOMEM;
308
309	new->start	= start;
310	new->end	= end;
311	new->type	= actual_type;
312
313	spin_lock(&memtype_lock);
314
315	err = rbt_memtype_check_insert(new, new_type);
316	if (err) {
317		printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, "
318		       "track %s, req %s\n",
319		       start, end, cattr_name(new->type), cattr_name(req_type));
320		kfree(new);
321		spin_unlock(&memtype_lock);
322
323		return err;
324	}
325
326	spin_unlock(&memtype_lock);
327
328	dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
329		start, end, cattr_name(new->type), cattr_name(req_type),
330		new_type ? cattr_name(*new_type) : "-");
331
332	return err;
333}
334
335int free_memtype(u64 start, u64 end)
336{
337	int err = -EINVAL;
338	int is_range_ram;
339	struct memtype *entry;
340
341	if (!pat_enabled)
342		return 0;
343
344	/* Low ISA region is always mapped WB. No need to track */
345	if (x86_platform.is_untracked_pat_range(start, end))
346		return 0;
347
348	is_range_ram = pat_pagerange_is_ram(start, end);
349	if (is_range_ram == 1) {
350
351		err = free_ram_pages_type(start, end);
352
353		return err;
354	} else if (is_range_ram < 0) {
355		return -EINVAL;
356	}
357
358	spin_lock(&memtype_lock);
359	entry = rbt_memtype_erase(start, end);
360	spin_unlock(&memtype_lock);
361
362	if (!entry) {
363		printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
364			current->comm, current->pid, start, end);
365		return -EINVAL;
366	}
367
368	kfree(entry);
369
370	dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
371
372	return 0;
373}
374
375
376/**
377 * lookup_memtype - Looksup the memory type for a physical address
378 * @paddr: physical address of which memory type needs to be looked up
379 *
380 * Only to be called when PAT is enabled
381 *
382 * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
383 * _PAGE_CACHE_UC
384 */
385static unsigned long lookup_memtype(u64 paddr)
386{
387	int rettype = _PAGE_CACHE_WB;
388	struct memtype *entry;
389
390	if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
391		return rettype;
392
393	if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
394		struct page *page;
 
395		page = pfn_to_page(paddr >> PAGE_SHIFT);
396		rettype = get_page_memtype(page);
397		/*
398		 * -1 from get_page_memtype() implies RAM page is in its
399		 * default state and not reserved, and hence of type WB
400		 */
401		if (rettype == -1)
402			rettype = _PAGE_CACHE_WB;
403
404		return rettype;
405	}
406
407	spin_lock(&memtype_lock);
408
409	entry = rbt_memtype_lookup(paddr);
410	if (entry != NULL)
411		rettype = entry->type;
412	else
413		rettype = _PAGE_CACHE_UC_MINUS;
414
415	spin_unlock(&memtype_lock);
416	return rettype;
417}
418
419/**
420 * io_reserve_memtype - Request a memory type mapping for a region of memory
421 * @start: start (physical address) of the region
422 * @end: end (physical address) of the region
423 * @type: A pointer to memtype, with requested type. On success, requested
424 * or any other compatible type that was available for the region is returned
425 *
426 * On success, returns 0
427 * On failure, returns non-zero
428 */
429int io_reserve_memtype(resource_size_t start, resource_size_t end,
430			unsigned long *type)
431{
432	resource_size_t size = end - start;
433	unsigned long req_type = *type;
434	unsigned long new_type;
435	int ret;
436
437	WARN_ON_ONCE(iomem_map_sanity_check(start, size));
438
439	ret = reserve_memtype(start, end, req_type, &new_type);
440	if (ret)
441		goto out_err;
442
443	if (!is_new_memtype_allowed(start, size, req_type, new_type))
444		goto out_free;
445
446	if (kernel_map_sync_memtype(start, size, new_type) < 0)
447		goto out_free;
448
449	*type = new_type;
450	return 0;
451
452out_free:
453	free_memtype(start, end);
454	ret = -EBUSY;
455out_err:
456	return ret;
457}
458
459/**
460 * io_free_memtype - Release a memory type mapping for a region of memory
461 * @start: start (physical address) of the region
462 * @end: end (physical address) of the region
463 */
464void io_free_memtype(resource_size_t start, resource_size_t end)
465{
466	free_memtype(start, end);
467}
468
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
470				unsigned long size, pgprot_t vma_prot)
471{
472	return vma_prot;
473}
474
475#ifdef CONFIG_STRICT_DEVMEM
476/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
477static inline int range_is_allowed(unsigned long pfn, unsigned long size)
478{
479	return 1;
480}
481#else
482/* This check is needed to avoid cache aliasing when PAT is enabled */
483static inline int range_is_allowed(unsigned long pfn, unsigned long size)
484{
485	u64 from = ((u64)pfn) << PAGE_SHIFT;
486	u64 to = from + size;
487	u64 cursor = from;
488
489	if (!pat_enabled)
490		return 1;
491
492	while (cursor < to) {
493		if (!devmem_is_allowed(pfn)) {
494			printk(KERN_INFO
495		"Program %s tried to access /dev/mem between %Lx->%Lx.\n",
496				current->comm, from, to);
497			return 0;
498		}
499		cursor += PAGE_SIZE;
500		pfn++;
501	}
502	return 1;
503}
504#endif /* CONFIG_STRICT_DEVMEM */
505
506int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
507				unsigned long size, pgprot_t *vma_prot)
508{
509	unsigned long flags = _PAGE_CACHE_WB;
510
511	if (!range_is_allowed(pfn, size))
512		return 0;
513
514	if (file->f_flags & O_DSYNC)
515		flags = _PAGE_CACHE_UC_MINUS;
516
517#ifdef CONFIG_X86_32
518	/*
519	 * On the PPro and successors, the MTRRs are used to set
520	 * memory types for physical addresses outside main memory,
521	 * so blindly setting UC or PWT on those pages is wrong.
522	 * For Pentiums and earlier, the surround logic should disable
523	 * caching for the high addresses through the KEN pin, but
524	 * we maintain the tradition of paranoia in this code.
525	 */
526	if (!pat_enabled &&
527	    !(boot_cpu_has(X86_FEATURE_MTRR) ||
528	      boot_cpu_has(X86_FEATURE_K6_MTRR) ||
529	      boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
530	      boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
531	    (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
532		flags = _PAGE_CACHE_UC;
533	}
534#endif
535
536	*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
537			     flags);
538	return 1;
539}
540
541/*
542 * Change the memory type for the physial address range in kernel identity
543 * mapping space if that range is a part of identity map.
544 */
545int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
 
546{
547	unsigned long id_sz;
548
549	if (base >= __pa(high_memory))
550		return 0;
551
552	id_sz = (__pa(high_memory) < base + size) ?
 
 
 
 
 
 
 
553				__pa(high_memory) - base :
554				size;
555
556	if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) {
557		printk(KERN_INFO
558			"%s:%d ioremap_change_attr failed %s "
559			"for %Lx-%Lx\n",
560			current->comm, current->pid,
561			cattr_name(flags),
562			base, (unsigned long long)(base + size));
563		return -EINVAL;
564	}
565	return 0;
566}
567
568/*
569 * Internal interface to reserve a range of physical memory with prot.
570 * Reserved non RAM regions only and after successful reserve_memtype,
571 * this func also keeps identity mapping (if any) in sync with this new prot.
572 */
573static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
574				int strict_prot)
575{
576	int is_ram = 0;
577	int ret;
578	unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
579	unsigned long flags = want_flags;
580
581	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
582
583	/*
584	 * reserve_pfn_range() for RAM pages. We do not refcount to keep
585	 * track of number of mappings of RAM pages. We can assert that
586	 * the type requested matches the type of first page in the range.
587	 */
588	if (is_ram) {
589		if (!pat_enabled)
590			return 0;
591
592		flags = lookup_memtype(paddr);
593		if (want_flags != flags) {
594			printk(KERN_WARNING
595			"%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n",
596				current->comm, current->pid,
597				cattr_name(want_flags),
598				(unsigned long long)paddr,
599				(unsigned long long)(paddr + size),
600				cattr_name(flags));
601			*vma_prot = __pgprot((pgprot_val(*vma_prot) &
602					      (~_PAGE_CACHE_MASK)) |
603					     flags);
604		}
605		return 0;
606	}
607
608	ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
609	if (ret)
610		return ret;
611
612	if (flags != want_flags) {
613		if (strict_prot ||
614		    !is_new_memtype_allowed(paddr, size, want_flags, flags)) {
615			free_memtype(paddr, paddr + size);
616			printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
617				" for %Lx-%Lx, got %s\n",
618				current->comm, current->pid,
619				cattr_name(want_flags),
620				(unsigned long long)paddr,
621				(unsigned long long)(paddr + size),
622				cattr_name(flags));
623			return -EINVAL;
624		}
625		/*
626		 * We allow returning different type than the one requested in
627		 * non strict case.
628		 */
629		*vma_prot = __pgprot((pgprot_val(*vma_prot) &
630				      (~_PAGE_CACHE_MASK)) |
631				     flags);
632	}
633
634	if (kernel_map_sync_memtype(paddr, size, flags) < 0) {
635		free_memtype(paddr, paddr + size);
636		return -EINVAL;
637	}
638	return 0;
639}
640
641/*
642 * Internal interface to free a range of physical memory.
643 * Frees non RAM regions only.
644 */
645static void free_pfn_range(u64 paddr, unsigned long size)
646{
647	int is_ram;
648
649	is_ram = pat_pagerange_is_ram(paddr, paddr + size);
650	if (is_ram == 0)
651		free_memtype(paddr, paddr + size);
652}
653
654/*
655 * track_pfn_vma_copy is called when vma that is covering the pfnmap gets
656 * copied through copy_page_range().
657 *
658 * If the vma has a linear pfn mapping for the entire range, we get the prot
659 * from pte and reserve the entire vma range with single reserve_pfn_range call.
660 */
661int track_pfn_vma_copy(struct vm_area_struct *vma)
662{
663	resource_size_t paddr;
664	unsigned long prot;
665	unsigned long vma_size = vma->vm_end - vma->vm_start;
666	pgprot_t pgprot;
667
668	if (is_linear_pfn_mapping(vma)) {
669		/*
670		 * reserve the whole chunk covered by vma. We need the
671		 * starting address and protection from pte.
672		 */
673		if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
674			WARN_ON_ONCE(1);
675			return -EINVAL;
676		}
677		pgprot = __pgprot(prot);
678		return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
679	}
680
681	return 0;
682}
683
684/*
685 * track_pfn_vma_new is called when a _new_ pfn mapping is being established
686 * for physical range indicated by pfn and size.
687 *
688 * prot is passed in as a parameter for the new mapping. If the vma has a
689 * linear pfn mapping for the entire range reserve the entire vma range with
690 * single reserve_pfn_range call.
691 */
692int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot,
693			unsigned long pfn, unsigned long size)
694{
695	unsigned long flags;
696	resource_size_t paddr;
697	unsigned long vma_size = vma->vm_end - vma->vm_start;
698
699	if (is_linear_pfn_mapping(vma)) {
700		/* reserve the whole chunk starting from vm_pgoff */
701		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
702		return reserve_pfn_range(paddr, vma_size, prot, 0);
 
 
 
 
 
703	}
704
705	if (!pat_enabled)
706		return 0;
707
708	/* for vm_insert_pfn and friends, we set prot based on lookup */
709	flags = lookup_memtype(pfn << PAGE_SHIFT);
710	*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
711			 flags);
 
 
 
 
 
 
 
 
 
 
 
 
712
713	return 0;
714}
715
 
 
 
 
 
 
 
 
 
 
 
 
 
716/*
717 * untrack_pfn_vma is called while unmapping a pfnmap for a region.
718 * untrack can be called for a specific region indicated by pfn and size or
719 * can be for the entire vma (in which case size can be zero).
720 */
721void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn,
722			unsigned long size)
723{
724	resource_size_t paddr;
725	unsigned long vma_size = vma->vm_end - vma->vm_start;
726
727	if (is_linear_pfn_mapping(vma)) {
728		/* free the whole chunk starting from vm_pgoff */
729		paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT;
730		free_pfn_range(paddr, vma_size);
731		return;
 
 
 
 
 
 
 
 
 
 
732	}
 
 
 
 
 
 
 
 
 
 
 
 
 
733}
734
735pgprot_t pgprot_writecombine(pgprot_t prot)
736{
737	if (pat_enabled)
738		return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
739	else
740		return pgprot_noncached(prot);
741}
742EXPORT_SYMBOL_GPL(pgprot_writecombine);
743
 
 
 
 
 
 
 
744#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
745
746static struct memtype *memtype_get_idx(loff_t pos)
747{
748	struct memtype *print_entry;
749	int ret;
750
751	print_entry  = kzalloc(sizeof(struct memtype), GFP_KERNEL);
752	if (!print_entry)
753		return NULL;
754
755	spin_lock(&memtype_lock);
756	ret = rbt_memtype_copy_nth_element(print_entry, pos);
757	spin_unlock(&memtype_lock);
758
759	if (!ret) {
760		return print_entry;
761	} else {
762		kfree(print_entry);
763		return NULL;
764	}
765}
766
767static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
768{
769	if (*pos == 0) {
770		++*pos;
771		seq_printf(seq, "PAT memtype list:\n");
772	}
773
774	return memtype_get_idx(*pos);
775}
776
777static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
778{
779	++*pos;
780	return memtype_get_idx(*pos);
781}
782
783static void memtype_seq_stop(struct seq_file *seq, void *v)
784{
785}
786
787static int memtype_seq_show(struct seq_file *seq, void *v)
788{
789	struct memtype *print_entry = (struct memtype *)v;
790
791	seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
792			print_entry->start, print_entry->end);
793	kfree(print_entry);
794
795	return 0;
796}
797
798static const struct seq_operations memtype_seq_ops = {
799	.start = memtype_seq_start,
800	.next  = memtype_seq_next,
801	.stop  = memtype_seq_stop,
802	.show  = memtype_seq_show,
803};
804
805static int memtype_seq_open(struct inode *inode, struct file *file)
806{
807	return seq_open(file, &memtype_seq_ops);
808}
809
810static const struct file_operations memtype_fops = {
811	.open    = memtype_seq_open,
812	.read    = seq_read,
813	.llseek  = seq_lseek,
814	.release = seq_release,
815};
816
817static int __init pat_memtype_list_init(void)
818{
819	if (pat_enabled) {
820		debugfs_create_file("pat_memtype_list", S_IRUSR,
821				    arch_debugfs_dir, NULL, &memtype_fops);
822	}
823	return 0;
824}
825
826late_initcall(pat_memtype_list_init);
827
828#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */