vmw_balloon.c - drivers/misc/vmw_balloon.c - Linux diff v4.10.11

   1/*
   2 * VMware Balloon driver.
   3 *
   4 * Copyright (C) 2000-2014, VMware, Inc. All Rights Reserved.
   5 *
   6 * This program is free software; you can redistribute it and/or modify it
   7 * under the terms of the GNU General Public License as published by the
   8 * Free Software Foundation; version 2 of the License and no later version.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
  13 * NON INFRINGEMENT.  See the GNU General Public License for more
  14 * details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write to the Free Software
  18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19 *
  20 * Maintained by:	Xavier Deguillard <xdeguillard@vmware.com>
  21 *			Philip Moltmann <moltmann@vmware.com>
  22 */
  23
  24/*
  25 * This is VMware physical memory management driver for Linux. The driver
  26 * acts like a "balloon" that can be inflated to reclaim physical pages by
  27 * reserving them in the guest and invalidating them in the monitor,
  28 * freeing up the underlying machine pages so they can be allocated to
  29 * other guests.  The balloon can also be deflated to allow the guest to
  30 * use more physical memory. Higher level policies can control the sizes
  31 * of balloons in VMs in order to manage physical memory resources.
  32 */
  33
  34//#define DEBUG
  35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  36
  37#include <linux/types.h>
  38#include <linux/kernel.h>
  39#include <linux/mm.h>
  40#include <linux/vmalloc.h>
  41#include <linux/sched.h>
  42#include <linux/module.h>
  43#include <linux/workqueue.h>
  44#include <linux/debugfs.h>
  45#include <linux/seq_file.h>
  46#include <linux/vmw_vmci_defs.h>
  47#include <linux/vmw_vmci_api.h>
  48#include <asm/hypervisor.h>
  49
  50MODULE_AUTHOR("VMware, Inc.");
  51MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
  52MODULE_VERSION("1.5.0.0-k");
  53MODULE_ALIAS("dmi:*:svnVMware*:*");
  54MODULE_ALIAS("vmware_vmmemctl");
  55MODULE_LICENSE("GPL");
  56
  57/*
  58 * Various constants controlling rate of inflaint/deflating balloon,
  59 * measured in pages.
  60 */
  61
  62/*
  63 * Rates of memory allocaton when guest experiences memory pressure
  64 * (driver performs sleeping allocations).
  65 */
  66#define VMW_BALLOON_RATE_ALLOC_MIN	512U
  67#define VMW_BALLOON_RATE_ALLOC_MAX	2048U
  68#define VMW_BALLOON_RATE_ALLOC_INC	16U
  69
  70/*
  71 * When guest is under memory pressure, use a reduced page allocation
  72 * rate for next several cycles.
  73 */
  74#define VMW_BALLOON_SLOW_CYCLES		4
  75
  76/*
  77 * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
  78 * allow wait (__GFP_RECLAIM) for NOSLEEP page allocations. Use
  79 * __GFP_NOWARN, to suppress page allocation failure warnings.
  80 */
  81#define VMW_PAGE_ALLOC_NOSLEEP		(__GFP_HIGHMEM|__GFP_NOWARN)
  82
  83/*
  84 * Use GFP_HIGHUSER when executing in a separate kernel thread
  85 * context and allocation can sleep.  This is less stressful to
  86 * the guest memory system, since it allows the thread to block
  87 * while memory is reclaimed, and won't take pages from emergency
  88 * low-memory pools.
  89 */
  90#define VMW_PAGE_ALLOC_CANSLEEP		(GFP_HIGHUSER)
  91
  92/* Maximum number of refused pages we accumulate during inflation cycle */
  93#define VMW_BALLOON_MAX_REFUSED		16
  94
  95/*
  96 * Hypervisor communication port definitions.
  97 */
  98#define VMW_BALLOON_HV_PORT		0x5670
  99#define VMW_BALLOON_HV_MAGIC		0x456c6d6f
 100#define VMW_BALLOON_GUEST_ID		1	/* Linux */
 101
 102enum vmwballoon_capabilities {
 103	/*
 104	 * Bit 0 is reserved and not associated to any capability.
 105	 */
 106	VMW_BALLOON_BASIC_CMDS			= (1 << 1),
 107	VMW_BALLOON_BATCHED_CMDS		= (1 << 2),
 108	VMW_BALLOON_BATCHED_2M_CMDS		= (1 << 3),
 109	VMW_BALLOON_SIGNALLED_WAKEUP_CMD	= (1 << 4),
 110};
 111
 112#define VMW_BALLOON_CAPABILITIES	(VMW_BALLOON_BASIC_CMDS \
 113					| VMW_BALLOON_BATCHED_CMDS \
 114					| VMW_BALLOON_BATCHED_2M_CMDS \
 115					| VMW_BALLOON_SIGNALLED_WAKEUP_CMD)
 116
 117#define VMW_BALLOON_2M_SHIFT		(9)
 118#define VMW_BALLOON_NUM_PAGE_SIZES	(2)
 119
 120/*
 121 * Backdoor commands availability:
 122 *
 123 * START, GET_TARGET and GUEST_ID are always available,
 124 *
 125 * VMW_BALLOON_BASIC_CMDS:
 126 *	LOCK and UNLOCK commands,
 127 * VMW_BALLOON_BATCHED_CMDS:
 128 *	BATCHED_LOCK and BATCHED_UNLOCK commands.
 129 * VMW BALLOON_BATCHED_2M_CMDS:
 130 *	BATCHED_2M_LOCK and BATCHED_2M_UNLOCK commands,
 131 * VMW VMW_BALLOON_SIGNALLED_WAKEUP_CMD:
 132 *	VMW_BALLOON_CMD_VMCI_DOORBELL_SET command.
 133 */
 134#define VMW_BALLOON_CMD_START			0
 135#define VMW_BALLOON_CMD_GET_TARGET		1
 136#define VMW_BALLOON_CMD_LOCK			2
 137#define VMW_BALLOON_CMD_UNLOCK			3
 138#define VMW_BALLOON_CMD_GUEST_ID		4
 139#define VMW_BALLOON_CMD_BATCHED_LOCK		6
 140#define VMW_BALLOON_CMD_BATCHED_UNLOCK		7
 141#define VMW_BALLOON_CMD_BATCHED_2M_LOCK		8
 142#define VMW_BALLOON_CMD_BATCHED_2M_UNLOCK	9
 143#define VMW_BALLOON_CMD_VMCI_DOORBELL_SET	10
 144
 145
 146/* error codes */
 147#define VMW_BALLOON_SUCCESS		        0
 148#define VMW_BALLOON_FAILURE		        -1
 149#define VMW_BALLOON_ERROR_CMD_INVALID	        1
 150#define VMW_BALLOON_ERROR_PPN_INVALID	        2
 151#define VMW_BALLOON_ERROR_PPN_LOCKED	        3
 152#define VMW_BALLOON_ERROR_PPN_UNLOCKED	        4
 153#define VMW_BALLOON_ERROR_PPN_PINNED	        5
 154#define VMW_BALLOON_ERROR_PPN_NOTNEEDED	        6
 155#define VMW_BALLOON_ERROR_RESET		        7
 156#define VMW_BALLOON_ERROR_BUSY		        8
 157
 158#define VMW_BALLOON_SUCCESS_WITH_CAPABILITIES	(0x03000000)
 159
 160/* Batch page description */
 161
 162/*
 163 * Layout of a page in the batch page:
 164 *
 165 * +-------------+----------+--------+
 166 * |             |          |        |
 167 * | Page number | Reserved | Status |
 168 * |             |          |        |
 169 * +-------------+----------+--------+
 170 * 64  PAGE_SHIFT          6         0
 171 *
 172 * The reserved field should be set to 0.
 173 */
 174#define VMW_BALLOON_BATCH_MAX_PAGES	(PAGE_SIZE / sizeof(u64))
 175#define VMW_BALLOON_BATCH_STATUS_MASK	((1UL << 5) - 1)
 176#define VMW_BALLOON_BATCH_PAGE_MASK	(~((1UL << PAGE_SHIFT) - 1))
 177
 178struct vmballoon_batch_page {
 179	u64 pages[VMW_BALLOON_BATCH_MAX_PAGES];
 180};
 181
 182static u64 vmballoon_batch_get_pa(struct vmballoon_batch_page *batch, int idx)
 183{
 184	return batch->pages[idx] & VMW_BALLOON_BATCH_PAGE_MASK;
 185}
 186
 187static int vmballoon_batch_get_status(struct vmballoon_batch_page *batch,
 188				int idx)
 189{
 190	return (int)(batch->pages[idx] & VMW_BALLOON_BATCH_STATUS_MASK);
 191}
 192
 193static void vmballoon_batch_set_pa(struct vmballoon_batch_page *batch, int idx,
 194				u64 pa)
 195{
 196	batch->pages[idx] = pa;
 197}
 198
 199
 200#define VMWARE_BALLOON_CMD(cmd, arg1, arg2, result)		\
 201({								\
 202	unsigned long __status, __dummy1, __dummy2, __dummy3;	\
 203	__asm__ __volatile__ ("inl %%dx" :			\
 204		"=a"(__status),					\
 205		"=c"(__dummy1),					\
 206		"=d"(__dummy2),					\
 207		"=b"(result),					\
 208		"=S" (__dummy3) :				\
 209		"0"(VMW_BALLOON_HV_MAGIC),			\
 210		"1"(VMW_BALLOON_CMD_##cmd),			\
 211		"2"(VMW_BALLOON_HV_PORT),			\
 212		"3"(arg1),					\
 213		"4" (arg2) :					\
 214		"memory");					\
 215	if (VMW_BALLOON_CMD_##cmd == VMW_BALLOON_CMD_START)	\
 216		result = __dummy1;				\
 217	result &= -1UL;						\
 218	__status & -1UL;					\
 219})
 220
 221#ifdef CONFIG_DEBUG_FS
 222struct vmballoon_stats {
 223	unsigned int timer;
 224	unsigned int doorbell;
 225
 226	/* allocation statistics */
 227	unsigned int alloc[VMW_BALLOON_NUM_PAGE_SIZES];
 228	unsigned int alloc_fail[VMW_BALLOON_NUM_PAGE_SIZES];
 229	unsigned int sleep_alloc;
 230	unsigned int sleep_alloc_fail;
 231	unsigned int refused_alloc[VMW_BALLOON_NUM_PAGE_SIZES];
 232	unsigned int refused_free[VMW_BALLOON_NUM_PAGE_SIZES];
 233	unsigned int free[VMW_BALLOON_NUM_PAGE_SIZES];
 234
 235	/* monitor operations */
 236	unsigned int lock[VMW_BALLOON_NUM_PAGE_SIZES];
 237	unsigned int lock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
 238	unsigned int unlock[VMW_BALLOON_NUM_PAGE_SIZES];
 239	unsigned int unlock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
 240	unsigned int target;
 241	unsigned int target_fail;
 242	unsigned int start;
 243	unsigned int start_fail;
 244	unsigned int guest_type;
 245	unsigned int guest_type_fail;
 246	unsigned int doorbell_set;
 247	unsigned int doorbell_unset;
 248};
 249
 250#define STATS_INC(stat) (stat)++
 251#else
 252#define STATS_INC(stat)
 253#endif
 254
 255struct vmballoon;
 256
 257struct vmballoon_ops {
 258	void (*add_page)(struct vmballoon *b, int idx, struct page *p);
 259	int (*lock)(struct vmballoon *b, unsigned int num_pages,
 260			bool is_2m_pages, unsigned int *target);
 261	int (*unlock)(struct vmballoon *b, unsigned int num_pages,
 262			bool is_2m_pages, unsigned int *target);
 263};
 264
 265struct vmballoon_page_size {
 266	/* list of reserved physical pages */
 267	struct list_head pages;
 268
 269	/* transient list of non-balloonable pages */
 270	struct list_head refused_pages;
 271	unsigned int n_refused_pages;
 272};
 273
 274struct vmballoon {
 275	struct vmballoon_page_size page_sizes[VMW_BALLOON_NUM_PAGE_SIZES];
 276
 277	/* supported page sizes. 1 == 4k pages only, 2 == 4k and 2m pages */
 278	unsigned supported_page_sizes;
 279
 280	/* balloon size in pages */
 281	unsigned int size;
 282	unsigned int target;
 283
 284	/* reset flag */
 285	bool reset_required;
 286
 287	/* adjustment rates (pages per second) */
 288	unsigned int rate_alloc;
 289
 290	/* slowdown page allocations for next few cycles */
 291	unsigned int slow_allocation_cycles;
 292
 293	unsigned long capabilities;
 294
 295	struct vmballoon_batch_page *batch_page;
 296	unsigned int batch_max_pages;
 297	struct page *page;
 298
 299	const struct vmballoon_ops *ops;
 300
 301#ifdef CONFIG_DEBUG_FS
 302	/* statistics */
 303	struct vmballoon_stats stats;
 304
 305	/* debugfs file exporting statistics */
 306	struct dentry *dbg_entry;
 307#endif
 308
 309	struct sysinfo sysinfo;
 310
 311	struct delayed_work dwork;
 312
 313	struct vmci_handle vmci_doorbell;
 314};
 315
 316static struct vmballoon balloon;
 317
 318/*
 319 * Send "start" command to the host, communicating supported version
 320 * of the protocol.
 321 */
 322static bool vmballoon_send_start(struct vmballoon *b, unsigned long req_caps)
 323{
 324	unsigned long status, capabilities, dummy = 0;
 325	bool success;
 326
 327	STATS_INC(b->stats.start);
 328
 329	status = VMWARE_BALLOON_CMD(START, req_caps, dummy, capabilities);
 330
 331	switch (status) {
 332	case VMW_BALLOON_SUCCESS_WITH_CAPABILITIES:
 333		b->capabilities = capabilities;
 334		success = true;
 335		break;
 336	case VMW_BALLOON_SUCCESS:
 337		b->capabilities = VMW_BALLOON_BASIC_CMDS;
 338		success = true;
 339		break;
 340	default:
 341		success = false;
 342	}
 343
 344	if (b->capabilities & VMW_BALLOON_BATCHED_2M_CMDS)
 345		b->supported_page_sizes = 2;
 346	else
 347		b->supported_page_sizes = 1;
 348
 349	if (!success) {
 350		pr_debug("%s - failed, hv returns %ld\n", __func__, status);
 351		STATS_INC(b->stats.start_fail);
 352	}
 353	return success;
 354}
 355
 356static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
 357{
 358	switch (status) {
 359	case VMW_BALLOON_SUCCESS:
 360		return true;
 361
 362	case VMW_BALLOON_ERROR_RESET:
 363		b->reset_required = true;
 364		/* fall through */
 365
 366	default:
 367		return false;
 368	}
 369}
 370
 371/*
 372 * Communicate guest type to the host so that it can adjust ballooning
 373 * algorithm to the one most appropriate for the guest. This command
 374 * is normally issued after sending "start" command and is part of
 375 * standard reset sequence.
 376 */
 377static bool vmballoon_send_guest_id(struct vmballoon *b)
 378{
 379	unsigned long status, dummy = 0;
 380
 381	status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy,
 382				dummy);
 383
 384	STATS_INC(b->stats.guest_type);
 385
 386	if (vmballoon_check_status(b, status))
 387		return true;
 388
 389	pr_debug("%s - failed, hv returns %ld\n", __func__, status);
 390	STATS_INC(b->stats.guest_type_fail);
 391	return false;
 392}
 393
 394static u16 vmballoon_page_size(bool is_2m_page)
 395{
 396	if (is_2m_page)
 397		return 1 << VMW_BALLOON_2M_SHIFT;
 398
 399	return 1;
 400}
 401
 402/*
 403 * Retrieve desired balloon size from the host.
 404 */
 405static bool vmballoon_send_get_target(struct vmballoon *b, u32 *new_target)
 406{
 407	unsigned long status;
 408	unsigned long target;
 409	unsigned long limit;
 410	unsigned long dummy = 0;
 411	u32 limit32;
 412
 413	/*
 414	 * si_meminfo() is cheap. Moreover, we want to provide dynamic
 415	 * max balloon size later. So let us call si_meminfo() every
 416	 * iteration.
 417	 */
 418	si_meminfo(&b->sysinfo);
 419	limit = b->sysinfo.totalram;
 420
 421	/* Ensure limit fits in 32-bits */
 422	limit32 = (u32)limit;
 423	if (limit != limit32)
 424		return false;
 425
 426	/* update stats */
 427	STATS_INC(b->stats.target);
 428
 429	status = VMWARE_BALLOON_CMD(GET_TARGET, limit, dummy, target);
 430	if (vmballoon_check_status(b, status)) {
 431		*new_target = target;
 432		return true;
 433	}
 434
 435	pr_debug("%s - failed, hv returns %ld\n", __func__, status);
 436	STATS_INC(b->stats.target_fail);
 437	return false;
 438}
 439
 440/*
 441 * Notify the host about allocated page so that host can use it without
 442 * fear that guest will need it. Host may reject some pages, we need to
 443 * check the return value and maybe submit a different page.
 444 */
 445static int vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn,
 446				unsigned int *hv_status, unsigned int *target)
 447{
 448	unsigned long status, dummy = 0;
 449	u32 pfn32;
 450
 451	pfn32 = (u32)pfn;
 452	if (pfn32 != pfn)
 453		return -1;
 454
 455	STATS_INC(b->stats.lock[false]);
 456
 457	*hv_status = status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy, *target);
 458	if (vmballoon_check_status(b, status))
 459		return 0;
 460
 461	pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
 462	STATS_INC(b->stats.lock_fail[false]);
 463	return 1;
 464}
 465
 466static int vmballoon_send_batched_lock(struct vmballoon *b,
 467		unsigned int num_pages, bool is_2m_pages, unsigned int *target)
 468{
 469	unsigned long status;
 470	unsigned long pfn = page_to_pfn(b->page);
 471
 472	STATS_INC(b->stats.lock[is_2m_pages]);
 473
 474	if (is_2m_pages)
 475		status = VMWARE_BALLOON_CMD(BATCHED_2M_LOCK, pfn, num_pages,
 476				*target);
 477	else
 478		status = VMWARE_BALLOON_CMD(BATCHED_LOCK, pfn, num_pages,
 479				*target);
 480
 481	if (vmballoon_check_status(b, status))
 482		return 0;
 483
 484	pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
 485	STATS_INC(b->stats.lock_fail[is_2m_pages]);
 486	return 1;
 487}
 488
 489/*
 490 * Notify the host that guest intends to release given page back into
 491 * the pool of available (to the guest) pages.
 492 */
 493static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn,
 494							unsigned int *target)
 495{
 496	unsigned long status, dummy = 0;
 497	u32 pfn32;
 498
 499	pfn32 = (u32)pfn;
 500	if (pfn32 != pfn)
 501		return false;
 502
 503	STATS_INC(b->stats.unlock[false]);
 504
 505	status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy, *target);
 506	if (vmballoon_check_status(b, status))
 507		return true;
 508
 509	pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
 510	STATS_INC(b->stats.unlock_fail[false]);
 511	return false;
 512}
 513
 514static bool vmballoon_send_batched_unlock(struct vmballoon *b,
 515		unsigned int num_pages, bool is_2m_pages, unsigned int *target)
 516{
 517	unsigned long status;
 518	unsigned long pfn = page_to_pfn(b->page);
 519
 520	STATS_INC(b->stats.unlock[is_2m_pages]);
 521
 522	if (is_2m_pages)
 523		status = VMWARE_BALLOON_CMD(BATCHED_2M_UNLOCK, pfn, num_pages,
 524				*target);
 525	else
 526		status = VMWARE_BALLOON_CMD(BATCHED_UNLOCK, pfn, num_pages,
 527				*target);
 528
 529	if (vmballoon_check_status(b, status))
 530		return true;
 531
 532	pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
 533	STATS_INC(b->stats.unlock_fail[is_2m_pages]);
 534	return false;
 535}
 536
 537static struct page *vmballoon_alloc_page(gfp_t flags, bool is_2m_page)
 538{
 539	if (is_2m_page)
 540		return alloc_pages(flags, VMW_BALLOON_2M_SHIFT);
 541
 542	return alloc_page(flags);
 543}
 544
 545static void vmballoon_free_page(struct page *page, bool is_2m_page)
 546{
 547	if (is_2m_page)
 548		__free_pages(page, VMW_BALLOON_2M_SHIFT);
 549	else
 550		__free_page(page);
 551}
 552
 553/*
 554 * Quickly release all pages allocated for the balloon. This function is
 555 * called when host decides to "reset" balloon for one reason or another.
 556 * Unlike normal "deflate" we do not (shall not) notify host of the pages
 557 * being released.
 558 */
 559static void vmballoon_pop(struct vmballoon *b)
 560{
 561	struct page *page, *next;
 562	unsigned is_2m_pages;
 563
 564	for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
 565			is_2m_pages++) {
 566		struct vmballoon_page_size *page_size =
 567				&b->page_sizes[is_2m_pages];
 568		u16 size_per_page = vmballoon_page_size(is_2m_pages);
 569
 570		list_for_each_entry_safe(page, next, &page_size->pages, lru) {
 571			list_del(&page->lru);
 572			vmballoon_free_page(page, is_2m_pages);
 573			STATS_INC(b->stats.free[is_2m_pages]);
 574			b->size -= size_per_page;
 575			cond_resched();
 576		}
 577	}
 578
 579	if (b->batch_page) {
 580		vunmap(b->batch_page);
 581		b->batch_page = NULL;
 582	}
 583
 584	if (b->page) {
 585		__free_page(b->page);
 586		b->page = NULL;
 587	}
 588}
 589
 590/*
 591 * Notify the host of a ballooned page. If host rejects the page put it on the
 592 * refuse list, those refused page are then released at the end of the
 593 * inflation cycle.
 594 */
 595static int vmballoon_lock_page(struct vmballoon *b, unsigned int num_pages,
 596				bool is_2m_pages, unsigned int *target)
 597{
 598	int locked, hv_status;
 599	struct page *page = b->page;
 600	struct vmballoon_page_size *page_size = &b->page_sizes[false];
 601
 602	/* is_2m_pages can never happen as 2m pages support implies batching */
 603
 604	locked = vmballoon_send_lock_page(b, page_to_pfn(page), &hv_status,
 605								target);
 606	if (locked > 0) {
 607		STATS_INC(b->stats.refused_alloc[false]);
 608
 609		if (hv_status == VMW_BALLOON_ERROR_RESET ||
 610				hv_status == VMW_BALLOON_ERROR_PPN_NOTNEEDED) {
 611			vmballoon_free_page(page, false);
 612			return -EIO;
 613		}
 614
 615		/*
 616		 * Place page on the list of non-balloonable pages
 617		 * and retry allocation, unless we already accumulated
 618		 * too many of them, in which case take a breather.
 619		 */
 620		if (page_size->n_refused_pages < VMW_BALLOON_MAX_REFUSED) {
 621			page_size->n_refused_pages++;
 622			list_add(&page->lru, &page_size->refused_pages);
 623		} else {
 624			vmballoon_free_page(page, false);
 625		}
 626		return -EIO;
 627	}
 628
 629	/* track allocated page */
 630	list_add(&page->lru, &page_size->pages);
 631
 632	/* update balloon size */
 633	b->size++;
 634
 635	return 0;
 636}
 637
 638static int vmballoon_lock_batched_page(struct vmballoon *b,
 639		unsigned int num_pages, bool is_2m_pages, unsigned int *target)
 640{
 641	int locked, i;
 642	u16 size_per_page = vmballoon_page_size(is_2m_pages);
 643
 644	locked = vmballoon_send_batched_lock(b, num_pages, is_2m_pages,
 645			target);
 646	if (locked > 0) {
 647		for (i = 0; i < num_pages; i++) {
 648			u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
 649			struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
 650
 651			vmballoon_free_page(p, is_2m_pages);
 652		}
 653
 654		return -EIO;
 655	}
 656
 657	for (i = 0; i < num_pages; i++) {
 658		u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
 659		struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
 660		struct vmballoon_page_size *page_size =
 661				&b->page_sizes[is_2m_pages];
 662
 663		locked = vmballoon_batch_get_status(b->batch_page, i);
 664
 665		switch (locked) {
 666		case VMW_BALLOON_SUCCESS:
 667			list_add(&p->lru, &page_size->pages);
 668			b->size += size_per_page;
 669			break;
 670		case VMW_BALLOON_ERROR_PPN_PINNED:
 671		case VMW_BALLOON_ERROR_PPN_INVALID:
 672			if (page_size->n_refused_pages
 673					< VMW_BALLOON_MAX_REFUSED) {
 674				list_add(&p->lru, &page_size->refused_pages);
 675				page_size->n_refused_pages++;
 676				break;
 677			}
 678			/* Fallthrough */
 679		case VMW_BALLOON_ERROR_RESET:
 680		case VMW_BALLOON_ERROR_PPN_NOTNEEDED:
 681			vmballoon_free_page(p, is_2m_pages);
 682			break;
 683		default:
 684			/* This should never happen */
 685			WARN_ON_ONCE(true);
 686		}
 687	}
 688
 689	return 0;
 690}
 691
 692/*
 693 * Release the page allocated for the balloon. Note that we first notify
 694 * the host so it can make sure the page will be available for the guest
 695 * to use, if needed.
 696 */
 697static int vmballoon_unlock_page(struct vmballoon *b, unsigned int num_pages,
 698		bool is_2m_pages, unsigned int *target)
 699{
 700	struct page *page = b->page;
 701	struct vmballoon_page_size *page_size = &b->page_sizes[false];
 702
 703	/* is_2m_pages can never happen as 2m pages support implies batching */
 704
 705	if (!vmballoon_send_unlock_page(b, page_to_pfn(page), target)) {
 706		list_add(&page->lru, &page_size->pages);
 707		return -EIO;
 708	}
 709
 710	/* deallocate page */
 711	vmballoon_free_page(page, false);
 712	STATS_INC(b->stats.free[false]);
 713
 714	/* update balloon size */
 715	b->size--;
 716
 717	return 0;
 718}
 719
 720static int vmballoon_unlock_batched_page(struct vmballoon *b,
 721				unsigned int num_pages, bool is_2m_pages,
 722				unsigned int *target)
 723{
 724	int locked, i, ret = 0;
 725	bool hv_success;
 726	u16 size_per_page = vmballoon_page_size(is_2m_pages);
 727
 728	hv_success = vmballoon_send_batched_unlock(b, num_pages, is_2m_pages,
 729			target);
 730	if (!hv_success)
 731		ret = -EIO;
 732
 733	for (i = 0; i < num_pages; i++) {
 734		u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
 735		struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
 736		struct vmballoon_page_size *page_size =
 737				&b->page_sizes[is_2m_pages];
 738
 739		locked = vmballoon_batch_get_status(b->batch_page, i);
 740		if (!hv_success || locked != VMW_BALLOON_SUCCESS) {
 741			/*
 742			 * That page wasn't successfully unlocked by the
 743			 * hypervisor, re-add it to the list of pages owned by
 744			 * the balloon driver.
 745			 */
 746			list_add(&p->lru, &page_size->pages);
 747		} else {
 748			/* deallocate page */
 749			vmballoon_free_page(p, is_2m_pages);
 750			STATS_INC(b->stats.free[is_2m_pages]);
 751
 752			/* update balloon size */
 753			b->size -= size_per_page;
 754		}
 755	}
 756
 757	return ret;
 758}
 759
 760/*
 761 * Release pages that were allocated while attempting to inflate the
 762 * balloon but were refused by the host for one reason or another.
 763 */
 764static void vmballoon_release_refused_pages(struct vmballoon *b,
 765		bool is_2m_pages)
 766{
 767	struct page *page, *next;
 768	struct vmballoon_page_size *page_size =
 769			&b->page_sizes[is_2m_pages];
 770
 771	list_for_each_entry_safe(page, next, &page_size->refused_pages, lru) {
 772		list_del(&page->lru);
 773		vmballoon_free_page(page, is_2m_pages);
 774		STATS_INC(b->stats.refused_free[is_2m_pages]);
 775	}
 776
 777	page_size->n_refused_pages = 0;
 778}
 779
 780static void vmballoon_add_page(struct vmballoon *b, int idx, struct page *p)
 781{
 782	b->page = p;
 783}
 784
 785static void vmballoon_add_batched_page(struct vmballoon *b, int idx,
 786				struct page *p)
 787{
 788	vmballoon_batch_set_pa(b->batch_page, idx,
 789			(u64)page_to_pfn(p) << PAGE_SHIFT);
 790}
 791
 792/*
 793 * Inflate the balloon towards its target size. Note that we try to limit
 794 * the rate of allocation to make sure we are not choking the rest of the
 795 * system.
 796 */
 797static void vmballoon_inflate(struct vmballoon *b)
 798{
 799	unsigned rate;
 800	unsigned int allocations = 0;
 801	unsigned int num_pages = 0;
 802	int error = 0;
 803	gfp_t flags = VMW_PAGE_ALLOC_NOSLEEP;
 804	bool is_2m_pages;
 805
 806	pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
 807
 808	/*
 809	 * First try NOSLEEP page allocations to inflate balloon.
 810	 *
 811	 * If we do not throttle nosleep allocations, we can drain all
 812	 * free pages in the guest quickly (if the balloon target is high).
 813	 * As a side-effect, draining free pages helps to inform (force)
 814	 * the guest to start swapping if balloon target is not met yet,
 815	 * which is a desired behavior. However, balloon driver can consume
 816	 * all available CPU cycles if too many pages are allocated in a
 817	 * second. Therefore, we throttle nosleep allocations even when
 818	 * the guest is not under memory pressure. OTOH, if we have already
 819	 * predicted that the guest is under memory pressure, then we
 820	 * slowdown page allocations considerably.
 821	 */
 822
 823	/*
 824	 * Start with no sleep allocation rate which may be higher
 825	 * than sleeping allocation rate.
 826	 */
 827	if (b->slow_allocation_cycles) {
 828		rate = b->rate_alloc;
 829		is_2m_pages = false;
 830	} else {
 831		rate = UINT_MAX;
 832		is_2m_pages =
 833			b->supported_page_sizes == VMW_BALLOON_NUM_PAGE_SIZES;
 834	}
 835
 836	pr_debug("%s - goal: %d, no-sleep rate: %u, sleep rate: %d\n",
 837		 __func__, b->target - b->size, rate, b->rate_alloc);
 838
 839	while (!b->reset_required &&
 840		b->size + num_pages * vmballoon_page_size(is_2m_pages)
 841		< b->target) {
 842		struct page *page;
 843
 844		if (flags == VMW_PAGE_ALLOC_NOSLEEP)
 845			STATS_INC(b->stats.alloc[is_2m_pages]);
 846		else
 847			STATS_INC(b->stats.sleep_alloc);
 848
 849		page = vmballoon_alloc_page(flags, is_2m_pages);
 850		if (!page) {
 851			STATS_INC(b->stats.alloc_fail[is_2m_pages]);
 852
 853			if (is_2m_pages) {
 854				b->ops->lock(b, num_pages, true, &b->target);
 855
 856				/*
 857				 * ignore errors from locking as we now switch
 858				 * to 4k pages and we might get different
 859				 * errors.
 860				 */
 861
 862				num_pages = 0;
 863				is_2m_pages = false;
 864				continue;
 865			}
 866
 867			if (flags == VMW_PAGE_ALLOC_CANSLEEP) {
 868				/*
 869				 * CANSLEEP page allocation failed, so guest
 870				 * is under severe memory pressure. Quickly
 871				 * decrease allocation rate.
 872				 */
 873				b->rate_alloc = max(b->rate_alloc / 2,
 874						    VMW_BALLOON_RATE_ALLOC_MIN);
 875				STATS_INC(b->stats.sleep_alloc_fail);
 876				break;
 877			}
 878
 879			/*
 880			 * NOSLEEP page allocation failed, so the guest is
 881			 * under memory pressure. Let us slow down page
 882			 * allocations for next few cycles so that the guest
 883			 * gets out of memory pressure. Also, if we already
 884			 * allocated b->rate_alloc pages, let's pause,
 885			 * otherwise switch to sleeping allocations.
 886			 */
 887			b->slow_allocation_cycles = VMW_BALLOON_SLOW_CYCLES;
 888
 889			if (allocations >= b->rate_alloc)
 890				break;
 891
 892			flags = VMW_PAGE_ALLOC_CANSLEEP;
 893			/* Lower rate for sleeping allocations. */
 894			rate = b->rate_alloc;
 895			continue;
 896		}
 897
 898		b->ops->add_page(b, num_pages++, page);
 899		if (num_pages == b->batch_max_pages) {
 900			error = b->ops->lock(b, num_pages, is_2m_pages,
 901					&b->target);
 902			num_pages = 0;
 903			if (error)
 904				break;
 905		}
 906
 907		cond_resched();
 908
 909		if (allocations >= rate) {
 910			/* We allocated enough pages, let's take a break. */
 911			break;
 912		}
 913	}
 914
 915	if (num_pages > 0)
 916		b->ops->lock(b, num_pages, is_2m_pages, &b->target);
 917
 918	/*
 919	 * We reached our goal without failures so try increasing
 920	 * allocation rate.
 921	 */
 922	if (error == 0 && allocations >= b->rate_alloc) {
 923		unsigned int mult = allocations / b->rate_alloc;
 924
 925		b->rate_alloc =
 926			min(b->rate_alloc + mult * VMW_BALLOON_RATE_ALLOC_INC,
 927			    VMW_BALLOON_RATE_ALLOC_MAX);
 928	}
 929
 930	vmballoon_release_refused_pages(b, true);
 931	vmballoon_release_refused_pages(b, false);
 932}
 933
 934/*
 935 * Decrease the size of the balloon allowing guest to use more memory.
 936 */
 937static void vmballoon_deflate(struct vmballoon *b)
 938{
 939	unsigned is_2m_pages;
 940
 941	pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
 942
 943	/* free pages to reach target */
 944	for (is_2m_pages = 0; is_2m_pages < b->supported_page_sizes;
 945			is_2m_pages++) {
 946		struct page *page, *next;
 947		unsigned int num_pages = 0;
 948		struct vmballoon_page_size *page_size =
 949				&b->page_sizes[is_2m_pages];
 950
 951		list_for_each_entry_safe(page, next, &page_size->pages, lru) {
 952			if (b->reset_required ||
 953				(b->target > 0 &&
 954					b->size - num_pages
 955					* vmballoon_page_size(is_2m_pages)
 956				< b->target + vmballoon_page_size(true)))
 957				break;
 958
 959			list_del(&page->lru);
 960			b->ops->add_page(b, num_pages++, page);
 961
 962			if (num_pages == b->batch_max_pages) {
 963				int error;
 964
 965				error = b->ops->unlock(b, num_pages,
 966						is_2m_pages, &b->target);
 967				num_pages = 0;
 968				if (error)
 969					return;
 970			}
 971
 972			cond_resched();
 973		}
 974
 975		if (num_pages > 0)
 976			b->ops->unlock(b, num_pages, is_2m_pages, &b->target);
 977	}
 978}
 979
 980static const struct vmballoon_ops vmballoon_basic_ops = {
 981	.add_page = vmballoon_add_page,
 982	.lock = vmballoon_lock_page,
 983	.unlock = vmballoon_unlock_page
 984};
 985
 986static const struct vmballoon_ops vmballoon_batched_ops = {
 987	.add_page = vmballoon_add_batched_page,
 988	.lock = vmballoon_lock_batched_page,
 989	.unlock = vmballoon_unlock_batched_page
 990};
 991
 992static bool vmballoon_init_batching(struct vmballoon *b)
 993{
 994	b->page = alloc_page(VMW_PAGE_ALLOC_NOSLEEP);
 995	if (!b->page)
 996		return false;
 997
 998	b->batch_page = vmap(&b->page, 1, VM_MAP, PAGE_KERNEL);
 999	if (!b->batch_page) {
1000		__free_page(b->page);
1001		return false;
1002	}
1003
1004	return true;
1005}
1006
1007/*
1008 * Receive notification and resize balloon
1009 */
1010static void vmballoon_doorbell(void *client_data)
1011{
1012	struct vmballoon *b = client_data;
1013
1014	STATS_INC(b->stats.doorbell);
1015
1016	mod_delayed_work(system_freezable_wq, &b->dwork, 0);
1017}
1018
1019/*
1020 * Clean up vmci doorbell
1021 */
1022static void vmballoon_vmci_cleanup(struct vmballoon *b)
1023{
1024	int error;
1025
1026	VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, VMCI_INVALID_ID,
1027			VMCI_INVALID_ID, error);
1028	STATS_INC(b->stats.doorbell_unset);
1029
1030	if (!vmci_handle_is_invalid(b->vmci_doorbell)) {
1031		vmci_doorbell_destroy(b->vmci_doorbell);
1032		b->vmci_doorbell = VMCI_INVALID_HANDLE;
1033	}
1034}
1035
1036/*
1037 * Initialize vmci doorbell, to get notified as soon as balloon changes
1038 */
1039static int vmballoon_vmci_init(struct vmballoon *b)
1040{
1041	int error = 0;
1042
1043	if ((b->capabilities & VMW_BALLOON_SIGNALLED_WAKEUP_CMD) != 0) {
1044		error = vmci_doorbell_create(&b->vmci_doorbell,
1045				VMCI_FLAG_DELAYED_CB,
1046				VMCI_PRIVILEGE_FLAG_RESTRICTED,
1047				vmballoon_doorbell, b);
1048
1049		if (error == VMCI_SUCCESS) {
1050			VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET,
1051					b->vmci_doorbell.context,
1052					b->vmci_doorbell.resource, error);
1053			STATS_INC(b->stats.doorbell_set);
1054		}
1055	}
1056
1057	if (error != 0) {
1058		vmballoon_vmci_cleanup(b);
1059
1060		return -EIO;
1061	}
1062
1063	return 0;
1064}
1065
1066/*
1067 * Perform standard reset sequence by popping the balloon (in case it
1068 * is not  empty) and then restarting protocol. This operation normally
1069 * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
1070 */
1071static void vmballoon_reset(struct vmballoon *b)
1072{
1073	int error;
1074
1075	vmballoon_vmci_cleanup(b);
1076
1077	/* free all pages, skipping monitor unlock */
1078	vmballoon_pop(b);
1079
1080	if (!vmballoon_send_start(b, VMW_BALLOON_CAPABILITIES))
1081		return;
1082
1083	if ((b->capabilities & VMW_BALLOON_BATCHED_CMDS) != 0) {
1084		b->ops = &vmballoon_batched_ops;
1085		b->batch_max_pages = VMW_BALLOON_BATCH_MAX_PAGES;
1086		if (!vmballoon_init_batching(b)) {
1087			/*
1088			 * We failed to initialize batching, inform the monitor
1089			 * about it by sending a null capability.
1090			 *
1091			 * The guest will retry in one second.
1092			 */
1093			vmballoon_send_start(b, 0);
1094			return;
1095		}
1096	} else if ((b->capabilities & VMW_BALLOON_BASIC_CMDS) != 0) {
1097		b->ops = &vmballoon_basic_ops;
1098		b->batch_max_pages = 1;
1099	}
1100
1101	b->reset_required = false;
1102
1103	error = vmballoon_vmci_init(b);
1104	if (error)
1105		pr_err("failed to initialize vmci doorbell\n");
1106
1107	if (!vmballoon_send_guest_id(b))
1108		pr_err("failed to send guest ID to the host\n");
1109}
1110
1111/*
1112 * Balloon work function: reset protocol, if needed, get the new size and
1113 * adjust balloon as needed. Repeat in 1 sec.
1114 */
1115static void vmballoon_work(struct work_struct *work)
1116{
1117	struct delayed_work *dwork = to_delayed_work(work);
1118	struct vmballoon *b = container_of(dwork, struct vmballoon, dwork);
1119	unsigned int target;
1120
1121	STATS_INC(b->stats.timer);
1122
1123	if (b->reset_required)
1124		vmballoon_reset(b);
1125
1126	if (b->slow_allocation_cycles > 0)
1127		b->slow_allocation_cycles--;
1128
1129	if (!b->reset_required && vmballoon_send_get_target(b, &target)) {
1130		/* update target, adjust size */
1131		b->target = target;
1132
1133		if (b->size < target)
1134			vmballoon_inflate(b);
1135		else if (target == 0 ||
1136				b->size > target + vmballoon_page_size(true))
1137			vmballoon_deflate(b);
1138	}
1139
1140	/*
1141	 * We are using a freezable workqueue so that balloon operations are
1142	 * stopped while the system transitions to/from sleep/hibernation.
1143	 */
1144	queue_delayed_work(system_freezable_wq,
1145			   dwork, round_jiffies_relative(HZ));
1146}
1147
1148/*
1149 * DEBUGFS Interface
1150 */
1151#ifdef CONFIG_DEBUG_FS
1152
1153static int vmballoon_debug_show(struct seq_file *f, void *offset)
1154{
1155	struct vmballoon *b = f->private;
1156	struct vmballoon_stats *stats = &b->stats;
1157
1158	/* format capabilities info */
1159	seq_printf(f,
1160		   "balloon capabilities:   %#4x\n"
1161		   "used capabilities:      %#4lx\n"
1162		   "is resetting:           %c\n",
1163		   VMW_BALLOON_CAPABILITIES, b->capabilities,
1164		   b->reset_required ? 'y' : 'n');
1165
1166	/* format size info */
1167	seq_printf(f,
1168		   "target:             %8d pages\n"
1169		   "current:            %8d pages\n",
1170		   b->target, b->size);
1171
1172	/* format rate info */
1173	seq_printf(f,
1174		   "rateSleepAlloc:     %8d pages/sec\n",
1175		   b->rate_alloc);
1176
1177	seq_printf(f,
1178		   "\n"
1179		   "timer:              %8u\n"
1180		   "doorbell:           %8u\n"
1181		   "start:              %8u (%4u failed)\n"
1182		   "guestType:          %8u (%4u failed)\n"
1183		   "2m-lock:            %8u (%4u failed)\n"
1184		   "lock:               %8u (%4u failed)\n"
1185		   "2m-unlock:          %8u (%4u failed)\n"
1186		   "unlock:             %8u (%4u failed)\n"
1187		   "target:             %8u (%4u failed)\n"
1188		   "prim2mAlloc:        %8u (%4u failed)\n"
1189		   "primNoSleepAlloc:   %8u (%4u failed)\n"
1190		   "primCanSleepAlloc:  %8u (%4u failed)\n"
1191		   "prim2mFree:         %8u\n"
1192		   "primFree:           %8u\n"
1193		   "err2mAlloc:         %8u\n"
1194		   "errAlloc:           %8u\n"
1195		   "err2mFree:          %8u\n"
1196		   "errFree:            %8u\n"
1197		   "doorbellSet:        %8u\n"
1198		   "doorbellUnset:      %8u\n",
1199		   stats->timer,
1200		   stats->doorbell,
1201		   stats->start, stats->start_fail,
1202		   stats->guest_type, stats->guest_type_fail,
1203		   stats->lock[true],  stats->lock_fail[true],
1204		   stats->lock[false],  stats->lock_fail[false],
1205		   stats->unlock[true], stats->unlock_fail[true],
1206		   stats->unlock[false], stats->unlock_fail[false],
1207		   stats->target, stats->target_fail,
1208		   stats->alloc[true], stats->alloc_fail[true],
1209		   stats->alloc[false], stats->alloc_fail[false],
1210		   stats->sleep_alloc, stats->sleep_alloc_fail,
1211		   stats->free[true],
1212		   stats->free[false],
1213		   stats->refused_alloc[true], stats->refused_alloc[false],
1214		   stats->refused_free[true], stats->refused_free[false],
1215		   stats->doorbell_set, stats->doorbell_unset);
1216
1217	return 0;
1218}
1219
1220static int vmballoon_debug_open(struct inode *inode, struct file *file)
1221{
1222	return single_open(file, vmballoon_debug_show, inode->i_private);
1223}
1224
1225static const struct file_operations vmballoon_debug_fops = {
1226	.owner		= THIS_MODULE,
1227	.open		= vmballoon_debug_open,
1228	.read		= seq_read,
1229	.llseek		= seq_lseek,
1230	.release	= single_release,
1231};
1232
1233static int __init vmballoon_debugfs_init(struct vmballoon *b)
1234{
1235	int error;
1236
1237	b->dbg_entry = debugfs_create_file("vmmemctl", S_IRUGO, NULL, b,
1238					   &vmballoon_debug_fops);
1239	if (IS_ERR(b->dbg_entry)) {
1240		error = PTR_ERR(b->dbg_entry);
1241		pr_err("failed to create debugfs entry, error: %d\n", error);
1242		return error;
1243	}
1244
1245	return 0;
1246}
1247
1248static void __exit vmballoon_debugfs_exit(struct vmballoon *b)
1249{
1250	debugfs_remove(b->dbg_entry);
1251}
1252
1253#else
1254
1255static inline int vmballoon_debugfs_init(struct vmballoon *b)
1256{
1257	return 0;
1258}
1259
1260static inline void vmballoon_debugfs_exit(struct vmballoon *b)
1261{
1262}
1263
1264#endif	/* CONFIG_DEBUG_FS */
1265
1266static int __init vmballoon_init(void)
1267{
1268	int error;
1269	unsigned is_2m_pages;
1270	/*
1271	 * Check if we are running on VMware's hypervisor and bail out
1272	 * if we are not.
1273	 */
1274	if (x86_hyper != &x86_hyper_vmware)
1275		return -ENODEV;
1276
1277	for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
1278			is_2m_pages++) {
1279		INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].pages);
1280		INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].refused_pages);
1281	}
1282
1283	/* initialize rates */
1284	balloon.rate_alloc = VMW_BALLOON_RATE_ALLOC_MAX;
1285
1286	INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
1287
1288	error = vmballoon_debugfs_init(&balloon);
1289	if (error)
1290		return error;
1291
1292	balloon.vmci_doorbell = VMCI_INVALID_HANDLE;
1293	balloon.batch_page = NULL;
1294	balloon.page = NULL;
1295	balloon.reset_required = true;
1296
1297	queue_delayed_work(system_freezable_wq, &balloon.dwork, 0);
1298
1299	return 0;
1300}
1301module_init(vmballoon_init);
1302
1303static void __exit vmballoon_exit(void)
1304{
1305	vmballoon_vmci_cleanup(&balloon);
1306	cancel_delayed_work_sync(&balloon.dwork);
1307
1308	vmballoon_debugfs_exit(&balloon);
1309
1310	/*
1311	 * Deallocate all reserved memory, and reset connection with monitor.
1312	 * Reset connection before deallocating memory to avoid potential for
1313	 * additional spurious resets from guest touching deallocated pages.
1314	 */
1315	vmballoon_send_start(&balloon, 0);
1316	vmballoon_pop(&balloon);
1317}
1318module_exit(vmballoon_exit);

   1/*
   2 * VMware Balloon driver.
   3 *
   4 * Copyright (C) 2000-2014, VMware, Inc. All Rights Reserved.
   5 *
   6 * This program is free software; you can redistribute it and/or modify it
   7 * under the terms of the GNU General Public License as published by the
   8 * Free Software Foundation; version 2 of the License and no later version.
   9 *
  10 * This program is distributed in the hope that it will be useful, but
  11 * WITHOUT ANY WARRANTY; without even the implied warranty of
  12 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
  13 * NON INFRINGEMENT.  See the GNU General Public License for more
  14 * details.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * along with this program; if not, write to the Free Software
  18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19 *
  20 * Maintained by:	Xavier Deguillard <xdeguillard@vmware.com>
  21 *			Philip Moltmann <moltmann@vmware.com>
  22 */
  23
  24/*
  25 * This is VMware physical memory management driver for Linux. The driver
  26 * acts like a "balloon" that can be inflated to reclaim physical pages by
  27 * reserving them in the guest and invalidating them in the monitor,
  28 * freeing up the underlying machine pages so they can be allocated to
  29 * other guests.  The balloon can also be deflated to allow the guest to
  30 * use more physical memory. Higher level policies can control the sizes
  31 * of balloons in VMs in order to manage physical memory resources.
  32 */
  33
  34//#define DEBUG
  35#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  36
  37#include <linux/types.h>
  38#include <linux/kernel.h>
  39#include <linux/mm.h>
  40#include <linux/vmalloc.h>
  41#include <linux/sched.h>
  42#include <linux/module.h>
  43#include <linux/workqueue.h>
  44#include <linux/debugfs.h>
  45#include <linux/seq_file.h>
  46#include <linux/vmw_vmci_defs.h>
  47#include <linux/vmw_vmci_api.h>
  48#include <asm/hypervisor.h>
  49
  50MODULE_AUTHOR("VMware, Inc.");
  51MODULE_DESCRIPTION("VMware Memory Control (Balloon) Driver");
  52MODULE_VERSION("1.5.0.0-k");
  53MODULE_ALIAS("dmi:*:svnVMware*:*");
  54MODULE_ALIAS("vmware_vmmemctl");
  55MODULE_LICENSE("GPL");
  56
  57/*
  58 * Various constants controlling rate of inflaint/deflating balloon,
  59 * measured in pages.
  60 */
  61
  62/*
  63 * Rates of memory allocaton when guest experiences memory pressure
  64 * (driver performs sleeping allocations).
  65 */
  66#define VMW_BALLOON_RATE_ALLOC_MIN	512U
  67#define VMW_BALLOON_RATE_ALLOC_MAX	2048U
  68#define VMW_BALLOON_RATE_ALLOC_INC	16U
  69
  70/*
  71 * When guest is under memory pressure, use a reduced page allocation
  72 * rate for next several cycles.
  73 */
  74#define VMW_BALLOON_SLOW_CYCLES		4
  75
  76/*
  77 * Use __GFP_HIGHMEM to allow pages from HIGHMEM zone. We don't
  78 * allow wait (__GFP_RECLAIM) for NOSLEEP page allocations. Use
  79 * __GFP_NOWARN, to suppress page allocation failure warnings.
  80 */
  81#define VMW_PAGE_ALLOC_NOSLEEP		(__GFP_HIGHMEM|__GFP_NOWARN)
  82
  83/*
  84 * Use GFP_HIGHUSER when executing in a separate kernel thread
  85 * context and allocation can sleep.  This is less stressful to
  86 * the guest memory system, since it allows the thread to block
  87 * while memory is reclaimed, and won't take pages from emergency
  88 * low-memory pools.
  89 */
  90#define VMW_PAGE_ALLOC_CANSLEEP		(GFP_HIGHUSER)
  91
  92/* Maximum number of refused pages we accumulate during inflation cycle */
  93#define VMW_BALLOON_MAX_REFUSED		16
  94
  95/*
  96 * Hypervisor communication port definitions.
  97 */
  98#define VMW_BALLOON_HV_PORT		0x5670
  99#define VMW_BALLOON_HV_MAGIC		0x456c6d6f
 100#define VMW_BALLOON_GUEST_ID		1	/* Linux */
 101
 102enum vmwballoon_capabilities {
 103	/*
 104	 * Bit 0 is reserved and not associated to any capability.
 105	 */
 106	VMW_BALLOON_BASIC_CMDS			= (1 << 1),
 107	VMW_BALLOON_BATCHED_CMDS		= (1 << 2),
 108	VMW_BALLOON_BATCHED_2M_CMDS		= (1 << 3),
 109	VMW_BALLOON_SIGNALLED_WAKEUP_CMD	= (1 << 4),
 110};
 111
 112#define VMW_BALLOON_CAPABILITIES	(VMW_BALLOON_BASIC_CMDS \
 113					| VMW_BALLOON_BATCHED_CMDS \
 114					| VMW_BALLOON_BATCHED_2M_CMDS \
 115					| VMW_BALLOON_SIGNALLED_WAKEUP_CMD)
 116
 117#define VMW_BALLOON_2M_SHIFT		(9)
 118#define VMW_BALLOON_NUM_PAGE_SIZES	(2)
 119
 120/*
 121 * Backdoor commands availability:
 122 *
 123 * START, GET_TARGET and GUEST_ID are always available,
 124 *
 125 * VMW_BALLOON_BASIC_CMDS:
 126 *	LOCK and UNLOCK commands,
 127 * VMW_BALLOON_BATCHED_CMDS:
 128 *	BATCHED_LOCK and BATCHED_UNLOCK commands.
 129 * VMW BALLOON_BATCHED_2M_CMDS:
 130 *	BATCHED_2M_LOCK and BATCHED_2M_UNLOCK commands,
 131 * VMW VMW_BALLOON_SIGNALLED_WAKEUP_CMD:
 132 *	VMW_BALLOON_CMD_VMCI_DOORBELL_SET command.
 133 */
 134#define VMW_BALLOON_CMD_START			0
 135#define VMW_BALLOON_CMD_GET_TARGET		1
 136#define VMW_BALLOON_CMD_LOCK			2
 137#define VMW_BALLOON_CMD_UNLOCK			3
 138#define VMW_BALLOON_CMD_GUEST_ID		4
 139#define VMW_BALLOON_CMD_BATCHED_LOCK		6
 140#define VMW_BALLOON_CMD_BATCHED_UNLOCK		7
 141#define VMW_BALLOON_CMD_BATCHED_2M_LOCK		8
 142#define VMW_BALLOON_CMD_BATCHED_2M_UNLOCK	9
 143#define VMW_BALLOON_CMD_VMCI_DOORBELL_SET	10
 144
 145
 146/* error codes */
 147#define VMW_BALLOON_SUCCESS		        0
 148#define VMW_BALLOON_FAILURE		        -1
 149#define VMW_BALLOON_ERROR_CMD_INVALID	        1
 150#define VMW_BALLOON_ERROR_PPN_INVALID	        2
 151#define VMW_BALLOON_ERROR_PPN_LOCKED	        3
 152#define VMW_BALLOON_ERROR_PPN_UNLOCKED	        4
 153#define VMW_BALLOON_ERROR_PPN_PINNED	        5
 154#define VMW_BALLOON_ERROR_PPN_NOTNEEDED	        6
 155#define VMW_BALLOON_ERROR_RESET		        7
 156#define VMW_BALLOON_ERROR_BUSY		        8
 157
 158#define VMW_BALLOON_SUCCESS_WITH_CAPABILITIES	(0x03000000)
 159
 160/* Batch page description */
 161
 162/*
 163 * Layout of a page in the batch page:
 164 *
 165 * +-------------+----------+--------+
 166 * |             |          |        |
 167 * | Page number | Reserved | Status |
 168 * |             |          |        |
 169 * +-------------+----------+--------+
 170 * 64  PAGE_SHIFT          6         0
 171 *
 172 * The reserved field should be set to 0.
 173 */
 174#define VMW_BALLOON_BATCH_MAX_PAGES	(PAGE_SIZE / sizeof(u64))
 175#define VMW_BALLOON_BATCH_STATUS_MASK	((1UL << 5) - 1)
 176#define VMW_BALLOON_BATCH_PAGE_MASK	(~((1UL << PAGE_SHIFT) - 1))
 177
 178struct vmballoon_batch_page {
 179	u64 pages[VMW_BALLOON_BATCH_MAX_PAGES];
 180};
 181
 182static u64 vmballoon_batch_get_pa(struct vmballoon_batch_page *batch, int idx)
 183{
 184	return batch->pages[idx] & VMW_BALLOON_BATCH_PAGE_MASK;
 185}
 186
 187static int vmballoon_batch_get_status(struct vmballoon_batch_page *batch,
 188				int idx)
 189{
 190	return (int)(batch->pages[idx] & VMW_BALLOON_BATCH_STATUS_MASK);
 191}
 192
 193static void vmballoon_batch_set_pa(struct vmballoon_batch_page *batch, int idx,
 194				u64 pa)
 195{
 196	batch->pages[idx] = pa;
 197}
 198
 199
 200#define VMWARE_BALLOON_CMD(cmd, arg1, arg2, result)		\
 201({								\
 202	unsigned long __status, __dummy1, __dummy2, __dummy3;	\
 203	__asm__ __volatile__ ("inl %%dx" :			\
 204		"=a"(__status),					\
 205		"=c"(__dummy1),					\
 206		"=d"(__dummy2),					\
 207		"=b"(result),					\
 208		"=S" (__dummy3) :				\
 209		"0"(VMW_BALLOON_HV_MAGIC),			\
 210		"1"(VMW_BALLOON_CMD_##cmd),			\
 211		"2"(VMW_BALLOON_HV_PORT),			\
 212		"3"(arg1),					\
 213		"4" (arg2) :					\
 214		"memory");					\
 215	if (VMW_BALLOON_CMD_##cmd == VMW_BALLOON_CMD_START)	\
 216		result = __dummy1;				\
 217	result &= -1UL;						\
 218	__status & -1UL;					\
 219})
 220
 221#ifdef CONFIG_DEBUG_FS
 222struct vmballoon_stats {
 223	unsigned int timer;
 224	unsigned int doorbell;
 225
 226	/* allocation statistics */
 227	unsigned int alloc[VMW_BALLOON_NUM_PAGE_SIZES];
 228	unsigned int alloc_fail[VMW_BALLOON_NUM_PAGE_SIZES];
 229	unsigned int sleep_alloc;
 230	unsigned int sleep_alloc_fail;
 231	unsigned int refused_alloc[VMW_BALLOON_NUM_PAGE_SIZES];
 232	unsigned int refused_free[VMW_BALLOON_NUM_PAGE_SIZES];
 233	unsigned int free[VMW_BALLOON_NUM_PAGE_SIZES];
 234
 235	/* monitor operations */
 236	unsigned int lock[VMW_BALLOON_NUM_PAGE_SIZES];
 237	unsigned int lock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
 238	unsigned int unlock[VMW_BALLOON_NUM_PAGE_SIZES];
 239	unsigned int unlock_fail[VMW_BALLOON_NUM_PAGE_SIZES];
 240	unsigned int target;
 241	unsigned int target_fail;
 242	unsigned int start;
 243	unsigned int start_fail;
 244	unsigned int guest_type;
 245	unsigned int guest_type_fail;
 246	unsigned int doorbell_set;
 247	unsigned int doorbell_unset;
 248};
 249
 250#define STATS_INC(stat) (stat)++
 251#else
 252#define STATS_INC(stat)
 253#endif
 254
 255struct vmballoon;
 256
 257struct vmballoon_ops {
 258	void (*add_page)(struct vmballoon *b, int idx, struct page *p);
 259	int (*lock)(struct vmballoon *b, unsigned int num_pages,
 260			bool is_2m_pages, unsigned int *target);
 261	int (*unlock)(struct vmballoon *b, unsigned int num_pages,
 262			bool is_2m_pages, unsigned int *target);
 263};
 264
 265struct vmballoon_page_size {
 266	/* list of reserved physical pages */
 267	struct list_head pages;
 268
 269	/* transient list of non-balloonable pages */
 270	struct list_head refused_pages;
 271	unsigned int n_refused_pages;
 272};
 273
 274struct vmballoon {
 275	struct vmballoon_page_size page_sizes[VMW_BALLOON_NUM_PAGE_SIZES];
 276
 277	/* supported page sizes. 1 == 4k pages only, 2 == 4k and 2m pages */
 278	unsigned supported_page_sizes;
 279
 280	/* balloon size in pages */
 281	unsigned int size;
 282	unsigned int target;
 283
 284	/* reset flag */
 285	bool reset_required;
 286
 287	/* adjustment rates (pages per second) */
 288	unsigned int rate_alloc;
 289
 290	/* slowdown page allocations for next few cycles */
 291	unsigned int slow_allocation_cycles;
 292
 293	unsigned long capabilities;
 294
 295	struct vmballoon_batch_page *batch_page;
 296	unsigned int batch_max_pages;
 297	struct page *page;
 298
 299	const struct vmballoon_ops *ops;
 300
 301#ifdef CONFIG_DEBUG_FS
 302	/* statistics */
 303	struct vmballoon_stats stats;
 304
 305	/* debugfs file exporting statistics */
 306	struct dentry *dbg_entry;
 307#endif
 308
 309	struct sysinfo sysinfo;
 310
 311	struct delayed_work dwork;
 312
 313	struct vmci_handle vmci_doorbell;
 314};
 315
 316static struct vmballoon balloon;
 317
 318/*
 319 * Send "start" command to the host, communicating supported version
 320 * of the protocol.
 321 */
 322static bool vmballoon_send_start(struct vmballoon *b, unsigned long req_caps)
 323{
 324	unsigned long status, capabilities, dummy = 0;
 325	bool success;
 326
 327	STATS_INC(b->stats.start);
 328
 329	status = VMWARE_BALLOON_CMD(START, req_caps, dummy, capabilities);
 330
 331	switch (status) {
 332	case VMW_BALLOON_SUCCESS_WITH_CAPABILITIES:
 333		b->capabilities = capabilities;
 334		success = true;
 335		break;
 336	case VMW_BALLOON_SUCCESS:
 337		b->capabilities = VMW_BALLOON_BASIC_CMDS;
 338		success = true;
 339		break;
 340	default:
 341		success = false;
 342	}
 343
 344	if (b->capabilities & VMW_BALLOON_BATCHED_2M_CMDS)
 345		b->supported_page_sizes = 2;
 346	else
 347		b->supported_page_sizes = 1;
 348
 349	if (!success) {
 350		pr_debug("%s - failed, hv returns %ld\n", __func__, status);
 351		STATS_INC(b->stats.start_fail);
 352	}
 353	return success;
 354}
 355
 356static bool vmballoon_check_status(struct vmballoon *b, unsigned long status)
 357{
 358	switch (status) {
 359	case VMW_BALLOON_SUCCESS:
 360		return true;
 361
 362	case VMW_BALLOON_ERROR_RESET:
 363		b->reset_required = true;
 364		/* fall through */
 365
 366	default:
 367		return false;
 368	}
 369}
 370
 371/*
 372 * Communicate guest type to the host so that it can adjust ballooning
 373 * algorithm to the one most appropriate for the guest. This command
 374 * is normally issued after sending "start" command and is part of
 375 * standard reset sequence.
 376 */
 377static bool vmballoon_send_guest_id(struct vmballoon *b)
 378{
 379	unsigned long status, dummy = 0;
 380
 381	status = VMWARE_BALLOON_CMD(GUEST_ID, VMW_BALLOON_GUEST_ID, dummy,
 382				dummy);
 383
 384	STATS_INC(b->stats.guest_type);
 385
 386	if (vmballoon_check_status(b, status))
 387		return true;
 388
 389	pr_debug("%s - failed, hv returns %ld\n", __func__, status);
 390	STATS_INC(b->stats.guest_type_fail);
 391	return false;
 392}
 393
 394static u16 vmballoon_page_size(bool is_2m_page)
 395{
 396	if (is_2m_page)
 397		return 1 << VMW_BALLOON_2M_SHIFT;
 398
 399	return 1;
 400}
 401
 402/*
 403 * Retrieve desired balloon size from the host.
 404 */
 405static bool vmballoon_send_get_target(struct vmballoon *b, u32 *new_target)
 406{
 407	unsigned long status;
 408	unsigned long target;
 409	unsigned long limit;
 410	unsigned long dummy = 0;
 411	u32 limit32;
 412
 413	/*
 414	 * si_meminfo() is cheap. Moreover, we want to provide dynamic
 415	 * max balloon size later. So let us call si_meminfo() every
 416	 * iteration.
 417	 */
 418	si_meminfo(&b->sysinfo);
 419	limit = b->sysinfo.totalram;
 420
 421	/* Ensure limit fits in 32-bits */
 422	limit32 = (u32)limit;
 423	if (limit != limit32)
 424		return false;
 425
 426	/* update stats */
 427	STATS_INC(b->stats.target);
 428
 429	status = VMWARE_BALLOON_CMD(GET_TARGET, limit, dummy, target);
 430	if (vmballoon_check_status(b, status)) {
 431		*new_target = target;
 432		return true;
 433	}
 434
 435	pr_debug("%s - failed, hv returns %ld\n", __func__, status);
 436	STATS_INC(b->stats.target_fail);
 437	return false;
 438}
 439
 440/*
 441 * Notify the host about allocated page so that host can use it without
 442 * fear that guest will need it. Host may reject some pages, we need to
 443 * check the return value and maybe submit a different page.
 444 */
 445static int vmballoon_send_lock_page(struct vmballoon *b, unsigned long pfn,
 446				unsigned int *hv_status, unsigned int *target)
 447{
 448	unsigned long status, dummy = 0;
 449	u32 pfn32;
 450
 451	pfn32 = (u32)pfn;
 452	if (pfn32 != pfn)
 453		return -1;
 454
 455	STATS_INC(b->stats.lock[false]);
 456
 457	*hv_status = status = VMWARE_BALLOON_CMD(LOCK, pfn, dummy, *target);
 458	if (vmballoon_check_status(b, status))
 459		return 0;
 460
 461	pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
 462	STATS_INC(b->stats.lock_fail[false]);
 463	return 1;
 464}
 465
 466static int vmballoon_send_batched_lock(struct vmballoon *b,
 467		unsigned int num_pages, bool is_2m_pages, unsigned int *target)
 468{
 469	unsigned long status;
 470	unsigned long pfn = page_to_pfn(b->page);
 471
 472	STATS_INC(b->stats.lock[is_2m_pages]);
 473
 474	if (is_2m_pages)
 475		status = VMWARE_BALLOON_CMD(BATCHED_2M_LOCK, pfn, num_pages,
 476				*target);
 477	else
 478		status = VMWARE_BALLOON_CMD(BATCHED_LOCK, pfn, num_pages,
 479				*target);
 480
 481	if (vmballoon_check_status(b, status))
 482		return 0;
 483
 484	pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
 485	STATS_INC(b->stats.lock_fail[is_2m_pages]);
 486	return 1;
 487}
 488
 489/*
 490 * Notify the host that guest intends to release given page back into
 491 * the pool of available (to the guest) pages.
 492 */
 493static bool vmballoon_send_unlock_page(struct vmballoon *b, unsigned long pfn,
 494							unsigned int *target)
 495{
 496	unsigned long status, dummy = 0;
 497	u32 pfn32;
 498
 499	pfn32 = (u32)pfn;
 500	if (pfn32 != pfn)
 501		return false;
 502
 503	STATS_INC(b->stats.unlock[false]);
 504
 505	status = VMWARE_BALLOON_CMD(UNLOCK, pfn, dummy, *target);
 506	if (vmballoon_check_status(b, status))
 507		return true;
 508
 509	pr_debug("%s - ppn %lx, hv returns %ld\n", __func__, pfn, status);
 510	STATS_INC(b->stats.unlock_fail[false]);
 511	return false;
 512}
 513
 514static bool vmballoon_send_batched_unlock(struct vmballoon *b,
 515		unsigned int num_pages, bool is_2m_pages, unsigned int *target)
 516{
 517	unsigned long status;
 518	unsigned long pfn = page_to_pfn(b->page);
 519
 520	STATS_INC(b->stats.unlock[is_2m_pages]);
 521
 522	if (is_2m_pages)
 523		status = VMWARE_BALLOON_CMD(BATCHED_2M_UNLOCK, pfn, num_pages,
 524				*target);
 525	else
 526		status = VMWARE_BALLOON_CMD(BATCHED_UNLOCK, pfn, num_pages,
 527				*target);
 528
 529	if (vmballoon_check_status(b, status))
 530		return true;
 531
 532	pr_debug("%s - batch ppn %lx, hv returns %ld\n", __func__, pfn, status);
 533	STATS_INC(b->stats.unlock_fail[is_2m_pages]);
 534	return false;
 535}
 536
 537static struct page *vmballoon_alloc_page(gfp_t flags, bool is_2m_page)
 538{
 539	if (is_2m_page)
 540		return alloc_pages(flags, VMW_BALLOON_2M_SHIFT);
 541
 542	return alloc_page(flags);
 543}
 544
 545static void vmballoon_free_page(struct page *page, bool is_2m_page)
 546{
 547	if (is_2m_page)
 548		__free_pages(page, VMW_BALLOON_2M_SHIFT);
 549	else
 550		__free_page(page);
 551}
 552
 553/*
 554 * Quickly release all pages allocated for the balloon. This function is
 555 * called when host decides to "reset" balloon for one reason or another.
 556 * Unlike normal "deflate" we do not (shall not) notify host of the pages
 557 * being released.
 558 */
 559static void vmballoon_pop(struct vmballoon *b)
 560{
 561	struct page *page, *next;
 562	unsigned is_2m_pages;
 563
 564	for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
 565			is_2m_pages++) {
 566		struct vmballoon_page_size *page_size =
 567				&b->page_sizes[is_2m_pages];
 568		u16 size_per_page = vmballoon_page_size(is_2m_pages);
 569
 570		list_for_each_entry_safe(page, next, &page_size->pages, lru) {
 571			list_del(&page->lru);
 572			vmballoon_free_page(page, is_2m_pages);
 573			STATS_INC(b->stats.free[is_2m_pages]);
 574			b->size -= size_per_page;
 575			cond_resched();
 576		}
 577	}
 578
 579	if (b->batch_page) {
 580		vunmap(b->batch_page);
 581		b->batch_page = NULL;
 582	}
 583
 584	if (b->page) {
 585		__free_page(b->page);
 586		b->page = NULL;
 587	}
 588}
 589
 590/*
 591 * Notify the host of a ballooned page. If host rejects the page put it on the
 592 * refuse list, those refused page are then released at the end of the
 593 * inflation cycle.
 594 */
 595static int vmballoon_lock_page(struct vmballoon *b, unsigned int num_pages,
 596				bool is_2m_pages, unsigned int *target)
 597{
 598	int locked, hv_status;
 599	struct page *page = b->page;
 600	struct vmballoon_page_size *page_size = &b->page_sizes[false];
 601
 602	/* is_2m_pages can never happen as 2m pages support implies batching */
 603
 604	locked = vmballoon_send_lock_page(b, page_to_pfn(page), &hv_status,
 605								target);
 606	if (locked > 0) {
 607		STATS_INC(b->stats.refused_alloc[false]);
 608
 609		if (hv_status == VMW_BALLOON_ERROR_RESET ||
 610				hv_status == VMW_BALLOON_ERROR_PPN_NOTNEEDED) {
 611			vmballoon_free_page(page, false);
 612			return -EIO;
 613		}
 614
 615		/*
 616		 * Place page on the list of non-balloonable pages
 617		 * and retry allocation, unless we already accumulated
 618		 * too many of them, in which case take a breather.
 619		 */
 620		if (page_size->n_refused_pages < VMW_BALLOON_MAX_REFUSED) {
 621			page_size->n_refused_pages++;
 622			list_add(&page->lru, &page_size->refused_pages);
 623		} else {
 624			vmballoon_free_page(page, false);
 625		}
 626		return -EIO;
 627	}
 628
 629	/* track allocated page */
 630	list_add(&page->lru, &page_size->pages);
 631
 632	/* update balloon size */
 633	b->size++;
 634
 635	return 0;
 636}
 637
 638static int vmballoon_lock_batched_page(struct vmballoon *b,
 639		unsigned int num_pages, bool is_2m_pages, unsigned int *target)
 640{
 641	int locked, i;
 642	u16 size_per_page = vmballoon_page_size(is_2m_pages);
 643
 644	locked = vmballoon_send_batched_lock(b, num_pages, is_2m_pages,
 645			target);
 646	if (locked > 0) {
 647		for (i = 0; i < num_pages; i++) {
 648			u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
 649			struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
 650
 651			vmballoon_free_page(p, is_2m_pages);
 652		}
 653
 654		return -EIO;
 655	}
 656
 657	for (i = 0; i < num_pages; i++) {
 658		u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
 659		struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
 660		struct vmballoon_page_size *page_size =
 661				&b->page_sizes[is_2m_pages];
 662
 663		locked = vmballoon_batch_get_status(b->batch_page, i);
 664
 665		switch (locked) {
 666		case VMW_BALLOON_SUCCESS:
 667			list_add(&p->lru, &page_size->pages);
 668			b->size += size_per_page;
 669			break;
 670		case VMW_BALLOON_ERROR_PPN_PINNED:
 671		case VMW_BALLOON_ERROR_PPN_INVALID:
 672			if (page_size->n_refused_pages
 673					< VMW_BALLOON_MAX_REFUSED) {
 674				list_add(&p->lru, &page_size->refused_pages);
 675				page_size->n_refused_pages++;
 676				break;
 677			}
 678			/* Fallthrough */
 679		case VMW_BALLOON_ERROR_RESET:
 680		case VMW_BALLOON_ERROR_PPN_NOTNEEDED:
 681			vmballoon_free_page(p, is_2m_pages);
 682			break;
 683		default:
 684			/* This should never happen */
 685			WARN_ON_ONCE(true);
 686		}
 687	}
 688
 689	return 0;
 690}
 691
 692/*
 693 * Release the page allocated for the balloon. Note that we first notify
 694 * the host so it can make sure the page will be available for the guest
 695 * to use, if needed.
 696 */
 697static int vmballoon_unlock_page(struct vmballoon *b, unsigned int num_pages,
 698		bool is_2m_pages, unsigned int *target)
 699{
 700	struct page *page = b->page;
 701	struct vmballoon_page_size *page_size = &b->page_sizes[false];
 702
 703	/* is_2m_pages can never happen as 2m pages support implies batching */
 704
 705	if (!vmballoon_send_unlock_page(b, page_to_pfn(page), target)) {
 706		list_add(&page->lru, &page_size->pages);
 707		return -EIO;
 708	}
 709
 710	/* deallocate page */
 711	vmballoon_free_page(page, false);
 712	STATS_INC(b->stats.free[false]);
 713
 714	/* update balloon size */
 715	b->size--;
 716
 717	return 0;
 718}
 719
 720static int vmballoon_unlock_batched_page(struct vmballoon *b,
 721				unsigned int num_pages, bool is_2m_pages,
 722				unsigned int *target)
 723{
 724	int locked, i, ret = 0;
 725	bool hv_success;
 726	u16 size_per_page = vmballoon_page_size(is_2m_pages);
 727
 728	hv_success = vmballoon_send_batched_unlock(b, num_pages, is_2m_pages,
 729			target);
 730	if (!hv_success)
 731		ret = -EIO;
 732
 733	for (i = 0; i < num_pages; i++) {
 734		u64 pa = vmballoon_batch_get_pa(b->batch_page, i);
 735		struct page *p = pfn_to_page(pa >> PAGE_SHIFT);
 736		struct vmballoon_page_size *page_size =
 737				&b->page_sizes[is_2m_pages];
 738
 739		locked = vmballoon_batch_get_status(b->batch_page, i);
 740		if (!hv_success || locked != VMW_BALLOON_SUCCESS) {
 741			/*
 742			 * That page wasn't successfully unlocked by the
 743			 * hypervisor, re-add it to the list of pages owned by
 744			 * the balloon driver.
 745			 */
 746			list_add(&p->lru, &page_size->pages);
 747		} else {
 748			/* deallocate page */
 749			vmballoon_free_page(p, is_2m_pages);
 750			STATS_INC(b->stats.free[is_2m_pages]);
 751
 752			/* update balloon size */
 753			b->size -= size_per_page;
 754		}
 755	}
 756
 757	return ret;
 758}
 759
 760/*
 761 * Release pages that were allocated while attempting to inflate the
 762 * balloon but were refused by the host for one reason or another.
 763 */
 764static void vmballoon_release_refused_pages(struct vmballoon *b,
 765		bool is_2m_pages)
 766{
 767	struct page *page, *next;
 768	struct vmballoon_page_size *page_size =
 769			&b->page_sizes[is_2m_pages];
 770
 771	list_for_each_entry_safe(page, next, &page_size->refused_pages, lru) {
 772		list_del(&page->lru);
 773		vmballoon_free_page(page, is_2m_pages);
 774		STATS_INC(b->stats.refused_free[is_2m_pages]);
 775	}
 776
 777	page_size->n_refused_pages = 0;
 778}
 779
 780static void vmballoon_add_page(struct vmballoon *b, int idx, struct page *p)
 781{
 782	b->page = p;
 783}
 784
 785static void vmballoon_add_batched_page(struct vmballoon *b, int idx,
 786				struct page *p)
 787{
 788	vmballoon_batch_set_pa(b->batch_page, idx,
 789			(u64)page_to_pfn(p) << PAGE_SHIFT);
 790}
 791
 792/*
 793 * Inflate the balloon towards its target size. Note that we try to limit
 794 * the rate of allocation to make sure we are not choking the rest of the
 795 * system.
 796 */
 797static void vmballoon_inflate(struct vmballoon *b)
 798{
 799	unsigned rate;
 800	unsigned int allocations = 0;
 801	unsigned int num_pages = 0;
 802	int error = 0;
 803	gfp_t flags = VMW_PAGE_ALLOC_NOSLEEP;
 804	bool is_2m_pages;
 805
 806	pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
 807
 808	/*
 809	 * First try NOSLEEP page allocations to inflate balloon.
 810	 *
 811	 * If we do not throttle nosleep allocations, we can drain all
 812	 * free pages in the guest quickly (if the balloon target is high).
 813	 * As a side-effect, draining free pages helps to inform (force)
 814	 * the guest to start swapping if balloon target is not met yet,
 815	 * which is a desired behavior. However, balloon driver can consume
 816	 * all available CPU cycles if too many pages are allocated in a
 817	 * second. Therefore, we throttle nosleep allocations even when
 818	 * the guest is not under memory pressure. OTOH, if we have already
 819	 * predicted that the guest is under memory pressure, then we
 820	 * slowdown page allocations considerably.
 821	 */
 822
 823	/*
 824	 * Start with no sleep allocation rate which may be higher
 825	 * than sleeping allocation rate.
 826	 */
 827	if (b->slow_allocation_cycles) {
 828		rate = b->rate_alloc;
 829		is_2m_pages = false;
 830	} else {
 831		rate = UINT_MAX;
 832		is_2m_pages =
 833			b->supported_page_sizes == VMW_BALLOON_NUM_PAGE_SIZES;
 834	}
 835
 836	pr_debug("%s - goal: %d, no-sleep rate: %u, sleep rate: %d\n",
 837		 __func__, b->target - b->size, rate, b->rate_alloc);
 838
 839	while (!b->reset_required &&
 840		b->size + num_pages * vmballoon_page_size(is_2m_pages)
 841		< b->target) {
 842		struct page *page;
 843
 844		if (flags == VMW_PAGE_ALLOC_NOSLEEP)
 845			STATS_INC(b->stats.alloc[is_2m_pages]);
 846		else
 847			STATS_INC(b->stats.sleep_alloc);
 848
 849		page = vmballoon_alloc_page(flags, is_2m_pages);
 850		if (!page) {
 851			STATS_INC(b->stats.alloc_fail[is_2m_pages]);
 852
 853			if (is_2m_pages) {
 854				b->ops->lock(b, num_pages, true, &b->target);
 855
 856				/*
 857				 * ignore errors from locking as we now switch
 858				 * to 4k pages and we might get different
 859				 * errors.
 860				 */
 861
 862				num_pages = 0;
 863				is_2m_pages = false;
 864				continue;
 865			}
 866
 867			if (flags == VMW_PAGE_ALLOC_CANSLEEP) {
 868				/*
 869				 * CANSLEEP page allocation failed, so guest
 870				 * is under severe memory pressure. Quickly
 871				 * decrease allocation rate.
 872				 */
 873				b->rate_alloc = max(b->rate_alloc / 2,
 874						    VMW_BALLOON_RATE_ALLOC_MIN);
 875				STATS_INC(b->stats.sleep_alloc_fail);
 876				break;
 877			}
 878
 879			/*
 880			 * NOSLEEP page allocation failed, so the guest is
 881			 * under memory pressure. Let us slow down page
 882			 * allocations for next few cycles so that the guest
 883			 * gets out of memory pressure. Also, if we already
 884			 * allocated b->rate_alloc pages, let's pause,
 885			 * otherwise switch to sleeping allocations.
 886			 */
 887			b->slow_allocation_cycles = VMW_BALLOON_SLOW_CYCLES;
 888
 889			if (allocations >= b->rate_alloc)
 890				break;
 891
 892			flags = VMW_PAGE_ALLOC_CANSLEEP;
 893			/* Lower rate for sleeping allocations. */
 894			rate = b->rate_alloc;
 895			continue;
 896		}
 897
 898		b->ops->add_page(b, num_pages++, page);
 899		if (num_pages == b->batch_max_pages) {
 900			error = b->ops->lock(b, num_pages, is_2m_pages,
 901					&b->target);
 902			num_pages = 0;
 903			if (error)
 904				break;
 905		}
 906
 907		cond_resched();
 908
 909		if (allocations >= rate) {
 910			/* We allocated enough pages, let's take a break. */
 911			break;
 912		}
 913	}
 914
 915	if (num_pages > 0)
 916		b->ops->lock(b, num_pages, is_2m_pages, &b->target);
 917
 918	/*
 919	 * We reached our goal without failures so try increasing
 920	 * allocation rate.
 921	 */
 922	if (error == 0 && allocations >= b->rate_alloc) {
 923		unsigned int mult = allocations / b->rate_alloc;
 924
 925		b->rate_alloc =
 926			min(b->rate_alloc + mult * VMW_BALLOON_RATE_ALLOC_INC,
 927			    VMW_BALLOON_RATE_ALLOC_MAX);
 928	}
 929
 930	vmballoon_release_refused_pages(b, true);
 931	vmballoon_release_refused_pages(b, false);
 932}
 933
 934/*
 935 * Decrease the size of the balloon allowing guest to use more memory.
 936 */
 937static void vmballoon_deflate(struct vmballoon *b)
 938{
 939	unsigned is_2m_pages;
 940
 941	pr_debug("%s - size: %d, target %d\n", __func__, b->size, b->target);
 942
 943	/* free pages to reach target */
 944	for (is_2m_pages = 0; is_2m_pages < b->supported_page_sizes;
 945			is_2m_pages++) {
 946		struct page *page, *next;
 947		unsigned int num_pages = 0;
 948		struct vmballoon_page_size *page_size =
 949				&b->page_sizes[is_2m_pages];
 950
 951		list_for_each_entry_safe(page, next, &page_size->pages, lru) {
 952			if (b->reset_required ||
 953				(b->target > 0 &&
 954					b->size - num_pages
 955					* vmballoon_page_size(is_2m_pages)
 956				< b->target + vmballoon_page_size(true)))
 957				break;
 958
 959			list_del(&page->lru);
 960			b->ops->add_page(b, num_pages++, page);
 961
 962			if (num_pages == b->batch_max_pages) {
 963				int error;
 964
 965				error = b->ops->unlock(b, num_pages,
 966						is_2m_pages, &b->target);
 967				num_pages = 0;
 968				if (error)
 969					return;
 970			}
 971
 972			cond_resched();
 973		}
 974
 975		if (num_pages > 0)
 976			b->ops->unlock(b, num_pages, is_2m_pages, &b->target);
 977	}
 978}
 979
 980static const struct vmballoon_ops vmballoon_basic_ops = {
 981	.add_page = vmballoon_add_page,
 982	.lock = vmballoon_lock_page,
 983	.unlock = vmballoon_unlock_page
 984};
 985
 986static const struct vmballoon_ops vmballoon_batched_ops = {
 987	.add_page = vmballoon_add_batched_page,
 988	.lock = vmballoon_lock_batched_page,
 989	.unlock = vmballoon_unlock_batched_page
 990};
 991
 992static bool vmballoon_init_batching(struct vmballoon *b)
 993{
 994	b->page = alloc_page(VMW_PAGE_ALLOC_NOSLEEP);
 995	if (!b->page)
 996		return false;
 997
 998	b->batch_page = vmap(&b->page, 1, VM_MAP, PAGE_KERNEL);
 999	if (!b->batch_page) {
1000		__free_page(b->page);
1001		return false;
1002	}
1003
1004	return true;
1005}
1006
1007/*
1008 * Receive notification and resize balloon
1009 */
1010static void vmballoon_doorbell(void *client_data)
1011{
1012	struct vmballoon *b = client_data;
1013
1014	STATS_INC(b->stats.doorbell);
1015
1016	mod_delayed_work(system_freezable_wq, &b->dwork, 0);
1017}
1018
1019/*
1020 * Clean up vmci doorbell
1021 */
1022static void vmballoon_vmci_cleanup(struct vmballoon *b)
1023{
1024	int error;
1025
1026	VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET, VMCI_INVALID_ID,
1027			VMCI_INVALID_ID, error);
1028	STATS_INC(b->stats.doorbell_unset);
1029
1030	if (!vmci_handle_is_invalid(b->vmci_doorbell)) {
1031		vmci_doorbell_destroy(b->vmci_doorbell);
1032		b->vmci_doorbell = VMCI_INVALID_HANDLE;
1033	}
1034}
1035
1036/*
1037 * Initialize vmci doorbell, to get notified as soon as balloon changes
1038 */
1039static int vmballoon_vmci_init(struct vmballoon *b)
1040{
1041	int error = 0;
1042
1043	if ((b->capabilities & VMW_BALLOON_SIGNALLED_WAKEUP_CMD) != 0) {
1044		error = vmci_doorbell_create(&b->vmci_doorbell,
1045				VMCI_FLAG_DELAYED_CB,
1046				VMCI_PRIVILEGE_FLAG_RESTRICTED,
1047				vmballoon_doorbell, b);
1048
1049		if (error == VMCI_SUCCESS) {
1050			VMWARE_BALLOON_CMD(VMCI_DOORBELL_SET,
1051					b->vmci_doorbell.context,
1052					b->vmci_doorbell.resource, error);
1053			STATS_INC(b->stats.doorbell_set);
1054		}
1055	}
1056
1057	if (error != 0) {
1058		vmballoon_vmci_cleanup(b);
1059
1060		return -EIO;
1061	}
1062
1063	return 0;
1064}
1065
1066/*
1067 * Perform standard reset sequence by popping the balloon (in case it
1068 * is not  empty) and then restarting protocol. This operation normally
1069 * happens when host responds with VMW_BALLOON_ERROR_RESET to a command.
1070 */
1071static void vmballoon_reset(struct vmballoon *b)
1072{
1073	int error;
1074
1075	vmballoon_vmci_cleanup(b);
1076
1077	/* free all pages, skipping monitor unlock */
1078	vmballoon_pop(b);
1079
1080	if (!vmballoon_send_start(b, VMW_BALLOON_CAPABILITIES))
1081		return;
1082
1083	if ((b->capabilities & VMW_BALLOON_BATCHED_CMDS) != 0) {
1084		b->ops = &vmballoon_batched_ops;
1085		b->batch_max_pages = VMW_BALLOON_BATCH_MAX_PAGES;
1086		if (!vmballoon_init_batching(b)) {
1087			/*
1088			 * We failed to initialize batching, inform the monitor
1089			 * about it by sending a null capability.
1090			 *
1091			 * The guest will retry in one second.
1092			 */
1093			vmballoon_send_start(b, 0);
1094			return;
1095		}
1096	} else if ((b->capabilities & VMW_BALLOON_BASIC_CMDS) != 0) {
1097		b->ops = &vmballoon_basic_ops;
1098		b->batch_max_pages = 1;
1099	}
1100
1101	b->reset_required = false;
1102
1103	error = vmballoon_vmci_init(b);
1104	if (error)
1105		pr_err("failed to initialize vmci doorbell\n");
1106
1107	if (!vmballoon_send_guest_id(b))
1108		pr_err("failed to send guest ID to the host\n");
1109}
1110
1111/*
1112 * Balloon work function: reset protocol, if needed, get the new size and
1113 * adjust balloon as needed. Repeat in 1 sec.
1114 */
1115static void vmballoon_work(struct work_struct *work)
1116{
1117	struct delayed_work *dwork = to_delayed_work(work);
1118	struct vmballoon *b = container_of(dwork, struct vmballoon, dwork);
1119	unsigned int target;
1120
1121	STATS_INC(b->stats.timer);
1122
1123	if (b->reset_required)
1124		vmballoon_reset(b);
1125
1126	if (b->slow_allocation_cycles > 0)
1127		b->slow_allocation_cycles--;
1128
1129	if (!b->reset_required && vmballoon_send_get_target(b, &target)) {
1130		/* update target, adjust size */
1131		b->target = target;
1132
1133		if (b->size < target)
1134			vmballoon_inflate(b);
1135		else if (target == 0 ||
1136				b->size > target + vmballoon_page_size(true))
1137			vmballoon_deflate(b);
1138	}
1139
1140	/*
1141	 * We are using a freezable workqueue so that balloon operations are
1142	 * stopped while the system transitions to/from sleep/hibernation.
1143	 */
1144	queue_delayed_work(system_freezable_wq,
1145			   dwork, round_jiffies_relative(HZ));
1146}
1147
1148/*
1149 * DEBUGFS Interface
1150 */
1151#ifdef CONFIG_DEBUG_FS
1152
1153static int vmballoon_debug_show(struct seq_file *f, void *offset)
1154{
1155	struct vmballoon *b = f->private;
1156	struct vmballoon_stats *stats = &b->stats;
1157
1158	/* format capabilities info */
1159	seq_printf(f,
1160		   "balloon capabilities:   %#4x\n"
1161		   "used capabilities:      %#4lx\n"
1162		   "is resetting:           %c\n",
1163		   VMW_BALLOON_CAPABILITIES, b->capabilities,
1164		   b->reset_required ? 'y' : 'n');
1165
1166	/* format size info */
1167	seq_printf(f,
1168		   "target:             %8d pages\n"
1169		   "current:            %8d pages\n",
1170		   b->target, b->size);
1171
1172	/* format rate info */
1173	seq_printf(f,
1174		   "rateSleepAlloc:     %8d pages/sec\n",
1175		   b->rate_alloc);
1176
1177	seq_printf(f,
1178		   "\n"
1179		   "timer:              %8u\n"
1180		   "doorbell:           %8u\n"
1181		   "start:              %8u (%4u failed)\n"
1182		   "guestType:          %8u (%4u failed)\n"
1183		   "2m-lock:            %8u (%4u failed)\n"
1184		   "lock:               %8u (%4u failed)\n"
1185		   "2m-unlock:          %8u (%4u failed)\n"
1186		   "unlock:             %8u (%4u failed)\n"
1187		   "target:             %8u (%4u failed)\n"
1188		   "prim2mAlloc:        %8u (%4u failed)\n"
1189		   "primNoSleepAlloc:   %8u (%4u failed)\n"
1190		   "primCanSleepAlloc:  %8u (%4u failed)\n"
1191		   "prim2mFree:         %8u\n"
1192		   "primFree:           %8u\n"
1193		   "err2mAlloc:         %8u\n"
1194		   "errAlloc:           %8u\n"
1195		   "err2mFree:          %8u\n"
1196		   "errFree:            %8u\n"
1197		   "doorbellSet:        %8u\n"
1198		   "doorbellUnset:      %8u\n",
1199		   stats->timer,
1200		   stats->doorbell,
1201		   stats->start, stats->start_fail,
1202		   stats->guest_type, stats->guest_type_fail,
1203		   stats->lock[true],  stats->lock_fail[true],
1204		   stats->lock[false],  stats->lock_fail[false],
1205		   stats->unlock[true], stats->unlock_fail[true],
1206		   stats->unlock[false], stats->unlock_fail[false],
1207		   stats->target, stats->target_fail,
1208		   stats->alloc[true], stats->alloc_fail[true],
1209		   stats->alloc[false], stats->alloc_fail[false],
1210		   stats->sleep_alloc, stats->sleep_alloc_fail,
1211		   stats->free[true],
1212		   stats->free[false],
1213		   stats->refused_alloc[true], stats->refused_alloc[false],
1214		   stats->refused_free[true], stats->refused_free[false],
1215		   stats->doorbell_set, stats->doorbell_unset);
1216
1217	return 0;
1218}
1219
1220static int vmballoon_debug_open(struct inode *inode, struct file *file)
1221{
1222	return single_open(file, vmballoon_debug_show, inode->i_private);
1223}
1224
1225static const struct file_operations vmballoon_debug_fops = {
1226	.owner		= THIS_MODULE,
1227	.open		= vmballoon_debug_open,
1228	.read		= seq_read,
1229	.llseek		= seq_lseek,
1230	.release	= single_release,
1231};
1232
1233static int __init vmballoon_debugfs_init(struct vmballoon *b)
1234{
1235	int error;
1236
1237	b->dbg_entry = debugfs_create_file("vmmemctl", S_IRUGO, NULL, b,
1238					   &vmballoon_debug_fops);
1239	if (IS_ERR(b->dbg_entry)) {
1240		error = PTR_ERR(b->dbg_entry);
1241		pr_err("failed to create debugfs entry, error: %d\n", error);
1242		return error;
1243	}
1244
1245	return 0;
1246}
1247
1248static void __exit vmballoon_debugfs_exit(struct vmballoon *b)
1249{
1250	debugfs_remove(b->dbg_entry);
1251}
1252
1253#else
1254
1255static inline int vmballoon_debugfs_init(struct vmballoon *b)
1256{
1257	return 0;
1258}
1259
1260static inline void vmballoon_debugfs_exit(struct vmballoon *b)
1261{
1262}
1263
1264#endif	/* CONFIG_DEBUG_FS */
1265
1266static int __init vmballoon_init(void)
1267{
1268	int error;
1269	unsigned is_2m_pages;
1270	/*
1271	 * Check if we are running on VMware's hypervisor and bail out
1272	 * if we are not.
1273	 */
1274	if (x86_hyper != &x86_hyper_vmware)
1275		return -ENODEV;
1276
1277	for (is_2m_pages = 0; is_2m_pages < VMW_BALLOON_NUM_PAGE_SIZES;
1278			is_2m_pages++) {
1279		INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].pages);
1280		INIT_LIST_HEAD(&balloon.page_sizes[is_2m_pages].refused_pages);
1281	}
1282
1283	/* initialize rates */
1284	balloon.rate_alloc = VMW_BALLOON_RATE_ALLOC_MAX;
1285
1286	INIT_DELAYED_WORK(&balloon.dwork, vmballoon_work);
1287
1288	error = vmballoon_debugfs_init(&balloon);
1289	if (error)
1290		return error;
1291
1292	balloon.vmci_doorbell = VMCI_INVALID_HANDLE;
1293	balloon.batch_page = NULL;
1294	balloon.page = NULL;
1295	balloon.reset_required = true;
1296
1297	queue_delayed_work(system_freezable_wq, &balloon.dwork, 0);
1298
1299	return 0;
1300}
1301module_init(vmballoon_init);
1302
1303static void __exit vmballoon_exit(void)
1304{
1305	vmballoon_vmci_cleanup(&balloon);
1306	cancel_delayed_work_sync(&balloon.dwork);
1307
1308	vmballoon_debugfs_exit(&balloon);
1309
1310	/*
1311	 * Deallocate all reserved memory, and reset connection with monitor.
1312	 * Reset connection before deallocating memory to avoid potential for
1313	 * additional spurious resets from guest touching deallocated pages.
1314	 */
1315	vmballoon_send_start(&balloon, 0);
1316	vmballoon_pop(&balloon);
1317}
1318module_exit(vmballoon_exit);