Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Virtio-mem device driver.
   4 *
   5 * Copyright Red Hat, Inc. 2020
   6 *
   7 * Author(s): David Hildenbrand <david@redhat.com>
   8 */
   9
  10#include <linux/virtio.h>
  11#include <linux/virtio_mem.h>
  12#include <linux/workqueue.h>
  13#include <linux/slab.h>
  14#include <linux/module.h>
  15#include <linux/mm.h>
  16#include <linux/memory_hotplug.h>
  17#include <linux/memory.h>
  18#include <linux/hrtimer.h>
  19#include <linux/crash_dump.h>
  20#include <linux/mutex.h>
  21#include <linux/bitmap.h>
  22#include <linux/lockdep.h>
  23#include <linux/log2.h>
  24#include <linux/vmalloc.h>
  25#include <linux/suspend.h>
  26
  27#include <acpi/acpi_numa.h>
  28
  29static bool unplug_online = true;
  30module_param(unplug_online, bool, 0644);
  31MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
  32
  33static bool force_bbm;
  34module_param(force_bbm, bool, 0444);
  35MODULE_PARM_DESC(force_bbm,
  36		"Force Big Block Mode. Default is 0 (auto-selection)");
  37
  38static unsigned long bbm_block_size;
  39module_param(bbm_block_size, ulong, 0444);
  40MODULE_PARM_DESC(bbm_block_size,
  41		 "Big Block size in bytes. Default is 0 (auto-detection).");
  42
  43/*
  44 * virtio-mem currently supports the following modes of operation:
  45 *
  46 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
  47 *   size of a Sub Block (SB) is determined based on the device block size, the
  48 *   pageblock size, and the maximum allocation granularity of the buddy.
  49 *   Subblocks within a Linux memory block might either be plugged or unplugged.
  50 *   Memory is added/removed to Linux MM in Linux memory block granularity.
  51 *
  52 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
  53 *   Memory is added/removed to Linux MM in Big Block granularity.
  54 *
  55 * The mode is determined automatically based on the Linux memory block size
  56 * and the device block size.
  57 *
  58 * User space / core MM (auto onlining) is responsible for onlining added
  59 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
  60 * always onlined separately, and all memory within a Linux memory block is
  61 * onlined to the same zone - virtio-mem relies on this behavior.
  62 */
  63
  64/*
  65 * State of a Linux memory block in SBM.
  66 */
  67enum virtio_mem_sbm_mb_state {
  68	/* Unplugged, not added to Linux. Can be reused later. */
  69	VIRTIO_MEM_SBM_MB_UNUSED = 0,
  70	/* (Partially) plugged, not added to Linux. Error on add_memory(). */
  71	VIRTIO_MEM_SBM_MB_PLUGGED,
  72	/* Fully plugged, fully added to Linux, offline. */
  73	VIRTIO_MEM_SBM_MB_OFFLINE,
  74	/* Partially plugged, fully added to Linux, offline. */
  75	VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
  76	/* Fully plugged, fully added to Linux, onlined to a kernel zone. */
  77	VIRTIO_MEM_SBM_MB_KERNEL,
  78	/* Partially plugged, fully added to Linux, online to a kernel zone */
  79	VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
  80	/* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
  81	VIRTIO_MEM_SBM_MB_MOVABLE,
  82	/* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
  83	VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
  84	VIRTIO_MEM_SBM_MB_COUNT
  85};
  86
  87/*
  88 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
  89 */
  90enum virtio_mem_bbm_bb_state {
  91	/* Unplugged, not added to Linux. Can be reused later. */
  92	VIRTIO_MEM_BBM_BB_UNUSED = 0,
  93	/* Plugged, not added to Linux. Error on add_memory(). */
  94	VIRTIO_MEM_BBM_BB_PLUGGED,
  95	/* Plugged and added to Linux. */
  96	VIRTIO_MEM_BBM_BB_ADDED,
  97	/* All online parts are fake-offline, ready to remove. */
  98	VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
  99	VIRTIO_MEM_BBM_BB_COUNT
 100};
 101
 102struct virtio_mem {
 103	struct virtio_device *vdev;
 104
 105	/* We might first have to unplug all memory when starting up. */
 106	bool unplug_all_required;
 107
 108	/* Workqueue that processes the plug/unplug requests. */
 109	struct work_struct wq;
 110	atomic_t wq_active;
 111	atomic_t config_changed;
 112
 113	/* Virtqueue for guest->host requests. */
 114	struct virtqueue *vq;
 115
 116	/* Wait for a host response to a guest request. */
 117	wait_queue_head_t host_resp;
 118
 119	/* Space for one guest request and the host response. */
 120	struct virtio_mem_req req;
 121	struct virtio_mem_resp resp;
 122
 123	/* The current size of the device. */
 124	uint64_t plugged_size;
 125	/* The requested size of the device. */
 126	uint64_t requested_size;
 127
 128	/* The device block size (for communicating with the device). */
 129	uint64_t device_block_size;
 130	/* The determined node id for all memory of the device. */
 131	int nid;
 132	/* Physical start address of the memory region. */
 133	uint64_t addr;
 134	/* Maximum region size in bytes. */
 135	uint64_t region_size;
 136
 137	/* The parent resource for all memory added via this device. */
 138	struct resource *parent_resource;
 139	/*
 140	 * Copy of "System RAM (virtio_mem)" to be used for
 141	 * add_memory_driver_managed().
 142	 */
 143	const char *resource_name;
 144	/* Memory group identification. */
 145	int mgid;
 146
 147	/*
 148	 * We don't want to add too much memory if it's not getting onlined,
 149	 * to avoid running OOM. Besides this threshold, we allow to have at
 150	 * least two offline blocks at a time (whatever is bigger).
 151	 */
 152#define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD		(1024 * 1024 * 1024)
 153	atomic64_t offline_size;
 154	uint64_t offline_threshold;
 155
 156	/* If set, the driver is in SBM, otherwise in BBM. */
 157	bool in_sbm;
 158
 159	union {
 160		struct {
 161			/* Id of the first memory block of this device. */
 162			unsigned long first_mb_id;
 163			/* Id of the last usable memory block of this device. */
 164			unsigned long last_usable_mb_id;
 165			/* Id of the next memory bock to prepare when needed. */
 166			unsigned long next_mb_id;
 167
 168			/* The subblock size. */
 169			uint64_t sb_size;
 170			/* The number of subblocks per Linux memory block. */
 171			uint32_t sbs_per_mb;
 172
 173			/*
 174			 * Some of the Linux memory blocks tracked as "partially
 175			 * plugged" are completely unplugged and can be offlined
 176			 * and removed -- which previously failed.
 177			 */
 178			bool have_unplugged_mb;
 179
 180			/* Summary of all memory block states. */
 181			unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
 182
 183			/*
 184			 * One byte state per memory block. Allocated via
 185			 * vmalloc(). Resized (alloc+copy+free) on demand.
 186			 *
 187			 * With 128 MiB memory blocks, we have states for 512
 188			 * GiB of memory in one 4 KiB page.
 189			 */
 190			uint8_t *mb_states;
 191
 192			/*
 193			 * Bitmap: one bit per subblock. Allocated similar to
 194			 * sbm.mb_states.
 195			 *
 196			 * A set bit means the corresponding subblock is
 197			 * plugged, otherwise it's unblocked.
 198			 *
 199			 * With 4 MiB subblocks, we manage 128 GiB of memory
 200			 * in one 4 KiB page.
 201			 */
 202			unsigned long *sb_states;
 203		} sbm;
 204
 205		struct {
 206			/* Id of the first big block of this device. */
 207			unsigned long first_bb_id;
 208			/* Id of the last usable big block of this device. */
 209			unsigned long last_usable_bb_id;
 210			/* Id of the next device bock to prepare when needed. */
 211			unsigned long next_bb_id;
 212
 213			/* Summary of all big block states. */
 214			unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
 215
 216			/* One byte state per big block. See sbm.mb_states. */
 217			uint8_t *bb_states;
 218
 219			/* The block size used for plugging/adding/removing. */
 220			uint64_t bb_size;
 221		} bbm;
 222	};
 223
 224	/*
 225	 * Mutex that protects the sbm.mb_count, sbm.mb_states,
 226	 * sbm.sb_states, bbm.bb_count, and bbm.bb_states
 227	 *
 228	 * When this lock is held the pointers can't change, ONLINE and
 229	 * OFFLINE blocks can't change the state and no subblocks will get
 230	 * plugged/unplugged.
 231	 *
 232	 * In kdump mode, used to serialize requests, last_block_addr and
 233	 * last_block_plugged.
 234	 */
 235	struct mutex hotplug_mutex;
 236	bool hotplug_active;
 237
 238	/* An error occurred we cannot handle - stop processing requests. */
 239	bool broken;
 240
 241	/* Cached valued of is_kdump_kernel() when the device was probed. */
 242	bool in_kdump;
 243
 244	/* The driver is being removed. */
 245	spinlock_t removal_lock;
 246	bool removing;
 247
 248	/* Timer for retrying to plug/unplug memory. */
 249	struct hrtimer retry_timer;
 250	unsigned int retry_timer_ms;
 251#define VIRTIO_MEM_RETRY_TIMER_MIN_MS		50000
 252#define VIRTIO_MEM_RETRY_TIMER_MAX_MS		300000
 253
 254	/* Memory notifier (online/offline events). */
 255	struct notifier_block memory_notifier;
 256
 257	/* Notifier to block hibernation image storing/reloading. */
 258	struct notifier_block pm_notifier;
 259
 260#ifdef CONFIG_PROC_VMCORE
 261	/* vmcore callback for /proc/vmcore handling in kdump mode */
 262	struct vmcore_cb vmcore_cb;
 263	uint64_t last_block_addr;
 264	bool last_block_plugged;
 265#endif /* CONFIG_PROC_VMCORE */
 266
 267	/* Next device in the list of virtio-mem devices. */
 268	struct list_head next;
 269};
 270
 271/*
 272 * We have to share a single online_page callback among all virtio-mem
 273 * devices. We use RCU to iterate the list in the callback.
 274 */
 275static DEFINE_MUTEX(virtio_mem_mutex);
 276static LIST_HEAD(virtio_mem_devices);
 277
 278static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
 279static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
 280						  unsigned long nr_pages);
 281static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
 282						   unsigned long nr_pages);
 283static void virtio_mem_retry(struct virtio_mem *vm);
 284static int virtio_mem_create_resource(struct virtio_mem *vm);
 285static void virtio_mem_delete_resource(struct virtio_mem *vm);
 286
 287/*
 288 * Register a virtio-mem device so it will be considered for the online_page
 289 * callback.
 290 */
 291static int register_virtio_mem_device(struct virtio_mem *vm)
 292{
 293	int rc = 0;
 294
 295	/* First device registers the callback. */
 296	mutex_lock(&virtio_mem_mutex);
 297	if (list_empty(&virtio_mem_devices))
 298		rc = set_online_page_callback(&virtio_mem_online_page_cb);
 299	if (!rc)
 300		list_add_rcu(&vm->next, &virtio_mem_devices);
 301	mutex_unlock(&virtio_mem_mutex);
 302
 303	return rc;
 304}
 305
 306/*
 307 * Unregister a virtio-mem device so it will no longer be considered for the
 308 * online_page callback.
 309 */
 310static void unregister_virtio_mem_device(struct virtio_mem *vm)
 311{
 312	/* Last device unregisters the callback. */
 313	mutex_lock(&virtio_mem_mutex);
 314	list_del_rcu(&vm->next);
 315	if (list_empty(&virtio_mem_devices))
 316		restore_online_page_callback(&virtio_mem_online_page_cb);
 317	mutex_unlock(&virtio_mem_mutex);
 318
 319	synchronize_rcu();
 320}
 321
 322/*
 323 * Calculate the memory block id of a given address.
 324 */
 325static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr)
 326{
 327	return addr / memory_block_size_bytes();
 328}
 329
 330/*
 331 * Calculate the physical start address of a given memory block id.
 332 */
 333static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
 334{
 335	return mb_id * memory_block_size_bytes();
 336}
 337
 338/*
 339 * Calculate the big block id of a given address.
 340 */
 341static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm,
 342					      uint64_t addr)
 343{
 344	return addr / vm->bbm.bb_size;
 345}
 346
 347/*
 348 * Calculate the physical start address of a given big block id.
 349 */
 350static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm,
 351					 unsigned long bb_id)
 352{
 353	return bb_id * vm->bbm.bb_size;
 354}
 355
 356/*
 357 * Calculate the subblock id of a given address.
 358 */
 359static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
 360					      unsigned long addr)
 361{
 362	const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
 363	const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
 364
 365	return (addr - mb_addr) / vm->sbm.sb_size;
 366}
 367
 368/*
 369 * Set the state of a big block, taking care of the state counter.
 370 */
 371static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm,
 372					unsigned long bb_id,
 373					enum virtio_mem_bbm_bb_state state)
 374{
 375	const unsigned long idx = bb_id - vm->bbm.first_bb_id;
 376	enum virtio_mem_bbm_bb_state old_state;
 377
 378	old_state = vm->bbm.bb_states[idx];
 379	vm->bbm.bb_states[idx] = state;
 380
 381	BUG_ON(vm->bbm.bb_count[old_state] == 0);
 382	vm->bbm.bb_count[old_state]--;
 383	vm->bbm.bb_count[state]++;
 384}
 385
 386/*
 387 * Get the state of a big block.
 388 */
 389static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm,
 390								unsigned long bb_id)
 391{
 392	return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id];
 393}
 394
 395/*
 396 * Prepare the big block state array for the next big block.
 397 */
 398static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
 399{
 400	unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id;
 401	unsigned long new_bytes = old_bytes + 1;
 402	int old_pages = PFN_UP(old_bytes);
 403	int new_pages = PFN_UP(new_bytes);
 404	uint8_t *new_array;
 405
 406	if (vm->bbm.bb_states && old_pages == new_pages)
 407		return 0;
 408
 409	new_array = vzalloc(new_pages * PAGE_SIZE);
 410	if (!new_array)
 411		return -ENOMEM;
 412
 413	mutex_lock(&vm->hotplug_mutex);
 414	if (vm->bbm.bb_states)
 415		memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE);
 416	vfree(vm->bbm.bb_states);
 417	vm->bbm.bb_states = new_array;
 418	mutex_unlock(&vm->hotplug_mutex);
 419
 420	return 0;
 421}
 422
 423#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \
 424	for (_bb_id = vm->bbm.first_bb_id; \
 425	     _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \
 426	     _bb_id++) \
 427		if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
 428
 429#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
 430	for (_bb_id = vm->bbm.next_bb_id - 1; \
 431	     _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
 432	     _bb_id--) \
 433		if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
 434
 435/*
 436 * Set the state of a memory block, taking care of the state counter.
 437 */
 438static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
 439					unsigned long mb_id, uint8_t state)
 440{
 441	const unsigned long idx = mb_id - vm->sbm.first_mb_id;
 442	uint8_t old_state;
 443
 444	old_state = vm->sbm.mb_states[idx];
 445	vm->sbm.mb_states[idx] = state;
 446
 447	BUG_ON(vm->sbm.mb_count[old_state] == 0);
 448	vm->sbm.mb_count[old_state]--;
 449	vm->sbm.mb_count[state]++;
 450}
 451
 452/*
 453 * Get the state of a memory block.
 454 */
 455static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
 456					   unsigned long mb_id)
 457{
 458	const unsigned long idx = mb_id - vm->sbm.first_mb_id;
 459
 460	return vm->sbm.mb_states[idx];
 461}
 462
 463/*
 464 * Prepare the state array for the next memory block.
 465 */
 466static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
 467{
 468	int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id);
 469	int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1);
 470	uint8_t *new_array;
 471
 472	if (vm->sbm.mb_states && old_pages == new_pages)
 473		return 0;
 474
 475	new_array = vzalloc(new_pages * PAGE_SIZE);
 476	if (!new_array)
 477		return -ENOMEM;
 478
 479	mutex_lock(&vm->hotplug_mutex);
 480	if (vm->sbm.mb_states)
 481		memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE);
 482	vfree(vm->sbm.mb_states);
 483	vm->sbm.mb_states = new_array;
 484	mutex_unlock(&vm->hotplug_mutex);
 485
 486	return 0;
 487}
 488
 489#define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
 490	for (_mb_id = _vm->sbm.first_mb_id; \
 491	     _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
 492	     _mb_id++) \
 493		if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
 494
 495#define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
 496	for (_mb_id = _vm->sbm.next_mb_id - 1; \
 497	     _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
 498	     _mb_id--) \
 499		if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
 500
 501/*
 502 * Calculate the bit number in the subblock bitmap for the given subblock
 503 * inside the given memory block.
 504 */
 505static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
 506					  unsigned long mb_id, int sb_id)
 507{
 508	return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
 509}
 510
 511/*
 512 * Mark all selected subblocks plugged.
 513 *
 514 * Will not modify the state of the memory block.
 515 */
 516static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
 517					  unsigned long mb_id, int sb_id,
 518					  int count)
 519{
 520	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 521
 522	__bitmap_set(vm->sbm.sb_states, bit, count);
 523}
 524
 525/*
 526 * Mark all selected subblocks unplugged.
 527 *
 528 * Will not modify the state of the memory block.
 529 */
 530static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
 531					    unsigned long mb_id, int sb_id,
 532					    int count)
 533{
 534	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 535
 536	__bitmap_clear(vm->sbm.sb_states, bit, count);
 537}
 538
 539/*
 540 * Test if all selected subblocks are plugged.
 541 */
 542static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
 543					   unsigned long mb_id, int sb_id,
 544					   int count)
 545{
 546	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 547
 548	if (count == 1)
 549		return test_bit(bit, vm->sbm.sb_states);
 550
 551	/* TODO: Helper similar to bitmap_set() */
 552	return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
 553	       bit + count;
 554}
 555
 556/*
 557 * Test if all selected subblocks are unplugged.
 558 */
 559static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
 560					     unsigned long mb_id, int sb_id,
 561					     int count)
 562{
 563	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 564
 565	/* TODO: Helper similar to bitmap_set() */
 566	return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
 567	       bit + count;
 568}
 569
 570/*
 571 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is
 572 * none.
 573 */
 574static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
 575					    unsigned long mb_id)
 576{
 577	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
 578
 579	return find_next_zero_bit(vm->sbm.sb_states,
 580				  bit + vm->sbm.sbs_per_mb, bit) - bit;
 581}
 582
 583/*
 584 * Prepare the subblock bitmap for the next memory block.
 585 */
 586static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
 587{
 588	const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
 589	const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
 590	const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
 591	int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
 592	int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
 593	unsigned long *new_bitmap, *old_bitmap;
 594
 595	if (vm->sbm.sb_states && old_pages == new_pages)
 596		return 0;
 597
 598	new_bitmap = vzalloc(new_pages * PAGE_SIZE);
 599	if (!new_bitmap)
 600		return -ENOMEM;
 601
 602	mutex_lock(&vm->hotplug_mutex);
 603	if (vm->sbm.sb_states)
 604		memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE);
 605
 606	old_bitmap = vm->sbm.sb_states;
 607	vm->sbm.sb_states = new_bitmap;
 608	mutex_unlock(&vm->hotplug_mutex);
 609
 610	vfree(old_bitmap);
 611	return 0;
 612}
 613
 614/*
 615 * Test if we could add memory without creating too much offline memory -
 616 * to avoid running OOM if memory is getting onlined deferred.
 617 */
 618static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
 619{
 620	if (WARN_ON_ONCE(size > vm->offline_threshold))
 621		return false;
 622
 623	return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
 624}
 625
 626/*
 627 * Try adding memory to Linux. Will usually only fail if out of memory.
 628 *
 629 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 630 * onlining code).
 631 *
 632 * Will not modify the state of memory blocks in virtio-mem.
 633 */
 634static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
 635				 uint64_t size)
 636{
 637	int rc;
 638
 639	/*
 640	 * When force-unloading the driver and we still have memory added to
 641	 * Linux, the resource name has to stay.
 642	 */
 643	if (!vm->resource_name) {
 644		vm->resource_name = kstrdup_const("System RAM (virtio_mem)",
 645						  GFP_KERNEL);
 646		if (!vm->resource_name)
 647			return -ENOMEM;
 648	}
 649
 650	dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
 651		addr + size - 1);
 652	/* Memory might get onlined immediately. */
 653	atomic64_add(size, &vm->offline_size);
 654	rc = add_memory_driver_managed(vm->mgid, addr, size, vm->resource_name,
 655				       MHP_MERGE_RESOURCE | MHP_NID_IS_MGID);
 656	if (rc) {
 657		atomic64_sub(size, &vm->offline_size);
 658		dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
 659		/*
 660		 * TODO: Linux MM does not properly clean up yet in all cases
 661		 * where adding of memory failed - especially on -ENOMEM.
 662		 */
 663	}
 664	return rc;
 665}
 666
 667/*
 668 * See virtio_mem_add_memory(): Try adding a single Linux memory block.
 669 */
 670static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
 671{
 672	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 673	const uint64_t size = memory_block_size_bytes();
 674
 675	return virtio_mem_add_memory(vm, addr, size);
 676}
 677
 678/*
 679 * See virtio_mem_add_memory(): Try adding a big block.
 680 */
 681static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id)
 682{
 683	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
 684	const uint64_t size = vm->bbm.bb_size;
 685
 686	return virtio_mem_add_memory(vm, addr, size);
 687}
 688
 689/*
 690 * Try removing memory from Linux. Will only fail if memory blocks aren't
 691 * offline.
 692 *
 693 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 694 * onlining code).
 695 *
 696 * Will not modify the state of memory blocks in virtio-mem.
 697 */
 698static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
 699				    uint64_t size)
 700{
 701	int rc;
 702
 703	dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
 704		addr + size - 1);
 705	rc = remove_memory(addr, size);
 706	if (!rc) {
 707		atomic64_sub(size, &vm->offline_size);
 708		/*
 709		 * We might have freed up memory we can now unplug, retry
 710		 * immediately instead of waiting.
 711		 */
 712		virtio_mem_retry(vm);
 713	} else {
 714		dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc);
 715	}
 716	return rc;
 717}
 718
 719/*
 720 * See virtio_mem_remove_memory(): Try removing a single Linux memory block.
 721 */
 722static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id)
 723{
 724	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 725	const uint64_t size = memory_block_size_bytes();
 726
 727	return virtio_mem_remove_memory(vm, addr, size);
 728}
 729
 730/*
 731 * Try offlining and removing memory from Linux.
 732 *
 733 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 734 * onlining code).
 735 *
 736 * Will not modify the state of memory blocks in virtio-mem.
 737 */
 738static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
 739						uint64_t addr,
 740						uint64_t size)
 741{
 742	int rc;
 743
 744	dev_dbg(&vm->vdev->dev,
 745		"offlining and removing memory: 0x%llx - 0x%llx\n", addr,
 746		addr + size - 1);
 747
 748	rc = offline_and_remove_memory(addr, size);
 749	if (!rc) {
 750		atomic64_sub(size, &vm->offline_size);
 751		/*
 752		 * We might have freed up memory we can now unplug, retry
 753		 * immediately instead of waiting.
 754		 */
 755		virtio_mem_retry(vm);
 756		return 0;
 757	}
 758	dev_dbg(&vm->vdev->dev, "offlining and removing memory failed: %d\n", rc);
 759	/*
 760	 * We don't really expect this to fail, because we fake-offlined all
 761	 * memory already. But it could fail in corner cases.
 762	 */
 763	WARN_ON_ONCE(rc != -ENOMEM && rc != -EBUSY);
 764	return rc == -ENOMEM ? -ENOMEM : -EBUSY;
 765}
 766
 767/*
 768 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing
 769 * a single Linux memory block.
 770 */
 771static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
 772						unsigned long mb_id)
 773{
 774	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 775	const uint64_t size = memory_block_size_bytes();
 776
 777	return virtio_mem_offline_and_remove_memory(vm, addr, size);
 778}
 779
 780/*
 781 * Try (offlining and) removing memory from Linux in case all subblocks are
 782 * unplugged. Can be called on online and offline memory blocks.
 783 *
 784 * May modify the state of memory blocks in virtio-mem.
 785 */
 786static int virtio_mem_sbm_try_remove_unplugged_mb(struct virtio_mem *vm,
 787						  unsigned long mb_id)
 788{
 789	int rc;
 790
 791	/*
 792	 * Once all subblocks of a memory block were unplugged, offline and
 793	 * remove it.
 794	 */
 795	if (!virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
 796		return 0;
 797
 798	/* offline_and_remove_memory() works for online and offline memory. */
 799	mutex_unlock(&vm->hotplug_mutex);
 800	rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
 801	mutex_lock(&vm->hotplug_mutex);
 802	if (!rc)
 803		virtio_mem_sbm_set_mb_state(vm, mb_id,
 804					    VIRTIO_MEM_SBM_MB_UNUSED);
 805	return rc;
 806}
 807
 808/*
 809 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
 810 * all Linux memory blocks covered by the big block.
 811 */
 812static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
 813						unsigned long bb_id)
 814{
 815	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
 816	const uint64_t size = vm->bbm.bb_size;
 817
 818	return virtio_mem_offline_and_remove_memory(vm, addr, size);
 819}
 820
 821/*
 822 * Trigger the workqueue so the device can perform its magic.
 823 */
 824static void virtio_mem_retry(struct virtio_mem *vm)
 825{
 826	unsigned long flags;
 827
 828	spin_lock_irqsave(&vm->removal_lock, flags);
 829	if (!vm->removing)
 830		queue_work(system_freezable_wq, &vm->wq);
 831	spin_unlock_irqrestore(&vm->removal_lock, flags);
 832}
 833
 834static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
 835{
 836	int node = NUMA_NO_NODE;
 837
 838#if defined(CONFIG_ACPI_NUMA)
 839	if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM))
 840		node = pxm_to_node(node_id);
 841#endif
 842	return node;
 843}
 844
 845/*
 846 * Test if a virtio-mem device overlaps with the given range. Can be called
 847 * from (notifier) callbacks lockless.
 848 */
 849static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start,
 850				      uint64_t size)
 851{
 852	return start < vm->addr + vm->region_size && vm->addr < start + size;
 853}
 854
 855/*
 856 * Test if a virtio-mem device contains a given range. Can be called from
 857 * (notifier) callbacks lockless.
 858 */
 859static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
 860				      uint64_t size)
 861{
 862	return start >= vm->addr && start + size <= vm->addr + vm->region_size;
 863}
 864
 865static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
 866					      unsigned long mb_id)
 867{
 868	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
 869	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
 870	case VIRTIO_MEM_SBM_MB_OFFLINE:
 871		return NOTIFY_OK;
 872	default:
 873		break;
 874	}
 875	dev_warn_ratelimited(&vm->vdev->dev,
 876			     "memory block onlining denied\n");
 877	return NOTIFY_BAD;
 878}
 879
 880static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm,
 881					  unsigned long mb_id)
 882{
 883	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
 884	case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
 885	case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
 886		virtio_mem_sbm_set_mb_state(vm, mb_id,
 887					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
 888		break;
 889	case VIRTIO_MEM_SBM_MB_KERNEL:
 890	case VIRTIO_MEM_SBM_MB_MOVABLE:
 891		virtio_mem_sbm_set_mb_state(vm, mb_id,
 892					    VIRTIO_MEM_SBM_MB_OFFLINE);
 893		break;
 894	default:
 895		BUG();
 896		break;
 897	}
 898}
 899
 900static void virtio_mem_sbm_notify_online(struct virtio_mem *vm,
 901					 unsigned long mb_id,
 902					 unsigned long start_pfn)
 903{
 904	const bool is_movable = is_zone_movable_page(pfn_to_page(start_pfn));
 905	int new_state;
 906
 907	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
 908	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
 909		new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL;
 910		if (is_movable)
 911			new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL;
 912		break;
 913	case VIRTIO_MEM_SBM_MB_OFFLINE:
 914		new_state = VIRTIO_MEM_SBM_MB_KERNEL;
 915		if (is_movable)
 916			new_state = VIRTIO_MEM_SBM_MB_MOVABLE;
 917		break;
 918	default:
 919		BUG();
 920		break;
 921	}
 922	virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
 923}
 924
 925static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm,
 926						unsigned long mb_id)
 927{
 928	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
 929	unsigned long pfn;
 930	int sb_id;
 931
 932	for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
 933		if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
 934			continue;
 935		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
 936			       sb_id * vm->sbm.sb_size);
 937		virtio_mem_fake_offline_going_offline(pfn, nr_pages);
 938	}
 939}
 940
 941static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm,
 942						 unsigned long mb_id)
 943{
 944	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
 945	unsigned long pfn;
 946	int sb_id;
 947
 948	for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
 949		if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
 950			continue;
 951		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
 952			       sb_id * vm->sbm.sb_size);
 953		virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
 954	}
 955}
 956
 957static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
 958						unsigned long bb_id,
 959						unsigned long pfn,
 960						unsigned long nr_pages)
 961{
 962	/*
 963	 * When marked as "fake-offline", all online memory of this device block
 964	 * is allocated by us. Otherwise, we don't have any memory allocated.
 965	 */
 966	if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
 967	    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
 968		return;
 969	virtio_mem_fake_offline_going_offline(pfn, nr_pages);
 970}
 971
 972static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
 973						 unsigned long bb_id,
 974						 unsigned long pfn,
 975						 unsigned long nr_pages)
 976{
 977	if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
 978	    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
 979		return;
 980	virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
 981}
 982
 983/*
 984 * This callback will either be called synchronously from add_memory() or
 985 * asynchronously (e.g., triggered via user space). We have to be careful
 986 * with locking when calling add_memory().
 987 */
 988static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
 989					 unsigned long action, void *arg)
 990{
 991	struct virtio_mem *vm = container_of(nb, struct virtio_mem,
 992					     memory_notifier);
 993	struct memory_notify *mhp = arg;
 994	const unsigned long start = PFN_PHYS(mhp->start_pfn);
 995	const unsigned long size = PFN_PHYS(mhp->nr_pages);
 996	int rc = NOTIFY_OK;
 997	unsigned long id;
 998
 999	if (!virtio_mem_overlaps_range(vm, start, size))
1000		return NOTIFY_DONE;
1001
1002	if (vm->in_sbm) {
1003		id = virtio_mem_phys_to_mb_id(start);
1004		/*
1005		 * In SBM, we add memory in separate memory blocks - we expect
1006		 * it to be onlined/offlined in the same granularity. Bail out
1007		 * if this ever changes.
1008		 */
1009		if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
1010				 !IS_ALIGNED(start, memory_block_size_bytes())))
1011			return NOTIFY_BAD;
1012	} else {
1013		id = virtio_mem_phys_to_bb_id(vm, start);
1014		/*
1015		 * In BBM, we only care about onlining/offlining happening
1016		 * within a single big block, we don't care about the
1017		 * actual granularity as we don't track individual Linux
1018		 * memory blocks.
1019		 */
1020		if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1)))
1021			return NOTIFY_BAD;
1022	}
1023
1024	/*
1025	 * Avoid circular locking lockdep warnings. We lock the mutex
1026	 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The
1027	 * blocking_notifier_call_chain() has it's own lock, which gets unlocked
1028	 * between both notifier calls and will bail out. False positive.
1029	 */
1030	lockdep_off();
1031
1032	switch (action) {
1033	case MEM_GOING_OFFLINE:
1034		mutex_lock(&vm->hotplug_mutex);
1035		if (vm->removing) {
1036			rc = notifier_from_errno(-EBUSY);
1037			mutex_unlock(&vm->hotplug_mutex);
1038			break;
1039		}
1040		vm->hotplug_active = true;
1041		if (vm->in_sbm)
1042			virtio_mem_sbm_notify_going_offline(vm, id);
1043		else
1044			virtio_mem_bbm_notify_going_offline(vm, id,
1045							    mhp->start_pfn,
1046							    mhp->nr_pages);
1047		break;
1048	case MEM_GOING_ONLINE:
1049		mutex_lock(&vm->hotplug_mutex);
1050		if (vm->removing) {
1051			rc = notifier_from_errno(-EBUSY);
1052			mutex_unlock(&vm->hotplug_mutex);
1053			break;
1054		}
1055		vm->hotplug_active = true;
1056		if (vm->in_sbm)
1057			rc = virtio_mem_sbm_notify_going_online(vm, id);
1058		break;
1059	case MEM_OFFLINE:
1060		if (vm->in_sbm)
1061			virtio_mem_sbm_notify_offline(vm, id);
1062
1063		atomic64_add(size, &vm->offline_size);
1064		/*
1065		 * Trigger the workqueue. Now that we have some offline memory,
1066		 * maybe we can handle pending unplug requests.
1067		 */
1068		if (!unplug_online)
1069			virtio_mem_retry(vm);
1070
1071		vm->hotplug_active = false;
1072		mutex_unlock(&vm->hotplug_mutex);
1073		break;
1074	case MEM_ONLINE:
1075		if (vm->in_sbm)
1076			virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn);
1077
1078		atomic64_sub(size, &vm->offline_size);
1079		/*
1080		 * Start adding more memory once we onlined half of our
1081		 * threshold. Don't trigger if it's possibly due to our actipn
1082		 * (e.g., us adding memory which gets onlined immediately from
1083		 * the core).
1084		 */
1085		if (!atomic_read(&vm->wq_active) &&
1086		    virtio_mem_could_add_memory(vm, vm->offline_threshold / 2))
1087			virtio_mem_retry(vm);
1088
1089		vm->hotplug_active = false;
1090		mutex_unlock(&vm->hotplug_mutex);
1091		break;
1092	case MEM_CANCEL_OFFLINE:
1093		if (!vm->hotplug_active)
1094			break;
1095		if (vm->in_sbm)
1096			virtio_mem_sbm_notify_cancel_offline(vm, id);
1097		else
1098			virtio_mem_bbm_notify_cancel_offline(vm, id,
1099							     mhp->start_pfn,
1100							     mhp->nr_pages);
1101		vm->hotplug_active = false;
1102		mutex_unlock(&vm->hotplug_mutex);
1103		break;
1104	case MEM_CANCEL_ONLINE:
1105		if (!vm->hotplug_active)
1106			break;
1107		vm->hotplug_active = false;
1108		mutex_unlock(&vm->hotplug_mutex);
1109		break;
1110	default:
1111		break;
1112	}
1113
1114	lockdep_on();
1115
1116	return rc;
1117}
1118
1119static int virtio_mem_pm_notifier_cb(struct notifier_block *nb,
1120				     unsigned long action, void *arg)
1121{
1122	struct virtio_mem *vm = container_of(nb, struct virtio_mem,
1123					     pm_notifier);
1124	switch (action) {
1125	case PM_HIBERNATION_PREPARE:
1126	case PM_RESTORE_PREPARE:
1127		/*
1128		 * When restarting the VM, all memory is unplugged. Don't
1129		 * allow to hibernate and restore from an image.
1130		 */
1131		dev_err(&vm->vdev->dev, "hibernation is not supported.\n");
1132		return NOTIFY_BAD;
1133	default:
1134		return NOTIFY_OK;
1135	}
1136}
1137
1138/*
1139 * Set a range of pages PG_offline. Remember pages that were never onlined
1140 * (via generic_online_page()) using PageDirty().
1141 */
1142static void virtio_mem_set_fake_offline(unsigned long pfn,
1143					unsigned long nr_pages, bool onlined)
1144{
1145	page_offline_begin();
1146	for (; nr_pages--; pfn++) {
1147		struct page *page = pfn_to_page(pfn);
1148
1149		if (!onlined)
1150			/*
1151			 * Pages that have not been onlined yet were initialized
1152			 * to PageOffline(). Remember that we have to route them
1153			 * through generic_online_page().
1154			 */
1155			SetPageDirty(page);
1156		else
1157			__SetPageOffline(page);
1158		VM_WARN_ON_ONCE(!PageOffline(page));
1159	}
1160	page_offline_end();
1161}
1162
1163/*
1164 * Clear PG_offline from a range of pages. If the pages were never onlined,
1165 * (via generic_online_page()), clear PageDirty().
1166 */
1167static void virtio_mem_clear_fake_offline(unsigned long pfn,
1168					  unsigned long nr_pages, bool onlined)
1169{
1170	for (; nr_pages--; pfn++) {
1171		struct page *page = pfn_to_page(pfn);
1172
1173		if (!onlined)
1174			/* generic_online_page() will clear PageOffline(). */
1175			ClearPageDirty(page);
1176		else
1177			__ClearPageOffline(page);
1178	}
1179}
1180
1181/*
1182 * Release a range of fake-offline pages to the buddy, effectively
1183 * fake-onlining them.
1184 */
1185static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
1186{
1187	unsigned long order = MAX_PAGE_ORDER;
1188	unsigned long i;
1189
1190	/*
1191	 * We might get called for ranges that don't cover properly aligned
1192	 * MAX_PAGE_ORDER pages; however, we can only online properly aligned
1193	 * pages with an order of MAX_PAGE_ORDER at maximum.
1194	 */
1195	while (!IS_ALIGNED(pfn | nr_pages, 1 << order))
1196		order--;
1197
1198	for (i = 0; i < nr_pages; i += 1 << order) {
1199		struct page *page = pfn_to_page(pfn + i);
1200
1201		/*
1202		 * If the page is PageDirty(), it was kept fake-offline when
1203		 * onlining the memory block. Otherwise, it was allocated
1204		 * using alloc_contig_range(). All pages in a subblock are
1205		 * alike.
1206		 */
1207		if (PageDirty(page)) {
1208			virtio_mem_clear_fake_offline(pfn + i, 1 << order, false);
1209			generic_online_page(page, order);
1210		} else {
1211			virtio_mem_clear_fake_offline(pfn + i, 1 << order, true);
1212			free_contig_range(pfn + i, 1 << order);
1213			adjust_managed_page_count(page, 1 << order);
1214		}
1215	}
1216}
1217
1218/*
1219 * Try to allocate a range, marking pages fake-offline, effectively
1220 * fake-offlining them.
1221 */
1222static int virtio_mem_fake_offline(struct virtio_mem *vm, unsigned long pfn,
1223				   unsigned long nr_pages)
1224{
1225	const bool is_movable = is_zone_movable_page(pfn_to_page(pfn));
1226	int rc, retry_count;
1227
1228	/*
1229	 * TODO: We want an alloc_contig_range() mode that tries to allocate
1230	 * harder (e.g., dealing with temporarily pinned pages, PCP), especially
1231	 * with ZONE_MOVABLE. So for now, retry a couple of times with
1232	 * ZONE_MOVABLE before giving up - because that zone is supposed to give
1233	 * some guarantees.
1234	 */
1235	for (retry_count = 0; retry_count < 5; retry_count++) {
1236		/*
1237		 * If the config changed, stop immediately and go back to the
1238		 * main loop: avoid trying to keep unplugging if the device
1239		 * might have decided to not remove any more memory.
1240		 */
1241		if (atomic_read(&vm->config_changed))
1242			return -EAGAIN;
1243
1244		rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
1245					GFP_KERNEL);
1246		if (rc == -ENOMEM)
1247			/* whoops, out of memory */
1248			return rc;
1249		else if (rc && !is_movable)
1250			break;
1251		else if (rc)
1252			continue;
1253
1254		virtio_mem_set_fake_offline(pfn, nr_pages, true);
1255		adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
1256		return 0;
1257	}
1258
1259	return -EBUSY;
1260}
1261
1262/*
1263 * Handle fake-offline pages when memory is going offline - such that the
1264 * pages can be skipped by mm-core when offlining.
1265 */
1266static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
1267						  unsigned long nr_pages)
1268{
1269	struct page *page;
1270	unsigned long i;
1271
1272	/* Drop our reference to the pages so the memory can get offlined. */
1273	for (i = 0; i < nr_pages; i++) {
1274		page = pfn_to_page(pfn + i);
1275		if (WARN_ON(!page_ref_dec_and_test(page)))
1276			dump_page(page, "fake-offline page referenced");
1277	}
1278}
1279
1280/*
1281 * Handle fake-offline pages when memory offlining is canceled - to undo
1282 * what we did in virtio_mem_fake_offline_going_offline().
1283 */
1284static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
1285						   unsigned long nr_pages)
1286{
1287	unsigned long i;
1288
1289	/*
1290	 * Get the reference again that we dropped via page_ref_dec_and_test()
1291	 * when going offline.
1292	 */
1293	for (i = 0; i < nr_pages; i++)
1294		page_ref_inc(pfn_to_page(pfn + i));
1295}
1296
1297static void virtio_mem_online_page(struct virtio_mem *vm,
1298				   struct page *page, unsigned int order)
1299{
1300	const unsigned long start = page_to_phys(page);
1301	const unsigned long end = start + PFN_PHYS(1 << order);
1302	unsigned long addr, next, id, sb_id, count;
1303	bool do_online;
1304
1305	/*
1306	 * We can get called with any order up to MAX_PAGE_ORDER. If our subblock
1307	 * size is smaller than that and we have a mixture of plugged and
1308	 * unplugged subblocks within such a page, we have to process in
1309	 * smaller granularity. In that case we'll adjust the order exactly once
1310	 * within the loop.
1311	 */
1312	for (addr = start; addr < end; ) {
1313		next = addr + PFN_PHYS(1 << order);
1314
1315		if (vm->in_sbm) {
1316			id = virtio_mem_phys_to_mb_id(addr);
1317			sb_id = virtio_mem_phys_to_sb_id(vm, addr);
1318			count = virtio_mem_phys_to_sb_id(vm, next - 1) - sb_id + 1;
1319
1320			if (virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, count)) {
1321				/* Fully plugged. */
1322				do_online = true;
1323			} else if (count == 1 ||
1324				   virtio_mem_sbm_test_sb_unplugged(vm, id, sb_id, count)) {
1325				/* Fully unplugged. */
1326				do_online = false;
1327			} else {
1328				/*
1329				 * Mixture, process sub-blocks instead. This
1330				 * will be at least the size of a pageblock.
1331				 * We'll run into this case exactly once.
1332				 */
1333				order = ilog2(vm->sbm.sb_size) - PAGE_SHIFT;
1334				do_online = virtio_mem_sbm_test_sb_plugged(vm, id, sb_id, 1);
1335				continue;
1336			}
1337		} else {
1338			/*
1339			 * If the whole block is marked fake offline, keep
1340			 * everything that way.
1341			 */
1342			id = virtio_mem_phys_to_bb_id(vm, addr);
1343			do_online = virtio_mem_bbm_get_bb_state(vm, id) !=
1344				    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE;
1345		}
1346
1347		if (do_online)
1348			generic_online_page(pfn_to_page(PFN_DOWN(addr)), order);
1349		else
1350			virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
1351						    false);
1352		addr = next;
1353	}
1354}
1355
1356static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
1357{
1358	const unsigned long addr = page_to_phys(page);
1359	struct virtio_mem *vm;
1360
1361	rcu_read_lock();
1362	list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
1363		/*
1364		 * Pages we're onlining will never cross memory blocks and,
1365		 * therefore, not virtio-mem devices.
1366		 */
1367		if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order)))
1368			continue;
1369
1370		/*
1371		 * virtio_mem_set_fake_offline() might sleep. We can safely
1372		 * drop the RCU lock at this point because the device
1373		 * cannot go away. See virtio_mem_remove() how races
1374		 * between memory onlining and device removal are handled.
1375		 */
1376		rcu_read_unlock();
1377
1378		virtio_mem_online_page(vm, page, order);
1379		return;
1380	}
1381	rcu_read_unlock();
1382
1383	/* not virtio-mem memory, but e.g., a DIMM. online it */
1384	generic_online_page(page, order);
1385}
1386
1387static uint64_t virtio_mem_send_request(struct virtio_mem *vm,
1388					const struct virtio_mem_req *req)
1389{
1390	struct scatterlist *sgs[2], sg_req, sg_resp;
1391	unsigned int len;
1392	int rc;
1393
1394	/* don't use the request residing on the stack (vaddr) */
1395	vm->req = *req;
1396
1397	/* out: buffer for request */
1398	sg_init_one(&sg_req, &vm->req, sizeof(vm->req));
1399	sgs[0] = &sg_req;
1400
1401	/* in: buffer for response */
1402	sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp));
1403	sgs[1] = &sg_resp;
1404
1405	rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL);
1406	if (rc < 0)
1407		return rc;
1408
1409	virtqueue_kick(vm->vq);
1410
1411	/* wait for a response */
1412	wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len));
1413
1414	return virtio16_to_cpu(vm->vdev, vm->resp.type);
1415}
1416
1417static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
1418					uint64_t size)
1419{
1420	const uint64_t nb_vm_blocks = size / vm->device_block_size;
1421	const struct virtio_mem_req req = {
1422		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG),
1423		.u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
1424		.u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1425	};
1426	int rc = -ENOMEM;
1427
1428	if (atomic_read(&vm->config_changed))
1429		return -EAGAIN;
1430
1431	dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
1432		addr + size - 1);
1433
1434	switch (virtio_mem_send_request(vm, &req)) {
1435	case VIRTIO_MEM_RESP_ACK:
1436		vm->plugged_size += size;
1437		return 0;
1438	case VIRTIO_MEM_RESP_NACK:
1439		rc = -EAGAIN;
1440		break;
1441	case VIRTIO_MEM_RESP_BUSY:
1442		rc = -ETXTBSY;
1443		break;
1444	case VIRTIO_MEM_RESP_ERROR:
1445		rc = -EINVAL;
1446		break;
1447	default:
1448		break;
1449	}
1450
1451	dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc);
1452	return rc;
1453}
1454
1455static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
1456					  uint64_t size)
1457{
1458	const uint64_t nb_vm_blocks = size / vm->device_block_size;
1459	const struct virtio_mem_req req = {
1460		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG),
1461		.u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
1462		.u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1463	};
1464	int rc = -ENOMEM;
1465
1466	if (atomic_read(&vm->config_changed))
1467		return -EAGAIN;
1468
1469	dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
1470		addr + size - 1);
1471
1472	switch (virtio_mem_send_request(vm, &req)) {
1473	case VIRTIO_MEM_RESP_ACK:
1474		vm->plugged_size -= size;
1475		return 0;
1476	case VIRTIO_MEM_RESP_BUSY:
1477		rc = -ETXTBSY;
1478		break;
1479	case VIRTIO_MEM_RESP_ERROR:
1480		rc = -EINVAL;
1481		break;
1482	default:
1483		break;
1484	}
1485
1486	dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc);
1487	return rc;
1488}
1489
1490static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
1491{
1492	const struct virtio_mem_req req = {
1493		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
1494	};
1495	int rc = -ENOMEM;
1496
1497	dev_dbg(&vm->vdev->dev, "unplugging all memory");
1498
1499	switch (virtio_mem_send_request(vm, &req)) {
1500	case VIRTIO_MEM_RESP_ACK:
1501		vm->unplug_all_required = false;
1502		vm->plugged_size = 0;
1503		/* usable region might have shrunk */
1504		atomic_set(&vm->config_changed, 1);
1505		return 0;
1506	case VIRTIO_MEM_RESP_BUSY:
1507		rc = -ETXTBSY;
1508		break;
1509	default:
1510		break;
1511	}
1512
1513	dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc);
1514	return rc;
1515}
1516
1517/*
1518 * Plug selected subblocks. Updates the plugged state, but not the state
1519 * of the memory block.
1520 */
1521static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
1522				  int sb_id, int count)
1523{
1524	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1525			      sb_id * vm->sbm.sb_size;
1526	const uint64_t size = count * vm->sbm.sb_size;
1527	int rc;
1528
1529	rc = virtio_mem_send_plug_request(vm, addr, size);
1530	if (!rc)
1531		virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
1532	return rc;
1533}
1534
1535/*
1536 * Unplug selected subblocks. Updates the plugged state, but not the state
1537 * of the memory block.
1538 */
1539static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
1540				    int sb_id, int count)
1541{
1542	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1543			      sb_id * vm->sbm.sb_size;
1544	const uint64_t size = count * vm->sbm.sb_size;
1545	int rc;
1546
1547	rc = virtio_mem_send_unplug_request(vm, addr, size);
1548	if (!rc)
1549		virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count);
1550	return rc;
1551}
1552
1553/*
1554 * Request to unplug a big block.
1555 *
1556 * Will not modify the state of the big block.
1557 */
1558static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id)
1559{
1560	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1561	const uint64_t size = vm->bbm.bb_size;
1562
1563	return virtio_mem_send_unplug_request(vm, addr, size);
1564}
1565
1566/*
1567 * Request to plug a big block.
1568 *
1569 * Will not modify the state of the big block.
1570 */
1571static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id)
1572{
1573	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1574	const uint64_t size = vm->bbm.bb_size;
1575
1576	return virtio_mem_send_plug_request(vm, addr, size);
1577}
1578
1579/*
1580 * Unplug the desired number of plugged subblocks of a offline or not-added
1581 * memory block. Will fail if any subblock cannot get unplugged (instead of
1582 * skipping it).
1583 *
1584 * Will not modify the state of the memory block.
1585 *
1586 * Note: can fail after some subblocks were unplugged.
1587 */
1588static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm,
1589					    unsigned long mb_id, uint64_t *nb_sb)
1590{
1591	int sb_id, count;
1592	int rc;
1593
1594	sb_id = vm->sbm.sbs_per_mb - 1;
1595	while (*nb_sb) {
1596		/* Find the next candidate subblock */
1597		while (sb_id >= 0 &&
1598		       virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1))
1599			sb_id--;
1600		if (sb_id < 0)
1601			break;
1602		/* Try to unplug multiple subblocks at a time */
1603		count = 1;
1604		while (count < *nb_sb && sb_id > 0 &&
1605		       virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
1606			count++;
1607			sb_id--;
1608		}
1609
1610		rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1611		if (rc)
1612			return rc;
1613		*nb_sb -= count;
1614		sb_id--;
1615	}
1616
1617	return 0;
1618}
1619
1620/*
1621 * Unplug all plugged subblocks of an offline or not-added memory block.
1622 *
1623 * Will not modify the state of the memory block.
1624 *
1625 * Note: can fail after some subblocks were unplugged.
1626 */
1627static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id)
1628{
1629	uint64_t nb_sb = vm->sbm.sbs_per_mb;
1630
1631	return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb);
1632}
1633
1634/*
1635 * Prepare tracking data for the next memory block.
1636 */
1637static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm,
1638					  unsigned long *mb_id)
1639{
1640	int rc;
1641
1642	if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id)
1643		return -ENOSPC;
1644
1645	/* Resize the state array if required. */
1646	rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm);
1647	if (rc)
1648		return rc;
1649
1650	/* Resize the subblock bitmap if required. */
1651	rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm);
1652	if (rc)
1653		return rc;
1654
1655	vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++;
1656	*mb_id = vm->sbm.next_mb_id++;
1657	return 0;
1658}
1659
1660/*
1661 * Try to plug the desired number of subblocks and add the memory block
1662 * to Linux.
1663 *
1664 * Will modify the state of the memory block.
1665 */
1666static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
1667					  unsigned long mb_id, uint64_t *nb_sb)
1668{
1669	const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb);
1670	int rc;
1671
1672	if (WARN_ON_ONCE(!count))
1673		return -EINVAL;
1674
1675	/*
1676	 * Plug the requested number of subblocks before adding it to linux,
1677	 * so that onlining will directly online all plugged subblocks.
1678	 */
1679	rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
1680	if (rc)
1681		return rc;
1682
1683	/*
1684	 * Mark the block properly offline before adding it to Linux,
1685	 * so the memory notifiers will find the block in the right state.
1686	 */
1687	if (count == vm->sbm.sbs_per_mb)
1688		virtio_mem_sbm_set_mb_state(vm, mb_id,
1689					    VIRTIO_MEM_SBM_MB_OFFLINE);
1690	else
1691		virtio_mem_sbm_set_mb_state(vm, mb_id,
1692					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1693
1694	/* Add the memory block to linux - if that fails, try to unplug. */
1695	rc = virtio_mem_sbm_add_mb(vm, mb_id);
1696	if (rc) {
1697		int new_state = VIRTIO_MEM_SBM_MB_UNUSED;
1698
1699		if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count))
1700			new_state = VIRTIO_MEM_SBM_MB_PLUGGED;
1701		virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
1702		return rc;
1703	}
1704
1705	*nb_sb -= count;
1706	return 0;
1707}
1708
1709/*
1710 * Try to plug the desired number of subblocks of a memory block that
1711 * is already added to Linux.
1712 *
1713 * Will modify the state of the memory block.
1714 *
1715 * Note: Can fail after some subblocks were successfully plugged.
1716 */
1717static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
1718				      unsigned long mb_id, uint64_t *nb_sb)
1719{
1720	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1721	unsigned long pfn, nr_pages;
1722	int sb_id, count;
1723	int rc;
1724
1725	if (WARN_ON_ONCE(!*nb_sb))
1726		return -EINVAL;
1727
1728	while (*nb_sb) {
1729		sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id);
1730		if (sb_id >= vm->sbm.sbs_per_mb)
1731			break;
1732		count = 1;
1733		while (count < *nb_sb &&
1734		       sb_id + count < vm->sbm.sbs_per_mb &&
1735		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
1736			count++;
1737
1738		rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
1739		if (rc)
1740			return rc;
1741		*nb_sb -= count;
1742		if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
1743			continue;
1744
1745		/* fake-online the pages if the memory block is online */
1746		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1747			       sb_id * vm->sbm.sb_size);
1748		nr_pages = PFN_DOWN(count * vm->sbm.sb_size);
1749		virtio_mem_fake_online(pfn, nr_pages);
1750	}
1751
1752	if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1753		virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1);
1754
1755	return 0;
1756}
1757
1758static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1759{
1760	const int mb_states[] = {
1761		VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
1762		VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
1763		VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
1764	};
1765	uint64_t nb_sb = diff / vm->sbm.sb_size;
1766	unsigned long mb_id;
1767	int rc, i;
1768
1769	if (!nb_sb)
1770		return 0;
1771
1772	/* Don't race with onlining/offlining */
1773	mutex_lock(&vm->hotplug_mutex);
1774
1775	for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
1776		virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) {
1777			rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb);
1778			if (rc || !nb_sb)
1779				goto out_unlock;
1780			cond_resched();
1781		}
1782	}
1783
1784	/*
1785	 * We won't be working on online/offline memory blocks from this point,
1786	 * so we can't race with memory onlining/offlining. Drop the mutex.
1787	 */
1788	mutex_unlock(&vm->hotplug_mutex);
1789
1790	/* Try to plug and add unused blocks */
1791	virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
1792		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1793			return -ENOSPC;
1794
1795		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1796		if (rc || !nb_sb)
1797			return rc;
1798		cond_resched();
1799	}
1800
1801	/* Try to prepare, plug and add new blocks */
1802	while (nb_sb) {
1803		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1804			return -ENOSPC;
1805
1806		rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
1807		if (rc)
1808			return rc;
1809		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1810		if (rc)
1811			return rc;
1812		cond_resched();
1813	}
1814
1815	return 0;
1816out_unlock:
1817	mutex_unlock(&vm->hotplug_mutex);
1818	return rc;
1819}
1820
1821/*
1822 * Plug a big block and add it to Linux.
1823 *
1824 * Will modify the state of the big block.
1825 */
1826static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm,
1827					  unsigned long bb_id)
1828{
1829	int rc;
1830
1831	if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
1832			 VIRTIO_MEM_BBM_BB_UNUSED))
1833		return -EINVAL;
1834
1835	rc = virtio_mem_bbm_plug_bb(vm, bb_id);
1836	if (rc)
1837		return rc;
1838	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
1839
1840	rc = virtio_mem_bbm_add_bb(vm, bb_id);
1841	if (rc) {
1842		if (!virtio_mem_bbm_unplug_bb(vm, bb_id))
1843			virtio_mem_bbm_set_bb_state(vm, bb_id,
1844						    VIRTIO_MEM_BBM_BB_UNUSED);
1845		else
1846			/* Retry from the main loop. */
1847			virtio_mem_bbm_set_bb_state(vm, bb_id,
1848						    VIRTIO_MEM_BBM_BB_PLUGGED);
1849		return rc;
1850	}
1851	return 0;
1852}
1853
1854/*
1855 * Prepare tracking data for the next big block.
1856 */
1857static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm,
1858					  unsigned long *bb_id)
1859{
1860	int rc;
1861
1862	if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id)
1863		return -ENOSPC;
1864
1865	/* Resize the big block state array if required. */
1866	rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm);
1867	if (rc)
1868		return rc;
1869
1870	vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++;
1871	*bb_id = vm->bbm.next_bb_id;
1872	vm->bbm.next_bb_id++;
1873	return 0;
1874}
1875
1876static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1877{
1878	uint64_t nb_bb = diff / vm->bbm.bb_size;
1879	unsigned long bb_id;
1880	int rc;
1881
1882	if (!nb_bb)
1883		return 0;
1884
1885	/* Try to plug and add unused big blocks */
1886	virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) {
1887		if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1888			return -ENOSPC;
1889
1890		rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1891		if (!rc)
1892			nb_bb--;
1893		if (rc || !nb_bb)
1894			return rc;
1895		cond_resched();
1896	}
1897
1898	/* Try to prepare, plug and add new big blocks */
1899	while (nb_bb) {
1900		if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1901			return -ENOSPC;
1902
1903		rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id);
1904		if (rc)
1905			return rc;
1906		rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1907		if (!rc)
1908			nb_bb--;
1909		if (rc)
1910			return rc;
1911		cond_resched();
1912	}
1913
1914	return 0;
1915}
1916
1917/*
1918 * Try to plug the requested amount of memory.
1919 */
1920static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
1921{
1922	if (vm->in_sbm)
1923		return virtio_mem_sbm_plug_request(vm, diff);
1924	return virtio_mem_bbm_plug_request(vm, diff);
1925}
1926
1927/*
1928 * Unplug the desired number of plugged subblocks of an offline memory block.
1929 * Will fail if any subblock cannot get unplugged (instead of skipping it).
1930 *
1931 * Will modify the state of the memory block. Might temporarily drop the
1932 * hotplug_mutex.
1933 *
1934 * Note: Can fail after some subblocks were successfully unplugged.
1935 */
1936static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm,
1937						unsigned long mb_id,
1938						uint64_t *nb_sb)
1939{
1940	int rc;
1941
1942	rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb);
1943
1944	/* some subblocks might have been unplugged even on failure */
1945	if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1946		virtio_mem_sbm_set_mb_state(vm, mb_id,
1947					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1948	if (rc)
1949		return rc;
1950
1951	if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1952		/*
1953		 * Remove the block from Linux - this should never fail.
1954		 * Hinder the block from getting onlined by marking it
1955		 * unplugged. Temporarily drop the mutex, so
1956		 * any pending GOING_ONLINE requests can be serviced/rejected.
1957		 */
1958		virtio_mem_sbm_set_mb_state(vm, mb_id,
1959					    VIRTIO_MEM_SBM_MB_UNUSED);
1960
1961		mutex_unlock(&vm->hotplug_mutex);
1962		rc = virtio_mem_sbm_remove_mb(vm, mb_id);
1963		BUG_ON(rc);
1964		mutex_lock(&vm->hotplug_mutex);
1965	}
1966	return 0;
1967}
1968
1969/*
1970 * Unplug the given plugged subblocks of an online memory block.
1971 *
1972 * Will modify the state of the memory block.
1973 */
1974static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm,
1975					   unsigned long mb_id, int sb_id,
1976					   int count)
1977{
1978	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count;
1979	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1980	unsigned long start_pfn;
1981	int rc;
1982
1983	start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1984			     sb_id * vm->sbm.sb_size);
1985
1986	rc = virtio_mem_fake_offline(vm, start_pfn, nr_pages);
1987	if (rc)
1988		return rc;
1989
1990	/* Try to unplug the allocated memory */
1991	rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1992	if (rc) {
1993		/* Return the memory to the buddy. */
1994		virtio_mem_fake_online(start_pfn, nr_pages);
1995		return rc;
1996	}
1997
1998	switch (old_state) {
1999	case VIRTIO_MEM_SBM_MB_KERNEL:
2000		virtio_mem_sbm_set_mb_state(vm, mb_id,
2001					    VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL);
2002		break;
2003	case VIRTIO_MEM_SBM_MB_MOVABLE:
2004		virtio_mem_sbm_set_mb_state(vm, mb_id,
2005					    VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL);
2006		break;
2007	}
2008
2009	return 0;
2010}
2011
2012/*
2013 * Unplug the desired number of plugged subblocks of an online memory block.
2014 * Will skip subblock that are busy.
2015 *
2016 * Will modify the state of the memory block. Might temporarily drop the
2017 * hotplug_mutex.
2018 *
2019 * Note: Can fail after some subblocks were successfully unplugged. Can
2020 *       return 0 even if subblocks were busy and could not get unplugged.
2021 */
2022static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm,
2023					       unsigned long mb_id,
2024					       uint64_t *nb_sb)
2025{
2026	int rc, sb_id;
2027
2028	/* If possible, try to unplug the complete block in one shot. */
2029	if (*nb_sb >= vm->sbm.sbs_per_mb &&
2030	    virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
2031		rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0,
2032						     vm->sbm.sbs_per_mb);
2033		if (!rc) {
2034			*nb_sb -= vm->sbm.sbs_per_mb;
2035			goto unplugged;
2036		} else if (rc != -EBUSY)
2037			return rc;
2038	}
2039
2040	/* Fallback to single subblocks. */
2041	for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
2042		/* Find the next candidate subblock */
2043		while (sb_id >= 0 &&
2044		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
2045			sb_id--;
2046		if (sb_id < 0)
2047			break;
2048
2049		rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1);
2050		if (rc == -EBUSY)
2051			continue;
2052		else if (rc)
2053			return rc;
2054		*nb_sb -= 1;
2055	}
2056
2057unplugged:
2058	rc = virtio_mem_sbm_try_remove_unplugged_mb(vm, mb_id);
2059	if (rc)
2060		vm->sbm.have_unplugged_mb = 1;
2061	/* Ignore errors, this is not critical. We'll retry later. */
2062	return 0;
2063}
2064
2065/*
2066 * Unplug the desired number of plugged subblocks of a memory block that is
2067 * already added to Linux. Will skip subblock of online memory blocks that are
2068 * busy (by the OS). Will fail if any subblock that's not busy cannot get
2069 * unplugged.
2070 *
2071 * Will modify the state of the memory block. Might temporarily drop the
2072 * hotplug_mutex.
2073 *
2074 * Note: Can fail after some subblocks were successfully unplugged. Can
2075 *       return 0 even if subblocks were busy and could not get unplugged.
2076 */
2077static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm,
2078					unsigned long mb_id,
2079					uint64_t *nb_sb)
2080{
2081	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
2082
2083	switch (old_state) {
2084	case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
2085	case VIRTIO_MEM_SBM_MB_KERNEL:
2086	case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
2087	case VIRTIO_MEM_SBM_MB_MOVABLE:
2088		return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb);
2089	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
2090	case VIRTIO_MEM_SBM_MB_OFFLINE:
2091		return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb);
2092	}
2093	return -EINVAL;
2094}
2095
2096static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
2097{
2098	const int mb_states[] = {
2099		VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
2100		VIRTIO_MEM_SBM_MB_OFFLINE,
2101		VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
2102		VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
2103		VIRTIO_MEM_SBM_MB_MOVABLE,
2104		VIRTIO_MEM_SBM_MB_KERNEL,
2105	};
2106	uint64_t nb_sb = diff / vm->sbm.sb_size;
2107	unsigned long mb_id;
2108	int rc, i;
2109
2110	if (!nb_sb)
2111		return 0;
2112
2113	/*
2114	 * We'll drop the mutex a couple of times when it is safe to do so.
2115	 * This might result in some blocks switching the state (online/offline)
2116	 * and we could miss them in this run - we will retry again later.
2117	 */
2118	mutex_lock(&vm->hotplug_mutex);
2119
2120	/*
2121	 * We try unplug from partially plugged blocks first, to try removing
2122	 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE
2123	 * as it's more reliable to unplug memory and remove whole memory
2124	 * blocks, and we don't want to trigger a zone imbalances by
2125	 * accidentially removing too much kernel memory.
2126	 */
2127	for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
2128		virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) {
2129			rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb);
2130			if (rc || !nb_sb)
2131				goto out_unlock;
2132			mutex_unlock(&vm->hotplug_mutex);
2133			cond_resched();
2134			mutex_lock(&vm->hotplug_mutex);
2135		}
2136		if (!unplug_online && i == 1) {
2137			mutex_unlock(&vm->hotplug_mutex);
2138			return 0;
2139		}
2140	}
2141
2142	mutex_unlock(&vm->hotplug_mutex);
2143	return nb_sb ? -EBUSY : 0;
2144out_unlock:
2145	mutex_unlock(&vm->hotplug_mutex);
2146	return rc;
2147}
2148
2149/*
2150 * Try to offline and remove a big block from Linux and unplug it. Will fail
2151 * with -EBUSY if some memory is busy and cannot get unplugged.
2152 *
2153 * Will modify the state of the memory block. Might temporarily drop the
2154 * hotplug_mutex.
2155 */
2156static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
2157						       unsigned long bb_id)
2158{
2159	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2160	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2161	unsigned long end_pfn = start_pfn + nr_pages;
2162	unsigned long pfn;
2163	struct page *page;
2164	int rc;
2165
2166	if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
2167			 VIRTIO_MEM_BBM_BB_ADDED))
2168		return -EINVAL;
2169
2170	/*
2171	 * Start by fake-offlining all memory. Once we marked the device
2172	 * block as fake-offline, all newly onlined memory will
2173	 * automatically be kept fake-offline. Protect from concurrent
2174	 * onlining/offlining until we have a consistent state.
2175	 */
2176	mutex_lock(&vm->hotplug_mutex);
2177	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
2178
2179	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2180		page = pfn_to_online_page(pfn);
2181		if (!page)
2182			continue;
2183
2184		rc = virtio_mem_fake_offline(vm, pfn, PAGES_PER_SECTION);
2185		if (rc) {
2186			end_pfn = pfn;
2187			goto rollback;
2188		}
2189	}
2190	mutex_unlock(&vm->hotplug_mutex);
2191
2192	rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
2193	if (rc) {
2194		mutex_lock(&vm->hotplug_mutex);
2195		goto rollback;
2196	}
2197
2198	rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
2199	if (rc)
2200		virtio_mem_bbm_set_bb_state(vm, bb_id,
2201					    VIRTIO_MEM_BBM_BB_PLUGGED);
2202	else
2203		virtio_mem_bbm_set_bb_state(vm, bb_id,
2204					    VIRTIO_MEM_BBM_BB_UNUSED);
2205	return rc;
2206
2207rollback:
2208	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2209		page = pfn_to_online_page(pfn);
2210		if (!page)
2211			continue;
2212		virtio_mem_fake_online(pfn, PAGES_PER_SECTION);
2213	}
2214	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
2215	mutex_unlock(&vm->hotplug_mutex);
2216	return rc;
2217}
2218
2219/*
2220 * Test if a big block is completely offline.
2221 */
2222static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm,
2223					 unsigned long bb_id)
2224{
2225	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2226	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2227	unsigned long pfn;
2228
2229	for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2230	     pfn += PAGES_PER_SECTION) {
2231		if (pfn_to_online_page(pfn))
2232			return false;
2233	}
2234
2235	return true;
2236}
2237
2238/*
2239 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline).
2240 */
2241static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm,
2242					 unsigned long bb_id)
2243{
2244	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2245	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2246	struct page *page;
2247	unsigned long pfn;
2248
2249	for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2250	     pfn += PAGES_PER_SECTION) {
2251		page = pfn_to_online_page(pfn);
2252		if (!page)
2253			continue;
2254		if (page_zonenum(page) != ZONE_MOVABLE)
2255			return false;
2256	}
2257
2258	return true;
2259}
2260
2261static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
2262{
2263	uint64_t nb_bb = diff / vm->bbm.bb_size;
2264	uint64_t bb_id;
2265	int rc, i;
2266
2267	if (!nb_bb)
2268		return 0;
2269
2270	/*
2271	 * Try to unplug big blocks. Similar to SBM, start with offline
2272	 * big blocks.
2273	 */
2274	for (i = 0; i < 3; i++) {
2275		virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
2276			cond_resched();
2277
2278			/*
2279			 * As we're holding no locks, these checks are racy,
2280			 * but we don't care.
2281			 */
2282			if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id))
2283				continue;
2284			if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id))
2285				continue;
2286			rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id);
2287			if (rc == -EBUSY)
2288				continue;
2289			if (!rc)
2290				nb_bb--;
2291			if (rc || !nb_bb)
2292				return rc;
2293		}
2294		if (i == 0 && !unplug_online)
2295			return 0;
2296	}
2297
2298	return nb_bb ? -EBUSY : 0;
2299}
2300
2301/*
2302 * Try to unplug the requested amount of memory.
2303 */
2304static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
2305{
2306	if (vm->in_sbm)
2307		return virtio_mem_sbm_unplug_request(vm, diff);
2308	return virtio_mem_bbm_unplug_request(vm, diff);
2309}
2310
2311/*
2312 * Try to unplug all blocks that couldn't be unplugged before, for example,
2313 * because the hypervisor was busy. Further, offline and remove any memory
2314 * blocks where we previously failed.
2315 */
2316static int virtio_mem_cleanup_pending_mb(struct virtio_mem *vm)
2317{
2318	unsigned long id;
2319	int rc = 0;
2320
2321	if (!vm->in_sbm) {
2322		virtio_mem_bbm_for_each_bb(vm, id,
2323					   VIRTIO_MEM_BBM_BB_PLUGGED) {
2324			rc = virtio_mem_bbm_unplug_bb(vm, id);
2325			if (rc)
2326				return rc;
2327			virtio_mem_bbm_set_bb_state(vm, id,
2328						    VIRTIO_MEM_BBM_BB_UNUSED);
2329		}
2330		return 0;
2331	}
2332
2333	virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) {
2334		rc = virtio_mem_sbm_unplug_mb(vm, id);
2335		if (rc)
2336			return rc;
2337		virtio_mem_sbm_set_mb_state(vm, id,
2338					    VIRTIO_MEM_SBM_MB_UNUSED);
2339	}
2340
2341	if (!vm->sbm.have_unplugged_mb)
2342		return 0;
2343
2344	/*
2345	 * Let's retry (offlining and) removing completely unplugged Linux
2346	 * memory blocks.
2347	 */
2348	vm->sbm.have_unplugged_mb = false;
2349
2350	mutex_lock(&vm->hotplug_mutex);
2351	virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL)
2352		rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
2353	virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL)
2354		rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
2355	virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
2356		rc |= virtio_mem_sbm_try_remove_unplugged_mb(vm, id);
2357	mutex_unlock(&vm->hotplug_mutex);
2358
2359	if (rc)
2360		vm->sbm.have_unplugged_mb = true;
2361	/* Ignore errors, this is not critical. We'll retry later. */
2362	return 0;
2363}
2364
2365/*
2366 * Update all parts of the config that could have changed.
2367 */
2368static void virtio_mem_refresh_config(struct virtio_mem *vm)
2369{
2370	const struct range pluggable_range = mhp_get_pluggable_range(true);
2371	uint64_t new_plugged_size, usable_region_size, end_addr;
2372
2373	/* the plugged_size is just a reflection of what _we_ did previously */
2374	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2375			&new_plugged_size);
2376	if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size))
2377		vm->plugged_size = new_plugged_size;
2378
2379	/* calculate the last usable memory block id */
2380	virtio_cread_le(vm->vdev, struct virtio_mem_config,
2381			usable_region_size, &usable_region_size);
2382	end_addr = min(vm->addr + usable_region_size - 1,
2383		       pluggable_range.end);
2384
2385	if (vm->in_sbm) {
2386		vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr);
2387		if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes()))
2388			vm->sbm.last_usable_mb_id--;
2389	} else {
2390		vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm,
2391								     end_addr);
2392		if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size))
2393			vm->bbm.last_usable_bb_id--;
2394	}
2395	/*
2396	 * If we cannot plug any of our device memory (e.g., nothing in the
2397	 * usable region is addressable), the last usable memory block id will
2398	 * be smaller than the first usable memory block id. We'll stop
2399	 * attempting to add memory with -ENOSPC from our main loop.
2400	 */
2401
2402	/* see if there is a request to change the size */
2403	virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size,
2404			&vm->requested_size);
2405
2406	dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size);
2407	dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size);
2408}
2409
2410/*
2411 * Workqueue function for handling plug/unplug requests and config updates.
2412 */
2413static void virtio_mem_run_wq(struct work_struct *work)
2414{
2415	struct virtio_mem *vm = container_of(work, struct virtio_mem, wq);
2416	uint64_t diff;
2417	int rc;
2418
2419	if (unlikely(vm->in_kdump)) {
2420		dev_warn_once(&vm->vdev->dev,
2421			     "unexpected workqueue run in kdump kernel\n");
2422		return;
2423	}
2424
2425	hrtimer_cancel(&vm->retry_timer);
2426
2427	if (vm->broken)
2428		return;
2429
2430	atomic_set(&vm->wq_active, 1);
2431retry:
2432	rc = 0;
2433
2434	/* Make sure we start with a clean state if there are leftovers. */
2435	if (unlikely(vm->unplug_all_required))
2436		rc = virtio_mem_send_unplug_all_request(vm);
2437
2438	if (atomic_read(&vm->config_changed)) {
2439		atomic_set(&vm->config_changed, 0);
2440		virtio_mem_refresh_config(vm);
2441	}
2442
2443	/* Cleanup any leftovers from previous runs */
2444	if (!rc)
2445		rc = virtio_mem_cleanup_pending_mb(vm);
2446
2447	if (!rc && vm->requested_size != vm->plugged_size) {
2448		if (vm->requested_size > vm->plugged_size) {
2449			diff = vm->requested_size - vm->plugged_size;
2450			rc = virtio_mem_plug_request(vm, diff);
2451		} else {
2452			diff = vm->plugged_size - vm->requested_size;
2453			rc = virtio_mem_unplug_request(vm, diff);
2454		}
2455	}
2456
2457	/*
2458	 * Keep retrying to offline and remove completely unplugged Linux
2459	 * memory blocks.
2460	 */
2461	if (!rc && vm->in_sbm && vm->sbm.have_unplugged_mb)
2462		rc = -EBUSY;
2463
2464	switch (rc) {
2465	case 0:
2466		vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2467		break;
2468	case -ENOSPC:
2469		/*
2470		 * We cannot add any more memory (alignment, physical limit)
2471		 * or we have too many offline memory blocks.
2472		 */
2473		break;
2474	case -ETXTBSY:
2475		/*
2476		 * The hypervisor cannot process our request right now
2477		 * (e.g., out of memory, migrating);
2478		 */
2479	case -EBUSY:
2480		/*
2481		 * We cannot free up any memory to unplug it (all plugged memory
2482		 * is busy).
2483		 */
2484	case -ENOMEM:
2485		/* Out of memory, try again later. */
2486		hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms),
2487			      HRTIMER_MODE_REL);
2488		break;
2489	case -EAGAIN:
2490		/* Retry immediately (e.g., the config changed). */
2491		goto retry;
2492	default:
2493		/* Unknown error, mark as broken */
2494		dev_err(&vm->vdev->dev,
2495			"unknown error, marking device broken: %d\n", rc);
2496		vm->broken = true;
2497	}
2498
2499	atomic_set(&vm->wq_active, 0);
2500}
2501
2502static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
2503{
2504	struct virtio_mem *vm = container_of(timer, struct virtio_mem,
2505					     retry_timer);
2506
2507	virtio_mem_retry(vm);
2508	vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2,
2509				   VIRTIO_MEM_RETRY_TIMER_MAX_MS);
2510	return HRTIMER_NORESTART;
2511}
2512
2513static void virtio_mem_handle_response(struct virtqueue *vq)
2514{
2515	struct virtio_mem *vm = vq->vdev->priv;
2516
2517	wake_up(&vm->host_resp);
2518}
2519
2520static int virtio_mem_init_vq(struct virtio_mem *vm)
2521{
2522	struct virtqueue *vq;
2523
2524	vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response,
2525				   "guest-request");
2526	if (IS_ERR(vq))
2527		return PTR_ERR(vq);
2528	vm->vq = vq;
2529
2530	return 0;
2531}
2532
2533static int virtio_mem_init_hotplug(struct virtio_mem *vm)
2534{
2535	const struct range pluggable_range = mhp_get_pluggable_range(true);
2536	uint64_t unit_pages, sb_size, addr;
2537	int rc;
2538
2539	/* bad device setup - warn only */
2540	if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
2541		dev_warn(&vm->vdev->dev,
2542			 "The alignment of the physical start address can make some memory unusable.\n");
2543	if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes()))
2544		dev_warn(&vm->vdev->dev,
2545			 "The alignment of the physical end address can make some memory unusable.\n");
2546	if (vm->addr < pluggable_range.start ||
2547	    vm->addr + vm->region_size - 1 > pluggable_range.end)
2548		dev_warn(&vm->vdev->dev,
2549			 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n");
2550
2551	/* Prepare the offline threshold - make sure we can add two blocks. */
2552	vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(),
2553				      VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
2554
2555	/*
2556	 * alloc_contig_range() works reliably with pageblock
2557	 * granularity on ZONE_NORMAL, use pageblock_nr_pages.
2558	 */
2559	sb_size = PAGE_SIZE * pageblock_nr_pages;
2560	sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
2561
2562	if (sb_size < memory_block_size_bytes() && !force_bbm) {
2563		/* SBM: At least two subblocks per Linux memory block. */
2564		vm->in_sbm = true;
2565		vm->sbm.sb_size = sb_size;
2566		vm->sbm.sbs_per_mb = memory_block_size_bytes() /
2567				     vm->sbm.sb_size;
2568
2569		/* Round up to the next full memory block */
2570		addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2571		       memory_block_size_bytes() - 1;
2572		vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr);
2573		vm->sbm.next_mb_id = vm->sbm.first_mb_id;
2574	} else {
2575		/* BBM: At least one Linux memory block. */
2576		vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size,
2577					memory_block_size_bytes());
2578
2579		if (bbm_block_size) {
2580			if (!is_power_of_2(bbm_block_size)) {
2581				dev_warn(&vm->vdev->dev,
2582					 "bbm_block_size is not a power of 2");
2583			} else if (bbm_block_size < vm->bbm.bb_size) {
2584				dev_warn(&vm->vdev->dev,
2585					 "bbm_block_size is too small");
2586			} else {
2587				vm->bbm.bb_size = bbm_block_size;
2588			}
2589		}
2590
2591		/* Round up to the next aligned big block */
2592		addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2593		       vm->bbm.bb_size - 1;
2594		vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr);
2595		vm->bbm.next_bb_id = vm->bbm.first_bb_id;
2596
2597		/* Make sure we can add two big blocks. */
2598		vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size,
2599					      vm->offline_threshold);
2600	}
2601
2602	dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
2603		 memory_block_size_bytes());
2604	if (vm->in_sbm)
2605		dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
2606			 (unsigned long long)vm->sbm.sb_size);
2607	else
2608		dev_info(&vm->vdev->dev, "big block size: 0x%llx",
2609			 (unsigned long long)vm->bbm.bb_size);
2610
2611	/* create the parent resource for all memory */
2612	rc = virtio_mem_create_resource(vm);
2613	if (rc)
2614		return rc;
2615
2616	/* use a single dynamic memory group to cover the whole memory device */
2617	if (vm->in_sbm)
2618		unit_pages = PHYS_PFN(memory_block_size_bytes());
2619	else
2620		unit_pages = PHYS_PFN(vm->bbm.bb_size);
2621	rc = memory_group_register_dynamic(vm->nid, unit_pages);
2622	if (rc < 0)
2623		goto out_del_resource;
2624	vm->mgid = rc;
2625
2626	/*
2627	 * If we still have memory plugged, we have to unplug all memory first.
2628	 * Registering our parent resource makes sure that this memory isn't
2629	 * actually in use (e.g., trying to reload the driver).
2630	 */
2631	if (vm->plugged_size) {
2632		vm->unplug_all_required = true;
2633		dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
2634	}
2635
2636	/* register callbacks */
2637	vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
2638	rc = register_memory_notifier(&vm->memory_notifier);
2639	if (rc)
2640		goto out_unreg_group;
2641	/* Block hibernation as early as possible. */
2642	vm->pm_notifier.priority = INT_MAX;
2643	vm->pm_notifier.notifier_call = virtio_mem_pm_notifier_cb;
2644	rc = register_pm_notifier(&vm->pm_notifier);
2645	if (rc)
2646		goto out_unreg_mem;
2647	rc = register_virtio_mem_device(vm);
2648	if (rc)
2649		goto out_unreg_pm;
2650
2651	return 0;
2652out_unreg_pm:
2653	unregister_pm_notifier(&vm->pm_notifier);
2654out_unreg_mem:
2655	unregister_memory_notifier(&vm->memory_notifier);
2656out_unreg_group:
2657	memory_group_unregister(vm->mgid);
2658out_del_resource:
2659	virtio_mem_delete_resource(vm);
2660	return rc;
2661}
2662
2663#ifdef CONFIG_PROC_VMCORE
2664static int virtio_mem_send_state_request(struct virtio_mem *vm, uint64_t addr,
2665					 uint64_t size)
2666{
2667	const uint64_t nb_vm_blocks = size / vm->device_block_size;
2668	const struct virtio_mem_req req = {
2669		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_STATE),
2670		.u.state.addr = cpu_to_virtio64(vm->vdev, addr),
2671		.u.state.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
2672	};
2673	int rc = -ENOMEM;
2674
2675	dev_dbg(&vm->vdev->dev, "requesting state: 0x%llx - 0x%llx\n", addr,
2676		addr + size - 1);
2677
2678	switch (virtio_mem_send_request(vm, &req)) {
2679	case VIRTIO_MEM_RESP_ACK:
2680		return virtio16_to_cpu(vm->vdev, vm->resp.u.state.state);
2681	case VIRTIO_MEM_RESP_ERROR:
2682		rc = -EINVAL;
2683		break;
2684	default:
2685		break;
2686	}
2687
2688	dev_dbg(&vm->vdev->dev, "requesting state failed: %d\n", rc);
2689	return rc;
2690}
2691
2692static bool virtio_mem_vmcore_pfn_is_ram(struct vmcore_cb *cb,
2693					 unsigned long pfn)
2694{
2695	struct virtio_mem *vm = container_of(cb, struct virtio_mem,
2696					     vmcore_cb);
2697	uint64_t addr = PFN_PHYS(pfn);
2698	bool is_ram;
2699	int rc;
2700
2701	if (!virtio_mem_contains_range(vm, addr, PAGE_SIZE))
2702		return true;
2703	if (!vm->plugged_size)
2704		return false;
2705
2706	/*
2707	 * We have to serialize device requests and access to the information
2708	 * about the block queried last.
2709	 */
2710	mutex_lock(&vm->hotplug_mutex);
2711
2712	addr = ALIGN_DOWN(addr, vm->device_block_size);
2713	if (addr != vm->last_block_addr) {
2714		rc = virtio_mem_send_state_request(vm, addr,
2715						   vm->device_block_size);
2716		/* On any kind of error, we're going to signal !ram. */
2717		if (rc == VIRTIO_MEM_STATE_PLUGGED)
2718			vm->last_block_plugged = true;
2719		else
2720			vm->last_block_plugged = false;
2721		vm->last_block_addr = addr;
2722	}
2723
2724	is_ram = vm->last_block_plugged;
2725	mutex_unlock(&vm->hotplug_mutex);
2726	return is_ram;
2727}
2728#endif /* CONFIG_PROC_VMCORE */
2729
2730static int virtio_mem_init_kdump(struct virtio_mem *vm)
2731{
2732#ifdef CONFIG_PROC_VMCORE
2733	dev_info(&vm->vdev->dev, "memory hot(un)plug disabled in kdump kernel\n");
2734	vm->vmcore_cb.pfn_is_ram = virtio_mem_vmcore_pfn_is_ram;
2735	register_vmcore_cb(&vm->vmcore_cb);
2736	return 0;
2737#else /* CONFIG_PROC_VMCORE */
2738	dev_warn(&vm->vdev->dev, "disabled in kdump kernel without vmcore\n");
2739	return -EBUSY;
2740#endif /* CONFIG_PROC_VMCORE */
2741}
2742
2743static int virtio_mem_init(struct virtio_mem *vm)
2744{
2745	uint16_t node_id;
2746
2747	if (!vm->vdev->config->get) {
2748		dev_err(&vm->vdev->dev, "config access disabled\n");
2749		return -EINVAL;
2750	}
2751
2752	/* Fetch all properties that can't change. */
2753	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2754			&vm->plugged_size);
2755	virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
2756			&vm->device_block_size);
2757	virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
2758			&node_id);
2759	vm->nid = virtio_mem_translate_node_id(vm, node_id);
2760	virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
2761	virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
2762			&vm->region_size);
2763
2764	/* Determine the nid for the device based on the lowest address. */
2765	if (vm->nid == NUMA_NO_NODE)
2766		vm->nid = memory_add_physaddr_to_nid(vm->addr);
2767
2768	dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
2769	dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
2770	dev_info(&vm->vdev->dev, "device block size: 0x%llx",
2771		 (unsigned long long)vm->device_block_size);
2772	if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
2773		dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
2774
2775	/*
2776	 * We don't want to (un)plug or reuse any memory when in kdump. The
2777	 * memory is still accessible (but not exposed to Linux).
2778	 */
2779	if (vm->in_kdump)
2780		return virtio_mem_init_kdump(vm);
2781	return virtio_mem_init_hotplug(vm);
2782}
2783
2784static int virtio_mem_create_resource(struct virtio_mem *vm)
2785{
2786	/*
2787	 * When force-unloading the driver and removing the device, we
2788	 * could have a garbage pointer. Duplicate the string.
2789	 */
2790	const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL);
2791
2792	if (!name)
2793		return -ENOMEM;
2794
2795	/* Disallow mapping device memory via /dev/mem completely. */
2796	vm->parent_resource = __request_mem_region(vm->addr, vm->region_size,
2797						   name, IORESOURCE_SYSTEM_RAM |
2798						   IORESOURCE_EXCLUSIVE);
2799	if (!vm->parent_resource) {
2800		kfree(name);
2801		dev_warn(&vm->vdev->dev, "could not reserve device region\n");
2802		dev_info(&vm->vdev->dev,
2803			 "reloading the driver is not supported\n");
2804		return -EBUSY;
2805	}
2806
2807	/* The memory is not actually busy - make add_memory() work. */
2808	vm->parent_resource->flags &= ~IORESOURCE_BUSY;
2809	return 0;
2810}
2811
2812static void virtio_mem_delete_resource(struct virtio_mem *vm)
2813{
2814	const char *name;
2815
2816	if (!vm->parent_resource)
2817		return;
2818
2819	name = vm->parent_resource->name;
2820	release_resource(vm->parent_resource);
2821	kfree(vm->parent_resource);
2822	kfree(name);
2823	vm->parent_resource = NULL;
2824}
2825
2826static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
2827{
2828	return 1;
2829}
2830
2831static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
2832{
2833	const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
2834
2835	return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
2836				   vm->addr + vm->region_size, NULL,
2837				   virtio_mem_range_has_system_ram) == 1;
2838}
2839
2840static int virtio_mem_probe(struct virtio_device *vdev)
2841{
2842	struct virtio_mem *vm;
2843	int rc;
2844
2845	BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
2846	BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10);
2847
2848	vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL);
2849	if (!vm)
2850		return -ENOMEM;
2851
2852	init_waitqueue_head(&vm->host_resp);
2853	vm->vdev = vdev;
2854	INIT_WORK(&vm->wq, virtio_mem_run_wq);
2855	mutex_init(&vm->hotplug_mutex);
2856	INIT_LIST_HEAD(&vm->next);
2857	spin_lock_init(&vm->removal_lock);
2858	hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2859	vm->retry_timer.function = virtio_mem_timer_expired;
2860	vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2861	vm->in_kdump = is_kdump_kernel();
2862
2863	/* register the virtqueue */
2864	rc = virtio_mem_init_vq(vm);
2865	if (rc)
2866		goto out_free_vm;
2867
2868	/* initialize the device by querying the config */
2869	rc = virtio_mem_init(vm);
2870	if (rc)
2871		goto out_del_vq;
2872
2873	virtio_device_ready(vdev);
2874
2875	/* trigger a config update to start processing the requested_size */
2876	if (!vm->in_kdump) {
2877		atomic_set(&vm->config_changed, 1);
2878		queue_work(system_freezable_wq, &vm->wq);
2879	}
2880
2881	return 0;
2882out_del_vq:
2883	vdev->config->del_vqs(vdev);
2884out_free_vm:
2885	kfree(vm);
2886	vdev->priv = NULL;
2887
2888	return rc;
2889}
2890
2891static void virtio_mem_deinit_hotplug(struct virtio_mem *vm)
2892{
2893	unsigned long mb_id;
2894	int rc;
2895
2896	/*
2897	 * Make sure the workqueue won't be triggered anymore and no memory
2898	 * blocks can be onlined/offlined until we're finished here.
2899	 */
2900	mutex_lock(&vm->hotplug_mutex);
2901	spin_lock_irq(&vm->removal_lock);
2902	vm->removing = true;
2903	spin_unlock_irq(&vm->removal_lock);
2904	mutex_unlock(&vm->hotplug_mutex);
2905
2906	/* wait until the workqueue stopped */
2907	cancel_work_sync(&vm->wq);
2908	hrtimer_cancel(&vm->retry_timer);
2909
2910	if (vm->in_sbm) {
2911		/*
2912		 * After we unregistered our callbacks, user space can online
2913		 * partially plugged offline blocks. Make sure to remove them.
2914		 */
2915		virtio_mem_sbm_for_each_mb(vm, mb_id,
2916					   VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
2917			rc = virtio_mem_sbm_remove_mb(vm, mb_id);
2918			BUG_ON(rc);
2919			virtio_mem_sbm_set_mb_state(vm, mb_id,
2920						    VIRTIO_MEM_SBM_MB_UNUSED);
2921		}
2922		/*
2923		 * After we unregistered our callbacks, user space can no longer
2924		 * offline partially plugged online memory blocks. No need to
2925		 * worry about them.
2926		 */
2927	}
2928
2929	/* unregister callbacks */
2930	unregister_virtio_mem_device(vm);
2931	unregister_pm_notifier(&vm->pm_notifier);
2932	unregister_memory_notifier(&vm->memory_notifier);
2933
2934	/*
2935	 * There is no way we could reliably remove all memory we have added to
2936	 * the system. And there is no way to stop the driver/device from going
2937	 * away. Warn at least.
2938	 */
2939	if (virtio_mem_has_memory_added(vm)) {
2940		dev_warn(&vm->vdev->dev,
2941			 "device still has system memory added\n");
2942	} else {
2943		virtio_mem_delete_resource(vm);
2944		kfree_const(vm->resource_name);
2945		memory_group_unregister(vm->mgid);
2946	}
2947
2948	/* remove all tracking data - no locking needed */
2949	if (vm->in_sbm) {
2950		vfree(vm->sbm.mb_states);
2951		vfree(vm->sbm.sb_states);
2952	} else {
2953		vfree(vm->bbm.bb_states);
2954	}
2955}
2956
2957static void virtio_mem_deinit_kdump(struct virtio_mem *vm)
2958{
2959#ifdef CONFIG_PROC_VMCORE
2960	unregister_vmcore_cb(&vm->vmcore_cb);
2961#endif /* CONFIG_PROC_VMCORE */
2962}
2963
2964static void virtio_mem_remove(struct virtio_device *vdev)
2965{
2966	struct virtio_mem *vm = vdev->priv;
2967
2968	if (vm->in_kdump)
2969		virtio_mem_deinit_kdump(vm);
2970	else
2971		virtio_mem_deinit_hotplug(vm);
2972
2973	/* reset the device and cleanup the queues */
2974	virtio_reset_device(vdev);
2975	vdev->config->del_vqs(vdev);
2976
2977	kfree(vm);
2978	vdev->priv = NULL;
2979}
2980
2981static void virtio_mem_config_changed(struct virtio_device *vdev)
2982{
2983	struct virtio_mem *vm = vdev->priv;
2984
2985	if (unlikely(vm->in_kdump))
2986		return;
2987
2988	atomic_set(&vm->config_changed, 1);
2989	virtio_mem_retry(vm);
2990}
2991
2992#ifdef CONFIG_PM_SLEEP
2993static int virtio_mem_freeze(struct virtio_device *vdev)
2994{
2995	struct virtio_mem *vm = vdev->priv;
2996
2997	/*
2998	 * We block hibernation using the PM notifier completely. The workqueue
2999	 * is already frozen by the PM core at this point, so we simply
3000	 * reset the device and cleanup the queues.
3001	 */
3002	if (pm_suspend_target_state != PM_SUSPEND_TO_IDLE &&
3003	    vm->plugged_size &&
3004	    !virtio_has_feature(vm->vdev, VIRTIO_MEM_F_PERSISTENT_SUSPEND)) {
3005		dev_err(&vm->vdev->dev,
3006			"suspending with plugged memory is not supported\n");
3007		return -EPERM;
3008	}
3009
3010	virtio_reset_device(vdev);
3011	vdev->config->del_vqs(vdev);
3012	vm->vq = NULL;
3013	return 0;
3014}
3015
3016static int virtio_mem_restore(struct virtio_device *vdev)
3017{
3018	struct virtio_mem *vm = vdev->priv;
3019	int ret;
3020
3021	ret = virtio_mem_init_vq(vm);
3022	if (ret)
3023		return ret;
3024	virtio_device_ready(vdev);
3025
3026	/* Let's check if anything changed. */
3027	virtio_mem_config_changed(vdev);
3028	return 0;
3029}
3030#endif
3031
3032static unsigned int virtio_mem_features[] = {
3033#if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA)
3034	VIRTIO_MEM_F_ACPI_PXM,
3035#endif
3036	VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE,
3037	VIRTIO_MEM_F_PERSISTENT_SUSPEND,
3038};
3039
3040static const struct virtio_device_id virtio_mem_id_table[] = {
3041	{ VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID },
3042	{ 0 },
3043};
3044
3045static struct virtio_driver virtio_mem_driver = {
3046	.feature_table = virtio_mem_features,
3047	.feature_table_size = ARRAY_SIZE(virtio_mem_features),
3048	.driver.name = KBUILD_MODNAME,
3049	.id_table = virtio_mem_id_table,
3050	.probe = virtio_mem_probe,
3051	.remove = virtio_mem_remove,
3052	.config_changed = virtio_mem_config_changed,
3053#ifdef CONFIG_PM_SLEEP
3054	.freeze	=	virtio_mem_freeze,
3055	.restore =	virtio_mem_restore,
3056#endif
3057};
3058
3059module_virtio_driver(virtio_mem_driver);
3060MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table);
3061MODULE_AUTHOR("David Hildenbrand <david@redhat.com>");
3062MODULE_DESCRIPTION("Virtio-mem driver");
3063MODULE_LICENSE("GPL");