Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Virtio-mem device driver.
   4 *
   5 * Copyright Red Hat, Inc. 2020
   6 *
   7 * Author(s): David Hildenbrand <david@redhat.com>
   8 */
   9
  10#include <linux/virtio.h>
  11#include <linux/virtio_mem.h>
  12#include <linux/workqueue.h>
  13#include <linux/slab.h>
  14#include <linux/module.h>
  15#include <linux/mm.h>
  16#include <linux/memory_hotplug.h>
  17#include <linux/memory.h>
  18#include <linux/hrtimer.h>
  19#include <linux/crash_dump.h>
  20#include <linux/mutex.h>
  21#include <linux/bitmap.h>
  22#include <linux/lockdep.h>
  23
  24#include <acpi/acpi_numa.h>
  25
  26static bool unplug_online = true;
  27module_param(unplug_online, bool, 0644);
  28MODULE_PARM_DESC(unplug_online, "Try to unplug online memory");
  29
  30static bool force_bbm;
  31module_param(force_bbm, bool, 0444);
  32MODULE_PARM_DESC(force_bbm,
  33		"Force Big Block Mode. Default is 0 (auto-selection)");
  34
  35static unsigned long bbm_block_size;
  36module_param(bbm_block_size, ulong, 0444);
  37MODULE_PARM_DESC(bbm_block_size,
  38		 "Big Block size in bytes. Default is 0 (auto-detection).");
  39
  40static bool bbm_safe_unplug = true;
  41module_param(bbm_safe_unplug, bool, 0444);
  42MODULE_PARM_DESC(bbm_safe_unplug,
  43	     "Use a safe unplug mechanism in BBM, avoiding long/endless loops");
  44
  45/*
  46 * virtio-mem currently supports the following modes of operation:
  47 *
  48 * * Sub Block Mode (SBM): A Linux memory block spans 2..X subblocks (SB). The
  49 *   size of a Sub Block (SB) is determined based on the device block size, the
  50 *   pageblock size, and the maximum allocation granularity of the buddy.
  51 *   Subblocks within a Linux memory block might either be plugged or unplugged.
  52 *   Memory is added/removed to Linux MM in Linux memory block granularity.
  53 *
  54 * * Big Block Mode (BBM): A Big Block (BB) spans 1..X Linux memory blocks.
  55 *   Memory is added/removed to Linux MM in Big Block granularity.
  56 *
  57 * The mode is determined automatically based on the Linux memory block size
  58 * and the device block size.
  59 *
  60 * User space / core MM (auto onlining) is responsible for onlining added
  61 * Linux memory blocks - and for selecting a zone. Linux Memory Blocks are
  62 * always onlined separately, and all memory within a Linux memory block is
  63 * onlined to the same zone - virtio-mem relies on this behavior.
  64 */
  65
  66/*
  67 * State of a Linux memory block in SBM.
  68 */
  69enum virtio_mem_sbm_mb_state {
  70	/* Unplugged, not added to Linux. Can be reused later. */
  71	VIRTIO_MEM_SBM_MB_UNUSED = 0,
  72	/* (Partially) plugged, not added to Linux. Error on add_memory(). */
  73	VIRTIO_MEM_SBM_MB_PLUGGED,
  74	/* Fully plugged, fully added to Linux, offline. */
  75	VIRTIO_MEM_SBM_MB_OFFLINE,
  76	/* Partially plugged, fully added to Linux, offline. */
  77	VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
  78	/* Fully plugged, fully added to Linux, onlined to a kernel zone. */
  79	VIRTIO_MEM_SBM_MB_KERNEL,
  80	/* Partially plugged, fully added to Linux, online to a kernel zone */
  81	VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
  82	/* Fully plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
  83	VIRTIO_MEM_SBM_MB_MOVABLE,
  84	/* Partially plugged, fully added to Linux, onlined to ZONE_MOVABLE. */
  85	VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
  86	VIRTIO_MEM_SBM_MB_COUNT
  87};
  88
  89/*
  90 * State of a Big Block (BB) in BBM, covering 1..X Linux memory blocks.
  91 */
  92enum virtio_mem_bbm_bb_state {
  93	/* Unplugged, not added to Linux. Can be reused later. */
  94	VIRTIO_MEM_BBM_BB_UNUSED = 0,
  95	/* Plugged, not added to Linux. Error on add_memory(). */
  96	VIRTIO_MEM_BBM_BB_PLUGGED,
  97	/* Plugged and added to Linux. */
  98	VIRTIO_MEM_BBM_BB_ADDED,
  99	/* All online parts are fake-offline, ready to remove. */
 100	VIRTIO_MEM_BBM_BB_FAKE_OFFLINE,
 101	VIRTIO_MEM_BBM_BB_COUNT
 102};
 103
 104struct virtio_mem {
 105	struct virtio_device *vdev;
 106
 107	/* We might first have to unplug all memory when starting up. */
 108	bool unplug_all_required;
 109
 110	/* Workqueue that processes the plug/unplug requests. */
 111	struct work_struct wq;
 112	atomic_t wq_active;
 113	atomic_t config_changed;
 114
 115	/* Virtqueue for guest->host requests. */
 116	struct virtqueue *vq;
 117
 118	/* Wait for a host response to a guest request. */
 119	wait_queue_head_t host_resp;
 120
 121	/* Space for one guest request and the host response. */
 122	struct virtio_mem_req req;
 123	struct virtio_mem_resp resp;
 124
 125	/* The current size of the device. */
 126	uint64_t plugged_size;
 127	/* The requested size of the device. */
 128	uint64_t requested_size;
 129
 130	/* The device block size (for communicating with the device). */
 131	uint64_t device_block_size;
 132	/* The determined node id for all memory of the device. */
 133	int nid;
 134	/* Physical start address of the memory region. */
 135	uint64_t addr;
 136	/* Maximum region size in bytes. */
 137	uint64_t region_size;
 138
 139	/* The parent resource for all memory added via this device. */
 140	struct resource *parent_resource;
 141	/*
 142	 * Copy of "System RAM (virtio_mem)" to be used for
 143	 * add_memory_driver_managed().
 144	 */
 145	const char *resource_name;
 146
 147	/*
 148	 * We don't want to add too much memory if it's not getting onlined,
 149	 * to avoid running OOM. Besides this threshold, we allow to have at
 150	 * least two offline blocks at a time (whatever is bigger).
 151	 */
 152#define VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD		(1024 * 1024 * 1024)
 153	atomic64_t offline_size;
 154	uint64_t offline_threshold;
 155
 156	/* If set, the driver is in SBM, otherwise in BBM. */
 157	bool in_sbm;
 158
 159	union {
 160		struct {
 161			/* Id of the first memory block of this device. */
 162			unsigned long first_mb_id;
 163			/* Id of the last usable memory block of this device. */
 164			unsigned long last_usable_mb_id;
 165			/* Id of the next memory bock to prepare when needed. */
 166			unsigned long next_mb_id;
 167
 168			/* The subblock size. */
 169			uint64_t sb_size;
 170			/* The number of subblocks per Linux memory block. */
 171			uint32_t sbs_per_mb;
 172
 173			/* Summary of all memory block states. */
 174			unsigned long mb_count[VIRTIO_MEM_SBM_MB_COUNT];
 175
 176			/*
 177			 * One byte state per memory block. Allocated via
 178			 * vmalloc(). Resized (alloc+copy+free) on demand.
 179			 *
 180			 * With 128 MiB memory blocks, we have states for 512
 181			 * GiB of memory in one 4 KiB page.
 182			 */
 183			uint8_t *mb_states;
 184
 185			/*
 186			 * Bitmap: one bit per subblock. Allocated similar to
 187			 * sbm.mb_states.
 188			 *
 189			 * A set bit means the corresponding subblock is
 190			 * plugged, otherwise it's unblocked.
 191			 *
 192			 * With 4 MiB subblocks, we manage 128 GiB of memory
 193			 * in one 4 KiB page.
 194			 */
 195			unsigned long *sb_states;
 196		} sbm;
 197
 198		struct {
 199			/* Id of the first big block of this device. */
 200			unsigned long first_bb_id;
 201			/* Id of the last usable big block of this device. */
 202			unsigned long last_usable_bb_id;
 203			/* Id of the next device bock to prepare when needed. */
 204			unsigned long next_bb_id;
 205
 206			/* Summary of all big block states. */
 207			unsigned long bb_count[VIRTIO_MEM_BBM_BB_COUNT];
 208
 209			/* One byte state per big block. See sbm.mb_states. */
 210			uint8_t *bb_states;
 211
 212			/* The block size used for plugging/adding/removing. */
 213			uint64_t bb_size;
 214		} bbm;
 215	};
 216
 217	/*
 218	 * Mutex that protects the sbm.mb_count, sbm.mb_states,
 219	 * sbm.sb_states, bbm.bb_count, and bbm.bb_states
 220	 *
 221	 * When this lock is held the pointers can't change, ONLINE and
 222	 * OFFLINE blocks can't change the state and no subblocks will get
 223	 * plugged/unplugged.
 224	 */
 225	struct mutex hotplug_mutex;
 226	bool hotplug_active;
 227
 228	/* An error occurred we cannot handle - stop processing requests. */
 229	bool broken;
 230
 231	/* The driver is being removed. */
 232	spinlock_t removal_lock;
 233	bool removing;
 234
 235	/* Timer for retrying to plug/unplug memory. */
 236	struct hrtimer retry_timer;
 237	unsigned int retry_timer_ms;
 238#define VIRTIO_MEM_RETRY_TIMER_MIN_MS		50000
 239#define VIRTIO_MEM_RETRY_TIMER_MAX_MS		300000
 240
 241	/* Memory notifier (online/offline events). */
 242	struct notifier_block memory_notifier;
 243
 244	/* Next device in the list of virtio-mem devices. */
 245	struct list_head next;
 246};
 247
 248/*
 249 * We have to share a single online_page callback among all virtio-mem
 250 * devices. We use RCU to iterate the list in the callback.
 251 */
 252static DEFINE_MUTEX(virtio_mem_mutex);
 253static LIST_HEAD(virtio_mem_devices);
 254
 255static void virtio_mem_online_page_cb(struct page *page, unsigned int order);
 256static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
 257						  unsigned long nr_pages);
 258static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
 259						   unsigned long nr_pages);
 260static void virtio_mem_retry(struct virtio_mem *vm);
 261
 262/*
 263 * Register a virtio-mem device so it will be considered for the online_page
 264 * callback.
 265 */
 266static int register_virtio_mem_device(struct virtio_mem *vm)
 267{
 268	int rc = 0;
 269
 270	/* First device registers the callback. */
 271	mutex_lock(&virtio_mem_mutex);
 272	if (list_empty(&virtio_mem_devices))
 273		rc = set_online_page_callback(&virtio_mem_online_page_cb);
 274	if (!rc)
 275		list_add_rcu(&vm->next, &virtio_mem_devices);
 276	mutex_unlock(&virtio_mem_mutex);
 277
 278	return rc;
 279}
 280
 281/*
 282 * Unregister a virtio-mem device so it will no longer be considered for the
 283 * online_page callback.
 284 */
 285static void unregister_virtio_mem_device(struct virtio_mem *vm)
 286{
 287	/* Last device unregisters the callback. */
 288	mutex_lock(&virtio_mem_mutex);
 289	list_del_rcu(&vm->next);
 290	if (list_empty(&virtio_mem_devices))
 291		restore_online_page_callback(&virtio_mem_online_page_cb);
 292	mutex_unlock(&virtio_mem_mutex);
 293
 294	synchronize_rcu();
 295}
 296
 297/*
 298 * Calculate the memory block id of a given address.
 299 */
 300static unsigned long virtio_mem_phys_to_mb_id(unsigned long addr)
 301{
 302	return addr / memory_block_size_bytes();
 303}
 304
 305/*
 306 * Calculate the physical start address of a given memory block id.
 307 */
 308static unsigned long virtio_mem_mb_id_to_phys(unsigned long mb_id)
 309{
 310	return mb_id * memory_block_size_bytes();
 311}
 312
 313/*
 314 * Calculate the big block id of a given address.
 315 */
 316static unsigned long virtio_mem_phys_to_bb_id(struct virtio_mem *vm,
 317					      uint64_t addr)
 318{
 319	return addr / vm->bbm.bb_size;
 320}
 321
 322/*
 323 * Calculate the physical start address of a given big block id.
 324 */
 325static uint64_t virtio_mem_bb_id_to_phys(struct virtio_mem *vm,
 326					 unsigned long bb_id)
 327{
 328	return bb_id * vm->bbm.bb_size;
 329}
 330
 331/*
 332 * Calculate the subblock id of a given address.
 333 */
 334static unsigned long virtio_mem_phys_to_sb_id(struct virtio_mem *vm,
 335					      unsigned long addr)
 336{
 337	const unsigned long mb_id = virtio_mem_phys_to_mb_id(addr);
 338	const unsigned long mb_addr = virtio_mem_mb_id_to_phys(mb_id);
 339
 340	return (addr - mb_addr) / vm->sbm.sb_size;
 341}
 342
 343/*
 344 * Set the state of a big block, taking care of the state counter.
 345 */
 346static void virtio_mem_bbm_set_bb_state(struct virtio_mem *vm,
 347					unsigned long bb_id,
 348					enum virtio_mem_bbm_bb_state state)
 349{
 350	const unsigned long idx = bb_id - vm->bbm.first_bb_id;
 351	enum virtio_mem_bbm_bb_state old_state;
 352
 353	old_state = vm->bbm.bb_states[idx];
 354	vm->bbm.bb_states[idx] = state;
 355
 356	BUG_ON(vm->bbm.bb_count[old_state] == 0);
 357	vm->bbm.bb_count[old_state]--;
 358	vm->bbm.bb_count[state]++;
 359}
 360
 361/*
 362 * Get the state of a big block.
 363 */
 364static enum virtio_mem_bbm_bb_state virtio_mem_bbm_get_bb_state(struct virtio_mem *vm,
 365								unsigned long bb_id)
 366{
 367	return vm->bbm.bb_states[bb_id - vm->bbm.first_bb_id];
 368}
 369
 370/*
 371 * Prepare the big block state array for the next big block.
 372 */
 373static int virtio_mem_bbm_bb_states_prepare_next_bb(struct virtio_mem *vm)
 374{
 375	unsigned long old_bytes = vm->bbm.next_bb_id - vm->bbm.first_bb_id;
 376	unsigned long new_bytes = old_bytes + 1;
 377	int old_pages = PFN_UP(old_bytes);
 378	int new_pages = PFN_UP(new_bytes);
 379	uint8_t *new_array;
 380
 381	if (vm->bbm.bb_states && old_pages == new_pages)
 382		return 0;
 383
 384	new_array = vzalloc(new_pages * PAGE_SIZE);
 385	if (!new_array)
 386		return -ENOMEM;
 387
 388	mutex_lock(&vm->hotplug_mutex);
 389	if (vm->bbm.bb_states)
 390		memcpy(new_array, vm->bbm.bb_states, old_pages * PAGE_SIZE);
 391	vfree(vm->bbm.bb_states);
 392	vm->bbm.bb_states = new_array;
 393	mutex_unlock(&vm->hotplug_mutex);
 394
 395	return 0;
 396}
 397
 398#define virtio_mem_bbm_for_each_bb(_vm, _bb_id, _state) \
 399	for (_bb_id = vm->bbm.first_bb_id; \
 400	     _bb_id < vm->bbm.next_bb_id && _vm->bbm.bb_count[_state]; \
 401	     _bb_id++) \
 402		if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
 403
 404#define virtio_mem_bbm_for_each_bb_rev(_vm, _bb_id, _state) \
 405	for (_bb_id = vm->bbm.next_bb_id - 1; \
 406	     _bb_id >= vm->bbm.first_bb_id && _vm->bbm.bb_count[_state]; \
 407	     _bb_id--) \
 408		if (virtio_mem_bbm_get_bb_state(_vm, _bb_id) == _state)
 409
 410/*
 411 * Set the state of a memory block, taking care of the state counter.
 412 */
 413static void virtio_mem_sbm_set_mb_state(struct virtio_mem *vm,
 414					unsigned long mb_id, uint8_t state)
 415{
 416	const unsigned long idx = mb_id - vm->sbm.first_mb_id;
 417	uint8_t old_state;
 418
 419	old_state = vm->sbm.mb_states[idx];
 420	vm->sbm.mb_states[idx] = state;
 421
 422	BUG_ON(vm->sbm.mb_count[old_state] == 0);
 423	vm->sbm.mb_count[old_state]--;
 424	vm->sbm.mb_count[state]++;
 425}
 426
 427/*
 428 * Get the state of a memory block.
 429 */
 430static uint8_t virtio_mem_sbm_get_mb_state(struct virtio_mem *vm,
 431					   unsigned long mb_id)
 432{
 433	const unsigned long idx = mb_id - vm->sbm.first_mb_id;
 434
 435	return vm->sbm.mb_states[idx];
 436}
 437
 438/*
 439 * Prepare the state array for the next memory block.
 440 */
 441static int virtio_mem_sbm_mb_states_prepare_next_mb(struct virtio_mem *vm)
 442{
 443	int old_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id);
 444	int new_pages = PFN_UP(vm->sbm.next_mb_id - vm->sbm.first_mb_id + 1);
 445	uint8_t *new_array;
 446
 447	if (vm->sbm.mb_states && old_pages == new_pages)
 448		return 0;
 449
 450	new_array = vzalloc(new_pages * PAGE_SIZE);
 451	if (!new_array)
 452		return -ENOMEM;
 453
 454	mutex_lock(&vm->hotplug_mutex);
 455	if (vm->sbm.mb_states)
 456		memcpy(new_array, vm->sbm.mb_states, old_pages * PAGE_SIZE);
 457	vfree(vm->sbm.mb_states);
 458	vm->sbm.mb_states = new_array;
 459	mutex_unlock(&vm->hotplug_mutex);
 460
 461	return 0;
 462}
 463
 464#define virtio_mem_sbm_for_each_mb(_vm, _mb_id, _state) \
 465	for (_mb_id = _vm->sbm.first_mb_id; \
 466	     _mb_id < _vm->sbm.next_mb_id && _vm->sbm.mb_count[_state]; \
 467	     _mb_id++) \
 468		if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
 469
 470#define virtio_mem_sbm_for_each_mb_rev(_vm, _mb_id, _state) \
 471	for (_mb_id = _vm->sbm.next_mb_id - 1; \
 472	     _mb_id >= _vm->sbm.first_mb_id && _vm->sbm.mb_count[_state]; \
 473	     _mb_id--) \
 474		if (virtio_mem_sbm_get_mb_state(_vm, _mb_id) == _state)
 475
 476/*
 477 * Calculate the bit number in the subblock bitmap for the given subblock
 478 * inside the given memory block.
 479 */
 480static int virtio_mem_sbm_sb_state_bit_nr(struct virtio_mem *vm,
 481					  unsigned long mb_id, int sb_id)
 482{
 483	return (mb_id - vm->sbm.first_mb_id) * vm->sbm.sbs_per_mb + sb_id;
 484}
 485
 486/*
 487 * Mark all selected subblocks plugged.
 488 *
 489 * Will not modify the state of the memory block.
 490 */
 491static void virtio_mem_sbm_set_sb_plugged(struct virtio_mem *vm,
 492					  unsigned long mb_id, int sb_id,
 493					  int count)
 494{
 495	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 496
 497	__bitmap_set(vm->sbm.sb_states, bit, count);
 498}
 499
 500/*
 501 * Mark all selected subblocks unplugged.
 502 *
 503 * Will not modify the state of the memory block.
 504 */
 505static void virtio_mem_sbm_set_sb_unplugged(struct virtio_mem *vm,
 506					    unsigned long mb_id, int sb_id,
 507					    int count)
 508{
 509	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 510
 511	__bitmap_clear(vm->sbm.sb_states, bit, count);
 512}
 513
 514/*
 515 * Test if all selected subblocks are plugged.
 516 */
 517static bool virtio_mem_sbm_test_sb_plugged(struct virtio_mem *vm,
 518					   unsigned long mb_id, int sb_id,
 519					   int count)
 520{
 521	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 522
 523	if (count == 1)
 524		return test_bit(bit, vm->sbm.sb_states);
 525
 526	/* TODO: Helper similar to bitmap_set() */
 527	return find_next_zero_bit(vm->sbm.sb_states, bit + count, bit) >=
 528	       bit + count;
 529}
 530
 531/*
 532 * Test if all selected subblocks are unplugged.
 533 */
 534static bool virtio_mem_sbm_test_sb_unplugged(struct virtio_mem *vm,
 535					     unsigned long mb_id, int sb_id,
 536					     int count)
 537{
 538	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, sb_id);
 539
 540	/* TODO: Helper similar to bitmap_set() */
 541	return find_next_bit(vm->sbm.sb_states, bit + count, bit) >=
 542	       bit + count;
 543}
 544
 545/*
 546 * Find the first unplugged subblock. Returns vm->sbm.sbs_per_mb in case there is
 547 * none.
 548 */
 549static int virtio_mem_sbm_first_unplugged_sb(struct virtio_mem *vm,
 550					    unsigned long mb_id)
 551{
 552	const int bit = virtio_mem_sbm_sb_state_bit_nr(vm, mb_id, 0);
 553
 554	return find_next_zero_bit(vm->sbm.sb_states,
 555				  bit + vm->sbm.sbs_per_mb, bit) - bit;
 556}
 557
 558/*
 559 * Prepare the subblock bitmap for the next memory block.
 560 */
 561static int virtio_mem_sbm_sb_states_prepare_next_mb(struct virtio_mem *vm)
 562{
 563	const unsigned long old_nb_mb = vm->sbm.next_mb_id - vm->sbm.first_mb_id;
 564	const unsigned long old_nb_bits = old_nb_mb * vm->sbm.sbs_per_mb;
 565	const unsigned long new_nb_bits = (old_nb_mb + 1) * vm->sbm.sbs_per_mb;
 566	int old_pages = PFN_UP(BITS_TO_LONGS(old_nb_bits) * sizeof(long));
 567	int new_pages = PFN_UP(BITS_TO_LONGS(new_nb_bits) * sizeof(long));
 568	unsigned long *new_bitmap, *old_bitmap;
 569
 570	if (vm->sbm.sb_states && old_pages == new_pages)
 571		return 0;
 572
 573	new_bitmap = vzalloc(new_pages * PAGE_SIZE);
 574	if (!new_bitmap)
 575		return -ENOMEM;
 576
 577	mutex_lock(&vm->hotplug_mutex);
 578	if (new_bitmap)
 579		memcpy(new_bitmap, vm->sbm.sb_states, old_pages * PAGE_SIZE);
 580
 581	old_bitmap = vm->sbm.sb_states;
 582	vm->sbm.sb_states = new_bitmap;
 583	mutex_unlock(&vm->hotplug_mutex);
 584
 585	vfree(old_bitmap);
 586	return 0;
 587}
 588
 589/*
 590 * Test if we could add memory without creating too much offline memory -
 591 * to avoid running OOM if memory is getting onlined deferred.
 592 */
 593static bool virtio_mem_could_add_memory(struct virtio_mem *vm, uint64_t size)
 594{
 595	if (WARN_ON_ONCE(size > vm->offline_threshold))
 596		return false;
 597
 598	return atomic64_read(&vm->offline_size) + size <= vm->offline_threshold;
 599}
 600
 601/*
 602 * Try adding memory to Linux. Will usually only fail if out of memory.
 603 *
 604 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 605 * onlining code).
 606 *
 607 * Will not modify the state of memory blocks in virtio-mem.
 608 */
 609static int virtio_mem_add_memory(struct virtio_mem *vm, uint64_t addr,
 610				 uint64_t size)
 611{
 612	int rc;
 613
 614	/*
 615	 * When force-unloading the driver and we still have memory added to
 616	 * Linux, the resource name has to stay.
 617	 */
 618	if (!vm->resource_name) {
 619		vm->resource_name = kstrdup_const("System RAM (virtio_mem)",
 620						  GFP_KERNEL);
 621		if (!vm->resource_name)
 622			return -ENOMEM;
 623	}
 624
 625	dev_dbg(&vm->vdev->dev, "adding memory: 0x%llx - 0x%llx\n", addr,
 626		addr + size - 1);
 627	/* Memory might get onlined immediately. */
 628	atomic64_add(size, &vm->offline_size);
 629	rc = add_memory_driver_managed(vm->nid, addr, size, vm->resource_name,
 630				       MHP_MERGE_RESOURCE);
 631	if (rc) {
 632		atomic64_sub(size, &vm->offline_size);
 633		dev_warn(&vm->vdev->dev, "adding memory failed: %d\n", rc);
 634		/*
 635		 * TODO: Linux MM does not properly clean up yet in all cases
 636		 * where adding of memory failed - especially on -ENOMEM.
 637		 */
 638	}
 639	return rc;
 640}
 641
 642/*
 643 * See virtio_mem_add_memory(): Try adding a single Linux memory block.
 644 */
 645static int virtio_mem_sbm_add_mb(struct virtio_mem *vm, unsigned long mb_id)
 646{
 647	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 648	const uint64_t size = memory_block_size_bytes();
 649
 650	return virtio_mem_add_memory(vm, addr, size);
 651}
 652
 653/*
 654 * See virtio_mem_add_memory(): Try adding a big block.
 655 */
 656static int virtio_mem_bbm_add_bb(struct virtio_mem *vm, unsigned long bb_id)
 657{
 658	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
 659	const uint64_t size = vm->bbm.bb_size;
 660
 661	return virtio_mem_add_memory(vm, addr, size);
 662}
 663
 664/*
 665 * Try removing memory from Linux. Will only fail if memory blocks aren't
 666 * offline.
 667 *
 668 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 669 * onlining code).
 670 *
 671 * Will not modify the state of memory blocks in virtio-mem.
 672 */
 673static int virtio_mem_remove_memory(struct virtio_mem *vm, uint64_t addr,
 674				    uint64_t size)
 675{
 676	int rc;
 677
 678	dev_dbg(&vm->vdev->dev, "removing memory: 0x%llx - 0x%llx\n", addr,
 679		addr + size - 1);
 680	rc = remove_memory(vm->nid, addr, size);
 681	if (!rc) {
 682		atomic64_sub(size, &vm->offline_size);
 683		/*
 684		 * We might have freed up memory we can now unplug, retry
 685		 * immediately instead of waiting.
 686		 */
 687		virtio_mem_retry(vm);
 688	} else {
 689		dev_dbg(&vm->vdev->dev, "removing memory failed: %d\n", rc);
 690	}
 691	return rc;
 692}
 693
 694/*
 695 * See virtio_mem_remove_memory(): Try removing a single Linux memory block.
 696 */
 697static int virtio_mem_sbm_remove_mb(struct virtio_mem *vm, unsigned long mb_id)
 698{
 699	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 700	const uint64_t size = memory_block_size_bytes();
 701
 702	return virtio_mem_remove_memory(vm, addr, size);
 703}
 704
 705/*
 706 * Try offlining and removing memory from Linux.
 707 *
 708 * Must not be called with the vm->hotplug_mutex held (possible deadlock with
 709 * onlining code).
 710 *
 711 * Will not modify the state of memory blocks in virtio-mem.
 712 */
 713static int virtio_mem_offline_and_remove_memory(struct virtio_mem *vm,
 714						uint64_t addr,
 715						uint64_t size)
 716{
 717	int rc;
 718
 719	dev_dbg(&vm->vdev->dev,
 720		"offlining and removing memory: 0x%llx - 0x%llx\n", addr,
 721		addr + size - 1);
 722
 723	rc = offline_and_remove_memory(vm->nid, addr, size);
 724	if (!rc) {
 725		atomic64_sub(size, &vm->offline_size);
 726		/*
 727		 * We might have freed up memory we can now unplug, retry
 728		 * immediately instead of waiting.
 729		 */
 730		virtio_mem_retry(vm);
 731	} else {
 732		dev_dbg(&vm->vdev->dev,
 733			"offlining and removing memory failed: %d\n", rc);
 734	}
 735	return rc;
 736}
 737
 738/*
 739 * See virtio_mem_offline_and_remove_memory(): Try offlining and removing
 740 * a single Linux memory block.
 741 */
 742static int virtio_mem_sbm_offline_and_remove_mb(struct virtio_mem *vm,
 743						unsigned long mb_id)
 744{
 745	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id);
 746	const uint64_t size = memory_block_size_bytes();
 747
 748	return virtio_mem_offline_and_remove_memory(vm, addr, size);
 749}
 750
 751/*
 752 * See virtio_mem_offline_and_remove_memory(): Try to offline and remove a
 753 * all Linux memory blocks covered by the big block.
 754 */
 755static int virtio_mem_bbm_offline_and_remove_bb(struct virtio_mem *vm,
 756						unsigned long bb_id)
 757{
 758	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
 759	const uint64_t size = vm->bbm.bb_size;
 760
 761	return virtio_mem_offline_and_remove_memory(vm, addr, size);
 762}
 763
 764/*
 765 * Trigger the workqueue so the device can perform its magic.
 766 */
 767static void virtio_mem_retry(struct virtio_mem *vm)
 768{
 769	unsigned long flags;
 770
 771	spin_lock_irqsave(&vm->removal_lock, flags);
 772	if (!vm->removing)
 773		queue_work(system_freezable_wq, &vm->wq);
 774	spin_unlock_irqrestore(&vm->removal_lock, flags);
 775}
 776
 777static int virtio_mem_translate_node_id(struct virtio_mem *vm, uint16_t node_id)
 778{
 779	int node = NUMA_NO_NODE;
 780
 781#if defined(CONFIG_ACPI_NUMA)
 782	if (virtio_has_feature(vm->vdev, VIRTIO_MEM_F_ACPI_PXM))
 783		node = pxm_to_node(node_id);
 784#endif
 785	return node;
 786}
 787
 788/*
 789 * Test if a virtio-mem device overlaps with the given range. Can be called
 790 * from (notifier) callbacks lockless.
 791 */
 792static bool virtio_mem_overlaps_range(struct virtio_mem *vm, uint64_t start,
 793				      uint64_t size)
 794{
 795	return start < vm->addr + vm->region_size && vm->addr < start + size;
 796}
 797
 798/*
 799 * Test if a virtio-mem device contains a given range. Can be called from
 800 * (notifier) callbacks lockless.
 801 */
 802static bool virtio_mem_contains_range(struct virtio_mem *vm, uint64_t start,
 803				      uint64_t size)
 804{
 805	return start >= vm->addr && start + size <= vm->addr + vm->region_size;
 806}
 807
 808static int virtio_mem_sbm_notify_going_online(struct virtio_mem *vm,
 809					      unsigned long mb_id)
 810{
 811	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
 812	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
 813	case VIRTIO_MEM_SBM_MB_OFFLINE:
 814		return NOTIFY_OK;
 815	default:
 816		break;
 817	}
 818	dev_warn_ratelimited(&vm->vdev->dev,
 819			     "memory block onlining denied\n");
 820	return NOTIFY_BAD;
 821}
 822
 823static void virtio_mem_sbm_notify_offline(struct virtio_mem *vm,
 824					  unsigned long mb_id)
 825{
 826	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
 827	case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
 828	case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
 829		virtio_mem_sbm_set_mb_state(vm, mb_id,
 830					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
 831		break;
 832	case VIRTIO_MEM_SBM_MB_KERNEL:
 833	case VIRTIO_MEM_SBM_MB_MOVABLE:
 834		virtio_mem_sbm_set_mb_state(vm, mb_id,
 835					    VIRTIO_MEM_SBM_MB_OFFLINE);
 836		break;
 837	default:
 838		BUG();
 839		break;
 840	}
 841}
 842
 843static void virtio_mem_sbm_notify_online(struct virtio_mem *vm,
 844					 unsigned long mb_id,
 845					 unsigned long start_pfn)
 846{
 847	const bool is_movable = page_zonenum(pfn_to_page(start_pfn)) ==
 848				ZONE_MOVABLE;
 849	int new_state;
 850
 851	switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) {
 852	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
 853		new_state = VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL;
 854		if (is_movable)
 855			new_state = VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL;
 856		break;
 857	case VIRTIO_MEM_SBM_MB_OFFLINE:
 858		new_state = VIRTIO_MEM_SBM_MB_KERNEL;
 859		if (is_movable)
 860			new_state = VIRTIO_MEM_SBM_MB_MOVABLE;
 861		break;
 862	default:
 863		BUG();
 864		break;
 865	}
 866	virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
 867}
 868
 869static void virtio_mem_sbm_notify_going_offline(struct virtio_mem *vm,
 870						unsigned long mb_id)
 871{
 872	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
 873	unsigned long pfn;
 874	int sb_id;
 875
 876	for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
 877		if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
 878			continue;
 879		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
 880			       sb_id * vm->sbm.sb_size);
 881		virtio_mem_fake_offline_going_offline(pfn, nr_pages);
 882	}
 883}
 884
 885static void virtio_mem_sbm_notify_cancel_offline(struct virtio_mem *vm,
 886						 unsigned long mb_id)
 887{
 888	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size);
 889	unsigned long pfn;
 890	int sb_id;
 891
 892	for (sb_id = 0; sb_id < vm->sbm.sbs_per_mb; sb_id++) {
 893		if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
 894			continue;
 895		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
 896			       sb_id * vm->sbm.sb_size);
 897		virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
 898	}
 899}
 900
 901static void virtio_mem_bbm_notify_going_offline(struct virtio_mem *vm,
 902						unsigned long bb_id,
 903						unsigned long pfn,
 904						unsigned long nr_pages)
 905{
 906	/*
 907	 * When marked as "fake-offline", all online memory of this device block
 908	 * is allocated by us. Otherwise, we don't have any memory allocated.
 909	 */
 910	if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
 911	    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
 912		return;
 913	virtio_mem_fake_offline_going_offline(pfn, nr_pages);
 914}
 915
 916static void virtio_mem_bbm_notify_cancel_offline(struct virtio_mem *vm,
 917						 unsigned long bb_id,
 918						 unsigned long pfn,
 919						 unsigned long nr_pages)
 920{
 921	if (virtio_mem_bbm_get_bb_state(vm, bb_id) !=
 922	    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE)
 923		return;
 924	virtio_mem_fake_offline_cancel_offline(pfn, nr_pages);
 925}
 926
 927/*
 928 * This callback will either be called synchronously from add_memory() or
 929 * asynchronously (e.g., triggered via user space). We have to be careful
 930 * with locking when calling add_memory().
 931 */
 932static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
 933					 unsigned long action, void *arg)
 934{
 935	struct virtio_mem *vm = container_of(nb, struct virtio_mem,
 936					     memory_notifier);
 937	struct memory_notify *mhp = arg;
 938	const unsigned long start = PFN_PHYS(mhp->start_pfn);
 939	const unsigned long size = PFN_PHYS(mhp->nr_pages);
 940	int rc = NOTIFY_OK;
 941	unsigned long id;
 942
 943	if (!virtio_mem_overlaps_range(vm, start, size))
 944		return NOTIFY_DONE;
 945
 946	if (vm->in_sbm) {
 947		id = virtio_mem_phys_to_mb_id(start);
 948		/*
 949		 * In SBM, we add memory in separate memory blocks - we expect
 950		 * it to be onlined/offlined in the same granularity. Bail out
 951		 * if this ever changes.
 952		 */
 953		if (WARN_ON_ONCE(size != memory_block_size_bytes() ||
 954				 !IS_ALIGNED(start, memory_block_size_bytes())))
 955			return NOTIFY_BAD;
 956	} else {
 957		id = virtio_mem_phys_to_bb_id(vm, start);
 958		/*
 959		 * In BBM, we only care about onlining/offlining happening
 960		 * within a single big block, we don't care about the
 961		 * actual granularity as we don't track individual Linux
 962		 * memory blocks.
 963		 */
 964		if (WARN_ON_ONCE(id != virtio_mem_phys_to_bb_id(vm, start + size - 1)))
 965			return NOTIFY_BAD;
 966	}
 967
 968	/*
 969	 * Avoid circular locking lockdep warnings. We lock the mutex
 970	 * e.g., in MEM_GOING_ONLINE and unlock it in MEM_ONLINE. The
 971	 * blocking_notifier_call_chain() has it's own lock, which gets unlocked
 972	 * between both notifier calls and will bail out. False positive.
 973	 */
 974	lockdep_off();
 975
 976	switch (action) {
 977	case MEM_GOING_OFFLINE:
 978		mutex_lock(&vm->hotplug_mutex);
 979		if (vm->removing) {
 980			rc = notifier_from_errno(-EBUSY);
 981			mutex_unlock(&vm->hotplug_mutex);
 982			break;
 983		}
 984		vm->hotplug_active = true;
 985		if (vm->in_sbm)
 986			virtio_mem_sbm_notify_going_offline(vm, id);
 987		else
 988			virtio_mem_bbm_notify_going_offline(vm, id,
 989							    mhp->start_pfn,
 990							    mhp->nr_pages);
 991		break;
 992	case MEM_GOING_ONLINE:
 993		mutex_lock(&vm->hotplug_mutex);
 994		if (vm->removing) {
 995			rc = notifier_from_errno(-EBUSY);
 996			mutex_unlock(&vm->hotplug_mutex);
 997			break;
 998		}
 999		vm->hotplug_active = true;
1000		if (vm->in_sbm)
1001			rc = virtio_mem_sbm_notify_going_online(vm, id);
1002		break;
1003	case MEM_OFFLINE:
1004		if (vm->in_sbm)
1005			virtio_mem_sbm_notify_offline(vm, id);
1006
1007		atomic64_add(size, &vm->offline_size);
1008		/*
1009		 * Trigger the workqueue. Now that we have some offline memory,
1010		 * maybe we can handle pending unplug requests.
1011		 */
1012		if (!unplug_online)
1013			virtio_mem_retry(vm);
1014
1015		vm->hotplug_active = false;
1016		mutex_unlock(&vm->hotplug_mutex);
1017		break;
1018	case MEM_ONLINE:
1019		if (vm->in_sbm)
1020			virtio_mem_sbm_notify_online(vm, id, mhp->start_pfn);
1021
1022		atomic64_sub(size, &vm->offline_size);
1023		/*
1024		 * Start adding more memory once we onlined half of our
1025		 * threshold. Don't trigger if it's possibly due to our actipn
1026		 * (e.g., us adding memory which gets onlined immediately from
1027		 * the core).
1028		 */
1029		if (!atomic_read(&vm->wq_active) &&
1030		    virtio_mem_could_add_memory(vm, vm->offline_threshold / 2))
1031			virtio_mem_retry(vm);
1032
1033		vm->hotplug_active = false;
1034		mutex_unlock(&vm->hotplug_mutex);
1035		break;
1036	case MEM_CANCEL_OFFLINE:
1037		if (!vm->hotplug_active)
1038			break;
1039		if (vm->in_sbm)
1040			virtio_mem_sbm_notify_cancel_offline(vm, id);
1041		else
1042			virtio_mem_bbm_notify_cancel_offline(vm, id,
1043							     mhp->start_pfn,
1044							     mhp->nr_pages);
1045		vm->hotplug_active = false;
1046		mutex_unlock(&vm->hotplug_mutex);
1047		break;
1048	case MEM_CANCEL_ONLINE:
1049		if (!vm->hotplug_active)
1050			break;
1051		vm->hotplug_active = false;
1052		mutex_unlock(&vm->hotplug_mutex);
1053		break;
1054	default:
1055		break;
1056	}
1057
1058	lockdep_on();
1059
1060	return rc;
1061}
1062
1063/*
1064 * Set a range of pages PG_offline. Remember pages that were never onlined
1065 * (via generic_online_page()) using PageDirty().
1066 */
1067static void virtio_mem_set_fake_offline(unsigned long pfn,
1068					unsigned long nr_pages, bool onlined)
1069{
1070	page_offline_begin();
1071	for (; nr_pages--; pfn++) {
1072		struct page *page = pfn_to_page(pfn);
1073
1074		__SetPageOffline(page);
1075		if (!onlined) {
1076			SetPageDirty(page);
1077			/* FIXME: remove after cleanups */
1078			ClearPageReserved(page);
1079		}
1080	}
1081	page_offline_end();
1082}
1083
1084/*
1085 * Clear PG_offline from a range of pages. If the pages were never onlined,
1086 * (via generic_online_page()), clear PageDirty().
1087 */
1088static void virtio_mem_clear_fake_offline(unsigned long pfn,
1089					  unsigned long nr_pages, bool onlined)
1090{
1091	for (; nr_pages--; pfn++) {
1092		struct page *page = pfn_to_page(pfn);
1093
1094		__ClearPageOffline(page);
1095		if (!onlined)
1096			ClearPageDirty(page);
1097	}
1098}
1099
1100/*
1101 * Release a range of fake-offline pages to the buddy, effectively
1102 * fake-onlining them.
1103 */
1104static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages)
1105{
1106	const unsigned long max_nr_pages = MAX_ORDER_NR_PAGES;
1107	unsigned long i;
1108
1109	/*
1110	 * We are always called at least with MAX_ORDER_NR_PAGES
1111	 * granularity/alignment (e.g., the way subblocks work). All pages
1112	 * inside such a block are alike.
1113	 */
1114	for (i = 0; i < nr_pages; i += max_nr_pages) {
1115		struct page *page = pfn_to_page(pfn + i);
1116
1117		/*
1118		 * If the page is PageDirty(), it was kept fake-offline when
1119		 * onlining the memory block. Otherwise, it was allocated
1120		 * using alloc_contig_range(). All pages in a subblock are
1121		 * alike.
1122		 */
1123		if (PageDirty(page)) {
1124			virtio_mem_clear_fake_offline(pfn + i, max_nr_pages,
1125						      false);
1126			generic_online_page(page, MAX_ORDER - 1);
1127		} else {
1128			virtio_mem_clear_fake_offline(pfn + i, max_nr_pages,
1129						      true);
1130			free_contig_range(pfn + i, max_nr_pages);
1131			adjust_managed_page_count(page, max_nr_pages);
1132		}
1133	}
1134}
1135
1136/*
1137 * Try to allocate a range, marking pages fake-offline, effectively
1138 * fake-offlining them.
1139 */
1140static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages)
1141{
1142	const bool is_movable = page_zonenum(pfn_to_page(pfn)) ==
1143				ZONE_MOVABLE;
1144	int rc, retry_count;
1145
1146	/*
1147	 * TODO: We want an alloc_contig_range() mode that tries to allocate
1148	 * harder (e.g., dealing with temporarily pinned pages, PCP), especially
1149	 * with ZONE_MOVABLE. So for now, retry a couple of times with
1150	 * ZONE_MOVABLE before giving up - because that zone is supposed to give
1151	 * some guarantees.
1152	 */
1153	for (retry_count = 0; retry_count < 5; retry_count++) {
1154		rc = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE,
1155					GFP_KERNEL);
1156		if (rc == -ENOMEM)
1157			/* whoops, out of memory */
1158			return rc;
1159		else if (rc && !is_movable)
1160			break;
1161		else if (rc)
1162			continue;
1163
1164		virtio_mem_set_fake_offline(pfn, nr_pages, true);
1165		adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
1166		return 0;
1167	}
1168
1169	return -EBUSY;
1170}
1171
1172/*
1173 * Handle fake-offline pages when memory is going offline - such that the
1174 * pages can be skipped by mm-core when offlining.
1175 */
1176static void virtio_mem_fake_offline_going_offline(unsigned long pfn,
1177						  unsigned long nr_pages)
1178{
1179	struct page *page;
1180	unsigned long i;
1181
1182	/*
1183	 * Drop our reference to the pages so the memory can get offlined
1184	 * and add the unplugged pages to the managed page counters (so
1185	 * offlining code can correctly subtract them again).
1186	 */
1187	adjust_managed_page_count(pfn_to_page(pfn), nr_pages);
1188	/* Drop our reference to the pages so the memory can get offlined. */
1189	for (i = 0; i < nr_pages; i++) {
1190		page = pfn_to_page(pfn + i);
1191		if (WARN_ON(!page_ref_dec_and_test(page)))
1192			dump_page(page, "fake-offline page referenced");
1193	}
1194}
1195
1196/*
1197 * Handle fake-offline pages when memory offlining is canceled - to undo
1198 * what we did in virtio_mem_fake_offline_going_offline().
1199 */
1200static void virtio_mem_fake_offline_cancel_offline(unsigned long pfn,
1201						   unsigned long nr_pages)
1202{
1203	unsigned long i;
1204
1205	/*
1206	 * Get the reference we dropped when going offline and subtract the
1207	 * unplugged pages from the managed page counters.
1208	 */
1209	adjust_managed_page_count(pfn_to_page(pfn), -nr_pages);
1210	for (i = 0; i < nr_pages; i++)
1211		page_ref_inc(pfn_to_page(pfn + i));
1212}
1213
1214static void virtio_mem_online_page_cb(struct page *page, unsigned int order)
1215{
1216	const unsigned long addr = page_to_phys(page);
1217	unsigned long id, sb_id;
1218	struct virtio_mem *vm;
1219	bool do_online;
1220
1221	rcu_read_lock();
1222	list_for_each_entry_rcu(vm, &virtio_mem_devices, next) {
1223		if (!virtio_mem_contains_range(vm, addr, PFN_PHYS(1 << order)))
1224			continue;
1225
1226		if (vm->in_sbm) {
1227			/*
1228			 * We exploit here that subblocks have at least
1229			 * MAX_ORDER_NR_PAGES size/alignment - so we cannot
1230			 * cross subblocks within one call.
1231			 */
1232			id = virtio_mem_phys_to_mb_id(addr);
1233			sb_id = virtio_mem_phys_to_sb_id(vm, addr);
1234			do_online = virtio_mem_sbm_test_sb_plugged(vm, id,
1235								   sb_id, 1);
1236		} else {
1237			/*
1238			 * If the whole block is marked fake offline, keep
1239			 * everything that way.
1240			 */
1241			id = virtio_mem_phys_to_bb_id(vm, addr);
1242			do_online = virtio_mem_bbm_get_bb_state(vm, id) !=
1243				    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE;
1244		}
1245
1246		/*
1247		 * virtio_mem_set_fake_offline() might sleep, we don't need
1248		 * the device anymore. See virtio_mem_remove() how races
1249		 * between memory onlining and device removal are handled.
1250		 */
1251		rcu_read_unlock();
1252
1253		if (do_online)
1254			generic_online_page(page, order);
1255		else
1256			virtio_mem_set_fake_offline(PFN_DOWN(addr), 1 << order,
1257						    false);
1258		return;
1259	}
1260	rcu_read_unlock();
1261
1262	/* not virtio-mem memory, but e.g., a DIMM. online it */
1263	generic_online_page(page, order);
1264}
1265
1266static uint64_t virtio_mem_send_request(struct virtio_mem *vm,
1267					const struct virtio_mem_req *req)
1268{
1269	struct scatterlist *sgs[2], sg_req, sg_resp;
1270	unsigned int len;
1271	int rc;
1272
1273	/* don't use the request residing on the stack (vaddr) */
1274	vm->req = *req;
1275
1276	/* out: buffer for request */
1277	sg_init_one(&sg_req, &vm->req, sizeof(vm->req));
1278	sgs[0] = &sg_req;
1279
1280	/* in: buffer for response */
1281	sg_init_one(&sg_resp, &vm->resp, sizeof(vm->resp));
1282	sgs[1] = &sg_resp;
1283
1284	rc = virtqueue_add_sgs(vm->vq, sgs, 1, 1, vm, GFP_KERNEL);
1285	if (rc < 0)
1286		return rc;
1287
1288	virtqueue_kick(vm->vq);
1289
1290	/* wait for a response */
1291	wait_event(vm->host_resp, virtqueue_get_buf(vm->vq, &len));
1292
1293	return virtio16_to_cpu(vm->vdev, vm->resp.type);
1294}
1295
1296static int virtio_mem_send_plug_request(struct virtio_mem *vm, uint64_t addr,
1297					uint64_t size)
1298{
1299	const uint64_t nb_vm_blocks = size / vm->device_block_size;
1300	const struct virtio_mem_req req = {
1301		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_PLUG),
1302		.u.plug.addr = cpu_to_virtio64(vm->vdev, addr),
1303		.u.plug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1304	};
1305	int rc = -ENOMEM;
1306
1307	if (atomic_read(&vm->config_changed))
1308		return -EAGAIN;
1309
1310	dev_dbg(&vm->vdev->dev, "plugging memory: 0x%llx - 0x%llx\n", addr,
1311		addr + size - 1);
1312
1313	switch (virtio_mem_send_request(vm, &req)) {
1314	case VIRTIO_MEM_RESP_ACK:
1315		vm->plugged_size += size;
1316		return 0;
1317	case VIRTIO_MEM_RESP_NACK:
1318		rc = -EAGAIN;
1319		break;
1320	case VIRTIO_MEM_RESP_BUSY:
1321		rc = -ETXTBSY;
1322		break;
1323	case VIRTIO_MEM_RESP_ERROR:
1324		rc = -EINVAL;
1325		break;
1326	default:
1327		break;
1328	}
1329
1330	dev_dbg(&vm->vdev->dev, "plugging memory failed: %d\n", rc);
1331	return rc;
1332}
1333
1334static int virtio_mem_send_unplug_request(struct virtio_mem *vm, uint64_t addr,
1335					  uint64_t size)
1336{
1337	const uint64_t nb_vm_blocks = size / vm->device_block_size;
1338	const struct virtio_mem_req req = {
1339		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG),
1340		.u.unplug.addr = cpu_to_virtio64(vm->vdev, addr),
1341		.u.unplug.nb_blocks = cpu_to_virtio16(vm->vdev, nb_vm_blocks),
1342	};
1343	int rc = -ENOMEM;
1344
1345	if (atomic_read(&vm->config_changed))
1346		return -EAGAIN;
1347
1348	dev_dbg(&vm->vdev->dev, "unplugging memory: 0x%llx - 0x%llx\n", addr,
1349		addr + size - 1);
1350
1351	switch (virtio_mem_send_request(vm, &req)) {
1352	case VIRTIO_MEM_RESP_ACK:
1353		vm->plugged_size -= size;
1354		return 0;
1355	case VIRTIO_MEM_RESP_BUSY:
1356		rc = -ETXTBSY;
1357		break;
1358	case VIRTIO_MEM_RESP_ERROR:
1359		rc = -EINVAL;
1360		break;
1361	default:
1362		break;
1363	}
1364
1365	dev_dbg(&vm->vdev->dev, "unplugging memory failed: %d\n", rc);
1366	return rc;
1367}
1368
1369static int virtio_mem_send_unplug_all_request(struct virtio_mem *vm)
1370{
1371	const struct virtio_mem_req req = {
1372		.type = cpu_to_virtio16(vm->vdev, VIRTIO_MEM_REQ_UNPLUG_ALL),
1373	};
1374	int rc = -ENOMEM;
1375
1376	dev_dbg(&vm->vdev->dev, "unplugging all memory");
1377
1378	switch (virtio_mem_send_request(vm, &req)) {
1379	case VIRTIO_MEM_RESP_ACK:
1380		vm->unplug_all_required = false;
1381		vm->plugged_size = 0;
1382		/* usable region might have shrunk */
1383		atomic_set(&vm->config_changed, 1);
1384		return 0;
1385	case VIRTIO_MEM_RESP_BUSY:
1386		rc = -ETXTBSY;
1387		break;
1388	default:
1389		break;
1390	}
1391
1392	dev_dbg(&vm->vdev->dev, "unplugging all memory failed: %d\n", rc);
1393	return rc;
1394}
1395
1396/*
1397 * Plug selected subblocks. Updates the plugged state, but not the state
1398 * of the memory block.
1399 */
1400static int virtio_mem_sbm_plug_sb(struct virtio_mem *vm, unsigned long mb_id,
1401				  int sb_id, int count)
1402{
1403	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1404			      sb_id * vm->sbm.sb_size;
1405	const uint64_t size = count * vm->sbm.sb_size;
1406	int rc;
1407
1408	rc = virtio_mem_send_plug_request(vm, addr, size);
1409	if (!rc)
1410		virtio_mem_sbm_set_sb_plugged(vm, mb_id, sb_id, count);
1411	return rc;
1412}
1413
1414/*
1415 * Unplug selected subblocks. Updates the plugged state, but not the state
1416 * of the memory block.
1417 */
1418static int virtio_mem_sbm_unplug_sb(struct virtio_mem *vm, unsigned long mb_id,
1419				    int sb_id, int count)
1420{
1421	const uint64_t addr = virtio_mem_mb_id_to_phys(mb_id) +
1422			      sb_id * vm->sbm.sb_size;
1423	const uint64_t size = count * vm->sbm.sb_size;
1424	int rc;
1425
1426	rc = virtio_mem_send_unplug_request(vm, addr, size);
1427	if (!rc)
1428		virtio_mem_sbm_set_sb_unplugged(vm, mb_id, sb_id, count);
1429	return rc;
1430}
1431
1432/*
1433 * Request to unplug a big block.
1434 *
1435 * Will not modify the state of the big block.
1436 */
1437static int virtio_mem_bbm_unplug_bb(struct virtio_mem *vm, unsigned long bb_id)
1438{
1439	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1440	const uint64_t size = vm->bbm.bb_size;
1441
1442	return virtio_mem_send_unplug_request(vm, addr, size);
1443}
1444
1445/*
1446 * Request to plug a big block.
1447 *
1448 * Will not modify the state of the big block.
1449 */
1450static int virtio_mem_bbm_plug_bb(struct virtio_mem *vm, unsigned long bb_id)
1451{
1452	const uint64_t addr = virtio_mem_bb_id_to_phys(vm, bb_id);
1453	const uint64_t size = vm->bbm.bb_size;
1454
1455	return virtio_mem_send_plug_request(vm, addr, size);
1456}
1457
1458/*
1459 * Unplug the desired number of plugged subblocks of a offline or not-added
1460 * memory block. Will fail if any subblock cannot get unplugged (instead of
1461 * skipping it).
1462 *
1463 * Will not modify the state of the memory block.
1464 *
1465 * Note: can fail after some subblocks were unplugged.
1466 */
1467static int virtio_mem_sbm_unplug_any_sb_raw(struct virtio_mem *vm,
1468					    unsigned long mb_id, uint64_t *nb_sb)
1469{
1470	int sb_id, count;
1471	int rc;
1472
1473	sb_id = vm->sbm.sbs_per_mb - 1;
1474	while (*nb_sb) {
1475		/* Find the next candidate subblock */
1476		while (sb_id >= 0 &&
1477		       virtio_mem_sbm_test_sb_unplugged(vm, mb_id, sb_id, 1))
1478			sb_id--;
1479		if (sb_id < 0)
1480			break;
1481		/* Try to unplug multiple subblocks at a time */
1482		count = 1;
1483		while (count < *nb_sb && sb_id > 0 &&
1484		       virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id - 1, 1)) {
1485			count++;
1486			sb_id--;
1487		}
1488
1489		rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1490		if (rc)
1491			return rc;
1492		*nb_sb -= count;
1493		sb_id--;
1494	}
1495
1496	return 0;
1497}
1498
1499/*
1500 * Unplug all plugged subblocks of an offline or not-added memory block.
1501 *
1502 * Will not modify the state of the memory block.
1503 *
1504 * Note: can fail after some subblocks were unplugged.
1505 */
1506static int virtio_mem_sbm_unplug_mb(struct virtio_mem *vm, unsigned long mb_id)
1507{
1508	uint64_t nb_sb = vm->sbm.sbs_per_mb;
1509
1510	return virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, &nb_sb);
1511}
1512
1513/*
1514 * Prepare tracking data for the next memory block.
1515 */
1516static int virtio_mem_sbm_prepare_next_mb(struct virtio_mem *vm,
1517					  unsigned long *mb_id)
1518{
1519	int rc;
1520
1521	if (vm->sbm.next_mb_id > vm->sbm.last_usable_mb_id)
1522		return -ENOSPC;
1523
1524	/* Resize the state array if required. */
1525	rc = virtio_mem_sbm_mb_states_prepare_next_mb(vm);
1526	if (rc)
1527		return rc;
1528
1529	/* Resize the subblock bitmap if required. */
1530	rc = virtio_mem_sbm_sb_states_prepare_next_mb(vm);
1531	if (rc)
1532		return rc;
1533
1534	vm->sbm.mb_count[VIRTIO_MEM_SBM_MB_UNUSED]++;
1535	*mb_id = vm->sbm.next_mb_id++;
1536	return 0;
1537}
1538
1539/*
1540 * Try to plug the desired number of subblocks and add the memory block
1541 * to Linux.
1542 *
1543 * Will modify the state of the memory block.
1544 */
1545static int virtio_mem_sbm_plug_and_add_mb(struct virtio_mem *vm,
1546					  unsigned long mb_id, uint64_t *nb_sb)
1547{
1548	const int count = min_t(int, *nb_sb, vm->sbm.sbs_per_mb);
1549	int rc;
1550
1551	if (WARN_ON_ONCE(!count))
1552		return -EINVAL;
1553
1554	/*
1555	 * Plug the requested number of subblocks before adding it to linux,
1556	 * so that onlining will directly online all plugged subblocks.
1557	 */
1558	rc = virtio_mem_sbm_plug_sb(vm, mb_id, 0, count);
1559	if (rc)
1560		return rc;
1561
1562	/*
1563	 * Mark the block properly offline before adding it to Linux,
1564	 * so the memory notifiers will find the block in the right state.
1565	 */
1566	if (count == vm->sbm.sbs_per_mb)
1567		virtio_mem_sbm_set_mb_state(vm, mb_id,
1568					    VIRTIO_MEM_SBM_MB_OFFLINE);
1569	else
1570		virtio_mem_sbm_set_mb_state(vm, mb_id,
1571					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1572
1573	/* Add the memory block to linux - if that fails, try to unplug. */
1574	rc = virtio_mem_sbm_add_mb(vm, mb_id);
1575	if (rc) {
1576		int new_state = VIRTIO_MEM_SBM_MB_UNUSED;
1577
1578		if (virtio_mem_sbm_unplug_sb(vm, mb_id, 0, count))
1579			new_state = VIRTIO_MEM_SBM_MB_PLUGGED;
1580		virtio_mem_sbm_set_mb_state(vm, mb_id, new_state);
1581		return rc;
1582	}
1583
1584	*nb_sb -= count;
1585	return 0;
1586}
1587
1588/*
1589 * Try to plug the desired number of subblocks of a memory block that
1590 * is already added to Linux.
1591 *
1592 * Will modify the state of the memory block.
1593 *
1594 * Note: Can fail after some subblocks were successfully plugged.
1595 */
1596static int virtio_mem_sbm_plug_any_sb(struct virtio_mem *vm,
1597				      unsigned long mb_id, uint64_t *nb_sb)
1598{
1599	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1600	unsigned long pfn, nr_pages;
1601	int sb_id, count;
1602	int rc;
1603
1604	if (WARN_ON_ONCE(!*nb_sb))
1605		return -EINVAL;
1606
1607	while (*nb_sb) {
1608		sb_id = virtio_mem_sbm_first_unplugged_sb(vm, mb_id);
1609		if (sb_id >= vm->sbm.sbs_per_mb)
1610			break;
1611		count = 1;
1612		while (count < *nb_sb &&
1613		       sb_id + count < vm->sbm.sbs_per_mb &&
1614		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id + count, 1))
1615			count++;
1616
1617		rc = virtio_mem_sbm_plug_sb(vm, mb_id, sb_id, count);
1618		if (rc)
1619			return rc;
1620		*nb_sb -= count;
1621		if (old_state == VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL)
1622			continue;
1623
1624		/* fake-online the pages if the memory block is online */
1625		pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1626			       sb_id * vm->sbm.sb_size);
1627		nr_pages = PFN_DOWN(count * vm->sbm.sb_size);
1628		virtio_mem_fake_online(pfn, nr_pages);
1629	}
1630
1631	if (virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1632		virtio_mem_sbm_set_mb_state(vm, mb_id, old_state - 1);
1633
1634	return 0;
1635}
1636
1637static int virtio_mem_sbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1638{
1639	const int mb_states[] = {
1640		VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
1641		VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
1642		VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
1643	};
1644	uint64_t nb_sb = diff / vm->sbm.sb_size;
1645	unsigned long mb_id;
1646	int rc, i;
1647
1648	if (!nb_sb)
1649		return 0;
1650
1651	/* Don't race with onlining/offlining */
1652	mutex_lock(&vm->hotplug_mutex);
1653
1654	for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
1655		virtio_mem_sbm_for_each_mb(vm, mb_id, mb_states[i]) {
1656			rc = virtio_mem_sbm_plug_any_sb(vm, mb_id, &nb_sb);
1657			if (rc || !nb_sb)
1658				goto out_unlock;
1659			cond_resched();
1660		}
1661	}
1662
1663	/*
1664	 * We won't be working on online/offline memory blocks from this point,
1665	 * so we can't race with memory onlining/offlining. Drop the mutex.
1666	 */
1667	mutex_unlock(&vm->hotplug_mutex);
1668
1669	/* Try to plug and add unused blocks */
1670	virtio_mem_sbm_for_each_mb(vm, mb_id, VIRTIO_MEM_SBM_MB_UNUSED) {
1671		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1672			return -ENOSPC;
1673
1674		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1675		if (rc || !nb_sb)
1676			return rc;
1677		cond_resched();
1678	}
1679
1680	/* Try to prepare, plug and add new blocks */
1681	while (nb_sb) {
1682		if (!virtio_mem_could_add_memory(vm, memory_block_size_bytes()))
1683			return -ENOSPC;
1684
1685		rc = virtio_mem_sbm_prepare_next_mb(vm, &mb_id);
1686		if (rc)
1687			return rc;
1688		rc = virtio_mem_sbm_plug_and_add_mb(vm, mb_id, &nb_sb);
1689		if (rc)
1690			return rc;
1691		cond_resched();
1692	}
1693
1694	return 0;
1695out_unlock:
1696	mutex_unlock(&vm->hotplug_mutex);
1697	return rc;
1698}
1699
1700/*
1701 * Plug a big block and add it to Linux.
1702 *
1703 * Will modify the state of the big block.
1704 */
1705static int virtio_mem_bbm_plug_and_add_bb(struct virtio_mem *vm,
1706					  unsigned long bb_id)
1707{
1708	int rc;
1709
1710	if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
1711			 VIRTIO_MEM_BBM_BB_UNUSED))
1712		return -EINVAL;
1713
1714	rc = virtio_mem_bbm_plug_bb(vm, bb_id);
1715	if (rc)
1716		return rc;
1717	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
1718
1719	rc = virtio_mem_bbm_add_bb(vm, bb_id);
1720	if (rc) {
1721		if (!virtio_mem_bbm_unplug_bb(vm, bb_id))
1722			virtio_mem_bbm_set_bb_state(vm, bb_id,
1723						    VIRTIO_MEM_BBM_BB_UNUSED);
1724		else
1725			/* Retry from the main loop. */
1726			virtio_mem_bbm_set_bb_state(vm, bb_id,
1727						    VIRTIO_MEM_BBM_BB_PLUGGED);
1728		return rc;
1729	}
1730	return 0;
1731}
1732
1733/*
1734 * Prepare tracking data for the next big block.
1735 */
1736static int virtio_mem_bbm_prepare_next_bb(struct virtio_mem *vm,
1737					  unsigned long *bb_id)
1738{
1739	int rc;
1740
1741	if (vm->bbm.next_bb_id > vm->bbm.last_usable_bb_id)
1742		return -ENOSPC;
1743
1744	/* Resize the big block state array if required. */
1745	rc = virtio_mem_bbm_bb_states_prepare_next_bb(vm);
1746	if (rc)
1747		return rc;
1748
1749	vm->bbm.bb_count[VIRTIO_MEM_BBM_BB_UNUSED]++;
1750	*bb_id = vm->bbm.next_bb_id;
1751	vm->bbm.next_bb_id++;
1752	return 0;
1753}
1754
1755static int virtio_mem_bbm_plug_request(struct virtio_mem *vm, uint64_t diff)
1756{
1757	uint64_t nb_bb = diff / vm->bbm.bb_size;
1758	unsigned long bb_id;
1759	int rc;
1760
1761	if (!nb_bb)
1762		return 0;
1763
1764	/* Try to plug and add unused big blocks */
1765	virtio_mem_bbm_for_each_bb(vm, bb_id, VIRTIO_MEM_BBM_BB_UNUSED) {
1766		if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1767			return -ENOSPC;
1768
1769		rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1770		if (!rc)
1771			nb_bb--;
1772		if (rc || !nb_bb)
1773			return rc;
1774		cond_resched();
1775	}
1776
1777	/* Try to prepare, plug and add new big blocks */
1778	while (nb_bb) {
1779		if (!virtio_mem_could_add_memory(vm, vm->bbm.bb_size))
1780			return -ENOSPC;
1781
1782		rc = virtio_mem_bbm_prepare_next_bb(vm, &bb_id);
1783		if (rc)
1784			return rc;
1785		rc = virtio_mem_bbm_plug_and_add_bb(vm, bb_id);
1786		if (!rc)
1787			nb_bb--;
1788		if (rc)
1789			return rc;
1790		cond_resched();
1791	}
1792
1793	return 0;
1794}
1795
1796/*
1797 * Try to plug the requested amount of memory.
1798 */
1799static int virtio_mem_plug_request(struct virtio_mem *vm, uint64_t diff)
1800{
1801	if (vm->in_sbm)
1802		return virtio_mem_sbm_plug_request(vm, diff);
1803	return virtio_mem_bbm_plug_request(vm, diff);
1804}
1805
1806/*
1807 * Unplug the desired number of plugged subblocks of an offline memory block.
1808 * Will fail if any subblock cannot get unplugged (instead of skipping it).
1809 *
1810 * Will modify the state of the memory block. Might temporarily drop the
1811 * hotplug_mutex.
1812 *
1813 * Note: Can fail after some subblocks were successfully unplugged.
1814 */
1815static int virtio_mem_sbm_unplug_any_sb_offline(struct virtio_mem *vm,
1816						unsigned long mb_id,
1817						uint64_t *nb_sb)
1818{
1819	int rc;
1820
1821	rc = virtio_mem_sbm_unplug_any_sb_raw(vm, mb_id, nb_sb);
1822
1823	/* some subblocks might have been unplugged even on failure */
1824	if (!virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb))
1825		virtio_mem_sbm_set_mb_state(vm, mb_id,
1826					    VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL);
1827	if (rc)
1828		return rc;
1829
1830	if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1831		/*
1832		 * Remove the block from Linux - this should never fail.
1833		 * Hinder the block from getting onlined by marking it
1834		 * unplugged. Temporarily drop the mutex, so
1835		 * any pending GOING_ONLINE requests can be serviced/rejected.
1836		 */
1837		virtio_mem_sbm_set_mb_state(vm, mb_id,
1838					    VIRTIO_MEM_SBM_MB_UNUSED);
1839
1840		mutex_unlock(&vm->hotplug_mutex);
1841		rc = virtio_mem_sbm_remove_mb(vm, mb_id);
1842		BUG_ON(rc);
1843		mutex_lock(&vm->hotplug_mutex);
1844	}
1845	return 0;
1846}
1847
1848/*
1849 * Unplug the given plugged subblocks of an online memory block.
1850 *
1851 * Will modify the state of the memory block.
1852 */
1853static int virtio_mem_sbm_unplug_sb_online(struct virtio_mem *vm,
1854					   unsigned long mb_id, int sb_id,
1855					   int count)
1856{
1857	const unsigned long nr_pages = PFN_DOWN(vm->sbm.sb_size) * count;
1858	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1859	unsigned long start_pfn;
1860	int rc;
1861
1862	start_pfn = PFN_DOWN(virtio_mem_mb_id_to_phys(mb_id) +
1863			     sb_id * vm->sbm.sb_size);
1864
1865	rc = virtio_mem_fake_offline(start_pfn, nr_pages);
1866	if (rc)
1867		return rc;
1868
1869	/* Try to unplug the allocated memory */
1870	rc = virtio_mem_sbm_unplug_sb(vm, mb_id, sb_id, count);
1871	if (rc) {
1872		/* Return the memory to the buddy. */
1873		virtio_mem_fake_online(start_pfn, nr_pages);
1874		return rc;
1875	}
1876
1877	switch (old_state) {
1878	case VIRTIO_MEM_SBM_MB_KERNEL:
1879		virtio_mem_sbm_set_mb_state(vm, mb_id,
1880					    VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL);
1881		break;
1882	case VIRTIO_MEM_SBM_MB_MOVABLE:
1883		virtio_mem_sbm_set_mb_state(vm, mb_id,
1884					    VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL);
1885		break;
1886	}
1887
1888	return 0;
1889}
1890
1891/*
1892 * Unplug the desired number of plugged subblocks of an online memory block.
1893 * Will skip subblock that are busy.
1894 *
1895 * Will modify the state of the memory block. Might temporarily drop the
1896 * hotplug_mutex.
1897 *
1898 * Note: Can fail after some subblocks were successfully unplugged. Can
1899 *       return 0 even if subblocks were busy and could not get unplugged.
1900 */
1901static int virtio_mem_sbm_unplug_any_sb_online(struct virtio_mem *vm,
1902					       unsigned long mb_id,
1903					       uint64_t *nb_sb)
1904{
1905	int rc, sb_id;
1906
1907	/* If possible, try to unplug the complete block in one shot. */
1908	if (*nb_sb >= vm->sbm.sbs_per_mb &&
1909	    virtio_mem_sbm_test_sb_plugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1910		rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, 0,
1911						     vm->sbm.sbs_per_mb);
1912		if (!rc) {
1913			*nb_sb -= vm->sbm.sbs_per_mb;
1914			goto unplugged;
1915		} else if (rc != -EBUSY)
1916			return rc;
1917	}
1918
1919	/* Fallback to single subblocks. */
1920	for (sb_id = vm->sbm.sbs_per_mb - 1; sb_id >= 0 && *nb_sb; sb_id--) {
1921		/* Find the next candidate subblock */
1922		while (sb_id >= 0 &&
1923		       !virtio_mem_sbm_test_sb_plugged(vm, mb_id, sb_id, 1))
1924			sb_id--;
1925		if (sb_id < 0)
1926			break;
1927
1928		rc = virtio_mem_sbm_unplug_sb_online(vm, mb_id, sb_id, 1);
1929		if (rc == -EBUSY)
1930			continue;
1931		else if (rc)
1932			return rc;
1933		*nb_sb -= 1;
1934	}
1935
1936unplugged:
1937	/*
1938	 * Once all subblocks of a memory block were unplugged, offline and
1939	 * remove it. This will usually not fail, as no memory is in use
1940	 * anymore - however some other notifiers might NACK the request.
1941	 */
1942	if (virtio_mem_sbm_test_sb_unplugged(vm, mb_id, 0, vm->sbm.sbs_per_mb)) {
1943		mutex_unlock(&vm->hotplug_mutex);
1944		rc = virtio_mem_sbm_offline_and_remove_mb(vm, mb_id);
1945		mutex_lock(&vm->hotplug_mutex);
1946		if (!rc)
1947			virtio_mem_sbm_set_mb_state(vm, mb_id,
1948						    VIRTIO_MEM_SBM_MB_UNUSED);
1949	}
1950
1951	return 0;
1952}
1953
1954/*
1955 * Unplug the desired number of plugged subblocks of a memory block that is
1956 * already added to Linux. Will skip subblock of online memory blocks that are
1957 * busy (by the OS). Will fail if any subblock that's not busy cannot get
1958 * unplugged.
1959 *
1960 * Will modify the state of the memory block. Might temporarily drop the
1961 * hotplug_mutex.
1962 *
1963 * Note: Can fail after some subblocks were successfully unplugged. Can
1964 *       return 0 even if subblocks were busy and could not get unplugged.
1965 */
1966static int virtio_mem_sbm_unplug_any_sb(struct virtio_mem *vm,
1967					unsigned long mb_id,
1968					uint64_t *nb_sb)
1969{
1970	const int old_state = virtio_mem_sbm_get_mb_state(vm, mb_id);
1971
1972	switch (old_state) {
1973	case VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL:
1974	case VIRTIO_MEM_SBM_MB_KERNEL:
1975	case VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL:
1976	case VIRTIO_MEM_SBM_MB_MOVABLE:
1977		return virtio_mem_sbm_unplug_any_sb_online(vm, mb_id, nb_sb);
1978	case VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL:
1979	case VIRTIO_MEM_SBM_MB_OFFLINE:
1980		return virtio_mem_sbm_unplug_any_sb_offline(vm, mb_id, nb_sb);
1981	}
1982	return -EINVAL;
1983}
1984
1985static int virtio_mem_sbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
1986{
1987	const int mb_states[] = {
1988		VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL,
1989		VIRTIO_MEM_SBM_MB_OFFLINE,
1990		VIRTIO_MEM_SBM_MB_MOVABLE_PARTIAL,
1991		VIRTIO_MEM_SBM_MB_KERNEL_PARTIAL,
1992		VIRTIO_MEM_SBM_MB_MOVABLE,
1993		VIRTIO_MEM_SBM_MB_KERNEL,
1994	};
1995	uint64_t nb_sb = diff / vm->sbm.sb_size;
1996	unsigned long mb_id;
1997	int rc, i;
1998
1999	if (!nb_sb)
2000		return 0;
2001
2002	/*
2003	 * We'll drop the mutex a couple of times when it is safe to do so.
2004	 * This might result in some blocks switching the state (online/offline)
2005	 * and we could miss them in this run - we will retry again later.
2006	 */
2007	mutex_lock(&vm->hotplug_mutex);
2008
2009	/*
2010	 * We try unplug from partially plugged blocks first, to try removing
2011	 * whole memory blocks along with metadata. We prioritize ZONE_MOVABLE
2012	 * as it's more reliable to unplug memory and remove whole memory
2013	 * blocks, and we don't want to trigger a zone imbalances by
2014	 * accidentially removing too much kernel memory.
2015	 */
2016	for (i = 0; i < ARRAY_SIZE(mb_states); i++) {
2017		virtio_mem_sbm_for_each_mb_rev(vm, mb_id, mb_states[i]) {
2018			rc = virtio_mem_sbm_unplug_any_sb(vm, mb_id, &nb_sb);
2019			if (rc || !nb_sb)
2020				goto out_unlock;
2021			mutex_unlock(&vm->hotplug_mutex);
2022			cond_resched();
2023			mutex_lock(&vm->hotplug_mutex);
2024		}
2025		if (!unplug_online && i == 1) {
2026			mutex_unlock(&vm->hotplug_mutex);
2027			return 0;
2028		}
2029	}
2030
2031	mutex_unlock(&vm->hotplug_mutex);
2032	return nb_sb ? -EBUSY : 0;
2033out_unlock:
2034	mutex_unlock(&vm->hotplug_mutex);
2035	return rc;
2036}
2037
2038/*
2039 * Try to offline and remove a big block from Linux and unplug it. Will fail
2040 * with -EBUSY if some memory is busy and cannot get unplugged.
2041 *
2042 * Will modify the state of the memory block. Might temporarily drop the
2043 * hotplug_mutex.
2044 */
2045static int virtio_mem_bbm_offline_remove_and_unplug_bb(struct virtio_mem *vm,
2046						       unsigned long bb_id)
2047{
2048	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2049	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2050	unsigned long end_pfn = start_pfn + nr_pages;
2051	unsigned long pfn;
2052	struct page *page;
2053	int rc;
2054
2055	if (WARN_ON_ONCE(virtio_mem_bbm_get_bb_state(vm, bb_id) !=
2056			 VIRTIO_MEM_BBM_BB_ADDED))
2057		return -EINVAL;
2058
2059	if (bbm_safe_unplug) {
2060		/*
2061		 * Start by fake-offlining all memory. Once we marked the device
2062		 * block as fake-offline, all newly onlined memory will
2063		 * automatically be kept fake-offline. Protect from concurrent
2064		 * onlining/offlining until we have a consistent state.
2065		 */
2066		mutex_lock(&vm->hotplug_mutex);
2067		virtio_mem_bbm_set_bb_state(vm, bb_id,
2068					    VIRTIO_MEM_BBM_BB_FAKE_OFFLINE);
2069
2070		for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2071			page = pfn_to_online_page(pfn);
2072			if (!page)
2073				continue;
2074
2075			rc = virtio_mem_fake_offline(pfn, PAGES_PER_SECTION);
2076			if (rc) {
2077				end_pfn = pfn;
2078				goto rollback_safe_unplug;
2079			}
2080		}
2081		mutex_unlock(&vm->hotplug_mutex);
2082	}
2083
2084	rc = virtio_mem_bbm_offline_and_remove_bb(vm, bb_id);
2085	if (rc) {
2086		if (bbm_safe_unplug) {
2087			mutex_lock(&vm->hotplug_mutex);
2088			goto rollback_safe_unplug;
2089		}
2090		return rc;
2091	}
2092
2093	rc = virtio_mem_bbm_unplug_bb(vm, bb_id);
2094	if (rc)
2095		virtio_mem_bbm_set_bb_state(vm, bb_id,
2096					    VIRTIO_MEM_BBM_BB_PLUGGED);
2097	else
2098		virtio_mem_bbm_set_bb_state(vm, bb_id,
2099					    VIRTIO_MEM_BBM_BB_UNUSED);
2100	return rc;
2101
2102rollback_safe_unplug:
2103	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
2104		page = pfn_to_online_page(pfn);
2105		if (!page)
2106			continue;
2107		virtio_mem_fake_online(pfn, PAGES_PER_SECTION);
2108	}
2109	virtio_mem_bbm_set_bb_state(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED);
2110	mutex_unlock(&vm->hotplug_mutex);
2111	return rc;
2112}
2113
2114/*
2115 * Test if a big block is completely offline.
2116 */
2117static bool virtio_mem_bbm_bb_is_offline(struct virtio_mem *vm,
2118					 unsigned long bb_id)
2119{
2120	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2121	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2122	unsigned long pfn;
2123
2124	for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2125	     pfn += PAGES_PER_SECTION) {
2126		if (pfn_to_online_page(pfn))
2127			return false;
2128	}
2129
2130	return true;
2131}
2132
2133/*
2134 * Test if a big block is completely onlined to ZONE_MOVABLE (or offline).
2135 */
2136static bool virtio_mem_bbm_bb_is_movable(struct virtio_mem *vm,
2137					 unsigned long bb_id)
2138{
2139	const unsigned long start_pfn = PFN_DOWN(virtio_mem_bb_id_to_phys(vm, bb_id));
2140	const unsigned long nr_pages = PFN_DOWN(vm->bbm.bb_size);
2141	struct page *page;
2142	unsigned long pfn;
2143
2144	for (pfn = start_pfn; pfn < start_pfn + nr_pages;
2145	     pfn += PAGES_PER_SECTION) {
2146		page = pfn_to_online_page(pfn);
2147		if (!page)
2148			continue;
2149		if (page_zonenum(page) != ZONE_MOVABLE)
2150			return false;
2151	}
2152
2153	return true;
2154}
2155
2156static int virtio_mem_bbm_unplug_request(struct virtio_mem *vm, uint64_t diff)
2157{
2158	uint64_t nb_bb = diff / vm->bbm.bb_size;
2159	uint64_t bb_id;
2160	int rc, i;
2161
2162	if (!nb_bb)
2163		return 0;
2164
2165	/*
2166	 * Try to unplug big blocks. Similar to SBM, start with offline
2167	 * big blocks.
2168	 */
2169	for (i = 0; i < 3; i++) {
2170		virtio_mem_bbm_for_each_bb_rev(vm, bb_id, VIRTIO_MEM_BBM_BB_ADDED) {
2171			cond_resched();
2172
2173			/*
2174			 * As we're holding no locks, these checks are racy,
2175			 * but we don't care.
2176			 */
2177			if (i == 0 && !virtio_mem_bbm_bb_is_offline(vm, bb_id))
2178				continue;
2179			if (i == 1 && !virtio_mem_bbm_bb_is_movable(vm, bb_id))
2180				continue;
2181			rc = virtio_mem_bbm_offline_remove_and_unplug_bb(vm, bb_id);
2182			if (rc == -EBUSY)
2183				continue;
2184			if (!rc)
2185				nb_bb--;
2186			if (rc || !nb_bb)
2187				return rc;
2188		}
2189		if (i == 0 && !unplug_online)
2190			return 0;
2191	}
2192
2193	return nb_bb ? -EBUSY : 0;
2194}
2195
2196/*
2197 * Try to unplug the requested amount of memory.
2198 */
2199static int virtio_mem_unplug_request(struct virtio_mem *vm, uint64_t diff)
2200{
2201	if (vm->in_sbm)
2202		return virtio_mem_sbm_unplug_request(vm, diff);
2203	return virtio_mem_bbm_unplug_request(vm, diff);
2204}
2205
2206/*
2207 * Try to unplug all blocks that couldn't be unplugged before, for example,
2208 * because the hypervisor was busy.
2209 */
2210static int virtio_mem_unplug_pending_mb(struct virtio_mem *vm)
2211{
2212	unsigned long id;
2213	int rc;
2214
2215	if (!vm->in_sbm) {
2216		virtio_mem_bbm_for_each_bb(vm, id,
2217					   VIRTIO_MEM_BBM_BB_PLUGGED) {
2218			rc = virtio_mem_bbm_unplug_bb(vm, id);
2219			if (rc)
2220				return rc;
2221			virtio_mem_bbm_set_bb_state(vm, id,
2222						    VIRTIO_MEM_BBM_BB_UNUSED);
2223		}
2224		return 0;
2225	}
2226
2227	virtio_mem_sbm_for_each_mb(vm, id, VIRTIO_MEM_SBM_MB_PLUGGED) {
2228		rc = virtio_mem_sbm_unplug_mb(vm, id);
2229		if (rc)
2230			return rc;
2231		virtio_mem_sbm_set_mb_state(vm, id,
2232					    VIRTIO_MEM_SBM_MB_UNUSED);
2233	}
2234
2235	return 0;
2236}
2237
2238/*
2239 * Update all parts of the config that could have changed.
2240 */
2241static void virtio_mem_refresh_config(struct virtio_mem *vm)
2242{
2243	const struct range pluggable_range = mhp_get_pluggable_range(true);
2244	uint64_t new_plugged_size, usable_region_size, end_addr;
2245
2246	/* the plugged_size is just a reflection of what _we_ did previously */
2247	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2248			&new_plugged_size);
2249	if (WARN_ON_ONCE(new_plugged_size != vm->plugged_size))
2250		vm->plugged_size = new_plugged_size;
2251
2252	/* calculate the last usable memory block id */
2253	virtio_cread_le(vm->vdev, struct virtio_mem_config,
2254			usable_region_size, &usable_region_size);
2255	end_addr = min(vm->addr + usable_region_size - 1,
2256		       pluggable_range.end);
2257
2258	if (vm->in_sbm) {
2259		vm->sbm.last_usable_mb_id = virtio_mem_phys_to_mb_id(end_addr);
2260		if (!IS_ALIGNED(end_addr + 1, memory_block_size_bytes()))
2261			vm->sbm.last_usable_mb_id--;
2262	} else {
2263		vm->bbm.last_usable_bb_id = virtio_mem_phys_to_bb_id(vm,
2264								     end_addr);
2265		if (!IS_ALIGNED(end_addr + 1, vm->bbm.bb_size))
2266			vm->bbm.last_usable_bb_id--;
2267	}
2268	/*
2269	 * If we cannot plug any of our device memory (e.g., nothing in the
2270	 * usable region is addressable), the last usable memory block id will
2271	 * be smaller than the first usable memory block id. We'll stop
2272	 * attempting to add memory with -ENOSPC from our main loop.
2273	 */
2274
2275	/* see if there is a request to change the size */
2276	virtio_cread_le(vm->vdev, struct virtio_mem_config, requested_size,
2277			&vm->requested_size);
2278
2279	dev_info(&vm->vdev->dev, "plugged size: 0x%llx", vm->plugged_size);
2280	dev_info(&vm->vdev->dev, "requested size: 0x%llx", vm->requested_size);
2281}
2282
2283/*
2284 * Workqueue function for handling plug/unplug requests and config updates.
2285 */
2286static void virtio_mem_run_wq(struct work_struct *work)
2287{
2288	struct virtio_mem *vm = container_of(work, struct virtio_mem, wq);
2289	uint64_t diff;
2290	int rc;
2291
2292	hrtimer_cancel(&vm->retry_timer);
2293
2294	if (vm->broken)
2295		return;
2296
2297	atomic_set(&vm->wq_active, 1);
2298retry:
2299	rc = 0;
2300
2301	/* Make sure we start with a clean state if there are leftovers. */
2302	if (unlikely(vm->unplug_all_required))
2303		rc = virtio_mem_send_unplug_all_request(vm);
2304
2305	if (atomic_read(&vm->config_changed)) {
2306		atomic_set(&vm->config_changed, 0);
2307		virtio_mem_refresh_config(vm);
2308	}
2309
2310	/* Unplug any leftovers from previous runs */
2311	if (!rc)
2312		rc = virtio_mem_unplug_pending_mb(vm);
2313
2314	if (!rc && vm->requested_size != vm->plugged_size) {
2315		if (vm->requested_size > vm->plugged_size) {
2316			diff = vm->requested_size - vm->plugged_size;
2317			rc = virtio_mem_plug_request(vm, diff);
2318		} else {
2319			diff = vm->plugged_size - vm->requested_size;
2320			rc = virtio_mem_unplug_request(vm, diff);
2321		}
2322	}
2323
2324	switch (rc) {
2325	case 0:
2326		vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2327		break;
2328	case -ENOSPC:
2329		/*
2330		 * We cannot add any more memory (alignment, physical limit)
2331		 * or we have too many offline memory blocks.
2332		 */
2333		break;
2334	case -ETXTBSY:
2335		/*
2336		 * The hypervisor cannot process our request right now
2337		 * (e.g., out of memory, migrating);
2338		 */
2339	case -EBUSY:
2340		/*
2341		 * We cannot free up any memory to unplug it (all plugged memory
2342		 * is busy).
2343		 */
2344	case -ENOMEM:
2345		/* Out of memory, try again later. */
2346		hrtimer_start(&vm->retry_timer, ms_to_ktime(vm->retry_timer_ms),
2347			      HRTIMER_MODE_REL);
2348		break;
2349	case -EAGAIN:
2350		/* Retry immediately (e.g., the config changed). */
2351		goto retry;
2352	default:
2353		/* Unknown error, mark as broken */
2354		dev_err(&vm->vdev->dev,
2355			"unknown error, marking device broken: %d\n", rc);
2356		vm->broken = true;
2357	}
2358
2359	atomic_set(&vm->wq_active, 0);
2360}
2361
2362static enum hrtimer_restart virtio_mem_timer_expired(struct hrtimer *timer)
2363{
2364	struct virtio_mem *vm = container_of(timer, struct virtio_mem,
2365					     retry_timer);
2366
2367	virtio_mem_retry(vm);
2368	vm->retry_timer_ms = min_t(unsigned int, vm->retry_timer_ms * 2,
2369				   VIRTIO_MEM_RETRY_TIMER_MAX_MS);
2370	return HRTIMER_NORESTART;
2371}
2372
2373static void virtio_mem_handle_response(struct virtqueue *vq)
2374{
2375	struct virtio_mem *vm = vq->vdev->priv;
2376
2377	wake_up(&vm->host_resp);
2378}
2379
2380static int virtio_mem_init_vq(struct virtio_mem *vm)
2381{
2382	struct virtqueue *vq;
2383
2384	vq = virtio_find_single_vq(vm->vdev, virtio_mem_handle_response,
2385				   "guest-request");
2386	if (IS_ERR(vq))
2387		return PTR_ERR(vq);
2388	vm->vq = vq;
2389
2390	return 0;
2391}
2392
2393static int virtio_mem_init(struct virtio_mem *vm)
2394{
2395	const struct range pluggable_range = mhp_get_pluggable_range(true);
2396	uint64_t sb_size, addr;
2397	uint16_t node_id;
2398
2399	if (!vm->vdev->config->get) {
2400		dev_err(&vm->vdev->dev, "config access disabled\n");
2401		return -EINVAL;
2402	}
2403
2404	/*
2405	 * We don't want to (un)plug or reuse any memory when in kdump. The
2406	 * memory is still accessible (but not mapped).
2407	 */
2408	if (is_kdump_kernel()) {
2409		dev_warn(&vm->vdev->dev, "disabled in kdump kernel\n");
2410		return -EBUSY;
2411	}
2412
2413	/* Fetch all properties that can't change. */
2414	virtio_cread_le(vm->vdev, struct virtio_mem_config, plugged_size,
2415			&vm->plugged_size);
2416	virtio_cread_le(vm->vdev, struct virtio_mem_config, block_size,
2417			&vm->device_block_size);
2418	virtio_cread_le(vm->vdev, struct virtio_mem_config, node_id,
2419			&node_id);
2420	vm->nid = virtio_mem_translate_node_id(vm, node_id);
2421	virtio_cread_le(vm->vdev, struct virtio_mem_config, addr, &vm->addr);
2422	virtio_cread_le(vm->vdev, struct virtio_mem_config, region_size,
2423			&vm->region_size);
2424
2425	/* Determine the nid for the device based on the lowest address. */
2426	if (vm->nid == NUMA_NO_NODE)
2427		vm->nid = memory_add_physaddr_to_nid(vm->addr);
2428
2429	/* bad device setup - warn only */
2430	if (!IS_ALIGNED(vm->addr, memory_block_size_bytes()))
2431		dev_warn(&vm->vdev->dev,
2432			 "The alignment of the physical start address can make some memory unusable.\n");
2433	if (!IS_ALIGNED(vm->addr + vm->region_size, memory_block_size_bytes()))
2434		dev_warn(&vm->vdev->dev,
2435			 "The alignment of the physical end address can make some memory unusable.\n");
2436	if (vm->addr < pluggable_range.start ||
2437	    vm->addr + vm->region_size - 1 > pluggable_range.end)
2438		dev_warn(&vm->vdev->dev,
2439			 "Some device memory is not addressable/pluggable. This can make some memory unusable.\n");
2440
2441	/* Prepare the offline threshold - make sure we can add two blocks. */
2442	vm->offline_threshold = max_t(uint64_t, 2 * memory_block_size_bytes(),
2443				      VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
2444
2445	/*
2446	 * We want subblocks to span at least MAX_ORDER_NR_PAGES and
2447	 * pageblock_nr_pages pages. This:
2448	 * - Simplifies our page onlining code (virtio_mem_online_page_cb)
2449	 *   and fake page onlining code (virtio_mem_fake_online).
2450	 * - Is required for now for alloc_contig_range() to work reliably -
2451	 *   it doesn't properly handle smaller granularity on ZONE_NORMAL.
2452	 */
2453	sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES,
2454			pageblock_nr_pages) * PAGE_SIZE;
2455	sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
2456
2457	if (sb_size < memory_block_size_bytes() && !force_bbm) {
2458		/* SBM: At least two subblocks per Linux memory block. */
2459		vm->in_sbm = true;
2460		vm->sbm.sb_size = sb_size;
2461		vm->sbm.sbs_per_mb = memory_block_size_bytes() /
2462				     vm->sbm.sb_size;
2463
2464		/* Round up to the next full memory block */
2465		addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2466		       memory_block_size_bytes() - 1;
2467		vm->sbm.first_mb_id = virtio_mem_phys_to_mb_id(addr);
2468		vm->sbm.next_mb_id = vm->sbm.first_mb_id;
2469	} else {
2470		/* BBM: At least one Linux memory block. */
2471		vm->bbm.bb_size = max_t(uint64_t, vm->device_block_size,
2472					memory_block_size_bytes());
2473
2474		if (bbm_block_size) {
2475			if (!is_power_of_2(bbm_block_size)) {
2476				dev_warn(&vm->vdev->dev,
2477					 "bbm_block_size is not a power of 2");
2478			} else if (bbm_block_size < vm->bbm.bb_size) {
2479				dev_warn(&vm->vdev->dev,
2480					 "bbm_block_size is too small");
2481			} else {
2482				vm->bbm.bb_size = bbm_block_size;
2483			}
2484		}
2485
2486		/* Round up to the next aligned big block */
2487		addr = max_t(uint64_t, vm->addr, pluggable_range.start) +
2488		       vm->bbm.bb_size - 1;
2489		vm->bbm.first_bb_id = virtio_mem_phys_to_bb_id(vm, addr);
2490		vm->bbm.next_bb_id = vm->bbm.first_bb_id;
2491
2492		/* Make sure we can add two big blocks. */
2493		vm->offline_threshold = max_t(uint64_t, 2 * vm->bbm.bb_size,
2494					      vm->offline_threshold);
2495	}
2496
2497	dev_info(&vm->vdev->dev, "start address: 0x%llx", vm->addr);
2498	dev_info(&vm->vdev->dev, "region size: 0x%llx", vm->region_size);
2499	dev_info(&vm->vdev->dev, "device block size: 0x%llx",
2500		 (unsigned long long)vm->device_block_size);
2501	dev_info(&vm->vdev->dev, "memory block size: 0x%lx",
2502		 memory_block_size_bytes());
2503	if (vm->in_sbm)
2504		dev_info(&vm->vdev->dev, "subblock size: 0x%llx",
2505			 (unsigned long long)vm->sbm.sb_size);
2506	else
2507		dev_info(&vm->vdev->dev, "big block size: 0x%llx",
2508			 (unsigned long long)vm->bbm.bb_size);
2509	if (vm->nid != NUMA_NO_NODE && IS_ENABLED(CONFIG_NUMA))
2510		dev_info(&vm->vdev->dev, "nid: %d", vm->nid);
2511
2512	return 0;
2513}
2514
2515static int virtio_mem_create_resource(struct virtio_mem *vm)
2516{
2517	/*
2518	 * When force-unloading the driver and removing the device, we
2519	 * could have a garbage pointer. Duplicate the string.
2520	 */
2521	const char *name = kstrdup(dev_name(&vm->vdev->dev), GFP_KERNEL);
2522
2523	if (!name)
2524		return -ENOMEM;
2525
2526	vm->parent_resource = __request_mem_region(vm->addr, vm->region_size,
2527						   name, IORESOURCE_SYSTEM_RAM);
2528	if (!vm->parent_resource) {
2529		kfree(name);
2530		dev_warn(&vm->vdev->dev, "could not reserve device region\n");
2531		dev_info(&vm->vdev->dev,
2532			 "reloading the driver is not supported\n");
2533		return -EBUSY;
2534	}
2535
2536	/* The memory is not actually busy - make add_memory() work. */
2537	vm->parent_resource->flags &= ~IORESOURCE_BUSY;
2538	return 0;
2539}
2540
2541static void virtio_mem_delete_resource(struct virtio_mem *vm)
2542{
2543	const char *name;
2544
2545	if (!vm->parent_resource)
2546		return;
2547
2548	name = vm->parent_resource->name;
2549	release_resource(vm->parent_resource);
2550	kfree(vm->parent_resource);
2551	kfree(name);
2552	vm->parent_resource = NULL;
2553}
2554
2555static int virtio_mem_range_has_system_ram(struct resource *res, void *arg)
2556{
2557	return 1;
2558}
2559
2560static bool virtio_mem_has_memory_added(struct virtio_mem *vm)
2561{
2562	const unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
2563
2564	return walk_iomem_res_desc(IORES_DESC_NONE, flags, vm->addr,
2565				   vm->addr + vm->region_size, NULL,
2566				   virtio_mem_range_has_system_ram) == 1;
2567}
2568
2569static int virtio_mem_probe(struct virtio_device *vdev)
2570{
2571	struct virtio_mem *vm;
2572	int rc;
2573
2574	BUILD_BUG_ON(sizeof(struct virtio_mem_req) != 24);
2575	BUILD_BUG_ON(sizeof(struct virtio_mem_resp) != 10);
2576
2577	vdev->priv = vm = kzalloc(sizeof(*vm), GFP_KERNEL);
2578	if (!vm)
2579		return -ENOMEM;
2580
2581	init_waitqueue_head(&vm->host_resp);
2582	vm->vdev = vdev;
2583	INIT_WORK(&vm->wq, virtio_mem_run_wq);
2584	mutex_init(&vm->hotplug_mutex);
2585	INIT_LIST_HEAD(&vm->next);
2586	spin_lock_init(&vm->removal_lock);
2587	hrtimer_init(&vm->retry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2588	vm->retry_timer.function = virtio_mem_timer_expired;
2589	vm->retry_timer_ms = VIRTIO_MEM_RETRY_TIMER_MIN_MS;
2590
2591	/* register the virtqueue */
2592	rc = virtio_mem_init_vq(vm);
2593	if (rc)
2594		goto out_free_vm;
2595
2596	/* initialize the device by querying the config */
2597	rc = virtio_mem_init(vm);
2598	if (rc)
2599		goto out_del_vq;
2600
2601	/* create the parent resource for all memory */
2602	rc = virtio_mem_create_resource(vm);
2603	if (rc)
2604		goto out_del_vq;
2605
2606	/*
2607	 * If we still have memory plugged, we have to unplug all memory first.
2608	 * Registering our parent resource makes sure that this memory isn't
2609	 * actually in use (e.g., trying to reload the driver).
2610	 */
2611	if (vm->plugged_size) {
2612		vm->unplug_all_required = true;
2613		dev_info(&vm->vdev->dev, "unplugging all memory is required\n");
2614	}
2615
2616	/* register callbacks */
2617	vm->memory_notifier.notifier_call = virtio_mem_memory_notifier_cb;
2618	rc = register_memory_notifier(&vm->memory_notifier);
2619	if (rc)
2620		goto out_del_resource;
2621	rc = register_virtio_mem_device(vm);
2622	if (rc)
2623		goto out_unreg_mem;
2624
2625	virtio_device_ready(vdev);
2626
2627	/* trigger a config update to start processing the requested_size */
2628	atomic_set(&vm->config_changed, 1);
2629	queue_work(system_freezable_wq, &vm->wq);
2630
2631	return 0;
2632out_unreg_mem:
2633	unregister_memory_notifier(&vm->memory_notifier);
2634out_del_resource:
2635	virtio_mem_delete_resource(vm);
2636out_del_vq:
2637	vdev->config->del_vqs(vdev);
2638out_free_vm:
2639	kfree(vm);
2640	vdev->priv = NULL;
2641
2642	return rc;
2643}
2644
2645static void virtio_mem_remove(struct virtio_device *vdev)
2646{
2647	struct virtio_mem *vm = vdev->priv;
2648	unsigned long mb_id;
2649	int rc;
2650
2651	/*
2652	 * Make sure the workqueue won't be triggered anymore and no memory
2653	 * blocks can be onlined/offlined until we're finished here.
2654	 */
2655	mutex_lock(&vm->hotplug_mutex);
2656	spin_lock_irq(&vm->removal_lock);
2657	vm->removing = true;
2658	spin_unlock_irq(&vm->removal_lock);
2659	mutex_unlock(&vm->hotplug_mutex);
2660
2661	/* wait until the workqueue stopped */
2662	cancel_work_sync(&vm->wq);
2663	hrtimer_cancel(&vm->retry_timer);
2664
2665	if (vm->in_sbm) {
2666		/*
2667		 * After we unregistered our callbacks, user space can online
2668		 * partially plugged offline blocks. Make sure to remove them.
2669		 */
2670		virtio_mem_sbm_for_each_mb(vm, mb_id,
2671					   VIRTIO_MEM_SBM_MB_OFFLINE_PARTIAL) {
2672			rc = virtio_mem_sbm_remove_mb(vm, mb_id);
2673			BUG_ON(rc);
2674			virtio_mem_sbm_set_mb_state(vm, mb_id,
2675						    VIRTIO_MEM_SBM_MB_UNUSED);
2676		}
2677		/*
2678		 * After we unregistered our callbacks, user space can no longer
2679		 * offline partially plugged online memory blocks. No need to
2680		 * worry about them.
2681		 */
2682	}
2683
2684	/* unregister callbacks */
2685	unregister_virtio_mem_device(vm);
2686	unregister_memory_notifier(&vm->memory_notifier);
2687
2688	/*
2689	 * There is no way we could reliably remove all memory we have added to
2690	 * the system. And there is no way to stop the driver/device from going
2691	 * away. Warn at least.
2692	 */
2693	if (virtio_mem_has_memory_added(vm)) {
2694		dev_warn(&vdev->dev, "device still has system memory added\n");
2695	} else {
2696		virtio_mem_delete_resource(vm);
2697		kfree_const(vm->resource_name);
2698	}
2699
2700	/* remove all tracking data - no locking needed */
2701	if (vm->in_sbm) {
2702		vfree(vm->sbm.mb_states);
2703		vfree(vm->sbm.sb_states);
2704	} else {
2705		vfree(vm->bbm.bb_states);
2706	}
2707
2708	/* reset the device and cleanup the queues */
2709	vdev->config->reset(vdev);
2710	vdev->config->del_vqs(vdev);
2711
2712	kfree(vm);
2713	vdev->priv = NULL;
2714}
2715
2716static void virtio_mem_config_changed(struct virtio_device *vdev)
2717{
2718	struct virtio_mem *vm = vdev->priv;
2719
2720	atomic_set(&vm->config_changed, 1);
2721	virtio_mem_retry(vm);
2722}
2723
2724#ifdef CONFIG_PM_SLEEP
2725static int virtio_mem_freeze(struct virtio_device *vdev)
2726{
2727	/*
2728	 * When restarting the VM, all memory is usually unplugged. Don't
2729	 * allow to suspend/hibernate.
2730	 */
2731	dev_err(&vdev->dev, "save/restore not supported.\n");
2732	return -EPERM;
2733}
2734
2735static int virtio_mem_restore(struct virtio_device *vdev)
2736{
2737	return -EPERM;
2738}
2739#endif
2740
2741static unsigned int virtio_mem_features[] = {
2742#if defined(CONFIG_NUMA) && defined(CONFIG_ACPI_NUMA)
2743	VIRTIO_MEM_F_ACPI_PXM,
2744#endif
2745};
2746
2747static const struct virtio_device_id virtio_mem_id_table[] = {
2748	{ VIRTIO_ID_MEM, VIRTIO_DEV_ANY_ID },
2749	{ 0 },
2750};
2751
2752static struct virtio_driver virtio_mem_driver = {
2753	.feature_table = virtio_mem_features,
2754	.feature_table_size = ARRAY_SIZE(virtio_mem_features),
2755	.driver.name = KBUILD_MODNAME,
2756	.driver.owner = THIS_MODULE,
2757	.id_table = virtio_mem_id_table,
2758	.probe = virtio_mem_probe,
2759	.remove = virtio_mem_remove,
2760	.config_changed = virtio_mem_config_changed,
2761#ifdef CONFIG_PM_SLEEP
2762	.freeze	=	virtio_mem_freeze,
2763	.restore =	virtio_mem_restore,
2764#endif
2765};
2766
2767module_virtio_driver(virtio_mem_driver);
2768MODULE_DEVICE_TABLE(virtio, virtio_mem_id_table);
2769MODULE_AUTHOR("David Hildenbrand <david@redhat.com>");
2770MODULE_DESCRIPTION("Virtio-mem driver");
2771MODULE_LICENSE("GPL");